summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/btree/bt_compact.c2652
-rw-r--r--src/btree/bt_compare.c213
-rw-r--r--src/btree/bt_compress.c3173
-rw-r--r--src/btree/bt_conv.c95
-rw-r--r--src/btree/bt_curadj.c694
-rw-r--r--src/btree/bt_cursor.c3076
-rw-r--r--src/btree/bt_delete.c541
-rw-r--r--src/btree/bt_method.c745
-rw-r--r--src/btree/bt_open.c677
-rw-r--r--src/btree/bt_put.c1087
-rw-r--r--src/btree/bt_rec.c2036
-rw-r--r--src/btree/bt_reclaim.c98
-rw-r--r--src/btree/bt_recno.c1427
-rw-r--r--src/btree/bt_rsearch.c513
-rw-r--r--src/btree/bt_search.c1028
-rw-r--r--src/btree/bt_split.c1332
-rw-r--r--src/btree/bt_stat.c669
-rw-r--r--src/btree/bt_upgrade.c153
-rw-r--r--src/btree/bt_verify.c2805
-rw-r--r--src/btree/btree.src290
-rw-r--r--src/btree/btree_auto.c207
-rw-r--r--src/btree/btree_autop.c291
-rw-r--r--src/clib/atoi.c50
-rw-r--r--src/clib/atol.c50
-rw-r--r--src/clib/bsearch.c38
-rw-r--r--src/clib/getcwd.c261
-rw-r--r--src/clib/getopt.c153
-rw-r--r--src/clib/isalpha.c28
-rw-r--r--src/clib/isdigit.c28
-rw-r--r--src/clib/isprint.c28
-rw-r--r--src/clib/isspace.c26
-rw-r--r--src/clib/memcmp.c62
-rw-r--r--src/clib/memmove.c150
-rw-r--r--src/clib/printf.c116
-rw-r--r--src/clib/qsort.c181
-rw-r--r--src/clib/raise.c26
-rw-r--r--src/clib/rand.c25
-rw-r--r--src/clib/snprintf.c149
-rw-r--r--src/clib/strcasecmp.c97
-rw-r--r--src/clib/strcat.c53
-rw-r--r--src/clib/strchr.c57
-rw-r--r--src/clib/strdup.c59
-rw-r--r--src/clib/strerror.c225
-rw-r--r--src/clib/strncat.c69
-rw-r--r--src/clib/strncmp.c61
-rw-r--r--src/clib/strrchr.c58
-rw-r--r--src/clib/strsep.c80
-rw-r--r--src/clib/strtol.c142
-rw-r--r--src/clib/strtoul.c121
-rw-r--r--src/clib/time.c34
-rw-r--r--src/common/clock.c57
-rw-r--r--src/common/crypto_stub.c44
-rw-r--r--src/common/db_byteorder.c63
-rw-r--r--src/common/db_compint.c555
-rw-r--r--src/common/db_err.c1118
-rw-r--r--src/common/db_getlong.c146
-rw-r--r--src/common/db_idspace.c85
-rw-r--r--src/common/db_log2.c57
-rw-r--r--src/common/db_shash.c104
-rw-r--r--src/common/dbt.c74
-rw-r--r--src/common/mkpath.c68
-rw-r--r--src/common/openflags.c51
-rw-r--r--src/common/os_method.c270
-rw-r--r--src/common/util_arg.c56
-rw-r--r--src/common/util_cache.c47
-rw-r--r--src/common/util_log.c45
-rw-r--r--src/common/util_sig.c110
-rw-r--r--src/common/zerofill.c129
-rw-r--r--src/crypto/aes_method.c357
-rw-r--r--src/crypto/crypto.c411
-rw-r--r--src/crypto/crypto.html638
-rw-r--r--src/crypto/mersenne/mt19937db.c187
-rw-r--r--src/crypto/rijndael/rijndael-alg-fst.c1466
-rw-r--r--src/crypto/rijndael/rijndael-alg-fst.h40
-rw-r--r--src/crypto/rijndael/rijndael-api-fst.c491
-rw-r--r--src/crypto/rijndael/rijndael-api-fst.h91
-rw-r--r--src/db/crdel.src71
-rw-r--r--src/db/crdel_auto.c59
-rw-r--r--src/db/crdel_autop.c103
-rw-r--r--src/db/crdel_rec.c301
-rw-r--r--src/db/db.c1659
-rw-r--r--src/db/db.src431
-rw-r--r--src/db/db_am.c1150
-rw-r--r--src/db/db_auto.c276
-rw-r--r--src/db/db_autop.c441
-rw-r--r--src/db/db_backup.c775
-rw-r--r--src/db/db_cam.c3506
-rw-r--r--src/db/db_cds.c201
-rw-r--r--src/db/db_compact.c1087
-rw-r--r--src/db/db_conv.c890
-rw-r--r--src/db/db_copy.c31
-rw-r--r--src/db/db_dispatch.c977
-rw-r--r--src/db/db_dup.c214
-rw-r--r--src/db/db_iface.c3001
-rw-r--r--src/db/db_join.c940
-rw-r--r--src/db/db_meta.c1428
-rw-r--r--src/db/db_method.c1117
-rw-r--r--src/db/db_open.c857
-rw-r--r--src/db/db_overflow.c705
-rw-r--r--src/db/db_ovfl_vrfy.c410
-rw-r--r--src/db/db_pr.c1956
-rw-r--r--src/db/db_rec.c2796
-rw-r--r--src/db/db_reclaim.c245
-rw-r--r--src/db/db_remove.c515
-rw-r--r--src/db/db_rename.c383
-rw-r--r--src/db/db_ret.c169
-rw-r--r--src/db/db_setid.c213
-rw-r--r--src/db/db_setlsn.c137
-rw-r--r--src/db/db_sort_multiple.c327
-rw-r--r--src/db/db_stati.c502
-rw-r--r--src/db/db_truncate.c233
-rw-r--r--src/db/db_upg.c527
-rw-r--r--src/db/db_upg_opd.c343
-rw-r--r--src/db/db_vrfy.c3055
-rw-r--r--src/db/db_vrfy_stub.c120
-rw-r--r--src/db/db_vrfyutil.c932
-rw-r--r--src/db/partition.c2059
-rw-r--r--src/dbinc/atomic.h220
-rw-r--r--src/dbinc/btree.h553
-rw-r--r--src/dbinc/clock.h131
-rw-r--r--src/dbinc/crypto.h93
-rw-r--r--src/dbinc/cxx_int.h77
-rw-r--r--src/dbinc/db.in2810
-rw-r--r--src/dbinc/db_185.in176
-rw-r--r--src/dbinc/db_am.h327
-rw-r--r--src/dbinc/db_cxx.in1523
-rw-r--r--src/dbinc/db_dispatch.h97
-rw-r--r--src/dbinc/db_int.in1162
-rw-r--r--src/dbinc/db_join.h37
-rw-r--r--src/dbinc/db_page.h841
-rw-r--r--src/dbinc/db_swap.h262
-rw-r--r--src/dbinc/db_upgrade.h248
-rw-r--r--src/dbinc/db_verify.h210
-rw-r--r--src/dbinc/debug.h283
-rw-r--r--src/dbinc/fop.h32
-rw-r--r--src/dbinc/globals.h105
-rw-r--r--src/dbinc/hash.h173
-rw-r--r--src/dbinc/heap.h59
-rw-r--r--src/dbinc/hmac.h39
-rw-r--r--src/dbinc/lock.h326
-rw-r--r--src/dbinc/log.h463
-rw-r--r--src/dbinc/log_verify.h207
-rw-r--r--src/dbinc/mp.h700
-rw-r--r--src/dbinc/mutex.h305
-rw-r--r--src/dbinc/mutex_int.h1070
-rw-r--r--src/dbinc/os.h178
-rw-r--r--src/dbinc/partition.h57
-rw-r--r--src/dbinc/perfmon.h103
-rw-r--r--src/dbinc/qam.h203
-rw-r--r--src/dbinc/queue.h570
-rw-r--r--src/dbinc/region.h329
-rw-r--r--src/dbinc/rep.h1102
-rw-r--r--src/dbinc/repmgr.h843
-rw-r--r--src/dbinc/shqueue.h410
-rw-r--r--src/dbinc/tcl_db.h316
-rw-r--r--src/dbinc/txn.h288
-rw-r--r--src/dbinc/win_db.h148
-rw-r--r--src/dbinc/xa.h183
-rw-r--r--src/dbinc_auto/api_flags.in228
-rw-r--r--src/dbinc_auto/btree_auto.h456
-rw-r--r--src/dbinc_auto/btree_ext.h147
-rw-r--r--src/dbinc_auto/clib_ext.h113
-rw-r--r--src/dbinc_auto/common_ext.h75
-rw-r--r--src/dbinc_auto/crdel_auto.h127
-rw-r--r--src/dbinc_auto/crypto_ext.h38
-rw-r--r--src/dbinc_auto/db_auto.h666
-rw-r--r--src/dbinc_auto/db_ext.h346
-rw-r--r--src/dbinc_auto/dbreg_auto.h43
-rw-r--r--src/dbinc_auto/dbreg_ext.h46
-rw-r--r--src/dbinc_auto/env_ext.h158
-rw-r--r--src/dbinc_auto/ext_185_def.in12
-rw-r--r--src/dbinc_auto/ext_185_prot.in19
-rw-r--r--src/dbinc_auto/ext_def.in66
-rw-r--r--src/dbinc_auto/ext_prot.in73
-rw-r--r--src/dbinc_auto/fileops_auto.h262
-rw-r--r--src/dbinc_auto/fileops_ext.h44
-rw-r--r--src/dbinc_auto/hash_auto.h484
-rw-r--r--src/dbinc_auto/hash_ext.h129
-rw-r--r--src/dbinc_auto/heap_auto.h146
-rw-r--r--src/dbinc_auto/heap_ext.h58
-rw-r--r--src/dbinc_auto/hmac_ext.h20
-rw-r--r--src/dbinc_auto/int_def.in2265
-rw-r--r--src/dbinc_auto/lock_ext.h78
-rw-r--r--src/dbinc_auto/log_ext.h208
-rw-r--r--src/dbinc_auto/mp_ext.h106
-rw-r--r--src/dbinc_auto/mutex_ext.h91
-rw-r--r--src/dbinc_auto/os_ext.h84
-rw-r--r--src/dbinc_auto/qam_auto.h174
-rw-r--r--src/dbinc_auto/qam_ext.h68
-rw-r--r--src/dbinc_auto/rep_automsg.h120
-rw-r--r--src/dbinc_auto/rep_ext.h151
-rw-r--r--src/dbinc_auto/repmgr_auto.h41
-rw-r--r--src/dbinc_auto/repmgr_automsg.h113
-rw-r--r--src/dbinc_auto/repmgr_ext.h249
-rw-r--r--src/dbinc_auto/sequence_ext.h17
-rw-r--r--src/dbinc_auto/tcl_ext.h134
-rw-r--r--src/dbinc_auto/txn_auto.h220
-rw-r--r--src/dbinc_auto/txn_ext.h93
-rw-r--r--src/dbinc_auto/xa_ext.h18
-rw-r--r--src/dbreg/dbreg.c1012
-rw-r--r--src/dbreg/dbreg.src37
-rw-r--r--src/dbreg/dbreg_auto.c35
-rw-r--r--src/dbreg/dbreg_autop.c43
-rw-r--r--src/dbreg/dbreg_rec.c399
-rw-r--r--src/dbreg/dbreg_stat.c140
-rw-r--r--src/dbreg/dbreg_util.c847
-rw-r--r--src/env/env_alloc.c759
-rw-r--r--src/env/env_backup.c166
-rw-r--r--src/env/env_config.c737
-rw-r--r--src/env/env_failchk.c558
-rw-r--r--src/env/env_file.c128
-rw-r--r--src/env/env_globals.c66
-rw-r--r--src/env/env_method.c1918
-rw-r--r--src/env/env_name.c285
-rw-r--r--src/env/env_open.c1262
-rw-r--r--src/env/env_recover.c1093
-rw-r--r--src/env/env_region.c1497
-rw-r--r--src/env/env_register.c730
-rw-r--r--src/env/env_sig.c201
-rw-r--r--src/env/env_stat.c879
-rw-r--r--src/fileops/fileops.src137
-rw-r--r--src/fileops/fileops_auto.c118
-rw-r--r--src/fileops/fileops_autop.c177
-rw-r--r--src/fileops/fop_basic.c318
-rw-r--r--src/fileops/fop_rec.c697
-rw-r--r--src/fileops/fop_util.c1841
-rw-r--r--src/hash/hash.c2340
-rw-r--r--src/hash/hash.src328
-rw-r--r--src/hash/hash_auto.c209
-rw-r--r--src/hash/hash_autop.c314
-rw-r--r--src/hash/hash_compact.c549
-rw-r--r--src/hash/hash_conv.c110
-rw-r--r--src/hash/hash_dup.c943
-rw-r--r--src/hash/hash_func.c240
-rw-r--r--src/hash/hash_meta.c170
-rw-r--r--src/hash/hash_method.c250
-rw-r--r--src/hash/hash_open.c584
-rw-r--r--src/hash/hash_page.c3182
-rw-r--r--src/hash/hash_rec.c1896
-rw-r--r--src/hash/hash_reclaim.c98
-rw-r--r--src/hash/hash_stat.c518
-rw-r--r--src/hash/hash_stub.c470
-rw-r--r--src/hash/hash_upgrade.c323
-rw-r--r--src/hash/hash_verify.c1157
-rw-r--r--src/heap/heap.c2812
-rw-r--r--src/heap/heap.src101
-rw-r--r--src/heap/heap_auto.c73
-rw-r--r--src/heap/heap_autop.c105
-rw-r--r--src/heap/heap_backup.c66
-rw-r--r--src/heap/heap_conv.c93
-rw-r--r--src/heap/heap_method.c168
-rw-r--r--src/heap/heap_open.c439
-rw-r--r--src/heap/heap_rec.c386
-rw-r--r--src/heap/heap_reclaim.c152
-rw-r--r--src/heap/heap_stat.c289
-rw-r--r--src/heap/heap_stub.c328
-rw-r--r--src/heap/heap_verify.c468
-rw-r--r--src/hmac/hmac.c223
-rw-r--r--src/hmac/sha1.c289
-rw-r--r--src/lock/Design301
-rw-r--r--src/lock/lock.c2020
-rw-r--r--src/lock/lock_alloc.incl138
-rw-r--r--src/lock/lock_deadlock.c1063
-rw-r--r--src/lock/lock_failchk.c114
-rw-r--r--src/lock/lock_id.c572
-rw-r--r--src/lock/lock_list.c365
-rw-r--r--src/lock/lock_method.c630
-rw-r--r--src/lock/lock_region.c578
-rw-r--r--src/lock/lock_stat.c770
-rw-r--r--src/lock/lock_stub.c631
-rw-r--r--src/lock/lock_timer.c128
-rw-r--r--src/lock/lock_util.c98
-rw-r--r--src/log/log.c1727
-rw-r--r--src/log/log_archive.c643
-rw-r--r--src/log/log_compare.c66
-rw-r--r--src/log/log_debug.c146
-rw-r--r--src/log/log_get.c1626
-rw-r--r--src/log/log_method.c533
-rw-r--r--src/log/log_print.c380
-rw-r--r--src/log/log_put.c2041
-rw-r--r--src/log/log_stat.c336
-rw-r--r--src/log/log_verify.c437
-rw-r--r--src/log/log_verify_auto.c318
-rw-r--r--src/log/log_verify_int.c4353
-rw-r--r--src/log/log_verify_stub.c79
-rw-r--r--src/log/log_verify_util.c2234
-rw-r--r--src/mp/mp_alloc.c724
-rw-r--r--src/mp/mp_backup.c333
-rw-r--r--src/mp/mp_bh.c690
-rw-r--r--src/mp/mp_fget.c1230
-rw-r--r--src/mp/mp_fmethod.c589
-rw-r--r--src/mp/mp_fopen.c1220
-rw-r--r--src/mp/mp_fput.c374
-rw-r--r--src/mp/mp_fset.c170
-rw-r--r--src/mp/mp_method.c1091
-rw-r--r--src/mp/mp_mvcc.c636
-rw-r--r--src/mp/mp_region.c620
-rw-r--r--src/mp/mp_register.c116
-rw-r--r--src/mp/mp_resize.c605
-rw-r--r--src/mp/mp_stat.c905
-rw-r--r--src/mp/mp_sync.c965
-rw-r--r--src/mp/mp_trickle.c112
-rw-r--r--src/mutex/README110
-rw-r--r--src/mutex/mut_alloc.c291
-rw-r--r--src/mutex/mut_failchk.c76
-rw-r--r--src/mutex/mut_fcntl.c248
-rw-r--r--src/mutex/mut_method.c482
-rw-r--r--src/mutex/mut_pthread.c770
-rw-r--r--src/mutex/mut_region.c468
-rw-r--r--src/mutex/mut_stat.c579
-rw-r--r--src/mutex/mut_stub.c252
-rw-r--r--src/mutex/mut_tas.c608
-rw-r--r--src/mutex/mut_win32.c589
-rw-r--r--src/mutex/test_mutex.c1051
-rw-r--r--src/mutex/uts4_cc.s26
-rw-r--r--src/os/os_abort.c33
-rw-r--r--src/os/os_abs.c24
-rw-r--r--src/os/os_addrinfo.c179
-rw-r--r--src/os/os_alloc.c464
-rw-r--r--src/os/os_clock.c73
-rw-r--r--src/os/os_config.c70
-rw-r--r--src/os/os_cpu.c47
-rw-r--r--src/os/os_ctime.c47
-rw-r--r--src/os/os_dir.c140
-rw-r--r--src/os/os_errno.c129
-rw-r--r--src/os/os_fid.c135
-rw-r--r--src/os/os_flock.c64
-rw-r--r--src/os/os_fsync.c104
-rw-r--r--src/os/os_getenv.c58
-rw-r--r--src/os/os_handle.c243
-rw-r--r--src/os/os_map.c607
-rw-r--r--src/os/os_mkdir.c52
-rw-r--r--src/os/os_open.c162
-rw-r--r--src/os/os_path.c27
-rw-r--r--src/os/os_pid.c63
-rw-r--r--src/os/os_rename.c53
-rw-r--r--src/os/os_root.c27
-rw-r--r--src/os/os_rpath.c36
-rw-r--r--src/os/os_rw.c291
-rw-r--r--src/os/os_seek.c66
-rw-r--r--src/os/os_stack.c45
-rw-r--r--src/os/os_stat.c108
-rw-r--r--src/os/os_tmpdir.c141
-rw-r--r--src/os/os_truncate.c63
-rw-r--r--src/os/os_uid.c55
-rw-r--r--src/os/os_unlink.c80
-rw-r--r--src/os/os_yield.c95
-rw-r--r--src/os_qnx/os_qnx_fsync.c73
-rw-r--r--src/os_qnx/os_qnx_open.c79
-rw-r--r--src/os_vxworks/os_vx_abs.c42
-rw-r--r--src/os_vxworks/os_vx_config.c56
-rw-r--r--src/os_vxworks/os_vx_map.c436
-rw-r--r--src/os_vxworks/os_vx_rpath.c55
-rw-r--r--src/os_vxworks/os_vx_yield.c49
-rw-r--r--src/os_windows/ce_ctime.c87
-rw-r--r--src/os_windows/os_abs.c33
-rw-r--r--src/os_windows/os_clock.c79
-rw-r--r--src/os_windows/os_config.c133
-rw-r--r--src/os_windows/os_cpu.c27
-rw-r--r--src/os_windows/os_dir.c122
-rw-r--r--src/os_windows/os_errno.c428
-rw-r--r--src/os_windows/os_fid.c129
-rw-r--r--src/os_windows/os_flock.c90
-rw-r--r--src/os_windows/os_fsync.c44
-rw-r--r--src/os_windows/os_getenv.c103
-rw-r--r--src/os_windows/os_handle.c167
-rw-r--r--src/os_windows/os_map.c397
-rw-r--r--src/os_windows/os_mkdir.c44
-rw-r--r--src/os_windows/os_open.c258
-rw-r--r--src/os_windows/os_rename.c82
-rw-r--r--src/os_windows/os_rw.c218
-rw-r--r--src/os_windows/os_seek.c67
-rw-r--r--src/os_windows/os_stat.c231
-rw-r--r--src/os_windows/os_truncate.c99
-rw-r--r--src/os_windows/os_unlink.c123
-rw-r--r--src/os_windows/os_yield.c35
-rw-r--r--src/qam/qam.c1760
-rw-r--r--src/qam/qam.src89
-rw-r--r--src/qam/qam_auto.c83
-rw-r--r--src/qam/qam_autop.c126
-rw-r--r--src/qam/qam_conv.c79
-rw-r--r--src/qam/qam_files.c939
-rw-r--r--src/qam/qam_method.c399
-rw-r--r--src/qam/qam_open.c346
-rw-r--r--src/qam/qam_rec.c687
-rw-r--r--src/qam/qam_stat.c255
-rw-r--r--src/qam/qam_stub.c339
-rw-r--r--src/qam/qam_upgrade.c101
-rw-r--r--src/qam/qam_verify.c653
-rw-r--r--src/rep/mlease.html1198
-rw-r--r--src/rep/rep.msg160
-rw-r--r--src/rep/rep_automsg.c1041
-rw-r--r--src/rep/rep_backup.c3568
-rw-r--r--src/rep/rep_elect.c1486
-rw-r--r--src/rep/rep_lease.c545
-rw-r--r--src/rep/rep_log.c1060
-rw-r--r--src/rep/rep_method.c3032
-rw-r--r--src/rep/rep_record.c2586
-rw-r--r--src/rep/rep_region.c610
-rw-r--r--src/rep/rep_stat.c692
-rw-r--r--src/rep/rep_stub.c425
-rw-r--r--src/rep/rep_util.c2791
-rw-r--r--src/rep/rep_verify.c751
-rw-r--r--src/repmgr/repmgr.msg119
-rw-r--r--src/repmgr/repmgr.src23
-rw-r--r--src/repmgr/repmgr_auto.c32
-rw-r--r--src/repmgr/repmgr_automsg.c757
-rw-r--r--src/repmgr/repmgr_autop.c44
-rw-r--r--src/repmgr/repmgr_elect.c585
-rw-r--r--src/repmgr/repmgr_method.c3092
-rw-r--r--src/repmgr/repmgr_msg.c1655
-rw-r--r--src/repmgr/repmgr_net.c2043
-rw-r--r--src/repmgr/repmgr_posix.c804
-rw-r--r--src/repmgr/repmgr_queue.c180
-rw-r--r--src/repmgr/repmgr_rec.c45
-rw-r--r--src/repmgr/repmgr_sel.c2096
-rw-r--r--src/repmgr/repmgr_stat.c363
-rw-r--r--src/repmgr/repmgr_stub.c262
-rw-r--r--src/repmgr/repmgr_util.c2086
-rw-r--r--src/repmgr/repmgr_windows.c849
-rw-r--r--src/sequence/seq_stat.c275
-rw-r--r--src/sequence/sequence.c1011
-rw-r--r--src/txn/txn.c2169
-rw-r--r--src/txn/txn.src120
-rw-r--r--src/txn/txn_auto.c93
-rw-r--r--src/txn/txn_autop.c175
-rw-r--r--src/txn/txn_chkpt.c419
-rw-r--r--src/txn/txn_failchk.c101
-rw-r--r--src/txn/txn_method.c124
-rw-r--r--src/txn/txn_rec.c616
-rw-r--r--src/txn/txn_recover.c317
-rw-r--r--src/txn/txn_region.c518
-rw-r--r--src/txn/txn_stat.c461
-rw-r--r--src/txn/txn_util.c696
-rw-r--r--src/xa/xa.c1068
-rw-r--r--src/xa/xa_map.c152
436 files changed, 224104 insertions, 0 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
new file mode 100644
index 00000000..b455ff23
--- /dev/null
+++ b/src/btree/bt_compact.c
@@ -0,0 +1,2652 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __bam_compact_dups __P((DBC *,
+ PAGE **, u_int32_t, int, DB_COMPACT *, int *));
+static int __bam_compact_isdone __P((DBC *, DBT *, PAGE *, int *));
+static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int));
+static int __bam_lock_tree __P((DBC *, EPG *, EPG *csp, u_int32_t, u_int32_t));
+static int __bam_lock_subtree __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_merge __P((DBC *,
+ DBC *, u_int32_t, DBT *, DB_COMPACT *,int *));
+static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *));
+static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *));
+static int __bam_merge_records __P((DBC *, DBC*, u_int32_t, DB_COMPACT *));
+static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *));
+static int __bam_truncate_root_page __P((DBC *,
+ PAGE *, u_int32_t, DB_COMPACT *));
+
+#ifdef HAVE_FTRUNCATE
+static int __bam_savekey __P((DBC *, int, DBT *));
+#endif
+
+/*
+ * __bam_csearch -- isolate search code for bam_compact.
+ * This routine hides the differences between searching
+ * a BTREE and a RECNO from the rest of the code.
+ */
+#define CS_READ 0 /* We are just reading. */
+#define CS_PARENT 1 /* We want the parent too, write lock. */
+#define CS_NEXT 2 /* Get the next page. */
+#define CS_NEXT_WRITE 3 /* Get the next page and write lock. */
+#define CS_DEL 4 /* Get a stack to delete a page. */
+#define CS_START 5 /* Starting level for stack, write lock. */
+#define CS_NEXT_BOTH 6 /* Get this page and the next, write lock. */
+#define CS_GETRECNO 0x80 /* Extract record number from start. */
+
+static int
+__bam_csearch(dbc, start, sflag, level)
+ DBC *dbc;
+ DBT *start;
+ u_int32_t sflag;
+ int level;
+{
+ BTREE_CURSOR *cp;
+ int not_used, ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (dbc->dbtype == DB_RECNO) {
+ /* If GETRECNO is not set the cp->recno is what we want. */
+ if (FLD_ISSET(sflag, CS_GETRECNO)) {
+ if (start == NULL || start->size == 0)
+ cp->recno = 1;
+ else if ((ret =
+ __ram_getno(dbc, start, &cp->recno, 0)) != 0)
+ return (ret);
+ FLD_CLR(sflag, CS_GETRECNO);
+ }
+ switch (sflag) {
+ case CS_READ:
+ sflag = SR_READ;
+ break;
+ case CS_NEXT:
+ sflag = SR_PARENT | SR_READ;
+ break;
+ case CS_START:
+ level = LEAFLEVEL;
+ /* FALLTHROUGH */
+ case CS_DEL:
+ case CS_NEXT_WRITE:
+ sflag = SR_STACK;
+ break;
+ case CS_NEXT_BOTH:
+ sflag = SR_BOTH | SR_NEXT | SR_WRITE;
+ break;
+ case CS_PARENT:
+ sflag = SR_PARENT | SR_WRITE;
+ break;
+ default:
+ return (__env_panic(dbc->env, EINVAL));
+ }
+ if ((ret = __bam_rsearch(dbc,
+ &cp->recno, sflag, level, &not_used)) != 0)
+ return (ret);
+ /* Reset the cursor's recno to the beginning of the page. */
+ cp->recno -= cp->csp->indx;
+ } else {
+ FLD_CLR(sflag, CS_GETRECNO);
+ switch (sflag) {
+ case CS_READ:
+ sflag = SR_READ | SR_DUPFIRST;
+ break;
+ case CS_DEL:
+ sflag = SR_DEL;
+ break;
+ case CS_NEXT:
+ sflag = SR_NEXT;
+ break;
+ case CS_NEXT_WRITE:
+ sflag = SR_NEXT | SR_WRITE;
+ break;
+ case CS_NEXT_BOTH:
+ sflag = SR_BOTH | SR_NEXT | SR_WRITE;
+ break;
+ case CS_START:
+ sflag = SR_START | SR_WRITE;
+ break;
+ case CS_PARENT:
+ sflag = SR_PARENT | SR_WRITE;
+ break;
+ default:
+ return (__env_panic(dbc->env, EINVAL));
+ }
+ if (start == NULL || start->size == 0)
+ FLD_SET(sflag, SR_MIN);
+
+ if ((ret = __bam_search(dbc,
+ PGNO_INVALID, start, sflag, level, NULL, &not_used)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_compact_int -- internal compaction routine.
+ * Called either with a cursor on the main database
+ * or a cursor initialized to the root of an off page duplicate
+ * tree.
+ * PUBLIC: int __bam_compact_int __P((DBC *,
+ * PUBLIC: DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
+ */
+int
+__bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
+ DBC *dbc;
+ DBT *start, *stop;
+ u_int32_t factor;
+ int *spanp;
+ DB_COMPACT *c_data;
+ int *donep;
+{
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ DBC *ndbc;
+ DB_LOCK metalock, next_lock, nnext_lock, prev_lock, saved_lock;
+ DB_MPOOLFILE *dbmp;
+ ENV *env;
+ EPG *epg;
+ PAGE *pg, *ppg, *npg;
+ db_pgno_t metapgno, npgno, nnext_pgno;
+ db_pgno_t pgno, prev_pgno, ppgno, saved_pgno;
+ db_recno_t next_recno;
+ u_int32_t nentry, sflag, pgs_free;
+ int check_dups, check_trunc, clear_root, do_commit, isdone;
+ int merged, next_p, pgs_done, ret, t_ret, tdone;
+
+#ifdef DEBUG
+#define CTRACE(dbc, location, t, start, f) do { \
+ DBT __trace; \
+ DB_SET_DBT(__trace, t, strlen(t)); \
+ DEBUG_LWRITE( \
+ dbc, (dbc)->txn, location, &__trace, start, f) \
+ } while (0)
+#define PTRACE(dbc, location, p, start, f) do { \
+ char __buf[32]; \
+ (void)snprintf(__buf, \
+ sizeof(__buf), "pgno: %lu", (u_long)p); \
+ CTRACE(dbc, location, __buf, start, f); \
+ } while (0)
+#else
+#define CTRACE(dbc, location, t, start, f)
+#define PTRACE(dbc, location, p, start, f)
+#endif
+
+ ndbc = NULL;
+ pg = NULL;
+ npg = NULL;
+
+ isdone = 0;
+ tdone = 0;
+ pgs_done = 0;
+ do_commit = 0;
+ next_recno = 0;
+ next_p = 0;
+ clear_root = 0;
+ metapgno = PGNO_BASE_MD;
+ ppgno = PGNO_INVALID;
+ LOCK_INIT(next_lock);
+ LOCK_INIT(nnext_lock);
+ LOCK_INIT(saved_lock);
+ LOCK_INIT(metalock);
+ LOCK_INIT(prev_lock);
+ check_trunc = c_data->compact_truncate != PGNO_INVALID;
+ check_dups = (!F_ISSET(dbc, DBC_OPD) &&
+ F_ISSET(dbc->dbp, DB_AM_DUP)) || check_trunc;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ dbmp = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ pgs_free = c_data->compact_pages_free;
+
+ /* Search down the tree for the starting point. */
+ if ((ret = __bam_csearch(dbc,
+ start, CS_READ | CS_GETRECNO, LEAFLEVEL)) != 0) {
+ /* Its not an error to compact an empty db. */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ isdone = 1;
+ goto err;
+ }
+
+ /*
+ * Get the first leaf page. The loop below will change pg so
+ * we clear the stack reference so we don't put a a page twice.
+ */
+ pg = cp->csp->page;
+ cp->csp->page = NULL;
+ next_recno = cp->recno;
+next: /*
+ * This is the start of the main compaction loop. There are 3
+ * parts to the process:
+ * 1) Walk the leaf pages of the tree looking for a page to
+ * process. We do this with read locks. Save the
+ * key from the page and release it.
+ * 2) Set up a cursor stack which will write lock the page
+ * and enough of its ancestors to get the job done.
+ * This could go to the root if we might delete a subtree
+ * or we have record numbers to update.
+ * 3) Loop fetching pages after the above page and move enough
+ * data to fill it.
+ * We exit the loop if we are at the end of the leaf pages, are
+ * about to lock a new subtree (we span) or on error.
+ */
+
+ /* Walk the pages looking for something to fill up. */
+ while ((npgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
+ c_data->compact_pages_examine++;
+ PTRACE(dbc, "Next", PGNO(pg), start, 0);
+
+ /* If we have fetched the next page, get the new key. */
+ if (next_p == 1 &&
+ dbc->dbtype != DB_RECNO && NUM_ENT(pg) != 0) {
+ if ((ret = __db_ret(dbc, pg, 0, start,
+ &start->data, &start->ulen)) != 0)
+ goto err;
+ }
+ next_recno += NUM_ENT(pg);
+ if (P_FREESPACE(dbp, pg) > factor ||
+ (check_trunc && PGNO(pg) > c_data->compact_truncate))
+ break;
+ if (stop != NULL && stop->size > 0) {
+ if ((ret = __bam_compact_isdone(dbc,
+ stop, pg, &isdone)) != 0)
+ goto err;
+ if (isdone)
+ goto done;
+ }
+
+ /*
+ * The page does not need more data or to be swapped,
+ * check to see if we want to look at possible duplicate
+ * trees or overflow records and the move on to the next page.
+ */
+ cp->recno += NUM_ENT(pg);
+ next_p = 1;
+ tdone = pgs_done;
+ PTRACE(dbc, "Dups", PGNO(pg), start, 0);
+ if (check_dups && (ret = __bam_compact_dups(
+ dbc, &pg, factor, 0, c_data, &pgs_done)) != 0)
+ goto err;
+ npgno = NEXT_PGNO(pg);
+ if ((ret = __memp_fput(dbmp,
+ dbc->thread_info, pg, dbc->priority)) != 0)
+ goto err;
+ pg = NULL;
+ /*
+ * If we don't do anything we don't need to hold
+ * the lock on the previous page, so couple always.
+ */
+ if ((ret = __db_lget(dbc,
+ tdone == pgs_done ? LCK_COUPLE_ALWAYS : LCK_COUPLE,
+ npgno, DB_LOCK_READ, 0, &cp->csp->lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(dbmp, &npgno,
+ dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+ goto err;
+ }
+
+ /*
+ * When we get here we have 3 cases:
+ * 1) We've reached the end of the leaf linked list and are done.
+ * 2) A page whose freespace exceeds our target and therefore needs
+ * to have data added to it.
+ * 3) A page that doesn't have too much free space but needs to be
+ * checked for truncation.
+ * In both cases 2 and 3, we need that page's first key or record
+ * number. We may already have it, if not get it here.
+ */
+ if ((nentry = NUM_ENT(pg)) != 0) {
+ /* Get a copy of the first recno on the page. */
+ if (dbc->dbtype == DB_RECNO) {
+ if ((ret = __db_retcopy(dbp->env, start,
+ &cp->recno, sizeof(cp->recno),
+ &start->data, &start->ulen)) != 0)
+ goto err;
+ } else if (((next_p == 1 && npgno == PGNO_INVALID) ||
+ start->size == 0) && (ret = __db_ret(dbc,
+ pg, 0, start, &start->data, &start->ulen)) != 0)
+ goto err;
+
+ next_p = 0;
+ /*
+ * If there is no next page we can stop unless there is
+ * a possibility of moving this data to a lower numbered
+ * page.
+ */
+ if (npgno == PGNO_INVALID &&
+ (!check_trunc || PGNO(pg) <= c_data->compact_truncate ||
+ PGNO(pg) == BAM_ROOT_PGNO(dbc))) {
+ /* End of the tree, check its duplicates and exit. */
+ PTRACE(dbc, "GoDone", PGNO(pg), start, 0);
+ if (check_dups && (ret = __bam_compact_dups(dbc,
+ &pg, factor, 0, c_data, &pgs_done)) != 0)
+ goto err;
+ c_data->compact_pages_examine++;
+ isdone = 1;
+ goto done;
+ }
+ }
+
+ /* Release the page so we don't deadlock getting its parent. */
+ if ((ret = __memp_fput(dbmp, dbc->thread_info, pg, dbc->priority)) != 0)
+ goto err;
+ if ((ret = __LPUT(dbc, cp->csp->lock)) != 0)
+ goto err;
+ BT_STK_CLR(cp);
+ pg = NULL;
+ saved_pgno = PGNO_INVALID;
+ prev_pgno = PGNO_INVALID;
+ nnext_pgno = PGNO_INVALID;
+
+ /*
+ * We must lock the metadata page first because we cannot block
+ * while holding interior nodes of the tree pinned.
+ */
+
+ if (!LOCK_ISSET(metalock) && pgs_free == c_data->compact_pages_free &&
+ (ret = __db_lget(dbc,
+ LCK_ALWAYS, metapgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+
+ /*
+ * Setup the cursor stack. There are 3 cases:
+ * 1) the page is empty and will be deleted: nentry == 0.
+ * 2) the next page has the same parent: *spanp == 0.
+ * 3) the next page has a different parent: *spanp == 1.
+ *
+ * We now need to search the tree again, getting a write lock
+ * on the page we are going to merge or delete. We do this by
+ * searching down the tree and locking as much of the subtree
+ * above the page as needed. In the case of a delete we will
+ * find the maximal subtree that can be deleted. In the case
+ * of merge if the current page and the next page are siblings
+ * with the same parent then we only need to lock the parent.
+ * Otherwise *span will be set and we need to search to find the
+ * lowest common ancestor. Dbc will be set to contain the subtree
+ * containing the page to be merged or deleted. Ndbc will contain
+ * the minimal subtree containing that page and its next sibling.
+ * In all cases for DB_RECNO we simplify things and get the whole
+ * tree if we need more than a single parent.
+ * The tree can collapse while we don't have it locked, so the
+ * page we are looking for may be gone. If so we are at
+ * the right most end of the leaf pages and are done.
+ */
+
+retry: pg = NULL;
+ if (npg != NULL && (ret = __memp_fput(dbmp,
+ dbc->thread_info, npg, dbc->priority)) != 0)
+ goto err;
+ npg = NULL;
+ if (ndbc != NULL) {
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ if (clear_root == 1) {
+ ncp->sp->page = NULL;
+ LOCK_INIT(ncp->sp->lock);
+ }
+ if ((ret = __bam_stkrel(ndbc, 0)) != 0)
+ goto err;
+ }
+ clear_root = 0;
+ /* Case 1 -- page is empty. */
+ if (nentry == 0) {
+ CTRACE(dbc, "Empty", "", start, 0);
+ if (next_p == 1)
+ sflag = CS_NEXT_WRITE;
+ else
+ sflag = CS_DEL;
+ if ((ret = __bam_csearch(dbc, start, sflag, LEAFLEVEL)) != 0) {
+ isdone = 1;
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+
+ pg = cp->csp->page;
+ /* Check to see if the page is still empty. */
+ if (NUM_ENT(pg) != 0)
+ npgno = PGNO(pg);
+ else {
+ npgno = NEXT_PGNO(pg);
+ /* If this is now the root, we are very done. */
+ if (PGNO(pg) == BAM_ROOT_PGNO(dbc))
+ isdone = 1;
+ else {
+ if (npgno != PGNO_INVALID) {
+ TRY_LOCK(dbc, npgno, saved_pgno,
+ next_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ }
+ if (PREV_PGNO(pg) != PGNO_INVALID) {
+ TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno,
+ prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ }
+ if ((ret =
+ __bam_dpages(dbc, 0, BTD_RELINK)) != 0)
+ goto err;
+ c_data->compact_pages_free++;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err;
+ LOCK_INIT(prev_lock);
+ if ((ret = __TLPUT(dbc, next_lock)) != 0)
+ goto err;
+ LOCK_INIT(next_lock);
+ saved_pgno = PGNO_INVALID;
+ goto next_no_release;
+ }
+ }
+ goto next_page;
+ }
+
+ /* case 3 -- different parents. */
+ if (*spanp) {
+ CTRACE(dbc, "Span", "", start, 0);
+ /*
+ * Search the tree looking for the page containing and
+ * the next page after the current key.
+ * The stack will be rooted at the page that spans
+ * the current and next pages. The two subtrees
+ * are returned below that. For BTREE the current
+ * page subtree will be first while for RECNO the
+ * next page subtree will be first
+ */
+ if (ndbc == NULL && (ret = __dbc_dup(dbc, &ndbc, 0)) != 0)
+ goto err;
+ DB_ASSERT(env, ndbc != NULL);
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+
+ ncp->recno = cp->recno;
+ cp->recno = next_recno;
+
+ if ((ret = __bam_csearch(dbc, start, CS_NEXT_BOTH, 0)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ isdone = 1;
+ ret = 0;
+ }
+ goto err;
+ }
+
+ /*
+ * Find the top of the stack for the second subtree.
+ */
+ for (epg = cp->csp - 1; epg > cp->sp; epg--)
+ if (LEVEL(epg->page) == LEAFLEVEL)
+ break;
+ DB_ASSERT(env, epg != cp->sp);
+
+ /*
+ * Copy the root. We will have two instances of the
+ * same page, be careful not to free both.
+ */
+ BT_STK_PUSH(env, ncp, cp->sp->page, cp->sp->indx,
+ cp->sp->lock, cp->sp->lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ clear_root = 1;
+
+ /* Copy the stack containing the next page. */
+ for (epg++; epg <= cp->csp; epg++) {
+ BT_STK_PUSH(env, ncp, epg->page, epg->indx,
+ epg->lock, epg->lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ }
+ /* adjust the stack pointer to remove these items. */
+ ncp->csp--;
+ cp->csp -= ncp->csp - ncp->sp;
+
+ /*
+ * If this is RECNO then we want to swap the stacks.
+ */
+ if (dbc->dbtype == DB_RECNO) {
+ ndbc->internal = (DBC_INTERNAL *)cp;
+ dbc->internal = (DBC_INTERNAL *)ncp;
+ cp = ncp;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ cp->sp->indx--;
+ } else
+ ncp->sp->indx++;
+
+ DB_ASSERT(env,
+ NEXT_PGNO(cp->csp->page) == PGNO(ncp->csp->page));
+ pg = cp->csp->page;
+
+ /*
+ * The page may have emptied while we waited for the
+ * lock or the record we are looking for may have
+ * moved.
+ * Reset npgno so we re-get this page when we go back
+ * to the top.
+ */
+ if (NUM_ENT(pg) == 0 ||
+ (dbc->dbtype == DB_RECNO &&
+ NEXT_PGNO(cp->csp->page) != PGNO(ncp->csp->page))) {
+ npgno = PGNO(pg);
+ *spanp = 0;
+ goto next_page;
+ }
+
+ if (check_trunc && PGNO(pg) > c_data->compact_truncate) {
+ if (PREV_PGNO(pg) != PGNO_INVALID) {
+ TRY_LOCK2(dbc, ndbc, PREV_PGNO(pg), prev_pgno,
+ prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ pgs_done++;
+ /* Get a fresh low numbered page. */
+ if ((ret = __db_exchange_page(dbc,
+ &cp->csp->page, ncp->csp->page,
+ PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ goto err1;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err1;
+ LOCK_INIT(prev_lock);
+ pg = cp->csp->page;
+ }
+ *spanp = 0;
+ PTRACE(dbc, "SDups", PGNO(ncp->csp->page), start, 0);
+ if (check_dups && (ret = __bam_compact_dups(ndbc,
+ &ncp->csp->page, factor, 1, c_data, &pgs_done)) != 0)
+ goto err1;
+
+ DB_ASSERT(env, ndbc != NULL);
+ /* Check to see if the tree collapsed. */
+ /*lint -e{794} */
+ if (PGNO(ncp->csp->page) == BAM_ROOT_PGNO(ndbc))
+ goto done;
+
+ pg = cp->csp->page;
+ npgno = NEXT_PGNO(pg);
+ PTRACE(dbc, "SDups", PGNO(pg), start, 0);
+ if (check_dups && (ret =
+ __bam_compact_dups(dbc, &cp->csp->page,
+ factor, 1, c_data, &pgs_done)) != 0)
+ goto err1;
+
+ /*
+ * We may have dropped our locks, check again
+ * to see if we still need to fill this page and
+ * we are in a spanning situation.
+ */
+
+ if (P_FREESPACE(dbp, pg) <= factor ||
+ cp->csp[-1].indx != NUM_ENT(cp->csp[-1].page) - 1)
+ goto next_page;
+
+ /*
+ * Try to move things into a single parent.
+ */
+ merged = 0;
+ for (epg = cp->sp; epg != cp->csp; epg++) {
+ PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0);
+ if ((ret = __bam_merge_internal(dbc,
+ ndbc, LEVEL(epg->page), c_data, &merged)) != 0)
+ break;
+ if (merged)
+ break;
+ }
+
+ if (ret != 0 && ret != DB_LOCK_NOTGRANTED)
+ goto err1;
+ /*
+ * If we merged the parent, then we nolonger span.
+ * Otherwise if we tried to merge the parent but would
+ * block on one of the other leaf pages try again.
+ * If we did not merge any records of the parent,
+ * exit to commit any local transactions and try again.
+ */
+ if (merged || (pgs_done > 0 && ret == DB_LOCK_NOTGRANTED)) {
+ if (merged)
+ pgs_done++;
+ else
+ goto done;
+ if (cp->csp->page == NULL)
+ goto deleted;
+ npgno = PGNO(pg);
+ next_recno = cp->recno;
+ goto next_page;
+ }
+ PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0);
+
+ /* if we remove the next page, then we need its next locked */
+ npgno = NEXT_PGNO(ncp->csp->page);
+ if (npgno != PGNO_INVALID) {
+ TRY_LOCK2(dbc, ndbc, npgno,
+ nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ /*lint -e{794} */
+ if ((ret = __bam_merge(dbc,
+ ndbc, factor, stop, c_data, &isdone)) != 0)
+ goto err1;
+ pgs_done++;
+ /*
+ * __bam_merge could have freed our stack if it
+ * deleted a page possibly collapsing the tree.
+ */
+ if (cp->csp->page == NULL)
+ goto deleted;
+ cp->recno += NUM_ENT(pg);
+
+ if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
+ goto err1;
+ LOCK_INIT(nnext_lock);
+ nnext_pgno = PGNO_INVALID;
+
+ /* If we did not bump to the next page something did not fit. */
+ if (npgno != NEXT_PGNO(pg)) {
+ npgno = NEXT_PGNO(pg);
+ goto next_page;
+ }
+ } else {
+ /* Case 2 -- same parents. */
+ CTRACE(dbc, "Sib", "", start, 0);
+ if ((ret =
+ __bam_csearch(dbc, start, CS_PARENT, LEAFLEVEL)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ isdone = 1;
+ ret = 0;
+ }
+ goto err;
+ }
+
+ pg = cp->csp->page;
+ DB_ASSERT(env, IS_DIRTY(pg));
+ DB_ASSERT(env,
+ PGNO(pg) == BAM_ROOT_PGNO(dbc) ||
+ IS_DIRTY(cp->csp[-1].page));
+
+ /* Check to see if we moved to a new parent. */
+ if (PGNO(pg) != BAM_ROOT_PGNO(dbc) &&
+ ppgno != PGNO(cp->csp[-1].page) && pgs_done != 0) {
+ do_commit = 1;
+ goto next_page;
+ }
+
+ /* We now have a write lock, recheck the page. */
+ if ((nentry = NUM_ENT(pg)) == 0) {
+ npgno = PGNO(pg);
+ goto next_page;
+ }
+
+ /* Check duplicate trees, we have a write lock on the page. */
+ PTRACE(dbc, "SibDup", PGNO(pg), start, 0);
+ if (check_dups && (ret =
+ __bam_compact_dups(dbc, &cp->csp->page,
+ factor, 1, c_data, &pgs_done)) != 0)
+ goto err1;
+ pg = cp->csp->page;
+ npgno = NEXT_PGNO(pg);
+
+ /* Check to see if the tree collapsed. */
+ if (PGNO(pg) == BAM_ROOT_PGNO(dbc))
+ goto err1;
+ DB_ASSERT(env, cp->csp - cp->sp == 1);
+
+ /* After re-locking check to see if we still need to fill. */
+ if (P_FREESPACE(dbp, pg) <= factor) {
+ if (check_trunc &&
+ PGNO(pg) > c_data->compact_truncate) {
+ if (PREV_PGNO(pg) != PGNO_INVALID) {
+ TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno,
+ prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ if (npgno != PGNO_INVALID) {
+ TRY_LOCK(dbc, npgno, saved_pgno,
+ next_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ /* Get a fresh low numbered page. */
+ pgno = PGNO(pg);
+ if ((ret = __db_exchange_page(dbc,
+ &cp->csp->page, NULL,
+ PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ goto err1;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err1;
+ LOCK_INIT(prev_lock);
+ prev_pgno = PGNO_INVALID;
+ if ((ret = __TLPUT(dbc, next_lock)) != 0)
+ goto err1;
+ LOCK_INIT(next_lock);
+ saved_pgno = PGNO_INVALID;
+ pg = cp->csp->page;
+ if (pgno != PGNO(pg)) {
+ pgs_done++;
+ pgno = PGNO(pg);
+ }
+ }
+ /*
+ * If we are going to leave this parent commit
+ * the current transaction before continuing.
+ */
+ epg = &cp->csp[-1];
+ if ((ppgno != PGNO(epg->page) &&
+ ppgno != PGNO_INVALID) ||
+ epg->indx == NUM_ENT(epg->page) - 1)
+ do_commit = 1;
+ ppgno = PGNO(epg->page);
+ goto next_page;
+ }
+
+ /* If they have the same parent, just dup the cursor */
+ if (ndbc != NULL && (ret = __dbc_close(ndbc)) != 0)
+ goto err1;
+ if ((ret = __dbc_dup(dbc, &ndbc, DB_POSITION)) != 0)
+ goto err1;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+
+ /*
+ * ncp->recno needs to have the recno of the next page.
+ * Bump it by the number of records on the current page.
+ */
+ ncp->recno += NUM_ENT(pg);
+ }
+
+ pgno = PGNO(cp->csp->page);
+ ppgno = PGNO(cp->csp[-1].page);
+ /* Fetch pages until we fill this one. */
+ while (!isdone && npgno != PGNO_INVALID &&
+ P_FREESPACE(dbp, pg) > factor && c_data->compact_pages != 0) {
+ /*
+ * merging may have to free the parent page, if it does,
+ * refetch it but do it descending the tree.
+ */
+ epg = &cp->csp[-1];
+ if ((ppg = epg->page) == NULL) {
+ if ((ret = __memp_fput(dbmp, dbc->thread_info,
+ cp->csp->page, dbc->priority)) != 0)
+ goto err1;
+ pg = cp->csp->page = NULL;
+ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+ (ret = __db_lget(dbc, 0, ppgno,
+ DB_LOCK_WRITE, 0, &epg->lock)) != 0)
+ goto err1;
+ if ((ret = __memp_fget(dbmp, &ppgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &ppg)) != 0)
+ goto err1;
+ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+ (ret = __db_lget(dbc, 0, pgno,
+ DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
+ goto err1;
+ if ((ret = __memp_fget(dbmp, &pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+ goto err1;
+ epg->page = ppg;
+ cp->csp->page = pg;
+ }
+
+ /*
+ * If our current position is the last one on a parent
+ * page, then we are about to merge across different
+ * internal nodes. Thus, we need to lock higher up
+ * in the tree. We will exit the routine and commit
+ * what we have done so far. Set spanp so we know
+ * we are in this case when we come back.
+ */
+ if (epg->indx == NUM_ENT(ppg) - 1) {
+ *spanp = 1;
+ do_commit = 1;
+ npgno = PGNO(pg);
+ next_recno = cp->recno;
+ epg->page = ppg;
+ goto next_page;
+ }
+
+ /* Lock and get the next page. */
+ TRY_LOCK(dbc, npgno,
+ saved_pgno, saved_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ if ((ret = __LPUT(dbc, ncp->lock)) != 0)
+ goto err1;
+ ncp->lock = saved_lock;
+ LOCK_INIT(saved_lock);
+ saved_pgno = PGNO_INVALID;
+
+ if ((ret = __memp_fget(dbmp, &npgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &npg)) != 0)
+ goto err1;
+
+ if (check_trunc &&
+ PGNO(pg) > c_data->compact_truncate) {
+ if (PREV_PGNO(pg) != PGNO_INVALID) {
+ TRY_LOCK(dbc, PREV_PGNO(pg),
+ prev_pgno, prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ pgno = PGNO(pg);
+ /* Get a fresh low numbered page. */
+ if ((ret = __db_exchange_page(dbc, &cp->csp->page,
+ npg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ goto err1;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err1;
+ LOCK_INIT(prev_lock);
+ prev_pgno = PGNO_INVALID;
+ pg = cp->csp->page;
+ if (pgno != PGNO(pg)) {
+ pgs_done++;
+ pgno = PGNO(pg);
+ }
+ }
+ c_data->compact_pages_examine++;
+
+ PTRACE(dbc, "MDups", PGNO(npg), start, 0);
+ if (check_dups && (ret = __bam_compact_dups(ndbc,
+ &npg, factor, 1, c_data, &pgs_done)) != 0)
+ goto err1;
+
+ npgno = NEXT_PGNO(npg);
+ if (npgno != PGNO_INVALID) {
+ TRY_LOCK(dbc, npgno,
+ nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+
+ /* copy the common parent to the stack. */
+ BT_STK_PUSH(env, ncp, ppg,
+ epg->indx + 1, epg->lock, epg->lock_mode, ret);
+ if (ret != 0)
+ goto err1;
+
+ /* Put the page on the stack. */
+ BT_STK_ENTER(env, ncp, npg, 0, ncp->lock, DB_LOCK_WRITE, ret);
+
+ LOCK_INIT(ncp->lock);
+ npg = NULL;
+
+ /*
+ * Merge the pages. This will either free the next
+ * page or just update its parent pointer.
+ */
+ PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0);
+ if ((ret = __bam_merge(dbc,
+ ndbc, factor, stop, c_data, &isdone)) != 0)
+ goto err1;
+
+ pgs_done++;
+
+ if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
+ goto err1;
+ LOCK_INIT(nnext_lock);
+ nnext_pgno = PGNO_INVALID;
+
+ /*
+ * __bam_merge could have freed our stack if it
+ * deleted a page possibly collapsing the tree.
+ */
+ if (cp->csp->page == NULL)
+ goto deleted;
+ /* If we did not bump to the next page something did not fit. */
+ if (npgno != NEXT_PGNO(pg))
+ break;
+ }
+
+ /* Bottom of the main loop. Move to the next page. */
+ npgno = NEXT_PGNO(pg);
+ cp->recno += NUM_ENT(pg);
+ next_recno = cp->recno;
+
+next_page:
+ if (ndbc != NULL) {
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ if (ncp->sp->page == cp->sp->page) {
+ ncp->sp->page = NULL;
+ LOCK_INIT(ncp->sp->lock);
+ }
+ if ((ret = __bam_stkrel(ndbc,
+ pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
+ goto err;
+ }
+ /*
+ * Unlatch the tree before trying to lock the next page. We must
+ * unlatch to avoid a latch deadlock but we want to hold the
+ * lock on the parent node so this leaf cannot be unlinked.
+ */
+ pg = NULL;
+ if ((ret = __bam_stkrel(dbc, STK_PGONLY)) != 0)
+ goto err;
+ if (npgno != PGNO_INVALID &&
+ (ret = __db_lget(dbc, 0, npgno, DB_LOCK_READ, 0, &next_lock)) != 0)
+ goto err;
+ if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
+ goto err;
+ if ((ret = __TLPUT(dbc, saved_lock)) != 0)
+ goto err;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err;
+
+next_no_release:
+ pg = NULL;
+
+ if (npgno == PGNO_INVALID || c_data->compact_pages == 0)
+ isdone = 1;
+ if (!isdone) {
+ /*
+ * If we are at the end of this parent commit the
+ * transaction so we don't tie things up.
+ */
+ if (do_commit && !F_ISSET(dbc, DBC_OPD) &&
+ (atomic_read(&dbp->mpf->mfp->multiversion) != 0 ||
+ pgs_done != 0)) {
+deleted: if (ndbc != NULL &&
+ ((ret = __bam_stkrel(ndbc, 0)) != 0 ||
+ (ret = __dbc_close(ndbc)) != 0))
+ goto err;
+ goto out;
+ }
+
+ /* Reget the next page to look at. */
+ cp->recno = next_recno;
+ if ((ret = __memp_fget(dbmp, &npgno,
+ dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+ goto err;
+ cp->csp->lock = next_lock;
+ LOCK_INIT(next_lock);
+ next_p = 1;
+ do_commit = 0;
+ /* If we did not do anything we can drop the metalock. */
+ if (pgs_done == 0 && (ret = __LPUT(dbc, metalock)) != 0)
+ goto err;
+ goto next;
+ }
+
+done:
+ if (0) {
+ /*
+ * We come here if pg came from cp->csp->page and could
+ * have already been fput.
+ */
+err1: pg = NULL;
+ }
+err: /*
+ * Don't release locks (STK_PGONLY)if we had an error, we could reveal
+ * a bad tree to a dirty reader. Wait till the abort to free the locks.
+ */
+ sflag = STK_CLRDBC;
+ if (dbc->txn != NULL && ret != 0)
+ sflag |= STK_PGONLY;
+ if (ndbc != NULL) {
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ if (npg == ncp->csp->page)
+ npg = NULL;
+ if (ncp->sp->page == cp->sp->page) {
+ ncp->sp->page = NULL;
+ LOCK_INIT(ncp->sp->lock);
+ }
+ if ((t_ret = __bam_stkrel(ndbc, sflag)) != 0 && ret == 0)
+ ret = t_ret;
+ else if ((t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if (pg == cp->csp->page)
+ pg = NULL;
+ if ((t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (pg != NULL && (t_ret =
+ __memp_fput(dbmp,
+ dbc->thread_info, pg, dbc->priority) != 0) && ret == 0)
+ ret = t_ret;
+ if (npg != NULL && (t_ret =
+ __memp_fput(dbmp,
+ dbc->thread_info, npg, dbc->priority) != 0) && ret == 0)
+ ret = t_ret;
+
+out: *donep = isdone;
+
+ /* For OPD trees return if we did anything in the span variable. */
+ if (F_ISSET(dbc, DBC_OPD))
+ *spanp = pgs_done;
+
+ return (ret);
+}
+
+/*
+ * __bam_merge -- do actual merging of leaf pages.
+ */
+static int
+__bam_merge(dbc, ndbc, factor, stop, c_data, donep)
+ DBC *dbc, *ndbc;
+ u_int32_t factor;
+ DBT *stop;
+ DB_COMPACT *c_data;
+ int *donep;
+{
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ PAGE *pg, *npg;
+ db_indx_t nent;
+ int ret;
+
+ DB_ASSERT(NULL, dbc != NULL);
+ DB_ASSERT(NULL, ndbc != NULL);
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ pg = cp->csp->page;
+ npg = ncp->csp->page;
+
+ nent = NUM_ENT(npg);
+
+ /* If the page is empty just throw it away. */
+ if (nent == 0)
+ goto free_page;
+
+ /* Find if the stopping point is on this page. */
+ if (stop != NULL && stop->size != 0) {
+ if ((ret = __bam_compact_isdone(dbc, stop, npg, donep)) != 0)
+ return (ret);
+ if (*donep)
+ return (0);
+ }
+
+ /*
+ * If there is too much data then just move records one at a time.
+ * Otherwise copy the data space over and fix up the index table.
+ * If we are on the left most child we will effect our parent's
+ * index entry so we call merge_records to figure out key sizes.
+ */
+ if ((dbc->dbtype == DB_BTREE &&
+ ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) ||
+ (int)(P_FREESPACE(dbp, pg) -
+ ((dbp->pgsize - P_OVERHEAD(dbp)) -
+ P_FREESPACE(dbp, npg))) < (int)factor)
+ ret = __bam_merge_records(dbc, ndbc, factor, c_data);
+ else
+ /*lint -e{794} */
+free_page: ret = __bam_merge_pages(dbc, ndbc, c_data);
+
+ return (ret);
+}
+
+static int
+__bam_merge_records(dbc, ndbc, factor, c_data)
+ DBC *dbc, *ndbc;
+ u_int32_t factor;
+ DB_COMPACT *c_data;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk, *tmp_bk;
+ BTREE *t;
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ DBT a, b, data, hdr;
+ ENV *env;
+ EPG *epg;
+ PAGE *pg, *npg;
+ db_indx_t adj, indx, nent, *ninp, pind;
+ int32_t adjust;
+ u_int32_t freespace, len, nksize, pfree, size;
+ int first_dup, is_dup, next_dup, n_ok, ret;
+ size_t (*func) __P((DB *, const DBT *, const DBT *));
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ t = dbp->bt_internal;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ pg = cp->csp->page;
+ memset(&hdr, 0, sizeof(hdr));
+ pind = NUM_ENT(pg);
+ n_ok = 0;
+ adjust = 0;
+ ret = 0;
+
+ /* See if we want to swap out this page. */
+ if (c_data->compact_truncate != PGNO_INVALID &&
+ PGNO(ncp->csp->page) > c_data->compact_truncate) {
+ /* Get a fresh low numbered page. */
+ if ((ret = __db_exchange_page(ndbc,
+ &ncp->csp->page, pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ goto err;
+ }
+
+ npg = ncp->csp->page;
+ nent = NUM_ENT(npg);
+
+ DB_ASSERT(env, nent != 0);
+
+ ninp = P_INP(dbp, npg);
+
+ /*
+ * pg is the page that is being filled, it is in the stack in cp.
+ * npg is the next page, it is in the stack in ncp.
+ */
+ freespace = P_FREESPACE(dbp, pg);
+
+ adj = TYPE(npg) == P_LBTREE ? P_INDX : O_INDX;
+ /*
+ * Loop through the records and find the stopping point.
+ */
+ for (indx = 0; indx < nent; indx += adj) {
+ bk = GET_BKEYDATA(dbp, npg, indx);
+
+ /* Size of the key. */
+ size = BITEM_PSIZE(bk);
+
+ /* Size of the data. */
+ if (TYPE(pg) == P_LBTREE)
+ size += BITEM_PSIZE(GET_BKEYDATA(dbp, npg, indx + 1));
+ /*
+ * If we are at a duplicate set, skip ahead to see and
+ * get the total size for the group.
+ */
+ n_ok = adj;
+ if (TYPE(pg) == P_LBTREE &&
+ indx < nent - adj &&
+ ninp[indx] == ninp[indx + adj]) {
+ do {
+ /* Size of index for key reference. */
+ size += sizeof(db_indx_t);
+ n_ok++;
+ /* Size of data item. */
+ size += BITEM_PSIZE(
+ GET_BKEYDATA(dbp, npg, indx + n_ok));
+ n_ok++;
+ } while (indx + n_ok < nent &&
+ ninp[indx] == ninp[indx + n_ok]);
+ }
+ /* if the next set will not fit on the page we are done. */
+ if (freespace < size)
+ break;
+
+ /*
+ * Otherwise figure out if we are past the goal and if
+ * adding this set will put us closer to the goal than
+ * we are now.
+ */
+ if ((freespace - size) < factor) {
+ if (freespace - factor > factor - (freespace - size))
+ indx += n_ok;
+ break;
+ }
+ freespace -= size;
+ indx += n_ok - adj;
+ }
+
+ /* If we have hit the first record then there is nothing we can move. */
+ if (indx == 0)
+ goto done;
+ if (TYPE(pg) != P_LBTREE && TYPE(pg) != P_LDUP) {
+ if (indx == nent)
+ return (__bam_merge_pages(dbc, ndbc, c_data));
+ goto no_check;
+ }
+ /*
+ * We need to update npg's parent key. Avoid creating a new key
+ * that will be too big. Get what space will be available on the
+ * parents. Then if there will not be room for this key, see if
+ * prefix compression will make it work, if not backup till we
+ * find something that will. (Needless to say, this is a very
+ * unlikely event.) If we are deleting this page then we will
+ * need to propagate the next key to our grand parents, so we
+ * see if that will fit.
+ */
+ pfree = dbp->pgsize;
+ for (epg = &ncp->csp[-1]; epg >= ncp->sp; epg--)
+ if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) {
+ bi = GET_BINTERNAL(dbp, epg->page, epg->indx);
+ /* Add back in the key we will be deleting. */
+ freespace += BINTERNAL_PSIZE(bi->len);
+ if (freespace < pfree)
+ pfree = freespace;
+ if (epg->indx != 0)
+ break;
+ }
+
+ /*
+ * If we are at the end, we will delete this page. We need to
+ * check the next parent key only if we are the leftmost page and
+ * will therefore have to propagate the key up the tree.
+ */
+ if (indx == nent) {
+ if (ncp->csp[-1].indx != 0 || ncp->csp[-1].entries == 1 ||
+ BINTERNAL_PSIZE(GET_BINTERNAL(dbp,
+ ncp->csp[-1].page, 1)->len) <= pfree)
+ return (__bam_merge_pages(dbc, ndbc, c_data));
+ indx -= adj;
+ }
+ bk = GET_BKEYDATA(dbp, npg, indx);
+ len = (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len;
+ if (indx != 0 && BINTERNAL_SIZE(len) >= pfree) {
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if (dbp->dup_compare == __bam_defcmp)
+ func = __bam_defpfx;
+ else
+ func = NULL;
+ } else
+ func = t->bt_prefix;
+ } else
+ func = NULL;
+
+ /* Skip to the beginning of a duplicate set. */
+ while (indx != 0 && ninp[indx] == ninp[indx - adj])
+ indx -= adj;
+
+ while (indx != 0 && BINTERNAL_SIZE(len) >= pfree) {
+ if (B_TYPE(bk->type) != B_KEYDATA)
+ goto noprefix;
+ /*
+ * Figure out if we can truncate this key.
+ * Code borrowed from bt_split.c
+ */
+ if (func == NULL)
+ goto noprefix;
+ tmp_bk = GET_BKEYDATA(dbp, npg, indx - adj);
+ if (B_TYPE(tmp_bk->type) != B_KEYDATA)
+ goto noprefix;
+ memset(&a, 0, sizeof(a));
+ a.size = tmp_bk->len;
+ a.data = tmp_bk->data;
+ memset(&b, 0, sizeof(b));
+ b.size = bk->len;
+ b.data = bk->data;
+ nksize = (u_int32_t)func(dbp, &a, &b);
+ if (BINTERNAL_PSIZE(nksize) < pfree)
+ break;
+noprefix:
+ /* Skip to the beginning of a duplicate set. */
+ do {
+ indx -= adj;
+ } while (indx != 0 && ninp[indx] == ninp[indx - adj]);
+
+ bk = GET_BKEYDATA(dbp, npg, indx);
+ len =
+ (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len;
+ }
+
+ /*
+ * indx references the first record that will not move to the previous
+ * page. If it is 0 then we could not find a key that would fit in
+ * the parent that would permit us to move any records.
+ */
+ if (indx == 0)
+ goto done;
+ DB_ASSERT(env, indx <= nent);
+
+ /* Loop through the records and move them from npg to pg. */
+no_check: is_dup = first_dup = next_dup = 0;
+ pg = cp->csp->page;
+ npg = ncp->csp->page;
+ DB_ASSERT(env, IS_DIRTY(pg));
+ DB_ASSERT(env, IS_DIRTY(npg));
+ ninp = P_INP(dbp, npg);
+ do {
+ bk = GET_BKEYDATA(dbp, npg, 0);
+ /* Figure out if we are in a duplicate group or not. */
+ if ((NUM_ENT(npg) % 2) == 0) {
+ if (NUM_ENT(npg) > 2 && ninp[0] == ninp[2]) {
+ if (!is_dup) {
+ first_dup = 1;
+ is_dup = 1;
+ } else
+ first_dup = 0;
+
+ next_dup = 1;
+ } else if (next_dup) {
+ is_dup = 1;
+ first_dup = 0;
+ next_dup = 0;
+ } else
+ is_dup = 0;
+ }
+
+ if (is_dup && !first_dup && (pind % 2) == 0) {
+ /* Duplicate key. */
+ if ((ret = __bam_adjindx(dbc,
+ pg, pind, pind - P_INDX, 1)) != 0)
+ goto err;
+ if (!next_dup)
+ is_dup = 0;
+ } else switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ hdr.data = bk;
+ hdr.size = SSZA(BKEYDATA, data);
+ data.size = bk->len;
+ data.data = bk->data;
+ if ((ret = __db_pitem(dbc, pg, pind,
+ BKEYDATA_SIZE(bk->len), &hdr, &data)) != 0)
+ goto err;
+ break;
+ case B_OVERFLOW:
+ case B_DUPLICATE:
+ data.size = BOVERFLOW_SIZE;
+ data.data = bk;
+ if ((ret = __db_pitem(dbc, pg, pind,
+ BOVERFLOW_SIZE, &data, NULL)) != 0)
+ goto err;
+ break;
+ default:
+ __db_errx(env, DB_STR_A("1022",
+ "Unknown record format, page %lu, indx 0",
+ "%lu"), (u_long)PGNO(pg));
+ ret = EINVAL;
+ goto err;
+ }
+ pind++;
+ if (next_dup && (NUM_ENT(npg) % 2) == 0) {
+ if ((ret = __bam_adjindx(ndbc,
+ npg, 0, O_INDX, 0)) != 0)
+ goto err;
+ } else {
+ if ((ret = __db_ditem(ndbc,
+ npg, 0, BITEM_SIZE(bk))) != 0)
+ goto err;
+ }
+ adjust++;
+ } while (--indx != 0);
+
+ DB_ASSERT(env, NUM_ENT(npg) != 0);
+
+ if (adjust != 0 &&
+ (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))) {
+ if (TYPE(pg) == P_LBTREE)
+ adjust /= P_INDX;
+ if ((ret = __bam_adjust(ndbc, -adjust)) != 0)
+ goto err;
+
+ if ((ret = __bam_adjust(dbc, adjust)) != 0)
+ goto err;
+ }
+
+ /* Update parent with new key. */
+ if (ndbc->dbtype == DB_BTREE &&
+ (ret = __bam_pupdate(ndbc, pg)) != 0)
+ goto err;
+
+done: if (cp->sp->page == ncp->sp->page) {
+ cp->sp->page = NULL;
+ LOCK_INIT(cp->sp->lock);
+ }
+ ret = __bam_stkrel(ndbc, STK_CLRDBC);
+
+err: return (ret);
+}
+
+static int
+__bam_merge_pages(dbc, ndbc, c_data)
+ DBC *dbc, *ndbc;
+ DB_COMPACT *c_data;
+{
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ DBT data, hdr;
+ DB_LOCK root_lock;
+ DB_MPOOLFILE *dbmp;
+ PAGE *pg, *npg;
+ db_indx_t nent, *ninp, *pinp;
+ db_pgno_t pgno, ppgno;
+ u_int8_t *bp;
+ u_int32_t len;
+ int i, level, ret;
+
+ LOCK_INIT(root_lock);
+ COMPQUIET(ppgno, PGNO_INVALID);
+ dbp = dbc->dbp;
+ dbmp = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ pg = cp->csp->page;
+ npg = ncp->csp->page;
+ memset(&hdr, 0, sizeof(hdr));
+ nent = NUM_ENT(npg);
+
+ /* If the page is empty just throw it away. */
+ if (nent == 0)
+ goto free_page;
+
+ pg = cp->csp->page;
+ npg = ncp->csp->page;
+ DB_ASSERT(dbp->env, IS_DIRTY(pg));
+ DB_ASSERT(dbp->env, IS_DIRTY(npg));
+ DB_ASSERT(dbp->env, nent == NUM_ENT(npg));
+
+ /* Bulk copy the data to the new page. */
+ len = dbp->pgsize - HOFFSET(npg);
+ if (DBC_LOGGING(dbc)) {
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = npg;
+ hdr.size = LOFFSET(dbp, npg);
+ memset(&data, 0, sizeof(data));
+ data.data = (u_int8_t *)npg + HOFFSET(npg);
+ data.size = len;
+ if ((ret = __db_merge_log(dbp,
+ dbc->txn, &LSN(pg), 0, PGNO(pg),
+ &LSN(pg), PGNO(npg), &LSN(npg), &hdr, &data, 0)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(pg));
+ LSN(npg) = LSN(pg);
+ bp = (u_int8_t *)pg + HOFFSET(pg) - len;
+ memcpy(bp, (u_int8_t *)npg + HOFFSET(npg), len);
+
+ /* Copy index table offset by what was there already. */
+ pinp = P_INP(dbp, pg) + NUM_ENT(pg);
+ ninp = P_INP(dbp, npg);
+ for (i = 0; i < NUM_ENT(npg); i++)
+ *pinp++ = *ninp++ - (dbp->pgsize - HOFFSET(pg));
+ HOFFSET(pg) -= len;
+ NUM_ENT(pg) += i;
+
+ NUM_ENT(npg) = 0;
+ HOFFSET(npg) += len;
+
+ if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD)) {
+ /*
+ * There are two cases here regarding the stack.
+ * Either we have two two level stacks but only ndbc
+ * references the parent page or we have a multilevel
+ * stack and only ndbc has an entry for the spanning
+ * page.
+ */
+ if (TYPE(pg) == P_LBTREE)
+ i /= P_INDX;
+ if ((ret = __bam_adjust(ndbc, -i)) != 0)
+ goto err;
+
+ if ((ret = __bam_adjust(dbc, i)) != 0)
+ goto err;
+ }
+
+free_page:
+ /*
+ * __bam_dpages may decide to collapse the tree.
+ * This can happen if we have the root and there
+ * are exactly 2 pointers left in it.
+ * If it can collapse the tree we must free the other
+ * stack since it will nolonger be valid. This
+ * must be done before hand because we cannot
+ * hold a page pinned if it might be truncated.
+ */
+ if ((ret = __db_relink(dbc,
+ ncp->csp->page, cp->csp->page, PGNO_INVALID)) != 0)
+ goto err;
+ /* Drop the duplicate reference to the sub tree root. */
+ cp->sp->page = NULL;
+ LOCK_INIT(cp->sp->lock);
+ if (PGNO(ncp->sp->page) == BAM_ROOT_PGNO(ndbc) &&
+ NUM_ENT(ncp->sp->page) == 2) {
+ if ((ret = __bam_stkrel(dbc, STK_CLRDBC | STK_PGONLY)) != 0)
+ goto err;
+ level = LEVEL(ncp->sp->page);
+ ppgno = PGNO(ncp->csp[-1].page);
+ } else
+ level = 0;
+ COMPACT_TRUNCATE(c_data);
+ if ((ret = __bam_dpages(ndbc,
+ 0, ndbc->dbtype == DB_RECNO ? 0 : BTD_UPDATE)) != 0)
+ goto err;
+ npg = NULL;
+ c_data->compact_pages_free++;
+ c_data->compact_pages--;
+ if (level != 0) {
+ pgno = PGNO_INVALID;
+ BAM_GET_ROOT(ndbc, pgno, npg, 0, DB_LOCK_READ, root_lock, ret);
+ if (ret != 0)
+ goto err;
+ DB_ASSERT(dbp->env, npg != NULL);
+ if (level == LEVEL(npg))
+ level = 0;
+ if ((ret = __memp_fput(dbmp,
+ dbc->thread_info, npg, dbc->priority)) != 0)
+ goto err;
+ if ((ret = __LPUT(ndbc, root_lock)) != 0)
+ goto err;
+ npg = NULL;
+ if (level != 0) {
+ c_data->compact_levels++;
+ c_data->compact_pages_free++;
+ COMPACT_TRUNCATE(c_data);
+ if (c_data->compact_pages != 0)
+ c_data->compact_pages--;
+ }
+ }
+
+err: return (ret);
+}
+
+/*
+ * __bam_merge_internal --
+ * Merge internal nodes of the tree.
+ */
+static int
+__bam_merge_internal(dbc, ndbc, level, c_data, merged)
+ DBC *dbc, *ndbc;
+ int level;
+ DB_COMPACT *c_data;
+ int *merged;
+{
+ BINTERNAL bi, *bip, *fip;
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ DBT data, hdr;
+ DB_LOCK root_lock;
+ DB_MPOOLFILE *dbmp;
+ EPG *epg, *save_csp, *nsave_csp;
+ PAGE *pg, *npg;
+ RINTERNAL *rk;
+ db_indx_t first, indx, pind;
+ db_pgno_t pgno, ppgno;
+ int32_t nrecs, trecs;
+ u_int16_t size;
+ u_int32_t freespace, pfree;
+ int ret;
+
+ COMPQUIET(bip, NULL);
+ COMPQUIET(ppgno, PGNO_INVALID);
+ DB_ASSERT(NULL, dbc != NULL);
+ DB_ASSERT(NULL, ndbc != NULL);
+ LOCK_INIT(root_lock);
+
+ /*
+ * ndbc will contain the the dominating parent of the subtree.
+ * dbc will have the tree containing the left child.
+ *
+ * The stacks descend to the leaf level.
+ * If this is a recno tree then both stacks will start at the root.
+ */
+ dbp = dbc->dbp;
+ dbmp = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ *merged = 0;
+ ret = 0;
+
+ /*
+ * Set the stacks to the level requested.
+ * Save the old value to restore when we exit.
+ */
+ save_csp = cp->csp;
+ cp->csp = &cp->csp[-level + 1];
+ pg = cp->csp->page;
+ pind = NUM_ENT(pg);
+
+ nsave_csp = ncp->csp;
+ ncp->csp = &ncp->csp[-level + 1];
+ npg = ncp->csp->page;
+ indx = NUM_ENT(npg);
+
+ /*
+ * The caller may have two stacks that include common ancestors, we
+ * check here for convenience.
+ */
+ if (npg == pg)
+ goto done;
+
+ if (TYPE(pg) == P_IBTREE) {
+ /*
+ * Check for overflow keys on both pages while we have
+ * them locked.
+ */
+ if ((ret =
+ __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0)
+ goto err;
+ if ((ret =
+ __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0)
+ goto err;
+ }
+
+ /*
+ * If we are about to move data off the left most page of an
+ * internal node we will need to update its parents, make sure there
+ * will be room for the new key on all the parents in the stack.
+ * If not, move less data.
+ */
+ fip = NULL;
+ if (TYPE(pg) == P_IBTREE) {
+ /* See where we run out of space. */
+ freespace = P_FREESPACE(dbp, pg);
+ /*
+ * The leftmost key of an internal page is not accurate.
+ * Go up the tree to find a non-leftmost parent.
+ */
+ epg = ncp->csp;
+ while (--epg >= ncp->sp && epg->indx == 0)
+ continue;
+ fip = bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
+ epg = ncp->csp;
+
+ for (indx = 0;;) {
+ size = BINTERNAL_PSIZE(bip->len);
+ if (size > freespace)
+ break;
+ freespace -= size;
+ if (++indx >= NUM_ENT(npg))
+ break;
+ bip = GET_BINTERNAL(dbp, npg, indx);
+ }
+
+ /* See if we are deleting the page and we are not left most. */
+ if (indx == NUM_ENT(npg) && epg[-1].indx != 0)
+ goto fits;
+
+ pfree = dbp->pgsize;
+ for (epg--; epg >= ncp->sp; epg--)
+ if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) {
+ bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
+ /* Add back in the key we will be deleting. */
+ freespace += BINTERNAL_PSIZE(bip->len);
+ if (freespace < pfree)
+ pfree = freespace;
+ if (epg->indx != 0)
+ break;
+ }
+ epg = ncp->csp;
+
+ /* If we are at the end of the page we will delete it. */
+ if (indx == NUM_ENT(npg)) {
+ if (NUM_ENT(epg[-1].page) == 1)
+ goto fits;
+ bip =
+ GET_BINTERNAL(dbp, epg[-1].page, epg[-1].indx + 1);
+ } else
+ bip = GET_BINTERNAL(dbp, npg, indx);
+
+ /* Back up until we have a key that fits. */
+ while (indx != 0 && BINTERNAL_PSIZE(bip->len) > pfree) {
+ indx--;
+ bip = GET_BINTERNAL(dbp, npg, indx);
+ }
+ if (indx == 0)
+ goto done;
+ }
+
+fits: memset(&bi, 0, sizeof(bi));
+ memset(&hdr, 0, sizeof(hdr));
+ memset(&data, 0, sizeof(data));
+ trecs = 0;
+
+ /*
+ * Copy data between internal nodes till one is full
+ * or the other is empty.
+ */
+ first = 0;
+ nrecs = 0;
+ do {
+ if (dbc->dbtype == DB_BTREE) {
+ bip = GET_BINTERNAL(dbp, npg, 0);
+ size = fip == NULL ?
+ BINTERNAL_SIZE(bip->len) :
+ BINTERNAL_SIZE(fip->len);
+ if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
+ break;
+
+ if (fip == NULL) {
+ data.size = bip->len;
+ data.data = bip->data;
+ } else {
+ data.size = fip->len;
+ data.data = fip->data;
+ }
+ bi.len = data.size;
+ B_TSET(bi.type, bip->type);
+ bi.pgno = bip->pgno;
+ bi.nrecs = bip->nrecs;
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))
+ nrecs = (int32_t)bip->nrecs;
+ } else {
+ rk = GET_RINTERNAL(dbp, npg, 0);
+ size = RINTERNAL_SIZE;
+ if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
+ break;
+
+ hdr.data = rk;
+ hdr.size = size;
+ nrecs = (int32_t)rk->nrecs;
+ }
+ /*
+ * Try to lock the subtree leaf records without waiting.
+ * We must lock the subtree below the record we are merging
+ * and the one after it since that is were a search will wind
+ * up if it has already looked at our parent. After the first
+ * move we have the current subtree already locked.
+ * If we merged any records then we will revisit this
+ * node when we merge its leaves. If not we will return
+ * NOTGRANTED and our caller will do a retry. We only
+ * need to do this if we are in a transaction. If not then
+ * we cannot abort and things will be hosed up on error
+ * anyway.
+ */
+ if (dbc->txn != NULL && (ret = __bam_lock_tree(ndbc,
+ ncp->csp, nsave_csp, first,
+ NUM_ENT(ncp->csp->page) == 1 ? 1 : 2)) != 0) {
+ if (ret != DB_LOCK_NOTGRANTED)
+ goto err;
+ break;
+ }
+ first = 1;
+ if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0)
+ goto err;
+ pind++;
+ if (fip != NULL) {
+ /* reset size to be for the record being deleted. */
+ size = BINTERNAL_SIZE(bip->len);
+ fip = NULL;
+ }
+ if ((ret = __db_ditem(ndbc, npg, 0, size)) != 0)
+ goto err;
+ *merged = 1;
+ trecs += nrecs;
+ } while (--indx != 0);
+
+ if (!*merged)
+ goto done;
+
+ if (trecs != 0) {
+ cp->csp--;
+ ret = __bam_adjust(dbc, trecs);
+ if (ret != 0)
+ goto err;
+ cp->csp++;
+ ncp->csp--;
+ if ((ret = __bam_adjust(ndbc, -trecs)) != 0)
+ goto err;
+ ncp->csp++;
+ }
+
+ /*
+ * Either we emptied the page or we need to update its
+ * parent to reflect the first page we now point to.
+ * First get rid of the bottom of the stack,
+ * bam_dpages will clear the stack. Maintain transactional
+ * locks on the leaf pages to protect changes at this level.
+ */
+ do {
+ if ((ret = __memp_fput(dbmp, dbc->thread_info,
+ nsave_csp->page, dbc->priority)) != 0)
+ goto err;
+ nsave_csp->page = NULL;
+ if ((ret = __TLPUT(dbc, nsave_csp->lock)) != 0)
+ goto err;
+ LOCK_INIT(nsave_csp->lock);
+ nsave_csp--;
+ } while (nsave_csp != ncp->csp);
+
+ if (NUM_ENT(npg) == 0) {
+ /*
+ * __bam_dpages may decide to collapse the tree
+ * so we need to free our other stack. The tree
+ * will change in height and our stack will nolonger
+ * be valid.
+ */
+ cp->csp = save_csp;
+ cp->sp->page = NULL;
+ LOCK_INIT(cp->sp->lock);
+ if (PGNO(ncp->sp->page) == BAM_ROOT_PGNO(ndbc) &&
+ NUM_ENT(ncp->sp->page) == 2) {
+ if ((ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0)
+ goto err;
+ level = LEVEL(ncp->sp->page);
+ ppgno = PGNO(ncp->csp[-1].page);
+ } else
+ level = 0;
+
+ COMPACT_TRUNCATE(c_data);
+ ret = __bam_dpages(ndbc,
+ 0, ndbc->dbtype == DB_RECNO ?
+ BTD_RELINK : BTD_UPDATE | BTD_RELINK);
+ c_data->compact_pages_free++;
+ if (ret == 0 && level != 0) {
+ pgno = PGNO_INVALID;
+ BAM_GET_ROOT(ndbc,
+ pgno, npg, 0, DB_LOCK_READ, root_lock, ret);
+ if (ret != 0)
+ goto err;
+ if (level == LEVEL(npg))
+ level = 0;
+ if ((ret = __LPUT(ndbc, root_lock)) != 0)
+ goto err;
+ if ((ret = __memp_fput(dbmp,
+ dbc->thread_info, npg, dbc->priority)) != 0)
+ goto err;
+ npg = NULL;
+ if (level != 0) {
+ c_data->compact_levels++;
+ c_data->compact_pages_free++;
+ COMPACT_TRUNCATE(c_data);
+ if (c_data->compact_pages != 0)
+ c_data->compact_pages--;
+ }
+ }
+ } else {
+ ret = __bam_pupdate(ndbc, npg);
+
+ if (NUM_ENT(npg) != 0 &&
+ c_data->compact_truncate != PGNO_INVALID &&
+ PGNO(npg) > c_data->compact_truncate &&
+ ncp->csp != ncp->sp) {
+ if ((ret = __db_exchange_page(ndbc, &ncp->csp->page,
+ pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ goto err;
+ }
+ if (c_data->compact_truncate != PGNO_INVALID &&
+ PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) {
+ if ((ret = __db_exchange_page(dbc, &cp->csp->page,
+ ncp->csp->page,
+ PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ goto err;
+ }
+ }
+ cp->csp = save_csp;
+
+ return (ret);
+
+done:
+err: cp->csp = save_csp;
+ ncp->csp = nsave_csp;
+
+ return (ret);
+}
+
+/*
+ * __bam_compact_dups -- try to compress off page dup trees.
+ * We may or may not have a write lock on this page.
+ */
+static int
+__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
+ DBC *dbc;
+ PAGE **ppg;
+ u_int32_t factor;
+ int have_lock;
+ DB_COMPACT *c_data;
+ int *donep;
+{
+ BOVERFLOW *bo;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *dbmp;
+ db_indx_t i;
+ db_pgno_t pgno;
+ int ret;
+
+ ret = 0;
+
+ DB_ASSERT(NULL, dbc != NULL);
+ dbp = dbc->dbp;
+ dbmp = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ for (i = 0; i < NUM_ENT(*ppg); i++) {
+ bo = GET_BOVERFLOW(dbp, *ppg, i);
+ if (B_TYPE(bo->type) == B_KEYDATA)
+ continue;
+ c_data->compact_pages_examine++;
+ if (bo->pgno > c_data->compact_truncate) {
+ (*donep)++;
+ if (!have_lock) {
+ /*
+ * The caller should have the page at
+ * least read locked. Drop the buffer
+ * and get the write lock.
+ */
+ pgno = PGNO(*ppg);
+ if ((ret = __memp_fput(dbmp, dbc->thread_info,
+ *ppg, dbc->priority)) != 0)
+ goto err;
+ *ppg = NULL;
+ if ((ret = __db_lget(dbc, 0, pgno,
+ DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
+ goto err;
+ have_lock = 1;
+ if ((ret = __memp_fget(dbmp, &pgno,
+ dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
+ goto err;
+ }
+ if ((ret = __bam_truncate_root_page(dbc,
+ *ppg, i, c_data)) != 0)
+ goto err;
+ /* Just in case it should move. Could it? */
+ bo = GET_BOVERFLOW(dbp, *ppg, i);
+ }
+
+ if (B_TYPE(bo->type) == B_OVERFLOW) {
+ if ((ret = __db_truncate_overflow(dbc,
+ bo->pgno, have_lock ? NULL : ppg, c_data)) != 0)
+ goto err;
+ (*donep)++;
+ continue;
+ }
+ if ((ret = __bam_compact_opd(dbc, bo->pgno,
+ have_lock ? NULL : ppg, factor, c_data, donep)) != 0)
+ goto err;
+ }
+
+err:
+ return (ret);
+}
+
+/*
+ * __bam_compact_opd -- compact an off page duplicate tree.
+ *
+ * PUBLIC: int __bam_compact_opd __P((DBC *,
+ * PUBLIC: db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *));
+ */
+int
+__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ PAGE **ppg;
+ u_int32_t factor;
+ DB_COMPACT *c_data;
+ int *donep;
+{
+ BTREE_CURSOR *cp;
+ DBC *opd;
+ DBT start;
+ DB_MPOOLFILE *dbmp;
+ ENV *env;
+ PAGE *dpg;
+ int isdone, level, ret, span, t_ret;
+ db_pgno_t pgno;
+
+ LOCK_CHECK_OFF(dbc->thread_info);
+
+ opd = NULL;
+ env = dbc->dbp->env;
+ dbmp = dbc->dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Take a peek at the root. If it's a leaf then
+ * there is no tree here, avoid all the trouble.
+ */
+ if ((ret = __memp_fget(dbmp, &root_pgno,
+ dbc->thread_info, dbc->txn, 0, &dpg)) != 0)
+ goto err;
+
+ level = dpg->level;
+ if ((ret = __memp_fput(dbmp,
+ dbc->thread_info, dpg, dbc->priority)) != 0)
+ goto err;
+ if (level == LEAFLEVEL)
+ goto done;
+ if ((ret = __dbc_newopd(dbc, root_pgno, NULL, &opd)) != 0)
+ goto err;
+ if (ppg != NULL) {
+ /*
+ * The caller should have the page at
+ * least read locked. Drop the buffer
+ * and get the write lock.
+ */
+ pgno = PGNO(*ppg);
+ if ((ret = __memp_fput(dbmp, dbc->thread_info,
+ *ppg, dbc->priority)) != 0)
+ goto err;
+ *ppg = NULL;
+ if ((ret = __db_lget(dbc, 0, pgno,
+ DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(dbmp, &pgno,
+ dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
+ goto err;
+ }
+ memset(&start, 0, sizeof(start));
+ do {
+ span = 0;
+ if ((ret = __bam_compact_int(opd, &start,
+ NULL, factor, &span, c_data, &isdone)) != 0)
+ break;
+ /* For OPD the number of pages dirtied is returned in span. */
+ *donep += span;
+ } while (!isdone);
+
+ if (start.data != NULL)
+ __os_free(env, start.data);
+
+err: if (opd != NULL && (t_ret = __dbc_close(opd)) != 0 && ret == 0)
+ ret = t_ret;
+done:
+ LOCK_CHECK_ON(dbc->thread_info);
+
+ return (ret);
+}
+
+/*
+ * __bam_truncate_root_page -- swap a page which is
+ * the root of an off page dup tree or the head of an overflow.
+ * The page is reference by the pg/indx passed in.
+ */
+static int
+__bam_truncate_root_page(dbc, pg, indx, c_data)
+ DBC *dbc;
+ PAGE *pg;
+ u_int32_t indx;
+ DB_COMPACT *c_data;
+{
+ BINTERNAL *bi;
+ BOVERFLOW *bo;
+ DB *dbp;
+ db_pgno_t *pgnop;
+ u_int32_t tlen;
+
+ COMPQUIET(c_data, NULL);
+ COMPQUIET(bo, NULL);
+ dbp = dbc->dbp;
+ if (TYPE(pg) == P_IBTREE) {
+ bi = GET_BINTERNAL(dbp, pg, indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)(bi->data);
+ pgnop = &bo->pgno;
+ tlen = bo->tlen;
+ } else {
+ /* Tlen is not used if this is not an overflow. */
+ tlen = 0;
+ pgnop = &bi->pgno;
+ }
+ } else {
+ bo = GET_BOVERFLOW(dbp, pg, indx);
+ pgnop = &bo->pgno;
+ tlen = bo->tlen;
+ }
+
+ DB_ASSERT(dbp->env, IS_DIRTY(pg));
+
+ return (__db_truncate_root(dbc, pg, indx, pgnop, tlen));
+}
+
+/*
+ * -- bam_truncate_internal_overflow -- find overflow keys
+ * on internal pages and if they have high page
+ * numbers swap them with lower pages and truncate them.
+ * Note that if there are overflow keys in the internal
+ * nodes they will get copied adding pages to the database.
+ */
+static int
+__bam_truncate_internal_overflow(dbc, page, c_data)
+ DBC *dbc;
+ PAGE *page;
+ DB_COMPACT *c_data;
+{
+ BINTERNAL *bi;
+ BOVERFLOW *bo;
+ db_indx_t indx;
+ int ret;
+
+ COMPQUIET(bo, NULL);
+ ret = 0;
+ for (indx = 0; indx < NUM_ENT(page); indx++) {
+ bi = GET_BINTERNAL(dbc->dbp, page, indx);
+ if (B_TYPE(bi->type) != B_OVERFLOW)
+ continue;
+ bo = (BOVERFLOW *)(bi->data);
+ if (bo->pgno > c_data->compact_truncate && (ret =
+ __bam_truncate_root_page(dbc, page, indx, c_data)) != 0)
+ break;
+ if ((ret = __db_truncate_overflow(
+ dbc, bo->pgno, NULL, c_data)) != 0)
+ break;
+ }
+ return (ret);
+}
+
+/*
+ * __bam_compact_isdone ---
+ *
+ * Check to see if the stop key specified by the caller is on the
+ * current page, in which case we are done compacting.
+ */
+static int
+__bam_compact_isdone(dbc, stop, pg, isdone)
+ DBC *dbc;
+ DBT *stop;
+ PAGE *pg;
+ int *isdone;
+{
+ db_recno_t recno;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ int cmp, ret;
+
+ *isdone = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbc->dbp->bt_internal;
+
+ if (dbc->dbtype == DB_RECNO) {
+ if ((ret = __ram_getno(dbc, stop, &recno, 0)) != 0)
+ return (ret);
+ *isdone = cp->recno > recno;
+ } else {
+ DB_ASSERT(dbc->dbp->env, TYPE(pg) == P_LBTREE);
+ if ((ret = __bam_cmp(dbc, stop, pg, 0,
+ t->bt_compare, &cmp)) != 0)
+ return (ret);
+
+ *isdone = cmp <= 0;
+ }
+ return (0);
+}
+
+/*
+ * Lock the subtrees from the top of the stack.
+ * The 0'th child may be in the stack and locked otherwise iterate
+ * through the records by calling __bam_lock_subtree.
+ */
+static int
+__bam_lock_tree(dbc, sp, csp, start, stop)
+ DBC *dbc;
+ EPG *sp, *csp;
+ u_int32_t start, stop;
+{
+ PAGE *cpage;
+ db_pgno_t pgno;
+ int ret;
+
+ if (dbc->dbtype == DB_RECNO)
+ pgno = GET_RINTERNAL(dbc->dbp, sp->page, 0)->pgno;
+ else
+ pgno = GET_BINTERNAL(dbc->dbp, sp->page, 0)->pgno;
+ cpage = (sp + 1)->page;
+ /*
+ * First recurse down the left most sub tree if it is in the cursor
+ * stack. We already have these pages latched and locked if its a
+ * leaf.
+ */
+ if (start == 0 && sp + 1 != csp && pgno == PGNO(cpage) &&
+ (ret = __bam_lock_tree(dbc, sp + 1, csp, 0, NUM_ENT(cpage))) != 0)
+ return (ret);
+
+ /*
+ * Then recurse on the other records on the page if needed.
+ * If the page is in the stack then its already locked or
+ * was processed above.
+ */
+ if (start == 0 && pgno == PGNO(cpage))
+ start = 1;
+
+ if (start == stop)
+ return (0);
+ return (__bam_lock_subtree(dbc, sp->page, start, stop));
+
+}
+
+/*
+ * Lock the subtree from the current node.
+ */
+static int
+__bam_lock_subtree(dbc, page, indx, stop)
+ DBC *dbc;
+ PAGE *page;
+ u_int32_t indx, stop;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *cpage;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+
+ for (; indx < stop; indx++) {
+ if (dbc->dbtype == DB_RECNO)
+ pgno = GET_RINTERNAL(dbc->dbp, page, indx)->pgno;
+ else
+ pgno = GET_BINTERNAL(dbc->dbp, page, indx)->pgno;
+ if (LEVEL(page) - 1 == LEAFLEVEL) {
+ if ((ret = __db_lget(dbc, 0, pgno,
+ DB_LOCK_WRITE, DB_LOCK_NOWAIT, &lock)) != 0) {
+ if (ret == DB_LOCK_DEADLOCK)
+ return (DB_LOCK_NOTGRANTED);
+ return (ret);
+ }
+ } else {
+ if ((ret = __memp_fget(dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &cpage)) != 0)
+ return (ret);
+ ret = __bam_lock_subtree(dbc, cpage, 0, NUM_ENT(cpage));
+ if ((t_ret = __memp_fput(dbp->mpf, dbc->thread_info,
+ cpage, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ }
+ }
+ return (0);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __bam_savekey -- save the key from an internal page.
+ * We need to save information so that we can
+ * fetch then next internal node of the tree. This means
+ * we need the btree key on this current page, or the
+ * next record number.
+ */
+static int
+__bam_savekey(dbc, next, start)
+ DBC *dbc;
+ int next;
+ DBT *start;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ ENV *env;
+ PAGE *pg;
+ RINTERNAL *ri;
+ db_indx_t indx, top;
+ db_pgno_t pgno, saved_pgno;
+ int ret, t_ret;
+ u_int32_t len;
+ u_int8_t *data;
+ int level;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ pg = cp->csp->page;
+ ret = 0;
+
+ if (dbc->dbtype == DB_RECNO) {
+ if (next)
+ for (indx = 0, top = NUM_ENT(pg); indx != top; indx++) {
+ ri = GET_RINTERNAL(dbp, pg, indx);
+ cp->recno += ri->nrecs;
+ }
+ return (__db_retcopy(env, start, &cp->recno,
+ sizeof(cp->recno), &start->data, &start->ulen));
+
+ }
+
+ bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1);
+ data = bi->data;
+ len = bi->len;
+ LOCK_INIT(lock);
+ saved_pgno = PGNO_INVALID;
+ /* If there is single record on the page it may have an empty key. */
+ while (len == 0) {
+ /*
+ * We should not have an empty data page, since we just
+ * compacted things, check anyway and punt.
+ */
+ if (NUM_ENT(pg) == 0)
+ goto no_key;
+ pgno = bi->pgno;
+ level = LEVEL(pg);
+ if (pg != cp->csp->page &&
+ (ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, pg, dbc->priority)) != 0) {
+ pg = NULL;
+ goto err;
+ }
+ pg = NULL;
+ if (level - 1 == LEAFLEVEL) {
+ TRY_LOCK(dbc, pgno, saved_pgno,
+ lock, DB_LOCK_READ, retry);
+ if (ret != 0)
+ goto err;
+ }
+ if ((ret = __memp_fget(dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+ goto err;
+
+ /*
+ * At the data level use the last key to try and avoid the
+ * possibility that the user has a zero length key, if they
+ * do, we punt.
+ */
+ if (pg->level == LEAFLEVEL) {
+ bk = GET_BKEYDATA(dbp, pg, NUM_ENT(pg) - 2);
+ data = bk->data;
+ len = bk->len;
+ if (len == 0) {
+no_key: __db_errx(env, DB_STR("1023",
+ "Compact cannot handle zero length key"));
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ } else {
+ bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1);
+ data = bi->data;
+ len = bi->len;
+ }
+ }
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)(data);
+ ret = __db_goff(dbc, start, bo->tlen, bo->pgno,
+ &start->data, &start->ulen);
+ }
+ else
+ ret = __db_retcopy(env,
+ start, data, len, &start->data, &start->ulen);
+
+err: if (pg != NULL && pg != cp->csp->page &&
+ (t_ret = __memp_fput(dbp->mpf, dbc->thread_info,
+ pg, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+
+retry: return (DB_LOCK_NOTGRANTED);
+}
+
+/*
+ * bam_truncate_ipages --
+ * Find high numbered pages in the internal nodes of a tree and
+ * swap them for lower numbered pages.
+ * PUBLIC: int __bam_truncate_ipages __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+ */
+int
+__bam_truncate_ipages(dbp, ip, txn, c_data)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_COMPACT *c_data;
+{
+ BTMETA *meta;
+ BTREE *bt;
+ BTREE_CURSOR *cp;
+ DBC *dbc;
+ DBMETA *dbmeta;
+ DBT start;
+ DB_LOCK meta_lock, root_lock;
+ DB_TXN *txn_orig;
+ PAGE *pg, *root;
+ db_pgno_t pgno;
+ u_int32_t sflag;
+ int level, local_txn, ret, rlevel, t_ret;
+
+ COMPQUIET(pg, NULL);
+ dbc = NULL;
+ memset(&start, 0, sizeof(start));
+ LOCK_INIT(root_lock);
+ txn_orig = txn;
+
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ local_txn = 1;
+ txn = NULL;
+ } else
+ local_txn = 0;
+
+ level = LEAFLEVEL + 1;
+ sflag = CS_READ | CS_GETRECNO;
+ LOCK_INIT(meta_lock);
+ bt = dbp->bt_internal;
+ meta = NULL;
+ root = NULL;
+
+new_txn:
+ if (local_txn &&
+ (ret = __txn_begin(dbp->env, ip, txn_orig, &txn, 0)) != 0)
+ goto err;
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * If the the root is a leaf we have nothing to do.
+ * Searching an empty RECNO tree will return NOTFOUND below and loop.
+ */
+ pgno = PGNO_INVALID;
+ BAM_GET_ROOT(dbc, pgno, root, 0, DB_LOCK_READ, root_lock, ret);
+ if (ret != 0)
+ goto err;
+
+ rlevel = LEVEL(root);
+ if ((ret = __memp_fput(dbp->mpf, ip, root, dbp->priority)) != 0)
+ goto err;
+ root = NULL;
+
+ if (rlevel == LEAFLEVEL)
+ goto again;
+
+ pgno = PGNO_INVALID;
+ do {
+ if ((ret = __bam_csearch(dbc, &start, sflag, level)) != 0) {
+ /* No more at this level, go up one. */
+ if (ret == DB_NOTFOUND) {
+ level++;
+ if (start.data != NULL)
+ __os_free(dbp->env, start.data);
+ memset(&start, 0, sizeof(start));
+ sflag = CS_READ | CS_GETRECNO;
+ continue;
+ }
+ goto err;
+ }
+ c_data->compact_pages_examine++;
+
+ pg = cp->csp->page;
+ pgno = PGNO(pg);
+
+ sflag = CS_NEXT | CS_GETRECNO;
+ /* Grab info about the page and drop the stack. */
+ if (pgno != BAM_ROOT_PGNO(dbc) && (ret = __bam_savekey(dbc,
+ pgno <= c_data->compact_truncate, &start)) != 0) {
+ if (ret == DB_LOCK_NOTGRANTED)
+ continue;
+ goto err;
+ }
+
+ /* We only got read locks so we can drop them. */
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ if (pgno == BAM_ROOT_PGNO(dbc))
+ break;
+
+ if (pgno <= c_data->compact_truncate)
+ continue;
+
+ /* Get the meta page lock before latching interior nodes. */
+ if (!LOCK_ISSET(meta_lock) && (ret = __db_lget(dbc,
+ 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+ goto err;
+
+ /* Reget the page with a write latch, and its parent too. */
+ if ((ret = __bam_csearch(dbc,
+ &start, CS_PARENT | CS_GETRECNO, level)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ }
+ goto err;
+ }
+ pgno = PGNO(cp->csp->page);
+
+ if (pgno > c_data->compact_truncate) {
+ if ((ret = __db_exchange_page(dbc, &cp->csp->page,
+ NULL, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ goto err;
+ }
+
+ /*
+ * For RECNO we need to bump the saved key to the next
+ * page since CS_NEXT will not do that.
+ */
+ if (dbc->dbtype == DB_RECNO &&
+ (ret = __bam_savekey(dbc, 1, &start)) != 0)
+ goto err;
+
+ pg = cp->csp->page;
+ if ((ret = __bam_stkrel(dbc,
+ pgno != PGNO(pg) ? 0 : STK_NOLOCK)) != 0)
+ goto err;
+
+ /* We are locking subtrees, so drop the write locks asap. */
+ if (local_txn && pgno != PGNO(pg))
+ break;
+ /* We really break from the loop above on this condition. */
+ } while (pgno != BAM_ROOT_PGNO(dbc));
+
+ if ((ret = __LPUT(dbc, root_lock)) != 0)
+ goto err;
+ if ((ret = __dbc_close(dbc)) != 0)
+ goto err;
+ dbc = NULL;
+ if (local_txn) {
+ if ((ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0)
+ goto err;
+ txn = NULL;
+ LOCK_INIT(meta_lock);
+ }
+ if (pgno != bt->bt_root)
+ goto new_txn;
+
+ /*
+ * Attempt to move the subdatabase metadata and/or root pages.
+ * Grab the metadata page and verify the revision, if its out
+ * of date reopen and try again.
+ */
+again: if (F_ISSET(dbp, DB_AM_SUBDB) &&
+ (bt->bt_root > c_data->compact_truncate ||
+ bt->bt_meta > c_data->compact_truncate)) {
+ if (local_txn && txn == NULL &&
+ (ret = __txn_begin(dbp->env, ip, txn_orig, &txn, 0)) != 0)
+ goto err;
+ if (dbc == NULL &&
+ (ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+ if ((ret = __db_lget(dbc,
+ 0, bt->bt_meta, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(dbp->mpf, &bt->bt_meta,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+ if (bt->revision != dbp->mpf->mfp->revision) {
+ if ((ret = __memp_fput(dbp->mpf,
+ ip, meta, dbp->priority)) != 0)
+ goto err;
+ meta = NULL;
+ if (local_txn) {
+ if ((ret = __dbc_close(dbc)) != 0)
+ goto err;
+ dbc = NULL;
+ ret = __txn_abort(txn);
+ txn = NULL;
+ if (ret != 0)
+ goto err;
+ } else {
+ if ((ret = __LPUT(dbc, meta_lock)) != 0)
+ goto err;
+ }
+ if ((ret = __db_reopen(dbc)) != 0)
+ goto err;
+ goto again;
+ }
+ if (PGNO(meta) > c_data->compact_truncate) {
+ dbmeta = (DBMETA *)meta;
+ ret = __db_move_metadata(dbc, &dbmeta, c_data);
+ meta = (BTMETA *)dbmeta;
+ if (ret != 0)
+ goto err;
+ }
+ if (bt->bt_root > c_data->compact_truncate) {
+ if ((ret = __db_lget(dbc, 0,
+ bt->bt_root, DB_LOCK_WRITE, 0, &root_lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(dbp->mpf,
+ &bt->bt_root, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &root)) != 0)
+ goto err;
+ c_data->compact_pages_examine++;
+ /*
+ * Bump the revision first since any reader will be
+ * blocked on the latch on the old page. That latch
+ * will get dropped when we free the page and the
+ * reader will do a __db_reopen and wait till the meta
+ * page latch is released.
+ */
+ ++dbp->mpf->mfp->revision;
+ if ((ret = __db_exchange_page(dbc,
+ &root, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+ goto err;
+ if (PGNO(root) == bt->bt_root)
+ goto err;
+ if (DBC_LOGGING(dbc)) {
+ if ((ret =
+ __bam_root_log(dbp, txn, &LSN(meta), 0,
+ PGNO(meta), PGNO(root), &LSN(meta))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+ bt->bt_root = meta->root = PGNO(root);
+ bt->revision = dbp->mpf->mfp->revision;
+ if ((ret = __memp_fput(dbp->mpf,
+ ip, root, dbp->priority)) != 0)
+ goto err;
+ root = NULL;
+ if (txn == NULL && (ret = __LPUT(dbc, root_lock)) != 0)
+ goto err;
+
+ }
+ if ((ret = __memp_fput(dbp->mpf, ip, meta, dbp->priority)) != 0)
+ goto err;
+ meta = NULL;
+ if ((ret = __dbc_close(dbc)) != 0)
+ goto err;
+ dbc = NULL;
+ if (local_txn) {
+ ret = __txn_commit(txn, DB_TXN_NOSYNC);
+ txn = NULL;
+ LOCK_INIT(meta_lock);
+ LOCK_INIT(root_lock);
+ }
+ }
+
+err: if (txn != NULL && ret != 0)
+ sflag = STK_PGONLY;
+ else
+ sflag = 0;
+ if (txn == NULL) {
+ if (dbc != NULL &&
+ (t_ret = __LPUT(dbc, meta_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL &&
+ (t_ret = __LPUT(dbc, root_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if (meta != NULL && (t_ret = __memp_fput(dbp->mpf,
+ ip, meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (root != NULL && (t_ret = __memp_fput(dbp->mpf,
+ ip, root, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL && (t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (local_txn &&
+ txn != NULL && (t_ret = __txn_abort(txn)) != 0 && ret == 0)
+ ret = t_ret;
+ if (start.data != NULL)
+ __os_free(dbp->env, start.data);
+ return (ret);
+}
+
+#endif
diff --git a/src/btree/bt_compare.c b/src/btree/bt_compare.c
new file mode 100644
index 00000000..5c009071
--- /dev/null
+++ b/src/btree/bt_compare.c
@@ -0,0 +1,213 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+/*
+ * __bam_cmp --
+ * Compare a key to a given record.
+ *
+ * PUBLIC: int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__bam_cmp(dbc, dbt, h, indx, func, cmpp)
+ DBC *dbc;
+ const DBT *dbt;
+ PAGE *h;
+ u_int32_t indx;
+ int (*func)__P((DB *, const DBT *, const DBT *));
+ int *cmpp;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ DB *dbp;
+ DBT pg_dbt;
+
+ dbp = dbc->dbp;
+
+ /*
+ * Returns:
+ * < 0 if dbt is < page record
+ * = 0 if dbt is = page record
+ * > 0 if dbt is > page record
+ *
+ * !!!
+ * We do not clear the pg_dbt DBT even though it's likely to contain
+ * random bits. That should be okay, because the app's comparison
+ * routine had better not be looking at fields other than data, size
+ * and app_data. We don't clear it because we go through this path a
+ * lot and it's expensive.
+ */
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW)
+ bo = (BOVERFLOW *)bk;
+ else {
+ pg_dbt.app_data = NULL;
+ pg_dbt.data = bk->data;
+ pg_dbt.size = bk->len;
+ *cmpp = func(dbp, dbt, &pg_dbt);
+ return (0);
+ }
+ break;
+ case P_IBTREE:
+ /*
+ * The following code guarantees that the left-most key on an
+ * internal page at any place in the tree sorts less than any
+ * user-specified key. The reason is that if we have reached
+ * this internal page, we know the user key must sort greater
+ * than the key we're storing for this page in any internal
+ * pages at levels above us in the tree. It then follows that
+ * any user-specified key cannot sort less than the first page
+ * which we reference, and so there's no reason to call the
+ * comparison routine. While this may save us a comparison
+ * routine call or two, the real reason for this is because
+ * we don't maintain a copy of the smallest key in the tree,
+ * so that we don't have to update all the levels of the tree
+ * should the application store a new smallest key. And, so,
+ * we may not have a key to compare, which makes doing the
+ * comparison difficult and error prone.
+ */
+ if (indx == 0) {
+ *cmpp = 1;
+ return (0);
+ }
+
+ bi = GET_BINTERNAL(dbp, h, indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW)
+ bo = (BOVERFLOW *)(bi->data);
+ else {
+ pg_dbt.app_data = NULL;
+ pg_dbt.data = bi->data;
+ pg_dbt.size = bi->len;
+ *cmpp = func(dbp, dbt, &pg_dbt);
+ return (0);
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(h)));
+ }
+
+ /*
+ * Overflow.
+ */
+ return (__db_moff(dbc, dbt, bo->pgno, bo->tlen,
+ func == __bam_defcmp ? NULL : func, cmpp));
+}
+
+/*
+ * __bam_defcmp --
+ * Default comparison routine.
+ *
+ * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *));
+ */
+int
+__bam_defcmp(dbp, a, b)
+ DB *dbp;
+ const DBT *a, *b;
+{
+ size_t len;
+ u_int8_t *p1, *p2;
+
+ COMPQUIET(dbp, NULL);
+
+ /*
+ * Returns:
+ * < 0 if a is < b
+ * = 0 if a is = b
+ * > 0 if a is > b
+ *
+ * XXX
+ * If a size_t doesn't fit into a long, or if the difference between
+ * any two characters doesn't fit into an int, this routine can lose.
+ * What we need is a signed integral type that's guaranteed to be at
+ * least as large as a size_t, and there is no such thing.
+ */
+ len = a->size > b->size ? b->size : a->size;
+ for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
+ if (*p1 != *p2)
+ return ((long)*p1 - (long)*p2);
+ return ((long)a->size - (long)b->size);
+}
+
+/*
+ * __bam_defpfx --
+ * Default prefix routine.
+ *
+ * PUBLIC: size_t __bam_defpfx __P((DB *, const DBT *, const DBT *));
+ */
+size_t
+__bam_defpfx(dbp, a, b)
+ DB *dbp;
+ const DBT *a, *b;
+{
+ size_t cnt, len;
+ u_int8_t *p1, *p2;
+
+ COMPQUIET(dbp, NULL);
+
+ cnt = 1;
+ len = a->size > b->size ? b->size : a->size;
+ for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt)
+ if (*p1 != *p2)
+ return (cnt);
+
+ /*
+ * They match up to the smaller of the two sizes.
+ * Collate the longer after the shorter.
+ */
+ if (a->size < b->size)
+ return (a->size + 1);
+ if (b->size < a->size)
+ return (b->size + 1);
+ return (b->size);
+}
diff --git a/src/btree/bt_compress.c b/src/btree/bt_compress.c
new file mode 100644
index 00000000..3f293461
--- /dev/null
+++ b/src/btree/bt_compress.c
@@ -0,0 +1,3173 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+#ifdef HAVE_COMPRESSION
+
+static int __bam_compress_marshal_data __P((DB *, const DBT *, DBT *));
+static int __bam_compress_set_dbt __P((DB *, DBT *, const void *, u_int32_t));
+static int __bam_compress_check_sort_multiple_key __P((DB *, DBT *));
+static int __bam_compress_check_sort_multiple __P((DB *, DBT *, DBT *));
+static int __bam_compress_check_sort_multiple_keyonly __P((DB *, DBT *));
+static int __bamc_compress_del_and_get_next __P((DBC *, DBT *, DBT *));
+static int __bamc_compress_get_bothc __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_get_multiple_key __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_get_multiple __P((DBC *, DBT *, DBT *,u_int32_t));
+static int __bamc_compress_get_next __P((DBC *, u_int32_t));
+static int __bamc_compress_get_next_dup __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_get_next_nodup __P((DBC *, u_int32_t));
+static int __bamc_compress_get_prev __P((DBC *, u_int32_t));
+static int __bamc_compress_get_prev_dup __P((DBC *, u_int32_t));
+static int __bamc_compress_get_prev_nodup __P((DBC *, u_int32_t));
+static int __bamc_compress_get_set __P((DBC *,
+ DBT *, DBT *, u_int32_t, u_int32_t));
+static int __bamc_compress_ibulk_del __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_idel __P((DBC *, u_int32_t));
+static int __bamc_compress_iget __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __bamc_compress_iput __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __bamc_compress_relocate __P((DBC *));
+static void __bamc_compress_reset __P((DBC *));
+static int __bamc_compress_seek __P((DBC *,
+ const DBT *, const DBT *, u_int32_t));
+static int __bamc_compress_store __P((DBC *,
+ DBT *, DBT*, DBT **, DBT **, DBT *, DBT *));
+static int __bamc_next_decompress __P((DBC *));
+static int __bamc_start_decompress __P((DBC *));
+
+/*
+ * Call __dbc_iget(), resizing DBTs if DB_BUFFER_SMALL is returned.
+ * We're always using a transient cursor when this macro is used, so
+ * we have to replace the OP with DB_CURRENT when we retry.
+ */
+#define CMP_IGET_RETRY(ret, dbc, dbt1, dbt2, flags) do { \
+ DB_ASSERT((dbc)->env, F_ISSET((dbt1), DB_DBT_USERMEM)); \
+ DB_ASSERT((dbc)->env, F_ISSET((dbt2), DB_DBT_USERMEM)); \
+ if (((ret) =__dbc_iget((dbc), \
+ (dbt1), (dbt2), (flags))) == DB_BUFFER_SMALL) { \
+ if ((CMP_RESIZE_DBT((ret), (dbc)->env, (dbt1))) != 0) \
+ break; \
+ if ((CMP_RESIZE_DBT((ret), (dbc)->env, (dbt2))) != 0) \
+ break; \
+ (ret) = __dbc_iget((dbc), (dbt1), (dbt2), \
+ ((flags) & ~DB_OPFLAGS_MASK) | DB_CURRENT); \
+ } \
+} while (0)
+
+#define CMP_INIT_DBT(dbt) do { \
+ (dbt)->data = NULL; \
+ (dbt)->size = 0; \
+ (dbt)->ulen = 0; \
+ (dbt)->doff = 0; \
+ (dbt)->dlen = 0; \
+ (dbt)->flags = DB_DBT_USERMEM; \
+ (dbt)->app_data = NULL; \
+} while (0)
+
+#define CMP_FREE_DBT(env, dbt) do { \
+ DB_ASSERT((env), F_ISSET((dbt), DB_DBT_USERMEM)); \
+ __os_free((env), (dbt)->data); \
+} while (0)
+
+#define CMP_RESIZE_DBT(ret, env, dbt) \
+ (((dbt)->size > (dbt)->ulen) ? \
+ ((((ret) = __os_realloc((env), (dbt)->size, &(dbt)->data)) \
+ != 0) ? (ret) : (((dbt)->ulen = (dbt)->size), 0)) : 0)
+
+static int
+__bam_compress_set_dbt(dbp, dbt, data, size)
+ DB *dbp;
+ DBT *dbt;
+ const void *data;
+ u_int32_t size;
+{
+ int ret;
+
+ ret = 0;
+ DB_ASSERT(dbp->env, F_ISSET(dbt, DB_DBT_USERMEM));
+
+ dbt->size = size;
+ if (CMP_RESIZE_DBT(ret, dbp->env, dbt) != 0)
+ return (ret);
+
+ memcpy(dbt->data, data, size);
+ return (0);
+}
+
+/******************************************************************************/
+
+/*
+ * Very simple key/data stream to give __bamc_compress_merge_insert()
+ * a source of data to work on.
+ */
+struct __bam_compress_stream;
+typedef struct __bam_compress_stream BTREE_COMPRESS_STREAM;
+struct __bam_compress_stream
+{
+ int (*next)(BTREE_COMPRESS_STREAM *, DBT *, DBT *);
+
+ void *kptr, *dptr;
+ DBT *key, *data;
+};
+
+/*
+ * These function prototypes can not go at the beginning because they rely on
+ * on BTREE_COMPRESS_STREAM defined above.
+ * The prototypes are required to avoid the Microsoft C++ compiler generating
+ * warnings about mismatching parameter lists.
+ */
+static int __bam_cs_next_done __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static int __bam_cs_single_next __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_single
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static int __bam_cs_single_keyonly_next
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_single_keyonly
+ __P((BTREE_COMPRESS_STREAM *, DBT *));
+static int __bam_cs_multiple_key_next
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_multiple_key __P((BTREE_COMPRESS_STREAM *, DBT *));
+static int __bam_cs_multiple_next __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_multiple
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static int __bam_cs_multiple_keyonly_next
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_multiple_keyonly
+ __P((BTREE_COMPRESS_STREAM *, DBT *));
+static int __bamc_compress_merge_insert
+ __P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *, u_int32_t));
+static int __bamc_compress_merge_delete
+ __P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *));
+static int __bamc_compress_merge_delete_dups
+ __P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *));
+
+/* BTREE_COMPRESS_STREAM->next() for when the data has finished. */
+static int
+__bam_cs_next_done(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ COMPQUIET(stream, NULL);
+ COMPQUIET(key, NULL);
+ COMPQUIET(data, NULL);
+ return (0);
+}
+
+/* BTREE_COMPRESS_STREAM->next() for a single key/data pair. */
+static int
+__bam_cs_single_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ key->data = stream->key->data;
+ key->size = stream->key->size;
+ data->data = stream->data->data;
+ data->size = stream->data->size;
+ stream->next = __bam_cs_next_done;
+ return (1);
+}
+
+/* Create a BTREE_COMPRESS_STREAM for a single key/data pair */
+static void
+__bam_cs_create_single(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ stream->next = __bam_cs_single_next;
+ stream->key = key;
+ stream->data = data;
+}
+
+/* BTREE_COMPRESS_STREAM->next() for a single key. */
+static int
+__bam_cs_single_keyonly_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ key->data = stream->key->data;
+ key->size = stream->key->size;
+ if (data != NULL) {
+ data->data = NULL;
+ data->size = 0;
+ }
+ stream->next = __bam_cs_next_done;
+ return (1);
+}
+
+/* Create a BTREE_COMPRESS_STREAM for a single key/data pair */
+static void
+__bam_cs_create_single_keyonly(stream, key)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key;
+{
+ stream->next = __bam_cs_single_keyonly_next;
+ stream->key = key;
+}
+
+/*
+ * BTREE_COMPRESS_STREAM->next() for a single buffer in the DB_MULTIPLE_KEY
+ * format.
+ */
+static int
+__bam_cs_multiple_key_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ DB_MULTIPLE_KEY_NEXT(stream->kptr, stream->key, key->data, key->size,
+ data->data, data->size);
+ if (key->data == NULL) {
+ stream->next = __bam_cs_next_done;
+ return (0);
+ }
+ return (1);
+}
+
+/*
+ * Create a BTREE_COMPRESS_STREAM for a single buffer in the DB_MULTIPLE_KEY
+ * format.
+ */
+static void
+__bam_cs_create_multiple_key(stream, multiple)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *multiple;
+{
+ stream->next = __bam_cs_multiple_key_next;
+ stream->key = multiple;
+ DB_MULTIPLE_INIT(stream->kptr, stream->key);
+}
+
+/* BTREE_COMPRESS_STREAM->next() for two buffers in the DB_MULTIPLE format. */
+static int
+__bam_cs_multiple_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ DB_MULTIPLE_NEXT(stream->kptr, stream->key, key->data, key->size);
+ DB_MULTIPLE_NEXT(stream->dptr, stream->data, data->data, data->size);
+ if (key->data == NULL || data->data == NULL) {
+ stream->next = __bam_cs_next_done;
+ return (0);
+ }
+ return (1);
+}
+
+/* Create a BTREE_COMPRESS_STREAM for two buffers in the DB_MULTIPLE format. */
+static void
+__bam_cs_create_multiple(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ stream->next = __bam_cs_multiple_next;
+ stream->key = key;
+ stream->data = data;
+ DB_MULTIPLE_INIT(stream->kptr, stream->key);
+ DB_MULTIPLE_INIT(stream->dptr, stream->data);
+}
+
+/*
+ * BTREE_COMPRESS_STREAM->next() for a single buffer in the DB_MULTIPLE
+ * format.
+ */
+static int
+__bam_cs_multiple_keyonly_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ DB_MULTIPLE_NEXT(stream->kptr, stream->key, key->data, key->size);
+ if (key->data == NULL) {
+ stream->next = __bam_cs_next_done;
+ return (0);
+ }
+ if (data != NULL) {
+ data->data = NULL;
+ data->size = 0;
+ }
+ return (1);
+}
+
+/*
+ * Create a BTREE_COMPRESS_STREAM for a single buffer in the DB_MULTIPLE
+ * format.
+ */
+static void
+__bam_cs_create_multiple_keyonly(stream, key)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key;
+{
+ stream->next = __bam_cs_multiple_keyonly_next;
+ stream->key = key;
+ DB_MULTIPLE_INIT(stream->kptr, stream->key);
+}
+
+/******************************************************************************/
+
+/*
+ * Marshal data in initial data format into destbuf, resizing destbuf if
+ * necessary.
+ */
+static int
+__bam_compress_marshal_data(dbp, data, destbuf)
+ DB *dbp;
+ const DBT *data;
+ DBT *destbuf;
+{
+ int ret;
+ u_int8_t *ptr;
+
+ ret = 0;
+ DB_ASSERT(dbp->env, F_ISSET(destbuf, DB_DBT_USERMEM));
+
+ destbuf->size = __db_compress_count_int(data->size);
+ destbuf->size += data->size;
+ if (CMP_RESIZE_DBT(ret, dbp->env, destbuf) != 0)
+ return (ret);
+
+ ptr = (u_int8_t*)destbuf->data;
+ ptr += __db_compress_int(ptr, data->size);
+ memcpy(ptr, data->data, data->size);
+
+ return (0);
+}
+
+/*
+ * Unmarshal initial data from source into data - does not copy, points
+ * into source.
+ */
+#define CMP_UNMARSHAL_DATA(src, dest) do { \
+ (dest)->data = ((u_int8_t*)(src)->data) + \
+ __db_decompress_int32((u_int8_t*)(src)->data, \
+ &(dest)->size); \
+} while (0)
+
+/******************************************************************************/
+
+/*
+ * __bam_compress_dupcmp --
+ * Duplicate comparison function for compressed BTrees.
+ *
+ * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *));
+ */
+int
+__bam_compress_dupcmp(db, a, b)
+ DB *db;
+ const DBT *a;
+ const DBT *b;
+{
+ DBT dcmp_a, dcmp_b;
+
+ /* Decompress the initial data in a */
+ CMP_UNMARSHAL_DATA(a, &dcmp_a);
+ dcmp_a.ulen = 0;
+ dcmp_a.doff = 0;
+ dcmp_a.dlen = 0;
+ dcmp_a.flags = 0;
+ dcmp_a.app_data = 0;
+
+ /* Decompress the initial data in b */
+ CMP_UNMARSHAL_DATA(b, &dcmp_b);
+ dcmp_b.ulen = 0;
+ dcmp_b.doff = 0;
+ dcmp_b.dlen = 0;
+ dcmp_b.flags = 0;
+ dcmp_b.app_data = 0;
+
+ /* Call the user's duplicate compare function */
+ return ((BTREE *)db->bt_internal)->
+ compress_dup_compare(db, &dcmp_a, &dcmp_b);
+}
+
+/*
+ * __bam_defcompress --
+ * Default compression routine.
+ *
+ * PUBLIC: int __bam_defcompress __P((DB *, const DBT *, const DBT *,
+ * PUBLIC: const DBT *, const DBT *, DBT *));
+ */
+int
+__bam_defcompress(dbp, prevKey, prevData, key, data, dest)
+ DB *dbp;
+ const DBT *prevKey, *prevData, *key, *data;
+ DBT *dest;
+{
+ u_int8_t *ptr;
+ const u_int8_t *k, *p;
+ size_t len, prefix, suffix;
+
+ COMPQUIET(dbp, NULL);
+
+ k = (const u_int8_t*)key->data;
+ p = (const u_int8_t*)prevKey->data;
+ len = key->size > prevKey->size ? prevKey->size : key->size;
+ for (; len-- && *k == *p; ++k, ++p)
+ continue;
+
+ prefix = (size_t)(k - (u_int8_t*)key->data);
+ suffix = key->size - prefix;
+
+ if (prefix == prevKey->size && suffix == 0) {
+ /* It's a duplicate - do prefix compression on the value */
+ k = (const u_int8_t*)data->data;
+ p = (const u_int8_t*)prevData->data;
+ len = data->size > prevData->size ? prevData->size : data->size;
+ for (; len-- && *k == *p; ++k, ++p)
+ continue;
+
+ prefix = (size_t)(k - (u_int8_t*)data->data);
+ suffix = data->size - prefix;
+
+ /* Check that we have enough space in dest */
+ dest->size = (u_int32_t)(1 + __db_compress_count_int(prefix) +
+ __db_compress_count_int(suffix) + suffix);
+ if (dest->size > dest->ulen)
+ return (DB_BUFFER_SMALL);
+
+ /* Magic identifying byte */
+ ptr = (u_int8_t*)dest->data;
+ *ptr = CMP_INT_SPARE_VAL;
+ ++ptr;
+
+ /* prefix length */
+ ptr += __db_compress_int(ptr, prefix);
+
+ /* suffix length */
+ ptr += __db_compress_int(ptr, suffix);
+
+ /* suffix */
+ memcpy(ptr, k, suffix);
+
+ return (0);
+ }
+
+ /* Check that we have enough space in dest */
+ dest->size = (u_int32_t)(__db_compress_count_int(prefix) +
+ __db_compress_count_int(suffix) +
+ __db_compress_count_int(data->size) + suffix + data->size);
+ if (dest->size > dest->ulen)
+ return (DB_BUFFER_SMALL);
+
+ /* prefix length */
+ ptr = (u_int8_t*)dest->data;
+ ptr += __db_compress_int(ptr, prefix);
+
+ /* suffix length */
+ ptr += __db_compress_int(ptr, suffix);
+
+ /* data length */
+ ptr += __db_compress_int(ptr, data->size);
+
+ /* suffix */
+ memcpy(ptr, k, suffix);
+ ptr += suffix;
+
+ /* data */
+ memcpy(ptr, data->data, data->size);
+
+ return (0);
+}
+
+/*
+ * __bam_defdecompress --
+ * Default decompression routine.
+ *
+ * PUBLIC: int __bam_defdecompress __P((DB *, const DBT *, const DBT *, DBT *,
+ * PUBLIC: DBT *, DBT *));
+ */
+int
+__bam_defdecompress(dbp, prevKey, prevData, compressed, destKey, destData)
+ DB *dbp;
+ const DBT *prevKey, *prevData;
+ DBT *compressed, *destKey, *destData;
+{
+ u_int8_t *s, *d;
+ u_int32_t prefix, suffix, size;
+
+ COMPQUIET(dbp, NULL);
+
+ /*
+ * Check for the magic identifying byte, that tells us that this is a
+ * compressed duplicate value.
+ */
+ s = (u_int8_t*)compressed->data;
+ if (*s == CMP_INT_SPARE_VAL) {
+ ++s;
+ size = 1;
+
+ /* Unmarshal prefix and suffix */
+ size += __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &prefix);
+
+ size += __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &suffix);
+
+ /* Check destination lengths */
+ destKey->size = prevKey->size;
+ destData->size = prefix + suffix;
+ if (destKey->size > destKey->ulen ||
+ destData->size > destData->ulen)
+ return (DB_BUFFER_SMALL);
+
+ /* Write the key */
+ memcpy(destKey->data, prevKey->data, destKey->size);
+
+ /* Write the prefix */
+ if (prefix > prevData->size)
+ return (EINVAL);
+ d = (u_int8_t*)destData->data;
+ memcpy(d, prevData->data, prefix);
+ d += prefix;
+
+ /* Write the suffix */
+ size += suffix;
+ if (size > compressed->size)
+ return (EINVAL);
+ memcpy(d, s, suffix);
+ s += suffix;
+
+ /* Return bytes read */
+ compressed->size = (u_int32_t)(s - (u_int8_t*)compressed->data);
+ return (0);
+ }
+
+ /* Unmarshal prefix, suffix and data length */
+ size = __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &prefix);
+
+ size += __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &suffix);
+
+ size += __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &destData->size);
+
+ /* Check destination lengths */
+ destKey->size = prefix + suffix;
+ if (destKey->size > destKey->ulen || destData->size > destData->ulen)
+ return (DB_BUFFER_SMALL);
+
+ /* Write the prefix */
+ if (prefix > prevKey->size)
+ return (EINVAL);
+ d = (u_int8_t*)destKey->data;
+ memcpy(d, prevKey->data, prefix);
+ d += prefix;
+
+ /* Write the suffix */
+ size += suffix;
+ if (size > compressed->size)
+ return (EINVAL);
+ memcpy(d, s, suffix);
+ s += suffix;
+
+ /* Write the data */
+ size += destData->size;
+ if (size > compressed->size)
+ return (EINVAL);
+ memcpy(destData->data, s, destData->size);
+ s += destData->size;
+
+ /* Return bytes read */
+ compressed->size = (u_int32_t)(s - (u_int8_t*)compressed->data);
+ return (0);
+}
+
+/******************************************************************************/
+
+/*
+ * Set dbc up to start decompressing the compressed key/data pair, dbc->key1
+ * and dbc->compressed.
+ */
+static int
+__bamc_start_decompress(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ int ret;
+ u_int32_t datasize;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ cp->prevKey = NULL;
+ cp->prevData = NULL;
+ cp->currentKey = &cp->key1;
+ cp->currentData = &cp->data1;
+ cp->compcursor = (u_int8_t*)cp->compressed.data;
+ cp->compend = cp->compcursor + cp->compressed.size;
+ cp->prevcursor = NULL;
+ cp->prev2cursor = NULL;
+
+ /* Unmarshal the first data */
+ cp->compcursor += __db_decompress_int32(cp->compcursor, &datasize);
+ ret = __bam_compress_set_dbt(dbc->dbp,
+ cp->currentData, cp->compcursor, datasize);
+
+ if (ret == 0)
+ cp->compcursor += datasize;
+ return (ret);
+}
+
+/* Decompress the next key/data pair from dbc->compressed. */
+static int
+__bamc_next_decompress(dbc)
+ DBC *dbc;
+{
+ DBT compressed;
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *db;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ db = dbc->dbp;
+
+ if (cp->compcursor >= cp->compend)
+ return (DB_NOTFOUND);
+
+ cp->prevKey = cp->currentKey;
+ cp->prevData = cp->currentData;
+ cp->prev2cursor = cp->prevcursor;
+ cp->prevcursor = cp->compcursor;
+
+ if (cp->currentKey == &cp->key1) {
+ cp->currentKey = &cp->key2;
+ cp->currentData = &cp->data2;
+ } else {
+ cp->currentKey = &cp->key1;
+ cp->currentData = &cp->data1;
+ }
+
+ compressed.flags = DB_DBT_USERMEM;
+ compressed.data = (void*)cp->compcursor;
+ compressed.ulen = compressed.size =
+ (u_int32_t)(cp->compend - cp->compcursor);
+ compressed.app_data = NULL;
+
+ while ((ret = ((BTREE *)db->bt_internal)->bt_decompress(db,
+ cp->prevKey, cp->prevData, &compressed,
+ cp->currentKey, cp->currentData)) == DB_BUFFER_SMALL) {
+ if (CMP_RESIZE_DBT(ret, dbc->env, cp->currentKey) != 0)
+ break;
+ if (CMP_RESIZE_DBT(ret, dbc->env, cp->currentData) != 0)
+ break;
+ }
+
+ if (ret == 0)
+ cp->compcursor += compressed.size;
+ return (ret);
+}
+
+/*
+ * Store key and data into destkey and destbuf, using the compression
+ * callback given.
+ */
+static int
+__bamc_compress_store(dbc, key, data, prevKey, prevData, destkey, destbuf)
+ DBC *dbc;
+ DBT *key, *data;
+ DBT **prevKey, **prevData;
+ DBT *destkey, *destbuf;
+{
+ int ret;
+ DBT dest;
+
+ if (*prevKey == 0) {
+ if ((ret = __bam_compress_set_dbt(dbc->dbp,
+ destkey, key->data, key->size)) != 0)
+ return (ret);
+
+ /* Marshal data - resize if it won't fit */
+ ret = __bam_compress_marshal_data(dbc->dbp, data, destbuf);
+
+ } else if (((BTREE_CURSOR *)dbc->internal)->ovflsize > destbuf->size) {
+ /*
+ * Don't write more than cp->ovflsize bytes to the destination
+ * buffer - destbuf must be at least cp->ovflsize in size.
+ */
+ dest.flags = DB_DBT_USERMEM;
+ dest.data = (u_int8_t*)destbuf->data + destbuf->size;
+ dest.ulen =
+ ((BTREE_CURSOR *)dbc->internal)->ovflsize - destbuf->size;
+ dest.size = 0;
+ dest.app_data = NULL;
+
+ ret = ((BTREE *)dbc->dbp->bt_internal)->bt_compress(
+ dbc->dbp, *prevKey, *prevData, key, data, &dest);
+
+ if (ret == 0)
+ destbuf->size += dest.size;
+ } else
+ ret = DB_BUFFER_SMALL;
+
+ if (ret == 0) {
+ *prevKey = key;
+ *prevData = data;
+ }
+
+ return (ret);
+}
+
+/*
+ * Move dbc->dbc to the correct position to start linear searching for
+ * seek_key/seek_data - the biggest key smaller than or equal to
+ * seek_key/seek_data.
+ */
+static int
+__bamc_compress_seek(dbc, seek_key, seek_data, flags)
+ DBC *dbc;
+ const DBT *seek_key;
+ const DBT *seek_data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int32_t method;
+ DB *dbp;
+ BTREE_CURSOR *cp;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if ((ret = __bam_compress_set_dbt(
+ dbp, &cp->key1, seek_key->data, seek_key->size)) != 0)
+ return (ret);
+
+ /*
+ * We allow seek_data to be 0 for __bamc_compress_get_set() with
+ * DB_SET
+ */
+ if (F_ISSET(dbp, DB_AM_DUPSORT) && seek_data != NULL) {
+ if ((ret = __bam_compress_marshal_data(
+ dbp, seek_data, &cp->compressed)) != 0)
+ return (ret);
+
+ method = DB_GET_BOTH_LTE;
+ } else
+ method = DB_SET_LTE;
+
+ CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, method | flags);
+
+ if (ret == 0 &&
+ F_ISSET(dbp, DB_AM_DUPSORT) && seek_data == NULL &&
+ __db_compare_both(dbp, seek_key, 0, &cp->key1, 0) == 0) {
+ /*
+ * Some entries for seek_key might be in the previous chunk,
+ * so we need to start searching there.
+ */
+ CMP_IGET_RETRY(ret,
+ dbc, &cp->key1, &cp->compressed, DB_PREV | flags);
+ if (ret == DB_NOTFOUND) {
+ /* No previous, we must need the first entry */
+ CMP_IGET_RETRY(ret,
+ dbc, &cp->key1, &cp->compressed, DB_FIRST | flags);
+ }
+ }
+
+ return (ret);
+}
+
+/* Reset the cursor to an uninitialized state */
+static void
+__bamc_compress_reset(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ cp->prevKey = 0;
+ cp->prevData = 0;
+ cp->currentKey = 0;
+ cp->currentData = 0;
+ cp->compcursor = 0;
+ cp->compend = 0;
+ cp->prevcursor = 0;
+ cp->prev2cursor = 0;
+
+ F_CLR(cp, C_COMPRESS_DELETED|C_COMPRESS_MODIFIED);
+}
+
+/*
+ * Duplicate the cursor and delete the current entry, move the original cursor
+ * on and then close the cursor we used to delete. We do that to make sure that
+ * the close method runs __bamc_physdel(), and actually gets rid of the deleted
+ * entry!
+ */
+static int
+__bamc_compress_del_and_get_next(dbc, nextk, nextc)
+ DBC *dbc;
+ DBT *nextk, *nextc;
+{
+ int ret, ret_n;
+ DBC *dbc_n;
+
+ if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION | DB_SHALLOW_DUP)) != 0)
+ return (ret);
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ if ((ret = __dbc_idel(dbc_n, 0)) != 0)
+ goto err;
+
+ /* Read the next position */
+ CMP_IGET_RETRY(ret, dbc, nextk, nextc, DB_NEXT);
+
+ err:
+ if ((ret_n = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = ret_n;
+
+ /* No need to relocate this cursor */
+ F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+
+ return (ret);
+}
+
+/*
+ * Duplicate the cursor, re-locate the position that this cursor pointed to
+ * using the duplicate (it may have been deleted), and then swap
+ * the cursors. We do that to make sure that the close method runs
+ * __bamc_physdel(), and gets rid of the entry that may have been deleted.
+ */
+static int
+__bamc_compress_relocate(dbc)
+ DBC *dbc;
+{
+ int ret, t_ret;
+ BTREE_CURSOR *cp, *cp_n;
+ DBC *dbc_n;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ return (ret);
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ cp_n = (BTREE_CURSOR *)dbc_n->internal;
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED)) {
+ /* Find the position after the deleted entry again */
+ ret = __bamc_compress_get_set(
+ dbc_n, &cp->del_key, &cp->del_data, 0, 0);
+ if (ret == DB_NOTFOUND) {
+ __bamc_compress_reset(dbc_n);
+ ret = 0;
+ } else if (ret != 0)
+ goto err;
+
+ F_SET(cp_n, C_COMPRESS_DELETED);
+
+ } else if (cp->currentKey != NULL) {
+ /* Find the current entry again */
+ ret = __bamc_compress_get_set(
+ dbc_n, cp->currentKey, cp->currentData,
+ F_ISSET(dbc->dbp, DB_AM_DUPSORT) ? DB_GET_BOTH : DB_SET, 0);
+
+ if (ret == DB_NOTFOUND) {
+ /* The current entry has been deleted */
+ if ((ret = __bam_compress_set_dbt(dbc_n->dbp,
+ &cp_n->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+ if ((ret = __bam_compress_set_dbt(dbc_n->dbp,
+ &cp_n->del_data, cp->currentData->data,
+ cp->currentData->size)) != 0)
+ return (ret);
+ F_SET(cp_n, C_COMPRESS_DELETED);
+ ret = 0;
+ } else if (ret != 0)
+ goto err;
+ }
+
+ err:
+ /* Cleanup and cursor resolution. This also clears the
+ C_COMPRESS_MODIFIED flag. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/******************************************************************************/
+
+#define CMP_STORE(key, data) do { \
+ while ((ret = __bamc_compress_store(dbc, (key), (data), \
+ &prevDestKey, &prevDestData, &destkey, &destbuf)) \
+ == DB_BUFFER_SMALL) { \
+ if ((ret = __dbc_iput(dbc, \
+ &destkey, &destbuf, DB_KEYLAST)) != 0) \
+ goto end; \
+ prevDestKey = NULL; \
+ prevDestData = NULL; \
+ destbuf.size = 0; \
+ } \
+} while (0)
+
+/* Merge the sorted key/data pairs from stream into the compressed database. */
+static int
+__bamc_compress_merge_insert(dbc, stream, countp, flags)
+ DBC *dbc;
+ BTREE_COMPRESS_STREAM *stream;
+ u_int32_t *countp;
+ u_int32_t flags;
+{
+ DBT ikey1, ikey2, idata1, idata2, nextk, nextc, nextd, destkey, destbuf;
+ DBT *ikey, *idata, *prevIkey, *prevIdata, *prevDestKey, *prevDestData;
+ int ret, bulk_ret, cmp, nextExists, moreCompressed, iSmallEnough;
+ int moreStream;
+ u_int32_t chunk_count;
+ ENV *env;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ bulk_ret = 0;
+
+ memset(&ikey1, 0, sizeof(DBT));
+ memset(&ikey2, 0, sizeof(DBT));
+ memset(&idata1, 0, sizeof(DBT));
+ memset(&idata2, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&nextk);
+ CMP_INIT_DBT(&nextc);
+ memset(&nextd, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&destkey);
+ CMP_INIT_DBT(&destbuf);
+ if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0)
+ goto end;
+ destbuf.ulen = cp->ovflsize;
+
+ if (countp != NULL)
+ *countp = 0;
+ chunk_count = 0;
+
+ /* Get the first input key and data */
+ ret = 0;
+ prevIkey = NULL;
+ prevIdata = NULL;
+ ikey = &ikey1;
+ idata = &idata1;
+ if (stream->next(stream, ikey, idata) == 0)
+ goto end;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+
+ moreStream = 1;
+ while (moreStream != 0) {
+ nextExists = 1;
+ moreCompressed = 1;
+
+ /* Seek the ikey/idata position */
+ ret = __bamc_compress_seek(dbc, ikey, idata, 0);
+ if (ret == 0) {
+ /*
+ * Delete the key - we might overwrite it below
+ * but it's safer to just always delete it, and it
+ * doesn't seem significantly slower to do so.
+ */
+ ret = __bamc_compress_del_and_get_next(dbc, &nextk,
+ &nextc);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ nextExists = 0;
+ } else if (ret == 0) {
+ CMP_UNMARSHAL_DATA(&nextc, &nextd);
+ } else
+ goto end;
+ ret = __bamc_start_decompress(dbc);
+ } else if (ret == DB_NOTFOUND) {
+ moreCompressed = 0;
+
+ /* Read the next position */
+ CMP_IGET_RETRY(ret, dbc, &nextk, &nextc, DB_FIRST);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ nextExists = 0;
+ } else if (ret == 0) {
+ CMP_UNMARSHAL_DATA(&nextc, &nextd);
+ }
+ }
+
+ if (ret != 0)
+ goto end;
+
+ /* !nextExists || ikey/idata < nextk/nextd */
+ iSmallEnough = 1;
+
+ while (moreCompressed != 0 || iSmallEnough != 0) {
+ if (moreCompressed == 0)
+ cmp = 1;
+ else if (iSmallEnough == 0)
+ cmp = -1;
+ else
+ cmp = __db_compare_both(dbp, cp->currentKey,
+ cp->currentData, ikey, idata);
+
+ if (cmp < 0) {
+store_current: CMP_STORE(cp->currentKey, cp->currentData);
+ if (ret != 0)
+ goto end;
+ } else {
+ switch (flags) {
+ case DB_KEYLAST:
+ case DB_KEYFIRST:
+ case DB_NODUPDATA:
+ if (cmp == 0 && bulk_ret == 0 &&
+ F_ISSET(dbp, DB_AM_DUPSORT)) {
+ bulk_ret = __db_duperr(dbp,
+ flags);
+
+ /*
+ * Continue until we store
+ * the current chunk,
+ * but don't insert any
+ * more entries.
+ */
+ moreStream = 0;
+ iSmallEnough = 0;
+
+ goto store_current;
+ }
+ break;
+ default:
+ break;
+ }
+
+ CMP_STORE(ikey, idata);
+ if (ret != 0)
+ goto end;
+ ++chunk_count;
+
+ /*
+ * prevDestKey/prevDestData now point to
+ * the same DBTs as ikey/idata. We don't
+ * want to overwrite them, so swap them
+ * to point to the other DBTs.
+ */
+ if (ikey == &ikey1) {
+ ikey = &ikey2;
+ idata = &idata2;
+ prevIkey = &ikey1;
+ prevIdata = &idata1;
+ } else {
+ ikey = &ikey1;
+ idata = &idata1;
+ prevIkey = &ikey2;
+ prevIdata = &idata2;
+ }
+
+ do {
+ /* Get the next input key and data */
+ if (stream->next(
+ stream, ikey, idata) == 0) {
+ moreStream = 0;
+ iSmallEnough = 0;
+ break;
+ }
+
+#ifdef DIAGNOSTIC
+ /* Check that the stream is sorted */
+ DB_ASSERT(env, __db_compare_both(dbp,
+ ikey, idata, prevIkey,
+ prevIdata) >= 0);
+#endif
+
+ /* Check for duplicates in the stream */
+ } while (__db_compare_both(dbp, ikey, idata,
+ prevIkey, prevIdata) == 0);
+
+ /*
+ * Check that !nextExists ||
+ * ikey/idata < nextk/nextd
+ */
+ if (moreStream != 0 && nextExists != 0 &&
+ __db_compare_both(dbp, ikey,
+ idata, &nextk, &nextd) >= 0)
+ iSmallEnough = 0;
+ }
+
+ if (cmp <= 0) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret == DB_NOTFOUND) {
+ moreCompressed = 0;
+ ret = 0;
+ } else if (ret != 0)
+ goto end;
+
+ }
+ }
+
+ if (prevDestKey != NULL) {
+ if ((ret = __dbc_iput(
+ dbc, &destkey, &destbuf, DB_KEYLAST)) != 0)
+ goto end;
+
+ if (countp != NULL)
+ *countp += chunk_count;
+ chunk_count = 0;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+ destbuf.size = 0;
+ }
+ }
+
+ end:
+ CMP_FREE_DBT(env, &destkey);
+ CMP_FREE_DBT(env, &destbuf);
+ CMP_FREE_DBT(env, &nextk);
+ CMP_FREE_DBT(env, &nextc);
+
+ return (ret != 0 ? ret : bulk_ret);
+}
+
+/******************************************************************************/
+
+/* Remove the sorted key/data pairs in stream from the compressed database. */
+static int
+__bamc_compress_merge_delete(dbc, stream, countp)
+ DBC *dbc;
+ BTREE_COMPRESS_STREAM *stream;
+ u_int32_t *countp;
+{
+ DBT ikey, idata, nextk, nextc, nextd, destkey, destbuf, pdestkey;
+ DBT pdestdata;
+#ifdef DIAGNOSTIC
+ DBT pikey, pidata;
+#endif
+ DBT *prevDestKey, *prevDestData;
+ int ret, bulk_ret, cmp, moreCompressed, moreStream, nextExists;
+ int iSmallEnough;
+ u_int32_t chunk_count;
+ ENV *env;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ bulk_ret = 0;
+
+ memset(&ikey, 0, sizeof(DBT));
+ memset(&idata, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&nextk);
+ CMP_INIT_DBT(&nextc);
+ memset(&nextd, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&pdestkey);
+ CMP_INIT_DBT(&pdestdata);
+
+ CMP_INIT_DBT(&destkey);
+ CMP_INIT_DBT(&destbuf);
+ if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0)
+ goto end;
+ destbuf.ulen = cp->ovflsize;
+
+ if (countp != NULL)
+ *countp = 0;
+ chunk_count = 0;
+
+ /* Get the first input key and data */
+ ret = 0;
+ if (stream->next(stream, &ikey, &idata) == 0)
+ goto end;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+
+ moreStream = 1;
+ while (moreStream != 0) {
+ nextExists = 1;
+ moreCompressed = 1;
+
+ /* Seek the ikey/idata position */
+ if ((ret = __bamc_compress_seek(dbc, &ikey, &idata, 0)) != 0)
+ goto end;
+
+ /*
+ * Delete the key - we might overwrite it below but it's safer
+ * to just always delete it, and it doesn't seem significantly
+ * slower to do so.
+ */
+ ret = __bamc_compress_del_and_get_next(dbc, &nextk, &nextc);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ nextExists = 0;
+ } else if (ret == 0) {
+ CMP_UNMARSHAL_DATA(&nextc, &nextd);
+ } else
+ goto end;
+
+ if ((ret = __bamc_start_decompress(dbc)) != 0)
+ goto end;
+
+ /* !nextExists || ikey/idata < nextk/nextd */
+ iSmallEnough = 1;
+
+ while (moreCompressed != 0 || iSmallEnough != 0) {
+ if (moreCompressed == 0)
+ cmp = 1;
+ else if (iSmallEnough == 0)
+ cmp = -1;
+ else
+ cmp = __db_compare_both(dbp, cp->currentKey,
+ cp->currentData, &ikey, &idata);
+
+ if (cmp < 0) {
+ CMP_STORE(cp->currentKey, cp->currentData);
+ if (ret != 0)
+ goto end;
+
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &pdestkey, cp->currentKey->data,
+ cp->currentKey->size)) != 0)
+ goto end;
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &pdestdata, cp->currentData->data,
+ cp->currentData->size)) != 0)
+ goto end;
+ prevDestKey = &pdestkey;
+ prevDestData = &pdestdata;
+ } else {
+ if (cmp != 0) {
+ /*
+ * Continue until we store the current
+ * chunk, but don't delete any more
+ * entries.
+ */
+ bulk_ret = DB_NOTFOUND;
+ moreStream = 0;
+ iSmallEnough = 0;
+ } else
+ ++chunk_count;
+
+#ifdef DIAGNOSTIC
+ pikey = ikey;
+ pidata = idata;
+#endif
+
+ /* Get the next input key and data */
+ if (stream->next(stream, &ikey, &idata) == 0) {
+ moreStream = 0;
+ iSmallEnough = 0;
+ }
+
+#ifdef DIAGNOSTIC
+ /* Check that the stream is sorted */
+ DB_ASSERT(env, moreStream == 0 ||
+ __db_compare_both(dbp, &ikey, &idata,
+ &pikey, &pidata) >= 0);
+#endif
+
+ /*
+ * Check that !nextExists ||
+ * ikey/idata < nextk/nextd
+ */
+ if (moreStream != 0 && nextExists != 0 &&
+ __db_compare_both(dbp, &ikey,
+ &idata, &nextk, &nextd) >= 0)
+ iSmallEnough = 0;
+ }
+
+ if (cmp <= 0) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret == DB_NOTFOUND) {
+ moreCompressed = 0;
+ ret = 0;
+ } else if (ret != 0)
+ goto end;
+ }
+ }
+
+ if (prevDestKey != NULL) {
+ if ((ret = __dbc_iput(
+ dbc, &destkey, &destbuf, DB_KEYLAST)) != 0)
+ goto end;
+
+ if (countp)
+ *countp += chunk_count;
+ chunk_count = 0;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+ destbuf.size = 0;
+ }
+ }
+
+ end:
+ CMP_FREE_DBT(env, &destkey);
+ CMP_FREE_DBT(env, &destbuf);
+ CMP_FREE_DBT(env, &pdestkey);
+ CMP_FREE_DBT(env, &pdestdata);
+ CMP_FREE_DBT(env, &nextk);
+ CMP_FREE_DBT(env, &nextc);
+
+ return (ret != 0 ? ret : bulk_ret);
+}
+
+/*
+ * Remove the sorted keys in stream along with all duplicate values from
+ * the compressed database.
+ */
+static int
+__bamc_compress_merge_delete_dups(dbc, stream, countp)
+ DBC *dbc;
+ BTREE_COMPRESS_STREAM *stream;
+ u_int32_t *countp;
+{
+ DBC *dbc_n;
+ DBT ikey, nextk, noread, destkey, destbuf, pdestkey, pdestdata;
+#ifdef DIAGNOSTIC
+ DBT pikey;
+#endif
+ DBT *prevDestKey, *prevDestData;
+ int ret, ret_n, bulk_ret, cmp, moreCompressed, moreStream, nextExists;
+ int iSmallEnough, ifound;
+ u_int32_t chunk_count;
+ ENV *env;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ bulk_ret = 0;
+
+ memset(&ikey, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&nextk);
+
+ memset(&noread, 0, sizeof(DBT));
+ noread.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+ CMP_INIT_DBT(&pdestkey);
+ CMP_INIT_DBT(&pdestdata);
+
+ CMP_INIT_DBT(&destkey);
+ CMP_INIT_DBT(&destbuf);
+ if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0)
+ goto end;
+ destbuf.ulen = cp->ovflsize;
+
+ if (countp != NULL)
+ *countp = 0;
+ chunk_count = 0;
+
+ /* Get the first input key and data */
+ ret = 0;
+ if (stream->next(stream, &ikey, NULL) == 0)
+ goto end;
+ ifound = 0;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+
+ moreStream = 1;
+ iSmallEnough = 0;
+ nextExists = 0;
+ while (moreStream != 0) {
+ if (iSmallEnough != 0) {
+ if (nextExists == 0) {
+ /*
+ * We've finished deleting the last key
+ * in the database
+ */
+ if (ifound == 0) {
+ bulk_ret = DB_NOTFOUND;
+ } else
+ ++chunk_count;
+ break;
+ }
+
+ /* Move to the next chunk */
+ CMP_IGET_RETRY(
+ ret, dbc, &cp->key1, &cp->compressed, DB_CURRENT);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ break;
+ } else if (ret != 0)
+ goto end;
+ } else
+ /* Seek the ikey position */
+ if ((ret =
+ __bamc_compress_seek(dbc, &ikey, NULL, 0)) != 0)
+ goto end;
+
+ nextExists = 1;
+ moreCompressed = 1;
+
+ /*
+ * Delete the key - we might overwrite it below but it's
+ * safer to just always delete it, and it doesn't seem
+ * significantly slower to do so.
+ */
+ ret = __bamc_compress_del_and_get_next(dbc, &nextk, &noread);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ nextExists = 0;
+ } else if (ret != 0)
+ goto end;
+
+ if ((ret = __bamc_start_decompress(dbc)) != 0)
+ goto end;
+
+ /* !nextExists || ikey <= nextk */
+ iSmallEnough = 1;
+
+ while (moreCompressed != 0) {
+ if (moreCompressed == 0)
+ cmp = 1;
+ else if (iSmallEnough == 0)
+ cmp = -1;
+ else
+ cmp = __db_compare_both(
+ dbp, cp->currentKey, NULL, &ikey, NULL);
+
+ if (cmp < 0) {
+ if ((ret = __bamc_compress_store(dbc,
+ cp->currentKey, cp->currentData,
+ &prevDestKey,
+ &prevDestData, &destkey, &destbuf)) != 0)
+ goto end;
+
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &pdestkey, cp->currentKey->data,
+ cp->currentKey->size)) != 0)
+ goto end;
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &pdestdata, cp->currentData->data,
+ cp->currentData->size)) != 0)
+ goto end;
+ prevDestKey = &pdestkey;
+ prevDestData = &pdestdata;
+ } else if (cmp > 0) {
+ if (ifound == 0) {
+ /*
+ * Continue until we store the
+ * current chunk, but don't delete
+ * any more entries.
+ */
+ bulk_ret = DB_NOTFOUND;
+ moreStream = 0;
+ iSmallEnough = 0;
+ } else
+ ++chunk_count;
+
+#ifdef DIAGNOSTIC
+ pikey = ikey;
+#endif
+
+ /* Get the next input key */
+ if (stream->next(stream, &ikey, NULL) == 0) {
+ moreStream = 0;
+ iSmallEnough = 0;
+ }
+ ifound = 0;
+
+#ifdef DIAGNOSTIC
+ /* Check that the stream is sorted */
+ DB_ASSERT(env, moreStream == 0 ||
+ __db_compare_both(dbp, &ikey, NULL,
+ &pikey, NULL) >= 0);
+#endif
+
+ /* Check that !nextExists || ikey <= nextk */
+ if (moreStream != 0 && nextExists != 0 &&
+ __db_compare_both(dbp,
+ &ikey, NULL, &nextk, NULL) > 0)
+ iSmallEnough = 0;
+ } else /* cmp == 0 */
+ ifound = 1;
+
+ if (cmp <= 0) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret == DB_NOTFOUND) {
+ moreCompressed = 0;
+ ret = 0;
+ } else if (ret != 0)
+ goto end;
+ }
+ }
+
+ if (prevDestKey != NULL) {
+ /*
+ * Do the DBC->put() with a duplicate cursor, so that
+ * the main cursor's position isn't changed - we might
+ * need it to be the same in order to use DB_CURRENT
+ * above.
+ */
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ goto end;
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ ret = __dbc_iput(dbc_n, &destkey, &destbuf, DB_KEYLAST);
+
+ if ((ret_n = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = ret_n;
+
+ if (ret != 0)
+ goto end;
+
+ if (countp)
+ *countp += chunk_count;
+ chunk_count = 0;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+ destbuf.size = 0;
+ }
+ }
+
+ end:
+ CMP_FREE_DBT(env, &destkey);
+ CMP_FREE_DBT(env, &destbuf);
+ CMP_FREE_DBT(env, &pdestkey);
+ CMP_FREE_DBT(env, &pdestdata);
+ CMP_FREE_DBT(env, &nextk);
+
+ return (ret != 0 ? ret : bulk_ret);
+}
+
+/******************************************************************************/
+
+/* Implements DB_PREV and DB_LAST for __bamc_compress_get() */
+static int
+__bamc_compress_get_prev(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ u_int32_t tofind;
+ BTREE_CURSOR *cp;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ F_CLR(cp, C_COMPRESS_DELETED);
+
+ if (cp->prevKey != NULL) {
+ /* Return the stored previous key */
+ cp->currentKey = cp->prevKey;
+ cp->currentData = cp->prevData;
+ cp->compcursor = cp->prevcursor;
+ cp->prevKey = 0;
+ cp->prevData = 0;
+ cp->prevcursor = cp->prev2cursor;
+ cp->prev2cursor = 0;
+ } else {
+ if (cp->currentKey == NULL) {
+ /* No current key, so fetch the last key */
+ flags |= DB_LAST;
+ tofind = (u_int32_t)-1;
+ } else if (cp->prevcursor == 0) {
+ /*
+ * The current key is at the beginning of the
+ * compressed block, so get the last key from the
+ * previous block
+ */
+ flags |= DB_PREV;
+ tofind = (u_int32_t)-1;
+ } else {
+ /*
+ * We have to search for the previous key in the
+ * current block
+ */
+ flags |= DB_CURRENT;
+ tofind = (u_int32_t)
+ (cp->prevcursor - (u_int8_t*)cp->compressed.data);
+ }
+
+ CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, flags);
+ if (ret != 0)
+ return (ret);
+
+ /* Decompress until we reach tofind */
+ ret = __bamc_start_decompress(dbc);
+ while (ret == 0 && tofind > (u_int32_t)
+ (cp->compcursor - (u_int8_t*)cp->compressed.data)) {
+ ret = __bamc_next_decompress(dbc);
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ }
+
+ return (ret);
+}
+
+/* Implements DB_PREV_DUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_prev_dup(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ BTREE *t;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ if (cp->currentKey == 0)
+ return (EINVAL);
+
+ /* If this is a deleted entry, del_key is already set, otherwise we
+ have to set it now */
+ if (!F_ISSET(cp, C_COMPRESS_DELETED)) {
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+ }
+
+ if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
+ return (ret);
+
+ if (t->bt_compare(dbp, cp->currentKey, &cp->del_key) != 0)
+ return (DB_NOTFOUND);
+
+ return (0);
+}
+
+/* Implements DB_PREV_NODUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_prev_nodup(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ BTREE *t;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ if (cp->currentKey == 0)
+ return (__bamc_compress_get_prev(dbc, flags));
+
+ /*
+ * If this is a deleted entry, del_key is already set, otherwise we
+ * have to set it now.
+ */
+ if (!F_ISSET(cp, C_COMPRESS_DELETED))
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+
+ /*
+ * Linear search for the next non-duplicate key - this is
+ * especially inefficient for DB_PREV_NODUP, since we have to
+ * decompress from the beginning of the chunk to find previous
+ * key/data pairs. Instead we could check for key equality as we
+ * decompress.
+ */
+ do
+ if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
+ return (ret);
+ while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+
+ return (0);
+}
+
+/* Implements DB_NEXT and DB_FIRST for __bamc_compress_get() */
+static int
+__bamc_compress_get_next(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED)) {
+ if (cp->currentKey == 0)
+ return (DB_NOTFOUND);
+ F_CLR(cp, C_COMPRESS_DELETED);
+ return (0);
+ } else if (cp->currentKey) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret != DB_NOTFOUND)
+ return (ret);
+
+ flags |= DB_NEXT;
+ } else
+ flags |= DB_FIRST;
+
+ CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, flags);
+ if (ret == DB_NOTFOUND) {
+ /*
+ * Reset the cursor, so that
+ * __bamc_compress_get_multiple_key will end up pointing
+ * to the right place
+ */
+ __bamc_compress_reset(dbc);
+ return (DB_NOTFOUND);
+ } else if (ret != 0)
+ return (ret);
+
+ ret = __bamc_start_decompress(dbc);
+
+ return (ret);
+}
+
+/* Implements DB_NEXT_DUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_next_dup(dbc, key, flags)
+ DBC *dbc;
+ DBT *key;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ BTREE *t;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED)) {
+ /*
+ * Check that the next entry has the same key as the
+ * deleted entry.
+ */
+ if (cp->currentKey == 0)
+ return (DB_NOTFOUND);
+ F_CLR(cp, C_COMPRESS_DELETED);
+ return (t->bt_compare(dbp,
+ cp->currentKey, &cp->del_key) == 0 ? 0 : DB_NOTFOUND);
+ } else if (cp->currentKey == 0)
+ return (EINVAL);
+
+ /* Check that the next entry has the same key as the previous entry */
+ ret = __bamc_next_decompress(dbc);
+ if (ret == 0 && t->bt_compare(dbp, cp->currentKey, cp->prevKey) != 0)
+ return (DB_NOTFOUND);
+ if (ret != DB_NOTFOUND)
+ return (ret);
+
+ if (key == NULL) {
+ /* Copy the current key to del_key */
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+ key = &cp->del_key;
+ }
+
+ /* Fetch the next chunk */
+ CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, DB_NEXT | flags);
+ if (ret == DB_NOTFOUND) {
+ /*
+ * Reset the cursor, so that __bamc_compress_get_multiple
+ * will end up pointing to the right place
+ */
+ __bamc_compress_reset(dbc);
+ return (DB_NOTFOUND);
+ } else if (ret != 0)
+ return (ret);
+
+ if ((ret = __bamc_start_decompress(dbc)) != 0)
+ return (ret);
+
+ /* Check the keys are the same */
+ if (t->bt_compare(dbp, cp->currentKey, key) != 0)
+ return (DB_NOTFOUND);
+
+ return (0);
+}
+
+/* Implements DB_NEXT_NODUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_next_nodup(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ BTREE *t;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ if (cp->currentKey == 0)
+ return (__bamc_compress_get_next(dbc, flags));
+
+ /*
+ * If this is a deleted entry, del_key is already set, otherwise
+ * we have to set it now
+ */
+ if (!F_ISSET(cp, C_COMPRESS_DELETED))
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+
+ /* Linear search for the next non-duplicate key */
+ do
+ if ((ret = __bamc_compress_get_next(dbc, flags)) != 0)
+ return (ret);
+ while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+
+ return (ret);
+}
+
+/*
+ * Implements DB_SET, DB_SET_RANGE, DB_GET_BOTH, and DB_GET_BOTH_RANGE
+ * for __bamc_compress_get()
+ */
+static int
+__bamc_compress_get_set(dbc, key, data, method, flags)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t method;
+ u_int32_t flags;
+{
+ int ret, cmp;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+
+ if (method == DB_SET || method == DB_SET_RANGE)
+ data = NULL;
+
+ F_CLR(cp, C_COMPRESS_DELETED);
+
+ ret = __bamc_compress_seek(dbc, key, data, flags);
+ if (ret == DB_NOTFOUND)
+ CMP_IGET_RETRY(ret, dbc,
+ &cp->key1, &cp->compressed, DB_FIRST | flags);
+ if (ret != 0)
+ return (ret);
+
+ /* Decompress and perform a linear search for the key */
+ cmp = 0;
+ ret = __bamc_start_decompress(dbc);
+ while (ret == 0 && (cmp = __db_compare_both(dbp,
+ cp->currentKey, cp->currentData, key, data)) < 0) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret == DB_NOTFOUND) {
+ CMP_IGET_RETRY(ret, dbc,
+ &cp->key1, &cp->compressed, DB_NEXT | flags);
+ if (ret == 0)
+ ret = __bamc_start_decompress(dbc);
+ }
+ }
+
+ switch (method) {
+ case DB_SET:
+ case DB_GET_BOTH_RANGE:
+ /*
+ * We need to exactly match the key, and if cmp != 0 we
+ * might not have - so check again here.
+ */
+ if (ret == 0 &&
+ __db_compare_both(dbp, cp->currentKey, 0, key, 0) != 0) {
+ /* We didn't find the key */
+ ret = DB_NOTFOUND;
+ }
+ break;
+ case DB_GET_BOTH:
+ if (ret == 0 && (cmp != 0 || (!F_ISSET(dbp, DB_AM_DUPSORT) &&
+ __bam_defcmp(dbp, cp->currentData, data) != 0))) {
+ /* We didn't find the key/data pair */
+ ret = DB_NOTFOUND;
+ }
+ break;
+ default:
+ DB_ASSERT(dbp->env, method == 0 || method == DB_SET_RANGE);
+ }
+
+ return (ret);
+}
+
+/* Implements DB_GET_BOTHC for __bamc_compress_get() */
+static int
+__bamc_compress_get_bothc(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ int ret, cmp;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+
+ /* Check that the data we are looking for comes after the current
+ position */
+ if (__db_compare_both(dbp, cp->currentKey,
+ cp->currentData, cp->currentKey, data) >= 0)
+ return (DB_NOTFOUND);
+
+ cmp = 0;
+ /* Perform a linear search for the data in the current chunk */
+ while ((ret = __bamc_next_decompress(dbc)) == 0 &&
+ (cmp = __db_compare_both(
+ dbp, cp->currentKey, cp->currentData, cp->prevKey, data)) < 0)
+ continue;
+
+ if (ret == 0)
+ return (cmp == 0 ? 0 : DB_NOTFOUND);
+ if (ret != DB_NOTFOUND)
+ return (ret);
+
+ /* Copy the current key to del_key */
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+
+ /* Search for the data using DB_GET_BOTH */
+ return __bamc_compress_get_set(
+ dbc, &cp->del_key, data, DB_GET_BOTH, flags);
+}
+
+/* Implements DB_MULTIPLE_KEY for __bamc_compress_get() */
+static int
+__bamc_compress_get_multiple_key(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int8_t *writekey, *writedata;
+ void *mptr;
+ BTREE_CURSOR *cp;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ DB_MULTIPLE_WRITE_INIT(mptr, data);
+ DB_MULTIPLE_KEY_RESERVE_NEXT(mptr, data, writekey, cp->currentKey->size,
+ writedata, cp->currentData->size);
+ if (writekey == NULL) {
+ data->size = cp->currentKey->size + cp->currentData->size +
+ 4 * sizeof(u_int32_t);
+ return DB_BUFFER_SMALL;
+ }
+ DB_ASSERT(dbc->dbp->env, writedata != NULL);
+
+ memcpy(writekey, cp->currentKey->data, cp->currentKey->size);
+ memcpy(writedata, cp->currentData->data, cp->currentData->size);
+
+ while ((ret = __bamc_compress_get_next(dbc, flags)) == 0) {
+ DB_MULTIPLE_KEY_RESERVE_NEXT(mptr, data, writekey,
+ cp->currentKey->size, writedata, cp->currentData->size);
+ if (writekey == NULL)
+ break;
+ DB_ASSERT(dbc->dbp->env, writedata != NULL);
+
+ /*
+ * We could choose to optimize this by just storing one
+ * copy of a key for each set of duplicate data.
+ */
+ memcpy(writekey, cp->currentKey->data, cp->currentKey->size);
+ memcpy(writedata, cp->currentData->data, cp->currentData->size);
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ if (ret == 0)
+ /*
+ * Rewind to the previous key/data, since we can't fit
+ * this one in the buffer
+ */
+ ret = __bamc_compress_get_prev(dbc, flags);
+
+ return (ret);
+}
+
+/* Implements DB_MULTIPLE for __bamc_compress_get() */
+static int
+__bamc_compress_get_multiple(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int8_t *writedata;
+ void *mptr;
+ BTREE_CURSOR *cp;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ data->size = 0;
+
+ DB_MULTIPLE_WRITE_INIT(mptr, data);
+ DB_MULTIPLE_RESERVE_NEXT(mptr, data, writedata, cp->currentData->size);
+ data->size += cp->currentData->size + 2 * sizeof(u_int32_t);
+ if (writedata == NULL)
+ return DB_BUFFER_SMALL;
+
+ memcpy(writedata, cp->currentData->data, cp->currentData->size);
+
+ while ((ret = __bamc_compress_get_next_dup(dbc, key, flags)) == 0) {
+ DB_MULTIPLE_RESERVE_NEXT(
+ mptr, data, writedata, cp->currentData->size);
+ data->size += cp->currentData->size + 2 * sizeof(u_int32_t);
+ if (writedata == NULL) {
+ /* DBC_FROM_DB_GET indicates we need to fit all the
+ * duplicates into the buffer or return DB_BUFFER_SMALL.
+ * [#17039]
+ */
+ if (F_ISSET(dbc, DBC_FROM_DB_GET))
+ return DB_BUFFER_SMALL;
+ break;
+ }
+
+ memcpy(writedata, cp->currentData->data, cp->currentData->size);
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ if (ret == 0)
+ /*
+ * Rewind to the previous key/data, as that's now our current
+ * entry.
+ */
+ ret = __bamc_compress_get_prev(dbc, flags);
+
+ return (ret);
+}
+
+/*
+ * __bamc_compress_iget --
+ * Get using a compressed cursor. (internal)
+ */
+static int
+__bamc_compress_iget(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int32_t multiple, method;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ ret = 0;
+
+ multiple = flags & (DB_MULTIPLE|DB_MULTIPLE_KEY);
+ method = flags & DB_OPFLAGS_MASK;
+ flags = flags & ~(DB_OPFLAGS_MASK|DB_MULTIPLE|DB_MULTIPLE_KEY);
+
+ switch (method) {
+ case DB_CURRENT:
+ if (F_ISSET(cp, C_COMPRESS_DELETED))
+ ret = DB_KEYEMPTY;
+ else if (cp->currentKey == NULL)
+ ret = EINVAL;
+ break;
+ case DB_FIRST:
+ __bamc_compress_reset(dbc);
+ ret = __bamc_compress_get_next(dbc, flags);
+ break;
+ case DB_NEXT:
+ ret = __bamc_compress_get_next(dbc, flags);
+ break;
+ case DB_NEXT_DUP:
+ ret = __bamc_compress_get_next_dup(dbc, 0, flags);
+ break;
+ case DB_NEXT_NODUP:
+ ret = __bamc_compress_get_next_nodup(dbc, flags);
+ break;
+ case DB_LAST:
+ __bamc_compress_reset(dbc);
+ ret = __bamc_compress_get_prev(dbc, flags);
+ break;
+ case DB_PREV:
+ ret = __bamc_compress_get_prev(dbc, flags);
+ break;
+ case DB_PREV_DUP:
+ ret = __bamc_compress_get_prev_dup(dbc, flags);
+ break;
+ case DB_PREV_NODUP:
+ ret = __bamc_compress_get_prev_nodup(dbc, flags);
+ break;
+ case DB_SET:
+ if (((BTREE *)
+ dbc->dbp->bt_internal)->bt_compare == __bam_defcmp)
+ F_SET(key, DB_DBT_ISSET);
+ /* FALL THROUGH */
+ case DB_SET_RANGE:
+ ret = __bamc_compress_get_set(dbc, key, 0, method, flags);
+ break;
+ case DB_GET_BOTH:
+ if (!F_ISSET(dbc->dbp, DB_AM_DUPSORT) || ((BTREE *)dbc->dbp->
+ bt_internal)->compress_dup_compare == __bam_defcmp)
+ F_SET(data, DB_DBT_ISSET);
+ /* FALL THROUGH */
+ case DB_GET_BOTH_RANGE:
+ if (((BTREE *)
+ dbc->dbp->bt_internal)->bt_compare == __bam_defcmp)
+ F_SET(key, DB_DBT_ISSET);
+ ret = __bamc_compress_get_set(dbc, key, data, method, flags);
+ break;
+ case DB_GET_BOTHC:
+ ret = __bamc_compress_get_bothc(dbc, data, flags);
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__bamc_compress_iget",
+ method);
+ break;
+ }
+
+ if (ret != 0)
+ goto err;
+
+ switch (multiple) {
+ case 0:
+ if (!F_ISSET(key, DB_DBT_ISSET))
+ ret = __db_retcopy(dbc->env, key,
+ cp->currentKey->data, cp->currentKey->size,
+ &dbc->rkey->data, &dbc->rkey->ulen);
+ if (!F_ISSET(data, DB_DBT_ISSET) && ret == 0)
+ ret = __db_retcopy(dbc->env, data,
+ cp->currentData->data, cp->currentData->size,
+ &dbc->rdata->data, &dbc->rdata->ulen);
+ break;
+ case DB_MULTIPLE:
+ if (!F_ISSET(key, DB_DBT_ISSET))
+ ret = __db_retcopy(dbc->env, key,
+ cp->currentKey->data, cp->currentKey->size,
+ &dbc->rkey->data, &dbc->rkey->ulen);
+ if (ret == 0)
+ ret =
+ __bamc_compress_get_multiple(dbc, key, data, flags);
+ break;
+ case DB_MULTIPLE_KEY:
+ ret = __bamc_compress_get_multiple_key(dbc, data, flags);
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__bamc_compress_iget",
+ multiple);
+ break;
+ }
+
+ err:
+ F_CLR(key, DB_DBT_ISSET);
+ F_CLR(data, DB_DBT_ISSET);
+
+ return (ret);
+}
+
+/*
+ * __bamc_compress_get --
+ * Get using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_get __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__bamc_compress_get(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc_n;
+ int ret, t_ret;
+ u_int32_t tmp_flags;
+
+ switch (flags & DB_OPFLAGS_MASK) {
+ case DB_CURRENT:
+ case DB_GET_BOTHC:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ if (F_ISSET((BTREE_CURSOR *)dbc->internal,
+ C_COMPRESS_MODIFIED) &&
+ (ret = __bamc_compress_relocate(dbc)) != 0)
+ return (ret);
+ tmp_flags = DB_POSITION;
+ break;
+ default:
+ F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+ tmp_flags = 0;
+ break;
+ }
+
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0)
+ goto err;
+
+ /*
+ * We don't care about preserving the cursor's position on
+ * error.
+ */
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ COPY_RET_MEM(dbc, dbc_n);
+ }
+
+ if ((ret = __bamc_compress_iget(dbc_n, key, data, flags)) != 0)
+ goto err;
+
+err:
+ /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bamc_compress_iput --
+ * Put using a compressed cursor (internal)
+ */
+static int
+__bamc_compress_iput(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int32_t multi;
+ DBT kcpy, pdata, empty;
+ BTREE_COMPRESS_STREAM stream;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ ENV *env;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ env = dbc->env;
+
+ memset(&pdata, 0, sizeof(DBT));
+ memset(&empty, 0, sizeof(DBT));
+
+ multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY);
+ LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+ if (flags == 0)
+ flags = DB_KEYLAST;
+
+ switch (flags) {
+ case DB_CURRENT:
+ if (cp->currentKey == 0 || F_ISSET(cp, C_COMPRESS_DELETED)) {
+ ret = DB_NOTFOUND;
+ goto end;
+ }
+
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __db_buildpartial(
+ dbp, cp->currentData, data, &pdata)) != 0)
+ goto end;
+ data = &pdata;
+ }
+
+ if (F_ISSET(dbp, DB_AM_DUPSORT) &&
+ ((BTREE *)dbp->bt_internal)->compress_dup_compare(
+ dbp, cp->currentData, data) != 0) {
+ __db_errx(env, DB_STR("1032",
+ "Existing data sorts differently from put data"));
+ ret = EINVAL;
+ goto end;
+ }
+ CMP_INIT_DBT(&kcpy);
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &kcpy, cp->currentKey->data, cp->currentKey->size)) != 0)
+ goto end;
+
+ __bam_cs_create_single(&stream, &kcpy, data);
+ ret = __bamc_compress_merge_insert(dbc, &stream, NULL, flags);
+
+ if (ret == 0)
+ /* Position the cursor on the entry written */
+ ret = __bamc_compress_get_set(
+ dbc, &kcpy, data, DB_GET_BOTH_RANGE, 0);
+
+ CMP_FREE_DBT(env, &kcpy);
+ break;
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_OVERWRITE_DUP:
+ switch (multi) {
+ case 0:
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __bamc_compress_get_set(dbc, key,
+ data, DB_SET, 0)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto end;
+ if ((ret = __db_buildpartial(dbp,
+ ret == DB_NOTFOUND ? &empty :
+ cp->currentData, data, &pdata)) != 0)
+ goto end;
+ data = &pdata;
+ }
+
+ __bam_cs_create_single(&stream, key, data);
+ ret = __bamc_compress_merge_insert(
+ dbc, &stream, NULL, flags);
+
+ if (ret == 0)
+ /* Position the cursor on the entry written */
+ ret = __bamc_compress_get_set(
+ dbc, key, data, DB_GET_BOTH_RANGE, 0);
+ break;
+ case DB_MULTIPLE:
+ if ((ret = __bam_compress_check_sort_multiple(dbp,
+ key, data)) != 0)
+ goto end;
+ __bam_cs_create_multiple(&stream, key, data);
+ ret = __bamc_compress_merge_insert(
+ dbc, &stream, &key->doff, flags);
+ break;
+ case DB_MULTIPLE_KEY:
+ if ((ret = __bam_compress_check_sort_multiple_key(dbp,
+ key)) != 0)
+ goto end;
+ __bam_cs_create_multiple_key(&stream, key);
+ ret = __bamc_compress_merge_insert(
+ dbc, &stream, &key->doff, flags);
+ break;
+ default:
+ return (__db_unknown_flag(
+ dbp->env, "__bamc_compress_iput", multi));
+ }
+ break;
+ case DB_NOOVERWRITE:
+ /* Check key doesn't already exist */
+ ret = __bamc_compress_get_set(dbc, key, 0, DB_SET, 0);
+ if (ret != DB_NOTFOUND) {
+ if (ret == 0)
+ ret = DB_KEYEXIST;
+ goto end;
+ }
+
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __db_buildpartial(
+ dbp, &empty, data, &pdata)) != 0)
+ goto end;
+ data = &pdata;
+ }
+
+ __bam_cs_create_single(&stream, key, data);
+ ret = __bamc_compress_merge_insert(dbc, &stream, NULL, flags);
+
+ if (ret == 0)
+ /* Position the cursor on the entry written */
+ ret = __bamc_compress_get_set(
+ dbc, key, data, DB_GET_BOTH_RANGE, 0);
+ break;
+ default:
+ return (__db_unknown_flag(
+ dbp->env, "__bamc_compress_iput", flags));
+ }
+
+ end:
+ if (pdata.data != NULL)
+ __os_free(env, pdata.data);
+ return (ret);
+}
+
+/*
+ * __bamc_compress_put --
+ * Put using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_put __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__bamc_compress_put(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc_n;
+ int ret, t_ret;
+
+ if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED)) {
+ if ((flags & DB_OPFLAGS_MASK) == DB_CURRENT &&
+ (ret = __bamc_compress_relocate(dbc)) != 0)
+ return (ret);
+ F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+ }
+
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_dup(dbc, &dbc_n,
+ (flags & DB_OPFLAGS_MASK) == DB_CURRENT ?
+ DB_POSITION : 0)) != 0)
+ goto err;
+
+ /*
+ * We don't care about preserving the cursor's position on
+ * error.
+ */
+ F_SET(dbc_n, DBC_TRANSIENT);
+ }
+
+ if ((ret = __bamc_compress_iput(dbc_n, key, data, flags)) != 0)
+ goto err;
+
+err:
+ /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bamc_compress_idel --
+ * Del using a compressed cursor. (internal)
+ */
+static int
+__bamc_compress_idel(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_COMPRESS_STREAM stream;
+ DB *dbp;
+ BTREE_CURSOR *cp;
+
+ COMPQUIET(flags, 0);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED))
+ return DB_KEYEMPTY;
+ if (cp->currentKey == 0)
+ return DB_NOTFOUND;
+
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ goto err;
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_data,
+ cp->currentData->data, cp->currentData->size)) != 0)
+ goto err;
+
+ __bam_cs_create_single(&stream, &cp->del_key, &cp->del_data);
+ if ((ret = __bamc_compress_merge_delete(dbc, &stream, NULL)) != 0)
+ goto err;
+
+ /* Position the cursor on the entry after the key/data deleted */
+ ret = __bamc_compress_get_set(dbc, &cp->del_key, &cp->del_data, 0, 0);
+ if (ret == DB_NOTFOUND) {
+ __bamc_compress_reset(dbc);
+ ret = 0;
+ } else if (ret != 0)
+ goto err;
+
+ /* Mark current as being deleted */
+ F_SET(cp, C_COMPRESS_DELETED);
+
+ err:
+ return (ret);
+}
+
+/*
+ * __bamc_compress_del --
+ * Del using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_del __P((DBC *, u_int32_t));
+ */
+int
+__bamc_compress_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret, t_ret;
+ DBC *dbc_n;
+
+ if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED) &&
+ (ret = __bamc_compress_relocate(dbc)) != 0)
+ return (ret);
+
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0)
+ goto err;
+
+ /*
+ * We don't care about preserving the cursor's position on
+ * error.
+ */
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ COPY_RET_MEM(dbc, dbc_n);
+ }
+
+ if ((ret = __bamc_compress_idel(dbc_n, flags)) != 0)
+ goto err;
+
+err:
+ /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bamc_compress_ibulk_del --
+ * Bulk del using a compressed cursor. (internal)
+ */
+static int
+__bamc_compress_ibulk_del(dbc, key, flags)
+ DBC *dbc;
+ DBT *key;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_COMPRESS_STREAM stream;
+
+ switch (flags) {
+ case 0:
+ __bam_cs_create_single_keyonly(&stream, key);
+ return (__bamc_compress_merge_delete_dups(dbc, &stream, NULL));
+ case DB_MULTIPLE:
+ if ((ret = __bam_compress_check_sort_multiple_keyonly(
+ dbc->dbp, key)) != 0)
+ return (ret);
+ __bam_cs_create_multiple_keyonly(&stream, key);
+ return (__bamc_compress_merge_delete_dups(
+ dbc, &stream, &key->doff));
+ case DB_MULTIPLE_KEY:
+ if ((ret = __bam_compress_check_sort_multiple_key(
+ dbc->dbp, key)) != 0)
+ return (ret);
+ __bam_cs_create_multiple_key(&stream, key);
+ return (__bamc_compress_merge_delete(dbc, &stream, &key->doff));
+ default:
+ break;
+ }
+
+ return (__db_unknown_flag(
+ dbc->env, "__bamc_compress_ibulk_del", flags));
+}
+
+/*
+ * __bamc_compress_bulk_del --
+ * Bulk del using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_bulk_del __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__bamc_compress_bulk_del(dbc, key, flags)
+ DBC *dbc;
+ DBT *key;
+ u_int32_t flags;
+{
+ int ret, t_ret;
+ DBC *dbc_n;
+
+ F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ goto err;
+
+ /*
+ * We don't care about preserving the cursor's position on
+ * error.
+ */
+ F_SET(dbc_n, DBC_TRANSIENT);
+ }
+
+ if ((ret = __bamc_compress_ibulk_del(dbc_n, key, flags)) != 0)
+ goto err;
+
+err:
+ /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bamc_compress_count --
+ * Count using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_count __P((DBC *, db_recno_t *));
+ */
+int
+__bamc_compress_count(dbc, countp)
+ DBC *dbc;
+ db_recno_t *countp;
+{
+ int ret, t_ret;
+ db_recno_t count;
+ DBT *key;
+ DBC *dbc_n;
+ BTREE_CURSOR *cp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * If the current entry is deleted use del_key, otherwise use
+ * currentKey.
+ */
+ if (F_ISSET(cp, C_COMPRESS_DELETED))
+ key = &cp->del_key;
+ else
+ key = cp->currentKey;
+
+ /* Duplicate the cursor */
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ return (ret);
+
+ /* We don't care about preserving the cursor's position on error */
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ /* Find the first duplicate */
+ if ((ret = __bamc_compress_get_set(dbc_n, key, 0, DB_SET, 0)) != 0)
+ goto err;
+ count = 1;
+
+ /* Count subsequent duplicates */
+ while ((ret = __bamc_compress_get_next_dup(dbc_n, key, 0)) == 0)
+ ++count;
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ else if (ret != 0)
+ goto err;
+
+ *countp = count;
+
+ err:
+ if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bamc_compress_cmp --
+ * Compare which compressed value is pointed to.
+ *
+ * PUBLIC: int __bamc_compress_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__bamc_compress_cmp(dbc, other_dbc, result)
+ DBC *dbc, *other_dbc;
+ int *result;
+{
+ DB *dbp;
+ BTREE_CURSOR *cp, *ocp;
+
+ /*
+ * At this point, we already know that the cursors point to the same
+ * DB.
+ */
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ocp = (BTREE_CURSOR *)other_dbc->internal;
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED))
+ if (F_ISSET(ocp, C_COMPRESS_DELETED))
+ *result = __db_compare_both(
+ dbp, &cp->del_key, &cp->del_data,
+ &ocp->del_key, &ocp->del_data) == 0 ? 0 : 1;
+ else {
+ if (ocp->currentKey == 0)
+ goto err;
+
+ *result = __db_compare_both(
+ dbp, &cp->del_key, &cp->del_data,
+ ocp->currentKey, ocp->currentData) == 0 ? 0 : 1;
+ }
+ else {
+ if (cp->currentKey == 0)
+ goto err;
+
+ if (F_ISSET(ocp, C_COMPRESS_DELETED))
+ *result = __db_compare_both(
+ dbp, cp->currentKey, cp->currentData,
+ &ocp->del_key, &ocp->del_data) == 0 ? 0 : 1;
+ else {
+ if (ocp->currentKey == 0)
+ goto err;
+
+ *result = __db_compare_both(
+ dbp, cp->currentKey, cp->currentData,
+ ocp->currentKey, ocp->currentData) == 0 ? 0 : 1;
+ }
+ }
+ return (0);
+
+ err:
+ __db_errx(dbc->env, DB_STR("1033",
+ "Both cursors must be initialized before calling DBC->cmp."));
+ return (EINVAL);
+}
+
+/*
+ * __bamc_compress_dup --
+ * Duplicate the compression specific part of a btree cursor.
+ *
+ * PUBLIC: int __bamc_compress_dup __P((DBC *, DBC *, u_int32_t));
+ */
+int
+__bamc_compress_dup(orig_dbc, new_dbc, flags)
+ DBC *orig_dbc, *new_dbc;
+ u_int32_t flags;
+{
+ int ret;
+ DB *dbp;
+ BTREE_CURSOR *orig, *new;
+
+ dbp = new_dbc->dbp;
+
+ orig = (BTREE_CURSOR *)orig_dbc->internal;
+ new = (BTREE_CURSOR *)new_dbc->internal;
+
+ if (orig->currentKey != NULL && !LF_ISSET(DB_SHALLOW_DUP)) {
+ new->currentKey = &new->key1;
+ new->currentData = &new->data1;
+
+ if ((ret = __bam_compress_set_dbt(dbp, new->currentKey,
+ orig->currentKey->data, orig->currentKey->size)) != 0)
+ return (ret);
+ if ((ret = __bam_compress_set_dbt(dbp, new->currentData,
+ orig->currentData->data, orig->currentData->size)) != 0)
+ return (ret);
+
+ if (orig->prevKey) {
+ new->prevKey = &new->key2;
+ new->prevData = &new->data2;
+
+ if ((ret = __bam_compress_set_dbt(dbp, new->prevKey,
+ orig->prevKey->data, orig->prevKey->size)) != 0)
+ return (ret);
+ if ((ret = __bam_compress_set_dbt(dbp, new->prevData,
+ orig->prevData->data, orig->prevData->size)) != 0)
+ return (ret);
+ }
+
+ if ((ret = __bam_compress_set_dbt(dbp, &new->compressed,
+ orig->compressed.data, orig->compressed.size)) != 0)
+ return (ret);
+
+ new->compcursor = (u_int8_t*)new->compressed.data +
+ (orig->compcursor - (u_int8_t*)orig->compressed.data);
+ new->compend = (u_int8_t*)new->compressed.data +
+ (orig->compend - (u_int8_t*)orig->compressed.data);
+ new->prevcursor = orig->prevcursor == NULL ? NULL :
+ (u_int8_t*)new->compressed.data + (orig->prevcursor -
+ (u_int8_t*)orig->compressed.data);
+ new->prev2cursor = orig->prev2cursor == NULL ? NULL :
+ (u_int8_t*)new->compressed.data + (orig->prev2cursor -
+ (u_int8_t*)orig->compressed.data);
+
+ if (F_ISSET(orig, C_COMPRESS_DELETED)) {
+ if ((ret = __bam_compress_set_dbt(dbp, &new->del_key,
+ orig->del_key.data, orig->del_key.size)) != 0)
+ return (ret);
+ if ((ret = __bam_compress_set_dbt(dbp, &new->del_data,
+ orig->del_data.data, orig->del_data.size)) != 0)
+ return (ret);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_compress_salvage --
+ * Salvage the compressed data from the key/data pair
+ *
+ * PUBLIC: int __bam_compress_salvage __P((DB *, VRFY_DBINFO *,
+ * PUBLIC: void *, int (*)(void *, const void *), DBT *, DBT *));
+ */
+int
+__bam_compress_salvage(dbp, vdp, handle, callback, key, data)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ DBT *key, *data;
+{
+ DBT key1, key2, data1, data2, compressed;
+ DBT *currentKey, *currentData, *prevKey, *prevData;
+ ENV *env;
+ int ret, t_ret;
+ u_int8_t *compcursor, *compend;
+ u_int32_t datasize, size;
+
+ env = dbp->env;
+
+ memset(&key1, 0, sizeof(DBT));
+ memset(&key2, 0, sizeof(DBT));
+ memset(&data1, 0, sizeof(DBT));
+ memset(&data2, 0, sizeof(DBT));
+ memset(&compressed, 0, sizeof(DBT));
+
+ key1.flags = DB_DBT_USERMEM;
+ key2.flags = DB_DBT_USERMEM;
+ data1.flags = DB_DBT_USERMEM;
+ data2.flags = DB_DBT_USERMEM;
+ compressed.flags = DB_DBT_USERMEM;
+
+ prevKey = NULL;
+ prevData = NULL;
+ currentKey = key;
+ currentData = &data2;
+ compcursor = (u_int8_t*)data->data;
+ compend = compcursor + data->size;
+
+ if (data->size == 0) {
+ ret = DB_VERIFY_FATAL;
+ goto unknown_data;
+ }
+
+ /* Unmarshal the first data */
+ size = __db_decompress_count_int(compcursor);
+ if (size == 0xFF || compcursor + size > compend) {
+ ret = DB_VERIFY_FATAL;
+ goto unknown_data;
+ }
+ compcursor += __db_decompress_int32(compcursor, &datasize);
+
+ if (compcursor + datasize > compend) {
+ ret = DB_VERIFY_FATAL;
+ goto unknown_data;
+ }
+ if ((ret = __bam_compress_set_dbt(
+ dbp, currentData, compcursor, datasize)) != 0)
+ goto err;
+ compcursor += datasize;
+
+ /* Output first data (first key has already been output by our caller */
+ if ((ret = __db_vrfy_prdbt(
+ currentData, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+ goto err;
+
+ while (compcursor < compend) {
+ prevKey = currentKey;
+ prevData = currentData;
+
+ if (currentKey == &key1) {
+ currentKey = &key2;
+ currentData = &data2;
+ } else {
+ currentKey = &key1;
+ currentData = &data1;
+ }
+
+ compressed.data = (void*)compcursor;
+ compressed.ulen = compressed.size =
+ (u_int32_t)(compend - compcursor);
+
+ /* Decompress the next key/data pair */
+ while ((ret = ((BTREE *)dbp->bt_internal)->bt_decompress(
+ dbp, prevKey, prevData,
+ &compressed, currentKey, currentData)) == DB_BUFFER_SMALL) {
+ if (CMP_RESIZE_DBT(ret, env, currentKey) != 0)
+ break;
+ if (CMP_RESIZE_DBT(ret, env, currentData) != 0)
+ break;
+ }
+
+ if (ret == EINVAL) {
+ ret = DB_VERIFY_FATAL;
+ goto err;
+ }
+ if (ret != 0)
+ goto err;
+
+ compcursor += compressed.size;
+
+ if (compcursor > compend) {
+ ret = DB_VERIFY_FATAL;
+ goto err;
+ }
+
+ /* Output the next key/data pair */
+ if ((ret = __db_vrfy_prdbt(
+ currentKey, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+ goto err;
+ if ((ret = __db_vrfy_prdbt(
+ currentData, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+ goto err;
+ }
+
+ if (0) {
+ unknown_data:
+ /*
+ * Make sure we output a data value for the key that's
+ * already been output
+ */
+ DB_INIT_DBT(
+ compressed, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1);
+ if ((t_ret = __db_vrfy_prdbt(
+ &compressed, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+ ret = t_ret;
+ }
+
+ err:
+ __os_free(env, key1.data);
+ __os_free(env, key2.data);
+ __os_free(env, data1.data);
+ __os_free(env, data2.data);
+ return (ret);
+}
+
+/*
+ * __bam_compress_count --
+ * Calculate key and entry counts for the compressed BTree
+ *
+ * PUBLIC: int __bam_compress_count __P((DBC *, u_int32_t *, u_int32_t *));
+ */
+int
+__bam_compress_count(dbc, nkeysp, ndatap)
+ DBC *dbc;
+ u_int32_t *nkeysp, *ndatap;
+{
+ int ret, t_ret;
+ u_int32_t nkeys, ndata;
+ DB *dbp;
+ BTREE *t;
+ DBC *dbc_n;
+ BTREE_CURSOR *cp_n;
+
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ /* Duplicate the cursor */
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ return (ret);
+
+ /* We don't care about preserving the cursor's position on error */
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ cp_n = (BTREE_CURSOR *)dbc_n->internal;
+
+ nkeys = 0;
+ ndata = 0;
+
+ CMP_IGET_RETRY(ret, dbc_n, &cp_n->key1, &cp_n->compressed, DB_FIRST);
+ if (ret != 0)
+ goto err;
+
+ if ((ret = __bamc_start_decompress(dbc_n)) != 0)
+ goto err;
+ nkeys += 1;
+
+ for (;;) {
+ ndata += 1;
+
+ ret = __bamc_next_decompress(dbc_n);
+ if (ret == DB_NOTFOUND) {
+ if (cp_n->currentKey == &cp_n->key1) {
+ /*
+ * Make sure that the previous key isn't
+ * overwritten when we fetch the next chunk.
+ */
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &cp_n->key2, cp_n->key1.data,
+ cp_n->key1.size)) != 0)
+ goto err;
+ }
+
+ CMP_IGET_RETRY(ret, dbc_n, &cp_n->key1,
+ &cp_n->compressed, DB_NEXT);
+ if (ret != 0)
+ goto err;
+
+ ret = __bamc_start_decompress(dbc_n);
+
+ cp_n->prevKey = &cp_n->key2;
+ }
+
+ if (ret != 0)
+ goto err;
+
+ if (t->bt_compare(dbp, cp_n->currentKey, cp_n->prevKey) != 0)
+ nkeys += 1;
+ }
+
+err:
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret == 0) {
+ if (nkeysp != NULL)
+ *nkeysp = nkeys;
+ if (ndatap != NULL)
+ *ndatap = ndata;
+ }
+
+ return (ret);
+}
+
+/*
+ * Check if the key/data pairs in the bulk buffer are sorted.
+ */
+static int
+__bam_compress_check_sort_multiple_key(dbp, key)
+ DB *dbp;
+ DBT *key;
+{
+#ifdef DIAGNOSTIC
+ void *kptr;
+ DBT key1, data1, key2, data2;
+
+ memset(&key1, 0, sizeof(DBT));
+ memset(&data1, 0, sizeof(DBT));
+ memset(&key2, 0, sizeof(DBT));
+ memset(&data2, 0, sizeof(DBT));
+
+ DB_MULTIPLE_INIT(kptr, key);
+ DB_MULTIPLE_KEY_NEXT(kptr, key,
+ key2.data, key2.size, data2.data, data2.size);
+ /* No key/data pair in the bulk buffer */
+ if (kptr == NULL)
+ return (0);
+
+ for (;;) {
+ DB_MULTIPLE_KEY_NEXT(kptr, key,
+ key1.data, key1.size, data1.data, data1.size);
+ if (kptr == NULL)
+ break;
+ if (__db_compare_both(dbp, &key1, &data1, &key2, &data2) < 0) {
+ __db_errx(dbp->env, DB_STR("1170",
+ "The key/data pairs in the buffer are not sorted."));
+ return (EINVAL);
+ }
+ key2.data = key1.data;
+ key2.size = key1.size;
+ data2.data = data1.data;
+ data2.size = data1.size;
+ }
+#else
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(key, NULL);
+#endif
+ return (0);
+}
+
+/*
+ * Check if the key/data pairs in the bulk buffer are sorted.
+ */
+static int
+__bam_compress_check_sort_multiple(dbp, key, data)
+ DB *dbp;
+ DBT *key, *data;
+{
+#ifdef DIAGNOSTIC
+ void *kptr, *dptr;
+ DBT key1, data1, key2, data2;
+
+ memset(&key1, 0, sizeof(DBT));
+ memset(&data1, 0, sizeof(DBT));
+ memset(&key2, 0, sizeof(DBT));
+ memset(&data2, 0, sizeof(DBT));
+
+ DB_MULTIPLE_INIT(kptr, key);
+ DB_MULTIPLE_INIT(dptr, data);
+ DB_MULTIPLE_NEXT(kptr, key, key2.data, key2.size);
+ DB_MULTIPLE_NEXT(dptr, data, data2.data, data2.size);
+ /* No key/data pair in the bulk buffer */
+ if (kptr == NULL || dptr == NULL)
+ return (0);
+
+ for (;;) {
+ DB_MULTIPLE_NEXT(kptr, key, key1.data, key1.size);
+ DB_MULTIPLE_NEXT(dptr, data, data1.data, data1.size);
+ if (kptr == NULL || dptr == NULL)
+ break;
+ if (__db_compare_both(dbp, &key1, &data1, &key2, &data2) < 0) {
+ __db_errx(dbp->env, DB_STR("1171",
+ "The key/data pairs in the buffer are not sorted."));
+ return (EINVAL);
+ }
+ key2.data = key1.data;
+ key2.size = key1.size;
+ data2.data = data1.data;
+ data2.size = data1.size;
+ }
+#else
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(key, NULL);
+ COMPQUIET(data, NULL);
+#endif
+ return (0);
+}
+
+/*
+ * Check if the keys in the bulk buffer are sorted.
+ */
+static int
+__bam_compress_check_sort_multiple_keyonly(dbp, key)
+ DB *dbp;
+ DBT *key;
+{
+#ifdef DIAGNOSTIC
+ void *kptr;
+ DBT key1, key2;
+
+ memset(&key1, 0, sizeof(DBT));
+ memset(&key2, 0, sizeof(DBT));
+
+ DB_MULTIPLE_INIT(kptr, key);
+ DB_MULTIPLE_NEXT(kptr, key, key2.data, key2.size);
+ /* No DBT item in the bulk buffer */
+ if (kptr == NULL)
+ return (0);
+
+ for (;;) {
+ DB_MULTIPLE_NEXT(kptr, key, key1.data, key1.size);
+ if (kptr == NULL)
+ break;
+ if (__db_compare_both(dbp, &key1, NULL, &key2, NULL) < 0) {
+ __db_errx(dbp->env, DB_STR("1172",
+ "The DBT items in the buffer are not sorted"));
+ return (EINVAL);
+ }
+ key2.data = key1.data;
+ key2.size = key1.size;
+ }
+#else
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(key, NULL);
+#endif
+ return (0);
+}
+
+#endif
diff --git a/src/btree/bt_conv.c b/src/btree/bt_conv.c
new file mode 100644
index 00000000..348ce5c2
--- /dev/null
+++ b/src/btree/bt_conv.c
@@ -0,0 +1,95 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+
+/*
+ * __bam_pgin --
+ * Convert host-specific page layout from the host-independent format
+ * stored on disk.
+ *
+ * PUBLIC: int __bam_pgin __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__bam_pgin(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+ PAGE *h;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!F_ISSET(pginfo, DB_AM_SWAP))
+ return (0);
+
+ h = pp;
+ return (TYPE(h) == P_BTREEMETA ? __bam_mswap(dbp->env, pp) :
+ __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 1));
+}
+
+/*
+ * __bam_pgout --
+ * Convert host-specific page layout to the host-independent format
+ * stored on disk.
+ *
+ * PUBLIC: int __bam_pgout __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__bam_pgout(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+ PAGE *h;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!F_ISSET(pginfo, DB_AM_SWAP))
+ return (0);
+
+ h = pp;
+ return (TYPE(h) == P_BTREEMETA ? __bam_mswap(dbp->env, pp) :
+ __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 0));
+}
+
+/*
+ * __bam_mswap --
+ * Swap the bytes on the btree metadata page.
+ *
+ * PUBLIC: int __bam_mswap __P((ENV *, PAGE *));
+ */
+int
+__bam_mswap(env, pg)
+ ENV *env;
+ PAGE *pg;
+{
+ u_int8_t *p;
+
+ COMPQUIET(env, NULL);
+
+ __db_metaswap(pg);
+ p = (u_int8_t *)pg + sizeof(DBMETA);
+
+ p += sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* minkey */
+ SWAP32(p); /* re_len */
+ SWAP32(p); /* re_pad */
+ SWAP32(p); /* root */
+ p += 92 * sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* crypto_magic */
+
+ return (0);
+}
diff --git a/src/btree/bt_curadj.c b/src/btree/bt_curadj.c
new file mode 100644
index 00000000..78606009
--- /dev/null
+++ b/src/btree/bt_curadj.c
@@ -0,0 +1,694 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+
+static int __bam_opd_cursor __P((DB *, DBC *, db_pgno_t, u_int32_t, u_int32_t));
+static int __bam_ca_delete_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __ram_ca_delete_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_di_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_dup_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_undodup_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_rsplit_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_split_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __bam_ca_undosplit_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * Cursor adjustments are logged if they are for subtransactions. This is
+ * because it's possible for a subtransaction to adjust cursors which will
+ * still be active after the subtransaction aborts, and so which must be
+ * restored to their previous locations. Cursors that can be both affected
+ * by our cursor adjustments and active after our transaction aborts can
+ * only be found in our parent transaction -- cursors in other transactions,
+ * including other child transactions of our parent, must have conflicting
+ * locker IDs, and so cannot be affected by adjustments in this transaction.
+ */
+
+ /*
+ * __bam_ca_delete_func
+ * Callback function for walking cursors to update them due to a delete.
+ */
+ static int
+ __bam_ca_delete_func(dbc, my_dbc, countp, pgno, indx, args)
+ DBC *dbc, *my_dbc;
+ u_int32_t *countp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ void *args;
+{
+ BTREE_CURSOR *cp;
+ u_int32_t del;
+
+ COMPQUIET(my_dbc, NULL);
+ del = *(u_int32_t *)args;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (cp->pgno == pgno && cp->indx == indx &&
+ !MVCC_SKIP_CURADJ(dbc, pgno)) {
+ /*
+ * [#8032] This assert is checking for possible race
+ * conditions where we hold a cursor position without
+ * a lock. Unfortunately, there are paths in the
+ * Btree code that do not satisfy these conditions.
+ * None of them are known to be a problem, but this
+ * assert should be re-activated when the Btree stack
+ * code is re-written.
+ DB_ASSERT(env, !STD_LOCKING(dbc) ||
+ cp->lock_mode != DB_LOCK_NG);
+ */
+ if (del) {
+ F_SET(cp, C_DELETED);
+ /*
+ * If we're deleting the item, we can't
+ * keep a streaming offset cached.
+ */
+ cp->stream_start_pgno = PGNO_INVALID;
+ } else
+ F_CLR(cp, C_DELETED);
+
+#ifdef HAVE_COMPRESSION
+ /*
+ * We also set the C_COMPRESS_MODIFIED flag, which
+ * prompts the compression code to look for it's
+ * current entry again if it needs to.
+ *
+ * The flag isn't cleared, because the compression
+ * code still needs to do that even for an entry that
+ * becomes undeleted.
+ *
+ * This flag also needs to be set if an entry is
+ * updated, but since the compression code always
+ * deletes before an update, setting it here is
+ * sufficient.
+ */
+ F_SET(cp, C_COMPRESS_MODIFIED);
+#endif
+
+ ++(*countp);
+ }
+ return (0);
+}
+
+/*
+ * __bam_ca_delete --
+ * Update the cursors when items are deleted and when already deleted
+ * items are overwritten. Return the number of relevant cursors found.
+ *
+ * PUBLIC: int __bam_ca_delete __P((DB *,
+ * PUBLIC: db_pgno_t, u_int32_t, int, u_int32_t *));
+ */
+int
+__bam_ca_delete(dbp, pgno, indx, del, countp)
+ DB *dbp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ int del;
+ u_int32_t *countp;
+{
+ int ret;
+ u_int32_t count;
+
+ /*
+ * Adjust the cursors. We have the page write locked, so the
+ * only other cursors that can be pointing at a page are
+ * those in the same thread of control. Unfortunately, we don't
+ * know that they're using the same DB handle, so traverse
+ * all matching DB handles in the same ENV, then all cursors
+ * on each matching DB handle.
+ *
+ * Each cursor is single-threaded, so we only need to lock the
+ * list of DBs and then the list of cursors in each DB.
+ */
+ if ((ret = __db_walk_cursors(dbp, NULL,
+ __bam_ca_delete_func, &count, pgno, indx, &del)) != 0)
+ return (ret);
+
+ if (countp != NULL)
+ *countp = count;
+ return (0);
+}
+
+static int
+__ram_ca_delete_func(dbc, my_dbc, countp, root_pgno, indx, args)
+ DBC *dbc, *my_dbc;
+ u_int32_t *countp;
+ db_pgno_t root_pgno;
+ u_int32_t indx;
+ void *args;
+{
+ COMPQUIET(indx, 0);
+ COMPQUIET(my_dbc, NULL);
+ COMPQUIET(args, NULL);
+
+ if (dbc->internal->root == root_pgno &&
+ !MVCC_SKIP_CURADJ(dbc, root_pgno)) {
+ (*countp)++;
+ return (EEXIST);
+ }
+ return (0);
+}
+
+/*
+ * __ram_ca_delete --
+ * Return if any relevant cursors found.
+ *
+ * PUBLIC: int __ram_ca_delete __P((DB *, db_pgno_t, u_int32_t *));
+ */
+int
+__ram_ca_delete(dbp, root_pgno, foundp)
+ DB *dbp;
+ db_pgno_t root_pgno;
+ u_int32_t *foundp;
+{
+ int ret;
+
+ if ((ret = __db_walk_cursors(dbp, NULL, __ram_ca_delete_func,
+ foundp, root_pgno, 0, NULL)) != 0 && ret != EEXIST)
+ return (ret);
+
+ return (0);
+}
+
+struct __bam_ca_di_args {
+ int adjust;
+ DB_TXN *my_txn;
+};
+
+static int
+__bam_ca_di_func(dbc, my_dbc, foundp, pgno, indx, vargs)
+ DBC *dbc, *my_dbc;
+ u_int32_t *foundp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ void *vargs;
+{
+ DBC_INTERNAL *cp;
+ struct __bam_ca_di_args *args;
+
+ if (dbc->dbtype == DB_RECNO)
+ return (0);
+
+ cp = dbc->internal;
+ args = vargs;
+ if (cp->pgno == pgno && cp->indx >= indx &&
+ (dbc == my_dbc || !MVCC_SKIP_CURADJ(dbc, pgno))) {
+ /* Cursor indices should never be negative. */
+ DB_ASSERT(dbc->dbp->env, cp->indx != 0 || args->adjust > 0);
+ /* [#8032]
+ DB_ASSERT(env, !STD_LOCKING(dbc) ||
+ cp->lock_mode != DB_LOCK_NG);
+ */
+ cp->indx += args->adjust;
+ if (args->my_txn != NULL && args->my_txn != dbc->txn)
+ *foundp = 1;
+ }
+ return (0);
+}
+/*
+ * __bam_ca_di --
+ * Adjust the cursors during a delete or insert.
+ *
+ * PUBLIC: int __bam_ca_di __P((DBC *, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_di(my_dbc, pgno, indx, adjust)
+ DBC *my_dbc;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ int adjust;
+{
+ DB *dbp;
+ DB_LSN lsn;
+ int ret;
+ u_int32_t found;
+ struct __bam_ca_di_args args;
+
+ dbp = my_dbc->dbp;
+ args.adjust = adjust;
+ args.my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ if ((ret = __db_walk_cursors(dbp, my_dbc, __bam_ca_di_func,
+ &found, pgno, indx, &args)) != 0)
+ return (ret);
+
+ if (found != 0 && DBC_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbp, my_dbc->txn, &lsn, 0,
+ DB_CA_DI, pgno, 0, 0, (u_int32_t)adjust, indx, 0)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_opd_cursor -- create a new opd cursor.
+ */
+static int
+__bam_opd_cursor(dbp, dbc, first, tpgno, ti)
+ DB *dbp;
+ DBC *dbc;
+ db_pgno_t tpgno;
+ u_int32_t first, ti;
+{
+ BTREE_CURSOR *cp, *orig_cp;
+ DBC *dbc_nopd;
+ int ret;
+
+ orig_cp = (BTREE_CURSOR *)dbc->internal;
+ dbc_nopd = NULL;
+
+ /*
+ * Allocate a new cursor and create the stack. If duplicates
+ * are sorted, we've just created an off-page duplicate Btree.
+ * If duplicates aren't sorted, we've just created a Recno tree.
+ *
+ * Note that in order to get here at all, there shouldn't be
+ * an old off-page dup cursor--to augment the checking dbc_newopd
+ * will do, assert this.
+ */
+ DB_ASSERT(dbp->env, orig_cp->opd == NULL);
+ if ((ret = __dbc_newopd(dbc, tpgno, orig_cp->opd, &dbc_nopd)) != 0)
+ return (ret);
+
+ cp = (BTREE_CURSOR *)dbc_nopd->internal;
+ cp->pgno = tpgno;
+ cp->indx = ti;
+
+ if (dbp->dup_compare == NULL) {
+ /*
+ * Converting to off-page Recno trees is tricky. The
+ * record number for the cursor is the index + 1 (to
+ * convert to 1-based record numbers).
+ */
+ cp->recno = ti + 1;
+ }
+
+ /*
+ * Transfer the deleted flag from the top-level cursor to the
+ * created one.
+ */
+ if (F_ISSET(orig_cp, C_DELETED)) {
+ F_SET(cp, C_DELETED);
+ F_CLR(orig_cp, C_DELETED);
+ }
+
+ /* Stack the cursors and reset the initial cursor's index. */
+ orig_cp->opd = dbc_nopd;
+ orig_cp->indx = first;
+ return (0);
+}
+
+struct __bam_ca_dup_args {
+ db_pgno_t tpgno;
+ db_indx_t first, ti;
+ DB_TXN *my_txn;
+};
+
+static int
+__bam_ca_dup_func(dbc, my_dbc, foundp, fpgno, fi, vargs)
+ DBC *dbc;
+ DBC *my_dbc;
+ u_int32_t *foundp;
+ db_pgno_t fpgno;
+ u_int32_t fi;
+ void *vargs;
+{
+ BTREE_CURSOR *orig_cp;
+ DB *dbp;
+ int ret;
+ struct __bam_ca_dup_args *args;
+
+ COMPQUIET(my_dbc, NULL);
+
+ /*
+ * Since we rescan the list see if this is already
+ * converted.
+ */
+ orig_cp = (BTREE_CURSOR *)dbc->internal;
+ if (orig_cp->opd != NULL)
+ return (0);
+
+ /* Find cursors pointing to this record. */
+ if (orig_cp->pgno != fpgno || orig_cp->indx != fi ||
+ MVCC_SKIP_CURADJ(dbc, fpgno))
+ return (0);
+
+ dbp = dbc->dbp;
+ args = vargs;
+
+ MUTEX_UNLOCK(dbp->env, dbp->mutex);
+
+ if ((ret = __bam_opd_cursor(dbp,
+ dbc, args->first, args->tpgno, args->ti)) != 0) {
+ MUTEX_LOCK(dbp->env, dbp->mutex);
+ return (ret);
+ }
+ if (args->my_txn != NULL && args->my_txn != dbc->txn)
+ *foundp = 1;
+ /* We released the mutex to get a cursor, start over. */
+ return (DB_LOCK_NOTGRANTED);
+}
+
+/*
+ * __bam_ca_dup --
+ * Adjust the cursors when moving items from a leaf page to a duplicates
+ * page.
+ *
+ * PUBLIC: int __bam_ca_dup __P((DBC *,
+ * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t));
+ */
+int
+__bam_ca_dup(my_dbc, first, fpgno, fi, tpgno, ti)
+ DBC *my_dbc;
+ db_pgno_t fpgno, tpgno;
+ u_int32_t first, fi, ti;
+{
+ DB *dbp;
+ DB_LSN lsn;
+ int ret, t_ret;
+ u_int32_t found;
+ struct __bam_ca_dup_args args;
+
+ dbp = my_dbc->dbp;
+
+ args.first = first;
+ args.tpgno = tpgno;
+ args.ti = ti;
+ args.my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+ if ((ret = __db_walk_cursors(dbp,
+ my_dbc, __bam_ca_dup_func, &found, fpgno, fi, &args)) != 0)
+ return (ret);
+
+ if (found != 0 && DBC_LOGGING(my_dbc)) {
+ if ((t_ret = __bam_curadj_log(dbp, my_dbc->txn,
+ &lsn, 0, DB_CA_DUP, fpgno, tpgno, 0, first, fi, ti)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+static int
+__bam_ca_undodup_func(dbc, my_dbc, countp, fpgno, fi, vargs)
+ DBC *dbc;
+ DBC *my_dbc;
+ u_int32_t *countp;
+ db_pgno_t fpgno;
+ u_int32_t fi;
+ void *vargs;
+{
+ BTREE_CURSOR *orig_cp;
+ DB *dbp;
+ int ret;
+ struct __bam_ca_dup_args *args;
+
+ COMPQUIET(my_dbc, NULL);
+ COMPQUIET(countp, NULL);
+
+ orig_cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ args = vargs;
+ /*
+ * A note on the orig_cp->opd != NULL requirement here:
+ * it's possible that there's a cursor that refers to
+ * the same duplicate set, but which has no opd cursor,
+ * because it refers to a different item and we took
+ * care of it while processing a previous record.
+ */
+ if (orig_cp->pgno != fpgno ||
+ orig_cp->indx != args->first ||
+ orig_cp->opd == NULL || ((BTREE_CURSOR *)
+ orig_cp->opd->internal)->indx != args->ti ||
+ MVCC_SKIP_CURADJ(dbc, fpgno))
+ return (0);
+ MUTEX_UNLOCK(dbp->env, dbp->mutex);
+ if ((ret = __dbc_close(orig_cp->opd)) != 0) {
+ MUTEX_LOCK(dbp->env, dbp->mutex);
+ return (ret);
+ }
+ orig_cp->opd = NULL;
+ orig_cp->indx = fi;
+ /*
+ * We released the mutex to free a cursor,
+ * start over.
+ */
+ return (DB_LOCK_NOTGRANTED);
+}
+
+/*
+ * __bam_ca_undodup --
+ * Adjust the cursors when returning items to a leaf page
+ * from a duplicate page.
+ * Called only during undo processing.
+ *
+ * PUBLIC: int __bam_ca_undodup __P((DB *,
+ * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, u_int32_t));
+ */
+int
+__bam_ca_undodup(dbp, first, fpgno, fi, ti)
+ DB *dbp;
+ db_pgno_t fpgno;
+ u_int32_t first, fi, ti;
+{
+ u_int32_t count;
+ struct __bam_ca_dup_args args;
+
+ args.first = first;
+ args.ti = ti;
+ return (__db_walk_cursors(dbp, NULL,
+ __bam_ca_undodup_func, &count, fpgno, fi, &args));
+
+}
+
+static int
+__bam_ca_rsplit_func(dbc, my_dbc, foundp, fpgno, indx, args)
+ DBC *dbc;
+ DBC *my_dbc;
+ u_int32_t *foundp;
+ db_pgno_t fpgno;
+ u_int32_t indx;
+ void *args;
+{
+ db_pgno_t tpgno;
+
+ COMPQUIET(indx, 0);
+
+ if (dbc->dbtype == DB_RECNO)
+ return (0);
+
+ tpgno = *(db_pgno_t *)args;
+ if (dbc->internal->pgno == fpgno &&
+ !MVCC_SKIP_CURADJ(dbc, fpgno)) {
+ dbc->internal->pgno = tpgno;
+ /* [#8032]
+ DB_ASSERT(env, !STD_LOCKING(dbc) ||
+ dbc->internal->lock_mode != DB_LOCK_NG);
+ */
+ if (IS_SUBTRANSACTION(my_dbc->txn) && dbc->txn != my_dbc->txn)
+ *foundp = 1;
+ }
+ return (0);
+}
+
+/*
+ * __bam_ca_rsplit --
+ * Adjust the cursors when doing reverse splits.
+ *
+ * PUBLIC: int __bam_ca_rsplit __P((DBC *, db_pgno_t, db_pgno_t));
+ */
+int
+__bam_ca_rsplit(my_dbc, fpgno, tpgno)
+ DBC* my_dbc;
+ db_pgno_t fpgno, tpgno;
+{
+ DB *dbp;
+ DB_LSN lsn;
+ int ret;
+ u_int32_t found;
+
+ dbp = my_dbc->dbp;
+
+ if ((ret = __db_walk_cursors(dbp, my_dbc,
+ __bam_ca_rsplit_func, &found, fpgno, 0, &tpgno)) != 0)
+ return (ret);
+
+ if (found != 0 && DBC_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbp, my_dbc->txn,
+ &lsn, 0, DB_CA_RSPLIT, fpgno, tpgno, 0, 0, 0, 0)) != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+struct __bam_ca_split_args {
+ db_pgno_t lpgno, rpgno;
+ int cleft;
+ DB_TXN *my_txn;
+};
+
+static int
+__bam_ca_split_func(dbc, my_dbc, foundp, ppgno, split_indx, vargs)
+ DBC *dbc;
+ DBC *my_dbc;
+ u_int32_t *foundp;
+ db_pgno_t ppgno;
+ u_int32_t split_indx;
+ void *vargs;
+{
+ DBC_INTERNAL *cp;
+ struct __bam_ca_split_args *args;
+
+ COMPQUIET(my_dbc, NULL);
+
+ if (dbc->dbtype == DB_RECNO)
+ return (0);
+ cp = dbc->internal;
+ args = vargs;
+ if (cp->pgno == ppgno &&
+ !MVCC_SKIP_CURADJ(dbc, ppgno)) {
+ /* [#8032]
+ DB_ASSERT(env, !STD_LOCKING(dbc) ||
+ cp->lock_mode != DB_LOCK_NG);
+ */
+ if (args->my_txn != NULL && args->my_txn != dbc->txn)
+ *foundp = 1;
+ if (cp->indx < split_indx) {
+ if (args->cleft)
+ cp->pgno = args->lpgno;
+ } else {
+ cp->pgno = args->rpgno;
+ cp->indx -= split_indx;
+ }
+ }
+ return (0);
+}
+
+/*
+ * __bam_ca_split --
+ * Adjust the cursors when splitting a page.
+ *
+ * PUBLIC: int __bam_ca_split __P((DBC *,
+ * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_split(my_dbc, ppgno, lpgno, rpgno, split_indx, cleft)
+ DBC *my_dbc;
+ db_pgno_t ppgno, lpgno, rpgno;
+ u_int32_t split_indx;
+ int cleft;
+{
+ DB *dbp;
+ DB_LSN lsn;
+ int ret;
+ u_int32_t found;
+ struct __bam_ca_split_args args;
+
+ dbp = my_dbc->dbp;
+
+ /*
+ * If splitting the page that a cursor was on, the cursor has to be
+ * adjusted to point to the same record as before the split. Most
+ * of the time we don't adjust pointers to the left page, because
+ * we're going to copy its contents back over the original page. If
+ * the cursor is on the right page, it is decremented by the number of
+ * records split to the left page.
+ */
+ args.lpgno = lpgno;
+ args.rpgno = rpgno;
+ args.cleft = cleft;
+ args.my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+ if ((ret = __db_walk_cursors(dbp, my_dbc,
+ __bam_ca_split_func, &found, ppgno, split_indx, &args)) != 0)
+ return (ret);
+
+ if (found != 0 && DBC_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbp,
+ my_dbc->txn, &lsn, 0, DB_CA_SPLIT, ppgno, rpgno,
+ cleft ? lpgno : PGNO_INVALID, 0, split_indx, 0)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+__bam_ca_undosplit_func(dbc, my_dbc, foundp, frompgno, split_indx, vargs)
+ DBC *dbc;
+ DBC *my_dbc;
+ u_int32_t *foundp;
+ db_pgno_t frompgno;
+ u_int32_t split_indx;
+ void *vargs;
+{
+ DBC_INTERNAL *cp;
+ struct __bam_ca_split_args *args;
+
+ COMPQUIET(my_dbc, NULL);
+ COMPQUIET(foundp, NULL);
+
+ if (dbc->dbtype == DB_RECNO)
+ return (0);
+ cp = dbc->internal;
+ args = vargs;
+ if (cp->pgno == args->rpgno &&
+ !MVCC_SKIP_CURADJ(dbc, args->rpgno)) {
+ cp->pgno = frompgno;
+ cp->indx += split_indx;
+ } else if (cp->pgno == args->lpgno &&
+ !MVCC_SKIP_CURADJ(dbc, args->lpgno))
+ cp->pgno = frompgno;
+
+ return (0);
+}
+
+/*
+ * __bam_ca_undosplit --
+ * Adjust the cursors when undoing a split of a page.
+ * If we grew a level we will execute this for both the
+ * left and the right pages.
+ * Called only during undo processing.
+ *
+ * PUBLIC: int __bam_ca_undosplit __P((DB *,
+ * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t));
+ */
+int
+__bam_ca_undosplit(dbp, frompgno, topgno, lpgno, split_indx)
+ DB *dbp;
+ db_pgno_t frompgno, topgno, lpgno;
+ u_int32_t split_indx;
+{
+ u_int32_t count;
+ struct __bam_ca_split_args args;
+
+ /*
+ * When backing out a split, we move the cursor back
+ * to the original offset and bump it by the split_indx.
+ */
+ args.lpgno = lpgno;
+ args.rpgno = topgno;
+ return (__db_walk_cursors(dbp, NULL,
+ __bam_ca_undosplit_func, &count, frompgno, split_indx, &args));
+}
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
new file mode 100644
index 00000000..860c31ce
--- /dev/null
+++ b/src/btree/bt_cursor.c
@@ -0,0 +1,3076 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __bam_bulk __P((DBC *, DBT *, u_int32_t));
+static int __bamc_close __P((DBC *, db_pgno_t, int *));
+static int __bamc_del __P((DBC *, u_int32_t));
+static int __bamc_destroy __P((DBC *));
+static int __bamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __bamc_getstack __P((DBC *));
+static int __bamc_next __P((DBC *, int, int));
+static int __bamc_physdel __P((DBC *));
+static int __bamc_prev __P((DBC *));
+static int __bamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __bamc_search __P((DBC *,
+ db_pgno_t, const DBT *, u_int32_t, int *));
+static int __bamc_writelock __P((DBC *));
+static int __bam_getboth_finddatum __P((DBC *, DBT *, u_int32_t));
+static int __bam_getbothc __P((DBC *, DBT *));
+static int __bam_get_prev __P((DBC *));
+static int __bam_isopd __P((DBC *, db_pgno_t *));
+#ifdef HAVE_COMPRESSION
+static int __bam_getlte __P((DBC *, DBT *, DBT *));
+#endif
+
+/*
+ * Acquire a new page/lock. If we hold a page/lock, discard the page, and
+ * lock-couple the lock.
+ *
+ * !!!
+ * We have to handle both where we have a lock to lock-couple and where we
+ * don't -- we don't duplicate locks when we duplicate cursors if we are
+ * running in a transaction environment as there's no point if locks are
+ * never discarded. This means that the cursor may or may not hold a lock.
+ * In the case where we are descending the tree we always want to unlock
+ * the held interior page so we use ACQUIRE_COUPLE.
+ */
+#undef ACQUIRE
+#define ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, flags, ret) do { \
+ DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \
+ if ((pagep) != NULL) { \
+ ret = __memp_fput(__mpf, \
+ (dbc)->thread_info, pagep, dbc->priority); \
+ pagep = NULL; \
+ } else \
+ ret = 0; \
+ if ((ret) == 0 && STD_LOCKING(dbc)) \
+ ret = __db_lget( \
+ dbc, LCK_COUPLE, lpgno, mode, flags, &(lock)); \
+ if ((ret) == 0) \
+ ret = __memp_fget(__mpf, &(fpgno), \
+ (dbc)->thread_info, (dbc)->txn, 0, &(pagep)); \
+} while (0)
+
+/* Acquire a new page/lock for a cursor. */
+#undef ACQUIRE_CUR
+#define ACQUIRE_CUR(dbc, mode, p, flags, ret) do { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ if (p != __cp->pgno) \
+ __cp->pgno = PGNO_INVALID; \
+ ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, flags, ret); \
+ if ((ret) == 0) { \
+ __cp->pgno = p; \
+ __cp->lock_mode = (mode); \
+ } \
+} while (0)
+
+/*
+ * Acquire a write lock if we don't already have one.
+ *
+ * !!!
+ * See ACQUIRE macro on why we handle cursors that don't have locks.
+ */
+#undef ACQUIRE_WRITE_LOCK
+#define ACQUIRE_WRITE_LOCK(dbc, ret) do { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \
+ int __get_page = 0; \
+ ret = 0; \
+ if (STD_LOCKING(dbc) && __cp->lock_mode != DB_LOCK_WRITE) { \
+ if (__cp->page != NULL) { \
+ (ret) = __memp_fput(__mpf, (dbc)->thread_info, \
+ __cp->page, (dbc)->priority); \
+ __cp->page = NULL; \
+ __get_page = 1; \
+ if ((ret) !=0) \
+ break; \
+ } \
+ if (((ret) = __db_lget((dbc), \
+ LOCK_ISSET(__cp->lock) ? LCK_COUPLE : 0, \
+ __cp->pgno, DB_LOCK_WRITE, 0, &__cp->lock)) != 0) \
+ break; \
+ __cp->lock_mode = DB_LOCK_WRITE; \
+ if (__get_page == 0) \
+ break; \
+ (ret) = __memp_fget(__mpf, &__cp->pgno, \
+ (dbc)->thread_info, \
+ (dbc)->txn, DB_MPOOL_DIRTY, &__cp->page); \
+ } \
+} while (0)
+
+/* Discard the current page/lock for a cursor. */
+#undef DISCARD_CUR
+#define DISCARD_CUR(dbc, ret) do { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \
+ int __t_ret; \
+ if ((__cp->page) != NULL) { \
+ __t_ret = __memp_fput(__mpf, \
+ (dbc)->thread_info, __cp->page, dbc->priority);\
+ __cp->page = NULL; \
+ } else \
+ __t_ret = 0; \
+ if (__t_ret != 0 && (ret) == 0) \
+ ret = __t_ret; \
+ __t_ret = __TLPUT((dbc), __cp->lock); \
+ if (__t_ret != 0 && (ret) == 0) \
+ ret = __t_ret; \
+ if ((ret) == 0 && !LOCK_ISSET(__cp->lock)) \
+ __cp->lock_mode = DB_LOCK_NG; \
+ __cp->stream_start_pgno = PGNO_INVALID; \
+} while (0)
+
+/* If on-page item is a deleted record. */
+#undef IS_DELETED
+#define IS_DELETED(dbp, page, indx) \
+ B_DISSET(GET_BKEYDATA(dbp, page, \
+ (indx) + (TYPE(page) == P_LBTREE ? O_INDX : 0))->type)
+#undef IS_CUR_DELETED
+#define IS_CUR_DELETED(dbc) \
+ IS_DELETED((dbc)->dbp, (dbc)->internal->page, (dbc)->internal->indx)
+
+/*
+ * Test to see if two cursors could point to duplicates of the same key.
+ * In the case of off-page duplicates they are they same, as the cursors
+ * will be in the same off-page duplicate tree. In the case of on-page
+ * duplicates, the key index offsets must be the same. For the last test,
+ * as the original cursor may not have a valid page pointer, we use the
+ * current cursor's.
+ */
+#undef IS_DUPLICATE
+#define IS_DUPLICATE(dbc, i1, i2) \
+ (P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i1] == \
+ P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i2])
+#undef IS_CUR_DUPLICATE
+#define IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx) \
+ (F_ISSET(dbc, DBC_OPD) || \
+ (orig_pgno == (dbc)->internal->pgno && \
+ IS_DUPLICATE(dbc, (dbc)->internal->indx, orig_indx)))
+
+/*
+ * __bamc_init --
+ * Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __bamc_init __P((DBC *, DBTYPE));
+ */
+int
+__bamc_init(dbc, dbtype)
+ DBC *dbc;
+ DBTYPE dbtype;
+{
+ ENV *env;
+ int ret;
+#ifdef HAVE_COMPRESSION
+ BTREE_CURSOR *cp;
+#endif
+
+ env = dbc->env;
+
+ /* Allocate/initialize the internal structure. */
+ if (dbc->internal == NULL) {
+ if ((ret = __os_calloc(
+ env, 1, sizeof(BTREE_CURSOR), &dbc->internal)) != 0)
+ return (ret);
+
+#ifdef HAVE_COMPRESSION
+ cp = (BTREE_CURSOR*)dbc->internal;
+ cp->compressed.flags = DB_DBT_USERMEM;
+ cp->key1.flags = DB_DBT_USERMEM;
+ cp->key2.flags = DB_DBT_USERMEM;
+ cp->data1.flags = DB_DBT_USERMEM;
+ cp->data2.flags = DB_DBT_USERMEM;
+ cp->del_key.flags = DB_DBT_USERMEM;
+ cp->del_data.flags = DB_DBT_USERMEM;
+#endif
+ }
+
+ /* Initialize methods. */
+ dbc->close = dbc->c_close = __dbc_close_pp;
+ dbc->cmp = __dbc_cmp_pp;
+ dbc->count = dbc->c_count = __dbc_count_pp;
+ dbc->del = dbc->c_del = __dbc_del_pp;
+ dbc->dup = dbc->c_dup = __dbc_dup_pp;
+ dbc->get = dbc->c_get = __dbc_get_pp;
+ dbc->pget = dbc->c_pget = __dbc_pget_pp;
+ dbc->put = dbc->c_put = __dbc_put_pp;
+ if (dbtype == DB_BTREE) {
+ dbc->am_bulk = __bam_bulk;
+ dbc->am_close = __bamc_close;
+ dbc->am_del = __bamc_del;
+ dbc->am_destroy = __bamc_destroy;
+ dbc->am_get = __bamc_get;
+ dbc->am_put = __bamc_put;
+ dbc->am_writelock = __bamc_writelock;
+ } else {
+ dbc->am_bulk = __bam_bulk;
+ dbc->am_close = __bamc_close;
+ dbc->am_del = __ramc_del;
+ dbc->am_destroy = __bamc_destroy;
+ dbc->am_get = __ramc_get;
+ dbc->am_put = __ramc_put;
+ dbc->am_writelock = __bamc_writelock;
+ }
+
+ return (0);
+}
+
+/*
+ * __bamc_refresh
+ * Set things up properly for cursor re-use.
+ *
+ * PUBLIC: int __bamc_refresh __P((DBC *));
+ */
+int
+__bamc_refresh(dbc)
+ DBC *dbc;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ dbp = dbc->dbp;
+ t = dbp->bt_internal;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * If our caller set the root page number, it's because the root was
+ * known. This is always the case for off page dup cursors. Else,
+ * pull it out of our internal information, unless this is a subdb.
+ */
+ if (cp->root == PGNO_INVALID && t->bt_meta == PGNO_BASE_MD)
+ cp->root = t->bt_root;
+
+ LOCK_INIT(cp->lock);
+ cp->lock_mode = DB_LOCK_NG;
+
+ if (cp->sp == NULL) {
+ cp->sp = cp->stack;
+ cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
+ }
+ BT_STK_CLR(cp);
+
+#ifdef HAVE_COMPRESSION
+ /* Initialize compression */
+ cp->prevKey = 0;
+ cp->prevData = 0;
+ cp->currentKey = 0;
+ cp->currentData = 0;
+ cp->compcursor = 0;
+ cp->compend = 0;
+ cp->prevcursor = 0;
+ cp->prev2cursor = 0;
+#endif
+
+ /*
+ * The btree leaf page data structures require that two key/data pairs
+ * (or four items) fit on a page, but other than that there's no fixed
+ * requirement. The btree off-page duplicates only require two items,
+ * to be exact, but requiring four for them as well seems reasonable.
+ *
+ * Recno uses the btree bt_ovflsize value -- it's close enough.
+ */
+ cp->ovflsize = B_MINKEY_TO_OVFLSIZE(
+ dbp, F_ISSET(dbc, DBC_OPD) ? 2 : t->bt_minkey, dbp->pgsize);
+
+ cp->recno = RECNO_OOB;
+ cp->order = INVALID_ORDER;
+ cp->flags = 0;
+
+ /* Initialize for record numbers. */
+ if (F_ISSET(dbc, DBC_OPD) ||
+ dbc->dbtype == DB_RECNO || F_ISSET(dbp, DB_AM_RECNUM)) {
+ F_SET(cp, C_RECNUM);
+
+ /*
+ * All btrees that support record numbers, optionally standard
+ * recno trees, and all off-page duplicate recno trees have
+ * mutable record numbers.
+ */
+ if ((F_ISSET(dbc, DBC_OPD) && dbc->dbtype == DB_RECNO) ||
+ F_ISSET(dbp, DB_AM_RECNUM | DB_AM_RENUMBER))
+ F_SET(cp, C_RENUMBER);
+ }
+
+ return (0);
+}
+
+/*
+ * __bamc_close --
+ * Close down the cursor.
+ */
+static int
+__bamc_close(dbc, root_pgno, rmroot)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int *rmroot;
+{
+ BTREE_CURSOR *cp, *cp_opd, *cp_c;
+ DB *dbp;
+ DBC *dbc_opd, *dbc_c;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ int cdb_lock, ret;
+ u_int32_t count;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ cp_opd = (dbc_opd = cp->opd) == NULL ?
+ NULL : (BTREE_CURSOR *)dbc_opd->internal;
+ cdb_lock = ret = 0;
+
+ /*
+ * There are 3 ways this function is called:
+ *
+ * 1. Closing a primary cursor: we get called with a pointer to a
+ * primary cursor that has a NULL opd field. This happens when
+ * closing a btree/recno database cursor without an associated
+ * off-page duplicate tree.
+ *
+ * 2. Closing a primary and an off-page duplicate cursor stack: we
+ * get called with a pointer to the primary cursor which has a
+ * non-NULL opd field. This happens when closing a btree cursor
+ * into database with an associated off-page btree/recno duplicate
+ * tree. (It can't be a primary recno database, recno databases
+ * don't support duplicates.)
+ *
+ * 3. Closing an off-page duplicate cursor stack: we get called with
+ * a pointer to the off-page duplicate cursor. This happens when
+ * closing a non-btree database that has an associated off-page
+ * btree/recno duplicate tree or for a btree database when the
+ * opd tree is not empty (root_pgno == PGNO_INVALID).
+ *
+ * If either the primary or off-page duplicate cursor deleted a btree
+ * key/data pair, check to see if the item is still referenced by a
+ * different cursor. If it is, confirm that cursor's delete flag is
+ * set and leave it to that cursor to do the delete.
+ *
+ * NB: The test for == 0 below is correct. Our caller already removed
+ * our cursor argument from the active queue, we won't find it when we
+ * search the queue in __bam_ca_delete().
+ * NB: It can't be true that both the primary and off-page duplicate
+ * cursors have deleted a btree key/data pair. Either the primary
+ * cursor may have deleted an item and there's no off-page duplicate
+ * cursor, or there's an off-page duplicate cursor and it may have
+ * deleted an item.
+ *
+ * Primary recno databases aren't an issue here. Recno keys are either
+ * deleted immediately or never deleted, and do not have to be handled
+ * here.
+ *
+ * Off-page duplicate recno databases are an issue here, cases #2 and
+ * #3 above can both be off-page recno databases. The problem is the
+ * same as the final problem for off-page duplicate btree databases.
+ * If we no longer need the off-page duplicate tree, we want to remove
+ * it. For off-page duplicate btrees, we are done with the tree when
+ * we delete the last item it contains, i.e., there can be no further
+ * references to it when it's empty. For off-page duplicate recnos,
+ * we remove items from the tree as the application calls the remove
+ * function, so we are done with the tree when we close the last cursor
+ * that references it.
+ *
+ * We optionally take the root page number from our caller. If the
+ * primary database is a btree, we can get it ourselves because dbc
+ * is the primary cursor. If the primary database is not a btree,
+ * the problem is that we may be dealing with a stack of pages. The
+ * cursor we're using to do the delete points at the bottom of that
+ * stack and we need the top of the stack.
+ */
+ if (F_ISSET(cp, C_DELETED)) {
+ dbc_c = dbc;
+ switch (dbc->dbtype) {
+ case DB_BTREE: /* Case #1, #3. */
+ if ((ret = __bam_ca_delete(
+ dbp, cp->pgno, cp->indx, 1, &count)) != 0)
+ goto err;
+ if (count == 0)
+ goto lock;
+ goto done;
+ case DB_RECNO:
+ if (!F_ISSET(dbc, DBC_OPD)) /* Case #1. */
+ goto done;
+ /* Case #3. */
+ if ((ret = __ram_ca_delete(dbp, cp->root, &count)) != 0)
+ goto err;
+ if (count == 0)
+ goto lock;
+ goto done;
+ case DB_HASH:
+ case DB_QUEUE:
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(
+ env, "DbCursor.close", dbc->dbtype);
+ goto err;
+ }
+ }
+
+ if (dbc_opd == NULL)
+ goto done;
+
+ if (F_ISSET(cp_opd, C_DELETED)) { /* Case #2. */
+ /*
+ * We will not have been provided a root page number. Acquire
+ * one from the primary database.
+ */
+ if ((h = cp->page) == NULL && (ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+ root_pgno = GET_BOVERFLOW(dbp, h, cp->indx + O_INDX)->pgno;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ cp->page = NULL;
+
+ dbc_c = dbc_opd;
+ switch (dbc_opd->dbtype) {
+ case DB_BTREE:
+ if ((ret = __bam_ca_delete(
+ dbp, cp_opd->pgno, cp_opd->indx, 1, &count)) != 0)
+ goto err;
+ if (count == 0)
+ goto lock;
+ goto done;
+ case DB_RECNO:
+ if ((ret =
+ __ram_ca_delete(dbp, cp_opd->root, &count)) != 0)
+ goto err;
+ if (count == 0)
+ goto lock;
+ goto done;
+ case DB_HASH:
+ case DB_QUEUE:
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(
+ env, "DbCursor.close", dbc->dbtype);
+ goto err;
+ }
+ }
+ goto done;
+
+lock: cp_c = (BTREE_CURSOR *)dbc_c->internal;
+
+ /*
+ * If this is CDB, upgrade the lock if necessary. While we acquired
+ * the write lock to logically delete the record, we released it when
+ * we returned from that call, and so may not be holding a write lock
+ * at the moment.
+ */
+ if (CDB_LOCKING(env)) {
+ if (F_ISSET(dbc, DBC_WRITECURSOR)) {
+ if ((ret = __lock_get(env,
+ dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt,
+ DB_LOCK_WRITE, &dbc->mylock)) != 0)
+ goto err;
+ cdb_lock = 1;
+ }
+ goto do_del;
+ }
+
+ /*
+ * The variable dbc_c has been initialized to reference the cursor in
+ * which we're going to do the delete. Initialize the cursor's lock
+ * structures as necessary.
+ *
+ * First, we may not need to acquire any locks. If we're in case #3,
+ * that is, the primary database isn't a btree database, our caller
+ * is responsible for acquiring any necessary locks before calling us.
+ */
+ if (F_ISSET(dbc, DBC_OPD))
+ goto do_del;
+
+ /*
+ * Otherwise, acquire a write lock on the primary database's page.
+ *
+ * Lock the primary database page, regardless of whether we're deleting
+ * an item on a primary database page or an off-page duplicates page.
+ *
+ * If the cursor that did the initial logical deletion (and had a write
+ * lock) is not the same cursor doing the physical deletion (which may
+ * have only ever had a read lock on the item), we need to upgrade to a
+ * write lock. The confusion comes as follows:
+ *
+ * C1 created, acquires item read lock
+ * C2 dup C1, create C2, also has item read lock.
+ * C1 acquire write lock, delete item
+ * C1 close
+ * C2 close, needs a write lock to physically delete item.
+ *
+ * If we're in a TXN, we know that C2 will be able to acquire the write
+ * lock, because no locker other than the one shared by C1 and C2 can
+ * acquire a write lock -- the original write lock C1 acquired was never
+ * discarded.
+ *
+ * If we're not in a TXN, it's nastier. Other cursors might acquire
+ * read locks on the item after C1 closed, discarding its write lock,
+ * and such locks would prevent C2 from acquiring a read lock. That's
+ * OK, though, we'll simply wait until we can acquire a write lock, or
+ * we'll deadlock. (Which better not happen, since we're not in a TXN.)
+ *
+ * There are similar scenarios with dirty reads, where the cursor may
+ * have downgraded its write lock to a was-write lock.
+ */
+ if (STD_LOCKING(dbc))
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE, cp->pgno, DB_LOCK_WRITE, 0, &cp->lock)) != 0)
+ goto err;
+
+do_del: /*
+ * If the delete occurred in a Btree, we're going to look at the page
+ * to see if the item has to be physically deleted. Otherwise, we do
+ * not need the actual page (and it may not even exist, it might have
+ * been truncated from the file after an allocation aborted).
+ *
+ * Delete the on-page physical item referenced by the cursor.
+ */
+ if (F_ISSET(dbc_c, DBC_OPD))
+ LOCK_CHECK_OFF(dbc_c->thread_info);
+ if (dbc_c->dbtype == DB_BTREE) {
+ if ((ret = __memp_fget(mpf, &cp_c->pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &cp_c->page)) != 0)
+ goto err_c;
+ if ((ret = __bamc_physdel(dbc_c)) != 0)
+ goto err_c;
+ }
+
+ /*
+ * If we're not working in an off-page duplicate tree, then we're
+ * done.
+ */
+ if (!F_ISSET(dbc_c, DBC_OPD) || root_pgno == PGNO_INVALID)
+ goto done;
+
+ /*
+ * We may have just deleted the last element in the off-page duplicate
+ * tree, and closed the last cursor in the tree. For an off-page btree
+ * there are no other cursors in the tree by definition, if the tree is
+ * empty. For an off-page recno we know we have closed the last cursor
+ * in the tree because the __ram_ca_delete call above returned 0 only
+ * in that case. So, if the off-page duplicate tree is empty at this
+ * point, we want to remove it.
+ */
+ if (((h = dbc_c->internal->page) == NULL || h->pgno != root_pgno) &&
+ (ret = __memp_fget(mpf,
+ &root_pgno, dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err_c;
+ if ((count = NUM_ENT(h)) == 0) {
+ if (h != dbc_c->internal->page)
+ DISCARD_CUR(dbc_c, ret);
+ else
+ dbc_c->internal->page = NULL;
+ if (ret == 0)
+ ret = __db_free(dbc, h, 0);
+ } else if (h != dbc_c->internal->page)
+ ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+
+err_c: if (F_ISSET(dbc_c, DBC_OPD))
+ LOCK_CHECK_ON(dbc_c->thread_info);
+ if (ret != 0)
+ goto err;
+
+ if (count != 0)
+ goto done;
+
+ /*
+ * When removing the tree, we have to do one of two things. If this is
+ * case #2, that is, the primary tree is a btree, delete the key that's
+ * associated with the tree from the btree leaf page. We know we are
+ * the only reference to it and we already have the correct lock. We
+ * detect this case because the cursor that was passed to us references
+ * an off-page duplicate cursor.
+ *
+ * If this is case #3, that is, the primary tree isn't a btree, pass
+ * the information back to our caller, it's their job to do cleanup on
+ * the primary page.
+ */
+ if (dbc_opd != NULL) {
+ if ((ret = __memp_fget(mpf, &cp->pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &cp->page)) != 0)
+ goto err;
+ if ((ret = __bamc_physdel(dbc)) != 0)
+ goto err;
+ } else
+ *rmroot = 1;
+err:
+done: /*
+ * Discard the page references and locks, and confirm that the stack
+ * has been emptied.
+ */
+ if (dbc_opd != NULL)
+ DISCARD_CUR(dbc_opd, ret);
+ DISCARD_CUR(dbc, ret);
+
+ /* Downgrade any CDB lock we acquired. */
+ if (cdb_lock)
+ (void)__lock_downgrade(env, &dbc->mylock, DB_LOCK_IWRITE, 0);
+
+ return (ret);
+}
+
+/*
+ * __bamc_cmp --
+ * Compare two btree cursors for equality.
+ *
+ * This function is only called with two cursors that point to the same item.
+ * It only distinguishes cursors pointing to deleted and undeleted items at
+ * the same location.
+ *
+ * PUBLIC: int __bamc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__bamc_cmp(dbc, other_dbc, result)
+ DBC *dbc, *other_dbc;
+ int *result;
+{
+ ENV *env;
+ BTREE_CURSOR *bcp, *obcp;
+
+ env = dbc->env;
+ bcp = (BTREE_CURSOR *)dbc->internal;
+ obcp = (BTREE_CURSOR *)other_dbc->internal;
+
+ DB_ASSERT (env, bcp->pgno == obcp->pgno);
+ DB_ASSERT (env, bcp->indx == obcp->indx);
+
+ /* Check to see if both cursors have the same deleted flag. */
+ *result =
+ ((F_ISSET(bcp, C_DELETED)) == F_ISSET(obcp, C_DELETED)) ? 0 : 1;
+ return (0);
+}
+
+/*
+ * __bamc_destroy --
+ * Close a single cursor -- internal version.
+ */
+static int
+__bamc_destroy(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ ENV *env;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ env = dbc->env;
+
+ /* Discard the structures. */
+ if (cp->sp != cp->stack)
+ __os_free(env, cp->sp);
+
+#ifdef HAVE_COMPRESSION
+ /* Free the memory used for compression */
+ __os_free(env, cp->compressed.data);
+ __os_free(env, cp->key1.data);
+ __os_free(env, cp->key2.data);
+ __os_free(env, cp->data1.data);
+ __os_free(env, cp->data2.data);
+ __os_free(env, cp->del_key.data);
+ __os_free(env, cp->del_data.data);
+#endif
+
+ __os_free(env, cp);
+
+ return (0);
+}
+
+/*
+ * __bamc_count --
+ * Return a count of on and off-page duplicates.
+ *
+ * PUBLIC: int __bamc_count __P((DBC *, db_recno_t *));
+ */
+int
+__bamc_count(dbc, recnop)
+ DBC *dbc;
+ db_recno_t *recnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ db_indx_t indx, top;
+ db_recno_t recno;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Called with the top-level cursor that may reference an off-page
+ * duplicates tree. We don't have to acquire any new locks, we have
+ * to have a read lock to even get here.
+ */
+ if (cp->opd == NULL) {
+ /*
+ * On-page duplicates, get the page and count.
+ */
+ DB_ASSERT(dbp->env, cp->page == NULL);
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ return (ret);
+
+ /*
+ * Move back to the beginning of the set of duplicates and
+ * then count forward.
+ */
+ for (indx = cp->indx;; indx -= P_INDX)
+ if (indx == 0 ||
+ !IS_DUPLICATE(dbc, indx, indx - P_INDX))
+ break;
+ for (recno = 0,
+ top = NUM_ENT(cp->page) - P_INDX;; indx += P_INDX) {
+ if (!IS_DELETED(dbp, cp->page, indx))
+ ++recno;
+ if (indx == top ||
+ !IS_DUPLICATE(dbc, indx, indx + P_INDX))
+ break;
+ }
+ } else {
+ /*
+ * Off-page duplicates tree, get the root page of the off-page
+ * duplicate tree.
+ */
+ if ((ret = __memp_fget(mpf, &cp->opd->internal->root,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ return (ret);
+
+ /*
+ * If the page is an internal page use the page's count as it's
+ * up-to-date and reflects the status of cursors in the tree.
+ * If the page is a leaf page for unsorted duplicates, use the
+ * page's count as cursors don't mark items deleted on the page
+ * and wait, cursor delete items immediately.
+ * If the page is a leaf page for sorted duplicates, there may
+ * be cursors on the page marking deleted items -- count.
+ */
+ if (TYPE(cp->page) == P_LDUP)
+ for (recno = 0, indx = 0,
+ top = NUM_ENT(cp->page) - O_INDX;; indx += O_INDX) {
+ if (!IS_DELETED(dbp, cp->page, indx))
+ ++recno;
+ if (indx == top)
+ break;
+ }
+ else
+ recno = RE_NREC(cp->page);
+ }
+
+ *recnop = recno;
+
+ ret = __memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
+ cp->page = NULL;
+
+ return (ret);
+}
+
+/*
+ * __bamc_del --
+ * Delete using a cursor.
+ */
+static int
+__bamc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+ u_int32_t count;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+ COMPQUIET(flags, 0);
+
+ /* If the item was already deleted, return failure. */
+ if (F_ISSET(cp, C_DELETED))
+ return (DB_KEYEMPTY);
+
+ /*
+ * This code is always called with a page lock but no page.
+ */
+ DB_ASSERT(dbp->env, cp->page == NULL);
+
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_OFF(dbc->thread_info);
+
+ /*
+ * We don't physically delete the record until the cursor moves, so
+ * we have to have a long-lived write lock on the page instead of a
+ * a long-lived read lock. Note, we have to have a read lock to even
+ * get here.
+ *
+ * If we're maintaining record numbers, we lock the entire tree, else
+ * we lock the single page.
+ */
+ if (F_ISSET(cp, C_RECNUM)) {
+ if ((ret = __bamc_getstack(dbc)) != 0)
+ goto err;
+ cp->page = cp->csp->page;
+ } else {
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE, cp->pgno, 0, ret);
+ if (ret != 0)
+ goto err;
+ }
+
+ /* Mark the page dirty. */
+ if ((ret = __memp_dirty(mpf,
+ &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_cdel_log(dbp, dbc->txn, &LSN(cp->page), 0,
+ PGNO(cp->page), &LSN(cp->page), cp->indx)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ /* Set the intent-to-delete flag on the page. */
+ if (TYPE(cp->page) == P_LBTREE)
+ B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx + O_INDX)->type);
+ else
+ B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type);
+
+err: /*
+ * If we've been successful so far and the tree has record numbers,
+ * adjust the record counts. Either way, release acquired page(s).
+ */
+ if (F_ISSET(cp, C_RECNUM)) {
+ cp->csp->page = cp->page;
+ if (ret == 0)
+ ret = __bam_adjust(dbc, -1);
+ (void)__bam_stkrel(dbc, 0);
+ } else
+ if (cp->page != NULL &&
+ (t_ret = __memp_fput(mpf, dbc->thread_info,
+ cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ cp->page = NULL;
+
+ /*
+ * Update the cursors last, after all chance of recoverable failure
+ * is past.
+ */
+ if (ret == 0)
+ ret = __bam_ca_delete(dbp, cp->pgno, cp->indx, 1, &count);
+
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_ON(dbc->thread_info);
+ return (ret);
+}
+
+/*
+ * __bamc_dup --
+ * Duplicate a btree cursor, such that the new one holds appropriate
+ * locks for the position of the original.
+ *
+ * PUBLIC: int __bamc_dup __P((DBC *, DBC *, u_int32_t));
+ */
+int
+__bamc_dup(orig_dbc, new_dbc, flags)
+ DBC *orig_dbc, *new_dbc;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *orig, *new;
+
+ orig = (BTREE_CURSOR *)orig_dbc->internal;
+ new = (BTREE_CURSOR *)new_dbc->internal;
+
+ new->ovflsize = orig->ovflsize;
+ new->recno = orig->recno;
+ new->flags = orig->flags;
+
+#ifdef HAVE_COMPRESSION
+ /* Copy the compression state */
+ return (__bamc_compress_dup(orig_dbc, new_dbc, flags));
+#else
+ COMPQUIET(flags, 0);
+
+ return (0);
+#endif
+}
+
+/*
+ * __bamc_get --
+ * Get using a cursor (btree).
+ */
+static int
+__bamc_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ db_pgno_t orig_pgno;
+ db_indx_t orig_indx;
+ int exact, newopd, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ orig_pgno = cp->pgno;
+ orig_indx = cp->indx;
+
+ newopd = 0;
+ switch (flags) {
+ case DB_CURRENT:
+ /* It's not possible to return a deleted record. */
+ if (F_ISSET(cp, C_DELETED)) {
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ /*
+ * Acquire the current page. We have at least a read-lock
+ * already. The caller may have set DB_RMW asking for a
+ * write lock, but upgrading to a write lock has no better
+ * chance of succeeding now instead of later, so don't try.
+ */
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ goto err;
+ break;
+ case DB_FIRST:
+ newopd = 1;
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, flags, &exact)) != 0)
+ goto err;
+ break;
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ /*
+ * There are two ways to get here based on DBcursor->get
+ * with the DB_GET_BOTH/DB_GET_BOTH_RANGE flags set:
+ *
+ * 1. Searching a sorted off-page duplicate tree: do a tree
+ * search.
+ *
+ * 2. Searching btree: do a tree search. If it returns a
+ * reference to off-page duplicate tree, return immediately
+ * and let our caller deal with it. If the search doesn't
+ * return a reference to off-page duplicate tree, continue
+ * with an on-page search.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if ((ret = __bamc_search(
+ dbc, PGNO_INVALID, data, flags, &exact)) != 0)
+ goto err;
+ if (flags == DB_GET_BOTH) {
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ }
+
+ /*
+ * We didn't require an exact match, so the search may
+ * may have returned an entry past the end of the page,
+ * or we may be referencing a deleted record. If so,
+ * move to the next entry.
+ */
+ if ((cp->indx == NUM_ENT(cp->page) ||
+ IS_CUR_DELETED(dbc)) &&
+ (ret = __bamc_next(dbc, 1, 0)) != 0)
+ goto err;
+ } else {
+ if ((ret = __bamc_search(
+ dbc, PGNO_INVALID, key, flags, &exact)) != 0)
+ return (ret);
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ if (pgnop != NULL && __bam_isopd(dbc, pgnop)) {
+ newopd = 1;
+ break;
+ }
+ if ((ret =
+ __bam_getboth_finddatum(dbc, data, flags)) != 0)
+ goto err;
+ }
+ break;
+#ifdef HAVE_COMPRESSION
+ case DB_SET_LTE:
+ if ((ret = __bam_getlte(dbc, key, NULL)) != 0)
+ goto err;
+ break;
+ case DB_GET_BOTH_LTE:
+ if ((ret = __bam_getlte(dbc, key, data)) != 0)
+ goto err;
+ break;
+#endif
+ case DB_GET_BOTHC:
+ if ((ret = __bam_getbothc(dbc, data)) != 0)
+ goto err;
+ break;
+ case DB_LAST:
+ newopd = 1;
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, flags, &exact)) != 0)
+ goto err;
+ break;
+ case DB_NEXT:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, DB_FIRST, &exact)) != 0)
+ goto err;
+ } else
+ if ((ret = __bamc_next(dbc, 1, 0)) != 0)
+ goto err;
+ break;
+ case DB_NEXT_DUP:
+ if ((ret = __bamc_next(dbc, 1, 0)) != 0)
+ goto err;
+ if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ case DB_NEXT_NODUP:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, DB_FIRST, &exact)) != 0)
+ goto err;
+ } else
+ do {
+ if ((ret = __bamc_next(dbc, 1, 0)) != 0)
+ goto err;
+ } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx));
+ break;
+ case DB_PREV:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+ goto err;
+ } else
+ if ((ret = __bamc_prev(dbc)) != 0)
+ goto err;
+ break;
+ case DB_PREV_DUP:
+ if ((ret = __bamc_prev(dbc)) != 0)
+ goto err;
+ if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ case DB_PREV_NODUP:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+ goto err;
+ } else
+ do {
+ if ((ret = __bamc_prev(dbc)) != 0)
+ goto err;
+ } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx));
+ break;
+ case DB_SET:
+ case DB_SET_RECNO:
+ newopd = 1;
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, key, flags, &exact)) != 0)
+ goto err;
+ break;
+ case DB_SET_RANGE:
+ newopd = 1;
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, key, flags, &exact)) != 0)
+ goto err;
+
+ /*
+ * As we didn't require an exact match, the search function
+ * may have returned an entry past the end of the page. Or,
+ * we may be referencing a deleted record. If so, move to
+ * the next entry.
+ */
+ if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc))
+ if ((ret = __bamc_next(dbc, 0, 0)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__bamc_get", flags);
+ goto err;
+ }
+
+ /*
+ * We may have moved to an off-page duplicate tree. Return that
+ * information to our caller.
+ */
+ if (newopd && pgnop != NULL)
+ (void)__bam_isopd(dbc, pgnop);
+
+err: /*
+ * Regardless of whether we were successful or not, if the cursor
+ * moved, clear the delete flag, DBcursor->get never references a
+ * deleted key, if it moved at all.
+ */
+ if (F_ISSET(cp, C_DELETED) &&
+ (cp->pgno != orig_pgno || cp->indx != orig_indx))
+ F_CLR(cp, C_DELETED);
+
+ return (ret);
+}
+
+static int
+__bam_get_prev(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DBT key, data;
+ db_pgno_t pgno;
+ int ret;
+
+ if ((ret = __bamc_prev(dbc)) != 0)
+ return (ret);
+
+ if (__bam_isopd(dbc, &pgno)) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0)
+ return (ret);
+ if ((ret = cp->opd->am_get(cp->opd,
+ &key, &data, DB_LAST, NULL)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_bulk -- Return bulk data from a btree.
+ */
+static int
+__bam_bulk(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BTREE_CURSOR *cp;
+ PAGE *pg;
+ db_indx_t *inp, indx, pg_keyoff;
+ int32_t *endp, key_off, *offp, *saveoffp;
+ u_int8_t *dbuf, *dp, *np;
+ u_int32_t key_size, pagesize, size, space;
+ int adj, is_key, need_pg, next_key, no_dup, rec_key, ret;
+
+ ret = 0;
+ key_off = 0;
+ size = 0;
+ pagesize = dbc->dbp->pgsize;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * dp tracks the beginning of the page in the buffer.
+ * np is the next place to copy things into the buffer.
+ * dbuf always stays at the beginning of the buffer.
+ */
+ dbuf = data->data;
+ np = dp = dbuf;
+
+ /* Keep track of space that is left. There is a termination entry */
+ space = data->ulen;
+ space -= sizeof(*offp);
+
+ /* Build the offset/size table from the end up. */
+ endp = (int32_t *)((u_int8_t *)dbuf + data->ulen);
+ endp--;
+ offp = endp;
+
+ key_size = 0;
+
+ /*
+ * Distinguish between BTREE and RECNO.
+ * There are no keys in RECNO. If MULTIPLE_KEY is specified
+ * then we return the record numbers.
+ * is_key indicates that multiple btree keys are returned.
+ * rec_key is set if we are returning record numbers.
+ * next_key is set if we are going after the next key rather than dup.
+ */
+ if (dbc->dbtype == DB_BTREE) {
+ is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1: 0;
+ rec_key = 0;
+ next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP;
+ adj = 2;
+ } else {
+ is_key = 0;
+ rec_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+ next_key = LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP;
+ adj = 1;
+ }
+ no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP;
+
+next_pg:
+ indx = cp->indx;
+ pg = cp->page;
+
+ inp = P_INP(dbc->dbp, pg);
+ /* The current page is not yet in the buffer. */
+ need_pg = 1;
+
+ /*
+ * Keep track of the offset of the current key on the page.
+ * If we are returning keys, set it to 0 first so we force
+ * the copy of the key to the buffer.
+ */
+ pg_keyoff = 0;
+ if (is_key == 0)
+ pg_keyoff = inp[indx];
+
+ do {
+ if (IS_DELETED(dbc->dbp, pg, indx)) {
+ if (dbc->dbtype != DB_RECNO)
+ continue;
+
+ cp->recno++;
+ /*
+ * If we are not returning recnos then we
+ * need to fill in every slot so the user
+ * can calculate the record numbers.
+ */
+ if (rec_key != 0)
+ continue;
+
+ space -= 2 * sizeof(*offp);
+ /* Check if space as underflowed. */
+ if (space > data->ulen)
+ goto back_up;
+
+ /* Just mark the empty recno slots. */
+ *offp-- = 0;
+ *offp-- = 0;
+ continue;
+ }
+
+ /*
+ * Check to see if we have a new key.
+ * If so, then see if we need to put the
+ * key on the page. If its already there
+ * then we just point to it.
+ */
+ if (is_key && pg_keyoff != inp[indx]) {
+ bk = GET_BKEYDATA(dbc->dbp, pg, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ size = key_size = bo->tlen;
+ if (key_size > space)
+ goto get_key_space;
+ if ((ret = __bam_bulk_overflow(dbc,
+ bo->tlen, bo->pgno, np)) != 0)
+ return (ret);
+ space -= key_size;
+ key_off = (int32_t)(np - dbuf);
+ np += key_size;
+ } else {
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+get_key_space:
+ /* Nothing added, then error. */
+ if (offp == endp) {
+ data->size = (u_int32_t)
+ DB_ALIGN(size +
+ pagesize, 1024);
+ return
+ (DB_BUFFER_SMALL);
+ }
+ /*
+ * We need to back up to the
+ * last record put into the
+ * buffer so that it is
+ * CURRENT.
+ */
+ if (indx != 0)
+ indx -= P_INDX;
+ else {
+ if ((ret =
+ __bam_get_prev(
+ dbc)) != 0)
+ return (ret);
+ indx = cp->indx;
+ pg = cp->page;
+ }
+ break;
+ }
+ /*
+ * Move the data part of the page
+ * to the buffer.
+ */
+ memcpy(dp,
+ (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+ key_size = bk->len;
+ key_off = (int32_t)((inp[indx] - HOFFSET(pg))
+ + (dp - dbuf) + SSZA(BKEYDATA, data));
+ pg_keyoff = inp[indx];
+ }
+ }
+
+ /*
+ * Reserve space for the pointers and sizes.
+ * Either key/data pair or just for a data item.
+ */
+ space -= (is_key ? 4 : 2) * sizeof(*offp);
+ if (rec_key)
+ space -= sizeof(*offp);
+
+ /* Check to see if space has underflowed. */
+ if (space > data->ulen)
+ goto back_up;
+
+ /*
+ * Determine if the next record is in the
+ * buffer already or if it needs to be copied in.
+ * If we have an off page dup, then copy as many
+ * as will fit into the buffer.
+ */
+ bk = GET_BKEYDATA(dbc->dbp, pg, indx + adj - 1);
+ if (B_TYPE(bk->type) == B_DUPLICATE) {
+ bo = (BOVERFLOW *)bk;
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+ /*
+ * We pass the offset of the current key.
+ * On return we check to see if offp has
+ * moved to see if any data fit.
+ */
+ saveoffp = offp;
+ if ((ret = __bam_bulk_duplicates(dbc, bo->pgno,
+ dbuf, is_key ? offp + P_INDX : NULL,
+ &offp, &np, &space, no_dup)) != 0) {
+ if (ret == DB_BUFFER_SMALL) {
+ size = space;
+ space = 0;
+ /* If nothing was added, then error. */
+ if (offp == saveoffp) {
+ offp += 2;
+ goto back_up;
+ }
+ goto get_space;
+ }
+ return (ret);
+ }
+ } else if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ size = bo->tlen;
+ if (size > space)
+ goto back_up;
+ if ((ret =
+ __bam_bulk_overflow(dbc,
+ bo->tlen, bo->pgno, np)) != 0)
+ return (ret);
+ space -= size;
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ } else if (rec_key)
+ *offp-- = (int32_t)cp->recno;
+ *offp-- = (int32_t)(np - dbuf);
+ np += size;
+ *offp-- = (int32_t)size;
+ } else {
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+back_up:
+ /*
+ * Back up the index so that the
+ * last record in the buffer is CURRENT
+ */
+ if (indx >= adj)
+ indx -= adj;
+ else {
+ if ((ret =
+ __bam_get_prev(dbc)) != 0 &&
+ ret != DB_NOTFOUND)
+ return (ret);
+ indx = cp->indx;
+ pg = cp->page;
+ }
+ if (dbc->dbtype == DB_RECNO)
+ cp->recno--;
+get_space:
+ /*
+ * See if we put anything in the
+ * buffer or if we are doing a DBP->get
+ * did we get all of the data.
+ */
+ if (offp >=
+ (is_key ? &endp[-1] : endp) ||
+ F_ISSET(dbc, DBC_FROM_DB_GET)) {
+ data->size = (u_int32_t)
+ DB_ALIGN(size +
+ data->ulen - space, 1024);
+ return (DB_BUFFER_SMALL);
+ }
+ break;
+ }
+ memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+ /*
+ * Add the offsets and sizes to the end of the buffer.
+ * First add the key info then the data info.
+ */
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ } else if (rec_key)
+ *offp-- = (int32_t)cp->recno;
+ *offp-- = (int32_t)((inp[indx + adj - 1] - HOFFSET(pg))
+ + (dp - dbuf) + SSZA(BKEYDATA, data));
+ *offp-- = bk->len;
+ }
+ if (dbc->dbtype == DB_RECNO)
+ cp->recno++;
+ else if (no_dup) {
+ while (indx + adj < NUM_ENT(pg) &&
+ pg_keyoff == inp[indx + adj])
+ indx += adj;
+ }
+ /*
+ * Stop when we either run off the page or we move to the next key and
+ * we are not returning multiple keys.
+ */
+ } while ((indx += adj) < NUM_ENT(pg) &&
+ (next_key || pg_keyoff == inp[indx]));
+
+ /* If we are off the page then try to the next page. */
+ if (ret == 0 && next_key && indx >= NUM_ENT(pg)) {
+ cp->indx = indx;
+ ret = __bamc_next(dbc, 0, 1);
+ if (ret == 0)
+ goto next_pg;
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ }
+
+ /*
+ * If we did a DBP->get we must error if we did not return
+ * all the data for the current key because there is
+ * no way to know if we did not get it all, nor any
+ * interface to fetch the balance.
+ */
+
+ if (ret == 0 && indx < pg->entries &&
+ F_ISSET(dbc, DBC_TRANSIENT) && pg_keyoff == inp[indx]) {
+ data->size = (data->ulen - space) + size;
+ return (DB_BUFFER_SMALL);
+ }
+ /*
+ * Must leave the index pointing at the last record fetched.
+ * If we are not fetching keys, we may have stepped to the
+ * next key.
+ */
+ if (ret == DB_BUFFER_SMALL || next_key || pg_keyoff == inp[indx])
+ cp->indx = indx;
+ else
+ cp->indx = indx - P_INDX;
+
+ if (rec_key == 1)
+ *offp = RECNO_OOB;
+ else
+ *offp = -1;
+ return (0);
+}
+
+/*
+ * __bam_bulk_overflow --
+ * Dump overflow record into the buffer.
+ * The space requirements have already been checked.
+ * PUBLIC: int __bam_bulk_overflow
+ * PUBLIC: __P((DBC *, u_int32_t, db_pgno_t, u_int8_t *));
+ */
+int
+__bam_bulk_overflow(dbc, len, pgno, dp)
+ DBC *dbc;
+ u_int32_t len;
+ db_pgno_t pgno;
+ u_int8_t *dp;
+{
+ DBT dbt;
+
+ memset(&dbt, 0, sizeof(dbt));
+ F_SET(&dbt, DB_DBT_USERMEM);
+ dbt.ulen = len;
+ dbt.data = (void *)dp;
+ return (__db_goff(dbc, &dbt, len, pgno, NULL, NULL));
+}
+
+/*
+ * __bam_bulk_duplicates --
+ * Put as many off page duplicates as will fit into the buffer.
+ * This routine will adjust the cursor to reflect the position in
+ * the overflow tree.
+ * PUBLIC: int __bam_bulk_duplicates __P((DBC *,
+ * PUBLIC: db_pgno_t, u_int8_t *, int32_t *,
+ * PUBLIC: int32_t **, u_int8_t **, u_int32_t *, int));
+ */
+int
+__bam_bulk_duplicates(dbc, pgno, dbuf, keyoff, offpp, dpp, spacep, no_dup)
+ DBC *dbc;
+ db_pgno_t pgno;
+ u_int8_t *dbuf;
+ int32_t *keyoff, **offpp;
+ u_int8_t **dpp;
+ u_int32_t *spacep;
+ int no_dup;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBC *opd;
+ DBT key, data;
+ PAGE *pg;
+ db_indx_t indx, *inp;
+ int32_t *offp;
+ u_int32_t pagesize, size, space;
+ u_int8_t *dp, *np;
+ int first, need_pg, ret, t_ret;
+
+ ret = 0;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ opd = cp->opd;
+
+ if (opd == NULL) {
+ if ((ret = __dbc_newopd(dbc, pgno, NULL, &opd)) != 0)
+ return (ret);
+ cp->opd = opd;
+ if ((ret = opd->am_get(opd,
+ &key, &data, DB_FIRST, NULL)) != 0)
+ goto close_opd;
+ }
+
+ pagesize = opd->dbp->pgsize;
+ cp = (BTREE_CURSOR *)opd->internal;
+ space = *spacep;
+ /* Get current offset slot. */
+ offp = *offpp;
+
+ /*
+ * np is the next place to put data.
+ * dp is the beginning of the current page in the buffer.
+ */
+ np = dp = *dpp;
+ first = 1;
+ indx = cp->indx;
+
+ do {
+ /* Fetch the current record. No initial move. */
+ if ((ret = __bamc_next(opd, 0, 0)) != 0)
+ break;
+ pg = cp->page;
+ indx = cp->indx;
+ inp = P_INP(dbp, pg);
+ /* We need to copy the page to the buffer. */
+ need_pg = 1;
+
+ do {
+ if (IS_DELETED(dbp, pg, indx))
+ goto contin;
+ bk = GET_BKEYDATA(dbp, pg, indx);
+ space -= 2 * sizeof(*offp);
+ /* Allocate space for key if needed. */
+ if (first == 0 && keyoff != NULL)
+ space -= 2 * sizeof(*offp);
+
+ /* Did space underflow? */
+ if (space > *spacep) {
+ ret = DB_BUFFER_SMALL;
+ if (first == 1) {
+ /* Get the absolute value. */
+ space = -(int32_t)space;
+ space = *spacep + space;
+ if (need_pg)
+ space += pagesize - HOFFSET(pg);
+ }
+ break;
+ }
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ size = bo->tlen;
+ if (size > space) {
+ ret = DB_BUFFER_SMALL;
+ space = *spacep + size;
+ break;
+ }
+ if (first == 0 && keyoff != NULL) {
+ *offp-- = keyoff[0];
+ *offp-- = keyoff[-1];
+ }
+ if ((ret = __bam_bulk_overflow(dbc,
+ bo->tlen, bo->pgno, np)) != 0)
+ return (ret);
+ space -= size;
+ *offp-- = (int32_t)(np - dbuf);
+ np += size;
+ } else {
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+ ret = DB_BUFFER_SMALL;
+ /* Return space required. */
+ space = *spacep + size;
+ break;
+ }
+ memcpy(dp,
+ (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+ if (first == 0 && keyoff != NULL) {
+ *offp-- = keyoff[0];
+ *offp-- = keyoff[-1];
+ }
+ size = bk->len;
+ *offp-- = (int32_t)((inp[indx] - HOFFSET(pg))
+ + (dp - dbuf) + SSZA(BKEYDATA, data));
+ }
+ *offp-- = (int32_t)size;
+ first = 0;
+ if (no_dup)
+ break;
+contin:
+ indx++;
+ if (opd->dbtype == DB_RECNO)
+ cp->recno++;
+ } while (indx < NUM_ENT(pg));
+ if (no_dup)
+ break;
+ cp->indx = indx;
+
+ } while (ret == 0);
+
+ /* Return the updated information. */
+ *spacep = space;
+ *offpp = offp;
+ *dpp = np;
+
+ /*
+ * If we ran out of space back up the pointer.
+ * If we did not return any dups or reached the end, close the opd.
+ */
+ if (ret == DB_BUFFER_SMALL) {
+ if (opd->dbtype == DB_RECNO) {
+ if (--cp->recno == 0)
+ goto close_opd;
+ } else if (indx != 0)
+ cp->indx--;
+ else {
+ t_ret = __bamc_prev(opd);
+ if (t_ret == DB_NOTFOUND)
+ goto close_opd;
+ if (t_ret != 0)
+ ret = t_ret;
+ }
+ } else if (keyoff == NULL && ret == DB_NOTFOUND) {
+ cp->indx--;
+ if (opd->dbtype == DB_RECNO)
+ --cp->recno;
+ } else if (indx == 0 || ret == DB_NOTFOUND) {
+close_opd:
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if ((t_ret = __dbc_close(opd)) != 0 && ret == 0)
+ ret = t_ret;
+ ((BTREE_CURSOR *)dbc->internal)->opd = NULL;
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ return (ret);
+}
+
+/*
+ * __bam_getbothc --
+ * Search for a matching data item on a join.
+ */
+static int
+__bam_getbothc(dbc, data)
+ DBC *dbc;
+ DBT *data;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ int cmp, exact, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Acquire the current page. We have at least a read-lock
+ * already. The caller may have set DB_RMW asking for a
+ * write lock, but upgrading to a write lock has no better
+ * chance of succeeding now instead of later, so don't try.
+ */
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ return (ret);
+
+ /*
+ * An off-page duplicate cursor. Search the remaining duplicates
+ * for one which matches (do a normal btree search, then verify
+ * that the retrieved record is greater than the original one).
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ /*
+ * Check to make sure the desired item comes strictly after
+ * the current position; if it doesn't, return DB_NOTFOUND.
+ */
+ if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
+ dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare,
+ &cmp)) != 0)
+ return (ret);
+
+ if (cmp <= 0)
+ return (DB_NOTFOUND);
+
+ /* Discard the current page, we're going to do a full search. */
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, cp->page, dbc->priority)) != 0)
+ return (ret);
+ cp->page = NULL;
+
+ return (__bamc_search(dbc,
+ PGNO_INVALID, data, DB_GET_BOTH, &exact));
+ }
+
+ /*
+ * We're doing a DBC->get(DB_GET_BOTHC) and we're already searching
+ * a set of on-page duplicates (either sorted or unsorted). Continue
+ * a linear search from after the current position.
+ *
+ * (Note that we could have just finished a "set" of one duplicate,
+ * i.e. not a duplicate at all, but the following check will always
+ * return DB_NOTFOUND in this case, which is the desired behavior.)
+ */
+ if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+ !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
+ return (DB_NOTFOUND);
+ cp->indx += P_INDX;
+
+ return (__bam_getboth_finddatum(dbc, data, DB_GET_BOTH));
+}
+
+#ifdef HAVE_COMPRESSION
+/*
+ * __bam_getlte --
+ * Search for the largest entry <= key/data - used by compression.
+ *
+ * data == NULL indicates the DB_SET_LTE flag
+ * data != NULL indicates the DB_GET_BOTH_LTE flag
+ *
+ * Only works for a primary cursor - not an OPD cursor. Handles the
+ * OPD manipulation as well - no need to return to the caller to
+ * perform more OPD movements.
+ */
+static int
+__bam_getlte(dbc, key, data)
+ DBC *dbc;
+ DBT *key, *data;
+{
+ BTREE_CURSOR *cp, *ocp;
+ DB *dbp;
+ db_pgno_t pgno;
+ int exact, ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /* Begin by searching for the key */
+ ret = __bamc_search(dbc, PGNO_INVALID, key, DB_SET_RANGE, &exact);
+ if (ret == DB_NOTFOUND)
+ goto find_last;
+ if (ret != 0)
+ goto end;
+
+ if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc)) {
+ /*
+ * Move to the next entry if we're past the end of the
+ * page or on a deleted entry.
+ */
+ ret = __bamc_next(dbc, 0, 0);
+ if (ret == DB_NOTFOUND)
+ goto find_last;
+ if (ret != 0)
+ goto end;
+
+ /* Check if we're still on the correct key */
+ if ((ret = __bam_cmp(dbc, key, cp->page, cp->indx,
+ ((BTREE*)dbp->bt_internal)->bt_compare, &exact)) != 0)
+ goto end;
+ exact = (exact == 0);
+ }
+
+ if (exact == 0) {
+ ret = __bam_get_prev(dbc);
+ goto end;
+ }
+
+ if (__bam_isopd(dbc, &pgno)) {
+ /*
+ * We want to do unusual things with off-page duplicates, so
+ * deal with them here rather than returning to handle them.
+ */
+ if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0)
+ goto end;
+
+ /* Search for the correct duplicate */
+ ret = __bamc_search(cp->opd, PGNO_INVALID, data,
+ data == NULL ? DB_FIRST : DB_SET_RANGE, &exact);
+ if (ret == DB_NOTFOUND)
+ goto find_last_dup;
+ if (ret != 0)
+ goto end;
+
+ ocp = (BTREE_CURSOR *)cp->opd->internal;
+ if (ocp->indx == NUM_ENT(ocp->page) ||
+ IS_CUR_DELETED(cp->opd)) {
+ /*
+ * Move to the next entry if we're past the end of the
+ * page or on a deleted entry.
+ */
+ ret = __bamc_next(cp->opd, 0, 0);
+ if (ret == DB_NOTFOUND)
+ goto find_last_dup;
+ if (ret != 0)
+ goto end;
+
+ if (data != NULL) {
+ /* Check if we're still on the correct data */
+ if ((ret = __bam_cmp(
+ dbc, data, ocp->page, ocp->indx,
+ dbp->dup_compare, &exact)) != 0)
+ goto end;
+ exact = (exact == 0);
+ } else
+ exact = 1;
+ }
+
+ if (exact == 0) {
+ /* Move to the previous entry */
+ ret = __bamc_prev(cp->opd);
+ if (ret == DB_NOTFOUND) {
+ if ((ret = __dbc_close(cp->opd)) != 0)
+ goto end;
+ cp->opd = NULL;
+ ret = __bam_get_prev(dbc);
+ }
+ }
+ } else if (data != NULL) {
+ /*
+ * If we got an exact match with on-page duplicates, we need to
+ * search in them.
+ */
+ ret = __bam_getboth_finddatum(dbc, data, DB_GET_BOTH_RANGE);
+ if (ret == DB_NOTFOUND)
+ exact = 0;
+ else if (ret != 0)
+ goto end;
+ else {
+ /* Check if we're still on the correct data */
+ if ((ret = __bam_cmp(dbc, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &exact)) != 0)
+ goto end;
+ exact = (exact == 0);
+ }
+
+ if (exact == 0) {
+ ret = __bam_get_prev(dbc);
+ }
+ }
+
+ end:
+ return (ret);
+
+ find_last:
+ if ((ret = __bamc_search(
+ dbc, PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+ return (ret);
+
+ if (__bam_isopd(dbc, &pgno)) {
+ if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0)
+ return (ret);
+ find_last_dup:
+ if ((ret = __bamc_search(
+ cp->opd, PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+ return (ret);
+ }
+
+ return (ret);
+}
+#endif
+
+/*
+ * __bam_getboth_finddatum --
+ * Find a matching on-page data item.
+ */
+static int
+__bam_getboth_finddatum(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_indx_t base, lim, top;
+ int cmp, ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ cmp = 0;
+
+ /*
+ * Called (sometimes indirectly) from DBC->get to search on-page data
+ * item(s) for a matching value. If the original flag was DB_GET_BOTH
+ * or DB_GET_BOTH_RANGE, the cursor is set to the first undeleted data
+ * item for the key. If the original flag was DB_GET_BOTHC, the cursor
+ * argument is set to the first data item we can potentially return.
+ * In both cases, there may or may not be additional duplicate data
+ * items to search.
+ *
+ * If the duplicates are not sorted, do a linear search.
+ */
+ if (dbp->dup_compare == NULL) {
+ for (;; cp->indx += P_INDX) {
+ if (!IS_CUR_DELETED(dbc)) {
+ if ((ret = __bam_cmp(
+ dbc, data, cp->page, cp->indx + O_INDX,
+ __bam_defcmp, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0)
+ return (0);
+ }
+
+ if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+ !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
+ break;
+ }
+ return (DB_NOTFOUND);
+ }
+
+ /*
+ * If the duplicates are sorted, do a binary search. The reason for
+ * this is that large pages and small key/data pairs result in large
+ * numbers of on-page duplicates before they get pushed off-page.
+ *
+ * Find the top and bottom of the duplicate set. Binary search
+ * requires at least two items, don't loop if there's only one.
+ */
+ for (base = top = cp->indx; top < NUM_ENT(cp->page); top += P_INDX)
+ if (!IS_DUPLICATE(dbc, cp->indx, top))
+ break;
+ if (base == (top - P_INDX)) {
+ if ((ret = __bam_cmp(dbc, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0 || (cmp < 0 && flags == DB_GET_BOTH_RANGE))
+ return (0);
+ cp->indx = top;
+ return DB_NOTFOUND;
+ }
+
+ for (lim = (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) {
+ cp->indx = base + ((lim >> 1) * P_INDX);
+ if ((ret = __bam_cmp(dbc, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0) {
+ /*
+ * XXX
+ * No duplicate duplicates in sorted duplicate sets,
+ * so there can be only one.
+ */
+ if (!IS_CUR_DELETED(dbc))
+ return (0);
+ break;
+ }
+ if (cmp > 0) {
+ base = cp->indx + P_INDX;
+ --lim;
+ }
+ }
+
+ /* No match found; if we're looking for an exact match, we're done. */
+ if (flags == DB_GET_BOTH)
+ return (DB_NOTFOUND);
+
+ /*
+ * Base is the smallest index greater than the data item, may be zero
+ * or a last + O_INDX index, and may be deleted. Find an undeleted
+ * item.
+ */
+ cp->indx = base;
+ while (cp->indx < top && IS_CUR_DELETED(dbc))
+ cp->indx += P_INDX;
+ return (cp->indx < top ? 0 : DB_NOTFOUND);
+}
+
+/*
+ * __bamc_put --
+ * Put using a cursor.
+ */
+static int
+__bamc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ db_pgno_t root_pgno;
+ int cmp, exact, own, ret, stack;
+ u_int32_t iiop;
+ void *arg;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ root_pgno = cp->root;
+
+split: ret = stack = 0;
+ switch (flags) {
+ case DB_CURRENT:
+ if (F_ISSET(cp, C_DELETED))
+ return (DB_NOTFOUND);
+ /* FALLTHROUGH */
+ case DB_AFTER:
+ case DB_BEFORE:
+ iiop = flags;
+ own = 1;
+
+ /* Acquire the current page with a write lock. */
+ ACQUIRE_WRITE_LOCK(dbc, ret);
+ if (ret != 0)
+ goto err;
+ if (cp->page == NULL && (ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ goto err;
+ break;
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_NOOVERWRITE:
+ case DB_OVERWRITE_DUP:
+ own = 0;
+ /*
+ * Searching off-page, sorted duplicate tree: do a tree search
+ * for the correct item; __bamc_search returns the smallest
+ * slot greater than the key, use it.
+ *
+ * See comment below regarding where we can start the search.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if ((ret = __bamc_search(dbc,
+ F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno,
+ data, flags, &exact)) != 0)
+ goto err;
+ stack = 1;
+
+ /* Disallow "sorted" duplicate duplicates. */
+ if (exact != 0) {
+ if (flags == DB_OVERWRITE_DUP ||
+ IS_DELETED(dbp, cp->page, cp->indx)) {
+ iiop = DB_CURRENT;
+ break;
+ }
+ ret = __db_duperr(dbp, flags);
+ goto err;
+ }
+ iiop = DB_BEFORE;
+ break;
+ }
+
+ /*
+ * Searching a btree.
+ *
+ * If we've done a split, we can start the search from the
+ * parent of the split page, which __bam_split returned
+ * for us in root_pgno, unless we're in a Btree with record
+ * numbering. In that case, we'll need the true root page
+ * in order to adjust the record count.
+ */
+ if ((ret = __bamc_search(dbc,
+ F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno, key,
+ flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
+ DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
+ goto err;
+ stack = 1;
+
+ /*
+ * If we don't have an exact match, __bamc_search returned
+ * the smallest slot greater than the key, use it.
+ */
+ if (!exact) {
+ iiop = DB_KEYFIRST;
+ break;
+
+ /*
+ * Check for NOOVERWRITE. It is possible that there
+ * is a key with an empty duplicate page attached.
+ */
+ } else if (flags == DB_NOOVERWRITE && !IS_CUR_DELETED(dbc)) {
+ if (pgnop != NULL && __bam_isopd(dbc, pgnop))
+ ret = __bam_opd_exists(dbc, *pgnop);
+ else
+ ret = DB_KEYEXIST;
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * If duplicates aren't supported, replace the current item.
+ */
+ if (!F_ISSET(dbp, DB_AM_DUP)) {
+ iiop = DB_CURRENT;
+ break;
+ }
+
+ /*
+ * If we find a matching entry, it may be an off-page duplicate
+ * tree. Return the page number to our caller, we need a new
+ * cursor.
+ */
+ if (pgnop != NULL && __bam_isopd(dbc, pgnop))
+ goto done;
+
+ /* If the duplicates aren't sorted, move to the right slot. */
+ if (dbp->dup_compare == NULL) {
+ if (flags == DB_KEYFIRST)
+ iiop = DB_BEFORE;
+ else
+ for (;; cp->indx += P_INDX)
+ if (cp->indx + P_INDX >=
+ NUM_ENT(cp->page) ||
+ !IS_DUPLICATE(dbc, cp->indx,
+ cp->indx + P_INDX)) {
+ iiop = DB_AFTER;
+ break;
+ }
+ break;
+ }
+
+ /*
+ * We know that we're looking at the first of a set of sorted
+ * on-page duplicates. Walk the list to find the right slot.
+ */
+ for (;; cp->indx += P_INDX) {
+ if ((ret = __bam_cmp(dbc, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ goto err;
+ if (cmp < 0) {
+ iiop = DB_BEFORE;
+ break;
+ }
+
+ /* Disallow "sorted" duplicate duplicates. */
+ if (cmp == 0) {
+ if (flags == DB_OVERWRITE_DUP ||
+ IS_DELETED(dbp, cp->page, cp->indx)) {
+ iiop = DB_CURRENT;
+ break;
+ }
+ ret = __db_duperr(dbp, flags);
+ goto err;
+ }
+
+ if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+ P_INP(dbp, ((PAGE *)cp->page))[cp->indx] !=
+ P_INP(dbp, ((PAGE *)cp->page))[cp->indx + P_INDX]) {
+ iiop = DB_AFTER;
+ break;
+ }
+ }
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__bamc_put", flags);
+ goto err;
+ }
+
+ switch (ret = __bam_iitem(dbc, key, data, iiop, 0)) {
+ case 0:
+ break;
+ case DB_NEEDSPLIT:
+ /*
+ * To split, we need a key for the page. Either use the key
+ * argument or get a copy of the key from the page.
+ */
+ if (flags == DB_AFTER ||
+ flags == DB_BEFORE || flags == DB_CURRENT) {
+ memset(&dbt, 0, sizeof(DBT));
+ if ((ret = __db_ret(dbc, cp->page, 0, &dbt,
+ &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+ goto err;
+ arg = &dbt;
+ } else
+ arg = F_ISSET(dbc, DBC_OPD) ? data : key;
+
+ /*
+ * Discard any locks and pinned pages (the locks are discarded
+ * even if we're running with transactions, as they lock pages
+ * that we're sorry we ever acquired). If stack is set and the
+ * cursor entries are valid, they point to the same entries as
+ * the stack, don't free them twice.
+ */
+ if (stack)
+ ret = __bam_stkrel(dbc, STK_CLRDBC | STK_NOLOCK);
+ else
+ DISCARD_CUR(dbc, ret);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * SR [#6059]
+ * If we do not own a lock on the page any more, then clear the
+ * cursor so we don't point at it. Even if we call __bam_stkrel
+ * above we still may have entered the routine with the cursor
+ * positioned to a particular record. This is in the case
+ * where C_RECNUM is set.
+ */
+ if (own == 0) {
+ cp->pgno = PGNO_INVALID;
+ cp->indx = 0;
+ }
+
+ /* Split the tree. */
+ if ((ret = __bam_split(dbc, arg, &root_pgno)) != 0)
+ return (ret);
+
+ goto split;
+ default:
+ goto err;
+ }
+
+err:
+done: /*
+ * If we inserted a key into the first or last slot of the tree,
+ * remember where it was so we can do it more quickly next time.
+ * If the tree has record numbers, we need a complete stack so
+ * that we can adjust the record counts, so skipping the tree search
+ * isn't possible. For subdatabases we need to be careful that the
+ * page does not move from one db to another, so we track its LSN.
+ *
+ * If there are duplicates and we are inserting into the last slot,
+ * the cursor will point _to_ the last item, not after it, which
+ * is why we subtract P_INDX below.
+ */
+
+ t = dbp->bt_internal;
+ if (ret == 0 && TYPE(cp->page) == P_LBTREE &&
+ (flags == DB_KEYFIRST || flags == DB_KEYLAST) &&
+ !F_ISSET(cp, C_RECNUM) &&
+ (!F_ISSET(dbp, DB_AM_SUBDB) ||
+ (LOGGING_ON(dbp->env) && !F_ISSET(dbp, DB_AM_NOT_DURABLE))) &&
+ ((NEXT_PGNO(cp->page) == PGNO_INVALID &&
+ cp->indx >= NUM_ENT(cp->page) - P_INDX) ||
+ (PREV_PGNO(cp->page) == PGNO_INVALID && cp->indx == 0))) {
+ t->bt_lpgno = cp->pgno;
+ if (F_ISSET(dbp, DB_AM_SUBDB))
+ t->bt_llsn = LSN(cp->page);
+ } else
+ t->bt_lpgno = PGNO_INVALID;
+ /*
+ * Discard any pages pinned in the tree and their locks, except for
+ * the leaf page. Note, the leaf page participated in any stack we
+ * acquired, and so we have to adjust the stack as necessary. If
+ * there was only a single page on the stack, we don't have to free
+ * further stack pages.
+ */
+ if (stack && BT_STK_POP(cp) != NULL)
+ (void)__bam_stkrel(dbc, 0);
+
+ /*
+ * Regardless of whether we were successful or not, clear the delete
+ * flag. If we're successful, we either moved the cursor or the item
+ * is no longer deleted. If we're not successful, then we're just a
+ * copy, no need to have the flag set.
+ *
+ * We may have instantiated off-page duplicate cursors during the put,
+ * so clear the deleted bit from the off-page duplicate cursor as well.
+ */
+ F_CLR(cp, C_DELETED);
+ if (cp->opd != NULL) {
+ cp = (BTREE_CURSOR *)cp->opd->internal;
+ F_CLR(cp, C_DELETED);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bamc_rget --
+ * Return the record number for a cursor.
+ *
+ * PUBLIC: int __bamc_rget __P((DBC *, DBT *));
+ */
+int
+__bamc_rget(dbc, data)
+ DBC *dbc;
+ DBT *data;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ db_recno_t recno;
+ int exact, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Get the page with the current item on it.
+ * Get a copy of the key.
+ * Release the page, making sure we don't release it twice.
+ */
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ return (ret);
+ memset(&dbt, 0, sizeof(DBT));
+ if ((ret = __db_ret(dbc, cp->page, cp->indx, &dbt,
+ &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+ goto err;
+ ret = __memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
+ cp->page = NULL;
+ if (ret != 0)
+ return (ret);
+
+ if ((ret = __bam_search(dbc, PGNO_INVALID, &dbt,
+ F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND,
+ 1, &recno, &exact)) != 0)
+ goto err;
+
+ ret = __db_retcopy(dbc->env, data,
+ &recno, sizeof(recno), &dbc->rdata->data, &dbc->rdata->ulen);
+
+ /* Release the stack. */
+err: if ((t_ret = __bam_stkrel(dbc, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bamc_writelock --
+ * Upgrade the cursor to a write lock.
+ */
+static int
+__bamc_writelock(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ int ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (cp->lock_mode == DB_LOCK_WRITE)
+ return (0);
+
+ /*
+ * When writing to an off-page duplicate tree, we need to have the
+ * appropriate page in the primary tree locked. The general DBC
+ * code calls us first with the primary cursor so we can acquire the
+ * appropriate lock.
+ */
+ ACQUIRE_WRITE_LOCK(dbc, ret);
+ return (ret);
+}
+
+/*
+ * __bamc_next --
+ * Move to the next record.
+ */
+static int
+__bamc_next(dbc, initial_move, deleted_okay)
+ DBC *dbc;
+ int initial_move, deleted_okay;
+{
+ BTREE_CURSOR *cp;
+ db_indx_t adjust;
+ db_lockmode_t lock_mode;
+ db_pgno_t pgno;
+ int ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /*
+ * We're either moving through a page of duplicates or a btree leaf
+ * page.
+ *
+ * !!!
+ * This code handles empty pages and pages with only deleted entries.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ adjust = O_INDX;
+ lock_mode = DB_LOCK_NG;
+ } else {
+ adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX;
+ lock_mode =
+ F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ }
+ if (cp->page == NULL) {
+ ACQUIRE_CUR(dbc, lock_mode, cp->pgno, 0, ret);
+ if (ret != 0)
+ return (ret);
+ }
+
+ if (initial_move)
+ cp->indx += adjust;
+
+ for (;;) {
+ /*
+ * If at the end of the page, move to a subsequent page.
+ *
+ * !!!
+ * Check for >= NUM_ENT. If the original search landed us on
+ * NUM_ENT, we may have incremented indx before the test.
+ */
+ if (cp->indx >= NUM_ENT(cp->page)) {
+ if ((pgno = NEXT_PGNO(cp->page)) == PGNO_INVALID)
+ return (DB_NOTFOUND);
+
+ ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
+ if (ret != 0)
+ return (ret);
+ cp->indx = 0;
+ continue;
+ }
+ if (!deleted_okay && IS_CUR_DELETED(dbc)) {
+ cp->indx += adjust;
+ continue;
+ }
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __bamc_prev --
+ * Move to the previous record.
+ */
+static int
+__bamc_prev(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ db_indx_t adjust;
+ db_lockmode_t lock_mode;
+ db_pgno_t pgno;
+ int ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /*
+ * We're either moving through a page of duplicates or a btree leaf
+ * page.
+ *
+ * !!!
+ * This code handles empty pages and pages with only deleted entries.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ adjust = O_INDX;
+ lock_mode = DB_LOCK_NG;
+ } else {
+ adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX;
+ lock_mode =
+ F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ }
+ if (cp->page == NULL) {
+ ACQUIRE_CUR(dbc, lock_mode, cp->pgno, 0, ret);
+ if (ret != 0)
+ return (ret);
+ }
+
+ for (;;) {
+ /* If at the beginning of the page, move to a previous one. */
+ if (cp->indx == 0) {
+ if ((pgno =
+ PREV_PGNO(cp->page)) == PGNO_INVALID)
+ return (DB_NOTFOUND);
+
+ ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
+ if (ret != 0)
+ return (ret);
+
+ if ((cp->indx = NUM_ENT(cp->page)) == 0)
+ continue;
+ }
+
+ /* Ignore deleted records. */
+ cp->indx -= adjust;
+ if (IS_CUR_DELETED(dbc))
+ continue;
+
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __bamc_search --
+ * Move to a specified record.
+ */
+static int
+__bamc_search(dbc, root_pgno, key, flags, exactp)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ const DBT *key;
+ u_int32_t flags;
+ int *exactp;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ PAGE *h;
+ db_indx_t base, indx, *inp, lim;
+ db_pgno_t bt_lpgno;
+ db_recno_t recno;
+ u_int32_t sflags;
+ int bulk, cmp, ret, t_ret;
+
+ COMPQUIET(cmp, 0);
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ ret = 0;
+ bulk = (F_ISSET(dbc, DBC_BULK) && cp->pgno != PGNO_INVALID);
+
+ /*
+ * Find an entry in the database. Discard any lock we currently hold,
+ * we're going to search the tree.
+ */
+ DISCARD_CUR(dbc, ret);
+ if (ret != 0)
+ return (ret);
+
+ switch (flags) {
+ case DB_FIRST:
+ sflags = (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_MIN;
+ goto search;
+ case DB_LAST:
+ sflags = (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_MAX;
+ goto search;
+ case DB_SET_RECNO:
+ if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
+ return (ret);
+ sflags =
+ (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND) | SR_EXACT;
+ if ((ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp)) != 0)
+ return (ret);
+ goto done;
+ case DB_SET:
+ case DB_GET_BOTH:
+ sflags =
+ (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND) | SR_EXACT;
+ if (bulk)
+ break;
+ goto search;
+ case DB_GET_BOTH_RANGE:
+ sflags = (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND);
+ goto search;
+ case DB_SET_RANGE:
+ sflags =
+ (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_DUPFIRST;
+ goto search;
+ case DB_KEYFIRST:
+ case DB_NOOVERWRITE:
+ sflags = SR_KEYFIRST;
+ break;
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_OVERWRITE_DUP:
+ sflags = SR_KEYLAST;
+ break;
+ default:
+ return (__db_unknown_flag(dbp->env, "__bamc_search", flags));
+ }
+
+ /*
+ * If the application has a history of inserting into the first or last
+ * pages of the database, we check those pages first to avoid doing a
+ * full search. Similarly, if the cursor is configured as a bulk
+ * cursor, check whether this operation belongs on the same page as the
+ * last one.
+ */
+ if (bulk)
+ bt_lpgno = cp->pgno;
+ else {
+ if (F_ISSET(dbc, DBC_OPD))
+ goto search;
+
+ /*
+ * !!!
+ * We do not mutex protect the t->bt_lpgno field, which means
+ * that it can only be used in an advisory manner. If we find
+ * page we can use, great. If we don't, we don't care, we do
+ * it the slow way instead. Regardless, copy it into a local
+ * variable, otherwise we might acquire a lock for a page and
+ * then read a different page because it changed underfoot.
+ */
+ bt_lpgno = t->bt_lpgno;
+ }
+
+ /*
+ * If the tree has no history of insertion, do it the slow way.
+ */
+ if (bt_lpgno == PGNO_INVALID)
+ goto search;
+
+ /*
+ * Lock and retrieve the page on which we last inserted.
+ *
+ * The page may not exist: if a transaction created the page
+ * and then aborted, the page might have been truncated from
+ * the end of the file. We don't want to wait on the lock.
+ * The page may not even be relevant to this search.
+ */
+ h = NULL;
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE, bt_lpgno, DB_LOCK_NOWAIT, ret);
+ if (ret != 0) {
+ if (ret == DB_LOCK_DEADLOCK ||
+ ret == DB_LOCK_NOTGRANTED ||
+ ret == DB_PAGE_NOTFOUND)
+ ret = 0;
+ goto fast_miss;
+ }
+
+ h = cp->page;
+ inp = P_INP(dbp, h);
+
+ /*
+ * It's okay if the page type isn't right or it's empty, it
+ * just means that the world changed.
+ */
+ if (TYPE(h) != P_LBTREE || NUM_ENT(h) == 0)
+ goto fast_miss;
+
+ /* Verify that this page cannot have moved to another db. */
+ if (F_ISSET(dbp, DB_AM_SUBDB) &&
+ LOG_COMPARE(&t->bt_llsn, &LSN(h)) != 0)
+ goto fast_miss;
+
+ /*
+ * What we do here is test to see if we're at the beginning or
+ * end of the tree and if the new item sorts before/after the
+ * first/last page entry. We only try to catch inserts into
+ * the middle of the tree for bulk cursors.
+ */
+ if (h->next_pgno == PGNO_INVALID) {
+ indx = NUM_ENT(h) - P_INDX;
+ if ((ret = __bam_cmp(dbc, key, h, indx,
+ t->bt_compare, &cmp)) != 0)
+ goto fast_miss;
+ if (cmp > 0) {
+ if (FLD_ISSET(sflags, SR_EXACT))
+ return (DB_NOTFOUND);
+ else
+ indx += P_INDX;
+ }
+ if (cmp >= 0)
+ goto fast_hit;
+ }
+ if (h->prev_pgno == PGNO_INVALID) {
+ indx = 0;
+ if ((ret = __bam_cmp(dbc, key, h, indx,
+ t->bt_compare, &cmp)) != 0)
+ goto fast_miss;
+ if (cmp < 0 && FLD_ISSET(sflags, SR_EXACT))
+ return (DB_NOTFOUND);
+ if (cmp <= 0)
+ goto fast_hit;
+ }
+ if (bulk) {
+ DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), P_INDX) {
+ DB_BINARY_SEARCH_INCR(indx, base, lim, P_INDX);
+ if ((ret = __bam_cmp(dbc, key, h, indx,
+ t->bt_compare, &cmp)) != 0)
+ goto fast_miss;
+
+ if (cmp == 0)
+ goto fast_hit;
+ if (cmp > 0)
+ DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
+ lim, P_INDX);
+ }
+ /*
+ * No match found: base is the smallest index greater than
+ * the key and may be zero or NUM_ENT(h).
+ */
+ indx = base;
+ if (indx > 0 && indx < NUM_ENT(h)) {
+ if (FLD_ISSET(sflags, SR_EXACT))
+ return (DB_NOTFOUND);
+ goto fast_hit;
+ }
+ }
+ goto fast_miss;
+
+fast_hit:
+ if (cmp == 0) {
+ /*
+ * Found a duplicate. Deal with DB_KEYFIRST / DB_KEYLAST.
+ */
+ if (FLD_ISSET(sflags, SR_DUPFIRST))
+ while (indx > 0 && inp[indx - P_INDX] == inp[indx])
+ indx -= P_INDX;
+ else if (FLD_ISSET(sflags, SR_DUPLAST))
+ while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+ inp[indx] == inp[indx + P_INDX])
+ indx += P_INDX;
+ }
+
+ /* Set the exact match flag, we may have found a duplicate. */
+ *exactp = (cmp == 0);
+
+ /*
+ * Insert the entry in the stack. (Our caller is likely to
+ * call __bam_stkrel() after our return.)
+ */
+ BT_STK_CLR(cp);
+ BT_STK_ENTER(dbp->env,
+ cp, h, indx, cp->lock, cp->lock_mode, ret);
+ if (ret != 0)
+ return (ret);
+ goto done;
+
+fast_miss:
+ /*
+ * This was not the right page, so we do not need to retain
+ * the lock even in the presence of transactions.
+ *
+ * This is also an error path, so ret may have been set.
+ */
+ DISCARD_CUR(dbc, ret);
+ cp->pgno = PGNO_INVALID;
+ if ((t_ret = __LPUT(dbc, cp->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+
+search:
+ if ((ret = __bam_search(dbc, root_pgno,
+ key, sflags, 1, NULL, exactp)) != 0)
+ return (ret);
+
+done: /* Initialize the cursor from the stack. */
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+ cp->lock = cp->csp->lock;
+ cp->lock_mode = cp->csp->lock_mode;
+
+ /* If on an empty page or a deleted record, move to the next one. */
+ if (flags == DB_FIRST &&
+ (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc)))
+ if ((ret = __bamc_next(dbc, 0, 0)) != 0)
+ return (ret);
+ if (flags == DB_LAST &&
+ (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc)))
+ if ((ret = __bamc_prev(dbc)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __bamc_physdel --
+ * Physically remove an item from the page.
+ */
+static int
+__bamc_physdel(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT key;
+ DB_LOCK next_lock, prev_lock;
+ db_pgno_t pgno;
+ int delete_page, empty_page, exact, ret;
+
+ dbp = dbc->dbp;
+ memset(&key, 0, sizeof(DBT));
+ cp = (BTREE_CURSOR *)dbc->internal;
+ delete_page = empty_page = ret = 0;
+ LOCK_INIT(next_lock);
+ LOCK_INIT(prev_lock);
+
+ /* If the page is going to be emptied, consider deleting it. */
+ delete_page = empty_page =
+ NUM_ENT(cp->page) == (TYPE(cp->page) == P_LBTREE ? 2 : 1);
+
+ /*
+ * Check if the application turned off reverse splits. Applications
+ * can't turn off reverse splits in off-page duplicate trees, that
+ * space will never be reused unless the exact same key is specified.
+ */
+ if (delete_page &&
+ !F_ISSET(dbc, DBC_OPD) && F_ISSET(dbp, DB_AM_REVSPLITOFF))
+ delete_page = 0;
+
+ /*
+ * We never delete the last leaf page. (Not really true -- we delete
+ * the last leaf page of off-page duplicate trees, but that's handled
+ * by our caller, not down here.)
+ */
+ if (delete_page && cp->pgno == BAM_ROOT_PGNO(dbc))
+ delete_page = 0;
+
+ /*
+ * To delete a leaf page other than an empty root page, we need a
+ * copy of a key from the page. Use the 0th page index since it's
+ * the last key the page held.
+ *
+ * !!!
+ * Note that because __bamc_physdel is always called from a cursor
+ * close, it should be safe to use the cursor's own "my_rkey" memory
+ * to temporarily hold this key. We shouldn't own any returned-data
+ * memory of interest--if we do, we're in trouble anyway.
+ */
+ if (delete_page) {
+ if ((ret = __db_ret(dbc, cp->page, 0, &key,
+ &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+ goto err;
+ }
+
+ /*
+ * Delete the items. If page isn't empty, we adjust the cursors.
+ *
+ * !!!
+ * The following operations to delete a page may deadlock. The easy
+ * scenario is if we're deleting an item because we're closing cursors
+ * because we've already deadlocked and want to call txn->abort. If
+ * we fail due to deadlock, we'll leave a locked, possibly empty page
+ * in the tree, which won't be empty long because we'll undo the delete
+ * when we undo the transaction's modifications.
+ *
+ * !!!
+ * Delete the key item first, otherwise the on-page duplicate checks
+ * in __bam_ditem() won't work!
+ */
+ if ((ret = __memp_dirty(dbp->mpf,
+ &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+ if (TYPE(cp->page) == P_LBTREE) {
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+ if (!empty_page)
+ if ((ret = __bam_ca_di(dbc,
+ PGNO(cp->page), cp->indx, -1)) != 0)
+ goto err;
+ }
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+
+ /* Clear the deleted flag, the item is gone. */
+ F_CLR(cp, C_DELETED);
+
+ if (!empty_page)
+ if ((ret = __bam_ca_di(dbc, PGNO(cp->page), cp->indx, -1)) != 0)
+ goto err;
+
+ /*
+ * Need to downgrade write locks here or non-txn locks will get stuck.
+ */
+ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED)) {
+ if ((ret = __TLPUT(dbc, cp->lock)) != 0)
+ goto err;
+ cp->lock_mode = DB_LOCK_WWRITE;
+ if (cp->page != NULL &&
+ (ret = __memp_shared(dbp->mpf, cp->page)) != 0)
+ goto err;
+ }
+ /* If we're not going to try and delete the page, we're done. */
+ if (!delete_page)
+ return (0);
+
+ /*
+ * Lock the previous and next pages before latching the parent
+ * sub tree.
+ */
+ if (STD_LOCKING(dbc)) {
+ if ((pgno = PREV_PGNO(cp->page)) != PGNO_INVALID &&
+ (ret = __db_lget(dbc,
+ 0, pgno, DB_LOCK_WRITE, 0, &prev_lock)) != 0)
+ return (ret);
+ if ((pgno = NEXT_PGNO(cp->page)) != PGNO_INVALID &&
+ (ret = __db_lget(dbc,
+ 0, pgno, DB_LOCK_WRITE, 0, &next_lock)) != 0) {
+ (void)__TLPUT(dbc, next_lock);
+ return (ret);
+ }
+ }
+ DISCARD_CUR(dbc, ret);
+ if (ret != 0)
+ goto err;
+ ret = __bam_search(dbc, PGNO_INVALID, &key, SR_DEL, 0, NULL, &exact);
+
+ /*
+ * If everything worked, delete the stack, otherwise, release the
+ * stack and page locks without further damage.
+ */
+ if (ret == 0)
+ ret = __bam_dpages(dbc, 1, BTD_RELINK);
+ else
+ (void)__bam_stkrel(dbc, 0);
+
+err: if (ret != 0)
+ F_SET(dbc, DBC_ERROR);
+ (void)__TLPUT(dbc, prev_lock);
+ (void)__TLPUT(dbc, next_lock);
+ return (ret);
+}
+
+/*
+ * __bamc_getstack --
+ * Acquire a full stack for a cursor.
+ */
+static int
+__bamc_getstack(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int exact, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Get the page with the current item on it. The caller of this
+ * routine has to already hold a read lock on the page, so there
+ * is no additional lock to acquire.
+ */
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ return (ret);
+
+ /* Get a copy of a key from the page. */
+ memset(&dbt, 0, sizeof(DBT));
+ ret = __db_ret(dbc, h, 0, &dbt,
+ &dbc->my_rkey.data, &dbc->my_rkey.ulen);
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+
+ /* Get a write-locked stack for the page. */
+ exact = 0;
+ ret = __bam_search(dbc, PGNO_INVALID,
+ &dbt, SR_KEYFIRST, 1, NULL, &exact);
+
+ return (ret);
+}
+
+/*
+ * __bam_isopd --
+ * Return if the cursor references an off-page duplicate tree via its
+ * page number.
+ */
+static int
+__bam_isopd(dbc, pgnop)
+ DBC *dbc;
+ db_pgno_t *pgnop;
+{
+ BOVERFLOW *bo;
+
+ if (TYPE(dbc->internal->page) != P_LBTREE)
+ return (0);
+
+ bo = GET_BOVERFLOW(dbc->dbp,
+ dbc->internal->page, dbc->internal->indx + O_INDX);
+ if (B_TYPE(bo->type) == B_DUPLICATE) {
+ *pgnop = bo->pgno;
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * __bam_opd_exists --
+ * Return if the current position has any data.
+ * PUBLIC: int __bam_opd_exists __P((DBC *, db_pgno_t));
+ */
+int
+__bam_opd_exists(dbc, pgno)
+ DBC *dbc;
+ db_pgno_t pgno;
+{
+ PAGE *h;
+ int ret;
+
+ if ((ret = __memp_fget(dbc->dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ return (ret);
+
+ /*
+ * We always collapse OPD trees so we only need to check
+ * the number of entries on the root. If there is a non-empty
+ * tree then there will be duplicates.
+ */
+ if (NUM_ENT(h) == 0)
+ ret = 0;
+ else
+ ret = DB_KEYEXIST;
+
+ (void)__memp_fput(dbc->dbp->mpf, dbc->thread_info, h, dbc->priority);
+
+ return (ret);
+}
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
new file mode 100644
index 00000000..37496b3f
--- /dev/null
+++ b/src/btree/bt_delete.c
@@ -0,0 +1,541 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __bam_ditem --
+ * Delete one or more entries from a page.
+ *
+ * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
+ */
+int
+__bam_ditem(dbc, h, indx)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ DB *dbp;
+ u_int32_t nbytes;
+ int ret;
+ db_indx_t *inp;
+
+ dbp = dbc->dbp;
+ inp = P_INP(dbp, h);
+
+ /* The page should already have been dirtied by our caller. */
+ DB_ASSERT(dbp->env, IS_DIRTY(h));
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(dbp, h, indx);
+ switch (B_TYPE(bi->type)) {
+ case B_DUPLICATE:
+ case B_KEYDATA:
+ nbytes = BINTERNAL_SIZE(bi->len);
+ break;
+ case B_OVERFLOW:
+ nbytes = BINTERNAL_SIZE(bi->len);
+ if ((ret =
+ __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(h)));
+ }
+ break;
+ case P_IRECNO:
+ nbytes = RINTERNAL_SIZE;
+ break;
+ case P_LBTREE:
+ /*
+ * If it's a duplicate key, discard the index and don't touch
+ * the actual page item.
+ *
+ * !!!
+ * This works because no data item can have an index matching
+ * any other index so even if the data item is in a key "slot",
+ * it won't match any other index.
+ */
+ if ((indx % 2) == 0) {
+ /*
+ * Check for a duplicate after us on the page. NOTE:
+ * we have to delete the key item before deleting the
+ * data item, otherwise the "indx + P_INDX" calculation
+ * won't work!
+ */
+ if (indx + P_INDX < (u_int32_t)NUM_ENT(h) &&
+ inp[indx] == inp[indx + P_INDX])
+ return (__bam_adjindx(dbc,
+ h, indx, indx + O_INDX, 0));
+ /*
+ * Check for a duplicate before us on the page. It
+ * doesn't matter if we delete the key item before or
+ * after the data item for the purposes of this one.
+ */
+ if (indx > 0 && inp[indx] == inp[indx - P_INDX])
+ return (__bam_adjindx(dbc,
+ h, indx, indx - P_INDX, 0));
+ }
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ bk = GET_BKEYDATA(dbp, h, indx);
+ switch (B_TYPE(bk->type)) {
+ case B_DUPLICATE:
+ nbytes = BOVERFLOW_SIZE;
+ break;
+ case B_OVERFLOW:
+ nbytes = BOVERFLOW_SIZE;
+ if ((ret = __db_doff(
+ dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0)
+ return (ret);
+ break;
+ case B_KEYDATA:
+ nbytes = BKEYDATA_SIZE(bk->len);
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(h)));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(h)));
+ }
+
+ /* Delete the item and mark the page dirty. */
+ if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __bam_adjindx --
+ * Adjust an index on the page.
+ *
+ * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
+ */
+int
+__bam_adjindx(dbc, h, indx, indx_copy, is_insert)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx, indx_copy;
+ int is_insert;
+{
+ DB *dbp;
+ db_indx_t copy, *inp;
+ int ret;
+
+ dbp = dbc->dbp;
+ inp = P_INP(dbp, h);
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_adj_log(dbp, dbc->txn, &LSN(h), 0,
+ PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(h));
+
+ /* Shuffle the indices and mark the page dirty. */
+ if (is_insert) {
+ copy = inp[indx_copy];
+ if (indx != NUM_ENT(h))
+ memmove(&inp[indx + O_INDX], &inp[indx],
+ sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+ inp[indx] = copy;
+ ++NUM_ENT(h);
+ } else {
+ --NUM_ENT(h);
+ if (indx != NUM_ENT(h))
+ memmove(&inp[indx], &inp[indx + O_INDX],
+ sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_dpages --
+ * Delete a set of locked pages.
+ *
+ * PUBLIC: int __bam_dpages __P((DBC *, int, int));
+ */
+int
+__bam_dpages(dbc, use_top, flags)
+ DBC *dbc;
+ int use_top;
+ int flags;
+{
+ BINTERNAL *bi;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT a, b;
+ DB_LOCK c_lock, p_lock;
+ DB_MPOOLFILE *mpf;
+ EPG *epg, *save_sp, *stack_epg;
+ PAGE *child, *parent;
+ db_indx_t nitems;
+ db_pgno_t pgno, root_pgno;
+ db_recno_t rcnt;
+ int done, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ nitems = 0;
+ pgno = PGNO_INVALID;
+
+ /*
+ * We have the entire stack of deletable pages locked.
+ *
+ * Btree calls us with the first page in the stack is to have a
+ * single item deleted, and the rest of the pages are to be removed.
+ *
+ * Recno always has a stack to the root and __bam_merge operations
+ * may have unneeded items in the sack. We find the lowest page
+ * in the stack that has more than one record in it and start there.
+ */
+ ret = 0;
+ if (use_top)
+ stack_epg = cp->sp;
+ else
+ for (stack_epg = cp->csp; stack_epg > cp->sp; --stack_epg)
+ if (NUM_ENT(stack_epg->page) > 1)
+ break;
+ epg = stack_epg;
+ /*
+ * !!!
+ * There is an interesting deadlock situation here. We have to relink
+ * the leaf page chain around the leaf page being deleted. Consider
+ * a cursor walking through the leaf pages, that has the previous page
+ * read-locked and is waiting on a lock for the page we're deleting.
+ * It will deadlock here. Before we unlink the subtree, we relink the
+ * leaf page chain.
+ */
+ if (LF_ISSET(BTD_RELINK) && LEVEL(cp->csp->page) == 1 &&
+ (ret = __db_relink(dbc, cp->csp->page, NULL, PGNO_INVALID)) != 0)
+ goto discard;
+
+ /*
+ * Delete the last item that references the underlying pages that are
+ * to be deleted, and adjust cursors that reference that page. Then,
+ * save that page's page number and item count and release it. If
+ * the application isn't retaining locks because it's running without
+ * transactions, this lets the rest of the tree get back to business
+ * immediately.
+ */
+ if ((ret = __memp_dirty(mpf,
+ &epg->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto discard;
+ if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
+ goto discard;
+ if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0)
+ goto discard;
+
+ if (LF_ISSET(BTD_UPDATE) && epg->indx == 0) {
+ save_sp = cp->csp;
+ cp->csp = epg;
+ ret = __bam_pupdate(dbc, epg->page);
+ cp->csp = save_sp;
+ if (ret != 0)
+ goto discard;
+ }
+
+ pgno = PGNO(epg->page);
+ nitems = NUM_ENT(epg->page);
+
+ ret = __memp_fput(mpf, dbc->thread_info, epg->page, dbc->priority);
+ epg->page = NULL;
+ if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err_inc;
+
+ /* Then, discard any pages that we don't care about. */
+discard: for (epg = cp->sp; epg < stack_epg; ++epg) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ epg->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ epg->page = NULL;
+ if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if (ret != 0)
+ goto err;
+
+ /* Free the rest of the pages in the stack. */
+ while (++epg <= cp->csp) {
+ if ((ret = __memp_dirty(mpf, &epg->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+ /*
+ * Delete page entries so they will be restored as part of
+ * recovery. We don't need to do cursor adjustment here as
+ * the pages are being emptied by definition and so cannot
+ * be referenced by a cursor.
+ */
+ if (NUM_ENT(epg->page) != 0) {
+ DB_ASSERT(dbp->env, LEVEL(epg->page) != 1);
+
+ if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
+ goto err;
+ /*
+ * Sheer paranoia: if we find any pages that aren't
+ * emptied by the delete, someone else added an item
+ * while we were walking the tree, and we discontinue
+ * the delete. Shouldn't be possible, but we check
+ * regardless.
+ */
+ if (NUM_ENT(epg->page) != 0)
+ goto err;
+ }
+
+ ret = __db_free(dbc, epg->page, 0);
+ if (cp->page == epg->page)
+ cp->page = NULL;
+ epg->page = NULL;
+ if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err_inc;
+ }
+
+ if (0) {
+err_inc: ++epg;
+err: for (; epg <= cp->csp; ++epg) {
+ if (epg->page != NULL) {
+ (void)__memp_fput(mpf, dbc->thread_info,
+ epg->page, dbc->priority);
+ epg->page = NULL;
+ }
+ (void)__TLPUT(dbc, epg->lock);
+ }
+ BT_STK_CLR(cp);
+ return (ret);
+ }
+ BT_STK_CLR(cp);
+
+ /*
+ * If we just deleted the next-to-last item from the root page, the
+ * tree can collapse one or more levels. While there remains only a
+ * single item on the root page, write lock the last page referenced
+ * by the root page and copy it over the root page.
+ * Note that if pgno is the root of a btree database then the root
+ * cannot change as we have it locked.
+ */
+ if (nitems != 1)
+ return (0);
+ root_pgno = BAM_ROOT_PGNO(dbc);
+ if (pgno != root_pgno)
+ return (0);
+
+ for (done = 0; !done;) {
+ /* Initialize. */
+ parent = child = NULL;
+ LOCK_INIT(p_lock);
+ LOCK_INIT(c_lock);
+
+ /* Get the root. */
+ root_pgno = cp->root;
+ BAM_GET_ROOT(dbc, root_pgno,
+ parent, DB_MPOOL_DIRTY, DB_LOCK_WRITE, p_lock, ret);
+
+ DB_ASSERT(dbp->env, parent != NULL);
+ if (ret != 0 || NUM_ENT(parent) != 1)
+ goto stop;
+
+ switch (TYPE(parent)) {
+ case P_IBTREE:
+ /*
+ * If this is overflow, then try to delete it.
+ * The child may or may not still point at it.
+ */
+ bi = GET_BINTERNAL(dbp, parent, 0);
+ if (B_TYPE(bi->type) == B_OVERFLOW)
+ if ((ret = __db_doff(dbc,
+ ((BOVERFLOW *)bi->data)->pgno)) != 0)
+ goto stop;
+ pgno = bi->pgno;
+ break;
+ case P_IRECNO:
+ pgno = GET_RINTERNAL(dbp, parent, 0)->pgno;
+ break;
+ default:
+ goto stop;
+ }
+
+ /* Lock the child page. */
+ if ((ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0)
+ goto stop;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &child)) != 0)
+ goto stop;
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ memset(&a, 0, sizeof(a));
+ a.data = child;
+ a.size = dbp->pgsize;
+ memset(&b, 0, sizeof(b));
+ b.data = P_ENTRY(dbp, parent, 0);
+ b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE :
+ BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
+ if ((ret = __bam_rsplit_log(dbp, dbc->txn,
+ &child->lsn, 0, PGNO(child), &a, PGNO(parent),
+ RE_NREC(parent), &b, &parent->lsn)) != 0)
+ goto stop;
+ } else
+ LSN_NOT_LOGGED(child->lsn);
+
+ /*
+ * Make the switch.
+ *
+ * One fixup -- internal pages below the top level do not store
+ * a record count, so we have to preserve it if we're not
+ * converting to a leaf page. Note also that we are about to
+ * overwrite the parent page, including its LSN. This is OK
+ * because the log message we wrote describing this update
+ * stores its LSN on the child page. When the child is copied
+ * onto the parent, the correct LSN is copied into place.
+ */
+ COMPQUIET(rcnt, 0);
+ if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
+ rcnt = RE_NREC(parent);
+ memcpy(parent, child, dbp->pgsize);
+ PGNO(parent) = root_pgno;
+ if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
+ RE_NREC_SET(parent, rcnt);
+
+ /* Adjust the cursors. */
+ if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0)
+ goto stop;
+
+ /*
+ * Free the page copied onto the root page and discard its
+ * lock. (The call to __db_free() discards our reference
+ * to the page.)
+ */
+ if ((ret = __db_free(dbc, child, 0)) != 0) {
+ child = NULL;
+ goto stop;
+ }
+ child = NULL;
+
+ if (0) {
+stop: done = 1;
+ }
+ if ((t_ret = __TLPUT(dbc, p_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (parent != NULL &&
+ (t_ret = __memp_fput(mpf, dbc->thread_info,
+ parent, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, c_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (child != NULL &&
+ (t_ret = __memp_fput(mpf, dbc->thread_info,
+ child, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_pupdate --
+ * Update parent key pointers up the tree.
+ *
+ * PUBLIC: int __bam_pupdate __P((DBC *, PAGE *));
+ */
+int
+__bam_pupdate(dbc, lpg)
+ DBC *dbc;
+ PAGE *lpg;
+{
+ BTREE_CURSOR *cp;
+ ENV *env;
+ EPG *epg;
+ int ret;
+
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /*
+ * Update the parents up the tree. __bam_pinsert only looks at the
+ * left child if is a leaf page, so we don't need to change it. We
+ * just do a delete and insert; a replace is possible but reusing
+ * pinsert is better.
+ */
+ for (epg = &cp->csp[-1]; epg >= cp->sp; epg--) {
+ if ((ret = __memp_dirty(dbc->dbp->mpf, &epg->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ epg->indx--;
+ if ((ret = __bam_pinsert(dbc, epg, 0,
+ lpg, epg[1].page, BPI_NORECNUM | BPI_REPLACE)) != 0) {
+ if (ret == DB_NEEDSPLIT) {
+ /* This should not happen. */
+ __db_errx(env, DB_STR_A("1020",
+ "Not enough room in parent: %s: page %lu",
+ "%s %lu"), dbc->dbp->fname,
+ (u_long)PGNO(epg->page));
+ ret = __env_panic(env, EINVAL);
+ }
+ epg->indx++;
+ return (ret);
+ }
+ epg->indx++;
+ }
+ return (ret);
+}
diff --git a/src/btree/bt_method.c b/src/btree/bt_method.c
new file mode 100644
index 00000000..5cf93d2e
--- /dev/null
+++ b/src/btree/bt_method.c
@@ -0,0 +1,745 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/qam.h"
+
+static int __bam_set_bt_minkey __P((DB *, u_int32_t));
+static int __bam_get_bt_compare
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+static int __bam_get_bt_prefix
+ __P((DB *, size_t(**)(DB *, const DBT *, const DBT *)));
+static int __bam_set_bt_prefix
+ __P((DB *, size_t(*)(DB *, const DBT *, const DBT *)));
+static int __bam_get_bt_compress __P((DB *,
+ int (**)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
+ int (**)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+static int __ram_get_re_delim __P((DB *, int *));
+static int __ram_set_re_delim __P((DB *, int));
+static int __ram_set_re_len __P((DB *, u_int32_t));
+static int __ram_set_re_pad __P((DB *, int));
+static int __ram_get_re_source __P((DB *, const char **));
+static int __ram_set_re_source __P((DB *, const char *));
+
+/*
+ * __bam_db_create --
+ * Btree specific initialization of the DB structure.
+ *
+ * PUBLIC: int __bam_db_create __P((DB *));
+ */
+int
+__bam_db_create(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ int ret;
+
+ /* Allocate and initialize the private btree structure. */
+ if ((ret = __os_calloc(dbp->env, 1, sizeof(BTREE), &t)) != 0)
+ return (ret);
+ dbp->bt_internal = t;
+
+ t->bt_minkey = DEFMINKEYPAGE; /* Btree */
+ t->bt_compare = __bam_defcmp;
+ t->bt_prefix = __bam_defpfx;
+#ifdef HAVE_COMPRESSION
+ t->bt_compress = NULL;
+ t->bt_decompress = NULL;
+ t->compress_dup_compare = NULL;
+
+ /*
+ * DB_AM_COMPRESS may have been set in __bam_metachk before the
+ * bt_internal structure existed.
+ */
+ if (F_ISSET(dbp, DB_AM_COMPRESS) &&
+ (ret = __bam_set_bt_compress(dbp, NULL, NULL)) != 0)
+ return (ret);
+#endif
+
+ dbp->get_bt_compare = __bam_get_bt_compare;
+ dbp->set_bt_compare = __bam_set_bt_compare;
+ dbp->get_bt_minkey = __bam_get_bt_minkey;
+ dbp->set_bt_minkey = __bam_set_bt_minkey;
+ dbp->get_bt_prefix = __bam_get_bt_prefix;
+ dbp->set_bt_prefix = __bam_set_bt_prefix;
+ dbp->get_bt_compress = __bam_get_bt_compress;
+ dbp->set_bt_compress = __bam_set_bt_compress;
+
+ t->re_pad = ' '; /* Recno */
+ t->re_delim = '\n';
+ t->re_eof = 1;
+
+ dbp->get_re_delim = __ram_get_re_delim;
+ dbp->set_re_delim = __ram_set_re_delim;
+ dbp->get_re_len = __ram_get_re_len;
+ dbp->set_re_len = __ram_set_re_len;
+ dbp->get_re_pad = __ram_get_re_pad;
+ dbp->set_re_pad = __ram_set_re_pad;
+ dbp->get_re_source = __ram_get_re_source;
+ dbp->set_re_source = __ram_set_re_source;
+
+ return (0);
+}
+
+/*
+ * __bam_db_close --
+ * Btree specific discard of the DB structure.
+ *
+ * PUBLIC: int __bam_db_close __P((DB *));
+ */
+int
+__bam_db_close(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+
+ if ((t = dbp->bt_internal) == NULL)
+ return (0);
+ /* Recno */
+ /* Close any backing source file descriptor. */
+ if (t->re_fp != NULL)
+ (void)fclose(t->re_fp);
+
+ /* Free any backing source file name. */
+ if (t->re_source != NULL)
+ __os_free(dbp->env, t->re_source);
+
+ __os_free(dbp->env, t);
+ dbp->bt_internal = NULL;
+
+ return (0);
+}
+
+/*
+ * __bam_map_flags --
+ * Map Btree specific flags from public to the internal values.
+ *
+ * PUBLIC: void __bam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+ */
+void
+__bam_map_flags(dbp, inflagsp, outflagsp)
+ DB *dbp;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ COMPQUIET(dbp, NULL);
+
+ if (FLD_ISSET(*inflagsp, DB_DUP)) {
+ FLD_SET(*outflagsp, DB_AM_DUP);
+ FLD_CLR(*inflagsp, DB_DUP);
+ }
+ if (FLD_ISSET(*inflagsp, DB_DUPSORT)) {
+ FLD_SET(*outflagsp, DB_AM_DUP | DB_AM_DUPSORT);
+ FLD_CLR(*inflagsp, DB_DUPSORT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_RECNUM)) {
+ FLD_SET(*outflagsp, DB_AM_RECNUM);
+ FLD_CLR(*inflagsp, DB_RECNUM);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REVSPLITOFF)) {
+ FLD_SET(*outflagsp, DB_AM_REVSPLITOFF);
+ FLD_CLR(*inflagsp, DB_REVSPLITOFF);
+ }
+}
+
+/*
+ * __bam_set_flags --
+ * Set Btree specific flags.
+ *
+ * PUBLIC: int __bam_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__bam_set_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ BTREE *t;
+ u_int32_t flags;
+
+ t = dbp->bt_internal;
+
+ flags = *flagsp;
+ if (LF_ISSET(DB_DUP | DB_DUPSORT | DB_RECNUM | DB_REVSPLITOFF))
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags");
+
+ /*
+ * The DB_DUP and DB_DUPSORT flags are shared by the Hash
+ * and Btree access methods.
+ */
+ if (LF_ISSET(DB_DUP | DB_DUPSORT))
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+ if (LF_ISSET(DB_RECNUM | DB_REVSPLITOFF))
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+ /* DB_DUP/DB_DUPSORT is incompatible with DB_RECNUM. */
+ if (LF_ISSET(DB_DUP | DB_DUPSORT) && F_ISSET(dbp, DB_AM_RECNUM))
+ goto incompat;
+
+ /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */
+ if (LF_ISSET(DB_RECNUM) && F_ISSET(dbp, DB_AM_DUP))
+ goto incompat;
+
+ /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */
+ if (LF_ISSET(DB_RECNUM) && LF_ISSET(DB_DUP | DB_DUPSORT))
+ goto incompat;
+
+#ifdef HAVE_COMPRESSION
+ /* DB_RECNUM is incompatible with compression */
+ if (LF_ISSET(DB_RECNUM) && DB_IS_COMPRESSED(dbp)) {
+ __db_errx(dbp->env, DB_STR("1024",
+ "DB_RECNUM cannot be used with compression"));
+ return (EINVAL);
+ }
+
+ /* DB_DUP without DB_DUPSORT is incompatible with compression */
+ if (LF_ISSET(DB_DUP) && !LF_ISSET(DB_DUPSORT) &&
+ !F_ISSET(dbp, DB_AM_DUPSORT) && DB_IS_COMPRESSED(dbp)) {
+ __db_errx(dbp->env, DB_STR("1025",
+ "DB_DUP cannot be used with compression without DB_DUPSORT"));
+ return (EINVAL);
+ }
+#endif
+
+ if (LF_ISSET(DB_DUPSORT) && dbp->dup_compare == NULL) {
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp)) {
+ dbp->dup_compare = __bam_compress_dupcmp;
+ t->compress_dup_compare = __bam_defcmp;
+ } else
+#endif
+ dbp->dup_compare = __bam_defcmp;
+ }
+
+ __bam_map_flags(dbp, flagsp, &dbp->flags);
+ return (0);
+
+incompat:
+ return (__db_ferr(dbp->env, "DB->set_flags", 1));
+}
+
+/*
+ * __bam_get_bt_compare --
+ * Get the comparison function.
+ */
+static int
+__bam_get_bt_compare(dbp, funcp)
+ DB *dbp;
+ int (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ if (funcp != NULL)
+ *funcp = t->bt_compare;
+
+ return (0);
+}
+
+/*
+ * __bam_set_bt_compare --
+ * Set the comparison function.
+ *
+ * PUBLIC: int __bam_set_bt_compare
+ * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ */
+int
+__bam_set_bt_compare(dbp, func)
+ DB *dbp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_compare");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ /*
+ * Can't default the prefix routine if the user supplies a comparison
+ * routine; shortening the keys can break their comparison algorithm.
+ */
+ t->bt_compare = func;
+ if (t->bt_prefix == __bam_defpfx)
+ t->bt_prefix = NULL;
+
+ return (0);
+}
+
+/*
+ * __bam_get_bt_compress --
+ * Get the compression functions.
+ */
+static int
+__bam_get_bt_compress(dbp, compressp, decompressp)
+ DB *dbp;
+ int (**compressp) __P((DB *, const DBT *, const DBT *, const DBT *,
+ const DBT *, DBT *));
+ int (**decompressp) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
+ DBT *));
+{
+#ifdef HAVE_COMPRESSION
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ if (compressp != NULL)
+ *compressp = t->bt_compress;
+ if (decompressp != NULL)
+ *decompressp = t->bt_decompress;
+
+ return (0);
+#else
+ COMPQUIET(compressp, NULL);
+ COMPQUIET(decompressp, NULL);
+
+ __db_errx(dbp->env, DB_STR("1026",
+ "compression support has not been compiled in"));
+ return (EINVAL);
+#endif
+}
+
+/*
+ * __bam_set_bt_compress --
+ * Set the compression functions.
+ *
+ * PUBLIC: int __bam_set_bt_compress __P((DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *,
+ * PUBLIC: const DBT *, const DBT *, DBT *),
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+ */
+int
+__bam_set_bt_compress(dbp, compress, decompress)
+ DB *dbp;
+ int (*compress) __P((DB *, const DBT *, const DBT *, const DBT *,
+ const DBT *, DBT *));
+ int (*decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
+ DBT *));
+{
+#ifdef HAVE_COMPRESSION
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_compress");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ /* compression is incompatible with DB_RECNUM */
+ if (F_ISSET(dbp, DB_AM_RECNUM)) {
+ __db_errx(dbp->env, DB_STR("1027",
+ "compression cannot be used with DB_RECNUM"));
+ return (EINVAL);
+ }
+
+ /* compression is incompatible with DB_DUP without DB_DUPSORT */
+ if (F_ISSET(dbp, DB_AM_DUP) && !F_ISSET(dbp, DB_AM_DUPSORT)) {
+ __db_errx(dbp->env, DB_STR("1028",
+ "compression cannot be used with DB_DUP without DB_DUPSORT"));
+ return (EINVAL);
+ }
+
+ if (compress != 0 && decompress != 0) {
+ t->bt_compress = compress;
+ t->bt_decompress = decompress;
+ } else if (compress == 0 && decompress == 0) {
+ t->bt_compress = __bam_defcompress;
+ t->bt_decompress = __bam_defdecompress;
+ } else {
+ __db_errx(dbp->env, DB_STR("1029",
+ "to enable compression you need to supply both function arguments"));
+ return (EINVAL);
+ }
+ F_SET(dbp, DB_AM_COMPRESS);
+
+ /* Copy dup_compare to compress_dup_compare, and use the compression
+ duplicate compare */
+ if (F_ISSET(dbp, DB_AM_DUPSORT)) {
+ t->compress_dup_compare = dbp->dup_compare;
+ dbp->dup_compare = __bam_compress_dupcmp;
+ }
+
+ return (0);
+#else
+ COMPQUIET(compress, NULL);
+ COMPQUIET(decompress, NULL);
+
+ __db_errx(dbp->env, DB_STR("1030",
+ "compression support has not been compiled in"));
+ return (EINVAL);
+#endif
+}
+
+/*
+ * __db_get_bt_minkey --
+ * Get the minimum keys per page.
+ *
+ * PUBLIC: int __bam_get_bt_minkey __P((DB *, u_int32_t *));
+ */
+int
+__bam_get_bt_minkey(dbp, bt_minkeyp)
+ DB *dbp;
+ u_int32_t *bt_minkeyp;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+ *bt_minkeyp = t->bt_minkey;
+ return (0);
+}
+
+/*
+ * __bam_set_bt_minkey --
+ * Set the minimum keys per page.
+ */
+static int
+__bam_set_bt_minkey(dbp, bt_minkey)
+ DB *dbp;
+ u_int32_t bt_minkey;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_minkey");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ if (bt_minkey < 2) {
+ __db_errx(dbp->env, DB_STR("1031",
+ "minimum bt_minkey value is 2"));
+ return (EINVAL);
+ }
+
+ t->bt_minkey = bt_minkey;
+ return (0);
+}
+
+/*
+ * __bam_get_bt_prefix --
+ * Get the prefix function.
+ */
+static int
+__bam_get_bt_prefix(dbp, funcp)
+ DB *dbp;
+ size_t (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+ if (funcp != NULL)
+ *funcp = t->bt_prefix;
+ return (0);
+}
+
+/*
+ * __bam_set_bt_prefix --
+ * Set the prefix function.
+ */
+static int
+__bam_set_bt_prefix(dbp, func)
+ DB *dbp;
+ size_t (*func) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_prefix");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ t->bt_prefix = func;
+ return (0);
+}
+
+/*
+ * __bam_copy_config
+ * Copy the configuration of one DB handle to another.
+ * PUBLIC: void __bam_copy_config __P((DB *, DB*, u_int32_t));
+ */
+void
+__bam_copy_config(src, dst, nparts)
+ DB *src, *dst;
+ u_int32_t nparts;
+{
+ BTREE *s, *d;
+
+ COMPQUIET(nparts, 0);
+
+ s = src->bt_internal;
+ d = dst->bt_internal;
+ d->bt_compare = s->bt_compare;
+ d->bt_minkey = s->bt_minkey;
+ d->bt_minkey = s->bt_minkey;
+ d->bt_prefix = s->bt_prefix;
+#ifdef HAVE_COMPRESSION
+ d->bt_compress = s->bt_compress;
+ d->bt_decompress = s->bt_decompress;
+ d->compress_dup_compare = s->compress_dup_compare;
+#endif
+}
+
+/*
+ * __ram_map_flags --
+ * Map Recno specific flags from public to the internal values.
+ *
+ * PUBLIC: void __ram_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+ */
+void
+__ram_map_flags(dbp, inflagsp, outflagsp)
+ DB *dbp;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ COMPQUIET(dbp, NULL);
+
+ if (FLD_ISSET(*inflagsp, DB_RENUMBER)) {
+ FLD_SET(*outflagsp, DB_AM_RENUMBER);
+ FLD_CLR(*inflagsp, DB_RENUMBER);
+ }
+ if (FLD_ISSET(*inflagsp, DB_SNAPSHOT)) {
+ FLD_SET(*outflagsp, DB_AM_SNAPSHOT);
+ FLD_CLR(*inflagsp, DB_SNAPSHOT);
+ }
+}
+
+/*
+ * __ram_set_flags --
+ * Set Recno specific flags.
+ *
+ * PUBLIC: int __ram_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__ram_set_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ u_int32_t flags;
+
+ flags = *flagsp;
+ if (LF_ISSET(DB_RENUMBER | DB_SNAPSHOT)) {
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+ }
+
+ __ram_map_flags(dbp, flagsp, &dbp->flags);
+ return (0);
+}
+
+/*
+ * __db_get_re_delim --
+ * Get the variable-length input record delimiter.
+ */
+static int
+__ram_get_re_delim(dbp, re_delimp)
+ DB *dbp;
+ int *re_delimp;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+ t = dbp->bt_internal;
+ *re_delimp = t->re_delim;
+ return (0);
+}
+
+/*
+ * __ram_set_re_delim --
+ * Set the variable-length input record delimiter.
+ */
+static int
+__ram_set_re_delim(dbp, re_delim)
+ DB *dbp;
+ int re_delim;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_delim");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+
+ t->re_delim = re_delim;
+ F_SET(dbp, DB_AM_DELIMITER);
+
+ return (0);
+}
+
+/*
+ * __db_get_re_len --
+ * Get the variable-length input record length.
+ *
+ * PUBLIC: int __ram_get_re_len __P((DB *, u_int32_t *));
+ */
+int
+__ram_get_re_len(dbp, re_lenp)
+ DB *dbp;
+ u_int32_t *re_lenp;
+{
+ BTREE *t;
+ QUEUE *q;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ /*
+ * This has to work for all access methods, before or after opening the
+ * database. When the record length is set with __ram_set_re_len, the
+ * value in both the BTREE and QUEUE structs will be correct.
+ * Otherwise, this only makes sense after the database in opened, in
+ * which case we know the type.
+ */
+ if (dbp->type == DB_QUEUE) {
+ q = dbp->q_internal;
+ *re_lenp = q->re_len;
+ } else {
+ t = dbp->bt_internal;
+ *re_lenp = t->re_len;
+ }
+
+ return (0);
+}
+
+/*
+ * __ram_set_re_len --
+ * Set the variable-length input record length.
+ */
+static int
+__ram_set_re_len(dbp, re_len)
+ DB *dbp;
+ u_int32_t re_len;
+{
+ BTREE *t;
+#ifdef HAVE_QUEUE
+ QUEUE *q;
+#endif
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_len");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+ t->re_len = re_len;
+
+#ifdef HAVE_QUEUE
+ q = dbp->q_internal;
+ q->re_len = re_len;
+#endif
+
+ F_SET(dbp, DB_AM_FIXEDLEN);
+
+ return (0);
+}
+
+/*
+ * __db_get_re_pad --
+ * Get the fixed-length record pad character.
+ *
+ * PUBLIC: int __ram_get_re_pad __P((DB *, int *));
+ */
+int
+__ram_get_re_pad(dbp, re_padp)
+ DB *dbp;
+ int *re_padp;
+{
+ BTREE *t;
+ QUEUE *q;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ /*
+ * This has to work for all access methods, before or after opening the
+ * database. When the record length is set with __ram_set_re_pad, the
+ * value in both the BTREE and QUEUE structs will be correct.
+ * Otherwise, this only makes sense after the database in opened, in
+ * which case we know the type.
+ */
+ if (dbp->type == DB_QUEUE) {
+ q = dbp->q_internal;
+ *re_padp = q->re_pad;
+ } else {
+ t = dbp->bt_internal;
+ *re_padp = t->re_pad;
+ }
+
+ return (0);
+}
+
+/*
+ * __ram_set_re_pad --
+ * Set the fixed-length record pad character.
+ */
+static int
+__ram_set_re_pad(dbp, re_pad)
+ DB *dbp;
+ int re_pad;
+{
+ BTREE *t;
+#ifdef HAVE_QUEUE
+ QUEUE *q;
+#endif
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_pad");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+ t->re_pad = re_pad;
+
+#ifdef HAVE_QUEUE
+ q = dbp->q_internal;
+ q->re_pad = re_pad;
+#endif
+
+ F_SET(dbp, DB_AM_PAD);
+
+ return (0);
+}
+
+/*
+ * __db_get_re_source --
+ * Get the backing source file name.
+ */
+static int
+__ram_get_re_source(dbp, re_sourcep)
+ DB *dbp;
+ const char **re_sourcep;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+ *re_sourcep = t->re_source;
+ return (0);
+}
+
+/*
+ * __ram_set_re_source --
+ * Set the backing source file name.
+ */
+static int
+__ram_set_re_source(dbp, re_source)
+ DB *dbp;
+ const char *re_source;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_source");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+
+ return (__os_strdup(dbp->env, re_source, &t->re_source));
+}
diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c
new file mode 100644
index 00000000..7be141c1
--- /dev/null
+++ b/src/btree/bt_open.c
@@ -0,0 +1,677 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/fop.h"
+
+static void __bam_init_meta __P((DB *, BTMETA *, db_pgno_t, DB_LSN *));
+
+/*
+ * __bam_open --
+ * Open a btree.
+ *
+ * PUBLIC: int __bam_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__bam_open(dbp, ip, txn, name, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTREE *t;
+
+ COMPQUIET(name, NULL);
+ t = dbp->bt_internal;
+
+ /*
+ * We don't permit the user to specify a prefix routine if they didn't
+ * also specify a comparison routine, they can't know enough about our
+ * comparison routine to get it right.
+ */
+ if (t->bt_compare == __bam_defcmp && t->bt_prefix != __bam_defpfx) {
+ __db_errx(dbp->env, DB_STR("1006",
+"prefix comparison may not be specified for default comparison routine"));
+ return (EINVAL);
+ }
+
+ /*
+ * Verify that the bt_minkey value specified won't cause the
+ * calculation of ovflsize to underflow [#2406] for this pagesize.
+ */
+ if (B_MINKEY_TO_OVFLSIZE(dbp, t->bt_minkey, dbp->pgsize) >
+ B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
+ __db_errx(dbp->env, DB_STR_A("1007",
+ "bt_minkey value of %lu too high for page size of %lu",
+ "%lu %lu"), (u_long)t->bt_minkey, (u_long)dbp->pgsize);
+ return (EINVAL);
+ }
+
+ /* Start up the tree. */
+ return (__bam_read_root(dbp, ip, txn, base_pgno, flags));
+}
+
+/*
+ * __bam_metachk --
+ *
+ * PUBLIC: int __bam_metachk __P((DB *, const char *, BTMETA *));
+ */
+int
+__bam_metachk(dbp, name, btm)
+ DB *dbp;
+ const char *name;
+ BTMETA *btm;
+{
+ ENV *env;
+ u_int32_t vers;
+ int ret;
+
+ env = dbp->env;
+
+ /*
+ * At this point, all we know is that the magic number is for a Btree.
+ * Check the version, the database may be out of date.
+ */
+ vers = btm->dbmeta.version;
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ M_32_SWAP(vers);
+ switch (vers) {
+ case 6:
+ case 7:
+ __db_errx(env, DB_STR_A("1008",
+ "%s: btree version %lu requires a version upgrade",
+ "%s %lu"), name, (u_long)vers);
+ return (DB_OLD_VERSION);
+ case 8:
+ case 9:
+ break;
+ default:
+ __db_errx(env, DB_STR_A("1009",
+ "%s: unsupported btree version: %lu", "%s %lu"),
+ name, (u_long)vers);
+ return (EINVAL);
+ }
+
+ /* Swap the page if we need to. */
+ if (F_ISSET(dbp, DB_AM_SWAP) &&
+ (ret = __bam_mswap(env, (PAGE *)btm)) != 0)
+ return (ret);
+
+ /*
+ * Check application info against metadata info, and set info, flags,
+ * and type based on metadata info.
+ */
+ if ((ret =
+ __db_fchk(env, "DB->open", btm->dbmeta.flags, BTM_MASK)) != 0)
+ return (ret);
+
+ if (F_ISSET(&btm->dbmeta, BTM_RECNO)) {
+ if (dbp->type == DB_BTREE)
+ goto wrong_type;
+ dbp->type = DB_RECNO;
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+ } else {
+ if (dbp->type == DB_RECNO)
+ goto wrong_type;
+ dbp->type = DB_BTREE;
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_DUP))
+ F_SET(dbp, DB_AM_DUP);
+ else
+ if (F_ISSET(dbp, DB_AM_DUP)) {
+ __db_errx(env, DB_STR_A("1010",
+ "%s: DB_DUP specified to open method but not set in database",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_RECNUM)) {
+ if (dbp->type != DB_BTREE)
+ goto wrong_type;
+ F_SET(dbp, DB_AM_RECNUM);
+
+ if ((ret = __db_fcchk(env,
+ "DB->open", dbp->flags, DB_AM_DUP, DB_AM_RECNUM)) != 0)
+ return (ret);
+ } else
+ if (F_ISSET(dbp, DB_AM_RECNUM)) {
+ __db_errx(env, DB_STR_A("1011",
+ "%s: DB_RECNUM specified to open method but not set in database",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_FIXEDLEN)) {
+ if (dbp->type != DB_RECNO)
+ goto wrong_type;
+ F_SET(dbp, DB_AM_FIXEDLEN);
+ } else
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
+ __db_errx(env, DB_STR_A("1012",
+ "%s: DB_FIXEDLEN specified to open method but not set in database",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_RENUMBER)) {
+ if (dbp->type != DB_RECNO)
+ goto wrong_type;
+ F_SET(dbp, DB_AM_RENUMBER);
+ } else
+ if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+ __db_errx(env, DB_STR_A("1013",
+ "%s: DB_RENUMBER specified to open method but not set in database",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_SUBDB))
+ F_SET(dbp, DB_AM_SUBDB);
+ else
+ if (F_ISSET(dbp, DB_AM_SUBDB)) {
+ __db_errx(env, DB_STR_A("1014",
+ "%s: multiple databases specified but not supported by file",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_DUPSORT)) {
+ if (dbp->dup_compare == NULL)
+ dbp->dup_compare = __bam_defcmp;
+ F_SET(dbp, DB_AM_DUPSORT);
+ } else
+ if (dbp->dup_compare != NULL) {
+ __db_errx(env, DB_STR_A("1015",
+ "%s: duplicate sort specified but not supported in database",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+#ifdef HAVE_COMPRESSION
+ if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) {
+ F_SET(dbp, DB_AM_COMPRESS);
+ if ((BTREE *)dbp->bt_internal != NULL &&
+ !DB_IS_COMPRESSED(dbp) &&
+ (ret = __bam_set_bt_compress(dbp, NULL, NULL)) != 0)
+ return (ret);
+ } else {
+ if ((BTREE *)dbp->bt_internal != NULL &&
+ DB_IS_COMPRESSED(dbp)) {
+ __db_errx(env, DB_STR_A("1016",
+ "%s: compresssion specified to open method but not set in database",
+ "%s"), name);
+ return (EINVAL);
+ }
+ }
+#else
+ if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) {
+ __db_errx(env, DB_STR_A("1017",
+ "%s: compression support has not been compiled in", "%s"),
+ name);
+ return (EINVAL);
+ }
+#endif
+
+ /* Set the page size. */
+ dbp->pgsize = btm->dbmeta.pagesize;
+
+ /* Copy the file's ID. */
+ memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN);
+
+ return (0);
+
+wrong_type:
+ if (dbp->type == DB_BTREE)
+ __db_errx(env, DB_STR("1018",
+ "open method type is Btree, database type is Recno"));
+ else
+ __db_errx(env, DB_STR("1019",
+ "open method type is Recno, database type is Btree"));
+ return (EINVAL);
+}
+
+/*
+ * __bam_read_root --
+ * Read the root page and check a tree.
+ *
+ * PUBLIC: int __bam_read_root __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
+ */
+int
+__bam_read_root(dbp, ip, txn, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTMETA *meta;
+ BTREE *t;
+ DBC *dbc;
+ DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ COMPQUIET(flags, 0);
+
+ meta = NULL;
+ t = dbp->bt_internal;
+ LOCK_INIT(metalock);
+ mpf = dbp->mpf;
+ ret = 0;
+
+ /* Get a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc,
+ F_ISSET(dbp, DB_AM_RECOVER) ? DB_RECOVER : 0)) != 0)
+ return (ret);
+
+ /* Get the metadata page. */
+ if ((ret =
+ __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0)
+ goto err;
+
+ /*
+ * If the magic number is set, the tree has been created. Correct
+ * any fields that may not be right. Note, all of the local flags
+ * were set by DB->open.
+ *
+ * Otherwise, we'd better be in recovery or abort, in which case the
+ * metadata page will be created/initialized elsewhere.
+ *
+ * Ignore the last_pgno on the metadata page for snapshot transactions:
+ * we may be reading an old version of the page, and we've already
+ * set last_pgno from the file size. The only time this would matter
+ * is if we don't have ftruncate and there are some free pages at the
+ * end of the file: we could end up with holes.
+ */
+ if (meta->dbmeta.magic == DB_BTREEMAGIC) {
+ t->bt_minkey = meta->minkey;
+ t->re_pad = (int)meta->re_pad;
+ t->re_len = meta->re_len;
+
+ t->bt_meta = base_pgno;
+ t->bt_root = meta->root;
+ t->revision = dbp->mpf->mfp->revision;
+ if (PGNO(meta) == PGNO_BASE_MD &&
+ !F_ISSET(dbp, DB_AM_RECOVER) &&
+ (txn == NULL || !F_ISSET(txn, TXN_SNAPSHOT)) && (ret =
+ __memp_set_last_pgno(mpf, meta->dbmeta.last_pgno)) != 0)
+ goto err;
+ } else {
+ DB_ASSERT(dbp->env,
+ IS_RECOVERING(dbp->env) || F_ISSET(dbp, DB_AM_RECOVER));
+ }
+
+ /*
+ * !!!
+ * If creating a subdatabase, we've already done an insert when
+ * we put the subdatabase's entry into the master database, so
+ * our last-page-inserted value is wrongly initialized for the
+ * master database, not the subdatabase we're creating. I'm not
+ * sure where the *right* place to clear this value is, it's not
+ * intuitively obvious that it belongs here.
+ */
+ t->bt_lpgno = PGNO_INVALID;
+
+err: /* Put the metadata page back. */
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ ip, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bam_init_meta --
+ *
+ * Initialize a btree meta-data page. The following fields may need
+ * to be updated later: last_pgno, root.
+ */
+static void
+__bam_init_meta(dbp, meta, pgno, lsnp)
+ DB *dbp;
+ BTMETA *meta;
+ db_pgno_t pgno;
+ DB_LSN *lsnp;
+{
+ BTREE *t;
+#ifdef HAVE_PARTITION
+ DB_PARTITION *part;
+#endif
+ ENV *env;
+
+ env = dbp->env;
+ t = dbp->bt_internal;
+
+ memset(meta, 0, sizeof(BTMETA));
+ meta->dbmeta.lsn = *lsnp;
+ meta->dbmeta.pgno = pgno;
+ meta->dbmeta.magic = DB_BTREEMAGIC;
+ meta->dbmeta.version = DB_BTREEVERSION;
+ meta->dbmeta.pagesize = dbp->pgsize;
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
+ DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
+ meta->crypto_magic = meta->dbmeta.magic;
+ }
+ meta->dbmeta.type = P_BTREEMETA;
+ meta->dbmeta.free = PGNO_INVALID;
+ meta->dbmeta.last_pgno = pgno;
+ if (F_ISSET(dbp, DB_AM_DUP))
+ F_SET(&meta->dbmeta, BTM_DUP);
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN))
+ F_SET(&meta->dbmeta, BTM_FIXEDLEN);
+ if (F_ISSET(dbp, DB_AM_RECNUM))
+ F_SET(&meta->dbmeta, BTM_RECNUM);
+ if (F_ISSET(dbp, DB_AM_RENUMBER))
+ F_SET(&meta->dbmeta, BTM_RENUMBER);
+ if (F_ISSET(dbp, DB_AM_SUBDB))
+ F_SET(&meta->dbmeta, BTM_SUBDB);
+ if (dbp->dup_compare != NULL)
+ F_SET(&meta->dbmeta, BTM_DUPSORT);
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp))
+ F_SET(&meta->dbmeta, BTM_COMPRESS);
+#endif
+ if (dbp->type == DB_RECNO)
+ F_SET(&meta->dbmeta, BTM_RECNO);
+ memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+
+ meta->minkey = t->bt_minkey;
+ meta->re_len = t->re_len;
+ meta->re_pad = (u_int32_t)t->re_pad;
+
+#ifdef HAVE_PARTITION
+ if ((part = dbp->p_internal) != NULL) {
+ meta->dbmeta.nparts = part->nparts;
+ if (F_ISSET(part, PART_CALLBACK))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_CALLBACK);
+ if (F_ISSET(part, PART_RANGE))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_RANGE);
+ }
+#endif
+}
+
+/*
+ * __bam_new_file --
+ * Create the necessary pages to begin a new database file.
+ *
+ * This code appears more complex than it is because of the two cases (named
+ * and unnamed). The way to read the code is that for each page being created,
+ * there are three parts: 1) a "get page" chunk (which either uses malloc'd
+ * memory or calls __memp_fget), 2) the initialization, and 3) the "put page"
+ * chunk which either does a fop write or an __memp_fput.
+ *
+ * PUBLIC: int __bam_new_file __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__bam_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ BTMETA *meta;
+ DBT pdbt;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO pginfo;
+ ENV *env;
+ PAGE *root;
+ db_pgno_t pgno;
+ int ret, t_ret;
+ void *buf;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ root = NULL;
+ meta = NULL;
+ buf = NULL;
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ /* Build the meta-data page. */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno,
+ ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+ return (ret);
+ LSN_NOT_LOGGED(lsn);
+ __bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+ meta->root = 1;
+ meta->dbmeta.last_pgno = 1;
+ if ((ret =
+ __db_log_page(dbp, txn, &lsn, pgno, (PAGE *)meta)) != 0)
+ goto err;
+ ret = __memp_fput(mpf, ip, meta, dbp->priority);
+ meta = NULL;
+ if (ret != 0)
+ goto err;
+
+ /* Build the root page. */
+ pgno = 1;
+ if ((ret = __memp_fget(mpf, &pgno,
+ ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &root)) != 0)
+ goto err;
+ P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID,
+ LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
+ LSN_NOT_LOGGED(root->lsn);
+ if ((ret =
+ __db_log_page(dbp, txn, &root->lsn, pgno, root)) != 0)
+ goto err;
+ ret = __memp_fput(mpf, ip, root, dbp->priority);
+ root = NULL;
+ if (ret != 0)
+ goto err;
+ } else {
+ memset(&pdbt, 0, sizeof(pdbt));
+
+ /* Build the meta-data page. */
+ pginfo.db_pagesize = dbp->pgsize;
+ pginfo.flags =
+ F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+ pginfo.type = dbp->type;
+ pdbt.data = &pginfo;
+ pdbt.size = sizeof(pginfo);
+ if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0)
+ return (ret);
+ meta = (BTMETA *)buf;
+ LSN_NOT_LOGGED(lsn);
+ __bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+ meta->root = 1;
+ meta->dbmeta.last_pgno = 1;
+ if ((ret = __db_pgout(
+ dbp->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
+ goto err;
+ if ((ret = __fop_write(env, txn, name, dbp->dirname,
+ DB_APP_DATA, fhp,
+ dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, F_ISSET(
+ dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ meta = NULL;
+
+ /* Build the root page. */
+#ifdef DIAGNOSTIC
+ memset(buf, CLEAR_BYTE, dbp->pgsize);
+#endif
+ root = (PAGE *)buf;
+ P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID,
+ LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
+ LSN_NOT_LOGGED(root->lsn);
+ if ((ret =
+ __db_pgout(dbp->dbenv, root->pgno, root, &pdbt)) != 0)
+ goto err;
+ if ((ret =
+ __fop_write(env, txn, name, dbp->dirname, DB_APP_DATA,
+ fhp, dbp->pgsize, 1, 0, buf, dbp->pgsize, 1, F_ISSET(
+ dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ root = NULL;
+ }
+
+err: if (buf != NULL)
+ __os_free(env, buf);
+ else {
+ if (meta != NULL &&
+ (t_ret = __memp_fput(mpf, ip,
+ meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (root != NULL &&
+ (t_ret = __memp_fput(mpf, ip,
+ root, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+
+/*
+ * __bam_new_subdb --
+ * Create a metadata page and a root page for a new btree.
+ *
+ * PUBLIC: int __bam_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
+ */
+int
+__bam_new_subdb(mdbp, dbp, ip, txn)
+ DB *mdbp, *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+{
+ BTMETA *meta;
+ DBC *dbc;
+ DB_LOCK metalock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *root;
+ int ret, t_ret;
+
+ env = mdbp->env;
+ mpf = mdbp->mpf;
+ dbc = NULL;
+ meta = NULL;
+ root = NULL;
+
+ if ((ret = __db_cursor(mdbp, ip, txn,
+ &dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
+ return (ret);
+
+ /* Get, and optionally create the metadata page. */
+ if ((ret = __db_lget(dbc,
+ 0, dbp->meta_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &dbp->meta_pgno,
+ ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+
+ /* Build meta-data page. */
+ lsn = meta->dbmeta.lsn;
+ __bam_init_meta(dbp, meta, dbp->meta_pgno, &lsn);
+ if ((ret = __db_log_page(mdbp,
+ txn, &meta->dbmeta.lsn, dbp->meta_pgno, (PAGE *)meta)) != 0)
+ goto err;
+
+ /* Create and initialize a root page. */
+ if ((ret = __db_new(dbc,
+ dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE, NULL, &root)) != 0)
+ goto err;
+ root->level = LEAFLEVEL;
+
+ if (DBENV_LOGGING(env) &&
+#if !defined(DEBUG_WOP)
+ txn != NULL &&
+#endif
+
+ (ret = __bam_root_log(mdbp, txn, &meta->dbmeta.lsn, 0,
+ meta->dbmeta.pgno, root->pgno, &meta->dbmeta.lsn)) != 0)
+ goto err;
+
+ meta->root = root->pgno;
+ if ((ret =
+ __db_log_page(mdbp, txn, &root->lsn, root->pgno, root)) != 0)
+ goto err;
+
+ /* Release the metadata and root pages. */
+ if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
+ goto err;
+ meta = NULL;
+ if ((ret = __memp_fput(mpf, ip, root, dbc->priority)) != 0)
+ goto err;
+ root = NULL;
+err:
+ if (meta != NULL)
+ if ((t_ret = __memp_fput(mpf, ip,
+ meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (root != NULL)
+ if ((t_ret = __memp_fput(mpf, ip,
+ root, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL)
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
diff --git a/src/btree/bt_put.c b/src/btree/bt_put.c
new file mode 100644
index 00000000..13316181
--- /dev/null
+++ b/src/btree/bt_put.c
@@ -0,0 +1,1087 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __bam_build
+ __P((DBC *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_dup_check __P((DBC *, u_int32_t,
+ PAGE *, u_int32_t, u_int32_t, db_indx_t *));
+static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_ovput
+ __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *));
+static u_int32_t
+ __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t));
+
+/*
+ * __bam_iitem --
+ * Insert an item into the tree.
+ *
+ * PUBLIC: int __bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+ */
+int
+__bam_iitem(dbc, key, data, op, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t op, flags;
+{
+ BKEYDATA *bk, bk_tmp;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT bk_hdr, tdbt;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_indx_t cnt, indx;
+ u_int32_t data_size, have_bytes, need_bytes, needed, pages, pagespace;
+ char tmp_ch;
+ int cmp, bigkey, bigdata, del, dupadjust;
+ int padrec, replace, ret, t_ret, was_deleted;
+
+ COMPQUIET(cnt, 0);
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ h = cp->page;
+ indx = cp->indx;
+ del = dupadjust = replace = was_deleted = 0;
+
+ /*
+ * Fixed-length records with partial puts: it's an error to specify
+ * anything other simple overwrite.
+ */
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN) &&
+ F_ISSET(data, DB_DBT_PARTIAL) && data->size != data->dlen)
+ return (__db_rec_repl(env, data->size, data->dlen));
+
+ /*
+ * Figure out how much space the data will take, including if it's a
+ * partial record.
+ *
+ * Fixed-length records: it's an error to specify a record that's
+ * longer than the fixed-length, and we never require less than
+ * the fixed-length record size.
+ */
+ data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
+ __bam_partsize(dbp, op, data, h, indx) : data->size;
+ padrec = 0;
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
+ if (data_size > t->re_len)
+ return (__db_rec_toobig(env, data_size, t->re_len));
+
+ /* Records that are deleted anyway needn't be padded out. */
+ if (!LF_ISSET(BI_DELETED) && data_size < t->re_len) {
+ padrec = 1;
+ data_size = t->re_len;
+ }
+ }
+
+ /*
+ * Handle partial puts or short fixed-length records: check whether we
+ * can just append the data or else build the real record. We can't
+ * append if there are secondaries: we need the whole data item for the
+ * application's secondary callback.
+ */
+ if (op == DB_CURRENT && dbp->dup_compare == NULL &&
+ F_ISSET(data, DB_DBT_PARTIAL) && !DB_IS_PRIMARY(dbp)) {
+ bk = GET_BKEYDATA(
+ dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ /*
+ * If the item is an overflow type, and the input DBT is
+ * partial, and begins at the length of the current item then
+ * it is an append. Avoid deleting and re-creating the entire
+ * offpage item.
+ */
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ data->doff == ((BOVERFLOW *)bk)->tlen) {
+ /*
+ * If the cursor has not already cached the last page
+ * in the offpage chain. We need to walk the chain
+ * to be sure that the page has been read.
+ */
+ if (cp->stream_start_pgno != ((BOVERFLOW *)bk)->pgno ||
+ cp->stream_off > data->doff || data->doff >
+ cp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) {
+ memset(&tdbt, 0, sizeof(DBT));
+ tdbt.doff = data->doff - 1;
+ /*
+ * Set the length to 1, to force __db_goff
+ * to do the traversal.
+ */
+ tdbt.dlen = tdbt.ulen = 1;
+ tdbt.data = &tmp_ch;
+ tdbt.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+ /*
+ * Read to the last page. It will be cached
+ * in the cursor.
+ */
+ if ((ret = __db_goff(
+ dbc, &tdbt, ((BOVERFLOW *)bk)->tlen,
+ ((BOVERFLOW *)bk)->pgno, NULL, NULL)) != 0)
+ return (ret);
+ }
+
+ /*
+ * Since this is an append, dlen is irrelevant (there
+ * are no bytes to overwrite). We need the caller's
+ * DBT size to end up with the total size of the item.
+ * From now on, use dlen as the length of the user's
+ * data that we are going to append.
+ * Don't futz with the caller's DBT any more than we
+ * have to in order to send back the size.
+ */
+ tdbt = *data;
+ tdbt.dlen = data->size;
+ tdbt.size = data_size;
+ data = &tdbt;
+ F_SET(data, DB_DBT_STREAMING);
+ }
+ }
+ if (!F_ISSET(data, DB_DBT_STREAMING) &&
+ (padrec || F_ISSET(data, DB_DBT_PARTIAL))) {
+ tdbt = *data;
+ if ((ret =
+ __bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0)
+ return (ret);
+ data = &tdbt;
+ }
+
+ /*
+ * If the user has specified a duplicate comparison function, return
+ * an error if DB_CURRENT was specified and the replacement data
+ * doesn't compare equal to the current data. This stops apps from
+ * screwing up the duplicate sort order. We have to do this after
+ * we build the real record so that we're comparing the real items.
+ */
+ if (op == DB_CURRENT && dbp->dup_compare != NULL) {
+ if ((ret = __bam_cmp(dbc, data, h,
+ indx + (TYPE(h) == P_LBTREE ? O_INDX : 0),
+ dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ if (cmp != 0) {
+ __db_errx(env, DB_STR("1004",
+ "Existing data sorts differently from put data"));
+ return (EINVAL);
+ }
+ }
+
+ /*
+ * If the key or data item won't fit on a page, we'll have to store
+ * them on overflow pages.
+ */
+ needed = 0;
+ bigdata = data_size > cp->ovflsize;
+ switch (op) {
+ case DB_KEYFIRST:
+ /* We're adding a new key and data pair. */
+ bigkey = key->size > cp->ovflsize;
+ if (bigkey)
+ needed += BOVERFLOW_PSIZE;
+ else
+ needed += BKEYDATA_PSIZE(key->size);
+ if (bigdata)
+ needed += BOVERFLOW_PSIZE;
+ else
+ needed += BKEYDATA_PSIZE(data_size);
+ break;
+ case DB_AFTER:
+ case DB_BEFORE:
+ case DB_CURRENT:
+ /*
+ * We're either overwriting the data item of a key/data pair
+ * or we're creating a new on-page duplicate and only adding
+ * a data item.
+ *
+ * !!!
+ * We're not currently correcting for space reclaimed from
+ * already deleted items, but I don't think it's worth the
+ * complexity.
+ */
+ bigkey = 0;
+ if (op == DB_CURRENT) {
+ bk = GET_BKEYDATA(dbp, h,
+ indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ if (B_TYPE(bk->type) == B_KEYDATA)
+ have_bytes = BKEYDATA_PSIZE(bk->len);
+ else
+ have_bytes = BOVERFLOW_PSIZE;
+ need_bytes = 0;
+ } else {
+ have_bytes = 0;
+ need_bytes = sizeof(db_indx_t);
+ }
+ if (bigdata)
+ need_bytes += BOVERFLOW_PSIZE;
+ else
+ need_bytes += BKEYDATA_PSIZE(data_size);
+
+ if (have_bytes < need_bytes)
+ needed += need_bytes - have_bytes;
+ break;
+ default:
+ return (__db_unknown_flag(env, "DB->put", op));
+ }
+
+ /* Split the page if there's not enough room. */
+ if (P_FREESPACE(dbp, h) < needed)
+ return (DB_NEEDSPLIT);
+
+ /*
+ * Check to see if we will convert to off page duplicates -- if
+ * so, we'll need a page.
+ */
+ if (F_ISSET(dbp, DB_AM_DUP) &&
+ TYPE(h) == P_LBTREE && op != DB_KEYFIRST &&
+ P_FREESPACE(dbp, h) - needed <= dbp->pgsize / 2 &&
+ __bam_dup_check(dbc, op, h, indx, needed, &cnt)) {
+ pages = 1;
+ dupadjust = 1;
+ } else
+ pages = 0;
+
+ /*
+ * If we are not using transactions and there is a page limit
+ * set on the file, then figure out if things will fit before
+ * taking action.
+ */
+ if (dbc->txn == NULL && mpf->mfp->maxpgno != 0) {
+ pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+ if (bigdata)
+ pages += ((data_size - 1) / pagespace) + 1;
+ if (bigkey)
+ pages += ((key->size - 1) / pagespace) + 1;
+
+ if (pages > (mpf->mfp->maxpgno - mpf->mfp->last_pgno))
+ return (__db_space_err(dbp));
+ }
+
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_OFF(dbc->thread_info);
+ ret = __memp_dirty(mpf, &h,
+ dbc->thread_info, dbc->txn, dbc->priority, 0);
+ if (cp->csp->page == cp->page)
+ cp->csp->page = h;
+ cp->page = h;
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_ON(dbc->thread_info);
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * The code breaks it up into five cases:
+ *
+ * 1. Insert a new key/data pair.
+ * 2. Append a new data item (a new duplicate).
+ * 3. Insert a new data item (a new duplicate).
+ * 4. Delete and re-add the data item (overflow item).
+ * 5. Overwrite the data item.
+ */
+ switch (op) {
+ case DB_KEYFIRST: /* 1. Insert a new key/data pair. */
+ if (bigkey) {
+ if ((ret = __bam_ovput(dbc,
+ B_OVERFLOW, PGNO_INVALID, h, indx, key)) != 0)
+ return (ret);
+ } else
+ if ((ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(key->size), NULL, key)) != 0)
+ return (ret);
+
+ if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+ return (ret);
+ ++indx;
+ break;
+ case DB_AFTER: /* 2. Append a new data item. */
+ if (TYPE(h) == P_LBTREE) {
+ /* Copy the key for the duplicate and adjust cursors. */
+ if ((ret =
+ __bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0)
+ return (ret);
+ if ((ret =
+ __bam_ca_di(dbc, PGNO(h), indx + P_INDX, 1)) != 0)
+ return (ret);
+
+ indx += 3;
+
+ cp->indx += 2;
+ } else {
+ ++indx;
+ cp->indx += 1;
+ }
+ break;
+ case DB_BEFORE: /* 3. Insert a new data item. */
+ if (TYPE(h) == P_LBTREE) {
+ /* Copy the key for the duplicate and adjust cursors. */
+ if ((ret = __bam_adjindx(dbc, h, indx, indx, 1)) != 0)
+ return (ret);
+ if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+ return (ret);
+
+ ++indx;
+ }
+ break;
+ case DB_CURRENT:
+ /*
+ * Clear the cursor's deleted flag. The problem is that if
+ * we deadlock or fail while deleting the overflow item or
+ * replacing the non-overflow item, a subsequent cursor close
+ * will try and remove the item because the cursor's delete
+ * flag is set.
+ */
+ if ((ret = __bam_ca_delete(dbp, PGNO(h), indx, 0, NULL)) != 0)
+ return (ret);
+
+ if (TYPE(h) == P_LBTREE)
+ ++indx;
+ bk = GET_BKEYDATA(dbp, h, indx);
+
+ /*
+ * In a Btree deleted records aren't counted (deleted records
+ * are counted in a Recno because all accesses are based on
+ * record number). If it's a Btree and it's a DB_CURRENT
+ * operation overwriting a previously deleted record, increment
+ * the record count.
+ */
+ if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP)
+ was_deleted = B_DISSET(bk->type);
+
+ /*
+ * 4. Delete and re-add the data item.
+ *
+ * If we're changing the type of the on-page structure, or we
+ * are referencing offpage items, we have to delete and then
+ * re-add the item. We do not do any cursor adjustments here
+ * because we're going to immediately re-add the item into the
+ * same slot.
+ */
+ if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
+ /*
+ * If streaming, don't delete the overflow item,
+ * just delete the item pointing to the overflow item.
+ * It will be added back in later, with the new size.
+ * We can't simply adjust the size of the item on the
+ * page, because there is no easy way to log a
+ * modification.
+ */
+ if (F_ISSET(data, DB_DBT_STREAMING)) {
+ if ((ret = __db_ditem(
+ dbc, h, indx, BOVERFLOW_SIZE)) != 0)
+ return (ret);
+ } else if ((ret = __bam_ditem(dbc, h, indx)) != 0)
+ return (ret);
+ del = 1;
+ break;
+ }
+
+ /* 5. Overwrite the data item. */
+ replace = 1;
+ break;
+ default:
+ return (__db_unknown_flag(env, "DB->put", op));
+ }
+
+ /* Add the data. */
+ if (bigdata) {
+ /*
+ * We do not have to handle deleted (BI_DELETED) records
+ * in this case; the actual records should never be created.
+ */
+ DB_ASSERT(env, !LF_ISSET(BI_DELETED));
+ ret = __bam_ovput(dbc,
+ B_OVERFLOW, PGNO_INVALID, h, indx, data);
+ } else {
+ if (LF_ISSET(BI_DELETED)) {
+ B_TSET_DELETED(bk_tmp.type, B_KEYDATA);
+ bk_tmp.len = data->size;
+ bk_hdr.data = &bk_tmp;
+ bk_hdr.size = SSZA(BKEYDATA, data);
+ ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(data->size), &bk_hdr, data);
+ } else if (replace)
+ ret = __bam_ritem(dbc, h, indx, data, 0);
+ else
+ ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(data->size), NULL, data);
+ }
+ if (ret != 0) {
+ if (del == 1 && (t_ret =
+ __bam_ca_di(dbc, PGNO(h), indx + 1, -1)) != 0) {
+ __db_err(env, t_ret, DB_STR("1005",
+ "cursor adjustment after delete failed"));
+ return (__env_panic(env, t_ret));
+ }
+ return (ret);
+ }
+
+ /*
+ * Re-position the cursors if necessary and reset the current cursor
+ * to point to the new item.
+ */
+ if (op != DB_CURRENT) {
+ if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+ return (ret);
+ cp->indx = TYPE(h) == P_LBTREE ? indx - O_INDX : indx;
+ }
+
+ /*
+ * If we've changed the record count, update the tree. There's no
+ * need to adjust the count if the operation not performed on the
+ * current record or when the current record was previously deleted.
+ */
+ if (F_ISSET(cp, C_RECNUM) && (op != DB_CURRENT || was_deleted))
+ if ((ret = __bam_adjust(dbc, 1)) != 0)
+ return (ret);
+
+ /*
+ * If a Btree leaf page is at least 50% full and we may have added or
+ * modified a duplicate data item, see if the set of duplicates takes
+ * up at least 25% of the space on the page. If it does, move it onto
+ * its own page.
+ */
+ if (dupadjust &&
+ (ret = __bam_dup_convert(dbc, h, indx - O_INDX, cnt)) != 0)
+ return (ret);
+
+ /* If we've modified a recno file, set the flag. */
+ if (dbc->dbtype == DB_RECNO)
+ t->re_modified = 1;
+
+ return (ret);
+}
+
+/*
+ * __bam_partsize --
+ * Figure out how much space a partial data item is in total.
+ */
+static u_int32_t
+__bam_partsize(dbp, op, data, h, indx)
+ DB *dbp;
+ u_int32_t op, indx;
+ DBT *data;
+ PAGE *h;
+{
+ BKEYDATA *bk;
+ u_int32_t nbytes;
+
+ /*
+ * If the record doesn't already exist, it's simply the data we're
+ * provided.
+ */
+ if (op != DB_CURRENT)
+ return (data->doff + data->size);
+
+ /*
+ * Otherwise, it's the data provided plus any already existing data
+ * that we're not replacing.
+ */
+ bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ nbytes =
+ B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
+
+ return (__db_partsize(nbytes, data));
+}
+
+/*
+ * __bam_build --
+ * Build the real record for a partial put, or short fixed-length record.
+ */
+static int
+__bam_build(dbc, op, dbt, h, indx, nbytes)
+ DBC *dbc;
+ u_int32_t op, indx, nbytes;
+ DBT *dbt;
+ PAGE *h;
+{
+ BKEYDATA *bk, tbk;
+ BOVERFLOW *bo;
+ BTREE *t;
+ DB *dbp;
+ DBT copy, *rdata;
+ u_int32_t len, tlen;
+ u_int8_t *p;
+ int ret;
+
+ COMPQUIET(bo, NULL);
+
+ dbp = dbc->dbp;
+ t = dbp->bt_internal;
+
+ /* We use the record data return memory, it's only a short-term use. */
+ rdata = &dbc->my_rdata;
+ if (rdata->ulen < nbytes) {
+ if ((ret = __os_realloc(dbp->env,
+ nbytes, &rdata->data)) != 0) {
+ rdata->ulen = 0;
+ rdata->data = NULL;
+ return (ret);
+ }
+ rdata->ulen = nbytes;
+ }
+
+ /*
+ * We use nul or pad bytes for any part of the record that isn't
+ * specified; get it over with.
+ */
+ memset(rdata->data,
+ F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_pad : 0, nbytes);
+
+ /*
+ * In the next clauses, we need to do three things: a) set p to point
+ * to the place at which to copy the user's data, b) set tlen to the
+ * total length of the record, not including the bytes contributed by
+ * the user, and c) copy any valid data from an existing record. If
+ * it's not a partial put (this code is called for both partial puts
+ * and fixed-length record padding) or it's a new key, we can cut to
+ * the chase.
+ */
+ if (!F_ISSET(dbt, DB_DBT_PARTIAL) || op != DB_CURRENT) {
+ p = (u_int8_t *)rdata->data + dbt->doff;
+ tlen = dbt->doff;
+ goto user_copy;
+ }
+
+ /* Find the current record. */
+ if (indx < NUM_ENT(h)) {
+ bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ?
+ O_INDX : 0));
+ bo = (BOVERFLOW *)bk;
+ } else {
+ bk = &tbk;
+ B_TSET(bk->type, B_KEYDATA);
+ bk->len = 0;
+ }
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ /*
+ * In the case of an overflow record, we shift things around
+ * in the current record rather than allocate a separate copy.
+ */
+ memset(&copy, 0, sizeof(copy));
+ if ((ret = __db_goff(dbc, &copy, bo->tlen, bo->pgno,
+ &rdata->data, &rdata->ulen)) != 0)
+ return (ret);
+
+ /* Skip any leading data from the original record. */
+ tlen = dbt->doff;
+ p = (u_int8_t *)rdata->data + dbt->doff;
+
+ /*
+ * Copy in any trailing data from the original record.
+ *
+ * If the original record was larger than the original offset
+ * plus the bytes being deleted, there is trailing data in the
+ * original record we need to preserve. If we aren't deleting
+ * the same number of bytes as we're inserting, copy it up or
+ * down, into place.
+ *
+ * Use memmove(), the regions may overlap.
+ */
+ if (bo->tlen > dbt->doff + dbt->dlen) {
+ len = bo->tlen - (dbt->doff + dbt->dlen);
+ if (dbt->dlen != dbt->size)
+ memmove(p + dbt->size, p + dbt->dlen, len);
+ tlen += len;
+ }
+ } else {
+ /* Copy in any leading data from the original record. */
+ memcpy(rdata->data,
+ bk->data, dbt->doff > bk->len ? bk->len : dbt->doff);
+ tlen = dbt->doff;
+ p = (u_int8_t *)rdata->data + dbt->doff;
+
+ /* Copy in any trailing data from the original record. */
+ len = dbt->doff + dbt->dlen;
+ if (bk->len > len) {
+ memcpy(p + dbt->size, bk->data + len, bk->len - len);
+ tlen += bk->len - len;
+ }
+ }
+
+user_copy:
+ /*
+ * Copy in the application provided data -- p and tlen must have been
+ * initialized above.
+ */
+ memcpy(p, dbt->data, dbt->size);
+ tlen += dbt->size;
+
+ /* Set the DBT to reference our new record. */
+ rdata->size = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : tlen;
+ rdata->dlen = 0;
+ rdata->doff = 0;
+ rdata->flags = 0;
+ *dbt = *rdata;
+ return (0);
+}
+
+/*
+ * __bam_ritem --
+ * Replace an item on a page.
+ *
+ * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *, u_int32_t));
+ */
+int
+__bam_ritem(dbc, h, indx, data, typeflag)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *data;
+ u_int32_t typeflag;
+{
+ BKEYDATA *bk;
+ DB *dbp;
+ DBT orig, repl;
+ db_indx_t min, prefix, suffix;
+ u_int32_t len;
+ int ret;
+ u_int8_t *dp, *p, *t, type;
+
+ dbp = dbc->dbp;
+
+ /*
+ * Replace a single item onto a page. The logic figuring out where
+ * to insert and whether it fits is handled in the caller. All we do
+ * here is manage the page shuffling.
+ */
+ bk = GET_BKEYDATA(dbp, h, indx);
+ len = bk->len;
+ dp = bk->data;
+ type = bk->type;
+ typeflag = B_DISSET(type);
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ /*
+ * We might as well check to see if the two data items share
+ * a common prefix and suffix -- it can save us a lot of log
+ * message if they're large.
+ */
+ min = data->size < len ? data->size : len;
+ for (prefix = 0,
+ p = dp, t = data->data;
+ prefix < min && *p == *t; ++prefix, ++p, ++t)
+ ;
+
+ min -= prefix;
+ for (suffix = 0,
+ p = (u_int8_t *)dp + len - 1,
+ t = (u_int8_t *)data->data + data->size - 1;
+ suffix < min && *p == *t; ++suffix, --p, --t)
+ ;
+
+ /* We only log the parts of the keys that have changed. */
+ orig.data = (u_int8_t *)dp + prefix;
+ orig.size = len - (prefix + suffix);
+ repl.data = (u_int8_t *)data->data + prefix;
+ repl.size = data->size - (prefix + suffix);
+ if ((ret = __bam_repl_log(dbp, dbc->txn, &LSN(h), 0, PGNO(h),
+ &LSN(h), (u_int32_t)indx, typeflag,
+ &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(h));
+
+ return (__bam_ritem_nolog(dbc, h, indx, NULL, data, type));
+}
+
+/*
+ * __bam_ritem_nolog --
+ * Replace an item on a page.
+ *
+ * PUBLIC: int __bam_ritem_nolog __P((DBC *,
+ * PUBLIC: PAGE *, u_int32_t, DBT *, DBT *, u_int32_t));
+ */
+int
+__bam_ritem_nolog(dbc, h, indx, hdr, data, type)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *hdr, *data;
+ u_int32_t type;
+{
+ BKEYDATA *bk;
+ BINTERNAL *bi;
+ DB *dbp;
+ db_indx_t cnt, off, lo, ln;
+ db_indx_t *inp;
+ int32_t nbytes;
+ u_int8_t *p, *t;
+
+ dbp = dbc->dbp;
+ /*
+ * Set references to the first in-use byte on the page and the
+ * first byte of the item being replaced.
+ */
+ inp = P_INP(dbp, h);
+ p = (u_int8_t *)h + HOFFSET(h);
+ if (TYPE(h) == P_IBTREE) {
+ bi = GET_BINTERNAL(dbp, h, indx);
+ t = (u_int8_t *)bi;
+ lo = (db_indx_t)BINTERNAL_SIZE(bi->len);
+ if (data == NULL) {
+ DB_ASSERT(dbp->env, hdr != NULL);
+ bi = (BINTERNAL*)hdr->data;
+ P_16_COPY(&bi->len, &cnt);
+ ln = (db_indx_t)BINTERNAL_SIZE(cnt);
+ } else
+ ln = (db_indx_t)BINTERNAL_SIZE(data->size);
+ } else {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ t = (u_int8_t *)bk;
+ lo = (db_indx_t)BKEYDATA_SIZE(bk->len);
+ ln = (db_indx_t)BKEYDATA_SIZE(data->size);
+ }
+
+ /*
+ * If the entry is growing in size, shift the beginning of the data
+ * part of the page down. If the entry is shrinking in size, shift
+ * the beginning of the data part of the page up. Use memmove(3),
+ * the regions overlap.
+ */
+ if (lo != ln) {
+ nbytes = (int32_t)(lo - ln); /* Signed difference. */
+ if (p == t) /* First index is fast. */
+ inp[indx] += (u_int32_t)nbytes;
+ else { /* Else, shift the page. */
+ memmove(p + nbytes, p, (size_t)(t - p));
+
+ /* Adjust the indices' offsets. */
+ off = (u_int32_t)inp[indx];
+ for (cnt = 0; cnt < NUM_ENT(h); ++cnt)
+ if (inp[cnt] <= off)
+ inp[cnt] += (u_int32_t)nbytes;
+ }
+
+ /* Clean up the page and adjust the item's reference. */
+ HOFFSET(h) += (u_int32_t)nbytes;
+ t += nbytes;
+ }
+
+ /* Copy the new item onto the page. */
+ if (TYPE(h) == P_IBTREE) {
+ DB_ASSERT(dbp->env, hdr != NULL);
+ memcpy(t, hdr->data, hdr->size);
+ bi = (BINTERNAL *)t;
+ if (data != NULL && data->size != 0)
+ memcpy(bi->data, data->data, data->size);
+ } else {
+ bk = (BKEYDATA *)t;
+ bk->len = data->size;
+ B_TSET(bk->type, type);
+ memcpy(bk->data, data->data, bk->len);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_irep --
+ * Replace an item on an internal page.
+ *
+ * PUBLIC: int __bam_irep __P((DBC *, PAGE *, u_int32_t, DBT *, DBT *));
+ */
+int
+__bam_irep(dbc, h, indx, hdr, data)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *hdr;
+ DBT *data;
+{
+ BINTERNAL *bi, *bn;
+ DB *dbp;
+ DBT dbt;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ bi = GET_BINTERNAL(dbp, h, indx);
+ bn = (BINTERNAL *) hdr->data;
+
+ if (B_TYPE(bi->type) == B_OVERFLOW &&
+ (ret = __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
+ return (ret);
+
+ if (DBC_LOGGING(dbc)) {
+ dbt.data = bi;
+ dbt.size = BINTERNAL_SIZE(bi->len);
+ if ((ret = __bam_irep_log(dbp, dbc->txn, &LSN(h), 0, PGNO(h),
+ &LSN(h), (u_int32_t)indx, TYPE(h), hdr, data, &dbt)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(h));
+
+ return (__bam_ritem_nolog(dbc, h, indx, hdr, data, bn->type));
+}
+
+/*
+ * __bam_dup_check --
+ * Check to see if the duplicate set at indx should have its own page.
+ */
+static int
+__bam_dup_check(dbc, op, h, indx, sz, cntp)
+ DBC *dbc;
+ u_int32_t op;
+ PAGE *h;
+ u_int32_t indx, sz;
+ db_indx_t *cntp;
+{
+ BKEYDATA *bk;
+ DB *dbp;
+ db_indx_t cnt, first, *inp;
+
+ dbp = dbc->dbp;
+ inp = P_INP(dbp, h);
+
+ /*
+ * Count the duplicate records and calculate how much room they're
+ * using on the page.
+ */
+ while (indx > 0 && inp[indx] == inp[indx - P_INDX])
+ indx -= P_INDX;
+
+ /* Count the key once. */
+ bk = GET_BKEYDATA(dbp, h, indx);
+ sz += B_TYPE(bk->type) == B_KEYDATA ?
+ BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
+
+ /*
+ * Sum up all the data items.
+ * Account for the record being inserted. If we are replacing it,
+ * don't count it twice.
+ *
+ * We execute the loop with first == indx to get the size of the
+ * first record.
+ */
+ cnt = op == DB_CURRENT ? 0 : 1;
+ for (first = indx;
+ indx < NUM_ENT(h) && inp[first] == inp[indx];
+ ++cnt, indx += P_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx + O_INDX);
+ sz += B_TYPE(bk->type) == B_KEYDATA ?
+ BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
+ }
+
+ /*
+ * We have to do these checks when the user is replacing the cursor's
+ * data item -- if the application replaces a duplicate item with a
+ * larger data item, it can increase the amount of space used by the
+ * duplicates, requiring this check. But that means we may have done
+ * this check when it wasn't a duplicate item after all.
+ */
+ if (cnt == 1)
+ return (0);
+
+ /*
+ * If this set of duplicates is using more than 25% of the page, move
+ * them off. The choice of 25% is a WAG, but the value must be small
+ * enough that we can always split a page without putting duplicates
+ * on two different pages.
+ */
+ if (sz < dbp->pgsize / 4)
+ return (0);
+
+ *cntp = cnt;
+ return (1);
+}
+
+/*
+ * __bam_dup_convert --
+ * Move a set of duplicates off-page and into their own tree.
+ */
+static int
+__bam_dup_convert(dbc, h, indx, cnt)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx, cnt;
+{
+ BKEYDATA *bk;
+ DB *dbp;
+ DBT hdr;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ PAGE *dp;
+ db_indx_t cpindx, dindx, first, *inp;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ inp = P_INP(dbp, h);
+
+ /* Move to the beginning of the dup set. */
+ while (indx > 0 && inp[indx] == inp[indx - P_INDX])
+ indx -= P_INDX;
+
+ /* Get a new page. */
+ if ((ret = __db_new(dbc,
+ dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, &lock, &dp)) != 0)
+ return (ret);
+ P_INIT(dp, dbp->pgsize, dp->pgno,
+ PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp));
+
+ /*
+ * Move this set of duplicates off the page. First points to the first
+ * key of the first duplicate key/data pair, cnt is the number of pairs
+ * we're dealing with.
+ */
+ memset(&hdr, 0, sizeof(hdr));
+ first = indx;
+ dindx = indx;
+ cpindx = 0;
+ do {
+ /* Move cursors referencing the old entry to the new entry. */
+ if ((ret = __bam_ca_dup(dbc, first,
+ PGNO(h), indx, PGNO(dp), cpindx)) != 0)
+ goto err;
+
+ /*
+ * Copy the entry to the new page. If the off-duplicate page
+ * If the off-duplicate page is a Btree page (i.e. dup_compare
+ * will be non-NULL, we use Btree pages for sorted dups,
+ * and Recno pages for unsorted dups), move all entries
+ * normally, even deleted ones. If it's a Recno page,
+ * deleted entries are discarded (if the deleted entry is
+ * overflow, then free up those pages).
+ */
+ bk = GET_BKEYDATA(dbp, h, dindx + 1);
+ hdr.data = bk;
+ hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
+ BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
+ if (dbp->dup_compare == NULL && B_DISSET(bk->type)) {
+ /*
+ * Unsorted dups, i.e. recno page, and we have
+ * a deleted entry, don't move it, but if it was
+ * an overflow entry, we need to free those pages.
+ */
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_doff(dbc,
+ (GET_BOVERFLOW(dbp, h, dindx + 1))->pgno)) != 0)
+ goto err;
+ } else {
+ if ((ret = __db_pitem(
+ dbc, dp, cpindx, hdr.size, &hdr, NULL)) != 0)
+ goto err;
+ ++cpindx;
+ }
+ /* Delete all but the last reference to the key. */
+ if (cnt != 1) {
+ if ((ret = __bam_adjindx(dbc,
+ h, dindx, first + 1, 0)) != 0)
+ goto err;
+ } else
+ dindx++;
+
+ /* Delete the data item. */
+ if ((ret = __db_ditem(dbc, h, dindx, hdr.size)) != 0)
+ goto err;
+ indx += P_INDX;
+ } while (--cnt);
+
+ /* Put in a new data item that points to the duplicates page. */
+ if ((ret = __bam_ovput(dbc,
+ B_DUPLICATE, dp->pgno, h, first + 1, NULL)) != 0)
+ goto err;
+
+ /* Adjust cursors for all the above movements. */
+ ret = __bam_ca_di(dbc,
+ PGNO(h), first + P_INDX, (int)(first + P_INDX - indx));
+
+err: if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, dp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ (void)__TLPUT(dbc, lock);
+ return (ret);
+}
+
+/*
+ * __bam_ovput --
+ * Build an item for an off-page duplicates page or overflow page and
+ * insert it on the page.
+ */
+static int
+__bam_ovput(dbc, type, pgno, h, indx, item)
+ DBC *dbc;
+ u_int32_t type, indx;
+ db_pgno_t pgno;
+ PAGE *h;
+ DBT *item;
+{
+ BOVERFLOW bo;
+ DBT hdr;
+ int ret;
+
+ UMRW_SET(bo.unused1);
+ B_TSET(bo.type, type);
+ UMRW_SET(bo.unused2);
+
+ /*
+ * If we're creating an overflow item, do so and acquire the page
+ * number for it. If we're creating an off-page duplicates tree,
+ * we are giving the page number as an argument.
+ */
+ if (type == B_OVERFLOW) {
+ if ((ret = __db_poff(dbc, item, &bo.pgno)) != 0)
+ return (ret);
+ bo.tlen = item->size;
+ } else {
+ bo.pgno = pgno;
+ bo.tlen = 0;
+ }
+
+ /* Store the new record on the page. */
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bo;
+ hdr.size = BOVERFLOW_SIZE;
+ return (__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &hdr, NULL));
+}
diff --git a/src/btree/bt_rec.c b/src/btree/bt_rec.c
new file mode 100644
index 00000000..026564b6
--- /dev/null
+++ b/src/btree/bt_rec.c
@@ -0,0 +1,2036 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+#define IS_BTREE_PAGE(pagep) \
+ (TYPE(pagep) == P_IBTREE || \
+ TYPE(pagep) == P_LBTREE || TYPE(pagep) == P_LDUP)
+
+/*
+ * __bam_split_recover --
+ * Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_split_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_split_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN *plsnp;
+ DB_MPOOLFILE *mpf;
+ PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+ db_pgno_t pgno, parent_pgno;
+ u_int32_t opflags, size;
+ int cmp, l_update, p_update, r_update, ret, rootsplit, t_ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_split_print);
+
+ _lp = lp = np = pp = _rp = rp = NULL;
+ sp = NULL;
+
+ REC_INTRO(__bam_split_read, ip, 0);
+
+ opflags = OP_MODE_GET(argp->opflags);
+ if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+ (opflags & SPL_RECNO) ? DB_RECNO : DB_BTREE,
+ PGNO_INVALID, DB_RECOVER, NULL, &dbc)) != 0)
+ goto out;
+ if (opflags & SPL_NRECS)
+ F_SET((BTREE_CURSOR *)dbc->internal, C_RECNUM);
+
+ /*
+ * There are two kinds of splits that we have to recover from. The
+ * first is a root-page split, where the root page is split from a
+ * leaf page into an internal page and two new leaf pages are created.
+ * The second is where a page is split into two pages, and a new key
+ * is inserted into the parent page.
+ *
+ * DBTs are not aligned in log records, so we need to copy the page
+ * so that we can access fields within it throughout this routine.
+ * Although we could hardcode the unaligned copies in this routine,
+ * we will be calling into regular btree functions with this page,
+ * so it's got to be aligned. Copying it into allocated memory is
+ * the only way to guarantee this.
+ */
+ if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0)
+ goto out;
+ memcpy(sp, argp->pg.data, argp->pg.size);
+
+ pgno = PGNO(sp);
+ parent_pgno = argp->ppgno;
+ rootsplit = parent_pgno == pgno;
+
+ /* Get the pages going down the tree. */
+ REC_FGET(mpf, ip, parent_pgno, &pp, left);
+left: REC_FGET(mpf, ip, argp->left, &lp, right);
+right: REC_FGET(mpf, ip, argp->right, &rp, redo);
+
+redo: if (DB_REDO(op)) {
+ l_update = r_update = p_update = 0;
+ /*
+ * Decide if we need to resplit the page.
+ *
+ * If this is a root split, then the root has to exist unless
+ * we have truncated it due to a future deallocation.
+ */
+ if (pp != NULL) {
+ if (rootsplit)
+ plsnp = &LSN(argp->pg.data);
+ else
+ plsnp = &argp->plsn;
+ cmp = LOG_COMPARE(&LSN(pp), plsnp);
+ CHECK_LSN(env, op, cmp, &LSN(pp), plsnp);
+ if (cmp == 0)
+ p_update = 1;
+ }
+
+ if (lp != NULL) {
+ cmp = LOG_COMPARE(&LSN(lp), &argp->llsn);
+ CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn);
+ if (cmp == 0)
+ l_update = 1;
+ }
+
+ if (rp != NULL) {
+ cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn);
+ CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn);
+ if (cmp == 0)
+ r_update = 1;
+ }
+
+ if (!p_update && !l_update && !r_update)
+ goto check_next;
+
+ /* Allocate and initialize new left/right child pages. */
+ if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 ||
+ (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0)
+ goto out;
+ if (rootsplit) {
+ P_INIT(_lp, file_dbp->pgsize, argp->left,
+ PGNO_INVALID,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->left,
+ PGNO_INVALID, LEVEL(sp), TYPE(sp));
+ } else {
+ P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+ ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+ LEVEL(sp), TYPE(sp));
+ }
+
+ /* Split the page. */
+ if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+ (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+ NUM_ENT(sp))) != 0)
+ goto out;
+
+ if (l_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ memcpy(lp, _lp, file_dbp->pgsize);
+ lp->lsn = *lsnp;
+ }
+
+ if (r_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ memcpy(rp, _rp, file_dbp->pgsize);
+ rp->lsn = *lsnp;
+ }
+
+ /*
+ * Drop the latches on the lower level pages before
+ * getting an exclusive latch on the higher level page.
+ */
+ if (lp != NULL && (ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) && ret == 0)
+ goto out;
+ lp = NULL;
+ if (rp != NULL && (ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) && ret == 0)
+ goto out;
+ rp = NULL;
+ /*
+ * If the parent page is wrong, update it.
+ * For recno the insert into an existing parent
+ * was logged separately.
+ * If it is a root page update initialize the page and
+ * update the record counts if needed.
+ * Then insert the record for the right hand child page.
+ */
+ if (p_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+
+ if (rootsplit) {
+ P_INIT(pp, file_dbp->pgsize, pgno, PGNO_INVALID,
+ PGNO_INVALID, _lp->level + 1,
+ (opflags & SPL_RECNO) ?
+ P_IRECNO : P_IBTREE);
+ if (opflags & SPL_NRECS) {
+ RE_NREC_SET(pp,
+ __bam_total(file_dbp, _lp) +
+ __bam_total(file_dbp, _rp));
+ }
+ if ((ret = __db_pitem_nolog(dbc, pp,
+ argp->pindx, argp->pentry.size,
+ &argp->pentry, NULL)) != 0)
+ goto out;
+
+ } else if (opflags & SPL_NRECS)
+ goto recno;
+ if ((ret = __db_pitem_nolog(dbc, pp, argp->pindx + 1,
+ argp->rentry.size, &argp->rentry, NULL)) != 0)
+ goto out;
+recno: pp->lsn = *lsnp;
+ }
+
+check_next: /*
+ * Finally, redo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. The next
+ * page must exist because we're redoing the operation.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->npgno, &np, done);
+ cmp = LOG_COMPARE(&LSN(np), &argp->nlsn);
+ CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->right;
+ np->lsn = *lsnp;
+ }
+ }
+ } else {
+ /*
+ * If it's a root split and the left child ever existed, update
+ * its LSN. Otherwise its the split page. If
+ * right child ever existed, root split or not, update its LSN.
+ * The undo of the page allocation(s) will restore them to the
+ * free list.
+ */
+ if (rootsplit && lp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(lp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ lp->lsn = argp->llsn;
+ }
+ if (rp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(rp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ rp->lsn = argp->rlsn;
+ }
+ /*
+ * Drop the lower level pages before getting an exclusive
+ * latch on the parent.
+ */
+ if (rp != NULL && (ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)))
+ goto out;
+ rp = NULL;
+
+ /*
+ * Check the state of the split page. If its a rootsplit
+ * then that's the rootpage otherwise its the left page.
+ */
+ if (rootsplit) {
+ DB_ASSERT(env, pgno == argp->ppgno);
+ if (lp != NULL && (ret = __memp_fput(mpf, ip,
+ lp, file_dbp->priority)) != 0)
+ goto out;
+ lp = pp;
+ pp = NULL;
+ }
+ if (lp != NULL) {
+ cmp = LOG_COMPARE(lsnp, &LSN(lp));
+ CHECK_ABORT(env, op, cmp, &LSN(lp), lsnp);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ memcpy(lp, argp->pg.data, argp->pg.size);
+ if ((ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)))
+ goto out;
+ lp = NULL;
+ }
+ }
+
+ /*
+ * Next we can update the parent removing the new index.
+ * If this has record numbers, then we log this separately.
+ */
+ if (pp != NULL) {
+ DB_ASSERT(env, !rootsplit);
+ cmp = LOG_COMPARE(lsnp, &LSN(pp));
+ CHECK_ABORT(env, op, cmp, &LSN(pp), lsnp);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+ if ((opflags & SPL_NRECS) == 0) {
+ size = BINTERNAL_SIZE(
+ GET_BINTERNAL(file_dbp,
+ pp, argp->pindx + 1)->len);
+
+ if ((ret = __db_ditem(dbc, pp,
+ argp->pindx + 1, size)) != 0)
+ goto out;
+ }
+ pp->lsn = argp->plsn;
+ }
+ }
+
+ /*
+ * Finally, undo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. Since it's
+ * possible that the next-page never existed, we ignore it as
+ * if there's nothing to undo.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->npgno,
+ ip, NULL, DB_MPOOL_EDIT, &np)) != 0) {
+ np = NULL;
+ goto done;
+ }
+ if (LOG_COMPARE(lsnp, &LSN(np)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->left;
+ np->lsn = argp->nlsn;
+ }
+ }
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: /* Free any pages that are left. */
+ if (lp != NULL && (t_ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (np != NULL && (t_ret = __memp_fput(mpf,
+ ip, np, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pp != NULL && (t_ret = __memp_fput(mpf,
+ ip, pp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free any allocated space. */
+ if (_lp != NULL)
+ __os_free(env, _lp);
+ if (_rp != NULL)
+ __os_free(env, _rp);
+ if (sp != NULL)
+ __os_free(env, sp);
+
+ REC_CLOSE;
+}
+/*
+ * __bam_split_48_recover --
+ * Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_48_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_split_48_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_split_48_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN *plsnp;
+ DB_MPOOLFILE *mpf;
+ PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+ db_pgno_t pgno, parent_pgno;
+ u_int32_t ptype, size;
+ int cmp, l_update, p_update, r_update, ret, rootsplit, t_ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_split_print);
+
+ _lp = lp = np = pp = _rp = rp = NULL;
+ sp = NULL;
+
+ REC_INTRO(__bam_split_48_read, ip, 0);
+
+ if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+ (argp->opflags & SPL_RECNO) ? DB_RECNO : DB_BTREE,
+ PGNO_INVALID, DB_RECOVER, NULL, &dbc)) != 0)
+ goto out;
+ if (argp->opflags & SPL_NRECS)
+ F_SET((BTREE_CURSOR *)dbc->internal, C_RECNUM);
+
+ /*
+ * There are two kinds of splits that we have to recover from. The
+ * first is a root-page split, where the root page is split from a
+ * leaf page into an internal page and two new leaf pages are created.
+ * The second is where a page is split into two pages, and a new key
+ * is inserted into the parent page.
+ *
+ * DBTs are not aligned in log records, so we need to copy the page
+ * so that we can access fields within it throughout this routine.
+ * Although we could hardcode the unaligned copies in this routine,
+ * we will be calling into regular btree functions with this page,
+ * so it's got to be aligned. Copying it into allocated memory is
+ * the only way to guarantee this.
+ */
+ if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0)
+ goto out;
+ memcpy(sp, argp->pg.data, argp->pg.size);
+
+ pgno = PGNO(sp);
+ parent_pgno = argp->ppgno;
+ rootsplit = parent_pgno == pgno;
+
+ /* Get the pages going down the tree. */
+ REC_FGET(mpf, ip, parent_pgno, &pp, left);
+left: REC_FGET(mpf, ip, argp->left, &lp, right);
+right: REC_FGET(mpf, ip, argp->right, &rp, redo);
+
+redo: if (DB_REDO(op)) {
+ l_update = r_update = p_update = 0;
+ /*
+ * Decide if we need to resplit the page.
+ *
+ * If this is a root split, then the root has to exist unless
+ * we have truncated it due to a future deallocation.
+ */
+ if (pp != NULL) {
+ if (rootsplit)
+ plsnp = &LSN(argp->pg.data);
+ else
+ plsnp = &argp->plsn;
+ cmp = LOG_COMPARE(&LSN(pp), plsnp);
+ CHECK_LSN(env, op, cmp, &LSN(pp), plsnp);
+ if (cmp == 0)
+ p_update = 1;
+ }
+
+ if (lp != NULL) {
+ cmp = LOG_COMPARE(&LSN(lp), &argp->llsn);
+ CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn);
+ if (cmp == 0)
+ l_update = 1;
+ }
+
+ if (rp != NULL) {
+ cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn);
+ CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn);
+ if (cmp == 0)
+ r_update = 1;
+ }
+
+ if (!p_update && !l_update && !r_update)
+ goto check_next;
+
+ /* Allocate and initialize new left/right child pages. */
+ if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 ||
+ (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0)
+ goto out;
+ if (rootsplit) {
+ P_INIT(_lp, file_dbp->pgsize, argp->left,
+ PGNO_INVALID,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->left,
+ PGNO_INVALID, LEVEL(sp), TYPE(sp));
+ } else {
+ P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+ ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+ LEVEL(sp), TYPE(sp));
+ }
+
+ /* Split the page. */
+ if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+ (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+ NUM_ENT(sp))) != 0)
+ goto out;
+
+ if (l_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ memcpy(lp, _lp, file_dbp->pgsize);
+ lp->lsn = *lsnp;
+ }
+
+ if (r_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ memcpy(rp, _rp, file_dbp->pgsize);
+ rp->lsn = *lsnp;
+ }
+
+ /*
+ * Drop the latches on the lower level pages before
+ * getting an exclusive latch on the higher level page.
+ */
+ if (lp != NULL && (ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) && ret == 0)
+ goto out;
+ lp = NULL;
+ if (rp != NULL && (ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) && ret == 0)
+ goto out;
+ rp = NULL;
+ /*
+ * If the parent page is wrong, update it.
+ * Initialize the page. If it is a root page update
+ * the record counts if needed and put the first record in.
+ * Then insert the record for the right hand child page.
+ */
+ if (p_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+ if (argp->opflags & SPL_RECNO)
+ ptype = P_IRECNO;
+ else
+ ptype = P_IBTREE;
+
+ if (rootsplit) {
+ P_INIT(pp, file_dbp->pgsize, pgno, PGNO_INVALID,
+ PGNO_INVALID, _lp->level + 1, ptype);
+ if (argp->opflags & SPL_NRECS) {
+ RE_NREC_SET(pp,
+ __bam_total(file_dbp, _lp) +
+ __bam_total(file_dbp, _rp));
+ }
+ if ((ret = __db_pitem_nolog(dbc, pp,
+ argp->pindx, argp->pentry.size,
+ &argp->pentry, NULL)) != 0)
+ goto out;
+
+ }
+ if ((ret = __db_pitem_nolog(dbc, pp, argp->pindx + 1,
+ argp->rentry.size, &argp->rentry, NULL)) != 0)
+ goto out;
+ pp->lsn = *lsnp;
+ }
+
+check_next: /*
+ * Finally, redo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. The next
+ * page must exist because we're redoing the operation.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->npgno, &np, done);
+ cmp = LOG_COMPARE(&LSN(np), &argp->nlsn);
+ CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->right;
+ np->lsn = *lsnp;
+ }
+ }
+ } else {
+ /*
+ * If it's a root split and the left child ever existed, update
+ * its LSN. Otherwise its the split page. If
+ * right child ever existed, root split or not, update its LSN.
+ * The undo of the page allocation(s) will restore them to the
+ * free list.
+ */
+ if (rootsplit && lp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(lp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ lp->lsn = argp->llsn;
+ }
+ if (rp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(rp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ rp->lsn = argp->rlsn;
+ }
+ /*
+ * Drop the lower level pages before getting an exclusive
+ * latch on the parent.
+ */
+ if (rp != NULL && (ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)))
+ goto out;
+ rp = NULL;
+
+ /*
+ * Check the state of the split page. If its a rootsplit
+ * then that's the rootpage otherwise its the left page.
+ */
+ if (rootsplit) {
+ DB_ASSERT(env, pgno == argp->ppgno);
+ if (lp != NULL && (ret = __memp_fput(mpf, ip,
+ lp, file_dbp->priority)) != 0)
+ goto out;
+ lp = pp;
+ pp = NULL;
+ }
+ if (lp != NULL) {
+ cmp = LOG_COMPARE(lsnp, &LSN(lp));
+ CHECK_ABORT(env, op, cmp, &LSN(lp), lsnp);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ memcpy(lp, argp->pg.data, argp->pg.size);
+ if ((ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)))
+ goto out;
+ lp = NULL;
+ }
+ }
+
+ /*
+ * Next we can update the parent removing the new index.
+ */
+ if (pp != NULL) {
+ DB_ASSERT(env, !rootsplit);
+ cmp = LOG_COMPARE(lsnp, &LSN(pp));
+ CHECK_ABORT(env, op, cmp, &LSN(pp), lsnp);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+ if (argp->opflags & SPL_RECNO)
+ size = RINTERNAL_SIZE;
+ else
+ size = BINTERNAL_SIZE(
+ GET_BINTERNAL(file_dbp,
+ pp, argp->pindx + 1)->len);
+
+ if ((ret = __db_ditem(dbc, pp,
+ argp->pindx + 1, size)) != 0)
+ goto out;
+ pp->lsn = argp->plsn;
+ }
+ }
+
+ /*
+ * Finally, undo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. Since it's
+ * possible that the next-page never existed, we ignore it as
+ * if there's nothing to undo.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->npgno,
+ ip, NULL, DB_MPOOL_EDIT, &np)) != 0) {
+ np = NULL;
+ goto done;
+ }
+ if (LOG_COMPARE(lsnp, &LSN(np)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->left;
+ np->lsn = argp->nlsn;
+ }
+ }
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: /* Free any pages that are left. */
+ if (lp != NULL && (t_ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (np != NULL && (t_ret = __memp_fput(mpf,
+ ip, np, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pp != NULL && (t_ret = __memp_fput(mpf,
+ ip, pp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free any allocated space. */
+ if (_lp != NULL)
+ __os_free(env, _lp);
+ if (_rp != NULL)
+ __os_free(env, _rp);
+ if (sp != NULL)
+ __os_free(env, sp);
+
+ REC_CLOSE;
+}
+/*
+ * __bam_split_recover --
+ * Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_split_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_split_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+ db_pgno_t pgno, root_pgno;
+ u_int32_t ptype;
+ int cmp, l_update, p_update, r_update, rc, ret, rootsplit, t_ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_split_print);
+
+ _lp = lp = np = pp = _rp = rp = NULL;
+ sp = NULL;
+
+ REC_INTRO(__bam_split_42_read, ip, 0);
+
+ /*
+ * There are two kinds of splits that we have to recover from. The
+ * first is a root-page split, where the root page is split from a
+ * leaf page into an internal page and two new leaf pages are created.
+ * The second is where a page is split into two pages, and a new key
+ * is inserted into the parent page.
+ *
+ * DBTs are not aligned in log records, so we need to copy the page
+ * so that we can access fields within it throughout this routine.
+ * Although we could hardcode the unaligned copies in this routine,
+ * we will be calling into regular btree functions with this page,
+ * so it's got to be aligned. Copying it into allocated memory is
+ * the only way to guarantee this.
+ */
+ if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0)
+ goto out;
+ memcpy(sp, argp->pg.data, argp->pg.size);
+
+ pgno = PGNO(sp);
+ root_pgno = argp->root_pgno;
+ rootsplit = root_pgno != PGNO_INVALID;
+ REC_FGET(mpf, ip, argp->left, &lp, right);
+right: REC_FGET(mpf, ip, argp->right, &rp, redo);
+
+redo: if (DB_REDO(op)) {
+ l_update = r_update = p_update = 0;
+ /*
+ * Decide if we need to resplit the page.
+ *
+ * If this is a root split, then the root has to exist unless
+ * we have truncated it due to a future deallocation.
+ */
+ if (rootsplit) {
+ REC_FGET(mpf, ip, root_pgno, &pp, do_left);
+ cmp = LOG_COMPARE(&LSN(pp), &LSN(argp->pg.data));
+ CHECK_LSN(env, op,
+ cmp, &LSN(pp), &LSN(argp->pg.data));
+ p_update = cmp == 0;
+ }
+
+do_left: if (lp != NULL) {
+ cmp = LOG_COMPARE(&LSN(lp), &argp->llsn);
+ CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn);
+ if (cmp == 0)
+ l_update = 1;
+ }
+
+ if (rp != NULL) {
+ cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn);
+ CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn);
+ if (cmp == 0)
+ r_update = 1;
+ }
+
+ if (!p_update && !l_update && !r_update)
+ goto check_next;
+
+ /* Allocate and initialize new left/right child pages. */
+ if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 ||
+ (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0)
+ goto out;
+ if (rootsplit) {
+ P_INIT(_lp, file_dbp->pgsize, argp->left,
+ PGNO_INVALID,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->left,
+ PGNO_INVALID, LEVEL(sp), TYPE(sp));
+ } else {
+ P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+ ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+ LEVEL(sp), TYPE(sp));
+ }
+
+ /* Split the page. */
+ if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+ (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+ NUM_ENT(sp))) != 0)
+ goto out;
+
+ if (l_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ memcpy(lp, _lp, file_dbp->pgsize);
+ lp->lsn = *lsnp;
+ if ((ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) != 0)
+ goto out;
+ lp = NULL;
+ }
+
+ if (r_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ memcpy(rp, _rp, file_dbp->pgsize);
+ rp->lsn = *lsnp;
+ if ((ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) != 0)
+ goto out;
+ rp = NULL;
+ }
+
+ /*
+ * If the parent page is wrong, update it. This is of interest
+ * only if it was a root split, since root splits create parent
+ * pages. All other splits modify a parent page, but those are
+ * separately logged and recovered.
+ */
+ if (rootsplit && p_update) {
+ if (IS_BTREE_PAGE(sp)) {
+ ptype = P_IBTREE;
+ rc = argp->opflags & SPL_NRECS ? 1 : 0;
+ } else {
+ ptype = P_IRECNO;
+ rc = 1;
+ }
+
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+ P_INIT(pp, file_dbp->pgsize, root_pgno,
+ PGNO_INVALID, PGNO_INVALID, _lp->level + 1, ptype);
+ RE_NREC_SET(pp, rc ? __bam_total(file_dbp, _lp) +
+ __bam_total(file_dbp, _rp) : 0);
+
+ pp->lsn = *lsnp;
+ if ((ret = __memp_fput(mpf,
+ ip, pp, file_dbp->priority)) != 0)
+ goto out;
+ pp = NULL;
+ }
+
+check_next: /*
+ * Finally, redo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. The next
+ * page must exist because we're redoing the operation.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->npgno,
+ ip, NULL, 0, &np)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(
+ file_dbp, argp->npgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+ cmp = LOG_COMPARE(&LSN(np), &argp->nlsn);
+ CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->right;
+ np->lsn = *lsnp;
+ if ((ret = __memp_fput(mpf, ip,
+ np, file_dbp->priority)) != 0)
+ goto out;
+ np = NULL;
+ }
+ }
+ } else {
+ /*
+ * If the split page is wrong, replace its contents with the
+ * logged page contents. If the page doesn't exist, it means
+ * that the create of the page never happened, nor did any of
+ * the adds onto the page that caused the split, and there's
+ * really no undo-ing to be done.
+ */
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
+ DB_MPOOL_EDIT, &pp)) != 0) {
+ pp = NULL;
+ goto lrundo;
+ }
+ if (LOG_COMPARE(lsnp, &LSN(pp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+ memcpy(pp, argp->pg.data, argp->pg.size);
+ if ((ret = __memp_fput(mpf,
+ ip, pp, file_dbp->priority)) != 0)
+ goto out;
+ pp = NULL;
+ }
+
+ /*
+ * If it's a root split and the left child ever existed, update
+ * its LSN. (If it's not a root split, we've updated the left
+ * page already -- it's the same as the split page.) If the
+ * right child ever existed, root split or not, update its LSN.
+ * The undo of the page allocation(s) will restore them to the
+ * free list.
+ */
+lrundo: if ((rootsplit && lp != NULL) || rp != NULL) {
+ if (rootsplit && lp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(lp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ lp->lsn = argp->llsn;
+ if ((ret = __memp_fput(mpf, ip,
+ lp, file_dbp->priority)) != 0)
+ goto out;
+ lp = NULL;
+ }
+ if (rp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(rp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ rp->lsn = argp->rlsn;
+ if ((ret = __memp_fput(mpf, ip,
+ rp, file_dbp->priority)) != 0)
+ goto out;
+ rp = NULL;
+ }
+ }
+
+ /*
+ * Finally, undo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. Since it's
+ * possible that the next-page never existed, we ignore it as
+ * if there's nothing to undo.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->npgno,
+ ip, NULL, DB_MPOOL_EDIT, &np)) != 0) {
+ np = NULL;
+ goto done;
+ }
+ if (LOG_COMPARE(lsnp, &LSN(np)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->left;
+ np->lsn = argp->nlsn;
+ if (__memp_fput(mpf,
+ ip, np, file_dbp->priority))
+ goto out;
+ np = NULL;
+ }
+ }
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: /* Free any pages that weren't dirtied. */
+ if (pp != NULL && (t_ret = __memp_fput(mpf,
+ ip, pp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (lp != NULL && (t_ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (np != NULL && (t_ret = __memp_fput(mpf,
+ ip, np, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free any allocated space. */
+ if (_lp != NULL)
+ __os_free(env, _lp);
+ if (_rp != NULL)
+ __os_free(env, _rp);
+ if (sp != NULL)
+ __os_free(env, sp);
+
+ REC_CLOSE;
+}
+
+/*
+ * __bam_rsplit_recover --
+ * Recovery function for a reverse split.
+ *
+ * PUBLIC: int __bam_rsplit_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_rsplit_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_rsplit_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN copy_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno, root_pgno;
+ db_recno_t rcnt;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_rsplit_print);
+ REC_INTRO(__bam_rsplit_read, ip, 1);
+
+ /* Fix the root page. */
+ pgno = root_pgno = argp->root_pgno;
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else
+ goto do_page;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->rootlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->rootlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /*
+ * Copy the new data to the root page. If it is not now a
+ * leaf page we need to restore the record number. We could
+ * try to determine if C_RECNUM was set in the btree, but
+ * that's not really necessary since the field is not used
+ * otherwise.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ rcnt = RE_NREC(pagep);
+ memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+ if (LEVEL(pagep) > LEAFLEVEL)
+ RE_NREC_SET(pagep, rcnt);
+ pagep->pgno = root_pgno;
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, root_pgno,
+ argp->nrec, PGNO_INVALID, pagep->level + 1,
+ IS_BTREE_PAGE(pagep) ? P_IBTREE : P_IRECNO);
+ if ((ret = __db_pitem(dbc, pagep, 0,
+ argp->rootent.size, &argp->rootent, NULL)) != 0)
+ goto out;
+ pagep->lsn = argp->rootlsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+do_page:
+ /*
+ * Fix the page copied over the root page. It's possible that the
+ * page never made it to disk, or was truncated so if the page
+ * doesn't exist, it's okay and there's nothing further to do.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+ (void)__ua_memcpy(&copy_lsn, &LSN(argp->pgdbt.data), sizeof(DB_LSN));
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_adj_recover --
+ * Recovery function for adj.
+ *
+ * PUBLIC: int __bam_adj_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_adj_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_adj_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_adj_print);
+ REC_INTRO(__bam_adj_read, ip, 1);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __bam_adjindx(dbc,
+ pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0)
+ goto out;
+
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __bam_adjindx(dbc,
+ pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0)
+ goto out;
+
+ LSN(pagep) = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_cadjust_recover --
+ * Recovery function for the adjust of a count change in an internal
+ * page.
+ *
+ * PUBLIC: int __bam_cadjust_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_cadjust_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_cadjust_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_cadjust_print);
+ REC_INTRO(__bam_cadjust_read, ip, 0);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (IS_BTREE_PAGE(pagep)) {
+ GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs +=
+ argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, argp->adjust);
+ } else {
+ GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs +=
+ argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, argp->adjust);
+ }
+
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (IS_BTREE_PAGE(pagep)) {
+ GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs -=
+ argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, -(argp->adjust));
+ } else {
+ GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs -=
+ argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, -(argp->adjust));
+ }
+ LSN(pagep) = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_cdel_recover --
+ * Recovery function for the intent-to-delete of a cursor record.
+ *
+ * PUBLIC: int __bam_cdel_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_cdel_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_cdel_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ u_int32_t indx;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_cdel_print);
+ REC_INTRO(__bam_cdel_read, ip, 0);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
+ B_DSET(GET_BKEYDATA(file_dbp, pagep, indx)->type);
+
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
+ B_DCLR(GET_BKEYDATA(file_dbp, pagep, indx)->type);
+
+ if ((ret = __bam_ca_delete(
+ file_dbp, argp->pgno, argp->indx, 0, NULL)) != 0)
+ goto out;
+
+ LSN(pagep) = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_repl_recover --
+ * Recovery function for page item replacement.
+ *
+ * PUBLIC: int __bam_repl_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_repl_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_repl_args *argp;
+ DB_THREAD_INFO *ip;
+ BKEYDATA *bk;
+ DB *file_dbp;
+ DBC *dbc;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+ u_int32_t len;
+ u_int8_t *dp, *p;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_repl_print);
+ REC_INTRO(__bam_repl_read, ip, 1);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /*
+ * Need to redo update described.
+ *
+ * Re-build the replacement item.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ bk = GET_BKEYDATA(file_dbp, pagep, argp->indx);
+ dp = bk->data;
+ len = bk->len;
+ memset(&dbt, 0, sizeof(dbt));
+ dbt.size = argp->prefix + argp->suffix + argp->repl.size;
+ if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0)
+ goto out;
+ p = dbt.data;
+ memcpy(p, dp, argp->prefix);
+ p += argp->prefix;
+ memcpy(p, argp->repl.data, argp->repl.size);
+ p += argp->repl.size;
+ memcpy(p, dp + (len - argp->suffix), argp->suffix);
+
+ ret = __bam_ritem(dbc, pagep, argp->indx, &dbt, 0);
+ __os_free(env, dbt.data);
+ if (ret != 0)
+ goto out;
+
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /*
+ * Need to undo update described.
+ *
+ * Re-build the original item.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ bk = GET_BKEYDATA(file_dbp, pagep, argp->indx);
+ dp = bk->data;
+ len = bk->len;
+ memset(&dbt, 0, sizeof(dbt));
+ dbt.size = argp->prefix + argp->suffix + argp->orig.size;
+ if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0)
+ goto out;
+ p = dbt.data;
+ memcpy(p, dp, argp->prefix);
+ p += argp->prefix;
+ memcpy(p, argp->orig.data, argp->orig.size);
+ p += argp->orig.size;
+ memcpy(p, dp + (len - argp->suffix), argp->suffix);
+
+ ret = __bam_ritem(dbc, pagep, argp->indx, &dbt, 0);
+ __os_free(env, dbt.data);
+ if (ret != 0)
+ goto out;
+
+ /* Reset the deleted flag, if necessary. */
+ if (argp->isdeleted)
+ B_DSET(GET_BKEYDATA(file_dbp, pagep, argp->indx)->type);
+
+ LSN(pagep) = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_irep_recover --
+ * Recovery function for internal page item replacement.
+ *
+ * PUBLIC: int __bam_irep_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_irep_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_irep_args *argp;
+ BINTERNAL *bn;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_irep_print);
+ REC_INTRO(__bam_irep_read, ip, 1);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ bn = (BINTERNAL *)argp->hdr.data;
+ if ((ret = __bam_ritem_nolog(dbc,
+ pagep, argp->indx, &argp->hdr, &argp->data, bn->type)) != 0)
+ goto out;
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ bn = (BINTERNAL *)argp->old.data;
+ if ((ret = __bam_ritem_nolog(dbc,
+ pagep, argp->indx, &argp->old, NULL, bn->type)) != 0)
+ goto out;
+ LSN(pagep) = argp->lsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_root_recover --
+ * Recovery function for setting the root page on the meta-data page.
+ *
+ * PUBLIC: int __bam_root_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_root_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_root_args *argp;
+ DB_THREAD_INFO *ip;
+ BTMETA *meta;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ meta = NULL;
+ REC_PRINT(__bam_root_print);
+ REC_INTRO(__bam_root_read, ip, 0);
+
+ if ((ret = __memp_fget(mpf, &argp->meta_pgno, ip, NULL,
+ 0, &meta)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ meta->root = argp->root_pgno;
+ meta->dbmeta.lsn = *lsnp;
+ ((BTREE *)file_dbp->bt_internal)->bt_root = meta->root;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Nothing to undo except lsn. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ meta->dbmeta.lsn = argp->meta_lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_curadj_recover --
+ * Transaction abort function to undo cursor adjustments.
+ * This should only be triggered by subtransaction aborts.
+ *
+ * PUBLIC: int __bam_curadj_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_curadj_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_curadj_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ int ret;
+
+ COMPQUIET(mpf, NULL);
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_curadj_print);
+ REC_INTRO(__bam_curadj_read, ip, 1);
+
+ ret = 0;
+ if (op != DB_TXN_ABORT)
+ goto done;
+
+ switch (argp->mode) {
+ case DB_CA_DI:
+ if ((ret = __bam_ca_di(dbc, argp->from_pgno,
+ argp->from_indx, -(int)argp->first_indx)) != 0)
+ goto out;
+ break;
+ case DB_CA_DUP:
+ if ((ret = __bam_ca_undodup(file_dbp, argp->first_indx,
+ argp->from_pgno, argp->from_indx, argp->to_indx)) != 0)
+ goto out;
+ break;
+
+ case DB_CA_RSPLIT:
+ if ((ret =
+ __bam_ca_rsplit(dbc, argp->to_pgno, argp->from_pgno)) != 0)
+ goto out;
+ break;
+
+ case DB_CA_SPLIT:
+ if ((ret = __bam_ca_undosplit(file_dbp, argp->from_pgno,
+ argp->to_pgno, argp->left_pgno, argp->from_indx)) != 0)
+ goto out;
+ break;
+ }
+
+done: *lsnp = argp->prev_lsn;
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_rcuradj_recover --
+ * Transaction abort function to undo cursor adjustments in rrecno.
+ * This should only be triggered by subtransaction aborts.
+ *
+ * PUBLIC: int __bam_rcuradj_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_rcuradj_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_rcuradj_args *argp;
+ DB_THREAD_INFO *ip;
+ BTREE_CURSOR *cp;
+ DB *file_dbp;
+ DBC *dbc, *rdbc;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ COMPQUIET(mpf, NULL);
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ rdbc = NULL;
+ REC_PRINT(__bam_rcuradj_print);
+ REC_INTRO(__bam_rcuradj_read, ip, 1);
+
+ ret = t_ret = 0;
+
+ if (op != DB_TXN_ABORT)
+ goto done;
+
+ /*
+ * We don't know whether we're in an offpage dup set, and
+ * thus don't know whether the dbc REC_INTRO has handed us is
+ * of a reasonable type. It's certainly unset, so if this is
+ * an offpage dup set, we don't have an OPD cursor. The
+ * simplest solution is just to allocate a whole new cursor
+ * for our use; we're only really using it to hold pass some
+ * state into __ram_ca, and this way we don't need to make
+ * this function know anything about how offpage dups work.
+ */
+ if ((ret = __db_cursor_int(file_dbp, NULL,
+ NULL, DB_RECNO, argp->root, DB_RECOVER, NULL, &rdbc)) != 0)
+ goto out;
+
+ cp = (BTREE_CURSOR *)rdbc->internal;
+ F_SET(cp, C_RENUMBER);
+ cp->recno = argp->recno;
+
+ switch (argp->mode) {
+ case CA_DELETE:
+ /*
+ * The way to undo a delete is with an insert. Since
+ * we're undoing it, the delete flag must be set.
+ */
+ F_SET(cp, C_DELETED);
+ F_SET(cp, C_RENUMBER); /* Just in case. */
+ cp->order = argp->order;
+ if ((ret = __ram_ca(rdbc, CA_ICURRENT, NULL)) != 0)
+ goto out;
+ break;
+ case CA_IAFTER:
+ case CA_IBEFORE:
+ case CA_ICURRENT:
+ /*
+ * The way to undo an insert is with a delete. The delete
+ * flag is unset to start with.
+ */
+ F_CLR(cp, C_DELETED);
+ cp->order = INVALID_ORDER;
+ if ((ret = __ram_ca(rdbc, CA_DELETE, NULL)) != 0)
+ goto out;
+ break;
+ }
+
+done: *lsnp = argp->prev_lsn;
+out: if (rdbc != NULL && (t_ret = __dbc_close(rdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ REC_CLOSE;
+}
+
+/*
+ * __bam_merge_44_recover --
+ * Recovery function for merge.
+ *
+ * PUBLIC: int __bam_merge_44_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_merge_44_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_merge_44_args *argp;
+ DB_THREAD_INFO *ip;
+ BKEYDATA *bk;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_indx_t indx, *ninp, *pinp;
+ u_int32_t size;
+ u_int8_t *bp;
+ int cmp_n, cmp_p, i, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_merge_44_print);
+ REC_INTRO(__bam_merge_44_read, ip, 1);
+
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto next;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /*
+ * If the header is provided the page is empty, copy the
+ * needed data.
+ */
+ DB_ASSERT(env, argp->hdr.size == 0 || NUM_ENT(pagep) == 0);
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (argp->hdr.size != 0) {
+ P_INIT(pagep, file_dbp->pgsize, pagep->pgno,
+ PREV_PGNO(argp->hdr.data),
+ NEXT_PGNO(argp->hdr.data),
+ LEVEL(argp->hdr.data), TYPE(argp->hdr.data));
+ }
+ if (TYPE(pagep) == P_OVERFLOW) {
+ OV_REF(pagep) = OV_REF(argp->hdr.data);
+ OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+ bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp);
+ memcpy(bp, argp->data.data, argp->data.size);
+ } else {
+ /* Copy the data segment. */
+ bp = (u_int8_t *)pagep +
+ (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+ memcpy(bp, argp->data.data, argp->data.size);
+
+ /* Copy index table offset past the current entries. */
+ pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+ ninp = argp->ind.data;
+ for (i = 0;
+ i < (int)(argp->ind.size / sizeof(*ninp)); i++)
+ *pinp++ = *ninp++
+ - (file_dbp->pgsize - HOFFSET(pagep));
+ HOFFSET(pagep) -= argp->data.size;
+ NUM_ENT(pagep) += i;
+ }
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && !DB_REDO(op)) {
+ /*
+ * Since logging is logical at the page level
+ * we cannot just truncate the data space. Delete
+ * the proper number of items from the logical end
+ * of the page.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ for (i = 0; i < (int)(argp->ind.size / sizeof(*ninp)); i++) {
+ indx = NUM_ENT(pagep) - 1;
+ if (P_INP(file_dbp, pagep)[indx] ==
+ P_INP(file_dbp, pagep)[indx - P_INDX]) {
+ NUM_ENT(pagep)--;
+ continue;
+ }
+ switch (TYPE(pagep)) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ bk = GET_BKEYDATA(file_dbp, pagep, indx);
+ size = BITEM_SIZE(bk);
+ break;
+
+ case P_IBTREE:
+ size = BINTERNAL_SIZE(
+ GET_BINTERNAL(file_dbp, pagep, indx)->len);
+ break;
+ case P_IRECNO:
+ size = RINTERNAL_SIZE;
+ break;
+
+ default:
+ ret = __db_pgfmt(env, PGNO(pagep));
+ goto out;
+ }
+ if ((ret =
+ __db_ditem(dbc, pagep, indx, size)) != 0)
+ goto out;
+ }
+ if (argp->ind.size == 0)
+ HOFFSET(pagep) = file_dbp->pgsize;
+ pagep->lsn = argp->lsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+
+next: if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to truncate the page. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ HOFFSET(pagep) = file_dbp->pgsize;
+ NUM_ENT(pagep) = 0;
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && !DB_REDO(op)) {
+ /* Need to put the data back on the page. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (TYPE(pagep) == P_OVERFLOW) {
+ OV_REF(pagep) = OV_REF(argp->hdr.data);
+ OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+ bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp);
+ memcpy(bp, argp->data.data, argp->data.size);
+ } else {
+ bp = (u_int8_t *)pagep +
+ (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+ memcpy(bp, argp->data.data, argp->data.size);
+
+ /* Copy index table. */
+ pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+ ninp = argp->ind.data;
+ for (i = 0;
+ i < (int)(argp->ind.size / sizeof(*ninp)); i++)
+ *pinp++ = *ninp++;
+ HOFFSET(pagep) -= argp->data.size;
+ NUM_ENT(pagep) = i;
+ }
+ pagep->lsn = argp->nlsn;
+ }
+
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, dbc->priority)) != 0)
+ goto out;
+done:
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_relink_43_recover --
+ * Recovery function for relink.
+ *
+ * PUBLIC: int __bam_relink_43_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_relink_43_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_relink_43_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_relink_43_print);
+ REC_INTRO(__bam_relink_43_read, ip, 0);
+
+ /*
+ * There are up to three pages we need to check -- the page, and the
+ * previous and next pages, if they existed. For a page add operation,
+ * the current page is the result of a split and is being recovered
+ * elsewhere, so all we need do is recover the next page.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto next2;
+ }
+
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->lsn = *lsnp;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->next;
+ pagep->prev_pgno = argp->prev;
+ pagep->lsn = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+next2: if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->next, ret);
+ goto out;
+ } else
+ goto prev;
+ }
+
+ modified = 0;
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the remove or undo the add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->prev;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the remove or redo the add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->pgno;
+ modified = 1;
+ }
+ if (modified) {
+ if (DB_UNDO(op))
+ pagep->lsn = argp->lsn_next;
+ else
+ pagep->lsn = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+prev: if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->prev, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ modified = 0;
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->next;
+ modified = 1;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->pgno;
+ modified = 1;
+ }
+ if (modified) {
+ if (DB_UNDO(op))
+ pagep->lsn = argp->lsn_prev;
+ else
+ pagep->lsn = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
diff --git a/src/btree/bt_reclaim.c b/src/btree/bt_reclaim.c
new file mode 100644
index 00000000..f465cc5a
--- /dev/null
+++ b/src/btree/bt_reclaim.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+
+/*
+ * __bam_reclaim --
+ * Free a database.
+ *
+ * PUBLIC: int __bam_reclaim __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+ */
+int
+__bam_reclaim(dbp, ip, txn, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DB_LOCK meta_lock;
+ int ret, t_ret;
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ /* Write lock the metapage for deallocations. */
+ if ((ret = __db_lget(dbc,
+ 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+ goto err;
+
+ /* Avoid locking every page, we have the handle locked exclusive. */
+ F_SET(dbc, DBC_DONTLOCK);
+
+ /* Walk the tree, freeing pages. */
+ ret = __bam_traverse(dbc, DB_LOCK_WRITE,
+ PGNO_INVALID, __db_reclaim_callback, &flags);
+
+ if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the cursor. */
+err: if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_truncate --
+ * Truncate a database.
+ *
+ * PUBLIC: int __bam_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__bam_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ u_int32_t count;
+ int ret;
+
+#ifdef HAVE_COMPRESSION
+ u_int32_t comp_count;
+
+ comp_count = 0;
+ if (DB_IS_COMPRESSED(dbc->dbp) &&
+ (ret = __bam_compress_count(dbc, NULL, &comp_count)) != 0)
+ return (ret);
+#endif
+
+ count = 0;
+
+ /* Walk the tree, freeing pages. */
+ ret = __bam_traverse(dbc,
+ DB_LOCK_WRITE, PGNO_INVALID, __db_truncate_callback, &count);
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbc->dbp)) {
+ if (countp != NULL)
+ *countp = comp_count;
+ } else
+#endif
+ if (countp != NULL)
+ *countp = count;
+
+ return (ret);
+}
diff --git a/src/btree/bt_recno.c b/src/btree/bt_recno.c
new file mode 100644
index 00000000..9356a742
--- /dev/null
+++ b/src/btree/bt_recno.c
@@ -0,0 +1,1427 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
+static int __ram_source __P((DB *));
+static int __ram_sread __P((DBC *, db_recno_t));
+static int __ram_update __P((DBC *, db_recno_t, int));
+static int __ram_ca_getorder
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __ram_ca_setorder
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+/*
+ * In recno, there are two meanings to the on-page "deleted" flag. If we're
+ * re-numbering records, it means the record was implicitly created. We skip
+ * over implicitly created records if doing a cursor "next" or "prev", and
+ * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering
+ * records, it means that the record was implicitly created, or was deleted.
+ * We skip over implicitly created or deleted records if doing a cursor "next"
+ * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
+ *
+ * If we're re-numbering records, then we have to detect in the cursor that
+ * a record was deleted, and adjust the cursor as necessary on the next get.
+ * If we're not re-numbering records, then we can detect that a record has
+ * been deleted by looking at the actual on-page record, so we completely
+ * ignore the cursor's delete flag. This is different from the B+tree code.
+ * It also maintains whether the cursor references a deleted record in the
+ * cursor, and it doesn't always check the on-page value.
+ */
+#define CD_SET(cp) { \
+ if (F_ISSET(cp, C_RENUMBER)) \
+ F_SET(cp, C_DELETED); \
+}
+#define CD_CLR(cp) { \
+ if (F_ISSET(cp, C_RENUMBER)) { \
+ F_CLR(cp, C_DELETED); \
+ cp->order = INVALID_ORDER; \
+ } \
+}
+#define CD_ISSET(cp) \
+ (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED) ? 1 : 0)
+
+/*
+ * Macros for comparing the ordering of two cursors.
+ * cp1 comes before cp2 iff one of the following holds:
+ * cp1's recno is less than cp2's recno
+ * recnos are equal, both deleted, and cp1's order is less than cp2's
+ * recnos are equal, cp1 deleted, and cp2 not deleted
+ */
+#define C_LESSTHAN(cp1, cp2) \
+ (((cp1)->recno < (cp2)->recno) || \
+ (((cp1)->recno == (cp2)->recno) && \
+ ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \
+ (CD_ISSET((cp1)) && !CD_ISSET((cp2))))))
+
+/*
+ * cp1 is equal to cp2 iff their recnos and delete flags are identical,
+ * and if the delete flag is set their orders are also identical.
+ */
+#define C_EQUAL(cp1, cp2) \
+ ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \
+ (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order))
+
+/*
+ * Do we need to log the current cursor adjustment?
+ */
+#define CURADJ_LOG(dbc) \
+ (DBC_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL)
+
+/*
+ * After a search, copy the found page into the cursor, discarding any
+ * currently held lock.
+ */
+#define STACK_TO_CURSOR(cp, ret) { \
+ int __t_ret; \
+ (cp)->page = (cp)->csp->page; \
+ (cp)->pgno = (cp)->csp->page->pgno; \
+ (cp)->indx = (cp)->csp->indx; \
+ if ((__t_ret = __TLPUT(dbc, (cp)->lock)) != 0 && (ret) == 0) \
+ ret = __t_ret; \
+ (cp)->lock = (cp)->csp->lock; \
+ (cp)->lock_mode = (cp)->csp->lock_mode; \
+}
+
+/*
+ * __ram_open --
+ * Recno open function.
+ *
+ * PUBLIC: int __ram_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__ram_open(dbp, ip, txn, name, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTREE *t;
+ DBC *dbc;
+ int ret, t_ret;
+
+ COMPQUIET(name, NULL);
+ t = dbp->bt_internal;
+
+ /* Start up the tree. */
+ if ((ret = __bam_read_root(dbp, ip, txn, base_pgno, flags)) != 0)
+ return (ret);
+
+ /*
+ * If the user specified a source tree, open it and map it in.
+ *
+ * !!!
+ * We don't complain if the user specified transactions or threads.
+ * It's possible to make it work, but you'd better know what you're
+ * doing!
+ */
+ if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0)
+ return (ret);
+
+ /* If we're snapshotting an underlying source file, do it now. */
+ if (F_ISSET(dbp, DB_AM_SNAPSHOT)) {
+ /* Allocate a cursor. */
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /* Do the snapshot. */
+ if ((ret = __ram_update(dbc,
+ DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
+ ret = 0;
+
+ /* Discard the cursor. */
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
+ * __ram_append --
+ * Recno append function.
+ *
+ * PUBLIC: int __ram_append __P((DBC *, DBT *, DBT *));
+ */
+int
+__ram_append(dbc, key, data)
+ DBC *dbc;
+ DBT *key, *data;
+{
+ BTREE_CURSOR *cp;
+ int ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Make sure we've read in all of the backing source file. If
+ * we found the record or it simply didn't exist, add the
+ * user's record.
+ */
+ ret = __ram_update(dbc, DB_MAX_RECORDS, 0);
+ if (ret == 0 || ret == DB_NOTFOUND)
+ ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0);
+
+ /* Return the record number. */
+ if (ret == 0 && key != NULL)
+ ret = __db_retcopy(dbc->env, key, &cp->recno,
+ sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
+
+ if (!DB_RETOK_DBCPUT(ret))
+ F_SET(dbc, DBC_ERROR);
+ return (ret);
+}
+
+/*
+ * __ramc_del --
+ * Recno DBC->del function.
+ *
+ * PUBLIC: int __ramc_del __P((DBC *, u_int32_t));
+ */
+int
+__ramc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ BKEYDATA bk;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT hdr, data;
+ DB_LOCK next_lock, prev_lock;
+ DB_LSN lsn;
+ db_pgno_t npgno, ppgno, save_npgno, save_ppgno;
+ int exact, nc, ret, stack, t_ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ stack = 0;
+ save_npgno = save_ppgno = PGNO_INVALID;
+ LOCK_INIT(next_lock);
+ LOCK_INIT(prev_lock);
+ COMPQUIET(flags, 0);
+
+ /*
+ * The semantics of cursors during delete are as follows: in
+ * non-renumbering recnos, records are replaced with a marker
+ * containing a delete flag. If the record referenced by this cursor
+ * has already been deleted, we will detect that as part of the delete
+ * operation, and fail.
+ *
+ * In renumbering recnos, cursors which represent deleted items
+ * are flagged with the C_DELETED flag, and it is an error to
+ * call c_del a second time without an intervening cursor motion.
+ */
+ if (CD_ISSET(cp))
+ return (DB_KEYEMPTY);
+
+ /* Search the tree for the key; delete only deletes exact matches. */
+retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ stack = 1;
+
+ /* Copy the page into the cursor. */
+ STACK_TO_CURSOR(cp, ret);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If re-numbering records, the on-page deleted flag can only mean
+ * that this record was implicitly created. Applications aren't
+ * permitted to delete records they never created, return an error.
+ *
+ * If not re-numbering records, the on-page deleted flag means that
+ * this record was implicitly created, or, was deleted at some time.
+ * The former is an error because applications aren't permitted to
+ * delete records they never created, the latter is an error because
+ * if the record was "deleted", we could never have found it.
+ */
+ if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) {
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ if (F_ISSET(cp, C_RENUMBER)) {
+ /* If we are going to drop the page, lock its neighbors. */
+ if (STD_LOCKING(dbc) && NUM_ENT(cp->page) == 1 &&
+ PGNO(cp->page) != BAM_ROOT_PGNO(dbc)) {
+ if ((npgno = NEXT_PGNO(cp->page)) != PGNO_INVALID)
+ TRY_LOCK(dbc, npgno, save_npgno,
+ next_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ if ((ppgno = PREV_PGNO(cp->page)) != PGNO_INVALID)
+ TRY_LOCK(dbc, ppgno, save_ppgno,
+ prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ }
+ /* Delete the item, adjust the counts, adjust the cursors. */
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+ if ((ret = __bam_adjust(dbc, -1)) != 0)
+ goto err;
+ if ((ret = __ram_ca(dbc, CA_DELETE, &nc)) != 0)
+ goto err;
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
+ CA_DELETE, BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+ goto err;
+
+ /*
+ * If the page is empty, delete it.
+ *
+ * We never delete a root page. First, root pages of primary
+ * databases never go away, recno or otherwise. However, if
+ * it's the root page of an off-page duplicates database, then
+ * it can be deleted. We don't delete it here because we have
+ * no way of telling the primary database page holder (e.g.,
+ * the hash access method) that its page element should cleaned
+ * up because the underlying tree is gone. So, we keep the page
+ * around until the last cursor referencing the empty tree is
+ * are closed, and then clean it up.
+ */
+ if (NUM_ENT(cp->page) == 0 &&
+ PGNO(cp->page) != BAM_ROOT_PGNO(dbc)) {
+ /*
+ * We want to delete a single item out of the last page
+ * that we're not deleting.
+ */
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_OFF(dbc->thread_info);
+ ret = __bam_dpages(dbc, 0, BTD_RELINK);
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_ON(dbc->thread_info);
+
+ /*
+ * Regardless of the return from __bam_dpages, it will
+ * discard our stack and pinned page.
+ */
+ stack = 0;
+ cp->page = NULL;
+ LOCK_INIT(cp->lock);
+ cp->lock_mode = DB_LOCK_NG;
+ }
+ } else {
+ /* Use a delete/put pair to replace the record with a marker. */
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+
+ B_TSET_DELETED(bk.type, B_KEYDATA);
+ bk.len = 0;
+ DB_INIT_DBT(hdr, &bk, SSZA(BKEYDATA, data));
+ DB_INIT_DBT(data, "", 0);
+ if ((ret = __db_pitem(dbc,
+ cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
+ goto err;
+ }
+
+ t->re_modified = 1;
+
+err: if (!DB_RETOK_DBCDEL(ret))
+ F_SET(dbc, DBC_ERROR);
+ if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, next_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, prev_lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __ramc_get --
+ * Recno DBC->get function.
+ *
+ * PUBLIC: int __ramc_get
+ * PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ramc_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ int cmp, exact, ret;
+
+ COMPQUIET(pgnop, NULL);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+retry: switch (flags) {
+ case DB_CURRENT:
+ /*
+ * If we're using mutable records and the deleted flag is
+ * set, the cursor is pointing at a nonexistent record;
+ * return an error.
+ */
+ if (CD_ISSET(cp))
+ return (DB_KEYEMPTY);
+ break;
+ case DB_NEXT_DUP:
+ /*
+ * If we're not in an off-page dup set, we know there's no
+ * next duplicate since recnos don't have them. If we
+ * are in an off-page dup set, the next item assuredly is
+ * a dup, so we set flags to DB_NEXT and keep going.
+ */
+ if (!F_ISSET(dbc, DBC_OPD))
+ return (DB_NOTFOUND);
+ /* FALLTHROUGH */
+ case DB_NEXT_NODUP:
+ /*
+ * Recno databases don't have duplicates, set flags to DB_NEXT
+ * and keep going.
+ */
+ /* FALLTHROUGH */
+ case DB_NEXT:
+ flags = DB_NEXT;
+ /*
+ * If record numbers are mutable: if we just deleted a record,
+ * we have to avoid incrementing the record number so that we
+ * return the right record by virtue of renumbering the tree.
+ */
+ if (CD_ISSET(cp)) {
+ /*
+ * Clear the flag, we've moved off the deleted record.
+ */
+ CD_CLR(cp);
+ break;
+ }
+
+ if (cp->recno != RECNO_OOB) {
+ ++cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_FIRST:
+ flags = DB_NEXT;
+ cp->recno = 1;
+ break;
+ case DB_PREV_DUP:
+ /*
+ * If we're not in an off-page dup set, we know there's no
+ * previous duplicate since recnos don't have them. If we
+ * are in an off-page dup set, the previous item assuredly
+ * is a dup, so we set flags to DB_PREV and keep going.
+ */
+ if (!F_ISSET(dbc, DBC_OPD))
+ return (DB_NOTFOUND);
+ /* FALLTHROUGH */
+ case DB_PREV_NODUP:
+ /*
+ * Recno databases don't have duplicates, set flags to DB_PREV
+ * and keep going.
+ */
+ /* FALLTHROUGH */
+ case DB_PREV:
+ flags = DB_PREV;
+ if (cp->recno != RECNO_OOB) {
+ if (cp->recno == 1) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ --cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST:
+ flags = DB_PREV;
+ if (((ret = __ram_update(dbc,
+ DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
+ goto err;
+ if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
+ goto err;
+ if (cp->recno == 0) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ case DB_GET_BOTHC:
+ /*
+ * If we're doing a join and these are offpage dups,
+ * we want to keep searching forward from after the
+ * current cursor position. Increment the recno by 1,
+ * then proceed as for a DB_SET.
+ *
+ * Otherwise, we know there are no additional matching
+ * data, as recnos don't have dups. return DB_NOTFOUND.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ cp->recno++;
+ break;
+ }
+ ret = DB_NOTFOUND;
+ goto err;
+ /* NOTREACHED */
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ /*
+ * If we're searching a set of off-page dups, we start
+ * a new linear search from the first record. Otherwise,
+ * we compare the single data item associated with the
+ * requested record for a match.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ cp->recno = 1;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_SET:
+ case DB_SET_RANGE:
+ if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__ramc_get", flags);
+ goto err;
+ }
+
+ /*
+ * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already
+ * called __ram_update() to make sure sufficient records have been
+ * read from the backing source file. Do it now for DB_CURRENT (if
+ * the current record was deleted we may need more records from the
+ * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT.
+ * (We don't have to test for flags == DB_FIRST, because the switch
+ * statement above re-set flags to DB_NEXT in that case.)
+ */
+ if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret =
+ __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND)
+ goto err;
+
+ for (;; ++cp->recno) {
+ /* Search the tree for the record. */
+ if ((ret = __bam_rsearch(dbc, &cp->recno,
+ F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND,
+ 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ /* Copy the page into the cursor. */
+ STACK_TO_CURSOR(cp, ret);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If re-numbering records, the on-page deleted flag means this
+ * record was implicitly created. If not re-numbering records,
+ * the on-page deleted flag means this record was implicitly
+ * created, or, it was deleted at some time. Regardless, we
+ * skip such records if doing cursor next/prev operations or
+ * walking through off-page duplicates, and fail if they were
+ * requested explicitly by the application.
+ */
+ if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type))
+ switch (flags) {
+ case DB_NEXT:
+ case DB_PREV:
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ PERFMON4(env, race, ramc_get,
+ dbp->fname, dbp->dname, cp->page, flags);
+ goto retry;
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ /*
+ * If we're an OPD tree, we don't care about
+ * matching a record number on a DB_GET_BOTH
+ * -- everything belongs to the same tree. A
+ * normal recno should give up and return
+ * DB_NOTFOUND if the matching recno is deleted.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ continue;
+ }
+ ret = DB_NOTFOUND;
+ goto err;
+ default:
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ if (flags == DB_GET_BOTH ||
+ flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
+ if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
+ __bam_defcmp, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0)
+ break;
+ if (!F_ISSET(dbc, DBC_OPD)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ } else
+ break;
+ }
+
+ /* Return the key if the user didn't give us one. */
+ if (!F_ISSET(dbc, DBC_OPD) && !F_ISSET(key, DB_DBT_ISSET)) {
+ ret = __db_retcopy(dbp->env,
+ key, &cp->recno, sizeof(cp->recno),
+ &dbc->rkey->data, &dbc->rkey->ulen);
+ F_SET(key, DB_DBT_ISSET);
+ }
+
+ /* The cursor was reset, no further delete adjustment is necessary. */
+err: CD_CLR(cp);
+
+ return (ret);
+}
+
+/*
+ * __ramc_put --
+ * Recno DBC->put function.
+ *
+ * PUBLIC: int __ramc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ramc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LSN lsn;
+ ENV *env;
+ u_int32_t iiflags;
+ int exact, nc, ret, t_ret;
+ void *arg;
+
+ COMPQUIET(pgnop, NULL);
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * DB_KEYFIRST and DB_KEYLAST mean different things if they're
+ * used in an off-page duplicate tree. If we're an off-page
+ * duplicate tree, they really mean "put at the beginning of the
+ * tree" and "put at the end of the tree" respectively, so translate
+ * them to something else.
+ */
+ if (F_ISSET(dbc, DBC_OPD))
+ switch (flags) {
+ case DB_KEYFIRST:
+ cp->recno = 1;
+ flags = DB_BEFORE;
+ break;
+ case DB_KEYLAST:
+ if ((ret = __ram_add(dbc,
+ &cp->recno, data, DB_APPEND, 0)) != 0)
+ return (ret);
+ if (CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp, dbc->txn,
+ &lsn, 0, CA_ICURRENT,
+ BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+ return (ret);
+ return (0);
+ default:
+ break;
+ }
+
+ /*
+ * Handle normal DB_KEYFIRST/DB_KEYLAST; for a recno, which has
+ * no duplicates, these are identical and mean "put the given
+ * datum at the given recno".
+ */
+ if (flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+ flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP) {
+ ret = __ram_getno(dbc, key, &cp->recno, 1);
+ if (ret == 0 || ret == DB_NOTFOUND)
+ ret = __ram_add(dbc, &cp->recno, data, flags, 0);
+ return (ret);
+ }
+
+ /*
+ * If we're putting with a cursor that's marked C_DELETED, we need to
+ * take special care; the cursor doesn't "really" reference the item
+ * corresponding to its current recno, but instead is "between" that
+ * record and the current one. Translate the actual insert into
+ * DB_BEFORE, and let the __ram_ca work out the gory details of what
+ * should wind up pointing where.
+ */
+ if (CD_ISSET(cp))
+ iiflags = DB_BEFORE;
+ else
+ iiflags = flags;
+
+split: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_INSERT, 1, &exact)) != 0)
+ goto err;
+ /*
+ * An inexact match is okay; it just means we're one record past the
+ * end, which is reasonable if we're marked deleted.
+ */
+ DB_ASSERT(env, exact || CD_ISSET(cp));
+
+ /* Copy the page into the cursor. */
+ STACK_TO_CURSOR(cp, ret);
+ if (ret != 0)
+ goto err;
+
+ ret = __bam_iitem(dbc, key, data, iiflags, 0);
+ t_ret = __bam_stkrel(dbc, STK_CLRDBC);
+
+ if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT))
+ ret = t_ret;
+ else if (ret == DB_NEEDSPLIT) {
+ arg = &cp->recno;
+ if ((ret = __bam_split(dbc, arg, NULL)) != 0)
+ goto err;
+ goto split;
+ }
+ if (ret != 0)
+ goto err;
+
+ switch (flags) { /* Adjust the cursors. */
+ case DB_AFTER:
+ if ((ret = __ram_ca(dbc, CA_IAFTER, &nc)) != 0)
+ goto err;
+
+ /*
+ * We only need to adjust this cursor forward if we truly added
+ * the item after the current recno, rather than remapping it
+ * to DB_BEFORE.
+ */
+ if (iiflags == DB_AFTER)
+ ++cp->recno;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IAFTER,
+ BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ case DB_BEFORE:
+ if ((ret = __ram_ca(dbc, CA_IBEFORE, &nc)) != 0)
+ goto err;
+ --cp->recno;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IBEFORE,
+ BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ case DB_CURRENT:
+ /*
+ * We only need to do an adjustment if we actually
+ * added an item, which we only would have done if the
+ * cursor was marked deleted.
+ */
+ if (!CD_ISSET(cp))
+ break;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if ((ret = __ram_ca(dbc, CA_ICURRENT, &nc)) != 0)
+ goto err;
+ if (nc > 0 && CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp,
+ dbc->txn, &lsn, 0, CA_ICURRENT,
+ BAM_ROOT_PGNO(dbc), cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ default:
+ break;
+ }
+
+ /* Return the key if we've created a new record. */
+ if (!F_ISSET(dbc, DBC_OPD) &&
+ (flags == DB_AFTER || flags == DB_BEFORE) && key != NULL)
+ ret = __db_retcopy(env, key, &cp->recno,
+ sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
+
+ /* The cursor was reset, no further delete adjustment is necessary. */
+err: CD_CLR(cp);
+
+ if (!DB_RETOK_DBCDEL(ret))
+ F_SET(dbc, DBC_ERROR);
+ return (ret);
+}
+
+static int
+__ram_ca_getorder(dbc, my_dbc, orderp, root_pgno, recno, args)
+ DBC *dbc, *my_dbc;
+ u_int32_t *orderp;
+ db_pgno_t root_pgno;
+ u_int32_t recno;
+ void *args;
+{
+ BTREE_CURSOR *cp;
+
+ COMPQUIET(my_dbc, NULL);
+ COMPQUIET(args, NULL);
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (root_pgno == BAM_ROOT_PGNO(dbc) &&
+ recno == cp->recno && CD_ISSET(cp) &&
+ *orderp <= cp->order &&
+ !MVCC_SKIP_CURADJ(dbc, BAM_ROOT_PGNO(dbc)))
+ *orderp = cp->order;
+ return (0);
+}
+
+static int
+__ram_ca_setorder(dbc, my_dbc, foundp, pgno, order, args)
+ DBC *dbc, *my_dbc;
+ u_int32_t *foundp;
+ db_pgno_t pgno;
+ u_int32_t order;
+ void *args;
+{
+ BTREE_CURSOR *cp, *cp_arg;
+ int adjusted;
+ ca_recno_arg op;
+ db_recno_t recno;
+
+ COMPQUIET(pgno, 0);
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ cp_arg = (BTREE_CURSOR *)my_dbc->internal;
+ op = *(ca_recno_arg *)args;
+
+ if (cp_arg->root != cp->root ||
+ MVCC_SKIP_CURADJ(dbc, BAM_ROOT_PGNO(dbc)))
+ return (0);
+ ++(*foundp);
+ adjusted = 0;
+ recno = cp_arg->recno;
+ switch (op) {
+ case CA_DELETE:
+ if (recno < cp->recno) {
+ --cp->recno;
+ /*
+ * If the adjustment made them equal,
+ * we have to merge the orders.
+ */
+ if (recno == cp->recno && CD_ISSET(cp))
+ cp->order += order;
+ } else if (recno == cp->recno &&
+ !CD_ISSET(cp)) {
+ CD_SET(cp);
+ cp->order = order;
+ /*
+ * If we're deleting the item, we can't
+ * keep a streaming offset cached.
+ */
+ cp->stream_start_pgno = PGNO_INVALID;
+ }
+ break;
+ case CA_IBEFORE:
+ /*
+ * IBEFORE is just like IAFTER, except that we
+ * adjust cursors on the current record too.
+ */
+ if (C_EQUAL(cp_arg, cp)) {
+ ++cp->recno;
+ adjusted = 1;
+ }
+ goto iafter;
+ case CA_ICURRENT:
+
+ /*
+ * If the original cursor wasn't deleted, we
+ * just did a replacement and so there's no
+ * need to adjust anything--we shouldn't have
+ * gotten this far. Otherwise, we behave
+ * much like an IAFTER, except that all
+ * cursors pointing to the current item get
+ * marked undeleted and point to the new
+ * item.
+ */
+ DB_ASSERT(dbc->dbp->env, CD_ISSET(cp_arg));
+ if (C_EQUAL(cp_arg, cp)) {
+ CD_CLR(cp);
+ break;
+ }
+ /* FALLTHROUGH */
+ case CA_IAFTER:
+iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) {
+ ++cp->recno;
+ adjusted = 1;
+ }
+ if (recno == cp->recno && adjusted)
+ /*
+ * If we've moved this cursor's recno,
+ * split its order number--i.e.,
+ * decrement it by enough so that
+ * the lowest cursor moved has order 1.
+ * cp_arg->order is the split point,
+ * so decrement by one less than that.
+ */
+ cp->order -= (cp_arg->order - 1);
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __ram_ca --
+ * Adjust cursors. Returns the number of relevant cursors.
+ *
+ * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg, int *));
+ */
+int
+__ram_ca(dbc_arg, op, foundp)
+ DBC *dbc_arg;
+ ca_recno_arg op;
+ int *foundp;
+{
+ BTREE_CURSOR *cp_arg;
+ DB *dbp;
+ ENV *env;
+ db_recno_t recno;
+ u_int32_t found, order;
+ int ret;
+
+ dbp = dbc_arg->dbp;
+ env = dbp->env;
+ cp_arg = (BTREE_CURSOR *)dbc_arg->internal;
+ recno = cp_arg->recno;
+
+ /*
+ * It only makes sense to adjust cursors if we're a renumbering
+ * recno; we should only be called if this is one.
+ */
+ DB_ASSERT(env, F_ISSET(cp_arg, C_RENUMBER));
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * If we're doing a delete, we need to find the highest
+ * order of any cursor currently pointing at this item,
+ * so we can assign a higher order to the newly deleted
+ * cursor. Unfortunately, this requires a second pass through
+ * the cursor list.
+ */
+ if (op == CA_DELETE) {
+ if ((ret = __db_walk_cursors(dbp, NULL, __ram_ca_getorder,
+ &order, BAM_ROOT_PGNO(dbc_arg), recno, NULL)) != 0)
+ return (ret);
+ order++;
+ } else
+ order = INVALID_ORDER;
+
+ if ((ret = __db_walk_cursors(dbp, dbc_arg,
+ __ram_ca_setorder, &found, 0, order, &op)) != 0)
+ return (ret);
+ if (foundp != NULL)
+ *foundp = (int)found;
+ return (0);
+}
+
+/*
+ * __ram_getno --
+ * Check the user's record number, and make sure we've seen it.
+ *
+ * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
+ */
+int
+__ram_getno(dbc, key, rep, can_create)
+ DBC *dbc;
+ const DBT *key;
+ db_recno_t *rep;
+ int can_create;
+{
+ DB *dbp;
+ db_recno_t recno;
+
+ dbp = dbc->dbp;
+
+ /* If passed an empty DBT from Java, key->data may be NULL */
+ if (key->size != sizeof(db_recno_t)) {
+ __db_errx(dbp->env, DB_STR("1001",
+ "illegal record number size"));
+ return (EINVAL);
+ }
+
+ /* Check the user's record number. */
+ if ((recno = *(db_recno_t *)key->data) == 0) {
+ __db_errx(dbp->env, DB_STR("1002",
+ "illegal record number of 0"));
+ return (EINVAL);
+ }
+ if (rep != NULL)
+ *rep = recno;
+
+ /*
+ * Btree can neither create records nor read them in. Recno can
+ * do both, see if we can find the record.
+ */
+ return (dbc->dbtype == DB_RECNO ?
+ __ram_update(dbc, recno, can_create) : 0);
+}
+
+/*
+ * __ram_update --
+ * Ensure the tree has records up to and including the specified one.
+ */
+static int
+__ram_update(dbc, recno, can_create)
+ DBC *dbc;
+ db_recno_t recno;
+ int can_create;
+{
+ BTREE *t;
+ DB *dbp;
+ DBT *rdata;
+ db_recno_t nrecs;
+ int ret;
+
+ dbp = dbc->dbp;
+ t = dbp->bt_internal;
+
+ /*
+ * If we can't create records and we've read the entire backing input
+ * file, we're done.
+ */
+ if (!can_create && t->re_eof)
+ return (0);
+
+ /*
+ * If we haven't seen this record yet, try to get it from the original
+ * file.
+ */
+ if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+ return (ret);
+ if (!t->re_eof && recno > nrecs) {
+ if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND)
+ return (ret);
+ if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If we can create records, create empty ones up to the requested
+ * record.
+ */
+ if (!can_create || recno <= nrecs + 1)
+ return (0);
+
+ rdata = &dbc->my_rdata;
+ rdata->flags = 0;
+ rdata->size = 0;
+
+ while (recno > ++nrecs)
+ if ((ret = __ram_add(dbc,
+ &nrecs, rdata, 0, BI_DELETED)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * __ram_source --
+ * Load information about the backing file.
+ */
+static int
+__ram_source(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ ENV *env;
+ char *source;
+ int ret;
+
+ env = dbp->env;
+ t = dbp->bt_internal;
+
+ /* Find the real name, and swap out the one we had before. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, t->re_source, NULL, &source)) != 0)
+ return (ret);
+ __os_free(env, t->re_source);
+ t->re_source = source;
+
+ /*
+ * !!!
+ * It's possible that the backing source file is read-only. We don't
+ * much care other than we'll complain if there are any modifications
+ * when it comes time to write the database back to the source.
+ */
+ if ((t->re_fp = fopen(t->re_source, "rb")) == NULL) {
+ ret = __os_get_errno();
+ __db_err(env, ret, "%s", t->re_source);
+ return (ret);
+ }
+
+ t->re_eof = 0;
+ return (0);
+}
+
+/*
+ * __ram_writeback --
+ * Rewrite the backing file.
+ *
+ * PUBLIC: int __ram_writeback __P((DB *));
+ */
+int
+__ram_writeback(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ DBC *dbc;
+ DBT key, data;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ FILE *fp;
+ db_recno_t keyno;
+ int ret, t_ret;
+ u_int8_t delim, *pad;
+
+ t = dbp->bt_internal;
+ env = dbp->env;
+ fp = NULL;
+ pad = NULL;
+
+ /* If the file wasn't modified, we're done. */
+ if (!t->re_modified)
+ return (0);
+
+ /* If there's no backing source file, we're done. */
+ if (t->re_source == NULL) {
+ t->re_modified = 0;
+ return (0);
+ }
+
+ /*
+ * We step through the records, writing each one out. Use the record
+ * number and the dbp->get() function, instead of a cursor, so we find
+ * and write out "deleted" or non-existent records. The DB handle may
+ * be threaded, so allocate memory as we go.
+ */
+ memset(&key, 0, sizeof(key));
+ key.size = sizeof(db_recno_t);
+ key.data = &keyno;
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_REALLOC);
+
+ /* Allocate a cursor. */
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /*
+ * Read any remaining records into the tree.
+ *
+ * !!!
+ * This is why we can't support transactions when applications specify
+ * backing (re_source) files. At this point we have to read in the
+ * rest of the records from the file so that we can write all of the
+ * records back out again, which could modify a page for which we'd
+ * have to log changes and which we don't have locked. This could be
+ * partially fixed by taking a snapshot of the entire file during the
+ * DB->open as DB->open is transaction protected. But, if a checkpoint
+ * occurs then, the part of the log holding the copy of the file could
+ * be discarded, and that would make it impossible to recover in the
+ * face of disaster. This could all probably be fixed, but it would
+ * require transaction protecting the backing source file.
+ *
+ * XXX
+ * This could be made to work now that we have transactions protecting
+ * file operations. Margo has specifically asked for the privilege of
+ * doing this work.
+ */
+ if ((ret =
+ __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
+ goto err;
+
+ /*
+ * Close any existing file handle and re-open the file, truncating it.
+ */
+ if (t->re_fp != NULL) {
+ if (fclose(t->re_fp) != 0) {
+ ret = __os_get_errno();
+ __db_err(env, ret, "%s", t->re_source);
+ goto err;
+ }
+ t->re_fp = NULL;
+ }
+ if ((fp = fopen(t->re_source, "wb")) == NULL) {
+ ret = __os_get_errno();
+ __db_err(env, ret, "%s", t->re_source);
+ goto err;
+ }
+
+ /*
+ * We'll need the delimiter if we're doing variable-length records,
+ * and the pad character if we're doing fixed-length records.
+ */
+ delim = t->re_delim;
+ for (keyno = 1;; ++keyno) {
+ switch (ret = __db_get(dbp, ip, NULL, &key, &data, 0)) {
+ case 0:
+ if (data.size != 0 &&
+ fwrite(data.data, 1, data.size, fp) != data.size)
+ goto write_err;
+ break;
+ case DB_KEYEMPTY:
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
+ if (pad == NULL) {
+ if ((ret = __os_malloc(
+ env, t->re_len, &pad)) != 0)
+ goto err;
+ memset(pad, t->re_pad, t->re_len);
+ }
+ if (fwrite(pad, 1, t->re_len, fp) != t->re_len)
+ goto write_err;
+ }
+ break;
+ case DB_NOTFOUND:
+ ret = 0;
+ goto done;
+ default:
+ goto err;
+ }
+ if (!F_ISSET(dbp, DB_AM_FIXEDLEN) &&
+ fwrite(&delim, 1, 1, fp) != 1) {
+write_err: ret = __os_get_errno();
+ __db_err(env, ret, DB_STR_A("1003",
+ "%s: write failed to backing file", "%s"),
+ t->re_source);
+ goto err;
+ }
+ }
+
+err:
+done: /* Close the file descriptor. */
+ if (fp != NULL && fclose(fp) != 0) {
+ t_ret = __os_get_errno();
+ __db_err(env, t_ret, "%s", t->re_source);
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ /* Discard the cursor. */
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard memory allocated to hold the data items. */
+ if (data.data != NULL)
+ __os_ufree(env, data.data);
+ if (pad != NULL)
+ __os_free(env, pad);
+
+ if (ret == 0)
+ t->re_modified = 0;
+
+ return (ret);
+}
+
+/*
+ * __ram_sread --
+ * Read records from a source file.
+ */
+static int
+__ram_sread(dbc, top)
+ DBC *dbc;
+ db_recno_t top;
+{
+ BTREE *t;
+ DB *dbp;
+ DBT data, *rdata;
+ db_recno_t recno;
+ size_t len;
+ int ch, ret, was_modified;
+
+ t = dbc->dbp->bt_internal;
+ dbp = dbc->dbp;
+ was_modified = t->re_modified;
+
+ if ((ret = __bam_nrecs(dbc, &recno)) != 0)
+ return (ret);
+
+ /*
+ * Use the record key return memory, it's only a short-term use.
+ * The record data return memory is used by __bam_iitem, which
+ * we'll indirectly call, so use the key so as not to collide.
+ */
+ len = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : 256;
+ rdata = &dbc->my_rkey;
+ if (rdata->ulen < len) {
+ if ((ret = __os_realloc(
+ dbp->env, len, &rdata->data)) != 0) {
+ rdata->ulen = 0;
+ rdata->data = NULL;
+ return (ret);
+ }
+ rdata->ulen = (u_int32_t)len;
+ }
+
+ memset(&data, 0, sizeof(data));
+ while (recno < top) {
+ data.data = rdata->data;
+ data.size = 0;
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN))
+ for (len = t->re_len; len > 0; --len) {
+ if ((ch = fgetc(t->re_fp)) == EOF) {
+ if (data.size == 0)
+ goto eof;
+ break;
+ }
+ ((u_int8_t *)data.data)[data.size++] = ch;
+ }
+ else
+ for (;;) {
+ if ((ch = fgetc(t->re_fp)) == EOF) {
+ if (data.size == 0)
+ goto eof;
+ break;
+ }
+ if (ch == t->re_delim)
+ break;
+
+ ((u_int8_t *)data.data)[data.size++] = ch;
+ if (data.size == rdata->ulen) {
+ if ((ret = __os_realloc(dbp->env,
+ rdata->ulen *= 2,
+ &rdata->data)) != 0) {
+ rdata->ulen = 0;
+ rdata->data = NULL;
+ return (ret);
+ } else
+ data.data = rdata->data;
+ }
+ }
+
+ /*
+ * Another process may have read this record from the input
+ * file and stored it into the database already, in which
+ * case we don't need to repeat that operation. We detect
+ * this by checking if the last record we've read is greater
+ * or equal to the number of records in the database.
+ */
+ if (t->re_last >= recno) {
+ ++recno;
+ if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
+ goto err;
+ }
+ ++t->re_last;
+ }
+
+ if (0) {
+eof: t->re_eof = 1;
+ ret = DB_NOTFOUND;
+ }
+err: if (!was_modified)
+ t->re_modified = 0;
+
+ return (ret);
+}
+
+/*
+ * __ram_add --
+ * Add records into the tree.
+ */
+static int
+__ram_add(dbc, recnop, data, flags, bi_flags)
+ DBC *dbc;
+ db_recno_t *recnop;
+ DBT *data;
+ u_int32_t flags, bi_flags;
+{
+ BTREE_CURSOR *cp;
+ int exact, ret, stack, t_ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+retry: /* Find the slot for insertion. */
+ if ((ret = __bam_rsearch(dbc, recnop,
+ SR_INSERT | (flags == DB_APPEND ? SR_APPEND : 0), 1, &exact)) != 0)
+ return (ret);
+ stack = 1;
+
+ /* Copy the page into the cursor. */
+ STACK_TO_CURSOR(cp, ret);
+ if (ret != 0)
+ goto err;
+
+ if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) &&
+ !B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) {
+ ret = DB_KEYEXIST;
+ goto err;
+ }
+
+ /*
+ * The application may modify the data based on the selected record
+ * number.
+ */
+ if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL &&
+ (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0)
+ goto err;
+
+ /*
+ * Select the arguments for __bam_iitem() and do the insert. If the
+ * key is an exact match, or we're replacing the data item with a
+ * new data item, replace the current item. If the key isn't an exact
+ * match, we're inserting a new key/data pair, before the search
+ * location.
+ */
+ switch (ret = __bam_iitem(dbc,
+ NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
+ case 0:
+ /*
+ * Don't adjust anything.
+ *
+ * If we inserted a record, no cursors need adjusting because
+ * the only new record it's possible to insert is at the very
+ * end of the tree. The necessary adjustments to the internal
+ * page counts were made by __bam_iitem().
+ *
+ * If we overwrote a record, no cursors need adjusting because
+ * future DBcursor->get calls will simply return the underlying
+ * record (there's no adjustment made for the DB_CURRENT flag
+ * when a cursor get operation immediately follows a cursor
+ * delete operation, and the normal adjustment for the DB_NEXT
+ * flag is still correct).
+ */
+ break;
+ case DB_NEEDSPLIT:
+ /* Discard the stack of pages and split the page. */
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ stack = 0;
+
+ if ((ret = __bam_split(dbc, recnop, NULL)) != 0)
+ goto err;
+
+ goto retry;
+ /* NOTREACHED */
+ default:
+ goto err;
+ }
+
+err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/src/btree/bt_rsearch.c b/src/btree/bt_rsearch.c
new file mode 100644
index 00000000..36d1c667
--- /dev/null
+++ b/src/btree/bt_rsearch.c
@@ -0,0 +1,513 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __bam_rsearch --
+ * Search a btree for a record number.
+ *
+ * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
+ */
+int
+__bam_rsearch(dbc, recnop, flags, stop, exactp)
+ DBC *dbc;
+ db_recno_t *recnop;
+ u_int32_t flags;
+ int stop, *exactp;
+{
+ BINTERNAL *bi;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_indx_t adjust, deloffset, indx, top;
+ db_lockmode_t lock_mode;
+ db_pgno_t pg;
+ db_recno_t recno, t_recno, total;
+ u_int32_t get_mode;
+ int ret, stack, t_ret;
+
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_OFF(dbc->thread_info);
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ h = NULL;
+ ret = 0;
+
+ BT_STK_CLR(cp);
+
+ /*
+ * There are several ways we search a btree tree. The flags argument
+ * specifies if we're acquiring read or write locks and if we are
+ * locking pairs of pages. In addition, if we're adding or deleting
+ * an item, we have to lock the entire tree, regardless. See btree.h
+ * for more details.
+ *
+ * If write-locking pages, we need to know whether or not to acquire a
+ * write lock on a page before getting it. This depends on how deep it
+ * is in tree, which we don't know until we acquire the root page. So,
+ * if we need to lock the root page we may have to upgrade it later,
+ * because we won't get the correct lock initially.
+ *
+ * Retrieve the root page.
+ */
+
+ if ((ret = __bam_get_root(dbc, PGNO_INVALID, stop, flags, &stack)) != 0)
+ goto done;
+ lock_mode = cp->csp->lock_mode;
+ get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
+ lock = cp->csp->lock;
+ h = cp->csp->page;
+
+ BT_STK_CLR(cp);
+ /*
+ * If appending to the tree, set the record number now -- we have the
+ * root page locked.
+ *
+ * Delete only deletes exact matches, read only returns exact matches.
+ * Note, this is different from __bam_search(), which returns non-exact
+ * matches for read.
+ *
+ * The record may not exist. We can only return the correct location
+ * for the record immediately after the last record in the tree, so do
+ * a fast check now.
+ */
+ total = RE_NREC(h);
+ if (LF_ISSET(SR_APPEND)) {
+ *exactp = 0;
+ *recnop = recno = total + 1;
+ } else {
+ recno = *recnop;
+ if (recno <= total)
+ *exactp = 1;
+ else {
+ *exactp = 0;
+ if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) {
+ /*
+ * Keep the page locked for serializability.
+ *
+ * XXX
+ * This leaves the root page locked, which will
+ * eliminate any concurrency. A possible fix
+ * would be to lock the last leaf page instead.
+ */
+ ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ if ((t_ret =
+ __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == 0)
+ ret = DB_NOTFOUND;
+ goto done;
+ }
+ }
+ }
+
+ /*
+ * !!!
+ * Record numbers in the tree are 0-based, but the recno is
+ * 1-based. All of the calculations below have to take this
+ * into account.
+ */
+ for (total = 0;;) {
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ if (LF_ISSET(SR_MAX)) {
+ indx = NUM_ENT(h) - 2;
+ goto enter;
+ }
+ /* FALLTHROUGH */
+ case P_LDUP:
+ if (LF_ISSET(SR_MAX)) {
+ indx = NUM_ENT(h) - 1;
+ goto enter;
+ }
+ recno -= total;
+ /*
+ * There may be logically deleted records on the page.
+ * If there are enough, the record may not exist.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ adjust = P_INDX;
+ deloffset = O_INDX;
+ } else {
+ adjust = O_INDX;
+ deloffset = 0;
+ }
+ for (t_recno = 0, indx = 0;; indx += adjust) {
+ if (indx >= NUM_ENT(h)) {
+ *exactp = 0;
+ if (!LF_ISSET(SR_PAST_EOF) ||
+ recno > t_recno + 1) {
+ ret = __memp_fput(mpf,
+ dbc->thread_info,
+ h, dbc->priority);
+ h = NULL;
+ if ((t_ret = __TLPUT(dbc,
+ lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == 0)
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ }
+ if (!B_DISSET(GET_BKEYDATA(dbp, h,
+ indx + deloffset)->type) &&
+ ++t_recno == recno)
+ break;
+ }
+
+ BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ if (LF_ISSET(SR_BOTH))
+ goto get_prev;
+ goto done;
+ case P_IBTREE:
+ if (LF_ISSET(SR_MAX)) {
+ indx = NUM_ENT(h);
+ bi = GET_BINTERNAL(dbp, h, indx - 1);
+ } else for (indx = 0, top = NUM_ENT(h);;) {
+ bi = GET_BINTERNAL(dbp, h, indx);
+ if (++indx == top || total + bi->nrecs >= recno)
+ break;
+ total += bi->nrecs;
+ }
+ pg = bi->pgno;
+ break;
+ case P_LRECNO:
+ if (LF_ISSET(SR_MAX))
+ recno = NUM_ENT(h);
+ else
+ recno -= total;
+
+ /* Correct from 1-based to 0-based for a page offset. */
+ --recno;
+enter: BT_STK_ENTER(env, cp, h, recno, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ if (LF_ISSET(SR_BOTH)) {
+get_prev: DB_ASSERT(env, LF_ISSET(SR_NEXT));
+ /*
+ * We have a NEXT tree, now add the sub tree
+ * that points gets to the previous page.
+ */
+ cp->csp++;
+ indx = cp->sp->indx - 1;
+ h = cp->sp->page;
+ if (TYPE(h) == P_IRECNO) {
+ ri = GET_RINTERNAL(dbp, h, indx);
+ pg = ri->pgno;
+ } else {
+ DB_ASSERT(env, TYPE(h) == P_IBTREE);
+ bi = GET_BINTERNAL(dbp, h, indx);
+ pg = bi->pgno;
+ }
+ LF_CLR(SR_NEXT | SR_BOTH);
+ LF_SET(SR_MAX);
+ stack = 1;
+ h = NULL;
+ goto lock_next;
+ }
+ goto done;
+ case P_IRECNO:
+ if (LF_ISSET(SR_MAX)) {
+ indx = NUM_ENT(h);
+ ri = GET_RINTERNAL(dbp, h, indx - 1);
+ } else for (indx = 0, top = NUM_ENT(h);;) {
+ ri = GET_RINTERNAL(dbp, h, indx);
+ if (++indx == top || total + ri->nrecs >= recno)
+ break;
+ total += ri->nrecs;
+ }
+ pg = ri->pgno;
+ break;
+ default:
+ ret = __db_pgfmt(env, h->pgno);
+ goto done;
+ }
+ --indx;
+
+ /* Return if this is the lowest page wanted. */
+ if (stop == LEVEL(h)) {
+ BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ goto done;
+ }
+ if (stack) {
+ BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ h = NULL;
+
+ lock_mode = DB_LOCK_WRITE;
+ get_mode = DB_MPOOL_DIRTY;
+ if ((ret =
+ __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ goto err;
+ } else if (LF_ISSET(SR_NEXT)) {
+ /*
+ * For RECNO if we are doing a NEXT search the
+ * search recno is the one we are looking for
+ * but we want to keep the stack from the spanning
+ * node on down. We only know we have the spanning
+ * node when its child's index is 0, so save
+ * each node and discard the tree when we find out
+ * its not needed.
+ */
+ if (indx != 0 && cp->sp->page != NULL) {
+ BT_STK_POP(cp);
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ }
+
+ BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
+ h = NULL;
+ if (ret != 0)
+ goto err;
+lock_next: if ((ret =
+ __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ goto err;
+ } else {
+ /*
+ * Decide if we want to return a pointer to the next
+ * page in the stack. If we do, write lock it and
+ * never unlock it.
+ */
+ if ((LF_ISSET(SR_PARENT) &&
+ (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) ||
+ (LEVEL(h) - 1) == LEAFLEVEL)
+ stack = 1;
+
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ h = NULL;
+
+ lock_mode = stack &&
+ LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if (lock_mode == DB_LOCK_WRITE)
+ get_mode = DB_MPOOL_DIRTY;
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
+ /*
+ * If we fail, discard the lock we held. This
+ * is OK because this only happens when we are
+ * descending the tree holding read-locks.
+ */
+ (void)__LPUT(dbc, lock);
+ goto err;
+ }
+ }
+
+ if ((ret = __memp_fget(mpf, &pg,
+ dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
+ goto err;
+ }
+ /* NOTREACHED */
+
+err: if (h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ BT_STK_POP(cp);
+ (void)__bam_stkrel(dbc, 0);
+
+done:
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_ON(dbc->thread_info);
+ return (ret);
+}
+
+/*
+ * __bam_adjust --
+ * Adjust the tree after adding or deleting a record.
+ *
+ * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
+ */
+int
+__bam_adjust(dbc, adjust)
+ DBC *dbc;
+ int32_t adjust;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ EPG *epg;
+ PAGE *h;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ root_pgno = BAM_ROOT_PGNO(dbc);
+
+ /* Update the record counts for the tree. */
+ for (epg = cp->sp; epg <= cp->csp; ++epg) {
+ h = epg->page;
+ if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
+ ret = __memp_dirty(mpf, &h,
+ dbc->thread_info, dbc->txn, dbc->priority, 0);
+ epg->page = h;
+ if (ret != 0)
+ return (ret);
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_cadjust_log(dbp, dbc->txn,
+ &LSN(h), 0, PGNO(h), &LSN(h),
+ (u_int32_t)epg->indx, adjust,
+ PGNO(h) == root_pgno ?
+ CAD_UPDATEROOT : 0)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(h));
+
+ if (TYPE(h) == P_IBTREE)
+ GET_BINTERNAL(dbp, h, epg->indx)->nrecs +=
+ adjust;
+ else
+ GET_RINTERNAL(dbp, h, epg->indx)->nrecs +=
+ adjust;
+
+ if (PGNO(h) == root_pgno)
+ RE_NREC_ADJ(h, adjust);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __bam_nrecs --
+ * Return the number of records in the tree.
+ *
+ * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
+ */
+int
+__bam_nrecs(dbc, rep)
+ DBC *dbc;
+ db_recno_t *rep;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ COMPQUIET(h, NULL);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ LOCK_INIT(lock);
+
+ pgno = PGNO_INVALID;
+ BAM_GET_ROOT(dbc, pgno, h, 0, DB_LOCK_READ, lock, ret);
+ if (ret != 0)
+ goto err;
+ DB_ASSERT(dbp->env, h != NULL);
+
+ *rep = RE_NREC(h);
+
+ ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+err: if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_total --
+ * Return the number of records below a page.
+ *
+ * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *));
+ */
+db_recno_t
+__bam_total(dbp, h)
+ DB *dbp;
+ PAGE *h;
+{
+ db_recno_t nrecs;
+ db_indx_t indx, top;
+
+ nrecs = 0;
+ top = NUM_ENT(h);
+
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ /* Check for logically deleted records. */
+ for (indx = 0; indx < top; indx += P_INDX)
+ if (!B_DISSET(
+ GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
+ ++nrecs;
+ break;
+ case P_LDUP:
+ /* Check for logically deleted records. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
+ ++nrecs;
+ break;
+ case P_IBTREE:
+ for (indx = 0; indx < top; indx += O_INDX)
+ nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs;
+ break;
+ case P_LRECNO:
+ nrecs = NUM_ENT(h);
+ break;
+ case P_IRECNO:
+ for (indx = 0; indx < top; indx += O_INDX)
+ nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs;
+ break;
+ }
+
+ return (nrecs);
+}
diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c
new file mode 100644
index 00000000..e809a852
--- /dev/null
+++ b/src/btree/bt_search.c
@@ -0,0 +1,1028 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __bam_get_root --
+ * Fetch the root of a tree and see if we want to keep
+ * it in the stack.
+ *
+ * PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *));
+ */
+int
+__bam_get_root(dbc, root_pgno, slevel, flags, stack)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int slevel;
+ u_int32_t flags;
+ int *stack;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_lockmode_t lock_mode;
+ u_int32_t get_mode;
+ int ret, t_ret;
+
+ COMPQUIET(h, NULL);
+ LOCK_INIT(lock);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ /*
+ * If write-locking pages, we need to know whether or not to acquire a
+ * write lock on a page before getting it. This depends on how deep it
+ * is in tree, which we don't know until we acquire the root page. So,
+ * if we need to lock the root page we may have to upgrade it later,
+ * because we won't get the correct lock initially.
+ *
+ * Retrieve the root page.
+ */
+try_again:
+ *stack = LF_ISSET(SR_STACK) &&
+ (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM));
+ lock_mode = DB_LOCK_READ;
+ if (*stack ||
+ LF_ISSET(SR_DEL) || (LF_ISSET(SR_NEXT) && LF_ISSET(SR_WRITE)))
+ lock_mode = DB_LOCK_WRITE;
+
+ /*
+ * Get the root. If the root happens to be a leaf page then
+ * we are supposed to get a read lock on it before latching
+ * it. So if we have not locked it do a try get first.
+ * If we can't get the root shared, then get a lock on it and
+ * then wait for the latch.
+ */
+retry: if (lock_mode == DB_LOCK_WRITE)
+ get_mode = DB_MPOOL_DIRTY;
+ else if (LOCK_ISSET(lock) || !STD_LOCKING(dbc) ||
+ F_ISSET(dbc, DBC_DOWNREV) ||
+ dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM))
+ get_mode = 0;
+ else
+ get_mode = DB_MPOOL_TRY;
+
+ BAM_GET_ROOT(dbc, root_pgno, h, get_mode, lock_mode, lock, ret);
+ if (ret == DB_LOCK_NOTGRANTED && get_mode == DB_MPOOL_TRY) {
+ DB_ASSERT(dbp->env, !LOCK_ISSET(lock));
+ if ((ret = __db_lget(dbc, 0,
+ root_pgno == PGNO_INVALID ? BAM_ROOT_PGNO(dbc) : root_pgno,
+ lock_mode, 0, &lock)) != 0)
+ return (ret);
+ goto retry;
+ }
+ if (ret != 0) {
+ /* Did not read it, so we can release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+ DB_ASSERT(dbp->env, TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO ||
+ TYPE(h) == P_LBTREE || TYPE(h) == P_LRECNO || TYPE(h) == P_LDUP);
+
+ /*
+ * Decide if we need to dirty and/or lock this page.
+ * We must not hold the latch while we get the lock.
+ */
+ if (!*stack &&
+ ((LF_ISSET(SR_PARENT) && (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
+ LEVEL(h) == LEAFLEVEL ||
+ (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
+ *stack = 1;
+ /* If we already have the write lock, we are done. */
+ if (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM)) {
+ if (lock_mode == DB_LOCK_WRITE)
+ goto done;
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ return (ret);
+ }
+
+ /*
+ * Now that we know what level the root is at, do we need a
+ * write lock? If not or we got the lock before latching
+ * we are done.
+ */
+ if (LEVEL(h) != LEAFLEVEL || LF_ISSET(SR_WRITE)) {
+ lock_mode = DB_LOCK_WRITE;
+ /* Drop the read lock if we got it above. */
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ return (ret);
+ } else if (LOCK_ISSET(lock))
+ goto done;
+ if (!STD_LOCKING(dbc)) {
+ if (lock_mode != DB_LOCK_WRITE)
+ goto done;
+ if ((ret = __memp_dirty(mpf, &h, dbc->thread_info,
+ dbc->txn, dbc->priority, 0)) != 0) {
+ if (h != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ return (ret);
+ }
+ } else {
+ /* Try to lock the page without waiting first. */
+ if ((ret = __db_lget(dbc, 0, root_pgno,
+ lock_mode, DB_LOCK_NOWAIT, &lock)) == 0) {
+ if (lock_mode == DB_LOCK_WRITE && (ret =
+ __memp_dirty(mpf, &h, dbc->thread_info,
+ dbc->txn, dbc->priority, 0)) != 0) {
+ if (h != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h,
+ dbc->priority);
+ return (ret);
+ }
+ goto done;
+ }
+
+ t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ h = NULL;
+
+ if (ret == DB_LOCK_DEADLOCK ||
+ ret == DB_LOCK_NOTGRANTED)
+ ret = 0;
+ if (ret == 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ return (ret);
+ get_mode = 0;
+ if (lock_mode == DB_LOCK_WRITE)
+ get_mode = DB_MPOOL_DIRTY;
+
+ if ((ret = __db_lget(dbc,
+ 0, root_pgno, lock_mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = __memp_fget(mpf,
+ &root_pgno, dbc->thread_info, dbc->txn,
+ (atomic_read(&mpf->mfp->multiversion) == 0 &&
+ lock_mode == DB_LOCK_WRITE) ? DB_MPOOL_DIRTY : 0,
+ &h)) != 0) {
+ /* Did not read it, release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+ }
+ /*
+ * While getting dirty or locked we need to drop the mutex
+ * so someone else could get in and split the root.
+ */
+ if (!((LF_ISSET(SR_PARENT) &&
+ (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
+ LEVEL(h) == LEAFLEVEL ||
+ (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
+ /* Someone else split the root, start over. */
+ ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ h = NULL;
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ goto try_again;
+ } else if (atomic_read(&mpf->mfp->multiversion) != 0 &&
+ lock_mode == DB_LOCK_WRITE && (ret = __memp_dirty(mpf, &h,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ (void)__LPUT(dbc, lock);
+ }
+ }
+
+done: BT_STK_ENTER(dbp->env, cp, h, 0, lock, lock_mode, ret);
+
+ return (ret);
+}
+
+/*
+ * __bam_search --
+ * Search a btree for a key.
+ *
+ * PUBLIC: int __bam_search __P((DBC *, db_pgno_t,
+ * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *));
+ */
+int
+__bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ const DBT *key;
+ u_int32_t flags;
+ int slevel, *exactp;
+ db_recno_t *recnop;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock, saved_lock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h, *parent_h;
+ db_indx_t base, i, indx, *inp, lim;
+ db_lockmode_t lock_mode;
+ db_pgno_t pg, saved_pg, start_pgno;
+ db_recno_t recno;
+ int adjust, cmp, deloffset, ret, set_stack, stack, t_ret;
+ int getlock, was_next;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ u_int32_t get_mode, wait;
+ u_int8_t level, saved_level;
+
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_OFF(dbc->thread_info);
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ h = NULL;
+ parent_h = NULL;
+ t = dbp->bt_internal;
+ recno = 0;
+ t_ret = 0;
+
+ BT_STK_CLR(cp);
+ LOCK_INIT(saved_lock);
+ LOCK_INIT(lock);
+ was_next = LF_ISSET(SR_NEXT);
+ wait = DB_LOCK_NOWAIT;
+
+ /*
+ * There are several ways we search a btree tree. The flags argument
+ * specifies if we're acquiring read or write latches, if we position
+ * to the first or last item in a set of duplicates, if we return
+ * deleted items, and if we are latching pairs of pages. In addition,
+ * if we're modifying record numbers, we have to latch the entire tree
+ * regardless. See btree.h for more details.
+ */
+
+ start_pgno = saved_pg = root_pgno;
+ saved_level = MAXBTREELEVEL;
+retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
+ goto err;
+ lock_mode = cp->csp->lock_mode;
+ get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
+ h = cp->csp->page;
+ root_pgno = pg = PGNO(h);
+ lock = cp->csp->lock;
+ set_stack = stack;
+ /*
+ * Determine if we need to lock interior nodes.
+ * If we have record numbers we always lock. Otherwise we only
+ * need to do this if we are write locking and we are returning
+ * a stack of nodes. SR_NEXT will eventually get a stack and
+ * release the locks above that level.
+ */
+ if (F_ISSET(dbc, DBC_DOWNREV)) {
+ getlock = 1;
+ wait = 0;
+ } else
+ getlock = F_ISSET(cp, C_RECNUM) ||
+ (lock_mode == DB_LOCK_WRITE &&
+ (stack || LF_ISSET(SR_NEXT | SR_DEL)));
+
+ /*
+ * If we are asked a level that is above the root,
+ * just return the root. This can happen if the tree
+ * collapses while we are trying to lock the root.
+ */
+ if (!LF_ISSET(SR_START) && LEVEL(h) < slevel)
+ goto done;
+
+ BT_STK_CLR(cp);
+
+ /* Choose a comparison function. */
+ func = F_ISSET(dbc, DBC_OPD) ?
+ (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
+ t->bt_compare;
+
+ for (;;) {
+ if (TYPE(h) == P_LBTREE)
+ adjust = P_INDX;
+ else {
+ /*
+ * It is possible to catch an internal page as a change
+ * is being backed out. Its leaf pages will be locked
+ * but we must be sure we get to one. If the page
+ * is not populated enough lock it.
+ */
+ if (TYPE(h) != P_LDUP && NUM_ENT(h) == 0) {
+ getlock = 1;
+ level = LEVEL(h) + 1;
+ if ((ret = __memp_fput(mpf, dbc->thread_info,
+ h, dbc->priority)) != 0)
+ goto err;
+ goto lock_next;
+ }
+ adjust = O_INDX;
+ }
+ inp = P_INP(dbp, h);
+ if (LF_ISSET(SR_MIN | SR_MAX)) {
+ if (LF_ISSET(SR_MIN) || NUM_ENT(h) == 0)
+ indx = 0;
+ else if (TYPE(h) == P_LBTREE)
+ indx = NUM_ENT(h) - 2;
+ else
+ indx = NUM_ENT(h) - 1;
+
+ if (LEVEL(h) == LEAFLEVEL ||
+ (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
+ if (LF_ISSET(SR_NEXT))
+ goto get_next;
+ goto found;
+ }
+ goto next;
+ }
+ /*
+ * Do a binary search on the current page. If we're searching
+ * a Btree leaf page, we have to walk the indices in groups of
+ * two. If we're searching an internal page or a off-page dup
+ * page, they're an index per page item. If we find an exact
+ * match on a leaf page, we're done.
+ */
+ DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) {
+ DB_BINARY_SEARCH_INCR(indx, base, lim, adjust);
+ if ((ret = __bam_cmp(dbc, key, h, indx,
+ func, &cmp)) != 0)
+ goto err;
+ if (cmp == 0) {
+ if (LEVEL(h) == LEAFLEVEL ||
+ (!LF_ISSET(SR_START) &&
+ LEVEL(h) == slevel)) {
+ if (LF_ISSET(SR_NEXT))
+ goto get_next;
+ goto found;
+ }
+ goto next;
+ }
+ if (cmp > 0)
+ DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
+ lim, adjust);
+ }
+
+ /*
+ * No match found. Base is the smallest index greater than
+ * key and may be zero or a last + O_INDX index.
+ *
+ * If it's a leaf page or the stopping point,
+ * return base as the "found" value.
+ * Delete only deletes exact matches.
+ */
+ if (LEVEL(h) == LEAFLEVEL ||
+ (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
+ *exactp = 0;
+
+ if (LF_ISSET(SR_EXACT)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ if (LF_ISSET(SR_STK_ONLY)) {
+ BT_STK_NUM(env, cp, h, base, ret);
+ if ((t_ret =
+ __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ h = NULL;
+ if (ret != 0)
+ goto err;
+ goto done;
+ }
+ if (LF_ISSET(SR_NEXT)) {
+get_next: /*
+ * The caller could have asked for a NEXT
+ * at the root if the tree recently collapsed.
+ */
+ if (PGNO(h) == root_pgno) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ indx = cp->sp->indx + 1;
+ if (indx == NUM_ENT(cp->sp->page)) {
+ ret = DB_NOTFOUND;
+ cp->csp++;
+ goto err;
+ }
+ /*
+ * If we want both the key page and the next
+ * page, push the key page on the stack
+ * otherwise save the root of the subtree
+ * and drop the rest of the subtree.
+ * Search down again starting at the
+ * next child of the root of this subtree.
+ */
+ LF_SET(SR_MIN);
+ LF_CLR(SR_NEXT);
+ set_stack = stack = 1;
+ if (LF_ISSET(SR_BOTH)) {
+ cp->csp++;
+ BT_STK_PUSH(env,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ LOCK_INIT(lock);
+ h = cp->sp->page;
+ pg = GET_BINTERNAL(dbp, h, indx)->pgno;
+ level = LEVEL(h);
+ h = NULL;
+ goto lock_next;
+ } else {
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ goto err;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info,
+ h, dbc->priority)) != 0)
+ goto err;
+ h = cp->sp->page;
+ cp->sp->page = NULL;
+ lock = cp->sp->lock;
+ LOCK_INIT(cp->sp->lock);
+ if ((ret = __bam_stkrel(dbc,
+ STK_NOLOCK)) != 0)
+ goto err;
+ goto next;
+ }
+ }
+
+ /*
+ * !!!
+ * Possibly returning a deleted record -- DB_SET_RANGE,
+ * DB_KEYFIRST and DB_KEYLAST don't require an exact
+ * match, and we don't want to walk multiple pages here
+ * to find an undeleted record. This is handled by the
+ * calling routine.
+ */
+ if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
+ cp->csp++;
+ BT_STK_ENTER(env, cp, h, base, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ goto done;
+ }
+
+ /*
+ * If it's not a leaf page, record the internal page (which is
+ * a parent page for the key). Decrement the base by 1 if it's
+ * non-zero so that if a split later occurs, the inserted page
+ * will be to the right of the saved page.
+ */
+ indx = base > 0 ? base - O_INDX : base;
+
+ /*
+ * If we're trying to calculate the record number, sum up
+ * all the record numbers on this page up to the indx point.
+ */
+next: if (recnop != NULL)
+ for (i = 0; i < indx; ++i)
+ recno += GET_BINTERNAL(dbp, h, i)->nrecs;
+
+ pg = GET_BINTERNAL(dbp, h, indx)->pgno;
+ level = LEVEL(h);
+
+ /* See if we are at the level to start stacking. */
+ if (LF_ISSET(SR_START) && slevel == level)
+ set_stack = stack = 1;
+
+ if (LF_ISSET(SR_STK_ONLY)) {
+ if (slevel == LEVEL(h)) {
+ BT_STK_NUM(env, cp, h, indx, ret);
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ h = NULL;
+ if (ret != 0)
+ goto err;
+ goto done;
+ }
+ BT_STK_NUMPUSH(env, cp, h, indx, ret);
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ h = NULL;
+ } else if (stack) {
+ /* Return if this is the lowest page wanted. */
+ if (LF_ISSET(SR_PARENT) && slevel == level) {
+ BT_STK_ENTER(env,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ goto done;
+ }
+ if (LF_ISSET(SR_DEL) && NUM_ENT(h) > 1) {
+ /*
+ * There was a page with a singleton pointer
+ * to a non-empty subtree.
+ */
+ cp->csp--;
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ set_stack = stack = 0;
+ goto do_del;
+ }
+ BT_STK_PUSH(env,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+
+ LOCK_INIT(lock);
+ get_mode = DB_MPOOL_DIRTY;
+ lock_mode = DB_LOCK_WRITE;
+ getlock = 1;
+ goto lock_next;
+ } else {
+ /*
+ * Decide if we want to return a reference to the next
+ * page in the return stack. If so, latch it and don't
+ * unlatch it. We will want to stack things on the
+ * next iteration. The stack variable cannot be
+ * set until we leave this clause. If we are locking
+ * then we must lock this level before getting the page.
+ */
+ if ((LF_ISSET(SR_PARENT) &&
+ (u_int8_t)(slevel + 1) >= (level - 1)) ||
+ (level - 1) == LEAFLEVEL)
+ set_stack = 1;
+
+ /*
+ * Check for a normal search. If so, we need to
+ * latch couple the parent/chid buffers.
+ */
+ if (!LF_ISSET(SR_DEL | SR_NEXT)) {
+ parent_h = h;
+ goto lock_next;
+ }
+
+ /*
+ * Returning a subtree. See if we have hit the start
+ * point if so save the parent and set stack.
+ * Otherwise free the parent and temporarily
+ * save this one.
+ * For SR_DEL we need to find a page with 1 entry.
+ * For SR_NEXT we want find the minimal subtree
+ * that contains the key and the next page.
+ * We save pages as long as we are at the right
+ * edge of the subtree. When we leave the right
+ * edge, then drop the subtree.
+ */
+
+ if ((LF_ISSET(SR_DEL) && NUM_ENT(h) == 1)) {
+ /*
+ * We are pushing the things on the stack,
+ * set the stack variable now to indicate this
+ * has happened.
+ */
+ stack = set_stack = 1;
+ LF_SET(SR_WRITE);
+ /* Push the parent. */
+ cp->csp++;
+ /* Push this node. */
+ BT_STK_PUSH(env, cp, h,
+ indx, lock, DB_LOCK_NG, ret);
+ if (ret != 0)
+ goto err;
+ LOCK_INIT(lock);
+ } else {
+ /*
+ * See if we want to save the tree so far.
+ * If we are looking for the next key,
+ * then we must save this node if we are
+ * at the end of the page. If not then
+ * discard anything we have saved so far.
+ * For delete only keep one node until
+ * we find a singleton.
+ */
+do_del: if (cp->csp->page != NULL) {
+ if (LF_ISSET(SR_NEXT) &&
+ indx == NUM_ENT(h) - 1)
+ cp->csp++;
+ else if ((ret =
+ __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ }
+ /* Save this node. */
+ BT_STK_ENTER(env, cp,
+ h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ LOCK_INIT(lock);
+ }
+
+lock_next: h = NULL;
+
+ if (set_stack && LF_ISSET(SR_WRITE)) {
+ lock_mode = DB_LOCK_WRITE;
+ get_mode = DB_MPOOL_DIRTY;
+ getlock = 1;
+ }
+ /*
+ * If we are retrying and we are back at the same
+ * page then we already have it locked. If we are
+ * at a different page we want to lock couple and
+ * release that lock.
+ */
+ if (level - 1 == saved_level) {
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ goto err;
+ lock = saved_lock;
+ LOCK_INIT(saved_lock);
+ saved_level = MAXBTREELEVEL;
+ if (pg == saved_pg)
+ goto skip_lock;
+ }
+ if ((getlock || level - 1 == LEAFLEVEL) &&
+ (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS,
+ pg, lock_mode, wait, &lock)) != 0) {
+ /*
+ * If we are doing DEL or NEXT then we
+ * have an extra level saved in the stack,
+ * push it so it will get freed.
+ */
+ if (LF_ISSET(SR_DEL | SR_NEXT) && !stack)
+ cp->csp++;
+ PERFMON6(env, race, bam_search, dbp->fname,
+ dbp->dname, ret, h, parent_h, flags);
+ /*
+ * If we fail, discard the lock we held.
+ * This is ok because we will either search
+ * again or exit without actually looking
+ * at the data.
+ */
+ if ((t_ret = __LPUT(dbc, lock)) != 0)
+ ret = t_ret;
+ /*
+ * If we blocked at a different level release
+ * the previous saved lock.
+ */
+ if ((t_ret = __LPUT(dbc, saved_lock)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (wait == 0 || (ret != DB_LOCK_NOTGRANTED &&
+ ret != DB_LOCK_DEADLOCK))
+ goto err;
+
+ /* Release the parent if we are holding it. */
+ if (parent_h != NULL &&
+ (ret = __memp_fput(mpf, dbc->thread_info,
+ parent_h, dbc->priority)) != 0)
+ goto err;
+ parent_h = NULL;
+
+ BT_STK_POP(cp);
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ if ((ret = __db_lget(dbc,
+ 0, pg, lock_mode, 0, &saved_lock)) != 0)
+ goto err;
+ /*
+ * A very strange case: if this page was
+ * freed while we wait then we cannot hold
+ * the lock on it while we reget the root
+ * latch because allocation is one place
+ * we lock while holding a latch.
+ * We want to hold the lock but must ensure
+ * that the page is not free or cannot become
+ * free. If we are at the LEAF level we can
+ * hold on to the lock if the page is still
+ * of the right type. Otherwise we need to
+ * be sure this page cannot move to an off page
+ * duplicate tree (which are not locked) and
+ * masquerade as the page we want.
+ */
+
+ /*
+ * If the page is not at leaf level
+ * then see if OPD trees are around.
+ * If the page could appear as an
+ * interior offpage duplicate node
+ * at the right level the it will
+ * not be locked and subsequently be
+ * freed. If there are multiple
+ * databases in the file then they
+ * could have OPDs.
+ */
+ if (level - 1 > LEAFLEVEL &&
+ (F_ISSET(dbp, DB_AM_SUBDB) ||
+ (dbp->type == DB_BTREE &&
+ F_ISSET(dbp, DB_AM_DUPSORT))))
+ goto drop_lock;
+
+ /*
+ * Take a look at the page. If it got
+ * freed it could be very gone.
+ */
+ if ((ret = __memp_fget(mpf, &pg,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0 &&
+ ret != DB_PAGE_NOTFOUND)
+ goto err;
+
+ /*
+ * Check for right level and page type.
+ */
+ if (ret != 0 || LEVEL(h) != level - 1 ||
+ (LEVEL(h) == LEAFLEVEL ?
+ TYPE(h) != (dbc->dbtype == DB_BTREE ?
+ P_LBTREE : P_LRECNO) :
+ TYPE(h) != (dbc->dbtype == DB_BTREE ?
+ P_IBTREE : P_IRECNO))) {
+drop_lock: ret = __LPUT(dbc, saved_lock);
+ if (ret != 0)
+ goto err;
+ pg = root_pgno;
+ saved_level = MAXBTREELEVEL;
+ }
+ if (h != NULL && (ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ h = NULL;
+
+ if (was_next) {
+ LF_CLR(SR_MIN);
+ LF_SET(SR_NEXT);
+ }
+ /*
+ * We have the lock but we dropped the
+ * latch so we need to search again. If
+ * we get back to the same page then all
+ * is good, otherwise we need to try to
+ * lock the new page.
+ */
+ saved_pg = pg;
+ saved_level = level - 1;
+ goto retry;
+ }
+skip_lock: stack = set_stack;
+ }
+ /* Get the child page. */
+ if ((ret = __memp_fget(mpf, &pg,
+ dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
+ goto err;
+ /* Release the parent. */
+ if (parent_h != NULL && (ret = __memp_fput(mpf,
+ dbc->thread_info, parent_h, dbc->priority)) != 0)
+ goto err;
+ parent_h = NULL;
+ }
+ /* NOTREACHED */
+
+found: *exactp = 1;
+
+ /*
+ * If we got here, we know that we have a Btree leaf or off-page
+ * duplicates page. If it's a Btree leaf page, we have to handle
+ * on-page duplicates.
+ *
+ * If there are duplicates, go to the first/last one. This is
+ * safe because we know that we're not going to leave the page,
+ * all duplicate sets that are not on overflow pages exist on a
+ * single leaf page.
+ */
+ if (TYPE(h) == P_LBTREE && NUM_ENT(h) > P_INDX) {
+ if (LF_ISSET(SR_DUPLAST))
+ while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+ inp[indx] == inp[indx + P_INDX])
+ indx += P_INDX;
+ else if (LF_ISSET(SR_DUPFIRST))
+ while (indx > 0 &&
+ inp[indx] == inp[indx - P_INDX])
+ indx -= P_INDX;
+ }
+
+ /*
+ * Now check if we are allowed to return deleted items; if not, then
+ * find the next (or previous) non-deleted duplicate entry. (We do
+ * not move from the original found key on the basis of the SR_DELNO
+ * flag.)
+ */
+ DB_ASSERT(env, recnop == NULL || LF_ISSET(SR_DELNO));
+ if (LF_ISSET(SR_DELNO)) {
+ deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0;
+ if (LF_ISSET(SR_DUPLAST))
+ while (B_DISSET(GET_BKEYDATA(dbp,
+ h, indx + deloffset)->type) && indx > 0 &&
+ inp[indx] == inp[indx - adjust])
+ indx -= adjust;
+ else
+ while (B_DISSET(GET_BKEYDATA(dbp,
+ h, indx + deloffset)->type) &&
+ indx < (db_indx_t)(NUM_ENT(h) - adjust) &&
+ inp[indx] == inp[indx + adjust])
+ indx += adjust;
+
+ /*
+ * If we weren't able to find a non-deleted duplicate, return
+ * DB_NOTFOUND.
+ */
+ if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ /*
+ * Increment the record counter to point to the found element.
+ * Ignore any deleted key/data pairs. There doesn't need to
+ * be any correction for duplicates, as Btree doesn't support
+ * duplicates and record numbers in the same tree.
+ */
+ if (recnop != NULL) {
+ DB_ASSERT(env, TYPE(h) == P_LBTREE);
+
+ for (i = 0; i < indx; i += P_INDX)
+ if (!B_DISSET(
+ GET_BKEYDATA(dbp, h, i + O_INDX)->type))
+ ++recno;
+
+ /* Correct the number for a 0-base. */
+ *recnop = recno + 1;
+ }
+ }
+
+ if (LF_ISSET(SR_STK_ONLY)) {
+ BT_STK_NUM(env, cp, h, indx, ret);
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ h = NULL;
+ } else {
+ if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
+ cp->csp++;
+ BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
+ }
+ if (ret != 0)
+ goto err;
+
+ cp->csp->lock = lock;
+ DB_ASSERT(env, parent_h == NULL);
+
+done:
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_ON(dbc->thread_info);
+
+ if ((ret = __LPUT(dbc, saved_lock)) != 0)
+ return (ret);
+
+ return (0);
+
+err: if (ret == 0)
+ ret = t_ret;
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (parent_h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, parent_h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Keep any not-found page locked for serializability. */
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ (void)__LPUT(dbc, saved_lock);
+
+ BT_STK_POP(cp);
+ (void)__bam_stkrel(dbc, 0);
+
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_ON(dbc->thread_info);
+
+ return (ret);
+}
+
+/*
+ * __bam_stkrel --
+ * Release all pages currently held in the stack.
+ *
+ * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t));
+ */
+int
+__bam_stkrel(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ EPG *epg;
+ int ret, t_ret;
+
+ DB_ASSERT(NULL, dbc != NULL);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Release inner pages first.
+ *
+ * The caller must be sure that setting STK_NOLOCK will not effect
+ * either serializability or recoverability.
+ */
+ for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) {
+ if (epg->page != NULL) {
+ if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) {
+ cp->page = NULL;
+ LOCK_INIT(cp->lock);
+ }
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ epg->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ epg->page = NULL;
+ }
+ /*
+ * We set this if we need to release our pins,
+ * but are not logically ready to have the pages
+ * visible.
+ */
+ if (LF_ISSET(STK_PGONLY))
+ continue;
+ if (LF_ISSET(STK_NOLOCK) &&
+ (epg->lock.mode == DB_LOCK_READ ||
+ atomic_read(&mpf->mfp->multiversion) == 0)) {
+ if ((t_ret = __LPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ } else
+ if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /* Clear the stack, all pages have been released. */
+ if (!LF_ISSET(STK_PGONLY))
+ BT_STK_CLR(cp);
+
+ return (ret);
+}
+
+/*
+ * __bam_stkgrow --
+ * Grow the stack.
+ *
+ * PUBLIC: int __bam_stkgrow __P((ENV *, BTREE_CURSOR *));
+ */
+int
+__bam_stkgrow(env, cp)
+ ENV *env;
+ BTREE_CURSOR *cp;
+{
+ EPG *p;
+ size_t entries;
+ int ret;
+
+ entries = cp->esp - cp->sp;
+
+ if ((ret = __os_calloc(env, entries * 2, sizeof(EPG), &p)) != 0)
+ return (ret);
+ memcpy(p, cp->sp, entries * sizeof(EPG));
+ if (cp->sp != cp->stack)
+ __os_free(env, cp->sp);
+ cp->sp = p;
+ cp->csp = p + entries;
+ cp->esp = p + entries * 2;
+ return (0);
+}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
new file mode 100644
index 00000000..8299c69a
--- /dev/null
+++ b/src/btree/bt_split.c
@@ -0,0 +1,1332 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/btree.h"
+
+static int __bam_page __P((DBC *, EPG *, EPG *));
+static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));
+static int __bam_root __P((DBC *, EPG *));
+
+/*
+ * __bam_split --
+ * Split a page.
+ *
+ * PUBLIC: int __bam_split __P((DBC *, void *, db_pgno_t *));
+ */
+int
+__bam_split(dbc, arg, root_pgnop)
+ DBC *dbc;
+ void *arg;
+ db_pgno_t *root_pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB_LOCK metalock, next_lock;
+ enum { UP, DOWN } dir;
+ db_pgno_t pgno, next_pgno, root_pgno;
+ int exact, level, ret;
+
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_OFF(dbc->thread_info);
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ LOCK_INIT(next_lock);
+ next_pgno = PGNO_INVALID;
+
+ /*
+ * First get a lock on the metadata page, we will have to allocate
+ * pages and cannot get a lock while we have the search tree pinned.
+ */
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ 0, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ root_pgno = BAM_ROOT_PGNO(dbc);
+
+ /*
+ * The locking protocol we use to avoid deadlock to acquire locks by
+ * walking down the tree, but we do it as lazily as possible, locking
+ * the root only as a last resort. We expect all stack pages to have
+ * been discarded before we're called; we discard all short-term locks.
+ *
+ * When __bam_split is first called, we know that a leaf page was too
+ * full for an insert. We don't know what leaf page it was, but we
+ * have the key/recno that caused the problem. We call XX_search to
+ * reacquire the leaf page, but this time get both the leaf page and
+ * its parent, locked. We then split the leaf page and see if the new
+ * internal key will fit into the parent page. If it will, we're done.
+ *
+ * If it won't, we discard our current locks and repeat the process,
+ * only this time acquiring the parent page and its parent, locked.
+ * This process repeats until we succeed in the split, splitting the
+ * root page as the final resort. The entire process then repeats,
+ * as necessary, until we split a leaf page.
+ *
+ * XXX
+ * A traditional method of speeding this up is to maintain a stack of
+ * the pages traversed in the original search. You can detect if the
+ * stack is correct by storing the page's LSN when it was searched and
+ * comparing that LSN with the current one when it's locked during the
+ * split. This would be an easy change for this code, but I have no
+ * numbers that indicate it's worthwhile.
+ */
+ for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) {
+ /*
+ * Acquire a page and its parent, locked.
+ */
+retry: if ((ret = (dbc->dbtype == DB_BTREE ?
+ __bam_search(dbc, PGNO_INVALID,
+ arg, SR_WRPAIR, level, NULL, &exact) :
+ __bam_rsearch(dbc,
+ (db_recno_t *)arg, SR_WRPAIR, level, &exact))) != 0)
+ break;
+
+ if (cp->csp[0].page->pgno == root_pgno) {
+ /* we can overshoot the top of the tree. */
+ level = cp->csp[0].page->level;
+ if (root_pgnop != NULL)
+ *root_pgnop = root_pgno;
+ } else if (root_pgnop != NULL)
+ *root_pgnop = cp->csp[-1].page->pgno;
+
+ /*
+ * Split the page if it still needs it (it's possible another
+ * thread of control has already split the page). If we are
+ * guaranteed that two items will fit on the page, the split
+ * is no longer necessary.
+ */
+ if (2 * B_MAXSIZEONPAGE(cp->ovflsize)
+ <= (db_indx_t)P_FREESPACE(dbc->dbp, cp->csp[0].page)) {
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ goto no_split;
+ }
+
+ /*
+ * We need to try to lock the next page so we can update
+ * its PREV.
+ */
+ if (ISLEAF(cp->csp->page) &&
+ (pgno = NEXT_PGNO(cp->csp->page)) != PGNO_INVALID) {
+ TRY_LOCK(dbc, pgno,
+ next_pgno, next_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ }
+ ret = cp->csp[0].page->pgno == root_pgno ?
+ __bam_root(dbc, &cp->csp[0]) :
+ __bam_page(dbc, &cp->csp[-1], &cp->csp[0]);
+ BT_STK_CLR(cp);
+
+ switch (ret) {
+ case 0:
+no_split: /* Once we've split the leaf page, we're done. */
+ if (level == LEAFLEVEL)
+ goto done;
+
+ /* Switch directions. */
+ if (dir == UP)
+ dir = DOWN;
+ break;
+ case DB_NEEDSPLIT:
+ /*
+ * It's possible to fail to split repeatedly, as other
+ * threads may be modifying the tree, or the page usage
+ * is sufficiently bad that we don't get enough space
+ * the first time.
+ */
+ if (dir == DOWN)
+ dir = UP;
+ break;
+ default:
+ goto err;
+ }
+ }
+
+ if (root_pgnop != NULL)
+ *root_pgnop = BAM_ROOT_PGNO(dbc);
+err:
+done: (void)__LPUT(dbc, metalock);
+ (void)__TLPUT(dbc, next_lock);
+
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_ON(dbc->thread_info);
+ return (ret);
+}
+
+/*
+ * __bam_root --
+ * Split the root page of a btree.
+ */
+static int
+__bam_root(dbc, cp)
+ DBC *dbc;
+ EPG *cp;
+{
+ DB *dbp;
+ DBT log_dbt, rootent[2];
+ DB_LOCK llock, rlock;
+ DB_LSN log_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *lp, *rp;
+ db_indx_t split;
+ u_int32_t opflags;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ lp = rp = NULL;
+ LOCK_INIT(llock);
+ LOCK_INIT(rlock);
+ COMPQUIET(log_dbt.data, NULL);
+
+ /* Yeah, right. */
+ if (cp->page->level >= MAXBTREELEVEL) {
+ __db_errx(dbp->env, DB_STR_A("1021",
+ "Too many btree levels: %d", "%d"), cp->page->level);
+ return (ENOSPC);
+ }
+
+ if ((ret = __memp_dirty(mpf,
+ &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+
+ /* Create new left and right pages for the split. */
+ if ((ret = __db_new(dbc, TYPE(cp->page), &llock, &lp)) != 0 ||
+ (ret = __db_new(dbc, TYPE(cp->page), &rlock, &rp)) != 0)
+ goto err;
+ P_INIT(lp, dbp->pgsize, lp->pgno,
+ PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
+ cp->page->level, TYPE(cp->page));
+ P_INIT(rp, dbp->pgsize, rp->pgno,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID,
+ cp->page->level, TYPE(cp->page));
+
+ PERFMON5(env, alloc, btree_split,
+ dbp->fname, dbp->dname, lp->pgno, cp->page->pgno, lp->level);
+
+ /* Split the page. */
+ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+ goto err;
+
+ if (DBC_LOGGING(dbc)) {
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ if ((ret =
+ __os_malloc(dbp->env, dbp->pgsize, &log_dbt.data)) != 0)
+ goto err;
+ log_dbt.size = dbp->pgsize;
+ memcpy(log_dbt.data, cp->page, dbp->pgsize);
+ }
+
+ /* Clean up the new root page. */
+ if ((ret = (dbc->dbtype == DB_RECNO ?
+ __ram_root(dbc, cp->page, lp, rp) :
+ __bam_broot(dbc, cp->page, split, lp, rp))) != 0) {
+ if (DBC_LOGGING(dbc))
+ __os_free(dbp->env, log_dbt.data);
+ goto err;
+ }
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ memset(rootent, 0, sizeof(rootent));
+ rootent[0].data = GET_BINTERNAL(dbp, cp->page, 0);
+ rootent[1].data = GET_BINTERNAL(dbp, cp->page, 1);
+ if (dbc->dbtype == DB_RECNO)
+ rootent[0].size = rootent[1].size = RINTERNAL_SIZE;
+ else {
+ rootent[0].size = BINTERNAL_SIZE(
+ ((BINTERNAL *)rootent[0].data)->len);
+ rootent[1].size = BINTERNAL_SIZE(
+ ((BINTERNAL *)rootent[1].data)->len);
+ }
+ ZERO_LSN(log_lsn);
+ opflags = F_ISSET(
+ (BTREE_CURSOR *)dbc->internal, C_RECNUM) ? SPL_NRECS : 0;
+ if (dbc->dbtype == DB_RECNO)
+ opflags |= SPL_RECNO;
+ ret = __bam_split_log(dbp, dbc->txn, &LSN(cp->page), 0,
+ OP_SET(opflags, cp->page), PGNO(lp), &LSN(lp),
+ PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp),
+ PGNO_INVALID, &log_lsn, PGNO(cp->page),
+ &LSN(cp->page), 0, &log_dbt, &rootent[0], &rootent[1]);
+
+ /* On failure, restore the page. */
+ if (ret != 0)
+ memcpy(cp->page, log_dbt.data, dbp->pgsize);
+ __os_free(dbp->env, log_dbt.data);
+
+ if (ret != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+ LSN(lp) = LSN(cp->page);
+ LSN(rp) = LSN(cp->page);
+
+ /* Adjust any cursors. */
+ ret = __bam_ca_split(dbc, cp->page->pgno, lp->pgno, rp->pgno, split, 1);
+
+ /* Success or error: release pages and locks. */
+err: if (cp->page != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ cp->page = NULL;
+
+ /*
+ * We are done. Put or downgrade all our locks and release
+ * the pages.
+ */
+ if ((t_ret = __TLPUT(dbc, llock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, rlock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, cp->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (lp != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, lp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, rp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_page --
+ * Split the non-root page of a btree.
+ */
+static int
+__bam_page(dbc, pp, cp)
+ DBC *dbc;
+ EPG *pp, *cp;
+{
+ BTREE_CURSOR *bc;
+ DB *dbp;
+ DBT log_dbt, rentry;
+ DB_LOCK rplock;
+ DB_LSN log_lsn;
+ DB_LSN save_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *lp, *rp, *alloc_rp, *tp;
+ db_indx_t split;
+ u_int32_t opflags;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ alloc_rp = lp = rp = tp = NULL;
+ LOCK_INIT(rplock);
+ ret = -1;
+
+ /*
+ * Create new left page for the split, and fill in everything
+ * except its LSN and next-page page number.
+ *
+ * Create a new right page for the split, and fill in everything
+ * except its LSN and page number.
+ *
+ * We malloc space for both the left and right pages, so we don't get
+ * a new page from the underlying buffer pool until we know the split
+ * is going to succeed. The reason is that we can't release locks
+ * acquired during the get-a-new-page process because metadata page
+ * locks can't be discarded on failure since we may have modified the
+ * free list. So, if you assume that we're holding a write lock on the
+ * leaf page which ran out of space and started this split (e.g., we
+ * have already written records to the page, or we retrieved a record
+ * from it with the DB_RMW flag set), failing in a split with both a
+ * leaf page locked and the metadata page locked can potentially lock
+ * up the tree badly, because we've violated the rule of always locking
+ * down the tree, and never up.
+ */
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize * 2, &lp)) != 0)
+ goto err;
+ P_INIT(lp, dbp->pgsize, PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : PREV_PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : 0,
+ cp->page->level, TYPE(cp->page));
+
+ rp = (PAGE *)((u_int8_t *)lp + dbp->pgsize);
+ P_INIT(rp, dbp->pgsize, 0,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : NEXT_PGNO(cp->page),
+ cp->page->level, TYPE(cp->page));
+
+ /*
+ * Split right.
+ *
+ * Only the indices are sorted on the page, i.e., the key/data pairs
+ * aren't, so it's simpler to copy the data from the split page onto
+ * two new pages instead of copying half the data to a new right page
+ * and compacting the left page in place. Since the left page can't
+ * change, we swap the original and the allocated left page after the
+ * split.
+ */
+ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+ goto err;
+
+ /*
+ * Test to see if we are going to be able to insert the new pages into
+ * the parent page. The interesting failure here is that the parent
+ * page can't hold the new keys, and has to be split in turn, in which
+ * case we want to release all the locks we can.
+ */
+ if ((ret = __bam_pinsert(dbc, pp, split, lp, rp, BPI_SPACEONLY)) != 0)
+ goto err;
+
+ /*
+ * We've got everything locked down we need, and we know the split
+ * is going to succeed. Go and get the additional page we'll need.
+ */
+ if ((ret = __db_new(dbc, TYPE(cp->page), &rplock, &alloc_rp)) != 0)
+ goto err;
+
+ /*
+ * Prepare to fix up the previous pointer of any leaf page following
+ * the split page. Our caller has already write locked the page so
+ * we can get it without deadlocking on the parent latch.
+ */
+ if (ISLEAF(cp->page) && NEXT_PGNO(cp->page) != PGNO_INVALID &&
+ (ret = __memp_fget(mpf, &NEXT_PGNO(cp->page),
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &tp)) != 0)
+ goto err;
+
+ PERFMON5(env, alloc, btree_split, dbp->fname,
+ dbp->dname, cp->page->pgno, pp->page->pgno, cp->page->level);
+
+ /*
+ * Fix up the page numbers we didn't have before. We have to do this
+ * before calling __bam_pinsert because it may copy a page number onto
+ * the parent page and it takes the page number from its page argument.
+ */
+ PGNO(rp) = NEXT_PGNO(lp) = PGNO(alloc_rp);
+
+ DB_ASSERT(dbp->env, IS_DIRTY(cp->page));
+ DB_ASSERT(dbp->env, IS_DIRTY(pp->page));
+
+ bc = (BTREE_CURSOR *)dbc->internal;
+
+ /* Actually update the parent page. */
+ if ((ret = __bam_pinsert(dbc,
+ pp, split, lp, rp, F_ISSET(bc, C_RECNUM) ? 0 : BPI_NOLOGGING)) != 0)
+ goto err;
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ log_dbt.data = cp->page;
+ log_dbt.size = dbp->pgsize;
+ memset(&rentry, 0, sizeof(rentry));
+ rentry.data = GET_BINTERNAL(dbp, pp->page, pp->indx + 1);
+ opflags = F_ISSET(bc, C_RECNUM) ? SPL_NRECS : 0;
+ if (dbc->dbtype == DB_RECNO) {
+ opflags |= SPL_RECNO;
+ rentry.size = RINTERNAL_SIZE;
+ } else
+ rentry.size =
+ BINTERNAL_SIZE(((BINTERNAL *)rentry.data)->len);
+ if (tp == NULL)
+ ZERO_LSN(log_lsn);
+ if ((ret = __bam_split_log(dbp, dbc->txn, &LSN(cp->page),
+ 0, OP_SET(opflags, pp->page), PGNO(cp->page),
+ &LSN(cp->page), PGNO(alloc_rp), &LSN(alloc_rp),
+ (u_int32_t)NUM_ENT(lp), tp == NULL ? 0 : PGNO(tp),
+ tp == NULL ? &log_lsn : &LSN(tp), PGNO(pp->page),
+ &LSN(pp->page), pp->indx, &log_dbt, NULL, &rentry)) != 0) {
+ /*
+ * If this is not RECNO then undo the update
+ * to the parent page, which has not been
+ * logged yet. This must succeed. Renco
+ * database trees are locked and therefore
+ * the parent can be logged independently.
+ */
+ if (F_ISSET(bc, C_RECNUM) == 0) {
+ t_ret = __db_ditem_nolog(dbc, pp->page,
+ pp->indx + 1, rentry.size);
+ DB_ASSERT(dbp->env, t_ret == 0);
+ }
+
+ goto err;
+ }
+
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ /* Update the LSNs for all involved pages. */
+ LSN(alloc_rp) = LSN(cp->page);
+ LSN(lp) = LSN(cp->page);
+ LSN(rp) = LSN(cp->page);
+ LSN(pp->page) = LSN(cp->page);
+ if (tp != NULL) {
+ /* Log record has been written; so safe to update next page. */
+ PREV_PGNO(tp) = PGNO(rp);
+ LSN(tp) = LSN(cp->page);
+ }
+
+ /*
+ * Copy the left and right pages into place. There are two paths
+ * through here. Either we are logging and we set the LSNs in the
+ * logging path. However, if we are not logging, then we do not
+ * have valid LSNs on lp or rp. The correct LSNs to use are the
+ * ones on the page we got from __db_new or the one that was
+ * originally on cp->page. In both cases, we save the LSN from the
+ * real database page (not a malloc'd one) and reapply it after we
+ * do the copy.
+ */
+ save_lsn = alloc_rp->lsn;
+ memcpy(alloc_rp, rp, LOFFSET(dbp, rp));
+ memcpy((u_int8_t *)alloc_rp + HOFFSET(rp),
+ (u_int8_t *)rp + HOFFSET(rp), dbp->pgsize - HOFFSET(rp));
+ alloc_rp->lsn = save_lsn;
+
+ save_lsn = cp->page->lsn;
+ memcpy(cp->page, lp, LOFFSET(dbp, lp));
+ memcpy((u_int8_t *)cp->page + HOFFSET(lp),
+ (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
+ cp->page->lsn = save_lsn;
+
+ /* Adjust any cursors. */
+ if ((ret = __bam_ca_split(dbc,
+ PGNO(cp->page), PGNO(cp->page), PGNO(rp), split, 0)) != 0)
+ goto err;
+
+ __os_free(dbp->env, lp);
+
+ /*
+ * Success -- write the real pages back to the store.
+ */
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, alloc_rp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, rplock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (tp != NULL) {
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, tp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if ((t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+
+err: if (lp != NULL)
+ __os_free(dbp->env, lp);
+ if (alloc_rp != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, alloc_rp, dbc->priority);
+ if (tp != NULL)
+ (void)__memp_fput(mpf, dbc->thread_info, tp, dbc->priority);
+
+ if (pp->page != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, pp->page, dbc->priority);
+
+ if (ret == DB_NEEDSPLIT && atomic_read(&mpf->mfp->multiversion) == 0)
+ (void)__LPUT(dbc, pp->lock);
+ else
+ (void)__TLPUT(dbc, pp->lock);
+
+ (void)__memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
+
+ /*
+ * We don't drop the left and right page locks. If we doing dirty
+ * reads then we need to hold the locks until we abort the transaction.
+ * If we are not transactional, we are hosed anyway as the tree
+ * is trashed. It may be better not to leak the locks.
+ */
+
+ if (dbc->txn == NULL)
+ (void)__LPUT(dbc, rplock);
+
+ if (dbc->txn == NULL || ret == DB_NEEDSPLIT)
+ (void)__LPUT(dbc, cp->lock);
+
+ return (ret);
+}
+
+/*
+ * __bam_broot --
+ * Fix up the btree root page after it has been split.
+ * PUBLIC: int __bam_broot __P((DBC *, PAGE *, u_int32_t, PAGE *, PAGE *));
+ */
+int
+__bam_broot(dbc, rootp, split, lp, rp)
+ DBC *dbc;
+ u_int32_t split;
+ PAGE *rootp, *lp, *rp;
+{
+ BINTERNAL bi, bi0, *child_bi;
+ BKEYDATA *child_bk;
+ BOVERFLOW bo, *child_bo;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT hdr, hdr0, data;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ child_bo = NULL;
+ data.data = NULL;
+ memset(&bi, 0, sizeof(bi));
+
+ switch (TYPE(rootp)) {
+ case P_IBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bi = GET_BINTERNAL(dbp, rootp, split);
+ switch (B_TYPE(child_bi->type)) {
+ case B_KEYDATA:
+ bi.len = child_bi->len;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ if ((ret = __os_malloc(dbp->env,
+ child_bi->len, &data.data)) != 0)
+ return (ret);
+ memcpy(data.data, child_bi->data, child_bi->len);
+ data.size = child_bi->len;
+ break;
+ case B_OVERFLOW:
+ /* Reuse the overflow key. */
+ child_bo = (BOVERFLOW *)child_bi->data;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ bo.pgno = child_bo->pgno;
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ case P_LDUP:
+ case P_LBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bk = GET_BKEYDATA(dbp, rootp, split);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ bi.len = child_bk->len;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ if ((ret = __os_malloc(dbp->env,
+ child_bk->len, &data.data)) != 0)
+ return (ret);
+ memcpy(data.data, child_bk->data, child_bk->len);
+ data.size = child_bk->len;
+ break;
+ case B_OVERFLOW:
+ /* Copy the overflow key. */
+ child_bo = (BOVERFLOW *)child_bk;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr, child_bo->tlen,
+ child_bo->pgno, &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ default:
+pgfmt: return (__db_pgfmt(dbp->env, rp->pgno));
+ }
+ /*
+ * If the root page was a leaf page, change it into an internal page.
+ * We copy the key we split on (but not the key's data, in the case of
+ * a leaf page) to the new root page.
+ */
+ root_pgno = BAM_ROOT_PGNO(dbc);
+ P_INIT(rootp, dbp->pgsize,
+ root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE);
+
+ /*
+ * The btree comparison code guarantees that the left-most key on any
+ * internal btree page is never used, so it doesn't need to be filled
+ * in. Set the record count if necessary.
+ */
+ memset(&bi0, 0, sizeof(bi0));
+ B_TSET(bi0.type, B_KEYDATA);
+ bi0.pgno = lp->pgno;
+ if (F_ISSET(cp, C_RECNUM)) {
+ bi0.nrecs = __bam_total(dbp, lp);
+ RE_NREC_SET(rootp, bi0.nrecs);
+ bi.nrecs = __bam_total(dbp, rp);
+ RE_NREC_ADJ(rootp, bi.nrecs);
+ }
+ DB_SET_DBT(hdr0, &bi0, SSZA(BINTERNAL, data));
+ if ((ret = __db_pitem_nolog(dbc, rootp,
+ 0, BINTERNAL_SIZE(0), &hdr0, NULL)) != 0)
+ goto err;
+ ret = __db_pitem_nolog(dbc, rootp, 1,
+ BINTERNAL_SIZE(data.size), &hdr, &data);
+
+err: if (data.data != NULL && child_bo == NULL)
+ __os_free(dbp->env, data.data);
+ return (ret);
+}
+
+/*
+ * __ram_root --
+ * Fix up the recno root page after it has been split.
+ * PUBLIC: int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
+ */
+int
+__ram_root(dbc, rootp, lp, rp)
+ DBC *dbc;
+ PAGE *rootp, *lp, *rp;
+{
+ DB *dbp;
+ DBT hdr;
+ RINTERNAL ri;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ root_pgno = BAM_ROOT_PGNO(dbc);
+
+ /* Initialize the page. */
+ P_INIT(rootp, dbp->pgsize,
+ root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
+
+ /* Initialize the header. */
+ DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE);
+
+ /* Insert the left and right keys, set the header information. */
+ ri.pgno = lp->pgno;
+ ri.nrecs = __bam_total(dbp, lp);
+ if ((ret = __db_pitem_nolog(dbc,
+ rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_SET(rootp, ri.nrecs);
+ ri.pgno = rp->pgno;
+ ri.nrecs = __bam_total(dbp, rp);
+ if ((ret = __db_pitem_nolog(dbc,
+ rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_ADJ(rootp, ri.nrecs);
+ return (0);
+}
+
+/*
+ * __bam_pinsert --
+ * Insert a new key into a parent page, completing the split.
+ *
+ * PUBLIC: int __bam_pinsert
+ * PUBLIC: __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int));
+ */
+int
+__bam_pinsert(dbc, parent, split, lchild, rchild, flags)
+ DBC *dbc;
+ EPG *parent;
+ u_int32_t split;
+ PAGE *lchild, *rchild;
+ int flags;
+{
+ BINTERNAL bi, *child_bi;
+ BKEYDATA *child_bk, *tmp_bk;
+ BOVERFLOW bo, *child_bo;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT a, b, hdr, data;
+ EPG *child;
+ PAGE *ppage;
+ RINTERNAL ri;
+ db_indx_t off;
+ db_recno_t nrecs;
+ size_t (*func) __P((DB *, const DBT *, const DBT *));
+ int (*pitem) __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ u_int32_t n, nbytes, nksize, oldsize, size;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ ppage = parent->page;
+ child = parent + 1;
+
+ /* If handling record numbers, count records split to the right page. */
+ nrecs = F_ISSET(cp, C_RECNUM) &&
+ !LF_ISSET(BPI_SPACEONLY) ? __bam_total(dbp, rchild) : 0;
+
+ /*
+ * Now we insert the new page's first key into the parent page, which
+ * completes the split. The parent points to a PAGE and a page index
+ * offset, where the new key goes ONE AFTER the index, because we split
+ * to the right.
+ *
+ * XXX
+ * Some btree algorithms replace the key for the old page as well as
+ * the new page. We don't, as there's no reason to believe that the
+ * first key on the old page is any better than the key we have, and,
+ * in the case of a key being placed at index 0 causing the split, the
+ * key is unavailable.
+ */
+ off = parent->indx + O_INDX;
+ if (LF_ISSET(BPI_REPLACE))
+ oldsize = TYPE(ppage) == P_IRECNO ? RINTERNAL_PSIZE :
+ BINTERNAL_PSIZE(GET_BINTERNAL(dbp, ppage, off)->len);
+ else
+ oldsize = 0;
+
+ /*
+ * Calculate the space needed on the parent page.
+ *
+ * Prefix trees: space hack used when inserting into BINTERNAL pages.
+ * Retain only what's needed to distinguish between the new entry and
+ * the LAST entry on the page to its left. If the keys compare equal,
+ * retain the entire key. We ignore overflow keys, and the entire key
+ * must be retained for the next-to-leftmost key on the leftmost page
+ * of each level, or the search will fail. Applicable ONLY to internal
+ * pages that have leaf pages as children. Further reduction of the
+ * key between pairs of internal pages loses too much information.
+ */
+ switch (TYPE(child->page)) {
+ case P_IBTREE:
+ child_bi = GET_BINTERNAL(dbp, child->page, split);
+ nbytes = BINTERNAL_PSIZE(child_bi->len);
+
+ if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ switch (B_TYPE(child_bi->type)) {
+ case B_KEYDATA:
+ /* Add a new record for the right page. */
+ memset(&bi, 0, sizeof(bi));
+ bi.len = child_bi->len;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, child_bi->data, child_bi->len);
+ size = BINTERNAL_SIZE(child_bi->len);
+ break;
+ case B_OVERFLOW:
+ /* Reuse the overflow key. */
+ child_bo = (BOVERFLOW *)child_bi->data;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ bo.pgno = child_bo->pgno;
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ case P_LDUP:
+ case P_LBTREE:
+ child_bk = GET_BKEYDATA(dbp, child->page, split);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ nbytes = BINTERNAL_PSIZE(child_bk->len);
+ nksize = child_bk->len;
+
+ /*
+ * Prefix compression:
+ * We set t->bt_prefix to NULL if we have a comparison
+ * callback but no prefix compression callback. But,
+ * if we're splitting in an off-page duplicates tree,
+ * we still have to do some checking. If using the
+ * default off-page duplicates comparison routine we
+ * can use the default prefix compression callback. If
+ * not using the default off-page duplicates comparison
+ * routine, we can't do any kind of prefix compression
+ * as there's no way for an application to specify a
+ * prefix compression callback that corresponds to its
+ * comparison callback.
+ *
+ * No prefix compression if we don't have a compression
+ * function, or the key we'd compress isn't a normal
+ * key (for example, it references an overflow page).
+ *
+ * Generate a parent page key for the right child page
+ * from a comparison of the last key on the left child
+ * page and the first key on the right child page.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if (dbp->dup_compare == __bam_defcmp)
+ func = __bam_defpfx;
+ else
+ func = NULL;
+ } else
+ func = t->bt_prefix;
+ if (func == NULL)
+ goto noprefix;
+ tmp_bk = GET_BKEYDATA(dbp, lchild, NUM_ENT(lchild) -
+ (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX));
+ if (B_TYPE(tmp_bk->type) != B_KEYDATA)
+ goto noprefix;
+ DB_INIT_DBT(a, tmp_bk->data, tmp_bk->len);
+ DB_INIT_DBT(b, child_bk->data, child_bk->len);
+ nksize = (u_int32_t)func(dbp, &a, &b);
+ if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)
+ nbytes = n;
+ else
+ nksize = child_bk->len;
+
+noprefix: if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ memset(&bi, 0, sizeof(bi));
+ bi.len = nksize;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, child_bk->data, nksize);
+ size = BINTERNAL_SIZE(nksize);
+ break;
+ case B_OVERFLOW:
+ nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);
+
+ if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ /* Copy the overflow key. */
+ child_bo = (BOVERFLOW *)child_bk;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr, child_bo->tlen,
+ child_bo->pgno, &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+
+ memset(&bi, 0, sizeof(bi));
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ nbytes = RINTERNAL_PSIZE;
+
+ if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ /* Add a new record for the right page. */
+ DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE);
+ ri.pgno = rchild->pgno;
+ ri.nrecs = nrecs;
+ size = RINTERNAL_SIZE;
+ data.size = 0;
+ /*
+ * For now, we are locking internal recno nodes so
+ * use two steps.
+ */
+ if (LF_ISSET(BPI_REPLACE)) {
+ if ((ret = __bam_ditem(dbc, ppage, off)) != 0)
+ return (ret);
+ LF_CLR(BPI_REPLACE);
+ }
+ break;
+ default:
+pgfmt: return (__db_pgfmt(dbp->env, PGNO(child->page)));
+ }
+
+ if (LF_ISSET(BPI_REPLACE)) {
+ DB_ASSERT(dbp->env, !LF_ISSET(BPI_NOLOGGING));
+ if ((ret = __bam_irep(dbc, ppage, off, &hdr, &data)) != 0)
+ return (ret);
+ } else {
+ if (LF_ISSET(BPI_NOLOGGING))
+ pitem = __db_pitem_nolog;
+ else
+ pitem = __db_pitem;
+
+ if ((ret = pitem(dbc, ppage,
+ off, size, &hdr, data.size != 0 ? &data : NULL)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If a Recno or Btree with record numbers AM page, or an off-page
+ * duplicates tree, adjust the parent page's left page record count.
+ */
+ if (F_ISSET(cp, C_RECNUM) && !LF_ISSET(BPI_NORECNUM)) {
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_cadjust_log(dbp, dbc->txn,
+ &LSN(ppage), 0, PGNO(ppage), &LSN(ppage),
+ parent->indx, -(int32_t)nrecs, 0)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(ppage));
+
+ /* Update the left page count. */
+ if (dbc->dbtype == DB_RECNO)
+ GET_RINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;
+ else
+ GET_BINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_psplit --
+ * Do the real work of splitting the page.
+ */
+static int
+__bam_psplit(dbc, cp, lp, rp, splitret)
+ DBC *dbc;
+ EPG *cp;
+ PAGE *lp, *rp;
+ db_indx_t *splitret;
+{
+ DB *dbp;
+ PAGE *pp;
+ db_indx_t half, *inp, nbytes, off, splitp, top;
+ int adjust, cnt, iflag, isbigkey, ret;
+
+ dbp = dbc->dbp;
+ pp = cp->page;
+ inp = P_INP(dbp, pp);
+ adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
+
+ /*
+ * If we're splitting the first (last) page on a level because we're
+ * inserting (appending) a key to it, it's likely that the data is
+ * sorted. Moving a single item to the new page is less work and can
+ * push the fill factor higher than normal. This is trivial when we
+ * are splitting a new page before the beginning of the tree, all of
+ * the interesting tests are against values of 0.
+ *
+ * Catching appends to the tree is harder. In a simple append, we're
+ * inserting an item that sorts past the end of the tree; the cursor
+ * will point past the last element on the page. But, in trees with
+ * duplicates, the cursor may point to the last entry on the page --
+ * in this case, the entry will also be the last element of a duplicate
+ * set (the last because the search call specified the SR_DUPLAST flag).
+ * The only way to differentiate between an insert immediately before
+ * the last item in a tree or an append after a duplicate set which is
+ * also the last item in the tree is to call the comparison function.
+ * When splitting internal pages during an append, the search code
+ * guarantees the cursor always points to the largest page item less
+ * than the new internal entry. To summarize, we want to catch three
+ * possible index values:
+ *
+ * NUM_ENT(page) Btree/Recno leaf insert past end-of-tree
+ * NUM_ENT(page) - O_INDX Btree or Recno internal insert past EOT
+ * NUM_ENT(page) - P_INDX Btree leaf insert past EOT after a set
+ * of duplicates
+ *
+ * two of which, (NUM_ENT(page) - O_INDX or P_INDX) might be an insert
+ * near the end of the tree, and not after the end of the tree at all.
+ * Do a simple test which might be wrong because calling the comparison
+ * functions is expensive. Regardless, it's not a big deal if we're
+ * wrong, we'll do the split the right way next time.
+ */
+ off = 0;
+ if (NEXT_PGNO(pp) == PGNO_INVALID && cp->indx >= NUM_ENT(pp) - adjust)
+ off = NUM_ENT(pp) - adjust;
+ else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
+ off = adjust;
+ if (off != 0)
+ goto sort;
+
+ /*
+ * Split the data to the left and right pages. Try not to split on
+ * an overflow key. (Overflow keys on internal pages will slow down
+ * searches.) Refuse to split in the middle of a set of duplicates.
+ *
+ * First, find the optimum place to split.
+ *
+ * It's possible to try and split past the last record on the page if
+ * there's a very large record at the end of the page. Make sure this
+ * doesn't happen by bounding the check at the next-to-last entry on
+ * the page.
+ *
+ * Note, we try and split half the data present on the page. This is
+ * because another process may have already split the page and left
+ * it half empty. We don't try and skip the split -- we don't know
+ * how much space we're going to need on the page, and we may need up
+ * to half the page for a big item, so there's no easy test to decide
+ * if we need to split or not. Besides, if two threads are inserting
+ * data into the same place in the database, we're probably going to
+ * need more space soon anyway.
+ */
+ top = NUM_ENT(pp) - adjust;
+ half = (dbp->pgsize - HOFFSET(pp)) / 2;
+ for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (B_TYPE(
+ GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA)
+ nbytes += BINTERNAL_SIZE(
+ GET_BINTERNAL(dbp, pp, off)->len);
+ else
+ nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA)
+ nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+ pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
+
+ ++off;
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA)
+ nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+ pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
+ break;
+ case P_IRECNO:
+ nbytes += RINTERNAL_SIZE;
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, pp->pgno));
+ }
+sort: splitp = off;
+
+ /*
+ * Splitp is either at or just past the optimum split point. If the
+ * tree type is such that we're going to promote a key to an internal
+ * page, and our current choice is an overflow key, look for something
+ * close by that's smaller.
+ */
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ iflag = 1;
+ isbigkey =
+ B_TYPE(GET_BINTERNAL(dbp, pp, off)->type) != B_KEYDATA;
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ iflag = 0;
+ isbigkey = B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) !=
+ B_KEYDATA;
+ break;
+ default:
+ iflag = isbigkey = 0;
+ }
+ if (isbigkey)
+ for (cnt = 1; cnt <= 3; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < (db_indx_t)NUM_ENT(pp) &&
+ ((iflag && B_TYPE(
+ GET_BINTERNAL(dbp, pp,off)->type) == B_KEYDATA) ||
+ B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA)) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (iflag ? B_TYPE(
+ GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA :
+ B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA) {
+ splitp = off;
+ break;
+ }
+ }
+
+ /*
+ * We can't split in the middle a set of duplicates. We know that
+ * no duplicate set can take up more than about 25% of the page,
+ * because that's the point where we push it off onto a duplicate
+ * page set. So, this loop can't be unbounded.
+ */
+ if (TYPE(pp) == P_LBTREE &&
+ inp[splitp] == inp[splitp - adjust])
+ for (cnt = 1;; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < NUM_ENT(pp) &&
+ inp[splitp] != inp[off]) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (inp[splitp] != inp[off]) {
+ splitp = off + adjust;
+ break;
+ }
+ }
+
+ /* We're going to split at splitp. */
+ if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)
+ return (ret);
+ if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
+ return (ret);
+
+ *splitret = splitp;
+ return (0);
+}
+
+/*
+ * __bam_copy --
+ * Copy a set of records from one page to another.
+ *
+ * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__bam_copy(dbp, pp, cp, nxt, stop)
+ DB *dbp;
+ PAGE *pp, *cp;
+ u_int32_t nxt, stop;
+{
+ BINTERNAL internal;
+ db_indx_t *cinp, nbytes, off, *pinp;
+
+ cinp = P_INP(dbp, cp);
+ pinp = P_INP(dbp, pp);
+ /*
+ * Nxt is the offset of the next record to be placed on the target page.
+ */
+ for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (off == 0 && nxt != 0)
+ nbytes = BINTERNAL_SIZE(0);
+ else if (B_TYPE(
+ GET_BINTERNAL(dbp, pp, nxt)->type) == B_KEYDATA)
+ nbytes = BINTERNAL_SIZE(
+ GET_BINTERNAL(dbp, pp, nxt)->len);
+ else
+ nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ /*
+ * If we're on a key and it's a duplicate, just copy
+ * the offset.
+ */
+ if (off != 0 && (nxt % P_INDX) == 0 &&
+ pinp[nxt] == pinp[nxt - P_INDX]) {
+ cinp[off] = cinp[off - P_INDX];
+ continue;
+ }
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) ==
+ B_KEYDATA)
+ nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+ pp, nxt)->len);
+ else
+ nbytes = BOVERFLOW_SIZE;
+ break;
+ case P_IRECNO:
+ nbytes = RINTERNAL_SIZE;
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, pp->pgno));
+ }
+ cinp[off] = HOFFSET(cp) -= nbytes;
+ if (off == 0 && nxt != 0 && TYPE(pp) == P_IBTREE) {
+ internal.len = 0;
+ UMRW_SET(internal.unused);
+ internal.type = B_KEYDATA;
+ internal.pgno = GET_BINTERNAL(dbp, pp, nxt)->pgno;
+ internal.nrecs = GET_BINTERNAL(dbp, pp, nxt)->nrecs;
+ memcpy(P_ENTRY(dbp, cp, off), &internal, nbytes);
+ }
+ else
+ memcpy(P_ENTRY(dbp, cp, off),
+ P_ENTRY(dbp, pp, nxt), nbytes);
+ }
+ return (0);
+}
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
new file mode 100644
index 00000000..668c4fdb
--- /dev/null
+++ b/src/btree/bt_stat.c
@@ -0,0 +1,669 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+#ifdef HAVE_STATISTICS
+/*
+ * __bam_stat --
+ * Gather/print the btree statistics
+ *
+ * PUBLIC: int __bam_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__bam_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ BTMETA *meta;
+ BTREE *t;
+ DB *dbp;
+ DB_BTREE_STAT *sp;
+ DB_LOCK lock, metalock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret, t_ret, write_meta;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ meta = NULL;
+ t = dbp->bt_internal;
+ sp = NULL;
+ LOCK_INIT(metalock);
+ LOCK_INIT(lock);
+ mpf = dbp->mpf;
+ h = NULL;
+ ret = write_meta = 0;
+
+ /* Allocate and clear the structure. */
+ if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
+ goto err;
+ memset(sp, 0, sizeof(*sp));
+
+ /* Get the metadata page for the entire database. */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+ goto err;
+
+ if (flags == DB_FAST_STAT)
+ goto meta_only;
+
+ /* Walk the metadata free list, counting pages. */
+ for (sp->bt_free = 0, pgno = meta->dbmeta.free; pgno != PGNO_INVALID;) {
+ ++sp->bt_free;
+
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+
+ pgno = h->next_pgno;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ h = NULL;
+ }
+
+ /* Get the root page. */
+ BAM_GET_ROOT(dbc, pgno, h, 0, DB_LOCK_READ, lock, ret);
+ if (ret != 0)
+ goto err;
+ DB_ASSERT(env, h != NULL);
+
+ /* Get the levels from the root page. */
+ sp->bt_levels = h->level;
+
+ /* Discard the root page. */
+ ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ h = NULL;
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ /* Discard the metadata page. */
+ ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ meta = NULL;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ /* Walk the tree. */
+ if ((ret = __bam_traverse(dbc,
+ DB_LOCK_READ, PGNO_INVALID, __bam_stat_callback, sp)) != 0)
+ goto err;
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp) && (ret = __bam_compress_count(dbc,
+ &sp->bt_nkeys, &sp->bt_ndata)) != 0)
+ goto err;
+#endif
+
+ /*
+ * Get the subdatabase metadata page if it's not the same as the
+ * one we already have.
+ */
+ write_meta = !F_ISSET(dbp, DB_AM_RDONLY) &&
+ (!MULTIVERSION(dbp) || dbc->txn != NULL);
+meta_only:
+ if (meta == NULL || t->bt_meta != PGNO_BASE_MD || write_meta) {
+ if (meta != NULL) {
+ ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority);
+ meta = NULL;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ }
+
+ if ((ret = __db_lget(dbc,
+ 0, t->bt_meta, write_meta ? DB_LOCK_WRITE : DB_LOCK_READ,
+ 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &t->bt_meta,
+ dbc->thread_info, dbc->txn,
+ write_meta ? DB_MPOOL_DIRTY : 0, &meta)) != 0)
+ goto err;
+ }
+ if (flags == DB_FAST_STAT) {
+ if (dbp->type == DB_RECNO ||
+ (dbp->type == DB_BTREE && F_ISSET(dbp, DB_AM_RECNUM))) {
+ BAM_GET_ROOT(dbc, pgno, h, 0, DB_LOCK_READ, lock, ret);
+ if (ret != 0)
+ goto err;
+
+ sp->bt_nkeys = RE_NREC(h);
+ } else
+ sp->bt_nkeys = meta->dbmeta.key_count;
+
+ sp->bt_ndata = dbp->type == DB_RECNO ?
+ sp->bt_nkeys : meta->dbmeta.record_count;
+ }
+
+ /* Get metadata page statistics. */
+ sp->bt_metaflags = meta->dbmeta.flags;
+ sp->bt_minkey = meta->minkey;
+ sp->bt_re_len = meta->re_len;
+ sp->bt_re_pad = meta->re_pad;
+ /*
+ * Don't take the page number from the meta-data page -- that value is
+ * only maintained in the primary database, we may have been called on
+ * a subdatabase. (Yes, I read the primary database meta-data page
+ * earlier in this function, but I'm asking the underlying cache so the
+ * code for the Hash and Btree methods is the same.)
+ */
+ if ((ret = __memp_get_last_pgno(dbp->mpf, &pgno)) != 0)
+ goto err;
+ sp->bt_pagecnt = pgno + 1;
+ sp->bt_pagesize = meta->dbmeta.pagesize;
+ sp->bt_magic = meta->dbmeta.magic;
+ sp->bt_version = meta->dbmeta.version;
+
+ if (write_meta != 0) {
+ meta->dbmeta.key_count = sp->bt_nkeys;
+ meta->dbmeta.record_count = sp->bt_ndata;
+ }
+
+ *(DB_BTREE_STAT **)spp = sp;
+
+err: /* Discard the second page. */
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the metadata page. */
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0 && sp != NULL) {
+ __os_ufree(env, sp);
+ *(DB_BTREE_STAT **)spp = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_stat_print --
+ * Display btree/recno statistics.
+ *
+ * PUBLIC: int __bam_stat_print __P((DBC *, u_int32_t));
+ */
+int
+__bam_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { BTM_DUP, "duplicates" },
+ { BTM_RECNO, "recno" },
+ { BTM_RECNUM, "record-numbers" },
+ { BTM_FIXEDLEN, "fixed-length" },
+ { BTM_RENUMBER, "renumber" },
+ { BTM_SUBDB, "multiple-databases" },
+ { BTM_DUPSORT, "sorted duplicates" },
+ { BTM_COMPRESS, "compressed" },
+ { 0, NULL }
+ };
+ DB *dbp;
+ DB_BTREE_STAT *sp;
+ ENV *env;
+ int lorder, ret;
+ const char *s;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp)) {
+ if ((ret = __partition_stat(dbc, &sp, flags)) != 0)
+ return (ret);
+ } else
+#endif
+ if ((ret = __bam_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Default Btree/Recno database information:");
+ }
+
+ __db_msg(env, "%lx\tBtree magic number", (u_long)sp->bt_magic);
+ __db_msg(env, "%lu\tBtree version number", (u_long)sp->bt_version);
+
+ (void)__db_get_lorder(dbp, &lorder);
+ switch (lorder) {
+ case 1234:
+ s = "Little-endian";
+ break;
+ case 4321:
+ s = "Big-endian";
+ break;
+ default:
+ s = "Unrecognized byte order";
+ break;
+ }
+ __db_msg(env, "%s\tByte order", s);
+ __db_prflags(env, NULL, sp->bt_metaflags, fn, NULL, "\tFlags");
+ if (dbp->type == DB_BTREE)
+ __db_dl(env, "Minimum keys per-page", (u_long)sp->bt_minkey);
+ if (dbp->type == DB_RECNO) {
+ __db_dl(env,
+ "Fixed-length record size", (u_long)sp->bt_re_len);
+ __db_msg(env,
+ "%#x\tFixed-length record pad", (u_int)sp->bt_re_pad);
+ }
+ __db_dl(env,
+ "Underlying database page size", (u_long)sp->bt_pagesize);
+ if (dbp->type == DB_BTREE)
+ __db_dl(env, "Overflow key/data size",
+ ((BTREE_CURSOR *)dbc->internal)->ovflsize);
+ __db_dl(env, "Number of levels in the tree", (u_long)sp->bt_levels);
+ __db_dl(env, dbp->type == DB_BTREE ?
+ "Number of unique keys in the tree" :
+ "Number of records in the tree", (u_long)sp->bt_nkeys);
+ __db_dl(env,
+ "Number of data items in the tree", (u_long)sp->bt_ndata);
+
+ __db_dl(env,
+ "Number of tree internal pages", (u_long)sp->bt_int_pg);
+ __db_dl_pct(env,
+ "Number of bytes free in tree internal pages",
+ (u_long)sp->bt_int_pgfree,
+ DB_PCT_PG(sp->bt_int_pgfree, sp->bt_int_pg, sp->bt_pagesize), "ff");
+
+ __db_dl(env,
+ "Number of tree leaf pages", (u_long)sp->bt_leaf_pg);
+ __db_dl_pct(env, "Number of bytes free in tree leaf pages",
+ (u_long)sp->bt_leaf_pgfree, DB_PCT_PG(
+ sp->bt_leaf_pgfree, sp->bt_leaf_pg, sp->bt_pagesize), "ff");
+
+ __db_dl(env,
+ "Number of tree duplicate pages", (u_long)sp->bt_dup_pg);
+ __db_dl_pct(env,
+ "Number of bytes free in tree duplicate pages",
+ (u_long)sp->bt_dup_pgfree,
+ DB_PCT_PG(sp->bt_dup_pgfree, sp->bt_dup_pg, sp->bt_pagesize), "ff");
+
+ __db_dl(env,
+ "Number of tree overflow pages", (u_long)sp->bt_over_pg);
+ __db_dl_pct(env, "Number of bytes free in tree overflow pages",
+ (u_long)sp->bt_over_pgfree, DB_PCT_PG(
+ sp->bt_over_pgfree, sp->bt_over_pg, sp->bt_pagesize), "ff");
+ __db_dl(env, "Number of empty pages", (u_long)sp->bt_empty_pg);
+
+ __db_dl(env, "Number of pages on the free list", (u_long)sp->bt_free);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+/*
+ * __bam_stat_callback --
+ * Statistics callback.
+ *
+ * PUBLIC: int __bam_stat_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__bam_stat_callback(dbc, h, cookie, putp)
+ DBC *dbc;
+ PAGE *h;
+ void *cookie;
+ int *putp;
+{
+ DB *dbp;
+ DB_BTREE_STAT *sp;
+ db_indx_t indx, *inp, top;
+ u_int8_t type;
+
+ dbp = dbc->dbp;
+ sp = cookie;
+ *putp = 0;
+ top = NUM_ENT(h);
+ inp = P_INP(dbp, h);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ ++sp->bt_int_pg;
+ sp->bt_int_pgfree += P_FREESPACE(dbp, h);
+ break;
+ case P_LBTREE:
+ if (top == 0)
+ ++sp->bt_empty_pg;
+
+ /* Correct for on-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ type = GET_BKEYDATA(dbp, h, indx + O_INDX)->type;
+ /* Ignore deleted items. */
+ if (B_DISSET(type))
+ continue;
+
+ /* Ignore duplicate keys. */
+ if (indx + P_INDX >= top ||
+ inp[indx] != inp[indx + P_INDX])
+ ++sp->bt_nkeys;
+
+ /* Ignore off-page duplicates. */
+ if (B_TYPE(type) != B_DUPLICATE)
+ ++sp->bt_ndata;
+ }
+
+ ++sp->bt_leaf_pg;
+ sp->bt_leaf_pgfree += P_FREESPACE(dbp, h);
+ break;
+ case P_LRECNO:
+ if (top == 0)
+ ++sp->bt_empty_pg;
+
+ /*
+ * If walking a recno tree, then each of these items is a key.
+ * Otherwise, we're walking an off-page duplicate set.
+ */
+ if (dbp->type == DB_RECNO) {
+ /*
+ * Correct for deleted items in non-renumbering Recno
+ * databases.
+ */
+ if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+ sp->bt_nkeys += top;
+ sp->bt_ndata += top;
+ } else
+ for (indx = 0; indx < top; indx += O_INDX) {
+ type = GET_BKEYDATA(dbp, h, indx)->type;
+ if (!B_DISSET(type)) {
+ ++sp->bt_ndata;
+ ++sp->bt_nkeys;
+ }
+ }
+
+ ++sp->bt_leaf_pg;
+ sp->bt_leaf_pgfree += P_FREESPACE(dbp, h);
+ } else {
+ sp->bt_ndata += top;
+
+ ++sp->bt_dup_pg;
+ sp->bt_dup_pgfree += P_FREESPACE(dbp, h);
+ }
+ break;
+ case P_LDUP:
+ if (top == 0)
+ ++sp->bt_empty_pg;
+
+ /* Correct for deleted items. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
+ ++sp->bt_ndata;
+
+ ++sp->bt_dup_pg;
+ sp->bt_dup_pgfree += P_FREESPACE(dbp, h);
+ break;
+ case P_OVERFLOW:
+ ++sp->bt_over_pg;
+ sp->bt_over_pgfree += P_OVFLSPACE(dbp, dbp->pgsize, h);
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, h->pgno));
+ }
+ return (0);
+}
+
+/*
+ * __bam_print_cursor --
+ * Display the current internal cursor.
+ *
+ * PUBLIC: void __bam_print_cursor __P((DBC *));
+ */
+void
+__bam_print_cursor(dbc)
+ DBC *dbc;
+{
+ static const FN fn[] = {
+ { C_DELETED, "C_DELETED" },
+ { C_RECNUM, "C_RECNUM" },
+ { C_RENUMBER, "C_RENUMBER" },
+ { 0, NULL }
+ };
+ ENV *env;
+ BTREE_CURSOR *cp;
+
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ STAT_ULONG("Overflow size", cp->ovflsize);
+ if (dbc->dbtype == DB_RECNO)
+ STAT_ULONG("Recno", cp->recno);
+ STAT_ULONG("Order", cp->order);
+ __db_prflags(env, NULL, cp->flags, fn, NULL, "\tInternal Flags");
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__bam_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbc->env));
+}
+
+int
+__bam_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbc->env));
+}
+#endif
+
+/*
+ * __bam_key_range --
+ * Return proportion of keys relative to given key. The numbers are
+ * slightly skewed due to on page duplicates.
+ *
+ * PUBLIC: int __bam_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__bam_key_range(dbc, dbt, kp, flags)
+ DBC *dbc;
+ DBT *dbt;
+ DB_KEY_RANGE *kp;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ EPG *sp;
+ double factor;
+ int exact, ret;
+
+ COMPQUIET(flags, 0);
+
+ if ((ret = __bam_search(dbc, PGNO_INVALID,
+ dbt, SR_STK_ONLY, 1, NULL, &exact)) != 0)
+ return (ret);
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ kp->less = kp->greater = 0.0;
+
+ factor = 1.0;
+
+ /* Correct the leaf page. */
+ cp->csp->entries /= 2;
+ cp->csp->indx /= 2;
+ for (sp = cp->sp; sp <= cp->csp; ++sp) {
+ /*
+ * At each level we know that pages greater than indx contain
+ * keys greater than what we are looking for and those less
+ * than indx are less than. The one pointed to by indx may
+ * have some less, some greater or even equal. If indx is
+ * equal to the number of entries, then the key is out of range
+ * and everything is less.
+ */
+ if (sp->indx == 0)
+ kp->greater += factor * (sp->entries - 1)/sp->entries;
+ else if (sp->indx == sp->entries)
+ kp->less += factor;
+ else {
+ kp->less += factor * sp->indx / sp->entries;
+ kp->greater += factor *
+ ((sp->entries - sp->indx) - 1) / sp->entries;
+ }
+ factor *= 1.0/sp->entries;
+ }
+
+ /*
+ * If there was an exact match then assign 1 n'th to the key itself.
+ * Otherwise that factor belongs to those greater than the key, unless
+ * the key was out of range.
+ */
+ if (exact)
+ kp->equal = factor;
+ else {
+ if (kp->less != 1)
+ kp->greater += factor;
+ kp->equal = 0;
+ }
+
+ if ((ret = __bam_stkrel(dbc, 0)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __bam_traverse --
+ * Walk a Btree database.
+ *
+ * PUBLIC: int __bam_traverse __P((DBC *, db_lockmode_t,
+ * PUBLIC: db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *));
+ */
+int
+__bam_traverse(dbc, mode, root_pgno, callback, cookie)
+ DBC *dbc;
+ db_lockmode_t mode;
+ db_pgno_t root_pgno;
+ int (*callback)__P((DBC *, PAGE *, void *, int *));
+ void *cookie;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_indx_t indx, *inp;
+ int already_put, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ already_put = 0;
+ LOCK_INIT(lock);
+
+ COMPQUIET(h, NULL);
+ BAM_GET_ROOT(dbc, root_pgno, h, 0, mode, lock, ret);
+ if (ret != 0)
+ goto err1;
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+ bi = GET_BINTERNAL(dbp, h, indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbc,
+ ((BOVERFLOW *)bi->data)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ if ((ret = __bam_traverse(
+ dbc, mode, bi->pgno, callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ case P_IRECNO:
+ for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+ ri = GET_RINTERNAL(dbp, h, indx);
+ if ((ret = __bam_traverse(
+ dbc, mode, ri->pgno, callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ case P_LBTREE:
+ inp = P_INP(dbp, h);
+ for (indx = 0; indx < NUM_ENT(h); indx += P_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (indx + P_INDX >= NUM_ENT(h) ||
+ inp[indx] != inp[indx + P_INDX])) {
+ if ((ret = __db_traverse_big(dbc,
+ GET_BOVERFLOW(dbp, h, indx)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ }
+ bk = GET_BKEYDATA(dbp, h, indx + O_INDX);
+ if (B_TYPE(bk->type) == B_DUPLICATE &&
+ (ret = __bam_traverse(dbc, mode,
+ GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbc,
+ GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ case P_LDUP:
+ case P_LRECNO:
+ for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbc,
+ GET_BOVERFLOW(dbp, h, indx)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, h->pgno));
+ }
+
+ ret = callback(dbc, h, cookie, &already_put);
+
+err: if (!already_put && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+err1: if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c
new file mode 100644
index 00000000..c9123351
--- /dev/null
+++ b/src/btree/bt_upgrade.c
@@ -0,0 +1,153 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_upgrade.h"
+#include "dbinc/btree.h"
+
+/*
+ * __bam_30_btreemeta --
+ * Upgrade the metadata pages from version 6 to version 7.
+ *
+ * PUBLIC: int __bam_30_btreemeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__bam_30_btreemeta(dbp, real_name, buf)
+ DB *dbp;
+ char *real_name;
+ u_int8_t *buf;
+{
+ BTMETA2X *oldmeta;
+ BTMETA30 *newmeta;
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ newmeta = (BTMETA30 *)buf;
+ oldmeta = (BTMETA2X *)buf;
+
+ /*
+ * Move things from the end up, so we do not overwrite things.
+ * We are going to create a new uid, so we can move the stuff
+ * at the end of the structure first, overwriting the uid.
+ */
+
+ newmeta->re_pad = oldmeta->re_pad;
+ newmeta->re_len = oldmeta->re_len;
+ newmeta->minkey = oldmeta->minkey;
+ newmeta->maxkey = oldmeta->maxkey;
+ newmeta->dbmeta.free = oldmeta->free;
+ newmeta->dbmeta.flags = oldmeta->flags;
+ newmeta->dbmeta.type = P_BTREEMETA;
+
+ newmeta->dbmeta.version = 7;
+ /* Replace the unique ID. */
+ if ((ret = __os_fileid(env, real_name, 1, buf + 36)) != 0)
+ return (ret);
+
+ newmeta->root = 1;
+
+ return (0);
+}
+
+/*
+ * __bam_31_btreemeta --
+ * Upgrade the database from version 7 to version 8.
+ *
+ * PUBLIC: int __bam_31_btreemeta
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_31_btreemeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BTMETA30 *oldmeta;
+ BTMETA31 *newmeta;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+
+ newmeta = (BTMETA31 *)h;
+ oldmeta = (BTMETA30 *)h;
+
+ /*
+ * Copy the effected fields down the page.
+ * The fields may overlap each other so we
+ * start at the bottom and use memmove.
+ */
+ newmeta->root = oldmeta->root;
+ newmeta->re_pad = oldmeta->re_pad;
+ newmeta->re_len = oldmeta->re_len;
+ newmeta->minkey = oldmeta->minkey;
+ newmeta->maxkey = oldmeta->maxkey;
+ memmove(newmeta->dbmeta.uid,
+ oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid));
+ newmeta->dbmeta.flags = oldmeta->dbmeta.flags;
+ newmeta->dbmeta.record_count = 0;
+ newmeta->dbmeta.key_count = 0;
+ ZERO_LSN(newmeta->dbmeta.unused3);
+
+ /* Set the version number. */
+ newmeta->dbmeta.version = 8;
+
+ /* Upgrade the flags. */
+ if (LF_ISSET(DB_DUPSORT))
+ F_SET(&newmeta->dbmeta, BTM_DUPSORT);
+
+ *dirtyp = 1;
+ return (0);
+}
+
+/*
+ * __bam_31_lbtree --
+ * Upgrade the database btree leaf pages.
+ *
+ * PUBLIC: int __bam_31_lbtree
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BKEYDATA *bk;
+ db_pgno_t pgno;
+ db_indx_t indx;
+ int ret;
+
+ ret = 0;
+ for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_DUPLICATE) {
+ pgno = GET_BOVERFLOW(dbp, h, indx)->pgno;
+ if ((ret = __db_31_offdup(dbp, real_name, fhp,
+ LF_ISSET(DB_DUPSORT) ? 1 : 0, &pgno)) != 0)
+ break;
+ if (pgno != GET_BOVERFLOW(dbp, h, indx)->pgno) {
+ *dirtyp = 1;
+ GET_BOVERFLOW(dbp, h, indx)->pgno = pgno;
+ }
+ }
+ }
+
+ return (ret);
+}
diff --git a/src/btree/bt_verify.c b/src/btree/bt_verify.c
new file mode 100644
index 00000000..99354a58
--- /dev/null
+++ b/src/btree/bt_verify.c
@@ -0,0 +1,2805 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *,
+ PAGE *, u_int32_t, int, DBT *, int *));
+static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ db_indx_t *, u_int32_t));
+static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *,
+ BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *),
+ u_int32_t));
+static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ db_indx_t *, u_int32_t));
+
+/*
+ * __bam_vrfy_meta --
+ * Verify the btree-specific part of a metadata page.
+ *
+ * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC: db_pgno_t, u_int32_t));
+ */
+int
+__bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ BTMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int isbad, t_ret, ret;
+ db_indx_t ovflsize;
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ /*
+ * If we came through __db_vrfy_pagezero, we have already checked the
+ * common fields. However, we used the on-disk metadata page, it may
+ * have been stale. We now have the page from mpool, so check that.
+ */
+ if ((ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /* bt_minkey: must be >= 2; must produce sensible ovflsize */
+
+ /* avoid division by zero */
+ ovflsize = meta->minkey > 0 ?
+ B_MINKEY_TO_OVFLSIZE(dbp, meta->minkey, dbp->pgsize) : 0;
+
+ if (meta->minkey < 2 ||
+ ovflsize > B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
+ pip->bt_minkey = 0;
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1034",
+ "Page %lu: nonsensical bt_minkey value %lu on metadata page",
+ "%lu %lu"), (u_long)pgno, (u_long)meta->minkey));
+ } else
+ pip->bt_minkey = meta->minkey;
+
+ /* re_len: no constraints on this (may be zero or huge--we make rope) */
+ pip->re_pad = meta->re_pad;
+ pip->re_len = meta->re_len;
+
+ /*
+ * The root must not be current page or 0 and it must be within
+ * database. If this metadata page is the master meta data page
+ * of the file, then the root page had better be page 1.
+ */
+ pip->root = 0;
+ if (meta->root == PGNO_INVALID ||
+ meta->root == pgno || !IS_VALID_PGNO(meta->root) ||
+ (pgno == PGNO_BASE_MD && meta->root != 1)) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1035",
+ "Page %lu: nonsensical root page %lu on metadata page",
+ "%lu %lu"), (u_long)pgno, (u_long)meta->root));
+ } else
+ pip->root = meta->root;
+
+ /* Flags. */
+ if (F_ISSET(&meta->dbmeta, BTM_RENUMBER))
+ F_SET(pip, VRFY_IS_RRECNO);
+
+ if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) {
+ /*
+ * If this is a master db meta page, it had better not have
+ * duplicates.
+ */
+ if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1036",
+"Page %lu: Btree metadata page has both duplicates and multiple databases",
+ "%lu"), (u_long)pgno));
+ }
+ F_SET(pip, VRFY_HAS_SUBDBS);
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_DUP))
+ F_SET(pip, VRFY_HAS_DUPS);
+ if (F_ISSET(&meta->dbmeta, BTM_DUPSORT))
+ F_SET(pip, VRFY_HAS_DUPSORT);
+ if (F_ISSET(&meta->dbmeta, BTM_RECNUM))
+ F_SET(pip, VRFY_HAS_RECNUMS);
+ if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((env, DB_STR_A("1037",
+ "Page %lu: Btree metadata page illegally has both recnums and dups",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_RECNO)) {
+ F_SET(pip, VRFY_IS_RECNO);
+ dbp->type = DB_RECNO;
+ } else if (F_ISSET(pip, VRFY_IS_RRECNO)) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1038",
+ "Page %lu: metadata page has renumber flag set but is not recno",
+ "%lu"), (u_long)pgno));
+ }
+
+#ifdef HAVE_COMPRESSION
+ if (F_ISSET(&meta->dbmeta, BTM_COMPRESS)) {
+ F_SET(pip, VRFY_HAS_COMPRESS);
+ if (!DB_IS_COMPRESSED(dbp)) {
+ ((BTREE *)dbp->bt_internal)->bt_compress =
+ __bam_defcompress;
+ ((BTREE *)dbp->bt_internal)->bt_decompress =
+ __bam_defdecompress;
+ }
+ /*
+ * Copy dup_compare to compress_dup_compare, and use the
+ * compression duplicate compare.
+ */
+ if (F_ISSET(pip, VRFY_HAS_DUPSORT)) {
+ if (dbp->dup_compare == NULL)
+ dbp->dup_compare = __bam_defcmp;
+ if (((BTREE *)dbp->bt_internal)->compress_dup_compare
+ == NULL) {
+ ((BTREE *)dbp->bt_internal)->
+ compress_dup_compare = dbp->dup_compare;
+ dbp->dup_compare = __bam_compress_dupcmp;
+ }
+ }
+ }
+
+ if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_COMPRESS)) {
+ EPRINT((env, DB_STR_A("1039",
+ "Page %lu: Btree metadata page illegally has both recnums and compression",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+ if (F_ISSET(pip, VRFY_HAS_DUPS) && !F_ISSET(pip, VRFY_HAS_DUPSORT) &&
+ F_ISSET(pip, VRFY_HAS_COMPRESS)) {
+ EPRINT((env, DB_STR_A("1040",
+ "Page %lu: Btree metadata page illegally has both "
+ "unsorted duplicates and compression",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+#endif
+
+ if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((env, DB_STR_A("1041",
+ "Page %lu: recno metadata page specifies duplicates",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN))
+ F_SET(pip, VRFY_IS_FIXEDLEN);
+ else if (pip->re_len > 0) {
+ /*
+ * It's wrong to have an re_len if it's not a fixed-length
+ * database
+ */
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1042",
+ "Page %lu: re_len of %lu in non-fixed-length database",
+ "%lu %lu"), (u_long)pgno, (u_long)pip->re_len));
+ }
+
+ /*
+ * We do not check that the rest of the page is 0, because it may
+ * not be and may still be correct.
+ */
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (LF_ISSET(DB_SALVAGE) &&
+ (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ram_vrfy_leaf --
+ * Verify a recno leaf page.
+ *
+ * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i;
+ int ret, t_ret, isbad;
+ u_int32_t re_len_guess, len;
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if (TYPE(h) != P_LRECNO) {
+ ret = __db_unknown_path(env, "__ram_vrfy_leaf");
+ goto err;
+ }
+
+ /*
+ * Verify (and, if relevant, save off) page fields common to
+ * all PAGEs.
+ */
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * Verify inp[]. Return immediately if it returns DB_VERIFY_BAD;
+ * further checks are dangerous.
+ */
+ if ((ret = __bam_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0)
+ goto err;
+
+ if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((env, DB_STR_A("1043",
+ "Page %lu: Recno database has dups",
+ "%lu"), (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Walk through inp and see if the lengths of all the records are the
+ * same--if so, this may be a fixed-length database, and we want to
+ * save off this value. We know inp to be safe if we've gotten this
+ * far.
+ */
+ re_len_guess = 0;
+ for (i = 0; i < NUM_ENT(h); i++) {
+ bk = GET_BKEYDATA(dbp, h, i);
+ /* KEYEMPTY. Go on. */
+ if (B_DISSET(bk->type))
+ continue;
+ if (bk->type == B_OVERFLOW)
+ len = ((BOVERFLOW *)bk)->tlen;
+ else if (bk->type == B_KEYDATA)
+ len = bk->len;
+ else {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1044",
+ "Page %lu: nonsensical type for item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ continue;
+ }
+ if (re_len_guess == 0)
+ re_len_guess = len;
+
+ /*
+ * Is this item's len the same as the last one's? If not,
+ * reset to 0 and break--we don't have a single re_len.
+ * Otherwise, go on to the next item.
+ */
+ if (re_len_guess != len) {
+ re_len_guess = 0;
+ break;
+ }
+ }
+ pip->re_len = re_len_guess;
+
+ /* Save off record count. */
+ pip->rec_cnt = NUM_ENT(h);
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy --
+ * Verify a btree leaf or internal page.
+ *
+ * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__bam_vrfy(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret, isbad;
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ break;
+ default:
+ ret = __db_unknown_path(env, "__bam_vrfy");
+ goto err;
+ }
+
+ /*
+ * Verify (and, if relevant, save off) page fields common to
+ * all PAGEs.
+ */
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * The record count is, on internal pages, stored in an overloaded
+ * next_pgno field. Save it off; we'll verify it when we check
+ * overall database structure. We could overload the field
+ * in VRFY_PAGEINFO, too, but this seems gross, and space
+ * is not at such a premium.
+ */
+ pip->rec_cnt = RE_NREC(h);
+
+ /*
+ * Verify inp[].
+ */
+ if (TYPE(h) == P_IRECNO) {
+ if ((ret = __ram_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0)
+ goto err;
+ } else if ((ret = __bam_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ EPRINT((env, DB_STR_A("1045",
+ "Page %lu: item order check unsafe: skipping",
+ "%lu"), (u_long)pgno));
+ } else if (!LF_ISSET(DB_NOORDERCHK) && (ret =
+ __bam_vrfy_itemorder(dbp,
+ vdp, vdp->thread_info, h, pgno, 0, 0, 0, flags)) != 0) {
+ /*
+ * We know that the elements of inp are reasonable.
+ *
+ * Check that elements fall in the proper order.
+ */
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ram_vrfy_inp --
+ * Verify that all entries in a P_IRECNO inp[] array are reasonable,
+ * and count them. Note that P_LRECNO uses __bam_vrfy_inp;
+ * P_IRECNOs are a special, and simpler, case, since they have
+ * RINTERNALs rather than BKEYDATA/BINTERNALs.
+ */
+static int
+__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_indx_t *nentriesp;
+ u_int32_t flags;
+{
+ ENV *env;
+ RINTERNAL *ri;
+ VRFY_CHILDINFO child;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret, isbad;
+ u_int32_t himark, i, offset, nentries;
+ db_indx_t *inp;
+ u_int8_t *pagelayout, *p;
+
+ env = dbp->env;
+ isbad = 0;
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ nentries = 0;
+ pagelayout = NULL;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if (TYPE(h) != P_IRECNO) {
+ ret = __db_unknown_path(env, "__ram_vrfy_inp");
+ goto err;
+ }
+
+ himark = dbp->pgsize;
+ if ((ret = __os_malloc(env, dbp->pgsize, &pagelayout)) != 0)
+ goto err;
+ memset(pagelayout, 0, dbp->pgsize);
+ inp = P_INP(dbp, h);
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if ((u_int8_t *)inp + i >= (u_int8_t *)h + himark) {
+ EPRINT((env, DB_STR_A("1046",
+ "Page %lu: entries listing %lu overlaps data",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ offset = inp[i];
+ /*
+ * Check that the item offset is reasonable: it points
+ * somewhere after the inp array and before the end of the
+ * page.
+ */
+ if (offset <= (u_int32_t)((u_int8_t *)inp + i -
+ (u_int8_t *)h) ||
+ offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1047",
+ "Page %lu: bad offset %lu at index %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)offset,
+ (u_long)i));
+ continue;
+ }
+
+ /* Update the high-water mark (what HOFFSET should be) */
+ if (offset < himark)
+ himark = offset;
+
+ nentries++;
+
+ /* Make sure this RINTERNAL is not multiply referenced. */
+ ri = GET_RINTERNAL(dbp, h, i);
+ if (pagelayout[offset] == 0) {
+ pagelayout[offset] = 1;
+ child.pgno = ri->pgno;
+ child.type = V_RECNO;
+ child.nrecs = ri->nrecs;
+ if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+ goto err;
+ } else {
+ EPRINT((env, DB_STR_A("1048",
+ "Page %lu: RINTERNAL structure at offset %lu referenced twice",
+ "%lu %lu"), (u_long)pgno, (u_long)offset));
+ isbad = 1;
+ }
+ }
+
+ for (p = pagelayout + himark;
+ p < pagelayout + dbp->pgsize;
+ p += RINTERNAL_SIZE)
+ if (*p != 1) {
+ EPRINT((env, DB_STR_A("1049",
+ "Page %lu: gap between items at offset %lu",
+ "%lu %lu"), (u_long)pgno,
+ (u_long)(p - pagelayout)));
+ isbad = 1;
+ }
+
+ if ((db_indx_t)himark != HOFFSET(h)) {
+ EPRINT((env, DB_STR_A("1050",
+ "Page %lu: bad HOFFSET %lu, appears to be %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)(HOFFSET(h)),
+ (u_long)himark));
+ isbad = 1;
+ }
+
+ *nentriesp = nentries;
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pagelayout != NULL)
+ __os_free(env, pagelayout);
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+typedef enum { VRFY_ITEM_NOTSET=0, VRFY_ITEM_BEGIN, VRFY_ITEM_END } VRFY_ITEM;
+
+/*
+ * __bam_vrfy_inp --
+ * Verify that all entries in inp[] array are reasonable;
+ * count them.
+ */
+static int
+__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_indx_t *nentriesp;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ ENV *env;
+ VRFY_CHILDINFO child;
+ VRFY_ITEM *pagelayout;
+ VRFY_PAGEINFO *pip;
+ u_int32_t himark, offset; /*
+ * These would be db_indx_ts
+ * but for alignment.
+ */
+ u_int32_t i, endoff, nentries;
+ int isbad, initem, isdupitem, ret, t_ret;
+
+ env = dbp->env;
+ isbad = isdupitem = 0;
+ nentries = 0;
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ break;
+ default:
+ /*
+ * In the salvager, we might call this from a page which
+ * we merely suspect is a btree page. Otherwise, it
+ * shouldn't get called--if it is, that's a verifier bug.
+ */
+ if (LF_ISSET(DB_SALVAGE))
+ break;
+ ret = __db_unknown_path(env, "__bam_vrfy_inp");
+ goto err;
+ }
+
+ /*
+ * Loop through inp[], the array of items, until we either
+ * run out of entries or collide with the data. Keep track
+ * of h_offset in himark.
+ *
+ * For each element in inp[i], make sure it references a region
+ * that starts after the end of the inp array (as defined by
+ * NUM_ENT(h)), ends before the beginning of the page, doesn't
+ * overlap any other regions, and doesn't have a gap between
+ * it and the region immediately after it.
+ */
+ himark = dbp->pgsize;
+ if ((ret = __os_calloc(
+ env, dbp->pgsize, sizeof(pagelayout[0]), &pagelayout)) != 0)
+ goto err;
+ for (i = 0; i < NUM_ENT(h); i++) {
+ switch (ret = __db_vrfy_inpitem(dbp,
+ h, pgno, i, 1, flags, &himark, &offset)) {
+ case 0:
+ break;
+ case DB_VERIFY_BAD:
+ isbad = 1;
+ continue;
+ case DB_VERIFY_FATAL:
+ isbad = 1;
+ goto err;
+ default:
+ DB_ASSERT(env, ret != 0);
+ break;
+ }
+
+ /*
+ * We now have a plausible beginning for the item, and we know
+ * its length is safe.
+ *
+ * Mark the beginning and end in pagelayout so we can make sure
+ * items have no overlaps or gaps.
+ */
+ bk = GET_BKEYDATA(dbp, h, i);
+ if (pagelayout[offset] == VRFY_ITEM_NOTSET)
+ pagelayout[offset] = VRFY_ITEM_BEGIN;
+ else if (pagelayout[offset] == VRFY_ITEM_BEGIN) {
+ /*
+ * Having two inp entries that point at the same patch
+ * of page is legal if and only if the page is
+ * a btree leaf and they're onpage duplicate keys--
+ * that is, if (i % P_INDX) == 0.
+ */
+ if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) {
+ /* Flag for later. */
+ F_SET(pip, VRFY_HAS_DUPS);
+
+ /* Bump up nentries so we don't undercount. */
+ nentries++;
+
+ /*
+ * We'll check to make sure the end is
+ * equal, too.
+ */
+ isdupitem = 1;
+ } else {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1051",
+ "Page %lu: duplicated item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ }
+ }
+
+ /*
+ * Mark the end. Its location varies with the page type
+ * and the item type.
+ *
+ * If the end already has a sign other than 0, do nothing--
+ * it's an overlap that we'll catch later.
+ */
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ if (TYPE(h) == P_IBTREE)
+ /* It's a BINTERNAL. */
+ endoff = offset + BINTERNAL_SIZE(bk->len) - 1;
+ else
+ endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
+ break;
+ case B_DUPLICATE:
+ /*
+ * Flag that we have dups; we'll check whether
+ * that's okay during the structure check.
+ */
+ F_SET(pip, VRFY_HAS_DUPS);
+ /* FALLTHROUGH */
+ case B_OVERFLOW:
+ /*
+ * Overflow entries on internal pages are stored
+ * as the _data_ of a BINTERNAL; overflow entries
+ * on leaf pages are stored as the entire entry.
+ */
+ endoff = offset +
+ ((TYPE(h) == P_IBTREE) ?
+ BINTERNAL_SIZE(BOVERFLOW_SIZE) :
+ BOVERFLOW_SIZE) - 1;
+ break;
+ default:
+ /*
+ * We'll complain later; for now, just mark
+ * a minimum.
+ */
+ endoff = offset + BKEYDATA_SIZE(0) - 1;
+ break;
+ }
+
+ /*
+ * If this is an onpage duplicate key we've seen before,
+ * the end had better coincide too.
+ */
+ if (isdupitem && pagelayout[endoff] != VRFY_ITEM_END) {
+ EPRINT((env, DB_STR_A("1052",
+ "Page %lu: duplicated item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ isbad = 1;
+ } else if (pagelayout[endoff] == VRFY_ITEM_NOTSET)
+ pagelayout[endoff] = VRFY_ITEM_END;
+ isdupitem = 0;
+
+ /*
+ * There should be no deleted items in a quiescent tree,
+ * except in recno.
+ */
+ if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1053",
+ "Page %lu: item %lu marked deleted", "%lu %lu"),
+ (u_long)pgno, (u_long)i));
+ }
+
+ /*
+ * Check the type and such of bk--make sure it's reasonable
+ * for the pagetype.
+ */
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ /*
+ * This is a normal, non-overflow BKEYDATA or BINTERNAL.
+ * The only thing to check is the len, and that's
+ * already been done.
+ */
+ break;
+ case B_DUPLICATE:
+ if (TYPE(h) == P_IBTREE) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1054",
+ "Page %lu: duplicate page referenced by internal btree page at item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ } else if (TYPE(h) == P_LRECNO) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1055",
+ "Page %lu: duplicate page referenced by recno page at item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ }
+ /* FALLTHROUGH */
+ case B_OVERFLOW:
+ bo = (TYPE(h) == P_IBTREE) ?
+ (BOVERFLOW *)(((BINTERNAL *)bk)->data) :
+ (BOVERFLOW *)bk;
+
+ if (B_TYPE(bk->type) == B_OVERFLOW)
+ /* Make sure tlen is reasonable. */
+ if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1056",
+ "Page %lu: impossible tlen %lu, item %lu",
+ "%lu %lu %lu"), (u_long)pgno,
+ (u_long)bo->tlen, (u_long)i));
+ /* Don't save as a child. */
+ break;
+ }
+
+ if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
+ bo->pgno == PGNO_INVALID) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1057",
+ "Page %lu: offpage item %lu has bad pgno %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)i,
+ (u_long)bo->pgno));
+ /* Don't save as a child. */
+ break;
+ }
+
+ child.pgno = bo->pgno;
+ child.type = (B_TYPE(bk->type) == B_OVERFLOW ?
+ V_OVERFLOW : V_DUPLICATE);
+ child.tlen = bo->tlen;
+ if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+ goto err;
+ break;
+ default:
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1058",
+ "Page %lu: item %lu of invalid type %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)i,
+ (u_long)B_TYPE(bk->type)));
+ break;
+ }
+ }
+
+ /*
+ * Now, loop through and make sure the items are contiguous and
+ * non-overlapping.
+ */
+ initem = 0;
+ for (i = himark; i < dbp->pgsize; i++)
+ if (initem == 0)
+ switch (pagelayout[i]) {
+ case VRFY_ITEM_NOTSET:
+ /* May be just for alignment. */
+ if (i != DB_ALIGN(i, sizeof(u_int32_t)))
+ continue;
+
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1059",
+ "Page %lu: gap between items at offset %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ /* Find the end of the gap */
+ for (; pagelayout[i + 1] == VRFY_ITEM_NOTSET &&
+ (size_t)(i + 1) < dbp->pgsize; i++)
+ ;
+ break;
+ case VRFY_ITEM_BEGIN:
+ /* We've found an item. Check its alignment. */
+ if (i != DB_ALIGN(i, sizeof(u_int32_t))) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1060",
+ "Page %lu: offset %lu unaligned",
+ "%lu %lu"), (u_long)pgno,
+ (u_long)i));
+ }
+ initem = 1;
+ nentries++;
+ break;
+ case VRFY_ITEM_END:
+ /*
+ * We've hit the end of an item even though
+ * we don't think we're in one; must
+ * be an overlap.
+ */
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1061",
+ "Page %lu: overlapping items at offset %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ }
+ else
+ switch (pagelayout[i]) {
+ case VRFY_ITEM_NOTSET:
+ /* In the middle of an item somewhere. Okay. */
+ break;
+ case VRFY_ITEM_END:
+ /* End of an item; switch to out-of-item mode.*/
+ initem = 0;
+ break;
+ case VRFY_ITEM_BEGIN:
+ /*
+ * Hit a second item beginning without an
+ * end. Overlap.
+ */
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1062",
+ "Page %lu: overlapping items at offset %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ }
+
+ __os_free(env, pagelayout);
+
+ /* Verify HOFFSET. */
+ if ((db_indx_t)himark != HOFFSET(h)) {
+ EPRINT((env, DB_STR_A("1063",
+ "Page %lu: bad HOFFSET %lu, appears to be %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)HOFFSET(h),
+ (u_long)himark));
+ isbad = 1;
+ }
+
+err: if (nentriesp != NULL)
+ *nentriesp = nentries;
+
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_itemorder --
+ * Make sure the items on a page sort correctly.
+ *
+ * Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are
+ * reasonable; be sure that __bam_vrfy_inp has been called first.
+ *
+ * If ovflok is set, it also assumes that overflow page chains
+ * hanging off the current page have been sanity-checked, and so we
+ * can use __bam_cmp to verify their ordering. If it is not set,
+ * and we run into an overflow page, carp and return DB_VERIFY_BAD;
+ * we shouldn't be called if any exist.
+ *
+ * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *,
+ * PUBLIC: PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t));
+ */
+int
+__bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ DB_THREAD_INFO *ip;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t nentries;
+ int ovflok, hasdups;
+ u_int32_t flags;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BTREE *bt;
+ DB_MPOOLFILE *mpf;
+ DBC *dbc;
+ DBT dbta, dbtb, dup_1, dup_2, *p1, *p2, *tmp;
+ ENV *env;
+ PAGE *child;
+ db_pgno_t cpgno;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i, *inp;
+ int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret;
+ int (*dupfunc) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ void *buf1, *buf2, *tmpbuf;
+
+ /*
+ * We need to work in the ORDERCHKONLY environment where we might
+ * not have a pip, but we also may need to work in contexts where
+ * NUM_ENT isn't safe.
+ */
+ if (vdp != NULL) {
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ nentries = pip->entries;
+ } else
+ pip = NULL;
+
+ env = dbp->env;
+ ret = isbad = 0;
+ bo = NULL; /* Shut up compiler. */
+
+ memset(&dbta, 0, sizeof(DBT));
+ F_SET(&dbta, DB_DBT_REALLOC);
+
+ memset(&dbtb, 0, sizeof(DBT));
+ F_SET(&dbtb, DB_DBT_REALLOC);
+
+ buf1 = buf2 = NULL;
+
+ DB_ASSERT(env, !LF_ISSET(DB_NOORDERCHK));
+
+ dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+ if (TYPE(h) == P_LDUP)
+ func = dupfunc;
+ else {
+ func = __bam_defcmp;
+ if (dbp->bt_internal != NULL) {
+ bt = (BTREE *)dbp->bt_internal;
+ if (TYPE(h) == P_IBTREE && (bt->bt_compare != NULL ||
+ dupfunc != __bam_defcmp)) {
+ /*
+ * The problem here is that we cannot
+ * tell the difference between an off
+ * page duplicate internal page and
+ * a main database internal page.
+ * Walk down the tree to figure it out.
+ */
+ mpf = dbp->mpf;
+ child = h;
+ while (TYPE(child) == P_IBTREE) {
+ bi = GET_BINTERNAL(dbp, child, 0);
+ cpgno = bi->pgno;
+ if (child != h &&
+ (ret = __memp_fput(mpf,
+ vdp->thread_info, child,
+ DB_PRIORITY_UNCHANGED)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf,
+ &cpgno, vdp->thread_info,
+ NULL, 0, &child)) != 0)
+ goto err;
+ }
+ if (TYPE(child) == P_LDUP)
+ func = dupfunc;
+ else if (bt->bt_compare != NULL)
+ func = bt->bt_compare;
+ if ((ret = __memp_fput(mpf, vdp->thread_info,
+ child, DB_PRIORITY_UNCHANGED)) != 0)
+ goto err;
+ } else if (bt->bt_compare != NULL)
+ func = bt->bt_compare;
+ }
+ }
+
+ /*
+ * We alternate our use of dbta and dbtb so that we can walk
+ * through the page key-by-key without copying a dbt twice.
+ * p1 is always the dbt for index i - 1, and p2 for index i.
+ * Reset the data pointers in case we are retrying.
+ */
+retry: p1 = &dbta;
+ p1->data = NULL;
+ p2 = &dbtb;
+ p2->data = NULL;
+
+ /*
+ * Loop through the entries. nentries ought to contain the
+ * actual count, and so is a safe way to terminate the loop; whether
+ * we inc. by one or two depends on whether we're a leaf page--
+ * on a leaf page, we care only about keys. On internal pages
+ * and LDUP pages, we want to check the order of all entries.
+ *
+ * Note that on IBTREE pages or the index page of a partitioned
+ * database, we start with item 1, since item 0 doesn't get looked
+ * at by __bam_cmp.
+ */
+ inp = P_INP(dbp, h);
+ adj = (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX;
+ for (i = (TYPE(h) == P_IBTREE || dbp->p_internal != NULL) ? adj : 0;
+ i < nentries; i += adj) {
+ /*
+ * Put key i-1, now in p2, into p1, by swapping DBTs and bufs.
+ */
+ tmp = p1;
+ p1 = p2;
+ p2 = tmp;
+ tmpbuf = buf1;
+ buf1 = buf2;
+ buf2 = tmpbuf;
+
+ /*
+ * Get key i into p2.
+ */
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(dbp, h, i);
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)(bi->data);
+ goto overflow;
+ } else {
+ p2->data = bi->data;
+ p2->size = bi->len;
+ }
+
+ /*
+ * The leftmost key on an internal page must be
+ * len 0, since it's just a placeholder and
+ * automatically sorts less than all keys.
+ *
+ * XXX
+ * This criterion does not currently hold!
+ * See todo list item #1686. Meanwhile, it's harmless
+ * to just not check for it.
+ */
+#if 0
+ if (i == 0 && bi->len != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1064",
+ "Page %lu: lowest key on internal page of nonzero length",
+ "%lu"), (u_long)pgno));
+ }
+#endif
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ bk = GET_BKEYDATA(dbp, h, i);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ goto overflow;
+ } else {
+ p2->data = bk->data;
+ p2->size = bk->len;
+ }
+ break;
+ default:
+ /*
+ * This means our caller screwed up and sent us
+ * an inappropriate page.
+ */
+ ret = __db_unknown_path(env, "__bam_vrfy_itemorder");
+ goto err;
+ }
+
+ if (0) {
+ /*
+ * If ovflok != 1, we can't safely go chasing
+ * overflow pages with the normal routines now;
+ * they might be unsafe or nonexistent. Mark this
+ * page as incomplete and return.
+ *
+ * Note that we don't need to worry about freeing
+ * buffers, since they can't have been allocated
+ * if overflow items are unsafe.
+ */
+overflow: if (!ovflok) {
+ if (pip != NULL)
+ F_SET(pip, VRFY_INCOMPLETE);
+ goto err;
+ }
+
+ /*
+ * Overflow items are safe to chase. Do so.
+ * Fetch the overflow item into p2->data,
+ * NULLing it or reallocing it as appropriate.
+ *
+ * (We set p2->data to buf2 before the call
+ * so we're sure to realloc if we can and if p2
+ * was just pointing at a non-overflow item.)
+ */
+ p2->data = buf2;
+ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ goto err;
+ if ((ret = __db_goff(dbc,
+ p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1065",
+ "Page %lu: error %lu in fetching overflow item %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)ret,
+ (u_long)i));
+ }
+ /* In case it got realloc'ed and thus changed. */
+ buf2 = p2->data;
+ }
+
+ /* Compare with the last key. */
+ if (p1->data != NULL && p2->data != NULL) {
+ cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2);
+
+ /* comparison succeeded */
+ if (cmp > 0) {
+ /*
+ * If we are looking at an internal page, we
+ * don't know whether it is part of the main
+ * database or in an off-page-duplicate tree.
+ * If the main comparator fails, retry with
+ * the duplicate comparator.
+ */
+ if (TYPE(h) == P_IBTREE && func != dupfunc) {
+ func = dupfunc;
+ goto retry;
+ }
+
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1066",
+ "Page %lu: out-of-order key at entry %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ /* proceed */
+ } else if (cmp == 0) {
+ if (inp[i] != inp[i - adj]) {
+ /* See above. */
+ if (TYPE(h) == P_IBTREE &&
+ func != dupfunc) {
+ func = dupfunc;
+ goto retry;
+ }
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1067",
+ "Page %lu: non-dup dup key at entry %lu",
+ "%lu %lu"), (u_long)pgno,
+ (u_long)i));
+ }
+ /*
+ * If they compared equally, this
+ * had better be a (sub)database with dups.
+ * Mark it so we can check during the
+ * structure check.
+ */
+ if (pip != NULL)
+ F_SET(pip, VRFY_HAS_DUPS);
+ else if (hasdups == 0) {
+ /* See above. */
+ if (TYPE(h) == P_IBTREE &&
+ func != dupfunc) {
+ func = dupfunc;
+ goto retry;
+ }
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1068",
+ "Page %lu: database with no duplicates has duplicated keys",
+ "%lu"), (u_long)pgno));
+ }
+
+ /*
+ * If we're a btree leaf, check to see
+ * if the data items of these on-page dups are
+ * in sorted order. If not, flag this, so
+ * that we can make sure during the
+ * structure checks that the DUPSORT flag
+ * is unset.
+ *
+ * At this point i points to a duplicate key.
+ * Compare the datum before it (same key)
+ * to the datum after it, i.e. i-1 to i+1.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ /*
+ * Unsafe; continue and we'll pick
+ * up the bogus nentries later.
+ */
+ if (i + 1 >= (db_indx_t)nentries)
+ continue;
+
+ /*
+ * We don't bother with clever memory
+ * management with on-page dups,
+ * as it's only really a big win
+ * in the overflow case, and overflow
+ * dups are probably (?) rare.
+ */
+ if (((ret = __bam_safe_getdata(dbp,
+ ip, h, i - 1, ovflok,
+ &dup_1, &freedup_1)) != 0) ||
+ ((ret = __bam_safe_getdata(dbp,
+ ip, h, i + 1, ovflok,
+ &dup_2, &freedup_2)) != 0))
+ goto err;
+
+ /*
+ * If either of the data are NULL,
+ * it's because they're overflows and
+ * it's not safe to chase them now.
+ * Mark an incomplete and return.
+ */
+ if (dup_1.data == NULL ||
+ dup_2.data == NULL) {
+ DB_ASSERT(env, !ovflok);
+ if (pip != NULL)
+ F_SET(pip,
+ VRFY_INCOMPLETE);
+ goto err;
+ }
+
+ /*
+ * If the dups are out of order,
+ * flag this. It's not an error
+ * until we do the structure check
+ * and see whether DUPSORT is set.
+ */
+ if (dupfunc(dbp, &dup_1, &dup_2) > 0 &&
+ pip != NULL)
+ F_SET(pip, VRFY_DUPS_UNSORTED);
+
+ if (freedup_1)
+ __os_ufree(env, dup_1.data);
+ if (freedup_2)
+ __os_ufree(env, dup_2.data);
+ }
+ }
+ }
+ }
+
+err: if (pip != NULL && ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
+ ret = t_ret;
+
+ if (buf1 != NULL)
+ __os_ufree(env, buf1);
+ if (buf2 != NULL)
+ __os_ufree(env, buf2);
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_structure --
+ * Verify the tree structure of a btree database (including the master
+ * database containing subdbs).
+ *
+ * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: void *, void *, u_int32_t));
+ */
+int
+__bam_vrfy_structure(dbp, vdp, meta_pgno, lp, rp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+ void *lp, *rp;
+ u_int32_t flags;
+{
+ DB *pgset;
+ ENV *env;
+ VRFY_PAGEINFO *mip, *rip;
+ db_pgno_t root, p;
+ int t_ret, ret;
+ u_int32_t nrecs, level, relen, stflags;
+
+ env = dbp->env;
+ mip = rip = 0;
+ pgset = vdp->pgset;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0)
+ return (ret);
+
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, vdp->txn, meta_pgno, (int *)&p)) != 0)
+ goto err;
+ if (p != 0) {
+ EPRINT((env, DB_STR_A("1069",
+ "Page %lu: btree metadata page observed twice",
+ "%lu"), (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = __db_vrfy_pgset_inc(
+ pgset, vdp->thread_info, vdp->txn, meta_pgno)) != 0)
+ goto err;
+
+ root = mip->root;
+
+ if (root == 0) {
+ EPRINT((env, DB_STR_A("1070",
+ "Page %lu: btree metadata page has no root",
+ "%lu"), (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0)
+ goto err;
+
+ switch (rip->type) {
+ case P_IBTREE:
+ case P_LBTREE:
+ stflags = flags | DB_ST_TOPLEVEL;
+ if (F_ISSET(mip, VRFY_HAS_DUPS))
+ stflags |= DB_ST_DUPOK;
+ if (F_ISSET(mip, VRFY_HAS_DUPSORT))
+ stflags |= DB_ST_DUPSORT;
+ if (F_ISSET(mip, VRFY_HAS_RECNUMS))
+ stflags |= DB_ST_RECNUM;
+ ret = __bam_vrfy_subtree(dbp,
+ vdp, root, lp, rp, stflags, NULL, NULL, NULL);
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ stflags =
+ flags | DB_ST_RECNUM | DB_ST_IS_RECNO | DB_ST_TOPLEVEL;
+ if (mip->re_len > 0)
+ stflags |= DB_ST_RELEN;
+ if ((ret = __bam_vrfy_subtree(dbp, vdp,
+ root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0)
+ goto err;
+ /*
+ * Even if mip->re_len > 0, re_len may come back zero if the
+ * tree is empty. It should be okay to just skip the check in
+ * this case, as if there are any non-deleted keys at all,
+ * that should never happen.
+ */
+ if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) {
+ EPRINT((env, DB_STR_A("1071",
+ "Page %lu: recno database has bad re_len %lu",
+ "%lu %lu"), (u_long)meta_pgno, (u_long)relen));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ ret = 0;
+ break;
+ case P_LDUP:
+ EPRINT((env, DB_STR_A("1072",
+ "Page %lu: duplicate tree referenced from metadata page",
+ "%lu"), (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ break;
+ default:
+ EPRINT((env, DB_STR_A("1073",
+ "Page %lu: btree root of incorrect type %lu on metadata page",
+ "%lu %lu"), (u_long)meta_pgno, (u_long)rip->type));
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+err: if (mip != NULL && ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
+ ret = t_ret;
+ if (rip != NULL && ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, rip)) != 0) && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bam_vrfy_subtree--
+ * Verify a subtree (or entire) btree with specified root.
+ *
+ * Note that this is public because it must be called to verify
+ * offpage dup trees, including from hash.
+ *
+ * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *,
+ * PUBLIC: void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
+ */
+int
+__bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ void *l, *r;
+ u_int32_t flags, *levelp, *nrecsp, *relenp;
+{
+ BINTERNAL *li, *ri;
+ DB *pgset;
+ DBC *cc;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ VRFY_CHILDINFO *child;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i;
+ db_pgno_t next_pgno, prev_pgno;
+ db_recno_t child_nrecs, nrecs;
+ u_int32_t child_level, child_relen, j, level, relen, stflags;
+ u_int8_t leaf_type;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ int isbad, p, ret, t_ret, toplevel;
+
+ if (levelp != NULL) /* Don't leave uninitialized on error. */
+ *levelp = 0;
+ if (nrecsp != NULL)
+ *nrecsp = 0;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ next_pgno = prev_pgno = PGNO_INVALID;
+ nrecs = 0;
+ relen = 0;
+ leaf_type = P_INVALID;
+ isbad = ret = 0;
+
+ /* Provide feedback on our progress to the application. */
+ if (!LF_ISSET(DB_SALVAGE))
+ __db_vrfy_struct_feedback(dbp, vdp);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ cc = NULL;
+ level = pip->bt_level;
+
+ toplevel = LF_ISSET(DB_ST_TOPLEVEL) ? 1 : 0;
+ LF_CLR(DB_ST_TOPLEVEL);
+
+ /*
+ * If this is the root, initialize the vdp's prev- and next-pgno
+ * accounting.
+ *
+ * For each leaf page we hit, we'll want to make sure that
+ * vdp->prev_pgno is the same as pip->prev_pgno and vdp->next_pgno is
+ * our page number. Then, we'll set vdp->next_pgno to pip->next_pgno
+ * and vdp->prev_pgno to our page number, and the next leaf page in
+ * line should be able to do the same verification.
+ */
+ if (toplevel) {
+ /*
+ * Cache the values stored in the vdp so that if we're an
+ * auxiliary tree such as an off-page duplicate set, our
+ * caller's leaf page chain doesn't get lost.
+ */
+ prev_pgno = vdp->prev_pgno;
+ next_pgno = vdp->next_pgno;
+ leaf_type = vdp->leaf_type;
+ vdp->next_pgno = vdp->prev_pgno = PGNO_INVALID;
+ vdp->leaf_type = P_INVALID;
+ }
+
+ /*
+ * We are recursively descending a btree, starting from the root
+ * and working our way out to the leaves.
+ *
+ * There are four cases we need to deal with:
+ * 1. pgno is a recno leaf page. Any children are overflows.
+ * 2. pgno is a duplicate leaf page. Any children
+ * are overflow pages; traverse them, and then return
+ * level and nrecs.
+ * 3. pgno is an ordinary leaf page. Check whether dups are
+ * allowed, and if so, traverse any off-page dups or
+ * overflows. Then return nrecs and level.
+ * 4. pgno is a recno internal page. Recursively check any
+ * child pages, making sure their levels are one lower
+ * and their nrecs sum to ours.
+ * 5. pgno is a btree internal page. Same as #4, plus we
+ * must verify that for each pair of BINTERNAL entries
+ * N and N+1, the leftmost item on N's child sorts
+ * greater than N, and the rightmost item on N's child
+ * sorts less than N+1.
+ *
+ * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE),
+ * we need to verify the internal sort order is correct if,
+ * due to overflow items, we were not able to do so earlier.
+ */
+ switch (pip->type) {
+ case P_LRECNO:
+ case P_LDUP:
+ case P_LBTREE:
+ /*
+ * Cases 1, 2 and 3.
+ *
+ * We're some sort of leaf page; verify
+ * that our linked list of leaves is consistent.
+ */
+ if (vdp->leaf_type == P_INVALID) {
+ /*
+ * First leaf page. Set the type that all its
+ * successors should be, and verify that our prev_pgno
+ * is PGNO_INVALID.
+ */
+ vdp->leaf_type = pip->type;
+ if (pip->prev_pgno != PGNO_INVALID)
+ goto bad_prev;
+ } else {
+ /*
+ * Successor leaf page. Check our type, the previous
+ * page's next_pgno, and our prev_pgno.
+ */
+ if (pip->type != vdp->leaf_type) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1074",
+ "Page %lu: unexpected page type %lu found in leaf chain (expected %lu)",
+ "%lu %lu %lu"), (u_long)pip->pgno,
+ (u_long)pip->type,
+ (u_long)vdp->leaf_type));
+ }
+
+ /*
+ * Don't do the prev/next_pgno checks if we've lost
+ * leaf pages due to another corruption.
+ */
+ if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) {
+ if (pip->pgno != vdp->next_pgno) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1075",
+ "Page %lu: incorrect next_pgno %lu found in leaf chain (should be %lu)",
+ "%lu %lu %lu"),
+ (u_long)vdp->prev_pgno,
+ (u_long)vdp->next_pgno,
+ (u_long)pip->pgno));
+ }
+ if (pip->prev_pgno != vdp->prev_pgno) {
+bad_prev: isbad = 1;
+ EPRINT((env, DB_STR_A("1076",
+ "Page %lu: incorrect prev_pgno %lu found in leaf chain (should be %lu)",
+ "%lu %lu %lu"),
+ (u_long)pip->pgno,
+ (u_long)pip->prev_pgno,
+ (u_long)vdp->prev_pgno));
+ }
+ }
+ }
+ vdp->prev_pgno = pip->pgno;
+ vdp->next_pgno = pip->next_pgno;
+ F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN);
+
+ /*
+ * Overflow pages are common to all three leaf types;
+ * traverse the child list, looking for overflows.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child))
+ if (child->type == V_OVERFLOW &&
+ (ret = __db_vrfy_ovfl_structure(dbp, vdp,
+ child->pgno, child->tlen,
+ flags | DB_ST_OVFL_LEAF)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /* Case 1 */
+ if (pip->type == P_LRECNO) {
+ if (!LF_ISSET(DB_ST_IS_RECNO) &&
+ !(LF_ISSET(DB_ST_DUPOK) &&
+ !LF_ISSET(DB_ST_DUPSORT))) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1077",
+ "Page %lu: recno leaf page non-recno tree",
+ "%lu"), (u_long)pgno));
+ goto done;
+ }
+ goto leaf;
+ } else if (LF_ISSET(DB_ST_IS_RECNO)) {
+ /*
+ * It's a non-recno leaf. Had better not be a recno
+ * subtree.
+ */
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1078",
+ "Page %lu: non-recno leaf page in recno tree",
+ "%lu"), (u_long)pgno));
+ goto done;
+ }
+
+ /* Case 2--no more work. */
+ if (pip->type == P_LDUP)
+ goto leaf;
+
+ /* Case 3 */
+
+ /* Check if we have any dups. */
+ if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+ /* If dups aren't allowed in this btree, trouble. */
+ if (!LF_ISSET(DB_ST_DUPOK)) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1079",
+ "Page %lu: duplicates in non-dup btree",
+ "%lu"), (u_long)pgno));
+ } else {
+ /*
+ * We correctly have dups. If any are off-page,
+ * traverse those btrees recursively.
+ */
+ if ((ret =
+ __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child);
+ ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child)) {
+ stflags =
+ flags | DB_ST_RECNUM | DB_ST_DUPSET;
+ /* Skip any overflow entries. */
+ if (child->type == V_DUPLICATE) {
+ if ((ret = __db_vrfy_duptype(
+ dbp, vdp, child->pgno,
+ stflags)) != 0) {
+ isbad = 1;
+ /* Next child. */
+ continue;
+ }
+ if ((ret = __bam_vrfy_subtree(
+ dbp, vdp, child->pgno,
+ NULL, NULL,
+ stflags | DB_ST_TOPLEVEL,
+ NULL, NULL, NULL)) != 0) {
+ if (ret ==
+ DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ }
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /*
+ * If VRFY_DUPS_UNSORTED is set,
+ * DB_ST_DUPSORT had better not be.
+ */
+ if (F_ISSET(pip, VRFY_DUPS_UNSORTED) &&
+ LF_ISSET(DB_ST_DUPSORT)) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1080",
+ "Page %lu: unsorted duplicate set in sorted-dup database",
+ "%lu"), (u_long)pgno));
+ }
+ }
+ }
+ goto leaf;
+ case P_IBTREE:
+ case P_IRECNO:
+ /* We handle these below. */
+ break;
+ default:
+ /*
+ * If a P_IBTREE or P_IRECNO contains a reference to an
+ * invalid page, we'll wind up here; handle it gracefully.
+ * Note that the code at the "done" label assumes that the
+ * current page is a btree/recno one of some sort; this
+ * is not the case here, so we goto err.
+ *
+ * If the page is entirely zeroed, its pip->type will be a lie
+ * (we assumed it was a hash page, as they're allowed to be
+ * zeroed); handle this case specially.
+ */
+ if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+ ZEROPG_ERR_PRINT(env, pgno, DB_STR_P(
+ "btree or recno page"));
+ else
+ EPRINT((env, DB_STR_A("1081",
+ "Page %lu: btree or recno page is of inappropriate type %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)pip->type));
+
+ /*
+ * We probably lost a leaf page (or more if this was an
+ * internal page) from our prev/next_pgno chain. Flag
+ * that this is expected; we don't want or need to
+ * spew error messages about erroneous prev/next_pgnos,
+ * since that's probably not the real problem.
+ */
+ F_SET(vdp, VRFY_LEAFCHAIN_BROKEN);
+
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Cases 4 & 5: This is a btree or recno internal page. For each child,
+ * recurse, keeping a running count of nrecs and making sure the level
+ * is always reasonable.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child))
+ if (child->type == V_RECNO) {
+ if (pip->type != P_IRECNO) {
+ ret = __db_unknown_path(
+ env, "__bam_vrfy_subtree");
+ goto err;
+ }
+ if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno,
+ NULL, NULL, flags, &child_level, &child_nrecs,
+ &child_relen)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if (LF_ISSET(DB_ST_RELEN)) {
+ if (relen == 0)
+ relen = child_relen;
+ /*
+ * child_relen may be zero if the child subtree
+ * is empty.
+ */
+ else if (child_relen > 0 &&
+ relen != child_relen) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1082",
+ "Page %lu: recno page returned bad re_len %lu",
+ "%lu %lu"), (u_long)child->pgno,
+ (u_long)child_relen));
+ }
+ if (relenp)
+ *relenp = relen;
+ }
+ if (LF_ISSET(DB_ST_RECNUM)) {
+ if (child->nrecs != child_nrecs) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1083",
+ "Page %lu: record count incorrect: actual %lu, in record %lu",
+ "%lu %lu %lu"),
+ (u_long)child->pgno,
+ (u_long)child_nrecs,
+ (u_long)child->nrecs));
+ }
+ nrecs += child_nrecs;
+ }
+ if (isbad == 0 && level != child_level + 1) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1084",
+ "Page %lu: recno level incorrect: got %lu, expected %lu",
+ "%lu %lu %lu"),
+ (u_long)child->pgno, (u_long)child_level,
+ (u_long)(level - 1)));
+ }
+ } else if (child->type == V_OVERFLOW) {
+ /*
+ * It is possible for one internal page to reference
+ * a single overflow page twice, if all the items
+ * in the subtree referenced by slot 0 are deleted,
+ * then a similar number of items are put back
+ * before the key that formerly had been in slot 1.
+ *
+ * (Btree doesn't look at the key in slot 0, so the
+ * fact that the key formerly at slot 1 is the "wrong"
+ * parent of the stuff in the slot 0 subtree isn't
+ * really incorrect.)
+ *
+ * __db_vrfy_ovfl_structure is designed to be
+ * efficiently called multiple times for multiple
+ * references; call it here as many times as is
+ * appropriate.
+ */
+
+ /* Otherwise, __db_vrfy_childput would be broken. */
+ DB_ASSERT(env, child->refcnt >= 1);
+
+ /*
+ * An overflow referenced more than twice here
+ * shouldn't happen.
+ */
+ if (child->refcnt > 2) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1085",
+ "Page %lu: overflow page %lu referenced more than twice from internal page",
+ "%lu %lu"), (u_long)pgno,
+ (u_long)child->pgno));
+ } else
+ for (j = 0; j < child->refcnt; j++)
+ if ((ret = __db_vrfy_ovfl_structure(dbp,
+ vdp, child->pgno, child->tlen,
+ flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /* We're done with case 4. */
+ if (pip->type == P_IRECNO)
+ goto done;
+
+ /*
+ * Case 5. Btree internal pages.
+ * As described above, we need to iterate through all the
+ * items on the page and make sure that our children sort appropriately
+ * with respect to them.
+ *
+ * For each entry, li will be the "left-hand" key for the entry
+ * itself, which must sort lower than all entries on its child;
+ * ri will be the key to its right, which must sort greater.
+ */
+ if (h == NULL &&
+ (ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+ for (i = 0; i < pip->entries; i += O_INDX) {
+ li = GET_BINTERNAL(dbp, h, i);
+ ri = (i + O_INDX < pip->entries) ?
+ GET_BINTERNAL(dbp, h, i + O_INDX) : r;
+
+ /*
+ * The leftmost key is forcibly sorted less than all entries,
+ * so don't bother passing it.
+ */
+ if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno,
+ i == 0 ? NULL : li, ri, flags, &child_level,
+ &child_nrecs, NULL)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if (LF_ISSET(DB_ST_RECNUM)) {
+ /*
+ * Keep a running tally on the actual record count so
+ * we can return it to our parent (if we have one) or
+ * compare it to the NRECS field if we're a root page.
+ */
+ nrecs += child_nrecs;
+
+ /*
+ * Make sure the actual record count of the child
+ * is equal to the value in the BINTERNAL structure.
+ */
+ if (li->nrecs != child_nrecs) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1086",
+ "Page %lu: item %lu has incorrect record count of %lu, should be %lu",
+ "%lu %lu %lu %lu"), (u_long)pgno,
+ (u_long)i, (u_long)li->nrecs,
+ (u_long)child_nrecs));
+ }
+ }
+
+ if (level != child_level + 1) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1087",
+ "Page %lu: Btree level incorrect: got %lu, expected %lu",
+ "%lu %lu %lu"), (u_long)li->pgno,
+ (u_long)child_level, (u_long)(level - 1)));
+ }
+ }
+
+ if (0) {
+leaf: level = LEAFLEVEL;
+ if (LF_ISSET(DB_ST_RECNUM))
+ nrecs = pip->rec_cnt;
+
+ /* XXX
+ * We should verify that the record count on a leaf page
+ * is the sum of the number of keys and the number of
+ * records in its off-page dups. This requires looking
+ * at the page again, however, and it may all be changing
+ * soon, so for now we don't bother.
+ */
+
+ if (LF_ISSET(DB_ST_RELEN) && relenp)
+ *relenp = pip->re_len;
+ }
+done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) {
+ /*
+ * During the page-by-page pass, item order verification was
+ * not finished due to the presence of overflow items. If
+ * isbad == 0, though, it's now safe to do so, as we've
+ * traversed any child overflow pages. Do it.
+ */
+ if (h == NULL && (ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+ if ((ret = __bam_vrfy_itemorder(dbp,
+ vdp, vdp->thread_info, h, pgno, 0, 1, 0, flags)) != 0)
+ goto err;
+ F_CLR(pip, VRFY_INCOMPLETE);
+ }
+
+ /*
+ * It's possible to get to this point with a page that has no
+ * items, but without having detected any sort of failure yet.
+ * Having zero items is legal if it's a leaf--it may be the
+ * root page in an empty tree, or the tree may have been
+ * modified with the DB_REVSPLITOFF flag set (there's no way
+ * to tell from what's on disk). For an internal page,
+ * though, having no items is a problem (all internal pages
+ * must have children).
+ */
+ if (isbad == 0 && ret == 0) {
+ if (h == NULL && (ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+
+ if (NUM_ENT(h) == 0 && ISINTERNAL(h)) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1088",
+ "Page %lu: internal page is empty and should not be",
+ "%lu"), (u_long)pgno));
+ goto err;
+ }
+ }
+
+ /*
+ * Our parent has sent us BINTERNAL pointers to parent records
+ * so that we can verify our place with respect to them. If it's
+ * appropriate--we have a default sort function--verify this.
+ */
+ if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) &&
+ pip->type != P_IRECNO && pip->type != P_LRECNO) {
+ if (h == NULL && (ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+
+ /*
+ * __bam_vrfy_treeorder needs to know what comparison function
+ * to use. If DB_ST_DUPSET is set, we're in a duplicate tree
+ * and we use the duplicate comparison function; otherwise,
+ * use the btree one. If unset, use the default, of course.
+ */
+ func = LF_ISSET(DB_ST_DUPSET) ? dbp->dup_compare :
+ ((BTREE *)dbp->bt_internal)->bt_compare;
+ if (func == NULL)
+ func = __bam_defcmp;
+
+ if ((ret = __bam_vrfy_treeorder(dbp,
+ vdp->thread_info, h, l, r, func, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ }
+
+ /*
+ * This is guaranteed to succeed for leaf pages, but no harm done.
+ *
+ * Internal pages below the top level do not store their own
+ * record numbers, so we skip them.
+ */
+ if (LF_ISSET(DB_ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1089",
+ "Page %lu: bad record count: has %lu records, claims %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)nrecs,
+ (u_long)pip->rec_cnt));
+ }
+
+ if (levelp)
+ *levelp = level;
+ if (nrecsp)
+ *nrecsp = nrecs;
+
+ pgset = vdp->pgset;
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, vdp->txn, pgno, &p)) != 0)
+ goto err;
+ if (p != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1090",
+ "Page %lu: linked twice", "%lu"), (u_long)pgno));
+ } else if ((ret =
+ __db_vrfy_pgset_inc(pgset, vdp->thread_info, vdp->txn, pgno)) != 0)
+ goto err;
+
+ if (toplevel)
+ /*
+ * The last page's next_pgno in the leaf chain should have been
+ * PGNO_INVALID.
+ */
+ if (vdp->next_pgno != PGNO_INVALID) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1091",
+ "Page %lu: unterminated leaf chain",
+ "%lu"), (u_long)vdp->prev_pgno));
+ }
+
+err: if (toplevel) {
+ /* Restore our caller's settings. */
+ vdp->next_pgno = next_pgno;
+ vdp->prev_pgno = prev_pgno;
+ vdp->leaf_type = leaf_type;
+ }
+
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_treeorder --
+ * Verify that the lowest key on a page sorts greater than the
+ * BINTERNAL which points to it (lp), and the highest key
+ * sorts less than the BINTERNAL above that (rp).
+ *
+ * If lp is NULL, this means that it was the leftmost key on the
+ * parent, which (regardless of sort function) sorts less than
+ * all keys. No need to check it.
+ *
+ * If rp is NULL, lp was the highest key on the parent, so there's
+ * no higher key we must sort less than.
+ */
+static int
+__bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ PAGE *h;
+ BINTERNAL *lp, *rp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ u_int32_t flags;
+{
+ BOVERFLOW *bo;
+ DBC *dbc;
+ DBT dbt;
+ ENV *env;
+ db_indx_t last;
+ int ret, cmp;
+
+ env = dbp->env;
+ memset(&dbt, 0, sizeof(DBT));
+ F_SET(&dbt, DB_DBT_MALLOC);
+ ret = 0;
+
+ /*
+ * Empty pages are sorted correctly by definition. We check
+ * to see whether they ought to be empty elsewhere; leaf
+ * pages legally may be.
+ */
+ if (NUM_ENT(h) == 0)
+ return (0);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_LDUP:
+ last = NUM_ENT(h) - O_INDX;
+ break;
+ case P_LBTREE:
+ last = NUM_ENT(h) - P_INDX;
+ break;
+ default:
+ return (__db_unknown_path(env, "__bam_vrfy_treeorder"));
+ }
+
+ /* Populate a dummy cursor. */
+ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ return (ret);
+ /*
+ * The key on page h, the child page, is more likely to be
+ * an overflow page, so we pass its offset, rather than lp/rp's,
+ * into __bam_cmp. This will take advantage of __db_moff.
+ */
+
+ /*
+ * Skip first-item check if we're an internal page--the first
+ * entry on an internal page is treated specially by __bam_cmp,
+ * so what's on the page shouldn't matter. (Plus, since we're passing
+ * our page and item 0 as to __bam_cmp, we'll sort before our
+ * parent and falsely report a failure.)
+ */
+ if (lp != NULL && TYPE(h) != P_IBTREE) {
+ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ return (ret);
+ if (lp->type == B_KEYDATA) {
+ dbt.data = lp->data;
+ dbt.size = lp->len;
+ } else if (lp->type == B_OVERFLOW) {
+ bo = (BOVERFLOW *)lp->data;
+ if ((ret = __db_goff(dbc, &dbt,
+ bo->tlen, bo->pgno, NULL, NULL)) != 0)
+ return (ret);
+ } else
+ return (
+ __db_unknown_path(env, "__bam_vrfy_treeorder"));
+
+ /* On error, fall through, free if needed, and return. */
+ if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) {
+ if (cmp > 0) {
+ EPRINT((env, DB_STR_A("1092",
+ "Page %lu: first item on page sorted greater than parent entry",
+ "%lu"), (u_long)PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ }
+ } else
+ EPRINT((env, DB_STR_A("1093",
+ "Page %lu: first item on page had comparison error",
+ "%lu"), (u_long)PGNO(h)));
+
+ if (dbt.data != lp->data)
+ __os_ufree(env, dbt.data);
+ if (ret != 0)
+ return (ret);
+ }
+
+ if (rp != NULL) {
+ if (rp->type == B_KEYDATA) {
+ dbt.data = rp->data;
+ dbt.size = rp->len;
+ } else if (rp->type == B_OVERFLOW) {
+ bo = (BOVERFLOW *)rp->data;
+ if ((ret = __db_goff(dbc, &dbt,
+ bo->tlen, bo->pgno, NULL, NULL)) != 0)
+ return (ret);
+ } else
+ return (
+ __db_unknown_path(env, "__bam_vrfy_treeorder"));
+
+ /* On error, fall through, free if needed, and return. */
+ if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) {
+ if (cmp < 0) {
+ EPRINT((env, DB_STR_A("1094",
+ "Page %lu: last item on page sorted greater than parent entry",
+ "%lu"), (u_long)PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ }
+ } else
+ EPRINT((env, DB_STR_A("1095",
+ "Page %lu: last item on page had comparison error",
+ "%lu"), (u_long)PGNO(h)));
+
+ if (dbt.data != rp->data)
+ __os_ufree(env, dbt.data);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_salvage --
+ * Safely dump out anything that looks like a key on an alleged
+ * btree leaf page, also mark overflow pages as seen. For internal btree
+ * pages, just mark any overflow pages as seen.
+ *
+ * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *,
+ * PUBLIC: db_pgno_t, u_int32_t, PAGE *, void *,
+ * PUBLIC: int (*)(void *, const void *), DBT *, u_int32_t));
+ */
+int
+__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ DBT *key;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ DBT dbt, repldbt, unknown_key, unknown_data;
+ ENV *env;
+ VRFY_ITEM *pgmap;
+ db_indx_t i, last, beg, end, *inp;
+ db_pgno_t ovflpg;
+ u_int32_t himark, ovfl_bufsz;
+ void *ovflbuf;
+ int adj, ret, t_ret, t2_ret;
+#ifdef HAVE_COMPRESSION
+ DBT kcpy, *last_key;
+ int unknown_dup_key;
+#endif
+
+ env = dbp->env;
+ ovflbuf = pgmap = NULL;
+ inp = P_INP(dbp, h);
+
+ memset(&dbt, 0, sizeof(DBT));
+ dbt.flags = DB_DBT_REALLOC;
+ memset(&repldbt, 0, sizeof(DBT));
+
+#ifdef HAVE_COMPRESSION
+ memset(&kcpy, 0, sizeof(DBT));
+ unknown_dup_key = LF_ISSET(DB_SA_UNKNOWNKEY);
+ last_key = unknown_dup_key ? NULL : key;
+#endif
+ LF_CLR(DB_SA_UNKNOWNKEY);
+
+ DB_INIT_DBT(unknown_key, "UNKNOWN_KEY", sizeof("UNKNOWN_KEY") - 1);
+ DB_INIT_DBT(unknown_data, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1);
+
+ /*
+ * Allocate a buffer for overflow items. Start at one page;
+ * __db_safe_goff will realloc as needed.
+ */
+ if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
+ goto err;
+ ovfl_bufsz = dbp->pgsize;
+
+ if (LF_ISSET(DB_AGGRESSIVE) && (ret =
+ __os_calloc(env, dbp->pgsize, sizeof(pgmap[0]), &pgmap)) != 0)
+ goto err;
+
+ /*
+ * Loop through the inp array, spitting out key/data pairs.
+ *
+ * If we're salvaging normally, loop from 0 through NUM_ENT(h). If
+ * we're being aggressive, loop until we hit the end of the page --
+ * NUM_ENT() may be bogus.
+ */
+ himark = dbp->pgsize;
+ for (i = 0, last = UINT16_MAX;; i += O_INDX) {
+ /*
+ * If we're not aggressive, or if we're on an internal page,
+ * break when we hit NUM_ENT(h).
+ */
+ if ((!LF_ISSET(DB_AGGRESSIVE) ||
+ pgtype == P_IBTREE) && i >= NUM_ENT(h))
+ break;
+
+ /* Verify the current item. */
+ t_ret =
+ __db_vrfy_inpitem(dbp, h, pgno, i, 1, flags, &himark, NULL);
+
+ if (t_ret != 0) {
+ /*
+ * If this is a btree leaf and we've printed out a key
+ * but not its associated data item, fix this imbalance
+ * by printing an "UNKNOWN_DATA".
+ */
+ if (pgtype == P_LBTREE && i % P_INDX == 1 &&
+ last == i - 1 && (t2_ret = __db_vrfy_prdbt(
+ &unknown_data,
+ 0, " ", handle, callback, 0, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t2_ret;
+ goto err;
+ }
+
+ /*
+ * Don't return DB_VERIFY_FATAL; it's private and means
+ * only that we can't go on with this page, not with
+ * the whole database. It's not even an error if we've
+ * run into it after NUM_ENT(h).
+ */
+ if (t_ret == DB_VERIFY_FATAL) {
+ if (i < NUM_ENT(h) && ret == 0)
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+ continue;
+ }
+
+ /*
+ * If this returned 0, it's safe to print or (carefully)
+ * try to fetch.
+ *
+ * We only print deleted items if DB_AGGRESSIVE is set.
+ */
+ bk = GET_BKEYDATA(dbp, h, i);
+ if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type))
+ continue;
+
+ /*
+ * If this is a btree leaf and we're about to print out a data
+ * item for which we didn't print out a key, fix this imbalance
+ * by printing an "UNKNOWN_KEY".
+ */
+ if (pgtype == P_LBTREE && i % P_INDX == 1 && last != i - 1) {
+#ifdef HAVE_COMPRESSION
+ last_key = NULL;
+#endif
+ if ((t_ret = __db_vrfy_prdbt(&unknown_key,
+ 0, " ", handle, callback, 0, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+ }
+ last = i;
+
+ /*
+ * We're going to go try to print the next item. If key is
+ * non-NULL, we're a dup page, so we've got to print the key
+ * first, unless DB_SA_SKIPFIRSTKEY is set and we're on the
+ * first entry.
+ */
+ if (key != NULL && (i != 0 || !LF_ISSET(DB_SA_SKIPFIRSTKEY))) {
+#ifdef HAVE_COMPRESSION
+ last_key = unknown_dup_key ? NULL : key;
+#endif
+ if ((t_ret = __db_vrfy_prdbt(key,
+ 0, " ", handle, callback, 0, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+ }
+
+ beg = end = inp[i];
+ switch (B_TYPE(bk->type)) {
+ case B_DUPLICATE:
+ if (pgtype == P_IBTREE)
+ break;
+
+ end = beg + BOVERFLOW_SIZE - 1;
+ /*
+ * If we're not on a normal btree leaf page, there
+ * shouldn't be off-page dup sets. Something's
+ * confused; just drop it, and the code to pick up
+ * unlinked offpage dup sets will print it out
+ * with key "UNKNOWN" later.
+ */
+ if (pgtype != P_LBTREE)
+ break;
+
+ bo = (BOVERFLOW *)bk;
+
+ /*
+ * If the page number is unreasonable, or if this is
+ * supposed to be a key item, output "UNKNOWN_KEY" --
+ * the best we can do is run into the data items in
+ * the unlinked offpage dup pass.
+ */
+ if (!IS_VALID_PGNO(bo->pgno) || (i % P_INDX == 0)) {
+ /* Not much to do on failure. */
+#ifdef HAVE_COMPRESSION
+ if (key == NULL && i % P_INDX == 0)
+ last_key = NULL;
+#endif
+ if ((t_ret = __db_vrfy_prdbt(
+ i % P_INDX == 0 ? &unknown_key : &unknown_data,
+ 0, " ", handle, callback, 0, 0,vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+ break;
+ }
+
+ /* Don't stop on error. */
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, bo->pgno, &dbt, handle, callback,
+ flags | DB_SA_SKIPFIRSTKEY
+#ifdef HAVE_COMPRESSION
+ | (last_key == NULL ? DB_SA_UNKNOWNKEY : 0)
+#endif
+ )) != 0 && ret == 0)
+ ret = t_ret;
+
+ break;
+ case B_KEYDATA:
+ if (pgtype == P_IBTREE)
+ break;
+
+ end = (db_indx_t)DB_ALIGN(
+ beg + bk->len, sizeof(u_int32_t)) - 1;
+
+ dbt.data = bk->data;
+ dbt.size = bk->len;
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp) && last_key != NULL &&
+ (key != NULL || (i % P_INDX == 1))) {
+ /* Decompress the key/data pair - the key
+ is in last_key, and the data is in dbt */
+ if ((t_ret = __bam_compress_salvage(dbp, vdp,
+ handle, callback, last_key, &dbt)) != 0) {
+ if (t_ret == DB_VERIFY_FATAL) {
+ if (ret == 0)
+ ret = DB_VERIFY_BAD;
+ if (!LF_ISSET(DB_AGGRESSIVE))
+ goto err;
+ } else if (ret == 0) {
+ ret = t_ret;
+ goto err;
+ }
+ }
+ } else {
+ if (key == NULL && i % P_INDX == 0) {
+ if ((ret = __os_realloc(
+ env, dbt.size, &kcpy.data)) != 0)
+ goto err;
+ memcpy(kcpy.data, dbt.data, dbt.size);
+ kcpy.size = dbt.size;
+ last_key = &kcpy;
+ }
+#endif
+
+ if ((t_ret = __db_vrfy_prdbt(&dbt,
+ 0, " ", handle, callback, 0, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+#ifdef HAVE_COMPRESSION
+ }
+#endif
+ break;
+ case B_OVERFLOW:
+ if (pgtype != P_IBTREE)
+ end = beg + BOVERFLOW_SIZE - 1;
+ bo = (BOVERFLOW *)bk;
+
+ /*
+ * Check for replicated overflow keys, so that we only
+ * call __db_safe_goff once per overflow page. If we
+ * get the same offset as the previous key just re-use
+ * the previous dbt.
+ *
+ * P_IBTREE pages will never have replicated overflow
+ * keys.
+ */
+ adj = pgtype == P_IBTREE ? O_INDX : P_INDX;
+ if (pgtype == P_IBTREE) {
+ /*
+ * If we're looking at a P_IBTREE, we just want
+ * to mark the overflow page as seen.
+ *
+ * Note that this call to __db_safe_goff differs
+ * from the non-P_IBTREE call.
+ *
+ * Only call __db_safe_goff if the overflow page
+ * hasn't been seen.
+ */
+ ovflpg = ((BOVERFLOW *)
+ ((BINTERNAL *)bk)->data)->pgno;
+ if (__db_salvage_isdone(vdp, ovflpg) == 0 &&
+ (t_ret =__db_safe_goff(dbp, vdp, ovflpg,
+ &dbt, &ovflbuf,
+ &ovfl_bufsz, flags)) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+ } else if (i > adj - 1 &&
+ i % adj == 0 && inp[i] == inp[i - adj])
+ dbt = repldbt;
+ else {
+ /* Don't stop on error. */
+ if ((t_ret = __db_safe_goff(dbp, vdp,
+ bo->pgno, &dbt, &ovflbuf,
+ &ovfl_bufsz, flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * If this is a key, save it in case the next
+ * key is a replicated overflow, so we don't
+ * call __db_safe_goff again. Copy out dbt.data
+ * in case that pointer gets realloc'd when
+ * getting a data item.
+ */
+ if (i % P_INDX == 0) {
+ if (t_ret == 0) {
+ if ((t_ret = __os_realloc(env,
+ dbt.size,
+ &repldbt.data)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+ memcpy(repldbt.data,
+ dbt.data, dbt.size);
+ repldbt.size = dbt.size;
+ } else {
+ if (__os_realloc(env,
+ unknown_key.size,
+ &repldbt.data) != 0)
+ goto err;
+ memcpy(repldbt.data,
+ unknown_key.data,
+ unknown_key.size);
+ repldbt.size = unknown_key.size;
+ }
+ }
+
+ }
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp) && last_key && t_ret == 0 &&
+ (key != NULL || (i % P_INDX == 1))) {
+ /* Decompress the key/data pair - the key
+ is in last_key, and the data is in dbt */
+ if ((t_ret = __bam_compress_salvage(dbp, vdp,
+ handle, callback, last_key, &dbt)) != 0) {
+ if (t_ret == DB_VERIFY_FATAL) {
+ if (ret == 0)
+ ret = DB_VERIFY_BAD;
+ if (!LF_ISSET(DB_AGGRESSIVE))
+ goto err;
+ } else if (ret == 0) {
+ ret = t_ret;
+ goto err;
+ }
+ }
+ } else {
+ if (key == NULL && i % P_INDX == 0) {
+ if (t_ret == 0) {
+ if ((ret = __os_realloc(env,
+ dbt.size, &kcpy.data)) != 0)
+ goto err;
+ memcpy(kcpy.data, dbt.data,
+ dbt.size);
+ kcpy.size = dbt.size;
+ last_key = &kcpy;
+ } else
+ last_key = NULL;
+ }
+#endif
+
+ if ((t_ret = __db_vrfy_prdbt(
+ t_ret == 0 ? &dbt : &unknown_key,
+ 0, " ", handle, callback, 0, 0, vdp))
+ != 0 && ret == 0)
+ ret = t_ret;
+#ifdef HAVE_COMPRESSION
+ }
+#endif
+ break;
+ default:
+ /*
+ * We should never get here; __db_vrfy_inpitem should
+ * not be returning 0 if bk->type is unrecognizable.
+ */
+ t_ret = __db_unknown_path(env, "__bam_salvage");
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+
+ /*
+ * If we're being aggressive, mark the beginning and end of
+ * the item; we'll come back and print whatever "junk" is in
+ * the gaps in case we had any bogus inp elements and thereby
+ * missed stuff.
+ */
+ if (LF_ISSET(DB_AGGRESSIVE) && pgtype != P_IBTREE) {
+ pgmap[beg] = VRFY_ITEM_BEGIN;
+ pgmap[end] = VRFY_ITEM_END;
+ }
+ }
+
+err: if (pgmap != NULL)
+ __os_free(env, pgmap);
+ if (ovflbuf != NULL)
+ __os_free(env, ovflbuf);
+ if (repldbt.data != NULL)
+ __os_free(env, repldbt.data);
+#ifdef HAVE_COMPRESSION
+ if (kcpy.data != NULL)
+ __os_free(env, kcpy.data);
+#endif
+
+ /* Mark this page as done. */
+ if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_salvage_walkdupint --
+ * Walk a known-good btree or recno internal page which is part of
+ * a dup tree, calling __db_salvage_duptree on each child page.
+ *
+ * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *,
+ * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ DBT *key;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ BINTERNAL *bi;
+ ENV *env;
+ RINTERNAL *ri;
+ int ret, t_ret;
+ db_indx_t i;
+
+ env = dbp->env;
+ ret = 0;
+
+ for (i = 0; i < NUM_ENT(h); i++) {
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(dbp, h, i);
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, bi->pgno, key, handle, callback, flags)) != 0)
+ ret = t_ret;
+ break;
+ case P_IRECNO:
+ ri = GET_RINTERNAL(dbp, h, i);
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, ri->pgno, key, handle, callback, flags)) != 0)
+ ret = t_ret;
+ break;
+ default:
+ return (__db_unknown_path(
+ env, "__bam_salvage_walkdupint"));
+ }
+ /* Pass DB_SA_SKIPFIRSTKEY, if set, on to the 0th child only. */
+ flags &= ~LF_ISSET(DB_SA_SKIPFIRSTKEY);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_meta2pgset --
+ * Given a known-good meta page, return in pgsetp a 0-terminated list of
+ * db_pgno_t's corresponding to the pages in the btree.
+ *
+ * We do this by a somewhat sleazy method, to avoid having to traverse the
+ * btree structure neatly: we walk down the left side to the very
+ * first leaf page, then we mark all the pages in the chain of
+ * NEXT_PGNOs (being wary of cycles and invalid ones), then we
+ * consolidate our scratch array into a nice list, and return. This
+ * avoids the memory management hassles of recursion and the
+ * trouble of walking internal pages--they just don't matter, except
+ * for the left branch.
+ *
+ * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC: u_int32_t, DB *));
+ */
+int
+__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ BTMETA *btmeta;
+ u_int32_t flags;
+ DB *pgset;
+{
+ BINTERNAL *bi;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_pgno_t current, p;
+ int err_ret, ret;
+
+ DB_ASSERT(dbp->env, pgset != NULL);
+
+ mpf = dbp->mpf;
+ h = NULL;
+ ret = err_ret = 0;
+
+ for (current = btmeta->root;;) {
+ if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) {
+ err_ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = __memp_fget(mpf, &current,
+ vdp->thread_info, NULL, 0, &h)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ if ((ret = __bam_vrfy(dbp,
+ vdp, h, current, flags | DB_NOORDERCHK)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+ if (TYPE(h) == P_IBTREE) {
+ bi = GET_BINTERNAL(dbp, h, 0);
+ current = bi->pgno;
+ } else { /* P_IRECNO */
+ ri = GET_RINTERNAL(dbp, h, 0);
+ current = ri->pgno;
+ }
+ break;
+ case P_LBTREE:
+ case P_LRECNO:
+ goto traverse;
+ default:
+ err_ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ err_ret = ret;
+ h = NULL;
+ }
+
+ /*
+ * At this point, current is the pgno of leaf page h, the 0th in the
+ * tree we're concerned with.
+ */
+traverse:
+ while (IS_VALID_PGNO(current) && current != PGNO_INVALID) {
+ if (h == NULL && (ret = __memp_fget(mpf,
+ &current, vdp->thread_info, NULL, 0, &h)) != 0) {
+ err_ret = ret;
+ break;
+ }
+
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, vdp->txn, current, (int *)&p)) != 0)
+ goto err;
+
+ if (p != 0) {
+ /*
+ * We've found a cycle. Return success anyway--
+ * our caller may as well use however much of
+ * the pgset we've come up with.
+ */
+ break;
+ }
+ if ((ret = __db_vrfy_pgset_inc(
+ pgset, vdp->thread_info, vdp->txn, current)) != 0)
+ goto err;
+
+ current = NEXT_PGNO(h);
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ err_ret = ret;
+ h = NULL;
+ }
+
+err: if (h != NULL)
+ (void)__memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED);
+
+ return (ret == 0 ? err_ret : ret);
+}
+
+/*
+ * __bam_safe_getdata --
+ *
+ * Utility function for __bam_vrfy_itemorder. Safely gets the datum at
+ * index i, page h, and sticks it in DBT dbt. If ovflok is 1 and i's an
+ * overflow item, we do a safe_goff to get the item and signal that we need
+ * to free dbt->data; if ovflok is 0, we leaves the DBT zeroed.
+ */
+static int
+__bam_safe_getdata(dbp, ip, h, i, ovflok, dbt, freedbtp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ PAGE *h;
+ u_int32_t i;
+ int ovflok;
+ DBT *dbt;
+ int *freedbtp;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ DBC *dbc;
+ int ret;
+
+ memset(dbt, 0, sizeof(DBT));
+ *freedbtp = 0;
+
+ bk = GET_BKEYDATA(dbp, h, i);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ if (!ovflok)
+ return (0);
+
+ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ return (ret);
+ bo = (BOVERFLOW *)bk;
+ F_SET(dbt, DB_DBT_MALLOC);
+
+ *freedbtp = 1;
+ return (__db_goff(dbc, dbt, bo->tlen, bo->pgno, NULL, NULL));
+ } else {
+ dbt->data = bk->data;
+ dbt->size = bk->len;
+ }
+
+ return (0);
+}
diff --git a/src/btree/btree.src b/src/btree/btree.src
new file mode 100644
index 00000000..08e5a206
--- /dev/null
+++ b/src/btree/btree.src
@@ -0,0 +1,290 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __bam
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/btree.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * BTREE-split: used to log a page split.
+ *
+ * left: the page number for the low-order contents.
+ * llsn: the left page's original LSN.
+ * right: the page number for the high-order contents.
+ * rlsn: the right page's original LSN.
+ * indx: the number of entries that went to the left page.
+ * npgno: the next page number
+ * nlsn: the next page's original LSN (or 0 if no next page).
+ * pgno: the parent page number
+ * plsn: the parent page's original LSN.
+ * pg: the split page's contents before the split.
+ * opflags: SPL_NRECS: if splitting a tree that maintains a record count.
+ * pindx: index of new record in parent page.
+ */
+BEGIN split 50 62
+DB fileid int32_t ld
+OP opflags u_int32_t lu
+ARG left db_pgno_t lu
+POINTER llsn DB_LSN * lu
+ARG right db_pgno_t lu
+POINTER rlsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+ARG ppgno db_pgno_t lu
+POINTER plsn DB_LSN * lu
+ARG pindx u_int32_t lu
+PGDBT pg DBT s
+HDR pentry DBT s
+HDR rentry DBT s
+END
+
+BEGIN_COMPAT split 48 62
+DB fileid int32_t ld
+ARG left db_pgno_t lu
+POINTER llsn DB_LSN * lu
+ARG right db_pgno_t lu
+POINTER rlsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+ARG ppgno db_pgno_t lu
+POINTER plsn DB_LSN * lu
+ARG pindx u_int32_t lu
+PGDBT pg DBT s
+DBT pentry DBT s
+DBT rentry DBT s
+ARG opflags u_int32_t lu
+END
+
+BEGIN_COMPAT split 42 62
+DB fileid int32_t ld
+ARG left db_pgno_t lu
+POINTER llsn DB_LSN * lu
+ARG right db_pgno_t lu
+POINTER rlsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+ARG root_pgno db_pgno_t lu
+PGDBT pg DBT s
+ARG opflags u_int32_t lu
+END
+
+/*
+ * BTREE-rsplit: used to log a reverse-split
+ *
+ * pgno: the page number of the page copied over the root.
+ * pgdbt: the page being copied on the root page.
+ * root_pgno: the root page number.
+ * nrec: the tree's record count.
+ * rootent: last entry on the root page.
+ * rootlsn: the root page's original lsn.
+ */
+BEGIN rsplit 42 63
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+PGDBT pgdbt DBT s
+ARG root_pgno db_pgno_t lu
+ARG nrec db_pgno_t lu
+DBT rootent DBT s
+POINTER rootlsn DB_LSN * lu
+END
+
+/*
+ * BTREE-adj: used to log the adjustment of an index.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index adjusted.
+ * indx_copy: the index to copy if inserting.
+ * is_insert: 0 if a delete, 1 if an insert.
+ */
+BEGIN adj 42 55
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG indx_copy u_int32_t lu
+ARG is_insert u_int32_t lu
+END
+
+/*
+ * BTREE-cadjust: used to adjust the count change in an internal page.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be adjusted.
+ * adjust: the signed adjustment.
+ * opflags: CAD_UPDATEROOT: if root page count was adjusted.
+ */
+BEGIN cadjust 42 56
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG adjust int32_t ld
+ARG opflags u_int32_t lu
+END
+
+/*
+ * BTREE-cdel: used to log the intent-to-delete of a cursor record.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be deleted.
+ */
+BEGIN cdel 42 57
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+END
+
+/*
+ * BTREE-repl: used to log the replacement of an item.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be replaced.
+ * isdeleted: set if the record was previously deleted.
+ * orig: the original data.
+ * repl: the replacement data.
+ * prefix: the prefix of the replacement that matches the original.
+ * suffix: the suffix of the replacement that matches the original.
+ */
+BEGIN repl 42 58
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG isdeleted u_int32_t lu
+DBT orig DBT s
+DBT repl DBT s
+ARG prefix u_int32_t lu
+ARG suffix u_int32_t lu
+END
+
+/*
+ * BTREE-irep: used to log the replacement of an item on an internal page.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be replaced.
+ * ptype: type of the page.
+ * hdr: header of the record.
+ * data: data of the record.
+ */
+BEGIN irep 50 67
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+OP ptype u_int32_t lu
+HDR hdr DBT s
+DATA data DBT s
+HDR old DBT s
+END
+
+/*
+ * BTREE-root: log the assignment of a root btree page.
+ */
+BEGIN root 42 59
+DB fileid int32_t ld
+ARG meta_pgno db_pgno_t lu
+ARG root_pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+END
+
+/*
+ * BTREE-curadj: undo cursor adjustments on txn abort.
+ * Should only be processed during DB_TXN_ABORT.
+ * NOTE: the first_indx field gets used to hold
+ * signed index adjustment in one case.
+ * care should be taken if its size is changed.
+ */
+BEGIN curadj 42 64
+/* Fileid of db affected. */
+DB fileid int32_t ld
+/* Which adjustment. */
+ARG mode db_ca_mode ld
+/* Page entry is from. */
+ARG from_pgno db_pgno_t lu
+/* Page entry went to. */
+ARG to_pgno db_pgno_t lu
+/* Left page of root split. */
+ARG left_pgno db_pgno_t lu
+/* First index of dup set. Also used as adjustment. */
+ARG first_indx u_int32_t lu
+/* Index entry is from. */
+ARG from_indx u_int32_t lu
+/* Index where entry went. */
+ARG to_indx u_int32_t lu
+END
+
+/*
+ * BTREE-rcuradj: undo cursor adjustments on txn abort in
+ * renumbering recno trees.
+ * Should only be processed during DB_TXN_ABORT.
+ */
+BEGIN rcuradj 42 65
+/* Fileid of db affected. */
+DB fileid int32_t ld
+/* Which adjustment. */
+ARG mode ca_recno_arg ld
+/* Root page number. */
+ARG root db_pgno_t ld
+/* Recno of the adjustment. */
+ARG recno db_recno_t ld
+/* Order number of the adjustment. */
+ARG order u_int32_t lu
+END
+
+/*
+ * BTREE-relink -- Handles relinking around a deleted leaf page.
+ * Current routine moved to __db_relink.
+ *
+ */
+BEGIN_COMPAT relink 43 147
+/* Fileid of db affected. */
+DB fileid int32_t ld
+/* The page being removed. */
+ARG pgno db_pgno_t lu
+/* The page's original lsn. */
+POINTER lsn DB_LSN * lu
+/* The previous page. */
+ARG prev db_pgno_t lu
+/* The previous page's original lsn. */
+POINTER lsn_prev DB_LSN * lu
+/* The next page. */
+ARG next db_pgno_t lu
+/* The previous page's original lsn. */
+POINTER lsn_next DB_LSN * lu
+END
+
+/*
+ * BTREE-merge -- Handles merging of pages during a compaction.
+ * Current routine moved to __db_merge.
+ */
+BEGIN_COMPAT merge 44 148
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+DBT hdr DBT s
+DBT data DBT s
+DBT ind DBT s
+END
diff --git a/src/btree/btree_auto.c b/src/btree/btree_auto.c
new file mode 100644
index 00000000..e5e148c5
--- /dev/null
+++ b/src/btree/btree_auto.c
@@ -0,0 +1,207 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __bam_split_desc[] = {
+ {LOGREC_DB, SSZ(__bam_split_args, fileid), "fileid", ""},
+ {LOGREC_OP, SSZ(__bam_split_args, opflags), "opflags", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_split_args, left), "left", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_args, llsn), "llsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_args, right), "right", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_args, rlsn), "rlsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_split_args, npgno), "npgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_args, nlsn), "nlsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_args, ppgno), "ppgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_args, plsn), "plsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_args, pindx), "pindx", "%lu"},
+ {LOGREC_PGDBT, SSZ(__bam_split_args, pg), "pg", ""},
+ {LOGREC_HDR, SSZ(__bam_split_args, pentry), "pentry", ""},
+ {LOGREC_HDR, SSZ(__bam_split_args, rentry), "rentry", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_split_48_desc[] = {
+ {LOGREC_DB, SSZ(__bam_split_48_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_split_48_args, left), "left", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_48_args, llsn), "llsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_48_args, right), "right", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_48_args, rlsn), "rlsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_48_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_split_48_args, npgno), "npgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_48_args, nlsn), "nlsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_48_args, ppgno), "ppgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_48_args, plsn), "plsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_48_args, pindx), "pindx", "%lu"},
+ {LOGREC_PGDBT, SSZ(__bam_split_48_args, pg), "pg", ""},
+ {LOGREC_DBT, SSZ(__bam_split_48_args, pentry), "pentry", ""},
+ {LOGREC_DBT, SSZ(__bam_split_48_args, rentry), "rentry", ""},
+ {LOGREC_ARG, SSZ(__bam_split_48_args, opflags), "opflags", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_split_42_desc[] = {
+ {LOGREC_DB, SSZ(__bam_split_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_split_42_args, left), "left", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_42_args, llsn), "llsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_42_args, right), "right", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_42_args, rlsn), "rlsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_42_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_split_42_args, npgno), "npgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_split_42_args, nlsn), "nlsn", ""},
+ {LOGREC_ARG, SSZ(__bam_split_42_args, root_pgno), "root_pgno", "%lu"},
+ {LOGREC_PGDBT, SSZ(__bam_split_42_args, pg), "pg", ""},
+ {LOGREC_ARG, SSZ(__bam_split_42_args, opflags), "opflags", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_rsplit_desc[] = {
+ {LOGREC_DB, SSZ(__bam_rsplit_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_rsplit_args, pgno), "pgno", "%lu"},
+ {LOGREC_PGDBT, SSZ(__bam_rsplit_args, pgdbt), "pgdbt", ""},
+ {LOGREC_ARG, SSZ(__bam_rsplit_args, root_pgno), "root_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_rsplit_args, nrec), "nrec", "%lu"},
+ {LOGREC_DBT, SSZ(__bam_rsplit_args, rootent), "rootent", ""},
+ {LOGREC_POINTER, SSZ(__bam_rsplit_args, rootlsn), "rootlsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_adj_desc[] = {
+ {LOGREC_DB, SSZ(__bam_adj_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_adj_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_adj_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__bam_adj_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_adj_args, indx_copy), "indx_copy", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_adj_args, is_insert), "is_insert", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_cadjust_desc[] = {
+ {LOGREC_DB, SSZ(__bam_cadjust_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_cadjust_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_cadjust_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__bam_cadjust_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_cadjust_args, adjust), "adjust", "%ld"},
+ {LOGREC_ARG, SSZ(__bam_cadjust_args, opflags), "opflags", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_cdel_desc[] = {
+ {LOGREC_DB, SSZ(__bam_cdel_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_cdel_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_cdel_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__bam_cdel_args, indx), "indx", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_repl_desc[] = {
+ {LOGREC_DB, SSZ(__bam_repl_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_repl_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_repl_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__bam_repl_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_repl_args, isdeleted), "isdeleted", "%lu"},
+ {LOGREC_DBT, SSZ(__bam_repl_args, orig), "orig", ""},
+ {LOGREC_DBT, SSZ(__bam_repl_args, repl), "repl", ""},
+ {LOGREC_ARG, SSZ(__bam_repl_args, prefix), "prefix", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_repl_args, suffix), "suffix", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_irep_desc[] = {
+ {LOGREC_DB, SSZ(__bam_irep_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_irep_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_irep_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__bam_irep_args, indx), "indx", "%lu"},
+ {LOGREC_OP, SSZ(__bam_irep_args, ptype), "ptype", "%lu"},
+ {LOGREC_HDR, SSZ(__bam_irep_args, hdr), "hdr", ""},
+ {LOGREC_DATA, SSZ(__bam_irep_args, data), "data", ""},
+ {LOGREC_HDR, SSZ(__bam_irep_args, old), "old", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_root_desc[] = {
+ {LOGREC_DB, SSZ(__bam_root_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_root_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_root_args, root_pgno), "root_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_root_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_curadj_desc[] = {
+ {LOGREC_DB, SSZ(__bam_curadj_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_curadj_args, mode), "mode", "%ld"},
+ {LOGREC_ARG, SSZ(__bam_curadj_args, from_pgno), "from_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_curadj_args, to_pgno), "to_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_curadj_args, left_pgno), "left_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_curadj_args, first_indx), "first_indx", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_curadj_args, from_indx), "from_indx", "%lu"},
+ {LOGREC_ARG, SSZ(__bam_curadj_args, to_indx), "to_indx", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_rcuradj_desc[] = {
+ {LOGREC_DB, SSZ(__bam_rcuradj_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_rcuradj_args, mode), "mode", "%ld"},
+ {LOGREC_ARG, SSZ(__bam_rcuradj_args, root), "root", "%ld"},
+ {LOGREC_ARG, SSZ(__bam_rcuradj_args, recno), "recno", "%ld"},
+ {LOGREC_ARG, SSZ(__bam_rcuradj_args, order), "order", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_relink_43_desc[] = {
+ {LOGREC_DB, SSZ(__bam_relink_43_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_relink_43_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_relink_43_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__bam_relink_43_args, prev), "prev", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_relink_43_args, lsn_prev), "lsn_prev", ""},
+ {LOGREC_ARG, SSZ(__bam_relink_43_args, next), "next", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_relink_43_args, lsn_next), "lsn_next", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __bam_merge_44_desc[] = {
+ {LOGREC_DB, SSZ(__bam_merge_44_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__bam_merge_44_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_merge_44_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__bam_merge_44_args, npgno), "npgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__bam_merge_44_args, nlsn), "nlsn", ""},
+ {LOGREC_DBT, SSZ(__bam_merge_44_args, hdr), "hdr", ""},
+ {LOGREC_DBT, SSZ(__bam_merge_44_args, data), "data", ""},
+ {LOGREC_DBT, SSZ(__bam_merge_44_args, ind), "ind", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __bam_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__bam_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_split_recover, DB___bam_split)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rsplit_recover, DB___bam_rsplit)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_adj_recover, DB___bam_adj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cadjust_recover, DB___bam_cadjust)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cdel_recover, DB___bam_cdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_repl_recover, DB___bam_repl)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_irep_recover, DB___bam_irep)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_root_recover, DB___bam_root)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_curadj_recover, DB___bam_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rcuradj_recover, DB___bam_rcuradj)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/btree/btree_autop.c b/src/btree/btree_autop.c
new file mode 100644
index 00000000..d2bee7d0
--- /dev/null
+++ b/src/btree/btree_autop.c
@@ -0,0 +1,291 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __bam_split_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_split_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_split", __bam_split_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_split_48_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_split_48_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_split_48", __bam_split_48_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_split_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_split_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_split_42", __bam_split_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_rsplit_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_rsplit", __bam_rsplit_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_adj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_adj_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_adj", __bam_adj_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_cadjust_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_cadjust", __bam_cadjust_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_cdel_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_cdel_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_cdel", __bam_cdel_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_repl_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_repl_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_repl", __bam_repl_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_irep_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_irep_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_irep", __bam_irep_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_root_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_root_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_root", __bam_root_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_curadj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_curadj_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_curadj", __bam_curadj_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_rcuradj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_rcuradj_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_rcuradj", __bam_rcuradj_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_relink_43_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_relink_43_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_relink_43", __bam_relink_43_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_merge_44_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_merge_44_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__bam_merge_44", __bam_merge_44_desc, info));
+}
+
+/*
+ * PUBLIC: int __bam_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__bam_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_split_print, DB___bam_split)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rsplit_print, DB___bam_rsplit)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_adj_print, DB___bam_adj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cadjust_print, DB___bam_cadjust)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cdel_print, DB___bam_cdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_repl_print, DB___bam_repl)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_irep_print, DB___bam_irep)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_root_print, DB___bam_root)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_curadj_print, DB___bam_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rcuradj_print, DB___bam_rcuradj)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/clib/atoi.c b/src/clib/atoi.c
new file mode 100644
index 00000000..d064ffb0
--- /dev/null
+++ b/src/clib/atoi.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * atoi --
+ *
+ * PUBLIC: #ifndef HAVE_ATOI
+ * PUBLIC: int atoi __P((const char *));
+ * PUBLIC: #endif
+ */
+int
+atoi(str)
+ const char *str;
+{
+ return (int)strtol(str, (char **)NULL, 10);
+}
diff --git a/src/clib/atol.c b/src/clib/atol.c
new file mode 100644
index 00000000..9aefcd5a
--- /dev/null
+++ b/src/clib/atol.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * atol --
+ *
+ * PUBLIC: #ifndef HAVE_ATOL
+ * PUBLIC: long atol __P((const char *));
+ * PUBLIC: #endif
+ */
+long
+atol(str)
+ const char *str;
+{
+ return strtol(str, (char **)NULL, 10);
+}
diff --git a/src/clib/bsearch.c b/src/clib/bsearch.c
new file mode 100644
index 00000000..3e55009a
--- /dev/null
+++ b/src/clib/bsearch.c
@@ -0,0 +1,38 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * bsearch --
+ *
+ * PUBLIC: #ifndef HAVE_BSEARCH
+ * PUBLIC: void *bsearch __P((const void *, const void *, size_t,
+ * PUBLIC: size_t, int (*)(const void *, const void *)));
+ * PUBLIC: #endif
+ */
+
+void *bsearch(key, base, nmemb, size, cmp)
+ const void *key;
+ const void *base;
+ size_t nmemb;
+ size_t size;
+ int (*cmp) __P((const void *, const void *));
+{
+ size_t i;
+
+ /* not doing a binary search, but searching linearly */
+ for (i=0; i < nmemb; i++) {
+ if ((*cmp)(key, (const void *)((char *)base + i * size)) == 0)
+ return ((void *)((char *)base + i * size));
+ }
+
+ return (NULL);
+}
diff --git a/src/clib/getcwd.c b/src/clib/getcwd.c
new file mode 100644
index 00000000..83e8b62d
--- /dev/null
+++ b/src/clib/getcwd.c
@@ -0,0 +1,261 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#if HAVE_DIRENT_H
+# include <dirent.h>
+# define NAMLEN(dirent) strlen((dirent)->d_name)
+#else
+# define dirent direct
+# define NAMLEN(dirent) (dirent)->d_namlen
+# if HAVE_SYS_NDIR_H
+# include <sys/ndir.h>
+# endif
+# if HAVE_SYS_DIR_H
+# include <sys/dir.h>
+# endif
+# if HAVE_NDIR_H
+# include <ndir.h>
+# endif
+#endif
+#endif
+
+#define ISDOT(dp) \
+ (dp->d_name[0] == '.' && (dp->d_name[1] == '\0' || \
+ (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
+
+#ifndef dirfd
+#define dirfd(dirp) ((dirp)->dd_fd)
+#endif
+
+/*
+ * getcwd --
+ * Get the current working directory.
+ *
+ * PUBLIC: #ifndef HAVE_GETCWD
+ * PUBLIC: char *getcwd __P((char *, size_t));
+ * PUBLIC: #endif
+ */
+char *
+getcwd(pt, size)
+ char *pt;
+ size_t size;
+{
+ register struct dirent *dp;
+ register DIR *dir;
+ register dev_t dev;
+ register ino_t ino;
+ register int first;
+ register char *bpt, *bup;
+ struct stat s;
+ dev_t root_dev;
+ ino_t root_ino;
+ size_t ptsize, upsize;
+ int ret, save_errno;
+ char *ept, *eup, *up;
+
+ /*
+ * If no buffer specified by the user, allocate one as necessary.
+ * If a buffer is specified, the size has to be non-zero. The path
+ * is built from the end of the buffer backwards.
+ */
+ if (pt) {
+ ptsize = 0;
+ if (!size) {
+ __os_set_errno(EINVAL);
+ return (NULL);
+ }
+ if (size == 1) {
+ __os_set_errno(ERANGE);
+ return (NULL);
+ }
+ ept = pt + size;
+ } else {
+ if ((ret =
+ __os_malloc(NULL, ptsize = 1024 - 4, &pt)) != 0) {
+ __os_set_errno(ret);
+ return (NULL);
+ }
+ ept = pt + ptsize;
+ }
+ bpt = ept - 1;
+ *bpt = '\0';
+
+ /*
+ * Allocate bytes (1024 - malloc space) for the string of "../"'s.
+ * Should always be enough (it's 340 levels). If it's not, allocate
+ * as necessary. Special case the first stat, it's ".", not "..".
+ */
+ if ((ret = __os_malloc(NULL, upsize = 1024 - 4, &up)) != 0)
+ goto err;
+ eup = up + 1024;
+ bup = up;
+ up[0] = '.';
+ up[1] = '\0';
+
+ /* Save root values, so know when to stop. */
+ if (stat("/", &s))
+ goto err;
+ root_dev = s.st_dev;
+ root_ino = s.st_ino;
+
+ __os_set_errno(0); /* XXX readdir has no error return. */
+
+ for (first = 1;; first = 0) {
+ /* Stat the current level. */
+ if (lstat(up, &s))
+ goto err;
+
+ /* Save current node values. */
+ ino = s.st_ino;
+ dev = s.st_dev;
+
+ /* Check for reaching root. */
+ if (root_dev == dev && root_ino == ino) {
+ *--bpt = PATH_SEPARATOR[0];
+ /*
+ * It's unclear that it's a requirement to copy the
+ * path to the beginning of the buffer, but it's always
+ * been that way and stuff would probably break.
+ */
+ bcopy(bpt, pt, ept - bpt);
+ __os_free(NULL, up);
+ return (pt);
+ }
+
+ /*
+ * Build pointer to the parent directory, allocating memory
+ * as necessary. Max length is 3 for "../", the largest
+ * possible component name, plus a trailing NULL.
+ */
+ if (bup + 3 + MAXNAMLEN + 1 >= eup) {
+ if (__os_realloc(NULL, upsize *= 2, &up) != 0)
+ goto err;
+ bup = up;
+ eup = up + upsize;
+ }
+ *bup++ = '.';
+ *bup++ = '.';
+ *bup = '\0';
+
+ /* Open and stat parent directory. */
+ if (!(dir = opendir(up)) || fstat(dirfd(dir), &s))
+ goto err;
+
+ /* Add trailing slash for next directory. */
+ *bup++ = PATH_SEPARATOR[0];
+
+ /*
+ * If it's a mount point, have to stat each element because
+ * the inode number in the directory is for the entry in the
+ * parent directory, not the inode number of the mounted file.
+ */
+ save_errno = 0;
+ if (s.st_dev == dev) {
+ for (;;) {
+ if (!(dp = readdir(dir)))
+ goto notfound;
+ if (dp->d_fileno == ino)
+ break;
+ }
+ } else
+ for (;;) {
+ if (!(dp = readdir(dir)))
+ goto notfound;
+ if (ISDOT(dp))
+ continue;
+ bcopy(dp->d_name, bup, dp->d_namlen + 1);
+
+ /* Save the first error for later. */
+ if (lstat(up, &s)) {
+ if (save_errno == 0)
+ save_errno = __os_get_errno();
+ __os_set_errno(0);
+ continue;
+ }
+ if (s.st_dev == dev && s.st_ino == ino)
+ break;
+ }
+
+ /*
+ * Check for length of the current name, preceding slash,
+ * leading slash.
+ */
+ if (bpt - pt < dp->d_namlen + (first ? 1 : 2)) {
+ size_t len, off;
+
+ if (!ptsize) {
+ __os_set_errno(ERANGE);
+ goto err;
+ }
+ off = bpt - pt;
+ len = ept - bpt;
+ if (__os_realloc(NULL, ptsize *= 2, &pt) != 0)
+ goto err;
+ bpt = pt + off;
+ ept = pt + ptsize;
+ bcopy(bpt, ept - len, len);
+ bpt = ept - len;
+ }
+ if (!first)
+ *--bpt = PATH_SEPARATOR[0];
+ bpt -= dp->d_namlen;
+ bcopy(dp->d_name, bpt, dp->d_namlen);
+ (void)closedir(dir);
+
+ /* Truncate any file name. */
+ *bup = '\0';
+ }
+
+notfound:
+ /*
+ * If readdir set errno, use it, not any saved error; otherwise,
+ * didn't find the current directory in its parent directory, set
+ * errno to ENOENT.
+ */
+ if (__os_get_errno_ret_zero() == 0)
+ __os_set_errno(save_errno == 0 ? ENOENT : save_errno);
+ /* FALLTHROUGH */
+err:
+ if (ptsize)
+ __os_free(NULL, pt);
+ __os_free(NULL, up);
+ return (NULL);
+}
diff --git a/src/clib/getopt.c b/src/clib/getopt.c
new file mode 100644
index 00000000..ca98e7f1
--- /dev/null
+++ b/src/clib/getopt.c
@@ -0,0 +1,153 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1987, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+/*
+ * Avoid inclusion of internal header files as this
+ * file is used by example code.
+ *
+ * Unconditional inclusion of stdio and string are
+ * OK in this file. It will work on all platforms
+ * for which this file is used
+ */
+extern char *__db_rpath(const char *);
+#include <stdio.h>
+#include <string.h>
+
+int __db_getopt_reset; /* global reset for VxWorks. */
+
+int opterr = 1, /* if error message should be printed */
+ optind = 1, /* index into parent argv vector */
+ optopt, /* character checked for validity */
+ optreset; /* reset getopt */
+char *optarg; /* argument associated with option */
+
+#undef BADCH
+#define BADCH (int)'?'
+#undef BADARG
+#define BADARG (int)':'
+#undef EMSG
+#define EMSG ""
+
+/*
+ * getopt --
+ * Parse argc/argv argument vector.
+ *
+ * PUBLIC: #ifndef HAVE_GETOPT
+ * PUBLIC: int getopt __P((int, char * const *, const char *));
+ * PUBLIC: #endif
+ */
+int
+getopt(nargc, nargv, ostr)
+ int nargc;
+ char * const *nargv;
+ const char *ostr;
+{
+ static char *progname;
+ static char *place = EMSG; /* option letter processing */
+ char *oli; /* option letter list index */
+
+ /*
+ * VxWorks needs to be able to repeatedly call getopt from multiple
+ * programs within its global name space.
+ */
+ if (__db_getopt_reset) {
+ __db_getopt_reset = 0;
+
+ opterr = optind = 1;
+ optopt = optreset = 0;
+ optarg = NULL;
+ progname = NULL;
+ place = EMSG;
+ }
+ if (!progname) {
+ if ((progname = __db_rpath(*nargv)) == NULL)
+ progname = *nargv;
+ else
+ ++progname;
+ }
+
+ if (optreset || !*place) { /* update scanning pointer */
+ optreset = 0;
+ if (optind >= nargc || *(place = nargv[optind]) != '-') {
+ place = EMSG;
+ return (EOF);
+ }
+ if (place[1] && *++place == '-') { /* found "--" */
+ ++optind;
+ place = EMSG;
+ return (EOF);
+ }
+ } /* option letter okay? */
+ if ((optopt = (int)*place++) == (int)':' ||
+ !(oli = strchr(ostr, optopt))) {
+ /*
+ * if the user didn't specify '-' as an option,
+ * assume it means EOF.
+ */
+ if (optopt == (int)'-')
+ return (EOF);
+ if (!*place)
+ ++optind;
+ if (opterr && *ostr != ':')
+ (void)fprintf(stderr,
+ "%s: illegal option -- %c\n", progname, optopt);
+ return (BADCH);
+ }
+ if (*++oli != ':') { /* don't need argument */
+ optarg = NULL;
+ if (!*place)
+ ++optind;
+ }
+ else { /* need an argument */
+ if (*place) /* no white space */
+ optarg = place;
+ else if (nargc <= ++optind) { /* no arg */
+ place = EMSG;
+ if (*ostr == ':')
+ return (BADARG);
+ if (opterr)
+ (void)fprintf(stderr,
+ "%s: option requires an argument -- %c\n",
+ progname, optopt);
+ return (BADCH);
+ }
+ else /* white space */
+ optarg = nargv[optind];
+ place = EMSG;
+ ++optind;
+ }
+ return (optopt); /* dump back option letter */
+}
diff --git a/src/clib/isalpha.c b/src/clib/isalpha.c
new file mode 100644
index 00000000..6bf1ffb7
--- /dev/null
+++ b/src/clib/isalpha.c
@@ -0,0 +1,28 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * isalpha --
+ *
+ * PUBLIC: #ifndef HAVE_ISALPHA
+ * PUBLIC: int isalpha __P((int));
+ * PUBLIC: #endif
+ */
+int
+isalpha(c)
+ int c;
+{
+ /*
+ * Depends on ASCII-like character ordering.
+ */
+ return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ? 1 : 0);
+}
diff --git a/src/clib/isdigit.c b/src/clib/isdigit.c
new file mode 100644
index 00000000..d1b2a65e
--- /dev/null
+++ b/src/clib/isdigit.c
@@ -0,0 +1,28 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * isdigit --
+ *
+ * PUBLIC: #ifndef HAVE_ISDIGIT
+ * PUBLIC: int isdigit __P((int));
+ * PUBLIC: #endif
+ */
+int
+isdigit(c)
+ int c;
+{
+ /*
+ * Depends on ASCII-like character ordering.
+ */
+ return (c >= '0' && c <= '9' ? 1 : 0);
+}
diff --git a/src/clib/isprint.c b/src/clib/isprint.c
new file mode 100644
index 00000000..685e20ea
--- /dev/null
+++ b/src/clib/isprint.c
@@ -0,0 +1,28 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * isprint --
+ *
+ * PUBLIC: #ifndef HAVE_ISPRINT
+ * PUBLIC: int isprint __P((int));
+ * PUBLIC: #endif
+ */
+int
+isprint(c)
+ int c;
+{
+ /*
+ * Depends on ASCII character values.
+ */
+ return ((c >= ' ' && c <= '~') ? 1 : 0);
+}
diff --git a/src/clib/isspace.c b/src/clib/isspace.c
new file mode 100644
index 00000000..df450d3b
--- /dev/null
+++ b/src/clib/isspace.c
@@ -0,0 +1,26 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * isspace --
+ *
+ * PUBLIC: #ifndef HAVE_ISSPACE
+ * PUBLIC: int isspace __P((int));
+ * PUBLIC: #endif
+ */
+int
+isspace(c)
+ int c;
+{
+ return (c == '\t' || c == '\n' ||
+ c == '\v' || c == '\f' || c == '\r' || c == ' ' ? 1 : 0);
+}
diff --git a/src/clib/memcmp.c b/src/clib/memcmp.c
new file mode 100644
index 00000000..7fec827c
--- /dev/null
+++ b/src/clib/memcmp.c
@@ -0,0 +1,62 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * memcmp --
+ *
+ * PUBLIC: #ifndef HAVE_MEMCMP
+ * PUBLIC: int memcmp __P((const void *, const void *, size_t));
+ * PUBLIC: #endif
+ */
+int
+memcmp(s1, s2, n)
+ const void *s1, *s2;
+ size_t n;
+{
+ if (n != 0) {
+ unsigned char *p1 = (unsigned char *)s1,
+ *p2 = (unsigned char *)s2;
+ do {
+ if (*p1++ != *p2++)
+ return (*--p1 - *--p2);
+ } while (--n != 0);
+ }
+ return (0);
+}
diff --git a/src/clib/memmove.c b/src/clib/memmove.c
new file mode 100644
index 00000000..34a181cc
--- /dev/null
+++ b/src/clib/memmove.c
@@ -0,0 +1,150 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * sizeof(word) MUST BE A POWER OF TWO
+ * SO THAT wmask BELOW IS ALL ONES
+ */
+typedef int word; /* "word" used for optimal copy speed */
+
+#undef wsize
+#define wsize sizeof(word)
+#undef wmask
+#define wmask (wsize - 1)
+
+/*
+ * Copy a block of memory, handling overlap.
+ * This is the routine that actually implements
+ * (the portable versions of) bcopy, memcpy, and memmove.
+ */
+#ifdef MEMCOPY
+/*
+ * PUBLIC: #ifndef HAVE_MEMCPY
+ * PUBLIC: void *memcpy __P((void *, const void *, size_t));
+ * PUBLIC: #endif
+ */
+void *
+memcpy(dst0, src0, length)
+#else
+#ifdef MEMMOVE
+/*
+ * PUBLIC: #ifndef HAVE_MEMMOVE
+ * PUBLIC: void *memmove __P((void *, const void *, size_t));
+ * PUBLIC: #endif
+ */
+void *
+memmove(dst0, src0, length)
+#else
+void
+bcopy(src0, dst0, length)
+#endif
+#endif
+ void *dst0;
+ const void *src0;
+ register size_t length;
+{
+ register char *dst = dst0;
+ register const char *src = src0;
+ register size_t t;
+
+ if (length == 0 || dst == src) /* nothing to do */
+ goto done;
+
+ /*
+ * Macros: loop-t-times; and loop-t-times, t>0
+ */
+#undef TLOOP
+#define TLOOP(s) if (t) TLOOP1(s)
+#undef TLOOP1
+#define TLOOP1(s) do { s; } while (--t)
+
+ if ((unsigned long)dst < (unsigned long)src) {
+ /*
+ * Copy forward.
+ */
+ t = (size_t)src; /* only need low bits */
+ if ((t | (size_t)dst) & wmask) {
+ /*
+ * Try to align operands. This cannot be done
+ * unless the low bits match.
+ */
+ if ((t ^ (size_t)dst) & wmask || length < wsize)
+ t = length;
+ else
+ t = wsize - (t & wmask);
+ length -= t;
+ TLOOP1(*dst++ = *src++);
+ }
+ /*
+ * Copy whole words, then mop up any trailing bytes.
+ */
+ t = length / wsize;
+ TLOOP(*(word *)dst = *(word *)src; src += wsize; dst += wsize);
+ t = length & wmask;
+ TLOOP(*dst++ = *src++);
+ } else {
+ /*
+ * Copy backwards. Otherwise essentially the same.
+ * Alignment works as before, except that it takes
+ * (t&wmask) bytes to align, not wsize-(t&wmask).
+ */
+ src += length;
+ dst += length;
+ t = (size_t)src;
+ if ((t | (size_t)dst) & wmask) {
+ if ((t ^ (size_t)dst) & wmask || length <= wsize)
+ t = length;
+ else
+ t &= wmask;
+ length -= t;
+ TLOOP1(*--dst = *--src);
+ }
+ t = length / wsize;
+ TLOOP(src -= wsize; dst -= wsize; *(word *)dst = *(word *)src);
+ t = length & wmask;
+ TLOOP(*--dst = *--src);
+ }
+done:
+#if defined(MEMCOPY) || defined(MEMMOVE)
+ return (dst0);
+#else
+ return;
+#endif
+}
diff --git a/src/clib/printf.c b/src/clib/printf.c
new file mode 100644
index 00000000..a2c01296
--- /dev/null
+++ b/src/clib/printf.c
@@ -0,0 +1,116 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * printf --
+ *
+ * PUBLIC: #ifndef HAVE_PRINTF
+ * PUBLIC: int printf __P((const char *, ...));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_PRINTF
+int
+#ifdef STDC_HEADERS
+printf(const char *fmt, ...)
+#else
+printf(fmt, va_alist)
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+ size_t len;
+ char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ len = (size_t)vsnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ /*
+ * We implement printf/fprintf with fwrite, because Berkeley DB uses
+ * fwrite in other places.
+ */
+ return (fwrite(
+ buf, sizeof(char), (size_t)len, stdout) == len ? (int)len: -1);
+}
+#endif /* HAVE_PRINTF */
+
+/*
+ * fprintf --
+ *
+ * PUBLIC: #ifndef HAVE_PRINTF
+ * PUBLIC: int fprintf __P((FILE *, const char *, ...));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_PRINTF
+int
+#ifdef STDC_HEADERS
+fprintf(FILE *fp, const char *fmt, ...)
+#else
+fprintf(fp, fmt, va_alist)
+ FILE *fp;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+ size_t len;
+ char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ len = vsnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ /*
+ * We implement printf/fprintf with fwrite, because Berkeley DB uses
+ * fwrite in other places.
+ */
+ return (fwrite(
+ buf, sizeof(char), (size_t)len, fp) == len ? (int)len: -1);
+}
+#endif /* HAVE_PRINTF */
+
+/*
+ * vfprintf --
+ *
+ * PUBLIC: #ifndef HAVE_PRINTF
+ * PUBLIC: int vfprintf __P((FILE *, const char *, va_list));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_PRINTF
+int
+vfprintf(fp, fmt, ap)
+ FILE *fp;
+ const char *fmt;
+ va_list ap;
+{
+ size_t len;
+ char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+ len = vsnprintf(buf, sizeof(buf), fmt, ap);
+
+ /*
+ * We implement printf/fprintf with fwrite, because Berkeley DB uses
+ * fwrite in other places.
+ */
+ return (fwrite(
+ buf, sizeof(char), (size_t)len, fp) == len ? (int)len: -1);
+}
+#endif /* HAVE_PRINTF */
diff --git a/src/clib/qsort.c b/src/clib/qsort.c
new file mode 100644
index 00000000..cec6288c
--- /dev/null
+++ b/src/clib/qsort.c
@@ -0,0 +1,181 @@
+/*-
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * static char sccsid[] = "@(#)qsort.c 8.1 (Berkeley) 6/4/93";
+ * Id: qsort.c,v 1.4 1996/04/19 18:40:20 bde
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static char *med3 __P((char *,
+ char *, char *, int (*)(const void *, const void *)));
+static void swapfunc __P((char *, char *, int, int));
+
+#define min(a, b) (a) < (b) ? a : b
+
+/*
+ * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
+ */
+#define swapcode(TYPE, parmi, parmj, n) { \
+ long i = (n) / sizeof(TYPE); \
+ register TYPE *pi = (TYPE *) (parmi); \
+ register TYPE *pj = (TYPE *) (parmj); \
+ do { \
+ register TYPE t = *pi; \
+ *pi++ = *pj; \
+ *pj++ = t; \
+ } while (--i > 0); \
+}
+
+#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
+ es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;
+
+static inline void
+swapfunc(a, b, n, swaptype)
+ char *a, *b;
+ int n, swaptype;
+{
+ if (swaptype <= 1)
+ swapcode(long, a, b, n)
+ else
+ swapcode(char, a, b, n)
+}
+
+#define swap(a, b) \
+ if (swaptype == 0) { \
+ long t = *(long *)(a); \
+ *(long *)(a) = *(long *)(b); \
+ *(long *)(b) = t; \
+ } else \
+ swapfunc(a, b, es, swaptype)
+
+#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype)
+
+static inline char *
+med3(a, b, c, cmp)
+ char *a, *b, *c;
+ int (*cmp)(const void *, const void *);
+{
+ return cmp(a, b) < 0 ?
+ (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a ))
+ :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_QSORT
+ * PUBLIC: void qsort __P((void *,
+ * PUBLIC: size_t, size_t, int(*)(const void *, const void *)));
+ * PUBLIC: #endif
+ */
+void
+qsort(a, n, es, cmp)
+ void *a;
+ size_t n, es;
+ int (*cmp) __P((const void *, const void *));
+{
+ char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+ int d, r, swaptype, swap_cnt;
+
+loop: SWAPINIT(a, es);
+ swap_cnt = 0;
+ if (n < 7) {
+ for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
+ for (pl = pm; pl > (char *)a && cmp(pl - es, pl) > 0;
+ pl -= es)
+ swap(pl, pl - es);
+ return;
+ }
+ pm = (char *)a + (n / 2) * es;
+ if (n > 7) {
+ pl = a;
+ pn = (char *)a + (n - 1) * es;
+ if (n > 40) {
+ d = (n / 8) * es;
+ pl = med3(pl, pl + d, pl + 2 * d, cmp);
+ pm = med3(pm - d, pm, pm + d, cmp);
+ pn = med3(pn - 2 * d, pn - d, pn, cmp);
+ }
+ pm = med3(pl, pm, pn, cmp);
+ }
+ swap(a, pm);
+ pa = pb = (char *)a + es;
+
+ pc = pd = (char *)a + (n - 1) * es;
+ for (;;) {
+ while (pb <= pc && (r = cmp(pb, a)) <= 0) {
+ if (r == 0) {
+ swap_cnt = 1;
+ swap(pa, pb);
+ pa += es;
+ }
+ pb += es;
+ }
+ while (pb <= pc && (r = cmp(pc, a)) >= 0) {
+ if (r == 0) {
+ swap_cnt = 1;
+ swap(pc, pd);
+ pd -= es;
+ }
+ pc -= es;
+ }
+ if (pb > pc)
+ break;
+ swap(pb, pc);
+ swap_cnt = 1;
+ pb += es;
+ pc -= es;
+ }
+ if (swap_cnt == 0) { /* Switch to insertion sort */
+ for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
+ for (pl = pm; pl > (char *)a && cmp(pl - es, pl) > 0;
+ pl -= es)
+ swap(pl, pl - es);
+ return;
+ }
+
+ pn = (char *)a + n * es;
+ r = min(pa - (char *)a, pb - pa);
+ vecswap(a, pb - r, r);
+ r = min((int)(pd - pc), (int)(pn - pd - es));
+ vecswap(pb, pn - r, r);
+ if ((r = (int)(pb - pa)) > (int)es)
+ qsort(a, r / es, es, cmp);
+ if ((r = (int)(pd - pc)) > (int)es) {
+ /* Iterate rather than recurse to save stack space */
+ a = pn - r;
+ n = r / es;
+ goto loop;
+ }
+/* qsort(pn - r, r / es, es, cmp);*/
+}
diff --git a/src/clib/raise.c b/src/clib/raise.c
new file mode 100644
index 00000000..ad0e567f
--- /dev/null
+++ b/src/clib/raise.c
@@ -0,0 +1,26 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * raise --
+ * Send a signal to the current process.
+ *
+ * PUBLIC: #ifndef HAVE_RAISE
+ * PUBLIC: int raise __P((int));
+ * PUBLIC: #endif
+ */
+int
+raise(s)
+ int s;
+{
+ return (kill(getpid(), s));
+}
diff --git a/src/clib/rand.c b/src/clib/rand.c
new file mode 100644
index 00000000..6b810060
--- /dev/null
+++ b/src/clib/rand.c
@@ -0,0 +1,25 @@
+/*
+ * Copied from the ANSI C standard 4.10.2.2.
+ */
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * rand, srand --
+ *
+ * PUBLIC: #ifndef HAVE_RAND
+ * PUBLIC: int rand __P((void));
+ * PUBLIC: void srand __P((unsigned int));
+ * PUBLIC: #endif
+ */
+int rand(void) /* RAND_MAX assumed to be 32767 */
+{
+ DB_GLOBAL(rand_next) = DB_GLOBAL(rand_next) * 1103515245 + 12345;
+ return (unsigned int) (DB_GLOBAL(rand_next)/65536) % 32768;
+}
+
+void srand(unsigned int seed)
+{
+ DB_GLOBAL(rand_next) = seed;
+}
diff --git a/src/clib/snprintf.c b/src/clib/snprintf.c
new file mode 100644
index 00000000..6b31d850
--- /dev/null
+++ b/src/clib/snprintf.c
@@ -0,0 +1,149 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#if !defined(HAVE_SNPRINTF) || !defined(HAVE_VSNPRINTF)
+static void sprintf_overflow __P((void));
+static int sprintf_retcharpnt __P((void));
+#endif
+
+/*
+ * snprintf --
+ * Bounded version of sprintf.
+ *
+ * PUBLIC: #ifndef HAVE_SNPRINTF
+ * PUBLIC: int snprintf __P((char *, size_t, const char *, ...));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_SNPRINTF
+int
+#ifdef STDC_HEADERS
+snprintf(char *str, size_t n, const char *fmt, ...)
+#else
+snprintf(str, n, fmt, va_alist)
+ char *str;
+ size_t n;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ static int ret_charpnt = -1;
+ va_list ap;
+ size_t len;
+
+ if (ret_charpnt == -1)
+ ret_charpnt = sprintf_retcharpnt();
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ len = (size_t)vsprintf(str, fmt, ap);
+ if (ret_charpnt)
+ len = strlen(str);
+
+ va_end(ap);
+
+ if (len >= n) {
+ sprintf_overflow();
+ /* NOTREACHED */
+ }
+ return ((int)len);
+}
+#endif
+
+/*
+ * vsnprintf --
+ * Bounded version of vsprintf.
+ *
+ * PUBLIC: #ifndef HAVE_VSNPRINTF
+ * PUBLIC: int vsnprintf __P((char *, size_t, const char *, va_list));
+ * PUBLIC: #endif
+ */
+#ifndef HAVE_VSNPRINTF
+int
+vsnprintf(str, n, fmt, ap)
+ char *str;
+ size_t n;
+ const char *fmt;
+ va_list ap;
+{
+ static int ret_charpnt = -1;
+ size_t len;
+
+ if (ret_charpnt == -1)
+ ret_charpnt = sprintf_retcharpnt();
+
+ len = (size_t)vsprintf(str, fmt, ap);
+ if (ret_charpnt)
+ len = strlen(str);
+
+ if (len >= n) {
+ sprintf_overflow();
+ /* NOTREACHED */
+ }
+ return ((int)len);
+}
+#endif
+
+#if !defined(HAVE_SNPRINTF) || !defined(HAVE_VSNPRINTF)
+static void
+sprintf_overflow()
+{
+ /*
+ * !!!
+ * We're potentially manipulating strings handed us by the application,
+ * and on systems without a real snprintf() the sprintf() calls could
+ * have overflowed the buffer. We can't do anything about it now, but
+ * we don't want to return control to the application, we might have
+ * overwritten the stack with a Trojan horse. We're not trying to do
+ * anything recoverable here because systems without snprintf support
+ * are pretty rare anymore.
+ */
+#define OVERFLOW_ERROR "internal buffer overflow, process ended\n"
+#ifndef STDERR_FILENO
+#define STDERR_FILENO 2
+#endif
+ (void)write(STDERR_FILENO, OVERFLOW_ERROR, sizeof(OVERFLOW_ERROR) - 1);
+
+ /* Be polite. */
+ exit(1);
+
+ /* But firm. */
+ __os_abort(NULL);
+
+ /* NOTREACHED */
+}
+
+static int
+sprintf_retcharpnt()
+{
+ int ret_charpnt;
+ char buf[10];
+
+ /*
+ * Some old versions of sprintf return a pointer to the first argument
+ * instead of a character count. Assume the return value of snprintf,
+ * vsprintf, etc. will be the same as sprintf, and check the easy one.
+ *
+ * We do this test at run-time because it's not a test we can do in a
+ * cross-compilation environment.
+ */
+
+ ret_charpnt =
+ (int)sprintf(buf, "123") != 3 ||
+ (int)sprintf(buf, "123456789") != 9 ||
+ (int)sprintf(buf, "1234") != 4;
+
+ return (ret_charpnt);
+}
+#endif
diff --git a/src/clib/strcasecmp.c b/src/clib/strcasecmp.c
new file mode 100644
index 00000000..287895ce
--- /dev/null
+++ b/src/clib/strcasecmp.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 1987, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strcasecmp --
+ * Do strcmp(3) in a case-insensitive manner.
+ *
+ * PUBLIC: #ifndef HAVE_STRCASECMP
+ * PUBLIC: int strcasecmp __P((const char *, const char *));
+ * PUBLIC: #endif
+ */
+int
+strcasecmp(s1, s2)
+ const char *s1, *s2;
+{
+ u_char s1ch, s2ch;
+
+ for (;;) {
+ s1ch = *s1++;
+ s2ch = *s2++;
+ if (s1ch >= 'A' && s1ch <= 'Z') /* tolower() */
+ s1ch += 32;
+ if (s2ch >= 'A' && s2ch <= 'Z') /* tolower() */
+ s2ch += 32;
+ if (s1ch != s2ch)
+ return (s1ch - s2ch);
+ if (s1ch == '\0')
+ return (0);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * strncasecmp --
+ * Do strncmp(3) in a case-insensitive manner.
+ *
+ * PUBLIC: #ifndef HAVE_STRCASECMP
+ * PUBLIC: int strncasecmp __P((const char *, const char *, size_t));
+ * PUBLIC: #endif
+ */
+int
+strncasecmp(s1, s2, n)
+ const char *s1, *s2;
+ register size_t n;
+{
+ u_char s1ch, s2ch;
+
+ for (; n != 0; --n) {
+ s1ch = *s1++;
+ s2ch = *s2++;
+ if (s1ch >= 'A' && s1ch <= 'Z') /* tolower() */
+ s1ch += 32;
+ if (s2ch >= 'A' && s2ch <= 'Z') /* tolower() */
+ s2ch += 32;
+ if (s1ch != s2ch)
+ return (s1ch - s2ch);
+ if (s1ch == '\0')
+ return (0);
+ }
+ return (0);
+}
diff --git a/src/clib/strcat.c b/src/clib/strcat.c
new file mode 100644
index 00000000..d99c9070
--- /dev/null
+++ b/src/clib/strcat.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strcat --
+ *
+ * PUBLIC: #ifndef HAVE_STRCAT
+ * PUBLIC: char *strcat __P((char *, const char *));
+ * PUBLIC: #endif
+ */
+char *
+strcat(char *s, const char *append)
+{
+ char *save = s;
+
+ for (; *s; ++s);
+ while ((*s++ = *append++));
+ return (save);
+}
diff --git a/src/clib/strchr.c b/src/clib/strchr.c
new file mode 100644
index 00000000..a8ac4ce0
--- /dev/null
+++ b/src/clib/strchr.c
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strchr --
+ *
+ * PUBLIC: #ifndef HAVE_STRCHR
+ * PUBLIC: char *strchr __P((const char *, int));
+ * PUBLIC: #endif
+ */
+char *strchr(const char *p, int ch)
+{
+ char c;
+
+ c = ch;
+ for (;; ++p) {
+ if (*p == c)
+ return ((char *)p);
+ if (*p == '\0')
+ return (NULL);
+ }
+ /* NOTREACHED */
+}
diff --git a/src/clib/strdup.c b/src/clib/strdup.c
new file mode 100644
index 00000000..5863340c
--- /dev/null
+++ b/src/clib/strdup.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strdup --
+ *
+ * PUBLIC: #ifndef HAVE_STRDUP
+ * PUBLIC: char *strdup __P((const char *));
+ * PUBLIC: #endif
+ */
+char *
+strdup(str)
+ const char *str;
+{
+ size_t len;
+ char *copy;
+
+ len = strlen(str) + 1;
+ if (!(copy = malloc((u_int)len)))
+ return (NULL);
+ memcpy(copy, str, len);
+ return (copy);
+}
diff --git a/src/clib/strerror.c b/src/clib/strerror.c
new file mode 100644
index 00000000..62bd7dd5
--- /dev/null
+++ b/src/clib/strerror.c
@@ -0,0 +1,225 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+/*
+ * Copyright (c) 1982, 1985, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("FreeBSD: /repoman/r/ncvs/src/lib/libc/gen/errlst.c,v 1.8 2005/04/02 12:33:28 das Exp $");
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_strerror --
+ * Return the string associated with an errno.
+ *
+ * PUBLIC: #ifndef HAVE_STRERROR
+ * PUBLIC: char *strerror __P((int));
+ * PUBLIC: #endif
+ */
+char *
+strerror(num)
+ int num;
+{
+#define ERRSTR(v, s) do { \
+ if (num == (v)) \
+ return (s); \
+} while (0)
+ ERRSTR(0, "Undefined error: 0");
+ ERRSTR(EPERM, "Operation not permitted");
+ ERRSTR(ENOENT, "No such file or directory");
+ ERRSTR(ESRCH, "No such process");
+ ERRSTR(EINTR, "Interrupted system call");
+ ERRSTR(EIO, "Input/output error");
+ ERRSTR(ENXIO, "Device not configured");
+ ERRSTR(E2BIG, "Argument list too long");
+ ERRSTR(ENOEXEC, "Exec format error");
+ ERRSTR(EBADF, "Bad file descriptor");
+ ERRSTR(ECHILD, "No child processes");
+ ERRSTR(EDEADLK, "Resource deadlock avoided");
+ ERRSTR(ENOMEM, "Cannot allocate memory");
+ ERRSTR(EACCES, "Permission denied");
+ ERRSTR(EFAULT, "Bad address");
+ ERRSTR(ENOTBLK, "Block device required");
+ ERRSTR(EBUSY, "Device busy");
+ ERRSTR(EEXIST, "File exists");
+ ERRSTR(EXDEV, "Cross-device link");
+ ERRSTR(ENODEV, "Operation not supported by device");
+ ERRSTR(ENOTDIR, "Not a directory");
+ ERRSTR(EISDIR, "Is a directory");
+ ERRSTR(EINVAL, "Invalid argument");
+ ERRSTR(ENFILE, "Too many open files in system");
+ ERRSTR(EMFILE, "Too many open files");
+ ERRSTR(ENOTTY, "Inappropriate ioctl for device");
+ ERRSTR(ETXTBSY, "Text file busy");
+ ERRSTR(EFBIG, "File too large");
+ ERRSTR(ENOSPC, "No space left on device");
+ ERRSTR(ESPIPE, "Illegal seek");
+ ERRSTR(EROFS, "Read-only file system");
+ ERRSTR(EMLINK, "Too many links");
+ ERRSTR(EPIPE, "Broken pipe");
+
+/* math software */
+ ERRSTR(EDOM, "Numerical argument out of domain");
+ ERRSTR(ERANGE, "Result too large");
+
+/* non-blocking and interrupt i/o */
+ ERRSTR(EAGAIN, "Resource temporarily unavailable");
+ ERRSTR(EWOULDBLOCK, "Resource temporarily unavailable");
+ ERRSTR(EINPROGRESS, "Operation now in progress");
+ ERRSTR(EALREADY, "Operation already in progress");
+
+/* ipc/network software -- argument errors */
+ ERRSTR(ENOTSOCK, "Socket operation on non-socket");
+ ERRSTR(EDESTADDRREQ, "Destination address required");
+ ERRSTR(EMSGSIZE, "Message too long");
+ ERRSTR(EPROTOTYPE, "Protocol wrong type for socket");
+ ERRSTR(ENOPROTOOPT, "Protocol not available");
+ ERRSTR(EPROTONOSUPPORT, "Protocol not supported");
+ ERRSTR(ESOCKTNOSUPPORT, "Socket type not supported");
+ ERRSTR(EOPNOTSUPP, "Operation not supported");
+ ERRSTR(EPFNOSUPPORT, "Protocol family not supported");
+ ERRSTR(EAFNOSUPPORT, "Address family not supported by protocol family");
+ ERRSTR(EADDRINUSE, "Address already in use");
+ ERRSTR(EADDRNOTAVAIL, "Can't assign requested address");
+
+/* ipc/network software -- operational errors */
+ ERRSTR(ENETDOWN, "Network is down");
+ ERRSTR(ENETUNREACH, "Network is unreachable");
+ ERRSTR(ENETRESET, "Network dropped connection on reset");
+ ERRSTR(ECONNABORTED, "Software caused connection abort");
+ ERRSTR(ECONNRESET, "Connection reset by peer");
+ ERRSTR(ENOBUFS, "No buffer space available");
+ ERRSTR(EISCONN, "Socket is already connected");
+ ERRSTR(ENOTCONN, "Socket is not connected");
+ ERRSTR(ESHUTDOWN, "Can't send after socket shutdown");
+ ERRSTR(ETOOMANYREFS, "Too many references: can't splice");
+ ERRSTR(ETIMEDOUT, "Operation timed out");
+ ERRSTR(ECONNREFUSED, "Connection refused");
+
+ ERRSTR(ELOOP, "Too many levels of symbolic links");
+ ERRSTR(ENAMETOOLONG, "File name too long");
+
+/* should be rearranged */
+ ERRSTR(EHOSTDOWN, "Host is down");
+ ERRSTR(EHOSTUNREACH, "No route to host");
+ ERRSTR(ENOTEMPTY, "Directory not empty");
+
+/* quotas & mush */
+ ERRSTR(EPROCLIM, "Too many processes");
+ ERRSTR(EUSERS, "Too many users");
+ ERRSTR(EDQUOT, "Disc quota exceeded");
+
+/* Network File System */
+ ERRSTR(ESTALE, "Stale NFS file handle");
+ ERRSTR(EREMOTE, "Too many levels of remote in path");
+ ERRSTR(EBADRPC, "RPC struct is bad");
+ ERRSTR(ERPCMISMATCH, "RPC version wrong");
+ ERRSTR(EPROGUNAVAIL, "RPC prog. not avail");
+ ERRSTR(EPROGMISMATCH, "Program version wrong");
+ ERRSTR(EPROCUNAVAIL, "Bad procedure for program");
+
+ ERRSTR(ENOLCK, "No locks available");
+ ERRSTR(ENOSYS, "Function not implemented");
+ ERRSTR(EFTYPE, "Inappropriate file type or format");
+#ifdef EAUTH
+ ERRSTR(EAUTH, "Authentication error");
+#endif
+#ifdef ENEEDAUTH
+ ERRSTR(ENEEDAUTH, "Need authenticator");
+#endif
+ ERRSTR(EIDRM, "Identifier removed");
+ ERRSTR(ENOMSG, "No message of desired type");
+#ifdef EOVERFLOW
+ ERRSTR(EOVERFLOW, "Value too large to be stored in data type");
+#endif
+ ERRSTR(ECANCELED, "Operation canceled");
+ ERRSTR(EILSEQ, "Illegal byte sequence");
+#ifdef ENOATTR
+ ERRSTR(ENOATTR, "Attribute not found");
+#endif
+
+/* General */
+#ifdef EDOOFUS
+ ERRSTR(EDOOFUS, "Programming error");
+#endif
+
+#ifdef EBADMSG
+ ERRSTR(EBADMSG, "Bad message");
+#endif
+#ifdef EMULTIHOP
+ ERRSTR(EMULTIHOP, "Multihop attempted");
+#endif
+#ifdef ENOLINK
+ ERRSTR(ENOLINK, "Link has been severed");
+#endif
+#ifdef EPROTO
+ ERRSTR(EPROTO, "Protocol error");
+#endif
+
+ return (__db_unknown_error(num));
+}
diff --git a/src/clib/strncat.c b/src/clib/strncat.c
new file mode 100644
index 00000000..ce8273a4
--- /dev/null
+++ b/src/clib/strncat.c
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strncat --
+ *
+ * PUBLIC: #ifndef HAVE_STRNCAT
+ * PUBLIC: char *strncat __P((char *, const char *, size_t));
+ * PUBLIC: #endif
+ */
+/*
+ * Concatenate src on the end of dst. At most strlen(dst)+n+1 bytes
+ * are written at dst (at most n+1 bytes being appended). Return dst.
+ */
+char *
+strncat(char *dst, const char *src, size_t n)
+{
+ if (n != 0) {
+ char *d = dst;
+ const char *s = src;
+
+ while (*d != 0)
+ d++;
+ do {
+ if ((*d = *s++) == 0)
+ break;
+ d++;
+ } while (--n != 0);
+ *d = 0;
+ }
+ return (dst);
+}
diff --git a/src/clib/strncmp.c b/src/clib/strncmp.c
new file mode 100644
index 00000000..9738b5b2
--- /dev/null
+++ b/src/clib/strncmp.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strncmp --
+ *
+ * PUBLIC: #ifndef HAVE_STRNCMP
+ * PUBLIC: int strncmp __P((const char *, const char *, size_t));
+ * PUBLIC: #endif
+ */
+int
+strncmp(s1, s2, n)
+ const char *s1, *s2;
+ size_t n;
+{
+
+ if (n == 0)
+ return (0);
+ do {
+ if (*s1 != *s2++)
+ return (*(const unsigned char *)s1 -
+ *(const unsigned char *)(s2 - 1));
+ if (*s1++ == 0)
+ break;
+ } while (--n != 0);
+ return (0);
+}
diff --git a/src/clib/strrchr.c b/src/clib/strrchr.c
new file mode 100644
index 00000000..8753e943
--- /dev/null
+++ b/src/clib/strrchr.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * strrchr --
+ *
+ * PUBLIC: #ifndef HAVE_STRRCHR
+ * PUBLIC: char *strrchr __P((const char *, int));
+ * PUBLIC: #endif
+ */
+char *strrchr(const char *p, int ch)
+{
+ char *save;
+ char c;
+
+ c = ch;
+ for (save = NULL;; ++p) {
+ if (*p == c)
+ save = (char *)p;
+ if (*p == '\0')
+ return (save);
+ }
+ /* NOTREACHED */
+}
diff --git a/src/clib/strsep.c b/src/clib/strsep.c
new file mode 100644
index 00000000..f79d0f5c
--- /dev/null
+++ b/src/clib/strsep.c
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Get next token from string *stringp, where tokens are possibly-empty
+ * strings separated by characters from delim.
+ *
+ * Writes NULs into the string at *stringp to end tokens.
+ * delim need not remain constant from call to call.
+ * On return, *stringp points past the last NUL written (if there might
+ * be further tokens), or is NULL (if there are definitely no more tokens).
+ *
+ * If *stringp is NULL, strsep returns NULL.
+ *
+ * PUBLIC: #ifndef HAVE_STRSEP
+ * PUBLIC: char *strsep __P((char **, const char *));
+ * PUBLIC: #endif
+ */
+char *
+strsep(stringp, delim)
+ char **stringp;
+ const char *delim;
+{
+ char *s;
+ const char *spanp;
+ int c, sc;
+ char *tok;
+
+ if ((s = *stringp) == NULL)
+ return (NULL);
+ for (tok = s;;) {
+ c = *s++;
+ spanp = delim;
+ do {
+ if ((sc = *spanp++) == c) {
+ if (c == 0)
+ s = NULL;
+ else
+ s[-1] = 0;
+ *stringp = s;
+ return (tok);
+ }
+ } while (sc != 0);
+ }
+ /* NOTREACHED */
+}
diff --git a/src/clib/strtol.c b/src/clib/strtol.c
new file mode 100644
index 00000000..eb76b8f4
--- /dev/null
+++ b/src/clib/strtol.c
@@ -0,0 +1,142 @@
+/*-
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Convert a string to a long integer.
+ *
+ * Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ *
+ * PUBLIC: #ifndef HAVE_STRTOL
+ * PUBLIC: long strtol __P((const char *, char **, int));
+ * PUBLIC: #endif
+ */
+long
+strtol(nptr, endptr, base)
+ const char * nptr;
+ char ** endptr;
+ int base;
+{
+ const char *s;
+ unsigned long acc;
+ char c;
+ unsigned long cutoff;
+ int neg, any, cutlim;
+
+ /*
+ * Skip white space and pick up leading +/- sign if any.
+ * If base is 0, allow 0x for hex and 0 for octal, else
+ * assume decimal; if base is already 16, allow 0x.
+ */
+ s = nptr;
+ do {
+ c = *s++;
+ } while (isspace((unsigned char)c));
+ if (c == '-') {
+ neg = 1;
+ c = *s++;
+ } else {
+ neg = 0;
+ if (c == '+')
+ c = *s++;
+ }
+ if ((base == 0 || base == 16) &&
+ c == '0' && (*s == 'x' || *s == 'X')) {
+ c = s[1];
+ s += 2;
+ base = 16;
+ }
+ if (base == 0)
+ base = c == '0' ? 8 : 10;
+ acc = any = 0;
+ if (base < 2 || base > 36)
+ goto noconv;
+
+ /*
+ * Compute the cutoff value between legal numbers and illegal
+ * numbers. That is the largest legal value, divided by the
+ * base. An input number that is greater than this value, if
+ * followed by a legal input character, is too big. One that
+ * is equal to this value may be valid or not; the limit
+ * between valid and invalid numbers is then based on the last
+ * digit. For instance, if the range for longs is
+ * [-2147483648..2147483647] and the input base is 10,
+ * cutoff will be set to 214748364 and cutlim to either
+ * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
+ * a value > 214748364, or equal but the next digit is > 7 (or 8),
+ * the number is too big, and we will return a range error.
+ *
+ * Set 'any' if any `digits' consumed; make it negative to indicate
+ * overflow.
+ */
+ cutoff = neg ? (unsigned long)-(LONG_MIN + LONG_MAX) + LONG_MAX
+ : LONG_MAX;
+ cutlim = cutoff % base;
+ cutoff /= base;
+ for ( ; ; c = *s++) {
+ if (c >= '0' && c <= '9')
+ c -= '0';
+ else if (c >= 'A' && c <= 'Z')
+ c -= 'A' - 10;
+ else if (c >= 'a' && c <= 'z')
+ c -= 'a' - 10;
+ else
+ break;
+ if (c >= base)
+ break;
+ if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+ any = -1;
+ else {
+ any = 1;
+ acc *= base;
+ acc += c;
+ }
+ }
+ if (any < 0) {
+ acc = neg ? LONG_MIN : LONG_MAX;
+ errno = ERANGE;
+ } else if (!any) {
+noconv:
+ errno = EINVAL;
+ } else if (neg)
+ acc = -(long)acc;
+ if (endptr != NULL)
+ *endptr = (char *)(any ? s - 1 : nptr);
+ return (acc);
+}
diff --git a/src/clib/strtoul.c b/src/clib/strtoul.c
new file mode 100644
index 00000000..d0495a33
--- /dev/null
+++ b/src/clib/strtoul.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Convert a string to an unsigned long integer.
+ *
+ * Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ *
+ * PUBLIC: #ifndef HAVE_STRTOUL
+ * PUBLIC: unsigned long strtoul __P((const char *, char **, int));
+ * PUBLIC: #endif
+ */
+unsigned long
+strtoul(nptr, endptr, base)
+ const char * nptr;
+ char ** endptr;
+ int base;
+{
+ const char *s;
+ unsigned long acc;
+ char c;
+ unsigned long cutoff;
+ int neg, any, cutlim;
+
+ /*
+ * See strtol for comments as to the logic used.
+ */
+ s = nptr;
+ do {
+ c = *s++;
+ } while (isspace((unsigned char)c));
+ if (c == '-') {
+ neg = 1;
+ c = *s++;
+ } else {
+ neg = 0;
+ if (c == '+')
+ c = *s++;
+ }
+ if ((base == 0 || base == 16) &&
+ c == '0' && (*s == 'x' || *s == 'X')) {
+ c = s[1];
+ s += 2;
+ base = 16;
+ }
+ if (base == 0)
+ base = c == '0' ? 8 : 10;
+ acc = any = 0;
+ if (base < 2 || base > 36)
+ goto noconv;
+
+ cutoff = ULONG_MAX / base;
+ cutlim = ULONG_MAX % base;
+ for ( ; ; c = *s++) {
+ if (c >= '0' && c <= '9')
+ c -= '0';
+ else if (c >= 'A' && c <= 'Z')
+ c -= 'A' - 10;
+ else if (c >= 'a' && c <= 'z')
+ c -= 'a' - 10;
+ else
+ break;
+ if (c >= base)
+ break;
+ if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+ any = -1;
+ else {
+ any = 1;
+ acc *= base;
+ acc += c;
+ }
+ }
+ if (any < 0) {
+ acc = ULONG_MAX;
+ errno = ERANGE;
+ } else if (!any) {
+noconv:
+ errno = EINVAL;
+ } else if (neg)
+ acc = -acc;
+ if (endptr != NULL)
+ *endptr = (char *)(any ? s - 1 : nptr);
+ return (acc);
+}
diff --git a/src/clib/time.c b/src/clib/time.c
new file mode 100644
index 00000000..abc2ab2d
--- /dev/null
+++ b/src/clib/time.c
@@ -0,0 +1,34 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * time --
+ *
+ * PUBLIC: #ifndef HAVE_TIME
+ * PUBLIC: time_t time __P((time_t *));
+ * PUBLIC: #endif
+ */
+time_t
+time(nowp)
+ time_t *nowp;
+{
+ db_timespec t;
+ time_t res;
+
+ __os_gettime(NULL, &t, 0);
+
+ res = t.tv_sec + t.tv_nsec / NS_PER_SEC;
+
+ if (nowp != NULL)
+ *nowp = res;
+ return (res);
+}
diff --git a/src/common/clock.c b/src/common/clock.c
new file mode 100644
index 00000000..e1f917af
--- /dev/null
+++ b/src/common/clock.c
@@ -0,0 +1,57 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+/*
+ * __clock_set_expires --
+ * Set the expire time given the time to live.
+ *
+ * PUBLIC: void __clock_set_expires __P((ENV *, db_timespec *, db_timeout_t));
+ */
+void
+__clock_set_expires(env, timespecp, timeout)
+ ENV *env;
+ db_timespec *timespecp;
+ db_timeout_t timeout;
+{
+ db_timespec v;
+
+ /*
+ * If timespecp is set then it contains "now". This avoids repeated
+ * system calls to get the time.
+ */
+ if (!timespecisset(timespecp))
+ __os_gettime(env, timespecp, 1);
+
+ /* Convert the microsecond timeout argument to a timespec. */
+ DB_TIMEOUT_TO_TIMESPEC(timeout, &v);
+
+ /* Add the timeout to "now". */
+ timespecadd(timespecp, &v);
+}
+
+/*
+ * __clock_expired -- determine if a timeout has expired.
+ *
+ * PUBLIC: int __clock_expired __P((ENV *, db_timespec *, db_timespec *));
+ */
+int
+__clock_expired(env, now, timespecp)
+ ENV *env;
+ db_timespec *now, *timespecp;
+{
+ if (!timespecisset(timespecp))
+ return (0);
+
+ if (!timespecisset(now))
+ __os_gettime(env, now, 1);
+
+ return (timespeccmp(now, timespecp, >=));
+}
diff --git a/src/common/crypto_stub.c b/src/common/crypto_stub.c
new file mode 100644
index 00000000..95faebdb
--- /dev/null
+++ b/src/common/crypto_stub.c
@@ -0,0 +1,44 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __crypto_region_init --
+ * Initialize crypto.
+ *
+ *
+ * !!!
+ * We don't put this stub file in the crypto/ directory of the distribution
+ * because that entire directory is removed for non-crypto distributions.
+ *
+ * PUBLIC: int __crypto_region_init __P((ENV *));
+ */
+int
+__crypto_region_init(env)
+ ENV *env;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ int ret;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ ret = !(renv->cipher_off == INVALID_ROFF);
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+
+ if (ret == 0)
+ return (0);
+
+ __db_errx(env, DB_STR("0040",
+"Encrypted environment: library build did not include cryptography support"));
+ return (DB_OPNOTSUP);
+}
diff --git a/src/common/db_byteorder.c b/src/common/db_byteorder.c
new file mode 100644
index 00000000..71428f0a
--- /dev/null
+++ b/src/common/db_byteorder.c
@@ -0,0 +1,63 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_isbigendian --
+ * Return 1 if big-endian (Motorola and Sparc), not little-endian
+ * (Intel and Vax). We do this work at run-time, rather than at
+ * configuration time so cross-compilation and general embedded
+ * system support is simpler.
+ *
+ * PUBLIC: int __db_isbigendian __P((void));
+ */
+int
+__db_isbigendian()
+{
+ union { /* From Harbison & Steele. */
+ long l;
+ char c[sizeof(long)];
+ } u;
+
+ u.l = 1;
+ return (u.c[sizeof(long) - 1] == 1);
+}
+
+/*
+ * __db_byteorder --
+ * Return if we need to do byte swapping, checking for illegal
+ * values.
+ *
+ * PUBLIC: int __db_byteorder __P((ENV *, int));
+ */
+int
+__db_byteorder(env, lorder)
+ ENV *env;
+ int lorder;
+{
+ switch (lorder) {
+ case 0:
+ break;
+ case 1234:
+ if (!F_ISSET(env, ENV_LITTLEENDIAN))
+ return (DB_SWAPBYTES);
+ break;
+ case 4321:
+ if (F_ISSET(env, ENV_LITTLEENDIAN))
+ return (DB_SWAPBYTES);
+ break;
+ default:
+ __db_errx(env, DB_STR("0041",
+ "unsupported byte order, only big and little-endian supported"));
+ return (EINVAL);
+ }
+ return (0);
+}
diff --git a/src/common/db_compint.c b/src/common/db_compint.c
new file mode 100644
index 00000000..9f5ccf9a
--- /dev/null
+++ b/src/common/db_compint.c
@@ -0,0 +1,555 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_COMPRESSION
+
+/*
+ * Integer compression
+ *
+ * First byte | Next | Maximum
+ * byte | bytes| value
+ * ------------+------+---------------------------------------------------------
+ * [0 xxxxxxx] | 0 | 2^7 - 1
+ * [10 xxxxxx] | 1 | 2^14 + 2^7 - 1
+ * [110 xxxxx] | 2 | 2^21 + 2^14 + 2^7 - 1
+ * [1110 xxxx] | 3 | 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11110 xxx] | 4 | 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11111 000] | 5 | 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11111 001] | 6 | 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11111 010] | 7 | 2^56 + 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
+ * [11111 011] | 8 | 2^64 + 2^56 + 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 +
+ * | | 2^7 - 1
+ *
+ * NOTE: this compression algorithm depends
+ * on big-endian order, so swap if necessary.
+ *
+ */
+
+#define CMP_INT_1BYTE_MAX 0x7F
+#define CMP_INT_2BYTE_MAX 0x407F
+#define CMP_INT_3BYTE_MAX 0x20407F
+#define CMP_INT_4BYTE_MAX 0x1020407F
+
+#if defined(_MSC_VER) && _MSC_VER < 1300
+#define CMP_INT_5BYTE_MAX 0x081020407Fi64
+#define CMP_INT_6BYTE_MAX 0x01081020407Fi64
+#define CMP_INT_7BYTE_MAX 0x0101081020407Fi64
+#define CMP_INT_8BYTE_MAX 0x010101081020407Fi64
+#else
+#define CMP_INT_5BYTE_MAX 0x081020407FLL
+#define CMP_INT_6BYTE_MAX 0x01081020407FLL
+#define CMP_INT_7BYTE_MAX 0x0101081020407FLL
+#define CMP_INT_8BYTE_MAX 0x010101081020407FLL
+#endif
+
+#define CMP_INT_2BYTE_VAL 0x80
+#define CMP_INT_3BYTE_VAL 0xC0
+#define CMP_INT_4BYTE_VAL 0xE0
+#define CMP_INT_5BYTE_VAL 0xF0
+#define CMP_INT_6BYTE_VAL 0xF8
+#define CMP_INT_7BYTE_VAL 0xF9
+#define CMP_INT_8BYTE_VAL 0xFA
+#define CMP_INT_9BYTE_VAL 0xFB
+/* CMP_INT_SPARE_VAL is defined in db_int.h */
+
+#define CMP_INT_2BYTE_MASK 0x3F
+#define CMP_INT_3BYTE_MASK 0x1F
+#define CMP_INT_4BYTE_MASK 0x0F
+#define CMP_INT_5BYTE_MASK 0x07
+
+static const u_int8_t __db_marshaled_int_size[] = {
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x06, 0x07, 0x08, 0x09, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+/*
+ * __db_compress_count_int --
+ * Return the number of bytes that the compressed version
+ * of the argument will occupy.
+ *
+ * PUBLIC: u_int32_t __db_compress_count_int __P((u_int64_t));
+ */
+u_int32_t
+__db_compress_count_int(i)
+ u_int64_t i;
+{
+ if (i <= CMP_INT_1BYTE_MAX)
+ return 1;
+ else if (i <= CMP_INT_2BYTE_MAX)
+ return 2;
+ else if (i <= CMP_INT_3BYTE_MAX)
+ return 3;
+ else if (i <= CMP_INT_4BYTE_MAX)
+ return 4;
+ else if (i <= CMP_INT_5BYTE_MAX)
+ return 5;
+ else if (i <= CMP_INT_6BYTE_MAX)
+ return 6;
+ else if (i <= CMP_INT_7BYTE_MAX)
+ return 7;
+ else if (i <= CMP_INT_8BYTE_MAX)
+ return 8;
+ else
+ return 9;
+}
+
+/*
+ * __db_compress_int --
+ * Compresses the integer into the buffer, returning the number of
+ * bytes occupied.
+ *
+ * PUBLIC: int __db_compress_int __P((u_int8_t *, u_int64_t));
+ */
+int
+__db_compress_int(buf, i)
+ u_int8_t *buf;
+ u_int64_t i;
+{
+ if (i <= CMP_INT_1BYTE_MAX) {
+ /* no swapping for one byte value */
+ buf[0] = (u_int8_t)i;
+ return 1;
+ } else {
+ u_int8_t *p = (u_int8_t*)&i;
+ if (i <= CMP_INT_2BYTE_MAX) {
+ i -= CMP_INT_1BYTE_MAX + 1;
+ if (__db_isbigendian() != 0) {
+ buf[0] = p[6] | CMP_INT_2BYTE_VAL;
+ buf[1] = p[7];
+ } else {
+ buf[0] = p[1] | CMP_INT_2BYTE_VAL;
+ buf[1] = p[0];
+ }
+ return 2;
+ } else if (i <= CMP_INT_3BYTE_MAX) {
+ i -= CMP_INT_2BYTE_MAX + 1;
+ if (__db_isbigendian() != 0) {
+ buf[0] = p[5] | CMP_INT_3BYTE_VAL;
+ buf[1] = p[6];
+ buf[2] = p[7];
+ } else {
+ buf[0] = p[2] | CMP_INT_3BYTE_VAL;
+ buf[1] = p[1];
+ buf[2] = p[0];
+ }
+ return 3;
+ } else if (i <= CMP_INT_4BYTE_MAX) {
+ i -= CMP_INT_3BYTE_MAX + 1;
+ if (__db_isbigendian() != 0) {
+ buf[0] = p[4] | CMP_INT_4BYTE_VAL;
+ buf[1] = p[5];
+ buf[2] = p[6];
+ buf[3] = p[7];
+ } else {
+ buf[0] = p[3] | CMP_INT_4BYTE_VAL;
+ buf[1] = p[2];
+ buf[2] = p[1];
+ buf[3] = p[0];
+ }
+ return 4;
+ } else if (i <= CMP_INT_5BYTE_MAX) {
+ i -= CMP_INT_4BYTE_MAX + 1;
+ if (__db_isbigendian() != 0) {
+ buf[0] = p[3] | CMP_INT_5BYTE_VAL;
+ buf[1] = p[4];
+ buf[2] = p[5];
+ buf[3] = p[6];
+ buf[4] = p[7];
+ } else {
+ buf[0] = p[4] | CMP_INT_5BYTE_VAL;
+ buf[1] = p[3];
+ buf[2] = p[2];
+ buf[3] = p[1];
+ buf[4] = p[0];
+ }
+ return 5;
+ } else if (i <= CMP_INT_6BYTE_MAX) {
+ i -= CMP_INT_5BYTE_MAX + 1;
+ if (__db_isbigendian() != 0) {
+ buf[0] = CMP_INT_6BYTE_VAL;
+ buf[1] = p[3];
+ buf[2] = p[4];
+ buf[3] = p[5];
+ buf[4] = p[6];
+ buf[5] = p[7];
+ } else {
+ buf[0] = CMP_INT_6BYTE_VAL;
+ buf[1] = p[4];
+ buf[2] = p[3];
+ buf[3] = p[2];
+ buf[4] = p[1];
+ buf[5] = p[0];
+ }
+ return 6;
+ } else if (i <= CMP_INT_7BYTE_MAX) {
+ i -= CMP_INT_6BYTE_MAX + 1;
+ if (__db_isbigendian() != 0) {
+ buf[0] = CMP_INT_7BYTE_VAL;
+ buf[1] = p[2];
+ buf[2] = p[3];
+ buf[3] = p[4];
+ buf[4] = p[5];
+ buf[5] = p[6];
+ buf[6] = p[7];
+ } else {
+ buf[0] = CMP_INT_7BYTE_VAL;
+ buf[1] = p[5];
+ buf[2] = p[4];
+ buf[3] = p[3];
+ buf[4] = p[2];
+ buf[5] = p[1];
+ buf[6] = p[0];
+ }
+ return 7;
+ } else if (i <= CMP_INT_8BYTE_MAX) {
+ i -= CMP_INT_7BYTE_MAX + 1;
+ if (__db_isbigendian() != 0) {
+ buf[0] = CMP_INT_8BYTE_VAL;
+ buf[1] = p[1];
+ buf[2] = p[2];
+ buf[3] = p[3];
+ buf[4] = p[4];
+ buf[5] = p[5];
+ buf[6] = p[6];
+ buf[7] = p[7];
+ } else {
+ buf[0] = CMP_INT_8BYTE_VAL;
+ buf[1] = p[6];
+ buf[2] = p[5];
+ buf[3] = p[4];
+ buf[4] = p[3];
+ buf[5] = p[2];
+ buf[6] = p[1];
+ buf[7] = p[0];
+ }
+ return 8;
+ } else {
+ i -= CMP_INT_8BYTE_MAX + 1;
+ if (__db_isbigendian() != 0) {
+ buf[0] = CMP_INT_9BYTE_VAL;
+ buf[1] = p[0];
+ buf[2] = p[1];
+ buf[3] = p[2];
+ buf[4] = p[3];
+ buf[5] = p[4];
+ buf[6] = p[5];
+ buf[7] = p[6];
+ buf[8] = p[7];
+ } else {
+ buf[0] = CMP_INT_9BYTE_VAL;
+ buf[1] = p[7];
+ buf[2] = p[6];
+ buf[3] = p[5];
+ buf[4] = p[4];
+ buf[5] = p[3];
+ buf[6] = p[2];
+ buf[7] = p[1];
+ buf[8] = p[0];
+ }
+ return 9;
+ }
+ }
+}
+
+/*
+ * __db_decompress_count_int --
+ * Return the number of bytes occupied by the compressed
+ * integer pointed to by buf.
+ *
+ * PUBLIC: u_int32_t __db_decompress_count_int __P((const u_int8_t *));
+ */
+u_int32_t
+__db_decompress_count_int(buf)
+ const u_int8_t *buf;
+{
+ return __db_marshaled_int_size[*buf];
+}
+
+/*
+ * __db_decompress_int --
+ * Decompresses the compressed integer pointer to by buf into i,
+ * returning the number of bytes read.
+ *
+ * PUBLIC: int __db_decompress_int __P((const u_int8_t *, u_int64_t *));
+ */
+int
+__db_decompress_int(buf, i)
+ const u_int8_t *buf;
+ u_int64_t *i;
+{
+ int len;
+ u_int64_t tmp;
+ u_int8_t *p;
+ u_int8_t c;
+
+ tmp = 0;
+ p = (u_int8_t*)&tmp;
+ c = buf[0];
+ len = __db_marshaled_int_size[c];
+
+ switch (len) {
+ case 1:
+ *i = c;
+ return 1;
+ case 2:
+ if (__db_isbigendian() != 0) {
+ p[6] = (c & CMP_INT_2BYTE_MASK);
+ p[7] = buf[1];
+ } else {
+ p[1] = (c & CMP_INT_2BYTE_MASK);
+ p[0] = buf[1];
+ }
+ tmp += CMP_INT_1BYTE_MAX + 1;
+ break;
+ case 3:
+ if (__db_isbigendian() != 0) {
+ p[5] = (c & CMP_INT_3BYTE_MASK);
+ p[6] = buf[1];
+ p[7] = buf[2];
+ } else {
+ p[2] = (c & CMP_INT_3BYTE_MASK);
+ p[1] = buf[1];
+ p[0] = buf[2];
+ }
+ tmp += CMP_INT_2BYTE_MAX + 1;
+ break;
+ case 4:
+ if (__db_isbigendian() != 0) {
+ p[4] = (c & CMP_INT_4BYTE_MASK);
+ p[5] = buf[1];
+ p[6] = buf[2];
+ p[7] = buf[3];
+ } else {
+ p[3] = (c & CMP_INT_4BYTE_MASK);
+ p[2] = buf[1];
+ p[1] = buf[2];
+ p[0] = buf[3];
+ }
+ tmp += CMP_INT_3BYTE_MAX + 1;
+ break;
+ case 5:
+ if (__db_isbigendian() != 0) {
+ p[3] = (c & CMP_INT_5BYTE_MASK);
+ p[4] = buf[1];
+ p[5] = buf[2];
+ p[6] = buf[3];
+ p[7] = buf[4];
+ } else {
+ p[4] = (c & CMP_INT_5BYTE_MASK);
+ p[3] = buf[1];
+ p[2] = buf[2];
+ p[1] = buf[3];
+ p[0] = buf[4];
+ }
+ tmp += CMP_INT_4BYTE_MAX + 1;
+ break;
+ case 6:
+ if (__db_isbigendian() != 0) {
+ p[3] = buf[1];
+ p[4] = buf[2];
+ p[5] = buf[3];
+ p[6] = buf[4];
+ p[7] = buf[5];
+ } else {
+ p[4] = buf[1];
+ p[3] = buf[2];
+ p[2] = buf[3];
+ p[1] = buf[4];
+ p[0] = buf[5];
+ }
+ tmp += CMP_INT_5BYTE_MAX + 1;
+ break;
+ case 7:
+ if (__db_isbigendian() != 0) {
+ p[2] = buf[1];
+ p[3] = buf[2];
+ p[4] = buf[3];
+ p[5] = buf[4];
+ p[6] = buf[5];
+ p[7] = buf[6];
+ } else {
+ p[5] = buf[1];
+ p[4] = buf[2];
+ p[3] = buf[3];
+ p[2] = buf[4];
+ p[1] = buf[5];
+ p[0] = buf[6];
+ }
+ tmp += CMP_INT_6BYTE_MAX + 1;
+ break;
+ case 8:
+ if (__db_isbigendian() != 0) {
+ p[1] = buf[1];
+ p[2] = buf[2];
+ p[3] = buf[3];
+ p[4] = buf[4];
+ p[5] = buf[5];
+ p[6] = buf[6];
+ p[7] = buf[7];
+ } else {
+ p[6] = buf[1];
+ p[5] = buf[2];
+ p[4] = buf[3];
+ p[3] = buf[4];
+ p[2] = buf[5];
+ p[1] = buf[6];
+ p[0] = buf[7];
+ }
+ tmp += CMP_INT_7BYTE_MAX + 1;
+ break;
+ case 9:
+ if (__db_isbigendian() != 0) {
+ p[0] = buf[1];
+ p[1] = buf[2];
+ p[2] = buf[3];
+ p[3] = buf[4];
+ p[4] = buf[5];
+ p[5] = buf[6];
+ p[6] = buf[7];
+ p[7] = buf[8];
+ } else {
+ p[7] = buf[1];
+ p[6] = buf[2];
+ p[5] = buf[3];
+ p[4] = buf[4];
+ p[3] = buf[5];
+ p[2] = buf[6];
+ p[1] = buf[7];
+ p[0] = buf[8];
+ }
+ tmp += CMP_INT_8BYTE_MAX + 1;
+ break;
+ default:
+ break;
+ }
+
+ *i = tmp;
+ return len;
+}
+
+/*
+ * __db_decompress_int32 --
+ * Decompresses the compressed 32 bit integer pointer to by buf into i,
+ * returning the number of bytes read.
+ *
+ * PUBLIC: int __db_decompress_int32 __P((const u_int8_t *, u_int32_t *));
+ */
+int
+__db_decompress_int32(buf, i)
+ const u_int8_t *buf;
+ u_int32_t *i;
+{
+ int len;
+ u_int32_t tmp;
+ u_int8_t *p;
+ u_int8_t c;
+
+ tmp = 0;
+ p = (u_int8_t*)&tmp;
+ c = buf[0];
+ len = __db_marshaled_int_size[c];
+
+ switch (len) {
+ case 1:
+ *i = c;
+ return 1;
+ case 2:
+ if (__db_isbigendian() != 0) {
+ p[2] = (c & CMP_INT_2BYTE_MASK);
+ p[3] = buf[1];
+ } else {
+ p[1] = (c & CMP_INT_2BYTE_MASK);
+ p[0] = buf[1];
+ }
+ tmp += CMP_INT_1BYTE_MAX + 1;
+ break;
+ case 3:
+ if (__db_isbigendian() != 0) {
+ p[1] = (c & CMP_INT_3BYTE_MASK);
+ p[2] = buf[1];
+ p[3] = buf[2];
+ } else {
+ p[2] = (c & CMP_INT_3BYTE_MASK);
+ p[1] = buf[1];
+ p[0] = buf[2];
+ }
+ tmp += CMP_INT_2BYTE_MAX + 1;
+ break;
+ case 4:
+ if (__db_isbigendian() != 0) {
+ p[0] = (c & CMP_INT_4BYTE_MASK);
+ p[1] = buf[1];
+ p[2] = buf[2];
+ p[3] = buf[3];
+ } else {
+ p[3] = (c & CMP_INT_4BYTE_MASK);
+ p[2] = buf[1];
+ p[1] = buf[2];
+ p[0] = buf[3];
+ }
+ tmp += CMP_INT_3BYTE_MAX + 1;
+ break;
+ case 5:
+ if (__db_isbigendian() != 0) {
+ p[0] = buf[1];
+ p[1] = buf[2];
+ p[2] = buf[3];
+ p[3] = buf[4];
+ } else {
+ p[3] = buf[1];
+ p[2] = buf[2];
+ p[1] = buf[3];
+ p[0] = buf[4];
+ }
+ tmp += CMP_INT_4BYTE_MAX + 1;
+ break;
+ default:
+ break;
+ }
+
+ *i = tmp;
+ return len;
+}
+
+#endif
diff --git a/src/common/db_err.c b/src/common/db_err.c
new file mode 100644
index 00000000..6edc37b6
--- /dev/null
+++ b/src/common/db_err.c
@@ -0,0 +1,1118 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static void __db_msgcall __P((const DB_ENV *, const char *, va_list));
+static void __db_msgfile __P((const DB_ENV *, const char *, va_list));
+
+/*
+ * __db_fchk --
+ * General flags checking routine.
+ *
+ * PUBLIC: int __db_fchk __P((ENV *, const char *, u_int32_t, u_int32_t));
+ */
+int
+__db_fchk(env, name, flags, ok_flags)
+ ENV *env;
+ const char *name;
+ u_int32_t flags, ok_flags;
+{
+ return (LF_ISSET(~ok_flags) ? __db_ferr(env, name, 0) : 0);
+}
+
+/*
+ * __db_fcchk --
+ * General combination flags checking routine.
+ *
+ * PUBLIC: int __db_fcchk
+ * PUBLIC: __P((ENV *, const char *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__db_fcchk(env, name, flags, flag1, flag2)
+ ENV *env;
+ const char *name;
+ u_int32_t flags, flag1, flag2;
+{
+ return (LF_ISSET(flag1) &&
+ LF_ISSET(flag2) ? __db_ferr(env, name, 1) : 0);
+}
+
+/*
+ * __db_ferr --
+ * Common flag errors.
+ *
+ * PUBLIC: int __db_ferr __P((const ENV *, const char *, int));
+ */
+int
+__db_ferr(env, name, iscombo)
+ const ENV *env;
+ const char *name;
+ int iscombo;
+{
+ if (iscombo)
+ __db_errx(env, DB_STR_A("0054",
+ "illegal flag combination specified to %s", "%s"), name);
+ else
+ __db_errx(env, DB_STR_A("0055",
+ "illegal flag specified to %s", "%s"), name);
+
+ return (EINVAL);
+}
+
+/*
+ * __db_fnl --
+ * Common flag-needs-locking message.
+ *
+ * PUBLIC: int __db_fnl __P((const ENV *, const char *));
+ */
+int
+__db_fnl(env, name)
+ const ENV *env;
+ const char *name;
+{
+ __db_errx(env, DB_STR_A("0056",
+ "%s: DB_READ_COMMITTED, DB_READ_UNCOMMITTED and DB_RMW require locking",
+ "%s"), name);
+ return (EINVAL);
+}
+
+/*
+ * __db_pgerr --
+ * Error when unable to retrieve a specified page.
+ *
+ * PUBLIC: int __db_pgerr __P((DB *, db_pgno_t, int));
+ */
+int
+__db_pgerr(dbp, pgno, errval)
+ DB *dbp;
+ db_pgno_t pgno;
+ int errval;
+{
+ /*
+ * Three things are certain:
+ * Death, taxes, and lost data.
+ * Guess which has occurred.
+ */
+ __db_errx(dbp->env, DB_STR_A("0057",
+ "unable to create/retrieve page %lu", "%lu"), (u_long)pgno);
+ return (__env_panic(dbp->env, errval));
+}
+
+/*
+ * __db_pgfmt --
+ * Error when a page has the wrong format.
+ *
+ * PUBLIC: int __db_pgfmt __P((ENV *, db_pgno_t));
+ */
+int
+__db_pgfmt(env, pgno)
+ ENV *env;
+ db_pgno_t pgno;
+{
+ __db_errx(env, DB_STR_A("0058",
+ "page %lu: illegal page type or format", "%lu"), (u_long)pgno);
+ return (__env_panic(env, EINVAL));
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __db_assert --
+ * Error when an assertion fails. Only checked if #DIAGNOSTIC defined.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __db_assert __P((ENV *, const char *, const char *, int));
+ * PUBLIC: #endif
+ */
+void
+__db_assert(env, e, file, line)
+ ENV *env;
+ const char *e, *file;
+ int line;
+{
+ if (DB_GLOBAL(j_assert) != NULL)
+ DB_GLOBAL(j_assert)(e, file, line);
+ else {
+ __db_errx(env, DB_STR_A("0059",
+ "assert failure: %s/%d: \"%s\"",
+ "%s %d %s"), file, line, e);
+
+ __os_abort(env);
+ /* NOTREACHED */
+ }
+}
+#endif
+
+/*
+ * __env_panic_msg --
+ * Just report that someone else paniced.
+ *
+ * PUBLIC: int __env_panic_msg __P((ENV *));
+ */
+int
+__env_panic_msg(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = env->dbenv;
+
+ ret = DB_RUNRECOVERY;
+
+ __db_errx(env, DB_STR("0060",
+ "PANIC: fatal region error detected; run recovery"));
+
+ if (dbenv->db_paniccall != NULL) /* Deprecated */
+ dbenv->db_paniccall(dbenv, ret);
+
+ /* Must check for DB_EVENT_REG_PANIC panic first because it is never
+ * set by itself. If set, it means panic came from DB_REGISTER code
+ * only, otherwise it could be from many possible places in the code.
+ */
+ if ((env->reginfo != NULL) &&
+ (((REGENV *)env->reginfo->primary)->reg_panic))
+ DB_EVENT(env, DB_EVENT_REG_PANIC, &ret);
+ else
+ DB_EVENT(env, DB_EVENT_PANIC, &ret);
+
+ return (ret);
+}
+
+/*
+ * __env_panic --
+ * Lock out the database environment due to unrecoverable error.
+ *
+ * PUBLIC: int __env_panic __P((ENV *, int));
+ */
+int
+__env_panic(env, errval)
+ ENV *env;
+ int errval;
+{
+ DB_ENV *dbenv;
+
+ dbenv = env->dbenv;
+
+ if (env != NULL) {
+ __env_panic_set(env, 1);
+
+ __db_err(env, errval, DB_STR("0061", "PANIC"));
+
+ if (dbenv->db_paniccall != NULL) /* Deprecated */
+ dbenv->db_paniccall(dbenv, errval);
+
+ /* Must check for DB_EVENT_REG_PANIC first because it is never
+ * set by itself. If set, it means panic came from DB_REGISTER
+ * code only, otherwise it could be from many possible places
+ * in the code.
+ */
+ if ((env->reginfo != NULL) &&
+ (((REGENV *)env->reginfo->primary)->reg_panic))
+ DB_EVENT(env, DB_EVENT_REG_PANIC, &errval);
+ else
+ DB_EVENT(env, DB_EVENT_PANIC, &errval);
+ }
+
+#if defined(DIAGNOSTIC) && !defined(CONFIG_TEST)
+ /*
+ * We want a stack trace of how this could possibly happen.
+ *
+ * Don't drop core if it's the test suite -- it's reasonable for the
+ * test suite to check to make sure that DB_RUNRECOVERY is returned
+ * under certain conditions.
+ */
+ __os_abort(env);
+ /* NOTREACHED */
+#endif
+
+ /*
+ * Chaos reigns within.
+ * Reflect, repent, and reboot.
+ * Order shall return.
+ */
+ return (DB_RUNRECOVERY);
+}
+
+/*
+ * db_strerror --
+ * ANSI C strerror(3) for DB.
+ *
+ * EXTERN: char *db_strerror __P((int));
+ */
+char *
+db_strerror(error)
+ int error;
+{
+ char *p;
+
+ if (error == 0)
+ return (DB_STR("0062", "Successful return: 0"));
+ if (error > 0) {
+ if ((p = strerror(error)) != NULL)
+ return (p);
+ return (__db_unknown_error(error));
+ }
+
+ /*
+ * !!!
+ * The Tcl API requires that some of these return strings be compared
+ * against strings stored in application scripts. So, any of these
+ * errors that do not invariably result in a Tcl exception may not be
+ * altered.
+ */
+ switch (error) {
+ case DB_BUFFER_SMALL:
+ return (DB_STR("0063",
+ "DB_BUFFER_SMALL: User memory too small for return value"));
+ case DB_DONOTINDEX:
+ return (DB_STR("0064",
+ "DB_DONOTINDEX: Secondary index callback returns null"));
+ case DB_FOREIGN_CONFLICT:
+ return (DB_STR("0065",
+ "DB_FOREIGN_CONFLICT: A foreign database constraint has been violated"));
+ case DB_HEAP_FULL:
+ return (DB_STR("0208","DB_HEAP_FULL: no free space in db"));
+ case DB_KEYEMPTY:
+ return (DB_STR("0066",
+ "DB_KEYEMPTY: Non-existent key/data pair"));
+ case DB_KEYEXIST:
+ return (DB_STR("0067",
+ "DB_KEYEXIST: Key/data pair already exists"));
+ case DB_LOCK_DEADLOCK:
+ return (DB_STR("0068",
+ "DB_LOCK_DEADLOCK: Locker killed to resolve a deadlock"));
+ case DB_LOCK_NOTGRANTED:
+ return (DB_STR("0069", "DB_LOCK_NOTGRANTED: Lock not granted"));
+ case DB_LOG_BUFFER_FULL:
+ return (DB_STR("0070",
+ "DB_LOG_BUFFER_FULL: In-memory log buffer is full"));
+ case DB_LOG_VERIFY_BAD:
+ return (DB_STR("0071",
+ "DB_LOG_VERIFY_BAD: Log verification failed"));
+ case DB_NOSERVER:
+ return (DB_STR("0072",
+ "DB_NOSERVER: No message dispatch call-back function has been configured"));
+ case DB_NOTFOUND:
+ return (DB_STR("0073",
+ "DB_NOTFOUND: No matching key/data pair found"));
+ case DB_OLD_VERSION:
+ return (DB_STR("0074",
+ "DB_OLDVERSION: Database requires a version upgrade"));
+ case DB_PAGE_NOTFOUND:
+ return (DB_STR("0075",
+ "DB_PAGE_NOTFOUND: Requested page not found"));
+ case DB_REP_DUPMASTER:
+ return (DB_STR("0076",
+ "DB_REP_DUPMASTER: A second master site appeared"));
+ case DB_REP_HANDLE_DEAD:
+ return (DB_STR("0077",
+ "DB_REP_HANDLE_DEAD: Handle is no longer valid"));
+ case DB_REP_HOLDELECTION:
+ return (DB_STR("0078",
+ "DB_REP_HOLDELECTION: Need to hold an election"));
+ case DB_REP_IGNORE:
+ return (DB_STR("0079",
+ "DB_REP_IGNORE: Replication record/operation ignored"));
+ case DB_REP_ISPERM:
+ return (DB_STR("0080",
+ "DB_REP_ISPERM: Permanent record written"));
+ case DB_REP_JOIN_FAILURE:
+ return (DB_STR("0081",
+ "DB_REP_JOIN_FAILURE: Unable to join replication group"));
+ case DB_REP_LEASE_EXPIRED:
+ return (DB_STR("0082",
+ "DB_REP_LEASE_EXPIRED: Replication leases have expired"));
+ case DB_REP_LOCKOUT:
+ return (DB_STR("0083",
+ "DB_REP_LOCKOUT: Waiting for replication recovery to complete"));
+ case DB_REP_NEWSITE:
+ return (DB_STR("0084",
+ "DB_REP_NEWSITE: A new site has entered the system"));
+ case DB_REP_NOTPERM:
+ return (DB_STR("0085",
+ "DB_REP_NOTPERM: Permanent log record not written"));
+ case DB_REP_UNAVAIL:
+ return (DB_STR("0086",
+ "DB_REP_UNAVAIL: Too few remote sites to complete operation"));
+ case DB_REP_WOULDROLLBACK: /* Undocumented; C API only. */
+ return (DB_STR("0207",
+ "DB_REP_WOULDROLLBACK: Client data has diverged"));
+ case DB_RUNRECOVERY:
+ return (DB_STR("0087",
+ "DB_RUNRECOVERY: Fatal error, run database recovery"));
+ case DB_SECONDARY_BAD:
+ return (DB_STR("0088",
+ "DB_SECONDARY_BAD: Secondary index inconsistent with primary"));
+ case DB_TIMEOUT:
+ return (DB_STR("0089", "DB_TIMEOUT: Operation timed out"));
+ case DB_VERIFY_BAD:
+ return (DB_STR("0090",
+ "DB_VERIFY_BAD: Database verification failed"));
+ case DB_VERSION_MISMATCH:
+ return (DB_STR("0091",
+ "DB_VERSION_MISMATCH: Database environment version mismatch"));
+ default:
+ break;
+ }
+
+ return (__db_unknown_error(error));
+}
+
+/*
+ * __db_unknown_error --
+ * Format an unknown error value into a static buffer.
+ *
+ * PUBLIC: char *__db_unknown_error __P((int));
+ */
+char *
+__db_unknown_error(error)
+ int error;
+{
+ /*
+ * !!!
+ * Room for a 64-bit number + slop. This buffer is only used
+ * if we're given an unknown error number, which should never
+ * happen.
+ *
+ * We're no longer thread-safe if it does happen, but the worst
+ * result is a corrupted error string because there will always
+ * be a trailing nul byte since the error buffer is nul filled
+ * and longer than any error message.
+ */
+ (void)snprintf(DB_GLOBAL(error_buf),
+ sizeof(DB_GLOBAL(error_buf)), DB_STR_A("0092",
+ "Unknown error: %d", "%d"), error);
+ return (DB_GLOBAL(error_buf));
+}
+
+/*
+ * __db_syserr --
+ * Standard error routine.
+ *
+ * PUBLIC: void __db_syserr __P((const ENV *, int, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_syserr(const ENV *env, int error, const char *fmt, ...)
+#else
+__db_syserr(env, error, fmt, va_alist)
+ const ENV *env;
+ int error;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ DB_ENV *dbenv;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ /*
+ * The same as DB->err, except we don't default to writing to stderr
+ * after any output channel has been configured, and we use a system-
+ * specific function to translate errors to strings.
+ */
+ DB_REAL_ERR(dbenv, error, DB_ERROR_SYSTEM, 0, fmt);
+}
+
+/*
+ * __db_err --
+ * Standard error routine.
+ *
+ * PUBLIC: void __db_err __P((const ENV *, int, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_err(const ENV *env, int error, const char *fmt, ...)
+#else
+__db_err(env, error, fmt, va_alist)
+ const ENV *env;
+ int error;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ DB_ENV *dbenv;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ /*
+ * The same as DB->err, except we don't default to writing to stderr
+ * once an output channel has been configured.
+ */
+ DB_REAL_ERR(dbenv, error, DB_ERROR_SET, 0, fmt);
+}
+
+/*
+ * __db_errx --
+ * Standard error routine.
+ *
+ * PUBLIC: void __db_errx __P((const ENV *, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 2, 3)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_errx(const ENV *env, const char *fmt, ...)
+#else
+__db_errx(env, fmt, va_alist)
+ const ENV *env;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ DB_ENV *dbenv;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ /*
+ * The same as DB->errx, except we don't default to writing to stderr
+ * once an output channel has been configured.
+ */
+ DB_REAL_ERR(dbenv, 0, DB_ERROR_NOT_SET, 0, fmt);
+}
+
+/*
+ * __db_errcall --
+ * Do the error message work for callback functions.
+ *
+ * PUBLIC: void __db_errcall
+ * PUBLIC: __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
+ */
+void
+__db_errcall(dbenv, error, error_set, fmt, ap)
+ const DB_ENV *dbenv;
+ int error;
+ db_error_set_t error_set;
+ const char *fmt;
+ va_list ap;
+{
+ char *p;
+ char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+ char sysbuf[1024]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+ p = buf;
+ if (fmt != NULL)
+ p += vsnprintf(buf, sizeof(buf), fmt, ap);
+ if (error_set != DB_ERROR_NOT_SET)
+ p += snprintf(p,
+ sizeof(buf) - (size_t)(p - buf), ": %s",
+ error_set == DB_ERROR_SET ? db_strerror(error) :
+ __os_strerror(error, sysbuf, sizeof(sysbuf)));
+
+ dbenv->db_errcall(dbenv, dbenv->db_errpfx, buf);
+}
+
+/*
+ * __db_errfile --
+ * Do the error message work for FILE *s.
+ *
+ * PUBLIC: void __db_errfile
+ * PUBLIC: __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
+ */
+void
+__db_errfile(dbenv, error, error_set, fmt, ap)
+ const DB_ENV *dbenv;
+ int error;
+ db_error_set_t error_set;
+ const char *fmt;
+ va_list ap;
+{
+ FILE *fp;
+ int need_sep;
+ char sysbuf[1024]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+ fp = dbenv == NULL ||
+ dbenv->db_errfile == NULL ? stderr : dbenv->db_errfile;
+ need_sep = 0;
+
+ if (dbenv != NULL && dbenv->db_errpfx != NULL) {
+ (void)fprintf(fp, "%s", dbenv->db_errpfx);
+ need_sep = 1;
+ }
+ if (fmt != NULL && fmt[0] != '\0') {
+ if (need_sep)
+ (void)fprintf(fp, ": ");
+ need_sep = 1;
+ (void)vfprintf(fp, fmt, ap);
+ }
+ if (error_set != DB_ERROR_NOT_SET)
+ (void)fprintf(fp, "%s%s",
+ need_sep ? ": " : "",
+ error_set == DB_ERROR_SET ? db_strerror(error) :
+ __os_strerror(error, sysbuf, sizeof(sysbuf)));
+ (void)fprintf(fp, "\n");
+ (void)fflush(fp);
+}
+
+/*
+ * __db_msgadd --
+ * Aggregate a set of strings into a buffer for the callback API.
+ *
+ * PUBLIC: void __db_msgadd __P((ENV *, DB_MSGBUF *, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_msgadd(ENV *env, DB_MSGBUF *mbp, const char *fmt, ...)
+#else
+__db_msgadd(env, mbp, fmt, va_alist)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ __db_msgadd_ap(env, mbp, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * __db_msgadd_ap --
+ * Aggregate a set of strings into a buffer for the callback API.
+ *
+ * PUBLIC: void __db_msgadd_ap
+ * PUBLIC: __P((ENV *, DB_MSGBUF *, const char *, va_list));
+ */
+void
+__db_msgadd_ap(env, mbp, fmt, ap)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ const char *fmt;
+ va_list ap;
+{
+ size_t len, olen;
+ char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+ len = (size_t)vsnprintf(buf, sizeof(buf), fmt, ap);
+
+ /*
+ * There's a heap buffer in the ENV handle we use to aggregate the
+ * message chunks. We maintain a pointer to the buffer, the next slot
+ * to be filled in in the buffer, and a total buffer length.
+ */
+ olen = (size_t)(mbp->cur - mbp->buf);
+ if (olen + len >= mbp->len) {
+ if (__os_realloc(env, mbp->len + len + 256, &mbp->buf))
+ return;
+ mbp->len += (len + 256);
+ mbp->cur = mbp->buf + olen;
+ }
+
+ memcpy(mbp->cur, buf, len + 1);
+ mbp->cur += len;
+}
+
+/*
+ * __db_msg --
+ * Standard DB stat message routine.
+ *
+ * PUBLIC: void __db_msg __P((const ENV *, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 2, 3)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_msg(const ENV *env, const char *fmt, ...)
+#else
+__db_msg(env, fmt, va_alist)
+ const ENV *env;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ DB_ENV *dbenv;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ DB_REAL_MSG(dbenv, fmt);
+}
+
+/*
+ * __db_repmsg --
+ * Replication system message routine.
+ *
+ * PUBLIC: void __db_repmsg __P((const ENV *, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 2, 3)));
+ */
+void
+#ifdef STDC_HEADERS
+__db_repmsg(const ENV *env, const char *fmt, ...)
+#else
+__db_repmsg(env, fmt, va_alist)
+ const ENV *env;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+ char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ (void)vsnprintf(buf, sizeof(buf), fmt, ap);
+ __rep_msg(env, buf);
+ va_end(ap);
+}
+
+/*
+ * __db_msgcall --
+ * Do the message work for callback functions.
+ */
+static void
+__db_msgcall(dbenv, fmt, ap)
+ const DB_ENV *dbenv;
+ const char *fmt;
+ va_list ap;
+{
+ char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+ (void)vsnprintf(buf, sizeof(buf), fmt, ap);
+
+ dbenv->db_msgcall(dbenv, buf);
+}
+
+/*
+ * __db_msgfile --
+ * Do the message work for FILE *s.
+ */
+static void
+__db_msgfile(dbenv, fmt, ap)
+ const DB_ENV *dbenv;
+ const char *fmt;
+ va_list ap;
+{
+ FILE *fp;
+
+ fp = dbenv == NULL ||
+ dbenv->db_msgfile == NULL ? stdout : dbenv->db_msgfile;
+ (void)vfprintf(fp, fmt, ap);
+
+ (void)fprintf(fp, "\n");
+ (void)fflush(fp);
+}
+
+/*
+ * __db_unknown_flag -- report internal error
+ *
+ * PUBLIC: int __db_unknown_flag __P((ENV *, char *, u_int32_t));
+ */
+int
+__db_unknown_flag(env, routine, flag)
+ ENV *env;
+ char *routine;
+ u_int32_t flag;
+{
+ __db_errx(env, DB_STR_A("0093", "%s: Unknown flag: %#x", "%s %#x"),
+ routine, (u_int)flag);
+
+#ifdef DIAGNOSTIC
+ __os_abort(env);
+ /* NOTREACHED */
+#endif
+ return (EINVAL);
+}
+
+/*
+ * __db_unknown_type -- report internal database type error
+ *
+ * PUBLIC: int __db_unknown_type __P((ENV *, char *, DBTYPE));
+ */
+int
+__db_unknown_type(env, routine, type)
+ ENV *env;
+ char *routine;
+ DBTYPE type;
+{
+ __db_errx(env, DB_STR_A("0094", "%s: Unexpected database type: %s",
+ "%s %s"), routine, __db_dbtype_to_string(type));
+
+#ifdef DIAGNOSTIC
+ __os_abort(env);
+ /* NOTREACHED */
+#endif
+ return (EINVAL);
+}
+
+/*
+ * __db_unknown_path -- report unexpected database code path error.
+ *
+ * PUBLIC: int __db_unknown_path __P((ENV *, char *));
+ */
+int
+__db_unknown_path(env, routine)
+ ENV *env;
+ char *routine;
+{
+ __db_errx(env, DB_STR_A("0095", "%s: Unexpected code path error",
+ "%s"), routine);
+
+#ifdef DIAGNOSTIC
+ __os_abort(env);
+ /* NOTREACHED */
+#endif
+ return (EINVAL);
+}
+
+/*
+ * __db_check_txn --
+ * Check for common transaction errors.
+ *
+ * PUBLIC: int __db_check_txn __P((DB *, DB_TXN *, DB_LOCKER *, int));
+ */
+int
+__db_check_txn(dbp, txn, assoc_locker, read_op)
+ DB *dbp;
+ DB_TXN *txn;
+ DB_LOCKER *assoc_locker;
+ int read_op;
+{
+ ENV *env;
+ int related, ret;
+
+ env = dbp->env;
+
+ /*
+ * If we are in recovery or aborting a transaction, then we
+ * don't need to enforce the rules about dbp's not allowing
+ * transactional operations in non-transactional dbps and
+ * vica-versa. This happens all the time as the dbp during
+ * an abort may be transactional, but we undo operations
+ * outside a transaction since we're aborting.
+ */
+ if (IS_RECOVERING(env) || F_ISSET(dbp, DB_AM_RECOVER))
+ return (0);
+
+ /*
+ * Check for common transaction errors:
+ * an operation on a handle whose open commit hasn't completed.
+ * a transaction handle in a non-transactional environment
+ * a transaction handle for a non-transactional database
+ */
+ if (!read_op && txn != NULL && F_ISSET(txn, TXN_READONLY)) {
+ __db_errx(env, DB_STR("0096",
+ "Read-only transaction cannot be used for an update"));
+ return (EINVAL);
+ } else if (txn == NULL || F_ISSET(txn, TXN_PRIVATE)) {
+ if (dbp->cur_locker != NULL &&
+ dbp->cur_locker->id >= TXN_MINIMUM)
+ goto open_err;
+
+ if (!read_op && F_ISSET(dbp, DB_AM_TXN)) {
+ __db_errx(env, DB_STR("0097",
+ "Transaction not specified for a transactional database"));
+ return (EINVAL);
+ }
+ } else if (F_ISSET(txn, TXN_FAMILY)) {
+ /*
+ * Family transaction handles can be passed to any method,
+ * since they only determine locker IDs.
+ */
+ return (0);
+ } else {
+ if (!TXN_ON(env))
+ return (__db_not_txn_env(env));
+
+ if (!F_ISSET(dbp, DB_AM_TXN)) {
+ __db_errx(env, DB_STR("0098",
+ "Transaction specified for a non-transactional database"));
+ return (EINVAL);
+ }
+
+ if (F_ISSET(txn, TXN_DEADLOCK))
+ return (__db_txn_deadlock_err(env, txn));
+
+ if (dbp->cur_locker != NULL &&
+ dbp->cur_locker->id >= TXN_MINIMUM &&
+ dbp->cur_locker->id != txn->txnid) {
+ if ((ret = __lock_locker_same_family(env,
+ dbp->cur_locker, txn->locker, &related)) != 0)
+ return (ret);
+ if (!related)
+ goto open_err;
+ }
+ }
+
+ /*
+ * If dbp->associate_locker is not NULL, that means we're in
+ * the middle of a DB->associate with DB_CREATE (i.e., a secondary index
+ * creation).
+ *
+ * In addition to the usual transaction rules, we need to lock out
+ * non-transactional updates that aren't part of the associate (and
+ * thus are using some other locker ID).
+ *
+ * Transactional updates should simply block; from the time we
+ * decide to build the secondary until commit, we'll hold a write
+ * lock on all of its pages, so it should be safe to attempt to update
+ * the secondary in another transaction (presumably by updating the
+ * primary).
+ */
+ if (!read_op && dbp->associate_locker != NULL &&
+ txn != NULL && dbp->associate_locker != assoc_locker) {
+ __db_errx(env, DB_STR("0099",
+ "Operation forbidden while secondary index is being created"));
+ return (EINVAL);
+ }
+
+ /*
+ * Check the txn and dbp are from the same env.
+ */
+ if (txn != NULL && env != txn->mgrp->env) {
+ __db_errx(env, DB_STR("0100",
+ "Transaction and database from different environments"));
+ return (EINVAL);
+ }
+
+ return (0);
+open_err:
+ if (F2_ISSET(dbp, DB2_AM_EXCL))
+ __db_errx(env, DB_STR("0209",
+"Exclusive database handles can only have one active transaction at a time."));
+ else
+ __db_errx(env, DB_STR("0101",
+ "Transaction that opened the DB handle is still active"));
+ return (EINVAL);
+}
+
+/*
+ * __db_txn_deadlock_err --
+ * Transaction has allready been deadlocked.
+ *
+ * PUBLIC: int __db_txn_deadlock_err __P((ENV *, DB_TXN *));
+ */
+int
+__db_txn_deadlock_err(env, txn)
+ ENV *env;
+ DB_TXN *txn;
+{
+ const char *name;
+
+ name = NULL;
+ (void)__txn_get_name(txn, &name);
+
+ __db_errx(env, DB_STR_A("0102",
+ "%s%sprevious transaction deadlock return not resolved",
+ "%s %s"), name == NULL ? "" : name, name == NULL ? "" : ": ");
+
+ return (EINVAL);
+}
+
+/*
+ * __db_not_txn_env --
+ * DB handle must be in an environment that supports transactions.
+ *
+ * PUBLIC: int __db_not_txn_env __P((ENV *));
+ */
+int
+__db_not_txn_env(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("0103",
+ "DB environment not configured for transactions"));
+ return (EINVAL);
+}
+
+/*
+ * __db_rec_toobig --
+ * Fixed record length exceeded error message.
+ *
+ * PUBLIC: int __db_rec_toobig __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__db_rec_toobig(env, data_len, fixed_rec_len)
+ ENV *env;
+ u_int32_t data_len, fixed_rec_len;
+{
+ __db_errx(env, DB_STR_A("0104",
+ "%lu larger than database's maximum record length %lu",
+ "%lu %lu"), (u_long)data_len, (u_long)fixed_rec_len);
+ return (EINVAL);
+}
+
+/*
+ * __db_rec_repl --
+ * Fixed record replacement length error message.
+ *
+ * PUBLIC: int __db_rec_repl __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__db_rec_repl(env, data_size, data_dlen)
+ ENV *env;
+ u_int32_t data_size, data_dlen;
+{
+ __db_errx(env, DB_STR_A("0105",
+ "Record length error: "
+ "replacement length %lu differs from replaced length %lu",
+ "%lu %lu"), (u_long)data_size, (u_long)data_dlen);
+ return (EINVAL);
+}
+
+#if defined(DIAGNOSTIC) || defined(DEBUG_ROP) || defined(DEBUG_WOP)
+/*
+ * __dbc_logging --
+ * In DIAGNOSTIC mode, check for bad replication combinations.
+ *
+ * PUBLIC: int __dbc_logging __P((DBC *));
+ */
+int
+__dbc_logging(dbc)
+ DBC *dbc;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+ db_rep = env->rep_handle;
+
+ ret = LOGGING_ON(env) &&
+ !F_ISSET(dbc, DBC_RECOVER) && !IS_REP_CLIENT(env);
+
+ /*
+ * If we're not using replication or running recovery, return.
+ */
+ if (db_rep == NULL || F_ISSET(dbc, DBC_RECOVER))
+ return (ret);
+
+#ifndef DEBUG_ROP
+ /*
+ * Only check when DEBUG_ROP is not configured. People often do
+ * non-transactional reads, and debug_rop is going to write
+ * a log record.
+ */
+ {
+ REP *rep;
+
+ rep = db_rep->region;
+
+ /*
+ * If we're a client and not running recovery or non durably, error.
+ */
+ if (IS_REP_CLIENT(env) && !F_ISSET(dbc->dbp, DB_AM_NOT_DURABLE)) {
+ __db_errx(env, DB_STR("0106",
+ "dbc_logging: Client update"));
+ goto err;
+ }
+
+#ifndef DEBUG_WOP
+ /*
+ * If DEBUG_WOP is enabled, then we'll generate debugging log records
+ * that are non-transactional. This is OK.
+ */
+ if (IS_REP_MASTER(env) &&
+ dbc->txn == NULL && !F_ISSET(dbc->dbp, DB_AM_NOT_DURABLE)) {
+ __db_errx(env, DB_STR("0107",
+ "Dbc_logging: Master non-txn update"));
+ goto err;
+ }
+#endif
+
+ if (0) {
+err: __db_errx(env, DB_STR_A("0108", "Rep: flags 0x%lx msg_th %lu",
+ "%lx %lu"), (u_long)rep->flags, (u_long)rep->msg_th);
+ __db_errx(env, DB_STR_A("0109", "Rep: handle %lu, opcnt %lu",
+ "%lu %lu"), (u_long)rep->handle_cnt, (u_long)rep->op_cnt);
+ __os_abort(env);
+ /* NOTREACHED */
+ }
+ }
+#endif
+ return (ret);
+}
+#endif
+
+/*
+ * __db_check_lsn --
+ * Display the log sequence error message.
+ *
+ * PUBLIC: int __db_check_lsn __P((ENV *, DB_LSN *, DB_LSN *));
+ */
+int
+__db_check_lsn(env, lsn, prev)
+ ENV *env;
+ DB_LSN *lsn, *prev;
+{
+ __db_errx(env, DB_STR_A("0110",
+ "Log sequence error: page LSN %lu %lu; previous LSN %lu %lu",
+ "%lu %lu %lu %lu"), (u_long)(lsn)->file,
+ (u_long)(lsn)->offset, (u_long)(prev)->file,
+ (u_long)(prev)->offset);
+ return (EINVAL);
+}
+
+/*
+ * __db_rdonly --
+ * Common readonly message.
+ * PUBLIC: int __db_rdonly __P((const ENV *, const char *));
+ */
+int
+__db_rdonly(env, name)
+ const ENV *env;
+ const char *name;
+{
+ __db_errx(env, DB_STR_A("0111",
+ "%s: attempt to modify a read-only database", "%s"), name);
+ return (EACCES);
+}
+
+/*
+ * __db_space_err --
+ * Common out of space message.
+ * PUBLIC: int __db_space_err __P((const DB *));
+ */
+int
+__db_space_err(dbp)
+ const DB *dbp;
+{
+ __db_errx(dbp->env, DB_STR_A("0112",
+ "%s: file limited to %lu pages", "%s %lu"),
+ dbp->fname, (u_long)dbp->mpf->mfp->maxpgno);
+ return (ENOSPC);
+}
+
+/*
+ * __db_failed --
+ * Common failed thread message.
+ *
+ * PUBLIC: int __db_failed __P((const ENV *,
+ * PUBLIC: const char *, pid_t, db_threadid_t));
+ */
+int
+__db_failed(env, msg, pid, tid)
+ const ENV *env;
+ const char *msg;
+ pid_t pid;
+ db_threadid_t tid;
+{
+ DB_ENV *dbenv;
+ char buf[DB_THREADID_STRLEN];
+
+ dbenv = env->dbenv;
+
+ __db_errx(env, DB_STR_A("0113", "Thread/process %s failed: %s",
+ "%s %s"), dbenv->thread_id_string(dbenv, pid, tid, buf), msg);
+ return (DB_RUNRECOVERY);
+}
diff --git a/src/common/db_getlong.c b/src/common/db_getlong.c
new file mode 100644
index 00000000..cac55a0e
--- /dev/null
+++ b/src/common/db_getlong.c
@@ -0,0 +1,146 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_getlong --
+ * Return a long value inside of basic parameters.
+ *
+ * PUBLIC: int __db_getlong
+ * PUBLIC: __P((DB_ENV *, const char *, char *, long, long, long *));
+ */
+int
+__db_getlong(dbenv, progname, p, min, max, storep)
+ DB_ENV *dbenv;
+ const char *progname;
+ char *p;
+ long min, max, *storep;
+{
+ long val;
+ char *end;
+
+ __os_set_errno(0);
+ val = strtol(p, &end, 10);
+ if ((val == LONG_MIN || val == LONG_MAX) &&
+ __os_get_errno() == ERANGE) {
+ if (dbenv == NULL)
+ fprintf(stderr, "%s: %s: %s\n",
+ progname, p, strerror(ERANGE));
+ else
+ dbenv->err(dbenv, ERANGE, "%s", p);
+ return (ERANGE);
+ }
+ if (p[0] == '\0' || (end[0] != '\0' && end[0] != '\n')) {
+ if (dbenv == NULL)
+ fprintf(stderr, DB_STR_A("0042",
+ "%s: %s: Invalid numeric argument\n",
+ "%s %s\n"), progname, p);
+ else
+ dbenv->errx(dbenv, DB_STR_A("0043",
+ "%s: Invalid numeric argument", "%s"), p);
+ return (EINVAL);
+ }
+ if (val < min) {
+ if (dbenv == NULL)
+ fprintf(stderr, DB_STR_A("0044",
+ "%s: %s: Less than minimum value (%ld)\n",
+ "%s %s %ld\n"), progname, p, min);
+ else
+ dbenv->errx(dbenv, DB_STR_A("0045",
+ "%s: Less than minimum value (%ld)",
+ "%s %ld"), p, min);
+ return (ERANGE);
+ }
+ if (val > max) {
+ if (dbenv == NULL)
+ fprintf(stderr, DB_STR_A("0046",
+ "%s: %s: Greater than maximum value (%ld)\n",
+ "%s %s %ld\n"), progname, p, max);
+ else
+ dbenv->errx(dbenv, DB_STR_A("0047",
+ "%s: Greater than maximum value (%ld)",
+ "%s %ld"), p, max);
+ return (ERANGE);
+ }
+ *storep = val;
+ return (0);
+}
+
+/*
+ * __db_getulong --
+ * Return an unsigned long value inside of basic parameters.
+ *
+ * PUBLIC: int __db_getulong
+ * PUBLIC: __P((DB_ENV *, const char *, char *, u_long, u_long, u_long *));
+ */
+int
+__db_getulong(dbenv, progname, p, min, max, storep)
+ DB_ENV *dbenv;
+ const char *progname;
+ char *p;
+ u_long min, max, *storep;
+{
+ u_long val;
+ char *end;
+
+ __os_set_errno(0);
+ val = strtoul(p, &end, 10);
+ if (val == ULONG_MAX && __os_get_errno() == ERANGE) {
+ if (dbenv == NULL)
+ fprintf(stderr, "%s: %s: %s\n",
+ progname, p, strerror(ERANGE));
+ else
+ dbenv->err(dbenv, ERANGE, "%s", p);
+ return (ERANGE);
+ }
+ if (p[0] == '\0' || (end[0] != '\0' && end[0] != '\n')) {
+ if (dbenv == NULL)
+ fprintf(stderr, DB_STR_A("0048",
+ "%s: %s: Invalid numeric argument\n",
+ "%s %s\n"), progname, p);
+ else
+ dbenv->errx(dbenv, DB_STR_A("0049",
+ "%s: Invalid numeric argument",
+ "%s"), p);
+ return (EINVAL);
+ }
+ if (val < min) {
+ if (dbenv == NULL)
+ fprintf(stderr, DB_STR_A("0050",
+ "%s: %s: Less than minimum value (%lu)\n",
+ "%s %s %lu\n"), progname, p, min);
+ else
+ dbenv->errx(dbenv, DB_STR_A("0051",
+ "%s: Less than minimum value (%lu)",
+ "%s %lu"), p, min);
+ return (ERANGE);
+ }
+
+ /*
+ * We allow a 0 to substitute as a max value for ULONG_MAX because
+ * 1) accepting only a 0 value is unlikely to be necessary, and 2)
+ * we don't want callers to have to use ULONG_MAX explicitly, as it
+ * may not exist on all platforms.
+ */
+ if (max != 0 && val > max) {
+ if (dbenv == NULL)
+ fprintf(stderr, DB_STR_A("0052",
+ "%s: %s: Greater than maximum value (%lu)\n",
+ "%s %s %lu\n"), progname, p, max);
+ else
+ dbenv->errx(dbenv, DB_STR_A("0053",
+ "%s: Greater than maximum value (%lu)",
+ "%s %lu"), p, max);
+ return (ERANGE);
+ }
+ *storep = val;
+ return (0);
+}
diff --git a/src/common/db_idspace.c b/src/common/db_idspace.c
new file mode 100644
index 00000000..a9cbb1bf
--- /dev/null
+++ b/src/common/db_idspace.c
@@ -0,0 +1,85 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int __db_idcmp __P((const void *, const void *));
+
+static int
+__db_idcmp(a, b)
+ const void *a;
+ const void *b;
+{
+ u_int32_t i, j;
+
+ i = *(u_int32_t *)a;
+ j = *(u_int32_t *)b;
+
+ if (i < j)
+ return (-1);
+ else if (i > j)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * __db_idspace --
+ *
+ * On input, minp and maxp contain the minimum and maximum valid values for
+ * the name space and on return, they contain the minimum and maximum ids
+ * available (by finding the biggest gap). The minimum can be an inuse
+ * value, but the maximum cannot be.
+ *
+ * PUBLIC: void __db_idspace __P((u_int32_t *, int, u_int32_t *, u_int32_t *));
+ */
+void
+__db_idspace(inuse, n, minp, maxp)
+ u_int32_t *inuse;
+ int n;
+ u_int32_t *minp, *maxp;
+{
+ int i, low;
+ u_int32_t gap, t;
+
+ /* A single locker ID is a special case. */
+ if (n == 1) {
+ /*
+ * If the single item in use is the last one in the range,
+ * then we've got to perform wrap which means that we set
+ * the min to the minimum ID, which is what we came in with,
+ * so we don't do anything.
+ */
+ if (inuse[0] != *maxp)
+ *minp = inuse[0];
+ *maxp = inuse[0] - 1;
+ return;
+ }
+
+ gap = 0;
+ low = 0;
+ qsort(inuse, (size_t)n, sizeof(u_int32_t), __db_idcmp);
+ for (i = 0; i < n - 1; i++)
+ if ((t = (inuse[i + 1] - inuse[i])) > gap) {
+ gap = t;
+ low = i;
+ }
+
+ /* Check for largest gap at the end of the space. */
+ if ((*maxp - inuse[n - 1]) + (inuse[0] - *minp) > gap) {
+ /* Do same check as we do in the n == 1 case. */
+ if (inuse[n - 1] != *maxp)
+ *minp = inuse[n - 1];
+ *maxp = inuse[0] - 1;
+ } else {
+ *minp = inuse[low];
+ *maxp = inuse[low + 1] - 1;
+ }
+}
diff --git a/src/common/db_log2.c b/src/common/db_log2.c
new file mode 100644
index 00000000..9c929f84
--- /dev/null
+++ b/src/common/db_log2.c
@@ -0,0 +1,57 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * PUBLIC: u_int32_t __db_log2 __P((u_int32_t));
+ */
+u_int32_t
+__db_log2(num)
+ u_int32_t num;
+{
+ u_int32_t i, limit;
+
+ limit = 1;
+ for (i = 0; limit < num; limit = limit << 1)
+ ++i;
+ return (i);
+}
diff --git a/src/common/db_shash.c b/src/common/db_shash.c
new file mode 100644
index 00000000..a056e4b1
--- /dev/null
+++ b/src/common/db_shash.c
@@ -0,0 +1,104 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_tablesize --
+ * Choose a size for the hash table.
+ *
+ * PUBLIC: u_int32_t __db_tablesize __P((u_int32_t));
+ */
+u_int32_t
+__db_tablesize(n_buckets)
+ u_int32_t n_buckets;
+{
+ /*
+ * We try to be clever about how big we make the hash tables. Use a
+ * prime number close to the "suggested" number of elements that will
+ * be in the hash table. Use 32 as the minimum hash table size.
+ *
+ * Ref: Sedgewick, Algorithms in C, "Hash Functions"
+ *
+ * Up to ~250,000 buckets, we use powers of 2. After that, we slow
+ * the rate of increase by half. For each choice, we then use a
+ * nearby prime number as the hash value.
+ *
+ * If a terabyte is the maximum cache we'll see, and we assume there
+ * are 10 1K buckets on each hash chain, then 107374182 is the maximum
+ * number of buckets we'll ever need.
+ *
+ * We don't use the obvious static data structure because some C
+ * compilers (and I use the term loosely), can't handle them.
+ */
+#define HASH_SIZE(power, prime) { \
+ if ((power) >= n_buckets) \
+ return (prime); \
+}
+ HASH_SIZE(32, 37); /* 2^5 */
+ HASH_SIZE(64, 67); /* 2^6 */
+ HASH_SIZE(128, 131); /* 2^7 */
+ HASH_SIZE(256, 257); /* 2^8 */
+ HASH_SIZE(512, 521); /* 2^9 */
+ HASH_SIZE(1024, 1031); /* 2^10 */
+ HASH_SIZE(2048, 2053); /* 2^11 */
+ HASH_SIZE(4096, 4099); /* 2^12 */
+ HASH_SIZE(8192, 8191); /* 2^13 */
+ HASH_SIZE(16384, 16381); /* 2^14 */
+ HASH_SIZE(32768, 32771); /* 2^15 */
+ HASH_SIZE(65536, 65537); /* 2^16 */
+ HASH_SIZE(131072, 131071); /* 2^17 */
+ HASH_SIZE(262144, 262147); /* 2^18 */
+ HASH_SIZE(393216, 393209); /* 2^18 + 2^18/2 */
+ HASH_SIZE(524288, 524287); /* 2^19 */
+ HASH_SIZE(786432, 786431); /* 2^19 + 2^19/2 */
+ HASH_SIZE(1048576, 1048573); /* 2^20 */
+ HASH_SIZE(1572864, 1572869); /* 2^20 + 2^20/2 */
+ HASH_SIZE(2097152, 2097169); /* 2^21 */
+ HASH_SIZE(3145728, 3145721); /* 2^21 + 2^21/2 */
+ HASH_SIZE(4194304, 4194301); /* 2^22 */
+ HASH_SIZE(6291456, 6291449); /* 2^22 + 2^22/2 */
+ HASH_SIZE(8388608, 8388617); /* 2^23 */
+ HASH_SIZE(12582912, 12582917); /* 2^23 + 2^23/2 */
+ HASH_SIZE(16777216, 16777213); /* 2^24 */
+ HASH_SIZE(25165824, 25165813); /* 2^24 + 2^24/2 */
+ HASH_SIZE(33554432, 33554393); /* 2^25 */
+ HASH_SIZE(50331648, 50331653); /* 2^25 + 2^25/2 */
+ HASH_SIZE(67108864, 67108859); /* 2^26 */
+ HASH_SIZE(100663296, 100663291); /* 2^26 + 2^26/2 */
+ HASH_SIZE(134217728, 134217757); /* 2^27 */
+ HASH_SIZE(201326592, 201326611); /* 2^27 + 2^27/2 */
+ HASH_SIZE(268435456, 268435459); /* 2^28 */
+ HASH_SIZE(402653184, 402653189); /* 2^28 + 2^28/2 */
+ HASH_SIZE(536870912, 536870909); /* 2^29 */
+ HASH_SIZE(805306368, 805306357); /* 2^29 + 2^29/2 */
+ HASH_SIZE(1073741824, 1073741827); /* 2^30 */
+ return (1073741827);
+}
+
+/*
+ * __db_hashinit --
+ * Initialize a hash table that resides in shared memory.
+ *
+ * PUBLIC: void __db_hashinit __P((void *, u_int32_t));
+ */
+void
+__db_hashinit(begin, nelements)
+ void *begin;
+ u_int32_t nelements;
+{
+ u_int32_t i;
+ SH_TAILQ_HEAD(hash_head) *headp;
+
+ headp = (struct hash_head *)begin;
+
+ for (i = 0; i < nelements; i++, headp++)
+ SH_TAILQ_INIT(headp);
+}
diff --git a/src/common/dbt.c b/src/common/dbt.c
new file mode 100644
index 00000000..90409f2c
--- /dev/null
+++ b/src/common/dbt.c
@@ -0,0 +1,74 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __dbt_usercopy --
+ * Take a copy of the user's data, if a callback is supplied.
+ *
+ * PUBLIC: int __dbt_usercopy __P((ENV *, DBT *));
+ */
+int
+__dbt_usercopy(env, dbt)
+ ENV *env;
+ DBT *dbt;
+{
+ void *buf;
+ int ret;
+
+ if (dbt == NULL || !F_ISSET(dbt, DB_DBT_USERCOPY) || dbt->size == 0 ||
+ dbt->data != NULL)
+ return (0);
+
+ buf = NULL;
+ if ((ret = __os_umalloc(env, dbt->size, &buf)) != 0 ||
+ (ret = env->dbt_usercopy(dbt, 0, buf, dbt->size,
+ DB_USERCOPY_GETDATA)) != 0)
+ goto err;
+ dbt->data = buf;
+
+ return (0);
+
+err: if (buf != NULL) {
+ __os_ufree(env, buf);
+ dbt->data = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __dbt_userfree --
+ * Free a copy of the user's data, if necessary.
+ *
+ * PUBLIC: void __dbt_userfree __P((ENV *, DBT *, DBT *, DBT *));
+ */
+void
+__dbt_userfree(env, key, pkey, data)
+ ENV *env;
+ DBT *key, *pkey, *data;
+{
+ if (key != NULL &&
+ F_ISSET(key, DB_DBT_USERCOPY) && key->data != NULL) {
+ __os_ufree(env, key->data);
+ key->data = NULL;
+ }
+ if (pkey != NULL &&
+ F_ISSET(pkey, DB_DBT_USERCOPY) && pkey->data != NULL) {
+ __os_ufree(env, pkey->data);
+ pkey->data = NULL;
+ }
+ if (data != NULL &&
+ F_ISSET(data, DB_DBT_USERCOPY) && data->data != NULL) {
+ __os_ufree(env, data->data);
+ data->data = NULL;
+ }
+}
diff --git a/src/common/mkpath.c b/src/common/mkpath.c
new file mode 100644
index 00000000..c684692c
--- /dev/null
+++ b/src/common/mkpath.c
@@ -0,0 +1,68 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_mkpath -- --
+ * Create intermediate directories.
+ *
+ * PUBLIC: int __db_mkpath __P((ENV *, const char *));
+ */
+int
+__db_mkpath(env, name)
+ ENV *env;
+ const char *name;
+{
+ size_t len;
+ int ret;
+ char *p, *t, savech;
+
+ /*
+ * Get a copy so we can modify the string. It's a path and potentially
+ * quite long, so don't allocate the space on the stack.
+ */
+ len = strlen(name) + 1;
+ if ((ret = __os_malloc(env, len, &t)) != 0)
+ return (ret);
+ memcpy(t, name, len);
+
+ /*
+ * Cycle through the path, creating intermediate directories.
+ *
+ * Skip the first byte if it's a path separator, it's the start of an
+ * absolute pathname.
+ */
+ if (PATH_SEPARATOR[1] == '\0') {
+ for (p = t + 1; p[0] != '\0'; ++p)
+ if (p[0] == PATH_SEPARATOR[0]) {
+ savech = *p;
+ *p = '\0';
+ if (__os_exists(env, t, NULL) &&
+ (ret = __os_mkdir(
+ env, t, env->dir_mode)) != 0)
+ break;
+ *p = savech;
+ }
+ } else
+ for (p = t + 1; p[0] != '\0'; ++p)
+ if (strchr(PATH_SEPARATOR, p[0]) != NULL) {
+ savech = *p;
+ *p = '\0';
+ if (__os_exists(env, t, NULL) &&
+ (ret = __os_mkdir(
+ env, t, env->dir_mode)) != 0)
+ break;
+ *p = savech;
+ }
+
+ __os_free(env, t);
+ return (ret);
+}
diff --git a/src/common/openflags.c b/src/common/openflags.c
new file mode 100644
index 00000000..cec1f081
--- /dev/null
+++ b/src/common/openflags.c
@@ -0,0 +1,51 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_openflags --
+ * Convert open(2) flags to DB flags.
+ *
+ * PUBLIC: u_int32_t __db_openflags __P((int));
+ */
+u_int32_t
+__db_openflags(oflags)
+ int oflags;
+{
+ u_int32_t dbflags;
+
+ dbflags = 0;
+
+ if (oflags & O_CREAT)
+ dbflags |= DB_CREATE;
+
+ if (oflags & O_TRUNC)
+ dbflags |= DB_TRUNCATE;
+
+ /*
+ * !!!
+ * Convert POSIX 1003.1 open(2) mode flags to DB flags. This isn't
+ * an exact science as few POSIX implementations have a flag value
+ * for O_RDONLY, it's simply the lack of a write flag.
+ */
+#ifndef O_ACCMODE
+#define O_ACCMODE (O_RDONLY | O_RDWR | O_WRONLY)
+#endif
+ switch (oflags & O_ACCMODE) {
+ case O_RDWR:
+ case O_WRONLY:
+ break;
+ default:
+ dbflags |= DB_RDONLY;
+ break;
+ }
+ return (dbflags);
+}
diff --git a/src/common/os_method.c b/src/common/os_method.c
new file mode 100644
index 00000000..1ee06d7a
--- /dev/null
+++ b/src/common/os_method.c
@@ -0,0 +1,270 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * EXTERN: int db_env_set_func_assert
+ * EXTERN: __P((void (*)(const char *, const char *, int)));
+ */
+int
+db_env_set_func_assert(func_assert)
+ void (*func_assert) __P((const char *, const char *, int));
+{
+ DB_GLOBAL(j_assert) = func_assert;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_close __P((int (*)(int)));
+ */
+int
+db_env_set_func_close(func_close)
+ int (*func_close) __P((int));
+{
+ DB_GLOBAL(j_close) = func_close;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_dirfree __P((void (*)(char **, int)));
+ */
+int
+db_env_set_func_dirfree(func_dirfree)
+ void (*func_dirfree) __P((char **, int));
+{
+ DB_GLOBAL(j_dirfree) = func_dirfree;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_dirlist
+ * EXTERN: __P((int (*)(const char *, char ***, int *)));
+ */
+int
+db_env_set_func_dirlist(func_dirlist)
+ int (*func_dirlist) __P((const char *, char ***, int *));
+{
+ DB_GLOBAL(j_dirlist) = func_dirlist;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_exists __P((int (*)(const char *, int *)));
+ */
+int
+db_env_set_func_exists(func_exists)
+ int (*func_exists) __P((const char *, int *));
+{
+ DB_GLOBAL(j_exists) = func_exists;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_free __P((void (*)(void *)));
+ */
+int
+db_env_set_func_free(func_free)
+ void (*func_free) __P((void *));
+{
+ DB_GLOBAL(j_free) = func_free;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_fsync __P((int (*)(int)));
+ */
+int
+db_env_set_func_fsync(func_fsync)
+ int (*func_fsync) __P((int));
+{
+ DB_GLOBAL(j_fsync) = func_fsync;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_ftruncate __P((int (*)(int, off_t)));
+ */
+int
+db_env_set_func_ftruncate(func_ftruncate)
+ int (*func_ftruncate) __P((int, off_t));
+{
+ DB_GLOBAL(j_ftruncate) = func_ftruncate;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_ioinfo __P((int (*)(const char *,
+ * EXTERN: int, u_int32_t *, u_int32_t *, u_int32_t *)));
+ */
+int
+db_env_set_func_ioinfo(func_ioinfo)
+ int (*func_ioinfo)
+ __P((const char *, int, u_int32_t *, u_int32_t *, u_int32_t *));
+{
+ DB_GLOBAL(j_ioinfo) = func_ioinfo;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_malloc __P((void *(*)(size_t)));
+ */
+int
+db_env_set_func_malloc(func_malloc)
+ void *(*func_malloc) __P((size_t));
+{
+ DB_GLOBAL(j_malloc) = func_malloc;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_file_map
+ * EXTERN: __P((int (*)(DB_ENV *, char *, size_t, int, void **),
+ * EXTERN: int (*)(DB_ENV *, void *)));
+ */
+int
+db_env_set_func_file_map(func_file_map, func_file_unmap)
+ int (*func_file_map) __P((DB_ENV *, char *, size_t, int, void **));
+ int (*func_file_unmap) __P((DB_ENV *, void *));
+{
+ DB_GLOBAL(j_file_map) = func_file_map;
+ DB_GLOBAL(j_file_unmap) = func_file_unmap;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_region_map
+ * EXTERN: __P((int (*)(DB_ENV *, char *, size_t, int *, void **),
+ * EXTERN: int (*)(DB_ENV *, void *)));
+ */
+int
+db_env_set_func_region_map(func_region_map, func_region_unmap)
+ int (*func_region_map) __P((DB_ENV *, char *, size_t, int *, void **));
+ int (*func_region_unmap) __P((DB_ENV *, void *));
+{
+ DB_GLOBAL(j_region_map) = func_region_map;
+ DB_GLOBAL(j_region_unmap) = func_region_unmap;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_pread
+ * EXTERN: __P((ssize_t (*)(int, void *, size_t, off_t)));
+ */
+int
+db_env_set_func_pread(func_pread)
+ ssize_t (*func_pread) __P((int, void *, size_t, off_t));
+{
+ DB_GLOBAL(j_pread) = func_pread;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_pwrite
+ * EXTERN: __P((ssize_t (*)(int, const void *, size_t, off_t)));
+ */
+int
+db_env_set_func_pwrite(func_pwrite)
+ ssize_t (*func_pwrite) __P((int, const void *, size_t, off_t));
+{
+ DB_GLOBAL(j_pwrite) = func_pwrite;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_open __P((int (*)(const char *, int, ...)));
+ */
+int
+db_env_set_func_open(func_open)
+ int (*func_open) __P((const char *, int, ...));
+{
+ DB_GLOBAL(j_open) = func_open;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_read __P((ssize_t (*)(int, void *, size_t)));
+ */
+int
+db_env_set_func_read(func_read)
+ ssize_t (*func_read) __P((int, void *, size_t));
+{
+ DB_GLOBAL(j_read) = func_read;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_realloc __P((void *(*)(void *, size_t)));
+ */
+int
+db_env_set_func_realloc(func_realloc)
+ void *(*func_realloc) __P((void *, size_t));
+{
+ DB_GLOBAL(j_realloc) = func_realloc;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_rename
+ * EXTERN: __P((int (*)(const char *, const char *)));
+ */
+int
+db_env_set_func_rename(func_rename)
+ int (*func_rename) __P((const char *, const char *));
+{
+ DB_GLOBAL(j_rename) = func_rename;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_seek
+ * EXTERN: __P((int (*)(int, off_t, int)));
+ */
+int
+db_env_set_func_seek(func_seek)
+ int (*func_seek) __P((int, off_t, int));
+{
+ DB_GLOBAL(j_seek) = func_seek;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_unlink __P((int (*)(const char *)));
+ */
+int
+db_env_set_func_unlink(func_unlink)
+ int (*func_unlink) __P((const char *));
+{
+ DB_GLOBAL(j_unlink) = func_unlink;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_write
+ * EXTERN: __P((ssize_t (*)(int, const void *, size_t)));
+ */
+int
+db_env_set_func_write(func_write)
+ ssize_t (*func_write) __P((int, const void *, size_t));
+{
+ DB_GLOBAL(j_write) = func_write;
+ return (0);
+}
+
+/*
+ * EXTERN: int db_env_set_func_yield __P((int (*)(u_long, u_long)));
+ */
+int
+db_env_set_func_yield(func_yield)
+ int (*func_yield) __P((u_long, u_long));
+{
+ DB_GLOBAL(j_yield) = func_yield;
+ return (0);
+}
diff --git a/src/common/util_arg.c b/src/common/util_arg.c
new file mode 100644
index 00000000..73416cb7
--- /dev/null
+++ b/src/common/util_arg.c
@@ -0,0 +1,56 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#if DB_VERSION_MAJOR < 4 || DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR < 5
+/*
+ * !!!
+ * We build this file in old versions of Berkeley DB when we're doing test
+ * runs using the test_micro tool. Without a prototype in place, we get
+ * warnings, and there's no simple workaround.
+ */
+char *strsep();
+#endif
+
+/*
+ * __db_util_arg --
+ * Convert a string into an argc/argv pair.
+ *
+ * PUBLIC: int __db_util_arg __P((char *, char *, int *, char ***));
+ */
+int
+__db_util_arg(arg0, str, argcp, argvp)
+ char *arg0, *str, ***argvp;
+ int *argcp;
+{
+ int n, ret;
+ char **ap, **argv;
+
+#define MAXARGS 25
+ if ((ret =
+ __os_malloc(NULL, (MAXARGS + 1) * sizeof(char **), &argv)) != 0)
+ return (ret);
+
+ ap = argv;
+ *ap++ = arg0;
+ for (n = 1; (*ap = strsep(&str, " \t")) != NULL;)
+ if (**ap != '\0') {
+ ++ap;
+ if (++n == MAXARGS)
+ break;
+ }
+ *ap = NULL;
+
+ *argcp = (int)(ap - argv);
+ *argvp = argv;
+
+ return (0);
+}
diff --git a/src/common/util_cache.c b/src/common/util_cache.c
new file mode 100644
index 00000000..1206940b
--- /dev/null
+++ b/src/common/util_cache.c
@@ -0,0 +1,47 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_util_cache --
+ * Compute if we have enough cache.
+ *
+ * PUBLIC: int __db_util_cache __P((DB *, u_int32_t *, int *));
+ */
+int
+__db_util_cache(dbp, cachep, resizep)
+ DB *dbp;
+ u_int32_t *cachep;
+ int *resizep;
+{
+ u_int32_t pgsize;
+ int ret;
+
+ /* Get the current page size. */
+ if ((ret = dbp->get_pagesize(dbp, &pgsize)) != 0)
+ return (ret);
+
+ /*
+ * The current cache size is in cachep. If it's insufficient, set the
+ * the memory referenced by resizep to 1 and set cachep to the minimum
+ * size needed.
+ *
+ * Make sure our current cache is big enough. We want at least
+ * DB_MINPAGECACHE pages in the cache.
+ */
+ if ((*cachep / pgsize) < DB_MINPAGECACHE) {
+ *resizep = 1;
+ *cachep = pgsize * DB_MINPAGECACHE;
+ } else
+ *resizep = 0;
+
+ return (0);
+}
diff --git a/src/common/util_log.c b/src/common/util_log.c
new file mode 100644
index 00000000..d158d3f0
--- /dev/null
+++ b/src/common/util_log.c
@@ -0,0 +1,45 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_util_logset --
+ * Log that we're running.
+ *
+ * PUBLIC: int __db_util_logset __P((const char *, char *));
+ */
+int
+__db_util_logset(progname, fname)
+ const char *progname;
+ char *fname;
+{
+ pid_t pid;
+ FILE *fp;
+ time_t now;
+ char time_buf[CTIME_BUFLEN];
+
+ if ((fp = fopen(fname, "w")) == NULL)
+ goto err;
+
+ (void)time(&now);
+
+ __os_id(NULL, &pid, NULL);
+ fprintf(fp,
+ "%s: %lu %s", progname, (u_long)pid, __os_ctime(&now, time_buf));
+
+ if (fclose(fp) == EOF)
+ goto err;
+
+ return (0);
+
+err: fprintf(stderr, "%s: %s: %s\n", progname, fname, strerror(errno));
+ return (1);
+}
diff --git a/src/common/util_sig.c b/src/common/util_sig.c
new file mode 100644
index 00000000..02a0fcb2
--- /dev/null
+++ b/src/common/util_sig.c
@@ -0,0 +1,110 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int interrupt;
+static void set_signal __P((int, int));
+static void signal_handler __P((int));
+
+/*
+ * signal_handler --
+ * Interrupt signal handler.
+ */
+static void
+signal_handler(signo)
+ int signo;
+{
+#ifndef HAVE_SIGACTION
+ /* Assume signal() is unreliable and reset it, first thing. */
+ set_signal(signo, 0);
+#endif
+ /* Some systems don't pass in the correct signal value -- check. */
+ if ((interrupt = signo) == 0)
+ interrupt = SIGINT;
+}
+
+/*
+ * set_signal
+ */
+static void
+set_signal(s, is_dflt)
+ int s, is_dflt;
+{
+ /*
+ * Use sigaction if it's available, otherwise use signal().
+ */
+#ifdef HAVE_SIGACTION
+ struct sigaction sa, osa;
+
+ sa.sa_handler = is_dflt ? SIG_DFL : signal_handler;
+ (void)sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ (void)sigaction(s, &sa, &osa);
+#else
+ (void)signal(s, is_dflt ? SIG_DFL : signal_handler);
+#endif
+}
+
+/*
+ * __db_util_siginit --
+ *
+ * PUBLIC: void __db_util_siginit __P((void));
+ */
+void
+__db_util_siginit()
+{
+ /*
+ * Initialize the set of signals for which we want to clean up.
+ * Generally, we try not to leave the shared regions locked if
+ * we can.
+ */
+#ifdef SIGHUP
+ set_signal(SIGHUP, 0);
+#endif
+#ifdef SIGINT
+ set_signal(SIGINT, 0);
+#endif
+#ifdef SIGPIPE
+ set_signal(SIGPIPE, 0);
+#endif
+#ifdef SIGTERM
+ set_signal(SIGTERM, 0);
+#endif
+}
+
+/*
+ * __db_util_interrupted --
+ * Return if interrupted.
+ *
+ * PUBLIC: int __db_util_interrupted __P((void));
+ */
+int
+__db_util_interrupted()
+{
+ return (interrupt != 0);
+}
+
+/*
+ * __db_util_sigresend --
+ *
+ * PUBLIC: void __db_util_sigresend __P((void));
+ */
+void
+__db_util_sigresend()
+{
+ /* Resend any caught signal. */
+ if (interrupt != 0) {
+ set_signal(interrupt, 1);
+
+ (void)raise(interrupt);
+ /* NOTREACHED */
+ }
+}
diff --git a/src/common/zerofill.c b/src/common/zerofill.c
new file mode 100644
index 00000000..37662ddc
--- /dev/null
+++ b/src/common/zerofill.c
@@ -0,0 +1,129 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_zero_fill --
+ * Zero out bytes in the file.
+ *
+ * Pages allocated by writing pages past end-of-file are not zeroed,
+ * on some systems. Recovery could theoretically be fooled by a page
+ * showing up that contained garbage. In order to avoid this, we
+ * have to write the pages out to disk, and flush them. The reason
+ * for the flush is because if we don't sync, the allocation of another
+ * page subsequent to this one might reach the disk first, and if we
+ * crashed at the right moment, leave us with this page as the one
+ * allocated by writing a page past it in the file.
+ *
+ * PUBLIC: int __db_zero_fill __P((ENV *, DB_FH *));
+ */
+int
+__db_zero_fill(env, fhp)
+ ENV *env;
+ DB_FH *fhp;
+{
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ off_t stat_offset, write_offset;
+ size_t blen, nw;
+ u_int32_t bytes, mbytes;
+ int group_sync, ret;
+ u_int8_t *bp;
+
+ /* Calculate the byte offset of the next write. */
+ write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset;
+
+ /* Stat the file. */
+ if ((ret = __os_ioinfo(env, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
+ return (ret);
+ stat_offset = (off_t)mbytes * MEGABYTE + bytes;
+
+ /* Check if the file is large enough. */
+ if (stat_offset >= write_offset)
+ return (0);
+
+ /* Get a large buffer if we're writing lots of data. */
+#undef ZF_LARGE_WRITE
+#define ZF_LARGE_WRITE (64 * 1024)
+ if ((ret = __os_calloc(env, 1, ZF_LARGE_WRITE, &bp)) != 0)
+ return (ret);
+ blen = ZF_LARGE_WRITE;
+
+ /* Seek to the current end of the file. */
+ if ((ret = __os_seek(env, fhp, mbytes, MEGABYTE, bytes)) != 0)
+ goto err;
+
+ /*
+ * Hash is the only access method that allocates groups of pages. Hash
+ * uses the existence of the last page in a group to signify the entire
+ * group is OK; so, write all the pages but the last one in the group,
+ * flush them to disk, then write the last one to disk and flush it.
+ */
+ for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
+ if (write_offset - stat_offset <= (off_t)blen) {
+ blen = (size_t)(write_offset - stat_offset);
+ if (group_sync && (ret = __os_fsync(env, fhp)) != 0)
+ goto err;
+ }
+ if ((ret = __os_physwrite(env, fhp, bp, blen, &nw)) != 0)
+ goto err;
+ stat_offset += blen;
+ }
+ if ((ret = __os_fsync(env, fhp)) != 0)
+ goto err;
+
+ /* Seek back to where we started. */
+ mbytes = (u_int32_t)(write_offset / MEGABYTE);
+ bytes = (u_int32_t)(write_offset % MEGABYTE);
+ ret = __os_seek(env, fhp, mbytes, MEGABYTE, bytes);
+
+err: __os_free(env, bp);
+ return (ret);
+#else
+ COMPQUIET(env, NULL);
+ COMPQUIET(fhp, NULL);
+ return (0);
+#endif /* HAVE_FILESYSTEM_NOTZERO */
+}
+
+/*
+ * __db_zero --
+ * Zero to the end of the file.
+ *
+ * PUBLIC: int __db_zero_extend __P((ENV *,
+ * PUBLIC: DB_FH *, db_pgno_t, db_pgno_t, u_int32_t));
+ */
+int
+__db_zero_extend(env, fhp, pgno, last_pgno, pgsize)
+ ENV *env;
+ DB_FH *fhp;
+ db_pgno_t pgno, last_pgno;
+ u_int32_t pgsize;
+{
+ int ret;
+ size_t nwrote;
+ u_int8_t *buf;
+
+ if ((ret = __os_calloc(env, 1, pgsize, &buf)) != 0)
+ return (ret);
+ memset(buf, 0, pgsize);
+ for (; pgno <= last_pgno; pgno++)
+ if ((ret = __os_io(env, DB_IO_WRITE,
+ fhp, pgno, pgsize, 0, pgsize, buf, &nwrote)) != 0) {
+ if (ret == 0) {
+ ret = EIO;
+ goto err;
+ }
+ goto err;
+ }
+
+err: __os_free(env, buf);
+ return (ret);
+}
diff --git a/src/crypto/aes_method.c b/src/crypto/aes_method.c
new file mode 100644
index 00000000..47193539
--- /dev/null
+++ b/src/crypto/aes_method.c
@@ -0,0 +1,357 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * Some parts of this code originally written by Adam Stubblefield,
+ * -- astubble@rice.edu.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+
+#ifdef HAVE_CRYPTO_IPP
+#include <ippcp.h>
+#endif
+
+static void __aes_err __P((ENV *, int));
+static int __aes_derivekeys __P((ENV *, DB_CIPHER *, u_int8_t *, size_t));
+
+/*
+ * __aes_setup --
+ * Setup AES functions.
+ *
+ * PUBLIC: int __aes_setup __P((ENV *, DB_CIPHER *));
+ */
+int
+__aes_setup(env, db_cipher)
+ ENV *env;
+ DB_CIPHER *db_cipher;
+{
+ AES_CIPHER *aes_cipher;
+ int ret;
+#ifdef HAVE_CRYPTO_IPP
+ int ctx_size = 0;
+ IppStatus ipp_ret;
+#endif
+
+ db_cipher->adj_size = __aes_adj_size;
+ db_cipher->close = __aes_close;
+ db_cipher->decrypt = __aes_decrypt;
+ db_cipher->encrypt = __aes_encrypt;
+ db_cipher->init = __aes_init;
+ if ((ret = __os_calloc(env, 1, sizeof(AES_CIPHER), &aes_cipher)) != 0)
+ return (ret);
+#ifdef HAVE_CRYPTO_IPP
+ /*
+ * IPP AES encryption context size can only be obtained through this
+ * function call, cannot directly declare IppsRijndael128Spec within
+ * AES_CIPHER struct.
+ */
+ if ((ipp_ret = ippsRijndael128GetSize(&ctx_size)) != ippStsNoErr) {
+ __aes_err(env, (int)ipp_ret);
+ return (EAGAIN);
+ }
+ if ((ret = __os_malloc(env, ctx_size, &aes_cipher->ipp_ctx)) != 0) {
+ __os_free(env, aes_cipher);
+ return (ret);
+ }
+#endif
+ db_cipher->data = aes_cipher;
+ return (0);
+}
+
+/*
+ * __aes_adj_size --
+ * Given a size, return an addition amount needed to meet the
+ * "chunk" needs of the algorithm.
+ *
+ * PUBLIC: u_int __aes_adj_size __P((size_t));
+ */
+u_int
+__aes_adj_size(len)
+ size_t len;
+{
+ if (len % DB_AES_CHUNK == 0)
+ return (0);
+ return (DB_AES_CHUNK - (u_int)(len % DB_AES_CHUNK));
+}
+
+/*
+ * __aes_close --
+ * Destroy the AES encryption instantiation.
+ *
+ * PUBLIC: int __aes_close __P((ENV *, void *));
+ */
+int
+__aes_close(env, data)
+ ENV *env;
+ void *data;
+{
+#ifdef HAVE_CRYPTO_IPP
+ AES_CIPHER *aes_cipher = (AES_CIPHER *)data;
+ __os_free(env, aes_cipher->ipp_ctx);
+#endif
+ __os_free(env, data);
+ return (0);
+}
+
+/*
+ * __aes_decrypt --
+ * Decrypt data with AES.
+ *
+ * PUBLIC: int __aes_decrypt __P((ENV *, void *, void *,
+ * PUBLIC: u_int8_t *, size_t));
+ */
+int
+__aes_decrypt(env, aes_data, iv, cipher, cipher_len)
+ ENV *env;
+ void *aes_data;
+ void *iv;
+ u_int8_t *cipher;
+ size_t cipher_len;
+{
+ AES_CIPHER *aes;
+#ifdef HAVE_CRYPTO_IPP
+ IppStatus ipp_ret;
+#else
+ cipherInstance c;
+#endif
+ int ret;
+
+ aes = (AES_CIPHER *)aes_data;
+ if (iv == NULL || cipher == NULL)
+ return (EINVAL);
+ if ((cipher_len % DB_AES_CHUNK) != 0)
+ return (EINVAL);
+
+#ifdef HAVE_CRYPTO_IPP
+ if ((ipp_ret = ippsRijndael128DecryptCBC((const Ipp8u *)cipher,
+ (Ipp8u *)cipher, cipher_len, (IppsRijndael128Spec *)aes->ipp_ctx,
+ (const Ipp8u *)iv, 0)) != ippStsNoErr) {
+ __aes_err(env, (int)ipp_ret);
+ return (EAGAIN);
+ }
+#else
+ /*
+ * Initialize the cipher
+ */
+ if ((ret = __db_cipherInit(&c, MODE_CBC, iv)) < 0) {
+ __aes_err(env, ret);
+ return (EAGAIN);
+ }
+
+ /* Do the decryption */
+ if ((ret = __db_blockDecrypt(&c, &aes->decrypt_ki, cipher,
+ cipher_len * 8, cipher)) < 0) {
+ __aes_err(env, ret);
+ return (EAGAIN);
+ }
+#endif
+ return (0);
+}
+
+/*
+ * __aes_encrypt --
+ * Encrypt data with AES.
+ *
+ * PUBLIC: int __aes_encrypt __P((ENV *, void *, void *,
+ * PUBLIC: u_int8_t *, size_t));
+ */
+int
+__aes_encrypt(env, aes_data, iv, data, data_len)
+ ENV *env;
+ void *aes_data;
+ void *iv;
+ u_int8_t *data;
+ size_t data_len;
+{
+ AES_CIPHER *aes;
+#ifdef HAVE_CRYPTO_IPP
+ IppStatus ipp_ret;
+#else
+ cipherInstance c;
+#endif
+ u_int32_t tmp_iv[DB_IV_BYTES/4];
+ int ret;
+
+ aes = (AES_CIPHER *)aes_data;
+ if (aes == NULL || data == NULL)
+ return (EINVAL);
+ if ((data_len % DB_AES_CHUNK) != 0)
+ return (EINVAL);
+ /*
+ * Generate the IV here. We store it in a tmp IV because
+ * the IV might be stored within the data we are encrypting
+ * and so we will copy it over to the given location after
+ * encryption is done.
+ * We don't do this outside of there because some encryption
+ * algorithms someone might add may not use IV's and we always
+ * want on here.
+ */
+ if ((ret = __db_generate_iv(env, tmp_iv)) != 0)
+ return (ret);
+
+#ifdef HAVE_CRYPTO_IPP
+ if ((ipp_ret = ippsRijndael128EncryptCBC((const Ipp8u *)data,
+ (Ipp8u *)data, data_len, (IppsRijndael128Spec *)aes->ipp_ctx,
+ (const Ipp8u *)tmp_iv, 0)) != ippStsNoErr) {
+ __aes_err(env, (int)ipp_ret);
+ return (EAGAIN);
+ }
+#else
+ /*
+ * Initialize the cipher
+ */
+ if ((ret = __db_cipherInit(&c, MODE_CBC, (char *)tmp_iv)) < 0) {
+ __aes_err(env, ret);
+ return (EAGAIN);
+ }
+
+ /* Do the encryption */
+ if ((ret = __db_blockEncrypt(&c, &aes->encrypt_ki, data, data_len * 8,
+ data)) < 0) {
+ __aes_err(env, ret);
+ return (EAGAIN);
+ }
+#endif
+ memcpy(iv, tmp_iv, DB_IV_BYTES);
+ return (0);
+}
+
+/*
+ * __aes_init --
+ * Initialize the AES encryption instantiation.
+ *
+ * PUBLIC: int __aes_init __P((ENV *, DB_CIPHER *));
+ */
+int
+__aes_init(env, db_cipher)
+ ENV *env;
+ DB_CIPHER *db_cipher;
+{
+ DB_ENV *dbenv;
+
+ dbenv = env->dbenv;
+
+ return (__aes_derivekeys(
+ env, db_cipher, (u_int8_t *)dbenv->passwd, dbenv->passwd_len));
+}
+
+static int
+__aes_derivekeys(env, db_cipher, passwd, plen)
+ ENV *env;
+ DB_CIPHER *db_cipher;
+ u_int8_t *passwd;
+ size_t plen;
+{
+ AES_CIPHER *aes;
+ SHA1_CTX ctx;
+#ifdef HAVE_CRYPTO_IPP
+ IppStatus ipp_ret;
+#else
+ int ret;
+#endif
+ u_int32_t temp[DB_MAC_KEY/4];
+
+ if (passwd == NULL)
+ return (EINVAL);
+
+ aes = (AES_CIPHER *)db_cipher->data;
+
+ /* Derive the crypto keys */
+ __db_SHA1Init(&ctx);
+ __db_SHA1Update(&ctx, passwd, plen);
+ __db_SHA1Update(&ctx, (u_int8_t *)DB_ENC_MAGIC, strlen(DB_ENC_MAGIC));
+ __db_SHA1Update(&ctx, passwd, plen);
+ __db_SHA1Final((u_int8_t *)temp, &ctx);
+
+#ifdef HAVE_CRYPTO_IPP
+ if ((ipp_ret = ippsRijndael128Init((const Ipp8u *)temp,
+ IppsRijndaelKey128, (IppsRijndael128Spec *)aes->ipp_ctx))
+ != ippStsNoErr) {
+ __aes_err(env, (int)ipp_ret);
+ return (EAGAIN);
+ }
+#else
+ if ((ret = __db_makeKey(&aes->encrypt_ki, DIR_ENCRYPT,
+ DB_AES_KEYLEN, (char *)temp)) != TRUE) {
+ __aes_err(env, ret);
+ return (EAGAIN);
+ }
+ if ((ret = __db_makeKey(&aes->decrypt_ki, DIR_DECRYPT,
+ DB_AES_KEYLEN, (char *)temp)) != TRUE) {
+ __aes_err(env, ret);
+ return (EAGAIN);
+ }
+#endif
+ return (0);
+}
+
+/*
+ * __aes_err --
+ * Handle AES-specific errors. Codes and messages derived from
+ * rijndael/rijndael-api-fst.h.
+ */
+static void
+__aes_err(env, err)
+ ENV *env;
+ int err;
+{
+ char *errstr;
+
+ switch (err) {
+#ifdef HAVE_CRYPTO_IPP
+ case ippStsNullPtrErr:
+ errstr = DB_STR("0182", "IPP AES NULL pointer error");
+ break;
+ case ippStsLengthErr:
+ errstr = DB_STR("0183", "IPP AES length error");
+ break;
+ case ippStsContextMatchErr:
+ errstr = DB_STR("0184",
+ "IPP AES context does not match operation");
+ break;
+ case ippStsUnderRunErr:
+ errstr = DB_STR("0185", "IPP AES srclen size error");
+ break;
+#else
+ case BAD_KEY_DIR:
+ errstr = DB_STR("0186", "AES key direction is invalid");
+ break;
+ case BAD_KEY_MAT:
+ errstr = DB_STR("0187",
+ "AES key material not of correct length");
+ break;
+ case BAD_KEY_INSTANCE:
+ errstr = DB_STR("0188", "AES key passwd not valid");
+ break;
+ case BAD_CIPHER_MODE:
+ errstr = DB_STR("0189",
+ "AES cipher in wrong state (not initialized)");
+ break;
+ case BAD_BLOCK_LENGTH:
+ errstr = DB_STR("0190", "AES bad block length");
+ break;
+ case BAD_CIPHER_INSTANCE:
+ errstr = DB_STR("0191", "AES cipher instance is invalid");
+ break;
+ case BAD_DATA:
+ errstr = DB_STR("0192", "AES data contents are invalid");
+ break;
+ case BAD_OTHER:
+ errstr = DB_STR("0193", "AES unknown error");
+ break;
+#endif
+ default:
+ errstr = DB_STR("0194", "AES error unrecognized");
+ break;
+ }
+ __db_errx(env, "%s", errstr);
+ return;
+}
diff --git a/src/crypto/crypto.c b/src/crypto/crypto.c
new file mode 100644
index 00000000..b731496f
--- /dev/null
+++ b/src/crypto/crypto.c
@@ -0,0 +1,411 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * Some parts of this code originally written by Adam Stubblefield
+ * -- astubble@rice.edu
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/crypto.h"
+
+/*
+ * __crypto_region_init --
+ * Initialize crypto.
+ */
+int
+__crypto_region_init(env)
+ ENV *env;
+{
+ CIPHER *cipher;
+ DB_CIPHER *db_cipher;
+ DB_ENV *dbenv;
+ REGENV *renv;
+ REGINFO *infop;
+ char *sh_passwd;
+ int ret;
+
+ dbenv = env->dbenv;
+ infop = env->reginfo;
+ renv = infop->primary;
+ db_cipher = env->crypto_handle;
+ ret = 0;
+
+ if (renv->cipher_off == INVALID_ROFF) {
+ if (!CRYPTO_ON(env))
+ return (0);
+ if (!F_ISSET(infop, REGION_CREATE)) {
+ __db_errx(env, DB_STR("0172",
+ "Joining non-encrypted environment with encryption key"));
+ return (EINVAL);
+ }
+ if (F_ISSET(db_cipher, CIPHER_ANY)) {
+ __db_errx(env, DB_STR("0173",
+ "Encryption algorithm not supplied"));
+ return (EINVAL);
+ }
+ /*
+ * Must create the shared information. We need: Shared cipher
+ * information that contains the passwd. After we copy the
+ * passwd, we smash and free the one in the env.
+ */
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ if ((ret = __env_alloc(infop, sizeof(CIPHER), &cipher)) != 0) {
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ return (ret);
+ }
+ memset(cipher, 0, sizeof(*cipher));
+ if ((ret =
+ __env_alloc(infop, dbenv->passwd_len, &sh_passwd)) != 0) {
+ __env_alloc_free(infop, cipher);
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ return (ret);
+ }
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ memset(sh_passwd, 0, dbenv->passwd_len);
+ cipher->passwd = R_OFFSET(infop, sh_passwd);
+ cipher->passwd_len = dbenv->passwd_len;
+ cipher->flags = db_cipher->alg;
+ memcpy(sh_passwd, dbenv->passwd, cipher->passwd_len);
+ renv->cipher_off = R_OFFSET(infop, cipher);
+ } else {
+ if (!CRYPTO_ON(env)) {
+ __db_errx(env, DB_STR("0174",
+ "Encrypted environment: no encryption key supplied"));
+ return (EINVAL);
+ }
+ cipher = R_ADDR(infop, renv->cipher_off);
+ sh_passwd = R_ADDR(infop, cipher->passwd);
+ if ((cipher->passwd_len != dbenv->passwd_len) ||
+ memcmp(dbenv->passwd, sh_passwd, cipher->passwd_len) != 0) {
+ __db_errx(env, DB_STR("0175", "Invalid password"));
+ return (EPERM);
+ }
+ if (!F_ISSET(db_cipher, CIPHER_ANY) &&
+ db_cipher->alg != cipher->flags) {
+ __db_errx(env, DB_STR("0176",
+ "Environment encrypted using a different algorithm"));
+ return (EINVAL);
+ }
+ if (F_ISSET(db_cipher, CIPHER_ANY))
+ /*
+ * We have CIPHER_ANY and we are joining the existing
+ * env. Setup our cipher structure for whatever
+ * algorithm this env has.
+ */
+ if ((ret = __crypto_algsetup(env, db_cipher,
+ cipher->flags, 0)) != 0)
+ return (ret);
+ }
+ ret = db_cipher->init(env, db_cipher);
+
+ /*
+ * On success, no matter if we allocated it or are using the already
+ * existing one, we are done with the passwd in the env. We smash
+ * N-1 bytes so that we don't overwrite the nul.
+ */
+ memset(dbenv->passwd, 0xff, dbenv->passwd_len-1);
+ __os_free(env, dbenv->passwd);
+ dbenv->passwd = NULL;
+ dbenv->passwd_len = 0;
+
+ return (ret);
+}
+
+/*
+ * __crypto_env_close --
+ * Crypto-specific destruction of ENV structure.
+ *
+ * PUBLIC: int __crypto_env_close __P((ENV *));
+ */
+int
+__crypto_env_close(env)
+ ENV *env;
+{
+ DB_CIPHER *db_cipher;
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = env->dbenv;
+
+ if (dbenv->passwd != NULL) {
+ memset(dbenv->passwd, 0xff, dbenv->passwd_len-1);
+ __os_free(env, dbenv->passwd);
+ dbenv->passwd = NULL;
+ }
+
+ if (!CRYPTO_ON(env))
+ return (0);
+
+ ret = 0;
+ db_cipher = env->crypto_handle;
+ if (!F_ISSET(db_cipher, CIPHER_ANY))
+ ret = db_cipher->close(env, db_cipher->data);
+ __os_free(env, db_cipher);
+
+ env->crypto_handle = NULL;
+ return (ret);
+}
+
+/*
+ * __crypto_env_refresh --
+ * Clean up after the crpto system on a close or failed open.
+ *
+ * PUBLIC: int __crypto_env_refresh __P((ENV *));
+ */
+int
+__crypto_env_refresh(env)
+ ENV *env;
+{
+ CIPHER *cipher;
+ REGENV *renv;
+ REGINFO *infop;
+
+ /*
+ * If a private region, return the memory to the heap. Not needed for
+ * filesystem-backed or system shared memory regions, that memory isn't
+ * owned by any particular process.
+ */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ infop = env->reginfo;
+ renv = infop->primary;
+ if (renv->cipher_off != INVALID_ROFF) {
+ cipher = R_ADDR(infop, renv->cipher_off);
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ __env_alloc_free(infop, R_ADDR(infop, cipher->passwd));
+ __env_alloc_free(infop, cipher);
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __crypto_algsetup --
+ * Given a db_cipher structure and a valid algorithm flag, call
+ * the specific algorithm setup function.
+ *
+ * PUBLIC: int __crypto_algsetup __P((ENV *, DB_CIPHER *, u_int32_t, int));
+ */
+int
+__crypto_algsetup(env, db_cipher, alg, do_init)
+ ENV *env;
+ DB_CIPHER *db_cipher;
+ u_int32_t alg;
+ int do_init;
+{
+ int ret;
+
+ ret = 0;
+ if (!CRYPTO_ON(env)) {
+ __db_errx(env, DB_STR("0177",
+ "No cipher structure given"));
+ return (EINVAL);
+ }
+ F_CLR(db_cipher, CIPHER_ANY);
+ switch (alg) {
+ case CIPHER_AES:
+ db_cipher->alg = CIPHER_AES;
+ ret = __aes_setup(env, db_cipher);
+ break;
+ default:
+ ret = __env_panic(env, EINVAL);
+ break;
+ }
+ if (ret == 0 && do_init)
+ ret = db_cipher->init(env, db_cipher);
+ return (ret);
+}
+
+/*
+ * __crypto_decrypt_meta --
+ * Perform decryption on a metapage if needed.
+ *
+ * PUBLIC: int __crypto_decrypt_meta __P((ENV *, DB *, u_int8_t *, int));
+ */
+int
+__crypto_decrypt_meta(env, dbp, mbuf, do_metachk)
+ ENV *env;
+ DB *dbp;
+ u_int8_t *mbuf;
+ int do_metachk;
+{
+ DB dummydb;
+ DBMETA *meta;
+ DB_CIPHER *db_cipher;
+ size_t pg_off;
+ int ret;
+ u_int8_t *iv;
+
+ /*
+ * If we weren't given a dbp, we just want to decrypt the page on
+ * behalf of some internal subsystem, not on behalf of a user with
+ * a dbp. Therefore, set up a dummy dbp so that the call to
+ * P_OVERHEAD below works.
+ */
+ if (dbp == NULL) {
+ memset(&dummydb, 0, sizeof(DB));
+ dbp = &dummydb;
+ }
+
+ ret = 0;
+ meta = (DBMETA *)mbuf;
+
+ /*
+ * !!!
+ * We used an "unused" field in the meta-data page to flag whether or
+ * not the database is encrypted. Unfortunately, that unused field
+ * was used in Berkeley DB releases before 3.0 (for example, 2.7.7).
+ * It would have been OK, except encryption doesn't follow the usual
+ * rules of "upgrade before doing anything else", we check encryption
+ * before checking for old versions of the database.
+ *
+ * We don't have to check Btree databases -- before 3.0, the field of
+ * interest was the bt_maxkey field (which was never supported and has
+ * since been removed).
+ *
+ * Ugly check to jump out if this format is older than what we support.
+ * This works because we do not encrypt the page header.
+ */
+ if (meta->magic == DB_HASHMAGIC && meta->version <= 5)
+ return (0);
+
+ /*
+ * Meta-pages may be encrypted for DBMETASIZE bytes. If we have a
+ * non-zero IV (that is written after encryption) then we decrypt (or
+ * error if the user isn't set up for security). We guarantee that
+ * the IV space on non-encrypted pages will be zero and a zero-IV is
+ * illegal for encryption. Therefore any non-zero IV means an
+ * encrypted database. This basically checks the passwd on the file
+ * if we cannot find a good magic number. We walk through all the
+ * algorithms we know about attempting to decrypt (and possibly
+ * byteswap).
+ *
+ * !!!
+ * All method meta pages have the IV and checksum at the exact same
+ * location, but not in DBMETA, use BTMETA.
+ */
+ if (meta->encrypt_alg != 0) {
+ db_cipher = env->crypto_handle;
+ if (!F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ if (!CRYPTO_ON(env)) {
+ __db_errx(env, DB_STR("0178",
+ "Encrypted database: no encryption flag specified"));
+ return (EINVAL);
+ }
+ /*
+ * User has a correct, secure env, but has encountered
+ * a database in that env that is secure, but user
+ * didn't dbp->set_flags. Since it is existing, use
+ * encryption if it is that way already.
+ */
+ F_SET(dbp, DB_AM_ENCRYPT|DB_AM_CHKSUM);
+ }
+ /*
+ * This was checked in set_flags when DB_AM_ENCRYPT was set.
+ * So it better still be true here.
+ */
+ DB_ASSERT(env, CRYPTO_ON(env));
+ if (!F_ISSET(db_cipher, CIPHER_ANY) &&
+ meta->encrypt_alg != db_cipher->alg) {
+ __db_errx(env, DB_STR("0179",
+ "Database encrypted using a different algorithm"));
+ return (EINVAL);
+ }
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+ iv = ((BTMETA *)mbuf)->iv;
+ /*
+ * For ALL pages, we do not encrypt the beginning of the page
+ * that contains overhead information. This is true of meta
+ * and all other pages.
+ */
+ pg_off = P_OVERHEAD(dbp);
+alg_retry:
+ /*
+ * If they asked for a specific algorithm, then
+ * use it. Otherwise walk through those we know.
+ */
+ if (!F_ISSET(db_cipher, CIPHER_ANY)) {
+ if (do_metachk && (ret = db_cipher->decrypt(env,
+ db_cipher->data, iv, mbuf + pg_off,
+ DBMETASIZE - pg_off)))
+ return (ret);
+ if (((BTMETA *)meta)->crypto_magic !=
+ meta->magic) {
+ __db_errx(env, DB_STR("0180",
+ "Invalid password"));
+ return (EINVAL);
+ }
+ /*
+ * Success here. The algorithm asked for and the one
+ * on the file match. We've just decrypted the meta
+ * page and checked the magic numbers. They match,
+ * indicating the password is right. All is right
+ * with the world.
+ */
+ return (0);
+ }
+ /*
+ * If we get here, CIPHER_ANY must be set.
+ */
+ ret = __crypto_algsetup(env, db_cipher, meta->encrypt_alg, 1);
+ goto alg_retry;
+ } else if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ /*
+ * They gave us a passwd, but the database is not encrypted.
+ * This is an error. We do NOT want to silently allow them
+ * to write data in the clear when the user set up and expects
+ * encrypted data.
+ *
+ * This covers at least the following scenario.
+ * 1. User creates and sets up an encrypted database.
+ * 2. Attacker cannot read the actual data in the database
+ * because it is encrypted, but can remove/replace the file
+ * with an empty, unencrypted database file.
+ * 3. User sets encryption and we get to this code now.
+ * If we allowed the file to be used in the clear since
+ * it is that way on disk, the user would unsuspectingly
+ * write sensitive data in the clear.
+ * 4. Attacker reads data that user thought was encrypted.
+ *
+ * Therefore, asking for encryption with a database that
+ * was not encrypted is an error.
+ */
+ __db_errx(env, DB_STR("0181",
+ "Unencrypted database with a supplied encryption key"));
+ return (EINVAL);
+ }
+ return (ret);
+}
+
+/*
+ * __crypto_set_passwd --
+ * Get the password from the shared region; and set it in a new
+ * environment handle. Use this to duplicate environment handles.
+ *
+ * PUBLIC: int __crypto_set_passwd __P((ENV *, ENV *));
+ */
+int
+__crypto_set_passwd(env_src, env_dest)
+ ENV *env_src, *env_dest;
+{
+ CIPHER *cipher;
+ REGENV *renv;
+ REGINFO *infop;
+ char *sh_passwd;
+
+ infop = env_src->reginfo;
+ renv = infop->primary;
+
+ DB_ASSERT(env_src, CRYPTO_ON(env_src));
+
+ cipher = R_ADDR(infop, renv->cipher_off);
+ sh_passwd = R_ADDR(infop, cipher->passwd);
+ return (__env_set_encrypt(env_dest->dbenv, sh_passwd, DB_ENCRYPT_AES));
+}
diff --git a/src/crypto/crypto.html b/src/crypto/crypto.html
new file mode 100644
index 00000000..1a2dc0c1
--- /dev/null
+++ b/src/crypto/crypto.html
@@ -0,0 +1,638 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+ <meta name="GENERATOR" content="Mozilla/4.76 [en] (X11; U; FreeBSD 4.3-RELEASE i386) [Netscape]">
+</head>
+<body>
+
+<center>
+<h1>
+&nbsp;Security Interface for Berkeley DB</h1></center>
+
+<center><i>Susan LoVerso</i>
+<br><i>Rev 1.6</i>
+<br><i>2002 Feb 26</i></center>
+
+<p>We provide an interface allowing secure access to Berkeley DB.&nbsp;&nbsp;
+Our goal is to allow users to have encrypted secure databases.&nbsp; In
+this document, the term <i>ciphering</i> means the act of encryption or
+decryption.&nbsp; They are equal but opposite actions and the same issues
+apply to both just in the opposite direction.
+<h3>
+Requirements</h3>
+The overriding requirement is to provide a simple mechanism to allow users
+to have a secure database.&nbsp; A secure database means that all of the
+pages of a database will be encrypted, and all of the log files will be
+encrypted.
+<p>Falling out from this work will be a simple mechanism to allow users
+to request that we checksum their data for additional error detection (without
+encryption/decryption).
+<p>We expect that data in process memory or stored in shared memory, potentially
+backed by disk, is not encrypted or secure.
+<h2>
+<a NAME="DB Modifications"></a>DB Method Interface Modifications</h2>
+With a logging environment, all database changes are recorded in the log
+files.&nbsp; Therefore, users requiring secure databases in such environments
+also require secure log files.
+<p>A prior thought had been to allow different passwords on the environment
+and the databases within.&nbsp; However, such a scheme, then requires that
+the password be logged in order for recovery to be able to restore the
+database.&nbsp; Therefore, any application having the password for the
+log could get the password for any databases by reading the log.&nbsp;
+So having a different password on a database does not gain any additional
+security and it makes certain things harder and more complex.&nbsp; Some
+of those more complex things include the need to handle database and env
+passwords differently since they'd need to be stored and accessed from
+different places.&nbsp; Also resolving the issue of how <i>db_checkpoint</i>
+or <i>db_sync</i>, which flush database pages to disk, would find the passwords
+of various databases without any dbps was unsolved.&nbsp; The feature didn't
+gain anything and caused significant pain.&nbsp; Therefore the decision
+is that there will be a single password protecting an environment and all
+the logs and some databases within that environment.&nbsp; We do allow
+users to have a secure environment and clear databases.&nbsp; Users that
+want secure databases within a secure environment must set a flag.
+<p>Users wishing to enable encryption on a database in a secure environment
+or enable just checksumming on their database pages will use new flags
+to <a href="../docs/api_c/db_set_flags.html">DB->set_flags()</a>.&nbsp;
+Providing ciphering over an entire environment is accomplished by adding
+a single environment method: <a href="../docs/api_c/env_set_encrypt.html">DBENV->set_encrypt()</a>.&nbsp;
+Providing encryption for a database (not part of an environment) is accomplished
+by adding a new database method: <a href="../docs/api_c/db_set_encrypt.html">DB->set_encrypt()</a>.
+<p>Both of the <i>set_encrypt</i> methods must be called before their respective
+<i>open</i> calls.&nbsp; The environment method must be before the environment
+open because we must know about security before there is any possibility
+of writing any log records out.&nbsp; The database method must be before
+the database open in order to read the root page.&nbsp; The planned interfaces
+for these methods are:
+<pre>DBENV->set_encrypt(DBENV *dbenv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* DB_ENV structure */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; char *passwd&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Password */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int32_t flags);&nbsp;&nbsp;&nbsp;&nbsp; /* Flags */</pre>
+
+<pre>DB->set_encrypt(DB *dbp,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* DB structure */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; char *passwd&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Password */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int32_t flags);&nbsp;&nbsp;&nbsp;&nbsp; /* Flags */</pre>
+The flags accepted by these functions are:
+<pre>#define DB_ENCRYPT_AES&nbsp; 0x00000001&nbsp; /* Use the AES encryption algorithm */</pre>
+Passwords are NULL-terminated strings.&nbsp; NULL or zero length strings
+are illegal.&nbsp; These flags enable the checksumming and encryption using
+the particular algorithms we have chosen for this implementation.&nbsp;
+The flags are named such that there is a logical naming pattern if additional
+checksum or encryption algorithms are used. If a user gives a flag of zero,
+it will behave in a manner similar to DB_UNKNOWN. It will be illegal if
+they are creating the environment or database, as an algorithm must be
+specified. If they are joining an existing environment or opening an existing
+database, they will use whatever algorithm is in force at the time.&nbsp;
+Using DB_ENCRYPT_AES automatically implies SHA1 checksumming.
+<p>These functions will perform several initialization steps.&nbsp; We
+will allocate crypto_handle for our env handle and set up our function
+pointers.&nbsp; We will allocate space and copy the password into our env
+handle password area.&nbsp; Similar to <i>DB->set_cachesize</i>, calling
+<i>DB->set_encrypt</i>
+will actually reflect back into the local environment created by DB.
+<p>Lastly, we will add a new flag, DB_OVERWRITE, to the <a href="../docs/api_c/env_remove.html">DBENV->remove</a>
+method.&nbsp; The purpose of this flag is to force all of the memory used
+by the shared regions to be overwritten before removal.&nbsp; We will use
+<i>rm_overwrite</i>,
+a function that overwrites and syncs a file 3 times with varying bit patterns
+to really remove a file.&nbsp; Additionally, this flag will force a sync
+of the overwritten regions to disk, if the regions are backed by the file
+system.&nbsp; That way there is no residual information left in the clear
+in memory or freed disk blocks.&nbsp; Although we expect that this flag
+will be used by customers using security, primarily, its action is not
+dependent on passwords or a secure setup, and so can be used by anyone.
+<h4>
+Initialization of the Environment</h4>
+The setup of the security subsystem will be similar to replication initialization
+since it is a sort of subsystem, but it does not have its own region.&nbsp;
+When the environment handle is created via <i>db_env_create</i>, we initialize
+our <i>set_encrypt</i> method to be the RPC or local version.&nbsp; Therefore
+the <i>DB_ENV</i> structure needs a new pointer:
+<pre>&nbsp;&nbsp;&nbsp; void&nbsp;&nbsp;&nbsp; *crypto_handle;&nbsp;&nbsp; /* Security handle */</pre>
+The crypto handle will really point to a new <i>__db_cipher</i> structure
+that will contain a set of functions and a pointer to the in-memory information
+needed by the specific encryption algorithm.&nbsp; It will look like:
+<pre>typedef struct __db_cipher {
+&nbsp;&nbsp;&nbsp; int&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (*init)__P((...));&nbsp;&nbsp;&nbsp; /* Alg-specific initialization function */
+&nbsp;&nbsp;&nbsp; int&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (*encrypt)__P((...)); /* Alg-specific encryption algorithm */
+&nbsp;&nbsp;&nbsp; int&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (*decrypt)__P((...)); /* Alg-specific decryption function */
+&nbsp;&nbsp;&nbsp; void&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *data;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Pointer to alg-specific information (AES_CIPHER) */
+&nbsp;&nbsp;&nbsp; u_int32_t flags;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Cipher flags */
+} DB_CIPHER;</pre>
+
+<pre>#define DB_MAC_KEY&nbsp;&nbsp;&nbsp; 20&nbsp;&nbsp;&nbsp; /* Size of the MAC key */
+typedef struct __aes_cipher {
+&nbsp;&nbsp;&nbsp; keyInstance&nbsp;&nbsp;&nbsp; encrypt_ki;&nbsp;&nbsp; /* Encrypt keyInstance temp. */
+&nbsp;&nbsp;&nbsp; keyInstance&nbsp;&nbsp;&nbsp; decrypt_ki;&nbsp;&nbsp; /* Decrypt keyInstance temp. */
+&nbsp;&nbsp;&nbsp; u_int8_t&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; mac_key[DB_MAC_KEY]; /* MAC key */
+&nbsp;&nbsp;&nbsp; u_int32_t&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; flags;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* AES-specific flags */
+} AES_CIPHER;</pre>
+It should be noted that none of these structures have their own mutex.&nbsp;
+We hold the environment region locked while we are creating this, but once
+this is set up, it is read-only forever.
+<p>During <a href="../docs/api_c/env_set_encrypt.html">dbenv->set_encrypt</a>,
+we set the encryption, decryption and checksumming methods to the appropriate
+functions based on the flags.&nbsp; This function will allocate us a crypto
+handle that we store in the <i>DB_ENV</i> structure just like all the
+other subsystems.&nbsp; For now, only AES ciphering functions and SHA1
+checksumming functions are supported.&nbsp; Also we will copy the password
+into the <i>DB_ENV</i> structure.&nbsp; We ultimately need to keep the
+password in the environment's shared memory region or compare this one
+against the one that is there, if we are joining an existing environment,
+but we do not have it yet because open has not yet been called.&nbsp; We
+will allocate a structure that will be used in initialization and set up
+the function pointers to point to the algorithm-specific functions.
+<p>In the&nbsp; <i>__env_open</i> path, in <i>__db_e_attach</i>, if we
+are creating the region and the <i>dbenv->passwd</i> field is set, we need
+to use the length of the password in the initial computation of the environment's
+size.&nbsp; This guarantees sufficient space for storing the password in
+shared memory.&nbsp; Then we will call a new function to initialize the
+security region, <i>__crypto_region_init</i> in <i>__env_open</i>.&nbsp;
+If we are the creator, we will allocate space in the shared region to store
+the password and copy the password into that space.&nbsp; Or, if we are
+not the creator we will compare the password stored in the dbenv with the
+one in shared memory.&nbsp;&nbsp; Additionally, we will compare the ciphering
+algorithm to the one stored in the shared region.We'll smash the dbenv
+password and free it.&nbsp; If they do not match, we return an error.&nbsp;
+If we are the creator we store the offset into the REGENV structure.&nbsp;
+Then <i>__crypto_region_init&nbsp;</i> will call the initialization function
+set up earlier based on the ciphering algorithm specified.&nbsp; For now
+we will call <i>__aes_init</i>.&nbsp; Additionally this function will allocate
+and set up the per-process state vector for this encryption's IVs.&nbsp;
+See <a href="#Generating the Initialization Vector">Generating the Initialization
+Vector</a> for a detailed description of the IV and state vector.
+<p>In the AES-specific initialization function, <i>__aes_init</i>,&nbsp;
+we will initialize it by calling
+<i>__aes_derivekeys</i> in order to fill
+in the keyInstance and mac_key fields in that structure.&nbsp; The REGENV
+structure will have one additional item
+<pre>&nbsp;&nbsp; roff_t&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; passwd_off;&nbsp;&nbsp; /* Offset of passwd */</pre>
+
+<h4>
+Initializing a Database</h4>
+During <a href="../docs/api_c/db_set_encrypt.html">db->set_encrypt</a>,
+we set the encryption, decryption and checksumming methods to the appropriate
+functions based on the flags.&nbsp; Basically, we test that we are not
+in an existing environment and we haven't called open.&nbsp; Then we just
+call through the environment handle to set the password.
+<p>Also, we will need to add a flag in the database meta-data page that
+indicates that the database is encrypted and what its algorithm is.&nbsp;
+This will be used when the meta-page is read after reopening a file. We
+need this information on the meta-page in order to detect a user opening
+a secure database without a password.&nbsp; I propose using the first unused1
+byte (renaming it too) in the meta page for this purpose.
+<p>All pages will not be encrypted for the first 64 bytes of data.&nbsp;
+Database meta-pages will be encrypted on the first 512 bytes only.&nbsp;
+All meta-page types will have an IV and checksum added within the first
+512 bytes as well as a crypto magic number.&nbsp; This will expand the
+size of the meta-page from 256 bytes to 512 bytes. The page in/out routines,
+<i>__db_pgin</i> and <i>__db_pgout</i> know the page type of the page and
+will apply the 512 bytes ciphering to meta pages.&nbsp; In <i>__db_pgout</i>,
+if we have a crypto handle in our (private) environment, we will apply
+ciphering to either the entire page, or the first 512 bytes if it is a
+meta-page.&nbsp; In <i>__db_pgin</i>, we will decrypt if the page we have
+a crypto handle.
+<p>When multiple processes share a database, all must use the same password
+as the database creator. Using an existing database requires several conditions
+to be true.&nbsp; First, if the creator of the database did not create
+with security, then opening later with security is an error.&nbsp; Second,
+if the creator did create it with security, then opening later without
+security is an error.&nbsp; Third, we need to be able to test and check
+that when another process opens a secure database that the password they
+provided is the same as the one in use by the creator.
+<p>When reading the meta-page, in <i>__db_file_setup</i>, we do not go
+through the paging functions, but directly read via <i>__os_read</i>.&nbsp;
+It is at this point that we will determine if the user is configured correctly.&nbsp;
+If the meta-page we read has an IV and checksum, they better have a crypto
+handle.&nbsp; If they have a crypto handle, then the meta-page must have
+an IV and checksum.&nbsp; If both of those are true, we test the password.&nbsp;
+We compare the unencrypted magic number to the newly-decrypted crypto magic
+number and if they are not the same, then we report that the user gave
+us a bad password.
+<p>On a mostly unrelated topic, even when we go to very large pagesizes,
+the meta information will still be within a disk sector.&nbsp; So, after
+talking it over with Keith and Margo, we determined that unencrypted meta-pages
+still will not need a checksum.
+<h3>
+Encryption and Checksum Routines</h3>
+These routines are provided to us by Adam Stubblefield at Rice University
+(astubble@rice.edu).&nbsp; The functional interfaces are:
+<pre>__aes_derivekeys(DB_ENV *dbenv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* dbenv */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *passwd,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Password */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; size_t passwd_len,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Length of passwd */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *mac_key,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* 20 byte array to store MAC key */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; keyInstance *encrypt_key,&nbsp;&nbsp; /* Encryption key of passwd */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; keyInstance *decrypt_key);&nbsp; /* Decryption key of passwd */</pre>
+This is the only function requiring the textual user password.&nbsp; From
+the password, this function generates a key used in the checksum function,
+<i>__db_chksum</i>.&nbsp;
+It also fills in <i>keyInstance</i> structures which are then used in the
+encryption and decryption routines.&nbsp; The keyInstance structures must
+already be allocated.&nbsp; These will be stored in the AES_CIPHER structure.
+<pre>&nbsp;__db_chksum(u_int8_t *data,&nbsp;&nbsp;&nbsp; /* Data to checksum */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; size_t data_len,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Length of data */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *mac_key,&nbsp;&nbsp;&nbsp; /* 20 byte array from __db_derive_keys */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *checksum);&nbsp; /* 20 byte array to store checksum */</pre>
+This function generates a checksum on the data given.&nbsp; This function
+will do double-duty for users that simply want error detection on their
+pages.&nbsp; When users are using encryption, the <i>mac_key </i>will contain
+the 20-byte key set up in <i>__aes_derivekeys</i>.&nbsp; If they just want
+checksumming, then <i>mac_key</i> will be NULL.&nbsp; According to Adam,
+we can safely use the first N-bytes of the checksum.&nbsp; So for seeding
+the generator for initialization vectors, we'll hash the time and then
+send in the first 4 bytes for the seed.&nbsp; I believe we can probably
+do the same thing for checksumming log records.&nbsp; We can only use 4
+bytes for the checksum in the non-secure case.&nbsp; So when we want to
+verify the log checksum we can compute the mac but just compare the first
+4 bytes to the one we read.&nbsp; All locations where we generate or check
+log record checksums that currently call <i>__ham_func4</i> will now call
+<i>__db_chksum</i>.&nbsp;
+I believe there are 5 such locations,
+<i>__log_put, __log_putr, __log_newfile,
+__log_rep_put
+</i>and<i> __txn_force_abort.</i>
+<pre>__aes_encrypt(DB_ENV *dbenv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* dbenv */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; keyInstance *key,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Password key instance from __db_derive_keys */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *iv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Initialization vector */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *data,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Data to encrypt */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; size_t data_len);&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Length of data to encrypt - 16 byte multiple */</pre>
+This is the function to encrypt data.&nbsp; It will be called to encrypt
+pages and log records.&nbsp; The <i>key</i> instance is initialized in
+<i>__aes_derivekeys</i>.&nbsp;
+The initialization vector, <i>iv</i>, is the 16 byte random value set up
+by the Mersenne Twister pseudo-random generator.&nbsp; Lastly, we pass
+in a pointer to the <i>data</i> to encrypt and its length in <i>data_len</i>.&nbsp;
+The <i>data_len</i> must be a multiple of 16 bytes. The encryption is done
+in-place so that when the encryption code returns our encrypted data is
+in the same location as the original data.
+<pre>__aes_decrypt(DB_ENV *dbenv,&nbsp;&nbsp;&nbsp; /* dbenv */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; keyInstance *key,&nbsp; /* Password key instance from __db_derive_keys */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *iv,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* Initialization vector */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; u_int8_t *data,&nbsp;&nbsp;&nbsp; /* Data to decrypt */
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; size_t data_len);&nbsp; /* Length of data to decrypt - 16 byte multiple */</pre>
+This is the function to decrypt the data.&nbsp; It is exactly the same
+as the encryption function except for the action it performs.&nbsp; All
+of the args and issues are the same.&nbsp; It also decrypts in place.
+<h3>
+<a NAME="Generating the Initialization Vector"></a>Generating the Initialization
+Vector</h3>
+Internally, we need to provide a unique initialization vector (IV) of 16
+bytes every time we encrypt any data with the same password.&nbsp; For
+the IV we are planning on using mt19937, the Mersenne Twister, a random
+number generator that has a period of 2**19937-1. This package can be found
+at <a href="http://www.math.keio.ac.jp/~matumoto/emt.html">http://www.math.keio.ac.jp/~matumoto/emt.html</a>.&nbsp;
+Tests show that although it repeats a single integer every once in a while,
+that after several million iterations, it doesn't repeat any 4 integers
+that we'd be stuffing into our 16-byte IV.&nbsp; We plan on seeding this
+generator with the time (tv_sec) hashed through SHA1 when we create the
+environment.&nbsp; This package uses a global state vector that contains
+624 unsigned long integers.&nbsp; We do not allow a 16-byte IV of zero.&nbsp;
+It is simpler just to reject any 4-byte value of 0 and if we get one, just
+call the generator again and get a different number.&nbsp; We need to detect
+holes in files and if we read an IV of zero that is a simple indication
+that we need to check for an entire page of zero.&nbsp; The IVs are stored
+on the page after encryption and are not encrypted themselves so it is
+not possible for an entire encrypted page to be read as all zeroes, unless
+it was a hole in a file.&nbsp; See <a href="#Holes in Files">Holes in Files</a>
+for more details.
+<p>We will not be holding any locks when we need to generate our IV but
+we need to protect access to the state vector and the index.&nbsp; Calls
+to the MT code will come while encrypting some data in <i>__aes_encrypt.</i>&nbsp;&nbsp;
+The MT code will assume that all necessary locks are held in the caller.&nbsp;
+We will have per-process state vectors that are set up when a process begins.&nbsp;
+That way we minimize the contention and only multi-threaded processes need
+acquire locks for the IV.&nbsp; We will have the state vector in the environment
+handle in heap memory, as well as the index and there will be a mutex protecting
+it for threaded access.&nbsp; This will be added to the <i>DB_ENV</i>
+structure:
+<pre>&nbsp;&nbsp;&nbsp; DB_MUTEX&nbsp;&nbsp;&nbsp; *mt_mutexp;&nbsp;&nbsp; /* Mersenne Twister mutex */
+&nbsp;&nbsp;&nbsp; int&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *mti;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* MT index */
+&nbsp;&nbsp;&nbsp; u_long&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; *mt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; /* MT state vector */</pre>
+This portion of the environment will be initialized at the end of _<i>_dbenv_open</i>,
+right after we initialize the other mutex for the <i>dblist</i>. When we
+allocate the space, we will generate our initial state vector. If we are
+multi-threaded we'll allocate and initialize our mutex also.
+<p>We need to make changes to the MT code to make it work in our namespace
+and&nbsp; to take&nbsp; a pointer to the location of the state vector and
+the index.&nbsp;&nbsp; There will be a wrapper function <i>__db_generate_iv</i>
+that DB will call and it will call the appropriate MT function.&nbsp; I
+am also going to change the default seed to use a hashed time instead of
+a hard coded value.&nbsp; I have looked at other implementations of the
+MT code available on the web site.&nbsp; The C++ version does a hash on
+the current time.&nbsp; I will modify our MT code to seed with the hashed
+time as well.&nbsp; That way the code to seed is contained within the MT
+code and we can just write the wrapper to get an IV.&nbsp; We will not
+be changing the core computational code of MT.
+<h2>
+DB Internal Issues</h2>
+
+<h4>
+When do we Cipher?</h4>
+All of the page ciphering is done in the <i>__db_pgin/__db_pgout</i> functions.&nbsp;
+We will encrypt after the method-specific function on page-out and decrypt
+before the method-specfic function on page-in.&nbsp; We do not hold any
+locks when entering these functions.&nbsp; We determine that we need to
+cipher based on the existence of the encryption flag in the dbp.
+<p>For ciphering log records, the encryption will be done as the first
+thing (or a new wrapper) in <i>__log_put.&nbsp; </i>See <a href="#Log Record Encryption">Log
+Record Encryption</a> for those details.
+<br>&nbsp;
+<h4>
+Page Changes</h4>
+The checksum and IV values will be stored prior to the first index of the
+page.&nbsp; We have a new P_INP macro that replaces use of inp[X] in the
+code. &nbsp;This macro takes a dbp as an argument and determines where
+our first index is based on whether we have DB_AM_CHKSUM and DB_AM_ENCRYPT
+set.&nbsp; If neither is set, then our first index is where it always was.
+&nbsp;If just checksumming is set, then we reserve a 4-byte checksum.&nbsp;
+If encryption is set, then we reserve 36 bytes for our checksum/IV as well
+as some space to get proper alignment to encrypt on a 16-byte boundary.
+<p>Since several paging macros use inp[X] in them, those macros must now
+take a dbp.&nbsp; There are a lot of changes to make all the necessary
+paging macros take a dbp, although these changes are trivial in nature.
+<p>Also, there is a new function <i>__db_chk_meta</i> to perform checksumming
+and decryption checking on meta pages specifically.&nbsp; This function
+is where we check that the database algorithm matches what the user gave
+(or if they set DB_CIPHER_ANY then we set it), and other encryption related
+testing for bad combinations of what is in the file versus what is in the
+user structures.
+<h4>
+Verification</h4>
+The verification code will also need to be updated to deal with secure
+pages.&nbsp; Basically when the verification code reads in the meta page
+it will call <i>__db_chk_meta</i> to perform any checksumming and decryption.
+<h4>
+<a NAME="Holes in Files"></a>Holes in Files</h4>
+Holes in files will be dealt with rather simply.&nbsp; We need to be able
+to distinguish reading a hole in a file from an encrypted page that happened
+to encrypt to all zero's.&nbsp; If we read a hole in a file, we do not
+want to send that empty page through the decryption routine.&nbsp; This
+can be determined simply without incurring the performance penalty of comparing
+every byte on a page on every read until we get a non-zero byte.
+<br>The __db_pgin function is only given an invalid page P_INVALID in this
+case.&nbsp;&nbsp;So, if the page type, which is always unencrypted, is
+P_INVALID, then we do not perform any checksum verification or decryption.
+<h4>
+Errors and Recovery</h4>
+Dealing with a checksum error is tricky.&nbsp; Ultimately, if a checksum
+error occurs it is extremely likely that the user must do catastrophic
+recovery.&nbsp; There is no other failure return other than&nbsp; DB_RUNRECOVERY
+for indicating that the user should run catastrophic recovery.&nbsp; We
+do not want to add a new error return for applications to check because
+a lot of applications already look for and deal with DB_RUNRECOVERY as
+an error condition and we want to fit ourselves into that application model.&nbsp;
+We already indicate to the user that when they get that error, then they
+need to run recovery.&nbsp; If recovery fails, then they need to run catastrophic
+recovery.&nbsp; We need to get ourselves to the point where users will
+run catastrophic recovery.
+<p>If we get a checksum error, then we need to log a message stating a
+checksum error occurred on page N.&nbsp; In <i>__db_pgin</i>, we can check
+if logging is on in the environment.&nbsp; If so, we want to log the message.
+<p>When the application gets the DB_RUNRECOVERY error, they'll have to
+shut down their application and run recovery.&nbsp; When the recovery encounters
+the record indicating checksum failure, then normal recovery will fail
+and the user will have to perform catastrophic recovery.&nbsp; When catastrophic
+recovery encounters that record, it will simply ignore it.
+<h4>
+<a NAME="Log Record Encryption"></a>Log Record Encryption</h4>
+Log records will be ciphered.&nbsp; It might make sense to wrap <i>__log_put</i>
+to encrypt the DBT we send down.&nbsp; The <i>__log_put </i>function is
+where the checksum is computed before acquiring the region lock.&nbsp;
+But also this function is where we call <i>__rep_send_message</i> to send
+the DBT to the replication clients.&nbsp; Therefore, we need the DBT to
+be encrypted prior to there.&nbsp; We also need it encrypted before checksumming.
+I think <i>__log_put </i>will become <i>__log_put_internal</i>, and the
+new <i>__log_put</i> will encrypt if needed and then call <i>__log_put_internal
+</i>(the
+function formerly known as <i>__log_put</i>).&nbsp; Log records are kept
+in a shared memory region buffer prior to going out to disk.&nbsp; Records
+in the buffer will be encrypted.&nbsp; No locks are held at the time we
+will need to encrypt.
+<p>On reading the log, via log cursors, the log code stores log records
+in the log buffer.&nbsp; Records in that buffer will be encrypted, so decryption
+will occur no matter whether we are returning records from the buffer or
+if we are returning log records directly from the disk. Current checksum
+checking is done in
+<i>__logc_get_int.</i>&nbsp; Decryption will be done
+after the checksum is checked.
+<p>There are currently two nasty issues with encrypted log records.&nbsp;
+The first is that <i>__txn_force_abort</i> overwrites a commit record in
+the log buffer with an abort record.&nbsp; Well, our log buffer will be
+encrypted.&nbsp; Therefore, <i>__txn_force_abort</i> is going to need to
+do encryption of its new record.&nbsp; This can be accomplished by sending
+in the dbenv handle to the function.&nbsp; It is available to us in <i>__log_flush_commit</i>
+and we can just pass it in.&nbsp; I don't like putting log encryption in
+the txn code, but the layering violation is already there.
+<p>The second issue is that the encryption code requires data that is a
+multiple of 16 bytes and log record lengths are variable.&nbsp; We will
+need to pad log records to meet the requirement.&nbsp; Since the callers
+of <i>__log_put</i> set up the given DBT it is a logical place to pad if
+necessary. We will modify the gen_rec.awk script to have all of the generated
+logging functions pad for us if we have a crypto handle. This padding will
+also expand the size of log files. Anyone calling <i>log_put</i> and using
+security from the application will have to pad on their own or it will
+return an error.
+<p>When ciphering the log file, we will need a different header than the
+current one.&nbsp; The current header only has space for a 4 byte checksum.&nbsp;
+Our secure header will need space for the 16 byte IV and 20 byte checksum.&nbsp;
+This will blow up our log files when running securely since every single
+log record header will now consume 32 additional bytes.&nbsp; I believe
+that the log header does not need to be encrypted.&nbsp; It contains an
+offset, a length and our IV and checksum.&nbsp; Our IV and checksum are
+never encrypted.&nbsp; I don't believe there to be any risk in having the
+offset and length in the clear.
+<p>I would prefer not to have two types of log headers that are incompatible
+with each other.&nbsp; It is not acceptable to increase the log headers
+of all users from 12 bytes to 44 bytes.&nbsp; Such a change would also
+make log files incompatible with earlier releases.&nbsp; Worse even, is
+that the <i>cksum</i> field of the header is in between the offset and
+len.&nbsp; It would be really convenient if we could have just made a bigger
+cksum portion without affecting the location of the other fields.&nbsp;
+Oh well.&nbsp; Most customers will not be using encryption and we won't
+make them pay the price of the expanded header.&nbsp; Keith indicates that
+the log file format is changing with the next release so I will move the
+cksum field so it can at least be overlaid.
+<p>One method around this would be to have a single internal header that
+contains all the information both mechanisms need, but when we write out
+the header we choose which pieces to write.&nbsp; By appending the security
+information to the end of the existing structure, and adding a size field,
+we can modify a few places to use the size field to write out only the
+current first 12 bytes, or the entire security header needed.
+<h4>
+Replication</h4>
+Replication clients are going to need to start all of their individual
+environment handles with the same password.&nbsp; The log records are going
+to be sent to the clients decrypted and the clients will have to encrypt
+them on their way to the client log files.&nbsp; We cannot send encrypted
+log records to clients. &nbsp;The reason is that the checksum and IV&nbsp;are
+stored in the log header and the master only sends the log record itself
+to the client. &nbsp;Therefore, the client has no way to decrypt a log
+record from the master. &nbsp;Therefore, anyone wanting to use truly secure
+replication is going to have to have a secure transport mechanism.&nbsp;
+By not encrypting records, clients can theoretically have different passwords
+and DB won't care.
+<p>On the master side we must copy the DBT sent in.&nbsp; We encrypt the
+original and send to clients the clear record.&nbsp; On the client side,
+support for encryption is added into <i>__log_rep_put</i>.
+<h4>
+Sharing the Environment</h4>
+When multiple processes join the environment, all must use the same password
+as the creator.
+<p>Joining an existing environment requires several conditions to be true.&nbsp;
+First, if the creator of the environment did not create with security,
+then joining later with security is an error.&nbsp; Second, if the creator
+did create it with security, then joining later without security is an
+error.&nbsp; Third, we need to be able to test and check that when another
+process joins a secure environment that the password they provided is the
+same as the one in use by the creator.
+<p>The first two scenarios should be fairly trivial to determine, if we
+aren't creating the environment, we can compare what is there with what
+we have.&nbsp; In the third case, the <i>__crypto_region_init</i> function
+will see that the environment region has a valid passwd_off and we'll then
+compare that password to the one we have in our dbenv handle.&nbsp; In
+any case we'll smash the dbenv handle's passwd and free that memory before
+returning whether we have a password match or not.
+<p>We need to store the passwords themselves in the region because multiple
+calls to the <i>__aes_derivekeys </i>function with the same password yields
+different keyInstance contents.&nbsp; Therefore we don't have any way to
+check passwords other than retaining and comparing the actual passwords.
+<h4>
+Other APIs</h4>
+All of the other APIs will need interface enhancements to support the new
+security methods.&nbsp; The Java and C++ interfaces will likely be done
+by Michael Cahill and Sue will implement the Tcl and RPC changes.&nbsp;
+Tcl will need the changes for testing purposes but the interface should
+be public, not test-only.&nbsp; RPC should fully support security.&nbsp;
+The biggest risk that I can see is that the client will send the password
+to the server in the clear.&nbsp; Anyone sniffing the wires or running
+tcpdump or other packet grabbing code could grab that.&nbsp; Someone really
+interested in using security over RPC probably ought to add authentication
+and other measures to the RPC server as well.
+<h4>
+<a NAME="Utilities"></a>Utilities</h4>
+All should take a -P flag to specify a password for the environment or
+password.&nbsp; Those that take an env and a database might need something
+more to distinguish between env passwds and db passwds. Here is what we
+do for each utility:
+<ul>
+<li>
+berkeley_db_svc - Needs -P after each -h specified.</li>
+
+<li>
+db_archive - Needs -P if the env is encrypted.</li>
+
+<li>
+db_checkpoint - Needs -P if the env is encrypted.</li>
+
+<li>
+db_deadlock - No changes</li>
+
+<li>
+db_dump - Needs -P if the env or database is encrypted.</li>
+
+<li>
+db_load - Needs -P if the env or database is encrypted.</li>
+
+<li>
+db_printlog - Needs -P if the env is encrypted.</li>
+
+<li>
+db_recover - Needs -P if the env is encrypted.</li>
+
+<li>
+db_stat - Needs -P if the env or database is encrypted.</li>
+
+<li>
+db_upgrade - Needs -P if the env or database is encrypted.</li>
+
+<li>
+db_verify - Needs -P if the env or database is encrypted.</li>
+</ul>
+
+<h2>
+Testing</h2>
+All testing should be able to be accomplished via Tcl.&nbsp; The following
+tests (and probably others I haven't thought of yet) should be performed:
+<ul>
+<li>
+Basic functionality - basically a test001 but encrypted without an env</li>
+
+<li>
+Basic functionality, w/ env - like the previous test but with an env.</li>
+
+<li>
+Basic functionality, multiple processes - like first test, but make sure
+others can correctly join.</li>
+
+<li>
+Basic functionality, mult. processes - like above test, but initialize/close
+environment/database first so that the next test processes are all joiners
+of an existing env, but creator no longer exists and the shared region
+must be opened.</li>
+
+<li>
+Recovery test - Run recovery over an encrypted environment.</li>
+
+<li>
+Subdb test - Run with subdbs that are encrypted.</li>
+
+<li>
+Utility test - Verify the new options to all the utilities.</li>
+
+<li>
+Error handling - Test the basic setup errors for both env's and databases
+with multiple processes.&nbsp; They are:</li>
+
+<ol>
+<li>
+Attempt to set a NULL or zero-length passwd.</li>
+
+<li>
+Create Env w/ security and attempt to create database w/ its own password.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Proc2 joins without - should get an
+error.</li>
+
+<li>
+Env/DB creates without security.&nbsp; Proc2 joins with - should get an
+error.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Proc2 joins with different password
+- should get an error.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Closes.&nbsp; Proc2 reopens with different
+password - should get an error.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Closes.&nbsp; Tcl overwrites a page
+of the database with garbage.&nbsp; Proc2 reopens with the correct password.&nbsp;
+Code should detect checksum error.</li>
+
+<li>
+Env/DB creates with security.&nbsp; Open a 2nd identical DB with a different
+password.&nbsp; Put the exact same data into both databases.&nbsp; Close.&nbsp;
+Overwrite the identical page of DB1 with the one from DB2.&nbsp; Reopen
+the database with correct DB1 password.&nbsp; Code should detect an encryption
+error on that page.</li>
+</ol>
+</ul>
+
+<h2>
+Risks</h2>
+There are several holes in this design.&nbsp; It is important to document
+them clearly.
+<p>The first is that all of the pages are stored in memory and possibly
+the file system in the clear.&nbsp; The password is stored in the shared
+data regions in the clear.&nbsp; Therefore if an attacker can read the
+process memory, they can do whatever they want.&nbsp; If the attacker can
+read system memory or swap they can access the data as well.&nbsp; Since
+everything in the shared data regions (with the exception of the buffered
+log) will be in the clear, it is important to realize that file backed
+regions will be written in the clear, including the portion of the regions
+containing passwords.&nbsp; We recommend to users that they use system
+memory instead of file backed shared memory.
+</body>
+</html>
diff --git a/src/crypto/mersenne/mt19937db.c b/src/crypto/mersenne/mt19937db.c
new file mode 100644
index 00000000..2d53c312
--- /dev/null
+++ b/src/crypto/mersenne/mt19937db.c
@@ -0,0 +1,187 @@
+/*
+ * $Id$
+ */
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+
+/* A C-program for MT19937: Integer version (1999/10/28) */
+/* genrand() generates one pseudorandom unsigned integer (32bit) */
+/* which is uniformly distributed among 0 to 2^32-1 for each */
+/* call. sgenrand(seed) sets initial values to the working area */
+/* of 624 words. Before genrand(), sgenrand(seed) must be */
+/* called once. (seed is any 32-bit integer.) */
+/* Coded by Takuji Nishimura, considering the suggestions by */
+/* Topher Cooper and Marc Rieffel in July-Aug. 1997. */
+
+/* This library is free software under the Artistic license: */
+/* see the file COPYING distributed together with this code. */
+/* For the verification of the code, its output sequence file */
+/* mt19937int.out is attached (2001/4/2) */
+
+/* Copyright (C) 1997, 1999 Makoto Matsumoto and Takuji Nishimura. */
+/* Any feedback is very welcome. For any question, comments, */
+/* see http://www.math.keio.ac.jp/matumoto/emt.html or email */
+/* matumoto@math.keio.ac.jp */
+
+/* REFERENCE */
+/* M. Matsumoto and T. Nishimura, */
+/* "Mersenne Twister: A 623-Dimensionally Equidistributed Uniform */
+/* Pseudo-Random Number Generator", */
+/* ACM Transactions on Modeling and Computer Simulation, */
+/* Vol. 8, No. 1, January 1998, pp 3--30. */
+
+/* Period parameters */
+#define N 624
+#define M 397
+#define MATRIX_A 0x9908b0df /* constant vector a */
+#define UPPER_MASK 0x80000000 /* most significant w-r bits */
+#define LOWER_MASK 0x7fffffff /* least significant r bits */
+
+/* Tempering parameters */
+#define TEMPERING_MASK_B 0x9d2c5680
+#define TEMPERING_MASK_C 0xefc60000
+#define TEMPERING_SHIFT_U(y) (y >> 11)
+#define TEMPERING_SHIFT_S(y) (y << 7)
+#define TEMPERING_SHIFT_T(y) (y << 15)
+#define TEMPERING_SHIFT_L(y) (y >> 18)
+
+static void __db_sgenrand __P((unsigned long, unsigned long *, int *));
+#ifdef NOT_USED
+static void __db_lsgenrand __P((unsigned long *, unsigned long *, int *));
+#endif
+static unsigned long __db_genrand __P((ENV *));
+
+/*
+ * __db_generate_iv --
+ * Generate an initialization vector (IV)
+ *
+ * PUBLIC: int __db_generate_iv __P((ENV *, u_int32_t *));
+ */
+int
+__db_generate_iv(env, iv)
+ ENV *env;
+ u_int32_t *iv;
+{
+ int i, n, ret;
+
+ ret = 0;
+ n = DB_IV_BYTES / sizeof(u_int32_t);
+ MUTEX_LOCK(env, env->mtx_mt);
+ if (env->mt == NULL) {
+ if ((ret = __os_calloc(env, 1, N*sizeof(unsigned long),
+ &env->mt)) != 0)
+ return (ret);
+ /* mti==N+1 means mt[N] is not initialized */
+ env->mti = N + 1;
+ }
+ for (i = 0; i < n; i++) {
+ /*
+ * We do not allow 0. If we get one just try again.
+ */
+ do {
+ iv[i] = (u_int32_t)__db_genrand(env);
+ } while (iv[i] == 0);
+ }
+
+ MUTEX_UNLOCK(env, env->mtx_mt);
+ return (0);
+}
+
+/* Initializing the array with a seed */
+static void
+__db_sgenrand(seed, mt, mtip)
+ unsigned long seed;
+ unsigned long mt[];
+ int *mtip;
+{
+ int i;
+
+ DB_ASSERT(NULL, seed != 0);
+ for (i=0;i<N;i++) {
+ mt[i] = seed & 0xffff0000;
+ seed = 69069 * seed + 1;
+ mt[i] |= (seed & 0xffff0000) >> 16;
+ seed = 69069 * seed + 1;
+ }
+ *mtip = N;
+}
+
+#ifdef NOT_USED
+/* Initialization by "sgenrand()" is an example. Theoretically, */
+/* there are 2^19937-1 possible states as an intial state. */
+/* This function allows to choose any of 2^19937-1 ones. */
+/* Essential bits in "seed_array[]" is following 19937 bits: */
+/* (seed_array[0]&UPPER_MASK), seed_array[1], ..., seed_array[N-1]. */
+/* (seed_array[0]&LOWER_MASK) is discarded. */
+/* Theoretically, */
+/* (seed_array[0]&UPPER_MASK), seed_array[1], ..., seed_array[N-1] */
+/* can take any values except all zeros. */
+static void
+__db_lsgenrand(seed_array, mt, mtip)
+ unsigned long seed_array[];
+ unsigned long mt[];
+ int *mtip;
+ /* the length of seed_array[] must be at least N */
+{
+ int i;
+
+ for (i=0;i<N;i++)
+ mt[i] = seed_array[i];
+ *mtip=N;
+}
+#endif
+
+static unsigned long
+__db_genrand(env)
+ ENV *env;
+{
+ db_timespec ts;
+ unsigned long y;
+ static unsigned long mag01[2]={0x0, MATRIX_A};
+ /* mag01[x] = x * MATRIX_A for x=0,1 */
+ u_int32_t seed;
+
+ /*
+ * We are called with ENV->mtx_mt locked.
+ */
+ if (env->mti >= N) { /* generate N words at one time */
+ int kk;
+
+ if (env->mti == N+1) { /* if sgenrand() has not been called, */
+ /*
+ * Seed the generator with the hashed time. The __db_mac
+ * function will return 4 bytes if we don't send in a key.
+ */
+ do {
+ __os_gettime(env, &ts, 1);
+ __db_chksum(NULL, (u_int8_t *)&ts.tv_sec,
+ sizeof(ts.tv_sec), NULL, (u_int8_t *)&seed);
+ } while (seed == 0);
+ __db_sgenrand((unsigned long)seed, env->mt, &env->mti);
+ }
+
+ for (kk=0;kk<N-M;kk++) {
+ y = (env->mt[kk]&UPPER_MASK)|(env->mt[kk+1]&LOWER_MASK);
+ env->mt[kk] = env->mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1];
+ }
+ for (;kk<N-1;kk++) {
+ y = (env->mt[kk]&UPPER_MASK)|(env->mt[kk+1]&LOWER_MASK);
+ env->mt[kk] = env->mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1];
+ }
+ y = (env->mt[N-1]&UPPER_MASK)|(env->mt[0]&LOWER_MASK);
+ env->mt[N-1] = env->mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1];
+
+ env->mti = 0;
+ }
+
+ y = env->mt[env->mti++];
+ y ^= TEMPERING_SHIFT_U(y);
+ y ^= TEMPERING_SHIFT_S(y) & TEMPERING_MASK_B;
+ y ^= TEMPERING_SHIFT_T(y) & TEMPERING_MASK_C;
+ y ^= TEMPERING_SHIFT_L(y);
+
+ return y;
+}
diff --git a/src/crypto/rijndael/rijndael-alg-fst.c b/src/crypto/rijndael/rijndael-alg-fst.c
new file mode 100644
index 00000000..322ad5ff
--- /dev/null
+++ b/src/crypto/rijndael/rijndael-alg-fst.c
@@ -0,0 +1,1466 @@
+/**
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+
+#include "crypto/rijndael/rijndael-alg-fst.h"
+
+/*
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+Te4[x] = S [x].[01, 01, 01, 01];
+
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01, 01, 01, 01];
+*/
+
+static const u32 Te0[256] = {
+ (u_int)0xc66363a5, (u_int)0xf87c7c84, (u_int)0xee777799, (u_int)0xf67b7b8d,
+ (u_int)0xfff2f20d, (u_int)0xd66b6bbd, (u_int)0xde6f6fb1, (u_int)0x91c5c554,
+ (u_int)0x60303050, (u_int)0x02010103, (u_int)0xce6767a9, (u_int)0x562b2b7d,
+ (u_int)0xe7fefe19, (u_int)0xb5d7d762, (u_int)0x4dababe6, (u_int)0xec76769a,
+ (u_int)0x8fcaca45, (u_int)0x1f82829d, (u_int)0x89c9c940, (u_int)0xfa7d7d87,
+ (u_int)0xeffafa15, (u_int)0xb25959eb, (u_int)0x8e4747c9, (u_int)0xfbf0f00b,
+ (u_int)0x41adadec, (u_int)0xb3d4d467, (u_int)0x5fa2a2fd, (u_int)0x45afafea,
+ (u_int)0x239c9cbf, (u_int)0x53a4a4f7, (u_int)0xe4727296, (u_int)0x9bc0c05b,
+ (u_int)0x75b7b7c2, (u_int)0xe1fdfd1c, (u_int)0x3d9393ae, (u_int)0x4c26266a,
+ (u_int)0x6c36365a, (u_int)0x7e3f3f41, (u_int)0xf5f7f702, (u_int)0x83cccc4f,
+ (u_int)0x6834345c, (u_int)0x51a5a5f4, (u_int)0xd1e5e534, (u_int)0xf9f1f108,
+ (u_int)0xe2717193, (u_int)0xabd8d873, (u_int)0x62313153, (u_int)0x2a15153f,
+ (u_int)0x0804040c, (u_int)0x95c7c752, (u_int)0x46232365, (u_int)0x9dc3c35e,
+ (u_int)0x30181828, (u_int)0x379696a1, (u_int)0x0a05050f, (u_int)0x2f9a9ab5,
+ (u_int)0x0e070709, (u_int)0x24121236, (u_int)0x1b80809b, (u_int)0xdfe2e23d,
+ (u_int)0xcdebeb26, (u_int)0x4e272769, (u_int)0x7fb2b2cd, (u_int)0xea75759f,
+ (u_int)0x1209091b, (u_int)0x1d83839e, (u_int)0x582c2c74, (u_int)0x341a1a2e,
+ (u_int)0x361b1b2d, (u_int)0xdc6e6eb2, (u_int)0xb45a5aee, (u_int)0x5ba0a0fb,
+ (u_int)0xa45252f6, (u_int)0x763b3b4d, (u_int)0xb7d6d661, (u_int)0x7db3b3ce,
+ (u_int)0x5229297b, (u_int)0xdde3e33e, (u_int)0x5e2f2f71, (u_int)0x13848497,
+ (u_int)0xa65353f5, (u_int)0xb9d1d168, (u_int)0x00000000, (u_int)0xc1eded2c,
+ (u_int)0x40202060, (u_int)0xe3fcfc1f, (u_int)0x79b1b1c8, (u_int)0xb65b5bed,
+ (u_int)0xd46a6abe, (u_int)0x8dcbcb46, (u_int)0x67bebed9, (u_int)0x7239394b,
+ (u_int)0x944a4ade, (u_int)0x984c4cd4, (u_int)0xb05858e8, (u_int)0x85cfcf4a,
+ (u_int)0xbbd0d06b, (u_int)0xc5efef2a, (u_int)0x4faaaae5, (u_int)0xedfbfb16,
+ (u_int)0x864343c5, (u_int)0x9a4d4dd7, (u_int)0x66333355, (u_int)0x11858594,
+ (u_int)0x8a4545cf, (u_int)0xe9f9f910, (u_int)0x04020206, (u_int)0xfe7f7f81,
+ (u_int)0xa05050f0, (u_int)0x783c3c44, (u_int)0x259f9fba, (u_int)0x4ba8a8e3,
+ (u_int)0xa25151f3, (u_int)0x5da3a3fe, (u_int)0x804040c0, (u_int)0x058f8f8a,
+ (u_int)0x3f9292ad, (u_int)0x219d9dbc, (u_int)0x70383848, (u_int)0xf1f5f504,
+ (u_int)0x63bcbcdf, (u_int)0x77b6b6c1, (u_int)0xafdada75, (u_int)0x42212163,
+ (u_int)0x20101030, (u_int)0xe5ffff1a, (u_int)0xfdf3f30e, (u_int)0xbfd2d26d,
+ (u_int)0x81cdcd4c, (u_int)0x180c0c14, (u_int)0x26131335, (u_int)0xc3ecec2f,
+ (u_int)0xbe5f5fe1, (u_int)0x359797a2, (u_int)0x884444cc, (u_int)0x2e171739,
+ (u_int)0x93c4c457, (u_int)0x55a7a7f2, (u_int)0xfc7e7e82, (u_int)0x7a3d3d47,
+ (u_int)0xc86464ac, (u_int)0xba5d5de7, (u_int)0x3219192b, (u_int)0xe6737395,
+ (u_int)0xc06060a0, (u_int)0x19818198, (u_int)0x9e4f4fd1, (u_int)0xa3dcdc7f,
+ (u_int)0x44222266, (u_int)0x542a2a7e, (u_int)0x3b9090ab, (u_int)0x0b888883,
+ (u_int)0x8c4646ca, (u_int)0xc7eeee29, (u_int)0x6bb8b8d3, (u_int)0x2814143c,
+ (u_int)0xa7dede79, (u_int)0xbc5e5ee2, (u_int)0x160b0b1d, (u_int)0xaddbdb76,
+ (u_int)0xdbe0e03b, (u_int)0x64323256, (u_int)0x743a3a4e, (u_int)0x140a0a1e,
+ (u_int)0x924949db, (u_int)0x0c06060a, (u_int)0x4824246c, (u_int)0xb85c5ce4,
+ (u_int)0x9fc2c25d, (u_int)0xbdd3d36e, (u_int)0x43acacef, (u_int)0xc46262a6,
+ (u_int)0x399191a8, (u_int)0x319595a4, (u_int)0xd3e4e437, (u_int)0xf279798b,
+ (u_int)0xd5e7e732, (u_int)0x8bc8c843, (u_int)0x6e373759, (u_int)0xda6d6db7,
+ (u_int)0x018d8d8c, (u_int)0xb1d5d564, (u_int)0x9c4e4ed2, (u_int)0x49a9a9e0,
+ (u_int)0xd86c6cb4, (u_int)0xac5656fa, (u_int)0xf3f4f407, (u_int)0xcfeaea25,
+ (u_int)0xca6565af, (u_int)0xf47a7a8e, (u_int)0x47aeaee9, (u_int)0x10080818,
+ (u_int)0x6fbabad5, (u_int)0xf0787888, (u_int)0x4a25256f, (u_int)0x5c2e2e72,
+ (u_int)0x381c1c24, (u_int)0x57a6a6f1, (u_int)0x73b4b4c7, (u_int)0x97c6c651,
+ (u_int)0xcbe8e823, (u_int)0xa1dddd7c, (u_int)0xe874749c, (u_int)0x3e1f1f21,
+ (u_int)0x964b4bdd, (u_int)0x61bdbddc, (u_int)0x0d8b8b86, (u_int)0x0f8a8a85,
+ (u_int)0xe0707090, (u_int)0x7c3e3e42, (u_int)0x71b5b5c4, (u_int)0xcc6666aa,
+ (u_int)0x904848d8, (u_int)0x06030305, (u_int)0xf7f6f601, (u_int)0x1c0e0e12,
+ (u_int)0xc26161a3, (u_int)0x6a35355f, (u_int)0xae5757f9, (u_int)0x69b9b9d0,
+ (u_int)0x17868691, (u_int)0x99c1c158, (u_int)0x3a1d1d27, (u_int)0x279e9eb9,
+ (u_int)0xd9e1e138, (u_int)0xebf8f813, (u_int)0x2b9898b3, (u_int)0x22111133,
+ (u_int)0xd26969bb, (u_int)0xa9d9d970, (u_int)0x078e8e89, (u_int)0x339494a7,
+ (u_int)0x2d9b9bb6, (u_int)0x3c1e1e22, (u_int)0x15878792, (u_int)0xc9e9e920,
+ (u_int)0x87cece49, (u_int)0xaa5555ff, (u_int)0x50282878, (u_int)0xa5dfdf7a,
+ (u_int)0x038c8c8f, (u_int)0x59a1a1f8, (u_int)0x09898980, (u_int)0x1a0d0d17,
+ (u_int)0x65bfbfda, (u_int)0xd7e6e631, (u_int)0x844242c6, (u_int)0xd06868b8,
+ (u_int)0x824141c3, (u_int)0x299999b0, (u_int)0x5a2d2d77, (u_int)0x1e0f0f11,
+ (u_int)0x7bb0b0cb, (u_int)0xa85454fc, (u_int)0x6dbbbbd6, (u_int)0x2c16163a,
+};
+static const u32 Te1[256] = {
+ (u_int)0xa5c66363, (u_int)0x84f87c7c, (u_int)0x99ee7777, (u_int)0x8df67b7b,
+ (u_int)0x0dfff2f2, (u_int)0xbdd66b6b, (u_int)0xb1de6f6f, (u_int)0x5491c5c5,
+ (u_int)0x50603030, (u_int)0x03020101, (u_int)0xa9ce6767, (u_int)0x7d562b2b,
+ (u_int)0x19e7fefe, (u_int)0x62b5d7d7, (u_int)0xe64dabab, (u_int)0x9aec7676,
+ (u_int)0x458fcaca, (u_int)0x9d1f8282, (u_int)0x4089c9c9, (u_int)0x87fa7d7d,
+ (u_int)0x15effafa, (u_int)0xebb25959, (u_int)0xc98e4747, (u_int)0x0bfbf0f0,
+ (u_int)0xec41adad, (u_int)0x67b3d4d4, (u_int)0xfd5fa2a2, (u_int)0xea45afaf,
+ (u_int)0xbf239c9c, (u_int)0xf753a4a4, (u_int)0x96e47272, (u_int)0x5b9bc0c0,
+ (u_int)0xc275b7b7, (u_int)0x1ce1fdfd, (u_int)0xae3d9393, (u_int)0x6a4c2626,
+ (u_int)0x5a6c3636, (u_int)0x417e3f3f, (u_int)0x02f5f7f7, (u_int)0x4f83cccc,
+ (u_int)0x5c683434, (u_int)0xf451a5a5, (u_int)0x34d1e5e5, (u_int)0x08f9f1f1,
+ (u_int)0x93e27171, (u_int)0x73abd8d8, (u_int)0x53623131, (u_int)0x3f2a1515,
+ (u_int)0x0c080404, (u_int)0x5295c7c7, (u_int)0x65462323, (u_int)0x5e9dc3c3,
+ (u_int)0x28301818, (u_int)0xa1379696, (u_int)0x0f0a0505, (u_int)0xb52f9a9a,
+ (u_int)0x090e0707, (u_int)0x36241212, (u_int)0x9b1b8080, (u_int)0x3ddfe2e2,
+ (u_int)0x26cdebeb, (u_int)0x694e2727, (u_int)0xcd7fb2b2, (u_int)0x9fea7575,
+ (u_int)0x1b120909, (u_int)0x9e1d8383, (u_int)0x74582c2c, (u_int)0x2e341a1a,
+ (u_int)0x2d361b1b, (u_int)0xb2dc6e6e, (u_int)0xeeb45a5a, (u_int)0xfb5ba0a0,
+ (u_int)0xf6a45252, (u_int)0x4d763b3b, (u_int)0x61b7d6d6, (u_int)0xce7db3b3,
+ (u_int)0x7b522929, (u_int)0x3edde3e3, (u_int)0x715e2f2f, (u_int)0x97138484,
+ (u_int)0xf5a65353, (u_int)0x68b9d1d1, (u_int)0x00000000, (u_int)0x2cc1eded,
+ (u_int)0x60402020, (u_int)0x1fe3fcfc, (u_int)0xc879b1b1, (u_int)0xedb65b5b,
+ (u_int)0xbed46a6a, (u_int)0x468dcbcb, (u_int)0xd967bebe, (u_int)0x4b723939,
+ (u_int)0xde944a4a, (u_int)0xd4984c4c, (u_int)0xe8b05858, (u_int)0x4a85cfcf,
+ (u_int)0x6bbbd0d0, (u_int)0x2ac5efef, (u_int)0xe54faaaa, (u_int)0x16edfbfb,
+ (u_int)0xc5864343, (u_int)0xd79a4d4d, (u_int)0x55663333, (u_int)0x94118585,
+ (u_int)0xcf8a4545, (u_int)0x10e9f9f9, (u_int)0x06040202, (u_int)0x81fe7f7f,
+ (u_int)0xf0a05050, (u_int)0x44783c3c, (u_int)0xba259f9f, (u_int)0xe34ba8a8,
+ (u_int)0xf3a25151, (u_int)0xfe5da3a3, (u_int)0xc0804040, (u_int)0x8a058f8f,
+ (u_int)0xad3f9292, (u_int)0xbc219d9d, (u_int)0x48703838, (u_int)0x04f1f5f5,
+ (u_int)0xdf63bcbc, (u_int)0xc177b6b6, (u_int)0x75afdada, (u_int)0x63422121,
+ (u_int)0x30201010, (u_int)0x1ae5ffff, (u_int)0x0efdf3f3, (u_int)0x6dbfd2d2,
+ (u_int)0x4c81cdcd, (u_int)0x14180c0c, (u_int)0x35261313, (u_int)0x2fc3ecec,
+ (u_int)0xe1be5f5f, (u_int)0xa2359797, (u_int)0xcc884444, (u_int)0x392e1717,
+ (u_int)0x5793c4c4, (u_int)0xf255a7a7, (u_int)0x82fc7e7e, (u_int)0x477a3d3d,
+ (u_int)0xacc86464, (u_int)0xe7ba5d5d, (u_int)0x2b321919, (u_int)0x95e67373,
+ (u_int)0xa0c06060, (u_int)0x98198181, (u_int)0xd19e4f4f, (u_int)0x7fa3dcdc,
+ (u_int)0x66442222, (u_int)0x7e542a2a, (u_int)0xab3b9090, (u_int)0x830b8888,
+ (u_int)0xca8c4646, (u_int)0x29c7eeee, (u_int)0xd36bb8b8, (u_int)0x3c281414,
+ (u_int)0x79a7dede, (u_int)0xe2bc5e5e, (u_int)0x1d160b0b, (u_int)0x76addbdb,
+ (u_int)0x3bdbe0e0, (u_int)0x56643232, (u_int)0x4e743a3a, (u_int)0x1e140a0a,
+ (u_int)0xdb924949, (u_int)0x0a0c0606, (u_int)0x6c482424, (u_int)0xe4b85c5c,
+ (u_int)0x5d9fc2c2, (u_int)0x6ebdd3d3, (u_int)0xef43acac, (u_int)0xa6c46262,
+ (u_int)0xa8399191, (u_int)0xa4319595, (u_int)0x37d3e4e4, (u_int)0x8bf27979,
+ (u_int)0x32d5e7e7, (u_int)0x438bc8c8, (u_int)0x596e3737, (u_int)0xb7da6d6d,
+ (u_int)0x8c018d8d, (u_int)0x64b1d5d5, (u_int)0xd29c4e4e, (u_int)0xe049a9a9,
+ (u_int)0xb4d86c6c, (u_int)0xfaac5656, (u_int)0x07f3f4f4, (u_int)0x25cfeaea,
+ (u_int)0xafca6565, (u_int)0x8ef47a7a, (u_int)0xe947aeae, (u_int)0x18100808,
+ (u_int)0xd56fbaba, (u_int)0x88f07878, (u_int)0x6f4a2525, (u_int)0x725c2e2e,
+ (u_int)0x24381c1c, (u_int)0xf157a6a6, (u_int)0xc773b4b4, (u_int)0x5197c6c6,
+ (u_int)0x23cbe8e8, (u_int)0x7ca1dddd, (u_int)0x9ce87474, (u_int)0x213e1f1f,
+ (u_int)0xdd964b4b, (u_int)0xdc61bdbd, (u_int)0x860d8b8b, (u_int)0x850f8a8a,
+ (u_int)0x90e07070, (u_int)0x427c3e3e, (u_int)0xc471b5b5, (u_int)0xaacc6666,
+ (u_int)0xd8904848, (u_int)0x05060303, (u_int)0x01f7f6f6, (u_int)0x121c0e0e,
+ (u_int)0xa3c26161, (u_int)0x5f6a3535, (u_int)0xf9ae5757, (u_int)0xd069b9b9,
+ (u_int)0x91178686, (u_int)0x5899c1c1, (u_int)0x273a1d1d, (u_int)0xb9279e9e,
+ (u_int)0x38d9e1e1, (u_int)0x13ebf8f8, (u_int)0xb32b9898, (u_int)0x33221111,
+ (u_int)0xbbd26969, (u_int)0x70a9d9d9, (u_int)0x89078e8e, (u_int)0xa7339494,
+ (u_int)0xb62d9b9b, (u_int)0x223c1e1e, (u_int)0x92158787, (u_int)0x20c9e9e9,
+ (u_int)0x4987cece, (u_int)0xffaa5555, (u_int)0x78502828, (u_int)0x7aa5dfdf,
+ (u_int)0x8f038c8c, (u_int)0xf859a1a1, (u_int)0x80098989, (u_int)0x171a0d0d,
+ (u_int)0xda65bfbf, (u_int)0x31d7e6e6, (u_int)0xc6844242, (u_int)0xb8d06868,
+ (u_int)0xc3824141, (u_int)0xb0299999, (u_int)0x775a2d2d, (u_int)0x111e0f0f,
+ (u_int)0xcb7bb0b0, (u_int)0xfca85454, (u_int)0xd66dbbbb, (u_int)0x3a2c1616,
+};
+static const u32 Te2[256] = {
+ (u_int)0x63a5c663, (u_int)0x7c84f87c, (u_int)0x7799ee77, (u_int)0x7b8df67b,
+ (u_int)0xf20dfff2, (u_int)0x6bbdd66b, (u_int)0x6fb1de6f, (u_int)0xc55491c5,
+ (u_int)0x30506030, (u_int)0x01030201, (u_int)0x67a9ce67, (u_int)0x2b7d562b,
+ (u_int)0xfe19e7fe, (u_int)0xd762b5d7, (u_int)0xabe64dab, (u_int)0x769aec76,
+ (u_int)0xca458fca, (u_int)0x829d1f82, (u_int)0xc94089c9, (u_int)0x7d87fa7d,
+ (u_int)0xfa15effa, (u_int)0x59ebb259, (u_int)0x47c98e47, (u_int)0xf00bfbf0,
+ (u_int)0xadec41ad, (u_int)0xd467b3d4, (u_int)0xa2fd5fa2, (u_int)0xafea45af,
+ (u_int)0x9cbf239c, (u_int)0xa4f753a4, (u_int)0x7296e472, (u_int)0xc05b9bc0,
+ (u_int)0xb7c275b7, (u_int)0xfd1ce1fd, (u_int)0x93ae3d93, (u_int)0x266a4c26,
+ (u_int)0x365a6c36, (u_int)0x3f417e3f, (u_int)0xf702f5f7, (u_int)0xcc4f83cc,
+ (u_int)0x345c6834, (u_int)0xa5f451a5, (u_int)0xe534d1e5, (u_int)0xf108f9f1,
+ (u_int)0x7193e271, (u_int)0xd873abd8, (u_int)0x31536231, (u_int)0x153f2a15,
+ (u_int)0x040c0804, (u_int)0xc75295c7, (u_int)0x23654623, (u_int)0xc35e9dc3,
+ (u_int)0x18283018, (u_int)0x96a13796, (u_int)0x050f0a05, (u_int)0x9ab52f9a,
+ (u_int)0x07090e07, (u_int)0x12362412, (u_int)0x809b1b80, (u_int)0xe23ddfe2,
+ (u_int)0xeb26cdeb, (u_int)0x27694e27, (u_int)0xb2cd7fb2, (u_int)0x759fea75,
+ (u_int)0x091b1209, (u_int)0x839e1d83, (u_int)0x2c74582c, (u_int)0x1a2e341a,
+ (u_int)0x1b2d361b, (u_int)0x6eb2dc6e, (u_int)0x5aeeb45a, (u_int)0xa0fb5ba0,
+ (u_int)0x52f6a452, (u_int)0x3b4d763b, (u_int)0xd661b7d6, (u_int)0xb3ce7db3,
+ (u_int)0x297b5229, (u_int)0xe33edde3, (u_int)0x2f715e2f, (u_int)0x84971384,
+ (u_int)0x53f5a653, (u_int)0xd168b9d1, (u_int)0x00000000, (u_int)0xed2cc1ed,
+ (u_int)0x20604020, (u_int)0xfc1fe3fc, (u_int)0xb1c879b1, (u_int)0x5bedb65b,
+ (u_int)0x6abed46a, (u_int)0xcb468dcb, (u_int)0xbed967be, (u_int)0x394b7239,
+ (u_int)0x4ade944a, (u_int)0x4cd4984c, (u_int)0x58e8b058, (u_int)0xcf4a85cf,
+ (u_int)0xd06bbbd0, (u_int)0xef2ac5ef, (u_int)0xaae54faa, (u_int)0xfb16edfb,
+ (u_int)0x43c58643, (u_int)0x4dd79a4d, (u_int)0x33556633, (u_int)0x85941185,
+ (u_int)0x45cf8a45, (u_int)0xf910e9f9, (u_int)0x02060402, (u_int)0x7f81fe7f,
+ (u_int)0x50f0a050, (u_int)0x3c44783c, (u_int)0x9fba259f, (u_int)0xa8e34ba8,
+ (u_int)0x51f3a251, (u_int)0xa3fe5da3, (u_int)0x40c08040, (u_int)0x8f8a058f,
+ (u_int)0x92ad3f92, (u_int)0x9dbc219d, (u_int)0x38487038, (u_int)0xf504f1f5,
+ (u_int)0xbcdf63bc, (u_int)0xb6c177b6, (u_int)0xda75afda, (u_int)0x21634221,
+ (u_int)0x10302010, (u_int)0xff1ae5ff, (u_int)0xf30efdf3, (u_int)0xd26dbfd2,
+ (u_int)0xcd4c81cd, (u_int)0x0c14180c, (u_int)0x13352613, (u_int)0xec2fc3ec,
+ (u_int)0x5fe1be5f, (u_int)0x97a23597, (u_int)0x44cc8844, (u_int)0x17392e17,
+ (u_int)0xc45793c4, (u_int)0xa7f255a7, (u_int)0x7e82fc7e, (u_int)0x3d477a3d,
+ (u_int)0x64acc864, (u_int)0x5de7ba5d, (u_int)0x192b3219, (u_int)0x7395e673,
+ (u_int)0x60a0c060, (u_int)0x81981981, (u_int)0x4fd19e4f, (u_int)0xdc7fa3dc,
+ (u_int)0x22664422, (u_int)0x2a7e542a, (u_int)0x90ab3b90, (u_int)0x88830b88,
+ (u_int)0x46ca8c46, (u_int)0xee29c7ee, (u_int)0xb8d36bb8, (u_int)0x143c2814,
+ (u_int)0xde79a7de, (u_int)0x5ee2bc5e, (u_int)0x0b1d160b, (u_int)0xdb76addb,
+ (u_int)0xe03bdbe0, (u_int)0x32566432, (u_int)0x3a4e743a, (u_int)0x0a1e140a,
+ (u_int)0x49db9249, (u_int)0x060a0c06, (u_int)0x246c4824, (u_int)0x5ce4b85c,
+ (u_int)0xc25d9fc2, (u_int)0xd36ebdd3, (u_int)0xacef43ac, (u_int)0x62a6c462,
+ (u_int)0x91a83991, (u_int)0x95a43195, (u_int)0xe437d3e4, (u_int)0x798bf279,
+ (u_int)0xe732d5e7, (u_int)0xc8438bc8, (u_int)0x37596e37, (u_int)0x6db7da6d,
+ (u_int)0x8d8c018d, (u_int)0xd564b1d5, (u_int)0x4ed29c4e, (u_int)0xa9e049a9,
+ (u_int)0x6cb4d86c, (u_int)0x56faac56, (u_int)0xf407f3f4, (u_int)0xea25cfea,
+ (u_int)0x65afca65, (u_int)0x7a8ef47a, (u_int)0xaee947ae, (u_int)0x08181008,
+ (u_int)0xbad56fba, (u_int)0x7888f078, (u_int)0x256f4a25, (u_int)0x2e725c2e,
+ (u_int)0x1c24381c, (u_int)0xa6f157a6, (u_int)0xb4c773b4, (u_int)0xc65197c6,
+ (u_int)0xe823cbe8, (u_int)0xdd7ca1dd, (u_int)0x749ce874, (u_int)0x1f213e1f,
+ (u_int)0x4bdd964b, (u_int)0xbddc61bd, (u_int)0x8b860d8b, (u_int)0x8a850f8a,
+ (u_int)0x7090e070, (u_int)0x3e427c3e, (u_int)0xb5c471b5, (u_int)0x66aacc66,
+ (u_int)0x48d89048, (u_int)0x03050603, (u_int)0xf601f7f6, (u_int)0x0e121c0e,
+ (u_int)0x61a3c261, (u_int)0x355f6a35, (u_int)0x57f9ae57, (u_int)0xb9d069b9,
+ (u_int)0x86911786, (u_int)0xc15899c1, (u_int)0x1d273a1d, (u_int)0x9eb9279e,
+ (u_int)0xe138d9e1, (u_int)0xf813ebf8, (u_int)0x98b32b98, (u_int)0x11332211,
+ (u_int)0x69bbd269, (u_int)0xd970a9d9, (u_int)0x8e89078e, (u_int)0x94a73394,
+ (u_int)0x9bb62d9b, (u_int)0x1e223c1e, (u_int)0x87921587, (u_int)0xe920c9e9,
+ (u_int)0xce4987ce, (u_int)0x55ffaa55, (u_int)0x28785028, (u_int)0xdf7aa5df,
+ (u_int)0x8c8f038c, (u_int)0xa1f859a1, (u_int)0x89800989, (u_int)0x0d171a0d,
+ (u_int)0xbfda65bf, (u_int)0xe631d7e6, (u_int)0x42c68442, (u_int)0x68b8d068,
+ (u_int)0x41c38241, (u_int)0x99b02999, (u_int)0x2d775a2d, (u_int)0x0f111e0f,
+ (u_int)0xb0cb7bb0, (u_int)0x54fca854, (u_int)0xbbd66dbb, (u_int)0x163a2c16,
+};
+static const u32 Te3[256] = {
+
+ (u_int)0x6363a5c6, (u_int)0x7c7c84f8, (u_int)0x777799ee, (u_int)0x7b7b8df6,
+ (u_int)0xf2f20dff, (u_int)0x6b6bbdd6, (u_int)0x6f6fb1de, (u_int)0xc5c55491,
+ (u_int)0x30305060, (u_int)0x01010302, (u_int)0x6767a9ce, (u_int)0x2b2b7d56,
+ (u_int)0xfefe19e7, (u_int)0xd7d762b5, (u_int)0xababe64d, (u_int)0x76769aec,
+ (u_int)0xcaca458f, (u_int)0x82829d1f, (u_int)0xc9c94089, (u_int)0x7d7d87fa,
+ (u_int)0xfafa15ef, (u_int)0x5959ebb2, (u_int)0x4747c98e, (u_int)0xf0f00bfb,
+ (u_int)0xadadec41, (u_int)0xd4d467b3, (u_int)0xa2a2fd5f, (u_int)0xafafea45,
+ (u_int)0x9c9cbf23, (u_int)0xa4a4f753, (u_int)0x727296e4, (u_int)0xc0c05b9b,
+ (u_int)0xb7b7c275, (u_int)0xfdfd1ce1, (u_int)0x9393ae3d, (u_int)0x26266a4c,
+ (u_int)0x36365a6c, (u_int)0x3f3f417e, (u_int)0xf7f702f5, (u_int)0xcccc4f83,
+ (u_int)0x34345c68, (u_int)0xa5a5f451, (u_int)0xe5e534d1, (u_int)0xf1f108f9,
+ (u_int)0x717193e2, (u_int)0xd8d873ab, (u_int)0x31315362, (u_int)0x15153f2a,
+ (u_int)0x04040c08, (u_int)0xc7c75295, (u_int)0x23236546, (u_int)0xc3c35e9d,
+ (u_int)0x18182830, (u_int)0x9696a137, (u_int)0x05050f0a, (u_int)0x9a9ab52f,
+ (u_int)0x0707090e, (u_int)0x12123624, (u_int)0x80809b1b, (u_int)0xe2e23ddf,
+ (u_int)0xebeb26cd, (u_int)0x2727694e, (u_int)0xb2b2cd7f, (u_int)0x75759fea,
+ (u_int)0x09091b12, (u_int)0x83839e1d, (u_int)0x2c2c7458, (u_int)0x1a1a2e34,
+ (u_int)0x1b1b2d36, (u_int)0x6e6eb2dc, (u_int)0x5a5aeeb4, (u_int)0xa0a0fb5b,
+ (u_int)0x5252f6a4, (u_int)0x3b3b4d76, (u_int)0xd6d661b7, (u_int)0xb3b3ce7d,
+ (u_int)0x29297b52, (u_int)0xe3e33edd, (u_int)0x2f2f715e, (u_int)0x84849713,
+ (u_int)0x5353f5a6, (u_int)0xd1d168b9, (u_int)0x00000000, (u_int)0xeded2cc1,
+ (u_int)0x20206040, (u_int)0xfcfc1fe3, (u_int)0xb1b1c879, (u_int)0x5b5bedb6,
+ (u_int)0x6a6abed4, (u_int)0xcbcb468d, (u_int)0xbebed967, (u_int)0x39394b72,
+ (u_int)0x4a4ade94, (u_int)0x4c4cd498, (u_int)0x5858e8b0, (u_int)0xcfcf4a85,
+ (u_int)0xd0d06bbb, (u_int)0xefef2ac5, (u_int)0xaaaae54f, (u_int)0xfbfb16ed,
+ (u_int)0x4343c586, (u_int)0x4d4dd79a, (u_int)0x33335566, (u_int)0x85859411,
+ (u_int)0x4545cf8a, (u_int)0xf9f910e9, (u_int)0x02020604, (u_int)0x7f7f81fe,
+ (u_int)0x5050f0a0, (u_int)0x3c3c4478, (u_int)0x9f9fba25, (u_int)0xa8a8e34b,
+ (u_int)0x5151f3a2, (u_int)0xa3a3fe5d, (u_int)0x4040c080, (u_int)0x8f8f8a05,
+ (u_int)0x9292ad3f, (u_int)0x9d9dbc21, (u_int)0x38384870, (u_int)0xf5f504f1,
+ (u_int)0xbcbcdf63, (u_int)0xb6b6c177, (u_int)0xdada75af, (u_int)0x21216342,
+ (u_int)0x10103020, (u_int)0xffff1ae5, (u_int)0xf3f30efd, (u_int)0xd2d26dbf,
+ (u_int)0xcdcd4c81, (u_int)0x0c0c1418, (u_int)0x13133526, (u_int)0xecec2fc3,
+ (u_int)0x5f5fe1be, (u_int)0x9797a235, (u_int)0x4444cc88, (u_int)0x1717392e,
+ (u_int)0xc4c45793, (u_int)0xa7a7f255, (u_int)0x7e7e82fc, (u_int)0x3d3d477a,
+ (u_int)0x6464acc8, (u_int)0x5d5de7ba, (u_int)0x19192b32, (u_int)0x737395e6,
+ (u_int)0x6060a0c0, (u_int)0x81819819, (u_int)0x4f4fd19e, (u_int)0xdcdc7fa3,
+ (u_int)0x22226644, (u_int)0x2a2a7e54, (u_int)0x9090ab3b, (u_int)0x8888830b,
+ (u_int)0x4646ca8c, (u_int)0xeeee29c7, (u_int)0xb8b8d36b, (u_int)0x14143c28,
+ (u_int)0xdede79a7, (u_int)0x5e5ee2bc, (u_int)0x0b0b1d16, (u_int)0xdbdb76ad,
+ (u_int)0xe0e03bdb, (u_int)0x32325664, (u_int)0x3a3a4e74, (u_int)0x0a0a1e14,
+ (u_int)0x4949db92, (u_int)0x06060a0c, (u_int)0x24246c48, (u_int)0x5c5ce4b8,
+ (u_int)0xc2c25d9f, (u_int)0xd3d36ebd, (u_int)0xacacef43, (u_int)0x6262a6c4,
+ (u_int)0x9191a839, (u_int)0x9595a431, (u_int)0xe4e437d3, (u_int)0x79798bf2,
+ (u_int)0xe7e732d5, (u_int)0xc8c8438b, (u_int)0x3737596e, (u_int)0x6d6db7da,
+ (u_int)0x8d8d8c01, (u_int)0xd5d564b1, (u_int)0x4e4ed29c, (u_int)0xa9a9e049,
+ (u_int)0x6c6cb4d8, (u_int)0x5656faac, (u_int)0xf4f407f3, (u_int)0xeaea25cf,
+ (u_int)0x6565afca, (u_int)0x7a7a8ef4, (u_int)0xaeaee947, (u_int)0x08081810,
+ (u_int)0xbabad56f, (u_int)0x787888f0, (u_int)0x25256f4a, (u_int)0x2e2e725c,
+ (u_int)0x1c1c2438, (u_int)0xa6a6f157, (u_int)0xb4b4c773, (u_int)0xc6c65197,
+ (u_int)0xe8e823cb, (u_int)0xdddd7ca1, (u_int)0x74749ce8, (u_int)0x1f1f213e,
+ (u_int)0x4b4bdd96, (u_int)0xbdbddc61, (u_int)0x8b8b860d, (u_int)0x8a8a850f,
+ (u_int)0x707090e0, (u_int)0x3e3e427c, (u_int)0xb5b5c471, (u_int)0x6666aacc,
+ (u_int)0x4848d890, (u_int)0x03030506, (u_int)0xf6f601f7, (u_int)0x0e0e121c,
+ (u_int)0x6161a3c2, (u_int)0x35355f6a, (u_int)0x5757f9ae, (u_int)0xb9b9d069,
+ (u_int)0x86869117, (u_int)0xc1c15899, (u_int)0x1d1d273a, (u_int)0x9e9eb927,
+ (u_int)0xe1e138d9, (u_int)0xf8f813eb, (u_int)0x9898b32b, (u_int)0x11113322,
+ (u_int)0x6969bbd2, (u_int)0xd9d970a9, (u_int)0x8e8e8907, (u_int)0x9494a733,
+ (u_int)0x9b9bb62d, (u_int)0x1e1e223c, (u_int)0x87879215, (u_int)0xe9e920c9,
+ (u_int)0xcece4987, (u_int)0x5555ffaa, (u_int)0x28287850, (u_int)0xdfdf7aa5,
+ (u_int)0x8c8c8f03, (u_int)0xa1a1f859, (u_int)0x89898009, (u_int)0x0d0d171a,
+ (u_int)0xbfbfda65, (u_int)0xe6e631d7, (u_int)0x4242c684, (u_int)0x6868b8d0,
+ (u_int)0x4141c382, (u_int)0x9999b029, (u_int)0x2d2d775a, (u_int)0x0f0f111e,
+ (u_int)0xb0b0cb7b, (u_int)0x5454fca8, (u_int)0xbbbbd66d, (u_int)0x16163a2c,
+};
+static const u32 Te4[256] = {
+ (u_int)0x63636363, (u_int)0x7c7c7c7c, (u_int)0x77777777, (u_int)0x7b7b7b7b,
+ (u_int)0xf2f2f2f2, (u_int)0x6b6b6b6b, (u_int)0x6f6f6f6f, (u_int)0xc5c5c5c5,
+ (u_int)0x30303030, (u_int)0x01010101, (u_int)0x67676767, (u_int)0x2b2b2b2b,
+ (u_int)0xfefefefe, (u_int)0xd7d7d7d7, (u_int)0xabababab, (u_int)0x76767676,
+ (u_int)0xcacacaca, (u_int)0x82828282, (u_int)0xc9c9c9c9, (u_int)0x7d7d7d7d,
+ (u_int)0xfafafafa, (u_int)0x59595959, (u_int)0x47474747, (u_int)0xf0f0f0f0,
+ (u_int)0xadadadad, (u_int)0xd4d4d4d4, (u_int)0xa2a2a2a2, (u_int)0xafafafaf,
+ (u_int)0x9c9c9c9c, (u_int)0xa4a4a4a4, (u_int)0x72727272, (u_int)0xc0c0c0c0,
+ (u_int)0xb7b7b7b7, (u_int)0xfdfdfdfd, (u_int)0x93939393, (u_int)0x26262626,
+ (u_int)0x36363636, (u_int)0x3f3f3f3f, (u_int)0xf7f7f7f7, (u_int)0xcccccccc,
+ (u_int)0x34343434, (u_int)0xa5a5a5a5, (u_int)0xe5e5e5e5, (u_int)0xf1f1f1f1,
+ (u_int)0x71717171, (u_int)0xd8d8d8d8, (u_int)0x31313131, (u_int)0x15151515,
+ (u_int)0x04040404, (u_int)0xc7c7c7c7, (u_int)0x23232323, (u_int)0xc3c3c3c3,
+ (u_int)0x18181818, (u_int)0x96969696, (u_int)0x05050505, (u_int)0x9a9a9a9a,
+ (u_int)0x07070707, (u_int)0x12121212, (u_int)0x80808080, (u_int)0xe2e2e2e2,
+ (u_int)0xebebebeb, (u_int)0x27272727, (u_int)0xb2b2b2b2, (u_int)0x75757575,
+ (u_int)0x09090909, (u_int)0x83838383, (u_int)0x2c2c2c2c, (u_int)0x1a1a1a1a,
+ (u_int)0x1b1b1b1b, (u_int)0x6e6e6e6e, (u_int)0x5a5a5a5a, (u_int)0xa0a0a0a0,
+ (u_int)0x52525252, (u_int)0x3b3b3b3b, (u_int)0xd6d6d6d6, (u_int)0xb3b3b3b3,
+ (u_int)0x29292929, (u_int)0xe3e3e3e3, (u_int)0x2f2f2f2f, (u_int)0x84848484,
+ (u_int)0x53535353, (u_int)0xd1d1d1d1, (u_int)0x00000000, (u_int)0xedededed,
+ (u_int)0x20202020, (u_int)0xfcfcfcfc, (u_int)0xb1b1b1b1, (u_int)0x5b5b5b5b,
+ (u_int)0x6a6a6a6a, (u_int)0xcbcbcbcb, (u_int)0xbebebebe, (u_int)0x39393939,
+ (u_int)0x4a4a4a4a, (u_int)0x4c4c4c4c, (u_int)0x58585858, (u_int)0xcfcfcfcf,
+ (u_int)0xd0d0d0d0, (u_int)0xefefefef, (u_int)0xaaaaaaaa, (u_int)0xfbfbfbfb,
+ (u_int)0x43434343, (u_int)0x4d4d4d4d, (u_int)0x33333333, (u_int)0x85858585,
+ (u_int)0x45454545, (u_int)0xf9f9f9f9, (u_int)0x02020202, (u_int)0x7f7f7f7f,
+ (u_int)0x50505050, (u_int)0x3c3c3c3c, (u_int)0x9f9f9f9f, (u_int)0xa8a8a8a8,
+ (u_int)0x51515151, (u_int)0xa3a3a3a3, (u_int)0x40404040, (u_int)0x8f8f8f8f,
+ (u_int)0x92929292, (u_int)0x9d9d9d9d, (u_int)0x38383838, (u_int)0xf5f5f5f5,
+ (u_int)0xbcbcbcbc, (u_int)0xb6b6b6b6, (u_int)0xdadadada, (u_int)0x21212121,
+ (u_int)0x10101010, (u_int)0xffffffff, (u_int)0xf3f3f3f3, (u_int)0xd2d2d2d2,
+ (u_int)0xcdcdcdcd, (u_int)0x0c0c0c0c, (u_int)0x13131313, (u_int)0xecececec,
+ (u_int)0x5f5f5f5f, (u_int)0x97979797, (u_int)0x44444444, (u_int)0x17171717,
+ (u_int)0xc4c4c4c4, (u_int)0xa7a7a7a7, (u_int)0x7e7e7e7e, (u_int)0x3d3d3d3d,
+ (u_int)0x64646464, (u_int)0x5d5d5d5d, (u_int)0x19191919, (u_int)0x73737373,
+ (u_int)0x60606060, (u_int)0x81818181, (u_int)0x4f4f4f4f, (u_int)0xdcdcdcdc,
+ (u_int)0x22222222, (u_int)0x2a2a2a2a, (u_int)0x90909090, (u_int)0x88888888,
+ (u_int)0x46464646, (u_int)0xeeeeeeee, (u_int)0xb8b8b8b8, (u_int)0x14141414,
+ (u_int)0xdededede, (u_int)0x5e5e5e5e, (u_int)0x0b0b0b0b, (u_int)0xdbdbdbdb,
+ (u_int)0xe0e0e0e0, (u_int)0x32323232, (u_int)0x3a3a3a3a, (u_int)0x0a0a0a0a,
+ (u_int)0x49494949, (u_int)0x06060606, (u_int)0x24242424, (u_int)0x5c5c5c5c,
+ (u_int)0xc2c2c2c2, (u_int)0xd3d3d3d3, (u_int)0xacacacac, (u_int)0x62626262,
+ (u_int)0x91919191, (u_int)0x95959595, (u_int)0xe4e4e4e4, (u_int)0x79797979,
+ (u_int)0xe7e7e7e7, (u_int)0xc8c8c8c8, (u_int)0x37373737, (u_int)0x6d6d6d6d,
+ (u_int)0x8d8d8d8d, (u_int)0xd5d5d5d5, (u_int)0x4e4e4e4e, (u_int)0xa9a9a9a9,
+ (u_int)0x6c6c6c6c, (u_int)0x56565656, (u_int)0xf4f4f4f4, (u_int)0xeaeaeaea,
+ (u_int)0x65656565, (u_int)0x7a7a7a7a, (u_int)0xaeaeaeae, (u_int)0x08080808,
+ (u_int)0xbabababa, (u_int)0x78787878, (u_int)0x25252525, (u_int)0x2e2e2e2e,
+ (u_int)0x1c1c1c1c, (u_int)0xa6a6a6a6, (u_int)0xb4b4b4b4, (u_int)0xc6c6c6c6,
+ (u_int)0xe8e8e8e8, (u_int)0xdddddddd, (u_int)0x74747474, (u_int)0x1f1f1f1f,
+ (u_int)0x4b4b4b4b, (u_int)0xbdbdbdbd, (u_int)0x8b8b8b8b, (u_int)0x8a8a8a8a,
+ (u_int)0x70707070, (u_int)0x3e3e3e3e, (u_int)0xb5b5b5b5, (u_int)0x66666666,
+ (u_int)0x48484848, (u_int)0x03030303, (u_int)0xf6f6f6f6, (u_int)0x0e0e0e0e,
+ (u_int)0x61616161, (u_int)0x35353535, (u_int)0x57575757, (u_int)0xb9b9b9b9,
+ (u_int)0x86868686, (u_int)0xc1c1c1c1, (u_int)0x1d1d1d1d, (u_int)0x9e9e9e9e,
+ (u_int)0xe1e1e1e1, (u_int)0xf8f8f8f8, (u_int)0x98989898, (u_int)0x11111111,
+ (u_int)0x69696969, (u_int)0xd9d9d9d9, (u_int)0x8e8e8e8e, (u_int)0x94949494,
+ (u_int)0x9b9b9b9b, (u_int)0x1e1e1e1e, (u_int)0x87878787, (u_int)0xe9e9e9e9,
+ (u_int)0xcececece, (u_int)0x55555555, (u_int)0x28282828, (u_int)0xdfdfdfdf,
+ (u_int)0x8c8c8c8c, (u_int)0xa1a1a1a1, (u_int)0x89898989, (u_int)0x0d0d0d0d,
+ (u_int)0xbfbfbfbf, (u_int)0xe6e6e6e6, (u_int)0x42424242, (u_int)0x68686868,
+ (u_int)0x41414141, (u_int)0x99999999, (u_int)0x2d2d2d2d, (u_int)0x0f0f0f0f,
+ (u_int)0xb0b0b0b0, (u_int)0x54545454, (u_int)0xbbbbbbbb, (u_int)0x16161616,
+};
+static const u32 Td0[256] = {
+ (u_int)0x51f4a750, (u_int)0x7e416553, (u_int)0x1a17a4c3, (u_int)0x3a275e96,
+ (u_int)0x3bab6bcb, (u_int)0x1f9d45f1, (u_int)0xacfa58ab, (u_int)0x4be30393,
+ (u_int)0x2030fa55, (u_int)0xad766df6, (u_int)0x88cc7691, (u_int)0xf5024c25,
+ (u_int)0x4fe5d7fc, (u_int)0xc52acbd7, (u_int)0x26354480, (u_int)0xb562a38f,
+ (u_int)0xdeb15a49, (u_int)0x25ba1b67, (u_int)0x45ea0e98, (u_int)0x5dfec0e1,
+ (u_int)0xc32f7502, (u_int)0x814cf012, (u_int)0x8d4697a3, (u_int)0x6bd3f9c6,
+ (u_int)0x038f5fe7, (u_int)0x15929c95, (u_int)0xbf6d7aeb, (u_int)0x955259da,
+ (u_int)0xd4be832d, (u_int)0x587421d3, (u_int)0x49e06929, (u_int)0x8ec9c844,
+ (u_int)0x75c2896a, (u_int)0xf48e7978, (u_int)0x99583e6b, (u_int)0x27b971dd,
+ (u_int)0xbee14fb6, (u_int)0xf088ad17, (u_int)0xc920ac66, (u_int)0x7dce3ab4,
+ (u_int)0x63df4a18, (u_int)0xe51a3182, (u_int)0x97513360, (u_int)0x62537f45,
+ (u_int)0xb16477e0, (u_int)0xbb6bae84, (u_int)0xfe81a01c, (u_int)0xf9082b94,
+ (u_int)0x70486858, (u_int)0x8f45fd19, (u_int)0x94de6c87, (u_int)0x527bf8b7,
+ (u_int)0xab73d323, (u_int)0x724b02e2, (u_int)0xe31f8f57, (u_int)0x6655ab2a,
+ (u_int)0xb2eb2807, (u_int)0x2fb5c203, (u_int)0x86c57b9a, (u_int)0xd33708a5,
+ (u_int)0x302887f2, (u_int)0x23bfa5b2, (u_int)0x02036aba, (u_int)0xed16825c,
+ (u_int)0x8acf1c2b, (u_int)0xa779b492, (u_int)0xf307f2f0, (u_int)0x4e69e2a1,
+ (u_int)0x65daf4cd, (u_int)0x0605bed5, (u_int)0xd134621f, (u_int)0xc4a6fe8a,
+ (u_int)0x342e539d, (u_int)0xa2f355a0, (u_int)0x058ae132, (u_int)0xa4f6eb75,
+ (u_int)0x0b83ec39, (u_int)0x4060efaa, (u_int)0x5e719f06, (u_int)0xbd6e1051,
+ (u_int)0x3e218af9, (u_int)0x96dd063d, (u_int)0xdd3e05ae, (u_int)0x4de6bd46,
+ (u_int)0x91548db5, (u_int)0x71c45d05, (u_int)0x0406d46f, (u_int)0x605015ff,
+ (u_int)0x1998fb24, (u_int)0xd6bde997, (u_int)0x894043cc, (u_int)0x67d99e77,
+ (u_int)0xb0e842bd, (u_int)0x07898b88, (u_int)0xe7195b38, (u_int)0x79c8eedb,
+ (u_int)0xa17c0a47, (u_int)0x7c420fe9, (u_int)0xf8841ec9, (u_int)0x00000000,
+ (u_int)0x09808683, (u_int)0x322bed48, (u_int)0x1e1170ac, (u_int)0x6c5a724e,
+ (u_int)0xfd0efffb, (u_int)0x0f853856, (u_int)0x3daed51e, (u_int)0x362d3927,
+ (u_int)0x0a0fd964, (u_int)0x685ca621, (u_int)0x9b5b54d1, (u_int)0x24362e3a,
+ (u_int)0x0c0a67b1, (u_int)0x9357e70f, (u_int)0xb4ee96d2, (u_int)0x1b9b919e,
+ (u_int)0x80c0c54f, (u_int)0x61dc20a2, (u_int)0x5a774b69, (u_int)0x1c121a16,
+ (u_int)0xe293ba0a, (u_int)0xc0a02ae5, (u_int)0x3c22e043, (u_int)0x121b171d,
+ (u_int)0x0e090d0b, (u_int)0xf28bc7ad, (u_int)0x2db6a8b9, (u_int)0x141ea9c8,
+ (u_int)0x57f11985, (u_int)0xaf75074c, (u_int)0xee99ddbb, (u_int)0xa37f60fd,
+ (u_int)0xf701269f, (u_int)0x5c72f5bc, (u_int)0x44663bc5, (u_int)0x5bfb7e34,
+ (u_int)0x8b432976, (u_int)0xcb23c6dc, (u_int)0xb6edfc68, (u_int)0xb8e4f163,
+ (u_int)0xd731dcca, (u_int)0x42638510, (u_int)0x13972240, (u_int)0x84c61120,
+ (u_int)0x854a247d, (u_int)0xd2bb3df8, (u_int)0xaef93211, (u_int)0xc729a16d,
+ (u_int)0x1d9e2f4b, (u_int)0xdcb230f3, (u_int)0x0d8652ec, (u_int)0x77c1e3d0,
+ (u_int)0x2bb3166c, (u_int)0xa970b999, (u_int)0x119448fa, (u_int)0x47e96422,
+ (u_int)0xa8fc8cc4, (u_int)0xa0f03f1a, (u_int)0x567d2cd8, (u_int)0x223390ef,
+ (u_int)0x87494ec7, (u_int)0xd938d1c1, (u_int)0x8ccaa2fe, (u_int)0x98d40b36,
+ (u_int)0xa6f581cf, (u_int)0xa57ade28, (u_int)0xdab78e26, (u_int)0x3fadbfa4,
+ (u_int)0x2c3a9de4, (u_int)0x5078920d, (u_int)0x6a5fcc9b, (u_int)0x547e4662,
+ (u_int)0xf68d13c2, (u_int)0x90d8b8e8, (u_int)0x2e39f75e, (u_int)0x82c3aff5,
+ (u_int)0x9f5d80be, (u_int)0x69d0937c, (u_int)0x6fd52da9, (u_int)0xcf2512b3,
+ (u_int)0xc8ac993b, (u_int)0x10187da7, (u_int)0xe89c636e, (u_int)0xdb3bbb7b,
+ (u_int)0xcd267809, (u_int)0x6e5918f4, (u_int)0xec9ab701, (u_int)0x834f9aa8,
+ (u_int)0xe6956e65, (u_int)0xaaffe67e, (u_int)0x21bccf08, (u_int)0xef15e8e6,
+ (u_int)0xbae79bd9, (u_int)0x4a6f36ce, (u_int)0xea9f09d4, (u_int)0x29b07cd6,
+ (u_int)0x31a4b2af, (u_int)0x2a3f2331, (u_int)0xc6a59430, (u_int)0x35a266c0,
+ (u_int)0x744ebc37, (u_int)0xfc82caa6, (u_int)0xe090d0b0, (u_int)0x33a7d815,
+ (u_int)0xf104984a, (u_int)0x41ecdaf7, (u_int)0x7fcd500e, (u_int)0x1791f62f,
+ (u_int)0x764dd68d, (u_int)0x43efb04d, (u_int)0xccaa4d54, (u_int)0xe49604df,
+ (u_int)0x9ed1b5e3, (u_int)0x4c6a881b, (u_int)0xc12c1fb8, (u_int)0x4665517f,
+ (u_int)0x9d5eea04, (u_int)0x018c355d, (u_int)0xfa877473, (u_int)0xfb0b412e,
+ (u_int)0xb3671d5a, (u_int)0x92dbd252, (u_int)0xe9105633, (u_int)0x6dd64713,
+ (u_int)0x9ad7618c, (u_int)0x37a10c7a, (u_int)0x59f8148e, (u_int)0xeb133c89,
+ (u_int)0xcea927ee, (u_int)0xb761c935, (u_int)0xe11ce5ed, (u_int)0x7a47b13c,
+ (u_int)0x9cd2df59, (u_int)0x55f2733f, (u_int)0x1814ce79, (u_int)0x73c737bf,
+ (u_int)0x53f7cdea, (u_int)0x5ffdaa5b, (u_int)0xdf3d6f14, (u_int)0x7844db86,
+ (u_int)0xcaaff381, (u_int)0xb968c43e, (u_int)0x3824342c, (u_int)0xc2a3405f,
+ (u_int)0x161dc372, (u_int)0xbce2250c, (u_int)0x283c498b, (u_int)0xff0d9541,
+ (u_int)0x39a80171, (u_int)0x080cb3de, (u_int)0xd8b4e49c, (u_int)0x6456c190,
+ (u_int)0x7bcb8461, (u_int)0xd532b670, (u_int)0x486c5c74, (u_int)0xd0b85742,
+};
+static const u32 Td1[256] = {
+ (u_int)0x5051f4a7, (u_int)0x537e4165, (u_int)0xc31a17a4, (u_int)0x963a275e,
+ (u_int)0xcb3bab6b, (u_int)0xf11f9d45, (u_int)0xabacfa58, (u_int)0x934be303,
+ (u_int)0x552030fa, (u_int)0xf6ad766d, (u_int)0x9188cc76, (u_int)0x25f5024c,
+ (u_int)0xfc4fe5d7, (u_int)0xd7c52acb, (u_int)0x80263544, (u_int)0x8fb562a3,
+ (u_int)0x49deb15a, (u_int)0x6725ba1b, (u_int)0x9845ea0e, (u_int)0xe15dfec0,
+ (u_int)0x02c32f75, (u_int)0x12814cf0, (u_int)0xa38d4697, (u_int)0xc66bd3f9,
+ (u_int)0xe7038f5f, (u_int)0x9515929c, (u_int)0xebbf6d7a, (u_int)0xda955259,
+ (u_int)0x2dd4be83, (u_int)0xd3587421, (u_int)0x2949e069, (u_int)0x448ec9c8,
+ (u_int)0x6a75c289, (u_int)0x78f48e79, (u_int)0x6b99583e, (u_int)0xdd27b971,
+ (u_int)0xb6bee14f, (u_int)0x17f088ad, (u_int)0x66c920ac, (u_int)0xb47dce3a,
+ (u_int)0x1863df4a, (u_int)0x82e51a31, (u_int)0x60975133, (u_int)0x4562537f,
+ (u_int)0xe0b16477, (u_int)0x84bb6bae, (u_int)0x1cfe81a0, (u_int)0x94f9082b,
+ (u_int)0x58704868, (u_int)0x198f45fd, (u_int)0x8794de6c, (u_int)0xb7527bf8,
+ (u_int)0x23ab73d3, (u_int)0xe2724b02, (u_int)0x57e31f8f, (u_int)0x2a6655ab,
+ (u_int)0x07b2eb28, (u_int)0x032fb5c2, (u_int)0x9a86c57b, (u_int)0xa5d33708,
+ (u_int)0xf2302887, (u_int)0xb223bfa5, (u_int)0xba02036a, (u_int)0x5ced1682,
+ (u_int)0x2b8acf1c, (u_int)0x92a779b4, (u_int)0xf0f307f2, (u_int)0xa14e69e2,
+ (u_int)0xcd65daf4, (u_int)0xd50605be, (u_int)0x1fd13462, (u_int)0x8ac4a6fe,
+ (u_int)0x9d342e53, (u_int)0xa0a2f355, (u_int)0x32058ae1, (u_int)0x75a4f6eb,
+ (u_int)0x390b83ec, (u_int)0xaa4060ef, (u_int)0x065e719f, (u_int)0x51bd6e10,
+ (u_int)0xf93e218a, (u_int)0x3d96dd06, (u_int)0xaedd3e05, (u_int)0x464de6bd,
+ (u_int)0xb591548d, (u_int)0x0571c45d, (u_int)0x6f0406d4, (u_int)0xff605015,
+ (u_int)0x241998fb, (u_int)0x97d6bde9, (u_int)0xcc894043, (u_int)0x7767d99e,
+ (u_int)0xbdb0e842, (u_int)0x8807898b, (u_int)0x38e7195b, (u_int)0xdb79c8ee,
+ (u_int)0x47a17c0a, (u_int)0xe97c420f, (u_int)0xc9f8841e, (u_int)0x00000000,
+ (u_int)0x83098086, (u_int)0x48322bed, (u_int)0xac1e1170, (u_int)0x4e6c5a72,
+ (u_int)0xfbfd0eff, (u_int)0x560f8538, (u_int)0x1e3daed5, (u_int)0x27362d39,
+ (u_int)0x640a0fd9, (u_int)0x21685ca6, (u_int)0xd19b5b54, (u_int)0x3a24362e,
+ (u_int)0xb10c0a67, (u_int)0x0f9357e7, (u_int)0xd2b4ee96, (u_int)0x9e1b9b91,
+ (u_int)0x4f80c0c5, (u_int)0xa261dc20, (u_int)0x695a774b, (u_int)0x161c121a,
+ (u_int)0x0ae293ba, (u_int)0xe5c0a02a, (u_int)0x433c22e0, (u_int)0x1d121b17,
+ (u_int)0x0b0e090d, (u_int)0xadf28bc7, (u_int)0xb92db6a8, (u_int)0xc8141ea9,
+ (u_int)0x8557f119, (u_int)0x4caf7507, (u_int)0xbbee99dd, (u_int)0xfda37f60,
+ (u_int)0x9ff70126, (u_int)0xbc5c72f5, (u_int)0xc544663b, (u_int)0x345bfb7e,
+ (u_int)0x768b4329, (u_int)0xdccb23c6, (u_int)0x68b6edfc, (u_int)0x63b8e4f1,
+ (u_int)0xcad731dc, (u_int)0x10426385, (u_int)0x40139722, (u_int)0x2084c611,
+ (u_int)0x7d854a24, (u_int)0xf8d2bb3d, (u_int)0x11aef932, (u_int)0x6dc729a1,
+ (u_int)0x4b1d9e2f, (u_int)0xf3dcb230, (u_int)0xec0d8652, (u_int)0xd077c1e3,
+ (u_int)0x6c2bb316, (u_int)0x99a970b9, (u_int)0xfa119448, (u_int)0x2247e964,
+ (u_int)0xc4a8fc8c, (u_int)0x1aa0f03f, (u_int)0xd8567d2c, (u_int)0xef223390,
+ (u_int)0xc787494e, (u_int)0xc1d938d1, (u_int)0xfe8ccaa2, (u_int)0x3698d40b,
+ (u_int)0xcfa6f581, (u_int)0x28a57ade, (u_int)0x26dab78e, (u_int)0xa43fadbf,
+ (u_int)0xe42c3a9d, (u_int)0x0d507892, (u_int)0x9b6a5fcc, (u_int)0x62547e46,
+ (u_int)0xc2f68d13, (u_int)0xe890d8b8, (u_int)0x5e2e39f7, (u_int)0xf582c3af,
+ (u_int)0xbe9f5d80, (u_int)0x7c69d093, (u_int)0xa96fd52d, (u_int)0xb3cf2512,
+ (u_int)0x3bc8ac99, (u_int)0xa710187d, (u_int)0x6ee89c63, (u_int)0x7bdb3bbb,
+ (u_int)0x09cd2678, (u_int)0xf46e5918, (u_int)0x01ec9ab7, (u_int)0xa8834f9a,
+ (u_int)0x65e6956e, (u_int)0x7eaaffe6, (u_int)0x0821bccf, (u_int)0xe6ef15e8,
+ (u_int)0xd9bae79b, (u_int)0xce4a6f36, (u_int)0xd4ea9f09, (u_int)0xd629b07c,
+ (u_int)0xaf31a4b2, (u_int)0x312a3f23, (u_int)0x30c6a594, (u_int)0xc035a266,
+ (u_int)0x37744ebc, (u_int)0xa6fc82ca, (u_int)0xb0e090d0, (u_int)0x1533a7d8,
+ (u_int)0x4af10498, (u_int)0xf741ecda, (u_int)0x0e7fcd50, (u_int)0x2f1791f6,
+ (u_int)0x8d764dd6, (u_int)0x4d43efb0, (u_int)0x54ccaa4d, (u_int)0xdfe49604,
+ (u_int)0xe39ed1b5, (u_int)0x1b4c6a88, (u_int)0xb8c12c1f, (u_int)0x7f466551,
+ (u_int)0x049d5eea, (u_int)0x5d018c35, (u_int)0x73fa8774, (u_int)0x2efb0b41,
+ (u_int)0x5ab3671d, (u_int)0x5292dbd2, (u_int)0x33e91056, (u_int)0x136dd647,
+ (u_int)0x8c9ad761, (u_int)0x7a37a10c, (u_int)0x8e59f814, (u_int)0x89eb133c,
+ (u_int)0xeecea927, (u_int)0x35b761c9, (u_int)0xede11ce5, (u_int)0x3c7a47b1,
+ (u_int)0x599cd2df, (u_int)0x3f55f273, (u_int)0x791814ce, (u_int)0xbf73c737,
+ (u_int)0xea53f7cd, (u_int)0x5b5ffdaa, (u_int)0x14df3d6f, (u_int)0x867844db,
+ (u_int)0x81caaff3, (u_int)0x3eb968c4, (u_int)0x2c382434, (u_int)0x5fc2a340,
+ (u_int)0x72161dc3, (u_int)0x0cbce225, (u_int)0x8b283c49, (u_int)0x41ff0d95,
+ (u_int)0x7139a801, (u_int)0xde080cb3, (u_int)0x9cd8b4e4, (u_int)0x906456c1,
+ (u_int)0x617bcb84, (u_int)0x70d532b6, (u_int)0x74486c5c, (u_int)0x42d0b857,
+};
+static const u32 Td2[256] = {
+ (u_int)0xa75051f4, (u_int)0x65537e41, (u_int)0xa4c31a17, (u_int)0x5e963a27,
+ (u_int)0x6bcb3bab, (u_int)0x45f11f9d, (u_int)0x58abacfa, (u_int)0x03934be3,
+ (u_int)0xfa552030, (u_int)0x6df6ad76, (u_int)0x769188cc, (u_int)0x4c25f502,
+ (u_int)0xd7fc4fe5, (u_int)0xcbd7c52a, (u_int)0x44802635, (u_int)0xa38fb562,
+ (u_int)0x5a49deb1, (u_int)0x1b6725ba, (u_int)0x0e9845ea, (u_int)0xc0e15dfe,
+ (u_int)0x7502c32f, (u_int)0xf012814c, (u_int)0x97a38d46, (u_int)0xf9c66bd3,
+ (u_int)0x5fe7038f, (u_int)0x9c951592, (u_int)0x7aebbf6d, (u_int)0x59da9552,
+ (u_int)0x832dd4be, (u_int)0x21d35874, (u_int)0x692949e0, (u_int)0xc8448ec9,
+ (u_int)0x896a75c2, (u_int)0x7978f48e, (u_int)0x3e6b9958, (u_int)0x71dd27b9,
+ (u_int)0x4fb6bee1, (u_int)0xad17f088, (u_int)0xac66c920, (u_int)0x3ab47dce,
+ (u_int)0x4a1863df, (u_int)0x3182e51a, (u_int)0x33609751, (u_int)0x7f456253,
+ (u_int)0x77e0b164, (u_int)0xae84bb6b, (u_int)0xa01cfe81, (u_int)0x2b94f908,
+ (u_int)0x68587048, (u_int)0xfd198f45, (u_int)0x6c8794de, (u_int)0xf8b7527b,
+ (u_int)0xd323ab73, (u_int)0x02e2724b, (u_int)0x8f57e31f, (u_int)0xab2a6655,
+ (u_int)0x2807b2eb, (u_int)0xc2032fb5, (u_int)0x7b9a86c5, (u_int)0x08a5d337,
+ (u_int)0x87f23028, (u_int)0xa5b223bf, (u_int)0x6aba0203, (u_int)0x825ced16,
+ (u_int)0x1c2b8acf, (u_int)0xb492a779, (u_int)0xf2f0f307, (u_int)0xe2a14e69,
+ (u_int)0xf4cd65da, (u_int)0xbed50605, (u_int)0x621fd134, (u_int)0xfe8ac4a6,
+ (u_int)0x539d342e, (u_int)0x55a0a2f3, (u_int)0xe132058a, (u_int)0xeb75a4f6,
+ (u_int)0xec390b83, (u_int)0xefaa4060, (u_int)0x9f065e71, (u_int)0x1051bd6e,
+
+ (u_int)0x8af93e21, (u_int)0x063d96dd, (u_int)0x05aedd3e, (u_int)0xbd464de6,
+ (u_int)0x8db59154, (u_int)0x5d0571c4, (u_int)0xd46f0406, (u_int)0x15ff6050,
+ (u_int)0xfb241998, (u_int)0xe997d6bd, (u_int)0x43cc8940, (u_int)0x9e7767d9,
+ (u_int)0x42bdb0e8, (u_int)0x8b880789, (u_int)0x5b38e719, (u_int)0xeedb79c8,
+ (u_int)0x0a47a17c, (u_int)0x0fe97c42, (u_int)0x1ec9f884, (u_int)0x00000000,
+ (u_int)0x86830980, (u_int)0xed48322b, (u_int)0x70ac1e11, (u_int)0x724e6c5a,
+ (u_int)0xfffbfd0e, (u_int)0x38560f85, (u_int)0xd51e3dae, (u_int)0x3927362d,
+ (u_int)0xd9640a0f, (u_int)0xa621685c, (u_int)0x54d19b5b, (u_int)0x2e3a2436,
+ (u_int)0x67b10c0a, (u_int)0xe70f9357, (u_int)0x96d2b4ee, (u_int)0x919e1b9b,
+ (u_int)0xc54f80c0, (u_int)0x20a261dc, (u_int)0x4b695a77, (u_int)0x1a161c12,
+ (u_int)0xba0ae293, (u_int)0x2ae5c0a0, (u_int)0xe0433c22, (u_int)0x171d121b,
+ (u_int)0x0d0b0e09, (u_int)0xc7adf28b, (u_int)0xa8b92db6, (u_int)0xa9c8141e,
+ (u_int)0x198557f1, (u_int)0x074caf75, (u_int)0xddbbee99, (u_int)0x60fda37f,
+ (u_int)0x269ff701, (u_int)0xf5bc5c72, (u_int)0x3bc54466, (u_int)0x7e345bfb,
+ (u_int)0x29768b43, (u_int)0xc6dccb23, (u_int)0xfc68b6ed, (u_int)0xf163b8e4,
+ (u_int)0xdccad731, (u_int)0x85104263, (u_int)0x22401397, (u_int)0x112084c6,
+ (u_int)0x247d854a, (u_int)0x3df8d2bb, (u_int)0x3211aef9, (u_int)0xa16dc729,
+ (u_int)0x2f4b1d9e, (u_int)0x30f3dcb2, (u_int)0x52ec0d86, (u_int)0xe3d077c1,
+ (u_int)0x166c2bb3, (u_int)0xb999a970, (u_int)0x48fa1194, (u_int)0x642247e9,
+ (u_int)0x8cc4a8fc, (u_int)0x3f1aa0f0, (u_int)0x2cd8567d, (u_int)0x90ef2233,
+ (u_int)0x4ec78749, (u_int)0xd1c1d938, (u_int)0xa2fe8cca, (u_int)0x0b3698d4,
+ (u_int)0x81cfa6f5, (u_int)0xde28a57a, (u_int)0x8e26dab7, (u_int)0xbfa43fad,
+ (u_int)0x9de42c3a, (u_int)0x920d5078, (u_int)0xcc9b6a5f, (u_int)0x4662547e,
+ (u_int)0x13c2f68d, (u_int)0xb8e890d8, (u_int)0xf75e2e39, (u_int)0xaff582c3,
+ (u_int)0x80be9f5d, (u_int)0x937c69d0, (u_int)0x2da96fd5, (u_int)0x12b3cf25,
+ (u_int)0x993bc8ac, (u_int)0x7da71018, (u_int)0x636ee89c, (u_int)0xbb7bdb3b,
+ (u_int)0x7809cd26, (u_int)0x18f46e59, (u_int)0xb701ec9a, (u_int)0x9aa8834f,
+ (u_int)0x6e65e695, (u_int)0xe67eaaff, (u_int)0xcf0821bc, (u_int)0xe8e6ef15,
+ (u_int)0x9bd9bae7, (u_int)0x36ce4a6f, (u_int)0x09d4ea9f, (u_int)0x7cd629b0,
+ (u_int)0xb2af31a4, (u_int)0x23312a3f, (u_int)0x9430c6a5, (u_int)0x66c035a2,
+ (u_int)0xbc37744e, (u_int)0xcaa6fc82, (u_int)0xd0b0e090, (u_int)0xd81533a7,
+ (u_int)0x984af104, (u_int)0xdaf741ec, (u_int)0x500e7fcd, (u_int)0xf62f1791,
+ (u_int)0xd68d764d, (u_int)0xb04d43ef, (u_int)0x4d54ccaa, (u_int)0x04dfe496,
+ (u_int)0xb5e39ed1, (u_int)0x881b4c6a, (u_int)0x1fb8c12c, (u_int)0x517f4665,
+ (u_int)0xea049d5e, (u_int)0x355d018c, (u_int)0x7473fa87, (u_int)0x412efb0b,
+ (u_int)0x1d5ab367, (u_int)0xd25292db, (u_int)0x5633e910, (u_int)0x47136dd6,
+ (u_int)0x618c9ad7, (u_int)0x0c7a37a1, (u_int)0x148e59f8, (u_int)0x3c89eb13,
+ (u_int)0x27eecea9, (u_int)0xc935b761, (u_int)0xe5ede11c, (u_int)0xb13c7a47,
+ (u_int)0xdf599cd2, (u_int)0x733f55f2, (u_int)0xce791814, (u_int)0x37bf73c7,
+ (u_int)0xcdea53f7, (u_int)0xaa5b5ffd, (u_int)0x6f14df3d, (u_int)0xdb867844,
+ (u_int)0xf381caaf, (u_int)0xc43eb968, (u_int)0x342c3824, (u_int)0x405fc2a3,
+ (u_int)0xc372161d, (u_int)0x250cbce2, (u_int)0x498b283c, (u_int)0x9541ff0d,
+ (u_int)0x017139a8, (u_int)0xb3de080c, (u_int)0xe49cd8b4, (u_int)0xc1906456,
+ (u_int)0x84617bcb, (u_int)0xb670d532, (u_int)0x5c74486c, (u_int)0x5742d0b8,
+};
+static const u32 Td3[256] = {
+ (u_int)0xf4a75051, (u_int)0x4165537e, (u_int)0x17a4c31a, (u_int)0x275e963a,
+ (u_int)0xab6bcb3b, (u_int)0x9d45f11f, (u_int)0xfa58abac, (u_int)0xe303934b,
+ (u_int)0x30fa5520, (u_int)0x766df6ad, (u_int)0xcc769188, (u_int)0x024c25f5,
+ (u_int)0xe5d7fc4f, (u_int)0x2acbd7c5, (u_int)0x35448026, (u_int)0x62a38fb5,
+ (u_int)0xb15a49de, (u_int)0xba1b6725, (u_int)0xea0e9845, (u_int)0xfec0e15d,
+ (u_int)0x2f7502c3, (u_int)0x4cf01281, (u_int)0x4697a38d, (u_int)0xd3f9c66b,
+ (u_int)0x8f5fe703, (u_int)0x929c9515, (u_int)0x6d7aebbf, (u_int)0x5259da95,
+ (u_int)0xbe832dd4, (u_int)0x7421d358, (u_int)0xe0692949, (u_int)0xc9c8448e,
+ (u_int)0xc2896a75, (u_int)0x8e7978f4, (u_int)0x583e6b99, (u_int)0xb971dd27,
+ (u_int)0xe14fb6be, (u_int)0x88ad17f0, (u_int)0x20ac66c9, (u_int)0xce3ab47d,
+ (u_int)0xdf4a1863, (u_int)0x1a3182e5, (u_int)0x51336097, (u_int)0x537f4562,
+ (u_int)0x6477e0b1, (u_int)0x6bae84bb, (u_int)0x81a01cfe, (u_int)0x082b94f9,
+ (u_int)0x48685870, (u_int)0x45fd198f, (u_int)0xde6c8794, (u_int)0x7bf8b752,
+ (u_int)0x73d323ab, (u_int)0x4b02e272, (u_int)0x1f8f57e3, (u_int)0x55ab2a66,
+ (u_int)0xeb2807b2, (u_int)0xb5c2032f, (u_int)0xc57b9a86, (u_int)0x3708a5d3,
+ (u_int)0x2887f230, (u_int)0xbfa5b223, (u_int)0x036aba02, (u_int)0x16825ced,
+ (u_int)0xcf1c2b8a, (u_int)0x79b492a7, (u_int)0x07f2f0f3, (u_int)0x69e2a14e,
+ (u_int)0xdaf4cd65, (u_int)0x05bed506, (u_int)0x34621fd1, (u_int)0xa6fe8ac4,
+ (u_int)0x2e539d34, (u_int)0xf355a0a2, (u_int)0x8ae13205, (u_int)0xf6eb75a4,
+ (u_int)0x83ec390b, (u_int)0x60efaa40, (u_int)0x719f065e, (u_int)0x6e1051bd,
+ (u_int)0x218af93e, (u_int)0xdd063d96, (u_int)0x3e05aedd, (u_int)0xe6bd464d,
+ (u_int)0x548db591, (u_int)0xc45d0571, (u_int)0x06d46f04, (u_int)0x5015ff60,
+ (u_int)0x98fb2419, (u_int)0xbde997d6, (u_int)0x4043cc89, (u_int)0xd99e7767,
+ (u_int)0xe842bdb0, (u_int)0x898b8807, (u_int)0x195b38e7, (u_int)0xc8eedb79,
+ (u_int)0x7c0a47a1, (u_int)0x420fe97c, (u_int)0x841ec9f8, (u_int)0x00000000,
+ (u_int)0x80868309, (u_int)0x2bed4832, (u_int)0x1170ac1e, (u_int)0x5a724e6c,
+ (u_int)0x0efffbfd, (u_int)0x8538560f, (u_int)0xaed51e3d, (u_int)0x2d392736,
+ (u_int)0x0fd9640a, (u_int)0x5ca62168, (u_int)0x5b54d19b, (u_int)0x362e3a24,
+ (u_int)0x0a67b10c, (u_int)0x57e70f93, (u_int)0xee96d2b4, (u_int)0x9b919e1b,
+ (u_int)0xc0c54f80, (u_int)0xdc20a261, (u_int)0x774b695a, (u_int)0x121a161c,
+ (u_int)0x93ba0ae2, (u_int)0xa02ae5c0, (u_int)0x22e0433c, (u_int)0x1b171d12,
+ (u_int)0x090d0b0e, (u_int)0x8bc7adf2, (u_int)0xb6a8b92d, (u_int)0x1ea9c814,
+ (u_int)0xf1198557, (u_int)0x75074caf, (u_int)0x99ddbbee, (u_int)0x7f60fda3,
+ (u_int)0x01269ff7, (u_int)0x72f5bc5c, (u_int)0x663bc544, (u_int)0xfb7e345b,
+ (u_int)0x4329768b, (u_int)0x23c6dccb, (u_int)0xedfc68b6, (u_int)0xe4f163b8,
+ (u_int)0x31dccad7, (u_int)0x63851042, (u_int)0x97224013, (u_int)0xc6112084,
+ (u_int)0x4a247d85, (u_int)0xbb3df8d2, (u_int)0xf93211ae, (u_int)0x29a16dc7,
+ (u_int)0x9e2f4b1d, (u_int)0xb230f3dc, (u_int)0x8652ec0d, (u_int)0xc1e3d077,
+ (u_int)0xb3166c2b, (u_int)0x70b999a9, (u_int)0x9448fa11, (u_int)0xe9642247,
+ (u_int)0xfc8cc4a8, (u_int)0xf03f1aa0, (u_int)0x7d2cd856, (u_int)0x3390ef22,
+ (u_int)0x494ec787, (u_int)0x38d1c1d9, (u_int)0xcaa2fe8c, (u_int)0xd40b3698,
+ (u_int)0xf581cfa6, (u_int)0x7ade28a5, (u_int)0xb78e26da, (u_int)0xadbfa43f,
+ (u_int)0x3a9de42c, (u_int)0x78920d50, (u_int)0x5fcc9b6a, (u_int)0x7e466254,
+ (u_int)0x8d13c2f6, (u_int)0xd8b8e890, (u_int)0x39f75e2e, (u_int)0xc3aff582,
+ (u_int)0x5d80be9f, (u_int)0xd0937c69, (u_int)0xd52da96f, (u_int)0x2512b3cf,
+ (u_int)0xac993bc8, (u_int)0x187da710, (u_int)0x9c636ee8, (u_int)0x3bbb7bdb,
+ (u_int)0x267809cd, (u_int)0x5918f46e, (u_int)0x9ab701ec, (u_int)0x4f9aa883,
+ (u_int)0x956e65e6, (u_int)0xffe67eaa, (u_int)0xbccf0821, (u_int)0x15e8e6ef,
+ (u_int)0xe79bd9ba, (u_int)0x6f36ce4a, (u_int)0x9f09d4ea, (u_int)0xb07cd629,
+ (u_int)0xa4b2af31, (u_int)0x3f23312a, (u_int)0xa59430c6, (u_int)0xa266c035,
+ (u_int)0x4ebc3774, (u_int)0x82caa6fc, (u_int)0x90d0b0e0, (u_int)0xa7d81533,
+ (u_int)0x04984af1, (u_int)0xecdaf741, (u_int)0xcd500e7f, (u_int)0x91f62f17,
+ (u_int)0x4dd68d76, (u_int)0xefb04d43, (u_int)0xaa4d54cc, (u_int)0x9604dfe4,
+ (u_int)0xd1b5e39e, (u_int)0x6a881b4c, (u_int)0x2c1fb8c1, (u_int)0x65517f46,
+ (u_int)0x5eea049d, (u_int)0x8c355d01, (u_int)0x877473fa, (u_int)0x0b412efb,
+ (u_int)0x671d5ab3, (u_int)0xdbd25292, (u_int)0x105633e9, (u_int)0xd647136d,
+ (u_int)0xd7618c9a, (u_int)0xa10c7a37, (u_int)0xf8148e59, (u_int)0x133c89eb,
+ (u_int)0xa927eece, (u_int)0x61c935b7, (u_int)0x1ce5ede1, (u_int)0x47b13c7a,
+ (u_int)0xd2df599c, (u_int)0xf2733f55, (u_int)0x14ce7918, (u_int)0xc737bf73,
+ (u_int)0xf7cdea53, (u_int)0xfdaa5b5f, (u_int)0x3d6f14df, (u_int)0x44db8678,
+ (u_int)0xaff381ca, (u_int)0x68c43eb9, (u_int)0x24342c38, (u_int)0xa3405fc2,
+ (u_int)0x1dc37216, (u_int)0xe2250cbc, (u_int)0x3c498b28, (u_int)0x0d9541ff,
+ (u_int)0xa8017139, (u_int)0x0cb3de08, (u_int)0xb4e49cd8, (u_int)0x56c19064,
+ (u_int)0xcb84617b, (u_int)0x32b670d5, (u_int)0x6c5c7448, (u_int)0xb85742d0,
+};
+static const u32 Td4[256] = {
+ (u_int)0x52525252, (u_int)0x09090909, (u_int)0x6a6a6a6a, (u_int)0xd5d5d5d5,
+ (u_int)0x30303030, (u_int)0x36363636, (u_int)0xa5a5a5a5, (u_int)0x38383838,
+ (u_int)0xbfbfbfbf, (u_int)0x40404040, (u_int)0xa3a3a3a3, (u_int)0x9e9e9e9e,
+ (u_int)0x81818181, (u_int)0xf3f3f3f3, (u_int)0xd7d7d7d7, (u_int)0xfbfbfbfb,
+ (u_int)0x7c7c7c7c, (u_int)0xe3e3e3e3, (u_int)0x39393939, (u_int)0x82828282,
+ (u_int)0x9b9b9b9b, (u_int)0x2f2f2f2f, (u_int)0xffffffff, (u_int)0x87878787,
+ (u_int)0x34343434, (u_int)0x8e8e8e8e, (u_int)0x43434343, (u_int)0x44444444,
+ (u_int)0xc4c4c4c4, (u_int)0xdededede, (u_int)0xe9e9e9e9, (u_int)0xcbcbcbcb,
+ (u_int)0x54545454, (u_int)0x7b7b7b7b, (u_int)0x94949494, (u_int)0x32323232,
+ (u_int)0xa6a6a6a6, (u_int)0xc2c2c2c2, (u_int)0x23232323, (u_int)0x3d3d3d3d,
+ (u_int)0xeeeeeeee, (u_int)0x4c4c4c4c, (u_int)0x95959595, (u_int)0x0b0b0b0b,
+ (u_int)0x42424242, (u_int)0xfafafafa, (u_int)0xc3c3c3c3, (u_int)0x4e4e4e4e,
+ (u_int)0x08080808, (u_int)0x2e2e2e2e, (u_int)0xa1a1a1a1, (u_int)0x66666666,
+ (u_int)0x28282828, (u_int)0xd9d9d9d9, (u_int)0x24242424, (u_int)0xb2b2b2b2,
+ (u_int)0x76767676, (u_int)0x5b5b5b5b, (u_int)0xa2a2a2a2, (u_int)0x49494949,
+ (u_int)0x6d6d6d6d, (u_int)0x8b8b8b8b, (u_int)0xd1d1d1d1, (u_int)0x25252525,
+ (u_int)0x72727272, (u_int)0xf8f8f8f8, (u_int)0xf6f6f6f6, (u_int)0x64646464,
+ (u_int)0x86868686, (u_int)0x68686868, (u_int)0x98989898, (u_int)0x16161616,
+ (u_int)0xd4d4d4d4, (u_int)0xa4a4a4a4, (u_int)0x5c5c5c5c, (u_int)0xcccccccc,
+ (u_int)0x5d5d5d5d, (u_int)0x65656565, (u_int)0xb6b6b6b6, (u_int)0x92929292,
+ (u_int)0x6c6c6c6c, (u_int)0x70707070, (u_int)0x48484848, (u_int)0x50505050,
+ (u_int)0xfdfdfdfd, (u_int)0xedededed, (u_int)0xb9b9b9b9, (u_int)0xdadadada,
+ (u_int)0x5e5e5e5e, (u_int)0x15151515, (u_int)0x46464646, (u_int)0x57575757,
+ (u_int)0xa7a7a7a7, (u_int)0x8d8d8d8d, (u_int)0x9d9d9d9d, (u_int)0x84848484,
+ (u_int)0x90909090, (u_int)0xd8d8d8d8, (u_int)0xabababab, (u_int)0x00000000,
+ (u_int)0x8c8c8c8c, (u_int)0xbcbcbcbc, (u_int)0xd3d3d3d3, (u_int)0x0a0a0a0a,
+ (u_int)0xf7f7f7f7, (u_int)0xe4e4e4e4, (u_int)0x58585858, (u_int)0x05050505,
+ (u_int)0xb8b8b8b8, (u_int)0xb3b3b3b3, (u_int)0x45454545, (u_int)0x06060606,
+ (u_int)0xd0d0d0d0, (u_int)0x2c2c2c2c, (u_int)0x1e1e1e1e, (u_int)0x8f8f8f8f,
+ (u_int)0xcacacaca, (u_int)0x3f3f3f3f, (u_int)0x0f0f0f0f, (u_int)0x02020202,
+ (u_int)0xc1c1c1c1, (u_int)0xafafafaf, (u_int)0xbdbdbdbd, (u_int)0x03030303,
+ (u_int)0x01010101, (u_int)0x13131313, (u_int)0x8a8a8a8a, (u_int)0x6b6b6b6b,
+ (u_int)0x3a3a3a3a, (u_int)0x91919191, (u_int)0x11111111, (u_int)0x41414141,
+ (u_int)0x4f4f4f4f, (u_int)0x67676767, (u_int)0xdcdcdcdc, (u_int)0xeaeaeaea,
+ (u_int)0x97979797, (u_int)0xf2f2f2f2, (u_int)0xcfcfcfcf, (u_int)0xcececece,
+ (u_int)0xf0f0f0f0, (u_int)0xb4b4b4b4, (u_int)0xe6e6e6e6, (u_int)0x73737373,
+ (u_int)0x96969696, (u_int)0xacacacac, (u_int)0x74747474, (u_int)0x22222222,
+ (u_int)0xe7e7e7e7, (u_int)0xadadadad, (u_int)0x35353535, (u_int)0x85858585,
+ (u_int)0xe2e2e2e2, (u_int)0xf9f9f9f9, (u_int)0x37373737, (u_int)0xe8e8e8e8,
+ (u_int)0x1c1c1c1c, (u_int)0x75757575, (u_int)0xdfdfdfdf, (u_int)0x6e6e6e6e,
+ (u_int)0x47474747, (u_int)0xf1f1f1f1, (u_int)0x1a1a1a1a, (u_int)0x71717171,
+ (u_int)0x1d1d1d1d, (u_int)0x29292929, (u_int)0xc5c5c5c5, (u_int)0x89898989,
+ (u_int)0x6f6f6f6f, (u_int)0xb7b7b7b7, (u_int)0x62626262, (u_int)0x0e0e0e0e,
+ (u_int)0xaaaaaaaa, (u_int)0x18181818, (u_int)0xbebebebe, (u_int)0x1b1b1b1b,
+ (u_int)0xfcfcfcfc, (u_int)0x56565656, (u_int)0x3e3e3e3e, (u_int)0x4b4b4b4b,
+ (u_int)0xc6c6c6c6, (u_int)0xd2d2d2d2, (u_int)0x79797979, (u_int)0x20202020,
+ (u_int)0x9a9a9a9a, (u_int)0xdbdbdbdb, (u_int)0xc0c0c0c0, (u_int)0xfefefefe,
+ (u_int)0x78787878, (u_int)0xcdcdcdcd, (u_int)0x5a5a5a5a, (u_int)0xf4f4f4f4,
+ (u_int)0x1f1f1f1f, (u_int)0xdddddddd, (u_int)0xa8a8a8a8, (u_int)0x33333333,
+ (u_int)0x88888888, (u_int)0x07070707, (u_int)0xc7c7c7c7, (u_int)0x31313131,
+ (u_int)0xb1b1b1b1, (u_int)0x12121212, (u_int)0x10101010, (u_int)0x59595959,
+ (u_int)0x27272727, (u_int)0x80808080, (u_int)0xecececec, (u_int)0x5f5f5f5f,
+ (u_int)0x60606060, (u_int)0x51515151, (u_int)0x7f7f7f7f, (u_int)0xa9a9a9a9,
+ (u_int)0x19191919, (u_int)0xb5b5b5b5, (u_int)0x4a4a4a4a, (u_int)0x0d0d0d0d,
+ (u_int)0x2d2d2d2d, (u_int)0xe5e5e5e5, (u_int)0x7a7a7a7a, (u_int)0x9f9f9f9f,
+ (u_int)0x93939393, (u_int)0xc9c9c9c9, (u_int)0x9c9c9c9c, (u_int)0xefefefef,
+ (u_int)0xa0a0a0a0, (u_int)0xe0e0e0e0, (u_int)0x3b3b3b3b, (u_int)0x4d4d4d4d,
+ (u_int)0xaeaeaeae, (u_int)0x2a2a2a2a, (u_int)0xf5f5f5f5, (u_int)0xb0b0b0b0,
+ (u_int)0xc8c8c8c8, (u_int)0xebebebeb, (u_int)0xbbbbbbbb, (u_int)0x3c3c3c3c,
+ (u_int)0x83838383, (u_int)0x53535353, (u_int)0x99999999, (u_int)0x61616161,
+ (u_int)0x17171717, (u_int)0x2b2b2b2b, (u_int)0x04040404, (u_int)0x7e7e7e7e,
+ (u_int)0xbabababa, (u_int)0x77777777, (u_int)0xd6d6d6d6, (u_int)0x26262626,
+ (u_int)0xe1e1e1e1, (u_int)0x69696969, (u_int)0x14141414, (u_int)0x63636363,
+ (u_int)0x55555555, (u_int)0x21212121, (u_int)0x0c0c0c0c, (u_int)0x7d7d7d7d,
+};
+static const u32 rcon[] = {
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000,
+ 0x10000000, 0x20000000, 0x40000000, 0x80000000,
+ 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+#define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
+
+#ifdef _MSC_VER
+#define GETU32(p) SWAP(*((u32 *)(p)))
+#define PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
+#else
+#define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
+#define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
+#endif
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * @return the number of rounds for the given cipher key size.
+ */
+/*
+ * __db_rijndaelKeySetupEnc --
+ *
+ * PUBLIC: int __db_rijndaelKeySetupEnc __P((u32 *, const u8 *, int));
+ */
+int
+__db_rijndaelKeySetupEnc(rk, cipherKey, keyBits)
+ u32 *rk; /* rk[4*(Nr + 1)] */
+ const u8 *cipherKey;
+ int keyBits;
+{
+ int i = 0;
+ u32 temp;
+
+ rk[0] = GETU32(cipherKey );
+ rk[1] = GETU32(cipherKey + 4);
+ rk[2] = GETU32(cipherKey + 8);
+ rk[3] = GETU32(cipherKey + 12);
+ if (keyBits == 128) {
+ for (;;) {
+ temp = rk[3];
+ rk[4] = rk[0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[5] = rk[1] ^ rk[4];
+ rk[6] = rk[2] ^ rk[5];
+ rk[7] = rk[3] ^ rk[6];
+ if (++i == 10) {
+ return 10;
+ }
+ rk += 4;
+ }
+ }
+ rk[4] = GETU32(cipherKey + 16);
+ rk[5] = GETU32(cipherKey + 20);
+ if (keyBits == 192) {
+ for (;;) {
+ temp = rk[ 5];
+ rk[ 6] = rk[ 0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[ 7] = rk[ 1] ^ rk[ 6];
+ rk[ 8] = rk[ 2] ^ rk[ 7];
+ rk[ 9] = rk[ 3] ^ rk[ 8];
+ if (++i == 8) {
+ return 12;
+ }
+ rk[10] = rk[ 4] ^ rk[ 9];
+ rk[11] = rk[ 5] ^ rk[10];
+ rk += 6;
+ }
+ }
+ rk[6] = GETU32(cipherKey + 24);
+ rk[7] = GETU32(cipherKey + 28);
+ if (keyBits == 256) {
+ for (;;) {
+ temp = rk[ 7];
+ rk[ 8] = rk[ 0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[ 9] = rk[ 1] ^ rk[ 8];
+ rk[10] = rk[ 2] ^ rk[ 9];
+ rk[11] = rk[ 3] ^ rk[10];
+ if (++i == 7) {
+ return 14;
+ }
+ temp = rk[11];
+ rk[12] = rk[ 4] ^
+ (Te4[(temp >> 24) ] & 0xff000000) ^
+ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp ) & 0xff] & 0x000000ff);
+ rk[13] = rk[ 5] ^ rk[12];
+ rk[14] = rk[ 6] ^ rk[13];
+ rk[15] = rk[ 7] ^ rk[14];
+
+ rk += 8;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * @return the number of rounds for the given cipher key size.
+ */
+/*
+ * __db_rijndaelKeySetupDec --
+ *
+ * PUBLIC: int __db_rijndaelKeySetupDec __P((u32 *, const u8 *, int));
+ */
+int
+__db_rijndaelKeySetupDec(rk, cipherKey, keyBits)
+ u32 *rk; /* rk[4*(Nr + 1)] */
+ const u8 *cipherKey;
+ int keyBits;
+{
+ int Nr, i, j;
+ u32 temp;
+
+ /* expand the cipher key: */
+ Nr = __db_rijndaelKeySetupEnc(rk, cipherKey, keyBits);
+ /* invert the order of the round keys: */
+ for (i = 0, j = 4*Nr; i < j; i += 4, j -= 4) {
+ temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
+ temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+ temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+ temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+ }
+ /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+ for (i = 1; i < Nr; i++) {
+ rk += 4;
+ rk[0] =
+ Td0[Te4[(rk[0] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[0] ) & 0xff] & 0xff];
+ rk[1] =
+ Td0[Te4[(rk[1] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[1] ) & 0xff] & 0xff];
+ rk[2] =
+ Td0[Te4[(rk[2] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[2] ) & 0xff] & 0xff];
+ rk[3] =
+ Td0[Te4[(rk[3] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[3] ) & 0xff] & 0xff];
+ }
+ return Nr;
+}
+
+/*
+ * __db_rijndaelEncrypt --
+ *
+ * PUBLIC: void __db_rijndaelEncrypt __P((u32 *, int, const u8 *, u8 *));
+ */
+void
+__db_rijndaelEncrypt(rk, Nr, pt, ct)
+ u32 *rk; /* rk[4*(Nr + 1)] */
+ int Nr;
+ const u8 *pt;
+ u8 *ct;
+{
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+ int r;
+#endif /* ?FULL_UNROLL */
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(pt ) ^ rk[0];
+ s1 = GETU32(pt + 4) ^ rk[1];
+ s2 = GETU32(pt + 8) ^ rk[2];
+ s3 = GETU32(pt + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+ /* round 1: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
+ /* round 2: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
+ /* round 3: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
+ /* round 4: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
+ /* round 5: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
+ /* round 6: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
+ /* round 7: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
+ /* round 8: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
+ /* round 9: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
+ if (Nr > 10) {
+ /* round 10: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
+ /* round 11: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
+ if (Nr > 12) {
+ /* round 12: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+ /* round 13: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+ }
+ }
+ rk += Nr << 2;
+#else /* !FULL_UNROLL */
+ /*
+ * Nr - 1 full rounds:
+ */
+ r = Nr >> 1;
+ for (;;) {
+ t0 =
+ Te0[(s0 >> 24) ] ^
+ Te1[(s1 >> 16) & 0xff] ^
+ Te2[(s2 >> 8) & 0xff] ^
+ Te3[(s3 ) & 0xff] ^
+ rk[4];
+ t1 =
+ Te0[(s1 >> 24) ] ^
+ Te1[(s2 >> 16) & 0xff] ^
+ Te2[(s3 >> 8) & 0xff] ^
+ Te3[(s0 ) & 0xff] ^
+ rk[5];
+ t2 =
+ Te0[(s2 >> 24) ] ^
+ Te1[(s3 >> 16) & 0xff] ^
+ Te2[(s0 >> 8) & 0xff] ^
+ Te3[(s1 ) & 0xff] ^
+ rk[6];
+ t3 =
+ Te0[(s3 >> 24) ] ^
+ Te1[(s0 >> 16) & 0xff] ^
+ Te2[(s1 >> 8) & 0xff] ^
+ Te3[(s2 ) & 0xff] ^
+ rk[7];
+
+ rk += 8;
+ if (--r == 0) {
+ break;
+ }
+
+ s0 =
+ Te0[(t0 >> 24) ] ^
+ Te1[(t1 >> 16) & 0xff] ^
+ Te2[(t2 >> 8) & 0xff] ^
+ Te3[(t3 ) & 0xff] ^
+ rk[0];
+ s1 =
+ Te0[(t1 >> 24) ] ^
+ Te1[(t2 >> 16) & 0xff] ^
+ Te2[(t3 >> 8) & 0xff] ^
+ Te3[(t0 ) & 0xff] ^
+ rk[1];
+ s2 =
+ Te0[(t2 >> 24) ] ^
+ Te1[(t3 >> 16) & 0xff] ^
+ Te2[(t0 >> 8) & 0xff] ^
+ Te3[(t1 ) & 0xff] ^
+ rk[2];
+ s3 =
+ Te0[(t3 >> 24) ] ^
+ Te1[(t0 >> 16) & 0xff] ^
+ Te2[(t1 >> 8) & 0xff] ^
+ Te3[(t2 ) & 0xff] ^
+ rk[3];
+ }
+#endif /* ?FULL_UNROLL */
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ s0 =
+ (Te4[(t0 >> 24) ] & 0xff000000) ^
+ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t3 ) & 0xff] & 0x000000ff) ^
+ rk[0];
+ PUTU32(ct , s0);
+ s1 =
+ (Te4[(t1 >> 24) ] & 0xff000000) ^
+ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t0 ) & 0xff] & 0x000000ff) ^
+ rk[1];
+ PUTU32(ct + 4, s1);
+ s2 =
+ (Te4[(t2 >> 24) ] & 0xff000000) ^
+ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t1 ) & 0xff] & 0x000000ff) ^
+ rk[2];
+ PUTU32(ct + 8, s2);
+ s3 =
+ (Te4[(t3 >> 24) ] & 0xff000000) ^
+ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t2 ) & 0xff] & 0x000000ff) ^
+ rk[3];
+ PUTU32(ct + 12, s3);
+}
+
+/*
+ * __db_rijndaelDecrypt --
+ *
+ * PUBLIC: void __db_rijndaelDecrypt __P((u32 *, int, const u8 *, u8 *));
+ */
+void
+__db_rijndaelDecrypt(rk, Nr, ct, pt)
+ u32 *rk; /* rk[4*(Nr + 1)] */
+ int Nr;
+ const u8 *ct;
+ u8 *pt;
+{
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+ int r;
+#endif /* ?FULL_UNROLL */
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(ct ) ^ rk[0];
+ s1 = GETU32(ct + 4) ^ rk[1];
+ s2 = GETU32(ct + 8) ^ rk[2];
+ s3 = GETU32(ct + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+ /* round 1: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
+ /* round 2: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
+ /* round 3: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
+ /* round 4: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
+ /* round 5: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
+ /* round 6: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
+ /* round 7: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
+ /* round 8: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
+ /* round 9: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
+ if (Nr > 10) {
+ /* round 10: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
+ /* round 11: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
+ if (Nr > 12) {
+ /* round 12: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+ /* round 13: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+ }
+ }
+ rk += Nr << 2;
+#else /* !FULL_UNROLL */
+ /*
+ * Nr - 1 full rounds:
+ */
+ r = Nr >> 1;
+ for (;;) {
+ t0 =
+ Td0[(s0 >> 24) ] ^
+ Td1[(s3 >> 16) & 0xff] ^
+ Td2[(s2 >> 8) & 0xff] ^
+ Td3[(s1 ) & 0xff] ^
+ rk[4];
+ t1 =
+ Td0[(s1 >> 24) ] ^
+ Td1[(s0 >> 16) & 0xff] ^
+ Td2[(s3 >> 8) & 0xff] ^
+ Td3[(s2 ) & 0xff] ^
+ rk[5];
+ t2 =
+ Td0[(s2 >> 24) ] ^
+ Td1[(s1 >> 16) & 0xff] ^
+ Td2[(s0 >> 8) & 0xff] ^
+ Td3[(s3 ) & 0xff] ^
+ rk[6];
+ t3 =
+ Td0[(s3 >> 24) ] ^
+ Td1[(s2 >> 16) & 0xff] ^
+ Td2[(s1 >> 8) & 0xff] ^
+ Td3[(s0 ) & 0xff] ^
+ rk[7];
+
+ rk += 8;
+ if (--r == 0) {
+ break;
+ }
+
+ s0 =
+ Td0[(t0 >> 24) ] ^
+ Td1[(t3 >> 16) & 0xff] ^
+ Td2[(t2 >> 8) & 0xff] ^
+ Td3[(t1 ) & 0xff] ^
+ rk[0];
+ s1 =
+ Td0[(t1 >> 24) ] ^
+ Td1[(t0 >> 16) & 0xff] ^
+ Td2[(t3 >> 8) & 0xff] ^
+ Td3[(t2 ) & 0xff] ^
+ rk[1];
+ s2 =
+ Td0[(t2 >> 24) ] ^
+ Td1[(t1 >> 16) & 0xff] ^
+ Td2[(t0 >> 8) & 0xff] ^
+ Td3[(t3 ) & 0xff] ^
+ rk[2];
+ s3 =
+ Td0[(t3 >> 24) ] ^
+ Td1[(t2 >> 16) & 0xff] ^
+ Td2[(t1 >> 8) & 0xff] ^
+ Td3[(t0 ) & 0xff] ^
+ rk[3];
+ }
+#endif /* ?FULL_UNROLL */
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ s0 =
+ (Td4[(t0 >> 24) ] & 0xff000000) ^
+ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t1 ) & 0xff] & 0x000000ff) ^
+ rk[0];
+ PUTU32(pt , s0);
+ s1 =
+ (Td4[(t1 >> 24) ] & 0xff000000) ^
+ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t2 ) & 0xff] & 0x000000ff) ^
+ rk[1];
+ PUTU32(pt + 4, s1);
+ s2 =
+ (Td4[(t2 >> 24) ] & 0xff000000) ^
+ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t3 ) & 0xff] & 0x000000ff) ^
+ rk[2];
+ PUTU32(pt + 8, s2);
+ s3 =
+ (Td4[(t3 >> 24) ] & 0xff000000) ^
+ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t0 ) & 0xff] & 0x000000ff) ^
+ rk[3];
+ PUTU32(pt + 12, s3);
+}
+
+#ifdef INTERMEDIATE_VALUE_KAT
+
+/*
+ * __db_rijndaelEncryptRound --
+ *
+ * PUBLIC: void __db_rijndaelEncryptRound __P((const u32 *, int, u8 *, int));
+ */
+void
+__db_rijndaelEncryptRound(rk, Nr, pt, ct)
+ const u32 *rk; /* rk[4*(Nr + 1)] */
+ int Nr;
+ u8 *block;
+ int rounds;
+{
+ int r;
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(block ) ^ rk[0];
+ s1 = GETU32(block + 4) ^ rk[1];
+ s2 = GETU32(block + 8) ^ rk[2];
+ s3 = GETU32(block + 12) ^ rk[3];
+ rk += 4;
+
+ /*
+ * Nr - 1 full rounds:
+ */
+ for (r = (rounds < Nr ? rounds : Nr - 1); r > 0; r--) {
+ t0 =
+ Te0[(s0 >> 24) ] ^
+ Te1[(s1 >> 16) & 0xff] ^
+ Te2[(s2 >> 8) & 0xff] ^
+ Te3[(s3 ) & 0xff] ^
+ rk[0];
+ t1 =
+ Te0[(s1 >> 24) ] ^
+ Te1[(s2 >> 16) & 0xff] ^
+ Te2[(s3 >> 8) & 0xff] ^
+ Te3[(s0 ) & 0xff] ^
+ rk[1];
+ t2 =
+ Te0[(s2 >> 24) ] ^
+ Te1[(s3 >> 16) & 0xff] ^
+ Te2[(s0 >> 8) & 0xff] ^
+ Te3[(s1 ) & 0xff] ^
+ rk[2];
+ t3 =
+ Te0[(s3 >> 24) ] ^
+ Te1[(s0 >> 16) & 0xff] ^
+ Te2[(s1 >> 8) & 0xff] ^
+ Te3[(s2 ) & 0xff] ^
+ rk[3];
+
+ s0 = t0;
+ s1 = t1;
+ s2 = t2;
+ s3 = t3;
+ rk += 4;
+
+ }
+
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ if (rounds == Nr) {
+ t0 =
+ (Te4[(s0 >> 24) ] & 0xff000000) ^
+ (Te4[(s1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(s2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(s3 ) & 0xff] & 0x000000ff) ^
+ rk[0];
+ t1 =
+ (Te4[(s1 >> 24) ] & 0xff000000) ^
+ (Te4[(s2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(s3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(s0 ) & 0xff] & 0x000000ff) ^
+ rk[1];
+ t2 =
+ (Te4[(s2 >> 24) ] & 0xff000000) ^
+ (Te4[(s3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(s0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(s1 ) & 0xff] & 0x000000ff) ^
+ rk[2];
+ t3 =
+ (Te4[(s3 >> 24) ] & 0xff000000) ^
+ (Te4[(s0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(s1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(s2 ) & 0xff] & 0x000000ff) ^
+ rk[3];
+
+ s0 = t0;
+ s1 = t1;
+ s2 = t2;
+ s3 = t3;
+ }
+
+ PUTU32(block , s0);
+ PUTU32(block + 4, s1);
+ PUTU32(block + 8, s2);
+ PUTU32(block + 12, s3);
+}
+
+/*
+ * __db_rijndaelDecryptRound --
+ *
+ * PUBLIC: void __db_rijndaelDecryptRound __P((const u32 *, int, u8 *, int));
+ */
+void
+__db_rijndaelDecryptRound(rk, Nr, pt, ct)
+ const u32 *rk; /* rk[4*(Nr + 1)] */
+ int Nr;
+ u8 *block;
+ int rounds;
+{
+ int r;
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(block ) ^ rk[0];
+ s1 = GETU32(block + 4) ^ rk[1];
+ s2 = GETU32(block + 8) ^ rk[2];
+ s3 = GETU32(block + 12) ^ rk[3];
+ rk += 4;
+
+ /*
+ * Nr - 1 full rounds:
+ */
+ for (r = (rounds < Nr ? rounds : Nr) - 1; r > 0; r--) {
+ t0 =
+ Td0[(s0 >> 24) ] ^
+ Td1[(s3 >> 16) & 0xff] ^
+ Td2[(s2 >> 8) & 0xff] ^
+ Td3[(s1 ) & 0xff] ^
+ rk[0];
+ t1 =
+ Td0[(s1 >> 24) ] ^
+ Td1[(s0 >> 16) & 0xff] ^
+ Td2[(s3 >> 8) & 0xff] ^
+ Td3[(s2 ) & 0xff] ^
+ rk[1];
+ t2 =
+ Td0[(s2 >> 24) ] ^
+ Td1[(s1 >> 16) & 0xff] ^
+ Td2[(s0 >> 8) & 0xff] ^
+ Td3[(s3 ) & 0xff] ^
+ rk[2];
+ t3 =
+ Td0[(s3 >> 24) ] ^
+ Td1[(s2 >> 16) & 0xff] ^
+ Td2[(s1 >> 8) & 0xff] ^
+ Td3[(s0 ) & 0xff] ^
+ rk[3];
+
+ s0 = t0;
+ s1 = t1;
+ s2 = t2;
+ s3 = t3;
+ rk += 4;
+
+ }
+
+ /*
+ * complete the last round and
+ * map cipher state to byte array block:
+ */
+ t0 =
+ (Td4[(s0 >> 24) ] & 0xff000000) ^
+ (Td4[(s3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(s2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(s1 ) & 0xff] & 0x000000ff);
+ t1 =
+ (Td4[(s1 >> 24) ] & 0xff000000) ^
+ (Td4[(s0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(s3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(s2 ) & 0xff] & 0x000000ff);
+ t2 =
+ (Td4[(s2 >> 24) ] & 0xff000000) ^
+ (Td4[(s1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(s0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(s3 ) & 0xff] & 0x000000ff);
+ t3 =
+ (Td4[(s3 >> 24) ] & 0xff000000) ^
+ (Td4[(s2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(s1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(s0 ) & 0xff] & 0x000000ff);
+
+ if (rounds == Nr) {
+ t0 ^= rk[0];
+ t1 ^= rk[1];
+ t2 ^= rk[2];
+ t3 ^= rk[3];
+ }
+
+ PUTU32(block , t0);
+ PUTU32(block + 4, t1);
+ PUTU32(block + 8, t2);
+ PUTU32(block + 12, t3);
+}
+
+#endif /* INTERMEDIATE_VALUE_KAT */
diff --git a/src/crypto/rijndael/rijndael-alg-fst.h b/src/crypto/rijndael/rijndael-alg-fst.h
new file mode 100644
index 00000000..7d5e228c
--- /dev/null
+++ b/src/crypto/rijndael/rijndael-alg-fst.h
@@ -0,0 +1,40 @@
+/*
+ * $Id$
+ */
+/**
+ * rijndael-alg-fst.h
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __RIJNDAEL_ALG_FST_H
+#define __RIJNDAEL_ALG_FST_H
+
+#define MAXKC (256/32)
+#define MAXKB (256/8)
+#define MAXNR 14
+
+typedef u_int8_t u8;
+typedef u_int16_t u16;
+typedef u_int32_t u32;
+
+#endif /* __RIJNDAEL_ALG_FST_H */
diff --git a/src/crypto/rijndael/rijndael-api-fst.c b/src/crypto/rijndael/rijndael-api-fst.c
new file mode 100644
index 00000000..3fd6489d
--- /dev/null
+++ b/src/crypto/rijndael/rijndael-api-fst.c
@@ -0,0 +1,491 @@
+/**
+ * rijndael-api-fst.c
+ *
+ * @version 2.9 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Acknowledgements:
+ *
+ * We are deeply indebted to the following people for their bug reports,
+ * fixes, and improvement suggestions to this implementation. Though we
+ * tried to list all contributions, we apologise in advance for any
+ * missing reference.
+ *
+ * Andrew Bales <Andrew.Bales@Honeywell.com>
+ * Markus Friedl <markus.friedl@informatik.uni-erlangen.de>
+ * John Skodon <skodonj@webquill.com>
+ */
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+
+#include "crypto/rijndael/rijndael-alg-fst.h"
+#include "crypto/rijndael/rijndael-api-fst.h"
+
+/*
+ * __db_makeKey --
+ *
+ * PUBLIC: int __db_makeKey __P((keyInstance *, int, int, char *));
+ */
+int
+__db_makeKey(key, direction, keyLen, keyMaterial)
+ keyInstance *key;
+ int direction;
+ int keyLen;
+ char *keyMaterial;
+{
+ u8 cipherKey[MAXKB];
+
+ if (key == NULL) {
+ return BAD_KEY_INSTANCE;
+ }
+
+ if ((direction == DIR_ENCRYPT) || (direction == DIR_DECRYPT)) {
+ key->direction = direction;
+ } else {
+ return BAD_KEY_DIR;
+ }
+
+ if ((keyLen == 128) || (keyLen == 192) || (keyLen == 256)) {
+ key->keyLen = keyLen;
+ } else {
+ return BAD_KEY_MAT;
+ }
+
+ if (keyMaterial != NULL) {
+ memcpy(cipherKey, keyMaterial, key->keyLen/8);
+ }
+
+ if (direction == DIR_ENCRYPT) {
+ key->Nr = __db_rijndaelKeySetupEnc(key->rk, cipherKey, keyLen);
+ } else {
+ key->Nr = __db_rijndaelKeySetupDec(key->rk, cipherKey, keyLen);
+ }
+ __db_rijndaelKeySetupEnc(key->ek, cipherKey, keyLen);
+ return TRUE;
+}
+
+/*
+ * __db_cipherInit --
+ *
+ * PUBLIC: int __db_cipherInit __P((cipherInstance *, int, char *));
+ */
+int
+__db_cipherInit(cipher, mode, IV)
+ cipherInstance *cipher;
+ int mode;
+ char *IV;
+{
+ if ((mode == MODE_ECB) || (mode == MODE_CBC) || (mode == MODE_CFB1)) {
+ cipher->mode = mode;
+ } else {
+ return BAD_CIPHER_MODE;
+ }
+ if (IV != NULL) {
+ memcpy(cipher->IV, IV, MAX_IV_SIZE);
+ }
+ return TRUE;
+}
+
+/*
+ * __db_blockEncrypt --
+ *
+ * PUBLIC: int __db_blockEncrypt __P((cipherInstance *, keyInstance *, u_int8_t *,
+ * PUBLIC: size_t, u_int8_t *));
+ */
+int
+__db_blockEncrypt(cipher, key, input, inputLen, outBuffer)
+ cipherInstance *cipher;
+ keyInstance *key;
+ u_int8_t *input;
+ size_t inputLen;
+ u_int8_t *outBuffer;
+{
+ int i, k, t, numBlocks;
+ u8 block[16], *iv;
+ u32 tmpiv[4];
+
+ if (cipher == NULL ||
+ key == NULL ||
+ key->direction == DIR_DECRYPT) {
+ return BAD_CIPHER_STATE;
+ }
+ if (input == NULL || inputLen <= 0) {
+ return 0; /* nothing to do */
+ }
+
+ numBlocks = (int)(inputLen/128);
+
+ switch (cipher->mode) {
+ case MODE_ECB:
+ for (i = numBlocks; i > 0; i--) {
+ __db_rijndaelEncrypt(key->rk, key->Nr, input, outBuffer);
+ input += 16;
+ outBuffer += 16;
+ }
+ break;
+
+ case MODE_CBC:
+ iv = cipher->IV;
+ for (i = numBlocks; i > 0; i--) {
+ memcpy(tmpiv, iv, MAX_IV_SIZE);
+ ((u32*)block)[0] = ((u32*)input)[0] ^ tmpiv[0];
+ ((u32*)block)[1] = ((u32*)input)[1] ^ tmpiv[1];
+ ((u32*)block)[2] = ((u32*)input)[2] ^ tmpiv[2];
+ ((u32*)block)[3] = ((u32*)input)[3] ^ tmpiv[3];
+ __db_rijndaelEncrypt(key->rk, key->Nr, block, outBuffer);
+ iv = outBuffer;
+ input += 16;
+ outBuffer += 16;
+ }
+ break;
+
+ case MODE_CFB1:
+ iv = cipher->IV;
+ for (i = numBlocks; i > 0; i--) {
+ memcpy(outBuffer, input, 16);
+ for (k = 0; k < 128; k++) {
+ __db_rijndaelEncrypt(key->ek, key->Nr, iv, block);
+ outBuffer[k >> 3] ^= (block[0] & (u_int)0x80) >> (k & 7);
+ for (t = 0; t < 15; t++) {
+ iv[t] = (iv[t] << 1) | (iv[t + 1] >> 7);
+ }
+ iv[15] = (iv[15] << 1) | ((outBuffer[k >> 3] >> (7 - (k & 7))) & 1);
+ }
+ outBuffer += 16;
+ input += 16;
+ }
+ break;
+
+ default:
+ return BAD_CIPHER_STATE;
+ }
+
+ return 128*numBlocks;
+}
+
+/**
+ * Encrypt data partitioned in octets, using RFC 2040-like padding.
+ *
+ * @param input data to be encrypted (octet sequence)
+ * @param inputOctets input length in octets (not bits)
+ * @param outBuffer encrypted output data
+ *
+ * @return length in octets (not bits) of the encrypted output buffer.
+ */
+/*
+ * __db_padEncrypt --
+ *
+ * PUBLIC: int __db_padEncrypt __P((cipherInstance *, keyInstance *, u_int8_t *,
+ * PUBLIC: int, u_int8_t *));
+ */
+int
+__db_padEncrypt(cipher, key, input, inputOctets, outBuffer)
+ cipherInstance *cipher;
+ keyInstance *key;
+ u_int8_t *input;
+ int inputOctets;
+ u_int8_t *outBuffer;
+{
+ int i, numBlocks, padLen;
+ u8 block[16], *iv;
+ u32 tmpiv[4];
+
+ if (cipher == NULL ||
+ key == NULL ||
+ key->direction == DIR_DECRYPT) {
+ return BAD_CIPHER_STATE;
+ }
+ if (input == NULL || inputOctets <= 0) {
+ return 0; /* nothing to do */
+ }
+
+ numBlocks = inputOctets/16;
+
+ switch (cipher->mode) {
+ case MODE_ECB:
+ for (i = numBlocks; i > 0; i--) {
+ __db_rijndaelEncrypt(key->rk, key->Nr, input, outBuffer);
+ input += 16;
+ outBuffer += 16;
+ }
+ padLen = 16 - (inputOctets - 16*numBlocks);
+ DB_ASSERT(NULL, padLen > 0 && padLen <= 16);
+ memcpy(block, input, 16 - padLen);
+ memset(block + 16 - padLen, padLen, padLen);
+ __db_rijndaelEncrypt(key->rk, key->Nr, block, outBuffer);
+ break;
+
+ case MODE_CBC:
+ iv = cipher->IV;
+ for (i = numBlocks; i > 0; i--) {
+ memcpy(tmpiv, iv, MAX_IV_SIZE);
+ ((u32*)block)[0] = ((u32*)input)[0] ^ tmpiv[0];
+ ((u32*)block)[1] = ((u32*)input)[1] ^ tmpiv[1];
+ ((u32*)block)[2] = ((u32*)input)[2] ^ tmpiv[2];
+ ((u32*)block)[3] = ((u32*)input)[3] ^ tmpiv[3];
+ __db_rijndaelEncrypt(key->rk, key->Nr, block, outBuffer);
+ iv = outBuffer;
+ input += 16;
+ outBuffer += 16;
+ }
+ padLen = 16 - (inputOctets - 16*numBlocks);
+ DB_ASSERT(NULL, padLen > 0 && padLen <= 16);
+ for (i = 0; i < 16 - padLen; i++) {
+ block[i] = input[i] ^ iv[i];
+ }
+ for (i = 16 - padLen; i < 16; i++) {
+ block[i] = (u_int8_t)padLen ^ iv[i];
+ }
+ __db_rijndaelEncrypt(key->rk, key->Nr, block, outBuffer);
+ break;
+
+ default:
+ return BAD_CIPHER_STATE;
+ }
+
+ return 16*(numBlocks + 1);
+}
+
+/*
+ * __db_blockDecrypt --
+ *
+ * PUBLIC: int __db_blockDecrypt __P((cipherInstance *, keyInstance *, u_int8_t *,
+ * PUBLIC: size_t, u_int8_t *));
+ */
+int
+__db_blockDecrypt(cipher, key, input, inputLen, outBuffer)
+ cipherInstance *cipher;
+ keyInstance *key;
+ u_int8_t *input;
+ size_t inputLen;
+ u_int8_t *outBuffer;
+{
+ int i, k, t, numBlocks;
+ u8 block[16], *iv;
+ u32 tmpiv[4];
+
+ if (cipher == NULL ||
+ key == NULL ||
+ (cipher->mode != MODE_CFB1 && key->direction == DIR_ENCRYPT)) {
+ return BAD_CIPHER_STATE;
+ }
+ if (input == NULL || inputLen <= 0) {
+ return 0; /* nothing to do */
+ }
+
+ numBlocks = (int)(inputLen/128);
+
+ switch (cipher->mode) {
+ case MODE_ECB:
+ for (i = numBlocks; i > 0; i--) {
+ __db_rijndaelDecrypt(key->rk, key->Nr, input, outBuffer);
+ input += 16;
+ outBuffer += 16;
+ }
+ break;
+
+ case MODE_CBC:
+ memcpy(tmpiv, cipher->IV, MAX_IV_SIZE);
+ for (i = numBlocks; i > 0; i--) {
+ __db_rijndaelDecrypt(key->rk, key->Nr, input, block);
+ ((u32*)block)[0] ^= tmpiv[0];
+ ((u32*)block)[1] ^= tmpiv[1];
+ ((u32*)block)[2] ^= tmpiv[2];
+ ((u32*)block)[3] ^= tmpiv[3];
+ memcpy(tmpiv, input, 16);
+ memcpy(outBuffer, block, 16);
+ input += 16;
+ outBuffer += 16;
+ }
+ break;
+
+ case MODE_CFB1:
+ iv = cipher->IV;
+ for (i = numBlocks; i > 0; i--) {
+ memcpy(outBuffer, input, 16);
+ for (k = 0; k < 128; k++) {
+ __db_rijndaelEncrypt(key->ek, key->Nr, iv, block);
+ for (t = 0; t < 15; t++) {
+ iv[t] = (iv[t] << 1) | (iv[t + 1] >> 7);
+ }
+ iv[15] = (iv[15] << 1) | ((input[k >> 3] >> (7 - (k & 7))) & 1);
+ outBuffer[k >> 3] ^= (block[0] & (u_int)0x80) >> (k & 7);
+ }
+ outBuffer += 16;
+ input += 16;
+ }
+ break;
+
+ default:
+ return BAD_CIPHER_STATE;
+ }
+
+ return 128*numBlocks;
+}
+
+/*
+ * __db_padDecrypt --
+ *
+ * PUBLIC: int __db_padDecrypt __P((cipherInstance *, keyInstance *, u_int8_t *,
+ * PUBLIC: int, u_int8_t *));
+ */
+int
+__db_padDecrypt(cipher, key, input, inputOctets, outBuffer)
+ cipherInstance *cipher;
+ keyInstance *key;
+ u_int8_t *input;
+ int inputOctets;
+ u_int8_t *outBuffer;
+{
+ int i, numBlocks, padLen;
+ u8 block[16];
+ u32 tmpiv[4];
+
+ if (cipher == NULL ||
+ key == NULL ||
+ key->direction == DIR_ENCRYPT) {
+ return BAD_CIPHER_STATE;
+ }
+ if (input == NULL || inputOctets <= 0) {
+ return 0; /* nothing to do */
+ }
+ if (inputOctets % 16 != 0) {
+ return BAD_DATA;
+ }
+
+ numBlocks = inputOctets/16;
+
+ switch (cipher->mode) {
+ case MODE_ECB:
+ /* all blocks but last */
+ for (i = numBlocks - 1; i > 0; i--) {
+ __db_rijndaelDecrypt(key->rk, key->Nr, input, outBuffer);
+ input += 16;
+ outBuffer += 16;
+ }
+ /* last block */
+ __db_rijndaelDecrypt(key->rk, key->Nr, input, block);
+ padLen = block[15];
+ if (padLen >= 16) {
+ return BAD_DATA;
+ }
+ for (i = 16 - padLen; i < 16; i++) {
+ if (block[i] != padLen) {
+ return BAD_DATA;
+ }
+ }
+ memcpy(outBuffer, block, 16 - padLen);
+ break;
+
+ case MODE_CBC:
+ /* all blocks but last */
+ memcpy(tmpiv, cipher->IV, MAX_IV_SIZE);
+ for (i = numBlocks - 1; i > 0; i--) {
+ __db_rijndaelDecrypt(key->rk, key->Nr, input, block);
+ ((u32*)block)[0] ^= tmpiv[0];
+ ((u32*)block)[1] ^= tmpiv[1];
+ ((u32*)block)[2] ^= tmpiv[2];
+ ((u32*)block)[3] ^= tmpiv[3];
+ memcpy(tmpiv, input, 16);
+ memcpy(outBuffer, block, 16);
+ input += 16;
+ outBuffer += 16;
+ }
+ /* last block */
+ __db_rijndaelDecrypt(key->rk, key->Nr, input, block);
+ ((u32*)block)[0] ^= tmpiv[0];
+ ((u32*)block)[1] ^= tmpiv[1];
+ ((u32*)block)[2] ^= tmpiv[2];
+ ((u32*)block)[3] ^= tmpiv[3];
+ padLen = block[15];
+ if (padLen <= 0 || padLen > 16) {
+ return BAD_DATA;
+ }
+ for (i = 16 - padLen; i < 16; i++) {
+ if (block[i] != padLen) {
+ return BAD_DATA;
+ }
+ }
+ memcpy(outBuffer, block, 16 - padLen);
+ break;
+
+ default:
+ return BAD_CIPHER_STATE;
+ }
+
+ return 16*numBlocks - padLen;
+}
+
+#ifdef INTERMEDIATE_VALUE_KAT
+/**
+ * cipherUpdateRounds:
+ *
+ * Encrypts/Decrypts exactly one full block a specified number of rounds.
+ * Only used in the Intermediate Value Known Answer Test.
+ *
+ * Returns:
+ * TRUE - on success
+ * BAD_CIPHER_STATE - cipher in bad state (e.g., not initialized)
+ */
+/*
+ * __db_cipherUpdateRounds --
+ *
+ * PUBLIC: int __db_cipherUpdateRounds __P((cipherInstance *, keyInstance *,
+ * PUBLIC: u_int8_t *, int, u_int8_t *, int));
+ */
+int
+__db_cipherUpdateRounds(cipher, key, input, inputLen, outBuffer, rounds)
+ cipherInstance *cipher;
+ keyInstance *key;
+ u_int8_t *input;
+ size_t inputLen;
+ u_int8_t *outBuffer;
+ int rounds;
+{
+ u8 block[16];
+
+ if (cipher == NULL || key == NULL) {
+ return BAD_CIPHER_STATE;
+ }
+
+ memcpy(block, input, 16);
+
+ switch (key->direction) {
+ case DIR_ENCRYPT:
+ __db_rijndaelEncryptRound(key->rk, key->Nr, block, rounds);
+ break;
+
+ case DIR_DECRYPT:
+ __db_rijndaelDecryptRound(key->rk, key->Nr, block, rounds);
+ break;
+
+ default:
+ return BAD_KEY_DIR;
+ }
+
+ memcpy(outBuffer, block, 16);
+
+ return TRUE;
+}
+#endif /* INTERMEDIATE_VALUE_KAT */
diff --git a/src/crypto/rijndael/rijndael-api-fst.h b/src/crypto/rijndael/rijndael-api-fst.h
new file mode 100644
index 00000000..3e31920a
--- /dev/null
+++ b/src/crypto/rijndael/rijndael-api-fst.h
@@ -0,0 +1,91 @@
+/*
+ * $Id$
+ */
+/**
+ * rijndael-api-fst.h
+ *
+ * @version 2.9 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Acknowledgements:
+ *
+ * We are deeply indebted to the following people for their bug reports,
+ * fixes, and improvement suggestions to this implementation. Though we
+ * tried to list all contributions, we apologise in advance for any
+ * missing reference.
+ *
+ * Andrew Bales <Andrew.Bales@Honeywell.com>
+ * Markus Friedl <markus.friedl@informatik.uni-erlangen.de>
+ * John Skodon <skodonj@webquill.com>
+ */
+
+#ifndef __RIJNDAEL_API_FST_H
+#define __RIJNDAEL_API_FST_H
+
+#include "crypto/rijndael/rijndael-alg-fst.h"
+
+/* Generic Defines */
+#define DIR_ENCRYPT 0 /* Are we encrpyting? */
+#define DIR_DECRYPT 1 /* Are we decrpyting? */
+#define MODE_ECB 1 /* Are we ciphering in ECB mode? */
+#define MODE_CBC 2 /* Are we ciphering in CBC mode? */
+#define MODE_CFB1 3 /* Are we ciphering in 1-bit CFB mode? */
+#undef TRUE
+#define TRUE 1
+#undef FALSE
+#define FALSE 0
+#define BITSPERBLOCK 128 /* Default number of bits in a cipher block */
+
+/* Error Codes */
+#define BAD_KEY_DIR -1 /* Key direction is invalid, e.g., unknown value */
+#define BAD_KEY_MAT -2 /* Key material not of correct length */
+#define BAD_KEY_INSTANCE -3 /* Key passed is not valid */
+#define BAD_CIPHER_MODE -4 /* Params struct passed to cipherInit invalid */
+#define BAD_CIPHER_STATE -5 /* Cipher in wrong state (e.g., not initialized) */
+#define BAD_BLOCK_LENGTH -6
+#define BAD_CIPHER_INSTANCE -7
+#define BAD_DATA -8 /* Data contents are invalid, e.g., invalid padding */
+#define BAD_OTHER -9 /* Unknown error */
+
+/* Algorithm-specific Defines */
+#define MAX_KEY_SIZE 64 /* # of ASCII char's needed to represent a key */
+#define MAX_IV_SIZE 16 /* # bytes needed to represent an IV */
+
+/* Typedefs */
+
+/* The structure for key information */
+typedef struct {
+ u_int8_t direction; /* Key used for encrypting or decrypting? */
+ int keyLen; /* Length of the key */
+ char keyMaterial[MAX_KEY_SIZE+1]; /* Raw key data in ASCII, e.g., user input or KAT values */
+ int Nr; /* key-length-dependent number of rounds */
+ u32 rk[4*(MAXNR + 1)]; /* key schedule */
+ u32 ek[4*(MAXNR + 1)]; /* CFB1 key schedule (encryption only) */
+} keyInstance;
+
+/* The structure for cipher information */
+typedef struct { /* changed order of the components */
+ u_int8_t mode; /* MODE_ECB, MODE_CBC, or MODE_CFB1 */
+ u_int8_t IV[MAX_IV_SIZE]; /* A possible Initialization Vector for ciphering */
+} cipherInstance;
+
+#endif /* __RIJNDAEL_API_FST_H */
diff --git a/src/db/crdel.src b/src/db/crdel.src
new file mode 100644
index 00000000..70473899
--- /dev/null
+++ b/src/db/crdel.src
@@ -0,0 +1,71 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __crdel
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * Metasub: log the creation of a subdatabase meta data page.
+ *
+ * fileid: identifies the file being acted upon.
+ * pgno: page number on which to write this meta-data page
+ * page: the actual meta-data page
+ * lsn: lsn of the page.
+ */
+BEGIN metasub 42 142
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+PGDBT page DBT s
+POINTER lsn DB_LSN * lu
+END
+
+/*
+ * Inmem_create: Log the creation of an in-memory database.
+ *
+ * name: Name of the database
+ * fid: File id of the database
+ */
+BEGIN inmem_create 44 138
+ARG fileid int32_t ld
+DBT name DBT s
+DBT fid DBT s
+ARG pgsize u_int32_t lu
+END
+
+/*
+ * Inmem_rename: Log the renaming of an in-memory only database.
+ *
+ * oldname: database's starting name
+ * newname: database's ending name
+ * fid: fileid
+ */
+BEGIN inmem_rename 44 139
+DBT oldname DBT s
+DBT newname DBT s
+DBT fid DBT s
+END
+
+/*
+ * Inmem_remove: Log the removal of an in-memory only database.
+ *
+ * name: database's ending name
+ * fid: fileid
+ */
+BEGIN inmem_remove 44 140
+DBT name DBT s
+DBT fid DBT s
+END
+
diff --git a/src/db/crdel_auto.c b/src/db/crdel_auto.c
new file mode 100644
index 00000000..a2a3f54b
--- /dev/null
+++ b/src/db/crdel_auto.c
@@ -0,0 +1,59 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __crdel_metasub_desc[] = {
+ {LOGREC_DB, SSZ(__crdel_metasub_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__crdel_metasub_args, pgno), "pgno", "%lu"},
+ {LOGREC_PGDBT, SSZ(__crdel_metasub_args, page), "page", ""},
+ {LOGREC_POINTER, SSZ(__crdel_metasub_args, lsn), "lsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __crdel_inmem_create_desc[] = {
+ {LOGREC_ARG, SSZ(__crdel_inmem_create_args, fileid), "fileid", "%ld"},
+ {LOGREC_DBT, SSZ(__crdel_inmem_create_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__crdel_inmem_create_args, fid), "fid", ""},
+ {LOGREC_ARG, SSZ(__crdel_inmem_create_args, pgsize), "pgsize", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __crdel_inmem_rename_desc[] = {
+ {LOGREC_DBT, SSZ(__crdel_inmem_rename_args, oldname), "oldname", ""},
+ {LOGREC_DBT, SSZ(__crdel_inmem_rename_args, newname), "newname", ""},
+ {LOGREC_DBT, SSZ(__crdel_inmem_rename_args, fid), "fid", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __crdel_inmem_remove_desc[] = {
+ {LOGREC_DBT, SSZ(__crdel_inmem_remove_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__crdel_inmem_remove_args, fid), "fid", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __crdel_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_metasub_recover, DB___crdel_metasub)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_create_recover, DB___crdel_inmem_create)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_rename_recover, DB___crdel_inmem_rename)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_remove_recover, DB___crdel_inmem_remove)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/db/crdel_autop.c b/src/db/crdel_autop.c
new file mode 100644
index 00000000..79bd4d99
--- /dev/null
+++ b/src/db/crdel_autop.c
@@ -0,0 +1,103 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __crdel_metasub_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__crdel_metasub_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__crdel_metasub", __crdel_metasub_desc, info));
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_print __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__crdel_inmem_create", __crdel_inmem_create_desc, info));
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_print __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__crdel_inmem_rename", __crdel_inmem_rename_desc, info));
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_print __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__crdel_inmem_remove", __crdel_inmem_remove_desc, info));
+}
+
+/*
+ * PUBLIC: int __crdel_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_metasub_print, DB___crdel_metasub)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_create_print, DB___crdel_inmem_create)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_rename_print, DB___crdel_inmem_rename)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_remove_print, DB___crdel_inmem_remove)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/db/crdel_rec.c b/src/db/crdel_rec.c
new file mode 100644
index 00000000..08e7bae8
--- /dev/null
+++ b/src/db/crdel_rec.c
@@ -0,0 +1,301 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * __crdel_metasub_recover --
+ * Recovery function for metasub.
+ *
+ * PUBLIC: int __crdel_metasub_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_metasub_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __crdel_metasub_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_p, ret, t_ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__crdel_metasub_print);
+ REC_INTRO(__crdel_metasub_read, ip, 0);
+
+ /*
+ * If we are undoing this operation, but the DB that we got back
+ * was never really opened, then this open was an in-memory open
+ * that did not finish. We can let the file creation take care
+ * of any necessary undo/cleanup.
+ */
+ if (DB_UNDO(op) && !F_ISSET(file_dbp, DB_AM_OPEN_CALLED))
+ goto done;
+
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ /*
+ * If this is an in-memory file, this might be OK. Also, heap
+ * can get there through a truncate and we have to redo page 1
+ */
+ if ((file_dbp->type == DB_HEAP ||
+ F_ISSET(file_dbp, DB_AM_INMEM)) &&
+ (ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) == 0) {
+ if (F_ISSET(file_dbp, DB_AM_INMEM))
+ LSN_NOT_LOGGED(LSN(pagep));
+ } else {
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+ goto out;
+ }
+ }
+
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->page.data, argp->page.size);
+ LSN(pagep) = *lsnp;
+
+ /*
+ * If this was an in-memory database and we are re-creating
+ * and this is the meta-data page, then we need to set up a
+ * bunch of fields in the dbo as well.
+ */
+ if (F_ISSET(file_dbp, DB_AM_INMEM) &&
+ argp->pgno == PGNO_BASE_MD &&
+ (ret = __db_meta_setup(file_dbp->env, file_dbp,
+ file_dbp->dname, (DBMETA *)pagep, 0, DB_CHK_META)) != 0)
+ goto out;
+ } else if (DB_UNDO(op)) {
+ /*
+ * We want to undo this page creation. The page creation
+ * happened in two parts. First, we called __db_pg_alloc which
+ * was logged separately. Then we wrote the meta-data onto
+ * the page. So long as we restore the LSN, then the recovery
+ * for __db_pg_alloc will do everything else.
+ *
+ * Don't bother checking the lsn on the page. If we are
+ * rolling back the next thing is that this page will get
+ * freed. Opening the subdb will have reinitialized the
+ * page, but not the lsn.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = argp->lsn;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL && (t_ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ REC_CLOSE;
+}
+
+/*
+ * __crdel_inmem_create_recover --
+ * Recovery function for inmem_create.
+ *
+ * PUBLIC: int __crdel_inmem_create_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __crdel_inmem_create_args *argp;
+ DB *dbp;
+ int do_close, ret, t_ret;
+
+ COMPQUIET(info, NULL);
+
+ dbp = NULL;
+ do_close = 0;
+ REC_PRINT(__crdel_inmem_create_print);
+ REC_NOOP_INTRO(__crdel_inmem_create_read);
+
+ /* First, see if the DB handle already exists. */
+ if (argp->fileid == DB_LOGFILEID_INVALID) {
+ if (DB_REDO(op))
+ ret = ENOENT;
+ else
+ ret = 0;
+ } else
+ ret = __dbreg_id_to_db(env, argp->txnp, &dbp, argp->fileid, 0);
+
+ if (DB_REDO(op)) {
+ /*
+ * If the dbreg failed, that means that we're creating a
+ * tmp file.
+ */
+ if (ret != 0) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto out;
+
+ F_SET(dbp, DB_AM_RECOVER | DB_AM_INMEM);
+ memcpy(dbp->fileid, argp->fid.data, DB_FILE_ID_LEN);
+ if (((ret = __os_strdup(env,
+ argp->name.data, &dbp->dname)) != 0))
+ goto out;
+
+ /*
+ * This DBP is never going to be entered into the
+ * dbentry table, so if we leave it open here,
+ * then we're going to lose it.
+ */
+ do_close = 1;
+ }
+
+ /* Now, set the fileid. */
+ memcpy(dbp->fileid, argp->fid.data, argp->fid.size);
+ if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
+ goto out;
+ dbp->preserve_fid = 1;
+ MAKE_INMEM(dbp);
+ if ((ret = __env_setup(dbp,
+ NULL, NULL, argp->name.data, TXN_INVALID, 0)) != 0)
+ goto out;
+ ret = __env_mpool(dbp, argp->name.data, 0);
+
+ if (ret == ENOENT) {
+ dbp->pgsize = argp->pgsize;
+ if ((ret = __env_mpool(dbp,
+ argp->name.data, DB_CREATE)) != 0)
+ goto out;
+ } else if (ret != 0)
+ goto out;
+ }
+
+ if (DB_UNDO(op)) {
+ if (ret == 0)
+ ret = __memp_nameop(env, argp->fid.data, NULL,
+ (const char *)argp->name.data, NULL, 1);
+
+ if (ret == ENOENT || ret == DB_DELETED)
+ ret = 0;
+ else
+ goto out;
+ }
+
+ *lsnp = argp->prev_lsn;
+
+out: if (dbp != NULL) {
+ t_ret = 0;
+
+ if (do_close || ret != 0)
+ t_ret = __db_close(dbp, NULL, DB_NOSYNC);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ }
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __crdel_inmem_rename_recover --
+ * Recovery function for inmem_rename.
+ *
+ * PUBLIC: int __crdel_inmem_rename_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __crdel_inmem_rename_args *argp;
+ u_int8_t *fileid;
+ int ret;
+
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__crdel_inmem_rename_print);
+ REC_NOOP_INTRO(__crdel_inmem_rename_read);
+ fileid = argp->fid.data;
+
+ /* Void out errors because the files may or may not still exist. */
+ if (DB_REDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->newname.data,
+ (const char *)argp->oldname.data,
+ (const char *)argp->newname.data, 1);
+
+ if (DB_UNDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->oldname.data,
+ (const char *)argp->newname.data,
+ (const char *)argp->oldname.data, 1);
+
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __crdel_inmem_remove_recover --
+ * Recovery function for inmem_remove.
+ *
+ * PUBLIC: int __crdel_inmem_remove_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __crdel_inmem_remove_args *argp;
+ int ret;
+
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__crdel_inmem_remove_print);
+ REC_NOOP_INTRO(__crdel_inmem_remove_read);
+
+ /*
+ * Since removes are delayed; there is no undo for a remove; only redo.
+ * The remove may fail, which is OK.
+ */
+ if (DB_REDO(op)) {
+ (void)__memp_nameop(env,
+ argp->fid.data, NULL, argp->name.data, NULL, 1);
+ }
+
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ REC_NOOP_CLOSE;
+}
diff --git a/src/db/db.c b/src/db/db.c
new file mode 100644
index 00000000..0d9d1e6e
--- /dev/null
+++ b/src/db/db.c
@@ -0,0 +1,1659 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_disassociate __P((DB *));
+static int __db_disassociate_foreign __P ((DB *));
+
+#ifdef CONFIG_TEST
+static int __db_makecopy __P((ENV *, const char *, const char *));
+static int __qam_testdocopy __P((DB *, const char *));
+#endif
+
+/*
+ * DB.C --
+ * This file contains the utility functions for the DBP layer.
+ */
+
+/*
+ * __db_master_open --
+ * Open up a handle on a master database.
+ *
+ * PUBLIC: int __db_master_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, u_int32_t, int, DB **));
+ */
+int
+__db_master_open(subdbp, ip, txn, name, flags, mode, dbpp)
+ DB *subdbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ u_int32_t flags;
+ int mode;
+ DB **dbpp;
+{
+ DB *dbp;
+ int ret;
+
+ *dbpp = NULL;
+
+ /* Open up a handle on the main database. */
+ if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0)
+ return (ret);
+
+ /*
+ * It's always a btree.
+ * Run in the transaction we've created.
+ * Set the pagesize in case we're creating a new database.
+ * Flag that we're creating a database with subdatabases.
+ */
+ dbp->pgsize = subdbp->pgsize;
+ F_SET(dbp, DB_AM_SUBDB);
+ F_SET(dbp, F_ISSET(subdbp,
+ DB_AM_RECOVER | DB_AM_SWAP |
+ DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE));
+
+ /*
+ * If there was a subdb specified, then we only want to apply
+ * DB_EXCL to the subdb, not the actual file. We only got here
+ * because there was a subdb specified.
+ */
+ LF_CLR(DB_EXCL);
+ LF_SET(DB_RDWRMASTER);
+ if ((ret = __db_open(dbp, ip, txn,
+ name, NULL, DB_BTREE, flags, mode, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ /*
+ * The items in dbp are initialized from the master file's meta page.
+ * Other items such as checksum and encryption are checked when we
+ * read the meta-page, so we do not check those here. However, if
+ * the meta-page caused checksumming to be turned on and it wasn't
+ * already, set it here.
+ */
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ F_SET(subdbp, DB_AM_CHKSUM);
+
+ /*
+ * The user may have specified a page size for an existing file,
+ * which we want to ignore.
+ */
+ subdbp->pgsize = dbp->pgsize;
+ *dbpp = dbp;
+
+ if (0) {
+err: if (!F_ISSET(dbp, DB_AM_DISCARD))
+ (void)__db_close(dbp, txn, DB_NOSYNC);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_master_update --
+ * Add/Open/Remove a subdatabase from a master database.
+ *
+ * PUBLIC: int __db_master_update __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: const char *, DBTYPE, mu_action, const char *, u_int32_t));
+ */
+int
+__db_master_update(mdbp, sdbp, ip, txn, subdb, type, action, newname, flags)
+ DB *mdbp, *sdbp;
+ DB_TXN *txn;
+ DB_THREAD_INFO *ip;
+ const char *subdb;
+ DBTYPE type;
+ mu_action action;
+ const char *newname;
+ u_int32_t flags;
+{
+ DBC *dbc, *ndbc;
+ DBT key, data, ndata;
+ ENV *env;
+ PAGE *p, *r;
+ db_pgno_t t_pgno;
+ int modify, ret, t_ret;
+
+ env = mdbp->env;
+ dbc = ndbc = NULL;
+ p = NULL;
+
+ /*
+ * Open up a cursor. If this is CDB and we're creating the database,
+ * make it an update cursor.
+ *
+ * Might we modify the master database? If so, we'll need to lock.
+ */
+ modify = (!F_ISSET(mdbp, DB_AM_RDONLY) &&
+ (action != MU_OPEN || LF_ISSET(DB_CREATE))) ? 1 : 0;
+
+ if ((ret = __db_cursor(mdbp, ip, txn, &dbc,
+ (CDB_LOCKING(env) && modify) ? DB_WRITECURSOR : 0)) != 0)
+ return (ret);
+
+ /*
+ * Point the cursor at the record.
+ *
+ * If we're removing or potentially creating an entry, lock the page
+ * with DB_RMW.
+ *
+ * We do multiple cursor operations with the cursor in some cases and
+ * subsequently access the data DBT information. Set DB_DBT_MALLOC so
+ * we don't risk modification of the data between our uses of it.
+ *
+ * !!!
+ * We don't include the name's nul termination in the database.
+ */
+ DB_INIT_DBT(key, subdb, strlen(subdb));
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_MALLOC);
+
+ ret = __dbc_get(dbc, &key, &data,
+ DB_SET | ((STD_LOCKING(dbc) && modify) ? DB_RMW : 0));
+
+ /*
+ * What we do next--whether or not we found a record for the
+ * specified subdatabase--depends on what the specified action is.
+ * Handle ret appropriately as the first statement of each case.
+ */
+ switch (action) {
+ case MU_REMOVE:
+ /*
+ * We should have found something if we're removing it. Note
+ * that in the common case where the DB we're asking to remove
+ * doesn't exist, we won't get this far; __db_subdb_remove
+ * will already have returned an error from __db_open.
+ */
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Delete the subdatabase entry first; if this fails,
+ * we don't want to touch the actual subdb pages.
+ */
+ if ((ret = __dbc_del(dbc, 0)) != 0)
+ goto err;
+
+ /*
+ * We're handling actual data, not on-page meta-data,
+ * so it hasn't been converted to/from opposite
+ * endian architectures. Do it explicitly, now.
+ */
+ memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+ DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
+ if ((ret = __memp_fget(mdbp->mpf, &sdbp->meta_pgno,
+ ip, dbc->txn, DB_MPOOL_DIRTY, &p)) != 0)
+ goto err;
+
+ /* Free the root on the master db if it was created. */
+ if (TYPE(p) == P_BTREEMETA &&
+ ((BTMETA *)p)->root != PGNO_INVALID) {
+ if ((ret = __memp_fget(mdbp->mpf,
+ &((BTMETA *)p)->root, ip, dbc->txn,
+ DB_MPOOL_DIRTY, &r)) != 0)
+ goto err;
+
+ /* Free and put the page. */
+ if ((ret = __db_free(dbc, r, 0)) != 0) {
+ r = NULL;
+ goto err;
+ }
+ }
+ /* Free and put the page. */
+ if ((ret = __db_free(dbc, p, 0)) != 0) {
+ p = NULL;
+ goto err;
+ }
+ p = NULL;
+ break;
+ case MU_RENAME:
+ /* We should have found something if we're renaming it. */
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Before we rename, we need to make sure we're not
+ * overwriting another subdatabase, or else this operation
+ * won't be undoable. Open a second cursor and check
+ * for the existence of newname; it shouldn't appear under
+ * us since we hold the metadata lock.
+ */
+ if ((ret = __db_cursor(mdbp, ip, txn, &ndbc,
+ CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
+ goto err;
+ DB_SET_DBT(key, newname, strlen(newname));
+
+ /*
+ * We don't actually care what the meta page of the potentially-
+ * overwritten DB is; we just care about existence.
+ */
+ memset(&ndata, 0, sizeof(ndata));
+ F_SET(&ndata, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+ if ((ret = __dbc_get(ndbc, &key, &ndata, DB_SET)) == 0) {
+ /* A subdb called newname exists. Bail. */
+ ret = EEXIST;
+ __db_errx(env, DB_STR_A("0673",
+ "rename: database %s exists", "%s"), newname);
+ goto err;
+ } else if (ret != DB_NOTFOUND)
+ goto err;
+
+ /*
+ * Now do the put first; we don't want to lose our only
+ * reference to the subdb. Use the second cursor so the
+ * first one continues to point to the old record.
+ */
+ if ((ret = __dbc_put(ndbc, &key, &data, DB_KEYFIRST)) != 0)
+ goto err;
+ if ((ret = __dbc_del(dbc, 0)) != 0) {
+ /*
+ * If the delete fails, try to delete the record
+ * we just put, in case we're not txn-protected.
+ */
+ (void)__dbc_del(ndbc, 0);
+ goto err;
+ }
+
+ break;
+ case MU_OPEN:
+ /*
+ * Get the subdatabase information. If it already exists,
+ * copy out the page number and we're done.
+ */
+ switch (ret) {
+ case 0:
+ if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) {
+ ret = EEXIST;
+ goto err;
+ }
+ memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+ DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
+ goto done;
+ case DB_NOTFOUND:
+ if (LF_ISSET(DB_CREATE))
+ break;
+ /*
+ * No db_err, it is reasonable to remove a
+ * nonexistent db.
+ */
+ ret = ENOENT;
+ goto err;
+ default:
+ goto err;
+ }
+
+ /* Create a subdatabase. */
+ if (F_ISSET(mdbp, DB_AM_RDONLY)) {
+ ret = EBADF;
+ goto err;
+ }
+ if ((ret = __db_new(dbc,
+ type == DB_HASH ? P_HASHMETA : P_BTREEMETA, NULL, &p)) != 0)
+ goto err;
+ sdbp->meta_pgno = PGNO(p);
+
+ /*
+ * XXX
+ * We're handling actual data, not on-page meta-data, so it
+ * hasn't been converted to/from opposite endian architectures.
+ * Do it explicitly, now.
+ */
+ t_pgno = PGNO(p);
+ DB_HTONL_SWAP(env, &t_pgno);
+ memset(&ndata, 0, sizeof(ndata));
+ ndata.data = &t_pgno;
+ ndata.size = sizeof(db_pgno_t);
+ if ((ret = __dbc_put(dbc, &key, &ndata, 0)) != 0)
+ goto err;
+ F_SET(sdbp, DB_AM_CREATED);
+ break;
+
+ case MU_MOVE:
+ /* We should have found something if we're moving it. */
+ if (ret != 0)
+ goto err;
+ t_pgno = sdbp->meta_pgno;
+ DB_HTONL_SWAP(env, &t_pgno);
+ memset(&ndata, 0, sizeof(ndata));
+ ndata.data = &t_pgno;
+ ndata.size = sizeof(db_pgno_t);
+ if ((ret = __dbc_put(dbc, &key, &ndata, 0)) != 0)
+ goto err;
+ mdbp->mpf->mfp->revision++;
+ }
+
+err:
+done: /*
+ * If we allocated a page: if we're successful, mark the page dirty
+ * and return it to the cache, otherwise, discard/free it.
+ */
+ if (p != NULL && (t_ret = __memp_fput(mdbp->mpf,
+ dbc->thread_info, p, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the cursor(s) and data. */
+ if (data.data != NULL)
+ __os_ufree(env, data.data);
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ndbc != NULL && (t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __env_dbreg_setup --
+ *
+ * PUBLIC: int __env_dbreg_setup __P((DB *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbreg_setup(dbp, txn, fname, dname, id)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *fname, *dname;
+ u_int32_t id;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+ if (dbp->log_filename == NULL
+#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
+ && (txn != NULL || F_ISSET(dbp, DB_AM_RECOVER))
+#endif
+#if !defined(DEBUG_ROP)
+ && !F_ISSET(dbp, DB_AM_RDONLY)
+#endif
+ ) {
+ if ((ret = __dbreg_setup(dbp,
+ F_ISSET(dbp, DB_AM_INMEM) ? dname: fname,
+ F_ISSET(dbp, DB_AM_INMEM) ? NULL : dname, id)) != 0)
+ return (ret);
+
+ /*
+ * If we're actively logging and our caller isn't a
+ * recovery function that already did so, then assign
+ * this dbp a log fileid.
+ */
+ if (DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER) &&
+ (ret = __dbreg_new_id(dbp, txn)) != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+/*
+ * __env_setup --
+ * Set up the underlying environment during a db_open.
+ *
+ * PUBLIC: int __env_setup __P((DB *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t, u_int32_t));
+ */
+int
+__env_setup(dbp, txn, fname, dname, id, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *fname, *dname;
+ u_int32_t id, flags;
+{
+ DB *ldbp;
+ DB_ENV *dbenv;
+ ENV *env;
+ u_int32_t maxid;
+ int ret;
+
+ env = dbp->env;
+ dbenv = env->dbenv;
+
+ /*
+ * When verifying an in-memory db, we need to pass dname to
+ * __env_mpool. That is the only time fname will be used.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM) && F_ISSET(dbp, DB_AM_VERIFYING))
+ fname = dname;
+
+ /* If we don't yet have an environment, it's time to create it. */
+ if (!F_ISSET(env, ENV_OPEN_CALLED)) {
+#if defined(HAVE_MIXED_SIZE_ADDRESSING) && (SIZEOF_CHAR_P == 8)
+ __db_errx(env, DB_STR("0701", "DB_PRIVATE is not supported by"
+ " 64-bit applications in mixed-size-addressing mode"));
+ return (EINVAL);
+#endif
+ /* Make sure we have at least DB_MINCACHE pages in our cache. */
+ if (dbenv->mp_gbytes == 0 &&
+ dbenv->mp_bytes < dbp->pgsize * DB_MINPAGECACHE &&
+ (ret = __memp_set_cachesize(
+ dbenv, 0, dbp->pgsize * DB_MINPAGECACHE, 0)) != 0)
+ return (ret);
+
+ if ((ret = __env_open(dbenv, NULL, DB_CREATE |
+ DB_INIT_MPOOL | DB_PRIVATE | LF_ISSET(DB_THREAD), 0)) != 0)
+ return (ret);
+ }
+
+ /* Join the underlying cache. */
+ if ((!F_ISSET(dbp, DB_AM_INMEM) || F_ISSET(dbp, DB_AM_VERIFYING) ||
+ dname == NULL) && (ret = __env_mpool(dbp, fname, flags)) != 0)
+ return (ret);
+
+ /* We may need a per-thread mutex. */
+ if (LF_ISSET(DB_THREAD) && (ret = __mutex_alloc(
+ env, MTX_DB_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbp->mutex)) != 0)
+ return (ret);
+
+ /*
+ * Set up a bookkeeping entry for this database in the log region,
+ * if such a region exists. Note that even if we're in recovery
+ * or a replication client, where we won't log registries, we'll
+ * still need an FNAME struct, so LOGGING_ON is the correct macro.
+ */
+ if (LOGGING_ON(env) &&
+ (!F_ISSET(dbp, DB_AM_INMEM) || dname == NULL) &&
+ (ret = __env_dbreg_setup(dbp, txn, fname, dname, id)) != 0)
+ return (ret);
+
+ /*
+ * Insert ourselves into the ENV's dblist. We allocate a
+ * unique ID to each {fileid, meta page number} pair, and to
+ * each temporary file (since they all have a zero fileid).
+ * This ID gives us something to use to tell which DB handles
+ * go with which databases in all the cursor adjustment
+ * routines, where we don't want to do a lot of ugly and
+ * expensive memcmps.
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ maxid = 0;
+ TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks) {
+ /*
+ * There are three cases: on-disk database (first clause),
+ * named in-memory database (second clause), temporary database
+ * (never matches; no clause).
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM)) {
+ if (memcmp(ldbp->fileid, dbp->fileid, DB_FILE_ID_LEN)
+ == 0 && ldbp->meta_pgno == dbp->meta_pgno)
+ break;
+ } else if (dname != NULL) {
+ if (F_ISSET(ldbp, DB_AM_INMEM) &&
+ ldbp->dname != NULL &&
+ strcmp(ldbp->dname, dname) == 0)
+ break;
+ }
+ if (ldbp->adj_fileid > maxid)
+ maxid = ldbp->adj_fileid;
+ }
+
+ /*
+ * If ldbp is NULL, we didn't find a match. Assign the dbp an
+ * adj_fileid one higher than the largest we found, and
+ * insert it at the head of the master dbp list.
+ *
+ * If ldbp is not NULL, it is a match for our dbp. Give dbp
+ * the same ID that ldbp has, and add it after ldbp so they're
+ * together in the list.
+ */
+ if (ldbp == NULL) {
+ dbp->adj_fileid = maxid + 1;
+ TAILQ_INSERT_HEAD(&env->dblist, dbp, dblistlinks);
+ } else {
+ dbp->adj_fileid = ldbp->adj_fileid;
+ TAILQ_INSERT_AFTER(&env->dblist, ldbp, dbp, dblistlinks);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ return (0);
+}
+
+/*
+ * __env_mpool --
+ * Set up the underlying environment cache during a db_open.
+ *
+ * PUBLIC: int __env_mpool __P((DB *, const char *, u_int32_t));
+ */
+int
+__env_mpool(dbp, fname, flags)
+ DB *dbp;
+ const char *fname;
+ u_int32_t flags;
+{
+ DBT pgcookie;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO pginfo;
+ ENV *env;
+ int fidset, ftype, ret;
+ int32_t lsn_off;
+ u_int8_t nullfid[DB_FILE_ID_LEN];
+ u_int32_t clear_len;
+
+ env = dbp->env;
+
+ /* The LSN is the first entry on a DB page, byte offset 0. */
+ lsn_off = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LSN_OFF_NOTSET : 0;
+
+ /* It's possible that this database is already open. */
+ if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ return (0);
+
+ /*
+ * If we need to pre- or post-process a file's pages on I/O, set the
+ * file type. If it's a hash file, always call the pgin and pgout
+ * routines. This means that hash files can never be mapped into
+ * process memory. If it's a btree file and requires swapping, we
+ * need to page the file in and out. This has to be right -- we can't
+ * mmap files that are being paged in and out.
+ */
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_HEAP:
+ case DB_RECNO:
+ ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM)
+ ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
+ clear_len = CRYPTO_ON(env) ?
+ (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
+ DB_PAGE_DB_LEN;
+ break;
+ case DB_HASH:
+ ftype = DB_FTYPE_SET;
+ clear_len = CRYPTO_ON(env) ?
+ (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
+ DB_PAGE_DB_LEN;
+ break;
+ case DB_QUEUE:
+ ftype = F_ISSET(dbp,
+ DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) ?
+ DB_FTYPE_SET : DB_FTYPE_NOTSET;
+
+ /*
+ * If we came in here without a pagesize set, then we need
+ * to mark the in-memory handle as having clear_len not
+ * set, because we don't really know the clear length or
+ * the page size yet (since the file doesn't yet exist).
+ */
+ clear_len = dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET;
+ break;
+ case DB_UNKNOWN:
+ /*
+ * If we're running in the verifier, our database might
+ * be corrupt and we might not know its type--but we may
+ * still want to be able to verify and salvage.
+ *
+ * If we can't identify the type, it's not going to be safe
+ * to call __db_pgin--we pretty much have to give up all
+ * hope of salvaging cross-endianness. Proceed anyway;
+ * at worst, the database will just appear more corrupt
+ * than it actually is, but at best, we may be able
+ * to salvage some data even with no metadata page.
+ */
+ if (F_ISSET(dbp, DB_AM_VERIFYING)) {
+ ftype = DB_FTYPE_NOTSET;
+ clear_len = DB_PAGE_DB_LEN;
+ break;
+ }
+
+ /*
+ * This might be an in-memory file and we won't know its
+ * file type until after we open it and read the meta-data
+ * page.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ clear_len = DB_CLEARLEN_NOTSET;
+ ftype = DB_FTYPE_NOTSET;
+ lsn_off = DB_LSN_OFF_NOTSET;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ return (__db_unknown_type(env, "DB->open", dbp->type));
+ }
+
+ mpf = dbp->mpf;
+
+ memset(nullfid, 0, DB_FILE_ID_LEN);
+ fidset = memcmp(nullfid, dbp->fileid, DB_FILE_ID_LEN);
+ if (fidset)
+ (void)__memp_set_fileid(mpf, dbp->fileid);
+
+ (void)__memp_set_clear_len(mpf, clear_len);
+ (void)__memp_set_ftype(mpf, ftype);
+ (void)__memp_set_lsn_offset(mpf, lsn_off);
+
+ pginfo.db_pagesize = dbp->pgsize;
+ pginfo.flags =
+ F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+ pginfo.type = dbp->type;
+ pgcookie.data = &pginfo;
+ pgcookie.size = sizeof(DB_PGINFO);
+ (void)__memp_set_pgcookie(mpf, &pgcookie);
+
+#ifndef DIAG_MVCC
+ if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
+#endif
+ if (F_ISSET(dbp, DB_AM_TXN) &&
+ dbp->type != DB_QUEUE && dbp->type != DB_UNKNOWN)
+ LF_SET(DB_MULTIVERSION);
+
+ if ((ret = __memp_fopen(mpf, NULL, fname, &dbp->dirname,
+ LF_ISSET(DB_CREATE | DB_DURABLE_UNKNOWN | DB_MULTIVERSION |
+ DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE) |
+ (F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? DB_DIRECT : 0) |
+ (F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_TXN_NOT_DURABLE : 0),
+ 0, dbp->pgsize)) != 0) {
+ /*
+ * The open didn't work; we need to reset the mpf,
+ * retaining the in-memory semantics (if any).
+ */
+ (void)__memp_fclose(dbp->mpf, 0);
+ (void)__memp_fcreate(env, &dbp->mpf);
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ MAKE_INMEM(dbp);
+ return (ret);
+ }
+
+ /*
+ * Set the open flag. We use it to mean that the dbp has gone
+ * through mpf setup, including dbreg_register. Also, below,
+ * the underlying access method open functions may want to do
+ * things like acquire cursors, so the open flag has to be set
+ * before calling them.
+ */
+ F_SET(dbp, DB_AM_OPEN_CALLED);
+ if (!fidset && fname != NULL) {
+ (void)__memp_get_fileid(dbp->mpf, dbp->fileid);
+ dbp->preserve_fid = 1;
+ }
+
+ return (0);
+}
+
+/*
+ * __db_close --
+ * DB->close method.
+ *
+ * PUBLIC: int __db_close __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__db_close(dbp, txn, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ ENV *env;
+ int db_ref, deferred_close, ret, t_ret;
+
+ env = dbp->env;
+ deferred_close = 0;
+
+ PERFMON4(env, db, close,
+ dbp->fname, dbp->dname, flags, &dbp->fileid[0]);
+
+ /* Refresh the structure and close any underlying resources. */
+ ret = __db_refresh(dbp, txn, flags, &deferred_close, 0);
+
+ /*
+ * If we've deferred the close because the logging of the close failed,
+ * return our failure right away without destroying the handle.
+ */
+ if (deferred_close)
+ return (ret);
+
+ /* !!!
+ * This code has an apparent race between the moment we read and
+ * decrement env->db_ref and the moment we check whether it's 0.
+ * However, if the environment is DBLOCAL, the user shouldn't have a
+ * reference to the env handle anyway; the only way we can get
+ * multiple dbps sharing a local env is if we open them internally
+ * during something like a subdatabase open. If any such thing is
+ * going on while the user is closing the original dbp with a local
+ * env, someone's already badly screwed up, so there's no reason
+ * to bother engineering around this possibility.
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ db_ref = --env->db_ref;
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+ if (F_ISSET(env, ENV_DBLOCAL) && db_ref == 0 &&
+ (t_ret = __env_close(env->dbenv, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free the database handle. */
+ memset(dbp, CLEAR_BYTE, sizeof(*dbp));
+ __os_free(env, dbp);
+
+ return (ret);
+}
+
+/*
+ * __db_refresh --
+ * Refresh the DB structure, releasing any allocated resources.
+ * This does most of the work of closing files now because refresh
+ * is what is used during abort processing (since we can't destroy
+ * the actual handle) and during abort processing, we may have a
+ * fully opened handle.
+ *
+ * PUBLIC: int __db_refresh __P((DB *, DB_TXN *, u_int32_t, int *, int));
+ */
+int
+__db_refresh(dbp, txn, flags, deferred_closep, reuse)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t flags;
+ int *deferred_closep, reuse;
+{
+ DB *sdbp;
+ DBC *dbc;
+ DB_FOREIGN_INFO *f_info, *tmp;
+ DB_LOCKER *locker;
+ DB_LOCKREQ lreq;
+ ENV *env;
+ REGENV *renv;
+ REGINFO *infop;
+ u_int32_t save_flags;
+ int resync, ret, t_ret;
+
+ ret = 0;
+
+ env = dbp->env;
+ infop = env->reginfo;
+ if (infop != NULL)
+ renv = infop->primary;
+ else
+ renv = NULL;
+
+ /*
+ * If this dbp is not completely open, avoid trapping by trying to
+ * sync without an mpool file.
+ */
+ if (dbp->mpf == NULL)
+ LF_SET(DB_NOSYNC);
+
+ /* If never opened, or not currently open, it's easy. */
+ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ goto never_opened;
+
+ /*
+ * If we have any secondary indices, disassociate them from us.
+ * We don't bother with the mutex here; it only protects some
+ * of the ops that will make us core-dump mid-close anyway, and
+ * if you're trying to do something with a secondary *while* you're
+ * closing the primary, you deserve what you get. The disassociation
+ * is mostly done just so we can close primaries and secondaries in
+ * any order--but within one thread of control.
+ */
+ LIST_FOREACH(sdbp, &dbp->s_secondaries, s_links) {
+ LIST_REMOVE(sdbp, s_links);
+ if ((t_ret = __db_disassociate(sdbp)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if (F_ISSET(dbp, DB_AM_SECONDARY))
+ LIST_REMOVE(dbp, s_links);
+
+ /*
+ * Disassociate ourself from any databases using us as a foreign key
+ * database by clearing the referring db's pointer. Reclaim memory.
+ */
+ f_info = LIST_FIRST(&dbp->f_primaries);
+ while (f_info != NULL) {
+ tmp = LIST_NEXT(f_info, f_links);
+ LIST_REMOVE(f_info, f_links);
+ f_info->dbp->s_foreign = NULL;
+ __os_free(env, f_info);
+ f_info = tmp;
+ }
+
+ if (dbp->s_foreign != NULL &&
+ (t_ret = __db_disassociate_foreign(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Sync the underlying access method. Do before closing the cursors
+ * because DB->sync allocates cursors in order to write Recno backing
+ * source text files.
+ *
+ * Sync is slow on some systems, notably Solaris filesystems where the
+ * entire buffer cache is searched. If we're in recovery, don't flush
+ * the file, it's not necessary.
+ */
+ if (!LF_ISSET(DB_NOSYNC) &&
+ !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
+ (t_ret = __db_sync(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Go through the active cursors, unregister each cursor from its
+ * transaction if any, and call the cursor recycle routine,
+ * which resolves pending operations and moves the cursors onto the
+ * free list. Then, walk the free list and call the cursor destroy
+ * routine. Note that any failure on a close is considered "really
+ * bad" and we just break out of the loop and force forward.
+ */
+ resync = TAILQ_FIRST(&dbp->active_queue) == NULL ? 0 : 1;
+ while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) {
+ if (dbc->txn != NULL)
+ TAILQ_REMOVE(&(dbc->txn->my_cursors), dbc, txn_cursors);
+
+ if ((t_ret = __dbc_close(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+ }
+
+ while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+ if ((t_ret = __dbc_destroy(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+
+ /*
+ * Close any outstanding join cursors. Join cursors destroy themselves
+ * on close and have no separate destroy routine. We don't have to set
+ * the resync flag here, because join cursors aren't write cursors.
+ */
+ while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL)
+ if ((t_ret = __db_join_close(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+
+ /*
+ * Sync the memory pool, even though we've already called DB->sync,
+ * because closing cursors can dirty pages by deleting items they
+ * referenced.
+ *
+ * Sync is slow on some systems, notably Solaris filesystems where the
+ * entire buffer cache is searched. If we're in recovery, don't flush
+ * the file, it's not necessary.
+ */
+ if (resync && !LF_ISSET(DB_NOSYNC) &&
+ !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
+ (t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * If there is a file extension watermark associated with this
+ * database, we don't need it any more.
+ */
+ __txn_remove_fe_watermark(txn, dbp);
+
+never_opened:
+ MUTEX_LOCK(env, env->mtx_dblist);
+ /*
+ * At this point, we haven't done anything to render the DB handle
+ * unusable, at least by a transaction abort. Take the opportunity
+ * now to log the file close if we have initialized the logging
+ * information. If this log fails and we're in a transaction,
+ * we have to bail out of the attempted close; we'll need a dbp in
+ * order to successfully abort the transaction, and we can't conjure
+ * a new one up because we haven't gotten out the dbreg_register
+ * record that represents the close. In this case, we put off
+ * actually closing the dbp until we've performed the abort.
+ */
+ if (!reuse && LOGGING_ON(dbp->env) && dbp->log_filename != NULL) {
+ /*
+ * Discard the log file id, if any. We want to log the close
+ * if and only if this is not a recovery dbp or a client dbp,
+ * or a dead dbp handle.
+ */
+ DB_ASSERT(env, renv != NULL);
+ if (F_ISSET(dbp, DB_AM_RECOVER) || IS_REP_CLIENT(env) ||
+ dbp->timestamp != renv->rep_timestamp) {
+ if ((t_ret = __dbreg_revoke_id(dbp,
+ 0, DB_LOGFILEID_INVALID)) == 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __dbreg_teardown(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+ } else {
+ if ((t_ret = __dbreg_close_id(dbp,
+ txn, DBREG_CLOSE)) != 0 && txn != NULL) {
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+ /*
+ * We're in a txn and the attempt to log the
+ * close failed; let the txn subsystem know
+ * that we need to destroy this dbp once we're
+ * done with the abort, then bail from the
+ * close.
+ *
+ * Note that if the attempt to put off the
+ * close -also- fails--which it won't unless
+ * we're out of heap memory--we're really
+ * screwed. Panic.
+ */
+ if ((ret =
+ __txn_closeevent(env, txn, dbp)) != 0)
+ return (__env_panic(env, ret));
+ if (deferred_closep != NULL)
+ *deferred_closep = 1;
+ return (t_ret);
+ }
+ /*
+ * If dbreg_close_id failed and we were not in a
+ * transaction, then we need to finish this close
+ * because the caller can't do anything with the
+ * handle after we return an error. We rely on
+ * dbreg_close_id to mark the entry in some manner
+ * so that we do not do a clean shutdown of this
+ * environment. If shutdown isn't clean, then the
+ * application *must* run recovery and that will
+ * generate the RCLOSE record.
+ */
+ }
+
+ }
+
+ /* Close any handle we've been holding since the open. */
+ if (dbp->saved_open_fhp != NULL &&
+ (t_ret = __os_closehandle(env, dbp->saved_open_fhp)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ /*
+ * Remove this DB handle from the ENV's dblist, if it's been added.
+ *
+ * Close our reference to the underlying cache while locked, we don't
+ * want to race with a thread searching for our underlying cache link
+ * while opening a DB handle.
+ *
+ * The DB handle may not yet have been added to the ENV list, don't
+ * blindly call the underlying TAILQ_REMOVE macro. Explicitly reset
+ * the field values to NULL so that we can't call TAILQ_REMOVE twice.
+ */
+ if (!reuse &&
+ (dbp->dblistlinks.tqe_next != NULL ||
+ dbp->dblistlinks.tqe_prev != NULL)) {
+ TAILQ_REMOVE(&env->dblist, dbp, dblistlinks);
+ dbp->dblistlinks.tqe_next = NULL;
+ dbp->dblistlinks.tqe_prev = NULL;
+ }
+
+ /* Close the memory pool file handle. */
+ if (dbp->mpf != NULL) {
+ if ((t_ret = __memp_fclose(dbp->mpf,
+ F_ISSET(dbp, DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ dbp->mpf = NULL;
+ if (reuse &&
+ (t_ret = __memp_fcreate(env, &dbp->mpf)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ /*
+ * Call the access specific close function.
+ *
+ * We do this here rather than in __db_close as we need to do this when
+ * aborting an open so that file descriptors are closed and abort of
+ * renames can succeed on platforms that lock open files (such as
+ * Windows). In particular, we need to ensure that all the extents
+ * associated with a queue are closed so that queue renames can be
+ * aborted.
+ *
+ * It is also important that we do this before releasing the handle
+ * lock, because dbremove and dbrename assume that once they have the
+ * handle lock, it is safe to modify the underlying file(s).
+ *
+ * !!!
+ * Because of where these functions are called in the DB handle close
+ * process, these routines can't do anything that would dirty pages or
+ * otherwise affect closing down the database. Specifically, we can't
+ * abort and recover any of the information they control.
+ */
+#ifdef HAVE_PARTITION
+ if (dbp->p_internal != NULL &&
+ (t_ret = __partition_close(dbp, txn, flags)) != 0 && ret == 0)
+ ret = t_ret;
+#endif
+ if ((t_ret = __bam_db_close(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __ham_db_close(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __heap_db_close(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __qam_db_close(dbp, dbp->flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * !!!
+ * At this point, the access-method specific information has been
+ * freed. From now on, we can use the dbp, but not touch any
+ * access-method specific data.
+ */
+
+ if (!reuse && dbp->locker != NULL) {
+ /* We may have pending trade operations on this dbp. */
+ if (txn == NULL)
+ txn = dbp->cur_txn;
+ if (IS_REAL_TXN(txn))
+ __txn_remlock(env,
+ txn, &dbp->handle_lock, dbp->locker);
+
+ /* We may be holding the handle lock; release it. */
+ lreq.op = DB_LOCK_PUT_ALL;
+ lreq.obj = NULL;
+ if ((t_ret = __lock_vec(env,
+ dbp->locker, 0, &lreq, 1, NULL)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret =
+ __lock_id_free(env, dbp->locker)) != 0 && ret == 0)
+ ret = t_ret;
+ dbp->locker = NULL;
+ LOCK_INIT(dbp->handle_lock);
+ }
+
+ /*
+ * If this is a temporary file (un-named in-memory file), then
+ * discard the locker ID allocated as the fileid.
+ */
+ if (LOCKING_ON(env) &&
+ F_ISSET(dbp, DB_AM_INMEM) && !dbp->preserve_fid &&
+ *(u_int32_t *)dbp->fileid != DB_LOCK_INVALIDID) {
+ if ((t_ret = __lock_getlocker(env->lk_handle,
+ *(u_int32_t *)dbp->fileid, 0, &locker)) == 0)
+ t_ret = __lock_id_free(env, locker);
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ if (reuse) {
+ /*
+ * If we are reusing this dbp, then we're done now. Re-init
+ * the handle, preserving important flags, and then return.
+ * This code is borrowed from __db_init, which does more
+ * than we can do here.
+ */
+ save_flags = F_ISSET(dbp, DB_AM_INMEM |
+ DB_AM_RDONLY | DB_AM_TXN);
+
+ if ((ret = __bam_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __ham_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __heap_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __qam_db_create(dbp)) != 0)
+ return (ret);
+
+ /* Restore flags */
+ dbp->flags = dbp->orig_flags | save_flags;
+
+ if (FLD_ISSET(save_flags, DB_AM_INMEM)) {
+ /*
+ * If this is inmem, then it may have a fileid
+ * even if it was never opened, and we need to
+ * clear out that fileid.
+ */
+ memset(dbp->fileid, 0, sizeof(dbp->fileid));
+ MAKE_INMEM(dbp);
+ }
+ return (ret);
+ }
+
+ dbp->type = DB_UNKNOWN;
+
+ /*
+ * The thread mutex may have been invalidated in __dbreg_close_id if the
+ * fname refcount did not go to 0. If not, discard the thread mutex.
+ */
+ if ((t_ret = __mutex_free(env, &dbp->mutex)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard any memory allocated for the file and database names. */
+ if (dbp->fname != NULL) {
+ __os_free(dbp->env, dbp->fname);
+ dbp->fname = NULL;
+ }
+ if (dbp->dname != NULL) {
+ __os_free(dbp->env, dbp->dname);
+ dbp->dname = NULL;
+ }
+
+ /* Discard any memory used to store returned data. */
+ if (dbp->my_rskey.data != NULL)
+ __os_free(dbp->env, dbp->my_rskey.data);
+ if (dbp->my_rkey.data != NULL)
+ __os_free(dbp->env, dbp->my_rkey.data);
+ if (dbp->my_rdata.data != NULL)
+ __os_free(dbp->env, dbp->my_rdata.data);
+
+ /* For safety's sake; we may refresh twice. */
+ memset(&dbp->my_rskey, 0, sizeof(DBT));
+ memset(&dbp->my_rkey, 0, sizeof(DBT));
+ memset(&dbp->my_rdata, 0, sizeof(DBT));
+
+ /* Clear out fields that normally get set during open. */
+ memset(dbp->fileid, 0, sizeof(dbp->fileid));
+ dbp->adj_fileid = 0;
+ dbp->meta_pgno = 0;
+ dbp->cur_locker = NULL;
+ dbp->cur_txn = NULL;
+ dbp->associate_locker = NULL;
+ dbp->open_flags = 0;
+
+ /*
+ * If we are being refreshed with a txn specified, then we need
+ * to make sure that we clear out the lock handle field, because
+ * releasing all the locks for this transaction will release this
+ * lock and we don't want close to stumble upon this handle and
+ * try to close it.
+ */
+ if (txn != NULL)
+ LOCK_INIT(dbp->handle_lock);
+
+ /* Reset flags to whatever the user configured. */
+ dbp->flags = dbp->orig_flags;
+
+ return (ret);
+}
+
+/*
+ * __db_disassociate --
+ * Destroy the association between a given secondary and its primary.
+ */
+static int
+__db_disassociate(sdbp)
+ DB *sdbp;
+{
+ DBC *dbc;
+ int ret, t_ret;
+
+ ret = 0;
+
+ sdbp->s_callback = NULL;
+ sdbp->s_primary = NULL;
+ sdbp->get = sdbp->stored_get;
+ sdbp->close = sdbp->stored_close;
+
+ /*
+ * Complain, but proceed, if we have any active cursors. (We're in
+ * the middle of a close, so there's really no turning back.)
+ */
+ if (sdbp->s_refcnt != 1 ||
+ TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+ TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+ __db_errx(sdbp->env, DB_STR("0674",
+"Closing a primary DB while a secondary DB has active cursors is unsafe"));
+ ret = EINVAL;
+ }
+ sdbp->s_refcnt = 0;
+
+ while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+ if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ F_CLR(sdbp, DB_AM_SECONDARY);
+ return (ret);
+}
+
+/*
+ * __db_disassociate_foreign --
+ * Destroy the association between a given secondary and its foreign.
+ */
+static int
+__db_disassociate_foreign(sdbp)
+ DB *sdbp;
+{
+ DB *fdbp;
+ DB_FOREIGN_INFO *f_info, *tmp;
+ int ret;
+
+ if (sdbp->s_foreign == NULL)
+ return (0);
+ if ((ret = __os_malloc(sdbp->env, sizeof(DB_FOREIGN_INFO), &tmp)) != 0)
+ return (ret);
+
+ fdbp = sdbp->s_foreign;
+ ret = 0;
+ f_info = LIST_FIRST(&fdbp->f_primaries);
+ while (f_info != NULL) {
+ tmp = LIST_NEXT(f_info, f_links);
+ if (f_info ->dbp == sdbp) {
+ LIST_REMOVE(f_info, f_links);
+ __os_free(sdbp->env, f_info);
+ }
+ f_info = tmp;
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_log_page
+ * Log a meta-data or root page during a subdatabase create operation.
+ *
+ * PUBLIC: int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *));
+ */
+int
+__db_log_page(dbp, txn, lsn, pgno, page)
+ DB *dbp;
+ DB_TXN *txn;
+ DB_LSN *lsn;
+ db_pgno_t pgno;
+ PAGE *page;
+{
+ DBT page_dbt;
+ DB_LSN new_lsn;
+ int ret;
+
+ if (!LOGGING_ON(dbp->env) || txn == NULL)
+ return (0);
+
+ memset(&page_dbt, 0, sizeof(page_dbt));
+ page_dbt.size = dbp->pgsize;
+ page_dbt.data = page;
+
+ ret = __crdel_metasub_log(dbp, txn, &new_lsn, F_ISSET(dbp,
+ DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0, pgno, &page_dbt, lsn);
+
+ if (ret == 0)
+ page->lsn = new_lsn;
+ return (ret);
+}
+
+/*
+ * __db_walk_cursors
+ * Walk all cursors for a database.
+ *
+ * PUBLIC: int __db_walk_cursors __P((DB *, DBC *,
+ * PUBLIC: int (*) __P((DBC *, DBC *,
+ * PUBLIC: u_int32_t *, db_pgno_t, u_int32_t, void *)),
+ * PUBLIC: u_int32_t *, db_pgno_t, u_int32_t, void *));
+ */
+ int
+ __db_walk_cursors(dbp, my_dbc, func, countp, pgno, indx, args)
+ DB *dbp;
+ DBC *my_dbc;
+ int (*func)__P((DBC *, DBC *,
+ u_int32_t *, db_pgno_t, u_int32_t, void *));
+ u_int32_t *countp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ void *args;
+{
+ ENV *env;
+ DB *ldbp;
+ DBC *dbc;
+ int ret;
+
+ env = dbp->env;
+ ret = 0;
+
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (*countp = 0;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+loop: MUTEX_LOCK(env, ldbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links)
+ if ((ret = (func)(dbc, my_dbc,
+ countp, pgno, indx, args)) != 0)
+ break;
+ /*
+ * We use the error to communicate that function
+ * dropped the mutex.
+ */
+ if (ret == DB_LOCK_NOTGRANTED)
+ goto loop;
+ MUTEX_UNLOCK(env, ldbp->mutex);
+ if (ret != 0)
+ break;
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+ return (ret);
+}
+
+/*
+ * __db_backup_name
+ * Create the backup file name for a given file.
+ *
+ * PUBLIC: int __db_backup_name __P((ENV *,
+ * PUBLIC: const char *, DB_TXN *, char **));
+ */
+#undef BACKUP_PREFIX
+#define BACKUP_PREFIX "__db."
+
+#undef MAX_INT_TO_HEX
+#define MAX_INT_TO_HEX 8
+
+int
+__db_backup_name(env, name, txn, backup)
+ ENV *env;
+ const char *name;
+ DB_TXN *txn;
+ char **backup;
+{
+ u_int32_t id;
+ size_t len;
+ int ret;
+ char *p, *retp;
+
+ *backup = NULL;
+
+ /*
+ * Part of the name may be a full path, so we need to make sure that
+ * we allocate enough space for it, even in the case where we don't
+ * use the entire filename for the backup name.
+ */
+ len = strlen(name) + strlen(BACKUP_PREFIX) + 2 * MAX_INT_TO_HEX + 1;
+ if ((ret = __os_malloc(env, len, &retp)) != 0)
+ return (ret);
+
+ /*
+ * Create the name. Backup file names are in one of 2 forms: in a
+ * transactional env "__db.TXNID.ID", where ID is a random number,
+ * and in any other env "__db.FILENAME".
+ *
+ * In addition, the name passed may contain an env-relative path.
+ * In that case, put the "__db." in the right place (in the last
+ * component of the pathname).
+ *
+ * There are four cases here:
+ * 1. simple path w/out transaction
+ * 2. simple path + transaction
+ * 3. multi-component path w/out transaction
+ * 4. multi-component path + transaction
+ */
+ p = __db_rpath(name);
+ if (IS_REAL_TXN(txn)) {
+ __os_unique_id(env, &id);
+ if (p == NULL) /* Case 2. */
+ snprintf(retp, len, "%s%x.%x",
+ BACKUP_PREFIX, txn->txnid, id);
+ else /* Case 4. */
+ snprintf(retp, len, "%.*s%x.%x",
+ (int)(p - name) + 1, name, txn->txnid, id);
+ } else {
+ if (p == NULL) /* Case 1. */
+ snprintf(retp, len, "%s%s", BACKUP_PREFIX, name);
+ else /* Case 3. */
+ snprintf(retp, len, "%.*s%s%s",
+ (int)(p - name) + 1, name, BACKUP_PREFIX, p + 1);
+ }
+
+ *backup = retp;
+ return (0);
+}
+
+#ifdef CONFIG_TEST
+/*
+ * __db_testcopy
+ * Create a copy of all backup files and our "main" DB.
+ *
+ * PUBLIC: #ifdef CONFIG_TEST
+ * PUBLIC: int __db_testcopy __P((ENV *, DB *, const char *));
+ * PUBLIC: #endif
+ */
+int
+__db_testcopy(env, dbp, name)
+ ENV *env;
+ DB *dbp;
+ const char *name;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *mpf;
+
+ DB_ASSERT(env, dbp != NULL || name != NULL);
+
+ if (name == NULL) {
+ dbmp = env->mp_handle;
+ mpf = dbp->mpf;
+ name = R_ADDR(dbmp->reginfo, mpf->mfp->path_off);
+ }
+
+ if (dbp != NULL && dbp->type == DB_QUEUE)
+ return (__qam_testdocopy(dbp, name));
+ else
+#ifdef HAVE_PARTITION
+ if (dbp != NULL && DB_IS_PARTITIONED(dbp))
+ return (__part_testdocopy(dbp, name));
+ else
+#endif
+ return (__db_testdocopy(env, name));
+}
+
+static int
+__qam_testdocopy(dbp, name)
+ DB *dbp;
+ const char *name;
+{
+ DB_THREAD_INFO *ip;
+ QUEUE_FILELIST *filelist, *fp;
+ int ret;
+ char buf[DB_MAXPATHLEN], *dir;
+
+ filelist = NULL;
+ if ((ret = __db_testdocopy(dbp->env, name)) != 0)
+ return (ret);
+
+ /* Call ENV_GET_THREAD_INFO to get a valid DB_THREAD_INFO */
+ ENV_GET_THREAD_INFO(dbp->env, ip);
+ if (dbp->mpf != NULL &&
+ (ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+ goto done;
+
+ if (filelist == NULL)
+ return (0);
+ dir = ((QUEUE *)dbp->q_internal)->dir;
+ for (fp = filelist; fp->mpf != NULL; fp++) {
+ snprintf(buf, sizeof(buf),
+ QUEUE_EXTENT, dir, PATH_SEPARATOR[0], name, fp->id);
+ if ((ret = __db_testdocopy(dbp->env, buf)) != 0)
+ return (ret);
+ }
+
+done: __os_free(dbp->env, filelist);
+ return (0);
+}
+
+/*
+ * __db_testdocopy
+ * Create a copy of all backup files and our "main" DB.
+ * PUBLIC: int __db_testdocopy __P((ENV *, const char *));
+ */
+int
+__db_testdocopy(env, name)
+ ENV *env;
+ const char *name;
+{
+ size_t len;
+ int dircnt, i, ret;
+ char *copy, **namesp, *p, *real_name;
+
+ dircnt = 0;
+ copy = NULL;
+ namesp = NULL;
+
+ /* Create the real backing file name. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, name, NULL, &real_name)) != 0)
+ return (ret);
+
+ /*
+ * !!!
+ * There are tests that attempt to copy non-existent files. I'd guess
+ * it's a testing bug, but I don't have time to figure it out. Block
+ * the case here.
+ */
+ if (__os_exists(env, real_name, NULL) != 0) {
+ __os_free(env, real_name);
+ return (0);
+ }
+
+ /*
+ * Copy the file itself.
+ *
+ * Allocate space for the file name, including adding an ".afterop" and
+ * trailing nul byte.
+ */
+ len = strlen(real_name) + sizeof(".afterop");
+ if ((ret = __os_malloc(env, len, &copy)) != 0)
+ goto err;
+ snprintf(copy, len, "%s.afterop", real_name);
+ if ((ret = __db_makecopy(env, real_name, copy)) != 0)
+ goto err;
+
+ /*
+ * Get the directory path to call __os_dirlist().
+ */
+ if ((p = __db_rpath(real_name)) != NULL)
+ *p = '\0';
+ if ((ret = __os_dirlist(env, real_name, 0, &namesp, &dircnt)) != 0)
+ goto err;
+
+ /*
+ * Walk the directory looking for backup files. Backup file names in
+ * transactional environments are of the form:
+ *
+ * BACKUP_PREFIX.TXNID.ID
+ */
+ for (i = 0; i < dircnt; i++) {
+ /* Check for a related backup file name. */
+ if (strncmp(
+ namesp[i], BACKUP_PREFIX, sizeof(BACKUP_PREFIX) - 1) != 0)
+ continue;
+ p = namesp[i] + sizeof(BACKUP_PREFIX);
+ p += strspn(p, "0123456789ABCDEFabcdef");
+ if (*p != '.')
+ continue;
+ ++p;
+ p += strspn(p, "0123456789ABCDEFabcdef");
+ if (*p != '\0')
+ continue;
+
+ /*
+ * Copy the backup file.
+ *
+ * Allocate space for the file name, including adding a
+ * ".afterop" and trailing nul byte.
+ */
+ if (real_name != NULL) {
+ __os_free(env, real_name);
+ real_name = NULL;
+ }
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, namesp[i], NULL, &real_name)) != 0)
+ goto err;
+ if (copy != NULL) {
+ __os_free(env, copy);
+ copy = NULL;
+ }
+ len = strlen(real_name) + sizeof(".afterop");
+ if ((ret = __os_malloc(env, len, &copy)) != 0)
+ goto err;
+ snprintf(copy, len, "%s.afterop", real_name);
+ if ((ret = __db_makecopy(env, real_name, copy)) != 0)
+ goto err;
+ }
+
+err: if (namesp != NULL)
+ __os_dirfree(env, namesp, dircnt);
+ if (copy != NULL)
+ __os_free(env, copy);
+ if (real_name != NULL)
+ __os_free(env, real_name);
+ return (ret);
+}
+
+static int
+__db_makecopy(env, src, dest)
+ ENV *env;
+ const char *src, *dest;
+{
+ DB_FH *rfhp, *wfhp;
+ size_t rcnt, wcnt;
+ int ret;
+ char *buf;
+
+ rfhp = wfhp = NULL;
+
+ if ((ret = __os_malloc(env, 64 * 1024, &buf)) != 0)
+ goto err;
+
+ if ((ret = __os_open(env, src, 0,
+ DB_OSO_RDONLY, DB_MODE_600, &rfhp)) != 0)
+ goto err;
+ if ((ret = __os_open(env, dest, 0,
+ DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &wfhp)) != 0)
+ goto err;
+
+ for (;;) {
+ if ((ret =
+ __os_read(env, rfhp, buf, sizeof(buf), &rcnt)) != 0)
+ goto err;
+ if (rcnt == 0)
+ break;
+ if ((ret =
+ __os_write(env, wfhp, buf, sizeof(buf), &wcnt)) != 0)
+ goto err;
+ }
+
+ if (0) {
+err: __db_err(env, ret, "__db_makecopy: %s -> %s", src, dest);
+ }
+
+ if (buf != NULL)
+ __os_free(env, buf);
+ if (rfhp != NULL)
+ (void)__os_closehandle(env, rfhp);
+ if (wfhp != NULL)
+ (void)__os_closehandle(env, wfhp);
+ return (ret);
+}
+#endif
diff --git a/src/db/db.src b/src/db/db.src
new file mode 100644
index 00000000..879c7856
--- /dev/null
+++ b/src/db/db.src
@@ -0,0 +1,431 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __db
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * addrem -- Add or remove an entry from a duplicate page.
+ *
+ * opcode: identifies if this is an add or delete.
+ * fileid: file identifier of the file being modified.
+ * pgno: duplicate page number.
+ * indx: location at which to insert or delete.
+ * nbytes: number of bytes added/removed to/from the page.
+ * hdr: header for the data item.
+ * dbt: data that is deleted or is to be added.
+ * pagelsn: former lsn of the page.
+ *
+ * If the hdr was NULL then, the dbt is a regular B_KEYDATA.
+ * If the dbt was NULL then the hdr is a complete item to be
+ * pasted on the page.
+ */
+BEGIN addrem 50 41
+OP opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG nbytes u_int32_t lu
+HDR hdr DBT s
+DBT dbt DBT s
+POINTER pagelsn DB_LSN * lu
+END
+
+BEGIN_COMPAT addrem 42 41
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG nbytes u_int32_t lu
+DBT hdr DBT s
+DBT dbt DBT s
+POINTER pagelsn DB_LSN * lu
+END
+
+/*
+ * big -- Handles addition and deletion of big key/data items.
+ *
+ * opcode: identifies get/put.
+ * fileid: file identifier of the file being modified.
+ * pgno: page onto which data is being added/removed.
+ * prev_pgno: the page before the one we are logging.
+ * next_pgno: the page after the one we are logging.
+ * dbt: data being written onto the page.
+ * pagelsn: former lsn of the orig_page.
+ * prevlsn: former lsn of the prev_pgno.
+ * nextlsn: former lsn of the next_pgno. This is not currently used, but
+ * may be used later if we actually do overwrites of big key/
+ * data items in place.
+ */
+BEGIN big 50 43
+OP opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG prev_pgno db_pgno_t lu
+ARG next_pgno db_pgno_t lu
+HDR dbt DBT s
+POINTER pagelsn DB_LSN * lu
+POINTER prevlsn DB_LSN * lu
+POINTER nextlsn DB_LSN * lu
+END
+
+BEGIN_COMPAT big 42 43
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG prev_pgno db_pgno_t lu
+ARG next_pgno db_pgno_t lu
+DBT dbt DBT s
+POINTER pagelsn DB_LSN * lu
+POINTER prevlsn DB_LSN * lu
+POINTER nextlsn DB_LSN * lu
+END
+
+/*
+ * ovref -- Handles increment/decrement of overflow page reference count.
+ *
+ * fileid: identifies the file being modified.
+ * pgno: page number whose ref count is being incremented/decremented.
+ * adjust: the adjustment being made.
+ * lsn: the page's original lsn.
+ */
+BEGIN ovref 42 44
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG adjust int32_t ld
+POINTER lsn DB_LSN * lu
+END
+
+/*
+ * relink -- Handles relinking around a page.
+ *
+ * opcode: indicates if this is an addpage or delete page
+ * pgno: the page being changed.
+ * lsn the page's original lsn.
+ * prev: the previous page.
+ * lsn_prev: the previous page's original lsn.
+ * next: the next page.
+ * lsn_next: the previous page's original lsn.
+ */
+BEGIN_COMPAT relink 42 45
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG prev db_pgno_t lu
+POINTER lsn_prev DB_LSN * lu
+ARG next db_pgno_t lu
+POINTER lsn_next DB_LSN * lu
+END
+
+/*
+ * Debug -- log an operation upon entering an access method.
+ * op: Operation (cursor, c_close, c_get, c_put, c_del,
+ * get, put, delete).
+ * fileid: identifies the file being acted upon.
+ * key: key paramater
+ * data: data parameter
+ * flags: flags parameter
+ */
+BEGIN debug 42 47
+DBT op DBT s
+ARG fileid int32_t ld
+DBT key DBT s
+DBT data DBT s
+ARG arg_flags u_int32_t lu
+END
+
+/*
+ * noop -- do nothing, but get an LSN.
+ */
+BEGIN noop 42 48
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER prevlsn DB_LSN * lu
+END
+
+/*
+ * pg_alloc: used to record allocating a new page.
+ *
+ * meta_lsn: the original lsn of the page reference by meta_pgno.
+ * meta_pgno the page pointing at the allocated page in the free list.
+ * If the list is unsorted this is the metadata page.
+ * page_lsn: the allocated page's original lsn.
+ * pgno: the page allocated.
+ * ptype: the type of the page allocated.
+ * next: the next page on the free list.
+ * last_pgno: the last page in the file after this op (4.3+).
+ */
+BEGIN_COMPAT pg_alloc 42 49
+DB fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+POINTER page_lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG ptype u_int32_t lu
+ARG next db_pgno_t lu
+END
+
+BEGIN pg_alloc 43 49
+DB fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+POINTER page_lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG ptype u_int32_t lu
+ARG next db_pgno_t lu
+ARG last_pgno db_pgno_t lu
+END
+
+/*
+ * pg_free: used to record freeing a page.
+ * If we are maintaining a sorted free list (during compact) meta_pgno
+ * will be non-zero and refer to the page that preceeds the one we are freeing
+ * in the free list. Meta_lsn will then be the lsn of that page.
+ *
+ * pgno: the page being freed.
+ * meta_lsn: the meta-data page's original lsn.
+ * meta_pgno: the meta-data page number.
+ * header: the header from the free'd page.
+ * next: the previous next pointer on the metadata page.
+ * last_pgno: the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT pg_free 42 50
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+PGDBT header DBT s
+ARG next db_pgno_t lu
+END
+
+BEGIN pg_free 43 50
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+PGDBT header DBT s
+ARG next db_pgno_t lu
+ARG last_pgno db_pgno_t lu
+END
+
+/*
+ * cksum --
+ * This log record is written when we're unable to checksum a page,
+ * before returning DB_RUNRECOVERY. This log record causes normal
+ * recovery to itself return DB_RUNRECOVERY, as only catastrophic
+ * recovery can fix things.
+ */
+BEGIN cksum 42 51
+END
+
+/*
+ * pg_freedata: used to record freeing a page with data on it.
+ *
+ * pgno: the page being freed.
+ * meta_lsn: the meta-data page's original lsn.
+ * meta_pgno: the meta-data page number.
+ * header: the header and index entries from the free'd page.
+ * data: the data from the free'd page.
+ * next: the previous next pointer on the metadata page.
+ * last_pgno: the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT pg_freedata 42 52
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+PGDBT header DBT s
+ARG next db_pgno_t lu
+PGDDBT data DBT s
+END
+
+BEGIN pg_freedata 43 52
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+PGDBT header DBT s
+ARG next db_pgno_t lu
+ARG last_pgno db_pgno_t lu
+PGDDBT data DBT s
+END
+
+/*
+ * pg_prepare: used to record an aborted page in a prepared transaction.
+ *
+ * pgno: the page being freed.
+ */
+X BEGIN pg_prepare 42 53
+X DB fileid int32_t ld
+X ARG pgno db_pgno_t lu
+X END
+
+/*
+ * pg_new: used to record a new page put on the free list.
+ *
+ * pgno: the page being freed.
+ * meta_lsn: the meta-data page's original lsn.
+ * meta_pgno: the meta-data page number.
+ * header: the header from the free'd page.
+ * next: the previous next pointer on the metadata page.
+ */
+X BEGIN pg_new 42 54
+X DB fileid int32_t ld
+X ARG pgno db_pgno_t lu
+X POINTER meta_lsn DB_LSN * lu
+X ARG meta_pgno db_pgno_t lu
+X PGDBT header DBT s
+X ARG next db_pgno_t lu
+X END
+
+/*
+ * pg_init: used to reinitialize a page during truncate.
+ *
+ * pgno: the page being initialized.
+ * header: the header from the page.
+ * data: data that used to be on the page.
+ */
+BEGIN pg_init 43 60
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+PGDBT header DBT s
+PGDDBT data DBT s
+END
+
+/*
+ * pg_sort: sort the free list
+ *
+ * meta: meta page number
+ * meta_lsn: lsn on meta page.
+ * last_free: page number of new last free page.
+ * last_lsn; lsn of last free page.
+ * last_pgno: current last page number.
+ * list: list of pages and lsns to sort.
+ */
+BEGIN_COMPAT pg_sort 44 61
+DB fileid int32_t ld
+ARG meta db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG last_free db_pgno_t lu
+POINTER last_lsn DB_LSN * lu
+ARG last_pgno db_pgno_t lu
+DBT list DBT s
+END
+
+
+/*
+ * pg_truc: truncate the free list
+ *
+ * meta: meta page number
+ * meta_lsn: lsn on meta page.
+ * last_free: page number of new last free page.
+ * last_lsn; lsn of last free page.
+ * last_pgno: current last page number.
+ * list: list of pages and lsns on free list.
+ */
+BEGIN pg_trunc 50 66
+DB fileid int32_t ld
+ARG meta db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG last_free db_pgno_t lu
+POINTER last_lsn DB_LSN * lu
+ARG next_free db_pgno_t lu
+ARG last_pgno db_pgno_t lu
+PGLIST list DBT s
+END
+
+/*
+ * realloc: allocate a range of pages from the free list
+ * prev_pgno: page number of the page preceeding the set of pages to
+ * be allocated
+ * prev_lsn LSN from the prev_pgno page
+ * next_free page number of the page immediately following the set
+ * of pages to be allocated
+ * ptype The type of page being allocated
+ * list: pairs of page numbers and LSNs corresponding to the pages on
+ * the free list that are being reallocated
+ */
+BEGIN realloc 50 36
+DB fileid int32_t ld
+ARG prev_pgno db_pgno_t lu
+POINTER page_lsn DB_LSN * lu
+ARG next_free db_pgno_t lu
+ARG ptype u_int32_t lu
+PGLIST list DBT s
+END
+
+/*
+ * relink: relink next and previous page pointers
+ * NOTE: moved from btree so its number is from that range.
+ * pgno: The page being removed.
+ * new_pgno: The new page number, if any.
+ * prev_pgno: The previous page, if any.
+ * lsn_prev: The previous page's original lsn.
+ * next_pgno: The next page, if any.
+ * lsn_next: The previous page's original lsn.
+ */
+BEGIN relink 44 147
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG new_pgno db_pgno_t lu
+ARG prev_pgno db_pgno_t lu
+POINTER lsn_prev DB_LSN * lu
+ARG next_pgno db_pgno_t lu
+POINTER lsn_next DB_LSN * lu
+END
+
+/*
+ * Merge: merge two pages.
+ * NOTE: moved from btree so its number is from that range.
+ * pgno: The page number of the target page.
+ * lsn: Orignial LSN of the page.
+ * npgno: The page number of the next, or merged, page.
+ * nlsn: The LSN of hte next page.
+ * hdr: The page header of the next page.
+ * data: The data from the next page.
+ * pg_copy: If 1, then the whole page was copied.
+ */
+BEGIN merge 47 148
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+PGDBT hdr DBT s
+PGDDBT data DBT s
+ARG pg_copy int32_t lu
+END
+
+
+/*
+ * pgno -- Handles replacing a page number in a record
+ * reference on pgno by indx.
+ * NOTE: moved from btree so its number is from that range.
+ * pgno: The page that is being updated.
+ * lsn: The LSN of the page.
+ * indx: The index of the record being updated.
+ * opgno: Old page number.
+ * npgno: New page number.
+ */
+BEGIN pgno 44 149
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG opgno db_pgno_t lu
+ARG npgno db_pgno_t lu
+END
diff --git a/src/db/db_am.c b/src/db/db_am.c
new file mode 100644
index 00000000..1cf3a505
--- /dev/null
+++ b/src/db/db_am.c
@@ -0,0 +1,1150 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_secondary_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __dbc_set_priority __P((DBC *, DB_CACHE_PRIORITY));
+static int __dbc_get_priority __P((DBC *, DB_CACHE_PRIORITY* ));
+
+/*
+ * __db_cursor_int --
+ * Internal routine to create a cursor.
+ *
+ * PUBLIC: int __db_cursor_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, DBTYPE, db_pgno_t, int, DB_LOCKER *, DBC **));
+ */
+int
+__db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBTYPE dbtype;
+ db_pgno_t root;
+ int flags;
+ DB_LOCKER *locker;
+ DBC **dbcp;
+{
+ DBC *dbc;
+ DBC_INTERNAL *cp;
+ DB_LOCKREQ req;
+ ENV *env;
+ db_threadid_t tid;
+ int allocated, envlid, ret;
+ pid_t pid;
+
+ env = dbp->env;
+ allocated = envlid = 0;
+
+ /*
+ * If dbcp is non-NULL it is assumed to point to an area to initialize
+ * as a cursor.
+ *
+ * Take one from the free list if it's available. Take only the
+ * right type. With off page dups we may have different kinds
+ * of cursors on the queue for a single database.
+ */
+ MUTEX_LOCK(env, dbp->mutex);
+
+#ifndef HAVE_NO_DB_REFCOUNT
+ /*
+ * If this DBP is being logged then refcount the log filename
+ * relative to this transaction. We do this here because we have
+ * the dbp->mutex which protects the refcount. We want to avoid
+ * calling the function if the transaction handle has a shared parent
+ * locker or we are duplicating a cursor. This includes the case of
+ * creating an off page duplicate cursor.
+ * If we knew this cursor will not be used in an update, we could avoid
+ * this, but we don't have that information.
+ */
+ if (IS_REAL_TXN(txn) &&
+ !LF_ISSET(DBC_OPD | DBC_DUPLICATE) &&
+ !F_ISSET(dbp, DB_AM_RECOVER) &&
+ dbp->log_filename != NULL && !IS_REP_CLIENT(env) &&
+ (ret = __txn_record_fname(env, txn, dbp->log_filename)) != 0) {
+ MUTEX_UNLOCK(env, dbp->mutex);
+ return (ret);
+ }
+
+#endif
+
+ TAILQ_FOREACH(dbc, &dbp->free_queue, links)
+ if (dbtype == dbc->dbtype) {
+ TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+ F_CLR(dbc, ~DBC_OWN_LID);
+ break;
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ if (dbc == NULL) {
+ if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
+ return (ret);
+ allocated = 1;
+ dbc->flags = 0;
+
+ dbc->dbp = dbp;
+ dbc->dbenv = dbp->dbenv;
+ dbc->env = dbp->env;
+
+ /* Set up locking information. */
+ if (LOCKING_ON(env)) {
+ /*
+ * If we are not threaded, we share a locker ID among
+ * all cursors opened in the environment handle,
+ * allocating one if this is the first cursor.
+ *
+ * This relies on the fact that non-threaded DB handles
+ * always have non-threaded environment handles, since
+ * we set DB_THREAD on DB handles created with threaded
+ * environment handles.
+ */
+ if (!DB_IS_THREADED(dbp)) {
+ if (env->env_lref == NULL) {
+ if ((ret = __lock_id(env,
+ NULL, &env->env_lref)) != 0)
+ goto err;
+ envlid = 1;
+ }
+ dbc->lref = env->env_lref;
+ }
+
+ /*
+ * In CDB, secondary indices should share a lock file
+ * ID with the primary; otherwise we're susceptible
+ * to deadlocks. We also use __db_cursor_int rather
+ * than __db_cursor to create secondary update cursors
+ * in c_put and c_del; these won't acquire a new lock.
+ *
+ * !!!
+ * Since this is in the one-time cursor allocation
+ * code, we need to be sure to destroy, not just
+ * close, all cursors in the secondary when we
+ * associate.
+ */
+ if (CDB_LOCKING(env) &&
+ F_ISSET(dbp, DB_AM_SECONDARY))
+ memcpy(dbc->lock.fileid,
+ dbp->s_primary->fileid, DB_FILE_ID_LEN);
+ else
+ memcpy(dbc->lock.fileid,
+ dbp->fileid, DB_FILE_ID_LEN);
+
+ if (CDB_LOCKING(env)) {
+ if (F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+ /*
+ * If we are doing a single lock per
+ * environment, set up the global
+ * lock object just like we do to
+ * single thread creates.
+ */
+ DB_ASSERT(env, sizeof(db_pgno_t) ==
+ sizeof(u_int32_t));
+ dbc->lock_dbt.size = sizeof(u_int32_t);
+ dbc->lock_dbt.data = &dbc->lock.pgno;
+ dbc->lock.pgno = 0;
+ } else {
+ dbc->lock_dbt.size = DB_FILE_ID_LEN;
+ dbc->lock_dbt.data = dbc->lock.fileid;
+ }
+ } else {
+ dbc->lock.type = DB_PAGE_LOCK;
+ dbc->lock_dbt.size = sizeof(dbc->lock);
+ dbc->lock_dbt.data = &dbc->lock;
+ }
+ }
+ /* Init the DBC internal structure. */
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp)) {
+ if ((ret = __partc_init(dbc)) != 0)
+ goto err;
+ } else
+#endif
+ switch (dbtype) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bamc_init(dbc, dbtype)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ if ((ret = __hamc_init(dbc)) != 0)
+ goto err;
+ break;
+ case DB_HEAP:
+ if ((ret = __heapc_init(dbc)) != 0)
+ goto err;
+ break;
+ case DB_QUEUE:
+ if ((ret = __qamc_init(dbc)) != 0)
+ goto err;
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env, "DB->cursor", dbtype);
+ goto err;
+ }
+
+ cp = dbc->internal;
+ }
+
+ /* Refresh the DBC structure. */
+ dbc->dbtype = dbtype;
+ RESET_RET_MEM(dbc);
+ dbc->set_priority = __dbc_set_priority;
+ dbc->get_priority = __dbc_get_priority;
+ dbc->priority = dbp->priority;
+ dbc->txn_cursors.tqe_next = NULL;
+ dbc->txn_cursors.tqe_prev = NULL;
+
+ /*
+ * If the DB handle is not threaded, there is one locker ID for the
+ * whole environment. There should only one family transaction active
+ * as well. This doesn't apply to CDS group transactions, where the
+ * cursor can simply use the transaction's locker directly.
+ */
+ if (!CDB_LOCKING(env) && txn != NULL && F_ISSET(txn, TXN_FAMILY) &&
+ (F_ISSET(dbc, DBC_OWN_LID) || dbc->lref == NULL || envlid)) {
+ if (LOCKING_ON(env)) {
+ if (dbc->lref == NULL) {
+ if ((ret =
+ __lock_id(env, NULL, &dbc->lref)) != 0)
+ goto err;
+ F_SET(dbc, DBC_OWN_LID);
+ }
+ if ((ret = __lock_addfamilylocker(env,
+ txn->txnid, dbc->lref->id, 1)) != 0)
+ goto err;
+ }
+ F_SET(dbc, DBC_FAMILY);
+ txn = NULL;
+ }
+
+ if ((dbc->txn = txn) != NULL)
+ dbc->locker = txn->locker;
+ else if (LOCKING_ON(env)) {
+ /*
+ * There are certain cases in which we want to create a
+ * new cursor with a particular locker ID that is known
+ * to be the same as (and thus not conflict with) an
+ * open cursor.
+ *
+ * The most obvious case is cursor duplication; when we
+ * call DBC->dup or __dbc_idup, we want to use the original
+ * cursor's locker ID.
+ *
+ * Another case is when updating secondary indices. Standard
+ * CDB locking would mean that we might block ourself: we need
+ * to open an update cursor in the secondary while an update
+ * cursor in the primary is open, and when the secondary and
+ * primary are subdatabases or we're using env-wide locking,
+ * this is disastrous.
+ *
+ * In these cases, our caller will pass a nonzero locker
+ * ID into this function. Use this locker ID instead of
+ * the default as the locker ID for our new cursor.
+ */
+ if (locker != NULL)
+ dbc->locker = locker;
+ else if (LF_ISSET(DB_RECOVER))
+ dbc->locker = NULL;
+ else {
+ if (dbc->lref == NULL) {
+ if ((ret =
+ __lock_id(env, NULL, &dbc->lref)) != 0)
+ goto err;
+ F_SET(dbc, DBC_OWN_LID);
+ }
+ /*
+ * If we are threaded then we need to set the
+ * proper thread id into the locker.
+ */
+ if (DB_IS_THREADED(dbp)) {
+ env->dbenv->thread_id(env->dbenv, &pid, &tid);
+ __lock_set_thread_id(dbc->lref, pid, tid);
+ }
+ dbc->locker = dbc->lref;
+ }
+ }
+
+ /*
+ * These fields change when we are used as a secondary index, so
+ * if the DB is a secondary, make sure they're set properly just
+ * in case we opened some cursors before we were associated.
+ *
+ * __dbc_get is used by all access methods, so this should be safe.
+ */
+ if (F_ISSET(dbp, DB_AM_SECONDARY))
+ dbc->get = dbc->c_get = __dbc_secondary_get_pp;
+
+ /*
+ * Don't enable bulk for btrees with record numbering, since avoiding
+ * a full search avoids taking write locks necessary to maintain
+ * consistent numbering.
+ */
+ if (LF_ISSET(DB_CURSOR_BULK) && dbtype == DB_BTREE &&
+ !F_ISSET(dbp, DB_AM_RECNUM))
+ F_SET(dbc, DBC_BULK);
+ if (LF_ISSET(DB_CURSOR_TRANSIENT))
+ F_SET(dbc, DBC_TRANSIENT);
+ if (LF_ISSET(DBC_OPD))
+ F_SET(dbc, DBC_OPD);
+ if (F_ISSET(dbp, DB_AM_RECOVER) || LF_ISSET(DB_RECOVER))
+ F_SET(dbc, DBC_RECOVER);
+ if (F_ISSET(dbp, DB_AM_COMPENSATE))
+ F_SET(dbc, DBC_DONTLOCK);
+ /*
+ * If this database is exclusive then the cursor
+ * does not need to get locks.
+ */
+ if (F2_ISSET(dbp, DB2_AM_EXCL)) {
+ F_SET(dbc, DBC_DONTLOCK);
+ if (IS_REAL_TXN(txn)&& !LF_ISSET(DBC_OPD | DBC_DUPLICATE)) {
+ /*
+ * Exclusive databases can only have one active
+ * transaction at a time since there are no internal
+ * locks to prevent one transaction from reading and
+ * writing another's uncommitted changes.
+ */
+ if (dbp->cur_txn != NULL && dbp->cur_txn != txn) {
+ __db_errx(env, DB_STR("0749",
+"Exclusive database handles can only have one active transaction at a time."));
+ ret = EINVAL;
+ goto err;
+ }
+ /* Do not trade a second time. */
+ if (dbp->cur_txn != txn) {
+ /* Trade the handle lock to the txn locker. */
+ memset(&req, 0, sizeof(req));
+ req.lock = dbp->handle_lock;
+ req.op = DB_LOCK_TRADE;
+ if ((ret = __lock_vec(env, txn->locker, 0,
+ &req, 1, 0)) != 0)
+ goto err;
+ dbp->cur_txn = txn;
+ dbp->cur_locker = txn->locker;
+ if ((ret = __txn_lockevent(env, txn, dbp,
+ &dbp->handle_lock, dbp->locker)) != 0)
+ goto err;
+ }
+ }
+ }
+#ifdef HAVE_REPLICATION
+ /*
+ * If we are replicating from a down rev version then we must
+ * use old locking protocols.
+ */
+ if (LOGGING_ON(env) &&
+ ((LOG *)env->lg_handle->
+ reginfo.primary)->persist.version < DB_LOGVERSION_LATCHING)
+ F_SET(dbc, DBC_DOWNREV);
+#endif
+
+ /* Refresh the DBC internal structure. */
+ cp = dbc->internal;
+ cp->opd = NULL;
+ cp->pdbc = NULL;
+
+ cp->indx = 0;
+ cp->page = NULL;
+ cp->pgno = PGNO_INVALID;
+ cp->root = root;
+ cp->stream_start_pgno = cp->stream_curr_pgno = PGNO_INVALID;
+ cp->stream_off = 0;
+
+ if (DB_IS_PARTITIONED(dbp)) {
+ DBC_PART_REFRESH(dbc);
+ } else switch (dbtype) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bamc_refresh(dbc)) != 0)
+ goto err;
+ break;
+ case DB_HEAP:
+ if ((ret = __heapc_refresh(dbc)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ case DB_QUEUE:
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env, "DB->cursor", dbp->type);
+ goto err;
+ }
+
+ /*
+ * The transaction keeps track of how many cursors were opened within
+ * it to catch application errors where the cursor isn't closed when
+ * the transaction is resolved.
+ */
+ if (txn != NULL)
+ ++txn->cursors;
+ if (ip != NULL) {
+ dbc->thread_info = ip;
+#ifdef DIAGNOSTIC
+ if (dbc->locker != NULL)
+ ip->dbth_locker =
+ R_OFFSET(&(env->lk_handle->reginfo), dbc->locker);
+ else
+ ip->dbth_locker = INVALID_ROFF;
+#endif
+ } else if (txn != NULL)
+ dbc->thread_info = txn->thread_info;
+ else
+ ENV_GET_THREAD_INFO(env, dbc->thread_info);
+
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links);
+ F_SET(dbc, DBC_ACTIVE);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ *dbcp = dbc;
+ return (0);
+
+err: if (allocated)
+ __os_free(env, dbc);
+ return (ret);
+}
+
+/*
+ * __db_put --
+ * Store a key/data pair.
+ *
+ * PUBLIC: int __db_put __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_put(dbp, ip, txn, key, data, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB_HEAP_RID rid;
+ DBC *dbc;
+ DBT tdata, tkey;
+ ENV *env;
+ void *bulk_kptr, *bulk_ptr;
+ db_recno_t recno;
+ u_int32_t cursor_flags;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ /*
+ * See the comment in __db_get() regarding DB_CURSOR_TRANSIENT.
+ *
+ * Note that the get in the DB_NOOVERWRITE case is safe to do with this
+ * flag set; if it errors in any way other than DB_NOTFOUND, we're
+ * going to close the cursor without doing anything else, and if it
+ * returns DB_NOTFOUND then it's safe to do a c_put(DB_KEYLAST) even if
+ * an access method moved the cursor, since that's not
+ * position-dependent.
+ */
+ cursor_flags = DB_WRITELOCK;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
+ cursor_flags |= DB_CURSOR_BULK;
+ else
+ cursor_flags |= DB_CURSOR_TRANSIENT;
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "DB->put", key, data, flags);
+ PERFMON6(env, db, put, dbp->fname,
+ dbp->dname, txn == NULL ? 0 : txn->txnid, key, data, flags);
+
+ SET_RET_MEM(dbc, dbp);
+
+ if (flags == DB_APPEND && !DB_IS_PRIMARY(dbp)) {
+ /*
+ * If there is an append callback, the value stored in
+ * data->data may be replaced and then freed. To avoid
+ * passing a freed pointer back to the user, just operate
+ * on a copy of the data DBT.
+ */
+ tdata = *data;
+
+ /*
+ * Append isn't a normal put operation; call the appropriate
+ * access method's append function.
+ */
+ switch (dbp->type) {
+ case DB_HEAP:
+ if ((ret = __heap_append(dbc, key, &tdata)) != 0)
+ goto err;
+ break;
+ case DB_QUEUE:
+ if ((ret = __qam_append(dbc, key, &tdata)) != 0)
+ goto err;
+ break;
+ case DB_RECNO:
+ if ((ret = __ram_append(dbc, key, &tdata)) != 0)
+ goto err;
+ break;
+ case DB_BTREE:
+ case DB_HASH:
+ case DB_UNKNOWN:
+ default:
+ /* The interface should prevent this. */
+ DB_ASSERT(env,
+ dbp->type == DB_QUEUE || dbp->type == DB_RECNO);
+
+ ret = __db_ferr(env, "DB->put", 0);
+ goto err;
+ }
+
+ /*
+ * The append callback, if one exists, may have allocated
+ * a new tdata.data buffer. If so, free it.
+ */
+ FREE_IF_NEEDED(env, &tdata);
+
+ /* No need for a cursor put; we're done. */
+#ifdef HAVE_COMPRESSION
+ } else if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
+ !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
+ ret = __dbc_put(dbc, key, data, flags);
+#endif
+ } else if (LF_ISSET(DB_MULTIPLE)) {
+ ret = 0;
+ memset(&tkey, 0, sizeof(tkey));
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+ tkey.data = &recno;
+ tkey.size = sizeof(recno);
+ }
+ memset(&tdata, 0, sizeof(tdata));
+ DB_MULTIPLE_INIT(bulk_kptr, key);
+ DB_MULTIPLE_INIT(bulk_ptr, data);
+ key->doff = 0;
+ while (ret == 0) {
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+ DB_MULTIPLE_RECNO_NEXT(bulk_kptr, key,
+ recno, tdata.data, tdata.size);
+ else
+ DB_MULTIPLE_NEXT(bulk_kptr, key,
+ tkey.data, tkey.size);
+ DB_MULTIPLE_NEXT(bulk_ptr, data,
+ tdata.data, tdata.size);
+ if (bulk_kptr == NULL || bulk_ptr == NULL)
+ break;
+ if (dbp->type == DB_HEAP) {
+ memcpy(&rid, tkey.data, sizeof(DB_HEAP_RID));
+ tkey.data = &rid;
+ }
+ ret = __dbc_put(dbc, &tkey, &tdata,
+ LF_ISSET(DB_OPFLAGS_MASK));
+ if (ret == 0)
+ ++key->doff;
+ }
+ } else if (LF_ISSET(DB_MULTIPLE_KEY)) {
+ ret = 0;
+ memset(&tkey, 0, sizeof(tkey));
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+ tkey.data = &recno;
+ tkey.size = sizeof(recno);
+ }
+ memset(&tdata, 0, sizeof(tdata));
+ DB_MULTIPLE_INIT(bulk_ptr, key);
+ while (ret == 0) {
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+ DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key, recno,
+ tdata.data, tdata.size);
+ else
+ DB_MULTIPLE_KEY_NEXT(bulk_ptr, key, tkey.data,
+ tkey.size, tdata.data, tdata.size);
+ if (bulk_ptr == NULL)
+ break;
+ if (dbp->type == DB_HEAP) {
+ memcpy(&rid, tkey.data, sizeof(DB_HEAP_RID));
+ tkey.data = &rid;
+ }
+ ret = __dbc_put(dbc, &tkey, &tdata,
+ LF_ISSET(DB_OPFLAGS_MASK));
+ if (ret == 0)
+ ++key->doff;
+ }
+ } else
+ ret = __dbc_put(dbc, key, data, flags);
+
+err: /* Close the cursor. */
+ if (!DB_RETOK_DBPUT(ret))
+ F_SET(dbc, DBC_ERROR);
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_del --
+ * Delete the items referenced by a key.
+ *
+ * PUBLIC: int __db_del __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_del(dbp, ip, txn, key, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ DB_HEAP_RID rid;
+ DBC *dbc;
+ DBT data, tkey;
+ void *bulk_ptr;
+ db_recno_t recno;
+ u_int32_t cursor_flags, f_init, f_next;
+ int ret, t_ret;
+
+ COMPQUIET(bulk_ptr, NULL);
+ /* Allocate a cursor. */
+ cursor_flags = DB_WRITELOCK;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
+ cursor_flags |= DB_CURSOR_BULK;
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "DB->del", key, NULL, flags);
+ PERFMON5(env, db, del,
+ dbp->fname, dbp->dname, txn == NULL ? 0 : txn->txnid, key, flags);
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
+ !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
+ F_SET(dbc, DBC_TRANSIENT);
+ ret = __dbc_bulk_del(dbc, key, flags);
+ goto err;
+ }
+#endif
+
+ /*
+ * Walk a cursor through the key/data pairs, deleting as we go. Set
+ * the DB_DBT_USERMEM flag, as this might be a threaded application
+ * and the flags checking will catch us. We don't actually want the
+ * keys or data, set DB_DBT_ISSET. We rely on __dbc_get to clear
+ * this.
+ */
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_USERMEM);
+ tkey = *key;
+
+ f_init = LF_ISSET(DB_MULTIPLE_KEY) ? DB_GET_BOTH : DB_SET;
+ f_next = DB_NEXT_DUP;
+
+ /*
+ * If locking (and we haven't already acquired CDB locks), set the
+ * read-modify-write flag.
+ */
+ if (STD_LOCKING(dbc)) {
+ f_init |= DB_RMW;
+ f_next |= DB_RMW;
+ }
+
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+ memset(&tkey, 0, sizeof(tkey));
+ tkey.data = &recno;
+ tkey.size = sizeof(recno);
+ }
+ DB_MULTIPLE_INIT(bulk_ptr, key);
+ /* We return the number of keys deleted in doff. */
+ key->doff = 0;
+bulk_next: if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+ DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key,
+ recno, data.data, data.size);
+ else if (LF_ISSET(DB_MULTIPLE))
+ DB_MULTIPLE_NEXT(bulk_ptr, key, tkey.data, tkey.size);
+ else
+ DB_MULTIPLE_KEY_NEXT(bulk_ptr, key,
+ tkey.data, tkey.size, data.data, data.size);
+ if (bulk_ptr == NULL)
+ goto err;
+ if (dbp->type == DB_HEAP) {
+ memcpy(&rid, tkey.data, sizeof(DB_HEAP_RID));
+ tkey.data = &rid;
+ }
+
+ }
+
+ /* We're not interested in the data -- do not return it. */
+ F_SET(&tkey, DB_DBT_ISSET);
+ F_SET(&data, DB_DBT_ISSET);
+
+ /*
+ * Optimize the simple cases. For all AMs if we don't have secondaries
+ * and are not a secondary and we aren't a foreign database and there
+ * are no dups then we can avoid a bunch of overhead. For queue we
+ * don't need to fetch the record since we delete by direct calculation
+ * from the record number.
+ *
+ * Hash permits an optimization in DB->del: since on-page duplicates are
+ * stored in a single HKEYDATA structure, it's possible to delete an
+ * entire set of them at once, and as the HKEYDATA has to be rebuilt
+ * and re-put each time it changes, this is much faster than deleting
+ * the duplicates one by one. Thus, if not pointing at an off-page
+ * duplicate set, and we're not using secondary indices (in which case
+ * we'd have to examine the items one by one anyway), let hash do this
+ * "quick delete".
+ *
+ * !!!
+ * Note that this is the only application-executed delete call in
+ * Berkeley DB that does not go through the __dbc_del function.
+ * If anything other than the delete itself (like a secondary index
+ * update) has to happen there in a particular situation, the
+ * conditions here should be modified not to use these optimizations.
+ * The ordinary AM-independent alternative will work just fine;
+ * it'll just be slower.
+ */
+ if (!F_ISSET(dbp, DB_AM_SECONDARY) && !DB_IS_PRIMARY(dbp) &&
+ LIST_FIRST(&dbp->f_primaries) == NULL) {
+#ifdef HAVE_QUEUE
+ if (dbp->type == DB_QUEUE) {
+ ret = __qam_delete(dbc, &tkey, flags);
+ goto next;
+ }
+#endif
+
+ /* Fetch the first record. */
+ if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
+ goto err;
+
+#ifdef HAVE_HASH
+ /*
+ * Hash "quick delete" removes all on-page duplicates. We
+ * can't do that if deleting specific key/data pairs.
+ */
+ if (dbp->type == DB_HASH && !LF_ISSET(DB_MULTIPLE_KEY)) {
+ DBC *sdbc;
+ sdbc = dbc;
+#ifdef HAVE_PARTITION
+ if (F_ISSET(dbc, DBC_PARTITIONED))
+ sdbc =
+ ((PART_CURSOR*)dbc->internal)->sub_cursor;
+#endif
+ if (sdbc->internal->opd == NULL) {
+ ret = __ham_quick_delete(sdbc);
+ goto next;
+ }
+ }
+#endif
+
+ if (!F_ISSET(dbp, DB_AM_DUP)) {
+ ret = dbc->am_del(dbc, 0);
+ goto next;
+ }
+ } else if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
+ goto err;
+
+ /* Walk through the set of key/data pairs, deleting as we go. */
+ for (;;) {
+ if ((ret = __dbc_del(dbc, flags)) != 0)
+ break;
+ /*
+ * With DB_MULTIPLE_KEY, the application has specified the
+ * exact records they want deleted. We don't need to walk
+ * through a set of duplicates.
+ */
+ if (LF_ISSET(DB_MULTIPLE_KEY))
+ break;
+
+ F_SET(&tkey, DB_DBT_ISSET);
+ F_SET(&data, DB_DBT_ISSET);
+ if ((ret = __dbc_get(dbc, &tkey, &data, f_next)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ break;
+ }
+ }
+
+next: if (ret == 0 && LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ ++key->doff;
+ goto bulk_next;
+ }
+err: /* Discard the cursor. */
+ if (!DB_RETOK_DBDEL(ret))
+ F_SET(dbc, DBC_ERROR);
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_sync --
+ * Flush the database cache.
+ *
+ * PUBLIC: int __db_sync __P((DB *));
+ */
+int
+__db_sync(dbp)
+ DB *dbp;
+{
+ int ret, t_ret;
+
+ ret = 0;
+
+ /* If the database was read-only, we're done. */
+ if (F_ISSET(dbp, DB_AM_RDONLY))
+ return (0);
+
+ /* If it's a Recno tree, write the backing source text file. */
+ if (dbp->type == DB_RECNO)
+ ret = __ram_writeback(dbp);
+
+ /* If the database was never backed by a database file, we're done. */
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ return (ret);
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __partition_sync(dbp);
+ else
+#endif
+ if (dbp->type == DB_QUEUE)
+ ret = __qam_sync(dbp);
+ else
+ /* Flush any dirty pages from the cache to the backing file. */
+ if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_associate --
+ * Associate another database as a secondary index to this one.
+ *
+ * PUBLIC: int __db_associate __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate(dbp, ip, txn, sdbp, callback, flags)
+ DB *dbp, *sdbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+ u_int32_t flags;
+{
+ DBC *pdbc, *sdbc;
+ DBT key, data, skey, *tskeyp;
+ ENV *env;
+ int build, ret, t_ret;
+ u_int32_t nskey;
+
+ env = dbp->env;
+ pdbc = sdbc = NULL;
+ ret = 0;
+
+ memset(&skey, 0, sizeof(DBT));
+ nskey = 0;
+ tskeyp = NULL;
+
+ /*
+ * Check to see if the secondary is empty -- and thus if we should
+ * build it -- before we link it in and risk making it show up in other
+ * threads. Do this first so that the databases remain unassociated on
+ * error.
+ */
+ build = 0;
+ if (LF_ISSET(DB_CREATE)) {
+ FLD_SET(sdbp->s_assoc_flags, DB_ASSOC_CREATE);
+
+ if ((ret = __db_cursor(sdbp, ip, txn, &sdbc, 0)) != 0)
+ goto err;
+
+ /*
+ * We don't care about key or data; we're just doing
+ * an existence check.
+ */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ F_SET(&key, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ F_SET(&data, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = __dbc_get(sdbc, &key, &data,
+ (STD_LOCKING(sdbc) ? DB_RMW : 0) |
+ DB_FIRST)) == DB_NOTFOUND) {
+ build = 1;
+ ret = 0;
+ }
+
+ if (ret != 0)
+ F_SET(sdbc, DBC_ERROR);
+ if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Reset for later error check. */
+ sdbc = NULL;
+
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * Set up the database handle as a secondary.
+ */
+ sdbp->s_callback = callback;
+ sdbp->s_primary = dbp;
+
+ sdbp->stored_get = sdbp->get;
+ sdbp->get = __db_secondary_get;
+
+ sdbp->stored_close = sdbp->close;
+ sdbp->close = __db_secondary_close_pp;
+
+ F_SET(sdbp, DB_AM_SECONDARY);
+
+ if (LF_ISSET(DB_IMMUTABLE_KEY))
+ FLD_SET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY);
+
+ /*
+ * Add the secondary to the list on the primary. Do it here
+ * so that we see any updates that occur while we're walking
+ * the primary.
+ */
+ MUTEX_LOCK(env, dbp->mutex);
+
+ /* See __db_s_next for an explanation of secondary refcounting. */
+ DB_ASSERT(env, sdbp->s_refcnt == 0);
+ sdbp->s_refcnt = 1;
+ LIST_INSERT_HEAD(&dbp->s_secondaries, sdbp, s_links);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ if (build) {
+ /*
+ * We loop through the primary, putting each item we
+ * find into the new secondary.
+ *
+ * If we're using CDB, opening these two cursors puts us
+ * in a bit of a locking tangle: CDB locks are done on the
+ * primary, so that we stay deadlock-free, but that means
+ * that updating the secondary while we have a read cursor
+ * open on the primary will self-block. To get around this,
+ * we force the primary cursor to use the same locker ID
+ * as the secondary, so they won't conflict. This should
+ * be harmless even if we're not using CDB.
+ */
+ if ((ret = __db_cursor(sdbp, ip, txn, &sdbc,
+ CDB_LOCKING(sdbp->env) ? DB_WRITECURSOR : 0)) != 0)
+ goto err;
+ if ((ret = __db_cursor_int(dbp, ip,
+ txn, dbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+ goto err;
+
+ /* Lock out other threads, now that we have a locker. */
+ dbp->associate_locker = sdbc->locker;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ while ((ret = __dbc_get(pdbc, &key, &data, DB_NEXT)) == 0) {
+ if ((ret = callback(sdbp, &key, &data, &skey)) != 0) {
+ if (ret == DB_DONOTINDEX)
+ continue;
+ goto err;
+ }
+ if (F_ISSET(&skey, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+ __db_check_skeyset(sdbp, &skey);
+#endif
+ nskey = skey.size;
+ tskeyp = (DBT *)skey.data;
+ } else {
+ nskey = 1;
+ tskeyp = &skey;
+ }
+ SWAP_IF_NEEDED(sdbp, &key);
+ for (; nskey > 0; nskey--, tskeyp++) {
+ if ((ret = __dbc_put(sdbc,
+ tskeyp, &key, DB_UPDATE_SECONDARY)) != 0)
+ goto err;
+ FREE_IF_NEEDED(env, tskeyp);
+ }
+ SWAP_IF_NEEDED(sdbp, &key);
+ FREE_IF_NEEDED(env, &skey);
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ }
+
+err: if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (pdbc != NULL && (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ dbp->associate_locker = NULL;
+
+ for (; nskey > 0; nskey--, tskeyp++)
+ FREE_IF_NEEDED(env, tskeyp);
+ FREE_IF_NEEDED(env, &skey);
+
+ return (ret);
+}
+
+/*
+ * __db_secondary_get --
+ * This wrapper function for DB->pget() is the DB->get() function
+ * on a database which has been made into a secondary index.
+ */
+static int
+__db_secondary_get(sdbp, txn, skey, data, flags)
+ DB *sdbp;
+ DB_TXN *txn;
+ DBT *skey, *data;
+ u_int32_t flags;
+{
+ DB_ASSERT(sdbp->env, F_ISSET(sdbp, DB_AM_SECONDARY));
+ return (__db_pget_pp(sdbp, txn, skey, NULL, data, flags));
+}
+
+/*
+ * __db_secondary_close --
+ * Wrapper function for DB->close() which we use on secondaries to
+ * manage refcounting and make sure we don't close them underneath
+ * a primary that is updating.
+ *
+ * PUBLIC: int __db_secondary_close __P((DB *, u_int32_t));
+ */
+int
+__db_secondary_close(sdbp, flags)
+ DB *sdbp;
+ u_int32_t flags;
+{
+ DB *primary;
+ ENV *env;
+ int doclose;
+
+ /*
+ * If the opening transaction is rolled back then the db handle
+ * will have already been refreshed, we just need to call
+ * __db_close to free the data.
+ */
+ if (!F_ISSET(sdbp, DB_AM_OPEN_CALLED)) {
+ doclose = 1;
+ goto done;
+ }
+ doclose = 0;
+ primary = sdbp->s_primary;
+ env = primary->env;
+
+ MUTEX_LOCK(env, primary->mutex);
+ /*
+ * Check the refcount--if it was at 1 when we were called, no
+ * thread is currently updating this secondary through the primary,
+ * so it's safe to close it for real.
+ *
+ * If it's not safe to do the close now, we do nothing; the
+ * database will actually be closed when the refcount is decremented,
+ * which can happen in either __db_s_next or __db_s_done.
+ */
+ DB_ASSERT(env, sdbp->s_refcnt != 0);
+ if (--sdbp->s_refcnt == 0) {
+ LIST_REMOVE(sdbp, s_links);
+ /* We don't want to call close while the mutex is held. */
+ doclose = 1;
+ }
+ MUTEX_UNLOCK(env, primary->mutex);
+
+ /*
+ * sdbp->close is this function; call the real one explicitly if
+ * need be.
+ */
+done: return (doclose ? __db_close(sdbp, NULL, flags) : 0);
+}
+
+/*
+ * __db_associate_foreign --
+ * Associate this database (fdbp) as a foreign constraint to another
+ * database (pdbp). That is, dbp's keys appear as foreign key values in
+ * pdbp.
+ *
+ * PUBLIC: int __db_associate_foreign __P((DB *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ * PUBLIC: u_int32_t));
+ */
+int
+__db_associate_foreign(fdbp, pdbp, callback, flags)
+ DB *fdbp, *pdbp;
+ int (*callback)(DB *, const DBT *, DBT *, const DBT *, int *);
+ u_int32_t flags;
+{
+ DB_FOREIGN_INFO *f_info;
+ ENV *env;
+ int ret;
+
+ env = fdbp->env;
+ ret = 0;
+
+ if ((ret = __os_malloc(env, sizeof(DB_FOREIGN_INFO), &f_info)) != 0) {
+ return (ret);
+ }
+ memset(f_info, 0, sizeof(DB_FOREIGN_INFO));
+
+ f_info->dbp = pdbp;
+ f_info->callback = callback;
+
+ /*
+ * It might be wise to filter this, but for now the flags only
+ * set the delete action type.
+ */
+ FLD_SET(f_info->flags, flags);
+
+ /*
+ * Add f_info to the foreign database's list of primaries. That is to
+ * say, fdbp->f_primaries lists all databases for which fdbp is a
+ * foreign constraint.
+ */
+ MUTEX_LOCK(env, fdbp->mutex);
+ LIST_INSERT_HEAD(&fdbp->f_primaries, f_info, f_links);
+ MUTEX_UNLOCK(env, fdbp->mutex);
+
+ /*
+ * Associate fdbp as pdbp's foreign db, for referential integrity
+ * checks. We don't allow the foreign db to be changed, because we
+ * currently have no way of removing pdbp from the old foreign db's list
+ * of primaries.
+ */
+ if (pdbp->s_foreign != NULL)
+ return (EINVAL);
+ pdbp->s_foreign = fdbp;
+
+ return (ret);
+}
+
+static int
+__dbc_set_priority(dbc, priority)
+ DBC *dbc;
+ DB_CACHE_PRIORITY priority;
+{
+ dbc->priority = priority;
+ return (0);
+}
+
+static int
+__dbc_get_priority(dbc, priority)
+ DBC *dbc;
+ DB_CACHE_PRIORITY *priority;
+{
+ if (dbc->priority == DB_PRIORITY_UNCHANGED)
+ return (__memp_get_priority(dbc->dbp->mpf, priority));
+ else
+ *priority = dbc->priority;
+
+ return (0);
+}
diff --git a/src/db/db_auto.c b/src/db/db_auto.c
new file mode 100644
index 00000000..7c6b7e66
--- /dev/null
+++ b/src/db/db_auto.c
@@ -0,0 +1,276 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __db_addrem_desc[] = {
+ {LOGREC_OP, SSZ(__db_addrem_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__db_addrem_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_addrem_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_addrem_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__db_addrem_args, nbytes), "nbytes", "%lu"},
+ {LOGREC_HDR, SSZ(__db_addrem_args, hdr), "hdr", ""},
+ {LOGREC_DBT, SSZ(__db_addrem_args, dbt), "dbt", ""},
+ {LOGREC_POINTER, SSZ(__db_addrem_args, pagelsn), "pagelsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_addrem_42_desc[] = {
+ {LOGREC_ARG, SSZ(__db_addrem_42_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__db_addrem_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_addrem_42_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_addrem_42_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__db_addrem_42_args, nbytes), "nbytes", "%lu"},
+ {LOGREC_DBT, SSZ(__db_addrem_42_args, hdr), "hdr", ""},
+ {LOGREC_DBT, SSZ(__db_addrem_42_args, dbt), "dbt", ""},
+ {LOGREC_POINTER, SSZ(__db_addrem_42_args, pagelsn), "pagelsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_big_desc[] = {
+ {LOGREC_OP, SSZ(__db_big_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__db_big_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_big_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_big_args, prev_pgno), "prev_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_big_args, next_pgno), "next_pgno", "%lu"},
+ {LOGREC_HDR, SSZ(__db_big_args, dbt), "dbt", ""},
+ {LOGREC_POINTER, SSZ(__db_big_args, pagelsn), "pagelsn", ""},
+ {LOGREC_POINTER, SSZ(__db_big_args, prevlsn), "prevlsn", ""},
+ {LOGREC_POINTER, SSZ(__db_big_args, nextlsn), "nextlsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_big_42_desc[] = {
+ {LOGREC_ARG, SSZ(__db_big_42_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__db_big_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_big_42_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_big_42_args, prev_pgno), "prev_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_big_42_args, next_pgno), "next_pgno", "%lu"},
+ {LOGREC_DBT, SSZ(__db_big_42_args, dbt), "dbt", ""},
+ {LOGREC_POINTER, SSZ(__db_big_42_args, pagelsn), "pagelsn", ""},
+ {LOGREC_POINTER, SSZ(__db_big_42_args, prevlsn), "prevlsn", ""},
+ {LOGREC_POINTER, SSZ(__db_big_42_args, nextlsn), "nextlsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_ovref_desc[] = {
+ {LOGREC_DB, SSZ(__db_ovref_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_ovref_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_ovref_args, adjust), "adjust", "%ld"},
+ {LOGREC_POINTER, SSZ(__db_ovref_args, lsn), "lsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_relink_42_desc[] = {
+ {LOGREC_ARG, SSZ(__db_relink_42_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__db_relink_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_relink_42_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_relink_42_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__db_relink_42_args, prev), "prev", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_relink_42_args, lsn_prev), "lsn_prev", ""},
+ {LOGREC_ARG, SSZ(__db_relink_42_args, next), "next", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_relink_42_args, lsn_next), "lsn_next", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_debug_desc[] = {
+ {LOGREC_DBT, SSZ(__db_debug_args, op), "op", ""},
+ {LOGREC_ARG, SSZ(__db_debug_args, fileid), "fileid", "%ld"},
+ {LOGREC_DBT, SSZ(__db_debug_args, key), "key", ""},
+ {LOGREC_DBT, SSZ(__db_debug_args, data), "data", ""},
+ {LOGREC_ARG, SSZ(__db_debug_args, arg_flags), "arg_flags", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_noop_desc[] = {
+ {LOGREC_DB, SSZ(__db_noop_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_noop_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_noop_args, prevlsn), "prevlsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_alloc_42_desc[] = {
+ {LOGREC_DB, SSZ(__db_pg_alloc_42_args, fileid), "fileid", ""},
+ {LOGREC_POINTER, SSZ(__db_pg_alloc_42_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_alloc_42_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_alloc_42_args, page_lsn), "page_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_alloc_42_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pg_alloc_42_args, ptype), "ptype", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pg_alloc_42_args, next), "next", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_alloc_desc[] = {
+ {LOGREC_DB, SSZ(__db_pg_alloc_args, fileid), "fileid", ""},
+ {LOGREC_POINTER, SSZ(__db_pg_alloc_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_alloc_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_alloc_args, page_lsn), "page_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_alloc_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pg_alloc_args, ptype), "ptype", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pg_alloc_args, next), "next", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pg_alloc_args, last_pgno), "last_pgno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_free_42_desc[] = {
+ {LOGREC_DB, SSZ(__db_pg_free_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_pg_free_42_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_free_42_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_free_42_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_PGDBT, SSZ(__db_pg_free_42_args, header), "header", ""},
+ {LOGREC_ARG, SSZ(__db_pg_free_42_args, next), "next", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_free_desc[] = {
+ {LOGREC_DB, SSZ(__db_pg_free_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_pg_free_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_free_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_free_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_PGDBT, SSZ(__db_pg_free_args, header), "header", ""},
+ {LOGREC_ARG, SSZ(__db_pg_free_args, next), "next", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pg_free_args, last_pgno), "last_pgno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_cksum_desc[] = {
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_freedata_42_desc[] = {
+ {LOGREC_DB, SSZ(__db_pg_freedata_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_pg_freedata_42_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_freedata_42_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_freedata_42_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_PGDBT, SSZ(__db_pg_freedata_42_args, header), "header", ""},
+ {LOGREC_ARG, SSZ(__db_pg_freedata_42_args, next), "next", "%lu"},
+ {LOGREC_PGDDBT, SSZ(__db_pg_freedata_42_args, data), "data", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_freedata_desc[] = {
+ {LOGREC_DB, SSZ(__db_pg_freedata_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_pg_freedata_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_freedata_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_freedata_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_PGDBT, SSZ(__db_pg_freedata_args, header), "header", ""},
+ {LOGREC_ARG, SSZ(__db_pg_freedata_args, next), "next", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pg_freedata_args, last_pgno), "last_pgno", "%lu"},
+ {LOGREC_PGDDBT, SSZ(__db_pg_freedata_args, data), "data", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_init_desc[] = {
+ {LOGREC_DB, SSZ(__db_pg_init_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_pg_init_args, pgno), "pgno", "%lu"},
+ {LOGREC_PGDBT, SSZ(__db_pg_init_args, header), "header", ""},
+ {LOGREC_PGDDBT, SSZ(__db_pg_init_args, data), "data", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_sort_44_desc[] = {
+ {LOGREC_DB, SSZ(__db_pg_sort_44_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_pg_sort_44_args, meta), "meta", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_sort_44_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_sort_44_args, last_free), "last_free", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_sort_44_args, last_lsn), "last_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_sort_44_args, last_pgno), "last_pgno", "%lu"},
+ {LOGREC_DBT, SSZ(__db_pg_sort_44_args, list), "list", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pg_trunc_desc[] = {
+ {LOGREC_DB, SSZ(__db_pg_trunc_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_pg_trunc_args, meta), "meta", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_trunc_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_trunc_args, last_free), "last_free", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pg_trunc_args, last_lsn), "last_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pg_trunc_args, next_free), "next_free", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pg_trunc_args, last_pgno), "last_pgno", "%lu"},
+ {LOGREC_PGLIST, SSZ(__db_pg_trunc_args, list), "list", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_realloc_desc[] = {
+ {LOGREC_DB, SSZ(__db_realloc_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_realloc_args, prev_pgno), "prev_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_realloc_args, page_lsn), "page_lsn", ""},
+ {LOGREC_ARG, SSZ(__db_realloc_args, next_free), "next_free", "%lu"},
+ {LOGREC_ARG, SSZ(__db_realloc_args, ptype), "ptype", "%lu"},
+ {LOGREC_PGLIST, SSZ(__db_realloc_args, list), "list", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_relink_desc[] = {
+ {LOGREC_DB, SSZ(__db_relink_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_relink_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_relink_args, new_pgno), "new_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_relink_args, prev_pgno), "prev_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_relink_args, lsn_prev), "lsn_prev", ""},
+ {LOGREC_ARG, SSZ(__db_relink_args, next_pgno), "next_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_relink_args, lsn_next), "lsn_next", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_merge_desc[] = {
+ {LOGREC_DB, SSZ(__db_merge_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_merge_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_merge_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__db_merge_args, npgno), "npgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_merge_args, nlsn), "nlsn", ""},
+ {LOGREC_PGDBT, SSZ(__db_merge_args, hdr), "hdr", ""},
+ {LOGREC_PGDDBT, SSZ(__db_merge_args, data), "data", ""},
+ {LOGREC_ARG, SSZ(__db_merge_args, pg_copy), "pg_copy", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __db_pgno_desc[] = {
+ {LOGREC_DB, SSZ(__db_pgno_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__db_pgno_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__db_pgno_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__db_pgno_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pgno_args, opgno), "opgno", "%lu"},
+ {LOGREC_ARG, SSZ(__db_pgno_args, npgno), "npgno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __db_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_addrem_recover, DB___db_addrem)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_big_recover, DB___db_big)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_ovref_recover, DB___db_ovref)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_debug_recover, DB___db_debug)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_noop_recover, DB___db_noop)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_alloc_recover, DB___db_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_free_recover, DB___db_pg_free)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_cksum_recover, DB___db_cksum)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_freedata_recover, DB___db_pg_freedata)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_init_recover, DB___db_pg_init)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_trunc_recover, DB___db_pg_trunc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_realloc_recover, DB___db_realloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_relink_recover, DB___db_relink)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_merge_recover, DB___db_merge)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pgno_recover, DB___db_pgno)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/db/db_autop.c b/src/db/db_autop.c
new file mode 100644
index 00000000..6fe77039
--- /dev/null
+++ b/src/db/db_autop.c
@@ -0,0 +1,441 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __db_addrem_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_addrem_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_addrem", __db_addrem_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_addrem_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_addrem_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_addrem_42", __db_addrem_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_big_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_big", __db_big_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_big_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_big_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_big_42", __db_big_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_ovref_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_ovref_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_ovref", __db_ovref_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_relink_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_relink_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_relink_42", __db_relink_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_debug_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_debug_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_debug", __db_debug_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_noop_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_noop_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_noop", __db_noop_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_alloc_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pg_alloc_42", __db_pg_alloc_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_alloc_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pg_alloc", __db_pg_alloc_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_free_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_free_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pg_free_42", __db_pg_free_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_free_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_free_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pg_free", __db_pg_free_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_cksum_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_cksum_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_cksum", __db_cksum_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_freedata_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pg_freedata_42", __db_pg_freedata_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_freedata_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pg_freedata", __db_pg_freedata_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_init_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_init_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pg_init", __db_pg_init_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_sort_44_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_sort_44_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pg_sort_44", __db_pg_sort_44_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_trunc_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pg_trunc", __db_pg_trunc_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_realloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_realloc_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_realloc", __db_realloc_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_relink_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_relink_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_relink", __db_relink_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_merge_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_merge_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_merge", __db_merge_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_pgno_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pgno_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__db_pgno", __db_pgno_desc, info));
+}
+
+/*
+ * PUBLIC: int __db_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_addrem_print, DB___db_addrem)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_big_print, DB___db_big)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_ovref_print, DB___db_ovref)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_debug_print, DB___db_debug)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_noop_print, DB___db_noop)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_alloc_print, DB___db_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_free_print, DB___db_pg_free)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_cksum_print, DB___db_cksum)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_freedata_print, DB___db_pg_freedata)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_init_print, DB___db_pg_init)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_trunc_print, DB___db_pg_trunc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_realloc_print, DB___db_realloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_relink_print, DB___db_relink)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_merge_print, DB___db_merge)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pgno_print, DB___db_pgno)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/db/db_backup.c b/src/db/db_backup.c
new file mode 100644
index 00000000..66d7382a
--- /dev/null
+++ b/src/db/db_backup.c
@@ -0,0 +1,775 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/heap.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+#ifdef HAVE_QUEUE
+#include "dbinc/qam.h"
+#endif
+
+static void save_error __P((const DB_ENV *, const char *, const char *));
+static int backup_read_log_dir __P((DB_ENV *, const char *, int *, u_int32_t));
+static int backup_read_data_dir
+ __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t));
+static int backup_dir_clean
+ __P((DB_ENV *, const char *, const char *, int *, u_int32_t));
+static int backup_data_copy
+ __P((DB_ENV *, const char *, const char *, const char *, int));
+
+/*
+ * __db_dbbackup_pp --
+ * Copy a database file coordinated with mpool.
+ *
+ * PUBLIC: int __db_dbbackup_pp __P((DB_ENV *,
+ * PUBLIC: const char *, const char *, u_int32_t));
+ */
+int
+__db_dbbackup_pp(dbenv, dbfile, target, flags)
+ DB_ENV *dbenv;
+ const char *dbfile, *target;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ if ((ret = __db_fchk(dbenv->env,
+ "DB_ENV->dbbackup", flags, DB_EXCL)) != 0)
+ return (ret);
+ ENV_ENTER(dbenv->env, ip);
+
+ ret = __db_dbbackup(dbenv, ip, dbfile, target, flags);
+
+ ENV_LEAVE(dbenv->env, ip);
+ return (ret);
+}
+
+/*
+ * __db_dbbackup --
+ * Copy a database file coordinated with mpool.
+ *
+ * PUBLIC: int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *,
+ * PUBLIC: const char *, const char *, u_int32_t));
+ */
+int
+__db_dbbackup(dbenv, ip, dbfile, target, flags)
+ DB_ENV *dbenv;
+ DB_THREAD_INFO *ip;
+ const char *dbfile, *target;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_FH *fp;
+ void *handle;
+ int ret, retry_count, t_ret;
+
+ dbp = NULL;
+ retry_count = 0;
+
+retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 &&
+ (ret = __db_open(dbp, ip, NULL, dbfile, NULL,
+ DB_UNKNOWN, DB_AUTO_COMMIT | DB_RDONLY, 0, PGNO_BASE_MD)) != 0) {
+ if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) {
+ (void)__db_close(dbp, NULL, DB_NOSYNC);
+ dbp = NULL;
+ if (++retry_count > 100)
+ return (ret);
+ __db_errx(dbenv->env, DB_STR_A("0702",
+ "Deadlock while opening %s, retrying", "%s"), dbfile);
+ __os_yield(dbenv->env, 1, 0);
+ goto retry;
+ }
+ }
+
+ if (ret == 0) {
+ if ((ret = __memp_backup_open(dbenv->env,
+ dbp->mpf, dbfile, target, flags, &fp, &handle)) == 0) {
+ if (dbp->type == DB_HEAP)
+ ret = __heap_backup(
+ dbenv, dbp, ip, fp, handle, flags);
+ else
+ ret = __memp_backup_mpf(
+ dbenv->env, dbp->mpf,
+ ip, 0, dbp->mpf->mfp->last_pgno,
+ fp, handle, flags);
+ }
+ if ((t_ret = __memp_backup_close(dbenv->env,
+ dbp->mpf, dbfile, fp, handle)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+#ifdef HAVE_QUEUE
+ /*
+ * For compatibility with the 5.2 and patch versions of db_copy
+ * dump the queue extents here.
+ */
+ if (ret == 0 && dbp->type == DB_QUEUE)
+ ret = __qam_backup_extents(dbp, ip, target, flags);
+#endif
+
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ __db_err(dbenv->env, ret, "Backup Failed");
+ return (ret);
+}
+
+/*
+ * backup_dir_clean --
+ * Clean out the backup directory.
+ */
+static int
+backup_dir_clean(dbenv, backup_dir, log_dir, remove_maxp, flags)
+ DB_ENV *dbenv;
+ const char *backup_dir, *log_dir;
+ int *remove_maxp;
+ u_int32_t flags;
+{
+ ENV *env;
+ int cnt, fcnt, ret, v;
+ const char *dir;
+ char **names, buf[DB_MAXPATHLEN], path[DB_MAXPATHLEN];
+
+ env = dbenv->env;
+
+ /* We may be cleaning a log directory separate from the target. */
+ if (log_dir != NULL) {
+ if ((ret = __os_concat_path(buf,
+ sizeof(buf), backup_dir, log_dir)) != 0) {
+ buf[sizeof(buf) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0717",
+ "%s: path too long", "%s"), buf);
+ return (EINVAL);
+ }
+ dir = buf;
+ } else
+ dir = backup_dir;
+
+ /* Get a list of file names. */
+ if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
+ if (log_dir != NULL && !LF_ISSET(DB_BACKUP_UPDATE))
+ return (0);
+ __db_err(env,
+ ret, DB_STR_A("0718", "%s: directory read", "%s"), dir);
+ return (ret);
+ }
+ for (cnt = fcnt; --cnt >= 0;) {
+ /*
+ * Skip non-log files (if update was specified).
+ */
+ if (!IS_LOG_FILE(names[cnt])) {
+ if (LF_ISSET(DB_BACKUP_UPDATE))
+ continue;
+ } else {
+ /* Track the highest-numbered log file removed. */
+ v = atoi(names[cnt] + sizeof(LFPREFIX) - 1);
+ if (*remove_maxp < v)
+ *remove_maxp = v;
+ }
+ if ((ret = __os_concat_path(path,
+ sizeof(path), dir, names[cnt])) != 0) {
+ path[sizeof(path) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0714",
+ "%s: path too long", "%s"), path);
+ return (EINVAL);
+ }
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+ __db_msg(env, DB_STR_A("0715", "removing %s",
+ "%s"), path);
+ if ((ret = __os_unlink(env, path, 0)) != 0)
+ return (ret);
+ }
+
+ __os_dirfree(env, names, fcnt);
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP) && *remove_maxp != 0)
+ __db_msg(env, DB_STR_A("0719",
+ "highest numbered log file removed: %d", "%d"),
+ *remove_maxp);
+
+ return (0);
+}
+
+/*
+ * backup_data_copy --
+ * Copy a non-database file into the backup directory.
+ */
+static int
+backup_data_copy(dbenv, file, from_dir, to_dir, log)
+ DB_ENV *dbenv;
+ const char *file, *from_dir, *to_dir;
+ int log;
+{
+ DB_BACKUP *backup;
+ DB_FH *rfhp, *wfhp;
+ ENV *env;
+ u_int32_t gigs, off;
+ size_t nr, nw;
+ int ret, t_ret;
+ char *buf;
+ void *handle;
+ char from[DB_MAXPATHLEN], to[DB_MAXPATHLEN];
+
+ rfhp = wfhp = NULL;
+ handle = NULL;
+ buf = NULL;
+ env = dbenv->env;
+ backup = env->backup_handle;
+
+ if ((ret = __os_concat_path(from,
+ sizeof(from), from_dir, file)) != 0) {
+ from[sizeof(from) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0728",
+ "%s: path too long", "%s"), from);
+ goto err;
+ }
+ if ((ret = __os_concat_path(to,
+ sizeof(to), to_dir, file)) != 0) {
+ to[sizeof(to) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0729",
+ "%s: path too long", "%s"), to);
+ goto err;
+ }
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+ __db_msg(env, DB_STR_A("0726",
+ "copying %s to %s", "%s %s"), from, to);
+
+ if ((ret = __os_malloc(env, MEGABYTE, &buf)) != 0) {
+ __db_err(env, ret, DB_STR_A("0727",
+ "%lu buffer allocation", "%lu"), (u_long)MEGABYTE);
+ return (ret);
+ }
+
+ /* Open the input file. */
+ if ((ret = __os_open(env, from, 0, DB_OSO_RDONLY, 0, &rfhp)) != 0) {
+ if (ret == ENOENT && !log) {
+ ret = 0;
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+ __db_msg(env, DB_STR_A("0730",
+ "%s%c%s not present", "%s %c %s"),
+ from_dir, PATH_SEPARATOR[0], file);
+ goto done;
+ }
+ __db_err(env, ret, "%s", buf);
+ goto err;
+ }
+
+ /* Open the output file. */
+ if (backup != NULL && backup->open != NULL)
+ ret = backup->open(env->dbenv, file, to_dir, &handle);
+ else {
+ if ((ret = __os_open(env, to, 0,
+ DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &wfhp)) != 0) {
+ __db_err(env, ret, "%s", to);
+ goto err;
+ }
+ }
+
+ off = 0;
+ gigs = 0;
+ /* Copy the data. */
+ while ((ret = __os_read(env, rfhp, buf, MEGABYTE, &nr)) == 0 &&
+ nr > 0) {
+ if (backup != NULL && backup->write != NULL) {
+ if ((ret = backup->write(env->dbenv, gigs,
+ off, (u_int32_t)nr, (u_int8_t *)buf, handle)) != 0)
+ break;
+ } else {
+ if ((ret = __os_write(env, wfhp, buf, nr, &nw)) != 0)
+ break;
+ if (nr != nw) {
+ ret = EIO;
+ break;
+ }
+ }
+ off += (u_int32_t)nr;
+ if (off >= GIGABYTE) {
+ gigs++;
+ off -= GIGABYTE;
+ }
+ }
+ if (ret != 0)
+ __db_err(env, ret, DB_STR("0748", "Write failed."));
+
+err:
+done: if (buf != NULL)
+ __os_free(env, buf);
+
+ if (backup != NULL && backup->close != NULL &&
+ (t_ret = backup->close(env->dbenv, file, handle)) != 0 && ret != 0)
+ ret = t_ret;
+ if (rfhp != NULL &&
+ (t_ret = __os_closehandle(env, rfhp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* We may be running on a remote filesystem; force the flush. */
+ if (ret == 0 && wfhp != NULL) {
+ ret = __os_fsync(env, wfhp);
+ if (ret != 0)
+ __db_err(env, ret, DB_STR("0731", "Sync failed"));
+ }
+ if (wfhp != NULL &&
+ (t_ret = __os_closehandle(env, wfhp)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static void save_error(dbenv, prefix, errstr)
+ const DB_ENV *dbenv;
+ const char *prefix;
+ const char *errstr;
+{
+ COMPQUIET(prefix, NULL);
+ if (DB_GLOBAL(saved_errstr) != NULL)
+ __os_free(dbenv->env, DB_GLOBAL(saved_errstr));
+ (void)__os_strdup(dbenv->env, errstr, &DB_GLOBAL(saved_errstr));
+}
+
+/*
+ * backup_read_data_dir --
+ * Read a directory looking for databases to copy.
+ */
+static int
+backup_read_data_dir(dbenv, ip, dir, backup_dir, flags)
+ DB_ENV *dbenv;
+ DB_THREAD_INFO *ip;
+ const char *dir, *backup_dir;
+ u_int32_t flags;
+{
+ DB_MSGBUF mb;
+ ENV *env;
+ FILE *savefile;
+ int fcnt, ret;
+ size_t cnt;
+ const char *bd;
+ char **names, buf[DB_MAXPATHLEN], bbuf[DB_MAXPATHLEN];
+ void (*savecall) (const DB_ENV *, const char *, const char *);
+
+ env = dbenv->env;
+ memset(bbuf, 0, sizeof(bbuf));
+
+ bd = backup_dir;
+ if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && dir != env->db_home) {
+ cnt = sizeof(bbuf);
+ /* Build a path name to the destination. */
+ if ((ret = __os_concat_path(bbuf, sizeof(bbuf),
+ backup_dir, dir)) != 0 ||
+ (((cnt = strlen(bbuf)) == sizeof(bbuf) ||
+ (cnt == sizeof(bbuf) - 1 &&
+ strchr(PATH_SEPARATOR, bbuf[cnt - 1]) == NULL)) &&
+ LF_ISSET(DB_CREATE))) {
+ bbuf[sizeof(bbuf) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0720",
+ "%s: path too long", "%s"), bbuf);
+ return (1);
+ }
+
+ /* Create the path. */
+ if (LF_ISSET(DB_CREATE)) {
+ if (strchr(PATH_SEPARATOR, bbuf[cnt - 1]) == NULL)
+ bbuf[cnt] = PATH_SEPARATOR[0];
+
+ if ((ret = __db_mkpath(env, bbuf)) != 0) {
+ __db_err(env, ret, DB_STR_A("0721",
+ "%s: cannot create", "%s"), bbuf);
+ return (ret);
+ }
+ /* step on the trailing '/' */
+ bbuf[cnt] = '\0';
+ }
+ bd = bbuf;
+
+ }
+ if (!__os_abspath(dir) && dir != env->db_home) {
+ /* Build a path name to the source. */
+ if ((ret = __os_concat_path(buf,
+ sizeof(buf), env->db_home, dir)) != 0) {
+ buf[sizeof(buf) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0722",
+ "%s: path too long", "%s"), buf);
+ return (EINVAL);
+ }
+ dir = buf;
+ }
+ /* Get a list of file names. */
+ if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
+ __db_err(env, ret, DB_STR_A("0723", "%s: directory read",
+ "%s"), dir);
+ return (ret);
+ }
+ for (cnt = (size_t)fcnt; cnt-- > 0;) {
+ /*
+ * Skip files in DB's name space, except replication dbs.
+ */
+ if (IS_LOG_FILE(names[cnt]))
+ continue;
+ if (IS_DB_FILE(names[cnt]) && !IS_REP_FILE(names[cnt])
+#ifdef HAVE_PARTITION
+ && !IS_PARTITION_DB_FILE(names[cnt])
+#endif
+ )
+ continue;
+
+ /*
+ * Skip DB_CONFIG.
+ */
+ if (LF_ISSET(DB_BACKUP_SINGLE_DIR) &&
+ !strncmp(names[cnt], "DB_CONFIG", sizeof("DB_CONFIG")))
+ continue;
+
+ /*
+ * Copy the database.
+ */
+
+ DB_MSGBUF_INIT(&mb);
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+ __db_msgadd(env, &mb, DB_STR_A("0724",
+ "copying database %s%c%s to %s%c%s",
+ "%s%c%s %s%c%s"),
+ dir, PATH_SEPARATOR[0], names[cnt],
+ bd, PATH_SEPARATOR[0], names[cnt]);
+
+ /*
+ * Suppress errors on non-db files.
+ */
+ savecall = dbenv->db_errcall;
+ dbenv->db_errcall = save_error;
+ savefile = dbenv->db_errfile;
+ dbenv->db_errfile = NULL;
+
+ ret = __db_dbbackup(dbenv, ip, names[cnt], bd, flags);
+
+ dbenv->db_errcall = savecall;
+ dbenv->db_errfile = savefile;
+
+ /* The file might not be a database. */
+ if (ret == ENOENT || ret == EINVAL) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP)) {
+ __db_msgadd(env, &mb, " -- Not a database");
+ DB_MSGBUF_FLUSH(env, &mb);
+ }
+ if (LF_ISSET(DB_BACKUP_FILES))
+ ret = backup_data_copy(
+ dbenv, names[cnt], dir, bd, 0);
+ else
+ ret = 0;
+ } else if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+ DB_MSGBUF_FLUSH(env, &mb);
+
+ if (ret != 0) {
+ if (DB_GLOBAL(saved_errstr) != NULL) {
+ __db_errx(env, "%s", DB_GLOBAL(saved_errstr));
+ __os_free(env, DB_GLOBAL(saved_errstr));
+ DB_GLOBAL(saved_errstr) = NULL;
+ }
+ break;
+ }
+ }
+
+ __os_dirfree(env, names, fcnt);
+
+ return (ret);
+}
+
+/*
+ * backup_read_log_dir --
+ * Read a directory looking for log files to copy.
+ */
+static int
+backup_read_log_dir(dbenv, backup_dir, copy_minp, flags)
+ DB_ENV *dbenv;
+ const char *backup_dir;
+ int *copy_minp;
+ u_int32_t flags;
+{
+ ENV *env;
+ u_int32_t aflag;
+ size_t cnt;
+ int ret, update, v;
+ const char *backupd;
+ char **begin, **names, *logd;
+ char from[DB_MAXPATHLEN], to[DB_MAXPATHLEN];
+
+ env = dbenv->env;
+ ret = 0;
+ begin = NULL;
+ memset(to, 0, sizeof(to));
+
+ /*
+ * Figure out where the log files are and create the log
+ * destination directory if necessary.
+ */
+ backupd = backup_dir;
+ if ((logd = dbenv->db_log_dir) == NULL)
+ logd = env->db_home;
+ else {
+ if (!LF_ISSET(DB_BACKUP_SINGLE_DIR)) {
+ cnt = sizeof(to);
+ if ((ret = __os_concat_path(to,
+ sizeof(to), backup_dir, logd)) != 0 ||
+ (((cnt = strlen(to)) == sizeof(to) ||
+ (cnt == sizeof(to) - 1 &&
+ strchr(PATH_SEPARATOR, to[cnt - 1]) == NULL)) &&
+ LF_ISSET(DB_CREATE))) {
+ to[sizeof(to) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0733",
+ "%s: path too long", "%s"), to);
+ goto err;
+ }
+ if (LF_ISSET(DB_CREATE)) {
+ if (strchr(PATH_SEPARATOR, to[cnt - 1]) == NULL)
+ to[cnt] = PATH_SEPARATOR[0];
+
+ if ((ret = __db_mkpath(env, to)) != 0) {
+ __db_err(env, ret, DB_STR_A("0734",
+ "%s: cannot create", "%s"), to);
+ goto err;
+ }
+ to[cnt] = '\0';
+ }
+ if ((ret = __os_strdup(env, to, (void*) &backupd)) != 0)
+ goto err;
+ }
+ if (!__os_abspath(logd)) {
+ if ((ret = __os_concat_path(from,
+ sizeof(from), env->db_home, logd)) != 0) {
+ from[sizeof(from) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0732",
+ "%s: path too long", "%s"), from);
+ goto err;
+ }
+ if ((ret = __os_strdup(env, from, &logd)) != 0)
+ goto err;
+ }
+ }
+
+ update = LF_ISSET(DB_BACKUP_UPDATE);
+again: aflag = DB_ARCH_LOG;
+
+ /*
+ * If this is an update and we are deleting files, first process
+ * those files that can be removed, then repeat with the rest.
+ */
+ if (update)
+ aflag = 0;
+
+ /* Flush the log to get latest info. */
+ if ((ret = __log_flush(env, NULL)) != 0) {
+ __db_err(env, ret, DB_STR("0735", "Can't flush log"));
+ goto err;
+ }
+
+ /* Get a list of file names to be copied. */
+ if ((ret = __log_archive(env, &names, aflag)) != 0) {
+ __db_err(env, ret, DB_STR("0736", "Can't get log file names"));
+ goto err;
+ }
+ if (names == NULL)
+ goto done;
+ begin = names;
+ for (; *names != NULL; names++) {
+ /* Track the lowest-numbered log file copied. */
+ v = atoi(*names + sizeof(LFPREFIX) - 1);
+ if (*copy_minp == 0 || *copy_minp > v)
+ *copy_minp = v;
+
+ if ((ret = __os_concat_path(from,
+ sizeof(from), logd, *names)) != 0) {
+ from[sizeof(from) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0737",
+ "%s: path too long", "%s"), from);
+ goto err;
+ }
+
+ /*
+ * If we're going to remove the file, attempt to rename it
+ * instead of copying and then removing. The likely failure
+ * is EXDEV (source and destination are on different volumes).
+ * Fall back to a copy, regardless of the error. We don't
+ * worry about partial contents, the copy truncates the file
+ * on open.
+ */
+ if (update) {
+ if ((ret = __os_concat_path(to,
+ sizeof(to), backupd, *names)) != 0) {
+ to[sizeof(to) - 1] = '\0';
+ __db_errx(env, DB_STR_A("0738",
+ "%s: path too long", "%s"), to);
+ goto err;
+ }
+ if (__os_rename(env, from, to, 1) == 0) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+ __db_msg(env, DB_STR_A("0739",
+ "moving %s to %s",
+ "%s %s"), from, to);
+ continue;
+ }
+ }
+
+ /* Copy the file. */
+ if (backup_data_copy(dbenv, *names, logd, backupd, 1) != 0) {
+ ret = 1;
+ goto err;
+ }
+
+ if (update) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP))
+ __db_msg(env, DB_STR_A("0740",
+ "removing %s", "%s"), from);
+ if ((ret = __os_unlink(env, from, 0)) != 0) {
+ __db_err(env, ret, DB_STR_A("0741",
+ "unlink of %s failed", "%s"), from);
+ goto err;
+ }
+ }
+
+ }
+
+ __os_ufree(env, begin);
+ begin = NULL;
+done: if (update) {
+ update = 0;
+ goto again;
+ }
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_BACKUP) && *copy_minp != 0)
+ __db_msg(env, DB_STR_A("0742",
+ "lowest numbered log file copied: %d", "%d"),
+ *copy_minp);
+err: if (logd != dbenv->db_log_dir && logd != env->db_home)
+ __os_free(env, logd);
+ if (backupd != NULL && backupd != backup_dir)
+ __os_free(env, (void *)backupd);
+ if (begin != NULL)
+ __os_ufree(env, begin);
+
+ return (ret);
+}
+
+/*
+ * __db_backup --
+ * Backup databases in the enviornment.
+ *
+ * PUBLIC: int __db_backup __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__db_backup(dbenv, target, flags)
+ DB_ENV *dbenv;
+ const char *target;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int copy_min, remove_max, ret;
+ char **dir;
+
+ env = dbenv->env;
+ remove_max = copy_min = 0;
+
+#undef OKFLAGS
+#define OKFLAGS \
+ (DB_CREATE | DB_EXCL | DB_BACKUP_FILES | DB_BACKUP_SINGLE_DIR | \
+ DB_BACKUP_UPDATE | DB_BACKUP_NO_LOGS | DB_BACKUP_CLEAN)
+
+ if ((ret = __db_fchk(env, "DB_ENV->backup", flags, OKFLAGS)) != 0)
+ return (ret);
+
+ if (target == NULL) {
+ __db_errx(env,
+ DB_STR("0716", "Target directory may not be null."));
+ return (EINVAL);
+ }
+
+ /*
+ * If the target directory for the backup does not exist, create it
+ * with mode read-write-execute for the owner. Ignore errors here,
+ * it's simpler and more portable to just always try the create. If
+ * there's a problem, we'll fail with reasonable errors later.
+ */
+ if (LF_ISSET(DB_CREATE))
+ (void)__os_mkdir(NULL, target, DB_MODE_700);
+
+ if (LF_ISSET(DB_BACKUP_CLEAN)) {
+ if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) &&
+ dbenv->db_log_dir != NULL &&
+ (ret = backup_dir_clean(dbenv, target,
+ dbenv->db_log_dir, &remove_max, flags)) != 0)
+ return (ret);
+ if ((ret = backup_dir_clean(dbenv,
+ target, NULL, &remove_max, flags)) != 0)
+ return (ret);
+
+ }
+
+ ENV_ENTER(env, ip);
+
+ /*
+ * If the UPDATE option was not specified, copy all database
+ * files found in the database environment home directory and
+ * data directories..
+ */
+ if ((ret = __env_set_backup(env, 1)) != 0)
+ goto end;
+ F_SET(dbenv, DB_ENV_HOTBACKUP);
+ if (!LF_ISSET(DB_BACKUP_UPDATE)) {
+ if ((ret = backup_read_data_dir(dbenv,
+ ip, env->db_home, target, flags)) != 0)
+ goto err;
+ for (dir = dbenv->db_data_dir;
+ dir != NULL && *dir != NULL; ++dir) {
+ /*
+ * Don't allow absolute path names taken from the
+ * enviroment -- running recovery with them would
+ * corrupt the source files.
+ */
+ if (!LF_ISSET(DB_BACKUP_SINGLE_DIR)
+ && __os_abspath(*dir)) {
+ __db_errx(env, DB_STR_A("0725",
+"data directory '%s' is absolute path, not permitted unless backup is to a single directory",
+ "%s"), *dir);
+ ret = EINVAL;
+ goto err;
+ }
+ if ((ret = backup_read_data_dir(
+ dbenv, ip, *dir, target, flags)) != 0)
+ goto err;
+ }
+ }
+
+ /*
+ * Copy all log files found in the log directory.
+ * The log directory defaults to the home directory.
+ */
+ if ((ret = backup_read_log_dir(dbenv, target, &copy_min, flags)) != 0)
+ goto err;
+ /*
+ * If we're updating a snapshot, the lowest-numbered log file copied
+ * into the backup directory should be less than, or equal to, the
+ * highest-numbered log file removed from the backup directory during
+ * cleanup.
+ */
+ if (LF_ISSET(DB_BACKUP_UPDATE) && remove_max < copy_min &&
+ !(remove_max == 0 && copy_min == 1)) {
+ __db_errx(env, DB_STR_A("0743",
+"the largest log file removed (%d) must be greater than or equal the smallest log file copied (%d)",
+ "%d %d"), remove_max, copy_min);
+ ret = EINVAL;
+ }
+
+err: F_CLR(dbenv, DB_ENV_HOTBACKUP);
+ (void)__env_set_backup(env, 0);
+end: ENV_LEAVE(env, ip);
+ return (ret);
+}
diff --git a/src/db/db_cam.c b/src/db/db_cam.c
new file mode 100644
index 00000000..6ee8b579
--- /dev/null
+++ b/src/db/db_cam.c
@@ -0,0 +1,3506 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_s_count __P((DB *));
+static int __db_wrlock_err __P((ENV *));
+static int __dbc_del_foreign __P((DBC *));
+static int __dbc_del_oldskey __P((DB *, DBC *, DBT *, DBT *, DBT *));
+static int __dbc_del_secondary __P((DBC *));
+static int __dbc_pget_recno __P((DBC *, DBT *, DBT *, u_int32_t));
+static inline int __dbc_put_append __P((DBC *,
+ DBT *, DBT *, u_int32_t *, u_int32_t));
+static inline int __dbc_put_fixed_len __P((DBC *, DBT *, DBT *));
+static inline int __dbc_put_partial __P((DBC *,
+ DBT *, DBT *, DBT *, DBT *, u_int32_t *, u_int32_t));
+static int __dbc_put_primary __P((DBC *, DBT *, DBT *, u_int32_t));
+static inline int __dbc_put_resolve_key __P((DBC *,
+ DBT *, DBT *, u_int32_t *, u_int32_t));
+static inline int __dbc_put_secondaries __P((DBC *,
+ DBT *, DBT *, DBT *, int, DBT *, u_int32_t *));
+
+#define CDB_LOCKING_INIT(env, dbc) \
+ /* \
+ * If we are running CDB, this had better be either a write \
+ * cursor or an immediate writer. If it's a regular writer, \
+ * that means we have an IWRITE lock and we need to upgrade \
+ * it to a write lock. \
+ */ \
+ if (CDB_LOCKING(env)) { \
+ if (!F_ISSET(dbc, DBC_WRITECURSOR | DBC_WRITER)) \
+ return (__db_wrlock_err(env)); \
+ \
+ if (F_ISSET(dbc, DBC_WRITECURSOR) && \
+ (ret = __lock_get(env, \
+ (dbc)->locker, DB_LOCK_UPGRADE, &(dbc)->lock_dbt, \
+ DB_LOCK_WRITE, &(dbc)->mylock)) != 0) \
+ return (ret); \
+ }
+#define CDB_LOCKING_DONE(env, dbc) \
+ /* Release the upgraded lock. */ \
+ if (F_ISSET(dbc, DBC_WRITECURSOR)) \
+ (void)__lock_downgrade( \
+ env, &(dbc)->mylock, DB_LOCK_IWRITE, 0);
+
+#define SET_READ_LOCKING_FLAGS(dbc, var) do { \
+ var = 0; \
+ if (!F_ISSET(dbc, DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED)) { \
+ if (LF_ISSET(DB_READ_COMMITTED)) \
+ var = DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED; \
+ if (LF_ISSET(DB_READ_UNCOMMITTED)) \
+ var = DBC_READ_UNCOMMITTED; \
+ } \
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED); \
+} while (0)
+
+/*
+ * __dbc_close --
+ * DBC->close.
+ *
+ * PUBLIC: int __dbc_close __P((DBC *));
+ */
+int
+__dbc_close(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DBC *opd;
+ DBC_INTERNAL *cp;
+ DB_TXN *txn;
+ ENV *env;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ cp = dbc->internal;
+ opd = cp->opd;
+ ret = 0;
+
+ /*
+ * Remove the cursor(s) from the active queue. We may be closing two
+ * cursors at once here, a top-level one and a lower-level, off-page
+ * duplicate one. The access-method specific cursor close routine must
+ * close both of them in a single call.
+ *
+ * !!!
+ * Cursors must be removed from the active queue before calling the
+ * access specific cursor close routine, btree depends on having that
+ * order of operations.
+ */
+ MUTEX_LOCK(env, dbp->mutex);
+
+ if (opd != NULL) {
+ DB_ASSERT(env, F_ISSET(opd, DBC_ACTIVE));
+ F_CLR(opd, DBC_ACTIVE);
+ TAILQ_REMOVE(&dbp->active_queue, opd, links);
+ }
+ DB_ASSERT(env, F_ISSET(dbc, DBC_ACTIVE));
+ F_CLR(dbc, DBC_ACTIVE);
+ TAILQ_REMOVE(&dbp->active_queue, dbc, links);
+
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ /* Call the access specific cursor close routine. */
+ if ((t_ret =
+ dbc->am_close(dbc, PGNO_INVALID, NULL)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Release the lock after calling the access method specific close
+ * routine, a Btree cursor may have had pending deletes.
+ *
+ * Also, be sure not to free anything if mylock.off is INVALID; in
+ * some cases, such as idup'ed read cursors and secondary update
+ * cursors, a cursor in a CDB environment may not have a lock at all.
+ */
+ if (LOCK_ISSET(dbc->mylock)) {
+ if ((t_ret = __LPUT(dbc, dbc->mylock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* For safety's sake, since this is going on the free queue. */
+ memset(&dbc->mylock, 0, sizeof(dbc->mylock));
+ if (opd != NULL)
+ memset(&opd->mylock, 0, sizeof(opd->mylock));
+ }
+
+ /*
+ * Remove this cursor's locker ID from its family.
+ */
+ if (F_ISSET(dbc, DBC_OWN_LID) && F_ISSET(dbc, DBC_FAMILY)) {
+ if ((t_ret = __lock_familyremove(env->lk_handle,
+ dbc->lref)) != 0 && ret == 0)
+ ret = t_ret;
+ F_CLR(dbc, DBC_FAMILY);
+ }
+
+ if ((txn = dbc->txn) != NULL)
+ txn->cursors--;
+
+ /* Move the cursor(s) to the free queue. */
+ MUTEX_LOCK(env, dbp->mutex);
+ if (opd != NULL) {
+ if (txn != NULL)
+ txn->cursors--;
+ TAILQ_INSERT_TAIL(&dbp->free_queue, opd, links);
+ }
+ TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ if (txn != NULL && F_ISSET(txn, TXN_PRIVATE) && txn->cursors == 0 &&
+ (t_ret = __txn_commit(txn, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __dbc_destroy --
+ * Destroy the cursor, called after DBC->close.
+ *
+ * PUBLIC: int __dbc_destroy __P((DBC *));
+ */
+int
+__dbc_destroy(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ ENV *env;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /* Remove the cursor from the free queue. */
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ /* Free up allocated memory. */
+ if (dbc->my_rskey.data != NULL)
+ __os_free(env, dbc->my_rskey.data);
+ if (dbc->my_rkey.data != NULL)
+ __os_free(env, dbc->my_rkey.data);
+ if (dbc->my_rdata.data != NULL)
+ __os_free(env, dbc->my_rdata.data);
+
+ /* Call the access specific cursor destroy routine. */
+ ret = dbc->am_destroy == NULL ? 0 : dbc->am_destroy(dbc);
+
+ /*
+ * Release the lock id for this cursor.
+ */
+ if (LOCKING_ON(env) &&
+ F_ISSET(dbc, DBC_OWN_LID) &&
+ (t_ret = __lock_id_free(env, dbc->lref)) != 0 && ret == 0)
+ ret = t_ret;
+
+ __os_free(env, dbc);
+
+ return (ret);
+}
+
+/*
+ * __dbc_cmp --
+ * Compare the position of two cursors. Return whether two cursors are
+ * pointing to the same key/data pair.
+ *
+ * result == 0 if both cursors refer to the same item.
+ * result == 1 otherwise
+ *
+ * PUBLIC: int __dbc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__dbc_cmp(dbc, other_dbc, result)
+ DBC *dbc, *other_dbc;
+ int *result;
+{
+ DBC *curr_dbc, *curr_odbc;
+ DBC_INTERNAL *dbc_int, *odbc_int;
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+ ret = 0;
+
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbc->dbp)) {
+ dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor;
+ other_dbc = ((PART_CURSOR *)other_dbc->internal)->sub_cursor;
+ }
+ /* Both cursors must still be valid. */
+ if (dbc == NULL || other_dbc == NULL) {
+ __db_errx(env, DB_STR("0692",
+"Both cursors must be initialized before calling DBC->cmp."));
+ return (EINVAL);
+ }
+
+ if (dbc->dbp != other_dbc->dbp) {
+ *result = 1;
+ return (0);
+ }
+#endif
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbc->dbp))
+ return (__bamc_compress_cmp(dbc, other_dbc, result));
+#endif
+
+ curr_dbc = dbc;
+ curr_odbc = other_dbc;
+ dbc_int = dbc->internal;
+ odbc_int = other_dbc->internal;
+
+ /* Both cursors must be on valid positions. */
+ if (dbc_int->pgno == PGNO_INVALID || odbc_int->pgno == PGNO_INVALID) {
+ __db_errx(env, DB_STR("0693",
+"Both cursors must be initialized before calling DBC->cmp."));
+ return (EINVAL);
+ }
+
+ /*
+ * Use a loop since cursors can be nested. Off page duplicate
+ * sets can only be nested one level deep, so it is safe to use a
+ * while (true) loop.
+ */
+ while (1) {
+ if (dbc_int->pgno == odbc_int->pgno &&
+ dbc_int->indx == odbc_int->indx) {
+ /*
+ * If one cursor is sitting on an off page duplicate
+ * set, the other will be pointing to the same set. Be
+ * careful, and check anyway.
+ */
+ if (dbc_int->opd != NULL && odbc_int->opd != NULL) {
+ curr_dbc = dbc_int->opd;
+ curr_odbc = odbc_int->opd;
+ dbc_int = dbc_int->opd->internal;
+ odbc_int= odbc_int->opd->internal;
+ continue;
+ } else if (dbc_int->opd == NULL &&
+ odbc_int->opd == NULL)
+ *result = 0;
+ else {
+ __db_errx(env, DB_STR("0694",
+ "DBCursor->cmp mismatched off page duplicate cursor pointers."));
+ return (EINVAL);
+ }
+
+ switch (curr_dbc->dbtype) {
+ case DB_HASH:
+ /*
+ * Make sure that on-page duplicate data
+ * indexes match, and that the deleted
+ * flags are consistent.
+ */
+ ret = __hamc_cmp(curr_dbc, curr_odbc, result);
+ break;
+ case DB_BTREE:
+ case DB_RECNO:
+ /*
+ * Check for consisted deleted flags on btree
+ * specific cursors.
+ */
+ ret = __bamc_cmp(curr_dbc, curr_odbc, result);
+ break;
+ default:
+ /* NO-OP break out. */
+ break;
+ }
+ } else
+ *result = 1;
+ return (ret);
+ }
+ /* NOTREACHED. */
+ return (ret);
+}
+
+/*
+ * __dbc_count --
+ * Return a count of duplicate data items.
+ *
+ * PUBLIC: int __dbc_count __P((DBC *, db_recno_t *));
+ */
+int
+__dbc_count(dbc, recnop)
+ DBC *dbc;
+ db_recno_t *recnop;
+{
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbc->dbp))
+ dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor;
+#endif
+ /*
+ * Cursor Cleanup Note:
+ * All of the cursors passed to the underlying access methods by this
+ * routine are not duplicated and will not be cleaned up on return.
+ * So, pages/locks that the cursor references must be resolved by the
+ * underlying functions.
+ */
+ switch (dbc->dbtype) {
+ case DB_HEAP:
+ case DB_QUEUE:
+ case DB_RECNO:
+ *recnop = 1;
+ break;
+ case DB_HASH:
+ if (dbc->internal->opd == NULL) {
+ if ((ret = __hamc_count(dbc, recnop)) != 0)
+ return (ret);
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_BTREE:
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbc->dbp))
+ return (__bamc_compress_count(dbc, recnop));
+#endif
+ if ((ret = __bamc_count(dbc, recnop)) != 0)
+ return (ret);
+ break;
+ case DB_UNKNOWN:
+ default:
+ return (__db_unknown_type(env, "__dbc_count", dbc->dbtype));
+ }
+ return (0);
+}
+
+/*
+ * __dbc_del --
+ * DBC->del.
+ *
+ * PUBLIC: int __dbc_del __P((DBC *, u_int32_t));
+ */
+int
+__dbc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ CDB_LOCKING_INIT(env, dbc);
+ F_CLR(dbc, DBC_ERROR);
+
+ /*
+ * If we're a secondary index, and DB_UPDATE_SECONDARY isn't set
+ * (which it only is if we're being called from a primary update),
+ * then we need to call through to the primary and delete the item.
+ *
+ * Note that this will delete the current item; we don't need to
+ * delete it ourselves as well, so we can just goto done.
+ */
+ if (flags != DB_UPDATE_SECONDARY && F_ISSET(dbp, DB_AM_SECONDARY)) {
+ ret = __dbc_del_secondary(dbc);
+ goto done;
+ }
+
+ /*
+ * If we are a foreign db, go through and check any foreign key
+ * constraints first, which will make rolling back changes on an abort
+ * simpler.
+ */
+ if (LIST_FIRST(&dbp->f_primaries) != NULL &&
+ (ret = __dbc_del_foreign(dbc)) != 0)
+ goto done;
+
+ /*
+ * If we are a primary and have secondary indices, go through
+ * and delete any secondary keys that point at the current record.
+ */
+ if (DB_IS_PRIMARY(dbp) &&
+ (ret = __dbc_del_primary(dbc)) != 0)
+ goto done;
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp))
+ ret = __bamc_compress_del(dbc, flags);
+ else
+#endif
+ ret = __dbc_idel(dbc, flags);
+
+done: CDB_LOCKING_DONE(env, dbc);
+
+ if (!DB_RETOK_DBCDEL(ret))
+ F_SET(dbc, DBC_ERROR);
+ return (ret);
+}
+
+/*
+ * __dbc_del --
+ * Implemenation of DBC->del.
+ *
+ * PUBLIC: int __dbc_idel __P((DBC *, u_int32_t));
+ */
+int
+__dbc_idel(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *opd;
+ int ret, t_ret;
+
+ COMPQUIET(flags, 0);
+
+ dbp = dbc->dbp;
+
+ /*
+ * Cursor Cleanup Note:
+ * All of the cursors passed to the underlying access methods by this
+ * routine are not duplicated and will not be cleaned up on return.
+ * So, pages/locks that the cursor references must be resolved by the
+ * underlying functions.
+ */
+
+ /*
+ * Off-page duplicate trees are locked in the primary tree, that is,
+ * we acquire a write lock in the primary tree and no locks in the
+ * off-page dup tree. If the del operation is done in an off-page
+ * duplicate tree, call the primary cursor's upgrade routine first.
+ */
+ opd = dbc->internal->opd;
+ if (opd == NULL)
+ ret = dbc->am_del(dbc, flags);
+ else if ((ret = dbc->am_writelock(dbc)) == 0)
+ ret = opd->am_del(opd, flags);
+
+ /*
+ * If this was an update that is supporting dirty reads
+ * then we may have just swapped our read for a write lock
+ * which is held by the surviving cursor. We need
+ * to explicitly downgrade this lock. The closed cursor
+ * may only have had a read lock.
+ */
+ if (ret == 0 && F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+ dbc->internal->lock_mode == DB_LOCK_WRITE) {
+ if ((ret = __TLPUT(dbc, dbc->internal->lock)) == 0)
+ dbc->internal->lock_mode = DB_LOCK_WWRITE;
+ if (dbc->internal->page != NULL && (t_ret =
+ __memp_shared(dbp->mpf, dbc->internal->page)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+#ifdef HAVE_COMPRESSION
+/*
+ * __dbc_bulk_del --
+ * Bulk del for a cursor.
+ *
+ * Only implemented for compressed BTrees. In this file in order to
+ * use the CDB_LOCKING_* macros.
+ *
+ * PUBLIC: #ifdef HAVE_COMPRESSION
+ * PUBLIC: int __dbc_bulk_del __P((DBC *, DBT *, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__dbc_bulk_del(dbc, key, flags)
+ DBC *dbc;
+ DBT *key;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+
+ DB_ASSERT(env, DB_IS_COMPRESSED(dbc->dbp));
+
+ CDB_LOCKING_INIT(env, dbc);
+ F_CLR(dbc, DBC_ERROR);
+
+ ret = __bamc_compress_bulk_del(dbc, key, flags);
+
+ CDB_LOCKING_DONE(env, dbc);
+
+ return (ret);
+}
+#endif
+
+/*
+ * __dbc_dup --
+ * Duplicate a cursor
+ *
+ * PUBLIC: int __dbc_dup __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_dup(dbc_orig, dbcp, flags)
+ DBC *dbc_orig;
+ DBC **dbcp;
+ u_int32_t flags;
+{
+ DBC *dbc_n, *dbc_nopd;
+ int ret;
+
+ dbc_n = dbc_nopd = NULL;
+
+ /* Allocate a new cursor and initialize it. */
+ if ((ret = __dbc_idup(dbc_orig, &dbc_n, flags)) != 0)
+ goto err;
+ *dbcp = dbc_n;
+
+ /*
+ * If the cursor references an off-page duplicate tree, allocate a
+ * new cursor for that tree and initialize it.
+ */
+ if (dbc_orig->internal->opd != NULL) {
+ if ((ret =
+ __dbc_idup(dbc_orig->internal->opd, &dbc_nopd, flags)) != 0)
+ goto err;
+ dbc_n->internal->opd = dbc_nopd;
+ dbc_nopd->internal->pdbc = dbc_n;
+ }
+ return (0);
+
+err: if (dbc_n != NULL)
+ (void)__dbc_close(dbc_n);
+ if (dbc_nopd != NULL)
+ (void)__dbc_close(dbc_nopd);
+
+ return (ret);
+}
+
+/*
+ * __dbc_idup --
+ * Internal version of __dbc_dup.
+ *
+ * PUBLIC: int __dbc_idup __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_idup(dbc_orig, dbcp, flags)
+ DBC *dbc_orig, **dbcp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *dbc_n;
+ DBC_INTERNAL *int_n, *int_orig;
+ ENV *env;
+ int ret;
+
+ dbp = dbc_orig->dbp;
+ dbc_n = *dbcp;
+ env = dbp->env;
+
+ if ((ret = __db_cursor_int(dbp, dbc_orig->thread_info,
+ dbc_orig->txn, dbc_orig->dbtype, dbc_orig->internal->root,
+ F_ISSET(dbc_orig, DBC_OPD) | DBC_DUPLICATE,
+ dbc_orig->locker, &dbc_n)) != 0)
+ return (ret);
+
+ /* Position the cursor if requested, acquiring the necessary locks. */
+ if (LF_ISSET(DB_POSITION)) {
+ int_n = dbc_n->internal;
+ int_orig = dbc_orig->internal;
+
+ dbc_n->flags |= dbc_orig->flags & ~DBC_OWN_LID;
+
+ int_n->indx = int_orig->indx;
+ int_n->pgno = int_orig->pgno;
+ int_n->root = int_orig->root;
+ int_n->lock_mode = int_orig->lock_mode;
+
+ int_n->stream_start_pgno = int_orig->stream_start_pgno;
+ int_n->stream_off = int_orig->stream_off;
+ int_n->stream_curr_pgno = int_orig->stream_curr_pgno;
+
+ switch (dbc_orig->dbtype) {
+ case DB_QUEUE:
+ if ((ret = __qamc_dup(dbc_orig, dbc_n)) != 0)
+ goto err;
+ break;
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bamc_dup(dbc_orig, dbc_n, flags)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ if ((ret = __hamc_dup(dbc_orig, dbc_n)) != 0)
+ goto err;
+ break;
+ case DB_HEAP:
+ if ((ret = __heapc_dup(dbc_orig, dbc_n)) != 0)
+ goto err;
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env,
+ "__dbc_idup", dbc_orig->dbtype);
+ goto err;
+ }
+ } else if (F_ISSET(dbc_orig, DBC_BULK)) {
+ /*
+ * For bulk cursors, remember what page were on, even if we
+ * don't know that the next operation will be nearby.
+ */
+ dbc_n->internal->pgno = dbc_orig->internal->pgno;
+ }
+
+ /* Copy the locking flags to the new cursor. */
+ F_SET(dbc_n, F_ISSET(dbc_orig, DBC_BULK |
+ DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED | DBC_WRITECURSOR));
+
+ /*
+ * If we're in CDB and this isn't an offpage dup cursor, then
+ * we need to get a lock for the duplicated cursor.
+ */
+ if (CDB_LOCKING(env) && !F_ISSET(dbc_n, DBC_OPD) &&
+ (ret = __lock_get(env, dbc_n->locker, 0,
+ &dbc_n->lock_dbt, F_ISSET(dbc_orig, DBC_WRITECURSOR) ?
+ DB_LOCK_IWRITE : DB_LOCK_READ, &dbc_n->mylock)) != 0)
+ goto err;
+
+ dbc_n->priority = dbc_orig->priority;
+ dbc_n->internal->pdbc = dbc_orig->internal->pdbc;
+ *dbcp = dbc_n;
+ return (0);
+
+err: (void)__dbc_close(dbc_n);
+ return (ret);
+}
+
+/*
+ * __dbc_newopd --
+ * Create a new off-page duplicate cursor.
+ *
+ * PUBLIC: int __dbc_newopd __P((DBC *, db_pgno_t, DBC *, DBC **));
+ */
+int
+__dbc_newopd(dbc_parent, root, oldopd, dbcp)
+ DBC *dbc_parent;
+ db_pgno_t root;
+ DBC *oldopd;
+ DBC **dbcp;
+{
+ DB *dbp;
+ DBC *opd;
+ DBTYPE dbtype;
+ int ret;
+
+ dbp = dbc_parent->dbp;
+ dbtype = (dbp->dup_compare == NULL) ? DB_RECNO : DB_BTREE;
+
+ /*
+ * On failure, we want to default to returning the old off-page dup
+ * cursor, if any; our caller can't be left with a dangling pointer
+ * to a freed cursor. On error the only allowable behavior is to
+ * close the cursor (and the old OPD cursor it in turn points to), so
+ * this should be safe.
+ */
+ *dbcp = oldopd;
+
+ if ((ret = __db_cursor_int(dbp, dbc_parent->thread_info,
+ dbc_parent->txn,
+ dbtype, root, DBC_OPD, dbc_parent->locker, &opd)) != 0)
+ return (ret);
+
+ opd->priority = dbc_parent->priority;
+ opd->internal->pdbc = dbc_parent;
+ *dbcp = opd;
+
+ /*
+ * Check to see if we already have an off-page dup cursor that we've
+ * passed in. If we do, close it. It'd be nice to use it again
+ * if it's a cursor belonging to the right tree, but if we're doing
+ * a cursor-relative operation this might not be safe, so for now
+ * we'll take the easy way out and always close and reopen.
+ *
+ * Note that under no circumstances do we want to close the old
+ * cursor without returning a valid new one; we don't want to
+ * leave the main cursor in our caller with a non-NULL pointer
+ * to a freed off-page dup cursor.
+ */
+ if (oldopd != NULL && (ret = __dbc_close(oldopd)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __dbc_get --
+ * Get using a cursor.
+ *
+ * PUBLIC: int __dbc_get __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ F_CLR(dbc, DBC_ERROR);
+#ifdef HAVE_PARTITION
+ if (F_ISSET(dbc, DBC_PARTITIONED))
+ return (__partc_get(dbc, key, data, flags));
+#endif
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbc->dbp))
+ return (__bamc_compress_get(dbc, key, data, flags));
+#endif
+
+ return (__dbc_iget(dbc, key, data, flags));
+}
+
+/*
+ * __dbc_iget --
+ * Implementation of get using a cursor.
+ *
+ * PUBLIC: int __dbc_iget __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_iget(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *ddbc, *dbc_n, *opd;
+ DBC_INTERNAL *cp, *cp_n;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ db_pgno_t pgno;
+ db_indx_t indx_off;
+ u_int32_t multi, orig_ulen, tmp_flags, tmp_read_locking, tmp_rmw;
+ u_int8_t type;
+ int key_small, ret, t_ret;
+
+ COMPQUIET(orig_ulen, 0);
+
+ key_small = 0;
+
+ /*
+ * Cursor Cleanup Note:
+ * All of the cursors passed to the underlying access methods by this
+ * routine are duplicated cursors. On return, any referenced pages
+ * will be discarded, and, if the cursor is not intended to be used
+ * again, the close function will be called. So, pages/locks that
+ * the cursor references do not need to be resolved by the underlying
+ * functions.
+ */
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ dbc_n = NULL;
+ opd = NULL;
+
+ PERFMON6(env, db, get, dbp->fname, dbp->dname,
+ dbc->txn == NULL ? 0 : dbc->txn->txnid, key, data, flags);
+
+ /* Clear OR'd in additional bits so we can check for flag equality. */
+ tmp_rmw = LF_ISSET(DB_RMW);
+ LF_CLR(DB_RMW);
+
+ SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking);
+
+ multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY);
+ LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+
+ /*
+ * Return a cursor's record number. It has nothing to do with the
+ * cursor get code except that it was put into the interface.
+ */
+ if (flags == DB_GET_RECNO) {
+ if (tmp_rmw)
+ F_SET(dbc, DBC_RMW);
+ F_SET(dbc, tmp_read_locking);
+ ret = __bamc_rget(dbc, data);
+ if (tmp_rmw)
+ F_CLR(dbc, DBC_RMW);
+ /* Clear the temp flags, but leave WAS_READ_COMMITTED. */
+ F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED);
+ return (ret);
+ }
+
+ if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+ CDB_LOCKING_INIT(env, dbc);
+
+ /* Don't return the key or data if it was passed to us. */
+ if (!DB_RETURNS_A_KEY(dbp, flags))
+ F_SET(key, DB_DBT_ISSET);
+ if (flags == DB_GET_BOTH &&
+ (dbp->dup_compare == NULL || dbp->dup_compare == __bam_defcmp))
+ F_SET(data, DB_DBT_ISSET);
+
+ /*
+ * If we have an off-page duplicates cursor, and the operation applies
+ * to it, perform the operation. Duplicate the cursor and call the
+ * underlying function.
+ *
+ * Off-page duplicate trees are locked in the primary tree, that is,
+ * we acquire a write lock in the primary tree and no locks in the
+ * off-page dup tree. If the DB_RMW flag was specified and the get
+ * operation is done in an off-page duplicate tree, call the primary
+ * cursor's upgrade routine first.
+ */
+ cp = dbc->internal;
+ if (cp->opd != NULL &&
+ (flags == DB_CURRENT || flags == DB_GET_BOTHC ||
+ flags == DB_NEXT || flags == DB_NEXT_DUP ||
+ flags == DB_PREV || flags == DB_PREV_DUP)) {
+ if (tmp_rmw && (ret = dbc->am_writelock(dbc)) != 0)
+ goto err;
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ opd = cp->opd;
+ else if ((ret = __dbc_idup(cp->opd, &opd, DB_POSITION)) != 0)
+ goto err;
+
+ if ((ret = opd->am_get(opd, key, data, flags, NULL)) == 0)
+ goto done;
+ /*
+ * Another cursor may have deleted all of the off-page
+ * duplicates, so for operations that are moving a cursor, we
+ * need to skip the empty tree and retry on the parent cursor.
+ */
+ if (ret == DB_NOTFOUND &&
+ (flags == DB_PREV || flags == DB_NEXT)) {
+ ret = __dbc_close(opd);
+ opd = NULL;
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ cp->opd = NULL;
+ }
+ if (ret != 0)
+ goto err;
+ } else if (cp->opd != NULL && F_ISSET(dbc, DBC_TRANSIENT)) {
+ if ((ret = __dbc_close(cp->opd)) != 0)
+ goto err;
+ cp->opd = NULL;
+ }
+
+ /*
+ * Perform an operation on the main cursor. Duplicate the cursor,
+ * upgrade the lock as required, and call the underlying function.
+ */
+ switch (flags) {
+ case DB_CURRENT:
+ case DB_GET_BOTHC:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ tmp_flags = DB_POSITION;
+ break;
+ default:
+ tmp_flags = 0;
+ break;
+ }
+
+ /*
+ * If this cursor is going to be closed immediately, we don't
+ * need to take precautions to clean it up on error.
+ */
+ if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+ dbc_n = dbc;
+ else {
+ ret = __dbc_idup(dbc, &dbc_n, tmp_flags);
+
+ if (ret != 0)
+ goto err;
+ COPY_RET_MEM(dbc, dbc_n);
+ }
+
+ if (tmp_rmw)
+ F_SET(dbc_n, DBC_RMW);
+ F_SET(dbc_n, tmp_read_locking);
+
+ switch (multi) {
+ case DB_MULTIPLE:
+ F_SET(dbc_n, DBC_MULTIPLE);
+ break;
+ case DB_MULTIPLE_KEY:
+ F_SET(dbc_n, DBC_MULTIPLE_KEY);
+ break;
+ case DB_MULTIPLE | DB_MULTIPLE_KEY:
+ F_SET(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
+ break;
+ case 0:
+ default:
+ break;
+ }
+
+retry: pgno = PGNO_INVALID;
+ ret = dbc_n->am_get(dbc_n, key, data, flags, &pgno);
+ if (tmp_rmw)
+ F_CLR(dbc_n, DBC_RMW);
+ /*
+ * Clear the temporary locking flags in the new cursor. The user's
+ * (old) cursor needs to have the WAS_READ_COMMITTED flag because this
+ * is used on the next call on that cursor.
+ */
+ F_CLR(dbc_n, tmp_read_locking);
+ F_SET(dbc, tmp_read_locking & DBC_WAS_READ_COMMITTED);
+ F_CLR(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
+ if (ret != 0)
+ goto err;
+
+ cp_n = dbc_n->internal;
+
+ /*
+ * We may be referencing a new off-page duplicates tree. Acquire
+ * a new cursor and call the underlying function.
+ */
+ if (pgno != PGNO_INVALID) {
+ if ((ret = __dbc_newopd(dbc,
+ pgno, cp_n->opd, &cp_n->opd)) != 0)
+ goto err;
+
+ switch (flags) {
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ case DB_SET:
+ case DB_SET_RECNO:
+ case DB_SET_RANGE:
+ tmp_flags = DB_FIRST;
+ break;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ tmp_flags = DB_LAST;
+ break;
+ case DB_GET_BOTH:
+ case DB_GET_BOTHC:
+ case DB_GET_BOTH_RANGE:
+ tmp_flags = flags;
+ break;
+ default:
+ ret = __db_unknown_flag(env, "__dbc_get", flags);
+ goto err;
+ }
+ ret = cp_n->opd->am_get(cp_n->opd, key, data, tmp_flags, NULL);
+ /*
+ * Another cursor may have deleted all of the off-page
+ * duplicates, so for operations that are moving a cursor, we
+ * need to skip the empty tree and retry on the parent cursor.
+ */
+ if (ret == DB_NOTFOUND) {
+ PERFMON5(env, race, dbc_get,
+ dbp->fname, dbp->dname, ret, tmp_flags, key);
+
+ switch (flags) {
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ flags = DB_NEXT;
+ break;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ flags = DB_PREV;
+ break;
+ default:
+ goto err;
+ }
+
+ ret = __dbc_close(cp_n->opd);
+ cp_n->opd = NULL;
+ if (ret == 0)
+ goto retry;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+done: /*
+ * Return a key/data item. The only exception is that we don't return
+ * a key if the user already gave us one, that is, if the DB_SET flag
+ * was set. The DB_SET flag is necessary. In a Btree, the user's key
+ * doesn't have to be the same as the key stored the tree, depending on
+ * the magic performed by the comparison function. As we may not have
+ * done any key-oriented operation here, the page reference may not be
+ * valid. Fill it in as necessary. We don't have to worry about any
+ * locks, the cursor must already be holding appropriate locks.
+ *
+ * XXX
+ * If not a Btree and DB_SET_RANGE is set, we shouldn't return a key
+ * either, should we?
+ */
+ cp_n = dbc_n == NULL ? dbc->internal : dbc_n->internal;
+ if (!F_ISSET(key, DB_DBT_ISSET)) {
+ if (cp_n->page == NULL && (ret = __memp_fget(mpf, &cp_n->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp_n->page)) != 0)
+ goto err;
+
+ if ((ret = __db_ret(dbc, cp_n->page, cp_n->indx, key,
+ &dbc->rkey->data, &dbc->rkey->ulen)) != 0) {
+ /*
+ * If the key DBT is too small, we still want to return
+ * the size of the data. Otherwise applications are
+ * forced to check each one with a separate call. We
+ * don't want to copy the data, so we set the ulen to
+ * zero before calling __db_ret.
+ */
+ if (ret == DB_BUFFER_SMALL &&
+ F_ISSET(data, DB_DBT_USERMEM)) {
+ key_small = 1;
+ orig_ulen = data->ulen;
+ data->ulen = 0;
+ } else
+ goto err;
+ }
+ }
+ if (multi != 0 && dbc->am_bulk != NULL) {
+ /*
+ * Even if fetching from the OPD cursor we need a duplicate
+ * primary cursor if we are going after multiple keys.
+ */
+ if (dbc_n == NULL) {
+ /*
+ * Non-"_KEY" DB_MULTIPLE doesn't move the main cursor,
+ * so it's safe to just use dbc, unless the cursor
+ * has an open off-page duplicate cursor whose state
+ * might need to be preserved.
+ */
+ if ((!(multi & DB_MULTIPLE_KEY) &&
+ dbc->internal->opd == NULL) ||
+ F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_idup(dbc,
+ &dbc_n, DB_POSITION)) != 0)
+ goto err;
+ if ((ret = dbc_n->am_get(dbc_n,
+ key, data, DB_CURRENT, &pgno)) != 0)
+ goto err;
+ }
+ cp_n = dbc_n->internal;
+ }
+
+ /*
+ * If opd is set then we dupped the opd that we came in with.
+ * When we return we may have a new opd if we went to another
+ * key.
+ */
+ if (opd != NULL) {
+ DB_ASSERT(env, cp_n->opd == NULL);
+ cp_n->opd = opd;
+ opd = NULL;
+ }
+
+ /*
+ * Bulk get doesn't use __db_retcopy, so data.size won't
+ * get set up unless there is an error. Assume success
+ * here. This is the only call to am_bulk, and it avoids
+ * setting it exactly the same everywhere. If we have an
+ * DB_BUFFER_SMALL error, it'll get overwritten with the
+ * needed value.
+ */
+ data->size = data->ulen;
+ ret = dbc_n->am_bulk(dbc_n, data, flags | multi);
+ } else if (!F_ISSET(data, DB_DBT_ISSET)) {
+ ddbc = opd != NULL ? opd :
+ cp_n->opd != NULL ? cp_n->opd : dbc_n;
+ cp = ddbc->internal;
+ if (cp->page == NULL &&
+ (ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, ddbc->txn, 0, &cp->page)) != 0)
+ goto err;
+
+ type = TYPE(cp->page);
+ indx_off = ((type == P_LBTREE ||
+ type == P_HASH || type == P_HASH_UNSORTED) ? O_INDX : 0);
+ ret = __db_ret(ddbc, cp->page, cp->indx + indx_off,
+ data, &dbc->rdata->data, &dbc->rdata->ulen);
+ }
+
+err: /* Don't pass DB_DBT_ISSET back to application level, error or no. */
+ F_CLR(key, DB_DBT_ISSET);
+ F_CLR(data, DB_DBT_ISSET);
+
+ /* Cleanup and cursor resolution. */
+ if (opd != NULL) {
+ /*
+ * To support dirty reads we must reget the write lock
+ * if we have just stepped off a deleted record.
+ * Since the OPD cursor does not know anything
+ * about the referencing page or cursor we need
+ * to peek at the OPD cursor and get the lock here.
+ */
+ if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+ F_ISSET((BTREE_CURSOR *)
+ dbc->internal->opd->internal, C_DELETED))
+ if ((t_ret =
+ dbc->am_writelock(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __dbc_cleanup(
+ dbc->internal->opd, opd, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ if (key_small) {
+ data->ulen = orig_ulen;
+ if (ret == 0)
+ ret = DB_BUFFER_SMALL;
+ }
+
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+
+ if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+ CDB_LOCKING_DONE(env, dbc);
+ return (ret);
+}
+
+/* Internal flags shared by the dbc_put functions. */
+#define DBC_PUT_RMW 0x001
+#define DBC_PUT_NODEL 0x002
+#define DBC_PUT_HAVEREC 0x004
+
+/*
+ * __dbc_put_resolve_key --
+ * Get the current key and data so that we can correctly update the
+ * secondary and foreign databases.
+ */
+static inline int
+__dbc_put_resolve_key(dbc, oldkey, olddata, put_statep, flags)
+ DBC *dbc;
+ DBT *oldkey, *olddata;
+ u_int32_t flags, *put_statep;
+{
+ DB *dbp;
+ ENV *env;
+ int ret, rmw;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+ DB_ASSERT(env, flags == DB_CURRENT);
+ COMPQUIET(flags, 0);
+
+ /*
+ * This is safe to do on the cursor we already have;
+ * error or no, it won't move.
+ *
+ * We use DB_RMW for all of these gets because we'll be
+ * writing soon enough in the "normal" put code. In
+ * transactional databases we'll hold those write locks
+ * even if we close the cursor we're reading with.
+ *
+ * The DB_KEYEMPTY return needs special handling -- if the
+ * cursor is on a deleted key, we return DB_NOTFOUND.
+ */
+ memset(oldkey, 0, sizeof(DBT));
+ if ((ret = __dbc_get(dbc, oldkey, olddata, rmw | DB_CURRENT)) != 0)
+ return (ret == DB_KEYEMPTY ? DB_NOTFOUND : ret);
+
+ /* Record that we've looked for the old record. */
+ FLD_SET(*put_statep, DBC_PUT_HAVEREC);
+ return (0);
+}
+
+/*
+ * __dbc_put_append --
+ * Handle an append to a primary.
+ */
+static inline int
+__dbc_put_append(dbc, key, data, put_statep, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags, *put_statep;
+{
+ DB *dbp;
+ ENV *env;
+ DBC *dbc_n;
+ DBT tdata;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ ret = 0;
+ dbc_n = NULL;
+
+ DB_ASSERT(env, flags == DB_APPEND);
+ COMPQUIET(flags, 0);
+
+ /*
+ * With DB_APPEND, we need to do the insert to populate the key value.
+ * So we swap the 'normal' order of updating secondary / verifying
+ * foreign databases and inserting.
+ *
+ * If there is an append callback, the value stored in data->data may
+ * be replaced and then freed. To avoid passing a freed pointer back
+ * to the user, just operate on a copy of the data DBT.
+ */
+ tdata = *data;
+
+ /*
+ * If this cursor is going to be closed immediately, we don't
+ * need to take precautions to clean it up on error.
+ */
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else if ((ret = __dbc_idup(dbc, &dbc_n, 0)) != 0)
+ goto err;
+
+ /*
+ * Append isn't a normal put operation; call the appropriate access
+ * method's append function.
+ */
+ switch (dbp->type) {
+ case DB_HEAP:
+ if ((ret = __heap_append(dbc_n, key, &tdata)) != 0)
+ goto err;
+ break;
+ case DB_QUEUE:
+ if ((ret = __qam_append(dbc_n, key, &tdata)) != 0)
+ goto err;
+ break;
+ case DB_RECNO:
+ if ((ret = __ram_append(dbc_n, key, &tdata)) != 0)
+ goto err;
+ break;
+ default:
+ /* The interface should prevent this. */
+ DB_ASSERT(env,
+ dbp->type == DB_QUEUE || dbp->type == DB_RECNO);
+
+ ret = __db_ferr(env, "DBC->put", 0);
+ goto err;
+ }
+
+ /*
+ * The append callback, if one exists, may have allocated a new
+ * tdata.data buffer. If so, free it.
+ */
+ FREE_IF_NEEDED(env, &tdata);
+
+ /*
+ * The key value may have been generated by the above operation, but
+ * not set in the data buffer. Make sure it is there so that secondary
+ * updates can complete.
+ */
+ __dbt_userfree(env, key, NULL, NULL);
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ goto err;
+
+ /* An append cannot be replacing an existing item. */
+ FLD_SET(*put_statep, DBC_PUT_NODEL);
+
+err: if (dbc_n != NULL &&
+ (t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __dbc_put_partial --
+ * Ensure that the data item we are using is complete and correct.
+ * Otherwise we could break the secondary constraints.
+ */
+static inline int
+__dbc_put_partial(dbc, pkey, data, orig_data, out_data, put_statep, flags)
+ DBC *dbc;
+ DBT *pkey, *data, *orig_data, *out_data;
+ u_int32_t *put_statep, flags;
+{
+ DB *dbp;
+ DBC *pdbc;
+ ENV *env;
+ int ret, rmw, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ ret = t_ret = 0;
+ rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+ if (!FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) &&
+ !FLD_ISSET(*put_statep, DBC_PUT_NODEL)) {
+ /*
+ * We're going to have to search the tree for the
+ * specified key. Dup a cursor (so we have the same
+ * locking info) and do a c_get.
+ */
+ if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0)
+ return (ret);
+
+ /*
+ * When doing a put with DB_CURRENT, partial data items have
+ * already been resolved.
+ */
+ DB_ASSERT(env, flags != DB_CURRENT);
+
+ F_SET(pkey, DB_DBT_ISSET);
+ ret = __dbc_get(pdbc, pkey, orig_data, rmw | DB_SET);
+ if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+ FLD_SET(*put_statep, DBC_PUT_NODEL);
+ ret = 0;
+ }
+ if ((t_ret = __dbc_close(pdbc)) != 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+
+ FLD_SET(*put_statep, DBC_PUT_HAVEREC);
+ }
+
+ COMPQUIET(flags, 0);
+
+ /*
+ * Now build the new datum from orig_data and the partial data
+ * we were given. It's okay to do this if no record was
+ * returned above: a partial put on an empty record is allowed,
+ * if a little strange. The data is zero-padded.
+ */
+ return (__db_buildpartial(dbp, orig_data, data, out_data));
+}
+
+/*
+ * __dbc_put_fixed_len --
+ * Handle padding for fixed-length records.
+ */
+static inline int
+__dbc_put_fixed_len(dbc, data, out_data)
+ DBC *dbc;
+ DBT *data, *out_data;
+{
+ DB *dbp;
+ ENV *env;
+ int re_pad, ret;
+ u_int32_t re_len, size;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ ret = 0;
+
+ /*
+ * Handle fixed-length records. If the primary database has
+ * fixed-length records, we need to pad out the datum before
+ * we pass it into the callback function; we always index the
+ * "real" record.
+ */
+ if (dbp->type == DB_QUEUE) {
+ re_len = ((QUEUE *)dbp->q_internal)->re_len;
+ re_pad = ((QUEUE *)dbp->q_internal)->re_pad;
+ } else {
+ re_len = ((BTREE *)dbp->bt_internal)->re_len;
+ re_pad = ((BTREE *)dbp->bt_internal)->re_pad;
+ }
+
+ size = data->size;
+ if (size > re_len) {
+ ret = __db_rec_toobig(env, size, re_len);
+ return (ret);
+ } else if (size < re_len) {
+ /*
+ * If we're not doing a partial put, copy data->data into
+ * out_data->data, then pad out out_data->data. This overrides
+ * the assignment made above, which is used in the more common
+ * case when padding is not needed.
+ *
+ * If we're doing a partial put, the data we want are already
+ * in out_data.data; we just need to pad.
+ */
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __os_realloc(
+ env, re_len, &out_data->data)) != 0)
+ return (ret);
+ /*
+ * In the partial case, we have built the item into
+ * out_data already using __db_buildpartial. Just need
+ * to pad from the end of out_data, not from data->size.
+ */
+ size = out_data->size;
+ } else {
+ if ((ret = __os_malloc(
+ env, re_len, &out_data->data)) != 0)
+ return (ret);
+ memcpy(out_data->data, data->data, size);
+ }
+ memset((u_int8_t *)out_data->data + size, re_pad,
+ re_len - size);
+ out_data->size = re_len;
+ }
+
+ return (ret);
+}
+
+/*
+ * __dbc_put_secondaries --
+ * Insert the secondary keys, and validate the foreign key constraints.
+ */
+static inline int
+__dbc_put_secondaries(dbc,
+ pkey, data, orig_data, s_count, s_keys_buf, put_statep)
+ DBC *dbc;
+ DBT *pkey, *data, *orig_data, *s_keys_buf;
+ int s_count;
+ u_int32_t *put_statep;
+{
+ DB *dbp, *sdbp;
+ DBC *fdbc, *sdbc;
+ DBT fdata, oldpkey, *skeyp, temppkey, tempskey, *tskeyp;
+ ENV *env;
+ int cmp, ret, rmw, t_ret;
+ u_int32_t nskey;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ fdbc = sdbc = NULL;
+ sdbp = NULL;
+ t_ret = 0;
+ rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+ /*
+ * Loop through the secondaries. (Step 3.)
+ *
+ * Note that __db_s_first and __db_s_next will take care of
+ * thread-locking and refcounting issues.
+ */
+ for (ret = __db_s_first(dbp, &sdbp), skeyp = s_keys_buf;
+ sdbp != NULL && ret == 0;
+ ret = __db_s_next(&sdbp, dbc->txn), ++skeyp) {
+ DB_ASSERT(env, skeyp - s_keys_buf < s_count);
+ /*
+ * Don't process this secondary if the key is immutable and we
+ * know that the old record exists. This optimization can't be
+ * used if we have not checked for the old record yet.
+ */
+ if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) &&
+ !FLD_ISSET(*put_statep, DBC_PUT_NODEL) &&
+ FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY))
+ continue;
+
+ /*
+ * Call the callback for this secondary, to get the
+ * appropriate secondary key.
+ */
+ if ((ret = sdbp->s_callback(sdbp,
+ pkey, data, skeyp)) != 0) {
+ /* Not indexing is equivalent to an empty key set. */
+ if (ret == DB_DONOTINDEX) {
+ F_SET(skeyp, DB_DBT_MULTIPLE);
+ skeyp->size = 0;
+ ret = 0;
+ } else
+ goto err;
+ }
+
+ if (sdbp->s_foreign != NULL &&
+ (ret = __db_cursor_int(sdbp->s_foreign,
+ dbc->thread_info, dbc->txn, sdbp->s_foreign->type,
+ PGNO_INVALID, 0, dbc->locker, &fdbc)) != 0)
+ goto err;
+
+ /*
+ * Mark the secondary key DBT(s) as set -- that is, the
+ * callback returned at least one secondary key.
+ *
+ * Also, if this secondary index is associated with a foreign
+ * database, check that the foreign db contains the key(s) to
+ * maintain referential integrity. Set flags in fdata to avoid
+ * mem copying, we just need to know existence. We need to do
+ * this check before setting DB_DBT_ISSET, otherwise __dbc_get
+ * will overwrite the flag values.
+ */
+ if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+ __db_check_skeyset(sdbp, skeyp);
+#endif
+ for (tskeyp = (DBT *)skeyp->data, nskey = skeyp->size;
+ nskey > 0; nskey--, tskeyp++) {
+ if (fdbc != NULL) {
+ memset(&fdata, 0, sizeof(DBT));
+ F_SET(&fdata,
+ DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = __dbc_get(
+ fdbc, tskeyp, &fdata,
+ DB_SET | rmw)) == DB_NOTFOUND ||
+ ret == DB_KEYEMPTY) {
+ ret = DB_FOREIGN_CONFLICT;
+ break;
+ }
+ }
+ F_SET(tskeyp, DB_DBT_ISSET);
+ }
+ tskeyp = (DBT *)skeyp->data;
+ nskey = skeyp->size;
+ } else {
+ if (fdbc != NULL) {
+ memset(&fdata, 0, sizeof(DBT));
+ F_SET(&fdata, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = __dbc_get(fdbc, skeyp, &fdata,
+ DB_SET | rmw)) == DB_NOTFOUND ||
+ ret == DB_KEYEMPTY)
+ ret = DB_FOREIGN_CONFLICT;
+ }
+ F_SET(skeyp, DB_DBT_ISSET);
+ tskeyp = skeyp;
+ nskey = 1;
+ }
+ if (fdbc != NULL && (t_ret = __dbc_close(fdbc)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ fdbc = NULL;
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If we have the old record, we can generate and remove any
+ * old secondary key(s) now. We can also skip the secondary
+ * put if there is no change.
+ */
+ if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC)) {
+ if ((ret = __dbc_del_oldskey(sdbp, dbc,
+ skeyp, pkey, orig_data)) == DB_KEYEXIST)
+ continue;
+ else if (ret != 0)
+ goto err;
+ }
+ if (nskey == 0)
+ continue;
+
+ /*
+ * Open a cursor in this secondary.
+ *
+ * Use the same locker ID as our primary cursor, so that
+ * we're guaranteed that the locks don't conflict (e.g. in CDB
+ * or if we're subdatabases that share and want to lock a
+ * metadata page).
+ */
+ if ((ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn,
+ sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+ goto err;
+
+ /*
+ * If we're in CDB, updates will fail since the new cursor
+ * isn't a writer. However, we hold the WRITE lock in the
+ * primary and will for as long as our new cursor lasts,
+ * and the primary and secondary share a lock file ID,
+ * so it's safe to consider this a WRITER. The close
+ * routine won't try to put anything because we don't
+ * really have a lock.
+ */
+ if (CDB_LOCKING(env)) {
+ DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+
+ /*
+ * Swap the primary key to the byte order of this secondary, if
+ * necessary. By doing this now, we can compare directly
+ * against the data already in the secondary without having to
+ * swap it after reading.
+ */
+ SWAP_IF_NEEDED(sdbp, pkey);
+
+ for (; nskey > 0 && ret == 0; nskey--, tskeyp++) {
+ /* Skip this key if it is already in the database. */
+ if (!F_ISSET(tskeyp, DB_DBT_ISSET))
+ continue;
+
+ /*
+ * There are three cases here--
+ * 1) The secondary supports sorted duplicates.
+ * If we attempt to put a secondary/primary pair
+ * that already exists, that's a duplicate
+ * duplicate, and c_put will return DB_KEYEXIST
+ * (see __db_duperr). This will leave us with
+ * exactly one copy of the secondary/primary pair,
+ * and this is just right--we'll avoid deleting it
+ * later, as the old and new secondaries will
+ * match (since the old secondary is the dup dup
+ * that's already there).
+ * 2) The secondary supports duplicates, but they're not
+ * sorted. We need to avoid putting a duplicate
+ * duplicate, because the matching old and new
+ * secondaries will prevent us from deleting
+ * anything and we'll wind up with two secondary
+ * records that point to the same primary key. Do
+ * a c_get(DB_GET_BOTH); only do the put if the
+ * secondary doesn't exist.
+ * 3) The secondary doesn't support duplicates at all.
+ * In this case, secondary keys must be unique;
+ * if another primary key already exists for this
+ * secondary key, we have to either overwrite it
+ * or not put this one, and in either case we've
+ * corrupted the secondary index. Do a
+ * c_get(DB_SET). If the secondary/primary pair
+ * already exists, do nothing; if the secondary
+ * exists with a different primary, return an
+ * error; and if the secondary does not exist,
+ * put it.
+ */
+ if (!F_ISSET(sdbp, DB_AM_DUP)) {
+ /* Case 3. */
+ memset(&oldpkey, 0, sizeof(DBT));
+ F_SET(&oldpkey, DB_DBT_MALLOC);
+ ret = __dbc_get(sdbc,
+ tskeyp, &oldpkey, rmw | DB_SET);
+ if (ret == 0) {
+ cmp = __bam_defcmp(sdbp,
+ &oldpkey, pkey);
+ __os_ufree(env, oldpkey.data);
+ /*
+ * If the secondary key is unchanged,
+ * skip the put and go on to the next
+ * one.
+ */
+ if (cmp == 0)
+ continue;
+
+ __db_errx(env, DB_STR("0695",
+ "Put results in a non-unique secondary key in an "
+ "index not configured to support duplicates"));
+ ret = EINVAL;
+ }
+ if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+ break;
+ } else if (!F_ISSET(sdbp, DB_AM_DUPSORT)) {
+ /* Case 2. */
+ DB_INIT_DBT(tempskey,
+ tskeyp->data, tskeyp->size);
+ DB_INIT_DBT(temppkey,
+ pkey->data, pkey->size);
+ ret = __dbc_get(sdbc, &tempskey, &temppkey,
+ rmw | DB_GET_BOTH);
+ if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+ break;
+ }
+
+ ret = __dbc_put(sdbc, tskeyp, pkey,
+ DB_UPDATE_SECONDARY);
+
+ /*
+ * We don't know yet whether this was a put-overwrite
+ * that in fact changed nothing. If it was, we may get
+ * DB_KEYEXIST. This is not an error.
+ */
+ if (ret == DB_KEYEXIST)
+ ret = 0;
+ }
+
+ /* Make sure the primary key is back in native byte-order. */
+ SWAP_IF_NEEDED(sdbp, pkey);
+
+ if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Mark that we have a key for this secondary so we can check
+ * it later before deleting the old one. We can't set it
+ * earlier or it would be cleared in the calls above.
+ */
+ F_SET(skeyp, DB_DBT_ISSET);
+ }
+err: if (sdbp != NULL &&
+ (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+ ret = t_ret;
+ COMPQUIET(s_count, 0);
+ return (ret);
+}
+
+static int
+__dbc_put_primary(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp, *sdbp;
+ DBC *dbc_n, *pdbc;
+ DBT oldkey, olddata, newdata;
+ DBT *all_skeys, *skeyp, *tskeyp;
+ ENV *env;
+ int ret, t_ret, s_count;
+ u_int32_t nskey, put_state, rmw;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ t_ret = 0;
+ put_state = 0;
+ sdbp = NULL;
+ pdbc = dbc_n = NULL;
+ all_skeys = NULL;
+ memset(&newdata, 0, sizeof(DBT));
+ memset(&olddata, 0, sizeof(DBT));
+
+ /*
+ * We do multiple cursor operations in some cases and subsequently
+ * access the data DBT information. Set DB_DBT_MALLOC so we don't risk
+ * modification of the data between our uses of it.
+ */
+ F_SET(&olddata, DB_DBT_MALLOC);
+
+ /*
+ * We have at least one secondary which we may need to update.
+ *
+ * There is a rather vile locking issue here. Secondary gets
+ * will always involve acquiring a read lock in the secondary,
+ * then acquiring a read lock in the primary. Ideally, we
+ * would likewise perform puts by updating all the secondaries
+ * first, then doing the actual put in the primary, to avoid
+ * deadlock (since having multiple threads doing secondary
+ * gets and puts simultaneously is probably a common case).
+ *
+ * However, if this put is a put-overwrite--and we have no way to
+ * tell in advance whether it will be--we may need to delete
+ * an outdated secondary key. In order to find that old
+ * secondary key, we need to get the record we're overwriting,
+ * before we overwrite it.
+ *
+ * (XXX: It would be nice to avoid this extra get, and have the
+ * underlying put routines somehow pass us the old record
+ * since they need to traverse the tree anyway. I'm saving
+ * this optimization for later, as it's a lot of work, and it
+ * would be hard to fit into this locking paradigm anyway.)
+ *
+ * The simple thing to do would be to go get the old record before
+ * we do anything else. Unfortunately, though, doing so would
+ * violate our "secondary, then primary" lock acquisition
+ * ordering--even in the common case where no old primary record
+ * exists, we'll still acquire and keep a lock on the page where
+ * we're about to do the primary insert.
+ *
+ * To get around this, we do the following gyrations, which
+ * hopefully solve this problem in the common case:
+ *
+ * 1) If this is a c_put(DB_CURRENT), go ahead and get the
+ * old record. We already hold the lock on this page in
+ * the primary, so no harm done, and we'll need the primary
+ * key (which we weren't passed in this case) to do any
+ * secondary puts anyway.
+ * If this is a put(DB_APPEND), then we need to insert the item,
+ * so that we can know the key value. So go ahead and insert. In
+ * the case of a put(DB_APPEND) without secondaries it is
+ * implemented in the __db_put method as an optimization.
+ *
+ * 2) If we're doing a partial put, we need to perform the
+ * get on the primary key right away, since we don't have
+ * the whole datum that the secondary key is based on.
+ * We may also need to pad out the record if the primary
+ * has a fixed record length.
+ *
+ * 3) Loop through the secondary indices, putting into each a
+ * new secondary key that corresponds to the new record.
+ *
+ * 4) If we haven't done so in (1) or (2), get the old primary
+ * key/data pair. If one does not exist--the common case--we're
+ * done with secondary indices, and can go straight on to the
+ * primary put.
+ *
+ * 5) If we do have an old primary key/data pair, however, we need
+ * to loop through all the secondaries a second time and delete
+ * the old secondary in each.
+ */
+ s_count = __db_s_count(dbp);
+ if ((ret = __os_calloc(env,
+ (u_int)s_count, sizeof(DBT), &all_skeys)) != 0)
+ goto err;
+
+ /*
+ * Primary indices can't have duplicates, so only DB_APPEND,
+ * DB_CURRENT, DB_KEYFIRST, and DB_KEYLAST make any sense. Other flags
+ * should have been caught by the checking routine, but
+ * add a sprinkling of paranoia.
+ */
+ DB_ASSERT(env, flags == DB_APPEND || flags == DB_CURRENT ||
+ flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+ flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP);
+
+ /*
+ * We'll want to use DB_RMW in a few places, but it's only legal
+ * when locking is on.
+ */
+ rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+ if (rmw)
+ FLD_SET(put_state, DBC_PUT_RMW);
+
+ /* Resolve the primary key if required (Step 1). */
+ if (flags == DB_CURRENT) {
+ if ((ret = __dbc_put_resolve_key(dbc,
+ &oldkey, &olddata, &put_state, flags)) != 0)
+ goto err;
+ key = &oldkey;
+ } else if (flags == DB_APPEND) {
+ if ((ret = __dbc_put_append(dbc,
+ key, data, &put_state, flags)) != 0)
+ goto err;
+ }
+
+ /*
+ * PUT_NOOVERWRITE with secondaries is a troublesome case. We need
+ * to check that the insert will work prior to making any changes
+ * to secondaries. Try to work within the locking constraints outlined
+ * above.
+ *
+ * This is DB->put (DB_NOOVERWRITE). DBC->put(DB_NODUPDATA) is not
+ * relevant since it is only valid on DBs that support duplicates,
+ * which primaries with secondaries can't have.
+ */
+ if (flags == DB_NOOVERWRITE) {
+ /* Don't bother retrieving the data. */
+ F_SET(key, DB_DBT_ISSET);
+ olddata.dlen = 0;
+ olddata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+ ret = __dbc_get(dbc, key, &olddata, DB_SET);
+ if (ret == 0) {
+ ret = DB_KEYEXIST;
+ goto done;
+ } else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+ goto err;
+ }
+
+ /*
+ * Check for partial puts using DB_DBT_PARTIAL (Step 2).
+ */
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __dbc_put_partial(dbc,
+ key, data, &olddata, &newdata, &put_state, flags)) != 0)
+ goto err;
+ } else {
+ newdata = *data;
+ }
+
+ /*
+ * Check for partial puts, with fixed length record databases (Step 2).
+ */
+ if ((dbp->type == DB_RECNO && F_ISSET(dbp, DB_AM_FIXEDLEN)) ||
+ (dbp->type == DB_QUEUE)) {
+ if ((ret = __dbc_put_fixed_len(dbc, data, &newdata)) != 0)
+ goto err;
+ }
+
+ /* Validate any foreign databases, and update secondaries. (Step 3). */
+ if ((ret = __dbc_put_secondaries(dbc, key, &newdata,
+ &olddata, s_count, all_skeys, &put_state))
+ != 0)
+ goto err;
+ /*
+ * If we've already got the old primary key/data pair, the secondary
+ * updates are already done.
+ */
+ if (FLD_ISSET(put_state, DBC_PUT_HAVEREC))
+ goto done;
+
+ /*
+ * If still necessary, go get the old primary key/data. (Step 4.)
+ *
+ * See the comments in step 2. This is real familiar.
+ */
+ if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0)
+ goto err;
+ DB_ASSERT(env, flags != DB_CURRENT);
+ F_SET(key, DB_DBT_ISSET);
+ ret = __dbc_get(pdbc, key, &olddata, rmw | DB_SET);
+ if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+ FLD_SET(put_state, DBC_PUT_NODEL);
+ ret = 0;
+ }
+ if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Check whether we do in fact have an old record we may need to
+ * delete. (Step 5).
+ */
+ if (FLD_ISSET(put_state, DBC_PUT_NODEL))
+ goto done;
+
+ for (ret = __db_s_first(dbp, &sdbp), skeyp = all_skeys;
+ sdbp != NULL && ret == 0;
+ ret = __db_s_next(&sdbp, dbc->txn), skeyp++) {
+ DB_ASSERT(env, skeyp - all_skeys < s_count);
+ /*
+ * Don't process this secondary if the key is immutable. We
+ * know that the old record exists, so this optimization can
+ * always be used.
+ */
+ if (FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY))
+ continue;
+
+ if ((ret = __dbc_del_oldskey(sdbp, dbc,
+ skeyp, key, &olddata)) != 0 && ret != DB_KEYEXIST)
+ goto err;
+ }
+ if (ret != 0)
+ goto err;
+
+done:
+err:
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* If newdata or olddata were used, free their buffers. */
+ if (newdata.data != NULL && newdata.data != data->data)
+ __os_free(env, newdata.data);
+ if (olddata.data != NULL)
+ __os_ufree(env, olddata.data);
+
+ CDB_LOCKING_DONE(env, dbc);
+
+ if (sdbp != NULL &&
+ (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (all_skeys != NULL) {
+ for (skeyp = all_skeys; skeyp - all_skeys < s_count; skeyp++) {
+ if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) {
+ for (nskey = skeyp->size,
+ tskeyp = (DBT *)skeyp->data;
+ nskey > 0;
+ nskey--, tskeyp++)
+ FREE_IF_NEEDED(env, tskeyp);
+ }
+ FREE_IF_NEEDED(env, skeyp);
+ }
+ __os_free(env, all_skeys);
+ }
+ return (ret);
+}
+
+/*
+ * __dbc_put --
+ * Put using a cursor.
+ *
+ * PUBLIC: int __dbc_put __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_put(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ int ret;
+
+ dbp = dbc->dbp;
+ ret = 0;
+ F_CLR(dbc, DBC_ERROR);
+
+ /*
+ * Putting to secondary indices is forbidden; when we need to
+ * internally update one, we're called with a private flag,
+ * DB_UPDATE_SECONDARY, which does the right thing but won't return an
+ * error during flag checking.
+ *
+ * As a convenience, many places that want the default DB_KEYLAST
+ * behavior call DBC->put with flags == 0. Protect lower-level code
+ * here by translating that.
+ *
+ * Lastly, the DB_OVERWRITE_DUP flag is equivalent to DB_KEYLAST unless
+ * there are sorted duplicates. Limit the number of places that need
+ * to test for it explicitly.
+ */
+ if (flags == DB_UPDATE_SECONDARY || flags == 0 ||
+ (flags == DB_OVERWRITE_DUP && !F_ISSET(dbp, DB_AM_DUPSORT)))
+ flags = DB_KEYLAST;
+
+ CDB_LOCKING_INIT(dbc->env, dbc);
+
+ PERFMON6(env, db, put, dbp->fname, dbp->dname,
+ dbc->txn == NULL ? 0 : dbc->txn->txnid, key, data, flags);
+ /*
+ * Check to see if we are a primary and have secondary indices.
+ * If we are not, we save ourselves a good bit of trouble and
+ * just skip to the "normal" put.
+ */
+ if (DB_IS_PRIMARY(dbp) &&
+ ((ret = __dbc_put_primary(dbc, key, data, flags)) != 0))
+ return (ret);
+
+ /*
+ * If this is an append operation, the insert was done prior to the
+ * secondary updates, so we are finished.
+ */
+ if (flags == DB_APPEND)
+ return (ret);
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp))
+ return (__bamc_compress_put(dbc, key, data, flags));
+#endif
+
+ return (__dbc_iput(dbc, key, data, flags));
+}
+
+/*
+ * __dbc_iput --
+ * Implementation of put using a cursor.
+ *
+ * PUBLIC: int __dbc_iput __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_iput(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc_n, *oldopd, *opd;
+ db_pgno_t pgno;
+ int ret, t_ret;
+ u_int32_t tmp_flags;
+
+ /*
+ * Cursor Cleanup Note:
+ * All of the cursors passed to the underlying access methods by this
+ * routine are duplicated cursors. On return, any referenced pages
+ * will be discarded, and, if the cursor is not intended to be used
+ * again, the close function will be called. So, pages/locks that
+ * the cursor references do not need to be resolved by the underlying
+ * functions.
+ */
+ dbc_n = NULL;
+ ret = t_ret = 0;
+
+ /*
+ * If we have an off-page duplicates cursor, and the operation applies
+ * to it, perform the operation. Duplicate the cursor and call the
+ * underlying function.
+ *
+ * Off-page duplicate trees are locked in the primary tree, that is,
+ * we acquire a write lock in the primary tree and no locks in the
+ * off-page dup tree. If the put operation is done in an off-page
+ * duplicate tree, call the primary cursor's upgrade routine first.
+ */
+ if (dbc->internal->opd != NULL &&
+ (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)) {
+ /*
+ * A special case for hash off-page duplicates. Hash doesn't
+ * support (and is documented not to support) put operations
+ * relative to a cursor which references an already deleted
+ * item. For consistency, apply the same criteria to off-page
+ * duplicates as well.
+ */
+ if (dbc->dbtype == DB_HASH && F_ISSET(
+ ((BTREE_CURSOR *)(dbc->internal->opd->internal)),
+ C_DELETED)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ if ((ret = dbc->am_writelock(dbc)) != 0 ||
+ (ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0)
+ goto err;
+ opd = dbc_n->internal->opd;
+ if ((ret = opd->am_put(
+ opd, key, data, flags, NULL)) != 0)
+ goto err;
+ goto done;
+ }
+
+ /*
+ * Perform an operation on the main cursor. Duplicate the cursor,
+ * and call the underlying function.
+ */
+ if (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)
+ tmp_flags = DB_POSITION;
+ else
+ tmp_flags = 0;
+
+ /*
+ * If this cursor is going to be closed immediately, we don't
+ * need to take precautions to clean it up on error.
+ */
+ if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+ dbc_n = dbc;
+ else if ((ret = __dbc_idup(dbc, &dbc_n, tmp_flags)) != 0)
+ goto err;
+
+ pgno = PGNO_INVALID;
+ if ((ret = dbc_n->am_put(dbc_n, key, data, flags, &pgno)) != 0)
+ goto err;
+
+ /*
+ * We may be referencing a new off-page duplicates tree. Acquire
+ * a new cursor and call the underlying function.
+ */
+ if (pgno != PGNO_INVALID) {
+ oldopd = dbc_n->internal->opd;
+ if ((ret = __dbc_newopd(dbc, pgno, oldopd, &opd)) != 0) {
+ dbc_n->internal->opd = opd;
+ goto err;
+ }
+
+ dbc_n->internal->opd = opd;
+ opd->internal->pdbc = dbc_n;
+
+ if (flags == DB_NOOVERWRITE)
+ flags = DB_KEYLAST;
+ if ((ret = opd->am_put(
+ opd, key, data, flags, NULL)) != 0)
+ goto err;
+ }
+
+done:
+err: /* Cleanup and cursor resolution. */
+ if (dbc_n != NULL && !DB_RETOK_DBCPUT(ret))
+ F_SET(dbc_n, DBC_ERROR);
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __dbc_del_oldskey --
+ * Delete an old secondary key, if necessary.
+ * Returns DB_KEYEXIST if the new and old keys match..
+ */
+static int
+__dbc_del_oldskey(sdbp, dbc, skey, pkey, olddata)
+ DB *sdbp;
+ DBC *dbc;
+ DBT *skey, *pkey, *olddata;
+{
+ DB *dbp;
+ DBC *sdbc;
+ DBT *toldskeyp, *tskeyp;
+ DBT oldskey, temppkey, tempskey;
+ ENV *env;
+ int ret, t_ret;
+ u_int32_t i, noldskey, nsame, nskey, rmw;
+
+ sdbc = NULL;
+ dbp = sdbp->s_primary;
+ env = dbp->env;
+ nsame = 0;
+ rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+ /*
+ * Get the old secondary key.
+ */
+ memset(&oldskey, 0, sizeof(DBT));
+ if ((ret = sdbp->s_callback(sdbp, pkey, olddata, &oldskey)) != 0) {
+ if (ret == DB_DONOTINDEX ||
+ (F_ISSET(&oldskey, DB_DBT_MULTIPLE) && oldskey.size == 0))
+ /* There's no old key to delete. */
+ ret = 0;
+ return (ret);
+ }
+
+ if (F_ISSET(&oldskey, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+ __db_check_skeyset(sdbp, &oldskey);
+#endif
+ toldskeyp = (DBT *)oldskey.data;
+ noldskey = oldskey.size;
+ } else {
+ toldskeyp = &oldskey;
+ noldskey = 1;
+ }
+
+ if (F_ISSET(skey, DB_DBT_MULTIPLE)) {
+ nskey = skey->size;
+ skey = (DBT *)skey->data;
+ } else
+ nskey = F_ISSET(skey, DB_DBT_ISSET) ? 1 : 0;
+
+ for (; noldskey > 0 && ret == 0; noldskey--, toldskeyp++) {
+ /*
+ * Check whether this old secondary key is also a new key
+ * before we delete it. Note that bt_compare is (and must be)
+ * set no matter what access method we're in.
+ */
+ for (i = 0, tskeyp = skey; i < nskey; i++, tskeyp++)
+ if (((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
+ toldskeyp, tskeyp) == 0) {
+ nsame++;
+ F_CLR(tskeyp, DB_DBT_ISSET);
+ break;
+ }
+
+ if (i < nskey) {
+ FREE_IF_NEEDED(env, toldskeyp);
+ continue;
+ }
+
+ if (sdbc == NULL) {
+ if ((ret = __db_cursor_int(sdbp,
+ dbc->thread_info, dbc->txn, sdbp->type,
+ PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+ goto err;
+ if (CDB_LOCKING(env)) {
+ DB_ASSERT(env,
+ sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+ }
+
+ /*
+ * Don't let c_get(DB_GET_BOTH) stomp on our data. Use
+ * temporary DBTs instead.
+ */
+ SWAP_IF_NEEDED(sdbp, pkey);
+ DB_INIT_DBT(temppkey, pkey->data, pkey->size);
+ DB_INIT_DBT(tempskey, toldskeyp->data, toldskeyp->size);
+ if ((ret = __dbc_get(sdbc,
+ &tempskey, &temppkey, rmw | DB_GET_BOTH)) == 0)
+ ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY);
+ else if (ret == DB_NOTFOUND)
+ ret = __db_secondary_corrupt(dbp);
+ SWAP_IF_NEEDED(sdbp, pkey);
+ FREE_IF_NEEDED(env, toldskeyp);
+ }
+
+err: for (; noldskey > 0; noldskey--, toldskeyp++)
+ FREE_IF_NEEDED(env, toldskeyp);
+ FREE_IF_NEEDED(env, &oldskey);
+ if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == 0 && nsame == nskey)
+ return (DB_KEYEXIST);
+ return (ret);
+}
+
+/*
+ * __db_duperr()
+ * Error message: we don't currently support sorted duplicate duplicates.
+ * PUBLIC: int __db_duperr __P((DB *, u_int32_t));
+ */
+int
+__db_duperr(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ /*
+ * If we run into this error while updating a secondary index,
+ * don't yell--there's no clean way to pass DB_NODUPDATA in along
+ * with DB_UPDATE_SECONDARY, but we may run into this problem
+ * in a normal, non-error course of events.
+ *
+ * !!!
+ * If and when we ever permit duplicate duplicates in sorted-dup
+ * databases, we need to either change the secondary index code
+ * to check for dup dups, or we need to maintain the implicit
+ * "DB_NODUPDATA" behavior for databases with DB_AM_SECONDARY set.
+ */
+ if (flags != DB_NODUPDATA && !F_ISSET(dbp, DB_AM_SECONDARY))
+ __db_errx(dbp->env, DB_STR("0696",
+ "Duplicate data items are not supported with sorted data"));
+ return (DB_KEYEXIST);
+}
+
+/*
+ * __dbc_cleanup --
+ * Clean up duplicate cursors.
+ *
+ * PUBLIC: int __dbc_cleanup __P((DBC *, DBC *, int));
+ */
+int
+__dbc_cleanup(dbc, dbc_n, failed)
+ DBC *dbc, *dbc_n;
+ int failed;
+{
+ DB *dbp;
+ DBC *opd;
+ DBC_INTERNAL *internal;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_OFF(dbc->thread_info);
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ internal = dbc->internal;
+ ret = 0;
+
+ /* Discard any pages we're holding. */
+ if (internal->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ internal->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ internal->page = NULL;
+ }
+ opd = internal->opd;
+ if (opd != NULL && opd->internal->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ opd->internal->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ opd->internal->page = NULL;
+ }
+
+ /*
+ * If dbc_n is NULL, there's no internal cursor swapping to be done
+ * and no dbc_n to close--we probably did the entire operation on an
+ * offpage duplicate cursor. Just return.
+ *
+ * If dbc and dbc_n are the same, we're either inside a DB->{put/get}
+ * operation, and as an optimization we performed the operation on
+ * the main cursor rather than on a duplicated one, or we're in a
+ * bulk get that can't have moved the cursor (DB_MULTIPLE with the
+ * initial c_get operation on an off-page dup cursor). Just
+ * return--either we know we didn't move the cursor, or we're going
+ * to close it before we return to application code, so we're sure
+ * not to visibly violate the "cursor stays put on error" rule.
+ */
+ if (dbc_n == NULL || dbc == dbc_n)
+ goto done;
+
+ if (dbc_n->internal->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ dbc_n->internal->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ dbc_n->internal->page = NULL;
+ }
+ opd = dbc_n->internal->opd;
+ if (opd != NULL && opd->internal->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ opd->internal->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ opd->internal->page = NULL;
+ }
+
+ /*
+ * If we didn't fail before entering this routine or just now when
+ * freeing pages, swap the interesting contents of the old and new
+ * cursors.
+ */
+ if (!failed && ret == 0) {
+ if (opd != NULL)
+ opd->internal->pdbc = dbc;
+ if (internal->opd != NULL)
+ internal->opd->internal->pdbc = dbc_n;
+ dbc->internal = dbc_n->internal;
+ dbc_n->internal = internal;
+ }
+
+ /*
+ * Close the cursor we don't care about anymore. The close can fail,
+ * but we only expect DB_LOCK_DEADLOCK failures. This violates our
+ * "the cursor is unchanged on error" semantics, but since all you can
+ * do with a DB_LOCK_DEADLOCK failure is close the cursor, I believe
+ * that's OK.
+ *
+ * XXX
+ * There's no way to recover from failure to close the old cursor.
+ * All we can do is move to the new position and return an error.
+ *
+ * XXX
+ * We might want to consider adding a flag to the cursor, so that any
+ * subsequent operations other than close just return an error?
+ */
+ if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * If this was an update that is supporting dirty reads
+ * then we may have just swapped our read for a write lock
+ * which is held by the surviving cursor. We need
+ * to explicitly downgrade this lock. The closed cursor
+ * may only have had a read lock.
+ */
+ if (ret == 0 && failed == 0 && F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+ dbc->internal->lock_mode == DB_LOCK_WRITE &&
+ (ret = __TLPUT(dbc, dbc->internal->lock)) == 0)
+ dbc->internal->lock_mode = DB_LOCK_WWRITE;
+
+done:
+ if (F_ISSET(dbc, DBC_OPD))
+ LOCK_CHECK_ON(dbc->thread_info);
+
+ return (ret);
+}
+
+/*
+ * __dbc_secondary_get_pp --
+ * This wrapper function for DBC->pget() is the DBC->get() function
+ * for a secondary index cursor.
+ *
+ * PUBLIC: int __dbc_secondary_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_secondary_get_pp(dbc, skey, data, flags)
+ DBC *dbc;
+ DBT *skey, *data;
+ u_int32_t flags;
+{
+ DB_ASSERT(dbc->env, F_ISSET(dbc->dbp, DB_AM_SECONDARY));
+ return (__dbc_pget_pp(dbc, skey, NULL, data, flags));
+}
+
+/*
+ * __dbc_pget --
+ * Get a primary key/data pair through a secondary index.
+ *
+ * PUBLIC: int __dbc_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_pget(dbc, skey, pkey, data, flags)
+ DBC *dbc;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DB *pdbp, *sdbp;
+ DBC *dbc_n, *pdbc;
+ DBT nullpkey, *save_data;
+ u_int32_t save_pkey_flags, tmp_flags, tmp_read_locking, tmp_rmw;
+ int pkeymalloc, ret, t_ret;
+
+ sdbp = dbc->dbp;
+ pdbp = sdbp->s_primary;
+ dbc_n = NULL;
+ save_data = NULL;
+ pkeymalloc = t_ret = 0;
+
+ /*
+ * The challenging part of this function is getting the behavior
+ * right for all the various permutations of DBT flags. The
+ * next several blocks handle the various cases we need to
+ * deal with specially.
+ */
+
+ /*
+ * We may be called with a NULL pkey argument, if we've been
+ * wrapped by a 2-DBT get call. If so, we need to use our
+ * own DBT.
+ */
+ if (pkey == NULL) {
+ memset(&nullpkey, 0, sizeof(DBT));
+ pkey = &nullpkey;
+ }
+
+ /* Clear OR'd in additional bits so we can check for flag equality. */
+ tmp_rmw = LF_ISSET(DB_RMW);
+ LF_CLR(DB_RMW);
+
+ SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking);
+ /*
+ * DB_GET_RECNO is a special case, because we're interested not in
+ * the primary key/data pair, but rather in the primary's record
+ * number.
+ */
+ if (flags == DB_GET_RECNO) {
+ if (tmp_rmw)
+ F_SET(dbc, DBC_RMW);
+ F_SET(dbc, tmp_read_locking);
+ ret = __dbc_pget_recno(dbc, pkey, data, flags);
+ if (tmp_rmw)
+ F_CLR(dbc, DBC_RMW);
+ /* Clear the temp flags, but leave WAS_READ_COMMITTED. */
+ F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED);
+ return (ret);
+ }
+
+ /*
+ * If the DBTs we've been passed don't have any of the
+ * user-specified memory management flags set, we want to make sure
+ * we return values using the DBTs dbc->rskey, dbc->rkey, and
+ * dbc->rdata, respectively.
+ *
+ * There are two tricky aspects to this: first, we need to pass
+ * skey and pkey *in* to the initial c_get on the secondary key,
+ * since either or both may be looked at by it (depending on the
+ * get flag). Second, we must not use a normal DB->get call
+ * on the secondary, even though that's what we want to accomplish,
+ * because the DB handle may be free-threaded. Instead,
+ * we open a cursor, then take steps to ensure that we actually use
+ * the rkey/rdata from the *secondary* cursor.
+ *
+ * We accomplish all this by passing in the DBTs we started out
+ * with to the c_get, but swapping the contents of rskey and rkey,
+ * respectively, into rkey and rdata; __db_ret will treat them like
+ * the normal key/data pair in a c_get call, and will realloc them as
+ * need be (this is "step 1"). Then, for "step 2", we swap back
+ * rskey/rkey/rdata to normal, and do a get on the primary with the
+ * secondary dbc appointed as the owner of the returned-data memory.
+ *
+ * Note that in step 2, we copy the flags field in case we need to
+ * pass down a DB_DBT_PARTIAL or other flag that is compatible with
+ * letting DB do the memory management.
+ */
+
+ /*
+ * It is correct, though slightly sick, to attempt a partial get of a
+ * primary key. However, if we do so here, we'll never find the
+ * primary record; clear the DB_DBT_PARTIAL field of pkey just for the
+ * duration of the next call.
+ */
+ save_pkey_flags = pkey->flags;
+ F_CLR(pkey, DB_DBT_PARTIAL);
+
+ /*
+ * Now we can go ahead with the meat of this call. First, get the
+ * primary key from the secondary index. (What exactly we get depends
+ * on the flags, but the underlying cursor get will take care of the
+ * dirty work.) Duplicate the cursor, in case the later get on the
+ * primary fails.
+ */
+ switch (flags) {
+ case DB_CURRENT:
+ case DB_GET_BOTHC:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ tmp_flags = DB_POSITION;
+ break;
+ default:
+ tmp_flags = 0;
+ break;
+ }
+
+ if (dbc->internal->opd != NULL ||
+ F_ISSET(dbc, DBC_PARTITIONED | DBC_TRANSIENT)) {
+ dbc_n = dbc;
+ save_data = dbc_n->rdata;
+ } else {
+ if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0)
+ return (ret);
+ F_SET(dbc_n, DBC_TRANSIENT);
+ }
+ dbc_n->rdata = dbc->rkey;
+ dbc_n->rkey = dbc->rskey;
+
+ if (tmp_rmw)
+ F_SET(dbc_n, DBC_RMW);
+ F_SET(dbc_n, tmp_read_locking);
+
+ /*
+ * If we've been handed a primary key, it will be in native byte order,
+ * so we need to swap it before reading from the secondary.
+ */
+ if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC ||
+ flags == DB_GET_BOTH_RANGE)
+ SWAP_IF_NEEDED(sdbp, pkey);
+
+retry: /* Step 1. */
+ ret = __dbc_get(dbc_n, skey, pkey, flags);
+ /* Restore pkey's flags in case we stomped the PARTIAL flag. */
+ pkey->flags = save_pkey_flags;
+
+ /*
+ * We need to swap the primary key to native byte order if we read it
+ * successfully, or if we swapped it on entry above. We can't return
+ * with the application's data modified.
+ */
+ if (ret == 0 || flags == DB_GET_BOTH || flags == DB_GET_BOTHC ||
+ flags == DB_GET_BOTH_RANGE)
+ SWAP_IF_NEEDED(sdbp, pkey);
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Now we're ready for "step 2". If either or both of pkey and data do
+ * not have memory management flags set--that is, if DB is managing
+ * their memory--we need to swap around the rkey/rdata structures so
+ * that we don't wind up trying to use memory managed by the primary
+ * database cursor, which we'll close before we return.
+ *
+ * !!!
+ * If you're carefully following the bouncing ball, you'll note that in
+ * the DB-managed case, the buffer hanging off of pkey is the same as
+ * dbc->rkey->data. This is just fine; we may well realloc and stomp
+ * on it when we return, if we're doing a DB_GET_BOTH and need to
+ * return a different partial or key (depending on the comparison
+ * function), but this is safe.
+ *
+ * !!!
+ * We need to use __db_cursor_int here rather than simply calling
+ * pdbp->cursor, because otherwise, if we're in CDB, we'll allocate a
+ * new locker ID and leave ourselves open to deadlocks. (Even though
+ * we're only acquiring read locks, we'll still block if there are any
+ * waiters.)
+ */
+ if ((ret = __db_cursor_int(pdbp, dbc->thread_info,
+ dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+ goto err;
+
+ F_SET(pdbc, tmp_read_locking |
+ F_ISSET(dbc, DBC_READ_UNCOMMITTED | DBC_READ_COMMITTED | DBC_RMW));
+
+ /*
+ * We're about to use pkey a second time. If DB_DBT_MALLOC is set on
+ * it, we'll leak the memory we allocated the first time. Thus, set
+ * DB_DBT_REALLOC instead so that we reuse that memory instead of
+ * leaking it.
+ *
+ * Alternatively, if the application is handling copying for pkey, we
+ * need to take a copy now. The copy will be freed on exit from
+ * __dbc_pget_pp (and we must be coming through there if DB_DBT_USERCOPY
+ * is set). In the case of DB_GET_BOTH_RANGE, the pkey supplied by
+ * the application has already been copied in but the value may have
+ * changed in the search. In that case, free the original copy and get
+ * a new one.
+ *
+ * !!!
+ * This assumes that the user must always specify a compatible realloc
+ * function if a malloc function is specified. I think this is a
+ * reasonable requirement.
+ */
+ if (F_ISSET(pkey, DB_DBT_MALLOC)) {
+ F_CLR(pkey, DB_DBT_MALLOC);
+ F_SET(pkey, DB_DBT_REALLOC);
+ pkeymalloc = 1;
+ } else if (F_ISSET(pkey, DB_DBT_USERCOPY)) {
+ if (flags == DB_GET_BOTH_RANGE)
+ __dbt_userfree(sdbp->env, NULL, pkey, NULL);
+ if ((ret = __dbt_usercopy(sdbp->env, pkey)) != 0)
+ goto err;
+ }
+
+ /*
+ * Do the actual get. Set DBC_TRANSIENT since we don't care about
+ * preserving the position on error, and it's faster. SET_RET_MEM so
+ * that the secondary DBC owns any returned-data memory.
+ */
+ F_SET(pdbc, DBC_TRANSIENT);
+ SET_RET_MEM(pdbc, dbc);
+ ret = __dbc_get(pdbc, pkey, data, DB_SET);
+ DB_ASSERT(pdbp->env, ret != DB_PAGE_NOTFOUND);
+
+ /*
+ * If the item wasn't found in the primary, this is a bug; our
+ * secondary has somehow gotten corrupted, and contains elements that
+ * don't correspond to anything in the primary. Complain.
+ */
+
+ /* Now close the primary cursor. */
+ if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ else if (ret == DB_NOTFOUND) {
+ if (!F_ISSET(dbc, DBC_READ_UNCOMMITTED))
+ ret = __db_secondary_corrupt(pdbp);
+ else switch (flags) {
+ case DB_GET_BOTHC:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ PERFMON5(pdbp->env, race, dbc_get,
+ sdbp->fname, sdbp->dname, ret, flags, pkey);
+ goto retry;
+ default:
+ break;
+ }
+ }
+
+err: /* Cleanup and cursor resolution. */
+ if (dbc_n == dbc) {
+ dbc_n->rkey = dbc_n->rdata;
+ dbc_n->rdata = save_data;
+ }
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pkeymalloc) {
+ /*
+ * If pkey had a MALLOC flag, we need to restore it; otherwise,
+ * if the user frees the buffer but reuses the DBT without
+ * NULL'ing its data field or changing the flags, we may drop
+ * core.
+ */
+ F_CLR(pkey, DB_DBT_REALLOC);
+ F_SET(pkey, DB_DBT_MALLOC);
+ }
+
+ return (ret);
+}
+
+/*
+ * __dbc_pget_recno --
+ * Perform a DB_GET_RECNO c_pget on a secondary index. Returns
+ * the secondary's record number in the pkey field and the primary's
+ * in the data field.
+ */
+static int
+__dbc_pget_recno(sdbc, pkey, data, flags)
+ DBC *sdbc;
+ DBT *pkey, *data;
+ u_int32_t flags;
+{
+ DB *pdbp, *sdbp;
+ DBC *pdbc;
+ DBT discardme, primary_key;
+ ENV *env;
+ db_recno_t oob;
+ u_int32_t rmw;
+ int ret, t_ret;
+
+ sdbp = sdbc->dbp;
+ pdbp = sdbp->s_primary;
+ env = sdbp->env;
+ pdbc = NULL;
+ ret = t_ret = 0;
+
+ rmw = LF_ISSET(DB_RMW);
+
+ memset(&discardme, 0, sizeof(DBT));
+ F_SET(&discardme, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+ oob = RECNO_OOB;
+
+ /*
+ * If the primary is an rbtree, we want its record number, whether
+ * or not the secondary is one too. Fetch the recno into "data".
+ *
+ * If it's not an rbtree, return RECNO_OOB in "data".
+ */
+ if (F_ISSET(pdbp, DB_AM_RECNUM)) {
+ /*
+ * Get the primary key, so we can find the record number
+ * in the primary. (We're uninterested in the secondary key.)
+ */
+ memset(&primary_key, 0, sizeof(DBT));
+ F_SET(&primary_key, DB_DBT_MALLOC);
+ if ((ret = __dbc_get(sdbc,
+ &discardme, &primary_key, rmw | DB_CURRENT)) != 0)
+ return (ret);
+
+ /*
+ * Open a cursor on the primary, set it to the right record,
+ * and fetch its recno into "data".
+ *
+ * (See __dbc_pget for comments on the use of __db_cursor_int.)
+ *
+ * SET_RET_MEM so that the secondary DBC owns any returned-data
+ * memory.
+ */
+ if ((ret = __db_cursor_int(pdbp, sdbc->thread_info, sdbc->txn,
+ pdbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+ goto perr;
+ SET_RET_MEM(pdbc, sdbc);
+ if ((ret = __dbc_get(pdbc,
+ &primary_key, &discardme, rmw | DB_SET)) != 0)
+ goto perr;
+
+ ret = __dbc_get(pdbc, &discardme, data, rmw | DB_GET_RECNO);
+
+perr: __os_ufree(env, primary_key.data);
+ if (pdbc != NULL &&
+ (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ } else if ((ret = __db_retcopy(env, data, &oob,
+ sizeof(oob), &sdbc->rkey->data, &sdbc->rkey->ulen)) != 0)
+ return (ret);
+
+ /*
+ * If the secondary is an rbtree, we want its record number, whether
+ * or not the primary is one too. Fetch the recno into "pkey".
+ *
+ * If it's not an rbtree, return RECNO_OOB in "pkey".
+ */
+ if (F_ISSET(sdbp, DB_AM_RECNUM))
+ return (__dbc_get(sdbc, &discardme, pkey, flags));
+ else
+ return (__db_retcopy(env, pkey, &oob,
+ sizeof(oob), &sdbc->rdata->data, &sdbc->rdata->ulen));
+}
+
+/*
+ * __db_wrlock_err -- do not have a write lock.
+ */
+static int
+__db_wrlock_err(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("0697", "Write attempted on read-only cursor"));
+ return (EPERM);
+}
+
+/*
+ * __dbc_del_secondary --
+ * Perform a delete operation on a secondary index: call through
+ * to the primary and delete the primary record that this record
+ * points to.
+ *
+ * Note that deleting the primary record will call c_del on all
+ * the secondaries, including this one; thus, it is not necessary
+ * to execute both this function and an actual delete.
+ */
+static int
+__dbc_del_secondary(dbc)
+ DBC *dbc;
+{
+ DB *pdbp;
+ DBC *pdbc;
+ DBT skey, pkey;
+ ENV *env;
+ int ret, t_ret;
+ u_int32_t rmw;
+
+ pdbp = dbc->dbp->s_primary;
+ env = pdbp->env;
+ rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+ /*
+ * Get the current item that we're pointing at.
+ * We don't actually care about the secondary key, just
+ * the primary.
+ */
+ memset(&skey, 0, sizeof(DBT));
+ memset(&pkey, 0, sizeof(DBT));
+ F_SET(&skey, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = __dbc_get(dbc, &skey, &pkey, DB_CURRENT)) != 0)
+ return (ret);
+
+ SWAP_IF_NEEDED(dbc->dbp, &pkey);
+ DEBUG_LWRITE(dbc, dbc->txn, "del_secondary", &skey, &pkey, 0);
+
+ /*
+ * Create a cursor on the primary with our locker ID,
+ * so that when it calls back, we don't conflict.
+ *
+ * We create a cursor explicitly because there's no
+ * way to specify the same locker ID if we're using
+ * locking but not transactions if we use the DB->del
+ * interface. This shouldn't be any less efficient
+ * anyway.
+ */
+ if ((ret = __db_cursor_int(pdbp, dbc->thread_info, dbc->txn,
+ pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+ return (ret);
+
+ /*
+ * See comment in __dbc_put--if we're in CDB,
+ * we already hold the locks we need, and we need to flag
+ * the cursor as a WRITER so we don't run into errors
+ * when we try to delete.
+ */
+ if (CDB_LOCKING(env)) {
+ DB_ASSERT(env, pdbc->mylock.off == LOCK_INVALID);
+ F_SET(pdbc, DBC_WRITER);
+ }
+
+ /*
+ * Set the new cursor to the correct primary key. Then
+ * delete it. We don't really care about the datum;
+ * just reuse our skey DBT.
+ *
+ * If the primary get returns DB_NOTFOUND, something is amiss--
+ * every record in the secondary should correspond to some record
+ * in the primary.
+ */
+ if ((ret = __dbc_get(pdbc, &pkey, &skey, DB_SET | rmw)) == 0)
+ ret = __dbc_del(pdbc, 0);
+ else if (ret == DB_NOTFOUND)
+ ret = __db_secondary_corrupt(pdbp);
+
+ if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __dbc_del_primary --
+ * Perform a delete operation on a primary index. Loop through
+ * all the secondary indices which correspond to this primary
+ * database, and delete any secondary keys that point at the current
+ * record.
+ *
+ * PUBLIC: int __dbc_del_primary __P((DBC *));
+ */
+int
+__dbc_del_primary(dbc)
+ DBC *dbc;
+{
+ DB *dbp, *sdbp;
+ DBC *sdbc;
+ DBT *tskeyp;
+ DBT data, pkey, skey, temppkey, tempskey;
+ ENV *env;
+ u_int32_t nskey, rmw;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ sdbp = NULL;
+ rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+ /*
+ * If we're called at all, we have at least one secondary.
+ * (Unfortunately, we can't assert this without grabbing the mutex.)
+ * Get the current record so that we can construct appropriate
+ * secondary keys as needed.
+ */
+ memset(&pkey, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ if ((ret = __dbc_get(dbc, &pkey, &data, DB_CURRENT)) != 0)
+ return (ret);
+
+ memset(&skey, 0, sizeof(DBT));
+ for (ret = __db_s_first(dbp, &sdbp);
+ sdbp != NULL && ret == 0;
+ ret = __db_s_next(&sdbp, dbc->txn)) {
+ /*
+ * Get the secondary key for this secondary and the current
+ * item.
+ */
+ if ((ret = sdbp->s_callback(sdbp, &pkey, &data, &skey)) != 0) {
+ /* Not indexing is equivalent to an empty key set. */
+ if (ret == DB_DONOTINDEX) {
+ F_SET(&skey, DB_DBT_MULTIPLE);
+ skey.size = 0;
+ } else /* We had a substantive error. Bail. */
+ goto err;
+ }
+
+#ifdef DIAGNOSTIC
+ if (F_ISSET(&skey, DB_DBT_MULTIPLE))
+ __db_check_skeyset(sdbp, &skey);
+#endif
+
+ if (F_ISSET(&skey, DB_DBT_MULTIPLE)) {
+ tskeyp = (DBT *)skey.data;
+ nskey = skey.size;
+ if (nskey == 0)
+ continue;
+ } else {
+ tskeyp = &skey;
+ nskey = 1;
+ }
+
+ /* Open a secondary cursor. */
+ if ((ret = __db_cursor_int(sdbp,
+ dbc->thread_info, dbc->txn, sdbp->type,
+ PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+ goto err;
+ /* See comment above and in __dbc_put. */
+ if (CDB_LOCKING(env)) {
+ DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+
+ for (; nskey > 0; nskey--, tskeyp++) {
+ /*
+ * Set the secondary cursor to the appropriate item.
+ * Delete it.
+ *
+ * We want to use DB_RMW if locking is on; it's only
+ * legal then, though.
+ *
+ * !!!
+ * Don't stomp on any callback-allocated buffer in skey
+ * when we do a c_get(DB_GET_BOTH); use a temp DBT
+ * instead. Similarly, don't allow pkey to be
+ * invalidated when the cursor is closed.
+ */
+ DB_INIT_DBT(tempskey, tskeyp->data, tskeyp->size);
+ SWAP_IF_NEEDED(sdbp, &pkey);
+ DB_INIT_DBT(temppkey, pkey.data, pkey.size);
+ if ((ret = __dbc_get(sdbc, &tempskey, &temppkey,
+ DB_GET_BOTH | rmw)) == 0)
+ ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY);
+ else if (ret == DB_NOTFOUND)
+ ret = __db_secondary_corrupt(dbp);
+ SWAP_IF_NEEDED(sdbp, &pkey);
+ FREE_IF_NEEDED(env, tskeyp);
+ }
+
+ if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ /*
+ * In the common case where there is a single secondary key, we
+ * will have freed any application-allocated data in skey
+ * already. In the multiple key case, we need to free it here.
+ * It is safe to do this twice as the macro resets the data
+ * field.
+ */
+ FREE_IF_NEEDED(env, &skey);
+ }
+
+err: if (sdbp != NULL &&
+ (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+ ret = t_ret;
+ FREE_IF_NEEDED(env, &skey);
+ return (ret);
+}
+
+/*
+ * __dbc_del_foreign --
+ * Apply the foreign database constraints for a particular foreign
+ * database when an item is being deleted (dbc points at item being deleted
+ * in the foreign database.)
+ *
+ * Delete happens in dbp, check for occurrences of key in pdpb.
+ * Terminology:
+ * Foreign db = Where delete occurs (dbp).
+ * Secondary db = Where references to dbp occur (sdbp, a secondary)
+ * Primary db = sdbp's primary database, references to dbp are secondary
+ * keys here
+ * Foreign Key = Key being deleted in dbp (fkey)
+ * Primary Key = Key of the corresponding entry in sdbp's primary (pkey).
+ */
+static int
+__dbc_del_foreign(dbc)
+ DBC *dbc;
+{
+ DB_FOREIGN_INFO *f_info;
+ DB *dbp, *pdbp, *sdbp;
+ DBC *pdbc, *sdbc;
+ DBT data, fkey, pkey;
+ ENV *env;
+ u_int32_t flags, rmw;
+ int changed, ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ memset(&fkey, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ if ((ret = __dbc_get(dbc, &fkey, &data, DB_CURRENT)) != 0)
+ return (ret);
+
+ LIST_FOREACH(f_info, &(dbp->f_primaries), f_links) {
+ sdbp = f_info->dbp;
+ pdbp = sdbp->s_primary;
+ flags = f_info->flags;
+
+ rmw = (STD_LOCKING(dbc) &&
+ !LF_ISSET(DB_FOREIGN_ABORT)) ? DB_RMW : 0;
+
+ /*
+ * Handle CDB locking. Some of this is copied from
+ * __dbc_del_primary, but a bit more acrobatics are required.
+ * If we're not going to abort, then we need to get a write
+ * cursor. If CDB_ALLDB is set, then only one write cursor is
+ * allowed and we hold it, so we fudge things and promote the
+ * cursor on the other DBs manually, it won't cause a problem.
+ * If CDB_ALLDB is not set, then we go through the usual route
+ * to make sure we block as necessary. If there are any open
+ * read cursors on sdbp, the delete or put call later will
+ * block.
+ *
+ * If NULLIFY is set, we'll need a cursor on the primary to
+ * update it with the nullified data. Because primary and
+ * secondary dbs share a lock file ID in CDB, we open a cursor
+ * on the secondary and then get another writable cursor on the
+ * primary via __db_cursor_int to avoid deadlocking.
+ */
+ sdbc = pdbc = NULL;
+ if (!LF_ISSET(DB_FOREIGN_ABORT) && CDB_LOCKING(env) &&
+ !F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+ ret = __db_cursor(sdbp,
+ dbc->thread_info, dbc->txn, &sdbc, DB_WRITECURSOR);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0) {
+ ret = __db_cursor_int(pdbp,
+ dbc->thread_info, dbc->txn, pdbp->type,
+ PGNO_INVALID, 0, dbc->locker, &pdbc);
+ F_SET(pdbc, DBC_WRITER);
+ }
+ } else {
+ ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn,
+ sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0)
+ ret = __db_cursor_int(pdbp, dbc->thread_info,
+ dbc->txn, pdbp->type, PGNO_INVALID, 0,
+ dbc->locker, &pdbc);
+ }
+ if (ret != 0) {
+ if (sdbc != NULL)
+ (void)__dbc_close(sdbc);
+ return (ret);
+ }
+ if (CDB_LOCKING(env) && F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+ DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) && pdbc != NULL) {
+ DB_ASSERT(env,
+ pdbc->mylock.off == LOCK_INVALID);
+ F_SET(pdbc, DBC_WRITER);
+ }
+ }
+
+ /*
+ * There are three actions possible when a foreign database has
+ * items corresponding to a deleted item:
+ * DB_FOREIGN_ABORT - The delete operation should be aborted.
+ * DB_FOREIGN_CASCADE - All corresponding foreign items should
+ * be deleted.
+ * DB_FOREIGN_NULLIFY - A callback needs to be made, allowing
+ * the application to modify the data DBT from the
+ * associated database. If the callback makes a
+ * modification, the updated item needs to replace the
+ * original item in the foreign db
+ */
+ memset(&pkey, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ ret = __dbc_pget(sdbc, &fkey, &pkey, &data, DB_SET|rmw);
+
+ if (ret == DB_NOTFOUND) {
+ /* No entry means no constraint */
+ ret = __dbc_close(sdbc);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) &&
+ (t_ret = __dbc_close(pdbc)) != 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ continue;
+ } else if (ret != 0) {
+ /* Just return the error code from the pget */
+ (void)__dbc_close(sdbc);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY))
+ (void)__dbc_close(pdbc);
+ return (ret);
+ } else if (LF_ISSET(DB_FOREIGN_ABORT)) {
+ /* If the record exists and ABORT is set, we're done */
+ if ((ret = __dbc_close(sdbc)) != 0)
+ return (ret);
+ return (DB_FOREIGN_CONFLICT);
+ }
+
+ /*
+ * There were matching items in the primary DB, and the action
+ * is either DB_FOREIGN_CASCADE or DB_FOREIGN_NULLIFY.
+ */
+ while (ret == 0) {
+ if (LF_ISSET(DB_FOREIGN_CASCADE)) {
+ /*
+ * Don't use the DB_UPDATE_SECONDARY flag,
+ * since we want the delete to cascade into the
+ * secondary's primary.
+ */
+ if ((ret = __dbc_del(sdbc, 0)) != 0) {
+ __db_err(env, ret, DB_STR("0698",
+ "Attempt to execute cascading delete in a foreign index failed"));
+ break;
+ }
+ } else if (LF_ISSET(DB_FOREIGN_NULLIFY)) {
+ changed = 0;
+ if ((ret = f_info->callback(sdbp,
+ &pkey, &data, &fkey, &changed)) != 0) {
+ __db_err(env, ret, DB_STR("0699",
+ "Foreign database application callback"));
+ break;
+ }
+
+ /*
+ * If the user callback modified the DBT and
+ * a put on the primary failed.
+ */
+ if (changed && (ret = __dbc_put(pdbc,
+ &pkey, &data, DB_KEYFIRST)) != 0) {
+ __db_err(env, ret, DB_STR("0700",
+"Attempt to overwrite item in foreign database with nullified value failed"));
+ break;
+ }
+ }
+ /* retrieve the next matching item from the prim. db */
+ memset(&pkey, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ ret = __dbc_pget(sdbc,
+ &fkey, &pkey, &data, DB_NEXT_DUP|rmw);
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) &&
+ (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_s_first --
+ * Get the first secondary, if any are present, from the primary.
+ *
+ * PUBLIC: int __db_s_first __P((DB *, DB **));
+ */
+int
+__db_s_first(pdbp, sdbpp)
+ DB *pdbp, **sdbpp;
+{
+ DB *sdbp;
+
+ MUTEX_LOCK(pdbp->env, pdbp->mutex);
+ sdbp = LIST_FIRST(&pdbp->s_secondaries);
+
+ /* See __db_s_next. */
+ if (sdbp != NULL)
+ sdbp->s_refcnt++;
+ MUTEX_UNLOCK(pdbp->env, pdbp->mutex);
+
+ *sdbpp = sdbp;
+
+ return (0);
+}
+
+/*
+ * __db_s_next --
+ * Get the next secondary in the list.
+ *
+ * PUBLIC: int __db_s_next __P((DB **, DB_TXN *));
+ */
+int
+__db_s_next(sdbpp, txn)
+ DB **sdbpp;
+ DB_TXN *txn;
+{
+ DB *sdbp, *pdbp, *closeme;
+ ENV *env;
+ int ret;
+
+ /*
+ * Secondary indices are kept in a linked list, s_secondaries,
+ * off each primary DB handle. If a primary is free-threaded,
+ * this list may only be traversed or modified while the primary's
+ * thread mutex is held.
+ *
+ * The tricky part is that we don't want to hold the thread mutex
+ * across the full set of secondary puts necessary for each primary
+ * put, or we'll wind up essentially single-threading all the puts
+ * to the handle; the secondary puts will each take about as
+ * long as the primary does, and may require I/O. So we instead
+ * hold the thread mutex only long enough to follow one link to the
+ * next secondary, and then we release it before performing the
+ * actual secondary put.
+ *
+ * The only danger here is that we might legitimately close a
+ * secondary index in one thread while another thread is performing
+ * a put and trying to update that same secondary index. To
+ * prevent this from happening, we refcount the secondary handles.
+ * If close is called on a secondary index handle while we're putting
+ * to it, it won't really be closed--the refcount will simply drop,
+ * and we'll be responsible for closing it here.
+ */
+ sdbp = *sdbpp;
+ pdbp = sdbp->s_primary;
+ env = pdbp->env;
+ closeme = NULL;
+
+ MUTEX_LOCK(env, pdbp->mutex);
+ DB_ASSERT(env, sdbp->s_refcnt != 0);
+ if (--sdbp->s_refcnt == 0) {
+ LIST_REMOVE(sdbp, s_links);
+ closeme = sdbp;
+ }
+ sdbp = LIST_NEXT(sdbp, s_links);
+ if (sdbp != NULL)
+ sdbp->s_refcnt++;
+ MUTEX_UNLOCK(env, pdbp->mutex);
+
+ *sdbpp = sdbp;
+
+ /*
+ * closeme->close() is a wrapper; call __db_close explicitly.
+ */
+ if (closeme == NULL)
+ ret = 0;
+ else
+ ret = __db_close(closeme, txn, 0);
+
+ return (ret);
+}
+
+/*
+ * __db_s_done --
+ * Properly decrement the refcount on a secondary database handle we're
+ * using, without calling __db_s_next.
+ *
+ * PUBLIC: int __db_s_done __P((DB *, DB_TXN *));
+ */
+int
+__db_s_done(sdbp, txn)
+ DB *sdbp;
+ DB_TXN *txn;
+{
+ DB *pdbp;
+ ENV *env;
+ int doclose, ret;
+
+ pdbp = sdbp->s_primary;
+ env = pdbp->env;
+ doclose = 0;
+
+ MUTEX_LOCK(env, pdbp->mutex);
+ DB_ASSERT(env, sdbp->s_refcnt != 0);
+ if (--sdbp->s_refcnt == 0) {
+ LIST_REMOVE(sdbp, s_links);
+ doclose = 1;
+ }
+ MUTEX_UNLOCK(env, pdbp->mutex);
+
+ if (doclose == 0)
+ ret = 0;
+ else
+ ret = __db_close(sdbp, txn, 0);
+ return (ret);
+}
+
+/*
+ * __db_s_count --
+ * Count the number of secondaries associated with a given primary.
+ */
+static int
+__db_s_count(pdbp)
+ DB *pdbp;
+{
+ DB *sdbp;
+ ENV *env;
+ int count;
+
+ env = pdbp->env;
+ count = 0;
+
+ MUTEX_LOCK(env, pdbp->mutex);
+ for (sdbp = LIST_FIRST(&pdbp->s_secondaries);
+ sdbp != NULL;
+ sdbp = LIST_NEXT(sdbp, s_links))
+ ++count;
+ MUTEX_UNLOCK(env, pdbp->mutex);
+
+ return (count);
+}
+
+/*
+ * __db_buildpartial --
+ * Build the record that will result after a partial put is applied to
+ * an existing record.
+ *
+ * This should probably be merged with __bam_build, but that requires
+ * a little trickery if we plan to keep the overflow-record optimization
+ * in that function.
+ *
+ * PUBLIC: int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *));
+ */
+int
+__db_buildpartial(dbp, oldrec, partial, newrec)
+ DB *dbp;
+ DBT *oldrec, *partial, *newrec;
+{
+ ENV *env;
+ u_int32_t len, nbytes;
+ u_int8_t *buf;
+ int ret;
+
+ env = dbp->env;
+
+ DB_ASSERT(env, F_ISSET(partial, DB_DBT_PARTIAL));
+
+ memset(newrec, 0, sizeof(DBT));
+
+ nbytes = __db_partsize(oldrec->size, partial);
+ newrec->size = nbytes;
+
+ if ((ret = __os_malloc(env, nbytes, &buf)) != 0)
+ return (ret);
+ newrec->data = buf;
+
+ /* Nul or pad out the buffer, for any part that isn't specified. */
+ memset(buf,
+ F_ISSET(dbp, DB_AM_FIXEDLEN) ? ((BTREE *)dbp->bt_internal)->re_pad :
+ 0, nbytes);
+
+ /* Copy in any leading data from the original record. */
+ memcpy(buf, oldrec->data,
+ partial->doff > oldrec->size ? oldrec->size : partial->doff);
+
+ /* Copy the data from partial. */
+ memcpy(buf + partial->doff, partial->data, partial->size);
+
+ /* Copy any trailing data from the original record. */
+ len = partial->doff + partial->dlen;
+ if (oldrec->size > len)
+ memcpy(buf + partial->doff + partial->size,
+ (u_int8_t *)oldrec->data + len, oldrec->size - len);
+
+ return (0);
+}
+
+/*
+ * __db_partsize --
+ * Given the number of bytes in an existing record and a DBT that
+ * is about to be partial-put, calculate the size of the record
+ * after the put.
+ *
+ * This code is called from __bam_partsize.
+ *
+ * PUBLIC: u_int32_t __db_partsize __P((u_int32_t, DBT *));
+ */
+u_int32_t
+__db_partsize(nbytes, data)
+ u_int32_t nbytes;
+ DBT *data;
+{
+
+ /*
+ * There are really two cases here:
+ *
+ * Case 1: We are replacing some bytes that do not exist (i.e., they
+ * are past the end of the record). In this case the number of bytes
+ * we are replacing is irrelevant and all we care about is how many
+ * bytes we are going to add from offset. So, the new record length
+ * is going to be the size of the new bytes (size) plus wherever those
+ * new bytes begin (doff).
+ *
+ * Case 2: All the bytes we are replacing exist. Therefore, the new
+ * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
+ * plus the bytes we are adding (size).
+ */
+ if (nbytes < data->doff + data->dlen) /* Case 1 */
+ return (data->doff + data->size);
+
+ return (nbytes + data->size - data->dlen); /* Case 2 */
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __db_check_skeyset --
+ * Diagnostic check that the application's callback returns a set of
+ * secondary keys without repeats.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __db_check_skeyset __P((DB *, DBT *));
+ * PUBLIC: #endif
+ */
+void
+__db_check_skeyset(sdbp, skeyp)
+ DB *sdbp;
+ DBT *skeyp;
+{
+ DBT *first_key, *last_key, *key1, *key2;
+ ENV *env;
+
+ env = sdbp->env;
+
+ first_key = (DBT *)skeyp->data;
+ last_key = first_key + skeyp->size;
+ for (key1 = first_key; key1 < last_key; key1++)
+ for (key2 = key1 + 1; key2 < last_key; key2++)
+ DB_ASSERT(env,
+ ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
+ key1, key2) != 0);
+}
+#endif
diff --git a/src/db/db_cds.c b/src/db/db_cds.c
new file mode 100644
index 00000000..185d5487
--- /dev/null
+++ b/src/db/db_cds.c
@@ -0,0 +1,201 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+static int __cdsgroup_abort __P((DB_TXN *txn));
+static int __cdsgroup_commit __P((DB_TXN *txn, u_int32_t flags));
+static int __cdsgroup_discard __P((DB_TXN *txn, u_int32_t flags));
+static u_int32_t __cdsgroup_id __P((DB_TXN *txn));
+static int __cdsgroup_notsup __P((ENV *env, const char *meth));
+static int __cdsgroup_prepare __P((DB_TXN *txn, u_int8_t *gid));
+static int __cdsgroup_get_name __P((DB_TXN *txn, const char **namep));
+static int __cdsgroup_set_name __P((DB_TXN *txn, const char *name));
+static int __cdsgroup_set_timeout
+ __P((DB_TXN *txn, db_timeout_t timeout, u_int32_t flags));
+
+/*
+ * __cdsgroup_notsup --
+ * Error when CDS groups don't support a method.
+ */
+static int
+__cdsgroup_notsup(env, meth)
+ ENV *env;
+ const char *meth;
+{
+ __db_errx(env, DB_STR_A("0687", "CDS groups do not support %s", "%s"),
+ meth);
+ return (DB_OPNOTSUP);
+}
+
+static int
+__cdsgroup_abort(txn)
+ DB_TXN *txn;
+{
+ return (__cdsgroup_notsup(txn->mgrp->env, "abort"));
+}
+
+static int
+__cdsgroup_commit(txn, flags)
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DB_LOCKER *locker;
+ DB_LOCKREQ lreq;
+ ENV *env;
+ int ret, t_ret;
+
+ COMPQUIET(flags, 0);
+ env = txn->mgrp->env;
+
+ /* Check for live cursors. */
+ if (txn->cursors != 0) {
+ __db_errx(env, DB_STR("0688", "CDS group has active cursors"));
+ return (EINVAL);
+ }
+
+ /* We may be holding handle locks; release them. */
+ lreq.op = DB_LOCK_PUT_ALL;
+ lreq.obj = NULL;
+ ret = __lock_vec(env, txn->locker, 0, &lreq, 1, NULL);
+
+ env = txn->mgrp->env;
+ locker = txn->locker;
+ __os_free(env, txn->mgrp);
+ __os_free(env, txn);
+ if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static int __cdsgroup_discard(txn, flags)
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__cdsgroup_notsup(txn->mgrp->env, "discard"));
+}
+
+static u_int32_t __cdsgroup_id(txn)
+ DB_TXN *txn;
+{
+ return (txn->txnid);
+}
+
+static int __cdsgroup_prepare(txn, gid)
+ DB_TXN *txn;
+ u_int8_t *gid;
+{
+ COMPQUIET(gid, NULL);
+ return (__cdsgroup_notsup(txn->mgrp->env, "prepare"));
+}
+
+static int __cdsgroup_get_name(txn, namep)
+ DB_TXN *txn;
+ const char **namep;
+{
+ COMPQUIET(namep, NULL);
+ return (__cdsgroup_notsup(txn->mgrp->env, "get_name"));
+}
+
+static int __cdsgroup_set_name(txn, name)
+ DB_TXN *txn;
+ const char *name;
+{
+ COMPQUIET(name, NULL);
+ return (__cdsgroup_notsup(txn->mgrp->env, "set_name"));
+}
+
+static int __cdsgroup_set_timeout(txn, timeout, flags)
+ DB_TXN *txn;
+ db_timeout_t timeout;
+ u_int32_t flags;
+{
+ COMPQUIET(timeout, 0);
+ COMPQUIET(flags, 0);
+ return (__cdsgroup_notsup(txn->mgrp->env, "set_timeout"));
+}
+
+/*
+ * PUBLIC: int __cdsgroup_begin __P((ENV *, DB_TXN **));
+ */
+int
+__cdsgroup_begin(env, txnpp)
+ ENV *env;
+ DB_TXN **txnpp;
+{
+ DB_TXN *txn;
+ int ret;
+
+ *txnpp = txn = NULL;
+ if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0)
+ goto err;
+ /*
+ * We need a dummy DB_TXNMGR -- it's the only way to get from a
+ * transaction handle to the environment handle.
+ */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_TXNMGR), &txn->mgrp)) != 0)
+ goto err;
+ txn->mgrp->env = env;
+
+ if ((ret = __lock_id(env, &txn->txnid, &txn->locker)) != 0)
+ goto err;
+
+ txn->flags = TXN_FAMILY;
+ txn->abort = __cdsgroup_abort;
+ txn->commit = __cdsgroup_commit;
+ txn->discard = __cdsgroup_discard;
+ txn->id = __cdsgroup_id;
+ txn->prepare = __cdsgroup_prepare;
+ txn->get_name = __cdsgroup_get_name;
+ txn->set_name = __cdsgroup_set_name;
+ txn->set_timeout = __cdsgroup_set_timeout;
+
+ *txnpp = txn;
+
+ if (0) {
+err: if (txn != NULL) {
+ if (txn->mgrp != NULL)
+ __os_free(env, txn->mgrp);
+ __os_free(env, txn);
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __cds_txn_begin_pp --
+ * DB_ENV->cdsgroup_begin
+ *
+ * PUBLIC: int __cdsgroup_begin_pp __P((DB_ENV *, DB_TXN **));
+ */
+int __cdsgroup_begin_pp(dbenv, txnpp)
+ DB_ENV *dbenv;
+ DB_TXN **txnpp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "cdsgroup_begin");
+ if (!CDB_LOCKING(env))
+ return (__env_not_config(env, "cdsgroup_begin", DB_INIT_CDB));
+
+ ENV_ENTER(env, ip);
+ ret = __cdsgroup_begin(env, txnpp);
+ ENV_LEAVE(env, ip);
+ return (ret);
+ }
diff --git a/src/db/db_compact.c b/src/db/db_compact.c
new file mode 100644
index 00000000..d0f4801e
--- /dev/null
+++ b/src/db/db_compact.c
@@ -0,0 +1,1087 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/fop.h"
+#ifdef HAVE_FTRUNCATE
+static int __db_free_freelist __P((DB *, DB_THREAD_INFO *, DB_TXN *));
+static int __db_setup_freelist __P((DB *, db_pglist_t *, u_int32_t));
+#endif
+
+#define SAVE_START \
+ do { \
+ save_data = *c_data; \
+ ret = __db_retcopy(env, \
+ &save_start, current.data, current.size, \
+ &save_start.data, &save_start.ulen); \
+ } while (0)
+
+/*
+ * Only restore those things that are negated by aborting the
+ * transaction. We don't restore the number of deadlocks, for example.
+ */
+
+#define RESTORE_START \
+ do { \
+ c_data->compact_pages_free = \
+ save_data.compact_pages_free; \
+ c_data->compact_levels = save_data.compact_levels; \
+ c_data->compact_truncate = save_data.compact_truncate; \
+ c_data->compact_empty_buckets = \
+ save_data.compact_empty_buckets; \
+ ret = __db_retcopy(env, &current, \
+ save_start.data, save_start.size, \
+ &current.data, &current.ulen); \
+ } while (0)
+
+/*
+ * __db_compact_int -- compact a database.
+ *
+ * PUBLIC: int __db_compact_int __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__db_compact_int(dbp, ip, txn, start, stop, c_data, flags, end)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *start, *stop;
+ DB_COMPACT *c_data;
+ u_int32_t flags;
+ DBT *end;
+{
+ DBC *dbc;
+ DBT current, save_start;
+ DB_COMPACT save_data;
+ DB_TXN *txn_orig;
+ ENV *env;
+ u_int32_t empty_buckets, factor, retry;
+ int deadlock, have_freelist, isdone, ret, span, t_ret, txn_local;
+
+#ifdef HAVE_FTRUNCATE
+ db_pglist_t *list;
+ db_pgno_t last_pgno;
+ u_int32_t nelems, truncated;
+#endif
+
+ env = dbp->env;
+
+ memset(&current, 0, sizeof(current));
+ memset(&save_start, 0, sizeof(save_start));
+ dbc = NULL;
+ factor = 0;
+ have_freelist = deadlock = isdone = span = 0;
+ ret = retry = 0;
+ txn_orig = txn;
+
+#ifdef HAVE_FTRUNCATE
+ list = NULL;
+ last_pgno = 0;
+ nelems = truncated = 0;
+#endif
+
+ /*
+ * We pass "current" to the internal routine, indicating where that
+ * routine should begin its work and expecting that it will return to
+ * us the last key that it processed.
+ */
+ if (start != NULL && (ret = __db_retcopy(env,
+ &current, start->data, start->size,
+ &current.data, &current.ulen)) != 0)
+ return (ret);
+
+ empty_buckets = c_data->compact_empty_buckets;
+
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ txn_local = 1;
+ LF_SET(DB_AUTO_COMMIT);
+ } else
+ txn_local = 0;
+ if (!LF_ISSET(DB_FREE_SPACE | DB_FREELIST_ONLY))
+ goto no_free;
+ if (LF_ISSET(DB_FREELIST_ONLY))
+ LF_SET(DB_FREE_SPACE);
+
+#ifdef HAVE_FTRUNCATE
+ /* Sort the freelist and set up the in-memory list representation. */
+ if (txn_local && (ret = __txn_begin(env, ip, txn_orig, &txn, 0)) != 0)
+ goto err;
+
+ if ((ret = __db_free_truncate(dbp, ip,
+ txn, flags, c_data, &list, &nelems, &last_pgno)) != 0) {
+ LF_CLR(DB_FREE_SPACE);
+ goto terr;
+ }
+
+ /* If the freelist is empty and we are not filling, get out. */
+ if (nelems == 0 && LF_ISSET(DB_FREELIST_ONLY)) {
+ ret = 0;
+ LF_CLR(DB_FREE_SPACE);
+ goto terr;
+ }
+ if ((ret = __db_setup_freelist(dbp, list, nelems)) != 0) {
+ /* Someone else owns the free list. */
+ if (ret == EBUSY)
+ ret = 0;
+ }
+ if (ret == 0)
+ have_freelist = 1;
+
+ /* Commit the txn and release the meta page lock. */
+terr: if (txn_local) {
+ if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ txn = NULL;
+ }
+ if (ret != 0)
+ goto err;
+
+ /* Save the number truncated so far, we will add what we get below. */
+ truncated = c_data->compact_pages_truncated;
+ if (LF_ISSET(DB_FREELIST_ONLY))
+ goto done;
+#endif
+
+ /*
+ * We want factor to be the target number of free bytes on each page,
+ * so we know when to stop adding items to a page. Make sure to
+ * subtract the page overhead when computing this target. This can
+ * result in a 1-2% error on the smallest page.
+ * First figure out how many bytes we should use:
+ */
+no_free:
+ factor = dbp->pgsize - SIZEOF_PAGE;
+ if (c_data->compact_fillpercent != 0) {
+ factor *= c_data->compact_fillpercent;
+ factor /= 100;
+ }
+ /* Now convert to the number of free bytes to target. */
+ factor = (dbp->pgsize - SIZEOF_PAGE) - factor;
+
+ if (c_data->compact_pages == 0)
+ c_data->compact_pages = DB_MAX_PAGES;
+
+ do {
+ deadlock = 0;
+
+ SAVE_START;
+ if (ret != 0)
+ break;
+
+ if (txn_local) {
+ if ((ret =
+ __txn_begin(env, ip, txn_orig, &txn, 0)) != 0)
+ break;
+
+ if (c_data->compact_timeout != 0 &&
+ (ret = __txn_set_timeout(txn,
+ c_data->compact_timeout, DB_SET_LOCK_TIMEOUT)) != 0)
+ goto err;
+ }
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH)
+ ret = __ham_compact_int(dbc,
+ &current, stop, factor, c_data, &isdone, flags);
+ else
+#endif
+ ret = __bam_compact_int(dbc, &current, stop, factor,
+ &span, c_data, &isdone);
+ if (ret == DB_LOCK_DEADLOCK && txn_local) {
+ /*
+ * We retry on deadlock. Cancel the statistics
+ * and reset the start point to before this
+ * iteration.
+ */
+ deadlock = 1;
+ c_data->compact_deadlock++;
+ RESTORE_START;
+ }
+ /*
+ * If we could not get a lock while holding an internal
+ * node latched, commit the current local transaction otherwise
+ * report a deadlock.
+ */
+ if (ret == DB_LOCK_NOTGRANTED) {
+ if (txn_local || retry++ < 5)
+ ret = 0;
+ else
+ ret = DB_LOCK_DEADLOCK;
+ } else
+ retry = 0;
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: if (txn_local && txn != NULL) {
+ if (ret == 0 && deadlock == 0)
+ ret = __txn_commit(txn, DB_TXN_NOSYNC);
+ else if ((t_ret = __txn_abort(txn)) != 0 && ret == 0)
+ ret = t_ret;
+ txn = NULL;
+ }
+ DB_ASSERT(env, ip == NULL || ip->dbth_pincount == 0);
+ } while (ret == 0 && !isdone);
+
+ if (ret == 0 && end != NULL)
+ ret = __db_retcopy(env, end, current.data, current.size,
+ &end->data, &end->ulen);
+ if (current.data != NULL)
+ __os_free(env, current.data);
+ if (save_start.data != NULL)
+ __os_free(env, save_start.data);
+
+#ifdef HAVE_FTRUNCATE
+ /*
+ * Finish up truncation work. If there are pages left in the free
+ * list we can try to move the internal structures around so that we
+ * can remove more pages from the file.
+ * For BTREE search the internal nodes of the tree as we may have
+ * missed some while walking the leaf nodes.
+ * For HASH we will compact the hash table itself, moving segments
+ * to lower number pages where possible.
+ * Then calculate how many pages we have truncated and release
+ * the in-memory free list.
+ */
+done: if (LF_ISSET(DB_FREE_SPACE)) {
+ DBMETA *meta;
+ db_pgno_t pgno;
+
+ pgno = PGNO_BASE_MD;
+ isdone = 1;
+ if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) &&
+ __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta) == 0) {
+ isdone = meta->free == PGNO_INVALID;
+ ret = __memp_fput(dbp->mpf, ip, meta, dbp->priority);
+ }
+
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH) {
+ c_data->compact_empty_buckets -= empty_buckets;
+ if (!isdone || c_data->compact_empty_buckets != 0)
+ ret = __ham_compact_hash(dbp,
+ ip, txn_orig, c_data);
+ c_data->compact_empty_buckets += empty_buckets;
+ } else
+#endif
+ if (!isdone)
+ ret = __bam_truncate_ipages(dbp, ip, txn_orig, c_data);
+
+ /* Clean up the free list. */
+ if (list != NULL)
+ __os_free(env, list);
+
+ if ((t_ret =
+ __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) {
+ c_data->compact_pages_truncated =
+ truncated + last_pgno - meta->last_pgno;
+ if ((t_ret = __memp_fput(dbp->mpf, ip,
+ meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ } else if (ret == 0)
+ ret = t_ret;
+
+ if (have_freelist && (t_ret =
+ __db_free_freelist(dbp, ip, txn_orig)) != 0 && ret == 0)
+ t_ret = ret;
+ }
+#endif
+
+ return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+static int
+__db_setup_freelist(dbp, list, nelems)
+ DB *dbp;
+ db_pglist_t *list;
+ u_int32_t nelems;
+{
+ DB_MPOOLFILE *mpf;
+ db_pgno_t *plist;
+ int ret;
+
+ mpf = dbp->mpf;
+
+ if ((ret = __memp_alloc_freelist(mpf, nelems, &plist)) != 0)
+ return (ret);
+
+ while (nelems-- != 0)
+ *plist++ = list++->pgno;
+
+ return (0);
+}
+
+static int
+__db_free_freelist(dbp, ip, txn)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+{
+ DBC *dbc;
+ DB_LOCK lock;
+ int auto_commit, ret, t_ret;
+
+ LOCK_INIT(lock);
+ auto_commit = ret = 0;
+
+ /*
+ * If we are not in a transaction then we need to get
+ * a lock on the meta page, otherwise we should already
+ * have the lock.
+ */
+
+ dbc = NULL;
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ /*
+ * We must not timeout the lock or we will not free the list.
+ * We ignore errors from txn_begin as there is little that
+ * the application can do with the error and we want to
+ * get the lock and free the list if at all possible.
+ */
+ if (__txn_begin(dbp->env, ip, txn, &txn, 0) == 0) {
+ (void)__lock_set_timeout(dbp->env,
+ txn->locker, 0, DB_SET_TXN_TIMEOUT);
+ (void)__lock_set_timeout(dbp->env,
+ txn->locker, 0, DB_SET_LOCK_TIMEOUT);
+ auto_commit = 1;
+ }
+ /* Get a cursor so we can call __db_lget. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ if ((ret = __db_lget(dbc,
+ 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &lock)) != 0)
+ goto err;
+ }
+
+ ret = __memp_free_freelist(dbp->mpf);
+
+err: if (dbc != NULL && (t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (auto_commit && (t_ret = __txn_abort(txn)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+#endif
+
+/*
+ * __db_exchange_page -- swap a page with a lower numbered page.
+ * The routine will optionally free the higher numbered page. The cursor
+ * has a stack which includes at least the immediate parent of this page.
+ * PUBLIC: int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int));
+ */
+int
+__db_exchange_page(dbc, pgp, opg, newpgno, flags)
+ DBC *dbc;
+ PAGE **pgp, *opg;
+ db_pgno_t newpgno;
+ int flags;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT data, *dp, hdr;
+ DB_LSN lsn;
+ DB_LOCK lock;
+ EPG *epg;
+ PAGE *newpage;
+ db_pgno_t oldpgno, *pgnop;
+ int ret;
+
+ DB_ASSERT(NULL, dbc != NULL);
+ dbp = dbc->dbp;
+ LOCK_INIT(lock);
+
+ /*
+ * We want to free a page that lives in the part of the file that
+ * can be truncated, so we're going to move it onto a free page
+ * that is in the part of the file that need not be truncated.
+ * In the case of compacting hash table segments the caller already
+ * identified a contiguous set of pages to use. Otherwise
+ * since the freelist is ordered now, we can simply call __db_new
+ * which will grab the first element off the freelist; we know this
+ * is the lowest numbered free page.
+ */
+ if (newpgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(dbp->mpf, &newpgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &newpage)) != 0)
+ return (ret);
+ } else if ((ret = __db_new(dbc, P_DONTEXTEND | TYPE(*pgp),
+ STD_LOCKING(dbc) && TYPE(*pgp) != P_OVERFLOW ? &lock : NULL,
+ &newpage)) != 0)
+ return (ret);
+
+ /*
+ * If newpage is null then __db_new would have had to allocate
+ * a new page from the filesystem, so there is no reason
+ * to continue this action.
+ */
+ if (newpage == NULL)
+ return (0);
+
+ /*
+ * It is possible that a higher page is allocated if other threads
+ * are allocating at the same time, if so, just put it back.
+ */
+ if (PGNO(newpage) > PGNO(*pgp)) {
+ /* Its unfortunate but you can't just free a new overflow. */
+ if (TYPE(newpage) == P_OVERFLOW)
+ OV_LEN(newpage) = 0;
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ return (ret);
+ return (__db_free(dbc, newpage, 0));
+ }
+
+ /* Log if necessary. */
+ if (DBC_LOGGING(dbc)) {
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = *pgp;
+ hdr.size = P_OVERHEAD(dbp);
+ memset(&data, 0, sizeof(data));
+ dp = &data;
+ switch (TYPE(*pgp)) {
+ case P_OVERFLOW:
+ data.data = (u_int8_t *)*pgp + P_OVERHEAD(dbp);
+ data.size = OV_LEN(*pgp);
+ break;
+ case P_BTREEMETA:
+ hdr.size = sizeof(BTMETA);
+ dp = NULL;
+ break;
+ case P_HASHMETA:
+ hdr.size = sizeof(HMETA);
+ dp = NULL;
+ break;
+ default:
+ data.data = (u_int8_t *)*pgp + HOFFSET(*pgp);
+ data.size = dbp->pgsize - HOFFSET(*pgp);
+ hdr.size += NUM_ENT(*pgp) * sizeof(db_indx_t);
+ }
+ if ((ret = __db_merge_log(dbp, dbc->txn,
+ &LSN(newpage), 0, PGNO(newpage), &LSN(newpage),
+ PGNO(*pgp), &LSN(*pgp), &hdr, dp, 1)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(newpage));
+
+ oldpgno = PGNO(*pgp);
+ newpgno = PGNO(newpage);
+ lsn = LSN(newpage);
+ memcpy(newpage, *pgp, dbp->pgsize);
+ PGNO(newpage) = newpgno;
+ LSN(newpage) = lsn;
+
+ /* Empty the old page. */
+ if ((ret = __memp_dirty(dbp->mpf,
+ pgp, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+ if (TYPE(*pgp) == P_OVERFLOW)
+ OV_LEN(*pgp) = 0;
+ else {
+ HOFFSET(*pgp) = dbp->pgsize;
+ NUM_ENT(*pgp) = 0;
+ }
+ LSN(*pgp) = lsn;
+
+ /* Update siblings. */
+ switch (TYPE(newpage)) {
+ case P_OVERFLOW:
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ case P_HASH:
+ if (NEXT_PGNO(newpage) == PGNO_INVALID &&
+ PREV_PGNO(newpage) == PGNO_INVALID)
+ break;
+ if ((ret = __db_relink(dbc, *pgp, opg, PGNO(newpage))) != 0)
+ goto err;
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * For HASH we may reuse the old page for an even higher numbered
+ * page. Otherwise we free the old page.
+ */
+ if (!LF_ISSET(DB_EXCH_FREE)) {
+ NEXT_PGNO(*pgp) = PREV_PGNO(*pgp) = PGNO_INVALID;
+ ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, *pgp, dbc->priority);
+ } else
+ ret = __db_free(dbc, *pgp, 0);
+ *pgp = newpage;
+
+ if (ret != 0)
+ return (ret);
+
+ if (!LF_ISSET(DB_EXCH_PARENT))
+ goto done;
+
+ /* Update the parent. */
+ cp = (BTREE_CURSOR *)dbc->internal;
+ epg = &cp->csp[-1];
+
+ switch (TYPE(epg->page)) {
+ case P_IBTREE:
+ pgnop = &GET_BINTERNAL(dbp, epg->page, epg->indx)->pgno;
+ break;
+ case P_IRECNO:
+ pgnop = &GET_RINTERNAL(dbp, epg->page, epg->indx)->pgno;
+ break;
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ pgnop = &GET_BOVERFLOW(dbp, epg->page, epg->indx)->pgno;
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(epg->page)));
+ }
+ DB_ASSERT(dbp->env, oldpgno == *pgnop);
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_pgno_log(dbp, dbc->txn, &LSN(epg->page),
+ 0, PGNO(epg->page), &LSN(epg->page), (u_int32_t)epg->indx,
+ *pgnop, PGNO(newpage))) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(epg->page));
+
+ *pgnop = PGNO(newpage);
+ cp->csp->page = newpage;
+ if ((ret = __TLPUT(dbc, lock)) != 0)
+ return (ret);
+
+done: return (0);
+
+err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority);
+ (void)__TLPUT(dbc, lock);
+ return (ret);
+}
+
+/*
+ * __db_truncate_overflow -- find overflow pages to truncate.
+ * Walk the pages of an overflow chain and swap out
+ * high numbered pages. We are passed the first page
+ * but only deal with the second and subsequent pages.
+ * PUBLIC: int __db_truncate_overflow __P((DBC *,
+ * PUBLIC: db_pgno_t, PAGE **, DB_COMPACT *));
+ */
+int
+__db_truncate_overflow(dbc, pgno, ppg, c_data)
+ DBC *dbc;
+ db_pgno_t pgno;
+ PAGE **ppg;
+ DB_COMPACT *c_data;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *page;
+ db_pgno_t ppgno;
+ int have_lock, ret, t_ret;
+
+ dbp = dbc->dbp;
+ page = NULL;
+ LOCK_INIT(lock);
+ have_lock = ppg == NULL;
+
+ if ((ret = __memp_fget(dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &page)) != 0)
+ return (ret);
+
+ while ((pgno = NEXT_PGNO(page)) != PGNO_INVALID) {
+ if ((ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, page, dbc->priority)) != 0)
+ return (ret);
+ if ((ret = __memp_fget(dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &page)) != 0)
+ return (ret);
+ if (pgno <= c_data->compact_truncate)
+ continue;
+ if (have_lock == 0) {
+ DB_ASSERT(dbp->env, ppg != NULL);
+ ppgno = PGNO(*ppg);
+ if ((ret = __memp_fput(dbp->mpf, dbc->thread_info,
+ *ppg, dbc->priority)) != 0)
+ goto err;
+ *ppg = NULL;
+ if ((ret = __db_lget(dbc, 0, ppgno,
+ DB_LOCK_WRITE, 0, &lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(dbp->mpf, &ppgno,
+ dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
+ goto err;
+ have_lock = 1;
+ }
+ if ((ret = __db_exchange_page(dbc,
+ &page, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+ break;
+ }
+
+err: if (page != NULL &&
+ (t_ret = __memp_fput( dbp->mpf,
+ dbc->thread_info, page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+/*
+ * __db_truncate_root -- swap a root page for a lower numbered page.
+ * PUBLIC: int __db_truncate_root __P((DBC *,
+ * PUBLIC: PAGE *, u_int32_t, db_pgno_t *, u_int32_t));
+ */
+int
+__db_truncate_root(dbc, ppg, indx, pgnop, tlen)
+ DBC *dbc;
+ PAGE *ppg;
+ u_int32_t indx;
+ db_pgno_t *pgnop;
+ u_int32_t tlen;
+{
+ DB *dbp;
+ DBT orig;
+ PAGE *page;
+ int ret, t_ret;
+ db_pgno_t newpgno;
+
+ dbp = dbc->dbp;
+
+ DB_ASSERT(dbc->dbp->env, IS_DIRTY(ppg));
+ if ((ret = __memp_fget(dbp->mpf, pgnop,
+ dbc->thread_info, dbc->txn, 0, &page)) != 0)
+ goto err;
+
+ /*
+ * If this is a multiply reference overflow key, then we will just
+ * copy it and decrement the reference count. This is part of a
+ * fix to get rid of multiple references.
+ */
+ if (TYPE(page) == P_OVERFLOW && OV_REF(page) > 1) {
+ COMPQUIET(newpgno, 0);
+ if ((ret = __db_ovref(dbc, *pgnop)) != 0)
+ goto err;
+ memset(&orig, 0, sizeof(orig));
+ if ((ret = __db_goff(dbc, &orig, tlen, *pgnop,
+ &orig.data, &orig.size)) == 0)
+ ret = __db_poff(dbc, &orig, &newpgno);
+ if (orig.data != NULL)
+ __os_free(dbp->env, orig.data);
+ if (ret != 0)
+ goto err;
+ } else {
+ LOCK_CHECK_OFF(dbc->thread_info);
+ ret = __db_exchange_page(dbc,
+ &page, NULL, PGNO_INVALID, DB_EXCH_FREE);
+ LOCK_CHECK_ON(dbc->thread_info);
+ if (ret != 0)
+ goto err;
+ newpgno = PGNO(page);
+ /* If we could not allocate from the free list, give up.*/
+ if (newpgno == *pgnop)
+ goto err;
+ }
+
+ /* Update the reference. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_pgno_log(dbp,
+ dbc->txn, &LSN(ppg), 0, PGNO(ppg),
+ &LSN(ppg), (u_int32_t)indx, *pgnop, newpgno)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(ppg));
+
+ *pgnop = newpgno;
+
+err: if (page != NULL && (t_ret =
+ __memp_fput(dbp->mpf, dbc->thread_info,
+ page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __db_find_free --
+ * Find a contiguous "size" range of free pages that are lower numbers
+ * than the pages starting at "bstart". We can also return a set of pages
+ * that overlaps with the pages at "bstart".
+ * PUBLIC: int __db_find_free __P((DBC *, u_int32_t,
+ * PUBLIC: u_int32_t, db_pgno_t, db_pgno_t *));
+ */
+int
+__db_find_free(dbc, type, size, bstart, freep)
+ DBC *dbc;
+ u_int32_t type;
+ u_int32_t size;
+ db_pgno_t bstart, *freep;
+{
+ DB *dbp;
+ DBMETA *meta;
+ DBT listdbt;
+ DB_LOCK metalock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *page, *freepg;
+ u_int32_t i, j, start, nelems;
+ db_pgno_t *list, next_free, pgno;
+ db_pglist_t *lp, *pglist;
+ int hash, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ nelems = 0;
+ hash = 0;
+ page = NULL;
+ pglist = NULL;
+ meta = NULL;
+ LOCK_INIT(metalock);
+
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH) {
+ if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+ return (ret);
+ if (meta != NULL)
+ hash = 1;
+ }
+#endif
+ if (meta == NULL) {
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+ }
+
+ if ((ret = __memp_get_freelist(mpf, &nelems, &list)) != 0)
+ goto err;
+
+ if (nelems == 0) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ for (i = 0; i < nelems; i++) {
+ if (list[i] > bstart) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ start = i;
+ if (size == 1)
+ goto found;
+ while (i < nelems - 1 && list[i] + 1 == list[i + 1]) {
+ i++;
+ if (i - start == size - 1)
+ goto found;
+ }
+ if (i - start == size - 1)
+ goto found;
+ /*
+ * If the last set of contiguous free pages we found
+ * are contiguous to the chunk we are trying to move,
+ * then we can slide the allocated chunk back some number
+ * of pages -- figure out how many by calculating the
+ * number of pages before the allocated ones that we have
+ * found in the free list.
+ */
+ if (list[i] == bstart - 1) {
+ size = (i - start) + 1;
+ goto found;
+ }
+ }
+ ret = DB_NOTFOUND;
+ goto err;
+
+found: /* We have size range of pages. Remove them. */
+ next_free = i == nelems - 1 ? PGNO_INVALID : list[i + 1];
+ *freep = list[start];
+ if (start == 0) {
+ page = (PAGE *)meta;
+ } else if ((ret = __memp_fget(mpf, &list[start - 1],
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &page)) != 0)
+ return (ret);
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __os_malloc(dbp->env,
+ size * sizeof(db_pglist_t), &pglist)) != 0)
+ goto err;
+ lp = pglist;
+ for (j = start; j < start + size; j++, lp++) {
+ if ((ret = __memp_fget(mpf, &list[j],
+ dbc->thread_info, dbc->txn, 0, &freepg)) != 0)
+ goto err;
+ lp->pgno = PGNO(freepg);
+ lp->next_pgno = NEXT_PGNO(freepg);
+ lp->lsn = LSN(freepg);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, freepg, dbc->priority)) != 0)
+ goto err;
+ }
+ listdbt.size = size * sizeof(*pglist);
+ listdbt.data = pglist;
+ if ((ret = __db_realloc_log(dbp, dbc->txn, &lsn, 0,
+ PGNO(page), &LSN(page), next_free, type, &listdbt)) != 0)
+ goto err;
+ __os_free(dbp->env, pglist);
+ pglist = NULL;
+ } else
+ LSN_NOT_LOGGED(lsn);
+
+ LSN(page) = lsn;
+ if (start == 0)
+ meta->free = next_free;
+ else
+ NEXT_PGNO(page) = next_free;
+
+ if (page != (PAGE *)meta && (ret = __memp_fput(mpf,
+ dbc->thread_info, page, dbc->priority)) != 0)
+ goto err;
+
+ for (j = start; j < start + size; j++) {
+ if ((ret = __memp_fget(mpf,
+ &list[j], dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &freepg)) != 0)
+ goto err;
+ P_INIT(freepg, dbp->pgsize,
+ list[j], PGNO_INVALID, PGNO_INVALID, 0, type);
+ LSN(freepg) = lsn;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, freepg, dbc->priority)) != 0)
+ goto err;
+ }
+
+ if (++i != nelems)
+ memmove(&list[start], &list[i], (nelems - i) * sizeof(*list));
+ if ((ret = __memp_extend_freelist(mpf, nelems - size, &list)) != 0)
+ goto err;
+ if (hash == 0)
+ ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ t_ret = __TLPUT(dbc, metalock);
+
+ return (ret == 0 ? t_ret : ret);
+
+err: if (page != NULL && page != (PAGE *)meta)
+ (void)__memp_fput(mpf, dbc->thread_info, page, dbc->priority);
+ if (pglist != NULL)
+ __os_free(dbp->env, pglist);
+ if (meta != NULL && hash == 0)
+ (void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ (void)__TLPUT(dbc, metalock);
+ return (ret);
+}
+#endif
+
+/*
+ * __db_relink --
+ * Relink around a deleted page.
+ *
+ * PUBLIC: int __db_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t));
+ * Otherp can be either the previous or the next page to use if
+ * the caller already holds that page.
+ */
+int
+__db_relink(dbc, pagep, otherp, new_pgno)
+ DBC *dbc;
+ PAGE *pagep, *otherp;
+ db_pgno_t new_pgno;
+{
+ DB *dbp;
+ DB_LOCK npl, ppl;
+ DB_LSN *nlsnp, *plsnp, ret_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *np, *pp;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ np = pp = NULL;
+ LOCK_INIT(npl);
+ LOCK_INIT(ppl);
+ nlsnp = plsnp = NULL;
+ mpf = dbp->mpf;
+ ret = 0;
+
+ /*
+ * Retrieve the one/two pages. The caller must have them locked
+ * because the parent is latched. For a remove, we may need
+ * two pages (the before and after). For an add, we only need one
+ * because, the split took care of the prev.
+ */
+ if (pagep->next_pgno != PGNO_INVALID) {
+ if (((np = otherp) == NULL ||
+ PGNO(otherp) != pagep->next_pgno) &&
+ (ret = __memp_fget(mpf, &pagep->next_pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &np)) != 0) {
+ ret = __db_pgerr(dbp, pagep->next_pgno, ret);
+ goto err;
+ }
+ nlsnp = &np->lsn;
+ }
+ if (pagep->prev_pgno != PGNO_INVALID) {
+ if (((pp = otherp) == NULL ||
+ PGNO(otherp) != pagep->prev_pgno) &&
+ (ret = __memp_fget(mpf, &pagep->prev_pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pp)) != 0) {
+ ret = __db_pgerr(dbp, pagep->prev_pgno, ret);
+ goto err;
+ }
+ plsnp = &pp->lsn;
+ }
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_relink_log(dbp, dbc->txn, &ret_lsn, 0,
+ pagep->pgno, new_pgno, pagep->prev_pgno, plsnp,
+ pagep->next_pgno, nlsnp)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(ret_lsn);
+ if (np != NULL)
+ np->lsn = ret_lsn;
+ if (pp != NULL)
+ pp->lsn = ret_lsn;
+
+ /*
+ * Modify and release the two pages.
+ */
+ if (np != NULL) {
+ if (new_pgno == PGNO_INVALID)
+ np->prev_pgno = pagep->prev_pgno;
+ else
+ np->prev_pgno = new_pgno;
+ if (np != otherp)
+ ret = __memp_fput(mpf,
+ dbc->thread_info, np, dbc->priority);
+ if ((t_ret = __TLPUT(dbc, npl)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ }
+
+ if (pp != NULL) {
+ if (new_pgno == PGNO_INVALID)
+ pp->next_pgno = pagep->next_pgno;
+ else
+ pp->next_pgno = new_pgno;
+ if (pp != otherp)
+ ret = __memp_fput(mpf,
+ dbc->thread_info, pp, dbc->priority);
+ if ((t_ret = __TLPUT(dbc, ppl)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ }
+ return (0);
+
+err: if (np != NULL && np != otherp)
+ (void)__memp_fput(mpf, dbc->thread_info, np, dbc->priority);
+ if (pp != NULL && pp != otherp)
+ (void)__memp_fput(mpf, dbc->thread_info, pp, dbc->priority);
+ return (ret);
+}
+
+/*
+ * __db_move_metadata -- move a meta data page to a lower page number.
+ * The meta data page must be exclusively latched on entry.
+ *
+ * PUBLIC: int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *));
+ */
+int
+__db_move_metadata(dbc, metap, c_data)
+ DBC *dbc;
+ DBMETA **metap;
+ DB_COMPACT *c_data;
+{
+ BTREE *bt;
+ DB *dbp, *mdbp;
+ DB_LOCK handle_lock;
+ HASH *ht;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+
+ c_data->compact_pages_examine++;
+ if ((ret = __db_exchange_page(dbc,
+ (PAGE**)metap, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+ return (ret);
+
+ if (PGNO(*metap) == dbp->meta_pgno)
+ return (0);
+
+ if ((ret = __db_master_open(dbp,
+ dbc->thread_info, dbc->txn, dbp->fname, 0, 0, &mdbp)) != 0)
+ return (ret);
+
+ dbp->meta_pgno = PGNO(*metap);
+
+ if ((ret = __db_master_update(mdbp, dbp, dbc->thread_info,
+ dbc->txn, dbp->dname, dbp->type, MU_MOVE, NULL, 0)) != 0)
+ goto err;
+
+ /*
+ * The handle lock for subdb's depends on the metadata page number:
+ * swap the old one for the new one.
+ */
+ if (STD_LOCKING(dbc)) {
+ /*
+ * If this dbp is still in an opening transaction we need to
+ * change its lock in the event.
+ */
+ if (dbp->cur_txn != NULL)
+ __txn_remlock(dbp->env,
+ dbp->cur_txn, &dbp->handle_lock, DB_LOCK_INVALIDID);
+
+ handle_lock = dbp->handle_lock;
+ if ((ret = __fop_lock_handle(dbp->env, dbp,
+ dbp->cur_locker != NULL ? dbp->cur_locker : dbp->locker,
+ dbp->cur_txn != NULL ? DB_LOCK_WRITE : DB_LOCK_READ,
+ NULL, 0)) != 0)
+ goto err;
+
+ /* Move all the other handles to the new lock. */
+ if ((ret = __lock_change(dbp->env,
+ &handle_lock, &dbp->handle_lock)) != 0)
+ goto err;
+
+ /* Reregister the event. */
+ if (dbp->cur_txn != NULL)
+ ret = __txn_lockevent(dbp->env,
+ dbp->cur_txn, dbp, &dbp->handle_lock, dbp->locker);
+ }
+ if (dbp->log_filename != NULL)
+ dbp->log_filename->meta_pgno = dbp->meta_pgno;
+ if (dbp->type == DB_HASH) {
+ ht = dbp->h_internal;
+ ht->meta_pgno = dbp->meta_pgno;
+ ht->revision = ++dbp->mpf->mfp->revision;
+ } else {
+ bt = dbp->bt_internal;
+ bt->bt_meta = dbp->meta_pgno;
+ bt->revision = ++dbp->mpf->mfp->revision;
+ }
+
+err: if ((t_ret = __db_close(mdbp, dbc->txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
diff --git a/src/db/db_conv.c b/src/db/db_conv.c
new file mode 100644
index 00000000..210b4d6e
--- /dev/null
+++ b/src/db/db_conv.c
@@ -0,0 +1,890 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+
+/*
+ * __db_pgin --
+ * Primary page-swap routine.
+ *
+ * PUBLIC: int __db_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__db_pgin(dbenv, pg, pp, cookie)
+ DB_ENV *dbenv;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB dummydb, *dbp;
+ DB_CIPHER *db_cipher;
+ DB_LSN not_used;
+ DB_PGINFO *pginfo;
+ ENV *env;
+ PAGE *pagep;
+ size_t sum_len;
+ int is_hmac, ret;
+ u_int8_t *chksum;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ env = dbenv->env;
+ pagep = (PAGE *)pp;
+
+ ret = is_hmac = 0;
+ chksum = NULL;
+ memset(&dummydb, 0, sizeof(DB));
+ dbp = &dummydb;
+ dbp->dbenv = dbenv;
+ dbp->env = env;
+ dbp->flags = pginfo->flags;
+ dbp->pgsize = pginfo->db_pagesize;
+ db_cipher = env->crypto_handle;
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_HEAPMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * If checksumming is set on the meta-page, we must set
+ * it in the dbp.
+ */
+ if (FLD_ISSET(((DBMETA *)pp)->metaflags, DBMETA_CHKSUM))
+ F_SET(dbp, DB_AM_CHKSUM);
+ else
+ F_CLR(dbp, DB_AM_CHKSUM);
+ if (((DBMETA *)pp)->encrypt_alg != 0 ||
+ F_ISSET(dbp, DB_AM_ENCRYPT))
+ is_hmac = 1;
+ /*
+ * !!!
+ * For all meta pages it is required that the chksum
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ chksum = ((BTMETA *)pp)->chksum;
+ sum_len = DBMETASIZE;
+ break;
+ case P_INVALID:
+ /*
+ * We assume that we've read a file hole if we have
+ * a zero LSN, zero page number and P_INVALID. Otherwise
+ * we have an invalid page that might contain real data.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)) && pagep->pgno == PGNO_INVALID) {
+ sum_len = 0;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ chksum = P_CHKSUM(dbp, pagep);
+ sum_len = pginfo->db_pagesize;
+ /*
+ * If we are reading in a non-meta page, then if we have
+ * a db_cipher then we are using hmac.
+ */
+ is_hmac = CRYPTO_ON(env) ? 1 : 0;
+ break;
+ }
+
+ /*
+ * We expect a checksum error if there was a configuration problem.
+ * If there is no configuration problem and we don't get a match,
+ * it's fatal: panic the system.
+ */
+ if (F_ISSET(dbp, DB_AM_CHKSUM) && sum_len != 0) {
+ if (F_ISSET(dbp, DB_AM_SWAP) && is_hmac == 0)
+ P_32_SWAP(chksum);
+ switch (ret = __db_check_chksum(
+ env, NULL, db_cipher, chksum, pp, sum_len, is_hmac)) {
+ case 0:
+ break;
+ case -1:
+ if (DBENV_LOGGING(env))
+ (void)__db_cksum_log(
+ env, NULL, &not_used, DB_FLUSH);
+ __db_errx(env, DB_STR_A("0684",
+ "checksum error: page %lu: catastrophic recovery required",
+ "%lu"), (u_long)pg);
+ return (__env_panic(env, DB_RUNRECOVERY));
+ default:
+ return (ret);
+ }
+ }
+ if ((ret = __db_decrypt_pg(env, dbp, pagep)) != 0)
+ return (ret);
+ switch (pagep->type) {
+ case P_INVALID:
+ if (pginfo->type == DB_QUEUE)
+ return (__qam_pgin_out(env, pg, pp, cookie));
+ else if (pginfo->type == DB_HEAP)
+ return (__heap_pgin(dbp, pg, pp, cookie));
+ /*
+ * This page is either newly allocated from the end of the
+ * file, or from the free list, or it is an as-yet unwritten
+ * hash bucket page. In this last case it needs to be
+ * initialized, but never byte-swapped. Otherwise the header
+ * may need swapping. It will not be a metadata page, so the
+ * byte swapping code of __ham_pgin is adequate. If hash
+ * is not configured fall back to btree swapping.
+ */
+#ifdef HAVE_HASH
+ return (__ham_pgin(dbp, pg, pp, cookie));
+#else
+ return (__bam_pgin(dbp, pg, pp, cookie));
+#endif
+ /* NOTREACHED. */
+ break;
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ case P_HASHMETA:
+ return (__ham_pgin(dbp, pg, pp, cookie));
+ case P_HEAP:
+ case P_HEAPMETA:
+ case P_IHEAP:
+ return (__heap_pgin(dbp, pg, pp, cookie));
+ case P_BTREEMETA:
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ case P_OVERFLOW:
+ return (__bam_pgin(dbp, pg, pp, cookie));
+ case P_QAMMETA:
+ case P_QAMDATA:
+ return (__qam_pgin_out(env, pg, pp, cookie));
+ default:
+ break;
+ }
+ return (__db_pgfmt(env, pg));
+}
+
+/*
+ * __db_pgout --
+ * Primary page-swap routine.
+ *
+ * PUBLIC: int __db_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__db_pgout(dbenv, pg, pp, cookie)
+ DB_ENV *dbenv;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB dummydb, *dbp;
+ DB_PGINFO *pginfo;
+ ENV *env;
+ PAGE *pagep;
+ int ret;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ env = dbenv->env;
+ pagep = (PAGE *)pp;
+
+ memset(&dummydb, 0, sizeof(DB));
+ dbp = &dummydb;
+ dbp->dbenv = dbenv;
+ dbp->env = env;
+ dbp->flags = pginfo->flags;
+ dbp->pgsize = pginfo->db_pagesize;
+ ret = 0;
+ switch (pagep->type) {
+ case P_INVALID:
+ switch (pginfo->type) {
+ case DB_QUEUE:
+ ret = __qam_pgin_out(env, pg, pp, cookie);
+ break;
+#ifdef HAVE_HASH
+ case DB_HASH:
+ ret = __ham_pgout(dbp, pg, pp, cookie);
+ break;
+#endif
+#ifdef HAVE_HEAP
+ case DB_HEAP:
+ ret = __heap_pgout(dbp, pg, pp, cookie);
+ break;
+#endif
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_pgout(dbp, pg, pp, cookie);
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+ break;
+ case P_HASH:
+ case P_HASH_UNSORTED:
+ /*
+ * Support pgout of unsorted hash pages - since online
+ * replication upgrade can cause pages of this type to be
+ * written out.
+ *
+ * FALLTHROUGH
+ */
+ case P_HASHMETA:
+ ret = __ham_pgout(dbp, pg, pp, cookie);
+ break;
+ case P_HEAP:
+ case P_HEAPMETA:
+ case P_IHEAP:
+ ret = __heap_pgout(dbp, pg, pp, cookie);
+ break;
+ case P_BTREEMETA:
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ case P_OVERFLOW:
+ ret = __bam_pgout(dbp, pg, pp, cookie);
+ break;
+ case P_QAMMETA:
+ case P_QAMDATA:
+ ret = __qam_pgin_out(env, pg, pp, cookie);
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+ if (ret)
+ return (ret);
+
+ return (__db_encrypt_and_checksum_pg(env, dbp, pagep));
+}
+
+/*
+ * __db_decrypt_pg --
+ * Utility function to decrypt a db page.
+ *
+ * PUBLIC: int __db_decrypt_pg __P((ENV *, DB *, PAGE *));
+ */
+int
+__db_decrypt_pg (env, dbp, pagep)
+ ENV *env;
+ DB *dbp;
+ PAGE *pagep;
+{
+ DB_CIPHER *db_cipher;
+ size_t pg_len, pg_off;
+ u_int8_t *iv;
+ int ret;
+
+ db_cipher = env->crypto_handle;
+ ret = 0;
+ iv = NULL;
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ DB_ASSERT(env, db_cipher != NULL);
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+
+ pg_off = P_OVERHEAD(dbp);
+ DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0);
+
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_HEAPMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * !!!
+ * For all meta pages it is required that the iv
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ iv = ((BTMETA *)pagep)->iv;
+ pg_len = DBMETASIZE;
+ break;
+ case P_INVALID:
+ if (IS_ZERO_LSN(LSN(pagep)) &&
+ pagep->pgno == PGNO_INVALID) {
+ pg_len = 0;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ iv = P_IV(dbp, pagep);
+ pg_len = dbp->pgsize;
+ break;
+ }
+ if (pg_len != 0)
+ ret = db_cipher->decrypt(env, db_cipher->data,
+ iv, ((u_int8_t *)pagep) + pg_off,
+ pg_len - pg_off);
+ }
+ return (ret);
+}
+
+/*
+ * __db_encrypt_and_checksum_pg --
+ * Utility function to encrypt and checksum a db page.
+ *
+ * PUBLIC: int __db_encrypt_and_checksum_pg
+ * PUBLIC: __P((ENV *, DB *, PAGE *));
+ */
+int
+__db_encrypt_and_checksum_pg (env, dbp, pagep)
+ ENV *env;
+ DB *dbp;
+ PAGE *pagep;
+{
+ DB_CIPHER *db_cipher;
+ int ret;
+ size_t pg_off, pg_len, sum_len;
+ u_int8_t *chksum, *iv, *key;
+
+ chksum = iv = key = NULL;
+ db_cipher = env->crypto_handle;
+
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ DB_ASSERT(env, db_cipher != NULL);
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+
+ pg_off = P_OVERHEAD(dbp);
+ DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0);
+
+ key = db_cipher->mac_key;
+
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_HEAPMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * !!!
+ * For all meta pages it is required that the iv
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ iv = ((BTMETA *)pagep)->iv;
+ pg_len = DBMETASIZE;
+ break;
+ default:
+ iv = P_IV(dbp, pagep);
+ pg_len = dbp->pgsize;
+ break;
+ }
+ if ((ret = db_cipher->encrypt(env, db_cipher->data,
+ iv, ((u_int8_t *)pagep) + pg_off, pg_len - pg_off)) != 0)
+ return (ret);
+ }
+ if (F_ISSET(dbp, DB_AM_CHKSUM)) {
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_HEAPMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * !!!
+ * For all meta pages it is required that the chksum
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ chksum = ((BTMETA *)pagep)->chksum;
+ sum_len = DBMETASIZE;
+ break;
+ default:
+ chksum = P_CHKSUM(dbp, pagep);
+ sum_len = dbp->pgsize;
+ break;
+ }
+ __db_chksum(NULL, (u_int8_t *)pagep, sum_len, key, chksum);
+ if (F_ISSET(dbp, DB_AM_SWAP) && !F_ISSET(dbp, DB_AM_ENCRYPT))
+ P_32_SWAP(chksum);
+ }
+ return (0);
+}
+
+/*
+ * __db_metaswap --
+ * Byteswap the common part of the meta-data page.
+ *
+ * PUBLIC: void __db_metaswap __P((PAGE *));
+ */
+void
+__db_metaswap(pg)
+ PAGE *pg;
+{
+ u_int8_t *p;
+
+ p = (u_int8_t *)pg;
+
+ /* Swap the meta-data information. */
+ SWAP32(p); /* lsn.file */
+ SWAP32(p); /* lsn.offset */
+ SWAP32(p); /* pgno */
+ SWAP32(p); /* magic */
+ SWAP32(p); /* version */
+ SWAP32(p); /* pagesize */
+ p += 4; /* unused, page type, unused, unused */
+ SWAP32(p); /* free */
+ SWAP32(p); /* alloc_lsn part 1 */
+ SWAP32(p); /* alloc_lsn part 2 */
+ SWAP32(p); /* cached key count */
+ SWAP32(p); /* cached record count */
+ SWAP32(p); /* flags */
+}
+
+/*
+ * __db_byteswap --
+ * Byteswap an ordinary database page.
+ *
+ * PUBLIC: int __db_byteswap
+ * PUBLIC: __P((DB *, db_pgno_t, PAGE *, size_t, int));
+ */
+int
+__db_byteswap(dbp, pg, h, pagesize, pgin)
+ DB *dbp;
+ db_pgno_t pg;
+ PAGE *h;
+ size_t pagesize;
+ int pgin;
+{
+ ENV *env;
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ RINTERNAL *ri;
+ db_indx_t i, *inp, len, tmp;
+ u_int8_t *end, *p, *pgend;
+
+ if (pagesize == 0)
+ return (0);
+
+ if (pgin) {
+ M_32_SWAP(h->lsn.file);
+ M_32_SWAP(h->lsn.offset);
+ M_32_SWAP(h->pgno);
+ M_32_SWAP(h->prev_pgno);
+ M_32_SWAP(h->next_pgno);
+ M_16_SWAP(h->entries);
+ M_16_SWAP(h->hf_offset);
+ }
+
+ if (dbp == NULL)
+ return (0);
+ env = dbp->env;
+
+ pgend = (u_int8_t *)h + pagesize;
+
+ inp = P_INP(dbp, h);
+ if ((u_int8_t *)inp >= pgend)
+ goto out;
+
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if (pgin)
+ M_16_SWAP(inp[i]);
+
+ if (P_ENTRY(dbp, h, i) >= pgend)
+ continue;
+
+ switch (HPAGE_TYPE(dbp, h, i)) {
+ case H_KEYDATA:
+ break;
+ case H_DUPLICATE:
+ len = LEN_HKEYDATA(dbp, h, pagesize, i);
+ p = HKEYDATA_DATA(P_ENTRY(dbp, h, i));
+ for (end = p + len; p < end;) {
+ if (pgin) {
+ P_16_SWAP(p);
+ memcpy(&tmp,
+ p, sizeof(db_indx_t));
+ p += sizeof(db_indx_t);
+ } else {
+ memcpy(&tmp,
+ p, sizeof(db_indx_t));
+ SWAP16(p);
+ }
+ p += tmp;
+ SWAP16(p);
+ }
+ break;
+ case H_OFFDUP:
+ p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
+ SWAP32(p); /* pgno */
+ break;
+ case H_OFFPAGE:
+ p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
+ SWAP32(p); /* pgno */
+ SWAP32(p); /* tlen */
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+
+ }
+
+ /*
+ * The offsets in the inp array are used to determine
+ * the size of entries on a page; therefore they
+ * cannot be converted until we've done all the
+ * entries.
+ */
+ if (!pgin)
+ for (i = 0; i < NUM_ENT(h); i++)
+ M_16_SWAP(inp[i]);
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if (pgin)
+ M_16_SWAP(inp[i]);
+
+ /*
+ * In the case of on-page duplicates, key information
+ * should only be swapped once.
+ */
+ if (h->type == P_LBTREE && i > 1) {
+ if (pgin) {
+ if (inp[i] == inp[i - 2])
+ continue;
+ } else {
+ M_16_SWAP(inp[i]);
+ if (inp[i] == inp[i - 2])
+ continue;
+ M_16_SWAP(inp[i]);
+ }
+ }
+
+ bk = GET_BKEYDATA(dbp, h, i);
+ if ((u_int8_t *)bk >= pgend)
+ continue;
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ M_16_SWAP(bk->len);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ bo = (BOVERFLOW *)bk;
+ M_32_SWAP(bo->pgno);
+ M_32_SWAP(bo->tlen);
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+
+ if (!pgin)
+ M_16_SWAP(inp[i]);
+ }
+ break;
+ case P_IBTREE:
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if (pgin)
+ M_16_SWAP(inp[i]);
+
+ bi = GET_BINTERNAL(dbp, h, i);
+ if ((u_int8_t *)bi >= pgend)
+ continue;
+
+ M_16_SWAP(bi->len);
+ M_32_SWAP(bi->pgno);
+ M_32_SWAP(bi->nrecs);
+
+ switch (B_TYPE(bi->type)) {
+ case B_KEYDATA:
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ bo = (BOVERFLOW *)bi->data;
+ M_32_SWAP(bo->pgno);
+ M_32_SWAP(bo->tlen);
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+
+ if (!pgin)
+ M_16_SWAP(inp[i]);
+ }
+ break;
+ case P_IRECNO:
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if (pgin)
+ M_16_SWAP(inp[i]);
+
+ ri = GET_RINTERNAL(dbp, h, i);
+ if ((u_int8_t *)ri >= pgend)
+ continue;
+
+ M_32_SWAP(ri->pgno);
+ M_32_SWAP(ri->nrecs);
+
+ if (!pgin)
+ M_16_SWAP(inp[i]);
+ }
+ break;
+ case P_HEAP:
+ case P_IHEAP:
+ case P_INVALID:
+ case P_OVERFLOW:
+ case P_QAMDATA:
+ /* Nothing to do. */
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+
+out: if (!pgin) {
+ /* Swap the header information. */
+ M_32_SWAP(h->lsn.file);
+ M_32_SWAP(h->lsn.offset);
+ M_32_SWAP(h->pgno);
+ M_32_SWAP(h->prev_pgno);
+ M_32_SWAP(h->next_pgno);
+ M_16_SWAP(h->entries);
+ M_16_SWAP(h->hf_offset);
+ }
+ return (0);
+}
+
+/*
+ * __db_pageswap --
+ * Byteswap any database page. Normally, the page to be swapped will be
+ * referenced by the "pp" argument and the pdata argument will be NULL.
+ * This function is also called by automatically generated log functions,
+ * where the page may be split into separate header and data parts. In
+ * that case, pdata is not NULL we reconsitute
+ *
+ * PUBLIC: int __db_pageswap
+ * PUBLIC: __P((ENV *, DB *, void *, size_t, DBT *, int));
+ */
+int
+__db_pageswap(env, dbp, pp, len, pdata, pgin)
+ ENV *env;
+ DB *dbp;
+ void *pp;
+ size_t len;
+ DBT *pdata;
+ int pgin;
+{
+ db_pgno_t pg;
+ size_t pgsize;
+ void *pgcopy;
+ int ret;
+ u_int16_t hoffset;
+
+ switch (TYPE(pp)) {
+ case P_BTREEMETA:
+ return (__bam_mswap(env, pp));
+
+ case P_HASHMETA:
+ return (__ham_mswap(env, pp));
+
+ case P_QAMMETA:
+ return (__qam_mswap(env, pp));
+
+ case P_INVALID:
+ case P_OVERFLOW:
+ case P_QAMDATA:
+ /*
+ * We may have been passed an invalid page, or a queue data
+ * page, or an overflow page where fields like hoffset have a
+ * special meaning. In that case, no swapping of the page data
+ * is required, just the fields in the page header.
+ */
+ pdata = NULL;
+ break;
+
+ default:
+ break;
+ }
+
+ if (pgin) {
+ P_32_COPYSWAP(&PGNO(pp), &pg);
+ P_16_COPYSWAP(&HOFFSET(pp), &hoffset);
+ } else {
+ pg = PGNO(pp);
+ hoffset = HOFFSET(pp);
+ }
+
+ if (pdata == NULL)
+ ret = __db_byteswap(dbp, pg, (PAGE *)pp, len, pgin);
+ else {
+ pgsize = hoffset + pdata->size;
+ if ((ret = __os_malloc(env, pgsize, &pgcopy)) != 0)
+ return (ret);
+ memset(pgcopy, 0, pgsize);
+ memcpy(pgcopy, pp, len);
+ memcpy((u_int8_t *)pgcopy + hoffset, pdata->data, pdata->size);
+
+ ret = __db_byteswap(dbp, pg, (PAGE *)pgcopy, pgsize, pgin);
+ memcpy(pp, pgcopy, len);
+
+ /*
+ * If we are swapping data to be written to the log, we can't
+ * overwrite the buffer that was passed in: it may be a pointer
+ * into a page in cache. We set DB_DBT_APPMALLOC here so that
+ * the calling code can free the memory we allocate here.
+ */
+ if (!pgin) {
+ if ((ret =
+ __os_malloc(env, pdata->size, &pdata->data)) != 0) {
+ __os_free(env, pgcopy);
+ return (ret);
+ }
+ F_SET(pdata, DB_DBT_APPMALLOC);
+ }
+ memcpy(pdata->data, (u_int8_t *)pgcopy + hoffset, pdata->size);
+ __os_free(env, pgcopy);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_recordswap --
+ * Byteswap any database record.
+ *
+ * PUBLIC: void __db_recordswap __P((u_int32_t,
+ * PUBLIC: u_int32_t, void *, void *, u_int32_t));
+ */
+void
+__db_recordswap(op, size, hdr, data, pgin)
+ u_int32_t op;
+ u_int32_t size;
+ void *hdr, *data;
+ u_int32_t pgin;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BINTERNAL *bi;
+ RINTERNAL *ri;
+ db_indx_t tmp;
+ u_int8_t *p, *end;
+
+ if (size == 0)
+ return;
+ switch (OP_PAGE_GET(op)) {
+ case P_LDUP:
+ case P_LBTREE:
+ case P_LRECNO:
+ bk = (BKEYDATA *)hdr;
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ M_16_SWAP(bk->len);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ bo = (BOVERFLOW *)hdr;
+ M_32_SWAP(bo->pgno);
+ M_32_SWAP(bo->tlen);
+ break;
+ default:
+ DB_ASSERT(NULL, bk->type != bk->type);
+ }
+ break;
+ case P_IBTREE:
+ bi = (BINTERNAL *)hdr;
+ M_16_SWAP(bi->len);
+ M_32_SWAP(bi->pgno);
+ M_32_SWAP(bi->nrecs);
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ if (data == NULL) {
+ DB_ASSERT(NULL,
+ size == BINTERNAL_SIZE(BOVERFLOW_SIZE));
+ bo = (BOVERFLOW *)bi->data;
+ } else
+ bo = (BOVERFLOW *)data;
+ M_32_SWAP(bo->pgno);
+ }
+ break;
+ case P_IRECNO:
+ ri = (RINTERNAL *)hdr;
+ M_32_SWAP(ri->pgno);
+ M_32_SWAP(ri->nrecs);
+ break;
+ case P_OVERFLOW:
+ break;
+ case P_HASH:
+ case P_HASH_UNSORTED:
+ switch (OP_MODE_GET(op)) {
+ /* KEYDATA and DUPLICATE records do not include the header. */
+ case H_KEYDATA:
+ break;
+ case H_DUPLICATE:
+ p = (u_int8_t *)hdr;
+ for (end = p + size; p < end;) {
+ if (pgin) {
+ P_16_SWAP(p);
+ memcpy(&tmp,
+ p, sizeof(db_indx_t));
+ p += sizeof(db_indx_t);
+ } else {
+ memcpy(&tmp,
+ p, sizeof(db_indx_t));
+ SWAP16(p);
+ }
+ p += tmp;
+ SWAP16(p);
+ }
+ break;
+ /* These two record types include the full header. */
+ case H_OFFDUP:
+ p = (u_int8_t *)hdr;
+ p += SSZ(HOFFPAGE, pgno);
+ SWAP32(p); /* pgno */
+ break;
+ case H_OFFPAGE:
+ p = (u_int8_t *)hdr;
+ p += SSZ(HOFFPAGE, pgno);
+ SWAP32(p); /* pgno */
+ SWAP32(p); /* tlen */
+ break;
+ default:
+ DB_ASSERT(NULL, op != op);
+ }
+ break;
+
+ default:
+ DB_ASSERT(NULL, op != op);
+ }
+}
diff --git a/src/db/db_copy.c b/src/db/db_copy.c
new file mode 100644
index 00000000..359c74be
--- /dev/null
+++ b/src/db/db_copy.c
@@ -0,0 +1,31 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+/*
+ * db_copy --
+ * Copy a database file coordinated with mpool.
+ * This is for backward compatibility to the quick fix in 5.2.
+ *
+ * EXTERN: int db_copy __P((DB_ENV *,
+ * EXTERN: const char *, const char *, const char *));
+ */
+int
+db_copy(dbenv, dbfile, target, passwd)
+ DB_ENV *dbenv;
+ const char *dbfile;
+ const char *target;
+ const char *passwd;
+{
+ COMPQUIET(passwd, NULL);
+ return (__db_dbbackup_pp(dbenv, dbfile, target, 0));
+}
diff --git a/src/db/db_dispatch.c b/src/db/db_dispatch.c
new file mode 100644
index 00000000..06de4ef7
--- /dev/null
+++ b/src/db/db_dispatch.c
@@ -0,0 +1,977 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * The President and Fellows of Harvard University. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/log_verify.h"
+
+static int __db_txnlist_find_internal __P((ENV *, DB_TXNHEAD *,
+ db_txnlist_type, u_int32_t, DB_TXNLIST **,
+ int, u_int32_t *));
+
+/*
+ * __db_dispatch --
+ *
+ * This is the transaction dispatch function used by the db access methods.
+ * It is designed to handle the record format used by all the access
+ * methods (the one automatically generated by the db_{h,log,read}.sh
+ * scripts in the tools directory). An application using a different
+ * recovery paradigm will supply a different dispatch function to txn_open.
+ *
+ * PUBLIC: int __db_dispatch __P((ENV *,
+ * PUBLIC: DB_DISTAB *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_dispatch(env, dtab, db, lsnp, redo, params)
+ ENV *env; /* The environment. */
+ DB_DISTAB *dtab;
+ DBT *db; /* The log record upon which to dispatch. */
+ DB_LSN *lsnp; /* The lsn of the record being dispatched. */
+ db_recops redo; /* Redo this op (or undo it). */
+ void *params;
+{
+ DB_ENV *dbenv;
+ DB_TXNHEAD *info; /* Transaction list. */
+ DB_LOG_VRFY_INFO *lvh;
+ DB_LSN prev_lsn;
+ u_int32_t rectype, status, txnid, urectype;
+ int make_call, ret;
+
+ dbenv = env->dbenv;
+ make_call = ret = 0;
+ lvh = NULL;
+ info = NULL;
+ LOGCOPY_32(env, &rectype, db->data);
+ LOGCOPY_32(env, &txnid, (u_int8_t *)db->data + sizeof(rectype));
+
+ /*
+ * Log verification passes a DB_LOG_VRFY_INFO structure, others
+ * pass a DB_TXNHEAD structure.
+ */
+ if (redo != DB_TXN_LOG_VERIFY)
+ info = (DB_TXNHEAD *)params;
+ else
+ lvh = (DB_LOG_VRFY_INFO *)params;
+
+ /* If we don't have a dispatch table, it's hard to dispatch. */
+ DB_ASSERT(env, dtab != NULL);
+
+ /*
+ * If we find a record that is in the user's number space and they
+ * have specified a recovery routine, let them handle it. If they
+ * didn't specify a recovery routine, then we expect that they've
+ * followed all our rules and registered new recovery functions.
+ */
+ switch (redo) {
+ case DB_TXN_ABORT:
+ case DB_TXN_APPLY:
+ case DB_TXN_LOG_VERIFY:
+ case DB_TXN_PRINT:
+ make_call = 1;
+ break;
+ case DB_TXN_OPENFILES:
+ /*
+ * We collect all the transactions that have
+ * "begin" records, those with no previous LSN,
+ * so that we do not abort partial transactions.
+ * These are known to be undone, otherwise the
+ * log would not have been freeable.
+ */
+ LOGCOPY_TOLSN(env, &prev_lsn, (u_int8_t *)db->data +
+ sizeof(rectype) + sizeof(txnid));
+ if (txnid != 0 && prev_lsn.file == 0 && (ret =
+ __db_txnlist_add(env, info, txnid, TXN_OK, NULL)) != 0)
+ return (ret);
+
+ /* FALLTHROUGH */
+ case DB_TXN_POPENFILES:
+ if (rectype == DB___dbreg_register ||
+ rectype == DB___txn_child ||
+ rectype == DB___txn_ckp || rectype == DB___txn_recycle)
+ return ((dtab->int_dispatch[rectype])(env,
+ db, lsnp, redo, info));
+ break;
+ case DB_TXN_BACKWARD_ROLL:
+ /*
+ * Running full recovery in the backward pass. In general,
+ * we only process records during this pass that belong
+ * to aborted transactions. Unfortunately, there are several
+ * exceptions:
+ * 1. If this is a meta-record, one not associated with
+ * a transaction, then we must always process it.
+ * 2. If this is a transaction commit/abort, we must
+ * always process it, so that we know the status of
+ * every transaction.
+ * 3. If this is a child commit, we need to process it
+ * because the outcome of the child transaction depends
+ * on the outcome of the parent.
+ * 4. If this is a dbreg_register record, we must always
+ * process is because they contain non-transactional
+ * closes that must be properly handled.
+ * 5. If this is a noop, we must always undo it so that we
+ * properly handle any aborts before a file was closed.
+ * 6. If this a file remove, we need to process it to
+ * determine if the on-disk file is the same as the
+ * one being described.
+ */
+ switch (rectype) {
+ /*
+ * These either do not belong to a transaction or (regop)
+ * must be processed regardless of the status of the
+ * transaction.
+ */
+ case DB___txn_regop:
+ case DB___txn_recycle:
+ case DB___txn_ckp:
+ make_call = 1;
+ break;
+ /*
+ * These belong to a transaction whose status must be
+ * checked.
+ */
+ case DB___txn_child:
+ case DB___db_noop:
+ case DB___fop_file_remove:
+ case DB___dbreg_register:
+ make_call = 1;
+
+ /* FALLTHROUGH */
+ default:
+ if (txnid == 0)
+ break;
+
+ ret = __db_txnlist_find(env, info, txnid, &status);
+
+ /* If not found, this is an incomplete abort. */
+ if (ret == DB_NOTFOUND)
+ return (__db_txnlist_add(env,
+ info, txnid, TXN_IGNORE, lsnp));
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * If we ignore the transaction, ignore the operation
+ * UNLESS this is a child commit in which case we need
+ * to make sure that the child also gets marked as
+ * ignore.
+ */
+ if (status == TXN_IGNORE && rectype != DB___txn_child) {
+ make_call = 0;
+ break;
+ }
+ if (status == TXN_COMMIT)
+ break;
+
+ /* Set make_call in case we came through default */
+ make_call = 1;
+ if (status == TXN_OK &&
+ (ret = __db_txnlist_update(env,
+ info, txnid, rectype == DB___txn_prepare ?
+ TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0)
+ return (ret);
+ }
+ break;
+ case DB_TXN_FORWARD_ROLL:
+ /*
+ * In the forward pass, if we haven't seen the transaction,
+ * do nothing, else recover it.
+ *
+ * We need to always redo DB___db_noop records, so that we
+ * properly handle any commits after the file was closed.
+ */
+ switch (rectype) {
+ case DB___txn_recycle:
+ case DB___txn_ckp:
+ case DB___db_noop:
+ case DB___dbreg_register:
+ make_call = 1;
+ break;
+
+ default:
+ if (txnid == 0)
+ status = 0;
+ else {
+ ret = __db_txnlist_find(env,
+ info, txnid, &status);
+
+ if (ret == DB_NOTFOUND)
+ /* Break out out of if clause. */
+ ;
+ else if (ret != 0)
+ return (ret);
+ else if (status == TXN_COMMIT) {
+ make_call = 1;
+ break;
+ }
+ }
+
+ }
+ break;
+ default:
+ return (__db_unknown_flag(
+ env, "__db_dispatch", (u_int32_t)redo));
+ }
+
+ if (make_call) {
+ /*
+ * If the debug flag is set then we are logging
+ * records for a non-durable update so that they
+ * may be examined for diagnostic purposes.
+ * So only make the call if we are printing,
+ * otherwise we need to extract the previous
+ * lsn so undo will work properly.
+ */
+ if (rectype & DB_debug_FLAG) {
+ if (redo == DB_TXN_PRINT)
+ rectype &= ~DB_debug_FLAG;
+ else {
+ LOGCOPY_TOLSN(env, lsnp,
+ (u_int8_t *)db->data +
+ sizeof(rectype) +
+ sizeof(txnid));
+ return (0);
+ }
+ }
+ if (rectype >= DB_user_BEGIN) {
+ /*
+ * Increment user log count, we can't pass any extra
+ * args into app_dispatch, so this has to be done here.
+ */
+ if (redo == DB_TXN_LOG_VERIFY)
+ lvh->external_logrec_cnt++;
+ if (dbenv->app_dispatch != NULL)
+ return (dbenv->app_dispatch(dbenv,
+ db, lsnp, redo));
+
+ /* No application-specific dispatch */
+ urectype = rectype - DB_user_BEGIN;
+ if (urectype > dtab->ext_size ||
+ dtab->ext_dispatch[urectype] == NULL) {
+ __db_errx(env, DB_STR_A("0512",
+ "Illegal application-specific record type %lu in log",
+ "%lu"), (u_long)rectype);
+ return (EINVAL);
+ }
+
+ return ((dtab->ext_dispatch[urectype])(dbenv,
+ db, lsnp, redo));
+ } else {
+ if (rectype > dtab->int_size ||
+ dtab->int_dispatch[rectype] == NULL) {
+ __db_errx(env, DB_STR_A("0513",
+ "Illegal record type %lu in log", "%lu"),
+ (u_long)rectype);
+ if (redo == DB_TXN_LOG_VERIFY)
+ lvh->unknown_logrec_cnt++;
+
+ return (EINVAL);
+ }
+
+ return ((dtab->int_dispatch[rectype])(env,
+ db, lsnp, redo, params));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __db_add_recovery -- Add recovery functions to the dispatch table.
+ *
+ * We have two versions of this, an external one and an internal one,
+ * because application-specific functions take different arguments
+ * for dispatch (ENV versus DB_ENV).
+ *
+ * This is the external version.
+ *
+ * PUBLIC: int __db_add_recovery __P((DB_ENV *, DB_DISTAB *,
+ * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t));
+ */
+int
+__db_add_recovery(dbenv, dtab, func, ndx)
+ DB_ENV *dbenv;
+ DB_DISTAB *dtab;
+ int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+ u_int32_t ndx;
+{
+ size_t i, nsize;
+ int ret;
+
+ /* Make sure this is an application-specific record. */
+ if (ndx < DB_user_BEGIN) {
+ __db_errx(dbenv->env, DB_STR_A("0514",
+ "Attempting to add application-specific record with invalid type %lu",
+ "%lu"), (u_long)ndx);
+ return (EINVAL);
+ }
+ ndx -= DB_user_BEGIN;
+
+ /* Check if we have to grow the table. */
+ if (ndx >= dtab->ext_size) {
+ nsize = ndx + 40;
+ if ((ret =
+ __os_realloc(dbenv->env, nsize *
+ sizeof((dtab->ext_dispatch)[0]), &dtab->ext_dispatch))
+ != 0)
+ return (ret);
+ for (i = dtab->ext_size; i < nsize; ++i)
+ (dtab->ext_dispatch)[i] = NULL;
+ dtab->ext_size = nsize;
+ }
+
+ (dtab->ext_dispatch)[ndx] = func;
+ return (0);
+}
+
+/*
+ * __db_add_recovery_int --
+ *
+ * Internal version of dispatch addition function.
+ *
+ *
+ * PUBLIC: int __db_add_recovery_int __P((ENV *, DB_DISTAB *,
+ * PUBLIC: int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
+ */
+int
+__db_add_recovery_int(env, dtab, func, ndx)
+ ENV *env;
+ DB_DISTAB *dtab;
+ int (*func) __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ u_int32_t ndx;
+{
+ size_t i, nsize;
+ int ret;
+
+ if (ndx >= DB_user_BEGIN) {
+ __db_errx(env, DB_STR_A("0515",
+ "Attempting to add internal record with invalid type %lu",
+ "%lu"), (u_long)ndx);
+ return (EINVAL);
+ }
+
+ /* Check if we have to grow the table. */
+ if (ndx >= dtab->int_size) {
+ nsize = ndx + 40;
+ if ((ret =
+ __os_realloc(env, nsize * sizeof((dtab->int_dispatch)[0]),
+ &dtab->int_dispatch)) != 0)
+ return (ret);
+ for (i = dtab->int_size; i < nsize; ++i)
+ (dtab->int_dispatch)[i] = NULL;
+ dtab->int_size = nsize;
+ }
+
+ (dtab->int_dispatch)[ndx] = func;
+ return (0);
+}
+
+/*
+ * __db_txnlist_init --
+ * Initialize transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
+ */
+int
+__db_txnlist_init(env, ip, low_txn, hi_txn, trunc_lsn, retp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t low_txn, hi_txn;
+ DB_LSN *trunc_lsn;
+ DB_TXNHEAD **retp;
+{
+ DB_TXNHEAD *headp;
+ u_int32_t size, tmp;
+ int ret;
+
+ /*
+ * Size a hash table.
+ * If low is zero then we are being called during rollback
+ * and we need only one slot.
+ * Hi maybe lower than low if we have recycled txnid's.
+ * The numbers here are guesses about txn density, we can afford
+ * to look at a few entries in each slot.
+ */
+ if (low_txn == 0)
+ size = 1;
+ else {
+ if (hi_txn < low_txn) {
+ tmp = hi_txn;
+ hi_txn = low_txn;
+ low_txn = tmp;
+ }
+ tmp = hi_txn - low_txn;
+ /* See if we wrapped around. */
+ if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
+ tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
+ size = tmp / 5;
+ if (size < 100)
+ size = 100;
+ }
+ if ((ret = __os_malloc(env,
+ sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
+ return (ret);
+
+ memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
+ headp->maxid = hi_txn;
+ headp->generation = 0;
+ headp->nslots = size;
+ headp->gen_alloc = 8;
+ headp->thread_info = ip;
+ if ((ret = __os_malloc(env, headp->gen_alloc *
+ sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
+ __os_free(env, headp);
+ return (ret);
+ }
+ headp->gen_array[0].generation = 0;
+ headp->gen_array[0].txn_min = TXN_MINIMUM;
+ headp->gen_array[0].txn_max = TXN_MAXIMUM;
+ if (trunc_lsn != NULL) {
+ headp->trunc_lsn = *trunc_lsn;
+ headp->maxlsn = *trunc_lsn;
+ } else {
+ ZERO_LSN(headp->trunc_lsn);
+ ZERO_LSN(headp->maxlsn);
+ }
+ ZERO_LSN(headp->ckplsn);
+
+ *retp = headp;
+ return (0);
+}
+
+#define FIND_GENERATION(hp, txnid, gen) do { \
+ u_int32_t __i; \
+ for (__i = 0; __i <= (hp)->generation; __i++) \
+ /* The range may wrap around the end. */ \
+ if ((hp)->gen_array[__i].txn_min < \
+ (hp)->gen_array[__i].txn_max ? \
+ ((txnid) >= (hp)->gen_array[__i].txn_min && \
+ (txnid) <= (hp)->gen_array[__i].txn_max) : \
+ ((txnid) >= (hp)->gen_array[__i].txn_min || \
+ (txnid) <= (hp)->gen_array[__i].txn_max)) \
+ break; \
+ DB_ASSERT(env, __i <= (hp)->generation); \
+ gen = (hp)->gen_array[__i].generation; \
+} while (0)
+
+/*
+ * __db_txnlist_add --
+ * Add an element to our transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_add __P((ENV *,
+ * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
+ */
+int
+__db_txnlist_add(env, hp, txnid, status, lsn)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ u_int32_t txnid, status;
+ DB_LSN *lsn;
+{
+ DB_TXNLIST *elp;
+ int ret;
+
+ if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
+ return (ret);
+
+ LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
+
+ /* Find the most recent generation containing this ID */
+ FIND_GENERATION(hp, txnid, elp->u.t.generation);
+ elp->type = TXNLIST_TXNID;
+ elp->u.t.txnid = txnid;
+ elp->u.t.status = status;
+ if (txnid > hp->maxid)
+ hp->maxid = txnid;
+ if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+ hp->maxlsn = *lsn;
+
+ DB_ASSERT(env, lsn == NULL ||
+ status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0);
+
+ return (0);
+}
+
+/*
+ * __db_txnlist_remove --
+ * Remove an element from our transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t));
+ */
+int
+__db_txnlist_remove(env, hp, txnid)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ u_int32_t txnid;
+{
+ DB_TXNLIST *entry;
+ u_int32_t status;
+
+ return (__db_txnlist_find_internal(env,
+ hp, TXNLIST_TXNID, txnid, &entry, 1, &status));
+}
+
+/*
+ * __db_txnlist_ckp --
+ * Used to record the maximum checkpoint that will be retained
+ * after recovery. Typically this is simply the max checkpoint, but
+ * if we are doing client replication recovery or timestamp-based
+ * recovery, we are going to virtually truncate the log and we need
+ * to retain the last checkpoint before the truncation point.
+ *
+ * PUBLIC: void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+void
+__db_txnlist_ckp(env, hp, ckp_lsn)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ DB_LSN *ckp_lsn;
+{
+
+ COMPQUIET(env, NULL);
+
+ if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
+ LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0)
+ hp->ckplsn = *ckp_lsn;
+}
+
+/*
+ * __db_txnlist_end --
+ * Discard transaction linked list.
+ *
+ * PUBLIC: void __db_txnlist_end __P((ENV *, DB_TXNHEAD *));
+ */
+void
+__db_txnlist_end(env, hp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+{
+ u_int32_t i;
+ DB_TXNLIST *p;
+
+ if (hp == NULL)
+ return;
+
+ for (i = 0; i < hp->nslots; i++)
+ while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
+ switch (p->type) {
+ case TXNLIST_LSN:
+ __os_free(env, p->u.l.lsn_stack);
+ break;
+ case TXNLIST_DELETE:
+ case TXNLIST_TXNID:
+ default:
+ /*
+ * Possibly an incomplete DB_TXNLIST; just
+ * free it.
+ */
+ break;
+ }
+ LIST_REMOVE(p, links);
+ __os_free(env, p);
+ }
+
+ if (hp->gen_array != NULL)
+ __os_free(env, hp->gen_array);
+ __os_free(env, hp);
+}
+
+/*
+ * __db_txnlist_find --
+ * Checks to see if a txnid with the current generation is in the
+ * txnid list. This returns DB_NOTFOUND if the item isn't in the
+ * list otherwise it returns (like __db_txnlist_find_internal)
+ * the status of the transaction. A txnid of 0 means the record
+ * was generated while not in a transaction.
+ *
+ * PUBLIC: int __db_txnlist_find __P((ENV *,
+ * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t *));
+ */
+int
+__db_txnlist_find(env, hp, txnid, statusp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ u_int32_t txnid, *statusp;
+{
+ DB_TXNLIST *entry;
+
+ if (txnid == 0)
+ return (DB_NOTFOUND);
+
+ return (__db_txnlist_find_internal(env, hp,
+ TXNLIST_TXNID, txnid, &entry, 0, statusp));
+}
+
+/*
+ * __db_txnlist_update --
+ * Change the status of an existing transaction entry.
+ * Returns DB_NOTFOUND if no such entry exists.
+ *
+ * PUBLIC: int __db_txnlist_update __P((ENV *, DB_TXNHEAD *,
+ * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
+ */
+int
+__db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ u_int32_t txnid, status;
+ DB_LSN *lsn;
+ u_int32_t *ret_status;
+ int add_ok;
+{
+ DB_TXNLIST *elp;
+ int ret;
+
+ if (txnid == 0)
+ return (DB_NOTFOUND);
+
+ ret = __db_txnlist_find_internal(env,
+ hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status);
+
+ if (ret == DB_NOTFOUND && add_ok) {
+ *ret_status = status;
+ return (__db_txnlist_add(env, hp, txnid, status, lsn));
+ }
+ if (ret != 0)
+ return (ret);
+
+ if (*ret_status == TXN_IGNORE)
+ return (0);
+
+ elp->u.t.status = status;
+
+ if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+ hp->maxlsn = *lsn;
+
+ return (ret);
+}
+
+/*
+ * __db_txnlist_find_internal --
+ * Find an entry on the transaction list. If the entry is not there or
+ * the list pointer is not initialized we return DB_NOTFOUND. If the
+ * item is found, we return the status. Currently we always call this
+ * with an initialized list pointer but checking for NULL keeps it general.
+ */
+static int
+__db_txnlist_find_internal(env,
+ hp, type, txnid, txnlistp, del, statusp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ db_txnlist_type type;
+ u_int32_t txnid;
+ DB_TXNLIST **txnlistp;
+ int del;
+ u_int32_t *statusp;
+{
+ struct __db_headlink *head;
+ DB_TXNLIST *p;
+ u_int32_t generation, hash;
+ int ret;
+
+ ret = 0;
+
+ if (hp == NULL)
+ return (DB_NOTFOUND);
+
+ switch (type) {
+ case TXNLIST_TXNID:
+ hash = txnid;
+ FIND_GENERATION(hp, txnid, generation);
+ break;
+ case TXNLIST_DELETE:
+ case TXNLIST_LSN:
+ default:
+ return (__env_panic(env, EINVAL));
+ }
+
+ head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
+ LIST_FOREACH(p, head, links) {
+ if (p->type != type)
+ continue;
+ switch (type) {
+ case TXNLIST_TXNID:
+ if (p->u.t.txnid != txnid ||
+ generation != p->u.t.generation)
+ continue;
+ *statusp = p->u.t.status;
+ break;
+
+ case TXNLIST_DELETE:
+ case TXNLIST_LSN:
+ default:
+ return (__env_panic(env, EINVAL));
+ }
+ if (del == 1) {
+ LIST_REMOVE(p, links);
+ __os_free(env, p);
+ *txnlistp = NULL;
+ } else if (p != LIST_FIRST(head)) {
+ /* Move it to head of list. */
+ LIST_REMOVE(p, links);
+ LIST_INSERT_HEAD(head, p, links);
+ *txnlistp = p;
+ } else
+ *txnlistp = p;
+ return (ret);
+ }
+
+ return (DB_NOTFOUND);
+}
+
+/*
+ * __db_txnlist_gen --
+ * Change the current generation number.
+ *
+ * PUBLIC: int __db_txnlist_gen __P((ENV *,
+ * PUBLIC: DB_TXNHEAD *, int, u_int32_t, u_int32_t));
+ */
+int
+__db_txnlist_gen(env, hp, incr, min, max)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ int incr;
+ u_int32_t min, max;
+{
+ int ret;
+
+ /*
+ * During recovery generation numbers keep track of "restart"
+ * checkpoints and recycle records. Restart checkpoints occur
+ * whenever we take a checkpoint and there are no outstanding
+ * transactions. When that happens, we can reset transaction IDs
+ * back to TXNID_MINIMUM. Currently we only do the reset
+ * at then end of recovery. Recycle records occur when txnids
+ * are exhausted during runtime. A free range of ids is identified
+ * and logged. This code maintains a stack of ranges. A txnid
+ * is given the generation number of the first range it falls into
+ * in the stack.
+ */
+ if (incr < 0) {
+ --hp->generation;
+ memmove(hp->gen_array, &hp->gen_array[1],
+ (hp->generation + 1) * sizeof(hp->gen_array[0]));
+ } else {
+ ++hp->generation;
+ if (hp->generation >= hp->gen_alloc) {
+ hp->gen_alloc *= 2;
+ if ((ret = __os_realloc(env, hp->gen_alloc *
+ sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
+ return (ret);
+ }
+ memmove(&hp->gen_array[1], &hp->gen_array[0],
+ hp->generation * sizeof(hp->gen_array[0]));
+ hp->gen_array[0].generation = hp->generation;
+ hp->gen_array[0].txn_min = min;
+ hp->gen_array[0].txn_max = max;
+ }
+ return (0);
+}
+
+/*
+ * __db_txnlist_lsnadd --
+ * Save the prev_lsn from a txn_child record.
+ *
+ * PUBLIC: int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+int
+__db_txnlist_lsnadd(env, hp, lsnp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ DB_LSN *lsnp;
+{
+ DB_TXNLIST *elp;
+ int ret;
+
+ if (IS_ZERO_LSN(*lsnp))
+ return (0);
+
+ LIST_FOREACH(elp, &hp->head[0], links)
+ if (elp->type == TXNLIST_LSN)
+ break;
+
+ if (elp == NULL) {
+ if ((ret = __db_txnlist_lsninit(env, hp, lsnp)) != 0)
+ return (ret);
+ return (DB_SURPRISE_KID);
+ }
+
+ if (elp->u.l.stack_indx == elp->u.l.stack_size) {
+ elp->u.l.stack_size <<= 1;
+ if ((ret = __os_realloc(env, sizeof(DB_LSN) *
+ elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) {
+ __db_txnlist_end(env, hp);
+ return (ret);
+ }
+ }
+ elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp;
+
+ return (0);
+}
+
+/*
+ * __db_txnlist_lsnget --
+ *
+ * PUBLIC: int __db_txnlist_lsnget __P((ENV *,
+ * PUBLIC: DB_TXNHEAD *, DB_LSN *, u_int32_t));
+ * Get the lsn saved from a txn_child record.
+ */
+int
+__db_txnlist_lsnget(env, hp, lsnp, flags)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ DB_LSN *lsnp;
+ u_int32_t flags;
+{
+ DB_TXNLIST *elp;
+
+ COMPQUIET(env, NULL);
+ COMPQUIET(flags, 0);
+
+ LIST_FOREACH(elp, &hp->head[0], links)
+ if (elp->type == TXNLIST_LSN)
+ break;
+
+ if (elp == NULL || elp->u.l.stack_indx == 0) {
+ ZERO_LSN(*lsnp);
+ return (0);
+ }
+
+ *lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx];
+
+ return (0);
+}
+
+/*
+ * __db_txnlist_lsninit --
+ * Initialize a transaction list with an lsn array entry.
+ *
+ * PUBLIC: int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+int
+__db_txnlist_lsninit(env, hp, lsnp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ DB_LSN *lsnp;
+{
+ DB_TXNLIST *elp;
+ int ret;
+
+ elp = NULL;
+
+ if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
+ goto err;
+ LIST_INSERT_HEAD(&hp->head[0], elp, links);
+
+ elp->type = TXNLIST_LSN;
+ if ((ret = __os_malloc(env,
+ sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0)
+ goto err;
+ elp->u.l.stack_indx = 1;
+ elp->u.l.stack_size = DB_LSN_STACK_SIZE;
+ elp->u.l.lsn_stack[0] = *lsnp;
+
+ return (0);
+
+err: __db_txnlist_end(env, hp);
+ return (ret);
+}
+
+#ifdef DEBUG
+/*
+ * __db_txnlist_print --
+ * Print out the transaction list.
+ *
+ * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *));
+ */
+void
+__db_txnlist_print(hp)
+ DB_TXNHEAD *hp;
+{
+ DB_TXNLIST *p;
+ u_int32_t i;
+ char *txntype;
+
+ printf("Maxid: %lu Generation: %lu\n",
+ (u_long)hp->maxid, (u_long)hp->generation);
+ for (i = 0; i < hp->nslots; i++)
+ LIST_FOREACH(p, &hp->head[i], links) {
+ if (p->type != TXNLIST_TXNID) {
+ printf("Unrecognized type: %d\n", p->type);
+ continue;
+ }
+ switch (p->u.t.status) {
+ case TXN_OK:
+ txntype = "OK";
+ break;
+ case TXN_COMMIT:
+ txntype = "commit";
+ break;
+ case TXN_PREPARE:
+ txntype = "prepare";
+ break;
+ case TXN_ABORT:
+ txntype = "abort";
+ break;
+ case TXN_IGNORE:
+ txntype = "ignore";
+ break;
+ case TXN_EXPECTED:
+ txntype = "expected";
+ break;
+ case TXN_UNEXPECTED:
+ txntype = "unexpected";
+ break;
+ default:
+ txntype = "UNKNOWN";
+ break;
+ }
+ printf("TXNID: %lx(%lu): %s\n",
+ (u_long)p->u.t.txnid,
+ (u_long)p->u.t.generation, txntype);
+ }
+}
+#endif
diff --git a/src/db/db_dup.c b/src/db/db_dup.c
new file mode 100644
index 00000000..9fd04791
--- /dev/null
+++ b/src/db/db_dup.c
@@ -0,0 +1,214 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/mp.h"
+#include "dbinc/log.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * __db_ditem_nolog --
+ * Remove an item from a page without affecting its recoverability.
+ *
+ * PUBLIC: int __db_ditem_nolog __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__db_ditem_nolog(dbc, pagep, indx, nbytes)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx, nbytes;
+{
+ DB *dbp;
+ db_indx_t cnt, *inp, offset;
+ u_int8_t *from;
+
+ dbp = dbc->dbp;
+ DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+ DB_ASSERT(dbp->env, indx < NUM_ENT(pagep));
+
+ /*
+ * If there's only a single item on the page, we don't have to
+ * work hard.
+ */
+ if (NUM_ENT(pagep) == 1) {
+ NUM_ENT(pagep) = 0;
+ HOFFSET(pagep) = dbp->pgsize;
+ return (0);
+ }
+
+ inp = P_INP(dbp, pagep);
+ /*
+ * Pack the remaining key/data items at the end of the page. Use
+ * memmove(3), the regions may overlap.
+ */
+ from = (u_int8_t *)pagep + HOFFSET(pagep);
+ DB_ASSERT(dbp->env, inp[indx] >= HOFFSET(pagep));
+ memmove(from + nbytes, from, inp[indx] - HOFFSET(pagep));
+ HOFFSET(pagep) += nbytes;
+
+ /* Adjust the indices' offsets. */
+ offset = inp[indx];
+ for (cnt = 0; cnt < NUM_ENT(pagep); ++cnt)
+ if (inp[cnt] < offset)
+ inp[cnt] += nbytes;
+
+ /* Shift the indices down. */
+ --NUM_ENT(pagep);
+ if (indx != NUM_ENT(pagep))
+ memmove(&inp[indx], &inp[indx + 1],
+ sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
+
+ return (0);
+}
+
+/*
+ * __db_ditem --
+ * Remove an item from a page, logging it if enabled.
+ *
+ * PUBLIC: int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__db_ditem(dbc, pagep, indx, nbytes)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx, nbytes;
+{
+ DB *dbp;
+ DBT ldbt;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ if (DBC_LOGGING(dbc)) {
+ ldbt.data = P_ENTRY(dbp, pagep, indx);
+ ldbt.size = nbytes;
+ if ((ret = __db_addrem_log(dbp, dbc->txn, &LSN(pagep), 0,
+ OP_SET(DB_REM_DUP, pagep), PGNO(pagep),
+ (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+
+ return (__db_ditem_nolog(dbc, pagep, indx, nbytes));
+}
+
+/*
+ * __db_pitem_nolog --
+ * Put an item on a page without logging.
+ *
+ * PUBLIC: int __db_pitem_nolog
+ * PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT *hdr, *data;
+{
+ BKEYDATA bk;
+ DB *dbp;
+ DBT thdr;
+ db_indx_t *inp;
+ u_int8_t *p;
+
+ dbp = dbc->dbp;
+
+ DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+
+ if (nbytes > P_FREESPACE(dbp, pagep)) {
+ DB_ASSERT(dbp->env, nbytes <= P_FREESPACE(dbp, pagep));
+ return (EINVAL);
+ }
+
+ if (hdr == NULL) {
+ B_TSET(bk.type, B_KEYDATA);
+ bk.len = data == NULL ? 0 : data->size;
+
+ thdr.data = &bk;
+ thdr.size = SSZA(BKEYDATA, data);
+ hdr = &thdr;
+ }
+ inp = P_INP(dbp, pagep);
+
+ /* Adjust the index table, then put the item on the page. */
+ if (indx != NUM_ENT(pagep))
+ memmove(&inp[indx + 1], &inp[indx],
+ sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
+ HOFFSET(pagep) -= nbytes;
+ inp[indx] = HOFFSET(pagep);
+ ++NUM_ENT(pagep);
+
+ p = P_ENTRY(dbp, pagep, indx);
+ memcpy(p, hdr->data, hdr->size);
+ if (data != NULL)
+ memcpy(p + hdr->size, data->data, data->size);
+
+ return (0);
+}
+
+/*
+ * __db_pitem --
+ * Put an item on a page.
+ *
+ * PUBLIC: int __db_pitem
+ * PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__db_pitem(dbc, pagep, indx, nbytes, hdr, data)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT *hdr, *data;
+{
+ DB *dbp;
+ MPOOLFILE *mpf;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf->mfp;
+ /*
+ * Put a single item onto a page. The logic figuring out where to
+ * insert and whether it fits is handled in the caller. All we do
+ * here is manage the page shuffling. We cheat a little bit in that
+ * we don't want to copy the dbt on a normal put twice. If hdr is
+ * NULL, we create a BKEYDATA structure on the page, otherwise, just
+ * copy the caller's information onto the page.
+ *
+ * This routine is also used to put entries onto the page where the
+ * entry is pre-built, e.g., during recovery. In this case, the hdr
+ * will point to the entry, and the data argument will be NULL.
+ *
+ * If transactional bulk loading is enabled in this
+ * transaction, and the page is above the file's extension
+ * watermark, skip logging, but do not invoke LSN_NOT_LOGGED.
+ *
+ * !!!
+ * There's a tremendous potential for off-by-one errors here, since
+ * the passed in header sizes must be adjusted for the structure's
+ * placeholder for the trailing variable-length data field.
+ */
+ if (DBC_LOGGING(dbc)) {
+ if (__txn_pg_above_fe_watermark(dbc->txn, mpf, PGNO(pagep))) {
+ mpf->fe_nlws++; /* Note that logging was skipped. */
+ } else if ((ret = __db_addrem_log(dbp, dbc->txn, &LSN(pagep),
+ 0, OP_SET(DB_ADD_DUP, pagep), PGNO(pagep),
+ (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep)))) {
+ return (ret);
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+
+ return (__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data));
+}
diff --git a/src/db/db_iface.c b/src/db/db_iface.c
new file mode 100644
index 00000000..59e0ba53
--- /dev/null
+++ b/src/db/db_iface.c
@@ -0,0 +1,3001 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#ifndef HAVE_QUEUE
+#include "dbinc/qam.h" /* For __db_no_queue_am(). */
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+
+static int __db_associate_arg __P((DB *, DB *,
+ int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+static int __dbc_del_arg __P((DBC *, u_int32_t));
+static int __dbc_pget_arg __P((DBC *, DBT *, u_int32_t));
+static int __dbc_put_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_curinval __P((const ENV *));
+static int __db_cursor_arg __P((DB *, u_int32_t));
+static int __db_del_arg __P((DB *, DBT *, u_int32_t));
+static int __db_get_arg __P((const DB *, DBT *, DBT *, u_int32_t));
+static int __db_join_arg __P((DB *, DBC **, u_int32_t));
+static int __db_open_arg __P((DB *,
+ DB_TXN *, const char *, const char *, DBTYPE, u_int32_t));
+static int __db_pget_arg __P((DB *, DBT *, u_int32_t));
+static int __db_put_arg __P((DB *, DBT *, DBT *, u_int32_t));
+static int __dbt_ferr __P((const DB *, const char *, const DBT *, int));
+static int __db_compact_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __db_associate_foreign_arg __P((DB *, DB *,
+ int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ u_int32_t));
+
+/*
+ * These functions implement the Berkeley DB API. They are organized in a
+ * layered fashion. The interface functions (XXX_pp) perform all generic
+ * error checks (for example, PANIC'd region, replication state change
+ * in progress, inconsistent transaction usage), call function-specific
+ * check routines (_arg) to check for proper flag usage, etc., do pre-amble
+ * processing (incrementing handle counts, handling local transactions),
+ * call the function and then do post-amble processing (local transactions,
+ * decrement handle counts).
+ *
+ * The basic structure is:
+ * Check for simple/generic errors (PANIC'd region)
+ * Check if replication is changing state (increment handle count).
+ * Call function-specific argument checking routine
+ * Create internal transaction if necessary
+ * Call underlying worker function
+ * Commit/abort internal transaction if necessary
+ * Decrement handle count
+ */
+
+/*
+ * __db_associate_pp --
+ * DB->associate pre/post processing.
+ *
+ * PUBLIC: int __db_associate_pp __P((DB *, DB_TXN *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate_pp(dbp, txn, sdbp, callback, flags)
+ DB *dbp, *sdbp;
+ DB_TXN *txn;
+ int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+ u_int32_t flags;
+{
+ DBC *sdbc;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+
+ ENV_ENTER(env, ip);
+ XA_CHECK_TXN(ip, txn);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Secondary cursors may have the primary's lock file ID, so we need
+ * to make sure that no older cursors are lying around when we make
+ * the transition.
+ */
+ if (TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+ TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+ __db_errx(env, DB_STR("0572",
+ "Databases may not become secondary indices while cursors are open"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if ((ret = __db_associate_arg(dbp, sdbp, callback, flags)) != 0)
+ goto err;
+
+ /*
+ * Create a local transaction as necessary, check for consistent
+ * transaction usage, and, if we have no transaction but do have
+ * locking on, acquire a locker id for the handle lock acquisition.
+ */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ while ((sdbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+ if ((ret = __dbc_destroy(sdbc)) != 0)
+ goto err;
+
+ ret = __db_associate(dbp, ip, txn, sdbp, callback, flags);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_associate_arg --
+ * Check DB->associate arguments.
+ */
+static int
+__db_associate_arg(dbp, sdbp, callback, flags)
+ DB *dbp, *sdbp;
+ int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ if (sdbp->type == DB_HEAP) {
+ __db_errx(env,
+ "Heap databases may not be used as secondary databases");
+ return (EINVAL);
+ }
+
+ if (F_ISSET(sdbp, DB_AM_SECONDARY)) {
+ __db_errx(env, DB_STR("0573",
+ "Secondary index handles may not be re-associated"));
+ return (EINVAL);
+ }
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env, DB_STR("0574",
+ "Secondary indices may not be used as primary databases"));
+ return (EINVAL);
+ }
+ if (F_ISSET(dbp, DB_AM_DUP)) {
+ __db_errx(env, DB_STR("0575",
+ "Primary databases may not be configured with duplicates"));
+ return (EINVAL);
+ }
+ if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+ __db_errx(env, DB_STR("0576",
+ "Renumbering recno databases may not be used as primary databases"));
+ return (EINVAL);
+ }
+
+ /*
+ * It's OK for the primary and secondary to not share an environment IFF
+ * the environments are local to the DB handle. (Specifically, cursor
+ * adjustment will work correctly in this case.) The environment being
+ * local implies the environment is not configured for either locking or
+ * transactions, as neither of those could work correctly.
+ */
+ if (dbp->env != sdbp->env &&
+ (!F_ISSET(dbp->env, ENV_DBLOCAL) ||
+ !F_ISSET(sdbp->env, ENV_DBLOCAL))) {
+ __db_errx(env, DB_STR("0577",
+ "The primary and secondary must be opened in the same environment"));
+ return (EINVAL);
+ }
+ if ((DB_IS_THREADED(dbp) && !DB_IS_THREADED(sdbp)) ||
+ (!DB_IS_THREADED(dbp) && DB_IS_THREADED(sdbp))) {
+ __db_errx(env, DB_STR("0578",
+ "The DB_THREAD setting must be the same for primary and secondary"));
+ return (EINVAL);
+ }
+ if (callback == NULL &&
+ (!F_ISSET(dbp, DB_AM_RDONLY) || !F_ISSET(sdbp, DB_AM_RDONLY))) {
+ __db_errx(env, DB_STR("0579",
+"Callback function may be NULL only when database handles are read-only"));
+ return (EINVAL);
+ }
+
+ if ((ret = __db_fchk(env, "DB->associate", flags, DB_CREATE |
+ DB_IMMUTABLE_KEY)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __db_close_pp --
+ * DB->close pre/post processing.
+ *
+ * PUBLIC: int __db_close_pp __P((DB *, u_int32_t));
+ */
+int
+__db_close_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+ ret = 0;
+
+ /*
+ * Close a DB handle -- as a handle destructor, we can't fail.
+ *
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_NOSYNC)
+ ret = __db_ferr(env, "DB->close", 0);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) {
+ handle_check = 0;
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ if ((t_ret = __db_close(dbp, NULL, flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_cursor_pp --
+ * DB->cursor pre/post processing.
+ *
+ * PUBLIC: int __db_cursor_pp __P((DB *, DB_TXN *, DBC **, u_int32_t));
+ */
+int
+__db_cursor_pp(dbp, txn, dbcp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBC **dbcp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REGENV *renv;
+ int rep_blocked, ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor");
+
+ ENV_ENTER(env, ip);
+ XA_CHECK_TXN(ip, txn);
+
+ /* Check for replication block. */
+ rep_blocked = 0;
+ if (IS_ENV_REPLICATED(env)) {
+ if (!IS_REAL_TXN(txn)) {
+ if ((ret = __op_rep_enter(env, 0, 1)) != 0)
+ goto err;
+ rep_blocked = 1;
+ }
+ renv = env->reginfo->primary;
+ if (dbp->timestamp != renv->rep_timestamp) {
+ __db_errx(env, DB_STR("0580",
+ "replication recovery unrolled committed transactions;"
+ "open DB and DBcursor handles must be closed"));
+ ret = DB_REP_HANDLE_DEAD;
+ goto err;
+ }
+ }
+ if ((ret = __db_cursor_arg(dbp, flags)) != 0)
+ goto err;
+
+ /*
+ * Check for consistent transaction usage. For now, assume this
+ * cursor might be used for read operations only (in which case
+ * it may not require a txn). We'll check more stringently in
+ * c_del and c_put. (Note this means the read-op txn tests have
+ * to be a subset of the write-op ones.)
+ */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+ goto err;
+
+ ret = __db_cursor(dbp, ip, txn, dbcp, flags);
+
+ /*
+ * Register externally created cursors into the valid transaction.
+ * If a family transaction was passed in, the transaction handle in
+ * the cursor may not match.
+ */
+ txn = (*dbcp)->txn;
+ if (txn != NULL && ret == 0)
+ TAILQ_INSERT_HEAD(&(txn->my_cursors), *dbcp, txn_cursors);
+
+err: /* Release replication block on error. */
+ if (ret != 0 && rep_blocked)
+ (void)__op_rep_exit(env);
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_cursor --
+ * DB->cursor.
+ *
+ * PUBLIC: int __db_cursor __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBC **, u_int32_t));
+ */
+int
+__db_cursor(dbp, ip, txn, dbcp, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBC **dbcp;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ db_lockmode_t mode;
+ int ret;
+
+ env = dbp->env;
+
+ if (MULTIVERSION(dbp) && txn == NULL && (LF_ISSET(DB_TXN_SNAPSHOT) ||
+ F_ISSET(env->dbenv, DB_ENV_TXN_SNAPSHOT))) {
+ if ((ret =
+ __txn_begin(env, ip, NULL, &txn, DB_TXN_SNAPSHOT)) != 0)
+ return (ret);
+ F_SET(txn, TXN_PRIVATE);
+ }
+
+ PERFMON5(env, db, cursor, dbp->fname,
+ dbp->dname, txn == NULL ? 0 : txn->txnid, flags, &dbp->fileid[0]);
+
+ if ((ret = __db_cursor_int(dbp, ip, txn, dbp->type, PGNO_INVALID,
+ LF_ISSET(DB_CURSOR_BULK | DB_CURSOR_TRANSIENT | DB_RECOVER),
+ NULL, &dbc)) != 0)
+ return (ret);
+
+ /*
+ * If this is CDB, do all the locking in the interface, which is
+ * right here.
+ */
+ if (CDB_LOCKING(env)) {
+ mode = (LF_ISSET(DB_WRITELOCK)) ? DB_LOCK_WRITE :
+ ((LF_ISSET(DB_WRITECURSOR) || txn != NULL) ?
+ DB_LOCK_IWRITE : DB_LOCK_READ);
+ if ((ret = __lock_get(env, dbc->locker, 0,
+ &dbc->lock_dbt, mode, &dbc->mylock)) != 0)
+ goto err;
+ if (LF_ISSET(DB_WRITECURSOR))
+ F_SET(dbc, DBC_WRITECURSOR);
+ if (LF_ISSET(DB_WRITELOCK))
+ F_SET(dbc, DBC_WRITER);
+ }
+
+ if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+ (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
+ F_SET(dbc, DBC_READ_UNCOMMITTED);
+
+ if (LF_ISSET(DB_READ_COMMITTED) ||
+ (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
+ F_SET(dbc, DBC_READ_COMMITTED);
+
+ *dbcp = dbc;
+ return (0);
+
+err: (void)__dbc_close(dbc);
+ return (ret);
+}
+
+/*
+ * __db_cursor_arg --
+ * Check DB->cursor arguments.
+ */
+static int
+__db_cursor_arg(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ ENV *env;
+
+ env = dbp->env;
+
+ /*
+ * DB_READ_COMMITTED and DB_READ_UNCOMMITTED require locking.
+ */
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED)) {
+ if (!LOCKING_ON(env))
+ return (__db_fnl(env, "DB->cursor"));
+ }
+
+ LF_CLR(DB_CURSOR_BULK |
+ DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT);
+
+ /* Check for invalid function flags. */
+ if (LF_ISSET(DB_WRITECURSOR)) {
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->cursor"));
+ if (!CDB_LOCKING(env))
+ return (__db_ferr(env, "DB->cursor", 0));
+ LF_CLR(DB_WRITECURSOR);
+ } else if (LF_ISSET(DB_WRITELOCK)) {
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->cursor"));
+ LF_CLR(DB_WRITELOCK);
+ }
+
+ if (flags != 0)
+ return (__db_ferr(env, "DB->cursor", 0));
+
+ return (0);
+}
+
+/*
+ * __db_del_pp --
+ * DB->del pre/post processing.
+ *
+ * PUBLIC: int __db_del_pp __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_del_pp(dbp, txn, key, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");
+
+#ifdef CONFIG_TEST
+ if (IS_REP_MASTER(env))
+ DB_TEST_WAIT(env, env->test_check);
+#endif
+ ENV_ENTER(env, ip);
+ XA_CHECK_TXN(ip, txn);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if ((ret = __db_del_arg(dbp, key, flags)) != 0)
+ goto err;
+
+ /* Create local transaction as necessary. */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ ret = __db_del(dbp, ip, txn, key, flags);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, NULL);
+ return (ret);
+}
+
+/*
+ * __db_del_arg --
+ * Check DB->delete arguments.
+ */
+static int
+__db_del_arg(dbp, key, flags)
+ DB *dbp;
+ DBT *key;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ /* Check for changes to a read-only tree. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->del"));
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case DB_CONSUME:
+ if (dbp->type != DB_QUEUE)
+ return (__db_ferr(env, "DB->del", 0));
+ goto copy;
+ case DB_MULTIPLE:
+ case DB_MULTIPLE_KEY:
+ if (!F_ISSET(key, DB_DBT_BULK)) {
+ __db_errx(env, DB_STR("0581",
+ "DB->del with DB_MULTIPLE(_KEY) requires multiple key records"));
+ return (EINVAL);
+ }
+ /* FALL THROUGH */
+ case 0:
+copy: if ((ret = __dbt_usercopy(env, key)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_ferr(env, "DB->del", 0));
+ }
+
+ return (0);
+}
+
+/*
+ * __db_exists --
+ * DB->exists implementation.
+ *
+ * PUBLIC: int __db_exists __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_exists(dbp, txn, key, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ DBT data;
+ int ret;
+
+ /*
+ * Most flag checking is done in the DB->get call, we only check for
+ * specific incompatibilities here. This saves making __get_arg
+ * aware of the exist method's API constraints.
+ */
+ STRIP_AUTO_COMMIT(flags);
+
+ if ((ret = __db_fchk(dbp->env, "DB->exists", flags,
+ DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) != 0)
+ return (ret);
+
+ /*
+ * Configure a data DBT that returns no bytes so there's no copy
+ * of the data.
+ */
+ memset(&data, 0, sizeof(data));
+ data.dlen = 0;
+ data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+ return (dbp->get(dbp, txn, key, &data, flags));
+}
+
+/*
+ * db_fd_pp --
+ * DB->fd pre/post processing.
+ *
+ * PUBLIC: int __db_fd_pp __P((DB *, int *));
+ */
+int
+__db_fd_pp(dbp, fdp)
+ DB *dbp;
+ int *fdp;
+{
+ DB_FH *fhp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->fd");
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0)
+ goto err;
+
+ /*
+ * !!!
+ * There's no argument checking to be done.
+ *
+ * !!!
+ * The actual method call is simple, do it inline.
+ *
+ * XXX
+ * Truly spectacular layering violation.
+ */
+ if ((ret = __mp_xxx_fh(dbp->mpf, &fhp)) == 0) {
+ if (fhp == NULL) {
+ *fdp = -1;
+ __db_errx(env, DB_STR("0582",
+ "Database does not have a valid file handle"));
+ ret = ENOENT;
+ } else
+ *fdp = fhp->fd;
+ }
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_get_pp --
+ * DB->get pre/post processing.
+ *
+ * PUBLIC: int __db_get_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_get_pp(dbp, txn, key, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ u_int32_t mode;
+ int handle_check, ignore_lease, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ mode = 0;
+ txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get");
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+
+ if ((ret = __db_get_arg(dbp, key, data, flags)) != 0) {
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+ }
+
+ ENV_ENTER(env, ip);
+ XA_CHECK_TXN(ip, txn);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if (LF_ISSET(DB_READ_UNCOMMITTED))
+ mode = DB_READ_UNCOMMITTED;
+ else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME ||
+ (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT) {
+ mode = DB_WRITELOCK;
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID,
+ mode == DB_WRITELOCK || LF_ISSET(DB_RMW) ? 0 : 1)) != 0)
+ goto err;
+
+ ret = __db_get(dbp, ip, txn, key, data, flags);
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+
+/*
+ * __db_get --
+ * DB->get.
+ *
+ * PUBLIC: int __db_get __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_get(dbp, ip, txn, key, data, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ u_int32_t mode;
+ int ret, t_ret;
+
+ /*
+ * The DB_CURSOR_TRANSIENT flag indicates that we're just doing a single
+ * operation with this cursor, and that in case of error we don't need
+ * to restore it to its old position. Thus, we can perform the get
+ * without duplicating the cursor, saving some cycles in this common
+ * case.
+ */
+ mode = DB_CURSOR_TRANSIENT;
+ if (LF_ISSET(DB_READ_UNCOMMITTED)) {
+ mode |= DB_READ_UNCOMMITTED;
+ LF_CLR(DB_READ_UNCOMMITTED);
+ } else if (LF_ISSET(DB_READ_COMMITTED)) {
+ mode |= DB_READ_COMMITTED;
+ LF_CLR(DB_READ_COMMITTED);
+ } else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME ||
+ (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT)
+ mode |= DB_WRITELOCK;
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0)
+ return (ret);
+
+ DEBUG_LREAD(dbc, txn, "DB->get", key, NULL, flags);
+
+ /*
+ * The semantics of bulk gets are different for DB->get vs DBC->get.
+ * Mark the cursor so the low-level bulk get routines know which
+ * behavior we want.
+ */
+ F_SET(dbc, DBC_FROM_DB_GET);
+
+ /*
+ * SET_RET_MEM indicates that if key and/or data have no DBT
+ * flags set and DB manages the returned-data memory, that memory
+ * will belong to this handle, not to the underlying cursor.
+ */
+ SET_RET_MEM(dbc, dbp);
+
+ if (LF_ISSET(~(DB_RMW | DB_MULTIPLE)) == 0)
+ LF_SET(DB_SET);
+
+#ifdef HAVE_PARTITION
+ if (F_ISSET(dbc, DBC_PARTITIONED))
+ ret = __partc_get(dbc, key, data, flags);
+ else
+#endif
+ ret = __dbc_get(dbc, key, data, flags);
+
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_get_arg --
+ * DB->get argument checking, used by both DB->get and DB->pget.
+ */
+static int
+__db_get_arg(dbp, key, data, flags)
+ const DB *dbp;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ ENV *env;
+ int dirty, multi, ret;
+
+ env = dbp->env;
+
+ /*
+ * Check for read-modify-write validity. DB_RMW doesn't make sense
+ * with CDB cursors since if you're going to write the cursor, you
+ * had to create it with DB_WRITECURSOR. Regardless, we check for
+ * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it.
+ * If this changes, confirm that DB does not itself set the DB_RMW
+ * flag in a path where CDB may have been configured.
+ */
+ dirty = 0;
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+ if (!LOCKING_ON(env))
+ return (__db_fnl(env, "DB->get"));
+ if ((ret = __db_fcchk(env, "DB->get",
+ flags, DB_READ_UNCOMMITTED, DB_READ_COMMITTED)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED))
+ dirty = 1;
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ }
+
+ multi = 0;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ if (LF_ISSET(DB_MULTIPLE_KEY))
+ goto multi_err;
+ multi = LF_ISSET(DB_MULTIPLE) ? 1 : 0;
+ LF_CLR(DB_MULTIPLE);
+ }
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case DB_GET_BOTH:
+ if ((ret = __dbt_usercopy(env, data)) != 0)
+ return (ret);
+ /* FALLTHROUGH */
+ case 0:
+ if ((ret = __dbt_usercopy(env, key)) != 0) {
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+ }
+ break;
+ case DB_SET_RECNO:
+ if (!F_ISSET(dbp, DB_AM_RECNUM))
+ goto err;
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ return (ret);
+ break;
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ if (dirty) {
+ __db_errx(env, DB_STR_A("0583",
+ "%s is not supported with DB_CONSUME or DB_CONSUME_WAIT",
+ "%s"), LF_ISSET(DB_READ_UNCOMMITTED) ?
+ "DB_READ_UNCOMMITTED" : "DB_READ_COMMITTED");
+ return (EINVAL);
+ }
+ if (multi)
+multi_err: return (__db_ferr(env, "DB->get", 1));
+ if (dbp->type == DB_QUEUE)
+ break;
+ /* FALLTHROUGH */
+ default:
+err: return (__db_ferr(env, "DB->get", 0));
+ }
+
+ /*
+ * Check for invalid key/data flags.
+ */
+ if ((ret =
+ __dbt_ferr(dbp, "key", key, DB_RETURNS_A_KEY(dbp, flags))) != 0)
+ return (ret);
+
+ if (F_ISSET(data, DB_DBT_READONLY)) {
+ __db_errx(env, DB_STR("0584",
+ "DB_DBT_READONLY should not be set on data DBT."));
+ return (EINVAL);
+ }
+ if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0)
+ return (ret);
+
+ if (multi) {
+ if (!F_ISSET(data, DB_DBT_USERMEM)) {
+ __db_errx(env, DB_STR("0585",
+ "DB_MULTIPLE requires DB_DBT_USERMEM be set"));
+ return (EINVAL);
+ }
+ if (F_ISSET(key, DB_DBT_PARTIAL) ||
+ F_ISSET(data, DB_DBT_PARTIAL)) {
+ __db_errx(env, DB_STR("0586",
+ "DB_MULTIPLE does not support DB_DBT_PARTIAL"));
+ return (EINVAL);
+ }
+ if (data->ulen < 1024 ||
+ data->ulen < dbp->pgsize || data->ulen % 1024 != 0) {
+ __db_errx(env, DB_STR("0587",
+ "DB_MULTIPLE buffers must be aligned, "
+ "at least page size and multiples of 1KB"));
+ return (EINVAL);
+ }
+ }
+
+ /* Check invalid partial key. */
+ if (F_ISSET(key, DB_DBT_PARTIAL) && !(LF_ISSET(DB_CONSUME) &&
+ LF_ISSET(DB_CONSUME_WAIT) && LF_ISSET(DB_SET_RECNO))) {
+ __db_errx(env, DB_STR("0708",
+ "Invalid positioning flag combined with DB_DBT_PARTIAL"));
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_join_pp --
+ * DB->join pre/post processing.
+ *
+ * PUBLIC: int __db_join_pp __P((DB *, DBC **, DBC **, u_int32_t));
+ */
+int
+__db_join_pp(primary, curslist, dbcp, flags)
+ DB *primary;
+ DBC **curslist, **dbcp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = primary->env;
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(
+ primary, 1, 0, IS_REAL_TXN(curslist[0]->txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if ((ret = __db_join_arg(primary, curslist, flags)) == 0)
+ ret = __db_join(primary, curslist, dbcp, flags);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_join_arg --
+ * Check DB->join arguments.
+ */
+static int
+__db_join_arg(primary, curslist, flags)
+ DB *primary;
+ DBC **curslist;
+ u_int32_t flags;
+{
+ DB_TXN *txn;
+ ENV *env;
+ int i;
+
+ env = primary->env;
+
+ switch (flags) {
+ case 0:
+ case DB_JOIN_NOSORT:
+ break;
+ default:
+ return (__db_ferr(env, "DB->join", 0));
+ }
+
+ if (curslist == NULL || curslist[0] == NULL) {
+ __db_errx(env, DB_STR("0588",
+ "At least one secondary cursor must be specified to DB->join"));
+ return (EINVAL);
+ }
+
+ txn = curslist[0]->txn;
+ for (i = 1; curslist[i] != NULL; i++)
+ if (curslist[i]->txn != txn) {
+ __db_errx(env, DB_STR("0589",
+ "All secondary cursors must share the same transaction"));
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_key_range_pp --
+ * DB->key_range pre/post processing.
+ *
+ * PUBLIC: int __db_key_range_pp
+ * PUBLIC: __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__db_key_range_pp(dbp, txn, key, kr, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ DB_KEY_RANGE *kr;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->key_range");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0)
+ return (__db_ferr(env, "DB->key_range", 0));
+
+ ENV_ENTER(env, ip);
+ XA_CHECK_TXN(ip, txn);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+ goto err;
+
+ /*
+ * !!!
+ * The actual method call is simple, do it inline.
+ */
+ switch (dbp->type) {
+ case DB_BTREE:
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ goto err;
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) {
+ __dbt_userfree(env, key, NULL, NULL);
+ break;
+ }
+
+ DEBUG_LWRITE(dbc, NULL, "bam_key_range", NULL, NULL, 0);
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __part_key_range(dbc, key, kr, flags);
+ else
+#endif
+ ret = __bam_key_range(dbc, key, kr, flags);
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ __dbt_userfree(env, key, NULL, NULL);
+ break;
+ case DB_HASH:
+ case DB_QUEUE:
+ case DB_RECNO:
+ ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env, "DB->key_range", dbp->type);
+ break;
+ }
+
+err: /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_open_pp --
+ * DB->open pre/post processing.
+ *
+ * PUBLIC: int __db_open_pp __P((DB *, DB_TXN *,
+ * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int));
+ */
+int
+__db_open_pp(dbp, txn, fname, dname, type, flags, mode)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *fname, *dname;
+ DBTYPE type;
+ u_int32_t flags;
+ int mode;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, nosync, remove_me, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ nosync = 1;
+ handle_check = remove_me = txn_local = 0;
+
+ ENV_ENTER(env, ip);
+
+ /*
+ * Save the flags. We do this here because we don't pass all of the
+ * flags down into the actual DB->open method call, we strip
+ * DB_AUTO_COMMIT at this layer.
+ */
+ dbp->open_flags = flags;
+
+ /* Save the current DB handle flags for refresh. */
+ dbp->orig_flags = dbp->flags;
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * A replication client can't create a database, but it's convenient to
+ * allow a repmgr application to specify DB_CREATE anyway. Thus for
+ * such an application the meaning of DB_CREATE becomes "create it if
+ * I'm a master, and otherwise ignore the flag". A repmgr application
+ * running as master can't be sure that it won't spontaneously become a
+ * client, so there's a race condition.
+ */
+ if (IS_REP_CLIENT(env) && !F_ISSET(dbp, DB_AM_NOT_DURABLE))
+ LF_CLR(DB_CREATE);
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+ if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+ goto err;
+ txn_local = 1;
+ } else if (txn != NULL && !TXN_ON(env) &&
+ (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_FAMILY))) {
+ ret = __db_not_txn_env(env);
+ goto err;
+ }
+ LF_CLR(DB_AUTO_COMMIT);
+
+ /*
+ * We check arguments after possibly creating a local transaction,
+ * which is unusual -- the reason is some flags are illegal if any
+ * kind of transaction is in effect.
+ */
+ if ((ret = __db_open_arg(dbp, txn, fname, dname, type, flags)) == 0)
+ if ((ret = __db_open(dbp, ip, txn, fname, dname, type,
+ flags, mode, PGNO_BASE_MD)) != 0)
+ goto txnerr;
+
+ /*
+ * You can open the database that describes the subdatabases in the
+ * rest of the file read-only. The content of each key's data is
+ * unspecified and applications should never be adding new records
+ * or updating existing records. However, during recovery, we need
+ * to open these databases R/W so we can redo/undo changes in them.
+ * Likewise, we need to open master databases read/write during
+ * rename and remove so we can be sure they're fully sync'ed, so
+ * we provide an override flag for the purpose.
+ */
+ if (dname == NULL && !IS_RECOVERING(env) && !LF_ISSET(DB_RDONLY) &&
+ !LF_ISSET(DB_RDWRMASTER) && F_ISSET(dbp, DB_AM_SUBDB)) {
+ __db_errx(env, DB_STR("0590",
+ "files containing multiple databases may only be opened read-only"));
+ ret = EINVAL;
+ goto txnerr;
+ }
+
+ /*
+ * Success: file creations have to be synchronous, otherwise we don't
+ * care.
+ */
+ if (F_ISSET(dbp, DB_AM_CREATED | DB_AM_CREATED_MSTR))
+ nosync = 0;
+
+ /* Success: don't discard the file on close. */
+ F_CLR(dbp, DB_AM_DISCARD | DB_AM_CREATED | DB_AM_CREATED_MSTR);
+
+ /*
+ * If not transactional, remove the databases/subdatabases if it is
+ * persistent. If we're transactional, the child transaction abort
+ * cleans up.
+ */
+txnerr: if (ret != 0 && !IS_REAL_TXN(txn)) {
+ remove_me = (F_ISSET(dbp, DB_AM_CREATED) &&
+ (fname != NULL || dname != NULL)) ? 1 : 0;
+ if (F_ISSET(dbp, DB_AM_CREATED_MSTR) ||
+ (dname == NULL && remove_me))
+ /* Remove file. */
+ (void)__db_remove_int(dbp,
+ ip, txn, fname, NULL, DB_FORCE);
+ else if (remove_me)
+ /* Remove subdatabase. */
+ (void)__db_remove_int(dbp,
+ ip, txn, fname, dname, DB_FORCE);
+ }
+
+ if (txn_local && (t_ret =
+ __db_txn_auto_resolve(env, txn, nosync, ret)) && ret == 0)
+ ret = t_ret;
+
+err: /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_open_arg --
+ * Check DB->open arguments.
+ */
+static int
+__db_open_arg(dbp, txn, fname, dname, type, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *fname, *dname;
+ DBTYPE type;
+ u_int32_t flags;
+{
+ ENV *env;
+ u_int32_t ok_flags;
+ int ret;
+
+ env = dbp->env;
+
+ /* Validate arguments. */
+#undef OKFLAGS
+#define OKFLAGS \
+ (DB_AUTO_COMMIT | DB_CREATE | DB_EXCL | DB_FCNTL_LOCKING | \
+ DB_MULTIVERSION | DB_NOMMAP | DB_NO_AUTO_COMMIT | DB_RDONLY | \
+ DB_RDWRMASTER | DB_READ_UNCOMMITTED | DB_THREAD | DB_TRUNCATE)
+ if ((ret = __db_fchk(env, "DB->open", flags, OKFLAGS)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE))
+ return (__db_ferr(env, "DB->open", 1));
+ if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE))
+ return (__db_ferr(env, "DB->open", 1));
+
+#ifdef HAVE_VXWORKS
+ if (LF_ISSET(DB_TRUNCATE)) {
+ __db_errx(env, DB_STR("0591",
+ "DB_TRUNCATE not supported on VxWorks"));
+ return (DB_OPNOTSUP);
+ }
+#endif
+ switch (type) {
+ case DB_UNKNOWN:
+ if (LF_ISSET(DB_CREATE|DB_TRUNCATE)) {
+ __db_errx(env, DB_STR("0592",
+ "DB_UNKNOWN type specified with DB_CREATE or DB_TRUNCATE"));
+ return (EINVAL);
+ }
+ ok_flags = 0;
+ break;
+ case DB_BTREE:
+ ok_flags = DB_OK_BTREE;
+ break;
+ case DB_HASH:
+#ifndef HAVE_HASH
+ return (__db_no_hash_am(env));
+#endif
+ ok_flags = DB_OK_HASH;
+ break;
+ case DB_HEAP:
+ ok_flags = DB_OK_HEAP;
+ break;
+ case DB_QUEUE:
+#ifndef HAVE_QUEUE
+ return (__db_no_queue_am(env));
+#endif
+ ok_flags = DB_OK_QUEUE;
+ break;
+ case DB_RECNO:
+ ok_flags = DB_OK_RECNO;
+ break;
+ default:
+ __db_errx(env, DB_STR_A("0593",
+ "unknown type: %lu", "%lu"), (u_long)type);
+ return (EINVAL);
+ }
+ if (ok_flags)
+ DB_ILLEGAL_METHOD(dbp, ok_flags);
+
+ /* The environment may have been created, but never opened. */
+ if (!F_ISSET(env, ENV_DBLOCAL | ENV_OPEN_CALLED)) {
+ __db_errx(env, DB_STR("0594",
+ "database environment not yet opened"));
+ return (EINVAL);
+ }
+
+ /*
+ * Historically, you could pass in an environment that didn't have a
+ * mpool, and DB would create a private one behind the scenes. This
+ * no longer works.
+ */
+ if (!F_ISSET(env, ENV_DBLOCAL) && !MPOOL_ON(env)) {
+ __db_errx(env, DB_STR("0595",
+ "environment did not include a memory pool"));
+ return (EINVAL);
+ }
+
+ /*
+ * You can't specify threads during DB->open if subsystems in the
+ * environment weren't configured with them.
+ */
+ if (LF_ISSET(DB_THREAD) && !F_ISSET(env, ENV_DBLOCAL | ENV_THREAD)) {
+ __db_errx(env, DB_STR("0596",
+ "environment not created using DB_THREAD"));
+ return (EINVAL);
+ }
+
+ /* Exclusive database handles cannot be threaded.*/
+ if (LF_ISSET(DB_THREAD) && F2_ISSET(dbp, DB2_AM_EXCL)) {
+ __db_errx(env, DB_STR("0744",
+ "Exclusive database handles cannot be threaded."));
+ return (EINVAL);
+ }
+
+ /* Exclusive database handles require transactional environments. */
+ if (F2_ISSET(dbp, DB2_AM_EXCL) && !TXN_ON(env)) {
+ __db_errx(env, DB_STR("0745",
+ "Exclusive database handles require transactional environments."));
+ return (EINVAL);
+ }
+
+ /* Replication clients cannot open exclusive database handles. */
+ if (F2_ISSET(dbp, DB2_AM_EXCL) && IS_REP_CLIENT(env)) {
+ __db_errx(env, DB_STR("0746",
+"Exclusive database handles cannot be opened on replication clients."));
+ return (EINVAL);
+ }
+
+ /* DB_MULTIVERSION requires a database configured for transactions. */
+ if (LF_ISSET(DB_MULTIVERSION) && !IS_REAL_TXN(txn)) {
+ __db_errx(env, DB_STR("0597",
+ "DB_MULTIVERSION illegal without a transaction specified"));
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIVERSION) && type == DB_QUEUE) {
+ __db_errx(env, DB_STR("0598",
+ "DB_MULTIVERSION illegal with queue databases"));
+ return (EINVAL);
+ }
+
+ /* DB_TRUNCATE is neither transaction recoverable nor lockable. */
+ if (LF_ISSET(DB_TRUNCATE) && (LOCKING_ON(env) || txn != NULL)) {
+ __db_errx(env, DB_STR_A("0599",
+ "DB_TRUNCATE illegal with %s specified", "%s"),
+ LOCKING_ON(env) ? "locking" : "transactions");
+ return (EINVAL);
+ }
+
+ /* Subdatabase checks. */
+ if (dname != NULL) {
+ /* QAM can only be done on in-memory subdatabases. */
+ if (type == DB_QUEUE && fname != NULL) {
+ __db_errx(env, DB_STR("0600",
+ "Queue databases must be one-per-file"));
+ return (EINVAL);
+ }
+
+ /*
+ * Named in-memory databases can't support certain flags,
+ * so check here.
+ */
+ if (fname == NULL)
+ F_CLR(dbp, DB_AM_CHKSUM | DB_AM_ENCRYPT);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_pget_pp --
+ * DB->pget pre/post processing.
+ *
+ * PUBLIC: int __db_pget_pp
+ * PUBLIC: __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget_pp(dbp, txn, skey, pkey, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ignore_lease, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->pget");
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+
+ if ((ret = __db_pget_arg(dbp, pkey, flags)) != 0 ||
+ (ret = __db_get_arg(dbp, skey, data, flags)) != 0) {
+ __dbt_userfree(env, skey, pkey, data);
+ return (ret);
+ }
+
+ ENV_ENTER(env, ip);
+ XA_CHECK_TXN(ip, txn);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_pget(dbp, ip, txn, skey, pkey, data, flags);
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+err: /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, skey, pkey, data);
+ return (ret);
+}
+
+/*
+ * __db_pget --
+ * DB->pget.
+ *
+ * PUBLIC: int __db_pget __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget(dbp, ip, txn, skey, pkey, data, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ u_int32_t mode;
+ int ret, t_ret;
+
+ mode = DB_CURSOR_TRANSIENT;
+ if (LF_ISSET(DB_READ_UNCOMMITTED)) {
+ mode |= DB_READ_UNCOMMITTED;
+ LF_CLR(DB_READ_UNCOMMITTED);
+ } else if (LF_ISSET(DB_READ_COMMITTED)) {
+ mode |= DB_READ_COMMITTED;
+ LF_CLR(DB_READ_COMMITTED);
+ }
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0)
+ return (ret);
+
+ SET_RET_MEM(dbc, dbp);
+
+ DEBUG_LREAD(dbc, txn, "__db_pget", skey, NULL, flags);
+
+ /*
+ * !!!
+ * The actual method call is simple, do it inline.
+ *
+ * The underlying cursor pget will fill in a default DBT for null
+ * pkeys, and use the cursor's returned-key memory internally to
+ * store any intermediate primary keys. However, we've just set
+ * the returned-key memory to the DB handle's key memory, which
+ * is unsafe to use if the DB handle is threaded. If the pkey
+ * argument is NULL, use the DBC-owned returned-key memory
+ * instead; it'll go away when we close the cursor before we
+ * return, but in this case that's just fine, as we're not
+ * returning the primary key.
+ */
+ if (pkey == NULL)
+ dbc->rkey = &dbc->my_rkey;
+
+ /*
+ * The cursor is just a perfectly ordinary secondary database cursor.
+ * Call its c_pget() method to do the dirty work.
+ */
+ if (flags == 0 || flags == DB_RMW)
+ flags |= DB_SET;
+
+ ret = __dbc_pget(dbc, skey, pkey, data, flags);
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_pget_arg --
+ * Check DB->pget arguments.
+ */
+static int
+__db_pget_arg(dbp, pkey, flags)
+ DB *dbp;
+ DBT *pkey;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env, DB_STR("0601",
+ "DB->pget may only be used on secondary indices"));
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ __db_errx(env,DB_STR("0602",
+"DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices"));
+ return (EINVAL);
+ }
+
+ /* DB_CONSUME makes no sense on a secondary index. */
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ switch (flags) {
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ return (__db_ferr(env, "DB->pget", 0));
+ default:
+ /* __db_get_arg will catch the rest. */
+ break;
+ }
+
+ /*
+ * We allow the pkey field to be NULL, so that we can make the
+ * two-DBT get calls into wrappers for the three-DBT ones.
+ */
+ if (pkey != NULL &&
+ (ret = __dbt_ferr(dbp, "primary key", pkey, 1)) != 0)
+ return (ret);
+
+ /* Check invalid partial pkey. */
+ if (pkey != NULL && F_ISSET(pkey, DB_DBT_PARTIAL)) {
+ __db_errx(env, DB_STR("0709",
+ "The primary key returned by pget can't be partial"));
+ return (EINVAL);
+ }
+
+ if (flags == DB_GET_BOTH) {
+ /* The pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+ if (pkey == NULL) {
+ __db_errx(env, DB_STR("0603",
+ "DB_GET_BOTH on a secondary index requires a primary key"));
+ return (EINVAL);
+ }
+ if ((ret = __dbt_usercopy(env, pkey)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_put_pp --
+ * DB->put pre/post processing.
+ *
+ * PUBLIC: int __db_put_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_put_pp(dbp, txn, key, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, txn_local, t_ret;
+
+ env = dbp->env;
+ txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->put");
+
+ if ((ret = __db_put_arg(dbp, key, data, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ XA_CHECK_TXN(ip, txn);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Create local transaction as necessary. */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ ret = __db_put(dbp, ip, txn, key, data, flags);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+
+/*
+ * __db_put_arg --
+ * Check DB->put arguments.
+ */
+static int
+__db_put_arg(dbp, key, data, flags)
+ DB *dbp;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret, returnkey;
+
+ env = dbp->env;
+ returnkey = 0;
+
+ /* Check for changes to a read-only tree. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->put"));
+
+ /* Check for puts on a secondary. */
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env, DB_STR("0604",
+ "DB->put forbidden on secondary indices"));
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIPLE_KEY | DB_MULTIPLE)) {
+ if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY))
+ goto err;
+
+ switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+ case 0:
+ case DB_OVERWRITE_DUP:
+ break;
+ default:
+ __db_errx(env, DB_STR("0605",
+"DB->put: DB_MULTIPLE(_KEY) can only be combined with DB_OVERWRITE_DUP"));
+ return (EINVAL);
+ }
+
+ if (!F_ISSET(key, DB_DBT_BULK)) {
+ __db_errx(env, DB_STR("0606",
+ "DB->put with DB_MULTIPLE(_KEY) requires a bulk key buffer"));
+ return (EINVAL);
+ }
+ }
+ if (LF_ISSET(DB_MULTIPLE)) {
+ if (!F_ISSET(data, DB_DBT_BULK)) {
+ __db_errx(env, DB_STR("0607",
+ "DB->put with DB_MULTIPLE requires a bulk data buffer"));
+ return (EINVAL);
+ }
+ }
+
+ /* Check for invalid function flags. */
+ switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+ case 0:
+ case DB_NOOVERWRITE:
+ case DB_OVERWRITE_DUP:
+ break;
+ case DB_APPEND:
+ if (dbp->type != DB_RECNO &&
+ dbp->type != DB_QUEUE && dbp->type != DB_HEAP)
+ goto err;
+ returnkey = 1;
+ break;
+ case DB_NODUPDATA:
+ if (F_ISSET(dbp, DB_AM_DUPSORT))
+ break;
+ /* FALLTHROUGH */
+ default:
+err: return (__db_ferr(env, "DB->put", 0));
+ }
+
+ /*
+ * Check for invalid key/data flags. The key may reasonably be NULL
+ * if DB_APPEND is set and the application doesn't care about the
+ * returned key.
+ */
+ if (((returnkey && key != NULL) || !returnkey) &&
+ (ret = __dbt_ferr(dbp, "key", key, returnkey)) != 0)
+ return (ret);
+ if (!LF_ISSET(DB_MULTIPLE_KEY) &&
+ (ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+ return (ret);
+
+ /*
+ * The key parameter should not be NULL or have the "partial" flag set
+ * in a put call unless the user doesn't care about a key value we'd
+ * return. The user tells us they don't care about the returned key by
+ * setting the key parameter to NULL or configuring the key DBT to not
+ * return any information. (Returned keys from a put are always record
+ * numbers, and returning part of a record number doesn't make sense:
+ * only accept a partial return if the length returned is 0.)
+ */
+ if ((returnkey &&
+ key != NULL && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0) ||
+ (!returnkey && F_ISSET(key, DB_DBT_PARTIAL)))
+ return (__db_ferr(env, "key DBT", 0));
+
+ /* Check for partial puts in the presence of duplicates. */
+ if (data != NULL && F_ISSET(data, DB_DBT_PARTIAL) &&
+ (F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) {
+ __db_errx(env, DB_STR("0608",
+"a partial put in the presence of duplicates requires a cursor operation"));
+ return (EINVAL);
+ }
+
+ if ((flags != DB_APPEND && (ret = __dbt_usercopy(env, key)) != 0) ||
+ (!LF_ISSET(DB_MULTIPLE_KEY) &&
+ (ret = __dbt_usercopy(env, data)) != 0))
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __db_compact_func
+ * Callback routine to report if the txn has open cursors.
+ */
+static int
+__db_compact_func(dbc, my_dbc, countp, pgno, indx, args)
+ DBC *dbc, *my_dbc;
+ u_int32_t *countp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ void *args;
+{
+ DB_TXN *txn;
+
+ COMPQUIET(my_dbc, NULL);
+ COMPQUIET(countp, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(indx, 0);
+
+ txn = (DB_TXN *)args;
+
+ if (txn == dbc->txn)
+ return (EEXIST);
+ return (0);
+}
+/*
+ * __db_compact_pp --
+ * DB->compact pre/post processing.
+ *
+ * PUBLIC: int __db_compact_pp __P((DB *, DB_TXN *,
+ * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__db_compact_pp(dbp, txn, start, stop, c_data, flags, end)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *start, *stop;
+ DB_COMPACT *c_data;
+ u_int32_t flags;
+ DBT *end;
+{
+ DB_COMPACT *dp, l_data;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+ u_int32_t count;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->compact");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if ((ret = __db_fchk(
+ env, "DB->compact", flags, DB_FREELIST_ONLY | DB_FREE_SPACE)) != 0)
+ return (ret);
+
+ /* Check for changes to a read-only database. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->compact"));
+
+ if (start != NULL && (ret = __dbt_usercopy(env, start)) != 0)
+ return (ret);
+ if (stop != NULL && (ret = __dbt_usercopy(env, stop)) != 0) {
+ __dbt_userfree(env, start, NULL, NULL);
+ return (ret);
+ }
+
+ ENV_ENTER(env, ip);
+ XA_CHECK_TXN(ip, txn);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0,
+ IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if (txn != NULL) {
+ if ((ret = __db_walk_cursors(dbp,
+ NULL, __db_compact_func, &count, 0, 0, txn)) != 0) {
+ if (ret == EEXIST) {
+ __db_errx(env, DB_STR("0609",
+"DB->compact may not be called with active cursors in the transaction."));
+ ret = EINVAL;
+ }
+ goto err;
+ }
+ }
+
+ if (c_data == NULL) {
+ dp = &l_data;
+ memset(dp, 0, sizeof(*dp));
+ } else
+ dp = c_data;
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __part_compact(dbp, ip, txn, start, stop, dp, flags, end);
+ else
+#endif
+ switch (dbp->type) {
+ case DB_HASH:
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __db_compact_int(dbp, ip,
+ txn, start, stop, dp, flags, end);
+ break;
+ case DB_HEAP:
+ break;
+ default:
+ ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+ break;
+ }
+
+ /* Release replication block. */
+err: if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, start, stop, NULL);
+ return (ret);
+}
+
+/*
+ * __db_associate_foreign_pp --
+ * DB->associate_foreign pre/post processing.
+ *
+ * PUBLIC: int __db_associate_foreign_pp __P((DB *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ * PUBLIC: u_int32_t));
+ */
+int
+__db_associate_foreign_pp(fdbp, dbp, callback, flags)
+ DB *dbp, *fdbp;
+ int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+ u_int32_t flags;
+{
+ /* Most of this is based on the implementation of associate */
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ PANIC_CHECK(env);
+ STRIP_AUTO_COMMIT(flags);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if ((ret = __db_associate_foreign_arg(fdbp, dbp, callback, flags)) != 0)
+ goto err;
+
+ ret = __db_associate_foreign(fdbp, dbp, callback, flags);
+
+err: /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_associate_foreign_arg --
+ * DB->associate_foreign argument checking.
+ */
+static int
+__db_associate_foreign_arg(fdbp, dbp, callback, flags)
+ DB *dbp, *fdbp;
+ int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+ u_int32_t flags;
+{
+ ENV *env;
+
+ env = fdbp->env;
+
+ if (F_ISSET(fdbp, DB_AM_SECONDARY)) {
+ __db_errx(env, DB_STR("0610",
+ "Secondary indices may not be used as foreign databases"));
+ return (EINVAL);
+ }
+ if (F_ISSET(fdbp, DB_AM_DUP)) {
+ __db_errx(env, DB_STR("0611",
+ "Foreign databases may not be configured with duplicates"));
+ return (EINVAL);
+ }
+ if (F_ISSET(fdbp, DB_AM_RENUMBER)) {
+ __db_errx(env, DB_STR("0612",
+ "Renumbering recno databases may not be used as foreign databases"));
+ return (EINVAL);
+ }
+ if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env, DB_STR("0613",
+ "The associating database must be a secondary index."));
+ return (EINVAL);
+ }
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) && callback == NULL) {
+ __db_errx(env, DB_STR("0614",
+ "When specifying a delete action of nullify, a callback "
+ "function needs to be configured"));
+ return (EINVAL);
+ } else if (!LF_ISSET(DB_FOREIGN_NULLIFY) && callback != NULL) {
+ __db_errx(env, DB_STR("0615",
+ "When not specifying a delete action of nullify, a "
+ "callback function cannot be configured"));
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_sync_pp --
+ * DB->sync pre/post processing.
+ *
+ * PUBLIC: int __db_sync_pp __P((DB *, u_int32_t));
+ */
+int
+__db_sync_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->sync");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0)
+ return (__db_ferr(env, "DB->sync", 0));
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_sync(dbp);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_close_pp --
+ * DBC->close pre/post processing.
+ *
+ * PUBLIC: int __dbc_close_pp __P((DBC *));
+ */
+int
+__dbc_close_pp(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ DB_TXN *txn;
+ int handle_check, ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ txn = dbc->txn;
+
+ /*
+ * If the cursor is already closed we have a serious problem, and we
+ * assume that the cursor isn't on the active queue. Don't do any of
+ * the remaining cursor close processing.
+ */
+ if (!F_ISSET(dbc, DBC_ACTIVE)) {
+ __db_errx(env, DB_STR("0616",
+ "Closing already-closed cursor"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = !IS_REAL_TXN(dbc->txn) && IS_ENV_REPLICATED(env);
+
+ /* Unregister the cursor from its transaction, regardless of ret. */
+ if (txn != NULL) {
+ TAILQ_REMOVE(&(txn->my_cursors), dbc, txn_cursors);
+ dbc->txn_cursors.tqe_next = NULL;
+ dbc->txn_cursors.tqe_prev = NULL;
+ } else {
+ DB_ASSERT(env, dbc->txn_cursors.tqe_next == NULL &&
+ dbc->txn_cursors.tqe_prev == NULL);
+ }
+
+ ret = __dbc_close(dbc);
+
+ /* Release replication block. */
+ if (handle_check &&
+ (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_cmp_pp --
+ * DBC->cmp pre/post processing.
+ *
+ * PUBLIC: int __dbc_cmp_pp __P((DBC *, DBC *, int*, u_int32_t));
+ */
+int
+__dbc_cmp_pp(dbc, other_cursor, result, flags)
+ DBC *dbc, *other_cursor;
+ int *result;
+ u_int32_t flags;
+{
+ DB *dbp, *odbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ odbp = other_cursor->dbp;
+ env = dbp->env;
+
+ if (flags != 0)
+ return (__db_ferr(env, "DBcursor->cmp", 0));
+
+ if (other_cursor == NULL) {
+ __db_errx(env, DB_STR("0617",
+ "DBcursor->cmp dbc pointer must not be null"));
+ return (EINVAL);
+ }
+
+ if (dbp != odbp) {
+ __db_errx(env, DB_STR("0618",
+"DBcursor->cmp both cursors must refer to the same database."));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ ret = __dbc_cmp(dbc, other_cursor, result);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_count_pp --
+ * DBC->count pre/post processing.
+ *
+ * PUBLIC: int __dbc_count_pp __P((DBC *, db_recno_t *, u_int32_t));
+ */
+int
+__dbc_count_pp(dbc, recnop, flags)
+ DBC *dbc;
+ db_recno_t *recnop;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ *
+ * The cursor must be initialized, return EINVAL for an invalid cursor.
+ */
+ if (flags != 0)
+ return (__db_ferr(env, "DBcursor->count", 0));
+
+ if (!IS_INITIALIZED(dbc))
+ return (__db_curinval(env));
+
+ ENV_ENTER(env, ip);
+ ret = __dbc_count(dbc, recnop);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_del_pp --
+ * DBC->del pre/post processing.
+ *
+ * PUBLIC: int __dbc_del_pp __P((DBC *, u_int32_t));
+ */
+int
+__dbc_del_pp(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if ((ret = __dbc_del_arg(dbc, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+ goto err;
+
+ DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->del", NULL, NULL, flags);
+ ret = __dbc_del(dbc, flags);
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_del_arg --
+ * Check DBC->del arguments.
+ */
+static int
+__dbc_del_arg(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /* Check for changes to a read-only tree. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DBcursor->del"));
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case 0:
+ break;
+ case DB_CONSUME:
+ if (dbp->type != DB_QUEUE)
+ return (__db_ferr(env, "DBC->del", 0));
+ break;
+ case DB_UPDATE_SECONDARY:
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_SECONDARY));
+ break;
+ default:
+ return (__db_ferr(env, "DBcursor->del", 0));
+ }
+
+ /*
+ * The cursor must be initialized, return EINVAL for an invalid cursor,
+ * otherwise 0.
+ */
+ if (!IS_INITIALIZED(dbc))
+ return (__db_curinval(env));
+
+ return (0);
+}
+
+/*
+ * __dbc_dup_pp --
+ * DBC->dup pre/post processing.
+ *
+ * PUBLIC: int __dbc_dup_pp __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_dup_pp(dbc, dbcp, flags)
+ DBC *dbc, **dbcp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int rep_blocked, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_POSITION)
+ return (__db_ferr(env, "DBcursor->dup", 0));
+
+ ENV_ENTER(env, ip);
+ rep_blocked = 0;
+ if (dbc->txn == NULL && IS_ENV_REPLICATED(env)) {
+ if ((ret = __op_rep_enter(env, 1, 1)) != 0)
+ goto err;
+ rep_blocked = 1;
+ }
+ ret = __dbc_dup(dbc, dbcp, flags);
+
+ /* Register externally created cursors into the valid transaction. */
+ DB_ASSERT(env, (*dbcp)->txn == dbc->txn);
+ if ((*dbcp)->txn != NULL && ret == 0)
+ TAILQ_INSERT_HEAD(&((*dbcp)->txn->my_cursors), *dbcp,
+ txn_cursors);
+err:
+ if (ret != 0 && rep_blocked)
+ (void)__op_rep_exit(env);
+
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __dbc_get_pp --
+ * DBC->get pre/post processing.
+ *
+ * PUBLIC: int __dbc_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get_pp(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ignore_lease, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+ if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0) {
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+ }
+
+ ENV_ENTER(env, ip);
+
+ DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get",
+ flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+ ret = __dbc_get(dbc, key, data, flags);
+
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+
+/*
+ * __dbc_get_arg --
+ * Common DBC->get argument checking, used by both DBC->get and DBC->pget.
+ * PUBLIC: int __dbc_get_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get_arg(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+ int dirty, multi, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /*
+ * Typically in checking routines that modify the flags, we have
+ * to save them and restore them, because the checking routine
+ * calls the work routine. However, this is a pure-checking
+ * routine which returns to a function that calls the work routine,
+ * so it's OK that we do not save and restore the flags, even though
+ * we modify them.
+ *
+ * Check for read-modify-write validity. DB_RMW doesn't make sense
+ * with CDB cursors since if you're going to write the cursor, you
+ * had to create it with DB_WRITECURSOR. Regardless, we check for
+ * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it.
+ * If this changes, confirm that DB does not itself set the DB_RMW
+ * flag in a path where CDB may have been configured.
+ */
+ dirty = 0;
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+ if (!LOCKING_ON(env))
+ return (__db_fnl(env, "DBcursor->get"));
+ if (LF_ISSET(DB_READ_UNCOMMITTED))
+ dirty = 1;
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ }
+
+ multi = 0;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ multi = 1;
+ if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY))
+ goto multi_err;
+ LF_CLR(DB_MULTIPLE | DB_MULTIPLE_KEY);
+ }
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ if (dirty) {
+ __db_errx(env, DB_STR("0619",
+"DB_READ_UNCOMMITTED is not supported with DB_CONSUME or DB_CONSUME_WAIT"));
+ return (EINVAL);
+ }
+ if (dbp->type != DB_QUEUE)
+ goto err;
+ break;
+ case DB_CURRENT:
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ break;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ if (multi)
+multi_err: return (__db_ferr(env, "DBcursor->get", 1));
+ break;
+ case DB_GET_BOTHC:
+ if (dbp->type == DB_QUEUE)
+ goto err;
+ /* FALLTHROUGH */
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ if ((ret = __dbt_usercopy(env, data)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case DB_SET:
+ case DB_SET_RANGE:
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ goto err;
+ break;
+ case DB_GET_RECNO:
+ /*
+ * The one situation in which this might be legal with a
+ * non-RECNUM dbp is if dbp is a secondary and its primary is
+ * DB_AM_RECNUM.
+ */
+ if (!F_ISSET(dbp, DB_AM_RECNUM) &&
+ (!F_ISSET(dbp, DB_AM_SECONDARY) ||
+ !F_ISSET(dbp->s_primary, DB_AM_RECNUM)))
+ goto err;
+ break;
+ case DB_SET_RECNO:
+ if (!F_ISSET(dbp, DB_AM_RECNUM))
+ goto err;
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ goto err;
+ break;
+ default:
+err: __dbt_userfree(env, key, NULL, data);
+ return (__db_ferr(env, "DBcursor->get", 0));
+ }
+
+ /* Check for invalid key/data flags. */
+ if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+ return (ret);
+ if (F_ISSET(data, DB_DBT_READONLY)) {
+ __db_errx(env, DB_STR("0620",
+ "DB_DBT_READONLY should not be set on data DBT."));
+ return (EINVAL);
+ }
+ if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+ return (ret);
+
+ if (multi) {
+ if (!F_ISSET(data, DB_DBT_USERMEM)) {
+ __db_errx(env, DB_STR("0621",
+ "DB_MULTIPLE/DB_MULTIPLE_KEY require DB_DBT_USERMEM be set"));
+ return (EINVAL);
+ }
+ if (F_ISSET(key, DB_DBT_PARTIAL) ||
+ F_ISSET(data, DB_DBT_PARTIAL)) {
+ __db_errx(env, DB_STR("0622",
+ "DB_MULTIPLE/DB_MULTIPLE_KEY do not support DB_DBT_PARTIAL"));
+ return (EINVAL);
+ }
+ if (data->ulen < 1024 ||
+ data->ulen < dbp->pgsize || data->ulen % 1024 != 0) {
+ __db_errx(env, DB_STR("0623",
+ "DB_MULTIPLE/DB_MULTIPLE_KEY buffers must be "
+ "aligned, at least page size and multiples of 1KB"));
+ return (EINVAL);
+ }
+ }
+
+ /* Check compatible flags for partial key. */
+ if (F_ISSET(key, DB_DBT_PARTIAL) && (flags == DB_GET_BOTH ||
+ flags == DB_GET_BOTH_RANGE || flags == DB_SET)) {
+ __db_errx(env, DB_STR("0710",
+ "Invalid positioning flag combined with DB_DBT_PARTIAL"));
+ return (EINVAL);
+ }
+
+ /*
+ * The cursor must be initialized for DB_CURRENT, DB_GET_RECNO,
+ * DB_PREV_DUP and DB_NEXT_DUP. Return EINVAL for an invalid
+ * cursor, otherwise 0.
+ */
+ if (!IS_INITIALIZED(dbc) && (flags == DB_CURRENT ||
+ flags == DB_GET_RECNO ||
+ flags == DB_NEXT_DUP || flags == DB_PREV_DUP))
+ return (__db_curinval(env));
+
+ /* Check for consistent transaction usage. */
+ if (LF_ISSET(DB_RMW) &&
+ (ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __db_secondary_close_pp --
+ * DB->close for secondaries
+ *
+ * PUBLIC: int __db_secondary_close_pp __P((DB *, u_int32_t));
+ */
+int
+__db_secondary_close_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+ ret = 0;
+
+ /*
+ * As a DB handle destructor, we can't fail.
+ *
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_NOSYNC)
+ ret = __db_ferr(env, "DB->close", 0);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) {
+ handle_check = 0;
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ if ((t_ret = __db_secondary_close(dbp, flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_pget_pp --
+ * DBC->pget pre/post processing.
+ *
+ * PUBLIC: int __dbc_pget_pp __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_pget_pp(dbc, skey, pkey, data, flags)
+ DBC *dbc;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ignore_lease, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+ if ((ret = __dbc_pget_arg(dbc, pkey, flags)) != 0 ||
+ (ret = __dbc_get_arg(dbc, skey, data, flags)) != 0) {
+ __dbt_userfree(env, skey, pkey, data);
+ return (ret);
+ }
+
+ ENV_ENTER(env, ip);
+ DEBUG_LREAD(dbc, dbc->txn, "DBcursor->pget",
+ flags == DB_SET ||
+ flags == DB_SET_RANGE ? skey : NULL, NULL, flags);
+ ret = __dbc_pget(dbc, skey, pkey, data, flags);
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+ ENV_LEAVE(env, ip);
+
+ __dbt_userfree(env, skey, pkey, data);
+ return (ret);
+}
+
+/*
+ * __dbc_pget_arg --
+ * Check DBC->pget arguments.
+ */
+static int
+__dbc_pget_arg(dbc, pkey, flags)
+ DBC *dbc;
+ DBT *pkey;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env, DB_STR("0624",
+ "DBcursor->pget may only be used on secondary indices"));
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ __db_errx(env, DB_STR("0625",
+ "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices"));
+ return (EINVAL);
+ }
+
+ switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ /* These flags make no sense on a secondary index. */
+ return (__db_ferr(env, "DBcursor->pget", 0));
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ /* BOTH is "get both the primary and the secondary". */
+ if (pkey == NULL) {
+ __db_errx(env, DB_STR_A("0626",
+ "%s requires both a secondary and a primary key",
+ "%s"), LF_ISSET(DB_GET_BOTH) ?
+ "DB_GET_BOTH" : "DB_GET_BOTH_RANGE");
+ return (EINVAL);
+ }
+ if ((ret = __dbt_usercopy(env, pkey)) != 0)
+ return (ret);
+ break;
+ default:
+ /* __dbc_get_arg will catch the rest. */
+ break;
+ }
+
+ /*
+ * We allow the pkey field to be NULL, so that we can make the
+ * two-DBT get calls into wrappers for the three-DBT ones.
+ */
+ if (pkey != NULL &&
+ (ret = __dbt_ferr(dbp, "primary key", pkey, 0)) != 0)
+ return (ret);
+
+ /* Check invalid partial pkey. */
+ if (pkey != NULL && F_ISSET(pkey, DB_DBT_PARTIAL)) {
+ __db_errx(env, DB_STR("0711",
+ "The primary key returned by pget can't be partial."));
+ return (EINVAL);
+ }
+
+ /* But the pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+ if (pkey == NULL && (flags & DB_OPFLAGS_MASK) == DB_GET_BOTH) {
+ __db_errx(env, DB_STR("0627",
+ "DB_GET_BOTH on a secondary index requires a primary key"));
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __dbc_put_pp --
+ * DBC->put pre/post processing.
+ *
+ * PUBLIC: int __dbc_put_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_put_pp(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if ((ret = __dbc_put_arg(dbc, key, data, flags)) != 0) {
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+ }
+
+ ENV_ENTER(env, ip);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+ goto err;
+
+ DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->put",
+ flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+ flags == DB_NODUPDATA || flags == DB_UPDATE_SECONDARY ?
+ key : NULL, data, flags);
+ ret = __dbc_put(dbc, key, data, flags);
+
+err: ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+
+/*
+ * __dbc_put_arg --
+ * Check DBC->put arguments.
+ */
+static int
+__dbc_put_arg(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+ int key_flags, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ key_flags = 0;
+
+ /* Check for changes to a read-only tree. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DBcursor->put"));
+
+ /* Check for puts on a secondary. */
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ if (flags == DB_UPDATE_SECONDARY)
+ flags = 0;
+ else {
+ __db_errx(env, DB_STR("0628",
+ "DBcursor->put forbidden on secondary indices"));
+ return (EINVAL);
+ }
+ }
+
+ if ((ret = __dbt_usercopy(env, data)) != 0)
+ return (ret);
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case DB_AFTER:
+ case DB_BEFORE:
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_HASH: /* Only with unsorted duplicates. */
+ if (!F_ISSET(dbp, DB_AM_DUP))
+ goto err;
+ if (dbp->dup_compare != NULL)
+ goto err;
+ break;
+ case DB_QUEUE: /* Not permitted. */
+ goto err;
+ case DB_RECNO: /* Only with mutable record numbers. */
+ if (!F_ISSET(dbp, DB_AM_RENUMBER))
+ goto err;
+ key_flags = key == NULL ? 0 : 1;
+ break;
+ case DB_UNKNOWN:
+ default:
+ goto err;
+ }
+ break;
+ case DB_CURRENT:
+ /*
+ * If there is a comparison function, doing a DB_CURRENT
+ * must not change the part of the data item that is used
+ * for the comparison.
+ */
+ break;
+ case DB_NODUPDATA:
+ if (!F_ISSET(dbp, DB_AM_DUPSORT))
+ goto err;
+ /* FALLTHROUGH */
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_OVERWRITE_DUP:
+ key_flags = 1;
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ return (ret);
+ break;
+ default:
+err: return (__db_ferr(env, "DBcursor->put", 0));
+ }
+
+ /*
+ * Check for invalid key/data flags. The key may reasonably be NULL
+ * if DB_AFTER or DB_BEFORE is set and the application doesn't care
+ * about the returned key, or if the DB_CURRENT flag is set.
+ */
+ if (key_flags && (ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+ return (ret);
+ if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+ return (ret);
+
+ /*
+ * The key parameter should not be NULL or have the "partial" flag set
+ * in a put call unless the user doesn't care about a key value we'd
+ * return. The user tells us they don't care about the returned key by
+ * setting the key parameter to NULL or configuring the key DBT to not
+ * return any information. (Returned keys from a put are always record
+ * numbers, and returning part of a record number doesn't make sense:
+ * only accept a partial return if the length returned is 0.)
+ */
+ if (key_flags && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0)
+ return (__db_ferr(env, "key DBT", 0));
+
+ /*
+ * The cursor must be initialized for anything other than DB_KEYFIRST,
+ * DB_KEYLAST or zero: return EINVAL for an invalid cursor, otherwise 0.
+ */
+ if (!IS_INITIALIZED(dbc) && flags != 0 && flags != DB_KEYFIRST &&
+ flags != DB_KEYLAST && flags != DB_NODUPDATA &&
+ flags != DB_OVERWRITE_DUP)
+ return (__db_curinval(env));
+
+ return (0);
+}
+
+/*
+ * __dbt_ferr --
+ * Check a DBT for flag errors.
+ */
+static int
+__dbt_ferr(dbp, name, dbt, check_thread)
+ const DB *dbp;
+ const char *name;
+ const DBT *dbt;
+ int check_thread;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ /*
+ * Check for invalid DBT flags. We allow any of the flags to be
+ * specified to any DB or DBcursor call so that applications can
+ * set DB_DBT_MALLOC when retrieving a data item from a secondary
+ * database and then specify that same DBT as a key to a primary
+ * database, without having to clear flags.
+ */
+ if ((ret = __db_fchk(env, name, dbt->flags,
+ DB_DBT_APPMALLOC | DB_DBT_BULK | DB_DBT_DUPOK |
+ DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERCOPY |
+ DB_DBT_USERMEM | DB_DBT_PARTIAL | DB_DBT_READONLY)) != 0)
+ return (ret);
+ switch (F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC |
+ DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+ case 0:
+ case DB_DBT_MALLOC:
+ case DB_DBT_REALLOC:
+ case DB_DBT_USERCOPY:
+ case DB_DBT_USERMEM:
+ break;
+ default:
+ return (__db_ferr(env, name, 1));
+ }
+
+ if (F_ISSET(dbt, DB_DBT_BULK) && F_ISSET(dbt, DB_DBT_PARTIAL)) {
+ __db_errx(env, DB_STR_A("0629",
+ "Bulk and partial operations cannot be combined on %s DBT",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ if (check_thread && DB_IS_THREADED(dbp) &&
+ !F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC |
+ DB_DBT_USERCOPY | DB_DBT_USERMEM | DB_DBT_READONLY)) {
+ __db_errx(env, DB_STR_A("0630",
+ "DB_THREAD mandates memory allocation flag on %s DBT",
+ "%s"), name);
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __db_curinval
+ * Report that a cursor is in an invalid state.
+ */
+static int
+__db_curinval(env)
+ const ENV *env;
+{
+ __db_errx(env, DB_STR("0631",
+ "Cursor position must be set before performing this operation"));
+ return (EINVAL);
+}
+
+/*
+ * __db_txn_auto_init --
+ * Handle DB_AUTO_COMMIT initialization.
+ *
+ * PUBLIC: int __db_txn_auto_init __P((ENV *, DB_THREAD_INFO *, DB_TXN **));
+ */
+int
+__db_txn_auto_init(env, ip, txnidp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_TXN **txnidp;
+{
+ /*
+ * Method calls where applications explicitly specify DB_AUTO_COMMIT
+ * require additional validation: the DB_AUTO_COMMIT flag cannot be
+ * specified if a transaction cookie is also specified, nor can the
+ * flag be specified in a non-transactional environment.
+ */
+ if (*txnidp != NULL && !F_ISSET(*txnidp, TXN_FAMILY)) {
+ __db_errx(env, DB_STR("0632",
+ "DB_AUTO_COMMIT may not be specified along with a transaction handle"));
+ return (EINVAL);
+ }
+
+ if (!TXN_ON(env)) {
+ __db_errx(env, DB_STR("0633",
+ "DB_AUTO_COMMIT may not be specified in non-transactional environment"));
+ return (EINVAL);
+ }
+
+ /*
+ * Our caller checked to see if replication is making a state change.
+ * Don't call the user-level API (which would repeat that check).
+ */
+ return (__txn_begin(env, ip, *txnidp, txnidp, 0));
+}
+
+/*
+ * __db_txn_auto_resolve --
+ * Resolve local transactions.
+ *
+ * PUBLIC: int __db_txn_auto_resolve __P((ENV *, DB_TXN *, int, int));
+ */
+int
+__db_txn_auto_resolve(env, txn, nosync, ret)
+ ENV *env;
+ DB_TXN *txn;
+ int nosync, ret;
+{
+ int t_ret;
+
+ if (ret == 0)
+ return (__txn_commit(txn, nosync ? DB_TXN_NOSYNC : 0));
+
+ if ((t_ret = __txn_abort(txn)) != 0)
+ return (__env_panic(env, t_ret));
+
+ return (ret);
+}
diff --git a/src/db/db_join.c b/src/db/db_join.c
new file mode 100644
index 00000000..751cf9e2
--- /dev/null
+++ b/src/db/db_join.c
@@ -0,0 +1,940 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_join.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+
+static int __db_join_close_pp __P((DBC *));
+static int __db_join_cmp __P((const void *, const void *));
+static int __db_join_del __P((DBC *, u_int32_t));
+static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+static int __db_join_primget __P((DB *, DB_THREAD_INFO *,
+ DB_TXN *, DB_LOCKER *, DBT *, DBT *, u_int32_t));
+static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
+
+/*
+ * Check to see if the Nth secondary cursor of join cursor jc is pointing
+ * to a sorted duplicate set.
+ */
+#define SORTED_SET(jc, n) ((jc)->j_curslist[(n)]->dbp->dup_compare != NULL)
+
+/*
+ * This is the duplicate-assisted join functionality. Right now we're
+ * going to write it such that we return one item at a time, although
+ * I think we may need to optimize it to return them all at once.
+ * It should be easier to get it working this way, and I believe that
+ * changing it should be fairly straightforward.
+ *
+ * We optimize the join by sorting cursors from smallest to largest
+ * cardinality. In most cases, this is indeed optimal. However, if
+ * a cursor with large cardinality has very few data in common with the
+ * first cursor, it is possible that the join will be made faster by
+ * putting it earlier in the cursor list. Since we have no way to detect
+ * cases like this, we simply provide a flag, DB_JOIN_NOSORT, which retains
+ * the sort order specified by the caller, who may know more about the
+ * structure of the data.
+ *
+ * The first cursor moves sequentially through the duplicate set while
+ * the others search explicitly for the duplicate in question.
+ *
+ */
+
+/*
+ * __db_join --
+ * This is the interface to the duplicate-assisted join functionality.
+ * In the same way that cursors mark a position in a database, a cursor
+ * can mark a position in a join. While most cursors are created by the
+ * cursor method of a DB, join cursors are created through an explicit
+ * call to DB->join.
+ *
+ * The curslist is an array of existing, initialized cursors and primary
+ * is the DB of the primary file. The data item that joins all the
+ * cursors in the curslist is used as the key into the primary and that
+ * key and data are returned. When no more items are left in the join
+ * set, the c_next operation off the join cursor will return DB_NOTFOUND.
+ *
+ * PUBLIC: int __db_join __P((DB *, DBC **, DBC **, u_int32_t));
+ */
+int
+__db_join(primary, curslist, dbcp, flags)
+ DB *primary;
+ DBC **curslist, **dbcp;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ JOIN_CURSOR *jc;
+ size_t ncurs, nslots;
+ u_int32_t i;
+ int ret;
+
+ env = primary->env;
+ dbc = NULL;
+ jc = NULL;
+
+ if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
+ goto err;
+
+ if ((ret = __os_calloc(env, 1, sizeof(JOIN_CURSOR), &jc)) != 0)
+ goto err;
+
+ if ((ret = __os_malloc(env, 256, &jc->j_key.data)) != 0)
+ goto err;
+ jc->j_key.ulen = 256;
+ F_SET(&jc->j_key, DB_DBT_USERMEM);
+
+ F_SET(&jc->j_rdata, DB_DBT_REALLOC);
+
+ for (jc->j_curslist = curslist;
+ *jc->j_curslist != NULL; jc->j_curslist++)
+ ;
+
+ /*
+ * The number of cursor slots we allocate is one greater than
+ * the number of cursors involved in the join, because the
+ * list is NULL-terminated.
+ */
+ ncurs = (size_t)(jc->j_curslist - curslist);
+ nslots = ncurs + 1;
+
+ /*
+ * !!! -- A note on the various lists hanging off jc.
+ *
+ * j_curslist is the initial NULL-terminated list of cursors passed
+ * into __db_join. The original cursors are not modified; pristine
+ * copies are required because, in databases with unsorted dups, we
+ * must reset all of the secondary cursors after the first each
+ * time the first one is incremented, or else we will lose data
+ * which happen to be sorted differently in two different cursors.
+ *
+ * j_workcurs is where we put those copies that we're planning to
+ * work with. They're lazily c_dup'ed from j_curslist as we need
+ * them, and closed when the join cursor is closed or when we need
+ * to reset them to their original values (in which case we just
+ * c_dup afresh).
+ *
+ * j_fdupcurs is an array of cursors which point to the first
+ * duplicate in the duplicate set that contains the data value
+ * we're currently interested in. We need this to make
+ * __db_join_get correctly return duplicate duplicates; i.e., if a
+ * given data value occurs twice in the set belonging to cursor #2,
+ * and thrice in the set belonging to cursor #3, and once in all
+ * the other cursors, successive calls to __db_join_get need to
+ * return that data item six times. To make this happen, each time
+ * cursor N is allowed to advance to a new datum, all cursors M
+ * such that M > N have to be reset to the first duplicate with
+ * that datum, so __db_join_get will return all the dup-dups again.
+ * We could just reset them to the original cursor from j_curslist,
+ * but that would be a bit slower in the unsorted case and a LOT
+ * slower in the sorted one.
+ *
+ * j_exhausted is a list of boolean values which represent
+ * whether or not their corresponding cursors are "exhausted",
+ * i.e. whether the datum under the corresponding cursor has
+ * been found not to exist in any unreturned combinations of
+ * later secondary cursors, in which case they are ready to be
+ * incremented.
+ */
+
+ /* We don't want to free regions whose callocs have failed. */
+ jc->j_curslist = NULL;
+ jc->j_workcurs = NULL;
+ jc->j_fdupcurs = NULL;
+ jc->j_exhausted = NULL;
+
+ if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+ &jc->j_curslist)) != 0)
+ goto err;
+ if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+ &jc->j_workcurs)) != 0)
+ goto err;
+ if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+ &jc->j_fdupcurs)) != 0)
+ goto err;
+ if ((ret = __os_calloc(env, nslots, sizeof(u_int8_t),
+ &jc->j_exhausted)) != 0)
+ goto err;
+ for (i = 0; curslist[i] != NULL; i++) {
+ jc->j_curslist[i] = curslist[i];
+ jc->j_workcurs[i] = NULL;
+ jc->j_fdupcurs[i] = NULL;
+ jc->j_exhausted[i] = 0;
+ }
+ jc->j_ncurs = (u_int32_t)ncurs;
+
+ /*
+ * If DB_JOIN_NOSORT is not set, optimize secondary cursors by
+ * sorting in order of increasing cardinality.
+ */
+ if (!LF_ISSET(DB_JOIN_NOSORT))
+ qsort(jc->j_curslist, ncurs, sizeof(DBC *), __db_join_cmp);
+
+ /*
+ * We never need to reset the 0th cursor, so there's no
+ * solid reason to use workcurs[0] rather than curslist[0] in
+ * join_get. Nonetheless, it feels cleaner to do it for symmetry,
+ * and this is the most logical place to copy it.
+ *
+ * !!!
+ * There's no need to close the new cursor if we goto err only
+ * because this is the last thing that can fail. Modifier of this
+ * function beware!
+ */
+ if ((ret =
+ __dbc_dup(jc->j_curslist[0], jc->j_workcurs, DB_POSITION)) != 0)
+ goto err;
+
+ dbc->close = dbc->c_close = __db_join_close_pp;
+ dbc->del = dbc->c_del = __db_join_del;
+ dbc->get = dbc->c_get = __db_join_get_pp;
+ dbc->put = dbc->c_put = __db_join_put;
+ dbc->internal = (DBC_INTERNAL *)jc;
+ dbc->dbp = primary;
+ jc->j_primary = primary;
+
+ /* Stash the first cursor's transaction here for easy access. */
+ dbc->txn = curslist[0]->txn;
+
+ *dbcp = dbc;
+
+ MUTEX_LOCK(env, primary->mutex);
+ TAILQ_INSERT_TAIL(&primary->join_queue, dbc, links);
+ MUTEX_UNLOCK(env, primary->mutex);
+
+ return (0);
+
+err: if (jc != NULL) {
+ if (jc->j_curslist != NULL)
+ __os_free(env, jc->j_curslist);
+ if (jc->j_workcurs != NULL) {
+ if (jc->j_workcurs[0] != NULL)
+ (void)__dbc_close(jc->j_workcurs[0]);
+ __os_free(env, jc->j_workcurs);
+ }
+ if (jc->j_fdupcurs != NULL)
+ __os_free(env, jc->j_fdupcurs);
+ if (jc->j_exhausted != NULL)
+ __os_free(env, jc->j_exhausted);
+ __os_free(env, jc);
+ }
+ if (dbc != NULL)
+ __os_free(env, dbc);
+ return (ret);
+}
+
+/*
+ * __db_join_close_pp --
+ * DBC->close pre/post processing for join cursors.
+ */
+static int
+__db_join_close_pp(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ ENV_ENTER(env, ip);
+
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_join_close(dbc);
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+static int
+__db_join_put(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t flags;
+{
+ COMPQUIET(dbc, NULL);
+ COMPQUIET(key, NULL);
+ COMPQUIET(data, NULL);
+ COMPQUIET(flags, 0);
+ return (EINVAL);
+}
+
+static int
+__db_join_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ COMPQUIET(dbc, NULL);
+ COMPQUIET(flags, 0);
+ return (EINVAL);
+}
+
+/*
+ * __db_join_get_pp --
+ * DBjoin->get pre/post processing.
+ */
+static int
+__db_join_get_pp(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ u_int32_t handle_check, save_flags;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /* Save the original flags value. */
+ save_flags = flags;
+
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+ if (!LOCKING_ON(env))
+ return (__db_fnl(env, "DBC->get"));
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ }
+
+ switch (flags) {
+ case 0:
+ case DB_JOIN_ITEM:
+ break;
+ default:
+ return (__db_ferr(env, "DBC->get", 0));
+ }
+
+ /*
+ * A partial get of the key of a join cursor don't make much sense;
+ * the entire key is necessary to query the primary database
+ * and find the datum, and so regardless of the size of the key
+ * it would not be a performance improvement. Since it would require
+ * special handling, we simply disallow it.
+ *
+ * A partial get of the data, however, potentially makes sense (if
+ * all possible data are a predictable large structure, for instance)
+ * and causes us no headaches, so we permit it.
+ */
+ if (F_ISSET(key, DB_DBT_PARTIAL)) {
+ __db_errx(env, DB_STR("0516",
+ "DB_DBT_PARTIAL may not be set on key during join_get"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Restore the original flags value. */
+ flags = save_flags;
+
+ ret = __db_join_get(dbc, key, data, flags);
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, NULL);
+ return (ret);
+}
+
+static int
+__db_join_get(dbc, key_arg, data_arg, flags)
+ DBC *dbc;
+ DBT *key_arg, *data_arg;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *cp;
+ DBT *key_n, key_n_mem;
+ ENV *env;
+ JOIN_CURSOR *jc;
+ int db_manage_data, ret;
+ u_int32_t i, j, operation, opmods;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ jc = (JOIN_CURSOR *)dbc->internal;
+
+ operation = LF_ISSET(DB_OPFLAGS_MASK);
+
+ /* !!!
+ * If the set of flags here changes, check that __db_join_primget
+ * is updated to handle them properly.
+ */
+ opmods = LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+
+ /*
+ * Since we are fetching the key as a datum in the secondary indices,
+ * we must be careful of caller-specified DB_DBT_* memory
+ * management flags. If necessary, use a stack-allocated DBT;
+ * we'll appropriately copy and/or allocate the data later.
+ */
+ if (F_ISSET(key_arg,
+ DB_DBT_MALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+ /* We just use the default buffer; no need to go malloc. */
+ key_n = &key_n_mem;
+ memset(key_n, 0, sizeof(DBT));
+ } else {
+ /*
+ * Either DB_DBT_REALLOC or the default buffer will work
+ * fine if we have to reuse it, as we do.
+ */
+ key_n = key_arg;
+ }
+ if (F_ISSET(key_arg, DB_DBT_USERCOPY))
+ key_arg->data = NULL;
+
+ /*
+ * If our last attempt to do a get on the primary key failed,
+ * short-circuit the join and try again with the same key.
+ */
+ if (F_ISSET(jc, JOIN_RETRY))
+ goto samekey;
+ F_CLR(jc, JOIN_RETRY);
+
+retry: ret = __dbc_get(jc->j_workcurs[0], &jc->j_key, key_n,
+ opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT));
+
+ if (ret == DB_BUFFER_SMALL) {
+ jc->j_key.ulen <<= 1;
+ if ((ret = __os_realloc(env,
+ jc->j_key.ulen, &jc->j_key.data)) != 0)
+ goto mem_err;
+ goto retry;
+ }
+
+ /*
+ * If ret == DB_NOTFOUND, we're out of elements of the first
+ * secondary cursor. This is how we finally finish the join
+ * if all goes well.
+ */
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If jc->j_exhausted[0] == 1, we've just advanced the first cursor,
+ * and we're going to want to advance all the cursors that point to
+ * the first member of a duplicate duplicate set (j_fdupcurs[1..N]).
+ * Close all the cursors in j_fdupcurs; we'll reopen them the
+ * first time through the upcoming loop.
+ */
+ for (i = 1; i < jc->j_ncurs; i++) {
+ if (jc->j_fdupcurs[i] != NULL &&
+ (ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
+ goto err;
+ jc->j_fdupcurs[i] = NULL;
+ }
+
+ /*
+ * If jc->j_curslist[1] == NULL, we have only one cursor in the join.
+ * Thus, we can safely increment that one cursor on each call
+ * to __db_join_get, and we signal this by setting jc->j_exhausted[0]
+ * right away.
+ *
+ * Otherwise, reset jc->j_exhausted[0] to 0, so that we don't
+ * increment it until we know we're ready to.
+ */
+ if (jc->j_curslist[1] == NULL)
+ jc->j_exhausted[0] = 1;
+ else
+ jc->j_exhausted[0] = 0;
+
+ /* We have the first element; now look for it in the other cursors. */
+ for (i = 1; i < jc->j_ncurs; i++) {
+ DB_ASSERT(env, jc->j_curslist[i] != NULL);
+ if (jc->j_workcurs[i] == NULL)
+ /* If this is NULL, we need to dup curslist into it. */
+ if ((ret = __dbc_dup(jc->j_curslist[i],
+ &jc->j_workcurs[i], DB_POSITION)) != 0)
+ goto err;
+
+retry2: cp = jc->j_workcurs[i];
+
+ if ((ret = __db_join_getnext(cp, &jc->j_key, key_n,
+ jc->j_exhausted[i], opmods)) == DB_NOTFOUND) {
+ /*
+ * jc->j_workcurs[i] has no more of the datum we're
+ * interested in. Go back one cursor and get
+ * a new dup. We can't just move to a new
+ * element of the outer relation, because that way
+ * we might miss duplicate duplicates in cursor i-1.
+ *
+ * If this takes us back to the first cursor,
+ * -then- we can move to a new element of the outer
+ * relation.
+ */
+ --i;
+ jc->j_exhausted[i] = 1;
+
+ if (i == 0) {
+ for (j = 1; jc->j_workcurs[j] != NULL; j++) {
+ /*
+ * We're moving to a new element of
+ * the first secondary cursor. If
+ * that cursor is sorted, then any
+ * other sorted cursors can be safely
+ * reset to the first duplicate
+ * duplicate in the current set if we
+ * have a pointer to it (we can't just
+ * leave them be, or we'll miss
+ * duplicate duplicates in the outer
+ * relation).
+ *
+ * If the first cursor is unsorted, or
+ * if cursor j is unsorted, we can
+ * make no assumptions about what
+ * we're looking for next or where it
+ * will be, so we reset to the very
+ * beginning (setting workcurs NULL
+ * will achieve this next go-round).
+ *
+ * XXX: This is likely to break
+ * horribly if any two cursors are
+ * both sorted, but have different
+ * specified sort functions. For,
+ * now, we dismiss this as pathology
+ * and let strange things happen--we
+ * can't make rope childproof.
+ */
+ if ((ret = __dbc_close(
+ jc->j_workcurs[j])) != 0)
+ goto err;
+ if (!SORTED_SET(jc, 0) ||
+ !SORTED_SET(jc, j) ||
+ jc->j_fdupcurs[j] == NULL)
+ /*
+ * Unsafe conditions;
+ * reset fully.
+ */
+ jc->j_workcurs[j] = NULL;
+ else
+ /* Partial reset suffices. */
+ if ((__dbc_dup(
+ jc->j_fdupcurs[j],
+ &jc->j_workcurs[j],
+ DB_POSITION)) != 0)
+ goto err;
+ jc->j_exhausted[j] = 0;
+ }
+ goto retry;
+ /* NOTREACHED */
+ }
+
+ /*
+ * We're about to advance the cursor and need to
+ * reset all of the workcurs[j] where j>i, so that
+ * we don't miss any duplicate duplicates.
+ */
+ for (j = i + 1;
+ jc->j_workcurs[j] != NULL;
+ j++) {
+ if ((ret =
+ __dbc_close(jc->j_workcurs[j])) != 0)
+ goto err;
+ jc->j_exhausted[j] = 0;
+ if (jc->j_fdupcurs[j] == NULL)
+ jc->j_workcurs[j] = NULL;
+ else if ((ret = __dbc_dup(jc->j_fdupcurs[j],
+ &jc->j_workcurs[j], DB_POSITION)) != 0)
+ goto err;
+ }
+ goto retry2;
+ /* NOTREACHED */
+ }
+
+ if (ret == DB_BUFFER_SMALL) {
+ jc->j_key.ulen <<= 1;
+ if ((ret = __os_realloc(env, jc->j_key.ulen,
+ &jc->j_key.data)) != 0) {
+mem_err: __db_errx(env, DB_STR_A("0517",
+ "Allocation failed for join key, len = %lu",
+ "%lu"), (u_long)jc->j_key.ulen);
+ goto err;
+ }
+ goto retry2;
+ }
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If we made it this far, we've found a matching
+ * datum in cursor i. Mark the current cursor
+ * unexhausted, so we don't miss any duplicate
+ * duplicates the next go-round--unless this is the
+ * very last cursor, in which case there are none to
+ * miss, and we'll need that exhausted flag to finally
+ * get a DB_NOTFOUND and move on to the next datum in
+ * the outermost cursor.
+ */
+ if (i + 1 != jc->j_ncurs)
+ jc->j_exhausted[i] = 0;
+ else
+ jc->j_exhausted[i] = 1;
+
+ /*
+ * If jc->j_fdupcurs[i] is NULL and the ith cursor's dups are
+ * sorted, then we're here for the first time since advancing
+ * cursor 0, and we have a new datum of interest.
+ * jc->j_workcurs[i] points to the beginning of a set of
+ * duplicate duplicates; store this into jc->j_fdupcurs[i].
+ */
+ if (SORTED_SET(jc, i) && jc->j_fdupcurs[i] == NULL && (ret =
+ __dbc_dup(cp, &jc->j_fdupcurs[i], DB_POSITION)) != 0)
+ goto err;
+ }
+
+err: if (ret != 0)
+ return (ret);
+
+ if (0) {
+samekey: /*
+ * Get the key we tried and failed to return last time;
+ * it should be the current datum of all the secondary cursors.
+ */
+ if ((ret = __dbc_get(jc->j_workcurs[0],
+ &jc->j_key, key_n, DB_CURRENT | opmods)) != 0)
+ return (ret);
+ F_CLR(jc, JOIN_RETRY);
+ }
+
+ /*
+ * ret == 0; we have a key to return.
+ *
+ * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to copy the key
+ * back into the dbt we were given for the key; call __db_retcopy.
+ * Otherwise, assert that we do not need to copy anything and proceed.
+ */
+ DB_ASSERT(env, F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
+ DB_DBT_USERCOPY) || key_n == key_arg);
+
+ if ((F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
+ DB_DBT_USERCOPY)) &&
+ (ret = __db_retcopy(env,
+ key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
+ /*
+ * The retcopy failed, most commonly because we have a user
+ * buffer for the key which is too small. Set things up to
+ * retry next time, and return.
+ */
+ F_SET(jc, JOIN_RETRY);
+ return (ret);
+ }
+
+ /*
+ * If DB_JOIN_ITEM is set, we return it; otherwise we do the lookup
+ * in the primary and then return.
+ */
+ if (operation == DB_JOIN_ITEM)
+ return (0);
+
+ /*
+ * If data_arg->flags == 0--that is, if DB is managing the
+ * data DBT's memory--it's not safe to just pass the DBT
+ * through to the primary get call, since we don't want that
+ * memory to belong to the primary DB handle (and if the primary
+ * is free-threaded, it can't anyway).
+ *
+ * Instead, use memory that is managed by the join cursor, in
+ * jc->j_rdata.
+ */
+ if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC |
+ DB_DBT_USERMEM | DB_DBT_USERCOPY))
+ db_manage_data = 1;
+ else
+ db_manage_data = 0;
+ if ((ret = __db_join_primget(jc->j_primary, dbc->thread_info,
+ jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_n,
+ db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+ (jc->j_curslist[0]->txn != NULL && F_ISSET(
+ jc->j_curslist[0]->txn, TXN_READ_UNCOMMITTED)))
+ goto retry;
+ /*
+ * If ret == DB_NOTFOUND, the primary and secondary
+ * are out of sync; every item in each secondary
+ * should correspond to something in the primary,
+ * or we shouldn't have done the join this way.
+ * Wail.
+ */
+ ret = __db_secondary_corrupt(jc->j_primary);
+ } else
+ /*
+ * The get on the primary failed for some other
+ * reason, most commonly because we're using a user
+ * buffer that's not big enough. Flag our failure
+ * so we can return the same key next time.
+ */
+ F_SET(jc, JOIN_RETRY);
+ }
+ if (db_manage_data && ret == 0) {
+ data_arg->data = jc->j_rdata.data;
+ data_arg->size = jc->j_rdata.size;
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_join_close --
+ * DBC->close for join cursors.
+ *
+ * PUBLIC: int __db_join_close __P((DBC *));
+ */
+int
+__db_join_close(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ JOIN_CURSOR *jc;
+ int ret, t_ret;
+ u_int32_t i;
+
+ jc = (JOIN_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ env = dbp->env;
+ ret = t_ret = 0;
+
+ /*
+ * Remove from active list of join cursors. Note that this
+ * must happen before any action that can fail and return, or else
+ * __db_close may loop indefinitely.
+ */
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_REMOVE(&dbp->join_queue, dbc, links);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ ENV_ENTER(env, ip);
+ /*
+ * Close any open scratch cursors. In each case, there may
+ * not be as many outstanding as there are cursors in
+ * curslist, but we want to close whatever's there.
+ *
+ * If any close fails, there's no reason not to close everything else;
+ * we'll just return the error code of the last one to fail. There's
+ * not much the caller can do anyway, since these cursors only exist
+ * hanging off a db-internal data structure that they shouldn't be
+ * mucking with.
+ */
+ for (i = 0; i < jc->j_ncurs; i++) {
+ if (jc->j_workcurs[i] != NULL &&
+ (t_ret = __dbc_close(jc->j_workcurs[i])) != 0)
+ ret = t_ret;
+ if (jc->j_fdupcurs[i] != NULL &&
+ (t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
+ ret = t_ret;
+ }
+ ENV_LEAVE(env, ip);
+
+ __os_free(env, jc->j_exhausted);
+ __os_free(env, jc->j_curslist);
+ __os_free(env, jc->j_workcurs);
+ __os_free(env, jc->j_fdupcurs);
+ __os_free(env, jc->j_key.data);
+ if (jc->j_rdata.data != NULL)
+ __os_ufree(env, jc->j_rdata.data);
+ __os_free(env, jc);
+ __os_free(env, dbc);
+
+ return (ret);
+}
+
+/*
+ * __db_join_getnext --
+ * This function replaces the DBC_CONTINUE and DBC_KEYSET
+ * functionality inside the various cursor get routines.
+ *
+ * If exhausted == 0, we're not done with the current datum;
+ * return it if it matches "matching", otherwise search
+ * using DB_GET_BOTHC (which is faster than iteratively doing
+ * DB_NEXT_DUP) forward until we find one that does.
+ *
+ * If exhausted == 1, we are done with the current datum, so just
+ * leap forward to searching NEXT_DUPs.
+ *
+ * If no matching datum exists, returns DB_NOTFOUND, else 0.
+ */
+static int
+__db_join_getnext(dbc, key, data, exhausted, opmods)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t exhausted, opmods;
+{
+ int ret, cmp;
+ DB *dbp;
+ DBT ldata;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+
+ dbp = dbc->dbp;
+ func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+
+ switch (exhausted) {
+ case 0:
+ /*
+ * We don't want to step on data->data; use a new
+ * DBT and malloc so we don't step on dbc's rdata memory.
+ */
+ memset(&ldata, 0, sizeof(DBT));
+ F_SET(&ldata, DB_DBT_MALLOC);
+ if ((ret = __dbc_get(dbc,
+ key, &ldata, opmods | DB_CURRENT)) != 0)
+ break;
+ cmp = func(dbp, data, &ldata);
+ if (cmp == 0) {
+ /*
+ * We have to return the real data value. Copy
+ * it into data, then free the buffer we malloc'ed
+ * above.
+ */
+ if ((ret = __db_retcopy(dbp->env, data, ldata.data,
+ ldata.size, &data->data, &data->size)) != 0)
+ return (ret);
+ __os_ufree(dbp->env, ldata.data);
+ return (0);
+ }
+
+ /*
+ * Didn't match--we want to fall through and search future
+ * dups. We just forget about ldata and free
+ * its buffer--data contains the value we're searching for.
+ */
+ __os_ufree(dbp->env, ldata.data);
+ /* FALLTHROUGH */
+ case 1:
+ ret = __dbc_get(dbc, key, data, opmods | DB_GET_BOTHC);
+ break;
+ default:
+ ret = EINVAL;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_join_cmp --
+ * Comparison function for sorting DBCs in cardinality order.
+ */
+static int
+__db_join_cmp(a, b)
+ const void *a, *b;
+{
+ DBC *dbca, *dbcb;
+ db_recno_t counta, countb;
+
+ dbca = *((DBC * const *)a);
+ dbcb = *((DBC * const *)b);
+
+ if (__dbc_count(dbca, &counta) != 0 ||
+ __dbc_count(dbcb, &countb) != 0)
+ return (0);
+
+ return ((long)counta - (long)countb);
+}
+
+/*
+ * __db_join_primget --
+ * Perform a DB->get in the primary, being careful not to use a new
+ * locker ID if we're doing CDB locking.
+ */
+static int
+__db_join_primget(dbp, ip, txn, locker, key, data, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_LOCKER *locker;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ u_int32_t rmw;
+ int ret, t_ret;
+
+ if ((ret = __db_cursor_int(dbp, ip,
+ txn, dbp->type, PGNO_INVALID, 0, locker, &dbc)) != 0)
+ return (ret);
+
+ /*
+ * The only allowable flags here are the two flags copied into "opmods"
+ * in __db_join_get, DB_RMW and DB_READ_UNCOMMITTED. The former is an
+ * op on the c_get call, the latter on the cursor call. It's a DB bug
+ * if we allow any other flags down in here.
+ */
+ rmw = LF_ISSET(DB_RMW);
+ if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+ (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
+ F_SET(dbc, DBC_READ_UNCOMMITTED);
+
+ if (LF_ISSET(DB_READ_COMMITTED) ||
+ (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
+ F_SET(dbc, DBC_READ_COMMITTED);
+
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ DB_ASSERT(dbp->env, flags == 0);
+
+ F_SET(dbc, DBC_TRANSIENT);
+
+ /*
+ * This shouldn't be necessary, thanks to the fact that join cursors
+ * swap in their own DB_DBT_REALLOC'ed buffers, but just for form's
+ * sake, we mirror what __db_get does.
+ */
+ SET_RET_MEM(dbc, dbp);
+
+ ret = __dbc_get(dbc, key, data, DB_SET | rmw);
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_secondary_corrupt --
+ * Report primary/secondary inconsistencies.
+ *
+ * PUBLIC: int __db_secondary_corrupt __P((DB *));
+ */
+int
+__db_secondary_corrupt(dbp)
+ DB *dbp;
+{
+ __db_err(dbp->env, DB_SECONDARY_BAD, "%s%s%s",
+ dbp->fname == NULL ? "unnamed" : dbp->fname,
+ dbp->dname == NULL ? "" : "/",
+ dbp->dname == NULL ? "" : dbp->dname);
+ return (DB_SECONDARY_BAD);
+}
diff --git a/src/db/db_meta.c b/src/db/db_meta.c
new file mode 100644
index 00000000..8f97ebd8
--- /dev/null
+++ b/src/db/db_meta.c
@@ -0,0 +1,1428 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+
+static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
+#ifdef HAVE_FTRUNCATE
+static int __db_pglistcmp __P((const void *, const void *));
+static int __db_truncate_freelist __P((DBC *, DBMETA *,
+ PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
+#endif
+
+/*
+ * __db_init_meta --
+ * Helper function for __db_new that initializes the important fields in
+ * a meta-data page (used instead of P_INIT). We need to make sure that we
+ * retain the page number and LSN of the existing page.
+ */
+static void
+__db_init_meta(dbp, p, pgno, pgtype)
+ DB *dbp;
+ void *p;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+{
+ DBMETA *meta;
+ DB_LSN save_lsn;
+
+ meta = (DBMETA *)p;
+ save_lsn = meta->lsn;
+ memset(meta, 0, sizeof(DBMETA));
+ meta->lsn = save_lsn;
+ meta->pagesize = dbp->pgsize;
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ FLD_SET(meta->metaflags, DBMETA_CHKSUM);
+ meta->pgno = pgno;
+ meta->type = (u_int8_t)pgtype;
+}
+
+/*
+ * __db_new --
+ * Get a new page, preferably from the freelist.
+ *
+ * PUBLIC: int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
+ */
+int
+__db_new(dbc, type, lockp, pagepp)
+ DBC *dbc;
+ u_int32_t type;
+ DB_LOCK *lockp;
+ PAGE **pagepp;
+{
+ DB *dbp;
+ DBMETA *meta;
+ DB_LOCK metalock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pgno_t last, *list, pgno, newnext;
+ int extend, hash, ret;
+
+ meta = NULL;
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ newnext = PGNO_INVALID;
+ if (lockp != NULL)
+ LOCK_INIT(*lockp);
+
+ hash = 0;
+ ret = 0;
+ LOCK_INIT(metalock);
+
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH) {
+ if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+ if (meta != NULL)
+ hash = 1;
+ }
+#endif
+ if (meta == NULL) {
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+ }
+
+ last = meta->last_pgno;
+ if (meta->free == PGNO_INVALID) {
+ if (FLD_ISSET(type, P_DONTEXTEND)) {
+ *pagepp = NULL;
+ goto err;
+ }
+ last = pgno = meta->last_pgno + 1;
+ ZERO_LSN(lsn);
+ extend = 1;
+ } else {
+ pgno = meta->free;
+ /*
+ * Lock the new page. Do this here because we must do it
+ * before getting the page and the caller may need the lock
+ * to keep readers from seeing the page before the transaction
+ * commits. We can do this because no one will hold a free
+ * page locked.
+ */
+ if (lockp != NULL && (ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &h)) != 0)
+ goto err;
+
+ /*
+ * We want to take the first page off the free list and
+ * then set meta->free to the that page's next_pgno, but
+ * we need to log the change first.
+ */
+ newnext = h->next_pgno;
+ lsn = h->lsn;
+ extend = 0;
+ DB_ASSERT(env, TYPE(h) == P_INVALID);
+
+ if (TYPE(h) != P_INVALID) {
+ __db_errx(env, DB_STR_A("0689",
+ "%s page %lu is on free list with type %lu",
+ "%s %lu %lu"), dbp->fname, (u_long)PGNO(h),
+ (u_long)TYPE(h));
+ return (__env_panic(env, EINVAL));
+ }
+
+ }
+
+ FLD_CLR(type, P_DONTEXTEND);
+
+ /*
+ * Log the allocation before fetching the new page. If we
+ * don't have room in the log then we don't want to tell
+ * mpool to extend the file.
+ */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
+ &LSN(meta), PGNO_BASE_MD, &lsn,
+ pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+
+ meta->free = newnext;
+
+ if (extend == 1) {
+ if (lockp != NULL && (ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_NEW, &h)) != 0)
+ goto err;
+ DB_ASSERT(env, last == pgno);
+ meta->last_pgno = pgno;
+ ZERO_LSN(h->lsn);
+ h->pgno = pgno;
+
+ /*
+ * If the file was extended for the first time in this
+ * transaction, set the MPOOLFILE's file extension
+ * watermark.
+ */
+ __txn_add_fe_watermark(dbc->txn, dbp, h->pgno);
+
+ }
+ LSN(h) = LSN(meta);
+
+ if (hash == 0 && (ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0)
+ goto err;
+ meta = NULL;
+
+ switch (type) {
+ case P_BTREEMETA:
+ case P_HASHMETA:
+ case P_QAMMETA:
+ __db_init_meta(dbp, h, h->pgno, type);
+ break;
+ default:
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
+ break;
+ }
+
+ /* Fix up the sorted free list if necessary. */
+#ifdef HAVE_FTRUNCATE
+ if (extend == 0) {
+ u_int32_t nelems = 0;
+
+ if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
+ goto err;
+ if (nelems != 0) {
+ DB_ASSERT(env, h->pgno == list[0]);
+ memmove(list, &list[1], (nelems - 1) * sizeof(*list));
+ if ((ret = __memp_extend_freelist(
+ dbp->mpf, nelems - 1, &list)) != 0)
+ goto err;
+ }
+ }
+#else
+ COMPQUIET(list, NULL);
+#endif
+
+ if ((ret = __TLPUT(dbc, metalock)) != 0)
+ return (ret);
+ *pagepp = h;
+ PERFMON6(env, alloc, new, dbp->fname, dbp->dname, pgno, type, h, 0);
+ return (0);
+
+err: if (h != NULL)
+ (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ if (meta != NULL && hash == 0)
+ (void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ (void)__TLPUT(dbc, metalock);
+ if (lockp != NULL)
+ (void)__LPUT(dbc, *lockp);
+ /* Failure return - report 0 pgno, null page address. */
+ PERFMON6(env, alloc, new, dbp->fname, dbp->dname, 0, type, NULL, ret);
+ return (ret);
+}
+
+/*
+ * __db_free --
+ * Add a page to the head of the freelist.
+ *
+ * PUBLIC: int __db_free __P((DBC *, PAGE *, u_int32_t));
+ */
+int
+__db_free(dbc, h, flags)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBMETA *meta;
+ DBT ddbt, ldbt;
+ DB_LOCK metalock;
+ DB_LSN *lsnp;
+ DB_MPOOLFILE *mpf;
+ PAGE *prev;
+ db_pgno_t last_pgno, next_pgno, pgno, prev_pgno;
+ u_int32_t lflag;
+ int hash, ret, t_ret;
+#ifdef HAVE_FTRUNCATE
+ db_pgno_t *list, *lp;
+ u_int32_t nelem, position, start;
+ int do_truncate;
+#endif
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ prev_pgno = PGNO_INVALID;
+ meta = NULL;
+ prev = NULL;
+ LOCK_INIT(metalock);
+#ifdef HAVE_FTRUNCATE
+ lp = NULL;
+ nelem = 0;
+ do_truncate = 0;
+#endif
+
+ /*
+ * Retrieve the metadata page. If we are not keeping a sorted
+ * free list put the page at the head of the the free list.
+ * If we are keeping a sorted free list, for truncation,
+ * then figure out where this page belongs and either
+ * link it in or truncate the file as much as possible.
+ * If either the lock get or page get routines
+ * fail, then we need to put the page with which we were called
+ * back because our caller assumes we take care of it.
+ */
+ hash = 0;
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH) {
+ if ((ret = __ham_return_meta(dbc,
+#ifdef HAVE_FTRUNCATE
+ 0,
+#else
+ DB_MPOOL_DIRTY,
+#endif
+ &meta)) != 0)
+ goto err;
+ if (meta != NULL)
+ hash = 1;
+ }
+#endif
+ if (meta == NULL) {
+ /* If we support truncate, we might not dirty the meta page. */
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+#ifdef HAVE_FTRUNCATE
+ 0,
+#else
+ DB_MPOOL_DIRTY,
+#endif
+ &meta)) != 0)
+ goto err1;
+ }
+
+ last_pgno = meta->last_pgno;
+ next_pgno = meta->free;
+ /*
+ * Assign lsnp here so it always initialized when
+ * HAVE_FTRUNCATE is not defined.
+ */
+ lsnp = &LSN(meta);
+
+ DB_ASSERT(dbp->env, h->pgno != next_pgno);
+
+#ifdef HAVE_FTRUNCATE
+ /*
+ * If we are maintaining a sorted free list see if we either have a
+ * new truncation point or the page goes somewhere in the middle of
+ * the list. If it goes in the middle of the list, we will drop the
+ * meta page and get the previous page.
+ */
+ COMPQUIET(position, 0);
+ if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
+ goto err1;
+ if (list == NULL)
+ goto no_sort;
+
+ if (h->pgno != last_pgno) {
+ /*
+ * Put the page number in the sorted list. Find its
+ * position and the previous page. After logging we
+ * will extend the list, make room and insert the page in
+ * the list.
+ */
+ position = 0;
+ if (nelem != 0) {
+ __db_freelist_pos(h->pgno, list, nelem, &position);
+
+ DB_ASSERT(dbp->env, h->pgno != list[position]);
+
+ /* Get the previous page if this is not the smallest. */
+ if (position != 0 || h->pgno > list[0])
+ prev_pgno = list[position];
+ }
+
+ } else if (nelem != 0) {
+ /* Find the truncation point. */
+ for (lp = &list[nelem - 1]; lp >= list; lp--)
+ if (--last_pgno != *lp)
+ break;
+ if (lp < list || last_pgno < h->pgno - 1)
+ do_truncate = 1;
+ last_pgno = meta->last_pgno;
+ }
+
+no_sort:
+ if (prev_pgno == PGNO_INVALID) {
+#ifdef HAVE_HASH
+ if (hash) {
+ if ((ret =
+ __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err1;
+ } else
+#endif
+ if ((ret = __memp_dirty(mpf,
+ &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err1;
+ lsnp = &LSN(meta);
+ } else {
+ pgno = prev_pgno;
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0)
+ goto err1;
+ next_pgno = NEXT_PGNO(prev);
+ lsnp = &LSN(prev);
+ }
+#endif
+
+ /*
+ * Log the change.
+ * We are either logging an update to the metapage or to the
+ * previous page in the sorted list.
+ */
+ if (DBC_LOGGING(dbc)) {
+ memset(&ldbt, 0, sizeof(ldbt));
+ ldbt.data = h;
+ ldbt.size = P_OVERHEAD(dbp);
+ /*
+ * If we are removing pages from the file, we need to make
+ * sure the logging happens before the truncation. If we
+ * are truncating multiple pages we don't need to flush the
+ * log here as it will be flushed by __db_truncate_freelist.
+ */
+ lflag = 0;
+
+#ifdef HAVE_FTRUNCATE
+ if (h->pgno == last_pgno && do_truncate == 0)
+ lflag = DB_FLUSH;
+#endif
+ switch (h->type) {
+ case P_HASH:
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ if (h->entries > 0 && (h->pgno == last_pgno ||
+ !LF_ISSET(DB_LOG_NO_DATA))) {
+ ldbt.size += h->entries * sizeof(db_indx_t);
+ ddbt.data = (u_int8_t *)h + HOFFSET(h);
+ ddbt.size = dbp->pgsize - HOFFSET(h);
+ if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
+ lsnp, lflag,
+ h->pgno, lsnp, pgno,
+ &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
+ goto err1;
+ goto logged;
+ }
+ break;
+ case P_HASHMETA:
+ ldbt.size = sizeof(HMETA);
+ break;
+ case P_BTREEMETA:
+ ldbt.size = sizeof(BTMETA);
+ break;
+ case P_OVERFLOW:
+ ldbt.size += OV_LEN(h);
+ break;
+ default:
+ DB_ASSERT(dbp->env, h->type != P_QAMDATA);
+ }
+
+ if ((ret = __db_pg_free_log(dbp,
+ dbc->txn, lsnp, lflag, h->pgno,
+ lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0)
+ goto err1;
+ } else
+ LSN_NOT_LOGGED(*lsnp);
+
+logged:
+#ifdef HAVE_FTRUNCATE
+ if (do_truncate) {
+ start = (u_int32_t) (lp - list) + 1;
+ meta->last_pgno--;
+ ret = __db_truncate_freelist(
+ dbc, meta, h, list, start, nelem);
+ h = NULL;
+ } else if (h->pgno == last_pgno) {
+ /*
+ * We are going to throw this page away, but if we are
+ * using MVCC then this version may stick around and we
+ * might have to make a copy.
+ */
+ if (atomic_read(&mpf->mfp->multiversion) &&
+ (ret = __memp_dirty(mpf,
+ &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err1;
+ LSN(h) = *lsnp;
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err1;
+ h = NULL;
+ /* Give the page back to the OS. */
+ if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+ last_pgno, 0)) != 0)
+ goto err1;
+ DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD);
+ meta->last_pgno--;
+ } else {
+ if (list != NULL) {
+ /* Put the page number into the list. */
+ if ((ret =
+ __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
+ goto err1;
+ if (prev_pgno != PGNO_INVALID)
+ lp = &list[position + 1];
+ else
+ lp = list;
+ if (nelem != 0 && position != nelem)
+ memmove(lp + 1, lp, (size_t)
+ ((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
+ *lp = h->pgno;
+ }
+#else
+ {
+#endif
+ /*
+ * If we are not truncating the page then we
+ * reinitialize it and put it at the head of
+ * the free list.
+ */
+ if ((ret = __memp_dirty(mpf,
+ &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err1;
+ LSN(h) = *lsnp;
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+#ifdef DIAGNOSTIC
+ memset((u_int8_t *) h + P_OVERHEAD(dbp),
+ CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
+#endif
+ if (prev_pgno == PGNO_INVALID)
+ meta->free = h->pgno;
+ else
+ NEXT_PGNO(prev) = h->pgno;
+ }
+
+ /* Discard the metadata or previous page. */
+err1: if (hash == 0 && meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the caller's page reference. */
+err: if (h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ PERFMON4(dbp->env, alloc, free, dbp->fname, dbp->dname, pgno, ret);
+ /*
+ * XXX
+ * We have to unlock the caller's page in the caller!
+ */
+ return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __db_freelist_pos -- find the position of a page in the freelist.
+ * The list is sorted, we do a binary search.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
+ * PUBLIC: db_pgno_t *, u_int32_t, u_int32_t *));
+ * PUBLIC: #endif
+ */
+void
+__db_freelist_pos(pgno, list, nelem, posp)
+ db_pgno_t pgno;
+ db_pgno_t *list;
+ u_int32_t nelem;
+ u_int32_t *posp;
+{
+ u_int32_t base, indx, lim;
+
+ indx = 0;
+ for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
+ indx = base + (lim >> 1);
+ if (pgno == list[indx]) {
+ *posp = indx;
+ return;
+ }
+ if (pgno > list[indx]) {
+ base = indx + 1;
+ --lim;
+ }
+ }
+ if (base != 0)
+ base--;
+ *posp = base;
+ return;
+}
+
+static int
+__db_pglistcmp(a, b)
+ const void *a, *b;
+{
+ db_pglist_t *ap, *bp;
+
+ ap = (db_pglist_t *)a;
+ bp = (db_pglist_t *)b;
+
+ return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
+}
+
+/*
+ * __db_freelist_sort -- sort a list of free pages.
+ * PUBLIC: void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
+ */
+void
+__db_freelist_sort(list, nelems)
+ db_pglist_t *list;
+ u_int32_t nelems;
+{
+ qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp);
+}
+
+/*
+ * __db_pg_truncate -- find the truncation point in a sorted freelist.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *,
+ * PUBLIC: db_pglist_t *, DB_COMPACT *, u_int32_t *,
+ * PUBLIC: db_pgno_t , db_pgno_t *, DB_LSN *, int));
+ * PUBLIC: #endif
+ */
+int
+__db_pg_truncate(dbc, txn,
+ list, c_data, nelemp, free_pgno, last_pgno, lsnp, in_recovery)
+ DBC *dbc;
+ DB_TXN *txn;
+ db_pglist_t *list;
+ DB_COMPACT *c_data;
+ u_int32_t *nelemp;
+ db_pgno_t free_pgno, *last_pgno;
+ DB_LSN *lsnp;
+ int in_recovery;
+{
+ DB *dbp;
+ DBT ddbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_pglist_t *lp, *slp;
+ db_pgno_t lpgno, pgno;
+ u_int32_t elems, log_size, tpoint;
+ int last, ret;
+
+ ret = 0;
+ h = NULL;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ elems = tpoint = *nelemp;
+
+ /*
+ * Figure out what (if any) pages can be truncated immediately and
+ * record the place from which we can truncate, so we can do the
+ * memp_ftruncate below. We also use this to avoid ever putting
+ * these pages on the freelist, which we are about to relink.
+ */
+ pgno = *last_pgno;
+ lp = &list[elems - 1];
+ last = 1;
+ while (tpoint != 0) {
+ if (lp->pgno != pgno)
+ break;
+ pgno--;
+ tpoint--;
+ lp--;
+ }
+
+ lp = list;
+ slp = &list[elems];
+ /*
+ * Log the sorted list. We log the whole list so it can be rebuilt.
+ * Don't overflow the log file.
+ */
+again: if (DBC_LOGGING(dbc)) {
+ last = 1;
+ lpgno = *last_pgno;
+ ddbt.size = elems * sizeof(*lp);
+ ddbt.data = lp;
+ log_size = ((LOG *)dbc->env->
+ lg_handle->reginfo.primary)->log_size;
+ if (ddbt.size > log_size / 2) {
+ elems = (log_size / 2) / sizeof(*lp);
+ ddbt.size = elems * sizeof(*lp);
+ last = 0;
+ /*
+ * If we stopped after the truncation point
+ * then we need to truncate from here.
+ */
+ if (lp + elems >= &list[tpoint])
+ lpgno = lp[elems - 1].pgno;
+ }
+ /*
+ * If this is not the beginning of the list fetch the end
+ * of the previous segment. This page becomes the last_free
+ * page and will link to this segment if it is not truncated.
+ */
+ if (lp != list) {
+ if ((ret = __memp_fget(mpf, &lp[-1].pgno,
+ dbc->thread_info, txn, 0, &h)) != 0)
+ goto err;
+ }
+
+ slp = &lp[elems];
+
+ ZERO_LSN(null_lsn);
+ if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+ lsnp, last == 1 ? DB_FLUSH : 0, PGNO_BASE_MD,
+ lsnp, h != NULL ? PGNO(h) : PGNO_INVALID,
+ h != NULL ? &LSN(h) : &null_lsn,
+ free_pgno, lpgno, &ddbt)) != 0)
+ goto err;
+ if (h != NULL) {
+ LSN(h) = *lsnp;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ }
+ h = NULL;
+ } else if (!in_recovery)
+ LSN_NOT_LOGGED(*lsnp);
+
+ for (; lp < slp && lp < &list[tpoint]; lp++) {
+ if ((ret = __memp_fget(mpf, &lp->pgno, dbc->thread_info,
+ txn, !in_recovery ? DB_MPOOL_DIRTY : 0, &h)) != 0) {
+ /* Page may have been truncated later. */
+ if (in_recovery && ret == DB_PAGE_NOTFOUND) {
+ ret = 0;
+ continue;
+ }
+ goto err;
+ }
+ if (in_recovery) {
+ if (LOG_COMPARE(&LSN(h), &lp->lsn) == 0) {
+ if ((ret = __memp_dirty(mpf, &h,
+ dbc->thread_info,
+ txn, dbp->priority, 0)) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbp->priority);
+ goto err;
+ }
+ } else
+ goto skip;
+ }
+
+ if (lp == &list[tpoint - 1])
+ NEXT_PGNO(h) = PGNO_INVALID;
+ else
+ NEXT_PGNO(h) = lp[1].pgno;
+ DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno);
+
+ LSN(h) = *lsnp;
+skip: if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbp->priority)) != 0)
+ goto err;
+ h = NULL;
+ }
+
+ /*
+ * If we did not log everything try again. We start from slp and
+ * try to go to the end of the list.
+ */
+ if (last == 0) {
+ elems = (u_int32_t)(&list[*nelemp] - slp);
+ lp = slp;
+ goto again;
+ }
+
+ /*
+ * Truncate the file. Its possible that the last page is the
+ * only one that got truncated and that's done in the caller.
+ */
+ if (pgno != *last_pgno) {
+ if (tpoint != *nelemp &&
+ (ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+ pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
+ goto err;
+ if (c_data)
+ c_data->compact_pages_truncated += *last_pgno - pgno;
+ *last_pgno = pgno;
+ }
+ *nelemp = tpoint;
+
+ if (0) {
+err: if (h != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ }
+ return (ret);
+}
+
+/*
+ * __db_free_truncate --
+ * Build a sorted free list and truncate free pages at the end
+ * of the file.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *,
+ * PUBLIC: db_pgno_t *));
+ * PUBLIC: #endif
+ */
+int
+__db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t flags;
+ DB_COMPACT *c_data;
+ db_pglist_t **listp;
+ u_int32_t *nelemp;
+ db_pgno_t *last_pgnop;
+{
+ DBC *dbc;
+ DBMETA *meta;
+ DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pglist_t *list, *lp;
+ db_pgno_t pgno;
+ u_int32_t nelems;
+ int ret, t_ret;
+ size_t size;
+
+ COMPQUIET(flags, 0);
+ list = NULL;
+ meta = NULL;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ nelems = 0;
+ if (listp != NULL) {
+ *listp = NULL;
+ DB_ASSERT(env, nelemp != NULL);
+ *nelemp = 0;
+ }
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0)
+ return (ret);
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0,
+ &meta)) != 0)
+ goto err;
+
+ if (last_pgnop != NULL)
+ *last_pgnop = meta->last_pgno;
+ if ((pgno = meta->free) == PGNO_INVALID)
+ goto done;
+
+ size = 128;
+ if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0)
+ goto err;
+ lp = list;
+
+ do {
+ if (lp == &list[size]) {
+ size *= 2;
+ if ((ret = __os_realloc(env,
+ size * sizeof(*list), &list)) != 0)
+ goto err;
+ lp = &list[size / 2];
+ }
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+
+ lp->pgno = pgno;
+ lp->next_pgno = NEXT_PGNO(h);
+ lp->lsn = LSN(h);
+ pgno = NEXT_PGNO(h);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ lp++;
+ } while (pgno != PGNO_INVALID);
+ nelems = (u_int32_t)(lp - list);
+
+ if ((ret = __memp_dirty(mpf,
+ &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+
+ /* Sort the list */
+ __db_freelist_sort(list, nelems);
+
+ if ((ret = __db_pg_truncate(dbc, txn, list, c_data,
+ &nelems, meta->free, &meta->last_pgno, &LSN(meta), 0)) != 0)
+ goto err;
+
+ if (nelems == 0)
+ meta->free = PGNO_INVALID;
+ else
+ meta->free = list[0].pgno;
+
+done: if (last_pgnop != NULL)
+ *last_pgnop = meta->last_pgno;
+
+ /*
+ * The truncate point is the number of pages in the free
+ * list back from the last page. The number of pages
+ * in the free list are the number that we can swap in.
+ * Adjust it down slightly so if we find higher numbered
+ * pages early and then free other pages later we can
+ * truncate them.
+ */
+ if (c_data) {
+ c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
+ if (c_data->compact_truncate > nelems >> 2)
+ c_data->compact_truncate -= nelems >> 2;
+ }
+
+ if (nelems != 0 && listp != NULL) {
+ *listp = list;
+ *nelemp = nelems;
+ list = NULL;
+ }
+
+err: if (list != NULL)
+ __os_free(env, list);
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static int
+__db_truncate_freelist(dbc, meta, h, list, start, nelem)
+ DBC *dbc;
+ DBMETA *meta;
+ PAGE *h;
+ db_pgno_t *list;
+ u_int32_t start, nelem;
+{
+ DB *dbp;
+ DBT ddbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *last_free, *pg;
+ db_pgno_t *lp, free_pgno, lpgno;
+ db_pglist_t *plist, *pp, *spp;
+ u_int32_t elem, log_size;
+ int last, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ plist = NULL;
+ last_free = NULL;
+ pg = NULL;
+
+ if (start != 0 &&
+ (ret = __memp_fget(mpf, &list[start - 1],
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0)
+ goto err;
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __os_malloc(dbp->env,
+ (nelem - start) * sizeof(*pp), &plist)) != 0)
+ goto err;
+
+ pp = plist;
+ for (lp = &list[start]; lp < &list[nelem]; lp++) {
+ pp->pgno = *lp;
+ if ((ret = __memp_fget(mpf, lp,
+ dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+ goto err;
+ pp->lsn = LSN(pg);
+ pp->next_pgno = NEXT_PGNO(pg);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err;
+ pg = NULL;
+ pp++;
+ }
+ ZERO_LSN(null_lsn);
+ pp = plist;
+ elem = nelem - start;
+ log_size = ((LOG *)dbc->env->
+ lg_handle->reginfo.primary)->log_size;
+again: ddbt.data = spp = pp;
+ free_pgno = pp->pgno;
+ lpgno = meta->last_pgno;
+ ddbt.size = elem * sizeof(*pp);
+ if (ddbt.size > log_size / 2) {
+ elem = (log_size / 2) / (u_int32_t)sizeof(*pp);
+ ddbt.size = elem * sizeof(*pp);
+ pp += elem;
+ elem = (nelem - start) - (u_int32_t)(pp - plist);
+ lpgno = pp[-1].pgno;
+ last = 0;
+ } else
+ last = 1;
+ /*
+ * Get the page which will link to this section if we abort.
+ * If this is the first segment then its last_free.
+ */
+ if (spp == plist)
+ pg = last_free;
+ else if ((ret = __memp_fget(mpf, &spp[-1].pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+ goto err;
+
+ if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+ &LSN(meta), last == 1 ? DB_FLUSH : 0,
+ PGNO(meta), &LSN(meta),
+ pg != NULL ? PGNO(pg) : PGNO_INVALID,
+ pg != NULL ? &LSN(pg) : &null_lsn,
+ free_pgno, lpgno, &ddbt)) != 0)
+ goto err;
+ if (pg != NULL) {
+ LSN(pg) = LSN(meta);
+ if (pg != last_free && (ret = __memp_fput(mpf,
+ dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err;
+ pg = NULL;
+ }
+ if (last == 0)
+ goto again;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err;
+ h = NULL;
+ if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+ list[start], 0)) != 0)
+ goto err;
+ meta->last_pgno = list[start] - 1;
+
+ if (start == 0)
+ meta->free = PGNO_INVALID;
+ else {
+ NEXT_PGNO(last_free) = PGNO_INVALID;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, last_free, dbc->priority)) != 0)
+ goto err;
+ last_free = NULL;
+ }
+
+ /* Shrink the number of elements in the list. */
+ ret = __memp_extend_freelist(mpf, start, &list);
+
+err: if (plist != NULL)
+ __os_free(dbp->env, plist);
+
+ /* We need to put the page on error. */
+ if (h != NULL)
+ (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ if (pg != NULL && pg != last_free)
+ (void)__memp_fput(mpf, dbc->thread_info, pg, dbc->priority);
+ if (last_free != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, last_free, dbc->priority);
+
+ return (ret);
+}
+#endif
+
+#ifdef DEBUG
+/*
+ * __db_lprint --
+ * Print out the list of locks currently held by a cursor.
+ *
+ * PUBLIC: int __db_lprint __P((DBC *));
+ */
+int
+__db_lprint(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_LOCKREQ req;
+ ENV *env;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if (LOCKING_ON(env)) {
+ req.op = DB_LOCK_DUMP;
+ (void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL);
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __db_lget --
+ * The standard lock get call.
+ *
+ * PUBLIC: int __db_lget __P((DBC *,
+ * PUBLIC: int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
+ */
+int
+__db_lget(dbc, action, pgno, mode, lkflags, lockp)
+ DBC *dbc;
+ int action;
+ db_pgno_t pgno;
+ db_lockmode_t mode;
+ u_int32_t lkflags;
+ DB_LOCK *lockp;
+{
+ DB *dbp;
+ DB_LOCKREQ couple[3], *reqp;
+ DB_TXN *txn;
+ ENV *env;
+ int has_timeout, i, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ txn = dbc->txn;
+
+ /*
+ * We do not always check if we're configured for locking before
+ * calling __db_lget to acquire the lock.
+ */
+ if (CDB_LOCKING(env) || !LOCKING_ON(env) ||
+ (MULTIVERSION(dbp) && mode == DB_LOCK_READ &&
+ dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) ||
+ F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) &&
+ (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) ||
+ (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
+ LOCK_INIT(*lockp);
+ return (0);
+ }
+
+ /*
+ * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
+ * pass that along to the lock call.
+ */
+ if (DB_NONBLOCK(dbc))
+ lkflags |= DB_LOCK_NOWAIT;
+
+ /*
+ * If we're trying to run in exclusive mode, attempt to get an
+ * exclusive database lock. If it is not available then wait
+ * for the lock on the database and clear the exclusive bit.
+ *
+ * If we get an exclusive lock on the database, mark the cursor
+ * with DBC_DONTLOCK to avoid any further locking.
+ */
+ if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
+ dbc->lock.type = DB_DATABASE_LOCK;
+ dbc->lock.pgno = PGNO_BASE_MD;
+ if ((ret = __lock_get(env, dbc->locker, DB_LOCK_NOWAIT,
+ &dbc->lock_dbt, F_ISSET(dbp, DB_AM_RDONLY) ?
+ DB_LOCK_READ : DB_LOCK_WRITE, lockp)) == 0) {
+ if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
+ F_SET(dbc, DBC_DONTLOCK);
+ if (!IS_REAL_TXN(txn))
+ dbc->mylock = *lockp;
+ LOCK_INIT(*lockp);
+ return (0);
+ }
+ } else if (ret == DB_LOCK_NOTGRANTED &&
+ (lkflags & DB_LOCK_NOWAIT) == 0) {
+ if ((ret = __lock_get(env, dbc->locker, 0,
+ &dbc->lock_dbt, DB_LOCK_WRITE, lockp)) != 0)
+ return (ret);
+ F_CLR(dbp->mpf->mfp, MP_DATABASE_LOCKING);
+ if ((ret = __lock_put(env, lockp)) != 0)
+ return (ret);
+ LOCK_INIT(*lockp);
+ } else if (ret != 0)
+ return (ret);
+ }
+
+ dbc->lock.pgno = pgno;
+ if (lkflags & DB_LOCK_RECORD)
+ dbc->lock.type = DB_RECORD_LOCK;
+ else
+ dbc->lock.type = DB_PAGE_LOCK;
+ lkflags &= ~DB_LOCK_RECORD;
+
+ if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
+ mode = DB_LOCK_READ_UNCOMMITTED;
+
+ has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
+ (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
+
+ /*
+ * Transactional locking.
+ * Hold on to the previous read lock only if we are in full isolation.
+ * COUPLE_ALWAYS indicates we are holding an interior node which need
+ * not be isolated.
+ * Downgrade write locks if we are supporting dirty readers and the
+ * update did not have an error.
+ */
+ if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
+ !LOCK_ISSET(*lockp))
+ action = 0;
+ else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
+ action = LCK_COUPLE;
+ else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+ lockp->mode == DB_LOCK_READ)
+ action = LCK_COUPLE;
+ else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+ action = LCK_COUPLE;
+ else if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+ !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
+ action = LCK_DOWNGRADE;
+ else
+ action = 0;
+
+ i = 0;
+ switch (action) {
+ default:
+ if (has_timeout)
+ goto do_couple;
+ ret = __lock_get(env,
+ dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
+ break;
+
+ case LCK_DOWNGRADE:
+ couple[0].op = DB_LOCK_GET;
+ couple[0].obj = NULL;
+ couple[0].lock = *lockp;
+ couple[0].mode = DB_LOCK_WWRITE;
+ UMRW_SET(couple[0].timeout);
+ i++;
+ /* FALLTHROUGH */
+ case LCK_COUPLE:
+do_couple: couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
+ couple[i].obj = &dbc->lock_dbt;
+ couple[i].mode = mode;
+ UMRW_SET(couple[i].timeout);
+ i++;
+ if (has_timeout)
+ couple[0].timeout =
+ F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
+ if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
+ couple[i].op = DB_LOCK_PUT;
+ couple[i].lock = *lockp;
+ i++;
+ }
+
+ ret = __lock_vec(env,
+ dbc->locker, lkflags, couple, i, &reqp);
+ if (ret == 0 || reqp == &couple[i - 1])
+ *lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
+ break;
+ }
+
+ if (txn != NULL && ret == DB_LOCK_DEADLOCK)
+ F_SET(txn, TXN_DEADLOCK);
+ return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
+ DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __db_haslock --
+ * Determine if this locker holds a particular lock.
+ * Returns 0 if lock is held, non-zero otherwise.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: int __db_haslock __P((ENV *, DB_LOCKER *,
+ * PUBLIC: DB_MPOOLFILE *, db_pgno_t, db_lockmode_t, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__db_haslock(env, locker, dbmfp, pgno, mode, type)
+ ENV *env;
+ DB_LOCKER *locker;
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t pgno;
+ db_lockmode_t mode;
+ u_int32_t type;
+{
+ DBT lkdata;
+ DB_LOCK lock;
+ DB_LOCK_ILOCK ilock;
+
+ memset(&lkdata, 0, sizeof(lkdata));
+ lkdata.data = &ilock;
+ lkdata.size = sizeof(ilock);
+
+ memcpy(ilock.fileid, dbmfp->fileid, DB_FILE_ID_LEN);
+ ilock.pgno = pgno;
+ ilock.type = type;
+
+ return (__lock_get(env, locker, DB_LOCK_CHECK, &lkdata, mode, &lock));
+}
+/*
+ * __db_has_pagelock --
+ * Determine if this locker holds a particular page lock.
+ * Returns 0 if lock is held, non-zero otherwise.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: int __db_has_pagelock __P((ENV *, DB_LOCKER *,
+ * PUBLIC: DB_MPOOLFILE *, PAGE *, db_lockmode_t));
+ * PUBLIC: #endif
+ */
+int
+__db_has_pagelock(env, locker, dbmfp, pagep, mode)
+ ENV *env;
+ DB_LOCKER *locker;
+ DB_MPOOLFILE *dbmfp;
+ PAGE *pagep;
+ db_lockmode_t mode;
+{
+ int ret;
+
+ switch (pagep->type) {
+ case P_OVERFLOW:
+ case P_INVALID:
+ case P_QAMDATA:
+ case P_QAMMETA:
+ case P_IHEAP:
+ return (0);
+ case P_HASH:
+ if (PREV_PGNO(pagep) != PGNO_INVALID)
+ return (0);
+ break;
+ default:
+ break;
+ }
+ if ((ret = __db_haslock(env,
+ locker, dbmfp, pagep->pgno, mode, DB_PAGE_LOCK)) != 0)
+ ret = __db_haslock(env,
+ locker, dbmfp, PGNO_BASE_MD, mode, DB_DATABASE_LOCK);
+ return (ret);
+}
+#endif
+
+/*
+ * __db_lput --
+ * The standard lock put call.
+ *
+ * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
+ */
+int
+__db_lput(dbc, lockp)
+ DBC *dbc;
+ DB_LOCK *lockp;
+{
+ DB_LOCKREQ couple[2], *reqp;
+ ENV *env;
+ int action, ret;
+
+ /*
+ * Transactional locking.
+ * Hold on to the read locks only if we are in full isolation.
+ * Downgrade write locks if we are supporting dirty readers unless
+ * there was an error.
+ */
+ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+ !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
+ action = LCK_DOWNGRADE;
+ else if (dbc->txn == NULL)
+ action = LCK_COUPLE;
+ else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+ lockp->mode == DB_LOCK_READ)
+ action = LCK_COUPLE;
+ else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+ action = LCK_COUPLE;
+ else
+ action = 0;
+
+ env = dbc->env;
+ switch (action) {
+ case LCK_COUPLE:
+ ret = __lock_put(env, lockp);
+ break;
+ case LCK_DOWNGRADE:
+ couple[0].op = DB_LOCK_GET;
+ couple[0].obj = NULL;
+ couple[0].mode = DB_LOCK_WWRITE;
+ couple[0].lock = *lockp;
+ UMRW_SET(couple[0].timeout);
+ couple[1].op = DB_LOCK_PUT;
+ couple[1].lock = *lockp;
+ ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp);
+ if (ret == 0 || reqp == &couple[1])
+ *lockp = couple[0].lock;
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+
+ return (ret);
+}
diff --git a/src/db/db_method.c b/src/db/db_method.c
new file mode 100644
index 00000000..82d03e5f
--- /dev/null
+++ b/src/db/db_method.c
@@ -0,0 +1,1117 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_get_byteswapped __P((DB *, int *));
+static int __db_get_dbname __P((DB *, const char **, const char **));
+static DB_ENV *__db_get_env __P((DB *));
+static void __db_get_msgcall
+ __P((DB *, void (**)(const DB_ENV *, const char *)));
+static DB_MPOOLFILE *__db_get_mpf __P((DB *));
+static int __db_get_multiple __P((DB *));
+static int __db_get_transactional __P((DB *));
+static int __db_get_type __P((DB *, DBTYPE *dbtype));
+static int __db_init __P((DB *, u_int32_t));
+static int __db_get_alloc __P((DB *, void *(**)(size_t),
+ void *(**)(void *, size_t), void (**)(void *)));
+static int __db_set_alloc __P((DB *, void *(*)(size_t),
+ void *(*)(void *, size_t), void (*)(void *)));
+static int __db_get_append_recno __P((DB *,
+ int (**)(DB *, DBT *, db_recno_t)));
+static int __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
+static int __db_get_cachesize __P((DB *, u_int32_t *, u_int32_t *, int *));
+static int __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int));
+static int __db_get_create_dir __P((DB *, const char **));
+static int __db_set_create_dir __P((DB *, const char *));
+static int __db_get_dup_compare
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+static int __db_set_dup_compare
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+static int __db_get_encrypt_flags __P((DB *, u_int32_t *));
+static int __db_set_encrypt __P((DB *, const char *, u_int32_t));
+static int __db_get_feedback __P((DB *, void (**)(DB *, int, int)));
+static int __db_set_feedback __P((DB *, void (*)(DB *, int, int)));
+static int __db_get_lk_exclusive __P((DB *, int *, int *));
+static int __db_set_lk_exclusive __P((DB *, int));
+static void __db_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+static int __db_get_pagesize __P((DB *, u_int32_t *));
+static int __db_set_paniccall __P((DB *, void (*)(DB_ENV *, int)));
+static int __db_set_priority __P((DB *, DB_CACHE_PRIORITY));
+static int __db_get_priority __P((DB *, DB_CACHE_PRIORITY *));
+static void __db_get_errcall __P((DB *,
+ void (**)(const DB_ENV *, const char *, const char *)));
+static void __db_set_errcall
+ __P((DB *, void (*)(const DB_ENV *, const char *, const char *)));
+static void __db_get_errfile __P((DB *, FILE **));
+static void __db_set_errfile __P((DB *, FILE *));
+static void __db_get_errpfx __P((DB *, const char **));
+static void __db_set_errpfx __P((DB *, const char *));
+static void __db_set_msgcall
+ __P((DB *, void (*)(const DB_ENV *, const char *)));
+static void __db_get_msgfile __P((DB *, FILE **));
+static void __db_set_msgfile __P((DB *, FILE *));
+static int __db_get_assoc_flags __P((DB *, u_int32_t *));
+static void __dbh_err __P((DB *, int, const char *, ...));
+static void __dbh_errx __P((DB *, const char *, ...));
+
+/*
+ * db_create --
+ * DB constructor.
+ *
+ * EXTERN: int db_create __P((DB **, DB_ENV *, u_int32_t));
+ */
+int
+db_create(dbpp, dbenv, flags)
+ DB **dbpp;
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ ip = NULL;
+ env = dbenv == NULL ? NULL : dbenv->env;
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case 0:
+ break;
+ case DB_XA_CREATE:
+ if (dbenv != NULL) {
+ __db_errx(env, DB_STR("0504",
+ "XA applications may not specify an environment to db_create"));
+ return (EINVAL);
+ }
+
+ /*
+ * If it's an XA database, open it within the XA environment,
+ * taken from the global list of environments. (When the XA
+ * transaction manager called our xa_start() routine the
+ * "current" environment was moved to the start of the list.
+ */
+ env = TAILQ_FIRST(&DB_GLOBAL(envq));
+ if (env == NULL) {
+ __db_errx(env, DB_STR("0505",
+ "Cannot open XA database before XA is enabled"));
+ return (EINVAL);
+ }
+ break;
+ default:
+ return (__db_ferr(env, "db_create", 0));
+ }
+
+ if (env != NULL)
+ ENV_ENTER(env, ip);
+
+ /*
+ * If we are opening an XA database, make sure we don't have a global XA
+ * transaction running.
+ */
+ if (LF_ISSET(DB_XA_CREATE)) {
+ XA_NO_TXN(ip, ret);
+ if (ret != 0)
+ goto err;
+ }
+
+ ret = __db_create_internal(dbpp, env, flags);
+err: if (env != NULL)
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __db_create_internal --
+ * DB constructor internal routine.
+ *
+ * PUBLIC: int __db_create_internal __P((DB **, ENV *, u_int32_t));
+ */
+int
+__db_create_internal(dbpp, env, flags)
+ DB **dbpp;
+ ENV *env;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_ENV *dbenv;
+ DB_REP *db_rep;
+ int ret;
+
+ *dbpp = NULL;
+
+ /* If we don't have an environment yet, allocate a local one. */
+ if (env == NULL) {
+ if ((ret = db_env_create(&dbenv, 0)) != 0)
+ return (ret);
+ env = dbenv->env;
+ F_SET(env, ENV_DBLOCAL);
+ } else
+ dbenv = env->dbenv;
+
+ /* Allocate and initialize the DB handle. */
+ if ((ret = __os_calloc(env, 1, sizeof(*dbp), &dbp)) != 0)
+ goto err;
+
+ dbp->dbenv = env->dbenv;
+ dbp->env = env;
+ if ((ret = __db_init(dbp, flags)) != 0)
+ goto err;
+
+ MUTEX_LOCK(env, env->mtx_dblist);
+ ++env->db_ref;
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ /*
+ * Set the replication timestamp; it's 0 if we're not in a replicated
+ * environment. Don't acquire a lock to read the value, even though
+ * it's opaque: all we check later is value equality, nothing else.
+ */
+ dbp->timestamp = REP_ON(env) ?
+ ((REGENV *)env->reginfo->primary)->rep_timestamp : 0;
+ /*
+ * Set the replication generation number for fid management; valid
+ * replication generations start at 1. Don't acquire a lock to
+ * read the value. All we check later is value equality.
+ */
+ db_rep = env->rep_handle;
+ dbp->fid_gen = REP_ON(env) ? ((REP *)db_rep->region)->gen : 0;
+
+ /* Open a backing DB_MPOOLFILE handle in the memory pool. */
+ if ((ret = __memp_fcreate(env, &dbp->mpf)) != 0)
+ goto err;
+
+ dbp->type = DB_UNKNOWN;
+
+ *dbpp = dbp;
+ return (0);
+
+err: if (dbp != NULL) {
+ if (dbp->mpf != NULL)
+ (void)__memp_fclose(dbp->mpf, 0);
+ __os_free(env, dbp);
+ }
+
+ if (dbp != NULL && F_ISSET(env, ENV_DBLOCAL))
+ (void)__env_close(dbp->dbenv, 0);
+
+ return (ret);
+}
+
+/*
+ * __db_init --
+ * Initialize a DB structure.
+ */
+static int
+__db_init(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ int ret;
+
+ dbp->locker = NULL;
+ dbp->alt_close = NULL;
+ LOCK_INIT(dbp->handle_lock);
+
+ TAILQ_INIT(&dbp->free_queue);
+ TAILQ_INIT(&dbp->active_queue);
+ TAILQ_INIT(&dbp->join_queue);
+ LIST_INIT(&dbp->s_secondaries);
+
+ FLD_SET(dbp->am_ok,
+ DB_OK_BTREE | DB_OK_HASH | DB_OK_HEAP | DB_OK_QUEUE | DB_OK_RECNO);
+
+ /* DB PUBLIC HANDLE LIST BEGIN */
+ dbp->associate = __db_associate_pp;
+ dbp->associate_foreign = __db_associate_foreign_pp;
+ dbp->close = __db_close_pp;
+ dbp->compact = __db_compact_pp;
+ dbp->cursor = __db_cursor_pp;
+ dbp->del = __db_del_pp;
+ dbp->dump = __db_dump_pp;
+ dbp->err = __dbh_err;
+ dbp->errx = __dbh_errx;
+ dbp->exists = __db_exists;
+ dbp->fd = __db_fd_pp;
+ dbp->get = __db_get_pp;
+ dbp->get_alloc = __db_get_alloc;
+ dbp->get_append_recno = __db_get_append_recno;
+ dbp->get_assoc_flags = __db_get_assoc_flags;
+ dbp->get_byteswapped = __db_get_byteswapped;
+ dbp->get_cachesize = __db_get_cachesize;
+ dbp->get_create_dir = __db_get_create_dir;
+ dbp->get_dbname = __db_get_dbname;
+ dbp->get_dup_compare = __db_get_dup_compare;
+ dbp->get_encrypt_flags = __db_get_encrypt_flags;
+ dbp->get_env = __db_get_env;
+ dbp->get_errcall = __db_get_errcall;
+ dbp->get_errfile = __db_get_errfile;
+ dbp->get_errpfx = __db_get_errpfx;
+ dbp->get_feedback = __db_get_feedback;
+ dbp->get_flags = __db_get_flags;
+ dbp->get_lorder = __db_get_lorder;
+ dbp->get_mpf = __db_get_mpf;
+ dbp->get_msgcall = __db_get_msgcall;
+ dbp->get_msgfile = __db_get_msgfile;
+ dbp->get_multiple = __db_get_multiple;
+ dbp->get_open_flags = __db_get_open_flags;
+ dbp->get_partition_dirs = __partition_get_dirs;
+ dbp->get_partition_callback = __partition_get_callback;
+ dbp->get_partition_keys = __partition_get_keys;
+ dbp->get_pagesize = __db_get_pagesize;
+ dbp->get_priority = __db_get_priority;
+ dbp->get_transactional = __db_get_transactional;
+ dbp->get_type = __db_get_type;
+ dbp->join = __db_join_pp;
+ dbp->key_range = __db_key_range_pp;
+ dbp->get_lk_exclusive = __db_get_lk_exclusive;
+ dbp->set_lk_exclusive = __db_set_lk_exclusive;
+ dbp->open = __db_open_pp;
+ dbp->pget = __db_pget_pp;
+ dbp->put = __db_put_pp;
+ dbp->remove = __db_remove_pp;
+ dbp->rename = __db_rename_pp;
+ dbp->set_alloc = __db_set_alloc;
+ dbp->set_append_recno = __db_set_append_recno;
+ dbp->set_cachesize = __db_set_cachesize;
+ dbp->set_create_dir = __db_set_create_dir;
+ dbp->set_dup_compare = __db_set_dup_compare;
+ dbp->set_encrypt = __db_set_encrypt;
+ dbp->set_errcall = __db_set_errcall;
+ dbp->set_errfile = __db_set_errfile;
+ dbp->set_errpfx = __db_set_errpfx;
+ dbp->set_feedback = __db_set_feedback;
+ dbp->set_flags = __db_set_flags;
+ dbp->set_lorder = __db_set_lorder;
+ dbp->set_msgcall = __db_set_msgcall;
+ dbp->set_msgfile = __db_set_msgfile;
+ dbp->set_pagesize = __db_set_pagesize;
+ dbp->set_paniccall = __db_set_paniccall;
+ dbp->set_partition = __partition_set;
+ dbp->set_partition_dirs = __partition_set_dirs;
+ dbp->set_priority = __db_set_priority;
+ dbp->sort_multiple = __db_sort_multiple;
+ dbp->stat = __db_stat_pp;
+ dbp->stat_print = __db_stat_print_pp;
+ dbp->sync = __db_sync_pp;
+ dbp->truncate = __db_truncate_pp;
+ dbp->upgrade = __db_upgrade_pp;
+ dbp->verify = __db_verify_pp;
+ /* DB PUBLIC HANDLE LIST END */
+
+ /* Access method specific. */
+ if ((ret = __bam_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __ham_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __heap_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __qam_db_create(dbp)) != 0)
+ return (ret);
+
+ COMPQUIET(flags, 0);
+
+ return (0);
+}
+
+/*
+ * __dbh_am_chk --
+ * Error if an unreasonable method is called.
+ *
+ * PUBLIC: int __dbh_am_chk __P((DB *, u_int32_t));
+ */
+int
+__dbh_am_chk(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ /*
+ * We start out allowing any access methods to be called, and as the
+ * application calls the methods the options become restricted. The
+ * idea is to quit as soon as an illegal method combination is called.
+ */
+ if ((LF_ISSET(DB_OK_BTREE) && FLD_ISSET(dbp->am_ok, DB_OK_BTREE)) ||
+ (LF_ISSET(DB_OK_HASH) && FLD_ISSET(dbp->am_ok, DB_OK_HASH)) ||
+ (LF_ISSET(DB_OK_HEAP) && FLD_ISSET(dbp->am_ok, DB_OK_HEAP)) ||
+ (LF_ISSET(DB_OK_QUEUE) && FLD_ISSET(dbp->am_ok, DB_OK_QUEUE)) ||
+ (LF_ISSET(DB_OK_RECNO) && FLD_ISSET(dbp->am_ok, DB_OK_RECNO))) {
+ FLD_CLR(dbp->am_ok, ~flags);
+ return (0);
+ }
+
+ __db_errx(dbp->env, DB_STR("0506",
+"call implies an access method which is inconsistent with previous calls"));
+ return (EINVAL);
+}
+
+/*
+ * __dbh_err --
+ * Db.err method.
+ */
+static void
+#ifdef STDC_HEADERS
+__dbh_err(DB *dbp, int error, const char *fmt, ...)
+#else
+__dbh_err(dbp, error, fmt, va_alist)
+ DB *dbp;
+ int error;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ /* Message with error string, to stderr by default. */
+ DB_REAL_ERR(dbp->dbenv, error, DB_ERROR_SET, 1, fmt);
+}
+
+/*
+ * __dbh_errx --
+ * Db.errx method.
+ */
+static void
+#ifdef STDC_HEADERS
+__dbh_errx(DB *dbp, const char *fmt, ...)
+#else
+__dbh_errx(dbp, fmt, va_alist)
+ DB *dbp;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ /* Message without error string, to stderr by default. */
+ DB_REAL_ERR(dbp->dbenv, 0, DB_ERROR_NOT_SET, 1, fmt);
+}
+
+/*
+ * __db_get_byteswapped --
+ * Return if database requires byte swapping.
+ */
+static int
+__db_get_byteswapped(dbp, isswapped)
+ DB *dbp;
+ int *isswapped;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_byteswapped");
+
+ *isswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0;
+ return (0);
+}
+
+/*
+ * __db_get_dbname --
+ * Get the name of the database as passed to DB->open.
+ */
+static int
+__db_get_dbname(dbp, fnamep, dnamep)
+ DB *dbp;
+ const char **fnamep, **dnamep;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_dbname");
+
+ if (fnamep != NULL)
+ *fnamep = dbp->fname;
+ if (dnamep != NULL)
+ *dnamep = dbp->dname;
+ return (0);
+}
+
+/*
+ * __db_get_env --
+ * Get the DB_ENV handle that was passed to db_create.
+ */
+static DB_ENV *
+__db_get_env(dbp)
+ DB *dbp;
+{
+ return (dbp->dbenv);
+}
+
+/*
+ * __db_get_mpf --
+ * Get the underlying DB_MPOOLFILE handle.
+ */
+static DB_MPOOLFILE *
+__db_get_mpf(dbp)
+ DB *dbp;
+{
+ return (dbp->mpf);
+}
+
+/*
+ * get_multiple --
+ * Return whether this DB handle references a physical file with multiple
+ * databases.
+ */
+static int
+__db_get_multiple(dbp)
+ DB *dbp;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_multiple");
+
+ /*
+ * Only return TRUE if the handle is for the master database, not for
+ * any subdatabase in the physical file. If it's a Btree, with the
+ * subdatabases flag set, and the meta-data page has the right value,
+ * return TRUE. (We don't need to check it's a Btree, I suppose, but
+ * it doesn't hurt.)
+ */
+ return (dbp->type == DB_BTREE &&
+ F_ISSET(dbp, DB_AM_SUBDB) &&
+ dbp->meta_pgno == PGNO_BASE_MD ? 1 : 0);
+}
+
+/*
+ * get_transactional --
+ * Return whether this database was created in a transaction.
+ */
+static int
+__db_get_transactional(dbp)
+ DB *dbp;
+{
+ return (F_ISSET(dbp, DB_AM_TXN) ? 1 : 0);
+}
+
+/*
+ * __db_get_type --
+ * Return type of underlying database.
+ */
+static int
+__db_get_type(dbp, dbtype)
+ DB *dbp;
+ DBTYPE *dbtype;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_type");
+
+ *dbtype = dbp->type;
+ return (0);
+}
+
+/*
+ * __db_get_append_recno --
+ * Get record number append routine.
+ */
+static int
+__db_get_append_recno(dbp, funcp)
+ DB *dbp;
+ int (**funcp) __P((DB *, DBT *, db_recno_t));
+{
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+ if (funcp)
+ *funcp = dbp->db_append_recno;
+
+ return (0);
+}
+/*
+ * __db_set_append_recno --
+ * Set record number append routine.
+ */
+static int
+__db_set_append_recno(dbp, func)
+ DB *dbp;
+ int (*func) __P((DB *, DBT *, db_recno_t));
+{
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_append_recno");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ dbp->db_append_recno = func;
+
+ return (0);
+}
+
+/*
+ * __db_get_cachesize --
+ * Get underlying cache size.
+ */
+static int
+__db_get_cachesize(dbp, cache_gbytesp, cache_bytesp, ncachep)
+ DB *dbp;
+ u_int32_t *cache_gbytesp, *cache_bytesp;
+ int *ncachep;
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->get_cachesize");
+
+ return (__memp_get_cachesize(dbp->dbenv,
+ cache_gbytesp, cache_bytesp, ncachep));
+}
+
+/*
+ * __db_set_cachesize --
+ * Set underlying cache size.
+ */
+static int
+__db_set_cachesize(dbp, cache_gbytes, cache_bytes, ncache)
+ DB *dbp;
+ u_int32_t cache_gbytes, cache_bytes;
+ int ncache;
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->set_cachesize");
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_cachesize");
+
+ return (__memp_set_cachesize(
+ dbp->dbenv, cache_gbytes, cache_bytes, ncache));
+}
+
+static int
+__db_set_create_dir(dbp, dir)
+ DB *dbp;
+ const char *dir;
+{
+ DB_ENV *dbenv;
+ int i;
+
+ dbenv = dbp->dbenv;
+
+ for (i = 0; i < dbenv->data_next; i++)
+ if (strcmp(dir, dbenv->db_data_dir[i]) == 0)
+ break;
+
+ if (i == dbenv->data_next) {
+ __db_errx(dbp->env, DB_STR_A("0507",
+ "Directory %s not in environment list.", "%s"), dir);
+ return (EINVAL);
+ }
+
+ dbp->dirname = dbenv->db_data_dir[i];
+ return (0);
+}
+
+static int
+__db_get_create_dir(dbp, dirp)
+ DB *dbp;
+ const char **dirp;
+{
+ *dirp = dbp->dirname;
+ return (0);
+}
+
+/*
+ * __db_get_dup_compare --
+ * Get duplicate comparison routine.
+ */
+static int
+__db_get_dup_compare(dbp, funcp)
+ DB *dbp;
+ int (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+ if (funcp != NULL) {
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp)) {
+ *funcp =
+ ((BTREE *)dbp->bt_internal)->compress_dup_compare;
+ } else
+#endif
+ *funcp = dbp->dup_compare;
+ }
+
+ return (0);
+}
+
+/*
+ * __db_set_dup_compare --
+ * Set duplicate comparison routine.
+ */
+static int
+__db_set_dup_compare(dbp, func)
+ DB *dbp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+{
+ int ret;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_dup_compare");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+ if ((ret = __db_set_flags(dbp, DB_DUPSORT)) != 0)
+ return (ret);
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp)) {
+ dbp->dup_compare = __bam_compress_dupcmp;
+ ((BTREE *)dbp->bt_internal)->compress_dup_compare = func;
+ } else
+#endif
+ dbp->dup_compare = func;
+
+ return (0);
+}
+
+/*
+ * __db_get_encrypt_flags --
+ */
+static int
+__db_get_encrypt_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->get_encrypt_flags");
+
+ return (__env_get_encrypt_flags(dbp->dbenv, flagsp));
+}
+
+/*
+ * __db_set_encrypt --
+ * Set database passwd.
+ */
+static int
+__db_set_encrypt(dbp, passwd, flags)
+ DB *dbp;
+ const char *passwd;
+ u_int32_t flags;
+{
+ DB_CIPHER *db_cipher;
+ int ret;
+
+ DB_ILLEGAL_IN_ENV(dbp, "DB->set_encrypt");
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_encrypt");
+
+ if ((ret = __env_set_encrypt(dbp->dbenv, passwd, flags)) != 0)
+ return (ret);
+
+ /*
+ * In a real env, this gets initialized with the region. In a local
+ * env, we must do it here.
+ */
+ db_cipher = dbp->env->crypto_handle;
+ if (!F_ISSET(db_cipher, CIPHER_ANY) &&
+ (ret = db_cipher->init(dbp->env, db_cipher)) != 0)
+ return (ret);
+
+ return (__db_set_flags(dbp, DB_ENCRYPT));
+}
+
+static void
+__db_get_errcall(dbp, errcallp)
+ DB *dbp;
+ void (**errcallp) __P((const DB_ENV *, const char *, const char *));
+{
+ __env_get_errcall(dbp->dbenv, errcallp);
+}
+
+static void
+__db_set_errcall(dbp, errcall)
+ DB *dbp;
+ void (*errcall) __P((const DB_ENV *, const char *, const char *));
+{
+ __env_set_errcall(dbp->dbenv, errcall);
+}
+
+static void
+__db_get_errfile(dbp, errfilep)
+ DB *dbp;
+ FILE **errfilep;
+{
+ __env_get_errfile(dbp->dbenv, errfilep);
+}
+
+static void
+__db_set_errfile(dbp, errfile)
+ DB *dbp;
+ FILE *errfile;
+{
+ __env_set_errfile(dbp->dbenv, errfile);
+}
+
+static void
+__db_get_errpfx(dbp, errpfxp)
+ DB *dbp;
+ const char **errpfxp;
+{
+ __env_get_errpfx(dbp->dbenv, errpfxp);
+}
+
+static void
+__db_set_errpfx(dbp, errpfx)
+ DB *dbp;
+ const char *errpfx;
+{
+ __env_set_errpfx(dbp->dbenv, errpfx);
+}
+
+static int
+__db_get_feedback(dbp, feedbackp)
+ DB *dbp;
+ void (**feedbackp) __P((DB *, int, int));
+{
+ if (feedbackp != NULL)
+ *feedbackp = dbp->db_feedback;
+ return (0);
+}
+
+static int
+__db_set_feedback(dbp, feedback)
+ DB *dbp;
+ void (*feedback) __P((DB *, int, int));
+{
+ dbp->db_feedback = feedback;
+ return (0);
+}
+
+static int
+__db_get_lk_exclusive(dbp, onoff, nowait)
+ DB *dbp;
+ int *onoff;
+ int *nowait;
+{
+ *onoff = (F2_ISSET(dbp, DB2_AM_EXCL) ? 1 : 0);
+ *nowait = (F2_ISSET(dbp, DB2_AM_NOWAIT) ? 1 : 0);
+ return (0);
+}
+
+static int
+__db_set_lk_exclusive(dbp, nowait)
+ DB *dbp;
+ int nowait;
+{
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_lk_exclusive");
+
+ F2_CLR(dbp, DB2_AM_NOWAIT);
+ F2_SET(dbp, (nowait ? DB2_AM_NOWAIT|DB2_AM_EXCL :
+ DB2_AM_EXCL));
+ return (0);
+}
+
+/*
+ * __db_map_flags --
+ * Maps between public and internal flag values.
+ * This function doesn't check for validity, so it can't fail.
+ */
+static void
+__db_map_flags(dbp, inflagsp, outflagsp)
+ DB *dbp;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ COMPQUIET(dbp, NULL);
+
+ if (FLD_ISSET(*inflagsp, DB_CHKSUM)) {
+ FLD_SET(*outflagsp, DB_AM_CHKSUM);
+ FLD_CLR(*inflagsp, DB_CHKSUM);
+ }
+ if (FLD_ISSET(*inflagsp, DB_ENCRYPT)) {
+ FLD_SET(*outflagsp, DB_AM_ENCRYPT | DB_AM_CHKSUM);
+ FLD_CLR(*inflagsp, DB_ENCRYPT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_TXN_NOT_DURABLE)) {
+ FLD_SET(*outflagsp, DB_AM_NOT_DURABLE);
+ FLD_CLR(*inflagsp, DB_TXN_NOT_DURABLE);
+ }
+}
+
+/*
+ * __db_get_assoc_flags --
+ */
+static int
+__db_get_assoc_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_assoc_flags");
+
+ *flagsp = dbp->s_assoc_flags;
+ return (0);
+}
+
+/*
+ * __db_get_flags --
+ * The DB->get_flags method.
+ *
+ * PUBLIC: int __db_get_flags __P((DB *, u_int32_t *));
+ */
+int
+__db_get_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ static const u_int32_t db_flags[] = {
+ DB_CHKSUM,
+ DB_DUP,
+ DB_DUPSORT,
+ DB_ENCRYPT,
+#ifdef HAVE_QUEUE
+ DB_INORDER,
+#endif
+ DB_RECNUM,
+ DB_RENUMBER,
+ DB_REVSPLITOFF,
+ DB_SNAPSHOT,
+ DB_TXN_NOT_DURABLE,
+ 0
+ };
+ u_int32_t f, flags, mapped_flag;
+ int i;
+
+ flags = 0;
+ for (i = 0; (f = db_flags[i]) != 0; i++) {
+ mapped_flag = 0;
+ __db_map_flags(dbp, &f, &mapped_flag);
+ __bam_map_flags(dbp, &f, &mapped_flag);
+ __ram_map_flags(dbp, &f, &mapped_flag);
+#ifdef HAVE_QUEUE
+ __qam_map_flags(dbp, &f, &mapped_flag);
+#endif
+ DB_ASSERT(dbp->env, f == 0);
+ if (F_ISSET(dbp, mapped_flag) == mapped_flag)
+ LF_SET(db_flags[i]);
+ }
+
+ *flagsp = flags;
+ return (0);
+}
+
+/*
+ * __db_set_flags --
+ * DB->set_flags.
+ *
+ * PUBLIC: int __db_set_flags __P((DB *, u_int32_t));
+ */
+int
+__db_set_flags(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ if (LF_ISSET(DB_ENCRYPT) && !CRYPTO_ON(env)) {
+ __db_errx(env, DB_STR("0508",
+ "Database environment not configured for encryption"));
+ return (EINVAL);
+ }
+ if (LF_ISSET(DB_TXN_NOT_DURABLE))
+ ENV_REQUIRES_CONFIG(env,
+ env->tx_handle, "DB_NOT_DURABLE", DB_INIT_TXN);
+
+ __db_map_flags(dbp, &flags, &dbp->flags);
+
+ if ((ret = __bam_set_flags(dbp, &flags)) != 0)
+ return (ret);
+ if ((ret = __ram_set_flags(dbp, &flags)) != 0)
+ return (ret);
+#ifdef HAVE_QUEUE
+ if ((ret = __qam_set_flags(dbp, &flags)) != 0)
+ return (ret);
+#endif
+
+ return (flags == 0 ? 0 : __db_ferr(env, "DB->set_flags", 0));
+}
+
+/*
+ * __db_get_lorder --
+ * Get whether lorder is swapped or not.
+ *
+ * PUBLIC: int __db_get_lorder __P((DB *, int *));
+ */
+int
+__db_get_lorder(dbp, db_lorderp)
+ DB *dbp;
+ int *db_lorderp;
+{
+ int ret;
+
+ /* Flag if the specified byte order requires swapping. */
+ switch (ret = __db_byteorder(dbp->env, 1234)) {
+ case 0:
+ *db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 4321 : 1234;
+ break;
+ case DB_SWAPBYTES:
+ *db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 1234 : 4321;
+ break;
+ default:
+ return (ret);
+ /* NOTREACHED */
+ }
+
+ return (0);
+}
+
+/*
+ * __db_set_lorder --
+ * Set whether lorder is swapped or not.
+ *
+ * PUBLIC: int __db_set_lorder __P((DB *, int));
+ */
+int
+__db_set_lorder(dbp, db_lorder)
+ DB *dbp;
+ int db_lorder;
+{
+ int ret;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_lorder");
+
+ /* Flag if the specified byte order requires swapping. */
+ switch (ret = __db_byteorder(dbp->env, db_lorder)) {
+ case 0:
+ F_CLR(dbp, DB_AM_SWAP);
+ break;
+ case DB_SWAPBYTES:
+ F_SET(dbp, DB_AM_SWAP);
+ break;
+ default:
+ return (ret);
+ /* NOTREACHED */
+ }
+ return (0);
+}
+
+static int
+__db_get_alloc(dbp, mal_funcp, real_funcp, free_funcp)
+ DB *dbp;
+ void *(**mal_funcp) __P((size_t));
+ void *(**real_funcp) __P((void *, size_t));
+ void (**free_funcp) __P((void *));
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->get_alloc");
+
+ return (__env_get_alloc(dbp->dbenv, mal_funcp,
+ real_funcp, free_funcp));
+}
+
+static int
+__db_set_alloc(dbp, mal_func, real_func, free_func)
+ DB *dbp;
+ void *(*mal_func) __P((size_t));
+ void *(*real_func) __P((void *, size_t));
+ void (*free_func) __P((void *));
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->set_alloc");
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_alloc");
+
+ return (__env_set_alloc(dbp->dbenv, mal_func, real_func, free_func));
+}
+
+static void
+__db_get_msgcall(dbp, msgcallp)
+ DB *dbp;
+ void (**msgcallp) __P((const DB_ENV *, const char *));
+{
+ __env_get_msgcall(dbp->dbenv, msgcallp);
+}
+
+static void
+__db_set_msgcall(dbp, msgcall)
+ DB *dbp;
+ void (*msgcall) __P((const DB_ENV *, const char *));
+{
+ __env_set_msgcall(dbp->dbenv, msgcall);
+}
+
+static void
+__db_get_msgfile(dbp, msgfilep)
+ DB *dbp;
+ FILE **msgfilep;
+{
+ __env_get_msgfile(dbp->dbenv, msgfilep);
+}
+
+static void
+__db_set_msgfile(dbp, msgfile)
+ DB *dbp;
+ FILE *msgfile;
+{
+ __env_set_msgfile(dbp->dbenv, msgfile);
+}
+
+static int
+__db_get_pagesize(dbp, db_pagesizep)
+ DB *dbp;
+ u_int32_t *db_pagesizep;
+{
+ *db_pagesizep = dbp->pgsize;
+ return (0);
+}
+
+/*
+ * __db_set_pagesize --
+ * DB->set_pagesize
+ *
+ * PUBLIC: int __db_set_pagesize __P((DB *, u_int32_t));
+ */
+int
+__db_set_pagesize(dbp, db_pagesize)
+ DB *dbp;
+ u_int32_t db_pagesize;
+{
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_pagesize");
+
+ if (db_pagesize < DB_MIN_PGSIZE) {
+ __db_errx(dbp->env, DB_STR_A("0509",
+ "page sizes may not be smaller than %lu", "%lu"),
+ (u_long)DB_MIN_PGSIZE);
+ return (EINVAL);
+ }
+ if (db_pagesize > DB_MAX_PGSIZE) {
+ __db_errx(dbp->env, DB_STR_A("0510",
+ "page sizes may not be larger than %lu", "%lu"),
+ (u_long)DB_MAX_PGSIZE);
+ return (EINVAL);
+ }
+
+ /*
+ * We don't want anything that's not a power-of-2, as we rely on that
+ * for alignment of various types on the pages.
+ */
+ if (!POWER_OF_TWO(db_pagesize)) {
+ __db_errx(dbp->env, DB_STR("0511",
+ "page sizes must be a power-of-2"));
+ return (EINVAL);
+ }
+
+ /*
+ * XXX
+ * Should we be checking for a page size that's not a multiple of 512,
+ * so that we never try and write less than a disk sector?
+ */
+ dbp->pgsize = db_pagesize;
+
+ return (0);
+}
+
+static int
+__db_set_paniccall(dbp, paniccall)
+ DB *dbp;
+ void (*paniccall) __P((DB_ENV *, int));
+{
+ return (__env_set_paniccall(dbp->dbenv, paniccall));
+}
+
+static int
+__db_set_priority(dbp, priority)
+ DB *dbp;
+ DB_CACHE_PRIORITY priority;
+{
+ dbp->priority = priority;
+ return (0);
+}
+
+static int
+__db_get_priority(dbp, priority)
+ DB *dbp;
+ DB_CACHE_PRIORITY *priority;
+{
+ if (dbp->priority == DB_PRIORITY_UNCHANGED)
+ return (__memp_get_priority(dbp->mpf, priority));
+ else
+ *priority = dbp->priority;
+
+ return (0);
+}
diff --git a/src/db/db_open.c b/src/db/db_open.c
new file mode 100644
index 00000000..fefda48f
--- /dev/null
+++ b/src/db/db_open.c
@@ -0,0 +1,857 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_handle_lock __P((DB *));
+
+/*
+ * __db_open --
+ * DB->open method.
+ *
+ * This routine gets called in three different ways:
+ *
+ * 1. It can be called to open a file/database. In this case, subdb will
+ * be NULL and meta_pgno will be PGNO_BASE_MD.
+ * 2. It can be called to open a subdatabase during normal operation. In
+ * this case, name and subname will both be non-NULL and meta_pgno will
+ * be PGNO_BASE_MD (also PGNO_INVALID).
+ * 3. It can be called to open an in-memory database (name == NULL;
+ * subname = name).
+ * 4. It can be called during recovery to open a file/database, in which case
+ * name will be non-NULL, subname will be NULL, and meta-pgno will be
+ * PGNO_BASE_MD.
+ * 5. It can be called during recovery to open a subdatabase, in which case
+ * name will be non-NULL, subname may be NULL and meta-pgno will be
+ * a valid pgno (i.e., not PGNO_BASE_MD).
+ * 6. It can be called during recovery to open an in-memory database.
+ *
+ * PUBLIC: int __db_open __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int, db_pgno_t));
+ */
+int
+__db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *fname, *dname;
+ DBTYPE type;
+ u_int32_t flags;
+ int mode;
+ db_pgno_t meta_pgno;
+{
+ DB *tdbp;
+ ENV *env;
+ int ret;
+ u_int32_t id;
+
+ env = dbp->env;
+ id = TXN_INVALID;
+
+ /*
+ * We must flush any existing pages before truncating the file
+ * since they could age out of mpool and overwrite new pages.
+ */
+ if (LF_ISSET(DB_TRUNCATE)) {
+ if ((ret = __db_create_internal(&tdbp, dbp->env, 0)) != 0)
+ goto err;
+ ret = __db_open(tdbp, ip, txn, fname, dname, DB_UNKNOWN,
+ DB_NOERROR | (flags & ~(DB_TRUNCATE|DB_CREATE)),
+ mode, meta_pgno);
+ if (ret == 0)
+ ret = __memp_ftruncate(tdbp->mpf, txn, ip, 0, 0);
+ (void)__db_close(tdbp, txn, DB_NOSYNC);
+ if (ret != 0 && ret != ENOENT && ret != EINVAL)
+ goto err;
+ ret = 0;
+ }
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, fname);
+
+ /*
+ * If the environment was configured with threads, the DB handle
+ * must also be free-threaded, so we force the DB_THREAD flag on.
+ * (See SR #2033 for why this is a requirement--recovery needs
+ * to be able to grab a dbp using __db_fileid_to_dbp, and it has
+ * no way of knowing which dbp goes with which thread, so whichever
+ * one it finds has to be usable in any of them.)
+ */
+ if (F_ISSET(env, ENV_THREAD))
+ LF_SET(DB_THREAD);
+
+ /* Convert any DB->open flags. */
+ if (LF_ISSET(DB_RDONLY))
+ F_SET(dbp, DB_AM_RDONLY);
+ if (LF_ISSET(DB_READ_UNCOMMITTED))
+ F_SET(dbp, DB_AM_READ_UNCOMMITTED);
+
+ if (IS_REAL_TXN(txn))
+ F_SET(dbp, DB_AM_TXN);
+
+ /* Fill in the type. */
+ dbp->type = type;
+
+ /* Save the file and database names. */
+ if ((fname != NULL &&
+ (ret = __os_strdup(env, fname, &dbp->fname)) != 0))
+ goto err;
+ if ((dname != NULL &&
+ (ret = __os_strdup(env, dname, &dbp->dname)) != 0))
+ goto err;
+
+ /*
+ * If both fname and subname are NULL, it's always a create, so make
+ * sure that we have both DB_CREATE and a type specified. It would
+ * be nice if this checking were done in __db_open where most of the
+ * interface checking is done, but this interface (__db_dbopen) is
+ * used by the recovery and limbo system, so we need to safeguard
+ * this interface as well.
+ */
+ if (fname == NULL) {
+ if (dbp->p_internal != NULL) {
+ __db_errx(env, DB_STR("0634",
+ "Partitioned databases may not be in memory."));
+ return (ENOENT);
+ }
+ if (dname == NULL) {
+ if (!LF_ISSET(DB_CREATE)) {
+ __db_errx(env, DB_STR("0635",
+ "DB_CREATE must be specified to create databases."));
+ return (ENOENT);
+ }
+
+ F_SET(dbp, DB_AM_INMEM);
+ F_SET(dbp, DB_AM_CREATED);
+
+ if (dbp->type == DB_UNKNOWN) {
+ __db_errx(env, DB_STR("0636",
+ "DBTYPE of unknown without existing file"));
+ return (EINVAL);
+ }
+
+ if (dbp->pgsize == 0)
+ dbp->pgsize = DB_DEF_IOSIZE;
+
+ /*
+ * If the file is a temporary file and we're
+ * doing locking, then we have to create a
+ * unique file ID. We can't use our normal
+ * dev/inode pair (or whatever this OS uses
+ * in place of dev/inode pairs) because no
+ * backing file will be created until the
+ * mpool cache is filled forcing the buffers
+ * to disk. Grab a random locker ID to use
+ * as a file ID. The created ID must never
+ * match a potential real file ID -- we know
+ * it won't because real file IDs contain a
+ * time stamp after the dev/inode pair, and
+ * we're simply storing a 4-byte value.
+
+ * !!!
+ * Store the locker in the file id structure
+ * -- we can get it from there as necessary,
+ * and it saves having two copies.
+ */
+ if (LOCKING_ON(env) && (ret = __lock_id(env,
+ (u_int32_t *)dbp->fileid, NULL)) != 0)
+ return (ret);
+ } else
+ MAKE_INMEM(dbp);
+
+ /*
+ * Normally we would do handle locking here, however, with
+ * in-memory files, we cannot do any database manipulation
+ * until the mpool is open, so it happens later.
+ */
+ } else if (dname == NULL && meta_pgno == PGNO_BASE_MD) {
+ /* Open/create the underlying file. Acquire locks. */
+ if ((ret = __fop_file_setup(dbp, ip,
+ txn, fname, mode, flags, &id)) != 0)
+ return (ret);
+ /*
+ * If we are creating the first sub-db then this is the
+ * call to create the master db and we tried to open it
+ * read-only. The create will force it to be read/write
+ * So clear the RDONLY flag if we just created it.
+ */
+ if (!F_ISSET(dbp, DB_AM_RDONLY))
+ LF_CLR(DB_RDONLY);
+ } else {
+ if (dbp->p_internal != NULL) {
+ __db_errx(env, DB_STR("0637",
+ "Partitioned databases may not be included with multiple databases."));
+ return (ENOENT);
+ }
+ if ((ret = __fop_subdb_setup(dbp, ip,
+ txn, fname, dname, mode, flags)) != 0)
+ return (ret);
+ meta_pgno = dbp->meta_pgno;
+ }
+
+ /* Set up the underlying environment. */
+ if ((ret = __env_setup(dbp, txn, fname, dname, id, flags)) != 0)
+ return (ret);
+
+ /* For in-memory databases, we now need to open/create the database. */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if (dname == NULL)
+ ret = __db_new_file(dbp, ip, txn, NULL, NULL);
+ else {
+ id = TXN_INVALID;
+ ret = __fop_file_setup(dbp,
+ ip, txn, dname, mode, flags, &id);
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * Internal exclusive databases need to use the shared
+ * memory pool to lock out existing database handles before
+ * it gets its handle lock. So getting the lock is delayed
+ * until after the memory pool is allocated.
+ */
+ if (F2_ISSET(dbp, DB2_AM_INTEXCL) &&
+ (ret = __db_handle_lock(dbp)) != 0)
+ goto err;
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ ret = __bam_open(dbp, ip, txn, fname, meta_pgno, flags);
+ break;
+ case DB_HASH:
+ ret = __ham_open(dbp, ip, txn, fname, meta_pgno, flags);
+ break;
+ case DB_HEAP:
+ ret = __heap_open(dbp,
+ ip, txn, fname, meta_pgno, flags);
+ break;
+ case DB_RECNO:
+ ret = __ram_open(dbp, ip, txn, fname, meta_pgno, flags);
+ break;
+ case DB_QUEUE:
+ ret = __qam_open(
+ dbp, ip, txn, fname, meta_pgno, mode, flags);
+ break;
+ case DB_UNKNOWN:
+ return (
+ __db_unknown_type(env, "__db_dbopen", dbp->type));
+ }
+ if (ret != 0)
+ goto err;
+
+#ifdef HAVE_PARTITION
+ if (dbp->p_internal != NULL && (ret =
+ __partition_open(dbp, ip, txn, fname, type, flags, mode, 1)) != 0)
+ goto err;
+#endif
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, fname);
+
+ /*
+ * Temporary files don't need handle locks, so we only have to check
+ * for a handle lock downgrade or lockevent in the case of named
+ * files.
+ */
+ if (!F_ISSET(dbp, DB_AM_RECOVER) && (fname != NULL || dname != NULL) &&
+ LOCK_ISSET(dbp->handle_lock)) {
+ if (IS_REAL_TXN(txn))
+ ret = __txn_lockevent(env,
+ txn, dbp, &dbp->handle_lock, dbp->locker);
+ else if (LOCKING_ON(env) && !F2_ISSET(dbp, DB2_AM_EXCL))
+ /*
+ * Trade write handle lock for read handle lock,
+ * unless this is an exclusive database handle.
+ */
+ ret = __lock_downgrade(env,
+ &dbp->handle_lock, DB_LOCK_READ, 0);
+ }
+DB_TEST_RECOVERY_LABEL
+err:
+ PERFMON4(env,
+ db, open, (char *) fname, (char *) dname, flags, &dbp->fileid[0]);
+ return (ret);
+}
+
+/*
+ * __db_get_open_flags --
+ * Accessor for flags passed into DB->open call
+ *
+ * PUBLIC: int __db_get_open_flags __P((DB *, u_int32_t *));
+ */
+int
+__db_get_open_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_open_flags");
+
+ *flagsp = dbp->open_flags;
+ return (0);
+}
+
+/*
+ * __db_new_file --
+ * Create a new database file.
+ *
+ * PUBLIC: int __db_new_file __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__db_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ int ret;
+
+ /*
+ * For in-memory database, it is created by mpool and doesn't
+ * take any lock, so temporarily turn off the lock checking here.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ LOCK_CHECK_OFF(ip);
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_new_file(dbp, ip, txn, fhp, name);
+ break;
+ case DB_HASH:
+ ret = __ham_new_file(dbp, ip, txn, fhp, name);
+ break;
+ case DB_HEAP:
+ ret = __heap_new_file(dbp, ip, txn, fhp, name);
+ break;
+ case DB_QUEUE:
+ ret = __qam_new_file(dbp, ip, txn, fhp, name);
+ break;
+ case DB_UNKNOWN:
+ default:
+ __db_errx(dbp->env, DB_STR_A("0638",
+ "%s: Invalid type %d specified", "%s %d"),
+ name, dbp->type);
+ ret = EINVAL;
+ break;
+ }
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name);
+ /* Sync the file in preparation for moving it into place. */
+ if (ret == 0 && fhp != NULL)
+ ret = __os_fsync(dbp->env, fhp);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name);
+
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ LOCK_CHECK_ON(ip);
+
+DB_TEST_RECOVERY_LABEL
+ return (ret);
+}
+
+/*
+ * __db_init_subdb --
+ * Initialize the dbp for a subdb.
+ *
+ * PUBLIC: int __db_init_subdb __P((DB *,
+ * PUBLIC: DB *, const char *, DB_THREAD_INFO *, DB_TXN *));
+ */
+int
+__db_init_subdb(mdbp, dbp, name, ip, txn)
+ DB *mdbp, *dbp;
+ const char *name;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+{
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ ret = 0;
+ if (!F_ISSET(dbp, DB_AM_CREATED)) {
+ /* Subdb exists; read meta-data page and initialize. */
+ mpf = mdbp->mpf;
+ if ((ret = __memp_fget(mpf, &dbp->meta_pgno,
+ ip, txn, 0, &meta)) != 0)
+ goto err;
+ ret = __db_meta_setup(mdbp->env, dbp, name, meta, 0, 0);
+ if ((t_ret = __memp_fput(mpf,
+ ip, meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ /*
+ * If __db_meta_setup found that the meta-page hadn't
+ * been written out during recovery, we can just return.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ goto err;
+ }
+
+ /* Handle the create case here. */
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_new_subdb(mdbp, dbp, ip, txn);
+ break;
+ case DB_HASH:
+ ret = __ham_new_subdb(mdbp, dbp, ip, txn);
+ break;
+ case DB_QUEUE:
+ ret = EINVAL;
+ break;
+ case DB_UNKNOWN:
+ default:
+ __db_errx(dbp->env, DB_STR_A("0639",
+ "Invalid subdatabase type %d specified", "%d"),
+ dbp->type);
+ return (EINVAL);
+ }
+
+err: return (ret);
+}
+
+/*
+ * __db_chk_meta --
+ * Take a buffer containing a meta-data page and check it for a valid LSN,
+ * checksum (and verify the checksum if necessary) and possibly decrypt it.
+ *
+ * Return 0 on success, >0 (errno).
+ *
+ * PUBLIC: int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t));
+ */
+int
+__db_chk_meta(env, dbp, meta, flags)
+ ENV *env;
+ DB *dbp;
+ DBMETA *meta;
+ u_int32_t flags;
+{
+ DB_LSN swap_lsn;
+ int is_hmac, ret, swapped;
+ u_int32_t magic, orig_chk;
+ u_int8_t *chksum;
+
+ ret = 0;
+ swapped = 0;
+
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) {
+ if (dbp != NULL)
+ F_SET(dbp, DB_AM_CHKSUM);
+
+ is_hmac = meta->encrypt_alg == 0 ? 0 : 1;
+ chksum = ((BTMETA *)meta)->chksum;
+
+ /*
+ * If we need to swap, the checksum function overwrites the
+ * original checksum with 0, so we need to save a copy of the
+ * original for swapping later.
+ */
+ orig_chk = *(u_int32_t *)chksum;
+
+ /*
+ * We cannot add this to __db_metaswap because that gets done
+ * later after we've verified the checksum or decrypted.
+ */
+ if (LF_ISSET(DB_CHK_META)) {
+ swapped = 0;
+chk_retry: if ((ret =
+ __db_check_chksum(env, NULL, env->crypto_handle,
+ chksum, meta, DBMETASIZE, is_hmac)) != 0) {
+ if (is_hmac || swapped)
+ return (DB_CHKSUM_FAIL);
+
+ M_32_SWAP(orig_chk);
+ swapped = 1;
+ *(u_int32_t *)chksum = orig_chk;
+ goto chk_retry;
+ }
+ }
+ } else if (dbp != NULL)
+ F_CLR(dbp, DB_AM_CHKSUM);
+
+#ifdef HAVE_CRYPTO
+ if (__crypto_decrypt_meta(env,
+ dbp, (u_int8_t *)meta, LF_ISSET(DB_CHK_META)) != 0)
+ ret = DB_CHKSUM_FAIL;
+ else
+#endif
+
+ /* Now that we're decrypted, we can check LSN. */
+ if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) {
+ /*
+ * This gets called both before and after swapping, so we
+ * need to check ourselves. If we already swapped it above,
+ * we'll know that here.
+ */
+
+ swap_lsn = meta->lsn;
+ magic = meta->magic;
+lsn_retry:
+ if (swapped) {
+ M_32_SWAP(swap_lsn.file);
+ M_32_SWAP(swap_lsn.offset);
+ M_32_SWAP(magic);
+ }
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ case DB_HASHMAGIC:
+ case DB_HEAPMAGIC:
+ case DB_QAMMAGIC:
+ case DB_RENAMEMAGIC:
+ break;
+ default:
+ if (swapped)
+ return (EINVAL);
+ swapped = 1;
+ goto lsn_retry;
+ }
+ if (!IS_REP_CLIENT(env) &&
+ !IS_NOT_LOGGED_LSN(swap_lsn) && !IS_ZERO_LSN(swap_lsn))
+ /* Need to do check. */
+ ret = __log_check_page_lsn(env, dbp, &swap_lsn);
+ }
+ return (ret);
+}
+
+/*
+ * __db_meta_setup --
+ *
+ * Take a buffer containing a meta-data page and figure out if it's
+ * valid, and if so, initialize the dbp from the meta-data page.
+ *
+ * PUBLIC: int __db_meta_setup __P((ENV *,
+ * PUBLIC: DB *, const char *, DBMETA *, u_int32_t, u_int32_t));
+ */
+int
+__db_meta_setup(env, dbp, name, meta, oflags, flags)
+ ENV *env;
+ DB *dbp;
+ const char *name;
+ DBMETA *meta;
+ u_int32_t oflags;
+ u_int32_t flags;
+{
+ u_int32_t magic;
+ int ret;
+
+ ret = 0;
+
+ /*
+ * Figure out what access method we're dealing with, and then
+ * call access method specific code to check error conditions
+ * based on conflicts between the found file and application
+ * arguments. A found file overrides some user information --
+ * we don't consider it an error, for example, if the user set
+ * an expected byte order and the found file doesn't match it.
+ */
+ F_CLR(dbp, DB_AM_SWAP | DB_AM_IN_RENAME);
+ magic = meta->magic;
+
+swap_retry:
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ case DB_HASHMAGIC:
+ case DB_HEAPMAGIC:
+ case DB_QAMMAGIC:
+ case DB_RENAMEMAGIC:
+ break;
+ case 0:
+ /*
+ * The only time this should be 0 is if we're in the
+ * midst of opening a subdb during recovery and that
+ * subdatabase had its meta-data page allocated, but
+ * not yet initialized.
+ */
+ if (F_ISSET(dbp, DB_AM_SUBDB) && ((IS_RECOVERING(env) &&
+ F_ISSET(env->lg_handle, DBLOG_FORCE_OPEN)) ||
+ meta->pgno != PGNO_INVALID))
+ return (ENOENT);
+
+ goto bad_format;
+ default:
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ goto bad_format;
+
+ M_32_SWAP(magic);
+ F_SET(dbp, DB_AM_SWAP);
+ goto swap_retry;
+ }
+
+ /*
+ * We can only check the meta page if we are sure we have a meta page.
+ * If it is random data, then this check can fail. So only now can we
+ * checksum and decrypt. Don't distinguish between configuration and
+ * checksum match errors here, because we haven't opened the database
+ * and even a checksum error isn't a reason to panic the environment.
+ * If DB_SKIP_CHK is set, it means the checksum was already checked
+ * and the page was already decrypted.
+ */
+ if (!LF_ISSET(DB_SKIP_CHK) &&
+ (ret = __db_chk_meta(env, dbp, meta, flags)) != 0) {
+ if (ret == DB_CHKSUM_FAIL)
+ __db_errx(env, DB_STR_A("0640",
+ "%s: metadata page checksum error", "%s"), name);
+ goto bad_format;
+ }
+
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ if (dbp->type != DB_UNKNOWN &&
+ dbp->type != DB_RECNO && dbp->type != DB_BTREE)
+ goto bad_format;
+
+ flags = meta->flags;
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ M_32_SWAP(flags);
+ if (LF_ISSET(BTM_RECNO))
+ dbp->type = DB_RECNO;
+ else
+ dbp->type = DB_BTREE;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __bam_metachk(dbp, name, (BTMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_HASHMAGIC:
+ if (dbp->type != DB_UNKNOWN && dbp->type != DB_HASH)
+ goto bad_format;
+
+ dbp->type = DB_HASH;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __ham_metachk(dbp, name, (HMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_HEAPMAGIC:
+ if (dbp->type != DB_UNKNOWN && dbp->type != DB_HEAP)
+ goto bad_format;
+
+ dbp->type = DB_HEAP;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __heap_metachk(dbp, name, (HEAPMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_QAMMAGIC:
+ if (dbp->type != DB_UNKNOWN && dbp->type != DB_QUEUE)
+ goto bad_format;
+ dbp->type = DB_QUEUE;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __qam_metachk(dbp, name, (QMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_RENAMEMAGIC:
+ F_SET(dbp, DB_AM_IN_RENAME);
+
+ /* Copy the file's ID. */
+ memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
+
+ break;
+ default:
+ goto bad_format;
+ }
+
+ if (FLD_ISSET(meta->metaflags,
+ DBMETA_PART_RANGE | DBMETA_PART_CALLBACK))
+ if ((ret =
+ __partition_init(dbp, meta->metaflags)) != 0)
+ return (ret);
+ return (0);
+
+bad_format:
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ ret = ENOENT;
+ else
+ __db_errx(env, DB_STR_A("0641",
+ "__db_meta_setup: %s: unexpected file type or format",
+ "%s"), name);
+ return (ret == 0 ? EINVAL : ret);
+}
+
+/*
+ * __db_reopen --
+ * Reopen a subdatabase if its meta/root pages move.
+ * PUBLIC: int __db_reopen __P((DBC *));
+ */
+int
+__db_reopen(arg_dbc)
+ DBC *arg_dbc;
+{
+ BTREE *bt;
+ DBC *dbc;
+ DB_TXN *txn;
+ HASH *ht;
+ DB *dbp, *mdbp;
+ DB_LOCK new_lock, old_lock;
+ PAGE *new_page, *old_page;
+ db_pgno_t newpgno, oldpgno;
+ int ret, t_ret;
+
+ dbc = arg_dbc;
+ dbp = dbc->dbp;
+ old_page = new_page = NULL;
+ mdbp = NULL;
+
+ COMPQUIET(bt, NULL);
+ COMPQUIET(ht, NULL);
+ COMPQUIET(txn, NULL);
+ LOCK_INIT(new_lock);
+ LOCK_INIT(old_lock);
+
+ /*
+ * This must be done in the context of a transaction. If the
+ * requester does not have a transaction, create one.
+ */
+
+ if (TXN_ON(dbp->env) && (txn = dbc->txn) == NULL) {
+ if ((ret = __txn_begin(dbp->env,
+ dbc->thread_info, NULL, &txn, 0)) != 0)
+ return (ret);
+ if ((ret = __db_cursor(dbp,
+ dbc->thread_info, txn, &dbc, 0)) != 0) {
+ (void)__txn_abort(txn);
+ return (ret);
+ }
+ }
+
+ /*
+ * Lock and latch the old metadata page before re-opening the
+ * database so that the information is stable. Then lock
+ * and latch the new page before getting the revision so that
+ * it cannot change.
+ */
+
+ if (dbp->type == DB_HASH) {
+ ht = (HASH*)dbp->h_internal;
+ oldpgno = ht->meta_pgno;
+ } else {
+ bt = (BTREE *)dbp->bt_internal;
+ oldpgno = bt->bt_root;
+ }
+ if (STD_LOCKING(dbc) && (ret = __db_lget(dbc,
+ 0, oldpgno, DB_LOCK_READ, 0, &old_lock)) != 0)
+ goto err;
+
+ if ((ret = __memp_fget(dbp->mpf, &oldpgno,
+ dbc->thread_info, dbc->txn, 0, &old_page)) != 0 &&
+ ret != DB_PAGE_NOTFOUND)
+ goto err;
+
+ /* If the page is free we must not hold its lock. */
+ if (ret == DB_PAGE_NOTFOUND || TYPE(old_page) == P_INVALID) {
+ if ((ret = __LPUT(dbc, old_lock)) != 0)
+ goto err;
+ /* Drop the latch too. */
+ if (old_page != NULL && (ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, old_page, dbc->priority)) != 0)
+ goto err;
+ old_page = NULL;
+ }
+
+ if ((ret = __db_master_open(dbp,
+ dbc->thread_info, dbc->txn, dbp->fname, 0, 0, &mdbp)) != 0)
+ goto err;
+
+ if ((ret = __db_master_update(mdbp, dbp, dbc->thread_info,
+ dbc->txn, dbp->dname, dbp->type, MU_OPEN, NULL, 0)) != 0)
+ goto err;
+
+ if (dbp->type == DB_HASH)
+ newpgno = ht->meta_pgno = dbp->meta_pgno;
+ else {
+ bt->bt_meta = dbp->meta_pgno;
+ if ((ret = __bam_read_root(dbp,
+ dbc->thread_info, dbc->txn, bt->bt_meta, 0)) != 0)
+ goto err;
+ newpgno = bt->bt_root;
+ }
+
+ if (oldpgno == newpgno)
+ goto done;
+
+ if (STD_LOCKING(dbc) && (ret = __db_lget(dbc,
+ 0, newpgno, DB_LOCK_READ, 0, &new_lock)) != 0)
+ goto err;
+
+ if ((ret = __memp_fget(dbp->mpf, &newpgno,
+ dbc->thread_info, dbc->txn, 0, &new_page)) != 0)
+ goto err;
+
+done: if (dbp->type == DB_HASH)
+ ht->revision = dbp->mpf->mfp->revision;
+ else
+ bt->revision = dbp->mpf->mfp->revision;
+
+err: if (old_page != NULL && (t_ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, old_page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (new_page != NULL && (t_ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, new_page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (mdbp != NULL &&
+ (t_ret = __db_close(mdbp, dbc->txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (dbc != arg_dbc) {
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __txn_commit(txn, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+
+static int
+__db_handle_lock(dbp)
+ DB *dbp;
+{
+ ENV *env;
+ int ret;
+ u_int32_t old_flags;
+
+ env = dbp->env;
+ ret = 0;
+ old_flags = dbp->flags;
+
+ /*
+ * Internal exclusive database handles need to get and hold
+ * their own handle locks so that the client cannot open any
+ * external handles on that database.
+ */
+ F_CLR(dbp, DB_AM_RECOVER);
+ F_SET(dbp, DB_AM_NOT_DURABLE);
+
+ /* Begin exclusive handle lockout. */
+ dbp->mpf->mfp->excl_lockout = 1;
+
+ if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+ goto err;
+ LOCK_INIT(dbp->handle_lock);
+ if ((ret = __fop_lock_handle(env, dbp, dbp->locker, DB_LOCK_WRITE,
+ NULL, 0))!= 0)
+ goto err;
+
+err: /* End exclusive handle lockout. */
+ dbp->mpf->mfp->excl_lockout = 0;
+ dbp->flags = old_flags;
+
+ return (ret);
+}
diff --git a/src/db/db_overflow.c b/src/db/db_overflow.c
new file mode 100644
index 00000000..d992ec0d
--- /dev/null
+++ b/src/db/db_overflow.c
@@ -0,0 +1,705 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+
+/*
+ * Big key/data code.
+ *
+ * Big key and data entries are stored on linked lists of pages. The initial
+ * reference is a structure with the total length of the item and the page
+ * number where it begins. Each entry in the linked list contains a pointer
+ * to the next page of data, and so on.
+ */
+
+/*
+ * __db_goff --
+ * Get an offpage item.
+ *
+ * PUBLIC: int __db_goff __P((DBC *,
+ * PUBLIC: DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *));
+ */
+int
+__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz)
+ DBC *dbc;
+ DBT *dbt;
+ u_int32_t tlen;
+ db_pgno_t pgno;
+ void **bpp;
+ u_int32_t *bpsz;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ DB_TXN *txn;
+ DBC_INTERNAL *cp;
+ ENV *env;
+ PAGE *h;
+ DB_THREAD_INFO *ip;
+ db_indx_t bytes;
+ u_int32_t curoff, needed, start;
+ u_int8_t *p, *src;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = dbc->internal;
+ env = dbp->env;
+ ip = dbc->thread_info;
+ mpf = dbp->mpf;
+ txn = dbc->txn;
+
+ /*
+ * Check if the buffer is big enough; if it is not and we are
+ * allowed to malloc space, then we'll malloc it. If we are
+ * not (DB_DBT_USERMEM), then we'll set the dbt and return
+ * appropriately.
+ */
+ if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+ start = dbt->doff;
+ if (start > tlen)
+ needed = 0;
+ else if (dbt->dlen > tlen - start)
+ needed = tlen - start;
+ else
+ needed = dbt->dlen;
+ } else {
+ start = 0;
+ needed = tlen;
+ }
+
+ /*
+ * If the caller has not requested any data, return success. This
+ * "early-out" also avoids setting up the streaming optimization when
+ * no page would be retrieved. If it were removed, the streaming code
+ * should only initialize when needed is not 0.
+ */
+ if (needed == 0) {
+ dbt->size = 0;
+ return (0);
+ }
+
+ if (F_ISSET(dbt, DB_DBT_USERCOPY))
+ goto skip_alloc;
+
+ /* Allocate any necessary memory. */
+ if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+ if (needed > dbt->ulen) {
+ dbt->size = needed;
+ return (DB_BUFFER_SMALL);
+ }
+ } else if (F_ISSET(dbt, DB_DBT_MALLOC)) {
+ if ((ret = __os_umalloc(env, needed, &dbt->data)) != 0)
+ return (ret);
+ } else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+ if ((ret = __os_urealloc(env, needed, &dbt->data)) != 0)
+ return (ret);
+ } else if (bpsz != NULL && (*bpsz == 0 || *bpsz < needed)) {
+ if ((ret = __os_realloc(env, needed, bpp)) != 0)
+ return (ret);
+ *bpsz = needed;
+ dbt->data = *bpp;
+ } else if (bpp != NULL)
+ dbt->data = *bpp;
+ else {
+ DB_ASSERT(env,
+ F_ISSET(dbt,
+ DB_DBT_USERMEM | DB_DBT_MALLOC | DB_DBT_REALLOC) ||
+ bpsz != NULL);
+ return (DB_BUFFER_SMALL);
+ }
+
+skip_alloc:
+ /* Set up a start page in the overflow chain if streaming. */
+ if (cp->stream_start_pgno != PGNO_INVALID &&
+ pgno == cp->stream_start_pgno && start >= cp->stream_off &&
+ start < cp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) {
+ pgno = cp->stream_curr_pgno;
+ curoff = cp->stream_off;
+ } else {
+ cp->stream_start_pgno = cp->stream_curr_pgno = pgno;
+ cp->stream_off = curoff = 0;
+ }
+
+ /*
+ * Step through the linked list of pages, copying the data on each
+ * one into the buffer. Never copy more than the total data length.
+ */
+ dbt->size = needed;
+ for (p = dbt->data; pgno != PGNO_INVALID && needed > 0;) {
+ if ((ret = __memp_fget(mpf,
+ &pgno, ip, txn, 0, &h)) != 0)
+ return (ret);
+ DB_ASSERT(env, TYPE(h) == P_OVERFLOW);
+
+ /* Check if we need any bytes from this page. */
+ if (curoff + OV_LEN(h) >= start) {
+ bytes = OV_LEN(h);
+ src = (u_int8_t *)h + P_OVERHEAD(dbp);
+ if (start > curoff) {
+ src += start - curoff;
+ bytes -= start - curoff;
+ }
+ if (bytes > needed)
+ bytes = needed;
+ if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+ /*
+ * The offset into the DBT is the total size
+ * less the amount of data still needed. Care
+ * needs to be taken if doing a partial copy
+ * beginning at an offset other than 0.
+ */
+ if ((ret = env->dbt_usercopy(
+ dbt, dbt->size - needed,
+ src, bytes, DB_USERCOPY_SETDATA)) != 0) {
+ (void)__memp_fput(mpf,
+ ip, h, dbp->priority);
+ return (ret);
+ }
+ } else
+ memcpy(p, src, bytes);
+ p += bytes;
+ needed -= bytes;
+ }
+ cp->stream_off = curoff;
+ curoff += OV_LEN(h);
+ cp->stream_curr_pgno = pgno;
+ pgno = h->next_pgno;
+ (void)__memp_fput(mpf, ip, h, dbp->priority);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_poff --
+ * Put an offpage item.
+ *
+ * PUBLIC: int __db_poff __P((DBC *, const DBT *, db_pgno_t *));
+ */
+int
+__db_poff(dbc, dbt, pgnop)
+ DBC *dbc;
+ const DBT *dbt;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DBT tmp_dbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep, *lastp;
+ db_indx_t pagespace;
+ db_pgno_t pgno;
+ u_int32_t space, sz, tlen;
+ u_int8_t *p;
+ int ret, t_ret;
+
+ /*
+ * Allocate pages and copy the key/data item into them. Calculate the
+ * number of bytes we get for pages we fill completely with a single
+ * item.
+ */
+ dbp = dbc->dbp;
+ lastp = NULL;
+ mpf = dbp->mpf;
+ pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+ p = dbt->data;
+ sz = dbt->size;
+
+ /*
+ * Check whether we are streaming at the end of the overflow item.
+ * If so, the last pgno and offset will be cached in the cursor.
+ */
+ if (F_ISSET(dbt, DB_DBT_STREAMING)) {
+ tlen = dbt->size - dbt->dlen;
+ pgno = dbc->internal->stream_curr_pgno;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &lastp)) != 0)
+ return (ret);
+
+ /*
+ * Calculate how much we can write on the last page of the
+ * overflow item.
+ */
+ DB_ASSERT(dbp->env,
+ OV_LEN(lastp) == (tlen - dbc->internal->stream_off));
+ space = pagespace - OV_LEN(lastp);
+
+ /* Only copy as much data as we have. */
+ if (space > dbt->dlen)
+ space = dbt->dlen;
+
+ if (DBC_LOGGING(dbc)) {
+ tmp_dbt.data = dbt->data;
+ tmp_dbt.size = space;
+ ZERO_LSN(null_lsn);
+ if ((ret = __db_big_log(dbp, dbc->txn, &LSN(lastp), 0,
+ OP_SET(DB_APPEND_BIG, lastp), pgno,
+ PGNO_INVALID, PGNO_INVALID, &tmp_dbt,
+ &LSN(lastp), &null_lsn, &null_lsn)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(lastp));
+
+ memcpy((u_int8_t *)lastp + P_OVERHEAD(dbp) + OV_LEN(lastp),
+ dbt->data, space);
+ OV_LEN(lastp) += space;
+ sz -= space + dbt->doff;
+ p += space;
+ *pgnop = dbc->internal->stream_start_pgno;
+ }
+
+ ret = 0;
+ for (; sz > 0; p += pagespace, sz -= pagespace) {
+ /*
+ * Reduce pagespace so we terminate the loop correctly and
+ * don't copy too much data.
+ */
+ if (sz < pagespace)
+ pagespace = sz;
+
+ /*
+ * Allocate and initialize a new page and copy all or part of
+ * the item onto the page. If sz is less than pagespace, we
+ * have a partial record.
+ */
+ if ((ret = __db_new(dbc, P_OVERFLOW, NULL, &pagep)) != 0)
+ break;
+ if (DBC_LOGGING(dbc)) {
+ tmp_dbt.data = p;
+ tmp_dbt.size = pagespace;
+ ZERO_LSN(null_lsn);
+ if ((ret = __db_big_log(dbp, dbc->txn, &LSN(pagep), 0,
+ OP_SET(DB_ADD_BIG, pagep),
+ PGNO(pagep), lastp ? PGNO(lastp) : PGNO_INVALID,
+ PGNO_INVALID, &tmp_dbt, &LSN(pagep),
+ lastp == NULL ? &null_lsn : &LSN(lastp),
+ &null_lsn)) != 0) {
+ (void)__memp_fput(mpf, dbc->thread_info,
+ pagep, dbc->priority);
+ goto err;
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+
+ /* Move LSN onto page. */
+ if (lastp != NULL)
+ LSN(lastp) = LSN(pagep);
+
+ OV_LEN(pagep) = pagespace;
+ OV_REF(pagep) = 1;
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(dbp), p, pagespace);
+
+ /*
+ * If this is the first entry, update the user's info and
+ * initialize the cursor to allow for streaming of subsequent
+ * updates. Otherwise, update the entry on the last page
+ * filled in and release that page.
+ */
+ if (lastp == NULL) {
+ *pgnop = PGNO(pagep);
+ dbc->internal->stream_start_pgno =
+ dbc->internal->stream_curr_pgno = *pgnop;
+ dbc->internal->stream_off = 0;
+ } else {
+ lastp->next_pgno = PGNO(pagep);
+ pagep->prev_pgno = PGNO(lastp);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, lastp, dbc->priority)) != 0) {
+ lastp = NULL;
+ goto err;
+ }
+ }
+ lastp = pagep;
+ }
+err: if (lastp != NULL) {
+ if (ret == 0) {
+ dbc->internal->stream_curr_pgno = PGNO(lastp);
+ dbc->internal->stream_off = dbt->size - OV_LEN(lastp);
+ }
+
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info, lastp,
+ dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+
+/*
+ * __db_ovref --
+ * Decrement the reference count on an overflow page.
+ *
+ * PUBLIC: int __db_ovref __P((DBC *, db_pgno_t));
+ */
+int
+__db_ovref(dbc, pgno)
+ DBC *dbc;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &h)) != 0)
+ return (ret);
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_ovref_log(dbp,
+ dbc->txn, &LSN(h), 0, h->pgno, -1, &LSN(h))) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ return (ret);
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(h));
+
+ /*
+ * In BDB releases before 4.5, the overflow reference counts were
+ * incremented when an overflow item was split onto an internal
+ * page. There was a lock race in that code, and rather than fix
+ * the race, we changed BDB to copy overflow items when splitting
+ * them onto internal pages. The code to decrement reference
+ * counts remains so databases already in the field continue to
+ * work.
+ */
+ --OV_REF(h);
+
+ return (__memp_fput(mpf, dbc->thread_info, h, dbc->priority));
+}
+
+/*
+ * __db_doff --
+ * Delete an offpage chain of overflow pages.
+ *
+ * PUBLIC: int __db_doff __P((DBC *, db_pgno_t));
+ */
+int
+__db_doff(dbc, pgno)
+ DBC *dbc;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DBT tmp_dbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+
+ do {
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &pagep)) != 0)
+ return (ret);
+
+ DB_ASSERT(dbp->env, TYPE(pagep) == P_OVERFLOW);
+ /*
+ * If it's referenced by more than one key/data item,
+ * decrement the reference count and return.
+ */
+ if (OV_REF(pagep) > 1) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, pagep, dbc->priority);
+ return (__db_ovref(dbc, pgno));
+ }
+
+ if ((ret = __memp_dirty(mpf, &pagep,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+ if (pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, pagep, dbc->priority);
+ return (ret);
+ }
+
+ if (DBC_LOGGING(dbc)) {
+ tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+ tmp_dbt.size = OV_LEN(pagep);
+ ZERO_LSN(null_lsn);
+ if ((ret = __db_big_log(dbp, dbc->txn, &LSN(pagep), 0,
+ OP_SET(DB_REM_BIG, pagep), PGNO(pagep),
+ PREV_PGNO(pagep), NEXT_PGNO(pagep), &tmp_dbt,
+ &LSN(pagep), &null_lsn, &null_lsn)) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, pagep, dbc->priority);
+ return (ret);
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+ pgno = pagep->next_pgno;
+ OV_LEN(pagep) = 0;
+ if ((ret = __db_free(dbc, pagep, 0)) != 0)
+ return (ret);
+ } while (pgno != PGNO_INVALID);
+
+ return (0);
+}
+
+/*
+ * __db_moff --
+ * Match on overflow pages.
+ *
+ * Given a starting page number and a key, return <0, 0, >0 to indicate if the
+ * key on the page is less than, equal to or greater than the key specified.
+ * We optimize this by doing chunk at a time comparison unless the user has
+ * specified a comparison function. In this case, we need to materialize
+ * the entire object and call their comparison routine.
+ *
+ * __db_moff and __db_coff are generic functions useful in searching and
+ * ordering off page items. __db_moff matches an overflow DBT with an offpage
+ * item. __db_coff compares two offpage items for lexicographic sort order.
+ *
+ * PUBLIC: int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp)
+ DBC *dbc;
+ const DBT *dbt;
+ db_pgno_t pgno;
+ u_int32_t tlen;
+ int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+{
+ DB *dbp;
+ DBT local_dbt;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *pagep;
+ void *buf;
+ u_int32_t bufsize, cmp_bytes, key_left;
+ u_int8_t *p1, *p2;
+ int ret;
+
+ dbp = dbc->dbp;
+ ip = dbc->thread_info;
+ mpf = dbp->mpf;
+
+ /*
+ * If there is a user-specified comparison function, build a
+ * contiguous copy of the key, and call it.
+ */
+ if (cmpfunc != NULL) {
+ memset(&local_dbt, 0, sizeof(local_dbt));
+ buf = NULL;
+ bufsize = 0;
+
+ if ((ret = __db_goff(dbc,
+ &local_dbt, tlen, pgno, &buf, &bufsize)) != 0)
+ return (ret);
+ /* Pass the key as the first argument */
+ *cmpp = cmpfunc(dbp, dbt, &local_dbt);
+ __os_free(dbp->env, buf);
+ return (0);
+ }
+
+ /* While there are both keys to compare. */
+ for (*cmpp = 0, p1 = dbt->data,
+ key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) {
+ if ((ret =
+ __memp_fget(mpf, &pgno, ip, dbc->txn, 0, &pagep)) != 0)
+ return (ret);
+
+ cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left;
+ tlen -= cmp_bytes;
+ key_left -= cmp_bytes;
+ for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+ cmp_bytes-- > 0; ++p1, ++p2)
+ if (*p1 != *p2) {
+ *cmpp = (long)*p1 - (long)*p2;
+ break;
+ }
+ pgno = NEXT_PGNO(pagep);
+ if ((ret = __memp_fput(mpf, ip, pagep, dbp->priority)) != 0)
+ return (ret);
+ if (*cmpp != 0)
+ return (0);
+ }
+ if (key_left > 0) /* DBT is longer than the page key. */
+ *cmpp = 1;
+ else if (tlen > 0) /* DBT is shorter than the page key. */
+ *cmpp = -1;
+ else
+ *cmpp = 0;
+
+ return (0);
+}
+
+/*
+ * __db_coff --
+ * Match two offpage dbts.
+ *
+ * The DBTs must both refer to offpage items.
+ * The match happens a chunk (page) at a time unless a user defined comparison
+ * function exists. It is not possible to optimize this comparison away when
+ * a lexicographic sort order is required on mismatch.
+ *
+ * NOTE: For now this function only works for H_OFFPAGE type items. It would
+ * be simple to extend it for use with B_OVERFLOW type items. It would only
+ * require extracting the total length, and page number, dependent on the
+ * DBT type.
+ *
+ * PUBLIC: int __db_coff __P((DBC *, const DBT *, const DBT *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__db_coff(dbc, dbt, match, cmpfunc, cmpp)
+ DBC *dbc;
+ const DBT *dbt, *match;
+ int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_MPOOLFILE *mpf;
+ DB_TXN *txn;
+ DBT local_key, local_match;
+ PAGE *dbt_pagep, *match_pagep;
+ db_pgno_t dbt_pgno, match_pgno;
+ u_int32_t cmp_bytes, dbt_bufsz, dbt_len, match_bufsz;
+ u_int32_t match_len, max_data, page_space;
+ u_int8_t *p1, *p2;
+ int ret;
+ void *dbt_buf, *match_buf;
+
+ dbp = dbc->dbp;
+ ip = dbc->thread_info;
+ txn = dbc->txn;
+ mpf = dbp->mpf;
+ page_space = P_MAXSPACE(dbp, dbp->pgsize);
+ *cmpp = 0;
+ dbt_buf = match_buf = NULL;
+
+ DB_ASSERT(dbp->env, HPAGE_PTYPE(dbt->data) == H_OFFPAGE);
+ DB_ASSERT(dbp->env, HPAGE_PTYPE(match->data) == H_OFFPAGE);
+
+ /* Extract potentially unaligned length and pgno fields from DBTs */
+ memcpy(&dbt_len, HOFFPAGE_TLEN(dbt->data), sizeof(u_int32_t));
+ memcpy(&dbt_pgno, HOFFPAGE_PGNO(dbt->data), sizeof(db_pgno_t));
+ memcpy(&match_len, HOFFPAGE_TLEN(match->data), sizeof(u_int32_t));
+ memcpy(&match_pgno, HOFFPAGE_PGNO(match->data), sizeof(db_pgno_t));
+ max_data = (dbt_len < match_len ? dbt_len : match_len);
+
+ /*
+ * If there is a custom comparator, fully resolve both DBTs.
+ * Then call the users comparator.
+ */
+ if (cmpfunc != NULL) {
+ memset(&local_key, 0, sizeof(local_key));
+ memset(&local_match, 0, sizeof(local_match));
+ dbt_buf = match_buf = NULL;
+ dbt_bufsz = match_bufsz = 0;
+
+ if ((ret = __db_goff(dbc, &local_key, dbt_len,
+ dbt_pgno, &dbt_buf, &dbt_bufsz)) != 0)
+ goto err1;
+ if ((ret = __db_goff(dbc, &local_match, match_len,
+ match_pgno, &match_buf, &match_bufsz)) != 0)
+ goto err1;
+ /* The key needs to be the first argument for sort order */
+ *cmpp = cmpfunc(dbp, &local_key, &local_match);
+
+err1: if (dbt_buf != NULL)
+ __os_free(dbp->env, dbt_buf);
+ if (match_buf != NULL)
+ __os_free(dbp->env, match_buf);
+ return (ret);
+ }
+
+ /* Match the offpage DBTs a page at a time. */
+ while (dbt_pgno != PGNO_INVALID && match_pgno != PGNO_INVALID) {
+ if ((ret =
+ __memp_fget(mpf, &dbt_pgno, ip, txn, 0, &dbt_pagep)) != 0)
+ return (ret);
+ if ((ret =
+ __memp_fget(mpf, &match_pgno,
+ ip, txn, 0, &match_pagep)) != 0) {
+ (void)__memp_fput(
+ mpf, ip, dbt_pagep, DB_PRIORITY_UNCHANGED);
+ return (ret);
+ }
+ cmp_bytes = page_space < max_data ? page_space : max_data;
+ for (p1 = (u_int8_t *)dbt_pagep + P_OVERHEAD(dbp),
+ p2 = (u_int8_t *)match_pagep + P_OVERHEAD(dbp);
+ cmp_bytes-- > 0; ++p1, ++p2)
+ if (*p1 != *p2) {
+ *cmpp = (long)*p1 - (long)*p2;
+ break;
+ }
+
+ dbt_pgno = NEXT_PGNO(dbt_pagep);
+ match_pgno = NEXT_PGNO(match_pagep);
+ max_data -= page_space;
+ if ((ret = __memp_fput(mpf,
+ ip, dbt_pagep, DB_PRIORITY_UNCHANGED)) != 0) {
+ (void)__memp_fput(mpf,
+ ip, match_pagep, DB_PRIORITY_UNCHANGED);
+ return (ret);
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, match_pagep, DB_PRIORITY_UNCHANGED)) != 0)
+ return (ret);
+ if (*cmpp != 0)
+ return (0);
+ }
+
+ /* If a lexicographic mismatch was found, then the result has already
+ * been returned. If the DBTs matched, consider the lengths of the
+ * items, and return appropriately.
+ */
+ if (dbt_len > match_len) /* DBT is longer than the match key. */
+ *cmpp = 1;
+ else if (match_len > dbt_len) /* DBT is shorter than the match key. */
+ *cmpp = -1;
+ else
+ *cmpp = 0;
+
+ return (0);
+
+}
diff --git a/src/db/db_ovfl_vrfy.c b/src/db/db_ovfl_vrfy.c
new file mode 100644
index 00000000..fa630f7b
--- /dev/null
+++ b/src/db/db_ovfl_vrfy.c
@@ -0,0 +1,410 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/mp.h"
+
+/*
+ * __db_vrfy_overflow --
+ * Verify overflow page.
+ *
+ * PUBLIC: int __db_vrfy_overflow __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__db_vrfy_overflow(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ VRFY_PAGEINFO *pip;
+ int isbad, ret, t_ret;
+
+ isbad = 0;
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ pip->refcount = OV_REF(h);
+ if (pip->refcount < 1) {
+ EPRINT((dbp->env, DB_STR_A("0676",
+ "Page %lu: overflow page has zero reference count", "%lu"),
+ (u_long)pgno));
+ isbad = 1;
+ }
+
+ /* Just store for now. */
+ pip->olen = HOFFSET(h);
+
+err: if ((t_ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_ovfl_structure --
+ * Walk a list of overflow pages, avoiding cycles and marking
+ * pages seen.
+ *
+ * PUBLIC: int __db_vrfy_ovfl_structure
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, u_int32_t));
+ */
+int
+__db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t tlen;
+ u_int32_t flags;
+{
+ DB *pgset;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t next, prev;
+ int isbad, ret, seen_cnt, t_ret;
+ u_int32_t refcount;
+
+ env = dbp->env;
+ pgset = vdp->pgset;
+ DB_ASSERT(env, pgset != NULL);
+ isbad = 0;
+
+ /* This shouldn't happen, but just to be sure. */
+ if (!IS_VALID_PGNO(pgno))
+ return (DB_VERIFY_BAD);
+
+ /*
+ * Check the first prev_pgno; it ought to be PGNO_INVALID,
+ * since there's no prev page.
+ */
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ /* The refcount is stored on the first overflow page. */
+ refcount = pip->refcount;
+
+ if (pip->type != P_OVERFLOW) {
+ EPRINT((env, DB_STR_A("0677",
+ "Page %lu: overflow page of invalid type %lu", "%lu %lu"),
+ (u_long)pgno, (u_long)pip->type));
+ ret = DB_VERIFY_BAD;
+ goto err; /* Unsafe to continue. */
+ }
+
+ prev = pip->prev_pgno;
+ if (prev != PGNO_INVALID) {
+ EPRINT((env, DB_STR_A("0678",
+ "Page %lu: first page in overflow chain has a prev_pgno %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)prev));
+ isbad = 1;
+ }
+
+ for (;;) {
+ /*
+ * We may have seen this page elsewhere, if the overflow entry
+ * has been promoted to an internal page; we just want to
+ * make sure that each overflow page is seen exactly as many
+ * times as its refcount dictates.
+ *
+ * Note that this code also serves to keep us from looping
+ * infinitely if there's a cycle in an overflow chain.
+ */
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, vdp->txn, pgno, &seen_cnt)) != 0)
+ goto err;
+ if ((u_int32_t)seen_cnt > refcount) {
+ EPRINT((env, DB_STR_A("0679",
+ "Page %lu: encountered too many times in overflow traversal",
+ "%lu"), (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = __db_vrfy_pgset_inc(
+ pgset, vdp->thread_info, vdp->txn, pgno)) != 0)
+ goto err;
+
+ /*
+ * Each overflow page can be referenced multiple times,
+ * because it's possible for overflow Btree keys to get
+ * promoted to internal pages. We want to make sure that
+ * each page is referenced from a Btree leaf (or Hash data
+ * page, which we consider a "leaf" here) exactly once; if
+ * the parent was a leaf, set a flag to indicate that we've
+ * seen this page in a leaf context.
+ *
+ * If the parent is not a leaf--in which case it's a Btree
+ * internal page--we don't need to bother doing any further
+ * verification, as we'll do it when we hit the leaf (or
+ * complain that we never saw the leaf). Only the first
+ * page in an overflow chain should ever have a refcount
+ * greater than 1, and the combination of the LEAFSEEN check
+ * and the fact that we bail after the first page for
+ * non-leaves should ensure this.
+ *
+ * Note that each "child" of a page, such as an overflow page,
+ * is stored and verified in a structure check exactly once,
+ * so this code does not need to contend with the fact that
+ * overflow chains used as Btree duplicate keys may be
+ * referenced multiply from a single Btree leaf page.
+ */
+ if (LF_ISSET(DB_ST_OVFL_LEAF)) {
+ if (F_ISSET(pip, VRFY_OVFL_LEAFSEEN)) {
+ EPRINT((env, DB_STR_A("0680",
+ "Page %lu: overflow page linked twice from leaf or data page",
+ "%lu"), (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ F_SET(pip, VRFY_OVFL_LEAFSEEN);
+ }
+
+ /*
+ * We want to verify each overflow chain only once, and
+ * although no chain should be linked more than once from a
+ * leaf page, we can't guarantee that it'll be linked that
+ * once if it's linked from an internal page and the key
+ * is gone.
+ *
+ * seen_cnt is the number of times we'd encountered this page
+ * before calling this function.
+ */
+ if (seen_cnt == 0) {
+ /*
+ * Keep a running tab on how much of the item we've
+ * seen.
+ */
+ tlen -= pip->olen;
+
+ /* Send the application feedback about our progress. */
+ if (!LF_ISSET(DB_SALVAGE))
+ __db_vrfy_struct_feedback(dbp, vdp);
+ } else
+ goto done;
+
+ next = pip->next_pgno;
+
+ /* Are we there yet? */
+ if (next == PGNO_INVALID)
+ break;
+
+ /*
+ * We've already checked this when we saved it, but just
+ * to be sure...
+ */
+ if (!IS_VALID_PGNO(next)) {
+ EPRINT((env, DB_STR_A("0681",
+ "Page %lu: bad next_pgno %lu on overflow page",
+ "%lu %lu"), (u_long)pgno, (u_long)next));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 ||
+ (ret = __db_vrfy_getpageinfo(vdp, next, &pip)) != 0)
+ return (ret);
+ if (pip->prev_pgno != pgno) {
+ EPRINT((env, DB_STR_A("0682",
+ "Page %lu: bad prev_pgno %lu on overflow page (should be %lu)",
+ "%lu %lu %lu"), (u_long)next,
+ (u_long)pip->prev_pgno, (u_long)pgno));
+ isbad = 1;
+ /*
+ * It's safe to continue because we have separate
+ * cycle detection.
+ */
+ }
+
+ pgno = next;
+ }
+
+ if (tlen > 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0683",
+ "Page %lu: overflow item incomplete", "%lu"),
+ (u_long)pgno));
+ }
+
+done:
+err: if ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_safe_goff --
+ * Get an overflow item, very carefully, from an untrusted database,
+ * in the context of the salvager.
+ *
+ * PUBLIC: int __db_safe_goff __P((DB *, VRFY_DBINFO *,
+ * PUBLIC: db_pgno_t, DBT *, void *, u_int32_t *, u_int32_t));
+ */
+int
+__db_safe_goff(dbp, vdp, pgno, dbt, buf, bufsz, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ DBT *dbt;
+ void *buf;
+ u_int32_t *bufsz;
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret, t_ret;
+ u_int32_t bytesgot, bytes;
+ u_int8_t *src, *dest;
+
+ mpf = dbp->mpf;
+ h = NULL;
+ ret = t_ret = 0;
+ bytesgot = bytes = 0;
+
+ DB_ASSERT(dbp->env, bufsz != NULL);
+
+ /*
+ * Back up to the start of the overflow chain (if necessary) via the
+ * prev pointer of the overflow page. This guarantees we transverse the
+ * longest possible chains of overflow pages and won't be called again
+ * with a pgno earlier in the chain, stepping on ourselves.
+ */
+ for (;;) {
+ if ((ret = __memp_fget(
+ mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ return (ret);
+
+ if (PREV_PGNO(h) == PGNO_INVALID ||
+ !IS_VALID_PGNO(PREV_PGNO(h)))
+ break;
+
+ pgno = PREV_PGNO(h);
+
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ return (ret);
+ }
+ if ((ret = __memp_fput(
+ mpf, vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ return (ret);
+
+ h = NULL;
+
+ while ((pgno != PGNO_INVALID) && (IS_VALID_PGNO(pgno))) {
+ /*
+ * Mark that we're looking at this page; if we've seen it
+ * already, quit.
+ */
+ if ((ret = __db_salvage_markdone(vdp, pgno)) != 0)
+ break;
+
+ if ((ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &h)) != 0)
+ break;
+
+ /*
+ * Make sure it's really an overflow page, unless we're
+ * being aggressive, in which case we pretend it is.
+ */
+ if (!LF_ISSET(DB_AGGRESSIVE) && TYPE(h) != P_OVERFLOW) {
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+ src = (u_int8_t *)h + P_OVERHEAD(dbp);
+ bytes = OV_LEN(h);
+
+ if (bytes + P_OVERHEAD(dbp) > dbp->pgsize)
+ bytes = dbp->pgsize - P_OVERHEAD(dbp);
+
+ /*
+ * Realloc if buf is too small
+ */
+ if (bytesgot + bytes > *bufsz) {
+ if ((ret =
+ __os_realloc(dbp->env, bytesgot + bytes, buf)) != 0)
+ break;
+ *bufsz = bytesgot + bytes;
+ }
+
+ dest = *(u_int8_t **)buf + bytesgot;
+ bytesgot += bytes;
+
+ memcpy(dest, src, bytes);
+
+ pgno = NEXT_PGNO(h);
+
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ break;
+ h = NULL;
+ }
+
+ /*
+ * If we're being aggressive, salvage a partial datum if there
+ * was an error somewhere along the way.
+ */
+ if (ret == 0 || LF_ISSET(DB_AGGRESSIVE)) {
+ dbt->size = bytesgot;
+ dbt->data = *(void **)buf;
+ }
+
+ /* If we broke out on error, don't leave pages pinned. */
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/src/db/db_pr.c b/src/db/db_pr.c
new file mode 100644
index 00000000..d95440f9
--- /dev/null
+++ b/src/db/db_pr.c
@@ -0,0 +1,1956 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/db_verify.h"
+
+static int __db_bmeta __P((ENV *, DB *, BTMETA *, u_int32_t));
+static int __db_heapmeta __P((ENV *, DB *, HEAPMETA *, u_int32_t));
+static int __db_heapint __P((DB *, HEAPPG *, u_int32_t));
+static int __db_hmeta __P((ENV *, DB *, HMETA *, u_int32_t));
+static void __db_meta __P((ENV *, DB *, DBMETA *, FN const *, u_int32_t));
+static void __db_proff __P((ENV *, DB_MSGBUF *, void *));
+static int __db_qmeta __P((ENV *, DB *, QMETA *, u_int32_t));
+#ifdef HAVE_STATISTICS
+static void __db_prdb __P((DB *, u_int32_t));
+static int __db_prtree __P((DB *, DB_TXN *,
+ u_int32_t, db_pgno_t, db_pgno_t));
+#endif
+
+/*
+ * __db_loadme --
+ * A nice place to put a breakpoint.
+ *
+ * PUBLIC: void __db_loadme __P((void));
+ */
+void
+__db_loadme()
+{
+ pid_t pid;
+
+ __os_id(NULL, &pid, NULL);
+}
+
+#ifdef HAVE_STATISTICS
+/*
+ * __db_dumptree --
+ * Dump the tree to a file.
+ *
+ * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *,
+ * PUBLIC: char *, char *, db_pgno_t, db_pgno_t));
+ */
+int
+__db_dumptree(dbp, txn, op, name, first, last)
+ DB *dbp;
+ DB_TXN *txn;
+ char *op, *name;
+ db_pgno_t first, last;
+{
+ ENV *env;
+ FILE *fp, *orig_fp;
+ u_int32_t flags;
+ int ret;
+
+ env = dbp->env;
+
+ for (flags = 0; *op != '\0'; ++op)
+ switch (*op) {
+ case 'a':
+ LF_SET(DB_PR_PAGE);
+ break;
+ case 'h':
+ break;
+ case 'r':
+ LF_SET(DB_PR_RECOVERYTEST);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ if (name != NULL) {
+ if ((fp = fopen(name, "w")) == NULL)
+ return (__os_get_errno());
+
+ orig_fp = dbp->dbenv->db_msgfile;
+ dbp->dbenv->db_msgfile = fp;
+ } else
+ fp = orig_fp = NULL;
+
+ __db_prdb(dbp, flags);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+
+ ret = __db_prtree(dbp, txn, flags, first, last);
+
+ if (fp != NULL) {
+ (void)fclose(fp);
+ env->dbenv->db_msgfile = orig_fp;
+ }
+
+ return (ret);
+}
+
+static const FN __db_flags_fn[] = {
+ { DB_AM_CHKSUM, "checksumming" },
+ { DB_AM_COMPENSATE, "created by compensating transaction" },
+ { DB_AM_CREATED, "database created" },
+ { DB_AM_CREATED_MSTR, "encompassing file created" },
+ { DB_AM_DBM_ERROR, "dbm/ndbm error" },
+ { DB_AM_DELIMITER, "variable length" },
+ { DB_AM_DISCARD, "discard cached pages" },
+ { DB_AM_DUP, "duplicates" },
+ { DB_AM_DUPSORT, "sorted duplicates" },
+ { DB_AM_ENCRYPT, "encrypted" },
+ { DB_AM_FIXEDLEN, "fixed-length records" },
+ { DB_AM_INMEM, "in-memory" },
+ { DB_AM_IN_RENAME, "file is being renamed" },
+ { DB_AM_NOT_DURABLE, "changes not logged" },
+ { DB_AM_OPEN_CALLED, "open called" },
+ { DB_AM_PAD, "pad value" },
+ { DB_AM_PGDEF, "default page size" },
+ { DB_AM_RDONLY, "read-only" },
+ { DB_AM_READ_UNCOMMITTED, "read-uncommitted" },
+ { DB_AM_RECNUM, "Btree record numbers" },
+ { DB_AM_RECOVER, "opened for recovery" },
+ { DB_AM_RENUMBER, "renumber" },
+ { DB_AM_REVSPLITOFF, "no reverse splits" },
+ { DB_AM_SECONDARY, "secondary" },
+ { DB_AM_SNAPSHOT, "load on open" },
+ { DB_AM_SUBDB, "subdatabases" },
+ { DB_AM_SWAP, "needswap" },
+ { DB_AM_TXN, "transactional" },
+ { DB_AM_VERIFYING, "verifier" },
+ { 0, NULL }
+};
+
+/*
+ * __db_get_flags_fn --
+ * Return the __db_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_flags_fn __P((void));
+ */
+const FN *
+__db_get_flags_fn()
+{
+ return (__db_flags_fn);
+}
+
+/*
+ * __db_prdb --
+ * Print out the DB structure information.
+ */
+static void
+__db_prdb(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ BTREE *bt;
+ DB_MSGBUF mb;
+ ENV *env;
+ HASH *h;
+ QUEUE *q;
+ HEAP *hp;
+
+ env = dbp->env;
+
+ DB_MSGBUF_INIT(&mb);
+ __db_msg(env, "In-memory DB structure:");
+ __db_msgadd(env, &mb, "%s: %#lx",
+ __db_dbtype_to_string(dbp->type), (u_long)dbp->flags);
+ __db_prflags(env, &mb, dbp->flags, __db_flags_fn, " (", ")");
+ DB_MSGBUF_FLUSH(env, &mb);
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ bt = dbp->bt_internal;
+ __db_msg(env, "bt_meta: %lu bt_root: %lu",
+ (u_long)bt->bt_meta, (u_long)bt->bt_root);
+ __db_msg(env, "bt_minkey: %lu", (u_long)bt->bt_minkey);
+ if (!LF_ISSET(DB_PR_RECOVERYTEST))
+ __db_msg(env, "bt_compare: %#lx bt_prefix: %#lx",
+ P_TO_ULONG(bt->bt_compare),
+ P_TO_ULONG(bt->bt_prefix));
+#ifdef HAVE_COMPRESSION
+ if (!LF_ISSET(DB_PR_RECOVERYTEST))
+ __db_msg(env, "bt_compress: %#lx bt_decompress: %#lx",
+ P_TO_ULONG(bt->bt_compress),
+ P_TO_ULONG(bt->bt_decompress));
+#endif
+ __db_msg(env, "bt_lpgno: %lu", (u_long)bt->bt_lpgno);
+ if (dbp->type == DB_RECNO) {
+ __db_msg(env,
+ "re_pad: %#lx re_delim: %#lx re_len: %lu re_source: %s",
+ (u_long)bt->re_pad, (u_long)bt->re_delim,
+ (u_long)bt->re_len,
+ bt->re_source == NULL ? "" : bt->re_source);
+ __db_msg(env,
+ "re_modified: %d re_eof: %d re_last: %lu",
+ bt->re_modified, bt->re_eof, (u_long)bt->re_last);
+ }
+ break;
+ case DB_HASH:
+ h = dbp->h_internal;
+ __db_msg(env, "meta_pgno: %lu", (u_long)h->meta_pgno);
+ __db_msg(env, "h_ffactor: %lu", (u_long)h->h_ffactor);
+ __db_msg(env, "h_nelem: %lu", (u_long)h->h_nelem);
+ if (!LF_ISSET(DB_PR_RECOVERYTEST))
+ __db_msg(env, "h_hash: %#lx", P_TO_ULONG(h->h_hash));
+ break;
+ case DB_QUEUE:
+ q = dbp->q_internal;
+ __db_msg(env, "q_meta: %lu", (u_long)q->q_meta);
+ __db_msg(env, "q_root: %lu", (u_long)q->q_root);
+ __db_msg(env, "re_pad: %#lx re_len: %lu",
+ (u_long)q->re_pad, (u_long)q->re_len);
+ __db_msg(env, "rec_page: %lu", (u_long)q->rec_page);
+ __db_msg(env, "page_ext: %lu", (u_long)q->page_ext);
+ break;
+ case DB_HEAP:
+ hp = dbp->heap_internal;
+ __db_msg(env, "gbytes: %lu", (u_long)hp->gbytes);
+ __db_msg(env, "bytes: %lu", (u_long)hp->bytes);
+ __db_msg(env, "curregion: %lu", (u_long)hp->curregion);
+ __db_msg(env, "region_size: %lu", (u_long)hp->region_size);
+ __db_msg(env, "maxpgno: %lu", (u_long)hp->maxpgno);
+ break;
+ case DB_UNKNOWN:
+ default:
+ break;
+ }
+}
+
+/*
+ * __db_prtree --
+ * Print out the entire tree.
+ */
+static int
+__db_prtree(dbp, txn, flags, first, last)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t flags;
+ db_pgno_t first, last;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_pgno_t i;
+ int ret;
+
+ mpf = dbp->mpf;
+
+ if (dbp->type == DB_QUEUE)
+ return (__db_prqueue(dbp, flags));
+
+ /*
+ * Find out the page number of the last page in the database, then
+ * dump each page.
+ */
+ if (last == PGNO_INVALID &&
+ (ret = __memp_get_last_pgno(mpf, &last)) != 0)
+ return (ret);
+ for (i = first; i <= last; ++i) {
+ if ((ret = __memp_fget(mpf, &i, NULL, txn, 0, &h)) != 0)
+ return (ret);
+ (void)__db_prpage(dbp, h, flags);
+ if ((ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_prnpage
+ * -- Print out a specific page.
+ *
+ * PUBLIC: int __db_prnpage __P((DB *, DB_TXN *, db_pgno_t));
+ */
+int
+__db_prnpage(dbp, txn, pgno)
+ DB *dbp;
+ DB_TXN *txn;
+ db_pgno_t pgno;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret, t_ret;
+
+ mpf = dbp->mpf;
+
+ if ((ret = __memp_fget(mpf, &pgno, NULL, txn, 0, &h)) != 0)
+ return (ret);
+
+ ret = __db_prpage(dbp, h, DB_PR_PAGE);
+
+ if ((t_ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_prpage
+ * -- Print out a page.
+ *
+ * PUBLIC: int __db_prpage __P((DB *, PAGE *, u_int32_t));
+ */
+int
+__db_prpage(dbp, h, flags)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t flags;
+{
+ DB_MSGBUF mb;
+ u_int32_t pagesize;
+ /*
+ * !!!
+ * Find out the page size. We don't want to do it the "right" way,
+ * by reading the value from the meta-data page, that's going to be
+ * slow. Reach down into the mpool region.
+ */
+ pagesize = (u_int32_t)dbp->mpf->mfp->pagesize;
+ DB_MSGBUF_INIT(&mb);
+ return (__db_prpage_int(dbp->env,
+ &mb, dbp, "", h, pagesize, NULL, flags));
+}
+
+/*
+ * __db_lockmode_to_string --
+ * Return the name of the lock mode.
+ *
+ * PUBLIC: const char * __db_lockmode_to_string __P((db_lockmode_t));
+ */
+const char *
+__db_lockmode_to_string(mode)
+ db_lockmode_t mode;
+{
+ switch (mode) {
+ case DB_LOCK_NG:
+ return ("Not granted");
+ case DB_LOCK_READ:
+ return ("Shared/read");
+ case DB_LOCK_WRITE:
+ return ("Exclusive/write");
+ case DB_LOCK_WAIT:
+ return ("Wait for event");
+ case DB_LOCK_IWRITE:
+ return ("Intent exclusive/write");
+ case DB_LOCK_IREAD:
+ return ("Intent shared/read");
+ case DB_LOCK_IWR:
+ return ("Intent to read/write");
+ case DB_LOCK_READ_UNCOMMITTED:
+ return ("Read uncommitted");
+ case DB_LOCK_WWRITE:
+ return ("Was written");
+ default:
+ break;
+ }
+ return ("UNKNOWN LOCK MODE");
+}
+
+#else /* !HAVE_STATISTICS */
+
+/*
+ * __db_dumptree --
+ * Dump the tree to a file.
+ *
+ * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *,
+ * PUBLIC: char *, char *, db_pgno_t, db_pgno_t));
+ */
+int
+__db_dumptree(dbp, txn, op, name, first, last)
+ DB *dbp;
+ DB_TXN *txn;
+ char *op, *name;
+ db_pgno_t first, last;
+{
+ COMPQUIET(txn, NULL);
+ COMPQUIET(op, NULL);
+ COMPQUIET(name, NULL);
+ COMPQUIET(first, last);
+
+ return (__db_stat_not_built(dbp->env));
+}
+
+/*
+ * __db_get_flags_fn --
+ * Return the __db_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_flags_fn __P((void));
+ */
+const FN *
+__db_get_flags_fn()
+{
+ /*
+ * !!!
+ * The Tcl API uses this interface, stub it off.
+ */
+ return (NULL);
+}
+#endif
+
+/*
+ * __db_meta --
+ * Print out common metadata information.
+ */
+static void
+__db_meta(env, dbp, dbmeta, fn, flags)
+ DB *dbp;
+ ENV *env;
+ DBMETA *dbmeta;
+ FN const *fn;
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ DB_MSGBUF mb;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int8_t *p;
+ int cnt, ret;
+ const char *sep;
+
+ DB_MSGBUF_INIT(&mb);
+
+ __db_msg(env, "\tmagic: %#lx", (u_long)dbmeta->magic);
+ __db_msg(env, "\tversion: %lu", (u_long)dbmeta->version);
+ __db_msg(env, "\tpagesize: %lu", (u_long)dbmeta->pagesize);
+ __db_msg(env, "\ttype: %lu", (u_long)dbmeta->type);
+ __db_msg(env, "\tmetaflags %#lx", (u_long)dbmeta->metaflags);
+ __db_msg(env, "\tkeys: %lu\trecords: %lu",
+ (u_long)dbmeta->key_count, (u_long)dbmeta->record_count);
+ if (dbmeta->nparts)
+ __db_msg(env, "\tnparts: %lu", (u_long)dbmeta->nparts);
+
+ /*
+ * If we're doing recovery testing, don't display the free list,
+ * it may have changed and that makes the dump diff not work.
+ */
+ if (dbp != NULL && !LF_ISSET(DB_PR_RECOVERYTEST)) {
+ mpf = dbp->mpf;
+ __db_msgadd(
+ env, &mb, "\tfree list: %lu", (u_long)dbmeta->free);
+ for (pgno = dbmeta->free,
+ cnt = 0, sep = ", "; pgno != PGNO_INVALID;) {
+ if ((ret = __memp_fget(mpf,
+ &pgno, NULL, NULL, 0, &h)) != 0) {
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msg(env,
+ "Unable to retrieve free-list page: %lu: %s",
+ (u_long)pgno, db_strerror(ret));
+ break;
+ }
+ pgno = h->next_pgno;
+ (void)__memp_fput(mpf, NULL, h, dbp->priority);
+ __db_msgadd(env, &mb, "%s%lu", sep, (u_long)pgno);
+ if (++cnt % 10 == 0) {
+ DB_MSGBUF_FLUSH(env, &mb);
+ cnt = 0;
+ sep = "\t";
+ } else
+ sep = ", ";
+ }
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msg(env, "\tlast_pgno: %lu", (u_long)dbmeta->last_pgno);
+ }
+
+ if (fn != NULL) {
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msgadd(env, &mb, "\tflags: %#lx", (u_long)dbmeta->flags);
+ __db_prflags(env, &mb, dbmeta->flags, fn, " (", ")");
+ }
+
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msgadd(env, &mb, "\tuid: ");
+ for (p = (u_int8_t *)dbmeta->uid,
+ cnt = 0; cnt < DB_FILE_ID_LEN; ++cnt) {
+ __db_msgadd(env, &mb, "%x", *p++);
+ if (cnt < DB_FILE_ID_LEN - 1)
+ __db_msgadd(env, &mb, " ");
+ }
+ DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_bmeta --
+ * Print out the btree meta-data page.
+ */
+static int
+__db_bmeta(env, dbp, h, flags)
+ ENV *env;
+ DB *dbp;
+ BTMETA *h;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { BTM_DUP, "duplicates" },
+ { BTM_RECNO, "recno" },
+ { BTM_RECNUM, "btree:recnum" },
+ { BTM_FIXEDLEN, "recno:fixed-length" },
+ { BTM_RENUMBER, "recno:renumber" },
+ { BTM_SUBDB, "multiple-databases" },
+ { BTM_DUPSORT, "sorted duplicates" },
+ { BTM_COMPRESS, "compressed" },
+ { 0, NULL }
+ };
+
+ __db_meta(env, dbp, (DBMETA *)h, fn, flags);
+
+ __db_msg(env, "\tminkey: %lu", (u_long)h->minkey);
+ if (F_ISSET(&h->dbmeta, BTM_RECNO))
+ __db_msg(env, "\tre_len: %#lx re_pad: %#lx",
+ (u_long)h->re_len, (u_long)h->re_pad);
+ __db_msg(env, "\troot: %lu", (u_long)h->root);
+
+ return (0);
+}
+
+/*
+ * __db_hmeta --
+ * Print out the hash meta-data page.
+ */
+static int
+__db_hmeta(env, dbp, h, flags)
+ ENV *env;
+ DB *dbp;
+ HMETA *h;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { DB_HASH_DUP, "duplicates" },
+ { DB_HASH_SUBDB, "multiple-databases" },
+ { DB_HASH_DUPSORT, "sorted duplicates" },
+ { 0, NULL }
+ };
+ DB_MSGBUF mb;
+ int i;
+
+ DB_MSGBUF_INIT(&mb);
+
+ __db_meta(env, dbp, (DBMETA *)h, fn, flags);
+
+ __db_msg(env, "\tmax_bucket: %lu", (u_long)h->max_bucket);
+ __db_msg(env, "\thigh_mask: %#lx", (u_long)h->high_mask);
+ __db_msg(env, "\tlow_mask: %#lx", (u_long)h->low_mask);
+ __db_msg(env, "\tffactor: %lu", (u_long)h->ffactor);
+ __db_msg(env, "\tnelem: %lu", (u_long)h->nelem);
+ __db_msg(env, "\th_charkey: %#lx", (u_long)h->h_charkey);
+ __db_msgadd(env, &mb, "\tspare points:\n\t");
+ for (i = 0; i < NCACHED; i++) {
+ __db_msgadd(env, &mb, "%lu (%lu) ", (u_long)h->spares[i],
+ (u_long)(h->spares[i] == 0 ?
+ 0 : h->spares[i] + (i == 0 ? 0 : 1 << (i-1))));
+ if ((i + 1) % 8 == 0)
+ __db_msgadd(env, &mb, "\n\t");
+ }
+ DB_MSGBUF_FLUSH(env, &mb);
+
+ return (0);
+}
+
+/*
+ * __db_qmeta --
+ * Print out the queue meta-data page.
+ */
+static int
+__db_qmeta(env, dbp, h, flags)
+ ENV *env;
+ DB *dbp;
+ QMETA *h;
+ u_int32_t flags;
+{
+
+ __db_meta(env, dbp, (DBMETA *)h, NULL, flags);
+
+ __db_msg(env, "\tfirst_recno: %lu", (u_long)h->first_recno);
+ __db_msg(env, "\tcur_recno: %lu", (u_long)h->cur_recno);
+ __db_msg(env, "\tre_len: %#lx re_pad: %lu",
+ (u_long)h->re_len, (u_long)h->re_pad);
+ __db_msg(env, "\trec_page: %lu", (u_long)h->rec_page);
+ __db_msg(env, "\tpage_ext: %lu", (u_long)h->page_ext);
+
+ return (0);
+}
+
+/*
+ * __db_heapmeta --
+ * Print out the heap meta-data page.
+ */
+static int
+__db_heapmeta(env, dbp, h, flags)
+ ENV *env;
+ DB *dbp;
+ HEAPMETA *h;
+ u_int32_t flags;
+{
+ __db_meta(env, dbp, (DBMETA *)h, NULL, flags);
+
+ __db_msg(env, "\tcurregion: %lu", (u_long)h->curregion);
+ __db_msg(env, "\tregion_size: %lu", (u_long)h->region_size);
+ __db_msg(env, "\tnregions: %lu", (u_long)h->nregions);
+ __db_msg(env, "\tgbytes: %lu", (u_long)h->gbytes);
+ __db_msg(env, "\tbytes: %lu", (u_long)h->bytes);
+
+ return (0);
+}
+
+/*
+ * __db_heapint --
+ * Print out the heap internal-data page.
+ */
+static int
+__db_heapint(dbp, h, flags)
+ DB *dbp;
+ HEAPPG *h;
+ u_int32_t flags;
+{
+ DB_MSGBUF mb;
+ ENV *env;
+ int count, printed;
+ u_int32_t i, max;
+ u_int8_t avail;
+
+ env = dbp->env;
+ DB_MSGBUF_INIT(&mb);
+ count = printed = 0;
+ COMPQUIET(flags, 0);
+
+ __db_msgadd(env, &mb, "\thigh: %4lu\n", (u_long)h->high_pgno);
+ /* How many entries could there be on a page */
+ max = HEAP_REGION_SIZE(dbp);
+
+ for (i = 0; i < max; i++, count++) {
+ avail = HEAP_SPACE(dbp, h, i);
+ if (avail != 0) {
+ __db_msgadd(env, &mb,
+ "%5lu:%1lu ", (u_long)i, (u_long)avail);
+ printed = 1;
+ }
+ /* We get 10 entries per line this way */
+ if (count == 9) {
+ DB_MSGBUF_FLUSH(env, &mb);
+ count = -1;
+ }
+ }
+ /* All pages were less than 33% full */
+ if (printed == 0)
+ __db_msgadd(env, &mb,
+ "All pages in this region less than 33 percent full");
+
+ DB_MSGBUF_FLUSH(env, &mb);
+ return (0);
+}
+
+/*
+ * For printing pages from the log we may be passed the data segment
+ * separate from the header, if so then it starts at HOFFSET.
+ */
+#define PR_ENTRY(dbp, h, i, data) \
+ (data == NULL ? P_ENTRY(dbp, h, i) : \
+ (u_int8_t *)data + P_INP(dbp, h)[i] - HOFFSET(h))
+/*
+ * __db_prpage_int
+ * -- Print out a page.
+ *
+ * PUBLIC: int __db_prpage_int __P((ENV *, DB_MSGBUF *,
+ * PUBLIC: DB *, char *, PAGE *, u_int32_t, u_int8_t *, u_int32_t));
+ */
+int
+__db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ DB *dbp;
+ char *lead;
+ PAGE *h;
+ u_int32_t pagesize;
+ u_int8_t *data;
+ u_int32_t flags;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ HOFFPAGE a_hkd;
+ QAMDATA *qp, *qep;
+ RINTERNAL *ri;
+ HEAPHDR *hh;
+ HEAPSPLITHDR *hs;
+ db_indx_t dlen, len, i, *inp, max;
+ db_pgno_t pgno;
+ db_recno_t recno;
+ u_int32_t qlen;
+ u_int8_t *ep, *hk, *p;
+ int deleted, ret;
+ const char *s;
+ void *hdata, *sp;
+
+ /*
+ * If we're doing recovery testing and this page is P_INVALID,
+ * assume it's a page that's on the free list, and don't display it.
+ */
+ if (LF_ISSET(DB_PR_RECOVERYTEST) && TYPE(h) == P_INVALID)
+ return (0);
+
+ if ((s = __db_pagetype_to_string(TYPE(h))) == NULL) {
+ __db_msg(env, "%sILLEGAL PAGE TYPE: page: %lu type: %lu",
+ lead, (u_long)h->pgno, (u_long)TYPE(h));
+ return (EINVAL);
+ }
+
+ /* Page number, page type. */
+ __db_msgadd(env, mbp, "%spage %lu: %s:", lead, (u_long)h->pgno, s);
+
+ /*
+ * LSNs on a metadata page will be different from the original after an
+ * abort, in some cases. Don't display them if we're testing recovery.
+ */
+ if (!LF_ISSET(DB_PR_RECOVERYTEST) ||
+ (TYPE(h) != P_BTREEMETA && TYPE(h) != P_HASHMETA &&
+ TYPE(h) != P_QAMMETA && TYPE(h) != P_QAMDATA &&
+ TYPE(h) != P_HEAPMETA))
+ __db_msgadd(env, mbp, " LSN [%lu][%lu]:",
+ (u_long)LSN(h).file, (u_long)LSN(h).offset);
+
+ /*
+ * Page level (only applicable for Btree/Recno, but we always display
+ * it, for no particular reason, except for Heap.
+ */
+ if (!HEAPTYPE(h))
+ __db_msgadd(env, mbp, " level %lu", (u_long)h->level);
+
+ /* Record count. */
+ if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO ||
+ (dbp != NULL && TYPE(h) == P_LRECNO &&
+ h->pgno == ((BTREE *)dbp->bt_internal)->bt_root))
+ __db_msgadd(env, mbp, " records: %lu", (u_long)RE_NREC(h));
+ DB_MSGBUF_FLUSH(env, mbp);
+
+ switch (TYPE(h)) {
+ case P_BTREEMETA:
+ return (__db_bmeta(env, dbp, (BTMETA *)h, flags));
+ case P_HASHMETA:
+ return (__db_hmeta(env, dbp, (HMETA *)h, flags));
+ case P_QAMMETA:
+ return (__db_qmeta(env, dbp, (QMETA *)h, flags));
+ case P_QAMDATA: /* Should be meta->start. */
+ if (!LF_ISSET(DB_PR_PAGE) || dbp == NULL)
+ return (0);
+
+ qlen = ((QUEUE *)dbp->q_internal)->re_len;
+ recno = (h->pgno - 1) * QAM_RECNO_PER_PAGE(dbp) + 1;
+ i = 0;
+ qep = (QAMDATA *)((u_int8_t *)h + pagesize - qlen);
+ for (qp = QAM_GET_RECORD(dbp, h, i); qp < qep;
+ recno++, i++, qp = QAM_GET_RECORD(dbp, h, i)) {
+ if (!F_ISSET(qp, QAM_SET))
+ continue;
+
+ __db_msgadd(env, mbp, "%s",
+ F_ISSET(qp, QAM_VALID) ? "\t" : " D");
+ __db_msgadd(env, mbp, "[%03lu] %4lu ", (u_long)recno,
+ (u_long)((u_int8_t *)qp - (u_int8_t *)h));
+ __db_prbytes(env, mbp, qp->data, qlen);
+ }
+ return (0);
+ case P_HEAPMETA:
+ return (__db_heapmeta(env, dbp, (HEAPMETA *)h, flags));
+ case P_IHEAP:
+ if (!LF_ISSET(DB_PR_PAGE) || dbp == NULL)
+ return (0);
+ return (__db_heapint(dbp, (HEAPPG *)h, flags));
+ default:
+ break;
+ }
+
+ s = "\t";
+ if (!HEAPTYPE(h) && TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) {
+ __db_msgadd(env, mbp, "%sprev: %4lu next: %4lu",
+ s, (u_long)PREV_PGNO(h), (u_long)NEXT_PGNO(h));
+ s = " ";
+ }
+
+ if (HEAPTYPE(h)) {
+ __db_msgadd(env, mbp, "%shigh indx: %4lu free indx: %4lu", s,
+ (u_long)HEAP_HIGHINDX(h), (u_long)HEAP_FREEINDX(h));
+ s = " ";
+ }
+
+ if (TYPE(h) == P_OVERFLOW) {
+ __db_msgadd(env, mbp,
+ "%sref cnt: %4lu ", s, (u_long)OV_REF(h));
+ if (dbp == NULL)
+ __db_msgadd(env, mbp,
+ " len: %4lu ", (u_long)OV_LEN(h));
+ else
+ __db_prbytes(env,
+ mbp, (u_int8_t *)h + P_OVERHEAD(dbp), OV_LEN(h));
+ return (0);
+ }
+ __db_msgadd(env, mbp, "%sentries: %4lu", s, (u_long)NUM_ENT(h));
+ __db_msgadd(env, mbp, " offset: %4lu", (u_long)HOFFSET(h));
+ DB_MSGBUF_FLUSH(env, mbp);
+
+ if (dbp == NULL || TYPE(h) == P_INVALID || !LF_ISSET(DB_PR_PAGE))
+ return (0);
+
+ if (data != NULL)
+ pagesize += HOFFSET(h);
+ else if (pagesize < HOFFSET(h))
+ return (0);
+
+ ret = 0;
+ inp = P_INP(dbp, h);
+ max = TYPE(h) == P_HEAP ? HEAP_HIGHINDX(h) + 1 : NUM_ENT(h);
+ for (i = 0; i < max; i++) {
+ if (TYPE(h) == P_HEAP && inp[i] == 0)
+ continue;
+ if ((uintptr_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) <
+ (uintptr_t)(P_OVERHEAD(dbp)) ||
+ (size_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) >= pagesize) {
+ __db_msg(env,
+ "ILLEGAL PAGE OFFSET: indx: %lu of %lu",
+ (u_long)i, (u_long)inp[i]);
+ ret = EINVAL;
+ continue;
+ }
+ deleted = 0;
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ case P_IBTREE:
+ case P_IRECNO:
+ sp = PR_ENTRY(dbp, h, i, data);
+ break;
+ case P_HEAP:
+ sp = P_ENTRY(dbp, h, i);
+ break;
+ case P_LBTREE:
+ sp = PR_ENTRY(dbp, h, i, data);
+ deleted = i % 2 == 0 &&
+ B_DISSET(GET_BKEYDATA(dbp, h, i + O_INDX)->type);
+ break;
+ case P_LDUP:
+ case P_LRECNO:
+ sp = PR_ENTRY(dbp, h, i, data);
+ deleted = B_DISSET(GET_BKEYDATA(dbp, h, i)->type);
+ break;
+ default:
+ goto type_err;
+ }
+ __db_msgadd(env, mbp, "%s", deleted ? " D" : "\t");
+ __db_msgadd(
+ env, mbp, "[%03lu] %4lu ", (u_long)i, (u_long)inp[i]);
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ hk = sp;
+ switch (HPAGE_PTYPE(hk)) {
+ case H_OFFDUP:
+ memcpy(&pgno,
+ HOFFDUP_PGNO(hk), sizeof(db_pgno_t));
+ __db_msgadd(env, mbp,
+ "%4lu [offpage dups]", (u_long)pgno);
+ DB_MSGBUF_FLUSH(env, mbp);
+ break;
+ case H_DUPLICATE:
+ /*
+ * If this is the first item on a page, then
+ * we cannot figure out how long it is, so
+ * we only print the first one in the duplicate
+ * set.
+ */
+ if (i != 0)
+ len = LEN_HKEYDATA(dbp, h, 0, i);
+ else
+ len = 1;
+
+ __db_msgadd(env, mbp, "Duplicates:");
+ DB_MSGBUF_FLUSH(env, mbp);
+ for (p = HKEYDATA_DATA(hk),
+ ep = p + len; p < ep;) {
+ memcpy(&dlen, p, sizeof(db_indx_t));
+ p += sizeof(db_indx_t);
+ __db_msgadd(env, mbp, "\t\t");
+ __db_prbytes(env, mbp, p, dlen);
+ p += sizeof(db_indx_t) + dlen;
+ }
+ break;
+ case H_KEYDATA:
+ __db_prbytes(env, mbp, HKEYDATA_DATA(hk),
+ LEN_HKEYDATA(dbp, h, i == 0 ?
+ pagesize : 0, i));
+ break;
+ case H_OFFPAGE:
+ memcpy(&a_hkd, hk, HOFFPAGE_SIZE);
+ __db_msgadd(env, mbp,
+ "overflow: total len: %4lu page: %4lu",
+ (u_long)a_hkd.tlen, (u_long)a_hkd.pgno);
+ DB_MSGBUF_FLUSH(env, mbp);
+ break;
+ default:
+ DB_MSGBUF_FLUSH(env, mbp);
+ __db_msg(env, "ILLEGAL HASH PAGE TYPE: %lu",
+ (u_long)HPAGE_PTYPE(hk));
+ ret = EINVAL;
+ break;
+ }
+ break;
+ case P_IBTREE:
+ bi = sp;
+
+ if (F_ISSET(dbp, DB_AM_RECNUM))
+ __db_msgadd(env, mbp,
+ "count: %4lu ", (u_long)bi->nrecs);
+ __db_msgadd(env, mbp,
+ "pgno: %4lu type: %lu ",
+ (u_long)bi->pgno, (u_long)bi->type);
+ switch (B_TYPE(bi->type)) {
+ case B_KEYDATA:
+ __db_prbytes(env, mbp, bi->data, bi->len);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ __db_proff(env, mbp, bi->data);
+ break;
+ default:
+ DB_MSGBUF_FLUSH(env, mbp);
+ __db_msg(env, "ILLEGAL BINTERNAL TYPE: %lu",
+ (u_long)B_TYPE(bi->type));
+ ret = EINVAL;
+ break;
+ }
+ break;
+ case P_IRECNO:
+ ri = sp;
+ __db_msgadd(env, mbp, "entries %4lu pgno %4lu",
+ (u_long)ri->nrecs, (u_long)ri->pgno);
+ DB_MSGBUF_FLUSH(env, mbp);
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ bk = sp;
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ __db_prbytes(env, mbp, bk->data, bk->len);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ __db_proff(env, mbp, bk);
+ break;
+ default:
+ DB_MSGBUF_FLUSH(env, mbp);
+ __db_msg(env,
+ "ILLEGAL DUPLICATE/LBTREE/LRECNO TYPE: %lu",
+ (u_long)B_TYPE(bk->type));
+ ret = EINVAL;
+ break;
+ }
+ break;
+ case P_HEAP:
+ hh = sp;
+ if (!F_ISSET(hh,HEAP_RECSPLIT))
+ hdata = (u_int8_t *)hh + sizeof(HEAPHDR);
+ else {
+ hs = sp;
+ __db_msgadd(env, mbp,
+ "split: 0x%02x tsize: %lu next: %lu.%lu ",
+ hh->flags, (u_long)hs->tsize,
+ (u_long)hs->nextpg, (u_long)hs->nextindx);
+
+ hdata = (u_int8_t *)hh + sizeof(HEAPSPLITHDR);
+ }
+ __db_prbytes(env, mbp, hdata, hh->size);
+ break;
+ default:
+type_err: DB_MSGBUF_FLUSH(env, mbp);
+ __db_msg(env,
+ "ILLEGAL PAGE TYPE: %lu", (u_long)TYPE(h));
+ ret = EINVAL;
+ continue;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __db_prbytes --
+ * Print out a data element.
+ *
+ * PUBLIC: void __db_prbytes __P((ENV *, DB_MSGBUF *, u_int8_t *, u_int32_t));
+ */
+void
+__db_prbytes(env, mbp, bytes, len)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ u_int8_t *bytes;
+ u_int32_t len;
+{
+ u_int8_t *p;
+ u_int32_t i, not_printable;
+ int msg_truncated;
+
+ __db_msgadd(env, mbp, "len: %3lu", (u_long)len);
+ if (len != 0) {
+ __db_msgadd(env, mbp, " data: ");
+
+ /*
+ * Print the first N bytes of the data. If that
+ * chunk is at least 3/4 printable characters, print
+ * it as text, else print it in hex. We have this
+ * heuristic because we're displaying things like
+ * lock objects that could be either text or data.
+ */
+ if (len > env->data_len) {
+ len = env->data_len;
+ msg_truncated = 1;
+ } else
+ msg_truncated = 0;
+ not_printable = 0;
+ for (p = bytes, i = 0; i < len; ++i, ++p) {
+ if (!isprint((int)*p) && *p != '\t' && *p != '\n') {
+ if (i == len - 1 && *p == '\0')
+ break;
+ if (++not_printable >= (len >> 2))
+ break;
+ }
+ }
+ if (not_printable < (len >> 2))
+ for (p = bytes, i = len; i > 0; --i, ++p) {
+ if (isprint((int)*p))
+ __db_msgadd(env, mbp, "%c", *p);
+ else
+ __db_msgadd(env,
+ mbp, "\\%x", (u_int)*p);
+ }
+ else
+ for (p = bytes, i = len; i > 0; --i, ++p)
+ __db_msgadd(env, mbp, "%.2x", (u_int)*p);
+ if (msg_truncated)
+ __db_msgadd(env, mbp, "...");
+ }
+ DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_proff --
+ * Print out an off-page element.
+ */
+static void
+__db_proff(env, mbp, vp)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ void *vp;
+{
+ BOVERFLOW *bo;
+
+ bo = vp;
+ switch (B_TYPE(bo->type)) {
+ case B_OVERFLOW:
+ __db_msgadd(env, mbp, "overflow: total len: %4lu page: %4lu",
+ (u_long)bo->tlen, (u_long)bo->pgno);
+ break;
+ case B_DUPLICATE:
+ __db_msgadd(
+ env, mbp, "duplicate: page: %4lu", (u_long)bo->pgno);
+ break;
+ default:
+ /* NOTREACHED */
+ break;
+ }
+ DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_prflags --
+ * Print out flags values.
+ *
+ * PUBLIC: void __db_prflags __P((ENV *, DB_MSGBUF *,
+ * PUBLIC: u_int32_t, const FN *, const char *, const char *));
+ */
+void
+__db_prflags(env, mbp, flags, fn, prefix, suffix)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ u_int32_t flags;
+ FN const *fn;
+ const char *prefix, *suffix;
+{
+ DB_MSGBUF mb;
+ const FN *fnp;
+ int found, standalone;
+ const char *sep;
+
+ if (fn == NULL)
+ return;
+
+ /*
+ * If it's a standalone message, output the suffix (which will be the
+ * label), regardless of whether we found anything or not, and flush
+ * the line.
+ */
+ if (mbp == NULL) {
+ standalone = 1;
+ mbp = &mb;
+ DB_MSGBUF_INIT(mbp);
+ } else
+ standalone = 0;
+
+ sep = prefix == NULL ? "" : prefix;
+ for (found = 0, fnp = fn; fnp->mask != 0; ++fnp)
+ if (LF_ISSET(fnp->mask)) {
+ __db_msgadd(env, mbp, "%s%s", sep, fnp->name);
+ sep = ", ";
+ found = 1;
+ }
+
+ if ((standalone || found) && suffix != NULL)
+ __db_msgadd(env, mbp, "%s", suffix);
+ if (standalone)
+ DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_name_to_val --
+ * Return the integral value associated with the string, or -1 if missing.
+ * It is intended for looking up string names of enums and single bit
+ * in order to get a numeric value.
+ *
+ * PUBLIC: int __db_name_to_val __P((FN const *, char *));
+ */
+int
+__db_name_to_val(strtable, s)
+ FN const *strtable;
+ char *s;
+{
+ if (s != NULL) {
+ do {
+ if (strcasecmp(strtable->name, s) == 0)
+ return ((int)strtable->mask);
+ } while ((++strtable)->name != NULL);
+ }
+ return (-1);
+}
+
+/*
+ * __db_pagetype_to_string --
+ * Return the name of the specified page type.
+ * PUBLIC: const char *__db_pagetype_to_string __P((u_int32_t));
+ */
+const char *
+__db_pagetype_to_string(type)
+ u_int32_t type;
+{
+ char *s;
+
+ s = NULL;
+ switch (type) {
+ case P_BTREEMETA:
+ s = "btree metadata";
+ break;
+ case P_LDUP:
+ s = "duplicate";
+ break;
+ case P_HASH_UNSORTED:
+ s = "hash unsorted";
+ break;
+ case P_HASH:
+ s = "hash";
+ break;
+ case P_HASHMETA:
+ s = "hash metadata";
+ break;
+ case P_IBTREE:
+ s = "btree internal";
+ break;
+ case P_INVALID:
+ s = "invalid";
+ break;
+ case P_IRECNO:
+ s = "recno internal";
+ break;
+ case P_LBTREE:
+ s = "btree leaf";
+ break;
+ case P_LRECNO:
+ s = "recno leaf";
+ break;
+ case P_OVERFLOW:
+ s = "overflow";
+ break;
+ case P_QAMMETA:
+ s = "queue metadata";
+ break;
+ case P_QAMDATA:
+ s = "queue";
+ break;
+ case P_HEAPMETA:
+ s = "heap metadata";
+ break;
+ case P_HEAP:
+ s = "heap data";
+ break;
+ case P_IHEAP:
+ s = "heap internal";
+ break;
+ default:
+ /* Just return a NULL. */
+ break;
+ }
+ return (s);
+}
+
+/*
+ * __db_dump_pp --
+ * DB->dump pre/post processing.
+ *
+ * PUBLIC: int __db_dump_pp __P((DB *, const char *,
+ * PUBLIC: int (*)(void *, const void *), void *, int, int));
+ */
+int
+__db_dump_pp(dbp, subname, callback, handle, pflag, keyflag)
+ DB *dbp;
+ const char *subname;
+ int (*callback) __P((void *, const void *));
+ void *handle;
+ int pflag, keyflag;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->dump");
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 1)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_dump(dbp, subname, callback, handle, pflag, keyflag);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_dump --
+ * DB->dump.
+ *
+ * PUBLIC: int __db_dump __P((DB *, const char *,
+ * PUBLIC: int (*)(void *, const void *), void *, int, int));
+ */
+int
+__db_dump(dbp, subname, callback, handle, pflag, keyflag)
+ DB *dbp;
+ const char *subname;
+ int (*callback) __P((void *, const void *));
+ void *handle;
+ int pflag, keyflag;
+{
+ DBC *dbcp;
+ DBT key, data;
+ DBT keyret, dataret;
+ DB_HEAP_RID rid;
+ ENV *env;
+ db_recno_t recno;
+ int is_recno, is_heap, ret, t_ret;
+ void *pointer;
+
+ env = dbp->env;
+ is_heap = 0;
+
+ if ((ret = __db_prheader(
+ dbp, subname, pflag, keyflag, handle, callback, NULL, 0)) != 0)
+ return (ret);
+
+ /*
+ * Get a cursor and step through the database, printing out each
+ * key/data pair.
+ */
+ if ((ret = __db_cursor(dbp, NULL, NULL, &dbcp, 0)) != 0)
+ return (ret);
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ if ((ret = __os_malloc(env, 1024 * 1024, &data.data)) != 0)
+ goto err;
+ data.ulen = 1024 * 1024;
+ data.flags = DB_DBT_USERMEM;
+ is_recno = (dbp->type == DB_RECNO || dbp->type == DB_QUEUE);
+ keyflag = is_recno ? keyflag : 1;
+ if (is_recno) {
+ keyret.data = &recno;
+ keyret.size = sizeof(recno);
+ }
+
+ if (dbp->type == DB_HEAP) {
+ is_heap = 1;
+ key.data = &rid;
+ key.size = key.ulen = sizeof(DB_HEAP_RID);
+ key.flags = DB_DBT_USERMEM;
+ }
+
+retry: while ((ret =
+ __dbc_get(dbcp, &key, &data,
+ !is_heap ? DB_NEXT | DB_MULTIPLE_KEY : DB_NEXT )) == 0) {
+ if (is_heap) {
+ /* Never dump keys for HEAP */
+ if ((ret = __db_prdbt(
+ &data, pflag, " ", handle, callback, 0, 0)) != 0)
+ goto err;
+ continue;
+ }
+ DB_MULTIPLE_INIT(pointer, &data);
+ for (;;) {
+ if (is_recno)
+ DB_MULTIPLE_RECNO_NEXT(pointer, &data,
+ recno, dataret.data, dataret.size);
+ else
+ DB_MULTIPLE_KEY_NEXT(pointer, &data,
+ keyret.data, keyret.size,
+ dataret.data, dataret.size);
+
+ if (dataret.data == NULL)
+ break;
+
+ if ((keyflag &&
+ (ret = __db_prdbt(&keyret, pflag, " ",
+ handle, callback, is_recno, 0)) != 0) ||
+ (ret = __db_prdbt(&dataret, pflag, " ",
+ handle, callback, 0, 0)) != 0)
+ goto err;
+ }
+ }
+ if (ret == DB_BUFFER_SMALL) {
+ data.size = (u_int32_t)DB_ALIGN(data.size, 1024);
+ if ((ret = __os_realloc(env, data.size, &data.data)) != 0)
+ goto err;
+ data.ulen = data.size;
+ goto retry;
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ if ((t_ret = __db_prfooter(handle, callback)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: if ((t_ret = __dbc_close(dbcp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (data.data != NULL)
+ __os_free(env, data.data);
+
+ return (ret);
+}
+
+/*
+ * __db_prdbt --
+ * Print out a DBT data element.
+ *
+ * PUBLIC: int __db_prdbt __P((DBT *, int, const char *, void *,
+ * PUBLIC: int (*)(void *, const void *), int, int));
+ */
+int
+__db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap)
+ DBT *dbtp;
+ int checkprint;
+ const char *prefix;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ int is_recno;
+ int is_heap;
+{
+ static const u_char hex[] = "0123456789abcdef";
+ db_recno_t recno;
+ DB_HEAP_RID rid;
+ size_t len;
+ int ret;
+#define DBTBUFLEN 100
+ u_int8_t *p, *hp;
+ char buf[DBTBUFLEN], hbuf[DBTBUFLEN];
+
+ /*
+ * !!!
+ * This routine is the routine that dumps out items in the format
+ * used by db_dump(1) and db_load(1). This means that the format
+ * cannot change.
+ */
+ if (prefix != NULL && (ret = callback(handle, prefix)) != 0)
+ return (ret);
+ if (is_recno) {
+ /*
+ * We're printing a record number, and this has to be done
+ * in a platform-independent way. So we use the numeral in
+ * straight ASCII.
+ */
+ (void)__ua_memcpy(&recno, dbtp->data, sizeof(recno));
+ snprintf(buf, DBTBUFLEN, "%lu", (u_long)recno);
+
+ /* If we're printing data as hex, print keys as hex too. */
+ if (!checkprint) {
+ for (len = strlen(buf), p = (u_int8_t *)buf,
+ hp = (u_int8_t *)hbuf; len-- > 0; ++p) {
+ *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4];
+ *hp++ = hex[*p & 0x0f];
+ }
+ *hp = '\0';
+ ret = callback(handle, hbuf);
+ } else
+ ret = callback(handle, buf);
+
+ if (ret != 0)
+ return (ret);
+ } else if (is_heap) {
+ /*
+ * We're printing a heap record number, and this has to be
+ * done in a platform-independent way. So we use the numeral
+ * in straight ASCII.
+ */
+ (void)__ua_memcpy(&rid, dbtp->data, sizeof(rid));
+ snprintf(buf, DBTBUFLEN, "%lu %hu",
+ (u_long)rid.pgno, (u_short)rid.indx);
+
+ /* If we're printing data as hex, print keys as hex too. */
+ if (!checkprint) {
+ for (len = strlen(buf), p = (u_int8_t *)buf,
+ hp = (u_int8_t *)hbuf; len-- > 0; ++p) {
+ *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4];
+ *hp++ = hex[*p & 0x0f];
+ }
+ *hp = '\0';
+ ret = callback(handle, hbuf);
+ } else
+ ret = callback(handle, buf);
+
+ if (ret != 0)
+ return (ret);
+ } else if (checkprint) {
+ for (len = dbtp->size, p = dbtp->data; len--; ++p)
+ if (isprint((int)*p)) {
+ if (*p == '\\' &&
+ (ret = callback(handle, "\\")) != 0)
+ return (ret);
+ snprintf(buf, DBTBUFLEN, "%c", *p);
+ if ((ret = callback(handle, buf)) != 0)
+ return (ret);
+ } else {
+ snprintf(buf, DBTBUFLEN, "\\%c%c",
+ hex[(u_int8_t)(*p & 0xf0) >> 4],
+ hex[*p & 0x0f]);
+ if ((ret = callback(handle, buf)) != 0)
+ return (ret);
+ }
+ } else
+ for (len = dbtp->size, p = dbtp->data; len--; ++p) {
+ snprintf(buf, DBTBUFLEN, "%c%c",
+ hex[(u_int8_t)(*p & 0xf0) >> 4],
+ hex[*p & 0x0f]);
+ if ((ret = callback(handle, buf)) != 0)
+ return (ret);
+ }
+
+ return (callback(handle, "\n"));
+}
+
+/*
+ * __db_prheader --
+ * Write out header information in the format expected by db_load.
+ *
+ * PUBLIC: int __db_prheader __P((DB *, const char *, int, int, void *,
+ * PUBLIC: int (*)(void *, const void *), VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
+ DB *dbp;
+ const char *subname;
+ int pflag, keyflag;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+{
+ DBT dbt;
+ DBTYPE dbtype;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ u_int32_t flags, tmp_u_int32;
+ size_t buflen;
+ char *buf;
+ int using_vdp, ret, t_ret, tmp_int;
+#ifdef HAVE_HEAP
+ u_int32_t tmp2_u_int32;
+#endif
+
+ ret = 0;
+ buf = NULL;
+ COMPQUIET(buflen, 0);
+
+ /*
+ * If dbp is NULL, then pip is guaranteed to be non-NULL; we only ever
+ * call __db_prheader with a NULL dbp from one case inside __db_prdbt,
+ * and this is a special subdatabase for "lost" items. In this case
+ * we have a vdp (from which we'll get a pip). In all other cases, we
+ * will have a non-NULL dbp (and vdp may or may not be NULL depending
+ * on whether we're salvaging).
+ */
+ if (dbp == NULL)
+ env = NULL;
+ else
+ env = dbp->env;
+ DB_ASSERT(env, dbp != NULL || vdp != NULL);
+
+ /*
+ * If we've been passed a verifier statistics object, use that; we're
+ * being called in a context where dbp->stat is unsafe.
+ *
+ * Also, the verifier may set the pflag on a per-salvage basis. If so,
+ * respect that.
+ */
+ if (vdp != NULL) {
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
+ return (ret);
+
+ if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+ pflag = 1;
+ using_vdp = 1;
+ } else {
+ pip = NULL;
+ using_vdp = 0;
+ }
+
+ /*
+ * If dbp is NULL, make it a btree. Otherwise, set dbtype to whatever
+ * appropriate type for the specified meta page, or the type of the dbp.
+ */
+ if (dbp == NULL)
+ dbtype = DB_BTREE;
+ else if (using_vdp)
+ switch (pip->type) {
+ case P_BTREEMETA:
+ if (F_ISSET(pip, VRFY_IS_RECNO))
+ dbtype = DB_RECNO;
+ else
+ dbtype = DB_BTREE;
+ break;
+ case P_HASHMETA:
+ dbtype = DB_HASH;
+ break;
+ case P_HEAPMETA:
+ dbtype = DB_HEAP;
+ break;
+ case P_QAMMETA:
+ dbtype = DB_QUEUE;
+ break;
+ default:
+ /*
+ * If the meta page is of a bogus type, it's because
+ * we have a badly corrupt database. (We must be in
+ * the verifier for pip to be non-NULL.) Pretend we're
+ * a Btree and salvage what we can.
+ */
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_VERIFYING));
+ dbtype = DB_BTREE;
+ break;
+ }
+ else
+ dbtype = dbp->type;
+
+ if ((ret = callback(handle, "VERSION=3\n")) != 0)
+ goto err;
+ if (pflag) {
+ if ((ret = callback(handle, "format=print\n")) != 0)
+ goto err;
+ } else if ((ret = callback(handle, "format=bytevalue\n")) != 0)
+ goto err;
+
+ /*
+ * 64 bytes is long enough, as a minimum bound, for any of the
+ * fields besides subname. Subname uses __db_prdbt and therefore
+ * does not need buffer space here.
+ */
+ buflen = 64;
+ if ((ret = __os_malloc(env, buflen, &buf)) != 0)
+ goto err;
+ if (subname != NULL) {
+ snprintf(buf, buflen, "database=");
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ DB_INIT_DBT(dbt, subname, strlen(subname));
+ if ((ret = __db_prdbt(&dbt, 1,
+ NULL, handle, callback, 0, 0)) != 0)
+ goto err;
+ }
+ switch (dbtype) {
+ case DB_BTREE:
+ if ((ret = callback(handle, "type=btree\n")) != 0)
+ goto err;
+ if (using_vdp)
+ tmp_int = F_ISSET(pip, VRFY_HAS_RECNUMS) ? 1 : 0;
+ else {
+ if ((ret = __db_get_flags(dbp, &flags)) != 0) {
+ __db_err(env, ret, "DB->get_flags");
+ goto err;
+ }
+ tmp_int = F_ISSET(dbp, DB_AM_RECNUM) ? 1 : 0;
+ }
+ if (tmp_int && (ret = callback(handle, "recnum=1\n")) != 0)
+ goto err;
+
+ if (using_vdp)
+ tmp_u_int32 = pip->bt_minkey;
+ else
+ if ((ret =
+ __bam_get_bt_minkey(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_bt_minkey");
+ goto err;
+ }
+ if (tmp_u_int32 != 0 && tmp_u_int32 != DEFMINKEYPAGE) {
+ snprintf(buf, buflen,
+ "bt_minkey=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ break;
+ case DB_HASH:
+#ifdef HAVE_HASH
+ if ((ret = callback(handle, "type=hash\n")) != 0)
+ goto err;
+ if (using_vdp)
+ tmp_u_int32 = pip->h_ffactor;
+ else
+ if ((ret =
+ __ham_get_h_ffactor(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_h_ffactor");
+ goto err;
+ }
+ if (tmp_u_int32 != 0) {
+ snprintf(buf, buflen,
+ "h_ffactor=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+
+ if (using_vdp)
+ tmp_u_int32 = pip->h_nelem;
+ else
+ if ((ret = __ham_get_h_nelem(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_h_nelem");
+ goto err;
+ }
+ /*
+ * Hash databases have an h_nelem field of 0 or 1, neither
+ * of those values is interesting.
+ */
+ if (tmp_u_int32 > 1) {
+ snprintf(buf, buflen,
+ "h_nelem=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ break;
+#else
+ ret = __db_no_hash_am(env);
+ goto err;
+#endif
+ case DB_HEAP:
+#ifdef HAVE_HEAP
+ if ((ret = callback(handle, "type=heap\n")) != 0)
+ goto err;
+
+ if ((ret = __heap_get_heapsize(
+ dbp, &tmp_u_int32, &tmp2_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_heapsize");
+ goto err;
+ }
+ if (tmp_u_int32 != 0) {
+ snprintf(buf,
+ buflen, "heap_gbytes=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ if (tmp2_u_int32 != 0) {
+ snprintf(buf,
+ buflen, "heap_bytes=%lu\n", (u_long)tmp2_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+
+ if ((ret =
+ __heap_get_heap_regionsize(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_heap_regionsize");
+ goto err;
+ }
+ if (tmp_u_int32 != 0) {
+ snprintf(buf, buflen,
+ "heap_regionsize=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ break;
+#else
+ ret = __db_no_heap_am(env);
+ goto err;
+#endif
+ case DB_QUEUE:
+#ifdef HAVE_QUEUE
+ if ((ret = callback(handle, "type=queue\n")) != 0)
+ goto err;
+ if (using_vdp)
+ tmp_u_int32 = vdp->re_len;
+ else
+ if ((ret = __ram_get_re_len(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_re_len");
+ goto err;
+ }
+ snprintf(buf, buflen, "re_len=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+
+ if (using_vdp)
+ tmp_int = (int)vdp->re_pad;
+ else
+ if ((ret = __ram_get_re_pad(dbp, &tmp_int)) != 0) {
+ __db_err(env, ret, "DB->get_re_pad");
+ goto err;
+ }
+ if (tmp_int != 0 && tmp_int != ' ') {
+ snprintf(buf, buflen, "re_pad=%#x\n", tmp_int);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+
+ if (using_vdp)
+ tmp_u_int32 = vdp->page_ext;
+ else
+ if ((ret =
+ __qam_get_extentsize(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_q_extentsize");
+ goto err;
+ }
+ if (tmp_u_int32 != 0) {
+ snprintf(buf, buflen,
+ "extentsize=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ break;
+#else
+ ret = __db_no_queue_am(env);
+ goto err;
+#endif
+ case DB_RECNO:
+ if ((ret = callback(handle, "type=recno\n")) != 0)
+ goto err;
+ if (using_vdp)
+ tmp_int = F_ISSET(pip, VRFY_IS_RRECNO) ? 1 : 0;
+ else
+ tmp_int = F_ISSET(dbp, DB_AM_RENUMBER) ? 1 : 0;
+ if (tmp_int != 0 &&
+ (ret = callback(handle, "renumber=1\n")) != 0)
+ goto err;
+
+ if (using_vdp)
+ tmp_int = F_ISSET(pip, VRFY_IS_FIXEDLEN) ? 1 : 0;
+ else
+ tmp_int = F_ISSET(dbp, DB_AM_FIXEDLEN) ? 1 : 0;
+ if (tmp_int) {
+ if (using_vdp)
+ tmp_u_int32 = pip->re_len;
+ else
+ if ((ret =
+ __ram_get_re_len(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_re_len");
+ goto err;
+ }
+ snprintf(buf, buflen,
+ "re_len=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+
+ if (using_vdp)
+ tmp_int = (int)pip->re_pad;
+ else
+ if ((ret =
+ __ram_get_re_pad(dbp, &tmp_int)) != 0) {
+ __db_err(env, ret, "DB->get_re_pad");
+ goto err;
+ }
+ if (tmp_int != 0 && tmp_int != ' ') {
+ snprintf(buf,
+ buflen, "re_pad=%#x\n", (u_int)tmp_int);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ }
+ break;
+ case DB_UNKNOWN: /* Impossible. */
+ ret = __db_unknown_path(env, "__db_prheader");
+ goto err;
+ }
+
+ if (using_vdp) {
+ if (F_ISSET(pip, VRFY_HAS_CHKSUM))
+ if ((ret = callback(handle, "chksum=1\n")) != 0)
+ goto err;
+ if (F_ISSET(pip, VRFY_HAS_DUPS))
+ if ((ret = callback(handle, "duplicates=1\n")) != 0)
+ goto err;
+ if (F_ISSET(pip, VRFY_HAS_DUPSORT))
+ if ((ret = callback(handle, "dupsort=1\n")) != 0)
+ goto err;
+#ifdef HAVE_COMPRESSION
+ if (F_ISSET(pip, VRFY_HAS_COMPRESS))
+ if ((ret = callback(handle, "compressed=1\n")) != 0)
+ goto err;
+#endif
+ /*
+ * !!!
+ * We don't know if the page size was the default if we're
+ * salvaging. It doesn't seem that interesting to have, so
+ * we ignore it for now.
+ */
+ } else {
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ if ((ret = callback(handle, "chksum=1\n")) != 0)
+ goto err;
+ if (F_ISSET(dbp, DB_AM_DUP))
+ if ((ret = callback(handle, "duplicates=1\n")) != 0)
+ goto err;
+ if (F_ISSET(dbp, DB_AM_DUPSORT))
+ if ((ret = callback(handle, "dupsort=1\n")) != 0)
+ goto err;
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp))
+ if ((ret = callback(handle, "compressed=1\n")) != 0)
+ goto err;
+#endif
+ if (!F_ISSET(dbp, DB_AM_PGDEF)) {
+ snprintf(buf, buflen,
+ "db_pagesize=%lu\n", (u_long)dbp->pgsize);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ }
+
+#ifdef HAVE_PARTITION
+ if (dbp != NULL && DB_IS_PARTITIONED(dbp) &&
+ F_ISSET((DB_PARTITION *)dbp->p_internal, PART_RANGE)) {
+ DBT *keys;
+ u_int32_t i;
+
+ if ((ret = __partition_get_keys(dbp, &tmp_u_int32, &keys)) != 0)
+ goto err;
+ if (tmp_u_int32 != 0) {
+ snprintf(buf,
+ buflen, "nparts=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ for (i = 0; i < tmp_u_int32 - 1; i++)
+ if ((ret = __db_prdbt(&keys[i],
+ pflag, " ", handle, callback, 0, 0)) != 0)
+ goto err;
+ }
+ }
+#endif
+
+ if (keyflag && (ret = callback(handle, "keys=1\n")) != 0)
+ goto err;
+
+ ret = callback(handle, "HEADER=END\n");
+
+err: if (using_vdp &&
+ (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (buf != NULL)
+ __os_free(env, buf);
+
+ return (ret);
+}
+
+/*
+ * __db_prfooter --
+ * Print the footer that marks the end of a DB dump. This is trivial,
+ * but for consistency's sake we don't want to put its literal contents
+ * in multiple places.
+ *
+ * PUBLIC: int __db_prfooter __P((void *, int (*)(void *, const void *)));
+ */
+int
+__db_prfooter(handle, callback)
+ void *handle;
+ int (*callback) __P((void *, const void *));
+{
+ return (callback(handle, "DATA=END\n"));
+}
+
+/*
+ * __db_pr_callback --
+ * Callback function for using pr_* functions from C.
+ *
+ * PUBLIC: int __db_pr_callback __P((void *, const void *));
+ */
+int
+__db_pr_callback(handle, str_arg)
+ void *handle;
+ const void *str_arg;
+{
+ char *str;
+ FILE *f;
+
+ str = (char *)str_arg;
+ f = (FILE *)handle;
+
+ if (fprintf(f, "%s", str) != (int)strlen(str))
+ return (EIO);
+
+ return (0);
+}
+
+/*
+ * __db_dbtype_to_string --
+ * Return the name of the database type.
+ *
+ * PUBLIC: const char * __db_dbtype_to_string __P((DBTYPE));
+ */
+const char *
+__db_dbtype_to_string(type)
+ DBTYPE type;
+{
+ switch (type) {
+ case DB_BTREE:
+ return ("btree");
+ case DB_HASH:
+ return ("hash");
+ case DB_RECNO:
+ return ("recno");
+ case DB_QUEUE:
+ return ("queue");
+ case DB_HEAP:
+ return ("heap");
+ case DB_UNKNOWN:
+ default:
+ break;
+ }
+ return ("UNKNOWN TYPE");
+}
diff --git a/src/db/db_rec.c b/src/db/db_rec.c
new file mode 100644
index 00000000..8ba1124e
--- /dev/null
+++ b/src/db/db_rec.c
@@ -0,0 +1,2796 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/lock.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+
+static int __db_pg_free_recover_int __P((ENV *, DB_THREAD_INFO *,
+ __db_pg_freedata_args *, DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
+static int __db_pg_free_recover_42_int __P((ENV *, DB_THREAD_INFO *,
+ __db_pg_freedata_42_args *,
+ DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
+
+/*
+ * PUBLIC: int __db_addrem_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * This log message is generated whenever we add or remove a duplicate
+ * to/from a duplicate page. On recover, we just do the opposite.
+ */
+int
+__db_addrem_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_addrem_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+ u_int32_t opcode;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_addrem_print);
+ REC_INTRO(__db_addrem_read, ip, 1);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+ modified = 0;
+
+ opcode = OP_MODE_GET(argp->opcode);
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if ((cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_DUP) ||
+ (cmp_n == 0 && DB_UNDO(op) && opcode == DB_REM_DUP)) {
+ /* Need to redo an add, or undo a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes,
+ argp->hdr.size == 0 ? NULL : &argp->hdr,
+ argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0)
+ goto out;
+ modified = 1;
+
+ } else if ((cmp_n == 0 && DB_UNDO(op) && opcode == DB_ADD_DUP) ||
+ (cmp_p == 0 && DB_REDO(op) && opcode == DB_REM_DUP)) {
+ /* Need to undo an add, or redo a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __db_ditem(dbc,
+ pagep, argp->indx, argp->nbytes)) != 0)
+ goto out;
+ modified = 1;
+ }
+
+ if (modified) {
+ if (DB_REDO(op))
+ LSN(pagep) = *lsnp;
+ else
+ LSN(pagep) = argp->pagelsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * PUBLIC: int __db_addrem_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * This log message is generated whenever we add or remove a duplicate
+ * to/from a duplicate page. On recover, we just do the opposite.
+ */
+int
+__db_addrem_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_addrem_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_addrem_print);
+ REC_INTRO(__db_addrem_42_read, ip, 1);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+ modified = 0;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_DUP) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_DUP)) {
+ /* Need to redo an add, or undo a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes,
+ argp->hdr.size == 0 ? NULL : &argp->hdr,
+ argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0)
+ goto out;
+ modified = 1;
+
+ } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_DUP) ||
+ (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_DUP)) {
+ /* Need to undo an add, or redo a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __db_ditem(dbc,
+ pagep, argp->indx, argp->nbytes)) != 0)
+ goto out;
+ modified = 1;
+ }
+
+ if (modified) {
+ if (DB_REDO(op))
+ LSN(pagep) = *lsnp;
+ else
+ LSN(pagep) = argp->pagelsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * PUBLIC: int __db_big_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_big_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+ u_int32_t opcode;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_big_print);
+ REC_INTRO(__db_big_read, ip, 0);
+
+ opcode = OP_MODE_GET(argp->opcode);
+ REC_FGET(mpf, ip, argp->pgno, &pagep, ppage);
+ modified = 0;
+
+ /*
+ * There are three pages we need to check. The one on which we are
+ * adding data, the previous one whose next_pointer may have
+ * been updated, and the next one whose prev_pointer may have
+ * been updated.
+ */
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if ((cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_BIG) ||
+ (cmp_n == 0 && DB_UNDO(op) && opcode == DB_REM_BIG)) {
+ /* We are either redo-ing an add, or undoing a delete. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, argp->pgno, argp->prev_pgno,
+ argp->next_pgno, 0, P_OVERFLOW);
+ OV_LEN(pagep) = argp->dbt.size;
+ OV_REF(pagep) = 1;
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data,
+ argp->dbt.size);
+ PREV_PGNO(pagep) = argp->prev_pgno;
+ modified = 1;
+ } else if ((cmp_n == 0 && DB_UNDO(op) && opcode == DB_ADD_BIG) ||
+ (cmp_p == 0 && DB_REDO(op) && opcode == DB_REM_BIG)) {
+ /*
+ * We are either undo-ing an add or redo-ing a delete.
+ * The page is about to be reclaimed in either case, so
+ * there really isn't anything to do here.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ modified = 1;
+ } else if (cmp_p == 0 && DB_REDO(op) && opcode == DB_APPEND_BIG) {
+ /* We are redoing an append. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+ OV_LEN(pagep), argp->dbt.data, argp->dbt.size);
+ OV_LEN(pagep) += argp->dbt.size;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op) && opcode == DB_APPEND_BIG) {
+ /* We are undoing an append. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ OV_LEN(pagep) -= argp->dbt.size;
+ memset((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+ OV_LEN(pagep), 0, argp->dbt.size);
+ modified = 1;
+ }
+ if (modified)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+
+ /*
+ * We only delete a whole chain of overflow items, and appends only
+ * apply to a single page. Adding a page is the only case that
+ * needs to update the chain.
+ */
+ppage: if (opcode != DB_ADD_BIG)
+ goto done;
+
+ /* Now check the previous page. */
+ if (argp->prev_pgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
+ modified = 0;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if (cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_BIG) {
+ /* Redo add, undo delete. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ NEXT_PGNO(pagep) = argp->pgno;
+ modified = 1;
+ } else if (cmp_n == 0 &&
+ DB_UNDO(op) && opcode == DB_ADD_BIG) {
+ /* Redo delete, undo add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ NEXT_PGNO(pagep) = argp->next_pgno;
+ modified = 1;
+ }
+ if (modified)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ }
+ pagep = NULL;
+
+ /* Now check the next page. Can only be set on a delete. */
+npage: if (argp->next_pgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
+ modified = 0;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ PREV_PGNO(pagep) = PGNO_INVALID;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ PREV_PGNO(pagep) = argp->pgno;
+ modified = 1;
+ }
+ if (modified)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ }
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * PUBLIC: int __db_big_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_big_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_big_print);
+ REC_INTRO(__db_big_42_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, ppage);
+ modified = 0;
+
+ /*
+ * There are three pages we need to check. The one on which we are
+ * adding data, the previous one whose next_pointer may have
+ * been updated, and the next one whose prev_pointer may have
+ * been updated.
+ */
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_BIG)) {
+ /* We are either redo-ing an add, or undoing a delete. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, argp->pgno, argp->prev_pgno,
+ argp->next_pgno, 0, P_OVERFLOW);
+ OV_LEN(pagep) = argp->dbt.size;
+ OV_REF(pagep) = 1;
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data,
+ argp->dbt.size);
+ PREV_PGNO(pagep) = argp->prev_pgno;
+ modified = 1;
+ } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_BIG) ||
+ (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_BIG)) {
+ /*
+ * We are either undo-ing an add or redo-ing a delete.
+ * The page is about to be reclaimed in either case, so
+ * there really isn't anything to do here.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ modified = 1;
+ } else if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_APPEND_BIG) {
+ /* We are redoing an append. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+ OV_LEN(pagep), argp->dbt.data, argp->dbt.size);
+ OV_LEN(pagep) += argp->dbt.size;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_APPEND_BIG) {
+ /* We are undoing an append. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ OV_LEN(pagep) -= argp->dbt.size;
+ memset((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+ OV_LEN(pagep), 0, argp->dbt.size);
+ modified = 1;
+ }
+ if (modified)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+
+ /*
+ * We only delete a whole chain of overflow items, and appends only
+ * apply to a single page. Adding a page is the only case that
+ * needs to update the chain.
+ */
+ppage: if (argp->opcode != DB_ADD_BIG)
+ goto done;
+
+ /* Now check the previous page. */
+ if (argp->prev_pgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
+ modified = 0;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) {
+ /* Redo add, undo delete. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ NEXT_PGNO(pagep) = argp->pgno;
+ modified = 1;
+ } else if (cmp_n == 0 &&
+ DB_UNDO(op) && argp->opcode == DB_ADD_BIG) {
+ /* Redo delete, undo add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ NEXT_PGNO(pagep) = argp->next_pgno;
+ modified = 1;
+ }
+ if (modified)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ }
+ pagep = NULL;
+
+ /* Now check the next page. Can only be set on a delete. */
+npage: if (argp->next_pgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
+ modified = 0;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ PREV_PGNO(pagep) = PGNO_INVALID;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ PREV_PGNO(pagep) = argp->pgno;
+ modified = 1;
+ }
+ if (modified)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ }
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+/*
+ * __db_ovref_recover --
+ * Recovery function for __db_ovref().
+ *
+ * PUBLIC: int __db_ovref_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_ovref_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_ovref_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_ovref_print);
+ REC_INTRO(__db_ovref_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+ cmp = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp, &LSN(pagep), &argp->lsn);
+ if (cmp == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ OV_REF(pagep) += argp->adjust;
+ pagep->lsn = *lsnp;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ OV_REF(pagep) -= argp->adjust;
+ pagep->lsn = argp->lsn;
+ }
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_debug_recover --
+ * Recovery function for debug.
+ *
+ * PUBLIC: int __db_debug_recover __P((ENV *,
+ * PUBLIC: DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_debug_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_debug_args *argp;
+ int ret;
+
+ COMPQUIET(op, DB_TXN_ABORT);
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__db_debug_print);
+ REC_NOOP_INTRO(__db_debug_read);
+
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __db_noop_recover --
+ * Recovery function for noop.
+ *
+ * PUBLIC: int __db_noop_recover __P((ENV *,
+ * PUBLIC: DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_noop_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_noop_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_noop_print);
+ REC_INTRO(__db_noop_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = argp->prevlsn;
+ }
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf,
+ ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_alloc_recover --
+ * Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __db_pg_alloc_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_alloc_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_alloc_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, created, level, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ meta = NULL;
+ pagep = NULL;
+ created = 0;
+ REC_PRINT(__db_pg_alloc_print);
+ REC_INTRO(__db_pg_alloc_read, ip, 0);
+
+ /*
+ * Fix up the metadata page. If we're redoing the operation, we have
+ * to get the metadata page and update its LSN and its free pointer.
+ * If we're undoing the operation and the page was ever created, we put
+ * it on the freelist.
+ */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+ /* The metadata page must always exist on redo. */
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ LSN(meta) = *lsnp;
+ meta->free = argp->next;
+ if (argp->pgno > meta->last_pgno)
+ meta->last_pgno = argp->pgno;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ LSN(meta) = argp->meta_lsn;
+ /*
+ * If the page has a zero LSN then its newly created and
+ * will be truncated rather than go on the free list.
+ */
+ if (!IS_ZERO_LSN(argp->page_lsn))
+ meta->free = argp->pgno;
+ meta->last_pgno = argp->last_pgno;
+ }
+
+#ifdef HAVE_FTRUNCATE
+ /*
+ * check to see if we are keeping a sorted freelist, if so put
+ * this back in the in memory list. It must be the first element.
+ */
+ if (op == DB_TXN_ABORT && !IS_ZERO_LSN(argp->page_lsn)) {
+ db_pgno_t *list;
+ u_int32_t nelem;
+
+ if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
+ goto out;
+ if (list != NULL && (nelem == 0 || *list != argp->pgno)) {
+ if ((ret =
+ __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
+ goto out;
+ if (nelem != 0)
+ memmove(list + 1, list, nelem * sizeof(*list));
+ *list = argp->pgno;
+ }
+ }
+#endif
+
+ /*
+ * Fix up the allocated page. If the page does not exist
+ * and we can truncate it then don't create it.
+ * Otherwise if we're redoing the operation, we have
+ * to get the page (creating it if it doesn't exist), and update its
+ * LSN. If we're undoing the operation, we have to reset the page's
+ * LSN and put it on the free list.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ /*
+ * We have to be able to identify if a page was newly
+ * created so we can recover it properly. We cannot simply
+ * look for an empty header, because hash uses a pgin
+ * function that will set the header. Instead, we explicitly
+ * try for the page without CREATE and if that fails, then
+ * create it.
+ */
+ if (DB_UNDO(op))
+ goto do_truncate;
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+ DB_MPOOL_CREATE, &pagep)) != 0) {
+ if (DB_UNDO(op) && ret == ENOSPC)
+ goto do_truncate;
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ created = 1;
+ }
+
+ /* Fix up the allocated page. */
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+
+ /*
+ * If an initial allocation is aborted and then reallocated during
+ * an archival restore the log record will have an LSN for the page
+ * but the page will be empty.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)))
+ cmp_p = 0;
+
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+ /*
+ * Another special case we have to handle is if we ended up with a
+ * page of all 0's which can happen if we abort between allocating a
+ * page in mpool and initializing it. In that case, even if we're
+ * undoing, we need to re-initialize the page.
+ */
+ if (DB_REDO(op) && cmp_p == 0) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ switch (argp->ptype) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ level = LEAFLEVEL;
+ break;
+ default:
+ level = 0;
+ break;
+ }
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+ pagep->lsn = *lsnp;
+ } else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
+ /*
+ * This is where we handle the case of a 0'd page (pagep->pgno
+ * is equal to PGNO_INVALID).
+ * Undo the allocation, reinitialize the page and
+ * link its next pointer to the free list.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+ pagep->lsn = argp->page_lsn;
+ }
+
+do_truncate:
+ /*
+ * If the page was newly created, give it back.
+ */
+ if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
+ IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+ /* Discard the page. */
+ if (pagep != NULL) {
+ if ((ret = __memp_fput(mpf, ip,
+ pagep, DB_PRIORITY_VERY_LOW)) != 0)
+ goto out;
+ pagep = NULL;
+ }
+ /* Give the page back to the OS. */
+ if (meta->last_pgno <= argp->pgno && (ret = __memp_ftruncate(
+ mpf, NULL, ip, argp->pgno, MP_TRUNC_RECOVER)) != 0)
+ goto out;
+ }
+
+ if (pagep != NULL) {
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ }
+
+ ret = __memp_fput(mpf, ip, meta, file_dbp->priority);
+ meta = NULL;
+ if (ret != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_free_recover_int --
+ */
+static int
+__db_pg_free_recover_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __db_pg_freedata_args *argp;
+ DB *file_dbp;
+ DB_LSN *lsnp;
+ DB_MPOOLFILE *mpf;
+ db_recops op;
+ int data;
+{
+ DBMETA *meta;
+ DB_LSN copy_lsn;
+ PAGE *pagep, *prevp;
+ int cmp_n, cmp_p, is_meta, ret;
+
+ meta = NULL;
+ pagep = prevp = NULL;
+
+ /*
+ * Get the "metapage". This will either be the metapage
+ * or the previous page in the free list if we are doing
+ * sorted allocations. If its a previous page then
+ * we will not be truncating.
+ */
+ is_meta = argp->meta_pgno == PGNO_BASE_MD;
+
+ REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
+
+ if (argp->meta_pgno != PGNO_BASE_MD)
+ prevp = (PAGE *)meta;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+
+ /*
+ * Fix up the metadata page. If we're redoing or undoing the operation
+ * we get the page and update its LSN, last and free pointer.
+ */
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ /*
+ * If we are at the end of the file truncate, otherwise
+ * put on the free list.
+ */
+#ifdef HAVE_FTRUNCATE
+ if (argp->pgno == argp->last_pgno)
+ meta->last_pgno = argp->pgno - 1;
+ else
+#endif
+ if (is_meta)
+ meta->free = argp->pgno;
+ else
+ NEXT_PGNO(prevp) = argp->pgno;
+ LSN(meta) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo the deallocation. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ if (is_meta) {
+ if (meta->last_pgno < argp->pgno)
+ meta->last_pgno = argp->pgno;
+ meta->free = argp->next;
+ } else
+ NEXT_PGNO(prevp) = argp->next;
+ LSN(meta) = argp->meta_lsn;
+ }
+
+check_meta:
+ if (ret != 0 && is_meta) {
+ /* The metadata page must always exist. */
+ ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+ goto out;
+ }
+
+ /*
+ * Get the freed page. Don't create the page if we are going to
+ * free it. If we're redoing the operation we get the page and
+ * explicitly discard its contents, then update its LSN. If we're
+ * undoing the operation, we get the page and restore its header.
+ */
+ if (DB_REDO(op) || (is_meta && meta->last_pgno < argp->pgno)) {
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+#ifdef HAVE_FTRUNCATE
+ if (is_meta &&
+ DB_REDO(op) && meta->last_pgno <= argp->pgno)
+ goto trunc;
+#endif
+ goto done;
+ }
+ } else if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+
+ (void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+ cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+
+ /*
+ * This page got extended by a later allocation,
+ * but its allocation was not in the scope of this
+ * recovery pass.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)))
+ cmp_p = 0;
+
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+ /*
+ * We need to check that the page could have the current LSN
+ * which was copied before it was truncated in addition to
+ * the usual of having the previous LSN.
+ */
+ if (DB_REDO(op) &&
+ (cmp_p == 0 || cmp_n == 0 ||
+ (IS_ZERO_LSN(copy_lsn) &&
+ LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+ /* Need to redo the deallocation. */
+ /*
+ * The page can be truncated if it was truncated at runtime
+ * and the current metapage reflects the truncation.
+ */
+#ifdef HAVE_FTRUNCATE
+ if (is_meta && meta->last_pgno <= argp->pgno &&
+ argp->last_pgno <= argp->pgno) {
+ if ((ret = __memp_fput(mpf, ip,
+ pagep, DB_PRIORITY_VERY_LOW)) != 0)
+ goto out;
+ pagep = NULL;
+trunc: if ((ret = __memp_ftruncate(mpf, NULL, ip,
+ argp->pgno, MP_TRUNC_RECOVER)) != 0)
+ goto out;
+ } else if (argp->last_pgno == argp->pgno) {
+ /* The page was truncated at runtime, zero it out. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, 0, PGNO_INVALID,
+ PGNO_INVALID, PGNO_INVALID, 0, P_INVALID);
+ ZERO_LSN(pagep->lsn);
+ } else
+#endif
+ if (cmp_p == 0 || IS_ZERO_LSN(LSN(pagep))) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+ pagep->lsn = *lsnp;
+
+ }
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to reallocate the page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->header.data, argp->header.size);
+ if (data)
+ memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+ argp->data.data, argp->data.size);
+ }
+ if (pagep != NULL &&
+ (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+
+ pagep = NULL;
+#ifdef HAVE_FTRUNCATE
+ /*
+ * If we are keeping an in memory free list remove this
+ * element from the list.
+ */
+ if (op == DB_TXN_ABORT && argp->pgno != argp->last_pgno) {
+ db_pgno_t *lp;
+ u_int32_t nelem, pos;
+
+ if ((ret = __memp_get_freelist(mpf, &nelem, &lp)) != 0)
+ goto out;
+ if (lp != NULL) {
+ pos = 0;
+ if (!is_meta) {
+ __db_freelist_pos(argp->pgno, lp, nelem, &pos);
+
+ /*
+ * If we aborted after logging but before
+ * updating the free list don't do anything.
+ */
+ if (argp->pgno != lp[pos]) {
+ DB_ASSERT(env,
+ argp->meta_pgno == lp[pos]);
+ goto done;
+ }
+ DB_ASSERT(env,
+ argp->meta_pgno == lp[pos - 1]);
+ } else if (nelem != 0 && argp->pgno != lp[pos])
+ goto done;
+
+ if (pos < nelem)
+ memmove(&lp[pos], &lp[pos + 1],
+ ((nelem - pos) - 1) * sizeof(*lp));
+
+ /* Shrink the list */
+ if ((ret =
+ __memp_extend_freelist(mpf, nelem - 1, &lp)) != 0)
+ goto out;
+ }
+ }
+#endif
+done:
+ if (meta != NULL &&
+ (ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+
+ return (ret);
+}
+
+/*
+ * __db_pg_free_recover --
+ * Recovery function for pg_free.
+ *
+ * PUBLIC: int __db_pg_free_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_free_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_free_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_free_print);
+ REC_INTRO(__db_pg_free_read, ip, 0);
+
+ if ((ret = __db_pg_free_recover_int(env, ip,
+ (__db_pg_freedata_args *)argp, file_dbp, lsnp, mpf, op, 0)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_freedata_recover --
+ * Recovery function for pg_freedata.
+ *
+ * PUBLIC: int __db_pg_freedata_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_freedata_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_freedata_print);
+ REC_INTRO(__db_pg_freedata_read, ip, 0);
+
+ if ((ret = __db_pg_free_recover_int(env,
+ ip, argp, file_dbp, lsnp, mpf, op, 1)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_cksum_recover --
+ * Recovery function for checksum failure log record.
+ *
+ * PUBLIC: int __db_cksum_recover __P((ENV *,
+ * PUBLIC: DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_cksum_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_cksum_args *argp;
+ int ret;
+
+ COMPQUIET(info, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, DB_TXN_ABORT);
+
+ REC_PRINT(__db_cksum_print);
+
+ if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ /*
+ * We had a checksum failure -- the only option is to run catastrophic
+ * recovery.
+ */
+ if (F_ISSET(env, ENV_RECOVER_FATAL))
+ ret = 0;
+ else {
+ __db_errx(env, DB_STR("0642",
+ "Checksum failure requires catastrophic recovery"));
+ ret = __env_panic(env, DB_RUNRECOVERY);
+ }
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * __db_pg_init_recover --
+ * Recovery function to reinit pages after truncation.
+ *
+ * PUBLIC: int __db_pg_init_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_init_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_init_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN copy_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret, type;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_init_print);
+ REC_INTRO(__db_pg_init_read, ip, 0);
+
+ mpf = file_dbp->mpf;
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_UNDO(op)) {
+ if (ret == DB_PAGE_NOTFOUND)
+ goto done;
+ else {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+
+ /*
+ * This page was truncated and may simply not have
+ * had an item written to it yet. This should only
+ * happen on hash databases, so confirm that.
+ */
+ DB_ASSERT(env, file_dbp->type == DB_HASH);
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+
+ (void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ if (TYPE(pagep) == P_HASH)
+ type = P_HASH;
+ else
+ type = file_dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, PGNO(pagep), PGNO_INVALID,
+ PGNO_INVALID, TYPE(pagep) == P_HASH ? 0 : 1, type);
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Put the data back on the page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->header.data, argp->header.size);
+ if (argp->data.size > 0)
+ memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+ argp->data.data, argp->data.size);
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_trunc_recover --
+ * Recovery function for pg_trunc.
+ *
+ * PUBLIC: int __db_pg_trunc_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_trunc_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+#ifdef HAVE_FTRUNCATE
+ __db_pg_trunc_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pglist_t *pglist, *lp;
+ db_pgno_t last_pgno, *list;
+ u_int32_t felem, nelem, pos;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_trunc_print);
+ REC_INTRO(__db_pg_trunc_read, ip, 1);
+
+ pglist = (db_pglist_t *) argp->list.data;
+ nelem = argp->list.size / sizeof(db_pglist_t);
+ if (DB_REDO(op)) {
+ /*
+ * First call __db_pg_truncate to find the truncation
+ * point, truncate the file and return the new last_pgno.
+ */
+ last_pgno = argp->last_pgno;
+ if ((ret = __db_pg_truncate(dbc, NULL, pglist,
+ NULL, &nelem, argp->next_free, &last_pgno, lsnp, 1)) != 0)
+ goto out;
+
+ if (argp->last_free != PGNO_INVALID) {
+ /*
+ * Update the next pointer of the last page in
+ * the freelist. If the truncation point is
+ * beyond next_free then this is still in the freelist
+ * otherwise the last_free page is at the end.
+ */
+ if ((ret = __memp_fget(mpf,
+ &argp->last_free, ip, NULL, 0, &meta)) == 0) {
+ if (LOG_COMPARE(&LSN(meta),
+ &argp->last_lsn) == 0) {
+ REC_DIRTY(mpf,
+ ip, dbc->priority, &meta);
+ if (pglist->pgno > last_pgno)
+ NEXT_PGNO(meta) = PGNO_INVALID;
+ else
+ NEXT_PGNO(meta) = pglist->pgno;
+ LSN(meta) = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf, ip,
+ meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+ } else if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ }
+ if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL,
+ 0, &meta)) != 0)
+ goto out;
+ if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ if (argp->last_free == PGNO_INVALID) {
+ if (nelem == 0)
+ meta->free = PGNO_INVALID;
+ else
+ meta->free = pglist->pgno;
+ }
+ /*
+ * If this is part of a multi record truncate
+ * this could be just the last page of this record
+ * don't move the meta->last_pgno forward.
+ */
+ if (meta->last_pgno > last_pgno)
+ meta->last_pgno = last_pgno;
+ LSN(meta) = *lsnp;
+ }
+ } else {
+ /* Put the free list back in its original order. */
+ for (lp = pglist; lp < &pglist[nelem]; lp++) {
+ if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+ NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+ if (IS_ZERO_LSN(LSN(pagep)) ||
+ LOG_COMPARE(&LSN(pagep), lsnp) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, lp->pgno,
+ PGNO_INVALID, lp->next_pgno, 0, P_INVALID);
+ LSN(pagep) = lp->lsn;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ }
+ /*
+ * Link the truncated part back into the free list.
+ * Its either after the last_free page or directly
+ * linked to the metadata page.
+ */
+ if (argp->last_free != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->last_free,
+ ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) {
+ if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+ NEXT_PGNO(meta) = argp->next_free;
+ LSN(meta) = argp->last_lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip,
+ meta, file_dbp->priority)) != 0)
+ goto out;
+ } else if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ meta = NULL;
+ }
+ if ((ret = __memp_fget(mpf, &argp->meta,
+ ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+ goto out;
+ if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ /*
+ * If we had to break up the list last_pgno
+ * may only represent the end of the block.
+ */
+ if (meta->last_pgno < argp->last_pgno)
+ meta->last_pgno = argp->last_pgno;
+ if (argp->last_free == PGNO_INVALID)
+ meta->free = argp->next_free;
+ LSN(meta) = argp->meta_lsn;
+ }
+ }
+
+ if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+
+ if (op == DB_TXN_ABORT) {
+ /*
+ * Put the pages back on the in memory free list.
+ * If this is part of a multi-record truncate then
+ * we need to find this batch, it may not be at the end.
+ * If we aborted while writing one of the log records
+ * then this set may still be in the list.
+ */
+ if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+ goto out;
+ if (list != NULL) {
+ if (felem != 0 && list[felem - 1] > pglist->pgno) {
+ __db_freelist_pos(
+ pglist->pgno, list, felem, &pos);
+ DB_ASSERT(env, pos < felem);
+ if (pglist->pgno == list[pos])
+ goto done;
+ pos++;
+ } else if (felem != 0 &&
+ list[felem - 1] == pglist->pgno)
+ goto done;
+ else
+ pos = felem;
+ if ((ret = __memp_extend_freelist(
+ mpf, felem + nelem, &list)) != 0)
+ goto out;
+ if (pos != felem)
+ memmove(&list[nelem + pos], &list[pos],
+ sizeof(*list) * (felem - pos));
+ for (lp = pglist; lp < &pglist[nelem]; lp++)
+ list[pos++] = lp->pgno;
+ }
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+#else
+ /*
+ * If HAVE_FTRUNCATE is not defined, we'll never see pg_trunc records
+ * to recover.
+ */
+ COMPQUIET(env, NULL);
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, DB_TXN_ABORT);
+ COMPQUIET(info, NULL);
+ return (EINVAL);
+#endif
+}
+/*
+ * __db_realloc_recover --
+ * Recovery function for realloc.
+ *
+ * PUBLIC: int __db_realloc_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_realloc_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_realloc_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *pagep;
+ db_pglist_t *pglist, *lp;
+#ifdef HAVE_FTRUNCATE
+ db_pgno_t *list;
+ u_int32_t felem, pos;
+#endif
+ u_int32_t nelem;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+
+ REC_PRINT(__db_realloc_print);
+ REC_INTRO(__db_realloc_read, ip, 1);
+ mpf = file_dbp->mpf;
+
+ /*
+ * First, iterate over all the pages and make sure they are all in
+ * their prior or new states (according to the op).
+ */
+ pglist = (db_pglist_t *) argp->list.data;
+ nelem = argp->list.size / sizeof(db_pglist_t);
+ for (lp = pglist; lp < &pglist[nelem]; lp++) {
+ if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+ NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+ if (DB_REDO(op) && LOG_COMPARE(&LSN(pagep), &lp->lsn) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, lp->pgno,
+ PGNO_INVALID, PGNO_INVALID, 0, argp->ptype);
+ LSN(pagep) = *lsnp;
+ } else if (DB_UNDO(op) && (IS_ZERO_LSN(LSN(pagep)) ||
+ LOG_COMPARE(&LSN(pagep), lsnp) == 0)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, lp->pgno,
+ PGNO_INVALID, lp->next_pgno, 0, P_INVALID);
+ LSN(pagep) = lp->lsn;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ }
+
+ /* Now, fix up the free list. */
+ if ((ret = __memp_fget(mpf,
+ &argp->prev_pgno, ip, NULL, 0, &pagep)) != 0)
+ goto out;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if (DB_REDO(op) && cmp_p == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (argp->prev_pgno == PGNO_BASE_MD)
+ ((DBMETA *)pagep)->free = argp->next_free;
+ else
+ NEXT_PGNO(pagep) = argp->next_free;
+ LSN(pagep) = *lsnp;
+ } else if (DB_UNDO(op) && cmp_n == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (argp->prev_pgno == PGNO_BASE_MD)
+ ((DBMETA *)pagep)->free = pglist->pgno;
+ else
+ NEXT_PGNO(pagep) = pglist->pgno;
+ LSN(pagep) = argp->page_lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+
+#ifdef HAVE_FTRUNCATE
+ if (op == DB_TXN_ABORT) {
+ /* Put the pages back in the sorted list. */
+ if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+ goto out;
+ if (list != NULL) {
+ __db_freelist_pos(pglist->pgno, list, felem, &pos);
+ if (pglist->pgno == list[pos])
+ goto done;
+ if ((ret = __memp_extend_freelist(
+ mpf, felem + nelem, &list)) != 0)
+ goto out;
+ pos++;
+ if (pos != felem)
+ memmove(&list[pos+nelem],
+ &list[pos], nelem * sizeof(*list));
+ for (lp = pglist; lp < &pglist[nelem]; lp++)
+ list[pos++] = lp->pgno;
+ }
+ }
+#endif
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+/*
+ * __db_pg_sort_44_recover --
+ * Recovery function for pg_sort.
+ * This is deprecated and kept for replication upgrades.
+ *
+ * PUBLIC: int __db_pg_sort_44_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_sort_44_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+#ifdef HAVE_FTRUNCATE
+ __db_pg_sort_44_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pglist_t *pglist, *lp;
+ db_pgno_t pgno, *list;
+ u_int32_t felem, nelem;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_sort_44_print);
+ REC_INTRO(__db_pg_sort_44_read, ip, 1);
+
+ pglist = (db_pglist_t *) argp->list.data;
+ nelem = argp->list.size / sizeof(db_pglist_t);
+ if (DB_REDO(op)) {
+ pgno = argp->last_pgno;
+ __db_freelist_sort(pglist, nelem);
+ if ((ret = __db_pg_truncate(dbc, NULL,
+ pglist, NULL, &nelem, PGNO_INVALID, &pgno, lsnp, 1)) != 0)
+ goto out;
+
+ if (argp->last_free != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf,
+ &argp->last_free, ip, NULL, 0, &meta)) == 0) {
+ if (LOG_COMPARE(&LSN(meta),
+ &argp->last_lsn) == 0) {
+ REC_DIRTY(mpf,
+ ip, dbc->priority, &meta);
+ NEXT_PGNO(meta) = PGNO_INVALID;
+ LSN(meta) = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf, ip,
+ meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+ } else if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ }
+ if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL,
+ 0, &meta)) != 0)
+ goto out;
+ if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ if (argp->last_free == PGNO_INVALID) {
+ if (nelem == 0)
+ meta->free = PGNO_INVALID;
+ else
+ meta->free = pglist->pgno;
+ }
+ meta->last_pgno = pgno;
+ LSN(meta) = *lsnp;
+ }
+ } else {
+ /* Put the free list back in its original order. */
+ for (lp = pglist; lp < &pglist[nelem]; lp++) {
+ if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+ NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+ if (IS_ZERO_LSN(LSN(pagep)) ||
+ LOG_COMPARE(&LSN(pagep), lsnp) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (lp == &pglist[nelem - 1])
+ pgno = PGNO_INVALID;
+ else
+ pgno = lp[1].pgno;
+
+ P_INIT(pagep, file_dbp->pgsize,
+ lp->pgno, PGNO_INVALID, pgno, 0, P_INVALID);
+ LSN(pagep) = lp->lsn;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ }
+ if (argp->last_free != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->last_free,
+ ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) {
+ if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+ NEXT_PGNO(meta) = pglist->pgno;
+ LSN(meta) = argp->last_lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip,
+ meta, file_dbp->priority)) != 0)
+ goto out;
+ } else if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ meta = NULL;
+ }
+ if ((ret = __memp_fget(mpf, &argp->meta,
+ ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+ goto out;
+ if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ meta->last_pgno = argp->last_pgno;
+ if (argp->last_free == PGNO_INVALID)
+ meta->free = pglist->pgno;
+ LSN(meta) = argp->meta_lsn;
+ }
+ }
+ if (op == DB_TXN_ABORT) {
+ if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+ goto out;
+ if (list != NULL) {
+ DB_ASSERT(env, felem == 0 ||
+ argp->last_free == list[felem - 1]);
+ if ((ret = __memp_extend_freelist(
+ mpf, felem + nelem, &list)) != 0)
+ goto out;
+ for (lp = pglist; lp < &pglist[nelem]; lp++)
+ list[felem++] = lp->pgno;
+ }
+ }
+
+ if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+#else
+ /*
+ * If HAVE_FTRUNCATE is not defined, we'll never see pg_sort records
+ * to recover.
+ */
+ COMPQUIET(env, NULL);
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, DB_TXN_ABORT);
+ COMPQUIET(info, NULL);
+ return (EINVAL);
+#endif
+}
+
+/*
+ * __db_pg_alloc_42_recover --
+ * Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __db_pg_alloc_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_alloc_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_alloc_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, created, level, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ meta = NULL;
+ pagep = NULL;
+ created = 0;
+ REC_PRINT(__db_pg_alloc_42_print);
+ REC_INTRO(__db_pg_alloc_42_read, ip, 0);
+
+ /*
+ * Fix up the metadata page. If we're redoing the operation, we have
+ * to get the metadata page and update its LSN and its free pointer.
+ * If we're undoing the operation and the page was ever created, we put
+ * it on the freelist.
+ */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+ /* The metadata page must always exist on redo. */
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ LSN(meta) = *lsnp;
+ meta->free = argp->next;
+ if (argp->pgno > meta->last_pgno)
+ meta->last_pgno = argp->pgno;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ goto no_rollback;
+ }
+
+ /*
+ * Fix up the allocated page. If the page does not exist
+ * and we can truncate it then don't create it.
+ * Otherwise if we're redoing the operation, we have
+ * to get the page (creating it if it doesn't exist), and update its
+ * LSN. If we're undoing the operation, we have to reset the page's
+ * LSN and put it on the free list, or truncate it.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ /*
+ * We have to be able to identify if a page was newly
+ * created so we can recover it properly. We cannot simply
+ * look for an empty header, because hash uses a pgin
+ * function that will set the header. Instead, we explicitly
+ * try for the page without CREATE and if that fails, then
+ * create it.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+ if (DB_UNDO(op) && ret == ENOSPC)
+ goto do_truncate;
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ created = 1;
+ }
+
+ /* Fix up the allocated page. */
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+
+ /*
+ * If an initial allocation is aborted and then reallocated during
+ * an archival restore the log record will have an LSN for the page
+ * but the page will be empty.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)) ||
+ (IS_ZERO_LSN(argp->page_lsn) && IS_INIT_LSN(LSN(pagep))))
+ cmp_p = 0;
+
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+ /*
+ * Another special case we have to handle is if we ended up with a
+ * page of all 0's which can happen if we abort between allocating a
+ * page in mpool and initializing it. In that case, even if we're
+ * undoing, we need to re-initialize the page.
+ */
+ if (DB_REDO(op) && cmp_p == 0) {
+ /* Need to redo update described. */
+ switch (argp->ptype) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ level = LEAFLEVEL;
+ break;
+ default:
+ level = 0;
+ break;
+ }
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+ pagep->lsn = *lsnp;
+ } else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
+ /*
+ * This is where we handle the case of a 0'd page (pagep->pgno
+ * is equal to PGNO_INVALID).
+ * Undo the allocation, reinitialize the page and
+ * link its next pointer to the free list.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+ pagep->lsn = argp->page_lsn;
+ }
+
+do_truncate:
+ /*
+ * We cannot undo things from 4.2 land, because we nolonger
+ * have limbo processing.
+ */
+ if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
+ IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+no_rollback: __db_errx(env, DB_STR("0643",
+"Cannot replicate prepared transactions from master running release 4.2 "));
+ ret = __env_panic(env, EINVAL);
+ }
+
+ if (pagep != NULL &&
+ (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+ if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_free_recover_42_int --
+ */
+static int
+__db_pg_free_recover_42_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __db_pg_freedata_42_args *argp;
+ DB *file_dbp;
+ DB_LSN *lsnp;
+ DB_MPOOLFILE *mpf;
+ db_recops op;
+ int data;
+{
+ DBMETA *meta;
+ DB_LSN copy_lsn;
+ PAGE *pagep, *prevp;
+ int cmp_n, cmp_p, is_meta, ret;
+
+ meta = NULL;
+ pagep = NULL;
+ prevp = NULL;
+
+ /*
+ * Get the "metapage". This will either be the metapage
+ * or the previous page in the free list if we are doing
+ * sorted allocations. If its a previous page then
+ * we will not be truncating.
+ */
+ is_meta = argp->meta_pgno == PGNO_BASE_MD;
+
+ REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
+
+ if (argp->meta_pgno != PGNO_BASE_MD)
+ prevp = (PAGE *)meta;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+
+ /*
+ * Fix up the metadata page. If we're redoing or undoing the operation
+ * we get the page and update its LSN, last and free pointer.
+ */
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo the deallocation. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ if (prevp == NULL)
+ meta->free = argp->pgno;
+ else
+ NEXT_PGNO(prevp) = argp->pgno;
+ /*
+ * If this was a compensating transaction and
+ * we are a replica, then we never executed the
+ * original allocation which incremented meta->free.
+ */
+ if (prevp == NULL && meta->last_pgno < meta->free)
+ meta->last_pgno = meta->free;
+ LSN(meta) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo the deallocation. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ if (prevp == NULL)
+ meta->free = argp->next;
+ else
+ NEXT_PGNO(prevp) = argp->next;
+ LSN(meta) = argp->meta_lsn;
+ if (prevp == NULL && meta->last_pgno < argp->pgno)
+ meta->last_pgno = argp->pgno;
+ }
+
+check_meta:
+ if (ret != 0 && is_meta) {
+ /* The metadata page must always exist. */
+ ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+ goto out;
+ }
+
+ /*
+ * Get the freed page. If we support truncate then don't
+ * create the page if we are going to free it. If we're
+ * redoing the operation we get the page and explicitly discard
+ * its contents, then update its LSN. If we're undoing the
+ * operation, we get the page and restore its header.
+ * If we don't support truncate, then we must create the page
+ * and roll it back.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+
+ (void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+ cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+ if (DB_REDO(op) &&
+ (cmp_p == 0 ||
+ (IS_ZERO_LSN(copy_lsn) &&
+ LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+ /* Need to redo the deallocation. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to reallocate the page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->header.data, argp->header.size);
+ if (data)
+ memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+ argp->data.data, argp->data.size);
+ }
+ if (pagep != NULL &&
+ (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+
+ pagep = NULL;
+ if (meta != NULL &&
+ (ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+
+ return (ret);
+}
+
+/*
+ * __db_pg_free_42_recover --
+ * Recovery function for pg_free.
+ *
+ * PUBLIC: int __db_pg_free_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_free_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_free_42_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_free_42_print);
+ REC_INTRO(__db_pg_free_42_read, ip, 0);
+
+ ret = __db_pg_free_recover_42_int(env, ip,
+ (__db_pg_freedata_42_args *)argp, file_dbp, lsnp, mpf, op, 0);
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_freedata_42_recover --
+ * Recovery function for pg_freedata.
+ *
+ * PUBLIC: int __db_pg_freedata_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_freedata_42_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_freedata_42_print);
+ REC_INTRO(__db_pg_freedata_42_read, ip, 0);
+
+ ret = __db_pg_free_recover_42_int(
+ env, ip, argp, file_dbp, lsnp, mpf, op, 1);
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_relink_42_recover --
+ * Recovery function for relink.
+ *
+ * PUBLIC: int __db_relink_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_relink_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_relink_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_relink_42_print);
+ REC_INTRO(__db_relink_42_read, ip, 0);
+
+ /*
+ * There are up to three pages we need to check -- the page, and the
+ * previous and next pages, if they existed. For a page add operation,
+ * the current page is the result of a split and is being recovered
+ * elsewhere, so all we need do is recover the next page.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ goto next2;
+ }
+ if (argp->opcode == DB_ADD_PAGE_COMPAT)
+ goto next1;
+
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->lsn = *lsnp;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->next;
+ pagep->prev_pgno = argp->prev;
+ pagep->lsn = argp->lsn;
+ }
+next1: if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+next2: if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, argp->next, ret);
+ goto out;
+ }
+ goto prev;
+ }
+ modified = 0;
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+ if ((argp->opcode == DB_REM_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op)) ||
+ (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_n == 0 && DB_UNDO(op))) {
+ /* Redo the remove or undo the add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->prev;
+ modified = 1;
+ } else if ((argp->opcode == DB_REM_PAGE_COMPAT &&
+ cmp_n == 0 && DB_UNDO(op)) ||
+ (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op))) {
+ /* Undo the remove or redo the add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->pgno;
+ modified = 1;
+ }
+ if (modified) {
+ if (DB_UNDO(op))
+ pagep->lsn = argp->lsn_next;
+ else
+ pagep->lsn = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+ if (argp->opcode == DB_ADD_PAGE_COMPAT)
+ goto done;
+
+prev: if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, argp->prev, ret);
+ goto out;
+ }
+ goto done;
+ }
+ modified = 0;
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->next;
+ modified = 1;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->pgno;
+ modified = 1;
+ }
+ if (modified) {
+ if (DB_UNDO(op))
+ pagep->lsn = argp->lsn_prev;
+ else
+ pagep->lsn = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_relink_recover --
+ * Recovery function for relink.
+ *
+ * PUBLIC: int __db_relink_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_relink_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_relink_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_relink_print);
+ REC_INTRO(__db_relink_read, ip, 0);
+
+ /*
+ * There are up to three pages we need to check -- the page, and the
+ * previous and next pages, if they existed. For a page add operation,
+ * the current page is the result of a split and is being recovered
+ * elsewhere, so all we need do is recover the next page.
+ */
+ if (argp->next_pgno == PGNO_INVALID)
+ goto prev;
+ if ((ret = __memp_fget(mpf,
+ &argp->next_pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->next_pgno, ret);
+ goto out;
+ } else
+ goto prev;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the remove or replace. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (argp->new_pgno == PGNO_INVALID)
+ pagep->prev_pgno = argp->prev_pgno;
+ else
+ pagep->prev_pgno = argp->new_pgno;
+
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the remove or replace. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->pgno;
+
+ pagep->lsn = argp->lsn_next;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+prev: if (argp->prev_pgno == PGNO_INVALID)
+ goto done;
+ if ((ret = __memp_fget(mpf,
+ &argp->prev_pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->prev_pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (argp->new_pgno == PGNO_INVALID)
+ pagep->next_pgno = argp->next_pgno;
+ else
+ pagep->next_pgno = argp->new_pgno;
+
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->pgno;
+ pagep->lsn = argp->lsn_prev;
+ }
+
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_merge_recover --
+ * Recovery function for merge.
+ *
+ * PUBLIC: int __db_merge_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_merge_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_merge_args *argp;
+ BTREE *bt;
+ DB_THREAD_INFO *ip;
+ BKEYDATA *bk;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LOCK handle_lock;
+ DB_LOCKREQ request;
+ DB_MPOOLFILE *mpf;
+ HASH *ht;
+ PAGE *pagep;
+ db_indx_t indx, *ninp, *pinp;
+ u_int32_t size;
+ u_int8_t *bp;
+ int cmp_n, cmp_p, i, ret, t_ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_merge_print);
+ REC_INTRO(__db_merge_read, ip, op != DB_TXN_APPLY);
+
+ /* Allocate our own cursor without DB_RECOVER as we need a locker. */
+ if (op == DB_TXN_APPLY && (ret = __db_cursor_int(file_dbp, ip, NULL,
+ DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+ goto out;
+ F_SET(dbc, DBC_RECOVER);
+
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto next;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ CHECK_ABORT(file_dbp->env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /*
+ * When pg_copy is set, we are copying onto a new page.
+ */
+ DB_ASSERT(env, !argp->pg_copy || NUM_ENT(pagep) == 0);
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (argp->pg_copy) {
+ if (argp->data.size == 0) {
+ memcpy(pagep, argp->hdr.data, argp->hdr.size);
+ pagep->pgno = argp->pgno;
+ goto do_lsn;
+ }
+ P_INIT(pagep, file_dbp->pgsize, pagep->pgno,
+ PREV_PGNO(argp->hdr.data),
+ NEXT_PGNO(argp->hdr.data),
+ LEVEL(argp->hdr.data), TYPE(argp->hdr.data));
+ }
+ if (TYPE(pagep) == P_OVERFLOW) {
+ OV_REF(pagep) = OV_REF(argp->hdr.data);
+ OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+ bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp);
+ memcpy(bp, argp->data.data, argp->data.size);
+ } else {
+ /* Copy the data segment. */
+ bp = (u_int8_t *)pagep +
+ (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+ memcpy(bp, argp->data.data, argp->data.size);
+
+ /* Copy index table offset past the current entries. */
+ pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+ ninp = P_INP(file_dbp, argp->hdr.data);
+ for (i = 0; i < NUM_ENT(argp->hdr.data); i++)
+ *pinp++ = *ninp++
+ - (file_dbp->pgsize - HOFFSET(pagep));
+ HOFFSET(pagep) -= argp->data.size;
+ NUM_ENT(pagep) += i;
+ }
+do_lsn: pagep->lsn = *lsnp;
+ if (op == DB_TXN_APPLY) {
+ /*
+ * If applying to an active system we must bump
+ * the revision number so that the db will get
+ * reopened. We also need to move the handle
+ * locks. Note that the dbp will not have a
+ * locker in a replication client apply thread.
+ */
+ if (file_dbp->type == DB_HASH) {
+ if (argp->npgno == file_dbp->meta_pgno)
+ file_dbp->mpf->mfp->revision++;
+ } else {
+ bt = file_dbp->bt_internal;
+ if (argp->npgno == bt->bt_meta ||
+ argp->npgno == bt->bt_root)
+ file_dbp->mpf->mfp->revision++;
+ }
+ if (argp->npgno == file_dbp->meta_pgno) {
+ F_CLR(file_dbp, DB_AM_RECOVER);
+ if ((ret = __fop_lock_handle(file_dbp->env,
+ file_dbp, dbc->locker, DB_LOCK_READ,
+ NULL, 0)) != 0)
+ goto err;
+ handle_lock = file_dbp->handle_lock;
+
+ file_dbp->meta_pgno = argp->pgno;
+ if ((ret = __fop_lock_handle(file_dbp->env,
+ file_dbp, dbc->locker, DB_LOCK_READ,
+ NULL, 0)) != 0)
+ goto err;
+
+ /* Move the other handles to the new lock. */
+ ret = __lock_change(file_dbp->env,
+ &handle_lock, &file_dbp->handle_lock);
+
+err: memset(&request, 0, sizeof(request));
+ request.op = DB_LOCK_PUT_ALL;
+ if ((t_ret = __lock_vec(
+ file_dbp->env, dbc->locker,
+ 0, &request, 1, NULL)) != 0 && ret == 0)
+ ret = t_ret;
+ F_SET(file_dbp, DB_AM_RECOVER);
+ if (ret != 0)
+ goto out;
+ }
+ }
+
+ } else if (cmp_n == 0 && !DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (TYPE(pagep) == P_OVERFLOW) {
+ HOFFSET(pagep) = file_dbp->pgsize;
+ goto setlsn;
+ }
+
+ if (argp->pg_copy) {
+ /* The page was empty when we started. */
+ P_INIT(pagep, file_dbp->pgsize,
+ pagep->pgno, PGNO_INVALID,
+ PGNO_INVALID, 0, TYPE(argp->hdr.data));
+ goto setlsn;
+ }
+
+ /*
+ * Since logging is logical at the page level we cannot just
+ * truncate the data space. Delete the proper number of items
+ * from the logical end of the page.
+ */
+ for (i = 0; i < NUM_ENT(argp->hdr.data); i++) {
+ indx = NUM_ENT(pagep) - 1;
+ if (TYPE(pagep) == P_LBTREE && indx != 0 &&
+ P_INP(file_dbp, pagep)[indx] ==
+ P_INP(file_dbp, pagep)[indx - P_INDX]) {
+ NUM_ENT(pagep)--;
+ continue;
+ }
+ switch (TYPE(pagep)) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ bk = GET_BKEYDATA(file_dbp, pagep, indx);
+ size = BITEM_SIZE(bk);
+ break;
+
+ case P_IBTREE:
+ size = BINTERNAL_SIZE(
+ GET_BINTERNAL(file_dbp, pagep, indx)->len);
+ break;
+ case P_IRECNO:
+ size = RINTERNAL_SIZE;
+ break;
+ case P_HASH:
+ size = LEN_HITEM(file_dbp,
+ pagep, file_dbp->pgsize, indx);
+ break;
+ default:
+ ret = __db_pgfmt(env, PGNO(pagep));
+ goto out;
+ }
+ if ((ret = __db_ditem(dbc, pagep, indx, size)) != 0)
+ goto out;
+ }
+setlsn: pagep->lsn = argp->lsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+
+next: if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to truncate the page. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ HOFFSET(pagep) = file_dbp->pgsize;
+ NUM_ENT(pagep) = 0;
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && !DB_REDO(op)) {
+ /* Need to put the data back on the page. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (TYPE(pagep) == P_OVERFLOW) {
+ OV_REF(pagep) = OV_REF(argp->hdr.data);
+ OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+ bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp);
+ memcpy(bp, argp->data.data, argp->data.size);
+ } else {
+ bp = (u_int8_t *)pagep +
+ (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+ memcpy(bp, argp->data.data, argp->data.size);
+
+ if (argp->pg_copy)
+ memcpy(pagep, argp->hdr.data, argp->hdr.size);
+ else {
+ /* Copy index table. */
+ pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+ ninp = P_INP(file_dbp, argp->hdr.data);
+ for (i = 0; i < NUM_ENT(argp->hdr.data); i++)
+ *pinp++ = *ninp++;
+ HOFFSET(pagep) -= argp->data.size;
+ NUM_ENT(pagep) += i;
+ }
+ }
+ pagep->lsn = argp->nlsn;
+ if (op == DB_TXN_ABORT) {
+ /*
+ * If we are undoing a meta/root page move we must
+ * bump the revision number. Put the handle
+ * locks back to their original state if we
+ * moved the metadata page.
+ */
+ i = 0;
+ if (file_dbp->type == DB_HASH) {
+ ht = file_dbp->h_internal;
+ if (argp->pgno == ht->meta_pgno) {
+ ht->meta_pgno = argp->npgno;
+ file_dbp->mpf->mfp->revision++;
+ i = 1;
+ }
+ } else {
+ bt = file_dbp->bt_internal;
+ if (argp->pgno == bt->bt_meta) {
+ file_dbp->mpf->mfp->revision++;
+ bt->bt_meta = argp->npgno;
+ i = 1;
+ } else if (argp->pgno == bt->bt_root) {
+ file_dbp->mpf->mfp->revision++;
+ bt->bt_root = argp->npgno;
+ }
+ }
+ if (argp->pgno == file_dbp->meta_pgno)
+ file_dbp->meta_pgno = argp->npgno;
+
+ /*
+ * If we detected a metadata page above, move
+ * the handle locks to the new page.
+ */
+ if (i == 1) {
+ handle_lock = file_dbp->handle_lock;
+ if ((ret = __fop_lock_handle(file_dbp->env,
+ file_dbp, file_dbp->locker, DB_LOCK_READ,
+ NULL, 0)) != 0)
+ goto out;
+
+ /* Move the other handles to the new lock. */
+ if ((ret = __lock_change(file_dbp->env,
+ &handle_lock, &file_dbp->handle_lock)) != 0)
+ goto out;
+ }
+ }
+ }
+
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, dbc->priority)) != 0)
+ goto out;
+done:
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __db_pgno_recover --
+ * Recovery function for page number replacment.
+ *
+ * PUBLIC: int __db_pgno_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pgno_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ BINTERNAL *bi;
+ __db_pgno_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep, *npagep;
+ db_pgno_t pgno, *pgnop;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pgno_print);
+ REC_INTRO(__db_pgno_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ CHECK_ABORT(file_dbp->env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if ((cmp_p == 0 && DB_REDO(op)) || (cmp_n == 0 && !DB_REDO(op))) {
+ switch (TYPE(pagep)) {
+ case P_IBTREE:
+ /*
+ * An internal record can have both a overflow
+ * and child pointer. Fetch the page to see
+ * which it is.
+ */
+ bi = GET_BINTERNAL(file_dbp, pagep, argp->indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ REC_FGET(mpf, ip, argp->npgno, &npagep, out);
+
+ if (TYPE(npagep) == P_OVERFLOW)
+ pgnop =
+ &((BOVERFLOW *)(bi->data))->pgno;
+ else
+ pgnop = &bi->pgno;
+ if ((ret = __memp_fput(mpf, ip,
+ npagep, file_dbp->priority)) != 0)
+ goto out;
+ break;
+ }
+ pgnop = &bi->pgno;
+ break;
+ case P_IRECNO:
+ pgnop =
+ &GET_RINTERNAL(file_dbp, pagep, argp->indx)->pgno;
+ break;
+ case P_HASH:
+ pgnop = &pgno;
+ break;
+ default:
+ pgnop =
+ &GET_BOVERFLOW(file_dbp, pagep, argp->indx)->pgno;
+ break;
+ }
+
+ if (DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ *pgnop = argp->npgno;
+ pagep->lsn = *lsnp;
+ } else {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ *pgnop = argp->opgno;
+ pagep->lsn = argp->lsn;
+ }
+ if (TYPE(pagep) == P_HASH)
+ memcpy(HOFFDUP_PGNO(P_ENTRY(file_dbp,
+ pagep, argp->indx)), pgnop, sizeof(db_pgno_t));
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+
+done:
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __db_pglist_swap -- swap a list of freelist pages.
+ * PUBLIC: void __db_pglist_swap __P((u_int32_t, void *));
+ */
+void
+__db_pglist_swap(size, list)
+ u_int32_t size;
+ void *list;
+{
+ db_pglist_t *lp;
+ u_int32_t nelem;
+
+ nelem = size / sizeof(db_pglist_t);
+
+ lp = (db_pglist_t *)list;
+ while (nelem-- > 0) {
+ P_32_SWAP(&lp->pgno);
+ P_32_SWAP(&lp->lsn.file);
+ P_32_SWAP(&lp->lsn.offset);
+ lp++;
+ }
+}
+
+/*
+ * __db_pglist_print -- print a list of freelist pages.
+ * PUBLIC: void __db_pglist_print __P((ENV *, DB_MSGBUF *, DBT *));
+ */
+void
+__db_pglist_print(env, mbp, list)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ DBT *list;
+{
+ db_pglist_t *lp;
+ u_int32_t nelem;
+
+ nelem = list->size / sizeof(db_pglist_t);
+ lp = (db_pglist_t *)list->data;
+ __db_msgadd(env, mbp, "\t");
+ while (nelem-- > 0) {
+ __db_msgadd(env, mbp, "%lu [%lu][%lu]", (u_long)lp->pgno,
+ (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
+ if (nelem % 4 == 0)
+ __db_msgadd(env, mbp, "\n\t");
+ else
+ __db_msgadd(env, mbp, " ");
+ lp++;
+ }
+}
diff --git a/src/db/db_reclaim.c b/src/db/db_reclaim.c
new file mode 100644
index 00000000..b902769a
--- /dev/null
+++ b/src/db/db_reclaim.c
@@ -0,0 +1,245 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+
+/*
+ * __db_traverse_big
+ * Traverse a chain of overflow pages and call the callback routine
+ * on each one. The calling convention for the callback is:
+ * callback(dbc, page, cookie, did_put),
+ * where did_put is a return value indicating if the page in question has
+ * already been returned to the mpool.
+ *
+ * PUBLIC: int __db_traverse_big __P((DBC *, db_pgno_t,
+ * PUBLIC: int (*)(DBC *, PAGE *, void *, int *), void *));
+ */
+int
+__db_traverse_big(dbc, pgno, callback, cookie)
+ DBC *dbc;
+ db_pgno_t pgno;
+ int (*callback) __P((DBC *, PAGE *, void *, int *));
+ void *cookie;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *p;
+ int did_put, ret;
+
+ mpf = dbc->dbp->mpf;
+
+ do {
+ did_put = 0;
+ if ((ret = __memp_fget(mpf,
+ &pgno, dbc->thread_info, dbc->txn, 0, &p)) != 0)
+ return (ret);
+ /*
+ * If we are freeing pages only process the overflow
+ * chain if the head of the chain has a refcount of 1.
+ */
+ pgno = NEXT_PGNO(p);
+ if (callback == __db_truncate_callback && OV_REF(p) != 1)
+ pgno = PGNO_INVALID;
+ if ((ret = callback(dbc, p, cookie, &did_put)) == 0 &&
+ !did_put)
+ ret = __memp_fput(mpf,
+ dbc->thread_info, p, dbc->priority);
+ } while (ret == 0 && pgno != PGNO_INVALID);
+
+ return (ret);
+}
+
+/*
+ * __db_reclaim_callback
+ * This is the callback routine used during a delete of a subdatabase.
+ * we are traversing a btree or hash table and trying to free all the
+ * pages. Since they share common code for duplicates and overflow
+ * items, we traverse them identically and use this routine to do the
+ * actual free. The reason that this is callback is because hash uses
+ * the same traversal code for statistics gathering.
+ *
+ * PUBLIC: int __db_reclaim_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__db_reclaim_callback(dbc, p, cookie, putp)
+ DBC *dbc;
+ PAGE *p;
+ void *cookie;
+ int *putp;
+{
+ DB *dbp;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ /*
+ * We don't want to log the free of the root with the subdb.
+ * If we abort then the subdb may not be openable to undo
+ * the free.
+ */
+ if ((dbp->type == DB_BTREE || dbp->type == DB_RECNO) &&
+ PGNO(p) == ((BTREE *)dbp->bt_internal)->bt_root)
+ return (0);
+ if ((ret = __db_free(dbc, p, *(u_int32_t *)cookie)) != 0)
+ return (ret);
+ *putp = 1;
+
+ return (0);
+}
+
+/*
+ * __db_truncate_callback
+ * This is the callback routine used during a truncate.
+ * we are traversing a btree or hash table and trying to free all the
+ * pages.
+ *
+ * PUBLIC: int __db_truncate_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__db_truncate_callback(dbc, p, cookie, putp)
+ DBC *dbc;
+ PAGE *p;
+ void *cookie;
+ int *putp;
+{
+ DB *dbp;
+ DBT ddbt, ldbt;
+ DB_MPOOLFILE *mpf;
+ db_indx_t indx, len, off, tlen, top;
+ u_int8_t *hk, type;
+ u_int32_t *countp;
+ int ret;
+
+ top = NUM_ENT(p);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ countp = cookie;
+ *putp = 1;
+
+ switch (TYPE(p)) {
+ case P_LBTREE:
+ /* Skip for off-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ type = GET_BKEYDATA(dbp, p, indx + O_INDX)->type;
+ if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE)
+ ++*countp;
+ }
+ /* FALLTHROUGH */
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_INVALID:
+ if (dbp->type != DB_HASH &&
+ ((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+ type = dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
+ goto reinit;
+ }
+ break;
+ case P_OVERFLOW:
+ if ((ret = __memp_dirty(mpf,
+ &p, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_ovref_log(dbp, dbc->txn,
+ &LSN(p), 0, p->pgno, -1, &LSN(p))) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(p));
+ if (--OV_REF(p) != 0)
+ *putp = 0;
+ break;
+ case P_LRECNO:
+ for (indx = 0; indx < top; indx += O_INDX) {
+ type = GET_BKEYDATA(dbp, p, indx)->type;
+ if (!B_DISSET(type))
+ ++*countp;
+ }
+
+ if (((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+ type = P_LRECNO;
+ goto reinit;
+ }
+ break;
+ case P_LDUP:
+ /* Correct for deleted items. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(dbp, p, indx)->type))
+ ++*countp;
+
+ break;
+ case P_HASH:
+ /* Correct for on-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ switch (*H_PAIRDATA(dbp, p, indx)) {
+ case H_OFFDUP:
+ break;
+ case H_OFFPAGE:
+ case H_KEYDATA:
+ ++*countp;
+ break;
+ case H_DUPLICATE:
+ tlen = LEN_HDATA(dbp, p, 0, indx);
+ hk = H_PAIRDATA(dbp, p, indx);
+ for (off = 0; off < tlen;
+ off += len + 2 * sizeof(db_indx_t)) {
+ ++*countp;
+ memcpy(&len,
+ HKEYDATA_DATA(hk)
+ + off, sizeof(db_indx_t));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, p->pgno));
+ }
+ }
+ /* Don't free the head of the bucket. */
+ if (PREV_PGNO(p) == PGNO_INVALID) {
+ type = P_HASH;
+
+reinit: if ((ret = __memp_dirty(mpf, &p,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ *putp = 0;
+ if (DBC_LOGGING(dbc)) {
+ memset(&ldbt, 0, sizeof(ldbt));
+ memset(&ddbt, 0, sizeof(ddbt));
+ ldbt.data = p;
+ ldbt.size = P_OVERHEAD(dbp);
+ ldbt.size += p->entries * sizeof(db_indx_t);
+ ddbt.data = (u_int8_t *)p + HOFFSET(p);
+ ddbt.size = dbp->pgsize - HOFFSET(p);
+ if ((ret = __db_pg_init_log(dbp,
+ dbc->txn, &LSN(p), 0,
+ p->pgno, &ldbt, &ddbt)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(p));
+
+ P_INIT(p, dbp->pgsize, PGNO(p), PGNO_INVALID,
+ PGNO_INVALID, type == P_HASH ? 0 : 1, type);
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, p->pgno));
+ }
+
+ if (*putp == 1) {
+ if ((ret = __db_free(dbc, p, 0)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __memp_fput(mpf, dbc->thread_info, p,
+ dbc->priority)) != 0)
+ return (ret);
+ *putp = 1;
+ }
+
+ return (0);
+}
diff --git a/src/db/db_remove.c b/src/db/db_remove.c
new file mode 100644
index 00000000..591a29b2
--- /dev/null
+++ b/src/db/db_remove.c
@@ -0,0 +1,515 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_dbtxn_remove __P((DB *,
+ DB_THREAD_INFO *, DB_TXN *, const char *, const char *));
+static int __db_subdb_remove __P((DB *,
+ DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+
+/*
+ * __env_dbremove_pp
+ * ENV->dbremove pre/post processing.
+ *
+ * PUBLIC: int __env_dbremove_pp __P((DB_ENV *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbremove_pp(dbenv, txn, name, subdb, flags)
+ DB_ENV *dbenv;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ dbp = NULL;
+ env = dbenv->env;
+ txn_local = 0;
+ handle_check = 0;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbremove");
+
+ /*
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if ((ret = __db_fchk(env, "DB->remove", flags,
+ DB_AUTO_COMMIT | DB_LOG_NO_DATA |
+ DB_NOSYNC | DB_TXN_NOT_DURABLE)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ XA_NO_TXN(ip, ret);
+ if (ret != 0)
+ goto err;
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+ if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+ goto err;
+ txn_local = 1;
+ } else if (txn != NULL && !TXN_ON(env) &&
+ (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_FAMILY))) {
+ ret = __db_not_txn_env(env);
+ goto err;
+ } else if (txn != NULL && LF_ISSET(DB_LOG_NO_DATA)) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR("0690",
+ "DB_LOG_NO_DATA may not be specified within a transaction."));
+ goto err;
+ }
+ LF_CLR(DB_AUTO_COMMIT);
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ if (LF_ISSET(DB_TXN_NOT_DURABLE) &&
+ (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ LF_CLR(DB_TXN_NOT_DURABLE);
+
+ ret = __db_remove_int(dbp, ip, txn, name, subdb, flags);
+
+ if (txn_local) {
+ /*
+ * We created the DBP here and when we commit/abort, we'll
+ * release all the transactional locks, including the handle
+ * lock; mark the handle cleared explicitly.
+ */
+ LOCK_INIT(dbp->handle_lock);
+ dbp->locker = NULL;
+ } else if (IS_REAL_TXN(txn)) {
+ /*
+ * We created this handle locally so we need to close it
+ * and clean it up. Unfortunately, it's holding transactional
+ * locks that need to persist until the end of transaction.
+ * If we invalidate the locker id (dbp->locker), then the close
+ * won't free these locks prematurely.
+ */
+ dbp->locker = NULL;
+ }
+
+err: if (txn_local && (t_ret =
+ __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * We never opened this dbp for real, so don't include a transaction
+ * handle, and use NOSYNC to avoid calling into mpool.
+ *
+ * !!!
+ * Note we're reversing the order of operations: we started the txn and
+ * then opened the DB handle; we're resolving the txn and then closing
+ * closing the DB handle -- a DB handle cannot be closed before
+ * resolving the txn.
+ */
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_remove_pp
+ * DB->remove pre/post processing.
+ *
+ * PUBLIC: int __db_remove_pp
+ * PUBLIC: __P((DB *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove_pp(dbp, name, subdb, flags)
+ DB *dbp;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ /*
+ * Validate arguments, continuing to destroy the handle on failure.
+ *
+ * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+ *
+ * !!!
+ * We have a serious problem if we're here with a handle used to open
+ * a database -- we'll destroy the handle, and the application won't
+ * ever be able to close the database.
+ */
+ if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ return (__db_mi_open(env, "DB->remove", 1));
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env, "DB->remove", flags, DB_NOSYNC)) != 0)
+ return (ret);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Remove the file. */
+ ret = __db_remove(dbp, ip, NULL, name, subdb, flags);
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_remove
+ * DB->remove method.
+ *
+ * PUBLIC: int __db_remove __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove(dbp, ip, txn, name, subdb, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ int ret, t_ret;
+
+ ret = __db_remove_int(dbp, ip, txn, name, subdb, flags);
+
+ if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_remove_int
+ * Worker function for the DB->remove method.
+ *
+ * PUBLIC: int __db_remove_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove_int(dbp, ip, txn, name, subdb, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+ char *real_name, *tmpname;
+
+ env = dbp->env;
+ real_name = tmpname = NULL;
+
+ if (name == NULL && subdb == NULL) {
+ __db_errx(env, DB_STR("0691",
+ "Remove on temporary files invalid"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (name == NULL) {
+ MAKE_INMEM(dbp);
+ real_name = (char *)subdb;
+ } else if (subdb != NULL) {
+ ret = __db_subdb_remove(dbp, ip, txn, name, subdb, flags);
+ goto err;
+ }
+
+ /* Handle transactional file removes separately. */
+ if (IS_REAL_TXN(txn)) {
+ ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb);
+ goto err;
+ }
+
+ /*
+ * The remaining case is a non-transactional file remove.
+ *
+ * Find the real name of the file.
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = __db_appname(env,
+ DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0)
+ goto err;
+
+ /*
+ * If this is a file and force is set, remove the temporary file, which
+ * may have been left around. Ignore errors because the temporary file
+ * might not exist.
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM) && LF_ISSET(DB_FORCE) &&
+ (ret = __db_backup_name(env, real_name, NULL, &tmpname)) == 0)
+ (void)__os_unlink(env, tmpname, 0);
+
+ if ((ret = __fop_remove_setup(dbp, NULL, real_name, 0)) != 0)
+ goto err;
+
+ if (dbp->db_am_remove != NULL &&
+ (ret = dbp->db_am_remove(dbp, ip, NULL, name, subdb, flags)) != 0)
+ goto err;
+
+ ret = F_ISSET(dbp, DB_AM_INMEM) ?
+ __db_inmem_remove(dbp, NULL, real_name) :
+ __fop_remove(env,
+ NULL, dbp->fileid, name, &dbp->dirname, DB_APP_DATA,
+ F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+err: if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL)
+ __os_free(env, real_name);
+ if (tmpname != NULL)
+ __os_free(env, tmpname);
+
+ return (ret);
+}
+
+/*
+ * __db_inmem_remove --
+ * Removal of a named in-memory database.
+ *
+ * PUBLIC: int __db_inmem_remove __P((DB *, DB_TXN *, const char *));
+ */
+int
+__db_inmem_remove(dbp, txn, name)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+{
+ DBT fid_dbt, name_dbt;
+ DB_LOCKER *locker;
+ DB_LSN lsn;
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+ locker = NULL;
+
+ DB_ASSERT(env, name != NULL);
+
+ /* This had better exist if we are trying to do a remove. */
+ (void)__memp_set_flags(dbp->mpf, DB_MPOOL_NOFILE, 1);
+ if ((ret = __memp_fopen(dbp->mpf, NULL,
+ name, &dbp->dirname, 0, 0, 0)) != 0)
+ return (ret);
+ if ((ret = __memp_get_fileid(dbp->mpf, dbp->fileid)) != 0)
+ return (ret);
+ dbp->preserve_fid = 1;
+
+ if (LOCKING_ON(env)) {
+ if (dbp->locker == NULL &&
+ (ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+ return (ret);
+ if (!CDB_LOCKING(env) &&
+ txn != NULL && F_ISSET(txn, TXN_INFAMILY)) {
+ if ((ret = __lock_addfamilylocker(env,
+ txn->txnid, dbp->locker->id, 1)) != 0)
+ return (ret);
+ txn = NULL;
+ }
+ locker = txn == NULL ? dbp->locker : txn->locker;
+ }
+
+ /*
+ * In a transactional environment, we'll play the same game we play
+ * for databases in the file system -- create a temporary database
+ * and put it in with the current name and then rename this one to
+ * another name. We'll then use a commit-time event to remove the
+ * entry.
+ */
+ if ((ret =
+ __fop_lock_handle(env, dbp, locker, DB_LOCK_WRITE, NULL, 0)) != 0)
+ return (ret);
+
+ if (!IS_REAL_TXN(txn))
+ ret = __memp_nameop(env, dbp->fileid, NULL, name, NULL, 1);
+ else if (LOGGING_ON(env)) {
+ if (txn != NULL && (ret =
+ __txn_remevent(env, txn, name, dbp->fileid, 1)) != 0)
+ return (ret);
+
+ DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
+ DB_INIT_DBT(fid_dbt, dbp->fileid, DB_FILE_ID_LEN);
+ ret = __crdel_inmem_remove_log(
+ env, txn, &lsn, 0, &name_dbt, &fid_dbt);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_subdb_remove --
+ * Remove a subdatabase.
+ */
+static int
+__db_subdb_remove(dbp, ip, txn, name, subdb, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ DB *mdbp, *sdbp;
+ int ret, t_ret;
+
+ mdbp = sdbp = NULL;
+
+ /* Open the subdatabase. */
+ if ((ret = __db_create_internal(&sdbp, dbp->env, 0)) != 0)
+ goto err;
+ if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
+ (ret = __db_set_flags(sdbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ if ((ret = __db_open(sdbp, ip,
+ txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name);
+
+ /* Have the handle locked so we will not lock pages. */
+ LOCK_CHECK_OFF(ip);
+
+ /* Free up the pages in the subdatabase. */
+ switch (sdbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bam_reclaim(sdbp, ip, txn, flags)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ if ((ret = __ham_reclaim(sdbp, ip, txn, flags)) != 0)
+ goto err;
+ break;
+ case DB_QUEUE:
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(
+ sdbp->env, "__db_subdb_remove", sdbp->type);
+ goto err;
+ }
+
+ /*
+ * Remove the entry from the main database and free the subdatabase
+ * metadata page.
+ */
+ if ((ret = __db_master_open(sdbp, ip, txn, name, 0, 0, &mdbp)) != 0)
+ goto err;
+
+ if ((ret = __db_master_update(mdbp,
+ sdbp, ip, txn, subdb, sdbp->type, MU_REMOVE, NULL, 0)) != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(sdbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+ /* Close the main and subdatabases. */
+ if ((t_ret = __db_close(sdbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (mdbp != NULL && (t_ret = __db_close(mdbp, txn,
+ (LF_ISSET(DB_NOSYNC) || txn != NULL) ? DB_NOSYNC : 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ LOCK_CHECK_ON(ip);
+ return (ret);
+}
+
+static int
+__db_dbtxn_remove(dbp, ip, txn, name, subdb)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+{
+ ENV *env;
+ int ret;
+ char *tmpname;
+
+ env = dbp->env;
+ tmpname = NULL;
+
+ /*
+ * This is a transactional remove, so we have to keep the name
+ * of the file locked until the transaction commits. As a result,
+ * we implement remove by renaming the file to some other name
+ * (which creates a dummy named file as a placeholder for the
+ * file being rename/dremoved) and then deleting that file as
+ * a delayed remove at commit.
+ */
+ if ((ret = __db_backup_name(env,
+ F_ISSET(dbp, DB_AM_INMEM) ? subdb : name, txn, &tmpname)) != 0)
+ return (ret);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+ if ((ret = __db_rename_int(dbp,
+ txn->thread_info, txn, name, subdb, tmpname, DB_NOSYNC)) != 0)
+ goto err;
+
+ /*
+ * The internal removes will also translate into delayed removes.
+ */
+ if (dbp->db_am_remove != NULL &&
+ (ret = dbp->db_am_remove(dbp, ip, txn, tmpname, NULL, 0)) != 0)
+ goto err;
+
+ ret = F_ISSET(dbp, DB_AM_INMEM) ?
+ __db_inmem_remove(dbp, txn, tmpname) :
+ __fop_remove(env,
+ txn, dbp->fileid, tmpname, &dbp->dirname, DB_APP_DATA,
+ F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+err:
+DB_TEST_RECOVERY_LABEL
+ if (tmpname != NULL)
+ __os_free(env, tmpname);
+
+ return (ret);
+}
diff --git a/src/db/db_rename.c b/src/db/db_rename.c
new file mode 100644
index 00000000..2812b948
--- /dev/null
+++ b/src/db/db_rename.c
@@ -0,0 +1,383 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_rename __P((DB *, DB_THREAD_INFO *,
+ DB_TXN *, const char *, const char *, const char *, u_int32_t));
+static int __db_subdb_rename __P((DB *, DB_THREAD_INFO *,
+ DB_TXN *, const char *, const char *, const char *, u_int32_t));
+
+/*
+ * __env_dbrename_pp
+ * ENV->dbrename pre/post processing.
+ *
+ * PUBLIC: int __env_dbrename_pp __P((DB_ENV *, DB_TXN *,
+ * PUBLIC: const char *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbrename_pp(dbenv, txn, name, subdb, newname, flags)
+ DB_ENV *dbenv;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ env = dbenv->env;
+ dbp = NULL;
+ txn_local = 0;
+ handle_check = 0;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbrename");
+
+ /*
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if ((ret = __db_fchk(env, "DB->rename", flags,
+ DB_AUTO_COMMIT | DB_NOSYNC)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ XA_NO_TXN(ip, ret);
+ if (ret != 0)
+ goto err;
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+ if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+ goto err;
+ txn_local = 1;
+ } else
+ if (txn != NULL && !TXN_ON(env) &&
+ (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_FAMILY))) {
+ ret = __db_not_txn_env(env);
+ goto err;
+ }
+
+ LF_CLR(DB_AUTO_COMMIT);
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+
+ ret = __db_rename_int(dbp, ip, txn, name, subdb, newname, flags);
+
+ if (txn_local) {
+ /*
+ * We created the DBP here and when we commit/abort, we'll
+ * release all the transactional locks, including the handle
+ * lock; mark the handle cleared explicitly.
+ */
+ LOCK_INIT(dbp->handle_lock);
+ dbp->locker = NULL;
+ } else if (IS_REAL_TXN(txn)) {
+ /*
+ * We created this handle locally so we need to close it and
+ * clean it up. Unfortunately, it's holding transactional
+ * or CDS group locks that need to persist until the end of
+ * transaction. If we invalidate the locker (dbp->locker),
+ * then the close won't free these locks prematurely.
+ */
+ dbp->locker = NULL;
+ }
+
+err: if (txn_local && (t_ret =
+ __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * We never opened this dbp for real, so don't include a transaction
+ * handle, and use NOSYNC to avoid calling into mpool.
+ *
+ * !!!
+ * Note we're reversing the order of operations: we started the txn and
+ * then opened the DB handle; we're resolving the txn and then closing
+ * closing the DB handle -- it's safer.
+ */
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_rename_pp
+ * DB->rename pre/post processing.
+ *
+ * PUBLIC: int __db_rename_pp __P((DB *,
+ * PUBLIC: const char *, const char *, const char *, u_int32_t));
+ */
+int
+__db_rename_pp(dbp, name, subdb, newname, flags)
+ DB *dbp;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+ handle_check = 0;
+
+ /*
+ * Validate arguments, continuing to destroy the handle on failure.
+ *
+ * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+ *
+ * !!!
+ * We have a serious problem if we're here with a handle used to open
+ * a database -- we'll destroy the handle, and the application won't
+ * ever be able to close the database.
+ */
+ if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ return (__db_mi_open(env, "DB->rename", 1));
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env, "DB->rename", flags, DB_NOSYNC)) != 0)
+ return (ret);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Rename the file. */
+ ret = __db_rename(dbp, ip, NULL, name, subdb, newname, flags);
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_rename
+ * DB->rename method.
+ *
+ */
+static int
+__db_rename(dbp, ip, txn, name, subdb, newname, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ int ret, t_ret;
+
+ ret = __db_rename_int(dbp, ip, txn, name, subdb, newname, flags);
+
+ if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_rename_int
+ * Worker function for DB->rename method; the close of the dbp is
+ * left in the wrapper routine.
+ *
+ * PUBLIC: int __db_rename_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, const char *, u_int32_t));
+ */
+int
+__db_rename_int(dbp, ip, txn, name, subdb, newname, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+ char *old, *real_name;
+
+ env = dbp->env;
+ real_name = NULL;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+ if (name == NULL && subdb == NULL) {
+ __db_errx(env, DB_STR("0503",
+ "Rename on temporary files invalid"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (name == NULL)
+ MAKE_INMEM(dbp);
+ else if (subdb != NULL) {
+ ret = __db_subdb_rename(dbp, ip,
+ txn, name, subdb, newname, flags);
+ goto err;
+ }
+
+ /*
+ * From here on down, this pertains to files or in-memory databases.
+ *
+ * Find the real name of the file.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ old = (char *)subdb;
+ real_name = (char *)subdb;
+ } else {
+ if ((ret = __db_appname(env, DB_APP_DATA,
+ name, &dbp->dirname, &real_name)) != 0)
+ goto err;
+ old = (char *)name;
+ }
+ DB_ASSERT(env, old != NULL);
+
+ if ((ret = __fop_remove_setup(dbp, txn, real_name, 0)) != 0)
+ goto err;
+
+ if (dbp->db_am_rename != NULL &&
+ (ret = dbp->db_am_rename(dbp, ip, txn, name, subdb, newname)) != 0)
+ goto err;
+
+ /*
+ * The transactional case and non-transactional case are
+ * quite different. In the non-transactional case, we simply
+ * do the rename. In the transactional case, since we need
+ * the ability to back out and maintain locking, we have to
+ * create a temporary object as a placeholder. This is all
+ * taken care of in the fop layer.
+ */
+ if (IS_REAL_TXN(txn)) {
+ if ((ret = __fop_dummy(dbp, txn, old, newname)) != 0)
+ goto err;
+ } else {
+ if ((ret = __fop_dbrename(dbp, old, newname)) != 0)
+ goto err;
+ }
+
+ /*
+ * I am pretty sure that we haven't gotten a dbreg id, so calling
+ * dbreg_filelist_update is not necessary.
+ */
+ DB_ASSERT(env, dbp->log_filename == NULL ||
+ dbp->log_filename->id == DB_LOGFILEID_INVALID);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, newname);
+
+DB_TEST_RECOVERY_LABEL
+err: if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL)
+ __os_free(env, real_name);
+
+ return (ret);
+}
+
+/*
+ * __db_subdb_rename --
+ * Rename a subdatabase.
+ */
+static int
+__db_subdb_rename(dbp, ip, txn, name, subdb, newname, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ DB *mdbp;
+ ENV *env;
+ PAGE *meta;
+ int ret, t_ret;
+
+ mdbp = NULL;
+ meta = NULL;
+ env = dbp->env;
+
+ /*
+ * We have not opened this dbp so it isn't marked as a subdb,
+ * but it ought to be.
+ */
+ F_SET(dbp, DB_AM_SUBDB);
+
+ /*
+ * Rename the entry in the main database. We need to first
+ * get the meta-data page number (via MU_OPEN) so that we can
+ * read the meta-data page and obtain a handle lock. Once we've
+ * done that, we can proceed to do the rename in the master.
+ */
+ if ((ret = __db_master_open(dbp, ip, txn, name, 0, 0, &mdbp)) != 0)
+ goto err;
+
+ if ((ret = __db_master_update(mdbp, dbp, ip, txn, subdb, dbp->type,
+ MU_OPEN, NULL, 0)) != 0)
+ goto err;
+
+ if ((ret = __memp_fget(mdbp->mpf, &dbp->meta_pgno,
+ ip, txn, 0, &meta)) != 0)
+ goto err;
+ memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
+ if ((ret = __fop_lock_handle(env, dbp,
+ (mdbp->cur_locker != NULL) ? mdbp->cur_locker : mdbp->locker,
+ DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn))) != 0)
+ goto err;
+
+ ret = __memp_fput(mdbp->mpf, ip, meta, dbp->priority);
+ meta = NULL;
+ if (ret != 0)
+ goto err;
+
+ if ((ret = __db_master_update(mdbp, dbp, ip, txn,
+ subdb, dbp->type, MU_RENAME, newname, 0)) != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+ if (meta != NULL && (t_ret =
+ __memp_fput(mdbp->mpf, ip, meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (mdbp != NULL && (t_ret = __db_close(mdbp, txn,
+ (LF_ISSET(DB_NOSYNC) || txn != NULL) ? DB_NOSYNC : 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/src/db/db_ret.c b/src/db/db_ret.c
new file mode 100644
index 00000000..709605f6
--- /dev/null
+++ b/src/db/db_ret.c
@@ -0,0 +1,169 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/heap.h"
+
+/*
+ * __db_ret --
+ * Build return DBT.
+ *
+ * PUBLIC: int __db_ret __P((DBC *,
+ * PUBLIC: PAGE *, u_int32_t, DBT *, void **, u_int32_t *));
+ */
+int
+__db_ret(dbc, h, indx, dbt, memp, memsize)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *dbt;
+ void **memp;
+ u_int32_t *memsize;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ DB *dbp;
+ HEAPHDR *hdr;
+ HOFFPAGE ho;
+ u_int32_t len;
+ u_int8_t *hk;
+ void *data;
+
+ if (F_ISSET(dbt, DB_DBT_READONLY))
+ return (0);
+ dbp = dbc->dbp;
+
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ hk = P_ENTRY(dbp, h, indx);
+ if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
+ memcpy(&ho, hk, sizeof(HOFFPAGE));
+ return (__db_goff(dbc, dbt,
+ ho.tlen, ho.pgno, memp, memsize));
+ }
+ len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx);
+ data = HKEYDATA_DATA(hk);
+ break;
+ case P_HEAP:
+ hdr = (HEAPHDR *)P_ENTRY(dbp, h, indx);
+ if (F_ISSET(hdr,(HEAP_RECSPLIT | HEAP_RECFIRST)))
+ return (__heapc_gsplit(dbc, dbt, memp, memsize));
+ len = hdr->size;
+ data = (u_int8_t *)hdr + sizeof(HEAPHDR);
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ return (__db_goff(dbc, dbt,
+ bo->tlen, bo->pgno, memp, memsize));
+ }
+ len = bk->len;
+ data = bk->data;
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, h->pgno));
+ }
+
+ return (__db_retcopy(dbp->env, dbt, data, len, memp, memsize));
+}
+
+/*
+ * __db_retcopy --
+ * Copy the returned data into the user's DBT, handling special flags.
+ *
+ * PUBLIC: int __db_retcopy __P((ENV *, DBT *,
+ * PUBLIC: void *, u_int32_t, void **, u_int32_t *));
+ */
+int
+__db_retcopy(env, dbt, data, len, memp, memsize)
+ ENV *env;
+ DBT *dbt;
+ void *data;
+ u_int32_t len;
+ void **memp;
+ u_int32_t *memsize;
+{
+ int ret;
+
+ if (F_ISSET(dbt, DB_DBT_READONLY))
+ return (0);
+ ret = 0;
+
+ /* If returning a partial record, reset the length. */
+ if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+ data = (u_int8_t *)data + dbt->doff;
+ if (len > dbt->doff) {
+ len -= dbt->doff;
+ if (len > dbt->dlen)
+ len = dbt->dlen;
+ } else
+ len = 0;
+ }
+
+ /*
+ * Allocate memory to be owned by the application: DB_DBT_MALLOC,
+ * DB_DBT_REALLOC.
+ *
+ * !!!
+ * We always allocate memory, even if we're copying out 0 bytes. This
+ * guarantees consistency, i.e., the application can always free memory
+ * without concern as to how many bytes of the record were requested.
+ *
+ * Use the memory specified by the application: DB_DBT_USERMEM.
+ *
+ * !!!
+ * If the length we're going to copy is 0, the application-supplied
+ * memory pointer is allowed to be NULL.
+ */
+ if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+ dbt->size = len;
+ return (len == 0 ? 0 : env->dbt_usercopy(dbt, 0, data,
+ len, DB_USERCOPY_SETDATA));
+
+ } else if (F_ISSET(dbt, DB_DBT_MALLOC))
+ ret = __os_umalloc(env, len, &dbt->data);
+ else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+ if (dbt->data == NULL || dbt->size == 0 || dbt->size < len)
+ ret = __os_urealloc(env, len, &dbt->data);
+ } else if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+ if (len != 0 && (dbt->data == NULL || dbt->ulen < len))
+ ret = DB_BUFFER_SMALL;
+ } else if (memp == NULL || memsize == NULL)
+ ret = EINVAL;
+ else {
+ if (len != 0 && (*memsize == 0 || *memsize < len)) {
+ if ((ret = __os_realloc(env, len, memp)) == 0)
+ *memsize = len;
+ else
+ *memsize = 0;
+ }
+ if (ret == 0)
+ dbt->data = *memp;
+ }
+
+ if (ret == 0 && len != 0)
+ memcpy(dbt->data, data, len);
+
+ /*
+ * Return the length of the returned record in the DBT size field.
+ * This satisfies the requirement that if we're using user memory
+ * and insufficient memory was provided, return the amount necessary
+ * in the size field.
+ */
+ dbt->size = len;
+
+ return (ret);
+}
diff --git a/src/db/db_setid.c b/src/db/db_setid.c
new file mode 100644
index 00000000..697c3ff7
--- /dev/null
+++ b/src/db/db_setid.c
@@ -0,0 +1,213 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+
+/*
+ * __env_fileid_reset_pp --
+ * ENV->fileid_reset pre/post processing.
+ *
+ * PUBLIC: int __env_fileid_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_fileid_reset_pp(dbenv, name, flags)
+ DB_ENV *dbenv;
+ const char *name;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->fileid_reset");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_ENCRYPT)
+ return (__db_ferr(env, "DB_ENV->fileid_reset", 0));
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__env_fileid_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)),
+ 1, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __env_fileid_reset --
+ * Reset the file IDs for every database in the file.
+ * PUBLIC: int __env_fileid_reset
+ * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, int));
+ */
+int
+__env_fileid_reset(env, ip, name, encrypted)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ const char *name;
+ int encrypted;
+{
+ DB *dbp;
+ DBC *dbcp;
+ DBMETA *meta;
+ DBT key, data;
+ DB_FH *fhp;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO cookie;
+ db_pgno_t pgno;
+ int subdb, t_ret, ret;
+ size_t n;
+ char *real_name;
+ u_int8_t fileid[DB_FILE_ID_LEN], mbuf[DBMETASIZE];
+ void *pagep;
+
+ dbp = NULL;
+ dbcp = NULL;
+ fhp = NULL;
+ real_name = NULL;
+
+ /* Get the real backing file name. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, name, NULL, &real_name)) != 0)
+ return (ret);
+
+ /* Get a new file ID. */
+ if ((ret = __os_fileid(env, real_name, 1, fileid)) != 0)
+ goto err;
+
+ /*
+ * The user may have physically copied a file currently open in the
+ * cache, which means if we open this file through the cache before
+ * updating the file ID on page 0, we might connect to the file from
+ * which the copy was made.
+ */
+ if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) {
+ __db_err(env, ret, "%s", real_name);
+ goto err;
+ }
+ if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+ goto err;
+
+ if (n != sizeof(mbuf)) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR_A("0675",
+ "__env_fileid_reset: %s: unexpected file type or format",
+ "%s"), real_name);
+ goto err;
+ }
+
+ /*
+ * Create the DB object.
+ */
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+
+ /* If configured with a password, the databases are encrypted. */
+ if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0)
+ goto err;
+
+ if ((ret = __db_meta_setup(env,
+ dbp, real_name, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0)
+ goto err;
+
+ meta = (DBMETA *)mbuf;
+ if (FLD_ISSET(meta->metaflags,
+ DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) && (ret =
+ __part_fileid_reset(env, ip, name, meta->nparts, encrypted)) != 0)
+ goto err;
+
+ subdb = meta->type == P_BTREEMETA && F_ISSET(meta, BTM_SUBDB);
+
+ memcpy(meta->uid, fileid, DB_FILE_ID_LEN);
+ cookie.db_pagesize = sizeof(mbuf);
+ cookie.flags = dbp->flags;
+ cookie.type = dbp->type;
+ key.data = &cookie;
+
+ if ((ret = __db_pgout(env->dbenv, 0, mbuf, &key)) != 0)
+ goto err;
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ if ((ret = __os_write(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+ goto err;
+ if ((ret = __os_fsync(env, fhp)) != 0)
+ goto err;
+
+ /*
+ * Page 0 of the file has an updated file ID, and we can open it in
+ * the cache without connecting to a different, existing file. Open
+ * the file in the cache, and update the file IDs for subdatabases.
+ */
+
+ /*
+ * If the database file doesn't support subdatabases, we only have
+ * to update a single metadata page. Otherwise, we have to open a
+ * cursor and step through the master database, and update all of
+ * the subdatabases' metadata pages.
+ */
+ if (!subdb)
+ goto err;
+
+ /*
+ * Open the DB file.
+ *
+ * !!!
+ * Note DB_RDWRMASTER flag, we need to open the master database file
+ * for writing in this case.
+ */
+ if ((ret = __db_open(dbp, ip, NULL,
+ name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ mpf = dbp->mpf;
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbcp, 0)) != 0)
+ goto err;
+ while ((ret = __dbc_get(dbcp, &key, &data, DB_NEXT)) == 0) {
+ /*
+ * XXX
+ * We're handling actual data, not on-page meta-data, so it
+ * hasn't been converted to/from opposite endian architectures.
+ * Do it explicitly, now.
+ */
+ memcpy(&pgno, data.data, sizeof(db_pgno_t));
+ DB_NTOHL_SWAP(env, &pgno);
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
+ DB_MPOOL_DIRTY, &pagep)) != 0)
+ goto err;
+ memcpy(((DBMETA *)pagep)->uid, fileid, DB_FILE_ID_LEN);
+ if ((ret = __memp_fput(mpf, ip, pagep, dbcp->priority)) != 0)
+ goto err;
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+err: if (dbcp != NULL && (t_ret = __dbc_close(dbcp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbp != NULL && (t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (real_name != NULL)
+ __os_free(env, real_name);
+
+ return (ret);
+}
diff --git a/src/db/db_setlsn.c b/src/db/db_setlsn.c
new file mode 100644
index 00000000..1a3280ed
--- /dev/null
+++ b/src/db/db_setlsn.c
@@ -0,0 +1,137 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+
+static int __env_lsn_reset __P((ENV *, DB_THREAD_INFO *, const char *, int));
+
+/*
+ * __env_lsn_reset_pp --
+ * ENV->lsn_reset pre/post processing.
+ *
+ * PUBLIC: int __env_lsn_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_lsn_reset_pp(dbenv, name, flags)
+ DB_ENV *dbenv;
+ const char *name;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->lsn_reset");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_ENCRYPT)
+ return (__db_ferr(env, "DB_ENV->lsn_reset", 0));
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__env_lsn_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)),
+ 1, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __env_lsn_reset --
+ * Reset the LSNs for every page in the file.
+ */
+static int
+__env_lsn_reset(env, ip, name, encrypted)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ const char *name;
+ int encrypted;
+{
+ DB *dbp;
+ int t_ret, ret;
+
+ /* Create the DB object. */
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+
+ /* If configured with a password, the databases are encrypted. */
+ if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0)
+ goto err;
+
+ /*
+ * Open the DB file.
+ *
+ * !!!
+ * Note DB_RDWRMASTER flag, we need to open the master database file
+ * for writing in this case.
+ */
+ if ((ret = __db_open(dbp, ip, NULL,
+ name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0) {
+ __db_err(env, ret, "%s", name);
+ goto err;
+ }
+
+ ret = __db_lsn_reset(dbp->mpf, ip);
+#ifdef HAVE_PARTITION
+ if (ret == 0 && DB_IS_PARTITIONED(dbp))
+ ret = __part_lsn_reset(dbp, ip);
+ else
+#endif
+ if (ret == 0 && dbp->type == DB_QUEUE)
+#ifdef HAVE_QUEUE
+ ret = __qam_lsn_reset(dbp, ip);
+#else
+ ret = __db_no_queue_am(env);
+#endif
+
+err: if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_lsn_reset -- reset the lsn for a db mpool handle.
+ * PUBLIC: int __db_lsn_reset __P((DB_MPOOLFILE *, DB_THREAD_INFO *));
+ */
+int
+__db_lsn_reset(mpf, ip)
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+{
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int ret;
+
+ /* Reset the LSN on every page of the database file. */
+ for (pgno = 0;
+ (ret = __memp_fget(mpf,
+ &pgno, ip, NULL, DB_MPOOL_DIRTY, &pagep)) == 0;
+ ++pgno) {
+ LSN_NOT_LOGGED(pagep->lsn);
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, DB_PRIORITY_UNCHANGED)) != 0)
+ break;
+ }
+
+ if (ret == DB_PAGE_NOTFOUND)
+ ret = 0;
+
+ return (ret);
+}
diff --git a/src/db/db_sort_multiple.c b/src/db/db_sort_multiple.c
new file mode 100644
index 00000000..c5e2e941
--- /dev/null
+++ b/src/db/db_sort_multiple.c
@@ -0,0 +1,327 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+static int __db_quicksort __P((DB *, DBT *, DBT *, u_int32_t *, u_int32_t *,
+ u_int32_t *, u_int32_t *, u_int32_t));
+
+/*
+ * __db_compare_both --
+ * Use the comparison functions from db to compare akey and bkey, and if
+ * DB_DUPSORT adata and bdata.
+ *
+ * PUBLIC: int __db_compare_both __P((DB *, const DBT *, const DBT *,
+ * PUBLIC: const DBT *, const DBT *));
+ */
+int
+__db_compare_both(db, akey, adata, bkey, bdata)
+ DB *db;
+ const DBT *akey;
+ const DBT *adata;
+ const DBT *bkey;
+ const DBT *bdata;
+{
+ BTREE *t;
+ int cmp;
+
+ t = (BTREE *)db->bt_internal;
+
+ cmp = t->bt_compare(db, akey, bkey);
+ if (cmp != 0) return cmp;
+ if (!F_ISSET(db, DB_AM_DUPSORT))
+ return (0);
+
+ if (adata == 0) return bdata == 0 ? 0 : -1;
+ if (bdata == 0) return 1;
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(db))
+ return t->compress_dup_compare(db, adata, bdata);
+#endif
+ return db->dup_compare(db, adata, bdata);
+}
+
+#define DB_SORT_SWAP(a, ad, b, bd) \
+do { \
+ tmp = (a)[0]; (a)[0] = (b)[0]; (b)[0] = tmp; \
+ tmp = (a)[-1]; (a)[-1] = (b)[-1]; (b)[-1] = tmp; \
+ if (data != NULL) { \
+ tmp = (ad)[0]; (ad)[0] = (bd)[0]; (bd)[0] = tmp; \
+ tmp = (ad)[-1]; (ad)[-1] = (bd)[-1]; (bd)[-1] = tmp; \
+ } \
+} while (0)
+
+#define DB_SORT_LOAD_DBT(a, ad, aptr, adptr) \
+do { \
+ (a).data = (u_int8_t*)key->data + (aptr)[0]; \
+ (a).size = (aptr)[-1]; \
+ if (data != NULL) { \
+ (ad).data = (u_int8_t*)data->data + (adptr)[0]; \
+ (ad).size = (adptr)[-1]; \
+ } \
+} while (0)
+
+#define DB_SORT_COMPARE(a, ad, b, bd) (data != NULL ? \
+ __db_compare_both(db, &(a), &(ad), &(b), &(bd)) : \
+ __db_compare_both(db, &(a), 0, &(b), 0))
+
+#define DB_SORT_STACKSIZE 32
+
+/*
+ * __db_quicksort --
+ * The quicksort implementation for __db_sort_multiple() and
+ * __db_sort_multiple_key().
+ */
+static int
+__db_quicksort(db, key, data, kstart, kend, dstart, dend, size)
+ DB *db;
+ DBT *key, *data;
+ u_int32_t *kstart, *kend, *dstart, *dend;
+ u_int32_t size;
+{
+ int ret, cmp;
+ u_int32_t tmp, len;
+ u_int32_t *kptr, *dptr, *kl, *dl, *kr, *dr;
+ DBT a, ad, b, bd, m, md;
+ ENV *env;
+
+ struct DB_SORT_quicksort_stack {
+ u_int32_t *kstart;
+ u_int32_t *kend;
+ u_int32_t *dstart;
+ u_int32_t *dend;
+ } stackbuf[DB_SORT_STACKSIZE], *stack;
+ u_int32_t soff, slen;
+
+ ret = 0;
+ env = db->env;
+
+ memset(&a, 0, sizeof(DBT));
+ memset(&ad, 0, sizeof(DBT));
+ memset(&b, 0, sizeof(DBT));
+ memset(&bd, 0, sizeof(DBT));
+ memset(&m, 0, sizeof(DBT));
+ memset(&md, 0, sizeof(DBT));
+
+ /* NB end is smaller than start */
+
+ stack = stackbuf;
+ soff = 0;
+ slen = DB_SORT_STACKSIZE;
+
+ start:
+ if (kend >= kstart) goto pop;
+
+ /* If there's only one value, it's already sorted */
+ len = (u_int32_t)(kstart - kend) / size;
+ if (len == 1) goto pop;
+
+ DB_SORT_LOAD_DBT(a, ad, kstart, dstart);
+ DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size);
+
+ if (len == 2) {
+ /* Special case the sorting of two value sequences */
+ if (DB_SORT_COMPARE(a, ad, b, bd) > 0) {
+ DB_SORT_SWAP(kstart, dstart, kend + size,
+ dend + size);
+ }
+ goto pop;
+ }
+
+ kptr = kstart - (len / 2) * size;
+ dptr = dstart - (len / 2) * size;
+ DB_SORT_LOAD_DBT(m, md, kptr, dptr);
+
+ /* Find the median of three */
+ if (DB_SORT_COMPARE(a, ad, b, bd) < 0) {
+ if (DB_SORT_COMPARE(m, md, a, ad) < 0) {
+ /* m < a < b */
+ if (len == 3) {
+ DB_SORT_SWAP(kstart, dstart, kptr, dptr);
+ goto pop;
+ }
+ DB_SORT_SWAP(kstart, dstart, kend + size, dend + size);
+ } else if (DB_SORT_COMPARE(m, md, b, bd) < 0) {
+ /* a <= m < b */
+ if (len == 3) {
+ goto pop;
+ }
+ DB_SORT_SWAP(kptr, dptr, kend + size, dend + size);
+ } else {
+ /* a < b <= m */
+ if (len == 3) {
+ DB_SORT_SWAP(kptr, dptr, kend + size,
+ dend + size);
+ goto pop;
+ }
+ /* Do nothing */
+ }
+ } else {
+ if (DB_SORT_COMPARE(a, ad, m, md) < 0) {
+ /* b <= a < m */
+ DB_SORT_SWAP(kstart, dstart, kend + size,
+ dend + size);
+ if (len == 3) {
+ DB_SORT_SWAP(kptr, dptr, kend + size,
+ dend + size);
+ goto pop;
+ }
+ } else if (DB_SORT_COMPARE(b, bd, m, md) < 0) {
+ /* b < m <= a */
+ if (len == 3) {
+ DB_SORT_SWAP(kstart, dstart, kend + size,
+ dend + size);
+ goto pop;
+ }
+ DB_SORT_SWAP(kptr, dptr, kend + size, dend + size);
+ } else {
+ /* m <= b <= a */
+ if (len == 3) {
+ DB_SORT_SWAP(kstart, dstart, kptr, dptr);
+ DB_SORT_SWAP(kptr, dptr, kend + size,
+ dend + size);
+ goto pop;
+ }
+ /* Do nothing */
+ }
+ }
+
+ /* partition */
+ DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size);
+ kl = kstart;
+ dl = dstart;
+ kr = kend + size;
+ dr = dend + size;
+ kptr = kstart;
+ dptr = dstart;
+ while (kptr >= kr) {
+ DB_SORT_LOAD_DBT(a, ad, kptr, dptr);
+ cmp = DB_SORT_COMPARE(a, ad, b, bd);
+ if (cmp < 0) {
+ DB_SORT_SWAP(kl, dl, kptr, dptr);
+ kl -= size;
+ dl -= size;
+ kptr -= size;
+ dptr -= size;
+ } else if (cmp > 0) {
+ DB_SORT_SWAP(kr, dr, kptr, dptr);
+ kr += size;
+ dr += size;
+ } else {
+ kptr -= size;
+ dptr -= size;
+ }
+ }
+
+ if (soff == slen) {
+ /* Grow the stack */
+ slen = slen * 2;
+ if (stack == stackbuf) {
+ ret = __os_malloc(env, slen *
+ sizeof(struct DB_SORT_quicksort_stack), &stack);
+ if (ret != 0) goto error;
+ memcpy(stack, stackbuf, soff *
+ sizeof(struct DB_SORT_quicksort_stack));
+ } else {
+ ret = __os_realloc(env, slen *
+ sizeof(struct DB_SORT_quicksort_stack), &stack);
+ if (ret != 0) goto error;
+ }
+ }
+
+ /* divide and conquer */
+ stack[soff].kstart = kr - size;
+ stack[soff].kend = kend;
+ stack[soff].dstart = dr - size;
+ stack[soff].dend = dend;
+ ++soff;
+
+ kend = kl;
+ dend = dl;
+
+ goto start;
+
+ pop:
+ if (soff != 0) {
+ --soff;
+ kstart = stack[soff].kstart;
+ kend = stack[soff].kend;
+ dstart = stack[soff].dstart;
+ dend = stack[soff].dend;
+ goto start;
+ }
+
+ error:
+ if (stack != stackbuf)
+ __os_free(env, stack);
+
+ return (ret);
+}
+
+#undef DB_SORT_SWAP
+#undef DB_SORT_LOAD_DBT
+
+/*
+ * __db_sort_multiple --
+ * If flags == DB_MULTIPLE_KEY, sorts a DB_MULTIPLE_KEY format DBT using
+ * the BTree comparison function and duplicate comparison function.
+ *
+ * If flags == DB_MULTIPLE, sorts one or two DB_MULTIPLE format DBTs using
+ * the BTree comparison function and duplicate comparison function. Will
+ * assume key and data specifies pairs of key/data to sort together. If
+ * data is NULL, will just sort key according to the btree comparison
+ * function.
+ *
+ * Uses an in-place quicksort algorithm, with median of three for the pivot
+ * point.
+ *
+ * PUBLIC: int __db_sort_multiple __P((DB *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_sort_multiple(db, key, data, flags)
+ DB *db;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ u_int32_t *kstart, *kend, *dstart, *dend;
+
+ /* TODO: sanity checks on the DBTs */
+ /* DB_ILLEGAL_METHOD(db, DB_OK_BTREE); */
+
+ kstart = (u_int32_t*)((u_int8_t *)key->data + key->ulen) - 1;
+
+ switch (flags) {
+ case DB_MULTIPLE:
+ if (data != NULL)
+ dstart = (u_int32_t*)((u_int8_t *)data->data +
+ data->ulen) - 1;
+ else
+ dstart = kstart;
+
+ /* Find the end */
+ for (kend = kstart, dend = dstart;
+ *kend != (u_int32_t)-1 && *dend != (u_int32_t)-1;
+ kend -= 2, dend -= 2)
+ ;
+
+ return (__db_quicksort(db, key, data, kstart, kend, dstart,
+ dend, 2));
+ case DB_MULTIPLE_KEY:
+ /* Find the end */
+ for (kend = kstart; *kend != (u_int32_t)-1; kend -= 4)
+ ;
+
+ return (__db_quicksort(db, key, key, kstart, kend, kstart - 2,
+ kend - 2, 4));
+ default:
+ return (__db_ferr(db->env, "DB->sort_multiple", 0));
+ }
+}
diff --git a/src/db/db_stati.c b/src/db/db_stati.c
new file mode 100644
index 00000000..61744e81
--- /dev/null
+++ b/src/db/db_stati.c
@@ -0,0 +1,502 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+#ifdef HAVE_STATISTICS
+static int __db_print_all __P((DB *, u_int32_t));
+static int __db_print_citem __P((DBC *));
+static int __db_print_cursor __P((DB *));
+static int __db_print_stats __P((DB *, DB_THREAD_INFO *, u_int32_t));
+static int __db_stat __P((DB *, DB_THREAD_INFO *, DB_TXN *, void *, u_int32_t));
+static int __db_stat_arg __P((DB *, u_int32_t));
+
+/*
+ * __db_stat_pp --
+ * DB->stat pre/post processing.
+ *
+ * PUBLIC: int __db_stat_pp __P((DB *, DB_TXN *, void *, u_int32_t));
+ */
+int
+__db_stat_pp(dbp, txn, spp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ void *spp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat");
+
+ if ((ret = __db_stat_arg(dbp, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0,
+ IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_stat(dbp, ip, txn, spp, flags);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_stat --
+ * DB->stat.
+ *
+ */
+static int
+__db_stat(dbp, ip, txn, spp, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ void *spp;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn,
+ &dbc, LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED))) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, NULL, "DB->stat", NULL, NULL, flags);
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __partition_stat(dbc, spp, flags);
+ else
+#endif
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_stat(dbc, spp, flags);
+ break;
+ case DB_HASH:
+ ret = __ham_stat(dbc, spp, flags);
+ break;
+ case DB_HEAP:
+ ret = __heap_stat(dbc, spp, flags);
+ break;
+ case DB_QUEUE:
+ ret = __qam_stat(dbc, spp, flags);
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = (__db_unknown_type(env, "DB->stat", dbp->type));
+ break;
+ }
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_stat_arg --
+ * Check DB->stat arguments.
+ */
+static int
+__db_stat_arg(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ ENV *env;
+
+ env = dbp->env;
+
+ /* Check for invalid function flags. */
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);
+ switch (flags) {
+ case 0:
+ case DB_FAST_STAT:
+ break;
+ default:
+ return (__db_ferr(env, "DB->stat", 0));
+ }
+
+ return (0);
+}
+
+/*
+ * __db_stat_print_pp --
+ * DB->stat_print pre/post processing.
+ *
+ * PUBLIC: int __db_stat_print_pp __P((DB *, u_int32_t));
+ */
+int
+__db_stat_print_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat_print");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline.
+ */
+ if ((ret = __db_fchk(env,
+ "DB->stat_print", flags, DB_FAST_STAT | DB_STAT_ALL)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_stat_print(dbp, ip, flags);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_stat_print --
+ * DB->stat_print.
+ *
+ * PUBLIC: int __db_stat_print __P((DB *, DB_THREAD_INFO *, u_int32_t));
+ */
+int
+__db_stat_print(dbp, ip, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ u_int32_t flags;
+{
+ time_t now;
+ int ret;
+ char time_buf[CTIME_BUFLEN];
+
+ (void)time(&now);
+ __db_msg(dbp->env, "%.24s\tLocal time", __os_ctime(&now, time_buf));
+
+ if (LF_ISSET(DB_STAT_ALL) && (ret = __db_print_all(dbp, flags)) != 0)
+ return (ret);
+
+ if ((ret = __db_print_stats(dbp, ip, flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __db_print_stats --
+ * Display default DB handle statistics.
+ */
+static int
+__db_print_stats(dbp, ip, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, NULL, "DB->stat_print", NULL, NULL, 0);
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_stat_print(dbc, flags);
+ break;
+ case DB_HASH:
+ ret = __ham_stat_print(dbc, flags);
+ break;
+ case DB_HEAP:
+ ret = __heap_stat_print(dbc, flags);
+ break;
+ case DB_QUEUE:
+ ret = __qam_stat_print(dbc, flags);
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = (__db_unknown_type(env, "DB->stat_print", dbp->type));
+ break;
+ }
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_print_all --
+ * Display debugging DB handle statistics.
+ */
+static int
+__db_print_all(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { DB_AM_CHKSUM, "DB_AM_CHKSUM" },
+ { DB_AM_COMPENSATE, "DB_AM_COMPENSATE" },
+ { DB_AM_CREATED, "DB_AM_CREATED" },
+ { DB_AM_CREATED_MSTR, "DB_AM_CREATED_MSTR" },
+ { DB_AM_DBM_ERROR, "DB_AM_DBM_ERROR" },
+ { DB_AM_DELIMITER, "DB_AM_DELIMITER" },
+ { DB_AM_DISCARD, "DB_AM_DISCARD" },
+ { DB_AM_DUP, "DB_AM_DUP" },
+ { DB_AM_DUPSORT, "DB_AM_DUPSORT" },
+ { DB_AM_ENCRYPT, "DB_AM_ENCRYPT" },
+ { DB_AM_FIXEDLEN, "DB_AM_FIXEDLEN" },
+ { DB_AM_INMEM, "DB_AM_INMEM" },
+ { DB_AM_IN_RENAME, "DB_AM_IN_RENAME" },
+ { DB_AM_NOT_DURABLE, "DB_AM_NOT_DURABLE" },
+ { DB_AM_OPEN_CALLED, "DB_AM_OPEN_CALLED" },
+ { DB_AM_PAD, "DB_AM_PAD" },
+ { DB_AM_PGDEF, "DB_AM_PGDEF" },
+ { DB_AM_RDONLY, "DB_AM_RDONLY" },
+ { DB_AM_READ_UNCOMMITTED, "DB_AM_READ_UNCOMMITTED" },
+ { DB_AM_RECNUM, "DB_AM_RECNUM" },
+ { DB_AM_RECOVER, "DB_AM_RECOVER" },
+ { DB_AM_RENUMBER, "DB_AM_RENUMBER" },
+ { DB_AM_REVSPLITOFF, "DB_AM_REVSPLITOFF" },
+ { DB_AM_SECONDARY, "DB_AM_SECONDARY" },
+ { DB_AM_SNAPSHOT, "DB_AM_SNAPSHOT" },
+ { DB_AM_SUBDB, "DB_AM_SUBDB" },
+ { DB_AM_SWAP, "DB_AM_SWAP" },
+ { DB_AM_TXN, "DB_AM_TXN" },
+ { DB_AM_VERIFYING, "DB_AM_VERIFYING" },
+ { 0, NULL }
+ };
+ ENV *env;
+ char time_buf[CTIME_BUFLEN];
+
+ env = dbp->env;
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB handle information:");
+ STAT_ULONG("Page size", dbp->pgsize);
+ STAT_ISSET("Append recno", dbp->db_append_recno);
+ STAT_ISSET("Feedback", dbp->db_feedback);
+ STAT_ISSET("Dup compare", dbp->dup_compare);
+ STAT_ISSET("App private", dbp->app_private);
+ STAT_ISSET("DbEnv", dbp->env);
+ STAT_STRING("Type", __db_dbtype_to_string(dbp->type));
+
+ __mutex_print_debug_single(env, "Thread mutex", dbp->mutex, flags);
+
+ STAT_STRING("File", dbp->fname);
+ STAT_STRING("Database", dbp->dname);
+ STAT_HEX("Open flags", dbp->open_flags);
+
+ __db_print_fileid(env, dbp->fileid, "\tFile ID");
+
+ STAT_ULONG("Cursor adjust ID", dbp->adj_fileid);
+ STAT_ULONG("Meta pgno", dbp->meta_pgno);
+ if (dbp->locker != NULL)
+ STAT_ULONG("Locker ID", dbp->locker->id);
+ if (dbp->cur_locker != NULL)
+ STAT_ULONG("Handle lock", dbp->cur_locker->id);
+ if (dbp->associate_locker != NULL)
+ STAT_ULONG("Associate lock", dbp->associate_locker->id);
+
+ __db_msg(env,
+ "%.24s\tReplication handle timestamp",
+ dbp->timestamp == 0 ? "0" : __os_ctime(&dbp->timestamp, time_buf));
+
+ STAT_ISSET("Secondary callback", dbp->s_callback);
+ STAT_ISSET("Primary handle", dbp->s_primary);
+
+ STAT_ISSET("api internal", dbp->api_internal);
+ STAT_ISSET("Btree/Recno internal", dbp->bt_internal);
+ STAT_ISSET("Hash internal", dbp->h_internal);
+ STAT_ISSET("Queue internal", dbp->q_internal);
+
+ __db_prflags(env, NULL, dbp->flags, fn, NULL, "\tFlags");
+
+ if (dbp->log_filename == NULL)
+ STAT_ISSET("File naming information", dbp->log_filename);
+ else
+ __dbreg_print_fname(env, dbp->log_filename);
+
+ (void)__db_print_cursor(dbp);
+
+ return (0);
+}
+
+/*
+ * __db_print_cursor --
+ * Display the cursor active and free queues.
+ */
+static int
+__db_print_cursor(dbp)
+ DB *dbp;
+{
+ DBC *dbc;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB handle cursors:");
+
+ ret = 0;
+ MUTEX_LOCK(dbp->env, dbp->mutex);
+ __db_msg(env, "Active queue:");
+ TAILQ_FOREACH(dbc, &dbp->active_queue, links)
+ if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ __db_msg(env, "Join queue:");
+ TAILQ_FOREACH(dbc, &dbp->join_queue, links)
+ if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ __db_msg(env, "Free queue:");
+ TAILQ_FOREACH(dbc, &dbp->free_queue, links)
+ if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ MUTEX_UNLOCK(dbp->env, dbp->mutex);
+
+ return (ret);
+}
+
+static int
+__db_print_citem(dbc)
+ DBC *dbc;
+{
+ static const FN fn[] = {
+ { DBC_ACTIVE, "DBC_ACTIVE" },
+ { DBC_DONTLOCK, "DBC_DONTLOCK" },
+ { DBC_MULTIPLE, "DBC_MULTIPLE" },
+ { DBC_MULTIPLE_KEY, "DBC_MULTIPLE_KEY" },
+ { DBC_OPD, "DBC_OPD" },
+ { DBC_OWN_LID, "DBC_OWN_LID" },
+ { DBC_READ_COMMITTED, "DBC_READ_COMMITTED" },
+ { DBC_READ_UNCOMMITTED, "DBC_READ_UNCOMMITTED" },
+ { DBC_RECOVER, "DBC_RECOVER" },
+ { DBC_RMW, "DBC_RMW" },
+ { DBC_TRANSIENT, "DBC_TRANSIENT" },
+ { DBC_WAS_READ_COMMITTED,"DBC_WAS_READ_COMMITTED" },
+ { DBC_WRITECURSOR, "DBC_WRITECURSOR" },
+ { DBC_WRITER, "DBC_WRITER" },
+ { 0, NULL }
+ };
+ DB *dbp;
+ DBC_INTERNAL *cp;
+ ENV *env;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ cp = dbc->internal;
+
+ STAT_POINTER("DBC", dbc);
+ STAT_POINTER("Associated dbp", dbc->dbp);
+ STAT_POINTER("Associated txn", dbc->txn);
+ STAT_POINTER("Internal", cp);
+ STAT_HEX("Default locker ID", dbc->lref == NULL ? 0 : dbc->lref->id);
+ STAT_HEX("Locker", dbc->locker == NULL ? 0 : dbc->locker->id);
+ STAT_STRING("Type", __db_dbtype_to_string(dbc->dbtype));
+
+ STAT_POINTER("Off-page duplicate cursor", cp->opd);
+ STAT_POINTER("Referenced page", cp->page);
+ STAT_ULONG("Root", cp->root);
+ STAT_ULONG("Page number", cp->pgno);
+ STAT_ULONG("Page index", cp->indx);
+ STAT_STRING("Lock mode", __db_lockmode_to_string(cp->lock_mode));
+ __db_prflags(env, NULL, dbc->flags, fn, NULL, "\tFlags");
+
+ switch (dbc->dbtype) {
+ case DB_BTREE:
+ case DB_RECNO:
+ __bam_print_cursor(dbc);
+ break;
+ case DB_HASH:
+ __ham_print_cursor(dbc);
+ break;
+ case DB_HEAP:
+ __heap_print_cursor(dbc);
+ break;
+ case DB_UNKNOWN:
+ DB_ASSERT(env, dbp->type != DB_UNKNOWN);
+ /* FALLTHROUGH */
+ case DB_QUEUE:
+ default:
+ break;
+ }
+ return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__db_stat_pp(dbp, txn, spp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(txn, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbp->env));
+}
+
+int
+__db_stat_print_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbp->env));
+}
+#endif
diff --git a/src/db/db_truncate.c b/src/db/db_truncate.c
new file mode 100644
index 00000000..0eeb0c64
--- /dev/null
+++ b/src/db/db_truncate.c
@@ -0,0 +1,233 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+#include "dbinc/lock.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+
+static int __db_cursor_check_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __db_cursor_check __P((DB *));
+
+/*
+ * __db_truncate_pp
+ * DB->truncate pre/post processing.
+ *
+ * PUBLIC: int __db_truncate_pp __P((DB *, DB_TXN *, u_int32_t *, u_int32_t));
+ */
+int
+__db_truncate_pp(dbp, txn, countp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t *countp, flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ handle_check = txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+
+ /* Check for invalid flags. */
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env, DB_STR("0685",
+ "DB->truncate forbidden on secondary indices"));
+ return (EINVAL);
+ }
+ if ((ret = __db_fchk(env, "DB->truncate", flags, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ XA_CHECK_TXN(ip, txn);
+
+ /*
+ * Make sure there are no active cursors on this db. Since we drop
+ * pages we cannot really adjust cursors.
+ */
+ if ((ret = __db_cursor_check(dbp)) != 0) {
+ __db_errx(env, DB_STR("0686",
+ "DB->truncate not permitted with active cursors"));
+ goto err;
+ }
+
+#ifdef CONFIG_TEST
+ if (IS_REP_MASTER(env))
+ DB_TEST_WAIT(env, env->test_check);
+#endif
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Check for changes to a read-only database. This must be after the
+ * replication block so that we cannot race master/client state changes.
+ */
+ if (DB_IS_READONLY(dbp)) {
+ ret = __db_rdonly(env, "DB->truncate");
+ goto err;
+ }
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ ret = __db_truncate(dbp, ip, txn, countp);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_truncate
+ * DB->truncate.
+ *
+ * PUBLIC: int __db_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: u_int32_t *));
+ */
+int
+__db_truncate(dbp, ip, txn, countp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t *countp;
+{
+ DB *sdbp;
+ DBC *dbc;
+ ENV *env;
+ u_int32_t scount;
+ int ret, t_ret;
+
+ env = dbp->env;
+ dbc = NULL;
+ ret = 0;
+
+ /*
+ * Run through all secondaries and truncate them first. The count
+ * returned is the count of the primary only. QUEUE uses normal
+ * processing to truncate so it will update the secondaries normally.
+ */
+ if (dbp->type != DB_QUEUE && DB_IS_PRIMARY(dbp)) {
+ if ((ret = __db_s_first(dbp, &sdbp)) != 0)
+ return (ret);
+ for (; sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp, txn))
+ if ((ret = __db_truncate(sdbp, ip, txn, &scount)) != 0)
+ break;
+ if (sdbp != NULL)
+ (void)__db_s_done(sdbp, txn);
+ if (ret != 0)
+ return (ret);
+ }
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, NULL);
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "DB->truncate", NULL, NULL, 0);
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __part_truncate(dbc, countp);
+ else
+#endif
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_truncate(dbc, countp);
+ break;
+ case DB_HASH:
+ ret = __ham_truncate(dbc, countp);
+ break;
+ case DB_HEAP:
+ ret = __heap_truncate(dbc, countp);
+ break;
+ case DB_QUEUE:
+ ret = __qam_truncate(dbc, countp);
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env, "DB->truncate", dbp->type);
+ break;
+ }
+
+ /* Discard the cursor. */
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL);
+
+DB_TEST_RECOVERY_LABEL
+
+ return (ret);
+}
+
+static int
+__db_cursor_check_func(dbc, my_dbc, foundp, pgno, indx, args)
+ DBC *dbc, *my_dbc;
+ u_int32_t *foundp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ void *args;
+{
+ COMPQUIET(my_dbc, NULL);
+ COMPQUIET(args, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(indx, 0);
+ if (IS_INITIALIZED(dbc)) {
+ *foundp = 1;
+ return (EEXIST);
+ }
+ return (0);
+}
+/*
+ * __db_cursor_check --
+ * See if there are any active cursors on this db.
+ */
+static int
+__db_cursor_check(dbp)
+ DB *dbp;
+{
+ int ret;
+ u_int32_t found;
+
+ ret = __db_walk_cursors(dbp, NULL,
+ __db_cursor_check_func, &found, 0, 0, NULL);
+ return (ret == EEXIST ? EINVAL : ret);
+}
diff --git a/src/db/db_upg.c b/src/db/db_upg.c
new file mode 100644
index 00000000..de5d0dc7
--- /dev/null
+++ b/src/db/db_upg.c
@@ -0,0 +1,527 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+
+/*
+ * __db_upgrade_pp --
+ * DB->upgrade pre/post processing.
+ *
+ * PUBLIC: int __db_upgrade_pp __P((DB *, const char *, u_int32_t));
+ */
+int
+__db_upgrade_pp(dbp, fname, flags)
+ DB *dbp;
+ const char *fname;
+ u_int32_t flags;
+{
+#ifdef HAVE_UPGRADE_SUPPORT
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline.
+ */
+ if ((ret = __db_fchk(env, "DB->upgrade", flags, DB_DUPSORT)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ ret = __db_upgrade(dbp, fname, flags);
+ ENV_LEAVE(env, ip);
+ return (ret);
+#else
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(fname, NULL);
+ COMPQUIET(flags, 0);
+
+ __db_errx(dbp->env, DB_STR("0665", "upgrade not supported"));
+ return (EINVAL);
+#endif
+}
+
+#ifdef HAVE_UPGRADE_SUPPORT
+static int (* const func_31_list[P_PAGETYPE_MAX])
+ __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
+ NULL, /* P_INVALID */
+ NULL, /* __P_DUPLICATE */
+ __ham_31_hash, /* P_HASH_UNSORTED */
+ NULL, /* P_IBTREE */
+ NULL, /* P_IRECNO */
+ __bam_31_lbtree, /* P_LBTREE */
+ NULL, /* P_LRECNO */
+ NULL, /* P_OVERFLOW */
+ __ham_31_hashmeta, /* P_HASHMETA */
+ __bam_31_btreemeta, /* P_BTREEMETA */
+ NULL, /* P_QAMMETA */
+ NULL, /* P_QAMDATA */
+ NULL, /* P_LDUP */
+ NULL, /* P_HASH */
+ NULL, /* P_HEAPMETA */
+ NULL, /* P_HEAP */
+ NULL, /* P_IHEAP */
+};
+
+static int (* const func_46_list[P_PAGETYPE_MAX])
+ __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
+ NULL, /* P_INVALID */
+ NULL, /* __P_DUPLICATE */
+ __ham_46_hash, /* P_HASH_UNSORTED */
+ NULL, /* P_IBTREE */
+ NULL, /* P_IRECNO */
+ NULL, /* P_LBTREE */
+ NULL, /* P_LRECNO */
+ NULL, /* P_OVERFLOW */
+ __ham_46_hashmeta, /* P_HASHMETA */
+ NULL, /* P_BTREEMETA */
+ NULL, /* P_QAMMETA */
+ NULL, /* P_QAMDATA */
+ NULL, /* P_LDUP */
+ NULL, /* P_HASH */
+ NULL, /* P_HEAPMETA */
+ NULL, /* P_HEAP */
+ NULL, /* P_IHEAP */
+};
+
+static int __db_page_pass __P((DB *, char *, u_int32_t, int (* const [])
+ (DB *, char *, u_int32_t, DB_FH *, PAGE *, int *), DB_FH *));
+static int __db_set_lastpgno __P((DB *, char *, DB_FH *));
+
+/*
+ * __db_upgrade --
+ * Upgrade an existing database.
+ *
+ * PUBLIC: int __db_upgrade __P((DB *, const char *, u_int32_t));
+ */
+int
+__db_upgrade(dbp, fname, flags)
+ DB *dbp;
+ const char *fname;
+ u_int32_t flags;
+{
+ DBMETA *meta;
+ DB_FH *fhp;
+ ENV *env;
+ size_t n;
+ int ret, t_ret, use_mp_open;
+ u_int8_t mbuf[256], tmpflags;
+ char *real_name;
+
+ use_mp_open = 0;
+ env = dbp->env;
+ fhp = NULL;
+
+ /* Get the real backing file name. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, fname, NULL, &real_name)) != 0)
+ return (ret);
+
+ /* Open the file. */
+ if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) {
+ __db_err(env, ret, "%s", real_name);
+ return (ret);
+ }
+
+ /* Initialize the feedback. */
+ if (dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_UPGRADE, 0);
+
+ /*
+ * Read the metadata page. We read 256 bytes, which is larger than
+ * any access method's metadata page and smaller than any disk sector.
+ */
+ if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+ goto err;
+
+ switch (((DBMETA *)mbuf)->magic) {
+ case DB_BTREEMAGIC:
+ switch (((DBMETA *)mbuf)->version) {
+ case 6:
+ /*
+ * Before V7 not all pages had page types, so we do the
+ * single meta-data page by hand.
+ */
+ if ((ret =
+ __bam_30_btreemeta(dbp, real_name, mbuf)) != 0)
+ goto err;
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 7:
+ /*
+ * We need the page size to do more. Rip it out of
+ * the meta-data page.
+ */
+ memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t));
+
+ if ((ret = __db_page_pass(
+ dbp, real_name, flags, func_31_list, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 8:
+ if ((ret =
+ __db_set_lastpgno(dbp, real_name, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 9:
+ break;
+ default:
+ __db_errx(env, DB_STR_A("0666",
+ "%s: unsupported btree version: %lu", "%s %lu"),
+ real_name, (u_long)((DBMETA *)mbuf)->version);
+ ret = DB_OLD_VERSION;
+ goto err;
+ }
+ break;
+ case DB_HASHMAGIC:
+ switch (((DBMETA *)mbuf)->version) {
+ case 4:
+ case 5:
+ /*
+ * Before V6 not all pages had page types, so we do the
+ * single meta-data page by hand.
+ */
+ if ((ret =
+ __ham_30_hashmeta(dbp, real_name, mbuf)) != 0)
+ goto err;
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+ goto err;
+
+ /*
+ * Before V6, we created hash pages one by one as they
+ * were needed, using hashhdr.ovfl_point to reserve
+ * a block of page numbers for them. A consequence
+ * of this was that, if no overflow pages had been
+ * created, the current doubling might extend past
+ * the end of the database file.
+ *
+ * In DB 3.X, we now create all the hash pages
+ * belonging to a doubling atomically; it's not
+ * safe to just save them for later, because when
+ * we create an overflow page we'll just create
+ * a new last page (whatever that may be). Grow
+ * the database to the end of the current doubling.
+ */
+ if ((ret =
+ __ham_30_sizefix(dbp, fhp, real_name, mbuf)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 6:
+ /*
+ * We need the page size to do more. Rip it out of
+ * the meta-data page.
+ */
+ memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t));
+
+ if ((ret = __db_page_pass(
+ dbp, real_name, flags, func_31_list, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 7:
+ if ((ret =
+ __db_set_lastpgno(dbp, real_name, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 8:
+ /*
+ * Any upgrade that has proceeded this far has metadata
+ * pages compatible with hash version 8 metadata pages,
+ * so casting mbuf to a dbmeta is safe.
+ * If a newer revision moves the pagesize, checksum or
+ * encrypt_alg flags in the metadata, then the
+ * extraction of the fields will need to use hard coded
+ * offsets.
+ */
+ meta = (DBMETA*)mbuf;
+ /*
+ * We need the page size to do more. Extract it from
+ * the meta-data page.
+ */
+ memcpy(&dbp->pgsize, &meta->pagesize,
+ sizeof(u_int32_t));
+ /*
+ * Rip out metadata and encrypt_alg fields from the
+ * metadata page. So the upgrade can know how big
+ * the page metadata pre-amble is. Any upgrade that has
+ * proceeded this far has metadata pages compatible
+ * with hash version 8 metadata pages, so extracting
+ * the fields is safe.
+ */
+ memcpy(&tmpflags, &meta->metaflags, sizeof(u_int8_t));
+ if (FLD_ISSET(tmpflags, DBMETA_CHKSUM))
+ F_SET(dbp, DB_AM_CHKSUM);
+ memcpy(&tmpflags, &meta->encrypt_alg, sizeof(u_int8_t));
+ if (tmpflags != 0) {
+ if (!CRYPTO_ON(dbp->env)) {
+ __db_errx(env, DB_STR("0667",
+"Attempt to upgrade an encrypted database without providing a password."));
+ ret = EINVAL;
+ goto err;
+ }
+ F_SET(dbp, DB_AM_ENCRYPT);
+ }
+
+ /*
+ * This is ugly. It is necessary to have a usable
+ * mpool in the dbp to upgrade from an unsorted
+ * to a sorted hash database. The mpool file is used
+ * to resolve offpage key items, which are needed to
+ * determine sort order. Having mpool open and access
+ * the file does not affect the page pass, since the
+ * page pass only updates DB_HASH_UNSORTED pages
+ * in-place, and the mpool file is only used to read
+ * OFFPAGE items.
+ */
+ use_mp_open = 1;
+ if ((ret = __os_closehandle(env, fhp)) != 0)
+ return (ret);
+ dbp->type = DB_HASH;
+ if ((ret = __env_mpool(dbp, fname,
+ DB_AM_NOT_DURABLE | DB_AM_VERIFYING)) != 0)
+ return (ret);
+ fhp = dbp->mpf->fhp;
+
+ /* Do the actual conversion pass. */
+ if ((ret = __db_page_pass(
+ dbp, real_name, flags, func_46_list, fhp)) != 0)
+ goto err;
+
+ /* FALLTHROUGH */
+ case 9:
+ break;
+ default:
+ __db_errx(env, DB_STR_A("0668",
+ "%s: unsupported hash version: %lu", "%s %lu"),
+ real_name, (u_long)((DBMETA *)mbuf)->version);
+ ret = DB_OLD_VERSION;
+ goto err;
+ }
+ break;
+ case DB_HEAPMAGIC:
+ /*
+ * There's no upgrade needed for Heap yet.
+ */
+ break;
+ case DB_QAMMAGIC:
+ switch (((DBMETA *)mbuf)->version) {
+ case 1:
+ /*
+ * If we're in a Queue database, the only page that
+ * needs upgrading is the meta-database page, don't
+ * bother with a full pass.
+ */
+ if ((ret = __qam_31_qammeta(dbp, real_name, mbuf)) != 0)
+ return (ret);
+ /* FALLTHROUGH */
+ case 2:
+ if ((ret = __qam_32_qammeta(dbp, real_name, mbuf)) != 0)
+ return (ret);
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 3:
+ case 4:
+ break;
+ default:
+ __db_errx(env, DB_STR_A("0669",
+ "%s: unsupported queue version: %lu",
+ "%s %lu"), real_name,
+ (u_long)((DBMETA *)mbuf)->version);
+ ret = DB_OLD_VERSION;
+ goto err;
+ }
+ break;
+ default:
+ M_32_SWAP(((DBMETA *)mbuf)->magic);
+ switch (((DBMETA *)mbuf)->magic) {
+ case DB_BTREEMAGIC:
+ case DB_HASHMAGIC:
+ case DB_HEAPMAGIC:
+ case DB_QAMMAGIC:
+ __db_errx(env, DB_STR_A("0670",
+ "%s: DB->upgrade only supported on native byte-order systems",
+ "%s"), real_name);
+ break;
+ default:
+ __db_errx(env, DB_STR_A("0671",
+ "%s: unrecognized file type", "%s"), real_name);
+ break;
+ }
+ ret = EINVAL;
+ goto err;
+ }
+
+ ret = __os_fsync(env, fhp);
+
+ /*
+ * If mp_open was used, then rely on the database close to clean up
+ * any file handles.
+ */
+err: if (use_mp_open == 0 && fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, real_name);
+
+ /* We're done. */
+ if (dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_UPGRADE, 100);
+
+ return (ret);
+}
+
+/*
+ * __db_page_pass --
+ * Walk the pages of the database, upgrading whatever needs it.
+ */
+static int
+__db_page_pass(dbp, real_name, flags, fl, fhp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ int (* const fl[P_PAGETYPE_MAX])
+ __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ DB_FH *fhp;
+{
+ ENV *env;
+ PAGE *page;
+ db_pgno_t i, pgno_last;
+ size_t n;
+ int dirty, ret;
+
+ env = dbp->env;
+
+ /* Determine the last page of the file. */
+ if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0)
+ return (ret);
+
+ /* Allocate memory for a single page. */
+ if ((ret = __os_malloc(env, dbp->pgsize, &page)) != 0)
+ return (ret);
+
+ /* Walk the file, calling the underlying conversion functions. */
+ for (i = 0; i < pgno_last; ++i) {
+ if (dbp->db_feedback != NULL)
+ dbp->db_feedback(
+ dbp, DB_UPGRADE, (int)((i * 100)/pgno_last));
+ if ((ret = __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0)
+ break;
+ if ((ret = __os_read(env, fhp, page, dbp->pgsize, &n)) != 0)
+ break;
+ dirty = 0;
+ /* Always decrypt the page. */
+ if ((ret = __db_decrypt_pg(env, dbp, page)) != 0)
+ break;
+ if (fl[TYPE(page)] != NULL && (ret = fl[TYPE(page)]
+ (dbp, real_name, flags, fhp, page, &dirty)) != 0)
+ break;
+ if (dirty) {
+ if ((ret = __db_encrypt_and_checksum_pg(
+ env, dbp, page)) != 0)
+ break;
+ if ((ret =
+ __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0)
+ break;
+ if ((ret = __os_write(env,
+ fhp, page, dbp->pgsize, &n)) != 0)
+ break;
+ }
+ }
+
+ __os_free(dbp->env, page);
+ return (ret);
+}
+
+/*
+ * __db_lastpgno --
+ * Return the current last page number of the file.
+ *
+ * PUBLIC: int __db_lastpgno __P((DB *, char *, DB_FH *, db_pgno_t *));
+ */
+int
+__db_lastpgno(dbp, real_name, fhp, pgno_lastp)
+ DB *dbp;
+ char *real_name;
+ DB_FH *fhp;
+ db_pgno_t *pgno_lastp;
+{
+ ENV *env;
+ db_pgno_t pgno_last;
+ u_int32_t mbytes, bytes;
+ int ret;
+
+ env = dbp->env;
+
+ if ((ret = __os_ioinfo(env,
+ real_name, fhp, &mbytes, &bytes, NULL)) != 0) {
+ __db_err(env, ret, "%s", real_name);
+ return (ret);
+ }
+
+ /* Page sizes have to be a power-of-two. */
+ if (bytes % dbp->pgsize != 0) {
+ __db_errx(env, DB_STR_A("0672",
+ "%s: file size not a multiple of the pagesize", "%s"),
+ real_name);
+ return (EINVAL);
+ }
+ pgno_last = mbytes * (MEGABYTE / dbp->pgsize);
+ pgno_last += bytes / dbp->pgsize;
+
+ *pgno_lastp = pgno_last;
+ return (0);
+}
+
+/*
+ * __db_set_lastpgno --
+ * Update the meta->last_pgno field.
+ *
+ * Code assumes that we do not have checksums/crypto on the page.
+ */
+static int
+__db_set_lastpgno(dbp, real_name, fhp)
+ DB *dbp;
+ char *real_name;
+ DB_FH *fhp;
+{
+ DBMETA meta;
+ ENV *env;
+ int ret;
+ size_t n;
+
+ env = dbp->env;
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ return (ret);
+ if ((ret = __os_read(env, fhp, &meta, sizeof(meta), &n)) != 0)
+ return (ret);
+ dbp->pgsize = meta.pagesize;
+ if ((ret = __db_lastpgno(dbp, real_name, fhp, &meta.last_pgno)) != 0)
+ return (ret);
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ return (ret);
+ if ((ret = __os_write(env, fhp, &meta, sizeof(meta), &n)) != 0)
+ return (ret);
+
+ return (0);
+}
+#endif /* HAVE_UPGRADE_SUPPORT */
diff --git a/src/db/db_upg_opd.c b/src/db/db_upg_opd.c
new file mode 100644
index 00000000..992115ad
--- /dev/null
+++ b/src/db/db_upg_opd.c
@@ -0,0 +1,343 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+static int __db_build_bi __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
+static int __db_build_ri __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
+static int __db_up_ovref __P((DB *, DB_FH *, db_pgno_t));
+
+#define GET_PAGE(dbp, fhp, pgno, page) { \
+ if ((ret = __os_seek( \
+ dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0) \
+ goto err; \
+ if ((ret = __os_read(dbp->env, \
+ fhp, page, (dbp)->pgsize, &n)) != 0) \
+ goto err; \
+}
+#define PUT_PAGE(dbp, fhp, pgno, page) { \
+ if ((ret = __os_seek( \
+ dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0) \
+ goto err; \
+ if ((ret = __os_write(dbp->env, \
+ fhp, page, (dbp)->pgsize, &n)) != 0) \
+ goto err; \
+}
+
+/*
+ * __db_31_offdup --
+ * Convert 3.0 off-page duplicates to 3.1 off-page duplicates.
+ *
+ * PUBLIC: int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *));
+ */
+int
+__db_31_offdup(dbp, real_name, fhp, sorted, pgnop)
+ DB *dbp;
+ char *real_name;
+ DB_FH *fhp;
+ int sorted;
+ db_pgno_t *pgnop;
+{
+ PAGE *ipage, *page;
+ db_indx_t indx;
+ db_pgno_t cur_cnt, i, next_cnt, pgno, *pgno_cur, pgno_last;
+ db_pgno_t *pgno_next, pgno_max, *tmp;
+ db_recno_t nrecs;
+ size_t n;
+ int level, nomem, ret;
+
+ ipage = page = NULL;
+ pgno_cur = pgno_next = NULL;
+
+ /* Allocate room to hold a page. */
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0)
+ goto err;
+
+ /*
+ * Walk the chain of 3.0 off-page duplicates. Each one is converted
+ * in place to a 3.1 off-page duplicate page. If the duplicates are
+ * sorted, they are converted to a Btree leaf page, otherwise to a
+ * Recno leaf page.
+ */
+ for (nrecs = 0, cur_cnt = pgno_max = 0,
+ pgno = *pgnop; pgno != PGNO_INVALID;) {
+ if (pgno_max == cur_cnt) {
+ pgno_max += 20;
+ if ((ret = __os_realloc(dbp->env, pgno_max *
+ sizeof(db_pgno_t), &pgno_cur)) != 0)
+ goto err;
+ }
+ pgno_cur[cur_cnt++] = pgno;
+
+ GET_PAGE(dbp, fhp, pgno, page);
+ nrecs += NUM_ENT(page);
+ LEVEL(page) = LEAFLEVEL;
+ TYPE(page) = sorted ? P_LDUP : P_LRECNO;
+ /*
+ * !!!
+ * DB didn't zero the LSNs on off-page duplicates pages.
+ */
+ ZERO_LSN(LSN(page));
+ PUT_PAGE(dbp, fhp, pgno, page);
+
+ pgno = NEXT_PGNO(page);
+ }
+
+ /* If we only have a single page, it's easy. */
+ if (cur_cnt <= 1)
+ goto done;
+
+ /*
+ * pgno_cur is the list of pages we just converted. We're
+ * going to walk that list, but we'll need to create a new
+ * list while we do so.
+ */
+ if ((ret = __os_malloc(dbp->env,
+ cur_cnt * sizeof(db_pgno_t), &pgno_next)) != 0)
+ goto err;
+
+ /* Figure out where we can start allocating new pages. */
+ if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0)
+ goto err;
+
+ /* Allocate room for an internal page. */
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize, &ipage)) != 0)
+ goto err;
+ PGNO(ipage) = PGNO_INVALID;
+
+ /*
+ * Repeatedly walk the list of pages, building internal pages, until
+ * there's only one page at a level.
+ */
+ for (level = LEAFLEVEL + 1; cur_cnt > 1; ++level) {
+ for (indx = 0, i = next_cnt = 0; i < cur_cnt;) {
+ if (indx == 0) {
+ P_INIT(ipage, dbp->pgsize, pgno_last,
+ PGNO_INVALID, PGNO_INVALID,
+ level, sorted ? P_IBTREE : P_IRECNO);
+ ZERO_LSN(LSN(ipage));
+
+ pgno_next[next_cnt++] = pgno_last++;
+ }
+
+ GET_PAGE(dbp, fhp, pgno_cur[i], page);
+
+ /*
+ * If the duplicates are sorted, put the first item on
+ * the lower-level page onto a Btree internal page. If
+ * the duplicates are not sorted, create an internal
+ * Recno structure on the page. If either case doesn't
+ * fit, push out the current page and start a new one.
+ */
+ nomem = 0;
+ if (sorted) {
+ if ((ret = __db_build_bi(
+ dbp, fhp, ipage, page, indx, &nomem)) != 0)
+ goto err;
+ } else
+ if ((ret = __db_build_ri(
+ dbp, fhp, ipage, page, indx, &nomem)) != 0)
+ goto err;
+ if (nomem) {
+ indx = 0;
+ PUT_PAGE(dbp, fhp, PGNO(ipage), ipage);
+ } else {
+ ++indx;
+ ++NUM_ENT(ipage);
+ ++i;
+ }
+ }
+
+ /*
+ * Push out the last internal page. Set the top-level record
+ * count if we've reached the top.
+ */
+ if (next_cnt == 1)
+ RE_NREC_SET(ipage, nrecs);
+ PUT_PAGE(dbp, fhp, PGNO(ipage), ipage);
+
+ /* Swap the current and next page number arrays. */
+ cur_cnt = next_cnt;
+ tmp = pgno_cur;
+ pgno_cur = pgno_next;
+ pgno_next = tmp;
+ }
+
+done: *pgnop = pgno_cur[0];
+
+err: if (pgno_cur != NULL)
+ __os_free(dbp->env, pgno_cur);
+ if (pgno_next != NULL)
+ __os_free(dbp->env, pgno_next);
+ if (ipage != NULL)
+ __os_free(dbp->env, ipage);
+ if (page != NULL)
+ __os_free(dbp->env, page);
+
+ return (ret);
+}
+
+/*
+ * __db_build_bi --
+ * Build a BINTERNAL entry for a parent page.
+ */
+static int
+__db_build_bi(dbp, fhp, ipage, page, indx, nomemp)
+ DB *dbp;
+ DB_FH *fhp;
+ PAGE *ipage, *page;
+ u_int32_t indx;
+ int *nomemp;
+{
+ BINTERNAL bi, *child_bi;
+ BKEYDATA *child_bk;
+ u_int8_t *p;
+ int ret;
+ db_indx_t *inp;
+
+ inp = P_INP(dbp, ipage);
+ switch (TYPE(page)) {
+ case P_IBTREE:
+ child_bi = GET_BINTERNAL(dbp, page, 0);
+ if (P_FREESPACE(dbp, ipage) < BINTERNAL_PSIZE(child_bi->len)) {
+ *nomemp = 1;
+ return (0);
+ }
+ inp[indx] =
+ HOFFSET(ipage) -= BINTERNAL_SIZE(child_bi->len);
+ p = P_ENTRY(dbp, ipage, indx);
+
+ bi.len = child_bi->len;
+ B_TSET(bi.type, child_bi->type);
+ bi.pgno = PGNO(page);
+ bi.nrecs = __bam_total(dbp, page);
+ memcpy(p, &bi, SSZA(BINTERNAL, data));
+ p += SSZA(BINTERNAL, data);
+ memcpy(p, child_bi->data, child_bi->len);
+
+ /* Increment the overflow ref count. */
+ if (B_TYPE(child_bi->type) == B_OVERFLOW)
+ if ((ret = __db_up_ovref(dbp, fhp,
+ ((BOVERFLOW *)(child_bi->data))->pgno)) != 0)
+ return (ret);
+ break;
+ case P_LDUP:
+ child_bk = GET_BKEYDATA(dbp, page, 0);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ if (P_FREESPACE(dbp, ipage) <
+ BINTERNAL_PSIZE(child_bk->len)) {
+ *nomemp = 1;
+ return (0);
+ }
+ inp[indx] =
+ HOFFSET(ipage) -= BINTERNAL_SIZE(child_bk->len);
+ p = P_ENTRY(dbp, ipage, indx);
+
+ bi.len = child_bk->len;
+ B_TSET(bi.type, child_bk->type);
+ bi.pgno = PGNO(page);
+ bi.nrecs = __bam_total(dbp, page);
+ memcpy(p, &bi, SSZA(BINTERNAL, data));
+ p += SSZA(BINTERNAL, data);
+ memcpy(p, child_bk->data, child_bk->len);
+ break;
+ case B_OVERFLOW:
+ if (P_FREESPACE(dbp, ipage) <
+ BINTERNAL_PSIZE(BOVERFLOW_SIZE)) {
+ *nomemp = 1;
+ return (0);
+ }
+ inp[indx] =
+ HOFFSET(ipage) -= BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ p = P_ENTRY(dbp, ipage, indx);
+
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, child_bk->type);
+ bi.pgno = PGNO(page);
+ bi.nrecs = __bam_total(dbp, page);
+ memcpy(p, &bi, SSZA(BINTERNAL, data));
+ p += SSZA(BINTERNAL, data);
+ memcpy(p, child_bk, BOVERFLOW_SIZE);
+
+ /* Increment the overflow ref count. */
+ if ((ret = __db_up_ovref(dbp, fhp,
+ ((BOVERFLOW *)child_bk)->pgno)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(page)));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(page)));
+ }
+
+ return (0);
+}
+
+/*
+ * __db_build_ri --
+ * Build a RINTERNAL entry for an internal parent page.
+ */
+static int
+__db_build_ri(dbp, fhp, ipage, page, indx, nomemp)
+ DB *dbp;
+ DB_FH *fhp;
+ PAGE *ipage, *page;
+ u_int32_t indx;
+ int *nomemp;
+{
+ RINTERNAL ri;
+ db_indx_t *inp;
+
+ COMPQUIET(fhp, NULL);
+ inp = P_INP(dbp, ipage);
+ if (P_FREESPACE(dbp, ipage) < RINTERNAL_PSIZE) {
+ *nomemp = 1;
+ return (0);
+ }
+
+ ri.pgno = PGNO(page);
+ ri.nrecs = __bam_total(dbp, page);
+ inp[indx] = HOFFSET(ipage) -= RINTERNAL_SIZE;
+ memcpy(P_ENTRY(dbp, ipage, indx), &ri, RINTERNAL_SIZE);
+
+ return (0);
+}
+
+/*
+ * __db_up_ovref --
+ * Increment/decrement the reference count on an overflow page.
+ */
+static int
+__db_up_ovref(dbp, fhp, pgno)
+ DB *dbp;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+{
+ PAGE *page;
+ size_t n;
+ int ret;
+
+ /* Allocate room to hold a page. */
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0)
+ return (ret);
+
+ GET_PAGE(dbp, fhp, pgno, page);
+ ++OV_REF(page);
+ PUT_PAGE(dbp, fhp, pgno, page);
+
+err: __os_free(dbp->env, page);
+
+ return (ret);
+}
diff --git a/src/db/db_vrfy.c b/src/db/db_vrfy.c
new file mode 100644
index 00000000..9cb94ad2
--- /dev/null
+++ b/src/db/db_vrfy.c
@@ -0,0 +1,3055 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * This is the code for DB->verify, the DB database consistency checker.
+ * For now, it checks all subdatabases in a database, and verifies
+ * everything it knows how to (i.e. it's all-or-nothing, and one can't
+ * check only for a subset of possible problems).
+ */
+
+static u_int __db_guesspgsize __P((ENV *, DB_FH *));
+static int __db_is_valid_magicno __P((u_int32_t, DBTYPE *));
+static int __db_meta2pgset
+ __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, DB *));
+static int __db_salvage __P((DB *, VRFY_DBINFO *,
+ db_pgno_t, void *, int (*)(void *, const void *), u_int32_t));
+static int __db_salvage_subdbpg __P((DB *, VRFY_DBINFO *,
+ PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+static int __db_salvage_all __P((DB *, VRFY_DBINFO *, void *,
+ int(*)(void *, const void *), u_int32_t, int *));
+static int __db_salvage_unknowns __P((DB *, VRFY_DBINFO *, void *,
+ int (*)(void *, const void *), u_int32_t));
+static int __db_verify_arg __P((DB *, const char *, void *, u_int32_t));
+static int __db_vrfy_freelist
+ __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+static int __db_vrfy_getpagezero
+ __P((DB *, DB_FH *, const char *, u_int8_t *, u_int32_t));
+static int __db_vrfy_invalid
+ __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+static int __db_vrfy_orderchkonly __P((DB *,
+ VRFY_DBINFO *, const char *, const char *, u_int32_t));
+static int __db_vrfy_pagezero __P((DB *,
+ VRFY_DBINFO *, DB_FH *, const char *, u_int32_t));
+static int __db_vrfy_subdbs
+ __P((DB *, VRFY_DBINFO *, const char *, u_int32_t));
+static int __db_vrfy_structure __P((DB *, VRFY_DBINFO *,
+ const char *, db_pgno_t, void *, void *, u_int32_t));
+static int __db_vrfy_walkpages __P((DB *, VRFY_DBINFO *,
+ void *, int (*)(void *, const void *), u_int32_t));
+
+#define VERIFY_FLAGS \
+ (DB_AGGRESSIVE | \
+ DB_NOORDERCHK | DB_ORDERCHKONLY | DB_PRINTABLE | DB_SALVAGE | DB_UNREF)
+
+/*
+ * __db_verify_pp --
+ * DB->verify public interface.
+ *
+ * PUBLIC: int __db_verify_pp
+ * PUBLIC: __P((DB *, const char *, const char *, FILE *, u_int32_t));
+ */
+int
+__db_verify_pp(dbp, file, database, outfile, flags)
+ DB *dbp;
+ const char *file, *database;
+ FILE *outfile;
+ u_int32_t flags;
+{
+ /*
+ * __db_verify_pp is a wrapper to __db_verify_internal, which lets
+ * us pass appropriate equivalents to FILE * in from the non-C APIs.
+ * That's why the usual ENV_ENTER macros are in __db_verify_internal,
+ * not here.
+ */
+ return (__db_verify_internal(dbp,
+ file, database, outfile, __db_pr_callback, flags));
+}
+
+/*
+ * __db_verify_internal --
+ *
+ * PUBLIC: int __db_verify_internal __P((DB *, const char *,
+ * PUBLIC: const char *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_verify_internal(dbp, fname, dname, handle, callback, flags)
+ DB *dbp;
+ const char *fname, *dname;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->verify");
+
+ if (!LF_ISSET(DB_SALVAGE))
+ LF_SET(DB_UNREF);
+
+ ENV_ENTER(env, ip);
+
+ if ((ret = __db_verify_arg(dbp, dname, handle, flags)) == 0)
+ ret = __db_verify(dbp, ip,
+ fname, dname, handle, callback, NULL, NULL, flags);
+
+ /* Db.verify is a DB handle destructor. */
+ if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_verify_arg --
+ * Check DB->verify arguments.
+ */
+static int
+__db_verify_arg(dbp, dname, handle, flags)
+ DB *dbp;
+ const char *dname;
+ void *handle;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ if ((ret = __db_fchk(env, "DB->verify", flags, VERIFY_FLAGS)) != 0)
+ return (ret);
+
+ /*
+ * DB_SALVAGE is mutually exclusive with the other flags except
+ * DB_AGGRESSIVE, DB_PRINTABLE.
+ *
+ * DB_AGGRESSIVE and DB_PRINTABLE are only meaningful when salvaging.
+ *
+ * DB_SALVAGE requires an output stream.
+ */
+ if (LF_ISSET(DB_SALVAGE)) {
+ if (LF_ISSET(~(DB_AGGRESSIVE | DB_PRINTABLE | DB_SALVAGE)))
+ return (__db_ferr(env, "DB->verify", 1));
+ if (handle == NULL) {
+ __db_errx(env, DB_STR("0518",
+ "DB_SALVAGE requires a an output handle"));
+ return (EINVAL);
+ }
+ } else
+ if (LF_ISSET(DB_AGGRESSIVE | DB_PRINTABLE))
+ return (__db_ferr(env, "DB->verify", 1));
+
+ /*
+ * DB_ORDERCHKONLY is mutually exclusive with DB_SALVAGE and
+ * DB_NOORDERCHK, and requires a database name.
+ */
+ if ((ret = __db_fcchk(env, "DB->verify", flags,
+ DB_ORDERCHKONLY, DB_SALVAGE | DB_NOORDERCHK)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_ORDERCHKONLY) && dname == NULL) {
+ __db_errx(env, DB_STR("0519",
+ "DB_ORDERCHKONLY requires a database name"));
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __db_verify --
+ * Walk the entire file page-by-page, either verifying with or without
+ * dumping in db_dump -d format, or DB_SALVAGE-ing whatever key/data
+ * pairs can be found and dumping them in standard (db_load-ready)
+ * dump format.
+ *
+ * (Salvaging isn't really a verification operation, but we put it
+ * here anyway because it requires essentially identical top-level
+ * code.)
+ *
+ * flags may be 0, DB_NOORDERCHK, DB_ORDERCHKONLY, or DB_SALVAGE
+ * (and optionally DB_AGGRESSIVE).
+ * PUBLIC: int __db_verify __P((DB *, DB_THREAD_INFO *, const char *,
+ * PUBLIC: const char *, void *, int (*)(void *, const void *),
+ * PUBLIC: void *, void *, u_int32_t));
+ */
+int
+__db_verify(dbp, ip, name, subdb, handle, callback, lp, rp, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ const char *name, *subdb;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ void *lp, *rp;
+ u_int32_t flags;
+{
+ DB_FH *fhp;
+ ENV *env;
+ VRFY_DBINFO *vdp;
+ u_int32_t sflags;
+ int has_subdbs, isbad, ret, t_ret;
+ char *real_name;
+
+ env = dbp->env;
+ fhp = NULL;
+ vdp = NULL;
+ real_name = NULL;
+ has_subdbs = isbad = ret = t_ret = 0;
+
+ F_SET(dbp, DB_AM_VERIFYING);
+
+ /* Initialize any feedback function. */
+ if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_VERIFY, 0);
+
+ /*
+ * We don't know how large the cache is, and if the database
+ * in question uses a small page size--which we don't know
+ * yet!--it may be uncomfortably small for the default page
+ * size [#2143]. However, the things we need temporary
+ * databases for in dbinfo are largely tiny, so using a
+ * 1024-byte pagesize is probably not going to be a big hit,
+ * and will make us fit better into small spaces.
+ */
+ if ((ret = __db_vrfy_dbinfo_create(env, ip, 1024, &vdp)) != 0)
+ goto err;
+
+ /*
+ * Note whether the user has requested that we use printable
+ * chars where possible. We won't get here with this flag if
+ * we're not salvaging.
+ */
+ if (LF_ISSET(DB_PRINTABLE))
+ F_SET(vdp, SALVAGE_PRINTABLE);
+
+ if (name != NULL) {
+ /* Find the real name of the file. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0)
+ goto err;
+
+ /*
+ * Our first order of business is to verify page 0, which is the
+ * metadata page for the master database of subdatabases or of
+ * the only database in the file. We want to do this by hand
+ * rather than just calling __db_open in case it's
+ * corrupt--various things in __db_open might act funny.
+ *
+ * Once we know the metadata page is healthy, I believe that
+ * it's safe to open the database normally and then use the page
+ * swapping code, which makes life easier.
+ */
+ if ((ret = __os_open(env,
+ real_name, 0, DB_OSO_RDONLY, 0, &fhp)) != 0)
+ goto err;
+ } else {
+ MAKE_INMEM(dbp);
+ }
+
+ /* Verify the metadata page 0; set pagesize and type. */
+ if ((ret = __db_vrfy_pagezero(dbp, vdp, fhp, subdb, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * We can assume at this point that dbp->pagesize and dbp->type are
+ * set correctly, or at least as well as they can be, and that
+ * locking, logging, and txns are not in use. Thus we can trust
+ * the memp code not to look at the page, and thus to be safe
+ * enough to use.
+ *
+ * The dbp is not open, but the file is open in the fhp, and we
+ * cannot assume that __db_open is safe. Call __env_setup,
+ * the [safe] part of __db_open that initializes the environment--
+ * and the mpool--manually.
+ */
+ if ((ret = __env_setup(dbp, NULL,
+ name, subdb, TXN_INVALID, DB_ODDFILESIZE | DB_RDONLY)) != 0)
+ goto err;
+
+ /*
+ * Set our name in the Queue subsystem; we may need it later
+ * to deal with extents. In-memory databases are not allowed to have
+ * extents.
+ */
+ if (dbp->type == DB_QUEUE && name != NULL &&
+ (ret = __qam_set_ext_data(dbp, name)) != 0)
+ goto err;
+
+ /* Mark the dbp as opened, so that we correctly handle its close. */
+ F_SET(dbp, DB_AM_OPEN_CALLED);
+
+ /*
+ * Find out the page number of the last page in the database. We'll
+ * use this later to verify the metadata page. We don't verify now
+ * because the data from __db_vrfy_pagezero could be stale.
+ */
+ if ((ret = __memp_get_last_pgno(dbp->mpf, &vdp->last_pgno)) != 0)
+ goto err;
+ /*
+ * DB_ORDERCHKONLY is a special case; our file consists of
+ * several subdatabases, which use different hash, bt_compare,
+ * and/or dup_compare functions. Consequently, we couldn't verify
+ * sorting and hashing simply by calling DB->verify() on the file.
+ * DB_ORDERCHKONLY allows us to come back and check those things; it
+ * requires a subdatabase, and assumes that everything but that
+ * database's sorting/hashing is correct.
+ */
+ if (LF_ISSET(DB_ORDERCHKONLY)) {
+ ret = __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags);
+ goto done;
+ }
+
+ sflags = flags;
+ if (dbp->p_internal != NULL)
+ LF_CLR(DB_SALVAGE);
+
+ /*
+ * When salvaging, we use a db to keep track of whether we've seen a
+ * given overflow or dup page in the course of traversing normal data.
+ * If in the end we have not, we assume its key got lost and print it
+ * with key "UNKNOWN".
+ */
+ if (LF_ISSET(DB_SALVAGE)) {
+ if ((ret = __db_salvage_init(vdp)) != 0)
+ goto err;
+
+ /*
+ * If we're not being aggressive, salvage by walking the tree
+ * and only printing the leaves we find. "has_subdbs" will
+ * indicate whether we found subdatabases.
+ */
+ if (!LF_ISSET(DB_AGGRESSIVE) && __db_salvage_all(
+ dbp, vdp, handle, callback, flags, &has_subdbs) != 0)
+ isbad = 1;
+
+ /*
+ * If we have subdatabases, flag if any keys are found that
+ * don't belong to a subdatabase -- they'll need to have an
+ * "__OTHER__" subdatabase header printed first.
+ */
+ if (has_subdbs) {
+ F_SET(vdp, SALVAGE_PRINTHEADER);
+ F_SET(vdp, SALVAGE_HASSUBDBS);
+ }
+ }
+
+ /* Walk all the pages, if a page cannot be read, verify structure. */
+ if ((ret =
+ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (ret != DB_PAGE_NOTFOUND)
+ goto err;
+ }
+
+ /* If we're verifying, verify inter-page structure. */
+ if (!LF_ISSET(DB_SALVAGE) && isbad == 0)
+ if ((t_ret = __db_vrfy_structure(dbp,
+ vdp, name, 0, lp, rp, flags)) != 0) {
+ if (t_ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * If we're salvaging, output with key UNKNOWN any overflow or dup pages
+ * we haven't been able to put in context. Then destroy the salvager's
+ * state-saving database.
+ */
+ if (LF_ISSET(DB_SALVAGE)) {
+ if ((ret = __db_salvage_unknowns(dbp,
+ vdp, handle, callback, flags)) != 0)
+ isbad = 1;
+ }
+
+ flags = sflags;
+
+#ifdef HAVE_PARTITION
+ if (t_ret == 0 && dbp->p_internal != NULL)
+ t_ret = __part_verify(dbp, vdp, name, handle, callback, flags);
+#endif
+
+ if (ret == 0)
+ ret = t_ret;
+
+ /* Don't display a footer for a database holding other databases. */
+ if (LF_ISSET(DB_SALVAGE | DB_VERIFY_PARTITION) == DB_SALVAGE &&
+ (!has_subdbs || F_ISSET(vdp, SALVAGE_PRINTFOOTER)))
+ (void)__db_prfooter(handle, callback);
+
+done: err:
+ /* Send feedback that we're done. */
+ if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_VERIFY, 100);
+
+ if (LF_ISSET(DB_SALVAGE) &&
+ (t_ret = __db_salvage_destroy(vdp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (vdp != NULL &&
+ (t_ret = __db_vrfy_dbinfo_destroy(env, vdp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (real_name != NULL)
+ __os_free(env, real_name);
+
+ /*
+ * DB_VERIFY_FATAL is a private error, translate to a public one.
+ *
+ * If we didn't find a page, it's probably a page number was corrupted.
+ * Return the standard corruption error.
+ *
+ * Otherwise, if we found corruption along the way, set the return.
+ */
+ if (ret == DB_VERIFY_FATAL ||
+ ret == DB_PAGE_NOTFOUND || (ret == 0 && isbad == 1))
+ ret = DB_VERIFY_BAD;
+
+ /* Make sure there's a public complaint if we found corruption. */
+ if (ret != 0)
+ __db_err(env, ret, "%s", name);
+
+ return (ret);
+}
+
+/*
+ * __db_vrfy_getpagezero --
+ * Store the master metadata page into a local buffer. For safety, skip
+ * the DB paging code and read the page directly from disk (via seek and
+ * read) or the mpool.
+ */
+static int
+__db_vrfy_getpagezero(dbp, fhp, name, mbuf, flags)
+ DB *dbp;
+ DB_FH *fhp;
+ const char *name;
+ u_int8_t *mbuf;
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret, t_ret;
+ size_t nr;
+
+ env = dbp->env;
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ /*
+ * Now get the metadata page from the cache, if possible. If
+ * we're verifying an in-memory db, this is the only metadata
+ * page we have.
+ *
+ *
+ * Open the in-memory db file and get the metadata page.
+ */
+ if ((ret = __memp_fcreate_pp(env->dbenv, &mpf, DB_VERIFY)) != 0)
+ return (ret);
+ if ((ret = __memp_set_flags(mpf, DB_MPOOL_NOFILE, 1)) != 0)
+ goto mpf_err;
+ if ((ret = __memp_fopen_pp(mpf,
+ name, DB_ODDFILESIZE | DB_RDONLY, 0, 0)) != 0)
+ goto mpf_err;
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget_pp(mpf, &pgno, NULL, 0, &h)) != 0) {
+ __db_err(env, ret, DB_STR_A("0747",
+ "Metadata page %lu cannot be read from mpool",
+ "%lu"), (u_long)pgno);
+ goto mpf_err;
+ }
+ memcpy(mbuf, (u_int8_t *)h, DBMETASIZE);
+ ret = __memp_fput_pp(mpf, h, DB_PRIORITY_UNCHANGED, 0);
+mpf_err: if ((t_ret = __memp_fclose_pp(mpf, 0)) != 0 || ret != 0) {
+ return (ret == 0 ? t_ret : ret);
+ }
+ } else {
+ /*
+ * Seek to the metadata page.
+ *
+ * Note that if we're just starting a verification, dbp->pgsize
+ * may be zero; this is okay, as we want page zero anyway and
+ * 0*0 == 0.
+ */
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0 ||
+ (ret = __os_read(env, fhp, mbuf, DBMETASIZE, &nr)) != 0) {
+ __db_err(env, ret, DB_STR_A("0520",
+ "Metadata page %lu cannot be read", "%lu"),
+ (u_long)PGNO_BASE_MD);
+ return (ret);
+ }
+
+ if (nr != DBMETASIZE) {
+ EPRINT((env, DB_STR_A("0521",
+ "Page %lu: Incomplete metadata page", "%lu"),
+ (u_long)PGNO_BASE_MD));
+ return (DB_VERIFY_FATAL);
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_vrfy_pagezero --
+ * Verify the master metadata page. Use seek, read, and a local buffer
+ * rather than the DB paging code, for safety.
+ *
+ * Must correctly (or best-guess) set dbp->type and dbp->pagesize.
+ */
+static int
+__db_vrfy_pagezero(dbp, vdp, fhp, name, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ DB_FH *fhp;
+ const char *name;
+ u_int32_t flags;
+{
+ DBMETA *meta;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t freelist;
+ int isbad, ret, swapped;
+ u_int8_t mbuf[DBMETASIZE];
+
+ isbad = ret = swapped = 0;
+ freelist = 0;
+ env = dbp->env;
+ meta = (DBMETA *)mbuf;
+ dbp->type = DB_UNKNOWN;
+
+ if ((ret = __db_vrfy_getpagezero(dbp, fhp, name, mbuf, flags)) != 0)
+ return (ret);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0)
+ return (ret);
+
+ if ((ret = __db_chk_meta(env, dbp, meta, 1)) != 0) {
+ EPRINT((env, DB_STR_A("0522",
+ "Page %lu: metadata page corrupted", "%lu"),
+ (u_long)PGNO_BASE_MD));
+ isbad = 1;
+ if (ret != DB_CHKSUM_FAIL) {
+ EPRINT((env, DB_STR_A("0523",
+ "Page %lu: could not check metadata page", "%lu"),
+ (u_long)PGNO_BASE_MD));
+ return (DB_VERIFY_FATAL);
+ }
+ }
+
+ /*
+ * Check all of the fields that we can.
+ *
+ * 08-11: Current page number. Must == pgno.
+ * Note that endianness doesn't matter--it's zero.
+ */
+ if (meta->pgno != PGNO_BASE_MD) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0524",
+ "Page %lu: pgno incorrectly set to %lu", "%lu %lu"),
+ (u_long)PGNO_BASE_MD, (u_long)meta->pgno));
+ }
+
+ /* 12-15: Magic number. Must be one of valid set. */
+ if (__db_is_valid_magicno(meta->magic, &dbp->type))
+ swapped = 0;
+ else {
+ M_32_SWAP(meta->magic);
+ if (__db_is_valid_magicno(meta->magic,
+ &dbp->type))
+ swapped = 1;
+ else {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0525",
+ "Page %lu: bad magic number %lu", "%lu %lu"),
+ (u_long)PGNO_BASE_MD, (u_long)meta->magic));
+ }
+ }
+
+ /*
+ * 16-19: Version. Must be current; for now, we
+ * don't support verification of old versions.
+ */
+ if (swapped)
+ M_32_SWAP(meta->version);
+ if ((dbp->type == DB_BTREE &&
+ (meta->version > DB_BTREEVERSION ||
+ meta->version < DB_BTREEOLDVER)) ||
+ (dbp->type == DB_HASH &&
+ (meta->version > DB_HASHVERSION ||
+ meta->version < DB_HASHOLDVER)) ||
+ (dbp->type == DB_HEAP &&
+ (meta->version > DB_HEAPVERSION ||
+ meta->version < DB_HEAPOLDVER)) ||
+ (dbp->type == DB_QUEUE &&
+ (meta->version > DB_QAMVERSION ||
+ meta->version < DB_QAMOLDVER))) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0526",
+ "Page %lu: unsupported DB version %lu; extraneous errors may result",
+ "%lu %lu"), (u_long)PGNO_BASE_MD, (u_long)meta->version));
+ }
+
+ /*
+ * 20-23: Pagesize. Must be power of two,
+ * greater than 512, and less than 64K.
+ */
+ if (swapped)
+ M_32_SWAP(meta->pagesize);
+ if (IS_VALID_PAGESIZE(meta->pagesize))
+ dbp->pgsize = meta->pagesize;
+ else {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0527", "Page %lu: bad page size %lu",
+ "%lu %lu"), (u_long)PGNO_BASE_MD, (u_long)meta->pagesize));
+
+ /*
+ * Now try to settle on a pagesize to use.
+ * If the user-supplied one is reasonable,
+ * use it; else, guess.
+ */
+ if (!IS_VALID_PAGESIZE(dbp->pgsize))
+ dbp->pgsize = __db_guesspgsize(env, fhp);
+ }
+
+ /*
+ * 25: Page type. Must be correct for dbp->type,
+ * which is by now set as well as it can be.
+ */
+ /* Needs no swapping--only one byte! */
+ if ((dbp->type == DB_BTREE && meta->type != P_BTREEMETA) ||
+ (dbp->type == DB_HASH && meta->type != P_HASHMETA) ||
+ (dbp->type == DB_HEAP && meta->type != P_HEAPMETA) ||
+ (dbp->type == DB_QUEUE && meta->type != P_QAMMETA)) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0528", "Page %lu: bad page type %lu",
+ "%lu %lu"), (u_long)PGNO_BASE_MD, (u_long)meta->type));
+ }
+
+ /*
+ * 26: Meta-flags.
+ */
+ if (meta->metaflags != 0) {
+ if (FLD_ISSET(meta->metaflags,
+ ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0529",
+ "Page %lu: bad meta-data flags value %#lx",
+ "%lu %#lx"), (u_long)PGNO_BASE_MD,
+ (u_long)meta->metaflags));
+ }
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+ F_SET(pip, VRFY_HAS_CHKSUM);
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))
+ F_SET(pip, VRFY_HAS_PART_RANGE);
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK))
+ F_SET(pip, VRFY_HAS_PART_CALLBACK);
+
+ if (FLD_ISSET(meta->metaflags,
+ DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) &&
+ (ret = __partition_init(dbp, meta->metaflags)) != 0)
+ return (ret);
+ }
+
+ /*
+ * 28-31: Free list page number.
+ * 32-35: Last page in database file.
+ * We'll verify last_pgno once we open the db in the mpool;
+ * for now, just store it.
+ */
+ if (swapped)
+ M_32_SWAP(meta->free);
+ freelist = meta->free;
+ if (swapped)
+ M_32_SWAP(meta->last_pgno);
+ vdp->meta_last_pgno = meta->last_pgno;
+
+ /*
+ * Initialize vdp->pages to fit a single pageinfo structure for
+ * this one page. We'll realloc later when we know how many
+ * pages there are.
+ */
+ pip->pgno = PGNO_BASE_MD;
+ pip->type = meta->type;
+
+ /*
+ * Signal that we still have to check the info specific to
+ * a given type of meta page.
+ */
+ F_SET(pip, VRFY_INCOMPLETE);
+
+ pip->free = freelist;
+
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ return (ret);
+
+ /* Set up the dbp's fileid. We don't use the regular open path. */
+ memcpy(dbp->fileid, meta->uid, DB_FILE_ID_LEN);
+ dbp->preserve_fid = 1;
+
+ if (swapped == 1)
+ F_SET(dbp, DB_AM_SWAP);
+
+ return (isbad ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __db_vrfy_walkpages --
+ * Main loop of the verifier/salvager. Walks through,
+ * page by page, and verifies all pages and/or prints all data pages.
+ */
+static int
+__db_vrfy_walkpages(dbp, vdp, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t i;
+ int ret, t_ret, isbad;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ ret = isbad = t_ret = 0;
+
+ for (i = 0; i <= vdp->last_pgno; i++) {
+ /*
+ * If DB_SALVAGE is set, we inspect our database of completed
+ * pages, and skip any we've already printed in the subdb pass.
+ */
+ if (LF_ISSET(DB_SALVAGE) && (__db_salvage_isdone(vdp, i) != 0))
+ continue;
+
+ /*
+ * An individual page get can fail if:
+ * * This is a hash database, it is expected to find
+ * empty buckets, which don't have allocated pages. Create
+ * a dummy page so the verification can proceed.
+ * * We are salvaging, flag the error and continue.
+ */
+ if ((t_ret = __memp_fget(mpf, &i,
+ vdp->thread_info, NULL, 0, &h)) != 0) {
+ if (dbp->type == DB_HASH ||
+ (dbp->type == DB_QUEUE &&
+ F_ISSET(dbp, DB_AM_INMEM))) {
+ if ((t_ret =
+ __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+ goto err1;
+ pip->type = P_INVALID;
+ pip->pgno = i;
+ F_CLR(pip, VRFY_IS_ALLZEROES);
+ F_SET(pip, VRFY_NONEXISTENT);
+ if ((t_ret = __db_vrfy_putpageinfo(
+ env, vdp, pip)) != 0)
+ goto err1;
+ continue;
+ }
+ if (t_ret == DB_PAGE_NOTFOUND) {
+ EPRINT((env, DB_STR_A("0530",
+ "Page %lu: beyond the end of the file, metadata page has last page as %lu",
+ "%lu %lu"), (u_long)i,
+ (u_long)vdp->last_pgno));
+ if (ret == 0)
+ return (t_ret);
+ }
+
+err1: if (ret == 0)
+ ret = t_ret;
+ if (LF_ISSET(DB_SALVAGE))
+ continue;
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_SALVAGE)) {
+ /*
+ * We pretty much don't want to quit unless a
+ * bomb hits. May as well return that something
+ * was screwy, however.
+ */
+ if ((t_ret = __db_salvage_pg(dbp,
+ vdp, i, h, handle, callback, flags)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ isbad = 1;
+ }
+ } else {
+ /*
+ * If we are not salvaging, and we get any error
+ * other than DB_VERIFY_BAD, return immediately;
+ * it may not be safe to proceed. If we get
+ * DB_VERIFY_BAD, keep going; listing more errors
+ * may make it easier to diagnose problems and
+ * determine the magnitude of the corruption.
+ *
+ * Verify info common to all page types.
+ */
+ if (i != PGNO_BASE_MD) {
+ ret = __db_vrfy_common(dbp, vdp, h, i, flags);
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (ret != 0)
+ goto err;
+ }
+
+ switch (TYPE(h)) {
+ case P_INVALID:
+ ret = __db_vrfy_invalid(dbp, vdp, h, i, flags);
+ break;
+ case __P_DUPLICATE:
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0531",
+ "Page %lu: old-style duplicate page",
+ "%lu"), (u_long)i));
+ break;
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ ret = __ham_vrfy(dbp, vdp, h, i, flags);
+ break;
+ case P_HEAP:
+ case P_IHEAP:
+ ret = __heap_vrfy(dbp, vdp, h, i, flags);
+ break;
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ ret = __bam_vrfy(dbp, vdp, h, i, flags);
+ break;
+ case P_LRECNO:
+ ret = __ram_vrfy_leaf(dbp, vdp, h, i, flags);
+ break;
+ case P_OVERFLOW:
+ ret = __db_vrfy_overflow(dbp, vdp, h, i, flags);
+ break;
+ case P_HASHMETA:
+ ret = __ham_vrfy_meta(dbp,
+ vdp, (HMETA *)h, i, flags);
+ break;
+ case P_HEAPMETA:
+ ret = __heap_vrfy_meta(dbp,
+ vdp, (HEAPMETA *)h, i, flags);
+ break;
+ case P_BTREEMETA:
+ ret = __bam_vrfy_meta(dbp,
+ vdp, (BTMETA *)h, i, flags);
+ break;
+ case P_QAMMETA:
+ ret = __qam_vrfy_meta(dbp,
+ vdp, (QMETA *)h, i, flags);
+ break;
+ case P_QAMDATA:
+ ret = __qam_vrfy_data(dbp,
+ vdp, (QPAGE *)h, i, flags);
+ break;
+ default:
+ EPRINT((env, DB_STR_A("0532",
+ "Page %lu: unknown page type %lu",
+ "%lu %lu"), (u_long)i, (u_long)TYPE(h)));
+ isbad = 1;
+ break;
+ }
+
+ /*
+ * Set up error return.
+ */
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (ret != 0)
+ goto err;
+
+ /*
+ * Provide feedback to the application about our
+ * progress. The range 0-50% comes from the fact
+ * that this is the first of two passes through the
+ * database (front-to-back, then top-to-bottom).
+ */
+ if (dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_VERIFY,
+ (int)((i + 1) * 50 / (vdp->last_pgno + 1)));
+ }
+
+ /*
+ * Just as with the page get, bail if and only if we're
+ * not salvaging.
+ */
+ if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ if (!LF_ISSET(DB_SALVAGE))
+ return (ret);
+ }
+ }
+
+ /*
+ * If we've seen a Queue metadata page, we may need to walk Queue
+ * extent pages that won't show up between 0 and vdp->last_pgno.
+ */
+ if (F_ISSET(vdp, VRFY_QMETA_SET) && (t_ret =
+ __qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ if (t_ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (!LF_ISSET(DB_SALVAGE))
+ return (ret);
+ }
+
+ if (0) {
+err: if (h != NULL && (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0)
+ return (ret == 0 ? t_ret : ret);
+ }
+
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_structure--
+ * After a beginning-to-end walk through the database has been
+ * completed, put together the information that has been collected
+ * to verify the overall database structure.
+ *
+ * Should only be called if we want to do a database verification,
+ * i.e. if DB_SALVAGE is not set.
+ */
+static int
+__db_vrfy_structure(dbp, vdp, dbname, meta_pgno, lp, rp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ const char *dbname;
+ db_pgno_t meta_pgno;
+ void *lp, *rp;
+ u_int32_t flags;
+{
+ DB *pgset;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t i;
+ int ret, isbad, hassubs, p;
+
+ isbad = 0;
+ pip = NULL;
+ env = dbp->env;
+ pgset = vdp->pgset;
+
+ /*
+ * Providing feedback here is tricky; in most situations,
+ * we fetch each page one more time, but we do so in a top-down
+ * order that depends on the access method. Worse, we do this
+ * recursively in btree, such that on any call where we're traversing
+ * a subtree we don't know where that subtree is in the whole database;
+ * worse still, any given database may be one of several subdbs.
+ *
+ * The solution is to decrement a counter vdp->pgs_remaining each time
+ * we verify (and call feedback on) a page. We may over- or
+ * under-count, but the structure feedback function will ensure that we
+ * never give a percentage under 50 or over 100. (The first pass
+ * covered the range 0-50%.)
+ */
+ if (dbp->db_feedback != NULL)
+ vdp->pgs_remaining = vdp->last_pgno + 1;
+
+ /*
+ * Call the appropriate function to downwards-traverse the db type.
+ */
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret =
+ __bam_vrfy_structure(dbp, vdp, 0, lp, rp, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * If we have subdatabases and we know that the database is,
+ * thus far, sound, it's safe to walk the tree of subdatabases.
+ * Do so, and verify the structure of the databases within.
+ */
+ if ((ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) != 0)
+ goto err;
+ hassubs = F_ISSET(pip, VRFY_HAS_SUBDBS) ? 1 : 0;
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ goto err;
+ pip = NULL;
+
+ if (isbad == 0 && hassubs)
+ if ((ret =
+ __db_vrfy_subdbs(dbp, vdp, dbname, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ break;
+ case DB_HASH:
+ if ((ret = __ham_vrfy_structure(dbp, vdp, 0, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ break;
+ case DB_HEAP:
+ if ((ret = __heap_vrfy_structure(dbp, vdp, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ }
+ /* Skip the freelist check for heap, it doesn't apply. */
+ goto err;
+ case DB_QUEUE:
+ if ((ret = __qam_vrfy_structure(dbp, vdp, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ }
+
+ /*
+ * Queue pages may be unreferenced and totally zeroed, if
+ * they're empty; queue doesn't have much structure, so
+ * this is unlikely to be wrong in any troublesome sense.
+ * Skip to "err".
+ */
+ goto err;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_path(env, "__db_vrfy_structure");
+ goto err;
+ }
+
+ /* Walk free list. */
+ if ((ret =
+ __db_vrfy_freelist(dbp, vdp, meta_pgno, flags)) == DB_VERIFY_BAD)
+ isbad = 1;
+
+ /*
+ * If structure checks up until now have failed, it's likely that
+ * checking what pages have been missed will result in oodles of
+ * extraneous error messages being EPRINTed. Skip to the end
+ * if this is the case; we're going to be printing at least one
+ * error anyway, and probably all the more salient ones.
+ */
+ if (ret != 0 || isbad == 1)
+ goto err;
+
+ /*
+ * Make sure no page has been missed and that no page is still marked
+ * "all zeroes" unless we are looking at unused hash bucket pages or
+ * pagesoff the end of database.
+ */
+ for (i = 0; i < vdp->last_pgno + 1; i++) {
+ if ((ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+ goto err;
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, vdp->txn, i, &p)) != 0)
+ goto err;
+ if (pip->type == P_OVERFLOW) {
+ if ((u_int32_t)p != pip->refcount) {
+ EPRINT((env, DB_STR_A("0533",
+ "Page %lu: overflow refcount %lu, referenced %lu times",
+ "%lu %lu %lu"), (u_long)i,
+ (u_long)pip->refcount, (u_long)p));
+ isbad = 1;
+ }
+ } else if (p == 0 &&
+#ifndef HAVE_FTRUNCATE
+ !(i > vdp->meta_last_pgno &&
+ (F_ISSET(pip, VRFY_IS_ALLZEROES) || pip->type == P_HASH)) &&
+#endif
+ !(dbp->type == DB_HASH &&
+ (pip->type == P_HASH || pip->type == P_INVALID))) {
+ /*
+ * It is OK for unreferenced hash buckets to be
+ * marked invalid and unreferenced.
+ */
+ EPRINT((env, DB_STR_A("0534",
+ "Page %lu: unreferenced page", "%lu"), (u_long)i));
+ isbad = 1;
+ }
+
+ if (F_ISSET(pip, VRFY_IS_ALLZEROES)
+#ifndef HAVE_FTRUNCATE
+ && i <= vdp->meta_last_pgno
+#endif
+ ) {
+ EPRINT((env, DB_STR_A("0535",
+ "Page %lu: totally zeroed page", "%lu"),
+ (u_long)i));
+ isbad = 1;
+ }
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ goto err;
+ pip = NULL;
+ }
+
+err: if (pip != NULL)
+ (void)__db_vrfy_putpageinfo(env, vdp, pip);
+
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_is_valid_magicno
+ */
+static int
+__db_is_valid_magicno(magic, typep)
+ u_int32_t magic;
+ DBTYPE *typep;
+{
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ *typep = DB_BTREE;
+ return (1);
+ case DB_HASHMAGIC:
+ *typep = DB_HASH;
+ return (1);
+ case DB_HEAPMAGIC:
+ *typep = DB_HEAP;
+ return (1);
+ case DB_QAMMAGIC:
+ *typep = DB_QUEUE;
+ return (1);
+ default:
+ break;
+ }
+ *typep = DB_UNKNOWN;
+ return (0);
+}
+
+/*
+ * __db_vrfy_common --
+ * Verify info common to all page types.
+ *
+ * PUBLIC: int __db_vrfy_common
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_common(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret;
+ u_int8_t *p;
+
+ env = dbp->env;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ pip->pgno = pgno;
+ F_CLR(pip, VRFY_IS_ALLZEROES);
+
+ /*
+ * Hash expands the table by leaving some pages between the
+ * old last and the new last totally zeroed. These pages may
+ * not be all zero if they were used, freed and then reallocated.
+ *
+ * Queue will create sparse files if sparse record numbers are used.
+ */
+ if (pgno != 0 && PGNO(h) == 0) {
+ F_SET(pip, VRFY_IS_ALLZEROES);
+ for (p = (u_int8_t *)h; p < (u_int8_t *)h + dbp->pgsize; p++)
+ if (*p != 0) {
+ F_CLR(pip, VRFY_IS_ALLZEROES);
+ break;
+ }
+ /*
+ * Mark it as a hash, and we'll
+ * check that that makes sense structurally later.
+ * (The queue verification doesn't care, since queues
+ * don't really have much in the way of structure.)
+ */
+ if (dbp->type != DB_HEAP)
+ pip->type = P_HASH;
+ ret = 0;
+ goto err; /* well, not really an err. */
+ }
+
+ if (PGNO(h) != pgno) {
+ EPRINT((env, DB_STR_A("0536", "Page %lu: bad page number %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)h->pgno));
+ ret = DB_VERIFY_BAD;
+ }
+
+ switch (h->type) {
+ case P_INVALID: /* Order matches ordinal value. */
+ case P_HASH_UNSORTED:
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_OVERFLOW:
+ case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ case P_QAMDATA:
+ case P_LDUP:
+ case P_HASH:
+ case P_HEAP:
+ case P_IHEAP:
+ case P_HEAPMETA:
+ break;
+ default:
+ EPRINT((env, DB_STR_A("0537", "Page %lu: bad page type %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)h->type));
+ ret = DB_VERIFY_BAD;
+ }
+ pip->type = h->type;
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_vrfy_invalid --
+ * Verify P_INVALID page.
+ * (Yes, there's not much to do here.)
+ */
+static int
+__db_vrfy_invalid(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ pip->next_pgno = pip->prev_pgno = 0;
+
+ if (!IS_VALID_PGNO(NEXT_PGNO(h))) {
+ EPRINT((env, DB_STR_A("0538", "Page %lu: invalid next_pgno %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)NEXT_PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ } else
+ pip->next_pgno = NEXT_PGNO(h);
+
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_vrfy_datapage --
+ * Verify elements common to data pages (P_HASH, P_LBTREE,
+ * P_IBTREE, P_IRECNO, P_LRECNO, P_OVERFLOW, P_DUPLICATE)--i.e.,
+ * those defined in the PAGE structure.
+ *
+ * Called from each of the per-page routines, after the
+ * all-page-type-common elements of pip have been verified and filled
+ * in.
+ *
+ * PUBLIC: int __db_vrfy_datapage
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_datapage(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ u_int32_t smallest_entry;
+ int isbad, ret, t_ret;
+
+ env = dbp->env;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ isbad = 0;
+
+ /*
+ * prev_pgno and next_pgno: store for inter-page checks,
+ * verify that they point to actual pages and not to self.
+ *
+ * !!!
+ * Internal btree pages, as well as heap pages, do not maintain these
+ * fields (indeed, they overload them). Skip.
+ */
+ if (TYPE(h) != P_IBTREE &&
+ TYPE(h) != P_IRECNO && TYPE(h) != P_HEAP && TYPE(h) != P_IHEAP) {
+ if (!IS_VALID_PGNO(PREV_PGNO(h)) || PREV_PGNO(h) == pip->pgno) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0539",
+ "Page %lu: invalid prev_pgno %lu", "%lu %lu"),
+ (u_long)pip->pgno, (u_long)PREV_PGNO(h)));
+ }
+ if (!IS_VALID_PGNO(NEXT_PGNO(h)) || NEXT_PGNO(h) == pip->pgno) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0540",
+ "Page %lu: invalid next_pgno %lu", "%lu %lu"),
+ (u_long)pip->pgno, (u_long)NEXT_PGNO(h)));
+ }
+ pip->prev_pgno = PREV_PGNO(h);
+ pip->next_pgno = NEXT_PGNO(h);
+ }
+
+ /*
+ * Verify the number of entries on the page: there's no good way to
+ * determine if this is accurate. The best we can do is verify that
+ * it's not more than can, in theory, fit on the page. Then, we make
+ * sure there are at least this many valid elements in inp[], and
+ * hope the test catches most cases.
+ */
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ smallest_entry = HKEYDATA_PSIZE(0);
+ break;
+ case P_HEAP:
+ smallest_entry = sizeof(HEAPHDR) + sizeof(db_indx_t);
+ break;
+ case P_IHEAP:
+ /* Really high_pgno. */
+ pip->prev_pgno = PREV_PGNO(h);
+ smallest_entry = 0;
+ break;
+ case P_IBTREE:
+ smallest_entry = BINTERNAL_PSIZE(0);
+ break;
+ case P_IRECNO:
+ smallest_entry = RINTERNAL_PSIZE;
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ smallest_entry = BKEYDATA_PSIZE(0);
+ break;
+ default:
+ smallest_entry = 0;
+ break;
+ }
+ if (smallest_entry * NUM_ENT(h) / 2 > dbp->pgsize) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0541",
+ "Page %lu: too many entries: %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)NUM_ENT(h)));
+ }
+
+ if (TYPE(h) != P_OVERFLOW)
+ pip->entries = NUM_ENT(h);
+
+ /*
+ * btree level. Should be zero unless we're a btree;
+ * if we are a btree, should be between LEAFLEVEL and MAXBTREELEVEL,
+ * and we need to save it off.
+ */
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ if (LEVEL(h) < LEAFLEVEL + 1) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0542",
+ "Page %lu: bad btree level %lu", "%lu %lu"),
+ (u_long)pgno, (u_long)LEVEL(h)));
+ }
+ pip->bt_level = LEVEL(h);
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ if (LEVEL(h) != LEAFLEVEL) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0543",
+ "Page %lu: btree leaf page has incorrect level %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)LEVEL(h)));
+ }
+ break;
+ default:
+ if (LEVEL(h) != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0544",
+ "Page %lu: nonzero level %lu in non-btree database",
+ "%lu %lu"), (u_long)pgno, (u_long)LEVEL(h)));
+ }
+ break;
+ }
+
+ /*
+ * Even though inp[] occurs in all PAGEs, we look at it in the
+ * access-method-specific code, since btree and hash treat
+ * item lengths very differently, and one of the most important
+ * things we want to verify is that the data--as specified
+ * by offset and length--cover the right part of the page
+ * without overlaps, gaps, or violations of the page boundary.
+ */
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_meta --
+ * Verify the access-method common parts of a meta page, using
+ * normal mpool routines.
+ *
+ * PUBLIC: int __db_vrfy_meta
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, DBMETA *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ DBMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ DBTYPE dbtype, magtype;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int isbad, ret, t_ret;
+
+ isbad = 0;
+ env = dbp->env;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ /* type plausible for a meta page */
+ switch (meta->type) {
+ case P_BTREEMETA:
+ dbtype = DB_BTREE;
+ break;
+ case P_HASHMETA:
+ dbtype = DB_HASH;
+ break;
+ case P_HEAPMETA:
+ dbtype = DB_HEAP;
+ break;
+ case P_QAMMETA:
+ dbtype = DB_QUEUE;
+ break;
+ default:
+ ret = __db_unknown_path(env, "__db_vrfy_meta");
+ goto err;
+ }
+
+ /* magic number valid */
+ if (!__db_is_valid_magicno(meta->magic, &magtype)) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0545", "Page %lu: invalid magic number",
+ "%lu"), (u_long)pgno));
+ }
+ if (magtype != dbtype) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0546",
+ "Page %lu: magic number does not match database type",
+ "%lu"), (u_long)pgno));
+ }
+
+ /* version */
+ if ((dbtype == DB_BTREE &&
+ (meta->version > DB_BTREEVERSION ||
+ meta->version < DB_BTREEOLDVER)) ||
+ (dbtype == DB_HASH &&
+ (meta->version > DB_HASHVERSION ||
+ meta->version < DB_HASHOLDVER)) ||
+ (dbtype == DB_HEAP &&
+ (meta->version > DB_HEAPVERSION ||
+ meta->version < DB_HEAPOLDVER)) ||
+ (dbtype == DB_QUEUE &&
+ (meta->version > DB_QAMVERSION ||
+ meta->version < DB_QAMOLDVER))) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0547",
+ "Page %lu: unsupported database version %lu; extraneous errors may result",
+ "%lu %lu"), (u_long)pgno, (u_long)meta->version));
+ }
+
+ /* pagesize */
+ if (meta->pagesize != dbp->pgsize) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0548", "Page %lu: invalid pagesize %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)meta->pagesize));
+ }
+
+ /* Flags */
+ if (meta->metaflags != 0) {
+ if (FLD_ISSET(meta->metaflags,
+ ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0549",
+ "Page %lu: bad meta-data flags value %#lx",
+ "%lu %#lx"), (u_long)PGNO_BASE_MD,
+ (u_long)meta->metaflags));
+ }
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+ F_SET(pip, VRFY_HAS_CHKSUM);
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))
+ F_SET(pip, VRFY_HAS_PART_RANGE);
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK))
+ F_SET(pip, VRFY_HAS_PART_CALLBACK);
+ }
+
+ /*
+ * Free list.
+ *
+ * If this is not the main, master-database meta page, it
+ * should not have a free list.
+ */
+ if (pgno != PGNO_BASE_MD && meta->free != PGNO_INVALID) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0550",
+ "Page %lu: nonempty free list on subdatabase metadata page",
+ "%lu"), (u_long)pgno));
+ }
+
+ /* Can correctly be PGNO_INVALID--that's just the end of the list. */
+ if (IS_VALID_PGNO(meta->free))
+ pip->free = meta->free;
+ else {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0551",
+ "Page %lu: nonsensical free list pgno %lu", "%lu %lu"),
+ (u_long)pgno, (u_long)meta->free));
+ }
+
+ /*
+ * Check that the meta page agrees with what we got from mpool.
+ * If we don't have FTRUNCATE then mpool could include some
+ * zeroed pages at the end of the file, we assume the meta page
+ * is correct. Queue does not update the meta page's last_pgno.
+ */
+ if (pgno == PGNO_BASE_MD &&
+ dbtype != DB_QUEUE && meta->last_pgno != vdp->last_pgno) {
+#ifdef HAVE_FTRUNCATE
+ isbad = 1;
+ EPRINT((env, DB_STR_A("0552",
+ "Page %lu: last_pgno is not correct: %lu != %lu",
+ "%lu %lu %lu"), (u_long)pgno,
+ (u_long)meta->last_pgno, (u_long)vdp->last_pgno));
+#endif
+ vdp->meta_last_pgno = meta->last_pgno;
+ }
+
+ /*
+ * We have now verified the common fields of the metadata page.
+ * Clear the flag that told us they had been incompletely checked.
+ */
+ F_CLR(pip, VRFY_INCOMPLETE);
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_freelist --
+ * Walk free list, checking off pages and verifying absence of
+ * loops.
+ */
+static int
+__db_vrfy_freelist(dbp, vdp, meta, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta;
+ u_int32_t flags;
+{
+ DB *pgset;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t cur_pgno, next_pgno;
+ int p, ret, t_ret;
+
+ env = dbp->env;
+ pgset = vdp->pgset;
+ DB_ASSERT(env, pgset != NULL);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta, &pip)) != 0)
+ return (ret);
+ for (next_pgno = pip->free;
+ next_pgno != PGNO_INVALID; next_pgno = pip->next_pgno) {
+ cur_pgno = pip->pgno;
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ return (t_ret);
+
+ /* This shouldn't happen, but just in case. */
+ if (!IS_VALID_PGNO(next_pgno)) {
+ EPRINT((env, DB_STR_A("0553",
+ "Page %lu: invalid next_pgno %lu on free list page",
+ "%lu %lu"), (u_long)cur_pgno, (u_long)next_pgno));
+ return (DB_VERIFY_BAD);
+ }
+
+ if (next_pgno > vdp->last_pgno) {
+ EPRINT((env, DB_STR_A("0713",
+ "Page %lu: page %lu on free list beyond last_pgno %lu",
+ "%lu %lu %lu"), (u_long)cur_pgno,
+ (u_long)next_pgno, (u_long)vdp->last_pgno));
+ ret = DB_VERIFY_BAD;
+ }
+ /* Detect cycles. */
+ if ((t_ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, vdp->txn, next_pgno, &p)) != 0)
+ return (t_ret);
+ if (p != 0) {
+ EPRINT((env, DB_STR_A("0554",
+ "Page %lu: page %lu encountered a second time on free list",
+ "%lu %lu"), (u_long)cur_pgno, (u_long)next_pgno));
+ return (DB_VERIFY_BAD);
+ }
+ if ((t_ret = __db_vrfy_pgset_inc(pgset,
+ vdp->thread_info, vdp->txn, next_pgno)) != 0)
+ return (t_ret);
+
+ if ((t_ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0)
+ return (t_ret);
+
+ if (pip->type != P_INVALID) {
+ EPRINT((env, DB_STR_A("0555",
+ "Page %lu: non-invalid page %lu on free list",
+ "%lu %lu"), (u_long)cur_pgno, (u_long)next_pgno));
+ ret = DB_VERIFY_BAD; /* unsafe to continue */
+ break;
+ }
+ }
+
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_vrfy_subdbs --
+ * Walk the known-safe master database of subdbs with a cursor,
+ * verifying the structure of each subdatabase we encounter.
+ */
+static int
+__db_vrfy_subdbs(dbp, vdp, dbname, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ const char *dbname;
+ u_int32_t flags;
+{
+ DB *mdbp;
+ DBC *dbc;
+ DBT key, data;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t meta_pgno;
+ int ret, t_ret, isbad;
+ u_int8_t type;
+
+ isbad = 0;
+ dbc = NULL;
+ env = dbp->env;
+
+ if ((ret = __db_master_open(dbp,
+ vdp->thread_info, NULL, dbname, DB_RDONLY, 0, &mdbp)) != 0)
+ return (ret);
+
+ if ((ret = __db_cursor_int(mdbp, NULL,
+ vdp->txn, DB_BTREE, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ goto err;
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ while ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) == 0) {
+ if (data.size != sizeof(db_pgno_t)) {
+ EPRINT((env, DB_STR("0556",
+ "Subdatabase entry not page-number size")));
+ isbad = 1;
+ goto err;
+ }
+ memcpy(&meta_pgno, data.data, data.size);
+ /*
+ * Subdatabase meta pgnos are stored in network byte
+ * order for cross-endian compatibility. Swap if appropriate.
+ */
+ DB_NTOHL_SWAP(env, &meta_pgno);
+ if (meta_pgno == PGNO_INVALID || meta_pgno > vdp->last_pgno) {
+ EPRINT((env, DB_STR_A("0557",
+ "Subdatabase entry references invalid page %lu",
+ "%lu"), (u_long)meta_pgno));
+ isbad = 1;
+ goto err;
+ }
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
+ goto err;
+ type = pip->type;
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ goto err;
+ switch (type) {
+ case P_BTREEMETA:
+ if ((ret = __bam_vrfy_structure(
+ dbp, vdp, meta_pgno, NULL, NULL, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ break;
+ case P_HASHMETA:
+ if ((ret = __ham_vrfy_structure(
+ dbp, vdp, meta_pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ break;
+ case P_QAMMETA:
+ default:
+ EPRINT((env, DB_STR_A("0558",
+ "Subdatabase entry references page %lu of invalid type %lu",
+ "%lu %lu"), (u_long)meta_pgno, (u_long)type));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+err: if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __db_close(mdbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_struct_feedback --
+ * Provide feedback during top-down database structure traversal.
+ * (See comment at the beginning of __db_vrfy_structure.)
+ *
+ * PUBLIC: void __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *));
+ */
+void
+__db_vrfy_struct_feedback(dbp, vdp)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+{
+ int progress;
+
+ if (dbp->db_feedback == NULL)
+ return;
+
+ if (vdp->pgs_remaining > 0)
+ vdp->pgs_remaining--;
+
+ /* Don't allow a feedback call of 100 until we're really done. */
+ progress = 100 - (int)(vdp->pgs_remaining * 50 / (vdp->last_pgno + 1));
+ dbp->db_feedback(dbp, DB_VERIFY, progress == 100 ? 99 : progress);
+}
+
+/*
+ * __db_vrfy_orderchkonly --
+ * Do an sort-order/hashing check on a known-otherwise-good subdb.
+ */
+static int
+__db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ BTMETA *btmeta;
+ DB *mdbp, *pgset;
+ DBC *pgsc;
+ DBT key, data;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HASH *h_internal;
+ HMETA *hmeta;
+ PAGE *h, *currpg;
+ db_pgno_t meta_pgno, p, pgno;
+ u_int32_t bucket;
+ int t_ret, ret;
+
+ pgset = NULL;
+ pgsc = NULL;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ currpg = h = NULL;
+
+ LF_CLR(DB_NOORDERCHK);
+
+ /* Open the master database and get the meta_pgno for the subdb. */
+ if ((ret = __db_master_open(dbp,
+ vdp->thread_info, NULL, name, DB_RDONLY, 0, &mdbp)) != 0)
+ goto err;
+
+ DB_INIT_DBT(key, subdb, strlen(subdb));
+ memset(&data, 0, sizeof(data));
+ if ((ret = __db_get(mdbp,
+ vdp->thread_info, NULL, &key, &data, 0)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = ENOENT;
+ goto err;
+ }
+
+ if (data.size != sizeof(db_pgno_t)) {
+ EPRINT((env, DB_STR("0559",
+ "Subdatabase entry of invalid size")));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ memcpy(&meta_pgno, data.data, data.size);
+
+ /*
+ * Subdatabase meta pgnos are stored in network byte
+ * order for cross-endian compatibility. Swap if appropriate.
+ */
+ DB_NTOHL_SWAP(env, &meta_pgno);
+
+ if ((ret = __memp_fget(mpf,
+ &meta_pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+
+ if ((ret = __db_vrfy_pgset(env,
+ vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+ goto err;
+
+ switch (TYPE(h)) {
+ case P_BTREEMETA:
+ btmeta = (BTMETA *)h;
+ if (F_ISSET(&btmeta->dbmeta, BTM_RECNO)) {
+ /* Recnos have no order to check. */
+ ret = 0;
+ goto err;
+ }
+ if ((ret =
+ __db_meta2pgset(dbp, vdp, meta_pgno, flags, pgset)) != 0)
+ goto err;
+ if ((ret = __db_cursor_int(pgset, NULL, vdp->txn, dbp->type,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0)
+ goto err;
+ while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+ if ((ret = __memp_fget(mpf, &p,
+ vdp->thread_info, NULL, 0, &currpg)) != 0)
+ goto err;
+ if ((ret = __bam_vrfy_itemorder(dbp, NULL,
+ vdp->thread_info, currpg, p, NUM_ENT(currpg), 1,
+ F_ISSET(&btmeta->dbmeta, BTM_DUP), flags)) != 0)
+ goto err;
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, currpg, dbp->priority)) != 0)
+ goto err;
+ currpg = NULL;
+ }
+
+ /*
+ * The normal exit condition for the loop above is DB_NOTFOUND.
+ * If we see that, zero it and continue on to cleanup.
+ * Otherwise, it's a real error and will be returned.
+ */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ break;
+ case P_HASHMETA:
+ hmeta = (HMETA *)h;
+ h_internal = (HASH *)dbp->h_internal;
+ /*
+ * Make sure h_charkey is right.
+ */
+ if (h_internal == NULL) {
+ EPRINT((env, DB_STR_A("0560",
+ "Page %lu: DB->h_internal field is NULL", "%lu"),
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if (h_internal->h_hash == NULL)
+ h_internal->h_hash = hmeta->dbmeta.version < 5
+ ? __ham_func4 : __ham_func5;
+ if (hmeta->h_charkey !=
+ h_internal->h_hash(dbp, CHARKEY, sizeof(CHARKEY))) {
+ EPRINT((env, DB_STR_A("0561",
+ "Page %lu: incorrect hash function for database",
+ "%lu"), (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Foreach bucket, verify hashing on each page in the
+ * corresponding chain of pages.
+ */
+ if ((ret = __db_cursor_int(dbp, NULL, vdp->txn, dbp->type,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0)
+ goto err;
+ for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) {
+ pgno = BS_TO_PAGE(bucket, hmeta->spares);
+ while (pgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &currpg)) != 0)
+ goto err;
+ if ((ret = __ham_vrfy_hashing(pgsc,
+ NUM_ENT(currpg), hmeta, bucket, pgno,
+ flags, h_internal->h_hash)) != 0)
+ goto err;
+ pgno = NEXT_PGNO(currpg);
+ if ((ret = __memp_fput(mpf, vdp->thread_info,
+ currpg, dbp->priority)) != 0)
+ goto err;
+ currpg = NULL;
+ }
+ }
+ break;
+ default:
+ EPRINT((env, DB_STR_A("0562",
+ "Page %lu: database metapage of bad type %lu",
+ "%lu %lu"), (u_long)meta_pgno, (u_long)TYPE(h)));
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+err: if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pgset != NULL &&
+ (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0)
+ ret = t_ret;
+ if (currpg != NULL &&
+ (t_ret = __memp_fput(mpf,
+ vdp->thread_info, currpg, dbp->priority)) != 0)
+ ret = t_ret;
+ if ((t_ret = __db_close(mdbp, NULL, 0)) != 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_salvage_pg --
+ * Walk through a page, salvaging all likely or plausible (w/
+ * DB_AGGRESSIVE) key/data pairs and marking seen pages in vdp.
+ *
+ * PUBLIC: int __db_salvage_pg __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_pg(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int keyflag, ret, t_ret;
+
+ env = dbp->env;
+ DB_ASSERT(env, LF_ISSET(DB_SALVAGE));
+
+ /*
+ * !!!
+ * We dump record numbers when salvaging Queue databases, but not for
+ * immutable Recno databases. The problem is we can't figure out the
+ * record number from the database page in the Recno case, while the
+ * offset in the file is sufficient for Queue.
+ */
+ keyflag = 0;
+
+ /* If we got this page in the subdb pass, we can safely skip it. */
+ if (__db_salvage_isdone(vdp, pgno))
+ return (0);
+
+ switch (TYPE(h)) {
+ case P_BTREEMETA:
+ ret = __bam_vrfy_meta(dbp, vdp, (BTMETA *)h, pgno, flags);
+ break;
+ case P_HASH:
+ case P_HASH_UNSORTED:
+ case P_HEAP:
+ case P_LBTREE:
+ case P_QAMDATA:
+ return (__db_salvage_leaf(dbp,
+ vdp, pgno, h, handle, callback, flags));
+ case P_HASHMETA:
+ ret = __ham_vrfy_meta(dbp, vdp, (HMETA *)h, pgno, flags);
+ break;
+ case P_HEAPMETA:
+ ret = __heap_vrfy_meta(dbp, vdp, (HEAPMETA *)h, pgno, flags);
+ break;
+ case P_IBTREE:
+ /*
+ * We need to mark any overflow keys on internal pages as seen,
+ * so we don't print them out in __db_salvage_unknowns. But if
+ * we're an upgraded database, a P_LBTREE page may very well
+ * have a reference to the same overflow pages (this practice
+ * stopped somewhere around db4.5). To give P_LBTREEs a chance
+ * to print out any keys on shared pages, mark the page now and
+ * deal with it at the end.
+ */
+ return (__db_salvage_markneeded(vdp, pgno, SALVAGE_IBTREE));
+ case P_IHEAP:
+ /*
+ * There's nothing to salvage from heap region pages. Just mark
+ * that we've seen the page.
+ */
+ return (__db_salvage_markdone(vdp, pgno));
+ case P_LDUP:
+ return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LDUP));
+ case P_LRECNO:
+ /*
+ * Recno leaves are tough, because the leaf could be (1) a dup
+ * page, or it could be (2) a regular database leaf page.
+ * Fortunately, RECNO databases are not allowed to have
+ * duplicates.
+ *
+ * If there are no subdatabases, dump the page immediately if
+ * it's a leaf in a RECNO database, otherwise wait and hopefully
+ * it will be dumped by the leaf page that refers to it,
+ * otherwise we'll get it with the unknowns.
+ *
+ * If there are subdatabases, there might be mixed types and
+ * dbp->type can't be trusted. We'll only get here after
+ * salvaging each database, though, so salvaging this page
+ * immediately isn't important. If this page is a dup, it might
+ * get salvaged later on, otherwise the unknowns pass will pick
+ * it up. Note that SALVAGE_HASSUBDBS won't get set if we're
+ * salvaging aggressively.
+ *
+ * If we're salvaging aggressively, we don't know whether or not
+ * there's subdatabases, so we wait on all recno pages.
+ */
+ if (!LF_ISSET(DB_AGGRESSIVE) &&
+ !F_ISSET(vdp, SALVAGE_HASSUBDBS) && dbp->type == DB_RECNO)
+ return (__db_salvage_leaf(dbp,
+ vdp, pgno, h, handle, callback, flags));
+ return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LRECNODUP));
+ case P_OVERFLOW:
+ return (__db_salvage_markneeded(vdp, pgno, SALVAGE_OVERFLOW));
+ case P_QAMMETA:
+ keyflag = 1;
+ ret = __qam_vrfy_meta(dbp, vdp, (QMETA *)h, pgno, flags);
+ break;
+ case P_INVALID:
+ case P_IRECNO:
+ case __P_DUPLICATE:
+ default:
+ /*
+ * There's no need to display an error, the page type was
+ * already checked and reported on.
+ */
+ return (0);
+ }
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * We have to display the dump header if it's a metadata page. It's
+ * our last chance as the page was marked "seen" in the vrfy routine,
+ * and we won't see the page again. We don't display headers for
+ * the first database in a multi-database file, that database simply
+ * contains a list of subdatabases.
+ */
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ if (!F_ISSET(pip, VRFY_HAS_SUBDBS) && !LF_ISSET(DB_VERIFY_PARTITION))
+ ret = __db_prheader(
+ dbp, NULL, 0, keyflag, handle, callback, vdp, pgno);
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_salvage_leaf --
+ * Walk through a leaf, salvaging all likely key/data pairs and marking
+ * seen pages in vdp.
+ *
+ * PUBLIC: int __db_salvage_leaf __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_leaf(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ ENV *env;
+
+ env = dbp->env;
+ DB_ASSERT(env, LF_ISSET(DB_SALVAGE));
+
+ /* If we got this page in the subdb pass, we can safely skip it. */
+ if (__db_salvage_isdone(vdp, pgno))
+ return (0);
+
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ return (__ham_salvage(dbp, vdp,
+ pgno, h, handle, callback, flags));
+ case P_HEAP:
+ return (__heap_salvage(dbp, vdp,
+ pgno, h, handle, callback, flags));
+ case P_LBTREE:
+ case P_LRECNO:
+ return (__bam_salvage(dbp, vdp,
+ pgno, TYPE(h), h, handle, callback, NULL, flags));
+ case P_QAMDATA:
+ return (__qam_salvage(dbp, vdp,
+ pgno, h, handle, callback, flags));
+ default:
+ /*
+ * There's no need to display an error, the page type was
+ * already checked and reported on.
+ */
+ return (0);
+ }
+}
+
+/*
+ * __db_salvage_unknowns --
+ * Walk through the salvager database, printing with key "UNKNOWN"
+ * any pages we haven't dealt with.
+ */
+static int
+__db_salvage_unknowns(dbp, vdp, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DBT unkdbt, key, *dbt;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t pgtype, ovfl_bufsz, tmp_flags;
+ int ret, t_ret;
+ void *ovflbuf;
+
+ dbc = NULL;
+ env = dbp->env;
+ mpf = dbp->mpf;
+
+ DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1);
+
+ if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
+ return (ret);
+ ovfl_bufsz = dbp->pgsize;
+
+ /*
+ * We make two passes -- in the first pass, skip SALVAGE_OVERFLOW
+ * pages, because they may be referenced by the standard database
+ * pages that we're resolving.
+ */
+ while ((t_ret =
+ __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 1)) == 0) {
+ if ((t_ret = __memp_fget(mpf,
+ &pgno, vdp->thread_info, NULL, 0, &h)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ continue;
+ }
+
+ dbt = NULL;
+ tmp_flags = 0;
+ switch (pgtype) {
+ case SALVAGE_LDUP:
+ case SALVAGE_LRECNODUP:
+ dbt = &unkdbt;
+ tmp_flags = DB_SA_UNKNOWNKEY;
+ /* FALLTHROUGH */
+ case SALVAGE_IBTREE:
+ case SALVAGE_LBTREE:
+ case SALVAGE_LRECNO:
+ if ((t_ret = __bam_salvage(
+ dbp, vdp, pgno, pgtype, h, handle,
+ callback, dbt, tmp_flags | flags)) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+ case SALVAGE_OVERFLOW:
+ DB_ASSERT(env, 0); /* Shouldn't ever happen. */
+ break;
+ case SALVAGE_HASH:
+ if ((t_ret = __ham_salvage(dbp, vdp,
+ pgno, h, handle, callback, flags)) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+ case SALVAGE_INVALID:
+ case SALVAGE_IGNORE:
+ default:
+ /*
+ * Shouldn't happen, but if it does, just do what the
+ * nice man says.
+ */
+ DB_ASSERT(env, 0);
+ break;
+ }
+ if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /* We should have reached the end of the database. */
+ if (t_ret == DB_NOTFOUND)
+ t_ret = 0;
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Re-open the cursor so we traverse the database again. */
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ dbc = NULL;
+
+ /* Now, deal with any remaining overflow pages. */
+ while ((t_ret =
+ __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 0)) == 0) {
+ if ((t_ret = __memp_fget(mpf,
+ &pgno, vdp->thread_info, NULL, 0, &h)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ continue;
+ }
+
+ switch (pgtype) {
+ case SALVAGE_OVERFLOW:
+ /*
+ * XXX:
+ * This may generate multiple "UNKNOWN" keys in
+ * a database with no dups. What to do?
+ */
+ if ((t_ret = __db_safe_goff(dbp, vdp,
+ pgno, &key, &ovflbuf, &ovfl_bufsz, flags)) != 0 ||
+ ((vdp->type == DB_BTREE || vdp->type == DB_HASH) &&
+ (t_ret = __db_vrfy_prdbt(&unkdbt,
+ 0, " ", handle, callback, 0, 0, vdp)) != 0) ||
+ (t_ret = __db_vrfy_prdbt(
+ &key, 0, " ", handle, callback, 0, 0, vdp)) != 0)
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ default:
+ DB_ASSERT(env, 0); /* Shouldn't ever happen. */
+ break;
+ }
+ if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /* We should have reached the end of the database. */
+ if (t_ret == DB_NOTFOUND)
+ t_ret = 0;
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ __os_free(env, ovflbuf);
+
+ return (ret);
+}
+
+/*
+ * Offset of the ith inp array entry, which we can compare to the offset
+ * the entry stores.
+ */
+#define INP_OFFSET(dbp, h, i) \
+ ((db_indx_t)((u_int8_t *)((P_INP(dbp,(h))) + (i)) - (u_int8_t *)(h)))
+
+/*
+ * __db_vrfy_inpitem --
+ * Verify that a single entry in the inp array is sane, and update
+ * the high water mark and current item offset. (The former of these is
+ * used for state information between calls, and is required; it must
+ * be initialized to the pagesize before the first call.)
+ *
+ * Returns DB_VERIFY_FATAL if inp has collided with the data,
+ * since verification can't continue from there; returns DB_VERIFY_BAD
+ * if anything else is wrong.
+ *
+ * PUBLIC: int __db_vrfy_inpitem __P((DB *, PAGE *,
+ * PUBLIC: db_pgno_t, u_int32_t, int, u_int32_t, u_int32_t *, u_int32_t *));
+ */
+int
+__db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp)
+ DB *dbp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t i;
+ int is_btree;
+ u_int32_t flags, *himarkp, *offsetp;
+{
+ BKEYDATA *bk;
+ ENV *env;
+ db_indx_t *inp, offset, len;
+
+ env = dbp->env;
+
+ DB_ASSERT(env, himarkp != NULL);
+ inp = P_INP(dbp, h);
+
+ /*
+ * Check that the inp array, which grows from the beginning of the
+ * page forward, has not collided with the data, which grow from the
+ * end of the page backward.
+ */
+ if (inp + i >= (db_indx_t *)((u_int8_t *)h + *himarkp)) {
+ /* We've collided with the data. We need to bail. */
+ EPRINT((env, DB_STR_A("0563",
+ "Page %lu: entries listing %lu overlaps data",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ return (DB_VERIFY_FATAL);
+ }
+
+ offset = inp[i];
+
+ /*
+ * Check that the item offset is reasonable: it points somewhere
+ * after the inp array and before the end of the page.
+ */
+ if (offset <= INP_OFFSET(dbp, h, i) || offset >= dbp->pgsize) {
+ EPRINT((env, DB_STR_A("0564",
+ "Page %lu: bad offset %lu at page index %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)offset, (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+
+ /* Update the high-water mark (what HOFFSET should be) */
+ if (offset < *himarkp)
+ *himarkp = offset;
+
+ if (is_btree) {
+ /*
+ * Check alignment; if it's unaligned, it's unsafe to
+ * manipulate this item.
+ */
+ if (offset != DB_ALIGN(offset, sizeof(u_int32_t))) {
+ EPRINT((env, DB_STR_A("0565",
+ "Page %lu: unaligned offset %lu at page index %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)offset,
+ (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+
+ /*
+ * Check that the item length remains on-page.
+ */
+ bk = GET_BKEYDATA(dbp, h, i);
+
+ /*
+ * We need to verify the type of the item here;
+ * we can't simply assume that it will be one of the
+ * expected three. If it's not a recognizable type,
+ * it can't be considered to have a verifiable
+ * length, so it's not possible to certify it as safe.
+ */
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ len = bk->len;
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ len = BOVERFLOW_SIZE;
+ break;
+ default:
+ EPRINT((env, DB_STR_A("0566",
+ "Page %lu: item %lu of unrecognizable type",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+
+ if ((size_t)(offset + len) > dbp->pgsize) {
+ EPRINT((env, DB_STR_A("0567",
+ "Page %lu: item %lu extends past page boundary",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+ }
+
+ if (offsetp != NULL)
+ *offsetp = offset;
+ return (0);
+}
+
+/*
+ * __db_vrfy_duptype--
+ * Given a page number and a set of flags to __bam_vrfy_subtree,
+ * verify that the dup tree type is correct--i.e., it's a recno
+ * if DUPSORT is not set and a btree if it is.
+ *
+ * PUBLIC: int __db_vrfy_duptype
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_duptype(dbp, vdp, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret, isbad;
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (pip->type) {
+ case P_IBTREE:
+ case P_LDUP:
+ if (!LF_ISSET(DB_ST_DUPSORT)) {
+ EPRINT((env, DB_STR_A("0568",
+ "Page %lu: sorted duplicate set in unsorted-dup database",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ if (LF_ISSET(DB_ST_DUPSORT)) {
+ EPRINT((env, DB_STR_A("0569",
+ "Page %lu: unsorted duplicate set in sorted-dup database",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+ break;
+ default:
+ /*
+ * If the page is entirely zeroed, its pip->type will be a lie
+ * (we assumed it was a hash page, as they're allowed to be
+ * zeroed); handle this case specially.
+ */
+ if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+ ZEROPG_ERR_PRINT(env, pgno, DB_STR_P("duplicate page"));
+ else
+ EPRINT((env, DB_STR_A("0570",
+ "Page %lu: duplicate page of inappropriate type %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)pip->type));
+ isbad = 1;
+ break;
+ }
+
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ return (ret);
+ return (isbad == 1 ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __db_salvage_duptree --
+ * Attempt to salvage a given duplicate tree, given its alleged root.
+ *
+ * The key that corresponds to this dup set has been passed to us
+ * in DBT *key. Because data items follow keys, though, it has been
+ * printed once already.
+ *
+ * The basic idea here is that pgno ought to be a P_LDUP, a P_LRECNO, a
+ * P_IBTREE, or a P_IRECNO. If it's an internal page, use the verifier
+ * functions to make sure it's safe; if it's not, we simply bail and the
+ * data will have to be printed with no key later on. if it is safe,
+ * recurse on each of its children.
+ *
+ * Whether or not it's safe, if it's a leaf page, __bam_salvage it.
+ *
+ * At all times, use the DB hanging off vdp to mark and check what we've
+ * done, so each page gets printed exactly once and we don't get caught
+ * in any cycles.
+ *
+ * PUBLIC: int __db_salvage_duptree __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_duptree(dbp, vdp, pgno, key, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ DBT *key;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret, t_ret;
+
+ mpf = dbp->mpf;
+
+ if (pgno == PGNO_INVALID || !IS_VALID_PGNO(pgno))
+ return (DB_VERIFY_BAD);
+
+ /* We have a plausible page. Try it. */
+ if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ if ((ret = __db_vrfy_common(dbp, vdp, h, pgno, flags)) != 0)
+ goto err;
+ if ((ret = __bam_vrfy(dbp,
+ vdp, h, pgno, flags | DB_NOORDERCHK)) != 0 ||
+ (ret = __db_salvage_markdone(vdp, pgno)) != 0)
+ goto err;
+ /*
+ * We have a known-healthy internal page. Walk it.
+ */
+ if ((ret = __bam_salvage_walkdupint(dbp, vdp, h, key,
+ handle, callback, flags)) != 0)
+ goto err;
+ break;
+ case P_LRECNO:
+ case P_LDUP:
+ if ((ret = __bam_salvage(dbp,
+ vdp, pgno, TYPE(h), h, handle, callback, key, flags)) != 0)
+ goto err;
+ break;
+ default:
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+err: if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_salvage_all --
+ * Salvage only the leaves we find by walking the tree. If we have subdbs,
+ * salvage each of them individually.
+ */
+static int
+__db_salvage_all(dbp, vdp, handle, callback, flags, hassubsp)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+ int *hassubsp;
+{
+ DB *pgset;
+ DBC *pgsc;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t p, meta_pgno;
+ int ret, t_ret;
+
+ *hassubsp = 0;
+
+ env = dbp->env;
+ pgset = NULL;
+ pgsc = NULL;
+ mpf = dbp->mpf;
+ h = NULL;
+ pip = NULL;
+ ret = 0;
+
+ /*
+ * Check to make sure the page is OK and find out if it contains
+ * subdatabases.
+ */
+ meta_pgno = PGNO_BASE_MD;
+ if ((t_ret = __memp_fget(mpf,
+ &meta_pgno, vdp->thread_info, NULL, 0, &h)) == 0 &&
+ (t_ret = __db_vrfy_common(dbp, vdp, h, PGNO_BASE_MD, flags)) == 0 &&
+ (t_ret = __db_salvage_pg(
+ dbp, vdp, PGNO_BASE_MD, h, handle, callback, flags)) == 0 &&
+ (t_ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) == 0)
+ if (F_ISSET(pip, VRFY_HAS_SUBDBS))
+ *hassubsp = 1;
+ if (pip != NULL &&
+ (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (h != NULL) {
+ if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ h = NULL;
+ }
+ if (ret != 0)
+ return (ret);
+
+ /* Without subdatabases, we can just dump from the meta pgno. */
+ if (*hassubsp == 0)
+ return (__db_salvage(dbp,
+ vdp, PGNO_BASE_MD, handle, callback, flags));
+
+ /*
+ * We have subdbs. Try to crack them.
+ *
+ * To do so, get a set of leaf pages in the master database, and then
+ * walk each of the valid ones, salvaging subdbs as we go. If any
+ * prove invalid, just drop them; we'll pick them up on a later pass.
+ */
+ if ((ret = __db_vrfy_pgset(env,
+ vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+ goto err;
+ if ((ret = __db_meta2pgset(dbp, vdp, PGNO_BASE_MD, flags, pgset)) != 0)
+ goto err;
+ if ((ret = __db_cursor(pgset, vdp->thread_info, NULL, &pgsc, 0)) != 0)
+ goto err;
+ while ((t_ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+ if ((t_ret = __memp_fget(mpf,
+ &p, vdp->thread_info, NULL, 0, &h)) == 0 &&
+ (t_ret = __db_vrfy_common(dbp, vdp, h, p, flags)) == 0 &&
+ (t_ret =
+ __bam_vrfy(dbp, vdp, h, p, flags | DB_NOORDERCHK)) == 0)
+ t_ret = __db_salvage_subdbpg(
+ dbp, vdp, h, handle, callback, flags);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ if (h != NULL) {
+ if ((t_ret = __memp_fput(mpf, vdp->thread_info,
+ h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ h = NULL;
+ }
+ }
+
+ if (t_ret != DB_NOTFOUND && ret == 0)
+ ret = t_ret;
+
+err: if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pgset != NULL &&
+ (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret ==0)
+ ret = t_ret;
+ if (h != NULL &&
+ (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_salvage_subdbpg --
+ * Given a known-good leaf page in the master database, salvage all
+ * leaf pages corresponding to each subdb.
+ */
+static int
+__db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *master;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ BKEYDATA *bkkey, *bkdata;
+ BOVERFLOW *bo;
+ DB *pgset;
+ DBC *pgsc;
+ DBT key;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *subpg;
+ db_indx_t i;
+ db_pgno_t meta_pgno;
+ int ret, err_ret, t_ret;
+ char *subdbname;
+ u_int32_t ovfl_bufsz;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ ret = err_ret = 0;
+ subdbname = NULL;
+ pgsc = NULL;
+ pgset = NULL;
+ ovfl_bufsz = 0;
+
+ /*
+ * For each entry, get and salvage the set of pages
+ * corresponding to that entry.
+ */
+ for (i = 0; i < NUM_ENT(master); i += P_INDX) {
+ bkkey = GET_BKEYDATA(dbp, master, i);
+ bkdata = GET_BKEYDATA(dbp, master, i + O_INDX);
+
+ /* Get the subdatabase name. */
+ if (B_TYPE(bkkey->type) == B_OVERFLOW) {
+ /*
+ * We can, in principle anyway, have a subdb
+ * name so long it overflows. Ick.
+ */
+ bo = (BOVERFLOW *)bkkey;
+ if ((ret = __db_safe_goff(dbp, vdp, bo->pgno,
+ &key, &subdbname, &ovfl_bufsz, flags)) != 0) {
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+
+ /* Nul-terminate it. */
+ if (ovfl_bufsz < key.size + 1) {
+ if ((ret = __os_realloc(env,
+ key.size + 1, &subdbname)) != 0)
+ goto err;
+ ovfl_bufsz = key.size + 1;
+ }
+ subdbname[key.size] = '\0';
+ } else if (B_TYPE(bkkey->type) == B_KEYDATA) {
+ if (ovfl_bufsz < (u_int32_t)bkkey->len + 1) {
+ if ((ret = __os_realloc(env,
+ bkkey->len + 1, &subdbname)) != 0)
+ goto err;
+ ovfl_bufsz = bkkey->len + 1;
+ }
+ DB_ASSERT(env, subdbname != NULL);
+ memcpy(subdbname, bkkey->data, bkkey->len);
+ subdbname[bkkey->len] = '\0';
+ }
+
+ /* Get the corresponding pgno. */
+ if (bkdata->len != sizeof(db_pgno_t)) {
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+ memcpy(&meta_pgno,
+ (db_pgno_t *)bkdata->data, sizeof(db_pgno_t));
+
+ /*
+ * Subdatabase meta pgnos are stored in network byte
+ * order for cross-endian compatibility. Swap if appropriate.
+ */
+ DB_NTOHL_SWAP(env, &meta_pgno);
+
+ /* If we can't get the subdb meta page, just skip the subdb. */
+ if (!IS_VALID_PGNO(meta_pgno) || (ret = __memp_fget(mpf,
+ &meta_pgno, vdp->thread_info, NULL, 0, &subpg)) != 0) {
+ err_ret = ret;
+ continue;
+ }
+
+ /*
+ * Verify the subdatabase meta page. This has two functions.
+ * First, if it's bad, we have no choice but to skip the subdb
+ * and let the pages just get printed on a later pass. Second,
+ * the access-method-specific meta verification routines record
+ * the various state info (such as the presence of dups)
+ * that we need for __db_prheader().
+ */
+ if ((ret =
+ __db_vrfy_common(dbp, vdp, subpg, meta_pgno, flags)) != 0) {
+ err_ret = ret;
+ (void)__memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority);
+ continue;
+ }
+ switch (TYPE(subpg)) {
+ case P_BTREEMETA:
+ if ((ret = __bam_vrfy_meta(dbp,
+ vdp, (BTMETA *)subpg, meta_pgno, flags)) != 0) {
+ err_ret = ret;
+ (void)__memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority);
+ continue;
+ }
+ break;
+ case P_HASHMETA:
+ if ((ret = __ham_vrfy_meta(dbp,
+ vdp, (HMETA *)subpg, meta_pgno, flags)) != 0) {
+ err_ret = ret;
+ (void)__memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority);
+ continue;
+ }
+ break;
+ default:
+ /* This isn't an appropriate page; skip this subdb. */
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority)) != 0) {
+ err_ret = ret;
+ continue;
+ }
+
+ /* Print a subdatabase header. */
+ if ((ret = __db_prheader(dbp,
+ subdbname, 0, 0, handle, callback, vdp, meta_pgno)) != 0)
+ goto err;
+
+ /* Salvage meta_pgno's tree. */
+ if ((ret = __db_salvage(dbp,
+ vdp, meta_pgno, handle, callback, flags)) != 0)
+ err_ret = ret;
+
+ /* Print a subdatabase footer. */
+ if ((ret = __db_prfooter(handle, callback)) != 0)
+ goto err;
+ }
+
+err: if (subdbname)
+ __os_free(env, subdbname);
+
+ if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0)
+ ret = t_ret;
+
+ if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0)
+ ret = t_ret;
+
+ if ((t_ret = __db_salvage_markdone(vdp, PGNO(master))) != 0)
+ return (t_ret);
+
+ return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __db_salvage --
+ * Given a meta page number, salvage all data from leaf pages found by
+ * walking the meta page's tree.
+ */
+static int
+__db_salvage(dbp, vdp, meta_pgno, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+
+{
+ DB *pgset;
+ DBC *dbc, *pgsc;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *subpg;
+ db_pgno_t p;
+ int err_ret, ret, t_ret;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ err_ret = ret = t_ret = 0;
+ pgsc = NULL;
+ pgset = NULL;
+ dbc = NULL;
+
+ if ((ret = __db_vrfy_pgset(env,
+ vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+ goto err;
+
+ /* Get all page numbers referenced from this meta page. */
+ if ((ret = __db_meta2pgset(dbp, vdp, meta_pgno,
+ flags, pgset)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+
+ if ((ret = __db_cursor(pgset,
+ vdp->thread_info, NULL, &pgsc, 0)) != 0)
+ goto err;
+
+ if (dbp->type == DB_QUEUE &&
+ (ret = __db_cursor(dbp, vdp->thread_info, NULL, &dbc, 0)) != 0)
+ goto err;
+
+ /* Salvage every page in pgset. */
+ while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+ if (dbp->type == DB_QUEUE) {
+#ifdef HAVE_QUEUE
+ ret = __qam_fget(dbc, &p, 0, &subpg);
+#else
+ ret = __db_no_queue_am(env);
+#endif
+ /* Don't report an error for pages not found in a queue.
+ * The pgset is a best guess, it doesn't know about
+ * deleted extents which leads to this error.
+ */
+ if (ret == ENOENT || ret == DB_PAGE_NOTFOUND)
+ continue;
+ } else
+ ret = __memp_fget(mpf,
+ &p, vdp->thread_info, NULL, 0, &subpg);
+ if (ret != 0) {
+ err_ret = ret;
+ continue;
+ }
+
+ if ((ret = __db_salvage_pg(dbp, vdp, p, subpg,
+ handle, callback, flags)) != 0)
+ err_ret = ret;
+
+ if (dbp->type == DB_QUEUE)
+#ifdef HAVE_QUEUE
+ ret = __qam_fput(dbc, p, subpg, dbp->priority);
+#else
+ ret = __db_no_queue_am(env);
+#endif
+ else
+ ret = __memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority);
+ if (ret != 0)
+ err_ret = ret;
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+err:
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0)
+ ret = t_ret;
+ if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0)
+ ret = t_ret;
+ if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0)
+ ret = t_ret;
+
+ return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __db_meta2pgset --
+ * Given a known-safe meta page number, return the set of pages
+ * corresponding to the database it represents. Return DB_VERIFY_BAD if
+ * it's not a suitable meta page or is invalid.
+ */
+static int
+__db_meta2pgset(dbp, vdp, pgno, flags, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t flags;
+ DB *pgset;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret, t_ret;
+
+ mpf = dbp->mpf;
+
+ if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_BTREEMETA:
+ ret = __bam_meta2pgset(dbp, vdp, (BTMETA *)h, flags, pgset);
+ break;
+ case P_HASHMETA:
+ ret = __ham_meta2pgset(dbp, vdp, (HMETA *)h, flags, pgset);
+ break;
+ case P_HEAPMETA:
+ ret = __heap_meta2pgset(dbp, vdp, (HEAPMETA *)h, pgset);
+ break;
+ case P_QAMMETA:
+#ifdef HAVE_QUEUE
+ ret = __qam_meta2pgset(dbp, vdp, pgset);
+ break;
+#endif
+ default:
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+ if ((t_ret = __memp_fput(mpf, vdp->thread_info, h, dbp->priority)) != 0)
+ return (t_ret);
+ return (ret);
+}
+
+/*
+ * __db_guesspgsize --
+ * Try to guess what the pagesize is if the one on the meta page
+ * and the one in the db are invalid.
+ */
+static u_int
+__db_guesspgsize(env, fhp)
+ ENV *env;
+ DB_FH *fhp;
+{
+ db_pgno_t i;
+ size_t nr;
+ u_int32_t guess;
+ u_int8_t type;
+
+ for (guess = DB_MAX_PGSIZE; guess >= DB_MIN_PGSIZE; guess >>= 1) {
+ /*
+ * We try to read three pages ahead after the first one
+ * and make sure we have plausible types for all of them.
+ * If the seeks fail, continue with a smaller size;
+ * we're probably just looking past the end of the database.
+ * If they succeed and the types are reasonable, also continue
+ * with a size smaller; we may be looking at pages N,
+ * 2N, and 3N for some N > 1.
+ *
+ * As soon as we hit an invalid type, we stop and return
+ * our previous guess; that last one was probably the page size.
+ */
+ for (i = 1; i <= 3; i++) {
+ if (__os_seek(
+ env, fhp, i, guess, SSZ(DBMETA, type)) != 0)
+ break;
+ if (__os_read(env,
+ fhp, &type, 1, &nr) != 0 || nr == 0)
+ break;
+ if (type == P_INVALID || type >= P_PAGETYPE_MAX)
+ return (guess << 1);
+ }
+ }
+
+ /*
+ * If we're just totally confused--the corruption takes up most of the
+ * beginning pages of the database--go with the default size.
+ */
+ return (DB_DEF_IOSIZE);
+}
diff --git a/src/db/db_vrfy_stub.c b/src/db/db_vrfy_stub.c
new file mode 100644
index 00000000..5037f33e
--- /dev/null
+++ b/src/db/db_vrfy_stub.c
@@ -0,0 +1,120 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_VERIFY
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/db_verify.h"
+
+/*
+ * If the library wasn't compiled with the verification support, various
+ * routines aren't available. Stub them here, returning an appropriate
+ * error.
+ */
+
+static int __db_novrfy __P((ENV *));
+
+/*
+ * __db_novrfy --
+ * Error when a Berkeley DB build doesn't include the access method.
+ */
+static int
+__db_novrfy(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("0571",
+ "library build did not include support for database verification"));
+ return (DB_OPNOTSUP);
+}
+
+int
+__db_verify_pp(dbp, file, database, outfile, flags)
+ DB *dbp;
+ const char *file, *database;
+ FILE *outfile;
+ u_int32_t flags;
+{
+ int ret;
+
+ COMPQUIET(file, NULL);
+ COMPQUIET(database, NULL);
+ COMPQUIET(outfile, NULL);
+ COMPQUIET(flags, 0);
+
+ ret = __db_novrfy(dbp->env);
+
+ /* The verify method is a destructor. */
+ (void)__db_close(dbp, NULL, 0);
+
+ return (ret);
+}
+
+int
+__db_verify_internal(dbp, name, subdb, handle, callback, flags)
+ DB *dbp;
+ const char *name, *subdb;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(name, NULL);
+ COMPQUIET(subdb, NULL);
+ COMPQUIET(handle, NULL);
+ COMPQUIET(callback, NULL);
+ COMPQUIET(flags, 0);
+ return (0);
+}
+
+int
+__db_vrfy_getpageinfo(vdp, pgno, pipp)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ VRFY_PAGEINFO **pipp;
+{
+ COMPQUIET(pgno, 0);
+ COMPQUIET(pipp, NULL);
+ return (__db_novrfy(vdp->pgdbp->env));
+}
+
+int
+__db_vrfy_putpageinfo(env, vdp, pip)
+ ENV *env;
+ VRFY_DBINFO *vdp;
+ VRFY_PAGEINFO *pip;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(pip, NULL);
+ return (__db_novrfy(env));
+}
+
+int
+__db_vrfy_prdbt(dbtp, checkprint, prefix,
+ handle, callback, is_recno, is_heap, vdp)
+ DBT *dbtp;
+ int checkprint;
+ const char *prefix;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ int is_recno;
+ int is_heap;
+ VRFY_DBINFO *vdp;
+{
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(checkprint, 0);
+ COMPQUIET(prefix, NULL);
+ COMPQUIET(handle, NULL);
+ COMPQUIET(callback, NULL);
+ COMPQUIET(is_recno, 0);
+ COMPQUIET(is_heap, 0);
+ return (__db_novrfy(vdp->pgdbp->env));
+}
+#endif /* !HAVE_VERIFY */
diff --git a/src/db/db_vrfyutil.c b/src/db/db_vrfyutil.c
new file mode 100644
index 00000000..d72e1188
--- /dev/null
+++ b/src/db/db_vrfyutil.c
@@ -0,0 +1,932 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/db_am.h"
+
+static int __db_vrfy_childinc __P((DBC *, VRFY_CHILDINFO *));
+static int __db_vrfy_pageinfo_create __P((ENV *, VRFY_PAGEINFO **));
+
+/*
+ * __db_vrfy_dbinfo_create --
+ * Allocate and initialize a VRFY_DBINFO structure.
+ *
+ * PUBLIC: int __db_vrfy_dbinfo_create
+ * PUBLIC: __P((ENV *, DB_THREAD_INFO *, u_int32_t, VRFY_DBINFO **));
+ */
+int
+__db_vrfy_dbinfo_create(env, ip, pgsize, vdpp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t pgsize;
+ VRFY_DBINFO **vdpp;
+{
+ DB *cdbp, *pgdbp, *pgset;
+ VRFY_DBINFO *vdp;
+ int ret;
+
+ vdp = NULL;
+ cdbp = pgdbp = pgset = NULL;
+
+ if ((ret = __os_calloc(NULL, 1, sizeof(VRFY_DBINFO), &vdp)) != 0)
+ goto err;
+
+ if ((ret = __db_create_internal(&cdbp, env, 0)) != 0)
+ goto err;
+
+ if ((ret = __db_set_flags(cdbp, DB_DUP)) != 0)
+ goto err;
+
+ if ((ret = __db_set_pagesize(cdbp, pgsize)) != 0)
+ goto err;
+
+ /* If transactional, make sure we don't log. */
+ if (TXN_ON(env) &&
+ (ret = __db_set_flags(cdbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ if ((ret = __db_open(cdbp, ip,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ if ((ret = __db_create_internal(&pgdbp, env, 0)) != 0)
+ goto err;
+
+ if ((ret = __db_set_pagesize(pgdbp, pgsize)) != 0)
+ goto err;
+
+ /* If transactional, make sure we don't log. */
+ if (TXN_ON(env) &&
+ (ret = __db_set_flags(pgdbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+
+ if ((ret = __db_open(pgdbp, ip,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ if ((ret = __db_vrfy_pgset(env, ip, pgsize, &pgset)) != 0)
+ goto err;
+
+ if (CDB_LOCKING(env) &&
+ (ret = __cdsgroup_begin(env, &vdp->txn)) != 0)
+ goto err;
+
+ LIST_INIT(&vdp->subdbs);
+ LIST_INIT(&vdp->activepips);
+
+ vdp->cdbp = cdbp;
+ vdp->pgdbp = pgdbp;
+ vdp->pgset = pgset;
+ vdp->thread_info = ip;
+ *vdpp = vdp;
+ return (0);
+
+err: if (cdbp != NULL)
+ (void)__db_close(cdbp, NULL, 0);
+ if (pgdbp != NULL)
+ (void)__db_close(pgdbp, NULL, 0);
+ if (vdp->txn != NULL)
+ (void)vdp->txn->commit(vdp->txn, 0);
+ if (vdp != NULL)
+ __os_free(env, vdp);
+ return (ret);
+}
+
+/*
+ * __db_vrfy_dbinfo_destroy --
+ * Destructor for VRFY_DBINFO. Destroys VRFY_PAGEINFOs and deallocates
+ * structure.
+ *
+ * PUBLIC: int __db_vrfy_dbinfo_destroy __P((ENV *, VRFY_DBINFO *));
+ */
+int
+__db_vrfy_dbinfo_destroy(env, vdp)
+ ENV *env;
+ VRFY_DBINFO *vdp;
+{
+ VRFY_CHILDINFO *c;
+ int t_ret, ret;
+
+ ret = 0;
+
+ /*
+ * Discard active page structures. Ideally there wouldn't be any,
+ * but in some error cases we may not have cleared them all out.
+ */
+ while (LIST_FIRST(&vdp->activepips) != NULL)
+ if ((t_ret = __db_vrfy_putpageinfo(
+ env, vdp, LIST_FIRST(&vdp->activepips))) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+
+ /* Discard subdatabase list structures. */
+ while ((c = LIST_FIRST(&vdp->subdbs)) != NULL) {
+ LIST_REMOVE(c, links);
+ __os_free(NULL, c);
+ }
+
+ if ((t_ret = __db_close(vdp->pgdbp, NULL, 0)) != 0)
+ ret = t_ret;
+
+ if ((t_ret = __db_close(vdp->cdbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __db_close(vdp->pgset, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (vdp->txn != NULL &&
+ (t_ret = vdp->txn->commit(vdp->txn, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (vdp->extents != NULL)
+ __os_free(env, vdp->extents);
+ __os_free(env, vdp);
+ return (ret);
+}
+
+/*
+ * __db_vrfy_getpageinfo --
+ * Get a PAGEINFO structure for a given page, creating it if necessary.
+ *
+ * PUBLIC: int __db_vrfy_getpageinfo
+ * PUBLIC: __P((VRFY_DBINFO *, db_pgno_t, VRFY_PAGEINFO **));
+ */
+int
+__db_vrfy_getpageinfo(vdp, pgno, pipp)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ VRFY_PAGEINFO **pipp;
+{
+ DB *pgdbp;
+ DBT key, data;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret;
+
+ /*
+ * We want a page info struct. There are three places to get it from,
+ * in decreasing order of preference:
+ *
+ * 1. vdp->activepips. If it's already "checked out", we're
+ * already using it, we return the same exact structure with a
+ * bumped refcount. This is necessary because this code is
+ * replacing array accesses, and it's common for f() to make some
+ * changes to a pip, and then call g() and h() which each make
+ * changes to the same pip. vdps are never shared between threads
+ * (they're never returned to the application), so this is safe.
+ * 2. The pgdbp. It's not in memory, but it's in the database, so
+ * get it, give it a refcount of 1, and stick it on activepips.
+ * 3. malloc. It doesn't exist yet; create it, then stick it on
+ * activepips. We'll put it in the database when we putpageinfo
+ * later.
+ */
+
+ /* Case 1. */
+ LIST_FOREACH(pip, &vdp->activepips, links)
+ if (pip->pgno == pgno)
+ goto found;
+
+ /* Case 2. */
+ pgdbp = vdp->pgdbp;
+ env = pgdbp->env;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ F_SET(&data, DB_DBT_MALLOC);
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ if ((ret = __db_get(pgdbp,
+ vdp->thread_info, vdp->txn, &key, &data, 0)) == 0) {
+ /* Found it. */
+ DB_ASSERT(env, data.size == sizeof(VRFY_PAGEINFO));
+ pip = data.data;
+ LIST_INSERT_HEAD(&vdp->activepips, pip, links);
+ goto found;
+ } else if (ret != DB_NOTFOUND) /* Something nasty happened. */
+ return (ret);
+
+ /* Case 3 */
+ if ((ret = __db_vrfy_pageinfo_create(env, &pip)) != 0)
+ return (ret);
+
+ LIST_INSERT_HEAD(&vdp->activepips, pip, links);
+found: pip->pi_refcount++;
+
+ *pipp = pip;
+ return (0);
+}
+
+/*
+ * __db_vrfy_putpageinfo --
+ * Put back a VRFY_PAGEINFO that we're done with.
+ *
+ * PUBLIC: int __db_vrfy_putpageinfo __P((ENV *,
+ * PUBLIC: VRFY_DBINFO *, VRFY_PAGEINFO *));
+ */
+int
+__db_vrfy_putpageinfo(env, vdp, pip)
+ ENV *env;
+ VRFY_DBINFO *vdp;
+ VRFY_PAGEINFO *pip;
+{
+ DB *pgdbp;
+ DBT key, data;
+ VRFY_PAGEINFO *p;
+ int ret;
+
+ if (--pip->pi_refcount > 0)
+ return (0);
+
+ pgdbp = vdp->pgdbp;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pip->pgno;
+ key.size = sizeof(db_pgno_t);
+ data.data = pip;
+ data.size = sizeof(VRFY_PAGEINFO);
+
+ if ((ret = __db_put(pgdbp,
+ vdp->thread_info, vdp->txn, &key, &data, 0)) != 0)
+ return (ret);
+
+ LIST_FOREACH(p, &vdp->activepips, links)
+ if (p == pip)
+ break;
+ if (p != NULL)
+ LIST_REMOVE(p, links);
+
+ __os_ufree(env, p);
+ return (0);
+}
+
+/*
+ * __db_vrfy_pgset --
+ * Create a temporary database for the storing of sets of page numbers.
+ * (A mapping from page number to int, used by the *_meta2pgset functions,
+ * as well as for keeping track of which pages the verifier has seen.)
+ *
+ * PUBLIC: int __db_vrfy_pgset __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, u_int32_t, DB **));
+ */
+int
+__db_vrfy_pgset(env, ip, pgsize, dbpp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t pgsize;
+ DB **dbpp;
+{
+ DB *dbp;
+ int ret;
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+ if ((ret = __db_set_pagesize(dbp, pgsize)) != 0)
+ goto err;
+
+ /* If transactional, make sure we don't log. */
+ if (TXN_ON(env) &&
+ (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ if ((ret = __db_open(dbp, ip,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) == 0)
+ *dbpp = dbp;
+ else
+err: (void)__db_close(dbp, NULL, 0);
+
+ return (ret);
+}
+
+/*
+ * __db_vrfy_pgset_get --
+ * Get the value associated in a page set with a given pgno. Return
+ * a 0 value (and succeed) if we've never heard of this page.
+ *
+ * PUBLIC: int __db_vrfy_pgset_get __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: db_pgno_t, int *));
+ */
+int
+__db_vrfy_pgset_get(dbp, ip, txn, pgno, valp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ db_pgno_t pgno;
+ int *valp;
+{
+ DBT key, data;
+ int ret, val;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+ data.data = &val;
+ data.ulen = sizeof(int);
+ F_SET(&data, DB_DBT_USERMEM);
+
+ if ((ret = __db_get(dbp, ip, txn, &key, &data, 0)) == 0) {
+ DB_ASSERT(dbp->env, data.size == sizeof(int));
+ } else if (ret == DB_NOTFOUND)
+ val = 0;
+ else
+ return (ret);
+
+ *valp = val;
+ return (0);
+}
+
+/*
+ * __db_vrfy_pgset_inc --
+ * Increment the value associated with a pgno by 1.
+ *
+ * PUBLIC: int __db_vrfy_pgset_inc __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: db_pgno_t));
+ */
+int
+__db_vrfy_pgset_inc(dbp, ip, txn, pgno)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ db_pgno_t pgno;
+{
+ DBT key, data;
+ int ret;
+ int val;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ val = 0;
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+ data.data = &val;
+ data.ulen = sizeof(int);
+ F_SET(&data, DB_DBT_USERMEM);
+
+ if ((ret = __db_get(dbp, ip, txn, &key, &data, 0)) == 0) {
+ DB_ASSERT(dbp->env, data.size == sizeof(int));
+ } else if (ret != DB_NOTFOUND)
+ return (ret);
+
+ data.size = sizeof(int);
+ ++val;
+
+ return (__db_put(dbp, ip, txn, &key, &data, 0));
+}
+
+/*
+ * __db_vrfy_pgset_next --
+ * Given a cursor open in a pgset database, get the next page in the
+ * set.
+ *
+ * PUBLIC: int __db_vrfy_pgset_next __P((DBC *, db_pgno_t *));
+ */
+int
+__db_vrfy_pgset_next(dbc, pgnop)
+ DBC *dbc;
+ db_pgno_t *pgnop;
+{
+ DBT key, data;
+ db_pgno_t pgno;
+ int ret;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ /* We don't care about the data, just the keys. */
+ F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+ F_SET(&key, DB_DBT_USERMEM);
+ key.data = &pgno;
+ key.ulen = sizeof(db_pgno_t);
+
+ if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) != 0)
+ return (ret);
+
+ DB_ASSERT(dbc->env, key.size == sizeof(db_pgno_t));
+ *pgnop = pgno;
+
+ return (0);
+}
+
+/*
+ * __db_vrfy_childcursor --
+ * Create a cursor to walk the child list with. Returns with a nonzero
+ * final argument if the specified page has no children.
+ *
+ * PUBLIC: int __db_vrfy_childcursor __P((VRFY_DBINFO *, DBC **));
+ */
+int
+__db_vrfy_childcursor(vdp, dbcp)
+ VRFY_DBINFO *vdp;
+ DBC **dbcp;
+{
+ DB *cdbp;
+ DBC *dbc;
+ int ret;
+
+ cdbp = vdp->cdbp;
+
+ if ((ret = __db_cursor(cdbp, vdp->thread_info, vdp->txn, &dbc, 0)) == 0)
+ *dbcp = dbc;
+
+ return (ret);
+}
+
+/*
+ * __db_vrfy_childput --
+ * Add a child structure to the set for a given page.
+ *
+ * PUBLIC: int __db_vrfy_childput
+ * PUBLIC: __P((VRFY_DBINFO *, db_pgno_t, VRFY_CHILDINFO *));
+ */
+int
+__db_vrfy_childput(vdp, pgno, cip)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ VRFY_CHILDINFO *cip;
+{
+ DB *cdbp;
+ DBC *cc;
+ DBT key, data;
+ VRFY_CHILDINFO *oldcip;
+ int ret;
+
+ cdbp = vdp->cdbp;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ /*
+ * We want to avoid adding multiple entries for a single child page;
+ * we only need to verify each child once, even if a child (such
+ * as an overflow key) is multiply referenced.
+ *
+ * However, we also need to make sure that when walking the list
+ * of children, we encounter them in the order they're referenced
+ * on a page. (This permits us, for example, to verify the
+ * prev_pgno/next_pgno chain of Btree leaf pages.)
+ *
+ * Check the child database to make sure that this page isn't
+ * already a child of the specified page number. If it's not,
+ * put it at the end of the duplicate set.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ return (ret);
+ for (ret = __db_vrfy_ccset(cc, pgno, &oldcip); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &oldcip))
+ if (oldcip->pgno == cip->pgno) {
+ /*
+ * Found a matching child. Increment its reference
+ * count--we've run into it again--but don't put it
+ * again.
+ */
+ if ((ret = __db_vrfy_childinc(cc, oldcip)) != 0 ||
+ (ret = __db_vrfy_ccclose(cc)) != 0)
+ return (ret);
+ return (0);
+ }
+ if (ret != DB_NOTFOUND) {
+ (void)__db_vrfy_ccclose(cc);
+ return (ret);
+ }
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ return (ret);
+
+ cip->refcnt = 1;
+ data.data = cip;
+ data.size = sizeof(VRFY_CHILDINFO);
+
+ return (__db_put(cdbp, vdp->thread_info, vdp->txn, &key, &data, 0));
+}
+
+/*
+ * __db_vrfy_childinc --
+ * Increment the refcount of the VRFY_CHILDINFO struct that the child
+ * cursor is pointing to. (The caller has just retrieved this struct, and
+ * passes it in as cip to save us a get.)
+ */
+static int
+__db_vrfy_childinc(dbc, cip)
+ DBC *dbc;
+ VRFY_CHILDINFO *cip;
+{
+ DBT key, data;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ cip->refcnt++;
+ data.data = cip;
+ data.size = sizeof(VRFY_CHILDINFO);
+
+ return (__dbc_put(dbc, &key, &data, DB_CURRENT));
+}
+
+/*
+ * __db_vrfy_ccset --
+ * Sets a cursor created with __db_vrfy_childcursor to the first
+ * child of the given pgno, and returns it in the third arg.
+ *
+ * PUBLIC: int __db_vrfy_ccset __P((DBC *, db_pgno_t, VRFY_CHILDINFO **));
+ */
+int
+__db_vrfy_ccset(dbc, pgno, cipp)
+ DBC *dbc;
+ db_pgno_t pgno;
+ VRFY_CHILDINFO **cipp;
+{
+ DBT key, data;
+ int ret;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ if ((ret = __dbc_get(dbc, &key, &data, DB_SET)) != 0)
+ return (ret);
+
+ DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO));
+ *cipp = (VRFY_CHILDINFO *)data.data;
+
+ return (0);
+}
+
+/*
+ * __db_vrfy_ccnext --
+ * Gets the next child of the given cursor created with
+ * __db_vrfy_childcursor, and returns it in the memory provided in the
+ * second arg.
+ *
+ * PUBLIC: int __db_vrfy_ccnext __P((DBC *, VRFY_CHILDINFO **));
+ */
+int
+__db_vrfy_ccnext(dbc, cipp)
+ DBC *dbc;
+ VRFY_CHILDINFO **cipp;
+{
+ DBT key, data;
+ int ret;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT_DUP)) != 0)
+ return (ret);
+
+ DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO));
+ *cipp = (VRFY_CHILDINFO *)data.data;
+
+ return (0);
+}
+
+/*
+ * __db_vrfy_ccclose --
+ * Closes the cursor created with __db_vrfy_childcursor.
+ *
+ * This doesn't actually do anything interesting now, but it's
+ * not inconceivable that we might change the internal database usage
+ * and keep the interfaces the same, and a function call here or there
+ * seldom hurts anyone.
+ *
+ * PUBLIC: int __db_vrfy_ccclose __P((DBC *));
+ */
+int
+__db_vrfy_ccclose(dbc)
+ DBC *dbc;
+{
+
+ return (__dbc_close(dbc));
+}
+
+/*
+ * __db_vrfy_pageinfo_create --
+ * Constructor for VRFY_PAGEINFO; allocates and initializes.
+ */
+static int
+__db_vrfy_pageinfo_create(env, pipp)
+ ENV *env;
+ VRFY_PAGEINFO **pipp;
+{
+ VRFY_PAGEINFO *pip;
+ int ret;
+
+ /*
+ * pageinfo structs are sometimes allocated here and sometimes
+ * allocated by fetching them from a database with DB_DBT_MALLOC.
+ * There's no easy way for the destructor to tell which was
+ * used, and so we always allocate with __os_umalloc so we can free
+ * with __os_ufree.
+ */
+ if ((ret = __os_umalloc(env, sizeof(VRFY_PAGEINFO), &pip)) != 0)
+ return (ret);
+ memset(pip, 0, sizeof(VRFY_PAGEINFO));
+
+ *pipp = pip;
+ return (0);
+}
+
+/*
+ * __db_salvage_init --
+ * Set up salvager database.
+ *
+ * PUBLIC: int __db_salvage_init __P((VRFY_DBINFO *));
+ */
+int
+__db_salvage_init(vdp)
+ VRFY_DBINFO *vdp;
+{
+ DB *dbp;
+ int ret;
+
+ if ((ret = __db_create_internal(&dbp, NULL, 0)) != 0)
+ return (ret);
+
+ if ((ret = __db_set_pagesize(dbp, 1024)) != 0)
+ goto err;
+
+ if ((ret = __db_open(dbp, vdp->thread_info,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ vdp->salvage_pages = dbp;
+ return (0);
+
+err: (void)__db_close(dbp, NULL, 0);
+ return (ret);
+}
+
+/*
+ * __db_salvage_destroy --
+ * Close salvager database.
+ * PUBLIC: int __db_salvage_destroy __P((VRFY_DBINFO *));
+ */
+int
+__db_salvage_destroy(vdp)
+ VRFY_DBINFO *vdp;
+{
+ return (vdp->salvage_pages == NULL ? 0 :
+ __db_close(vdp->salvage_pages, NULL, 0));
+}
+
+/*
+ * __db_salvage_getnext --
+ * Get the next (first) unprinted page in the database of pages we need to
+ * print still. Delete entries for any already-printed pages we encounter
+ * in this search, as well as the page we're returning.
+ *
+ * PUBLIC: int __db_salvage_getnext
+ * PUBLIC: __P((VRFY_DBINFO *, DBC **, db_pgno_t *, u_int32_t *, int));
+ */
+int
+__db_salvage_getnext(vdp, dbcp, pgnop, pgtypep, skip_overflow)
+ VRFY_DBINFO *vdp;
+ DBC **dbcp;
+ db_pgno_t *pgnop;
+ u_int32_t *pgtypep;
+ int skip_overflow;
+{
+ DB *dbp;
+ DBT key, data;
+ int ret;
+ u_int32_t pgtype;
+
+ dbp = vdp->salvage_pages;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ if (*dbcp == NULL &&
+ (ret = __db_cursor(dbp, vdp->thread_info, vdp->txn, dbcp, 0)) != 0)
+ return (ret);
+
+ while ((ret = __dbc_get(*dbcp, &key, &data, DB_NEXT)) == 0) {
+ DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t));
+ memcpy(&pgtype, data.data, sizeof(pgtype));
+
+ if (skip_overflow && pgtype == SALVAGE_OVERFLOW)
+ continue;
+
+ if ((ret = __dbc_del(*dbcp, 0)) != 0)
+ return (ret);
+ if (pgtype != SALVAGE_IGNORE) {
+ DB_ASSERT(dbp->env, key.size == sizeof(db_pgno_t));
+ DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t));
+
+ *pgnop = *(db_pgno_t *)key.data;
+ *pgtypep = *(u_int32_t *)data.data;
+ break;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_salvage_isdone --
+ * Return whether or not the given pgno is already marked
+ * SALVAGE_IGNORE (meaning that we don't need to print it again).
+ *
+ * Returns DB_KEYEXIST if it is marked, 0 if not, or another error on
+ * error.
+ *
+ * PUBLIC: int __db_salvage_isdone __P((VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_salvage_isdone(vdp, pgno)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DBT key, data;
+ int ret;
+ u_int32_t currtype;
+
+ dbp = vdp->salvage_pages;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ currtype = SALVAGE_INVALID;
+ data.data = &currtype;
+ data.ulen = sizeof(u_int32_t);
+ data.flags = DB_DBT_USERMEM;
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ /*
+ * Put an entry for this page, with pgno as key and type as data,
+ * unless it's already there and is marked done.
+ * If it's there and is marked anything else, that's fine--we
+ * want to mark it done.
+ */
+ if ((ret = __db_get(dbp,
+ vdp->thread_info, vdp->txn, &key, &data, 0)) == 0) {
+ /*
+ * The key's already here. Check and see if it's already
+ * marked done. If it is, return DB_KEYEXIST. If it's not,
+ * return 0.
+ */
+ if (currtype == SALVAGE_IGNORE)
+ return (DB_KEYEXIST);
+ else
+ return (0);
+ } else if (ret != DB_NOTFOUND)
+ return (ret);
+
+ /* The pgno is not yet marked anything; return 0. */
+ return (0);
+}
+
+/*
+ * __db_salvage_markdone --
+ * Mark as done a given page.
+ *
+ * PUBLIC: int __db_salvage_markdone __P((VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_salvage_markdone(vdp, pgno)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DBT key, data;
+ int pgtype, ret;
+ u_int32_t currtype;
+
+ pgtype = SALVAGE_IGNORE;
+ dbp = vdp->salvage_pages;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ currtype = SALVAGE_INVALID;
+ data.data = &currtype;
+ data.ulen = sizeof(u_int32_t);
+ data.flags = DB_DBT_USERMEM;
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ /*
+ * Put an entry for this page, with pgno as key and type as data,
+ * unless it's already there and is marked done.
+ * If it's there and is marked anything else, that's fine--we
+ * want to mark it done, but db_salvage_isdone only lets
+ * us know if it's marked IGNORE.
+ *
+ * We don't want to return DB_KEYEXIST, though; this will
+ * likely get passed up all the way and make no sense to the
+ * application. Instead, use DB_VERIFY_BAD to indicate that
+ * we've seen this page already--it probably indicates a
+ * multiply-linked page.
+ */
+ if ((ret = __db_salvage_isdone(vdp, pgno)) != 0)
+ return (ret == DB_KEYEXIST ? DB_VERIFY_BAD : ret);
+
+ data.size = sizeof(u_int32_t);
+ data.data = &pgtype;
+
+ return (__db_put(dbp, vdp->thread_info, vdp->txn, &key, &data, 0));
+}
+
+/*
+ * __db_salvage_markneeded --
+ * If it has not yet been printed, make note of the fact that a page
+ * must be dealt with later.
+ *
+ * PUBLIC: int __db_salvage_markneeded
+ * PUBLIC: __P((VRFY_DBINFO *, db_pgno_t, u_int32_t));
+ */
+int
+__db_salvage_markneeded(vdp, pgno, pgtype)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+{
+ DB *dbp;
+ DBT key, data;
+ int ret;
+
+ dbp = vdp->salvage_pages;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ data.data = &pgtype;
+ data.size = sizeof(u_int32_t);
+
+ /*
+ * Put an entry for this page, with pgno as key and type as data,
+ * unless it's already there, in which case it's presumably
+ * already been marked done.
+ */
+ ret = __db_put(dbp,
+ vdp->thread_info, vdp->txn, &key, &data, DB_NOOVERWRITE);
+ return (ret == DB_KEYEXIST ? 0 : ret);
+}
+
+/*
+ * __db_vrfy_prdbt --
+ * Print out a DBT data element from a verification routine.
+ *
+ * PUBLIC: int __db_vrfy_prdbt __P((DBT *, int, const char *, void *,
+ * PUBLIC: int (*)(void *, const void *), int, int, VRFY_DBINFO *));
+ */
+int
+__db_vrfy_prdbt(dbtp, checkprint, prefix,
+ handle, callback, is_recno, is_heap, vdp)
+ DBT *dbtp;
+ int checkprint;
+ const char *prefix;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ int is_recno;
+ int is_heap;
+ VRFY_DBINFO *vdp;
+{
+ if (vdp != NULL) {
+ /*
+ * If vdp is non-NULL, we might be the first key in the
+ * "fake" subdatabase used for key/data pairs we can't
+ * associate with a known subdb.
+ *
+ * Check and clear the SALVAGE_PRINTHEADER flag; if
+ * it was set, print a subdatabase header.
+ */
+ if (F_ISSET(vdp, SALVAGE_PRINTHEADER)) {
+ (void)__db_prheader(
+ NULL, "__OTHER__", 0, 0, handle, callback, vdp, 0);
+ F_CLR(vdp, SALVAGE_PRINTHEADER);
+ F_SET(vdp, SALVAGE_PRINTFOOTER);
+ }
+
+ /*
+ * Even if the printable flag wasn't set by our immediate
+ * caller, it may be set on a salvage-wide basis.
+ */
+ if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+ checkprint = 1;
+ }
+ return (
+ __db_prdbt(dbtp, checkprint,
+ prefix, handle, callback, is_recno, is_heap));
+}
diff --git a/src/db/partition.c b/src/db/partition.c
new file mode 100644
index 00000000..f8beaf16
--- /dev/null
+++ b/src/db/partition.c
@@ -0,0 +1,2059 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#ifdef HAVE_HASH
+#include "dbinc/hash.h"
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+#ifdef HAVE_PARTITION
+
+static int __part_rr __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ const char *, const char *, const char *, u_int32_t));
+static int __partc_close __P((DBC *, db_pgno_t, int *));
+static int __partc_del __P((DBC*, u_int32_t));
+static int __partc_destroy __P((DBC*));
+static int __partc_get_pp __P((DBC*, DBT *, DBT *, u_int32_t));
+static int __partc_put __P((DBC*, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __partc_writelock __P((DBC*));
+static int __partition_chk_meta __P((DB *,
+ DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+static int __partition_setup_keys __P((DBC *,
+ DB_PARTITION *, DBMETA *, u_int32_t));
+static int __part_key_cmp __P((const void *, const void *));
+static inline void __part_search __P((DB *,
+ DB_PARTITION *, DBT *, u_int32_t *));
+
+static char *Alloc_err = DB_STR_A("0644",
+ "Partition open failed to allocate %d bytes", "%d");
+
+/*
+ * Allocate a partition cursor and copy flags to the partition cursor.
+ * Not passed:
+ * DBC_PARTITIONED -- the subcursors are not.
+ * DBC_OWN_LID -- the arg dbc owns the lock id.
+ * DBC_WRITECURSOR DBC_WRITER -- CDS locking happens on
+ * the whole DB, not the partition.
+ */
+#define GET_PART_CURSOR(dbc, new_dbc, part_id) do { \
+ DB *__part_dbp; \
+ __part_dbp = part->handles[part_id]; \
+ if ((ret = __db_cursor_int(__part_dbp, \
+ (dbc)->thread_info, (dbc)->txn, __part_dbp->type, \
+ PGNO_INVALID, 0, (dbc)->locker, &new_dbc)) != 0) \
+ goto err; \
+ (new_dbc)->flags = (dbc)->flags & \
+ ~(DBC_PARTITIONED|DBC_OWN_LID|DBC_WRITECURSOR|DBC_WRITER); \
+} while (0)
+
+/*
+ * Search for the correct partition.
+ */
+static inline void __part_search(dbp, part, key, part_idp)
+ DB *dbp;
+ DB_PARTITION *part;
+ DBT *key;
+ u_int32_t *part_idp;
+{
+ db_indx_t base, indx, limit;
+ int cmp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+
+ DB_ASSERT(dbp->env, part->nparts != 0);
+ COMPQUIET(cmp, 0);
+ COMPQUIET(indx, 0);
+
+ func = ((BTREE *)dbp->bt_internal)->bt_compare;
+ DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) {
+ DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX);
+ cmp = func(dbp, key, &part->keys[indx]);
+ if (cmp == 0)
+ break;
+ if (cmp > 0)
+ DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX);
+ }
+ if (cmp == 0)
+ *part_idp = indx;
+ else if ((*part_idp = base) != 0)
+ (*part_idp)--;
+}
+
+/*
+ * __partition_init --
+ * Initialize the partition structure.
+ * Called when the meta data page is read in during database open or
+ * when partition keys or a callback are set.
+ *
+ * PUBLIC: int __partition_init __P((DB *, u_int32_t));
+ */
+int
+__partition_init(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_PARTITION *part;
+ int ret;
+
+ if ((part = dbp->p_internal) != NULL) {
+ if ((LF_ISSET(DBMETA_PART_RANGE) &&
+ F_ISSET(part, PART_CALLBACK)) ||
+ (LF_ISSET(DBMETA_PART_CALLBACK) &&
+ F_ISSET(part, PART_RANGE))) {
+ __db_errx(dbp->env, DB_STR("0645",
+ "Cannot specify callback and range keys."));
+ return (EINVAL);
+ }
+ } else if ((ret = __os_calloc(dbp->env, 1, sizeof(*part), &part)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DBMETA_PART_RANGE))
+ F_SET(part, PART_RANGE);
+ if (LF_ISSET(DBMETA_PART_CALLBACK))
+ F_SET(part, PART_CALLBACK);
+ dbp->p_internal = part;
+ /* Set up AM-specific methods that do not require an open. */
+ dbp->db_am_rename = __part_rename;
+ dbp->db_am_remove = __part_remove;
+ return (0);
+}
+/*
+ * __partition_set --
+ * Set the partitioning keys or callback function.
+ * This routine must be called prior to creating the database.
+ * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
+ * PUBLIC: u_int32_t (*callback)(DB *, DBT *key)));
+ */
+
+int
+__partition_set(dbp, parts, keys, callback)
+ DB *dbp;
+ u_int32_t parts;
+ DBT *keys;
+ u_int32_t (*callback)(DB *, DBT *key);
+{
+ DB_PARTITION *part;
+ ENV *env;
+ int ret;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition");
+ env = dbp->dbenv->env;
+
+ if (parts < 2) {
+ __db_errx(env, DB_STR("0646",
+ "Must specify at least 2 partitions."));
+ return (EINVAL);
+ }
+
+ if (keys == NULL && callback == NULL) {
+ __db_errx(env, DB_STR("0647",
+ "Must specify either keys or a callback."));
+ return (EINVAL);
+ }
+ if (keys != NULL && callback != NULL) {
+bad: __db_errx(env, DB_STR("0648",
+ "May not specify both keys and a callback."));
+ return (EINVAL);
+ }
+
+ if ((ret = __partition_init(dbp,
+ keys != NULL ?
+ DBMETA_PART_RANGE : DBMETA_PART_CALLBACK)) != 0)
+ return (ret);
+ part = dbp->p_internal;
+
+ if ((part->keys != NULL && callback != NULL) ||
+ (part->callback != NULL && keys != NULL))
+ goto bad;
+
+ part->nparts = parts;
+ part->keys = keys;
+ part->callback = callback;
+
+ return (0);
+}
+
+/*
+ * __partition_set_dirs --
+ * Set the directories for creating the partition databases.
+ * They must be in the environment.
+ * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
+ */
+int
+__partition_set_dirs(dbp, dirp)
+ DB *dbp;
+ const char **dirp;
+{
+ DB_ENV *dbenv;
+ DB_PARTITION *part;
+ ENV *env;
+ u_int32_t ndirs, slen;
+ int i, ret;
+ const char **dir;
+ char *cp, **part_dirs, **pd;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition_dirs");
+ dbenv = dbp->dbenv;
+ env = dbp->env;
+
+ ndirs = 1;
+ slen = 0;
+ for (dir = dirp; *dir != NULL; dir++) {
+ if (F_ISSET(env, ENV_DBLOCAL))
+ slen += (u_int32_t)strlen(*dir) + 1;
+ ndirs++;
+ }
+
+ slen += sizeof(char *) * ndirs;
+ if ((ret = __os_malloc(env, slen, &part_dirs)) != 0)
+ return (EINVAL);
+ memset(part_dirs, 0, slen);
+
+ cp = (char *) part_dirs + (sizeof(char *) * ndirs);
+ pd = part_dirs;
+ for (dir = dirp; *dir != NULL; dir++, pd++) {
+ if (F_ISSET(env, ENV_DBLOCAL)) {
+ (void)strcpy(cp, *dir);
+ *pd = cp;
+ cp += strlen(*dir) + 1;
+ continue;
+ }
+ for (i = 0; i < dbenv->data_next; i++)
+ if (strcmp(*dir, dbenv->db_data_dir[i]) == 0)
+ break;
+ if (i == dbenv->data_next) {
+ __db_errx(dbp->env, DB_STR_A("0649",
+ "Directory not in environment list %s",
+ "%s"), *dir);
+ __os_free(env, part_dirs);
+ return (EINVAL);
+ }
+ *pd = dbenv->db_data_dir[i];
+ }
+
+ if ((part = dbp->p_internal) == NULL) {
+ if ((ret = __partition_init(dbp, 0)) != 0)
+ return (ret);
+ part = dbp->p_internal;
+ }
+
+ part->dirs = (const char **)part_dirs;
+
+ return (0);
+}
+
+/*
+ * __partition_open --
+ * Open/create a partitioned database.
+ * PUBLIC: int __partition_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, DBTYPE, u_int32_t, int, int));
+ */
+int
+__partition_open(dbp, ip, txn, fname, type, flags, mode, do_open)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *fname;
+ DBTYPE type;
+ u_int32_t flags;
+ int mode, do_open;
+{
+ DB *part_db;
+ DB_PARTITION *part;
+ DBC *dbc;
+ ENV *env;
+ u_int32_t part_id;
+ int ret;
+ char *name, *sp;
+ const char **dirp, *np;
+
+ part = dbp->p_internal;
+ env = dbp->dbenv->env;
+ name = NULL;
+
+ if ((ret = __partition_chk_meta(dbp, ip, txn, flags)) != 0 && do_open)
+ goto err;
+
+ if ((ret = __os_calloc(env,
+ part->nparts, sizeof(*part->handles), &part->handles)) != 0) {
+ __db_errx(env,
+ Alloc_err, part->nparts * sizeof(*part->handles));
+ goto err;
+ }
+
+ DB_ASSERT(env, fname != NULL);
+ if ((ret = __os_malloc(env,
+ strlen(fname) + PART_LEN + 1, &name)) != 0) {
+ __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+ goto err;
+ }
+
+ sp = name;
+ np = __db_rpath(fname);
+ if (np == NULL)
+ np = fname;
+ else {
+ np++;
+ (void)strncpy(name, fname, (size_t)(np - fname));
+ sp = name + (np - fname);
+ }
+
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ goto done;
+ dirp = part->dirs;
+ for (part_id = 0; part_id < part->nparts; part_id++) {
+ if ((ret = __db_create_internal(
+ &part->handles[part_id], dbp->env, 0)) != 0)
+ goto err;
+
+ part_db = part->handles[part_id];
+ part_db->flags = F_ISSET(dbp,
+ ~(DB_AM_CREATED | DB_AM_CREATED_MSTR | DB_AM_OPEN_CALLED));
+ F_SET(part_db, DB_AM_PARTDB);
+ part_db->adj_fileid = dbp->adj_fileid;
+ part_db->pgsize = dbp->pgsize;
+ part_db->priority = dbp->priority;
+ part_db->db_append_recno = dbp->db_append_recno;
+ part_db->db_feedback = dbp->db_feedback;
+ part_db->dup_compare = dbp->dup_compare;
+ part_db->app_private = dbp->app_private;
+ part_db->api_internal = dbp->api_internal;
+
+ if (dbp->type == DB_BTREE)
+ __bam_copy_config(dbp, part_db, part->nparts);
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH)
+ __ham_copy_config(dbp, part_db, part->nparts);
+#endif
+
+ (void)sprintf(sp, PART_NAME, np, part_id);
+ if (do_open) {
+ /*
+ * Cycle through the directory names passed in,
+ * if any.
+ */
+ if (dirp != NULL &&
+ (part_db->dirname = *dirp++) == NULL) {
+ part_db->dirname = *(dirp = part->dirs);
+ dirp++;
+ }
+ if ((ret = __db_open(part_db, ip, txn,
+ name, NULL, type, flags, mode, PGNO_BASE_MD)) != 0)
+ goto err;
+ } else if ((ret = __os_strdup(env, name, &part_db->fname)) != 0)
+ goto err;
+ }
+
+ /* Get rid of the cursor used to open the database its the wrong type */
+done: while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+ if ((ret = __dbc_destroy(dbc)) != 0)
+ break;
+
+ if (0) {
+err: (void)__partition_close(dbp, txn, 0);
+ }
+ if (name != NULL)
+ __os_free(env, name);
+ return (ret);
+}
+
+/*
+ * __partition_chk_meta --
+ * Check for a consistent meta data page and parameters when opening a
+ * partitioned database.
+ */
+static int
+__partition_chk_meta(dbp, ip, txn, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DBMETA *meta;
+ DB_PARTITION *part;
+ DBC *dbc;
+ DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ db_pgno_t base_pgno;
+ int ret, t_ret;
+
+ dbc = NULL;
+ meta = NULL;
+ LOCK_INIT(metalock);
+ part = dbp->p_internal;
+ mpf = dbp->mpf;
+ env = dbp->env;
+ ret = 0;
+
+ /* Get a cursor on the main db. */
+ dbp->p_internal = NULL;
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+
+ /* Get the metadata page. */
+ base_pgno = PGNO_BASE_MD;
+ if ((ret =
+ __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0)
+ goto err;
+
+ if (meta->magic != DB_HASHMAGIC &&
+ (meta->magic != DB_BTREEMAGIC || F_ISSET(meta, BTM_RECNO))) {
+ __db_errx(env, DB_STR("0650",
+ "Partitioning may only specified on BTREE and HASH databases."));
+ ret = EINVAL;
+ goto err;
+ }
+ if (!FLD_ISSET(meta->metaflags,
+ DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) {
+ __db_errx(env, DB_STR("0651",
+ "Partitioning specified on a non-partitioned database."));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if ((F_ISSET(part, PART_RANGE) &&
+ FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK)) ||
+ (F_ISSET(part, PART_CALLBACK) &&
+ FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))) {
+ __db_errx(env, DB_STR("0652",
+ "Incompatible partitioning specified."));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK) &&
+ part->callback == NULL && !IS_RECOVERING(env) &&
+ !F_ISSET(dbp, DB_AM_RECOVER) && !LF_ISSET(DB_RDWRMASTER)) {
+ __db_errx(env, DB_STR("0653",
+ "Partition callback not specified."));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (F_ISSET(dbp, DB_AM_RECNUM)) {
+ __db_errx(env, DB_STR("0654",
+ "Record numbers are not supported in partitioned databases."));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (part->nparts == 0) {
+ if (LF_ISSET(DB_CREATE) && meta->nparts == 0) {
+ __db_errx(env, DB_STR("0655",
+ "Zero paritions specified."));
+ ret = EINVAL;
+ goto err;
+ } else
+ part->nparts = meta->nparts;
+ } else if (meta->nparts != 0 && part->nparts != meta->nparts) {
+ __db_errx(env, DB_STR("0656",
+ "Number of partitions does not match."));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (meta->magic == DB_HASHMAGIC) {
+ if (!F_ISSET(part, PART_CALLBACK)) {
+ __db_errx(env, DB_STR("0657",
+ "Hash database must specify a partition callback."));
+ ret = EINVAL;
+ }
+ } else if (meta->magic != DB_BTREEMAGIC) {
+ __db_errx(env, DB_STR("0658",
+ "Partitioning only supported on BTREE nad HASH."));
+ ret = EINVAL;
+ } else
+ ret = __partition_setup_keys(dbc, part, meta, flags);
+
+err: /* Put the metadata page back. */
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ ip, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ dbp->p_internal = part;
+ return (ret);
+}
+
+/*
+ * Support for sorting keys. Keys must be sorted using the btree
+ * compare function so if we call qsort in __partition_setup_keys
+ * we use this structure to pass the DBP and compare function.
+ */
+struct key_sort {
+ DB *dbp;
+ DBT *key;
+ int (*compare) __P((DB *, const DBT *, const DBT *));
+};
+
+static int __part_key_cmp(a, b)
+ const void *a, *b;
+{
+ const struct key_sort *ka, *kb;
+
+ ka = a;
+ kb = b;
+ return (ka->compare(ka->dbp, ka->key, kb->key));
+}
+/*
+ * __partition_setup_keys --
+ * Get the partition keys into memory, or put them to disk if we
+ * are creating a partitioned database.
+ */
+static int
+__partition_setup_keys(dbc, part, meta, flags)
+ DBC *dbc;
+ DB_PARTITION *part;
+ DBMETA *meta;
+ u_int32_t flags;
+{
+ BTREE *t;
+ DB *dbp;
+ DBT data, key, *keys, *kp;
+ ENV *env;
+ u_int32_t ds, i, j;
+ u_int8_t *dd;
+ struct key_sort *ks;
+ int have_keys, ret;
+ int (*compare) __P((DB *, const DBT *, const DBT *));
+ void *dp;
+
+ COMPQUIET(dd, NULL);
+ COMPQUIET(ds, 0);
+ memset(&data, 0, sizeof(data));
+ memset(&key, 0, sizeof(key));
+ ks = NULL;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /* Need to just read the main database. */
+ dbp->p_internal = NULL;
+ have_keys = 0;
+
+ /* First verify that things what we expect. */
+ if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) {
+ if (ret != DB_NOTFOUND)
+ goto err;
+ if (F_ISSET(part, PART_CALLBACK)) {
+ ret = 0;
+ goto done;
+ }
+ if (!LF_ISSET(DB_CREATE) && !F_ISSET(dbp, DB_AM_RECOVER) &&
+ !LF_ISSET(DB_RDWRMASTER)) {
+ __db_errx(env, DB_STR("0659", "No range keys found."));
+ ret = EINVAL;
+ goto err;
+ }
+ } else {
+ if (F_ISSET(part, PART_CALLBACK)) {
+ __db_errx(env, DB_STR("0660",
+ "Keys found and callback set."));
+ ret = EINVAL;
+ goto err;
+ }
+ if (key.size != 0) {
+ __db_errx(env, DB_STR("0661",
+ "Partition key 0 is not empty."));
+ ret = EINVAL;
+ goto err;
+ }
+ have_keys = 1;
+ }
+
+ if (LF_ISSET(DB_CREATE) && have_keys == 0) {
+ /* Insert the keys into the master database. */
+ for (i = 0; i < part->nparts - 1; i++) {
+ if ((ret = __db_put(dbp, dbc->thread_info,
+ dbc->txn, &part->keys[i], &data, 0)) != 0)
+ goto err;
+ }
+
+ /*
+ * Insert the "0" pointer. All records less than the first
+ * given key go into this partition. We must use the default
+ * compare to insert this key, otherwise it might not be first.
+ */
+ t = dbc->dbp->bt_internal;
+ compare = t->bt_compare;
+ t->bt_compare = __bam_defcmp;
+ memset(&key, 0, sizeof(key));
+ ret = __db_put(dbp, dbc->thread_info, dbc->txn, &key, &data, 0);
+ t->bt_compare = compare;
+ if (ret != 0)
+ goto err;
+ }
+done: if (F_ISSET(part, PART_RANGE)) {
+ /*
+ * Allocate one page to hold the keys plus space at the
+ * end of the buffer to put an array of DBTs. If there
+ * is not enough space __dbc_get will return how much
+ * is needed and we realloc.
+ */
+ if ((ret = __os_malloc(env,
+ meta->pagesize + (sizeof(DBT) * part->nparts),
+ &part->data)) != 0) {
+ __db_errx(env, Alloc_err, meta->pagesize);
+ goto err;
+ }
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ data.data = part->data;
+ data.ulen = meta->pagesize;
+ data.flags = DB_DBT_USERMEM;
+again: if ((ret = __dbc_get(dbc, &key, &data,
+ DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) {
+ if ((ret = __os_realloc(env,
+ data.size + (sizeof(DBT) * part->nparts),
+ &part->data)) != 0)
+ goto err;
+ data.data = part->data;
+ data.ulen = data.size;
+ goto again;
+ }
+ if (ret == 0) {
+ /*
+ * They passed in keys, they must match.
+ */
+ keys = NULL;
+ compare = NULL;
+ if (have_keys == 1 && (keys = part->keys) != NULL) {
+ t = dbc->dbp->bt_internal;
+ compare = t->bt_compare;
+ if ((ret = __os_malloc(env, (part->nparts - 1)
+ * sizeof(struct key_sort), &ks)) != 0)
+ goto err;
+ for (j = 0; j < part->nparts - 1; j++) {
+ ks[j].dbp = dbc->dbp;
+ ks[j].compare = compare;
+ ks[j].key = &keys[j];
+ }
+
+ qsort(ks, (size_t)part->nparts - 1,
+ sizeof(struct key_sort), __part_key_cmp);
+ }
+ DB_MULTIPLE_INIT(dp, &data);
+ part->keys = (DBT *)
+ ((u_int8_t *)part->data + data.size);
+ j = 0;
+ for (kp = part->keys;
+ kp < &part->keys[part->nparts]; kp++, j++) {
+ DB_MULTIPLE_KEY_NEXT(dp,
+ &data, kp->data, kp->size, dd, ds);
+ if (dp == NULL) {
+ ret = DB_NOTFOUND;
+ break;
+ }
+ if (keys != NULL && j != 0 &&
+ compare(dbc->dbp, ks[j - 1].key, kp) != 0) {
+ if (kp->data == NULL &&
+ F_ISSET(dbp, DB_AM_RECOVER))
+ goto err;
+ __db_errx(env, DB_STR_A("0662",
+ "Partition key %d does not match",
+ "%d"), j);
+ ret = EINVAL;
+ goto err;
+ }
+ }
+ }
+ }
+ if (ret == DB_NOTFOUND && F_ISSET(dbp, DB_AM_RECOVER))
+ ret = 0;
+
+err: dbp->p_internal = part;
+ if (ks != NULL)
+ __os_free(env, ks);
+ return (ret);
+}
+
+/*
+ * __partition_get_callback --
+ * Get the partition callback function.
+ * PUBLIC: int __partition_get_callback __P((DB *,
+ * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+ */
+int
+__partition_get_callback(dbp, parts, callback)
+ DB *dbp;
+ u_int32_t *parts;
+ u_int32_t (**callback)(DB *, DBT *key);
+{
+ DB_PARTITION *part;
+
+ part = dbp->p_internal;
+ /* Only return populated results if partitioned using callbacks. */
+ if (part != NULL && !F_ISSET(part, PART_CALLBACK))
+ part = NULL;
+ if (parts != NULL)
+ *parts = (part != NULL ? part->nparts : 0);
+ if (callback != NULL)
+ *callback = (part != NULL ? part->callback : NULL);
+
+ return (0);
+}
+
+/*
+ * __partition_get_keys --
+ * Get partition keys.
+ * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+ */
+int
+__partition_get_keys(dbp, parts, keys)
+ DB *dbp;
+ u_int32_t *parts;
+ DBT **keys;
+{
+ DB_PARTITION *part;
+
+ part = dbp->p_internal;
+ /* Only return populated results if partitioned using ranges. */
+ if (part != NULL && !F_ISSET(part, PART_RANGE))
+ part = NULL;
+ if (parts != NULL)
+ *parts = (part != NULL ? part->nparts : 0);
+ if (keys != NULL)
+ *keys = (part != NULL ? &part->keys[1] : NULL);
+
+ return (0);
+}
+
+/*
+ * __partition_get_dirs --
+ * Get partition dirs.
+ * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
+ */
+int
+__partition_get_dirs(dbp, dirpp)
+ DB *dbp;
+ const char ***dirpp;
+{
+ DB_PARTITION *part;
+ ENV *env;
+ u_int32_t i;
+ int ret;
+
+ env = dbp->env;
+ if ((part = dbp->p_internal) == NULL) {
+ *dirpp = NULL;
+ return (0);
+ }
+ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+ *dirpp = part->dirs;
+ return (0);
+ }
+
+ /*
+ * We build a list once when asked. The original directory list,
+ * if any, was discarded at open time.
+ */
+ if ((*dirpp = part->dirs) != NULL)
+ return (0);
+
+ if ((ret = __os_calloc(env,
+ sizeof(char *), part->nparts + 1, (void *) &part->dirs)) != 0)
+ return (ret);
+
+ for (i = 0; i < part->nparts; i++)
+ part->dirs[i] = part->handles[i]->dirname;
+
+ *dirpp = part->dirs;
+ return (0);
+}
+
+/*
+ * __partc_init --
+ * Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __partc_init __P((DBC *));
+ */
+int
+__partc_init(dbc)
+ DBC *dbc;
+{
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+
+ /* Allocate/initialize the internal structure. */
+ if (dbc->internal == NULL && (ret =
+ __os_calloc(env, 1, sizeof(PART_CURSOR), &dbc->internal)) != 0)
+ return (ret);
+
+ /* Initialize methods. */
+ dbc->close = dbc->c_close = __dbc_close_pp;
+ dbc->cmp = __dbc_cmp_pp;
+ dbc->count = dbc->c_count = __dbc_count_pp;
+ dbc->del = dbc->c_del = __dbc_del_pp;
+ dbc->dup = dbc->c_dup = __dbc_dup_pp;
+ dbc->get = dbc->c_get = __partc_get_pp;
+ dbc->pget = dbc->c_pget = __dbc_pget_pp;
+ dbc->put = dbc->c_put = __dbc_put_pp;
+ dbc->am_bulk = NULL;
+ dbc->am_close = __partc_close;
+ dbc->am_del = __partc_del;
+ dbc->am_destroy = __partc_destroy;
+ dbc->am_get = NULL;
+ dbc->am_put = __partc_put;
+ dbc->am_writelock = __partc_writelock;
+
+ /* We avoid swapping partition cursors since we swap the sub cursors */
+ F_SET(dbc, DBC_PARTITIONED);
+
+ return (0);
+}
+/*
+ * __partc_get_pp --
+ * cursor get opeartion on a partitioned database.
+ */
+static int
+__partc_get_pp(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ignore_lease, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+ if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get",
+ flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+
+ ret = __partc_get(dbc, key, data, flags);
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+/*
+ * __partition_get --
+ * cursor get opeartion on a partitioned database.
+ *
+ * PUBLIC: int __partc_get __P((DBC*, DBT *, DBT *, u_int32_t));
+ */
+int
+__partc_get(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *orig_dbc, *new_dbc;
+ DB_PARTITION *part;
+ PART_CURSOR *cp;
+ u_int32_t multi, part_id;
+ int ret, retry, search;
+
+ dbp = dbc->dbp;
+ cp = (PART_CURSOR*)dbc->internal;
+ orig_dbc = cp->sub_cursor;
+ part = dbp->p_internal;
+
+ new_dbc = NULL;
+ retry = search = 0;
+ part_id = cp->part_id;
+ multi = flags & ~DB_OPFLAGS_MASK;
+
+ switch (flags & DB_OPFLAGS_MASK) {
+ case DB_CURRENT:
+ break;
+ case DB_FIRST:
+ part_id = 0;
+ retry = 1;
+ break;
+ case DB_GET_BOTH:
+ case DB_GET_BOTHC:
+ case DB_GET_BOTH_RANGE:
+ search = 1;
+ break;
+ case DB_SET_RANGE:
+ search = 1;
+ retry = 1;
+ break;
+ case DB_LAST:
+ part_id = part->nparts - 1;
+ retry = 1;
+ break;
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ if (orig_dbc == NULL)
+ part_id = 0;
+ else
+ part_id = cp->part_id;
+ retry = 1;
+ break;
+ case DB_NEXT_DUP:
+ break;
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ if (orig_dbc == NULL)
+ part_id = part->nparts - 1;
+ else
+ part_id = cp->part_id;
+ retry = 1;
+ break;
+ case DB_PREV_DUP:
+ break;
+ case DB_SET:
+ search = 1;
+ break;
+ default:
+ return (__db_unknown_flag(dbp->env, "__partc_get", flags));
+ }
+
+ /*
+ * If we need to find the partition to start on, then
+ * do a binary search of the in memory partition table.
+ */
+ if (search == 1 && F_ISSET(part, PART_CALLBACK))
+ part_id = part->callback(dbp, key) % part->nparts;
+ else if (search == 1)
+ __part_search(dbp, part, key, &part_id);
+
+ /* Get a new cursor if necessary */
+ if (orig_dbc == NULL || cp->part_id != part_id) {
+ GET_PART_CURSOR(dbc, new_dbc, part_id);
+ } else
+ new_dbc = orig_dbc;
+
+ while ((ret = __dbc_get(new_dbc,
+ key, data, flags)) == DB_NOTFOUND && retry == 1) {
+ switch (flags & DB_OPFLAGS_MASK) {
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ case DB_SET_RANGE:
+ if (++part_id < part->nparts) {
+ flags = DB_FIRST | multi;
+ break;
+ }
+ goto err;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ if (part_id-- > 0) {
+ flags = DB_LAST | multi;
+ break;
+ }
+ goto err;
+ default:
+ goto err;
+ }
+
+ if (new_dbc != orig_dbc && (ret = __dbc_close(new_dbc)) != 0)
+ goto err;
+ GET_PART_CURSOR(dbc, new_dbc, part_id);
+ }
+
+ if (ret != 0)
+ goto err;
+
+ /* Success: swap original and new cursors. */
+ if (new_dbc != orig_dbc) {
+ if (orig_dbc != NULL) {
+ cp->sub_cursor = NULL;
+ if ((ret = __dbc_close(orig_dbc)) != 0)
+ goto err;
+ }
+ cp->sub_cursor = new_dbc;
+ cp->part_id = part_id;
+ }
+
+ return (0);
+
+err: if (new_dbc != NULL && new_dbc != orig_dbc)
+ (void)__dbc_close(new_dbc);
+ return (ret);
+}
+
+/*
+ * __partc_put --
+ * cursor put opeartion on a partitioned cursor.
+ *
+ */
+static int
+__partc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DB_PARTITION *part;
+ DBC *new_dbc;
+ PART_CURSOR *cp;
+ u_int32_t part_id;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (PART_CURSOR*)dbc->internal;
+ part_id = cp->part_id;
+ part = dbp->p_internal;
+ *pgnop = PGNO_INVALID;
+
+ switch (flags) {
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_NOOVERWRITE:
+ case DB_OVERWRITE_DUP:
+ if (F_ISSET(part, PART_CALLBACK)) {
+ part_id = part->callback(dbp, key) % part->nparts;
+ break;
+ }
+ __part_search(dbp, part, key, &part_id);
+ break;
+ default:
+ break;
+ }
+
+ if ((new_dbc = cp->sub_cursor) == NULL || cp->part_id != part_id) {
+ if ((ret = __db_cursor_int(part->handles[part_id],
+ dbc->thread_info, dbc->txn, part->handles[part_id]->type,
+ PGNO_INVALID, 0, dbc->locker, &new_dbc)) != 0)
+ goto err;
+ }
+
+ if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
+ F_SET(new_dbc, DBC_WRITER);
+ if ((ret = __dbc_put(new_dbc, key, data, flags)) != 0)
+ goto err;
+
+ if (new_dbc != cp->sub_cursor) {
+ if (cp->sub_cursor != NULL) {
+ if ((ret = __dbc_close(cp->sub_cursor)) != 0)
+ goto err;
+ cp->sub_cursor = NULL;
+ }
+ cp->sub_cursor = new_dbc;
+ cp->part_id = part_id;
+ }
+
+ return (0);
+
+err: if (new_dbc != NULL && cp->sub_cursor != new_dbc)
+ (void)__dbc_close(new_dbc);
+ return (ret);
+}
+
+/*
+ * __partc_del
+ * Delete interface to partitioned cursors.
+ *
+ */
+static int
+__partc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ PART_CURSOR *cp;
+ cp = (PART_CURSOR*)dbc->internal;
+
+ if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
+ F_SET(cp->sub_cursor, DBC_WRITER);
+ return (__dbc_del(cp->sub_cursor, flags));
+}
+
+/*
+ * __partc_writelock
+ * Writelock interface to partitioned cursors.
+ *
+ */
+static int
+__partc_writelock(dbc)
+ DBC *dbc;
+{
+ PART_CURSOR *cp;
+ cp = (PART_CURSOR*)dbc->internal;
+
+ return (cp->sub_cursor->am_writelock(cp->sub_cursor));
+}
+
+/*
+ * __partc_close
+ * Close interface to partitioned cursors.
+ *
+ */
+static int
+__partc_close(dbc, root_pgno, rmroot)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int *rmroot;
+{
+ PART_CURSOR *cp;
+ int ret;
+
+ COMPQUIET(root_pgno, 0);
+ COMPQUIET(rmroot, NULL);
+
+ cp = (PART_CURSOR*)dbc->internal;
+
+ if (cp->sub_cursor == NULL)
+ return (0);
+ ret = __dbc_close(cp->sub_cursor);
+ cp->sub_cursor = NULL;
+ return (ret);
+}
+
+/*
+ * __partc_destroy --
+ * Destroy a single cursor.
+ */
+static int
+__partc_destroy(dbc)
+ DBC *dbc;
+{
+ PART_CURSOR *cp;
+ ENV *env;
+
+ cp = (PART_CURSOR *)dbc->internal;
+ env = dbc->env;
+
+ /* Discard the structure. Don't recurse. */
+ __os_free(env, cp);
+
+ return (0);
+}
+
+/*
+ * __partition_close
+ * Close a partitioned database.
+ *
+ * PUBLIC: int __partition_close __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__partition_close(dbp, txn, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ ENV *env;
+ u_int32_t i;
+ int ret, t_ret;
+
+ if ((part = dbp->p_internal) == NULL)
+ return (0);
+
+ env = dbp->env;
+ ret = 0;
+
+ if ((pdbp = part->handles) != NULL) {
+ for (i = 0; i < part->nparts; i++, pdbp++)
+ if (*pdbp != NULL && (t_ret =
+ __db_close(*pdbp, txn, flags)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, part->handles);
+ }
+ if (part->dirs != NULL)
+ __os_free(env, (char **)part->dirs);
+ if (part->data != NULL)
+ __os_free(env, (char **)part->data);
+ __os_free(env, part);
+ dbp->p_internal = NULL;
+
+ return (ret);
+}
+
+/*
+ * __partition_sync
+ * Sync a partitioned database.
+ *
+ * PUBLIC: int __partition_sync __P((DB *));
+ */
+int
+__partition_sync(dbp)
+ DB *dbp;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ u_int32_t i;
+ int ret, t_ret;
+
+ ret = 0;
+ part = dbp->p_internal;
+
+ if ((pdbp = part->handles) != NULL) {
+ for (i = 0; i < part->nparts; i++, pdbp++)
+ if (*pdbp != NULL &&
+ F_ISSET(*pdbp, DB_AM_OPEN_CALLED) && (t_ret =
+ __memp_fsync((*pdbp)->mpf)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __partition_stat
+ * Stat a partitioned database.
+ *
+ * PUBLIC: int __partition_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__partition_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ DB *dbp, **pdbp;
+ DB_BTREE_STAT *fsp, *bsp;
+#ifdef HAVE_HASH
+ DB_HASH_STAT *hfsp, *hsp;
+#endif
+ DB_PARTITION *part;
+ DBC *new_dbc;
+ ENV *env;
+ u_int32_t i;
+ int ret;
+
+ dbp = dbc->dbp;
+ part = dbp->p_internal;
+ env = dbp->env;
+ fsp = NULL;
+#ifdef HAVE_HASH
+ hfsp = NULL;
+#endif
+
+ pdbp = part->handles;
+ for (i = 0; i < part->nparts; i++, pdbp++) {
+ if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
+ (*pdbp)->type, PGNO_INVALID,
+ 0, dbc->locker, &new_dbc)) != 0)
+ goto err;
+ switch (new_dbc->dbtype) {
+ case DB_BTREE:
+ if ((ret = __bam_stat(new_dbc, &bsp, flags)) != 0)
+ goto err;
+ if (fsp == NULL) {
+ fsp = bsp;
+ *(DB_BTREE_STAT **)spp = fsp;
+ } else {
+ fsp->bt_nkeys += bsp->bt_nkeys;
+ fsp->bt_ndata += bsp->bt_ndata;
+ fsp->bt_pagecnt += bsp->bt_pagecnt;
+ if (fsp->bt_levels < bsp->bt_levels)
+ fsp->bt_levels = bsp->bt_levels;
+ fsp->bt_int_pg += bsp->bt_int_pg;
+ fsp->bt_leaf_pg += bsp->bt_leaf_pg;
+ fsp->bt_dup_pg += bsp->bt_dup_pg;
+ fsp->bt_over_pg += bsp->bt_over_pg;
+ fsp->bt_free += bsp->bt_free;
+ fsp->bt_int_pgfree += bsp->bt_int_pgfree;
+ fsp->bt_leaf_pgfree += bsp->bt_leaf_pgfree;
+ fsp->bt_dup_pgfree += bsp->bt_dup_pgfree;
+ fsp->bt_over_pgfree += bsp->bt_over_pgfree;
+ __os_ufree(env, bsp);
+ }
+ break;
+#ifdef HAVE_HASH
+ case DB_HASH:
+ if ((ret = __ham_stat(new_dbc, &hsp, flags)) != 0)
+ goto err;
+ if (hfsp == NULL) {
+ hfsp = hsp;
+ *(DB_HASH_STAT **)spp = hfsp;
+ } else {
+ hfsp->hash_nkeys += hsp->hash_nkeys;
+ hfsp->hash_ndata += hsp->hash_ndata;
+ hfsp->hash_pagecnt += hsp->hash_pagecnt;
+ hfsp->hash_ffactor += hsp->hash_ffactor;
+ hfsp->hash_buckets += hsp->hash_buckets;
+ hfsp->hash_free += hsp->hash_free;
+ hfsp->hash_bfree += hsp->hash_bfree;
+ hfsp->hash_bigpages += hsp->hash_bigpages;
+ hfsp->hash_big_bfree += hsp->hash_big_bfree;
+ hfsp->hash_overflows += hsp->hash_overflows;
+ hfsp->hash_ovfl_free += hsp->hash_ovfl_free;
+ hfsp->hash_dup += hsp->hash_dup;
+ hfsp->hash_dup_free += hsp->hash_dup_free;
+ __os_ufree(env, hsp);
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+ if ((ret = __dbc_close(new_dbc)) != 0)
+ goto err;
+ }
+ return (0);
+
+err:
+ if (fsp != NULL)
+ __os_ufree(env, fsp);
+ *(DB_BTREE_STAT **)spp = NULL;
+ return (ret);
+}
+
+/*
+ * __part_truncate --
+ * Truncate a database.
+ *
+ * PUBLIC: int __part_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__part_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ DB *dbp, **pdbp;
+ DB_PARTITION *part;
+ DBC *new_dbc;
+ u_int32_t count, i;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ part = dbp->p_internal;
+ pdbp = part->handles;
+ ret = 0;
+
+ if (countp != NULL)
+ *countp = 0;
+ for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
+ if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
+ (*pdbp)->type, PGNO_INVALID,
+ 0, dbc->locker, &new_dbc)) != 0)
+ break;
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_truncate(new_dbc, &count);
+ break;
+ case DB_HASH:
+#ifdef HAVE_HASH
+ ret = __ham_truncate(new_dbc, &count);
+ break;
+#endif
+ case DB_QUEUE:
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(dbp->env,
+ "DB->truncate", dbp->type);
+ count = 0;
+ break;
+ }
+ if ((t_ret = __dbc_close(new_dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (countp != NULL)
+ *countp += count;
+ }
+
+ return (ret);
+}
+/*
+ * __part_compact -- compact a partitioned database.
+ *
+ * PUBLIC: int __part_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__part_compact(dbp, ip, txn, start, stop, c_data, flags, end)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *start, *stop;
+ DB_COMPACT *c_data;
+ u_int32_t flags;
+ DBT *end;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ u_int32_t i;
+ int ret;
+
+ part = dbp->p_internal;
+ pdbp = part->handles;
+ ret = 0;
+
+ for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
+ switch (dbp->type) {
+ case DB_HASH:
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __db_compact_int(*pdbp,
+ ip, txn, start, stop, c_data, flags, end);
+ break;
+
+ default:
+ ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+ break;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __part_lsn_reset --
+ * reset the lsns on each partition.
+ *
+ * PUBLIC: int __part_lsn_reset __P((DB *, DB_THREAD_INFO *));
+ */
+int
+__part_lsn_reset(dbp, ip)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ u_int32_t i;
+ int ret;
+
+ part = dbp->p_internal;
+ pdbp = part->handles;
+ ret = 0;
+
+ for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++)
+ ret = __db_lsn_reset((*pdbp)->mpf, ip);
+
+ return (ret);
+}
+
+/*
+ * __part_fileid_reset --
+ * reset the fileid on each partition.
+ *
+ * PUBLIC: int __part_fileid_reset
+ * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+ */
+int
+__part_fileid_reset(env, ip, fname, nparts, encrypted)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ const char *fname;
+ u_int32_t nparts;
+ int encrypted;
+{
+ int ret;
+ u_int32_t part_id;
+ char *name, *sp;
+ const char *np;
+
+ if ((ret = __os_malloc(env,
+ strlen(fname) + PART_LEN + 1, &name)) != 0) {
+ __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+ return (ret);
+ }
+
+ sp = name;
+ np = __db_rpath(fname);
+ if (np == NULL)
+ np = fname;
+ else {
+ np++;
+ (void)strncpy(name, fname, (size_t)(np - fname));
+ sp = name + (np - fname);
+ }
+
+ for (part_id = 0; ret == 0 && part_id < nparts; part_id++) {
+ (void)sprintf(sp, PART_NAME, np, part_id);
+ ret = __env_fileid_reset(env, ip, sp, encrypted);
+ }
+
+ __os_free(env, name);
+ return (ret);
+}
+
+/*
+ * __part_key_range --
+ * Return proportion of keys relative to given key.
+ *
+ * PUBLIC: int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__part_key_range(dbc, dbt, kp, flags)
+ DBC *dbc;
+ DBT *dbt;
+ DB_KEY_RANGE *kp;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DBC *new_dbc;
+ DB_PARTITION *part;
+ PAGE *h;
+ u_int32_t id, part_id;
+ u_int32_t elems, empty, less_elems, my_elems, greater_elems;
+ u_int32_t levels, max_levels, my_levels;
+ db_pgno_t root_pgno;
+ int ret;
+ double total_elems;
+
+ COMPQUIET(flags, 0);
+
+ part = dbc->dbp->p_internal;
+
+ /*
+ * First we find the key range for the partition that contains the
+ * key. Then we scale based on estimates of the other partitions.
+ */
+ if (F_ISSET(part, PART_CALLBACK))
+ part_id = part->callback(dbc->dbp, dbt) % part->nparts;
+ else
+ __part_search(dbc->dbp, part, dbt, &part_id);
+ GET_PART_CURSOR(dbc, new_dbc, part_id);
+
+ if ((ret = __bam_key_range(new_dbc, dbt, kp, flags)) != 0)
+ goto err;
+
+ cp = (BTREE_CURSOR *)new_dbc->internal;
+
+ root_pgno = BAM_ROOT_PGNO(new_dbc);
+ if ((ret = __memp_fget(new_dbc->dbp->mpf, &root_pgno,
+ new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
+ goto c_err;
+
+ my_elems = NUM_ENT(h);
+ my_levels = LEVEL(h);
+ max_levels = my_levels;
+
+ if ((ret = __memp_fput(new_dbc->dbp->mpf,
+ new_dbc->thread_info, h, new_dbc->priority)) != 0)
+ goto c_err;
+
+ if ((ret = __dbc_close(new_dbc)) != 0)
+ goto err;
+ /*
+ * We have the range within one subtree. Now estimate
+ * what part of the whole range that subtree is. Figure
+ * out how many levels each part has and how many entries
+ * in the level below the root.
+ */
+ empty = less_elems = greater_elems = 0;
+ for (id = 0; id < part->nparts; id++) {
+ if (id == part_id) {
+ empty = 0;
+ continue;
+ }
+ GET_PART_CURSOR(dbc, new_dbc, id);
+ cp = (BTREE_CURSOR *)new_dbc->internal;
+ if ((ret = __memp_fget(new_dbc->dbp->mpf, &cp->root,
+ new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
+ goto c_err;
+
+ elems = NUM_ENT(h);
+ levels = LEVEL(h);
+ if (levels == 1)
+ elems /= 2;
+
+ if ((ret = __memp_fput(new_dbc->dbp->mpf,
+ new_dbc->thread_info, h, new_dbc->priority)) != 0)
+ goto c_err;
+
+ if ((ret = __dbc_close(new_dbc)) != 0)
+ goto err;
+
+ /* If the tree is empty, ignore it. */
+ if (elems == 0) {
+ empty++;
+ continue;
+ }
+
+ /*
+ * If a tree has fewer levels than the max just count
+ * it as a single element in the higher level.
+ */
+ if (id < part_id) {
+ if (levels > max_levels) {
+ max_levels = levels;
+ less_elems = id + elems - empty;
+ } else if (levels < max_levels)
+ less_elems++;
+ else
+ less_elems += elems;
+ } else {
+ if (levels > max_levels) {
+ max_levels = levels;
+ greater_elems = (id - part_id) + elems - empty;
+ } else if (levels < max_levels)
+ greater_elems++;
+ else
+ greater_elems += elems;
+ }
+
+ }
+
+ if (my_levels < max_levels) {
+ /*
+ * The subtree containing the key is not the tallest one.
+ * Reduce its share by the number of records at the highest
+ * level. Scale the greater and lesser components up
+ * by the number of records on either side of this
+ * subtree.
+ */
+ total_elems = 1 + greater_elems + less_elems;
+ kp->equal /= total_elems;
+ kp->less /= total_elems;
+ kp->less += less_elems/total_elems;
+ kp->greater /= total_elems;
+ kp->greater += greater_elems/total_elems;
+ } else if (my_levels == max_levels) {
+ /*
+ * The key is in one of the tallest subtrees. We will
+ * scale the values by the ratio of the records at the
+ * top of this stubtree to the number of records at the
+ * highest level.
+ */
+ total_elems = greater_elems + less_elems;
+ if (total_elems != 0) {
+ /*
+ * First scale down by the fraction of elements
+ * in this subtree.
+ */
+ total_elems += my_elems;
+ kp->equal *= my_elems;
+ kp->equal /= total_elems;
+ kp->less *= my_elems;
+ kp->less /= total_elems;
+ kp->greater *= my_elems;
+ kp->greater /= total_elems;
+ /*
+ * Proportionally add weight from the subtrees to the
+ * left and right of this one.
+ */
+ kp->less += less_elems / total_elems;
+ kp->greater += greater_elems / total_elems;
+ }
+ }
+
+ if (0) {
+c_err: (void)__dbc_close(new_dbc);
+ }
+
+err: return (ret);
+}
+
+/*
+ * __part_remove --
+ * Remove method for a partitioned database.
+ *
+ * PUBLIC: int __part_remove __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__part_remove(dbp, ip, txn, name, subdb, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ return (__part_rr(dbp, ip, txn, name, subdb, NULL, flags));
+}
+
+/*
+ * __part_rename --
+ * Rename method for a partitioned database.
+ *
+ * PUBLIC: int __part_rename __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, const char *));
+ */
+int
+__part_rename(dbp, ip, txn, name, subdb, newname)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+{
+ return (__part_rr(dbp, ip, txn, name, subdb, newname, 0));
+}
+
+/*
+ * __part_rr --
+ * Remove/Rename method for a partitioned database.
+ */
+static int
+__part_rr(dbp, ip, txn, name, subdb, newname, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ DB **pdbp, *ptmpdbp, *tmpdbp;
+ DB_PARTITION *part;
+ ENV *env;
+ u_int32_t i;
+ int ret, t_ret;
+ char *np;
+
+ env = dbp->env;
+ ret = 0;
+
+ if (subdb != NULL && name != NULL) {
+ __db_errx(env, DB_STR("0663",
+ "A partitioned database can not be in a multiple databases file"));
+ return (EINVAL);
+ }
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /*
+ * Since rename no longer opens the database, we have
+ * to do it here.
+ */
+ if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+ return (ret);
+
+ /*
+ * We need to make sure we don't self-deadlock, so give
+ * this dbp the same locker as the incoming one.
+ */
+ tmpdbp->locker = dbp->locker;
+ if ((ret = __db_open(tmpdbp, ip, txn, name, NULL, dbp->type,
+ DB_RDWRMASTER | DB_RDONLY, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ part = tmpdbp->p_internal;
+ pdbp = part->handles;
+ COMPQUIET(np, NULL);
+ if (newname != NULL && (ret = __os_malloc(env,
+ strlen(newname) + PART_LEN + 1, &np)) != 0) {
+ __db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1);
+ goto err;
+ }
+ for (i = 0; i < part->nparts; i++, pdbp++) {
+ if ((ret = __db_create_internal(&ptmpdbp, env, 0)) != 0)
+ break;
+ ptmpdbp->locker = (*pdbp)->locker;
+ if (newname == NULL)
+ ret = __db_remove_int(ptmpdbp,
+ ip, txn, (*pdbp)->fname, NULL, flags);
+ else {
+ DB_ASSERT(env, np != NULL);
+ (void)sprintf(np, PART_NAME, newname, i);
+ ret = __db_rename_int(ptmpdbp,
+ ip, txn, (*pdbp)->fname, NULL, np, flags);
+ }
+ ptmpdbp->locker = NULL;
+ (void)__db_close(ptmpdbp, NULL, DB_NOSYNC);
+ if (ret != 0)
+ break;
+ }
+
+ if (newname != NULL)
+ __os_free(env, np);
+
+ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+err: /*
+ * Since we copied the locker ID from the dbp, we'd better not
+ * free it here.
+ */
+ tmpdbp->locker = NULL;
+
+ /* We need to remove the lock event we associated with this. */
+ if (txn != NULL)
+ __txn_remlock(env,
+ txn, &tmpdbp->handle_lock, DB_LOCK_INVALIDID);
+
+ if ((t_ret = __db_close(tmpdbp,
+ txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+#ifdef HAVE_VERIFY
+/*
+ * __part_verify --
+ * Verify a partitioned database.
+ *
+ * PUBLIC: int __part_verify __P((DB *, VRFY_DBINFO *, const char *,
+ * PUBLIC: void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__part_verify(dbp, vdp, fname, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ const char *fname;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ BINTERNAL *lp, *rp;
+ DB **pdbp;
+ DB_PARTITION *part;
+ DBC *dbc;
+ DBT *key;
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t i;
+ int ret, t_ret;
+
+ env = dbp->env;
+ lp = rp = NULL;
+ dbc = NULL;
+ ip = vdp->thread_info;
+
+ if (dbp->type == DB_BTREE) {
+ if ((ret = __bam_open(dbp, ip,
+ NULL, fname, PGNO_BASE_MD, flags)) != 0)
+ goto err;
+ }
+#ifdef HAVE_HASH
+ else if ((ret = __ham_open(dbp, ip,
+ NULL, fname, PGNO_BASE_MD, flags)) != 0)
+ goto err;
+#endif
+
+ /*
+ * Initalize partition db handles and get the names. Set DB_RDWRMASTER
+ * because we may not have the partition callback, but we can still
+ * look at the structure of the tree.
+ */
+ if ((ret = __partition_open(dbp,
+ ip, NULL, fname, dbp->type, flags | DB_RDWRMASTER, 0, 0)) != 0)
+ goto err;
+ part = dbp->p_internal;
+
+ if (LF_ISSET(DB_SALVAGE)) {
+ /* If we are being aggressive we don't want to dump the keys. */
+ if (LF_ISSET(DB_AGGRESSIVE))
+ dbp->p_internal = NULL;
+ ret = __db_prheader(dbp,
+ NULL, 0, 0, handle, callback, vdp, PGNO_BASE_MD);
+ dbp->p_internal = part;
+ if (ret != 0)
+ goto err;
+ }
+
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ goto err;
+
+ pdbp = part->handles;
+ for (i = 0; i < part->nparts; i++, pdbp++) {
+ if (!F_ISSET(part, PART_RANGE) || part->keys == NULL)
+ goto vrfy;
+ if (lp != NULL)
+ __os_free(env, lp);
+ lp = rp;
+ rp = NULL;
+ if (i + 1 < part->nparts) {
+ key = &part->keys[i + 1];
+ if ((ret = __os_malloc(env,
+ BINTERNAL_SIZE(key->size), &rp)) != 0)
+ goto err;
+ rp->len = key->size;
+ memcpy(rp->data, key->data, key->size);
+ B_TSET(rp->type, B_KEYDATA);
+ }
+vrfy: if ((t_ret = __db_verify(*pdbp, ip, (*pdbp)->fname,
+ NULL, handle, callback,
+ lp, rp, flags | DB_VERIFY_PARTITION)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+err: if (lp != NULL)
+ __os_free(env, lp);
+ if (rp != NULL)
+ __os_free(env, rp);
+ return (ret);
+}
+#endif
+
+#ifdef CONFIG_TEST
+/*
+ * __part_testdocopy -- copy all partitions for testing purposes.
+ *
+ * PUBLIC: int __part_testdocopy __P((DB *, const char *));
+ */
+int
+__part_testdocopy(dbp, name)
+ DB *dbp;
+ const char *name;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ u_int32_t i;
+ int ret;
+
+ if ((ret = __db_testdocopy(dbp->env, name)) != 0)
+ return (ret);
+
+ part = dbp->p_internal;
+ pdbp = part->handles;
+ for (i = 0; i < part->nparts; i++, pdbp++)
+ if ((ret = __db_testdocopy(dbp->env, (*pdbp)->fname)) != 0)
+ return (ret);
+
+ return (0);
+}
+#endif
+#else
+/*
+ * __db_nopartition --
+ * Error when a Berkeley DB build doesn't include partitioning.
+ *
+ * PUBLIC: int __db_no_partition __P((ENV *));
+ */
+int
+__db_no_partition(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("0664",
+ "library build did not include support for the database partitioning"));
+ return (DB_OPNOTSUP);
+}
+/*
+ * __partition_set --
+ * Set the partitioning keys or callback function.
+ * This routine must be called prior to creating the database.
+ * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
+ * PUBLIC: u_int32_t (*callback)(DB *, DBT *key)));
+ */
+
+int
+__partition_set(dbp, parts, keys, callback)
+ DB *dbp;
+ u_int32_t parts;
+ DBT *keys;
+ u_int32_t (*callback)(DB *, DBT *key);
+{
+ COMPQUIET(parts, 0);
+ COMPQUIET(keys, NULL);
+ COMPQUIET(callback, NULL);
+
+ return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_callback --
+ * Set the partition callback function. This routine must be called
+ * prior to opening a partition database that requires a function.
+ * PUBLIC: int __partition_get_callback __P((DB *,
+ * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+ */
+int
+__partition_get_callback(dbp, parts, callback)
+ DB *dbp;
+ u_int32_t *parts;
+ u_int32_t (**callback)(DB *, DBT *key);
+{
+ COMPQUIET(parts, NULL);
+ COMPQUIET(callback, NULL);
+
+ return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_dirs --
+ * Get partition dirs.
+ * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
+ */
+int
+__partition_get_dirs(dbp, dirpp)
+ DB *dbp;
+ const char ***dirpp;
+{
+ COMPQUIET(dirpp, NULL);
+ return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_keys --
+ * Get partition keys.
+ * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+ */
+int
+__partition_get_keys(dbp, parts, keys)
+ DB *dbp;
+ u_int32_t *parts;
+ DBT **keys;
+{
+ COMPQUIET(parts, NULL);
+ COMPQUIET(keys, NULL);
+
+ return (__db_no_partition(dbp->env));
+}
+/*
+ * __partition_init --
+ * Initialize the partition structure.
+ * Called when the meta data page is read in during database open or
+ * when partition keys or a callback are set.
+ *
+ * PUBLIC: int __partition_init __P((DB *, u_int32_t));
+ */
+int
+__partition_init(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_no_partition(dbp->env));
+}
+/*
+ * __part_fileid_reset --
+ * reset the fileid on each partition.
+ *
+ * PUBLIC: int __part_fileid_reset
+ * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+ */
+int
+__part_fileid_reset(env, ip, fname, nparts, encrypted)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ const char *fname;
+ u_int32_t nparts;
+ int encrypted;
+{
+ COMPQUIET(ip, NULL);
+ COMPQUIET(fname, NULL);
+ COMPQUIET(nparts, 0);
+ COMPQUIET(encrypted, 0);
+
+ return (__db_no_partition(env));
+}
+/*
+ * __partition_set_dirs --
+ * Set the directories for creating the partition databases.
+ * They must be in the environment.
+ * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
+ */
+int
+__partition_set_dirs(dbp, dirp)
+ DB *dbp;
+ const char **dirp;
+{
+ COMPQUIET(dirp, NULL);
+
+ return (__db_no_partition(dbp->env));
+}
+#endif
diff --git a/src/dbinc/atomic.h b/src/dbinc/atomic.h
new file mode 100644
index 00000000..096176a5
--- /dev/null
+++ b/src/dbinc/atomic.h
@@ -0,0 +1,220 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_ATOMIC_H_
+#define _DB_ATOMIC_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Atomic operation support for Oracle Berkeley DB
+ *
+ * HAVE_ATOMIC_SUPPORT configures whether to use the assembly language
+ * or system calls to perform:
+ *
+ * atomic_inc(env, valueptr)
+ * Adds 1 to the db_atomic_t value, returning the new value.
+ *
+ * atomic_dec(env, valueptr)
+ * Subtracts 1 from the db_atomic_t value, returning the new value.
+ *
+ * atomic_compare_exchange(env, valueptr, oldval, newval)
+ * If the db_atomic_t's value is still oldval, set it to newval.
+ * It returns 1 for success or 0 for failure.
+ *
+ * The ENV * parameter is used only when HAVE_ATOMIC_SUPPORT is undefined.
+ *
+ * If the platform does not natively support any one of these operations,
+ * then atomic operations will be emulated with this sequence:
+ * MUTEX_LOCK()
+ * <op>
+ * MUTEX_UNLOCK();
+ * Uses where mutexes are not available (e.g. the environment has not yet
+ * attached to the mutex region) must be avoided.
+ */
+#if defined(DB_WIN32)
+typedef DWORD atomic_value_t;
+#else
+typedef int32_t atomic_value_t;
+#endif
+
+/*
+ * Windows CE has strange issues using the Interlocked APIs with variables
+ * stored in shared memory. It seems like the page needs to have been written
+ * prior to the API working as expected. Work around this by allocating an
+ * additional 32-bit value that can be harmlessly written for each value
+ * used in Interlocked instructions.
+ */
+#if defined(DB_WINCE)
+typedef struct {
+ volatile atomic_value_t value;
+ volatile atomic_value_t dummy;
+} db_atomic_t;
+#else
+typedef struct {
+ volatile atomic_value_t value;
+} db_atomic_t;
+#endif
+
+/*
+ * These macro hide the db_atomic_t structure layout and help detect
+ * non-atomic_t actual argument to the atomic_xxx() calls. DB requires
+ * aligned 32-bit reads to be atomic even outside of explicit 'atomic' calls.
+ * These have no memory barriers; the caller must include them when necessary.
+ */
+#define atomic_read(p) ((p)->value)
+#define atomic_init(p, val) ((p)->value = (val))
+
+#ifdef HAVE_ATOMIC_SUPPORT
+
+#if defined(DB_WIN32)
+#if defined(DB_WINCE)
+#define WINCE_ATOMIC_MAGIC(p) \
+ /* \
+ * Memory mapped regions on Windows CE cause problems with \
+ * InterlockedXXX calls. Each page in a mapped region needs to \
+ * have been written to prior to an InterlockedXXX call, or the \
+ * InterlockedXXX call hangs. This does not seem to be \
+ * documented anywhere. For now, read/write a non-critical \
+ * piece of memory from the shared region prior to attempting \
+ * shared region prior to attempting an InterlockedExchange \
+ * InterlockedXXX operation. \
+ */ \
+ (p)->dummy = 0
+#else
+#define WINCE_ATOMIC_MAGIC(p) 0
+#endif
+
+#if defined(DB_WINCE) || (defined(_MSC_VER) && _MSC_VER < 1300)
+/*
+ * The Interlocked instructions on Windows CE have different parameter
+ * definitions. The parameters lost their 'volatile' qualifier,
+ * cast it away, to avoid compiler warnings.
+ * These definitions should match those in dbinc/mutex_int.h for tsl_t, except
+ * that the WINCE version drops the volatile qualifier.
+ */
+typedef PLONG interlocked_val;
+#define atomic_inc(env, p) \
+ (WINCE_ATOMIC_MAGIC(p), \
+ InterlockedIncrement((interlocked_val)(&(p)->value)))
+
+#else
+typedef LONG volatile *interlocked_val;
+#define atomic_inc(env, p) \
+ InterlockedIncrement((interlocked_val)(&(p)->value))
+#endif
+
+#define atomic_dec(env, p) \
+ (WINCE_ATOMIC_MAGIC(p), \
+ InterlockedDecrement((interlocked_val)(&(p)->value)))
+#if defined(_MSC_VER) && _MSC_VER < 1300
+#define atomic_compare_exchange(env, p, oldval, newval) \
+ (WINCE_ATOMIC_MAGIC(p), \
+ (InterlockedCompareExchange((PVOID *)(&(p)->value), \
+ (PVOID)(newval), (PVOID)(oldval)) == (PVOID)(oldval)))
+#else
+#define atomic_compare_exchange(env, p, oldval, newval) \
+ (WINCE_ATOMIC_MAGIC(p), \
+ (InterlockedCompareExchange((interlocked_val)(&(p)->value), \
+ (newval), (oldval)) == (oldval)))
+#endif
+#endif
+
+#if defined(HAVE_ATOMIC_SOLARIS)
+/* Solaris sparc & x86/64 */
+#include <atomic.h>
+#define atomic_inc(env, p) \
+ atomic_inc_uint_nv((volatile unsigned int *) &(p)->value)
+#define atomic_dec(env, p) \
+ atomic_dec_uint_nv((volatile unsigned int *) &(p)->value)
+#define atomic_compare_exchange(env, p, oval, nval) \
+ (atomic_cas_32((volatile unsigned int *) &(p)->value, \
+ (oval), (nval)) == (oval))
+#endif
+
+#if defined(HAVE_ATOMIC_X86_GCC_ASSEMBLY)
+/* x86/x86_64 gcc */
+#define atomic_inc(env, p) __atomic_inc(p)
+#define atomic_dec(env, p) __atomic_dec(p)
+#define atomic_compare_exchange(env, p, o, n) \
+ __atomic_compare_exchange((p), (o), (n))
+static inline int __atomic_inc(db_atomic_t *p)
+{
+ int temp;
+
+ temp = 1;
+ __asm__ __volatile__("lock; xadd %0, (%1)"
+ : "+r"(temp)
+ : "r"(p));
+ return (temp + 1);
+}
+
+static inline int __atomic_dec(db_atomic_t *p)
+{
+ int temp;
+
+ temp = -1;
+ __asm__ __volatile__("lock; xadd %0, (%1)"
+ : "+r"(temp)
+ : "r"(p));
+ return (temp - 1);
+}
+
+/*
+ * x86/gcc Compare exchange for shared latches. i486+
+ * Returns 1 for success, 0 for failure
+ *
+ * GCC 4.1+ has an equivalent __sync_bool_compare_and_swap() as well as
+ * __sync_val_compare_and_swap() which returns the value read from *dest
+ * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ * which configure could be changed to use.
+ */
+static inline int __atomic_compare_exchange(
+ db_atomic_t *p, atomic_value_t oldval, atomic_value_t newval)
+{
+ atomic_value_t was;
+
+ if (p->value != oldval) /* check without expensive cache line locking */
+ return 0;
+ __asm__ __volatile__("lock; cmpxchgl %1, (%2);"
+ :"=a"(was)
+ :"r"(newval), "r"(p), "a"(oldval)
+ :"memory", "cc");
+ return (was == oldval);
+}
+#endif
+
+#else
+/*
+ * No native hardware support for atomic increment, decrement, and
+ * compare-exchange. Emulate them when mutexes are supported;
+ * do them without concern for atomicity when no mutexes.
+ */
+#ifndef HAVE_MUTEX_SUPPORT
+/*
+ * These minimal versions are correct to use only for single-threaded,
+ * single-process environments.
+ */
+#define atomic_inc(env, p) (++(p)->value)
+#define atomic_dec(env, p) (--(p)->value)
+#define atomic_compare_exchange(env, p, oldval, newval) \
+ (DB_ASSERT(env, atomic_read(p) == (oldval)), \
+ atomic_init(p, (newval)), 1)
+#else
+#define atomic_inc(env, p) __atomic_inc(env, p)
+#define atomic_dec(env, p) __atomic_dec(env, p)
+#endif
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_ATOMIC_H_ */
diff --git a/src/dbinc/btree.h b/src/dbinc/btree.h
new file mode 100644
index 00000000..86bbec14
--- /dev/null
+++ b/src/dbinc/btree.h
@@ -0,0 +1,553 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+#ifndef _DB_BTREE_H_
+#define _DB_BTREE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Forward structure declarations. */
+struct __btree; typedef struct __btree BTREE;
+struct __cursor; typedef struct __cursor BTREE_CURSOR;
+struct __epg; typedef struct __epg EPG;
+
+#define DEFMINKEYPAGE (2)
+
+/*
+ * A recno order of 0 indicates that we don't have an order, not that we've
+ * an order less than 1.
+ */
+#define INVALID_ORDER 0
+
+#define ISINTERNAL(p) (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO)
+#define ISLEAF(p) (TYPE(p) == P_LBTREE || \
+ TYPE(p) == P_LRECNO || TYPE(p) == P_LDUP)
+
+/* Flags for __bam_cadjust_log(). */
+#define CAD_UPDATEROOT 0x01 /* Root page count was updated. */
+
+/* Flags for __bam_split_log(). */
+#define SPL_NRECS 0x01 /* Split tree has record count. */
+#define SPL_RECNO 0x02 /* This is a Recno cursor. */
+
+/* Flags for __bam_iitem(). */
+#define BI_DELETED 0x01 /* Key/data pair only placeholder. */
+
+/* Flags for __bam_stkrel(). */
+#define STK_CLRDBC 0x01 /* Clear dbc->page reference. */
+#define STK_NOLOCK 0x02 /* Don't retain locks. */
+#define STK_PGONLY 0x04
+
+/* Flags for __ram_ca(). These get logged, so make the values explicit. */
+typedef enum {
+ CA_DELETE = 0, /* Delete the current record. */
+ CA_IAFTER = 1, /* Insert before the current record. */
+ CA_IBEFORE = 2, /* Insert after the current record. */
+ CA_ICURRENT = 3 /* Overwrite the current record. */
+} ca_recno_arg;
+
+/*
+ * Flags for __bam_search() and __bam_rsearch().
+ *
+ * Note, internal page searches must find the largest record less than key in
+ * the tree so that descents work. Leaf page searches must find the smallest
+ * record greater than key so that the returned index is the record's correct
+ * position for insertion.
+ *
+ * The flags parameter to the search routines describes three aspects of the
+ * search: the type of locking required (including if we're locking a pair of
+ * pages), the item to return in the presence of duplicates and whether or not
+ * to return deleted entries. To simplify both the mnemonic representation
+ * and the code that checks for various cases, we construct a set of bitmasks.
+ */
+#define SR_READ 0x00001 /* Read locks. */
+#define SR_WRITE 0x00002 /* Write locks. */
+
+#define SR_APPEND 0x00040 /* Append to the tree. */
+#define SR_DELNO 0x00080 /* Don't return deleted items. */
+#define SR_DUPFIRST 0x00100 /* Return first duplicate. */
+#define SR_DUPLAST 0x00200 /* Return last duplicate. */
+#define SR_EXACT 0x00400 /* Exact items only. */
+#define SR_PARENT 0x00800 /* Lock page pair. */
+#define SR_STACK 0x01000 /* Need a complete stack. */
+#define SR_PAST_EOF 0x02000 /* If doing insert search (or keyfirst
+ * or keylast operations), or a split
+ * on behalf of an insert, it's okay to
+ * return an entry one past end-of-page.
+ */
+#define SR_STK_ONLY 0x04000 /* Just return info in the stack */
+#define SR_MAX 0x08000 /* Get the right most key */
+#define SR_MIN 0x10000 /* Get the left most key */
+#define SR_NEXT 0x20000 /* Get the page after this key */
+#define SR_DEL 0x40000 /* Get the tree to delete this key. */
+#define SR_START 0x80000 /* Level to start stack. */
+#define SR_BOTH 0x100000 /* Get this and the NEXT page */
+
+#define SR_DELETE \
+ (SR_WRITE | SR_DUPFIRST | SR_DELNO | SR_EXACT | SR_STACK)
+#define SR_FIND (SR_READ | SR_DUPFIRST | SR_DELNO)
+#define SR_FIND_WR (SR_WRITE | SR_DUPFIRST | SR_DELNO)
+#define SR_INSERT (SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK)
+#define SR_KEYFIRST (SR_WRITE | SR_DUPFIRST | SR_PAST_EOF | SR_STACK)
+#define SR_KEYLAST (SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK)
+#define SR_WRPAIR (SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_PARENT)
+
+/*
+ * Various routines pass around page references. A page reference is
+ * a pointer to the page, and the indx indicates an item on the page.
+ * Each page reference may include a lock.
+ */
+struct __epg {
+ PAGE *page; /* The page. */
+ db_indx_t indx; /* The index on the page. */
+ db_indx_t entries; /* The number of entries on page */
+ DB_LOCK lock; /* The page's lock. */
+ db_lockmode_t lock_mode; /* The lock mode. */
+};
+
+/*
+ * We maintain a stack of the pages that we're locking in the tree. Grow
+ * the stack as necessary.
+ *
+ * XXX
+ * Temporary fix for #3243 -- clear the page and lock from the stack entry.
+ * The correct fix is to never release a stack that doesn't hold items.
+ */
+#define BT_STK_CLR(c) do { \
+ (c)->csp = (c)->sp; \
+ (c)->csp->page = NULL; \
+ LOCK_INIT((c)->csp->lock); \
+} while (0)
+
+#define BT_STK_ENTER(env, c, pagep, page_indx, l, mode, ret) do { \
+ if ((ret = ((c)->csp == (c)->esp ? \
+ __bam_stkgrow(env, c) : 0)) == 0) { \
+ (c)->csp->page = pagep; \
+ (c)->csp->indx = (page_indx); \
+ (c)->csp->entries = NUM_ENT(pagep); \
+ (c)->csp->lock = l; \
+ (c)->csp->lock_mode = mode; \
+ } \
+} while (0)
+
+#define BT_STK_PUSH(env, c, pagep, page_indx, lock, mode, ret) do { \
+ BT_STK_ENTER(env, c, pagep, page_indx, lock, mode, ret); \
+ ++(c)->csp; \
+} while (0)
+
+#define BT_STK_NUM(env, c, pagep, page_indx, ret) do { \
+ if ((ret = ((c)->csp == \
+ (c)->esp ? __bam_stkgrow(env, c) : 0)) == 0) { \
+ (c)->csp->page = NULL; \
+ (c)->csp->indx = (page_indx); \
+ (c)->csp->entries = NUM_ENT(pagep); \
+ LOCK_INIT((c)->csp->lock); \
+ (c)->csp->lock_mode = DB_LOCK_NG; \
+ } \
+} while (0)
+
+#define BT_STK_NUMPUSH(env, c, pagep, page_indx, ret) do { \
+ BT_STK_NUM(env, cp, pagep, page_indx, ret); \
+ ++(c)->csp; \
+} while (0)
+
+#define BT_STK_POP(c) \
+ ((c)->csp == (c)->sp ? NULL : --(c)->csp)
+
+/*
+ * Flags for __bam_dpages.
+ */
+#define BTD_UPDATE 0x0001 /* Update parents. */
+#define BTD_RELINK 0x0002 /* Relink leaf pages. */
+
+/*
+ * TRY_LOCK
+ * When holding a stack we have pages latched but not locked so
+ * we must avoid an undetectable deadlock by not then blocking on a
+ * lock.
+ */
+#define TRY_LOCK(dbc, pgno, saved_pgno, saved_lock, lock_mode, label) \
+ TRY_LOCK2(dbc, NULL, pgno, saved_pgno, saved_lock, lock_mode, label)
+/*
+ * TRY_LOCK2
+ * This is a special call for __bam_compact_int which uses 2
+ * overlapping stacks.
+ */
+
+#ifdef BTREE_DEBUG
+#define TRY_LOCK2(dbc, ndbc, pgno, \
+ saved_pgno, saved_lock, lock_mode, label) do { \
+ static int BTcount = 0; \
+ if ((pgno) != (saved_pgno) && \
+ ((BTcount++ % 5) == 0 || \
+ (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno, \
+ lock_mode, DB_LOCK_NOWAIT, &(saved_lock))) != 0)) { \
+ if (ret != 0 && ret != DB_LOCK_NOTGRANTED && \
+ ret != DB_LOCK_DEADLOCK) \
+ break; \
+ if ((ndbc) != NULL) { \
+ BTREE_CURSOR *__cp; \
+ __cp = (BTREE_CURSOR *) (dbc)->internal; \
+ __cp->sp->page = NULL; \
+ LOCK_INIT(__cp->sp->lock); \
+ if ((ret = __bam_stkrel(ndbc, 0)) != 0) \
+ break; \
+ } \
+ if ((ret = __bam_stkrel(dbc, 0)) != 0) \
+ break; \
+ if ((ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno, \
+ lock_mode, 0, &(saved_lock))) != 0) \
+ break; \
+ saved_pgno = pgno; \
+ goto label; \
+ } \
+ saved_pgno = pgno; \
+} while (0)
+#else
+#define TRY_LOCK2(dbc, ndbc, pgno, \
+ saved_pgno, saved_lock, lock_mode, label) do { \
+ if ((pgno) != (saved_pgno) && \
+ (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno, \
+ lock_mode, DB_LOCK_NOWAIT, &(saved_lock))) != 0) { \
+ if (ret != DB_LOCK_NOTGRANTED && \
+ ret != DB_LOCK_DEADLOCK) \
+ break; \
+ if ((ndbc) != NULL) { \
+ BTREE_CURSOR *__cp; \
+ __cp = (BTREE_CURSOR *) (dbc)->internal; \
+ __cp->sp->page = NULL; \
+ LOCK_INIT(__cp->sp->lock); \
+ if ((ret = __bam_stkrel(ndbc, 0)) != 0) \
+ break; \
+ } \
+ if ((ret = __bam_stkrel(dbc, 0)) != 0) \
+ break; \
+ if ((ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, pgno, \
+ lock_mode, 0, &(saved_lock))) != 0) \
+ break; \
+ saved_pgno = pgno; \
+ goto label; \
+ } \
+ saved_pgno = pgno; \
+} while (0)
+#endif
+
+/* Btree/Recno cursor. */
+struct __cursor {
+ /* struct __dbc_internal */
+ __DBC_INTERNAL
+
+ /* btree private part */
+ EPG *sp; /* Stack pointer. */
+ EPG *csp; /* Current stack entry. */
+ EPG *esp; /* End stack pointer. */
+ EPG stack[5];
+
+ db_indx_t ovflsize; /* Maximum key/data on-page size. */
+
+ db_recno_t recno; /* Current record number. */
+ u_int32_t order; /* Relative order among deleted curs. */
+
+#ifdef HAVE_COMPRESSION
+ /*
+ * Compression:
+ *
+ * We need to hold the current compressed chunk, as well as the previous
+ * key/data, in order to decompress the next key/data. We do that by
+ * swapping whether prevKey/Data and currentKey/Data point to
+ * key1/data1, or key2/data2.
+ *
+ * We store prevcursor in order to be able to perform one level of
+ * DB_PREV by returning prevKey/prevData. We need prev2cursor to more
+ * efficiently do a subsequent DB_PREV with a linear search from the
+ * beginning of the compressed chunk.
+ *
+ * When we delete entries, we set the cursor to point to the next entry
+ * after the last deleted key, and set C_COMPRESS_DELETED. The del_key
+ * DBT holds the key of the deleted entry supposedly pointed to by a
+ * compressed cursor, and is used to implement DB_PREV_DUP,
+ * DB_PREV_NODUP, DB_NEXT_DUP, and DB_NEXT_NODUP on a deleted entry.
+ */
+ DBT compressed; /* Current compressed chunk */
+ DBT key1; /* Holds prevKey or currentKey */
+ DBT key2; /* Holds prevKey or currentKey */
+ DBT data1; /* Holds prevData or currentData */
+ DBT data2; /* Holds prevData or currentData */
+ DBT del_key; /* Holds key from the deleted entry */
+ DBT del_data; /* Holds data from the deleted entry */
+ DBT *prevKey; /* Previous key decompressed */
+ DBT *prevData; /* Previous data decompressed */
+ DBT *currentKey; /* Current key decompressed */
+ DBT *currentData; /* Current data decompressed */
+ u_int8_t *compcursor; /* Current position in compressed */
+ u_int8_t *compend; /* End of compressed */
+ u_int8_t *prevcursor; /* Previous current position */
+ u_int8_t *prev2cursor; /* Previous previous current position */
+#endif
+
+ /*
+ * Btree:
+ * We set a flag in the cursor structure if the underlying object has
+ * been deleted. It's not strictly necessary, we could get the same
+ * information by looking at the page itself, but this method doesn't
+ * require us to retrieve the page on cursor delete.
+ *
+ * Recno:
+ * When renumbering recno databases during deletes, cursors referencing
+ * "deleted" records end up positioned between two records, and so must
+ * be specially adjusted on the next operation.
+ */
+#define C_DELETED 0x0001 /* Record was deleted. */
+ /*
+ * There are three tree types that require maintaining record numbers.
+ * Recno AM trees, Btree AM trees for which the DB_RECNUM flag was set,
+ * and Btree off-page duplicate trees.
+ */
+#define C_RECNUM 0x0002 /* Tree requires record counts. */
+ /*
+ * Recno trees have immutable record numbers by default, but optionally
+ * support mutable record numbers. Off-page duplicate Recno trees have
+ * mutable record numbers. All Btrees with record numbers (including
+ * off-page duplicate trees) are mutable by design, no flag is needed.
+ */
+#define C_RENUMBER 0x0004 /* Tree records are mutable. */
+ /*
+ * The current compressed key/data could be deleted, as well as the
+ * key/data that the underlying BTree cursor points to.
+ */
+#define C_COMPRESS_DELETED 0x0008 /* Compressed record was deleted. */
+ /*
+ * The current compressed chunk has been modified by another DBC. A
+ * compressed cursor will have to seek it's position again if necessary
+ * when it is next accessed.
+ */
+#define C_COMPRESS_MODIFIED 0x0010 /* Compressed record was modified. */
+ u_int32_t flags;
+};
+
+/*
+ * Threshhold value, as a function of bt_minkey, of the number of
+ * bytes a key/data pair can use before being placed on an overflow
+ * page. Assume every item requires the maximum alignment for
+ * padding, out of sheer paranoia.
+ */
+#define B_MINKEY_TO_OVFLSIZE(dbp, minkey, pgsize) \
+ ((u_int16_t)(((pgsize) - P_OVERHEAD(dbp)) / ((minkey) * P_INDX) -\
+ (BKEYDATA_PSIZE(0) + DB_ALIGN(1, sizeof(int32_t)))))
+
+/*
+ * The maximum space that a single item can ever take up on one page.
+ * Used by __bam_split to determine whether a split is still necessary.
+ */
+#define B_MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define B_MAXSIZEONPAGE(ovflsize) \
+ (B_MAX(BOVERFLOW_PSIZE, BKEYDATA_PSIZE(ovflsize)))
+
+/*
+ * BAM_GET_ROOT --
+ * This macro is used to isolate the fact that the root page of
+ * a subdatabase may move if DB->compact is called on it.
+ * The dbp->mpf->mfp->revision will be incremented every time
+ * a subdatabase root or meta page moves. If this is the case then
+ * we must call __db_reopen to read the master database to find it.
+ * We leave the loop only by breaking out if we do not have a subdb
+ * or we are sure the have the right revision.
+ *
+ * It must be guaranteed that we cannot read an old root pgno and a
+ * current revision number. We note that the global revision number
+ * and DB handle information are only updated while holding the latches
+ * and locks of the master database pages.
+ * If another thread is synchronizing the DB handle with the master
+ * database it will exclusively latch both the old and new pages so we will
+ * synchronize on that.
+ */
+#define BAM_GET_ROOT(dbc, root_pgno, \
+ page, get_mode, lock_mode, lock, ret) do { \
+ BTREE *__t = (dbc)->dbp->bt_internal; \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ db_pgno_t __root; \
+ u_int32_t __rev = 0; \
+ if ((root_pgno) == PGNO_INVALID) { \
+ if (__cp->root == PGNO_INVALID) { \
+ __root = __t->bt_root; \
+ __rev = __t->revision; \
+ } else \
+ __root = root_pgno = __cp->root; \
+ } else \
+ __root = root_pgno; \
+ if (STD_LOCKING(dbc) && \
+ ((lock_mode) == DB_LOCK_WRITE || F_ISSET(dbc, DBC_DOWNREV) \
+ || dbc->dbtype == DB_RECNO || F_ISSET(__cp, C_RECNUM)) && \
+ (ret = \
+ __db_lget(dbc, 0, __root, lock_mode, 0, &(lock))) != 0) \
+ break; \
+ if ((ret = __memp_fget((dbc)->dbp->mpf, &__root, \
+ (dbc)->thread_info, dbc->txn, get_mode, &page)) == 0) { \
+ if (__root == root_pgno) \
+ break; \
+ if (F_ISSET(dbc, DBC_OPD) || \
+ !F_ISSET((dbc)->dbp, DB_AM_SUBDB) || \
+ (__t->bt_root == __root && \
+ (LEVEL(page) == LEAFLEVEL || TYPE(page) == \
+ (dbc->dbtype == DB_BTREE ? P_IBTREE : P_IRECNO)) &&\
+ __rev == (dbc)->dbp->mpf->mfp->revision)) { \
+ root_pgno = __root; \
+ break; \
+ } \
+ if ((ret = __memp_fput((dbc)->dbp->mpf, \
+ (dbc)->thread_info, page, (dbc)->priority)) != 0) \
+ break; \
+ } else if (ret != DB_PAGE_NOTFOUND) \
+ break; \
+ if ((ret = __LPUT(dbc, lock)) != 0) \
+ break; \
+ if ((ret = __db_reopen(dbc)) != 0) \
+ break; \
+} while (1)
+
+/*
+ * Return the root of this tree. If this is an off page duplicate tree
+ * then its in the cursor, otherwise we must look in the db handle.
+ */
+#define BAM_ROOT_PGNO(dbc) \
+ (((BTREE_CURSOR *)(dbc)->internal)->root == PGNO_INVALID ? \
+ ((BTREE*)(dbc)->dbp->bt_internal)->bt_root : \
+ ((BTREE_CURSOR *)(dbc)->internal)->root)
+
+
+
+/*
+ * The in-memory, per-tree btree/recno data structure.
+ */
+struct __btree { /* Btree access method. */
+ /*
+ * These fields may change if this is a subdatabase and
+ * it gets compacted.
+ */
+ db_pgno_t bt_meta; /* Database meta-data page. */
+ db_pgno_t bt_root; /* Database root page. */
+ u_int32_t revision; /* Revision of root/meta. */
+
+ u_int32_t bt_minkey; /* Minimum keys per page. */
+
+ /* Btree comparison function. */
+ int (*bt_compare) __P((DB *, const DBT *, const DBT *));
+ /* Btree prefix function. */
+ size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *));
+ /* Btree compress function. */
+#ifdef HAVE_COMPRESSION
+ int (*bt_compress) __P((DB *, const DBT *, const DBT *, const DBT *,
+ const DBT *, DBT *));
+ /* Btree decompress function. */
+ int (*bt_decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
+ DBT *));
+ /* dup_compare for compression */
+ int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *));
+#endif
+
+ /* Recno access method. */
+ int re_pad; /* Fixed-length padding byte. */
+ int re_delim; /* Variable-length delimiting byte. */
+ u_int32_t re_len; /* Length for fixed-length records. */
+ char *re_source; /* Source file name. */
+
+ /*
+ * !!!
+ * The bt_lpgno field is NOT protected by any mutex, and for this
+ * reason must be advisory only, so, while it is read/written by
+ * multiple threads, DB is completely indifferent to the quality
+ * of its information.
+ */
+ db_pgno_t bt_lpgno; /* Last insert location. */
+ DB_LSN bt_llsn; /* Last insert LSN. */
+
+ /*
+ * !!!
+ * The re_modified field is NOT protected by any mutex, and for this
+ * reason cannot be anything more complicated than a zero/non-zero
+ * value. The actual writing of the backing source file cannot be
+ * threaded, so clearing the flag isn't a problem.
+ */
+ int re_modified; /* If the tree was modified. */
+
+ /*
+ * !!!
+ * These fields are ignored as far as multi-threading is concerned.
+ * There are no transaction semantics associated with backing files,
+ * nor is there any thread protection.
+ */
+ FILE *re_fp; /* Source file handle. */
+ int re_eof; /* Backing source file EOF reached. */
+ db_recno_t re_last; /* Last record number read. */
+
+};
+
+/*
+ * Modes for the __bam_curadj recovery records (btree_curadj).
+ * These appear in log records, so we wire the values and
+ * do not leave it up to the compiler.
+ */
+typedef enum {
+ DB_CA_DI = 1,
+ DB_CA_DUP = 2,
+ DB_CA_RSPLIT = 3,
+ DB_CA_SPLIT = 4
+} db_ca_mode;
+
+/*
+ * Flags for __bam_pinsert.
+ */
+#define BPI_SPACEONLY 0x01 /* Only check for space to update. */
+#define BPI_NORECNUM 0x02 /* Not update the recnum on the left. */
+#define BPI_NOLOGGING 0x04 /* Don't log the update. */
+#define BPI_REPLACE 0x08 /* Replace the record. */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/btree_auto.h"
+#include "dbinc_auto/btree_ext.h"
+#include "dbinc/db_am.h"
+#endif /* !_DB_BTREE_H_ */
diff --git a/src/dbinc/clock.h b/src/dbinc/clock.h
new file mode 100644
index 00000000..caeaee70
--- /dev/null
+++ b/src/dbinc/clock.h
@@ -0,0 +1,131 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)time.h 8.5 (Berkeley) 5/4/95
+ * FreeBSD: src/sys/sys/time.h,v 1.65 2004/04/07 04:19:49 imp Exp
+ */
+
+#ifndef _DB_CLOCK_H_
+#define _DB_CLOCK_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This declaration is POSIX-compatible. Because there are lots of different
+ * time.h include file patterns out there, it's easier to declare our own name
+ * in all cases than to try and discover if a system has a struct timespec.
+ * For the same reason, and because we'd have to #include <sys/time.h> in db.h,
+ * we don't export any timespec structures in the DB API, even in places where
+ * it would make sense, like the replication statistics information.
+ */
+typedef struct {
+ time_t tv_sec; /* seconds */
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+ int32_t tv_nsec;
+#else
+ long tv_nsec; /* nanoseconds */
+#endif
+} db_timespec;
+
+/* Operations on timespecs */
+#undef timespecclear
+#define timespecclear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0)
+#undef timespecisset
+#define timespecisset(tvp) ((tvp)->tv_sec || (tvp)->tv_nsec)
+#undef timespeccmp
+#define timespeccmp(tvp, uvp, cmp) \
+ (((tvp)->tv_sec == (uvp)->tv_sec) ? \
+ ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \
+ ((tvp)->tv_sec cmp (uvp)->tv_sec))
+#undef timespecadd
+/*
+ * Note that using timespecadd to add to yourself (i.e. doubling)
+ * must be supported.
+ */
+#define timespecadd(vvp, uvp) \
+ do { \
+ (vvp)->tv_sec += (uvp)->tv_sec; \
+ (vvp)->tv_nsec += (uvp)->tv_nsec; \
+ if ((vvp)->tv_nsec >= 1000000000) { \
+ (vvp)->tv_sec++; \
+ (vvp)->tv_nsec -= 1000000000; \
+ } \
+ } while (0)
+#undef timespecsub
+#define timespecsub(vvp, uvp) \
+ do { \
+ (vvp)->tv_sec -= (uvp)->tv_sec; \
+ (vvp)->tv_nsec -= (uvp)->tv_nsec; \
+ if ((vvp)->tv_nsec < 0) { \
+ (vvp)->tv_sec--; \
+ (vvp)->tv_nsec += 1000000000; \
+ } \
+ } while (0)
+
+#undef timespecset
+#define timespecset(vvp, sec, nsec) \
+ do { \
+ (vvp)->tv_sec = (time_t)(sec); \
+ (vvp)->tv_nsec = (long)(nsec); \
+ } while (0)
+
+#define DB_TIMEOUT_TO_TIMESPEC(t, vvp) \
+ do { \
+ (vvp)->tv_sec = (time_t)((t) / 1000000); \
+ (vvp)->tv_nsec = (long)(((t) % 1000000) * 1000); \
+ } while (0)
+
+#define DB_TIMESPEC_TO_TIMEOUT(t, vvp, prec) \
+ do { \
+ t = (u_long)((vvp)->tv_sec * 1000000); \
+ t += (u_long)((vvp)->tv_nsec / 1000); \
+ /* Add in 1 usec for lost nsec precision if wanted. */ \
+ if (prec) \
+ t++; \
+ } while (0)
+
+#define TIMESPEC_ADD_DB_TIMEOUT(vvp, t) \
+ do { \
+ db_timespec __tmp; \
+ DB_TIMEOUT_TO_TIMESPEC(t, &__tmp); \
+ timespecadd((vvp), &__tmp); \
+ } while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_CLOCK_H_ */
diff --git a/src/dbinc/crypto.h b/src/dbinc/crypto.h
new file mode 100644
index 00000000..ea7a9cf0
--- /dev/null
+++ b/src/dbinc/crypto.h
@@ -0,0 +1,93 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_CRYPTO_H_
+#define _DB_CRYPTO_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifdef HAVE_CRYPTO_IPP
+#include <ippcp.h>
+#endif
+
+/*
+ * !!!
+ * These are the internal representations of the algorithm flags.
+ * They are used in both the DB_CIPHER structure and the CIPHER
+ * structure so we can tell if users specified both passwd and alg
+ * correctly.
+ *
+ * CIPHER_ANY is used when an app joins an existing env but doesn't
+ * know the algorithm originally used. This is only valid in the
+ * DB_CIPHER structure until we open and can set the alg.
+ */
+/*
+ * We store the algorithm in an 8-bit field on the meta-page. So we
+ * use a numeric value, not bit fields.
+ * now we are limited to 8 algorithms before we cannot use bits and
+ * need numeric values. That should be plenty. It is okay for the
+ * CIPHER_ANY flag to go beyond that since that is never stored on disk.
+ */
+
+/*
+ * This structure is per-process, not in shared memory.
+ */
+struct __db_cipher {
+ u_int (*adj_size) __P((size_t));
+ int (*close) __P((ENV *, void *));
+ int (*decrypt) __P((ENV *, void *, void *, u_int8_t *, size_t));
+ int (*encrypt) __P((ENV *, void *, void *, u_int8_t *, size_t));
+ int (*init) __P((ENV *, DB_CIPHER *));
+
+ u_int8_t mac_key[DB_MAC_KEY]; /* MAC key. */
+ void *data; /* Algorithm-specific information */
+
+#define CIPHER_AES 1 /* AES algorithm */
+ u_int8_t alg; /* Algorithm used - See above */
+ u_int8_t spare[3]; /* Spares */
+
+#define CIPHER_ANY 0x00000001 /* Only for DB_CIPHER */
+ u_int32_t flags; /* Other flags */
+};
+
+#ifdef HAVE_CRYPTO
+
+#include "crypto/rijndael/rijndael-api-fst.h"
+
+/*
+ * Shared ciphering structure
+ * No mutex needed because all information is read-only after creation.
+ */
+typedef struct __cipher {
+ roff_t passwd; /* Offset to shared passwd */
+ size_t passwd_len; /* Length of passwd */
+ u_int32_t flags; /* Algorithm used - see above */
+} CIPHER;
+
+#define DB_AES_KEYLEN 128 /* AES key length */
+#define DB_AES_CHUNK 16 /* AES byte unit size */
+
+typedef struct __aes_cipher {
+#ifdef HAVE_CRYPTO_IPP
+ void *ipp_ctx; /* IPP key instance */
+#else
+ keyInstance decrypt_ki; /* Decryption key instance */
+ keyInstance encrypt_ki; /* Encryption key instance */
+#endif
+ u_int32_t flags; /* AES-specific flags */
+} AES_CIPHER;
+
+#include "dbinc_auto/crypto_ext.h"
+#endif /* HAVE_CRYPTO */
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_CRYPTO_H_ */
diff --git a/src/dbinc/cxx_int.h b/src/dbinc/cxx_int.h
new file mode 100644
index 00000000..5492ead7
--- /dev/null
+++ b/src/dbinc/cxx_int.h
@@ -0,0 +1,77 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_CXX_INT_H_
+#define _DB_CXX_INT_H_
+
+// private data structures known to the implementation only
+
+//
+// Using FooImp classes will allow the implementation to change in the
+// future without any modification to user code or even to header files
+// that the user includes. FooImp * is just like void * except that it
+// provides a little extra protection, since you cannot randomly assign
+// any old pointer to a FooImp* as you can with void *. Currently, a
+// pointer to such an opaque class is always just a pointer to the
+// appropriate underlying implementation struct. These are converted
+// back and forth using the various overloaded wrap()/unwrap() methods.
+// This is essentially a use of the "Bridge" Design Pattern.
+//
+// WRAPPED_CLASS implements the appropriate wrap() and unwrap() methods
+// for a wrapper class that has an underlying pointer representation.
+//
+#define WRAPPED_CLASS(_WRAPPER_CLASS, _IMP_CLASS, _WRAPPED_TYPE) \
+ class _IMP_CLASS {}; \
+ \
+ inline _WRAPPED_TYPE *unwrap(_WRAPPER_CLASS *val) \
+ { \
+ if (!val) return (0); \
+ return (val->get_##_WRAPPED_TYPE()); \
+ } \
+ \
+ inline const _WRAPPED_TYPE *unwrapConst(const _WRAPPER_CLASS *val) \
+ { \
+ if (!val) return (0); \
+ return (val->get_const_##_WRAPPED_TYPE()); \
+ }
+
+WRAPPED_CLASS(Db, DbImp, DB)
+WRAPPED_CLASS(DbChannel, DbChannelImp, DB_CHANNEL)
+WRAPPED_CLASS(DbEnv, DbEnvImp, DB_ENV)
+WRAPPED_CLASS(DbMpoolFile, DbMpoolFileImp, DB_MPOOLFILE)
+WRAPPED_CLASS(DbSequence, DbSequenceImp, DB_SEQUENCE)
+WRAPPED_CLASS(DbSite, DbSiteImp, DB_SITE)
+WRAPPED_CLASS(DbTxn, DbTxnImp, DB_TXN)
+
+// A tristate integer value used by the DB_ERROR macro below.
+// We chose not to make this an enumerated type so it can
+// be kept private, even though methods that return the
+// tristate int can be declared in db_cxx.h .
+//
+#define ON_ERROR_THROW 1
+#define ON_ERROR_RETURN 0
+#define ON_ERROR_UNKNOWN (-1)
+
+// Macros that handle detected errors, in case we want to
+// change the default behavior. The 'policy' is one of
+// the tristate values given above. If UNKNOWN is specified,
+// the behavior is taken from the last initialized DbEnv.
+//
+#define DB_ERROR(dbenv, caller, ecode, policy) \
+ DbEnv::runtime_error(dbenv, caller, ecode, policy)
+
+#define DB_ERROR_DBT(dbenv, caller, dbt, policy) \
+ DbEnv::runtime_error_dbt(dbenv, caller, dbt, policy)
+
+#define DB_OVERFLOWED_DBT(dbt) \
+ (F_ISSET(dbt, DB_DBT_USERMEM) && dbt->size > dbt->ulen)
+
+/* values for Db::flags_ */
+#define DB_CXX_PRIVATE_ENV 0x00000001
+
+#endif /* !_DB_CXX_INT_H_ */
diff --git a/src/dbinc/db.in b/src/dbinc/db.in
new file mode 100644
index 00000000..a948910e
--- /dev/null
+++ b/src/dbinc/db.in
@@ -0,0 +1,2810 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ *
+ * db.h include file layout:
+ * General.
+ * Database Environment.
+ * Locking subsystem.
+ * Logging subsystem.
+ * Shared buffer cache (mpool) subsystem.
+ * Transaction subsystem.
+ * Access methods.
+ * Access method cursors.
+ * Dbm/Ndbm, Hsearch historic interfaces.
+ */
+
+#ifndef _DB_H_
+#define _DB_H_
+
+#ifndef __NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+@inttypes_h_decl@
+@stdint_h_decl@
+@stddef_h_decl@
+#include <stdio.h>
+@unistd_h_decl@
+@thread_h_decl@
+#endif
+
+@platform_header@
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+@DB_CONST@
+@DB_PROTO1@
+@DB_PROTO2@
+
+/*
+ * Berkeley DB version information.
+ */
+#define DB_VERSION_FAMILY @DB_VERSION_FAMILY@
+#define DB_VERSION_RELEASE @DB_VERSION_RELEASE@
+#define DB_VERSION_MAJOR @DB_VERSION_MAJOR@
+#define DB_VERSION_MINOR @DB_VERSION_MINOR@
+#define DB_VERSION_PATCH @DB_VERSION_PATCH@
+#define DB_VERSION_STRING @DB_VERSION_STRING@
+#define DB_VERSION_FULL_STRING @DB_VERSION_FULL_STRING@
+
+/*
+ * !!!
+ * Berkeley DB uses specifically sized types. If they're not provided by
+ * the system, typedef them here.
+ *
+ * We protect them against multiple inclusion using __BIT_TYPES_DEFINED__,
+ * as does BIND and Kerberos, since we don't know for sure what #include
+ * files the user is using.
+ *
+ * !!!
+ * We also provide the standard u_int, u_long etc., if they're not provided
+ * by the system.
+ */
+#ifndef __BIT_TYPES_DEFINED__
+#define __BIT_TYPES_DEFINED__
+@u_int8_decl@
+@int16_decl@
+@u_int16_decl@
+@int32_decl@
+@u_int32_decl@
+@int64_decl@
+@u_int64_decl@
+#endif
+
+@u_char_decl@
+@u_int_decl@
+@u_long_decl@
+@u_short_decl@
+
+/*
+ * Missing ANSI types.
+ *
+ * uintmax_t --
+ * Largest unsigned type, used to align structures in memory. We don't store
+ * floating point types in structures, so integral types should be sufficient
+ * (and we don't have to worry about systems that store floats in other than
+ * power-of-2 numbers of bytes). Additionally this fixes compilers that rewrite
+ * structure assignments and ANSI C memcpy calls to be in-line instructions
+ * that happen to require alignment.
+ *
+ * uintptr_t --
+ * Unsigned type that's the same size as a pointer. There are places where
+ * DB modifies pointers by discarding the bottom bits to guarantee alignment.
+ * We can't use uintmax_t, it may be larger than the pointer, and compilers
+ * get upset about that. So far we haven't run on any machine where there's
+ * no unsigned type the same size as a pointer -- here's hoping.
+ */
+@uintmax_t_decl@
+@uintptr_t_decl@
+
+@FILE_t_decl@
+@off_t_decl@
+@pid_t_decl@
+@size_t_decl@
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+typedef u_int32_t db_size_t;
+#else
+typedef size_t db_size_t;
+#endif
+@ssize_t_decl@
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+typedef int32_t db_ssize_t;
+#else
+typedef ssize_t db_ssize_t;
+#endif
+@time_t_decl@
+
+/*
+ * Sequences are only available on machines with 64-bit integral types.
+ */
+@db_seq_decl@
+
+/* Thread and process identification. */
+@db_threadid_t_decl@
+
+/* Basic types that are exported or quasi-exported. */
+typedef u_int32_t db_pgno_t; /* Page number type. */
+typedef u_int16_t db_indx_t; /* Page offset type. */
+#define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */
+
+typedef u_int32_t db_recno_t; /* Record number type. */
+#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */
+
+typedef u_int32_t db_timeout_t; /* Type of a timeout. */
+
+/*
+ * Region offsets are the difference between a pointer in a region and the
+ * region's base address. With private environments, both addresses are the
+ * result of calling malloc, and we can't assume anything about what malloc
+ * will return, so region offsets have to be able to hold differences between
+ * arbitrary pointers.
+ */
+typedef db_size_t roff_t;
+
+/*
+ * Forward structure declarations, so we can declare pointers and
+ * applications can get type checking.
+ */
+struct __channel; typedef struct __channel CHANNEL;
+struct __db; typedef struct __db DB;
+struct __db_bt_stat; typedef struct __db_bt_stat DB_BTREE_STAT;
+struct __db_channel; typedef struct __db_channel DB_CHANNEL;
+struct __db_cipher; typedef struct __db_cipher DB_CIPHER;
+struct __db_compact; typedef struct __db_compact DB_COMPACT;
+struct __db_dbt; typedef struct __db_dbt DBT;
+struct __db_distab; typedef struct __db_distab DB_DISTAB;
+struct __db_env; typedef struct __db_env DB_ENV;
+struct __db_h_stat; typedef struct __db_h_stat DB_HASH_STAT;
+struct __db_heap_rid; typedef struct __db_heap_rid DB_HEAP_RID;
+struct __db_heap_stat; typedef struct __db_heap_stat DB_HEAP_STAT;
+struct __db_ilock; typedef struct __db_ilock DB_LOCK_ILOCK;
+struct __db_lock_hstat; typedef struct __db_lock_hstat DB_LOCK_HSTAT;
+struct __db_lock_pstat; typedef struct __db_lock_pstat DB_LOCK_PSTAT;
+struct __db_lock_stat; typedef struct __db_lock_stat DB_LOCK_STAT;
+struct __db_lock_u; typedef struct __db_lock_u DB_LOCK;
+struct __db_locker; typedef struct __db_locker DB_LOCKER;
+struct __db_lockreq; typedef struct __db_lockreq DB_LOCKREQ;
+struct __db_locktab; typedef struct __db_locktab DB_LOCKTAB;
+struct __db_log; typedef struct __db_log DB_LOG;
+struct __db_log_cursor; typedef struct __db_log_cursor DB_LOGC;
+struct __db_log_stat; typedef struct __db_log_stat DB_LOG_STAT;
+struct __db_lsn; typedef struct __db_lsn DB_LSN;
+struct __db_mpool; typedef struct __db_mpool DB_MPOOL;
+struct __db_mpool_fstat;typedef struct __db_mpool_fstat DB_MPOOL_FSTAT;
+struct __db_mpool_stat; typedef struct __db_mpool_stat DB_MPOOL_STAT;
+struct __db_mpoolfile; typedef struct __db_mpoolfile DB_MPOOLFILE;
+struct __db_mutex_stat; typedef struct __db_mutex_stat DB_MUTEX_STAT;
+struct __db_mutex_t; typedef struct __db_mutex_t DB_MUTEX;
+struct __db_mutexmgr; typedef struct __db_mutexmgr DB_MUTEXMGR;
+struct __db_preplist; typedef struct __db_preplist DB_PREPLIST;
+struct __db_qam_stat; typedef struct __db_qam_stat DB_QUEUE_STAT;
+struct __db_rep; typedef struct __db_rep DB_REP;
+struct __db_rep_stat; typedef struct __db_rep_stat DB_REP_STAT;
+struct __db_repmgr_conn_err;
+ typedef struct __db_repmgr_conn_err DB_REPMGR_CONN_ERR;
+struct __db_repmgr_site;typedef struct __db_repmgr_site DB_REPMGR_SITE;
+struct __db_repmgr_stat;typedef struct __db_repmgr_stat DB_REPMGR_STAT;
+struct __db_seq_record; typedef struct __db_seq_record DB_SEQ_RECORD;
+struct __db_seq_stat; typedef struct __db_seq_stat DB_SEQUENCE_STAT;
+struct __db_site; typedef struct __db_site DB_SITE;
+struct __db_sequence; typedef struct __db_sequence DB_SEQUENCE;
+struct __db_thread_info;typedef struct __db_thread_info DB_THREAD_INFO;
+struct __db_txn; typedef struct __db_txn DB_TXN;
+struct __db_txn_active; typedef struct __db_txn_active DB_TXN_ACTIVE;
+struct __db_txn_stat; typedef struct __db_txn_stat DB_TXN_STAT;
+struct __db_txn_token; typedef struct __db_txn_token DB_TXN_TOKEN;
+struct __db_txnmgr; typedef struct __db_txnmgr DB_TXNMGR;
+struct __dbc; typedef struct __dbc DBC;
+struct __dbc_internal; typedef struct __dbc_internal DBC_INTERNAL;
+struct __env; typedef struct __env ENV;
+struct __fh_t; typedef struct __fh_t DB_FH;
+struct __fname; typedef struct __fname FNAME;
+struct __key_range; typedef struct __key_range DB_KEY_RANGE;
+struct __mpoolfile; typedef struct __mpoolfile MPOOLFILE;
+struct __db_logvrfy_config;
+typedef struct __db_logvrfy_config DB_LOG_VERIFY_CONFIG;
+
+/*
+ * The Berkeley DB API flags are automatically-generated -- the following flag
+ * names are no longer used, but remain for compatibility reasons.
+ */
+#define DB_DEGREE_2 DB_READ_COMMITTED
+#define DB_DIRTY_READ DB_READ_UNCOMMITTED
+#define DB_JOINENV 0x0
+
+/* Key/data structure -- a Data-Base Thang. */
+struct __db_dbt {
+ void *data; /* Key/data */
+ u_int32_t size; /* key/data length */
+
+ u_int32_t ulen; /* RO: length of user buffer. */
+ u_int32_t dlen; /* RO: get/put record length. */
+ u_int32_t doff; /* RO: get/put record offset. */
+
+ void *app_data;
+
+#define DB_DBT_APPMALLOC 0x001 /* Callback allocated memory. */
+#define DB_DBT_BULK 0x002 /* Internal: Insert if duplicate. */
+#define DB_DBT_DUPOK 0x004 /* Internal: Insert if duplicate. */
+#define DB_DBT_ISSET 0x008 /* Lower level calls set value. */
+#define DB_DBT_MALLOC 0x010 /* Return in malloc'd memory. */
+#define DB_DBT_MULTIPLE 0x020 /* References multiple records. */
+#define DB_DBT_PARTIAL 0x040 /* Partial put/get. */
+#define DB_DBT_REALLOC 0x080 /* Return in realloc'd memory. */
+#define DB_DBT_READONLY 0x100 /* Readonly, don't update. */
+#define DB_DBT_STREAMING 0x200 /* Internal: DBT is being streamed. */
+#define DB_DBT_USERCOPY 0x400 /* Use the user-supplied callback. */
+#define DB_DBT_USERMEM 0x800 /* Return in user's memory. */
+ u_int32_t flags;
+};
+
+/*******************************************************
+ * Mutexes.
+ *******************************************************/
+/*
+ * When mixed size addressing is supported mutexes need to be the same size
+ * independent of the process address size is.
+ */
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+typedef db_size_t db_mutex_t;
+#else
+typedef uintptr_t db_mutex_t;
+#endif
+
+struct __db_mutex_stat { /* SHARED */
+ /* The following fields are maintained in the region's copy. */
+ u_int32_t st_mutex_align; /* Mutex alignment */
+ u_int32_t st_mutex_tas_spins; /* Mutex test-and-set spins */
+ u_int32_t st_mutex_init; /* Initial mutex count */
+ u_int32_t st_mutex_cnt; /* Mutex count */
+ u_int32_t st_mutex_max; /* Mutex max */
+ u_int32_t st_mutex_free; /* Available mutexes */
+ u_int32_t st_mutex_inuse; /* Mutexes in use */
+ u_int32_t st_mutex_inuse_max; /* Maximum mutexes ever in use */
+
+ /* The following fields are filled-in from other places. */
+#ifndef __TEST_DB_NO_STATISTICS
+ uintmax_t st_region_wait; /* Region lock granted after wait. */
+ uintmax_t st_region_nowait; /* Region lock granted without wait. */
+ roff_t st_regsize; /* Region size. */
+ roff_t st_regmax; /* Region max. */
+#endif
+};
+
+/* This is the length of the buffer passed to DB_ENV->thread_id_string() */
+#define DB_THREADID_STRLEN 128
+
+/*******************************************************
+ * Locking.
+ *******************************************************/
+#define DB_LOCKVERSION 1
+
+#define DB_FILE_ID_LEN 20 /* Unique file ID length. */
+
+/*
+ * Deadlock detector modes; used in the DB_ENV structure to configure the
+ * locking subsystem.
+ */
+#define DB_LOCK_NORUN 0
+#define DB_LOCK_DEFAULT 1 /* Default policy. */
+#define DB_LOCK_EXPIRE 2 /* Only expire locks, no detection. */
+#define DB_LOCK_MAXLOCKS 3 /* Select locker with max locks. */
+#define DB_LOCK_MAXWRITE 4 /* Select locker with max writelocks. */
+#define DB_LOCK_MINLOCKS 5 /* Select locker with min locks. */
+#define DB_LOCK_MINWRITE 6 /* Select locker with min writelocks. */
+#define DB_LOCK_OLDEST 7 /* Select oldest locker. */
+#define DB_LOCK_RANDOM 8 /* Select random locker. */
+#define DB_LOCK_YOUNGEST 9 /* Select youngest locker. */
+
+/*
+ * Simple R/W lock modes and for multi-granularity intention locking.
+ *
+ * !!!
+ * These values are NOT random, as they are used as an index into the lock
+ * conflicts arrays, i.e., DB_LOCK_IWRITE must be == 3, and DB_LOCK_IREAD
+ * must be == 4.
+ */
+typedef enum {
+ DB_LOCK_NG=0, /* Not granted. */
+ DB_LOCK_READ=1, /* Shared/read. */
+ DB_LOCK_WRITE=2, /* Exclusive/write. */
+ DB_LOCK_WAIT=3, /* Wait for event */
+ DB_LOCK_IWRITE=4, /* Intent exclusive/write. */
+ DB_LOCK_IREAD=5, /* Intent to share/read. */
+ DB_LOCK_IWR=6, /* Intent to read and write. */
+ DB_LOCK_READ_UNCOMMITTED=7, /* Degree 1 isolation. */
+ DB_LOCK_WWRITE=8 /* Was Written. */
+} db_lockmode_t;
+
+/*
+ * Request types.
+ */
+typedef enum {
+ DB_LOCK_DUMP=0, /* Display held locks. */
+ DB_LOCK_GET=1, /* Get the lock. */
+ DB_LOCK_GET_TIMEOUT=2, /* Get lock with a timeout. */
+ DB_LOCK_INHERIT=3, /* Pass locks to parent. */
+ DB_LOCK_PUT=4, /* Release the lock. */
+ DB_LOCK_PUT_ALL=5, /* Release locker's locks. */
+ DB_LOCK_PUT_OBJ=6, /* Release locker's locks on obj. */
+ DB_LOCK_PUT_READ=7, /* Release locker's read locks. */
+ DB_LOCK_TIMEOUT=8, /* Force a txn to timeout. */
+ DB_LOCK_TRADE=9, /* Trade locker ids on a lock. */
+ DB_LOCK_UPGRADE_WRITE=10 /* Upgrade writes for dirty reads. */
+} db_lockop_t;
+
+/*
+ * Status of a lock.
+ */
+typedef enum {
+ DB_LSTAT_ABORTED=1, /* Lock belongs to an aborted txn. */
+ DB_LSTAT_EXPIRED=2, /* Lock has expired. */
+ DB_LSTAT_FREE=3, /* Lock is unallocated. */
+ DB_LSTAT_HELD=4, /* Lock is currently held. */
+ DB_LSTAT_PENDING=5, /* Lock was waiting and has been
+ * promoted; waiting for the owner
+ * to run and upgrade it to held. */
+ DB_LSTAT_WAITING=6 /* Lock is on the wait queue. */
+}db_status_t;
+
+/* Lock statistics structure. */
+struct __db_lock_stat { /* SHARED */
+ u_int32_t st_id; /* Last allocated locker ID. */
+ u_int32_t st_cur_maxid; /* Current maximum unused ID. */
+ u_int32_t st_initlocks; /* Initial number of locks in table. */
+ u_int32_t st_initlockers; /* Initial num of lockers in table. */
+ u_int32_t st_initobjects; /* Initial num of objects in table. */
+ u_int32_t st_locks; /* Current number of locks in table. */
+ u_int32_t st_lockers; /* Current num of lockers in table. */
+ u_int32_t st_objects; /* Current num of objects in table. */
+ u_int32_t st_maxlocks; /* Maximum number of locks in table. */
+ u_int32_t st_maxlockers; /* Maximum num of lockers in table. */
+ u_int32_t st_maxobjects; /* Maximum num of objects in table. */
+ u_int32_t st_partitions; /* number of partitions. */
+ u_int32_t st_tablesize; /* Size of object hash table. */
+ int32_t st_nmodes; /* Number of lock modes. */
+ u_int32_t st_nlockers; /* Current number of lockers. */
+#ifndef __TEST_DB_NO_STATISTICS
+ u_int32_t st_nlocks; /* Current number of locks. */
+ u_int32_t st_maxnlocks; /* Maximum number of locks so far. */
+ u_int32_t st_maxhlocks; /* Maximum number of locks in any bucket. */
+ uintmax_t st_locksteals; /* Number of lock steals so far. */
+ uintmax_t st_maxlsteals; /* Maximum number steals in any partition. */
+ u_int32_t st_maxnlockers; /* Maximum number of lockers so far. */
+ u_int32_t st_nobjects; /* Current number of objects. */
+ u_int32_t st_maxnobjects; /* Maximum number of objects so far. */
+ u_int32_t st_maxhobjects; /* Maximum number of objectsin any bucket. */
+ uintmax_t st_objectsteals; /* Number of objects steals so far. */
+ uintmax_t st_maxosteals; /* Maximum number of steals in any partition. */
+ uintmax_t st_nrequests; /* Number of lock gets. */
+ uintmax_t st_nreleases; /* Number of lock puts. */
+ uintmax_t st_nupgrade; /* Number of lock upgrades. */
+ uintmax_t st_ndowngrade; /* Number of lock downgrades. */
+ uintmax_t st_lock_wait; /* Lock conflicts w/ subsequent wait */
+ uintmax_t st_lock_nowait; /* Lock conflicts w/o subsequent wait */
+ uintmax_t st_ndeadlocks; /* Number of lock deadlocks. */
+ db_timeout_t st_locktimeout; /* Lock timeout. */
+ uintmax_t st_nlocktimeouts; /* Number of lock timeouts. */
+ db_timeout_t st_txntimeout; /* Transaction timeout. */
+ uintmax_t st_ntxntimeouts; /* Number of transaction timeouts. */
+ uintmax_t st_part_wait; /* Partition lock granted after wait. */
+ uintmax_t st_part_nowait; /* Partition lock granted without wait. */
+ uintmax_t st_part_max_wait; /* Max partition lock granted after wait. */
+ uintmax_t st_part_max_nowait; /* Max partition lock granted without wait. */
+ uintmax_t st_objs_wait; /* Object lock granted after wait. */
+ uintmax_t st_objs_nowait; /* Object lock granted without wait. */
+ uintmax_t st_lockers_wait; /* Locker lock granted after wait. */
+ uintmax_t st_lockers_nowait; /* Locker lock granted without wait. */
+ uintmax_t st_region_wait; /* Region lock granted after wait. */
+ uintmax_t st_region_nowait; /* Region lock granted without wait. */
+ u_int32_t st_hash_len; /* Max length of bucket. */
+ roff_t st_regsize; /* Region size. */
+#endif
+};
+
+struct __db_lock_hstat { /* SHARED */
+ uintmax_t st_nrequests; /* Number of lock gets. */
+ uintmax_t st_nreleases; /* Number of lock puts. */
+ uintmax_t st_nupgrade; /* Number of lock upgrades. */
+ uintmax_t st_ndowngrade; /* Number of lock downgrades. */
+ u_int32_t st_nlocks; /* Current number of locks. */
+ u_int32_t st_maxnlocks; /* Maximum number of locks so far. */
+ u_int32_t st_nobjects; /* Current number of objects. */
+ u_int32_t st_maxnobjects; /* Maximum number of objects so far. */
+ uintmax_t st_lock_wait; /* Lock conflicts w/ subsequent wait */
+ uintmax_t st_lock_nowait; /* Lock conflicts w/o subsequent wait */
+ uintmax_t st_nlocktimeouts; /* Number of lock timeouts. */
+ uintmax_t st_ntxntimeouts; /* Number of transaction timeouts. */
+ u_int32_t st_hash_len; /* Max length of bucket. */
+};
+
+struct __db_lock_pstat { /* SHARED */
+ u_int32_t st_nlocks; /* Current number of locks. */
+ u_int32_t st_maxnlocks; /* Maximum number of locks so far. */
+ u_int32_t st_nobjects; /* Current number of objects. */
+ u_int32_t st_maxnobjects; /* Maximum number of objects so far. */
+ uintmax_t st_locksteals; /* Number of lock steals so far. */
+ uintmax_t st_objectsteals; /* Number of objects steals so far. */
+};
+
+/*
+ * DB_LOCK_ILOCK --
+ * Internal DB access method lock.
+ */
+struct __db_ilock { /* SHARED */
+ db_pgno_t pgno; /* Page being locked. */
+ u_int8_t fileid[DB_FILE_ID_LEN];/* File id. */
+#define DB_HANDLE_LOCK 1
+#define DB_RECORD_LOCK 2
+#define DB_PAGE_LOCK 3
+#define DB_DATABASE_LOCK 4
+ u_int32_t type; /* Type of lock. */
+};
+
+/*
+ * DB_LOCK --
+ * The structure is allocated by the caller and filled in during a
+ * lock_get request (or a lock_vec/DB_LOCK_GET).
+ */
+struct __db_lock_u { /* SHARED */
+ roff_t off; /* Offset of the lock in the region */
+ u_int32_t ndx; /* Index of the object referenced by
+ * this lock; used for locking. */
+ u_int32_t gen; /* Generation number of this lock. */
+ db_lockmode_t mode; /* mode of this lock. */
+};
+
+/* Lock request structure. */
+struct __db_lockreq {
+ db_lockop_t op; /* Operation. */
+ db_lockmode_t mode; /* Requested mode. */
+ db_timeout_t timeout; /* Time to expire lock. */
+ DBT *obj; /* Object being locked. */
+ DB_LOCK lock; /* Lock returned. */
+};
+
+/*******************************************************
+ * Logging.
+ *******************************************************/
+#define DB_LOGVERSION 19 /* Current log version. */
+#define DB_LOGVERSION_LATCHING 15 /* Log version using latching: db-4.8 */
+#define DB_LOGCHKSUM 12 /* Check sum headers: db-4.5 */
+#define DB_LOGOLDVER 8 /* Oldest version supported: db-4.2 */
+#define DB_LOGMAGIC 0x040988
+
+/*
+ * A DB_LSN has two parts, a fileid which identifies a specific file, and an
+ * offset within that file. The fileid is an unsigned 4-byte quantity that
+ * uniquely identifies a file within the log directory -- currently a simple
+ * counter inside the log. The offset is also an unsigned 4-byte value. The
+ * log manager guarantees the offset is never more than 4 bytes by switching
+ * to a new log file before the maximum length imposed by an unsigned 4-byte
+ * offset is reached.
+ */
+struct __db_lsn { /* SHARED */
+ u_int32_t file; /* File ID. */
+ u_int32_t offset; /* File offset. */
+};
+
+/*
+ * Application-specified log record types start at DB_user_BEGIN, and must not
+ * equal or exceed DB_debug_FLAG.
+ *
+ * DB_debug_FLAG is the high-bit of the u_int32_t that specifies a log record
+ * type. If the flag is set, it's a log record that was logged for debugging
+ * purposes only, even if it reflects a database change -- the change was part
+ * of a non-durable transaction.
+ */
+#define DB_user_BEGIN 10000
+#define DB_debug_FLAG 0x80000000
+
+/*
+ * DB_LOGC --
+ * Log cursor.
+ */
+struct __db_log_cursor {
+ ENV *env; /* Environment */
+
+ DB_FH *fhp; /* File handle. */
+ DB_LSN lsn; /* Cursor: LSN */
+ u_int32_t len; /* Cursor: record length */
+ u_int32_t prev; /* Cursor: previous record's offset */
+
+ DBT dbt; /* Return DBT. */
+ DB_LSN p_lsn; /* Persist LSN. */
+ u_int32_t p_version; /* Persist version. */
+
+ u_int8_t *bp; /* Allocated read buffer. */
+ u_int32_t bp_size; /* Read buffer length in bytes. */
+ u_int32_t bp_rlen; /* Read buffer valid data length. */
+ DB_LSN bp_lsn; /* Read buffer first byte LSN. */
+
+ u_int32_t bp_maxrec; /* Max record length in the log file. */
+
+ /* DB_LOGC PUBLIC HANDLE LIST BEGIN */
+ int (*close) __P((DB_LOGC *, u_int32_t));
+ int (*get) __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+ int (*version) __P((DB_LOGC *, u_int32_t *, u_int32_t));
+ /* DB_LOGC PUBLIC HANDLE LIST END */
+
+#define DB_LOG_DISK 0x01 /* Log record came from disk. */
+#define DB_LOG_LOCKED 0x02 /* Log region already locked */
+#define DB_LOG_SILENT_ERR 0x04 /* Turn-off error messages. */
+ u_int32_t flags;
+};
+
+/* Log statistics structure. */
+struct __db_log_stat { /* SHARED */
+ u_int32_t st_magic; /* Log file magic number. */
+ u_int32_t st_version; /* Log file version number. */
+ int32_t st_mode; /* Log file permissions mode. */
+ u_int32_t st_lg_bsize; /* Log buffer size. */
+ u_int32_t st_lg_size; /* Log file size. */
+ u_int32_t st_wc_bytes; /* Bytes to log since checkpoint. */
+ u_int32_t st_wc_mbytes; /* Megabytes to log since checkpoint. */
+ u_int32_t st_fileid_init; /* Initial allocation for fileids. */
+#ifndef __TEST_DB_NO_STATISTICS
+ u_int32_t st_nfileid; /* Current number of fileids. */
+ u_int32_t st_maxnfileid; /* Maximum number of fileids used. */
+ uintmax_t st_record; /* Records entered into the log. */
+ u_int32_t st_w_bytes; /* Bytes to log. */
+ u_int32_t st_w_mbytes; /* Megabytes to log. */
+ uintmax_t st_wcount; /* Total I/O writes to the log. */
+ uintmax_t st_wcount_fill; /* Overflow writes to the log. */
+ uintmax_t st_rcount; /* Total I/O reads from the log. */
+ uintmax_t st_scount; /* Total syncs to the log. */
+ uintmax_t st_region_wait; /* Region lock granted after wait. */
+ uintmax_t st_region_nowait; /* Region lock granted without wait. */
+ u_int32_t st_cur_file; /* Current log file number. */
+ u_int32_t st_cur_offset; /* Current log file offset. */
+ u_int32_t st_disk_file; /* Known on disk log file number. */
+ u_int32_t st_disk_offset; /* Known on disk log file offset. */
+ u_int32_t st_maxcommitperflush; /* Max number of commits in a flush. */
+ u_int32_t st_mincommitperflush; /* Min number of commits in a flush. */
+ roff_t st_regsize; /* Region size. */
+#endif
+};
+
+/*
+ * We need to record the first log record of a transaction. For user
+ * defined logging this macro returns the place to put that information,
+ * if it is need in rlsnp, otherwise it leaves it unchanged. We also
+ * need to track the last record of the transaction, this returns the
+ * place to put that info.
+ */
+#define DB_SET_TXN_LSNP(txn, blsnp, llsnp) \
+ ((txn)->set_txn_lsnp(txn, blsnp, llsnp))
+
+/*
+ * Definition of the structure which specifies marshalling of log records.
+ */
+typedef enum {
+ LOGREC_Done,
+ LOGREC_ARG,
+ LOGREC_HDR,
+ LOGREC_DATA,
+ LOGREC_DB,
+ LOGREC_DBOP,
+ LOGREC_DBT,
+ LOGREC_LOCKS,
+ LOGREC_OP,
+ LOGREC_PGDBT,
+ LOGREC_PGDDBT,
+ LOGREC_PGLIST,
+ LOGREC_POINTER,
+ LOGREC_TIME
+} log_rec_type_t;
+
+typedef const struct __log_rec_spec {
+ log_rec_type_t type;
+ u_int32_t offset;
+ const char *name;
+ const char fmt[4];
+} DB_LOG_RECSPEC;
+
+/*
+ * Size of a DBT in a log record.
+ */
+#define LOG_DBT_SIZE(dbt) \
+ (sizeof(u_int32_t) + ((dbt) == NULL ? 0 : (dbt)->size))
+
+/*******************************************************
+ * Shared buffer cache (mpool).
+ *******************************************************/
+/* Priority values for DB_MPOOLFILE->{put,set_priority}. */
+typedef enum {
+ DB_PRIORITY_UNCHANGED=0,
+ DB_PRIORITY_VERY_LOW=1,
+ DB_PRIORITY_LOW=2,
+ DB_PRIORITY_DEFAULT=3,
+ DB_PRIORITY_HIGH=4,
+ DB_PRIORITY_VERY_HIGH=5
+} DB_CACHE_PRIORITY;
+
+/* Per-process DB_MPOOLFILE information. */
+struct __db_mpoolfile {
+ DB_FH *fhp; /* Underlying file handle. */
+
+ /*
+ * !!!
+ * The ref, pinref and q fields are protected by the region lock.
+ */
+ u_int32_t ref; /* Reference count. */
+
+ u_int32_t pinref; /* Pinned block reference count. */
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(__db_mpoolfile) q;
+ */
+ struct {
+ struct __db_mpoolfile *tqe_next;
+ struct __db_mpoolfile **tqe_prev;
+ } q; /* Linked list of DB_MPOOLFILE's. */
+
+ /*
+ * !!!
+ * The rest of the fields (with the exception of the MP_FLUSH flag)
+ * are not thread-protected, even when they may be modified at any
+ * time by the application. The reason is the DB_MPOOLFILE handle
+ * is single-threaded from the viewpoint of the application, and so
+ * the only fields needing to be thread-protected are those accessed
+ * by checkpoint or sync threads when using DB_MPOOLFILE structures
+ * to flush buffers from the cache.
+ */
+ ENV *env; /* Environment */
+ MPOOLFILE *mfp; /* Underlying MPOOLFILE. */
+
+ u_int32_t clear_len; /* Cleared length on created pages. */
+ u_int8_t /* Unique file ID. */
+ fileid[DB_FILE_ID_LEN];
+ int ftype; /* File type. */
+ int32_t lsn_offset; /* LSN offset in page. */
+ u_int32_t gbytes, bytes; /* Maximum file size. */
+ DBT *pgcookie; /* Byte-string passed to pgin/pgout. */
+ int32_t priority; /* Cache priority. */
+
+ void *addr; /* Address of mmap'd region. */
+ size_t len; /* Length of mmap'd region. */
+
+ u_int32_t config_flags; /* Flags to DB_MPOOLFILE->set_flags. */
+
+ /* DB_MPOOLFILE PUBLIC HANDLE LIST BEGIN */
+ int (*close) __P((DB_MPOOLFILE *, u_int32_t));
+ int (*get)
+ __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
+ int (*get_clear_len) __P((DB_MPOOLFILE *, u_int32_t *));
+ int (*get_fileid) __P((DB_MPOOLFILE *, u_int8_t *));
+ int (*get_flags) __P((DB_MPOOLFILE *, u_int32_t *));
+ int (*get_ftype) __P((DB_MPOOLFILE *, int *));
+ int (*get_last_pgno) __P((DB_MPOOLFILE *, db_pgno_t *));
+ int (*get_lsn_offset) __P((DB_MPOOLFILE *, int32_t *));
+ int (*get_maxsize) __P((DB_MPOOLFILE *, u_int32_t *, u_int32_t *));
+ int (*get_pgcookie) __P((DB_MPOOLFILE *, DBT *));
+ int (*get_priority) __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *));
+ int (*open) __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t));
+ int (*put) __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
+ int (*set_clear_len) __P((DB_MPOOLFILE *, u_int32_t));
+ int (*set_fileid) __P((DB_MPOOLFILE *, u_int8_t *));
+ int (*set_flags) __P((DB_MPOOLFILE *, u_int32_t, int));
+ int (*set_ftype) __P((DB_MPOOLFILE *, int));
+ int (*set_lsn_offset) __P((DB_MPOOLFILE *, int32_t));
+ int (*set_maxsize) __P((DB_MPOOLFILE *, u_int32_t, u_int32_t));
+ int (*set_pgcookie) __P((DB_MPOOLFILE *, DBT *));
+ int (*set_priority) __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY));
+ int (*sync) __P((DB_MPOOLFILE *));
+ /* DB_MPOOLFILE PUBLIC HANDLE LIST END */
+
+ /*
+ * MP_FILEID_SET, MP_OPEN_CALLED and MP_READONLY do not need to be
+ * thread protected because they are initialized before the file is
+ * linked onto the per-process lists, and never modified.
+ *
+ * MP_FLUSH is thread protected because it is potentially read/set by
+ * multiple threads of control.
+ */
+#define MP_FILEID_SET 0x001 /* Application supplied a file ID. */
+#define MP_FLUSH 0x002 /* Was used to flush a buffer. */
+#define MP_FOR_FLUSH 0x004 /* Was opened to flush a buffer. */
+#define MP_MULTIVERSION 0x008 /* Opened for multiversion access. */
+#define MP_OPEN_CALLED 0x010 /* File opened. */
+#define MP_READONLY 0x020 /* File is readonly. */
+#define MP_DUMMY 0x040 /* File is dummy for __memp_fput. */
+ u_int32_t flags;
+};
+
+/* Mpool statistics structure. */
+struct __db_mpool_stat { /* SHARED */
+ u_int32_t st_gbytes; /* Total cache size: GB. */
+ u_int32_t st_bytes; /* Total cache size: B. */
+ u_int32_t st_ncache; /* Number of cache regions. */
+ u_int32_t st_max_ncache; /* Maximum number of regions. */
+ db_size_t st_mmapsize; /* Maximum file size for mmap. */
+ int32_t st_maxopenfd; /* Maximum number of open fd's. */
+ int32_t st_maxwrite; /* Maximum buffers to write. */
+ db_timeout_t st_maxwrite_sleep; /* Sleep after writing max buffers. */
+ u_int32_t st_pages; /* Total number of pages. */
+#ifndef __TEST_DB_NO_STATISTICS
+ u_int32_t st_map; /* Pages from mapped files. */
+ uintmax_t st_cache_hit; /* Pages found in the cache. */
+ uintmax_t st_cache_miss; /* Pages not found in the cache. */
+ uintmax_t st_page_create; /* Pages created in the cache. */
+ uintmax_t st_page_in; /* Pages read in. */
+ uintmax_t st_page_out; /* Pages written out. */
+ uintmax_t st_ro_evict; /* Clean pages forced from the cache. */
+ uintmax_t st_rw_evict; /* Dirty pages forced from the cache. */
+ uintmax_t st_page_trickle; /* Pages written by memp_trickle. */
+ u_int32_t st_page_clean; /* Clean pages. */
+ u_int32_t st_page_dirty; /* Dirty pages. */
+ u_int32_t st_hash_buckets; /* Number of hash buckets. */
+ u_int32_t st_hash_mutexes; /* Number of hash bucket mutexes. */
+ u_int32_t st_pagesize; /* Assumed page size. */
+ u_int32_t st_hash_searches; /* Total hash chain searches. */
+ u_int32_t st_hash_longest; /* Longest hash chain searched. */
+ uintmax_t st_hash_examined; /* Total hash entries searched. */
+ uintmax_t st_hash_nowait; /* Hash lock granted with nowait. */
+ uintmax_t st_hash_wait; /* Hash lock granted after wait. */
+ uintmax_t st_hash_max_nowait; /* Max hash lock granted with nowait. */
+ uintmax_t st_hash_max_wait; /* Max hash lock granted after wait. */
+ uintmax_t st_region_nowait; /* Region lock granted with nowait. */
+ uintmax_t st_region_wait; /* Region lock granted after wait. */
+ uintmax_t st_mvcc_frozen; /* Buffers frozen. */
+ uintmax_t st_mvcc_thawed; /* Buffers thawed. */
+ uintmax_t st_mvcc_freed; /* Frozen buffers freed. */
+ uintmax_t st_alloc; /* Number of page allocations. */
+ uintmax_t st_alloc_buckets; /* Buckets checked during allocation. */
+ uintmax_t st_alloc_max_buckets;/* Max checked during allocation. */
+ uintmax_t st_alloc_pages; /* Pages checked during allocation. */
+ uintmax_t st_alloc_max_pages; /* Max checked during allocation. */
+ uintmax_t st_io_wait; /* Thread waited on buffer I/O. */
+ uintmax_t st_sync_interrupted; /* Number of times sync interrupted. */
+ roff_t st_regsize; /* Region size. */
+ roff_t st_regmax; /* Region max. */
+#endif
+};
+
+/*
+ * Mpool file statistics structure.
+ * The first fields in this structure must mirror the __db_mpool_fstat_int
+ * structure, since content is mem copied between the two.
+ */
+struct __db_mpool_fstat {
+ u_int32_t st_pagesize; /* Page size. */
+#ifndef __TEST_DB_NO_STATISTICS
+ u_int32_t st_map; /* Pages from mapped files. */
+ uintmax_t st_cache_hit; /* Pages found in the cache. */
+ uintmax_t st_cache_miss; /* Pages not found in the cache. */
+ uintmax_t st_page_create; /* Pages created in the cache. */
+ uintmax_t st_page_in; /* Pages read in. */
+ uintmax_t st_page_out; /* Pages written out. */
+ uintmax_t st_backup_spins; /* Number of spins during a copy. */
+#endif
+ char *file_name; /* File name. */
+};
+
+/*******************************************************
+ * Transactions and recovery.
+ *******************************************************/
+#define DB_TXNVERSION 1
+
+typedef enum {
+ DB_TXN_ABORT=0, /* Public. */
+ DB_TXN_APPLY=1, /* Public. */
+ DB_TXN_BACKWARD_ROLL=3, /* Public. */
+ DB_TXN_FORWARD_ROLL=4, /* Public. */
+ DB_TXN_OPENFILES=5, /* Internal. */
+ DB_TXN_POPENFILES=6, /* Internal. */
+ DB_TXN_PRINT=7, /* Public. */
+ DB_TXN_LOG_VERIFY=8 /* Internal. */
+} db_recops;
+
+/*
+ * BACKWARD_ALLOC is used during the forward pass to pick up any aborted
+ * allocations for files that were created during the forward pass.
+ * The main difference between _ALLOC and _ROLL is that the entry for
+ * the file not exist during the rollforward pass.
+ */
+#define DB_UNDO(op) ((op) == DB_TXN_ABORT || (op) == DB_TXN_BACKWARD_ROLL)
+#define DB_REDO(op) ((op) == DB_TXN_FORWARD_ROLL || (op) == DB_TXN_APPLY)
+
+struct __db_txn {
+ DB_TXNMGR *mgrp; /* Pointer to transaction manager. */
+ DB_TXN *parent; /* Pointer to transaction's parent. */
+ DB_THREAD_INFO *thread_info; /* Pointer to thread information. */
+
+ u_int32_t txnid; /* Unique transaction id. */
+ char *name; /* Transaction name. */
+ DB_LOCKER *locker; /* Locker for this txn. */
+
+ void *td; /* Detail structure within region. */
+ db_timeout_t lock_timeout; /* Timeout for locks for this txn. */
+ void *txn_list; /* Undo information for parent. */
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(__db_txn) links;
+ */
+ struct {
+ struct __db_txn *tqe_next;
+ struct __db_txn **tqe_prev;
+ } links; /* Links transactions off manager. */
+
+ /*
+ * !!!
+ * Explicit representations of structures from shqueue.h.
+ * SH_TAILQ_ENTRY xa_links;
+ * These links link together transactions that are active in
+ * the same thread of control.
+ */
+ struct {
+ db_ssize_t stqe_next;
+ db_ssize_t stqe_prev;
+ } xa_links; /* Links XA transactions. */
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_HEAD(__kids, __db_txn) kids;
+ */
+ struct __kids {
+ struct __db_txn *tqh_first;
+ struct __db_txn **tqh_last;
+ } kids;
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_HEAD(__events, __txn_event) events;
+ */
+ struct {
+ struct __txn_event *tqh_first;
+ struct __txn_event **tqh_last;
+ } events; /* Links deferred events. */
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * STAILQ_HEAD(__logrec, __txn_logrec) logs;
+ */
+ struct {
+ struct __txn_logrec *stqh_first;
+ struct __txn_logrec **stqh_last;
+ } logs; /* Links in memory log records. */
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(__db_txn) klinks;
+ */
+ struct {
+ struct __db_txn *tqe_next;
+ struct __db_txn **tqe_prev;
+ } klinks; /* Links of children in parent. */
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_HEAD(__my_cursors, __dbc) my_cursors;
+ */
+ struct __my_cursors {
+ struct __dbc *tqh_first;
+ struct __dbc **tqh_last;
+ } my_cursors;
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_HEAD(__femfs, MPOOLFILE) femfs;
+ *
+ * These are DBs involved in file extension in this transaction.
+ */
+ struct __femfs {
+ DB *tqh_first;
+ DB **tqh_last;
+ } femfs;
+
+ DB_TXN_TOKEN *token_buffer; /* User's commit token buffer. */
+ void *api_internal; /* C++ API private. */
+ void *xml_internal; /* XML API private. */
+
+ u_int32_t cursors; /* Number of cursors open for txn */
+
+ /* DB_TXN PUBLIC HANDLE LIST BEGIN */
+ int (*abort) __P((DB_TXN *));
+ int (*commit) __P((DB_TXN *, u_int32_t));
+ int (*discard) __P((DB_TXN *, u_int32_t));
+ int (*get_name) __P((DB_TXN *, const char **));
+ int (*get_priority) __P((DB_TXN *, u_int32_t *));
+ u_int32_t (*id) __P((DB_TXN *));
+ int (*prepare) __P((DB_TXN *, u_int8_t *));
+ int (*set_commit_token) __P((DB_TXN *, DB_TXN_TOKEN *));
+ int (*set_name) __P((DB_TXN *, const char *));
+ int (*set_priority) __P((DB_TXN *, u_int32_t));
+ int (*set_timeout) __P((DB_TXN *, db_timeout_t, u_int32_t));
+ /* DB_TXN PUBLIC HANDLE LIST END */
+
+ /* DB_TXN PRIVATE HANDLE LIST BEGIN */
+ void (*set_txn_lsnp) __P((DB_TXN *txn, DB_LSN **, DB_LSN **));
+ /* DB_TXN PRIVATE HANDLE LIST END */
+
+#define TXN_XA_THREAD_NOTA 0
+#define TXN_XA_THREAD_ASSOCIATED 1
+#define TXN_XA_THREAD_SUSPENDED 2
+#define TXN_XA_THREAD_UNASSOCIATED 3
+ u_int32_t xa_thr_status;
+
+#define TXN_CHILDCOMMIT 0x00001 /* Txn has committed. */
+#define TXN_COMPENSATE 0x00002 /* Compensating transaction. */
+#define TXN_DEADLOCK 0x00004 /* Txn has deadlocked. */
+#define TXN_FAMILY 0x00008 /* Cursors/children are independent. */
+#define TXN_IGNORE_LEASE 0x00010 /* Skip lease check at commit time. */
+#define TXN_INFAMILY 0x00020 /* Part of a transaction family. */
+#define TXN_LOCKTIMEOUT 0x00040 /* Txn has a lock timeout. */
+#define TXN_MALLOC 0x00080 /* Structure allocated by TXN system. */
+#define TXN_NOSYNC 0x00100 /* Do not sync on prepare and commit. */
+#define TXN_NOWAIT 0x00200 /* Do not wait on locks. */
+#define TXN_PRIVATE 0x00400 /* Txn owned by cursor. */
+#define TXN_READONLY 0x00800 /* CDS group handle. */
+#define TXN_READ_COMMITTED 0x01000 /* Txn has degree 2 isolation. */
+#define TXN_READ_UNCOMMITTED 0x02000 /* Txn has degree 1 isolation. */
+#define TXN_RESTORED 0x04000 /* Txn has been restored. */
+#define TXN_SNAPSHOT 0x08000 /* Snapshot Isolation. */
+#define TXN_SYNC 0x10000 /* Write and sync on prepare/commit. */
+#define TXN_WRITE_NOSYNC 0x20000 /* Write only on prepare/commit. */
+#define TXN_BULK 0x40000 /* Enable bulk loading optimization. */
+ u_int32_t flags;
+};
+
+#define TXN_SYNC_FLAGS (TXN_SYNC | TXN_NOSYNC | TXN_WRITE_NOSYNC)
+
+/*
+ * Structure used for two phase commit interface.
+ * We set the size of our global transaction id (gid) to be 128 in order
+ * to match that defined by the XA X/Open standard.
+ */
+#define DB_GID_SIZE 128
+struct __db_preplist {
+ DB_TXN *txn;
+ u_int8_t gid[DB_GID_SIZE];
+};
+
+/* Transaction statistics structure. */
+struct __db_txn_active {
+ u_int32_t txnid; /* Transaction ID */
+ u_int32_t parentid; /* Transaction ID of parent */
+ pid_t pid; /* Process owning txn ID */
+ db_threadid_t tid; /* Thread owning txn ID */
+
+ DB_LSN lsn; /* LSN when transaction began */
+
+ DB_LSN read_lsn; /* Read LSN for MVCC */
+ u_int32_t mvcc_ref; /* MVCC reference count */
+
+ u_int32_t priority; /* Deadlock resolution priority */
+
+#define TXN_ABORTED 1
+#define TXN_COMMITTED 2
+#define TXN_NEED_ABORT 3
+#define TXN_PREPARED 4
+#define TXN_RUNNING 5
+ u_int32_t status; /* Status of the transaction */
+
+#define TXN_XA_ACTIVE 1
+#define TXN_XA_DEADLOCKED 2
+#define TXN_XA_IDLE 3
+#define TXN_XA_PREPARED 4
+#define TXN_XA_ROLLEDBACK 5
+ u_int32_t xa_status; /* XA status */
+
+ u_int8_t gid[DB_GID_SIZE]; /* Global transaction ID */
+ char name[51]; /* 50 bytes of name, nul termination */
+};
+
+struct __db_txn_stat {
+ u_int32_t st_nrestores; /* number of restored transactions
+ after recovery. */
+#ifndef __TEST_DB_NO_STATISTICS
+ DB_LSN st_last_ckp; /* lsn of the last checkpoint */
+ time_t st_time_ckp; /* time of last checkpoint */
+ u_int32_t st_last_txnid; /* last transaction id given out */
+ u_int32_t st_inittxns; /* inital txns allocated */
+ u_int32_t st_maxtxns; /* maximum txns possible */
+ uintmax_t st_naborts; /* number of aborted transactions */
+ uintmax_t st_nbegins; /* number of begun transactions */
+ uintmax_t st_ncommits; /* number of committed transactions */
+ u_int32_t st_nactive; /* number of active transactions */
+ u_int32_t st_nsnapshot; /* number of snapshot transactions */
+ u_int32_t st_maxnactive; /* maximum active transactions */
+ u_int32_t st_maxnsnapshot; /* maximum snapshot transactions */
+ uintmax_t st_region_wait; /* Region lock granted after wait. */
+ uintmax_t st_region_nowait; /* Region lock granted without wait. */
+ roff_t st_regsize; /* Region size. */
+ DB_TXN_ACTIVE *st_txnarray; /* array of active transactions */
+#endif
+};
+
+#define DB_TXN_TOKEN_SIZE 20
+struct __db_txn_token {
+ u_int8_t buf[DB_TXN_TOKEN_SIZE];
+};
+
+/*******************************************************
+ * Replication.
+ *******************************************************/
+/* Special, out-of-band environment IDs. */
+#define DB_EID_BROADCAST -1
+#define DB_EID_INVALID -2
+#define DB_EID_MASTER -3
+
+#define DB_REP_DEFAULT_PRIORITY 100
+
+/* Acknowledgement policies; 0 reserved as OOB. */
+#define DB_REPMGR_ACKS_ALL 1
+#define DB_REPMGR_ACKS_ALL_AVAILABLE 2
+#define DB_REPMGR_ACKS_ALL_PEERS 3
+#define DB_REPMGR_ACKS_NONE 4
+#define DB_REPMGR_ACKS_ONE 5
+#define DB_REPMGR_ACKS_ONE_PEER 6
+#define DB_REPMGR_ACKS_QUORUM 7
+
+/* Replication timeout configuration values. */
+#define DB_REP_ACK_TIMEOUT 1 /* RepMgr acknowledgements. */
+#define DB_REP_CHECKPOINT_DELAY 2 /* Master checkpoint delay. */
+#define DB_REP_CONNECTION_RETRY 3 /* RepMgr connections. */
+#define DB_REP_ELECTION_RETRY 4 /* RepMgr elect retries. */
+#define DB_REP_ELECTION_TIMEOUT 5 /* Rep normal elections. */
+#define DB_REP_FULL_ELECTION_TIMEOUT 6 /* Rep full elections. */
+#define DB_REP_HEARTBEAT_MONITOR 7 /* RepMgr client HB monitor. */
+#define DB_REP_HEARTBEAT_SEND 8 /* RepMgr master send freq. */
+#define DB_REP_LEASE_TIMEOUT 9 /* Master leases. */
+
+/*
+ * Event notification types. (Tcl testing interface currently assumes there are
+ * no more than 32 of these.)
+ */
+#define DB_EVENT_PANIC 0
+#define DB_EVENT_REG_ALIVE 1
+#define DB_EVENT_REG_PANIC 2
+#define DB_EVENT_REP_CLIENT 3
+#define DB_EVENT_REP_CONNECT_BROKEN 4
+#define DB_EVENT_REP_CONNECT_ESTD 5
+#define DB_EVENT_REP_CONNECT_TRY_FAILED 6
+#define DB_EVENT_REP_DUPMASTER 7
+#define DB_EVENT_REP_ELECTED 8
+#define DB_EVENT_REP_ELECTION_FAILED 9
+#define DB_EVENT_REP_INIT_DONE 10
+#define DB_EVENT_REP_JOIN_FAILURE 11
+#define DB_EVENT_REP_LOCAL_SITE_REMOVED 12
+#define DB_EVENT_REP_MASTER 13
+#define DB_EVENT_REP_MASTER_FAILURE 14
+#define DB_EVENT_REP_NEWMASTER 15
+#define DB_EVENT_REP_PERM_FAILED 16
+#define DB_EVENT_REP_SITE_ADDED 17
+#define DB_EVENT_REP_SITE_REMOVED 18
+#define DB_EVENT_REP_STARTUPDONE 19
+#define DB_EVENT_REP_WOULD_ROLLBACK 20 /* Undocumented; C API only. */
+#define DB_EVENT_WRITE_FAILED 21
+#define DB_EVENT_NO_SUCH_EVENT 0xffffffff /* OOB sentinel value */
+
+/* Replication Manager site status. */
+struct __db_repmgr_site {
+ int eid;
+ char *host;
+ u_int port;
+
+#define DB_REPMGR_CONNECTED 1
+#define DB_REPMGR_DISCONNECTED 2
+ u_int32_t status;
+
+#define DB_REPMGR_ISPEER 0x01
+ u_int32_t flags;
+};
+
+/* Replication statistics. */
+struct __db_rep_stat { /* SHARED */
+ /* !!!
+ * Many replication statistics fields cannot be protected by a mutex
+ * without an unacceptable performance penalty, since most message
+ * processing is done without the need to hold a region-wide lock.
+ * Fields whose comments end with a '+' may be updated without holding
+ * the replication or log mutexes (as appropriate), and thus may be
+ * off somewhat (or, on unreasonable architectures under unlucky
+ * circumstances, garbaged).
+ */
+ u_int32_t st_startup_complete; /* Site completed client sync-up. */
+#ifndef __TEST_DB_NO_STATISTICS
+ uintmax_t st_log_queued; /* Log records currently queued.+ */
+ u_int32_t st_status; /* Current replication status. */
+ DB_LSN st_next_lsn; /* Next LSN to use or expect. */
+ DB_LSN st_waiting_lsn; /* LSN we're awaiting, if any. */
+ DB_LSN st_max_perm_lsn; /* Maximum permanent LSN. */
+ db_pgno_t st_next_pg; /* Next pg we expect. */
+ db_pgno_t st_waiting_pg; /* pg we're awaiting, if any. */
+
+ u_int32_t st_dupmasters; /* # of times a duplicate master
+ condition was detected.+ */
+ db_ssize_t st_env_id; /* Current environment ID. */
+ u_int32_t st_env_priority; /* Current environment priority. */
+ uintmax_t st_bulk_fills; /* Bulk buffer fills. */
+ uintmax_t st_bulk_overflows; /* Bulk buffer overflows. */
+ uintmax_t st_bulk_records; /* Bulk records stored. */
+ uintmax_t st_bulk_transfers; /* Transfers of bulk buffers. */
+ uintmax_t st_client_rerequests;/* Number of forced rerequests. */
+ uintmax_t st_client_svc_req; /* Number of client service requests
+ received by this client. */
+ uintmax_t st_client_svc_miss; /* Number of client service requests
+ missing on this client. */
+ u_int32_t st_gen; /* Current generation number. */
+ u_int32_t st_egen; /* Current election gen number. */
+ uintmax_t st_lease_chk; /* Lease validity checks. */
+ uintmax_t st_lease_chk_misses; /* Lease checks invalid. */
+ uintmax_t st_lease_chk_refresh; /* Lease refresh attempts. */
+ uintmax_t st_lease_sends; /* Lease messages sent live. */
+
+ uintmax_t st_log_duplicated; /* Log records received multiply.+ */
+ uintmax_t st_log_queued_max; /* Max. log records queued at once.+ */
+ uintmax_t st_log_queued_total; /* Total # of log recs. ever queued.+ */
+ uintmax_t st_log_records; /* Log records received and put.+ */
+ uintmax_t st_log_requested; /* Log recs. missed and requested.+ */
+ db_ssize_t st_master; /* Env. ID of the current master. */
+ uintmax_t st_master_changes; /* # of times we've switched masters. */
+ uintmax_t st_msgs_badgen; /* Messages with a bad generation #.+ */
+ uintmax_t st_msgs_processed; /* Messages received and processed.+ */
+ uintmax_t st_msgs_recover; /* Messages ignored because this site
+ was a client in recovery.+ */
+ uintmax_t st_msgs_send_failures;/* # of failed message sends.+ */
+ uintmax_t st_msgs_sent; /* # of successful message sends.+ */
+ uintmax_t st_newsites; /* # of NEWSITE msgs. received.+ */
+ u_int32_t st_nsites; /* Current number of sites we will
+ assume during elections. */
+ uintmax_t st_nthrottles; /* # of times we were throttled. */
+ uintmax_t st_outdated; /* # of times we detected and returned
+ an OUTDATED condition.+ */
+ uintmax_t st_pg_duplicated; /* Pages received multiply.+ */
+ uintmax_t st_pg_records; /* Pages received and stored.+ */
+ uintmax_t st_pg_requested; /* Pages missed and requested.+ */
+ uintmax_t st_txns_applied; /* # of transactions applied.+ */
+ uintmax_t st_startsync_delayed;/* # of STARTSYNC msgs delayed.+ */
+
+ /* Elections generally. */
+ uintmax_t st_elections; /* # of elections held.+ */
+ uintmax_t st_elections_won; /* # of elections won by this site.+ */
+
+ /* Statistics about an in-progress election. */
+ db_ssize_t st_election_cur_winner; /* Current front-runner. */
+ u_int32_t st_election_gen; /* Election generation number. */
+ u_int32_t st_election_datagen; /* Election data generation number. */
+ DB_LSN st_election_lsn; /* Max. LSN of current winner. */
+ u_int32_t st_election_nsites; /* # of "registered voters". */
+ u_int32_t st_election_nvotes; /* # of "registered voters" needed. */
+ u_int32_t st_election_priority; /* Current election priority. */
+ int32_t st_election_status; /* Current election status. */
+ u_int32_t st_election_tiebreaker;/* Election tiebreaker value. */
+ u_int32_t st_election_votes; /* Votes received in this round. */
+ u_int32_t st_election_sec; /* Last election time seconds. */
+ u_int32_t st_election_usec; /* Last election time useconds. */
+ u_int32_t st_max_lease_sec; /* Maximum lease timestamp seconds. */
+ u_int32_t st_max_lease_usec; /* Maximum lease timestamp useconds. */
+
+ /* Undocumented statistics only used by the test system. */
+#ifdef CONFIG_TEST
+ u_int32_t st_filefail_cleanups; /* # of FILE_FAIL cleanups done. */
+#endif
+#endif
+};
+
+/* Replication Manager statistics. */
+struct __db_repmgr_stat { /* SHARED */
+ uintmax_t st_perm_failed; /* # of insufficiently ack'ed msgs. */
+ uintmax_t st_msgs_queued; /* # msgs queued for network delay. */
+ uintmax_t st_msgs_dropped; /* # msgs discarded due to excessive
+ queue length. */
+ uintmax_t st_connection_drop; /* Existing connections dropped. */
+ uintmax_t st_connect_fail; /* Failed new connection attempts. */
+ uintmax_t st_elect_threads; /* # of active election threads. */
+ uintmax_t st_max_elect_threads; /* Max concurrent e-threads ever. */
+};
+
+/* Replication Manager connection error. */
+struct __db_repmgr_conn_err {
+ int eid; /* Replication Environment ID. */
+ int error; /* System networking error code. */
+};
+
+/*******************************************************
+ * Sequences.
+ *******************************************************/
+/*
+ * The storage record for a sequence.
+ */
+struct __db_seq_record {
+ u_int32_t seq_version; /* Version size/number. */
+ u_int32_t flags; /* DB_SEQ_XXX Flags. */
+ db_seq_t seq_value; /* Current value. */
+ db_seq_t seq_max; /* Max permitted. */
+ db_seq_t seq_min; /* Min permitted. */
+};
+
+/*
+ * Handle for a sequence object.
+ */
+struct __db_sequence {
+ DB *seq_dbp; /* DB handle for this sequence. */
+ db_mutex_t mtx_seq; /* Mutex if sequence is threaded. */
+ DB_SEQ_RECORD *seq_rp; /* Pointer to current data. */
+ DB_SEQ_RECORD seq_record; /* Data from DB_SEQUENCE. */
+ int32_t seq_cache_size; /* Number of values cached. */
+ db_seq_t seq_last_value; /* Last value cached. */
+ db_seq_t seq_prev_value; /* Last value returned. */
+ DBT seq_key; /* DBT pointing to sequence key. */
+ DBT seq_data; /* DBT pointing to seq_record. */
+
+ /* API-private structure: used by C++ and Java. */
+ void *api_internal;
+
+ /* DB_SEQUENCE PUBLIC HANDLE LIST BEGIN */
+ int (*close) __P((DB_SEQUENCE *, u_int32_t));
+ int (*get) __P((DB_SEQUENCE *,
+ DB_TXN *, int32_t, db_seq_t *, u_int32_t));
+ int (*get_cachesize) __P((DB_SEQUENCE *, int32_t *));
+ int (*get_db) __P((DB_SEQUENCE *, DB **));
+ int (*get_flags) __P((DB_SEQUENCE *, u_int32_t *));
+ int (*get_key) __P((DB_SEQUENCE *, DBT *));
+ int (*get_range) __P((DB_SEQUENCE *,
+ db_seq_t *, db_seq_t *));
+ int (*initial_value) __P((DB_SEQUENCE *, db_seq_t));
+ int (*open) __P((DB_SEQUENCE *,
+ DB_TXN *, DBT *, u_int32_t));
+ int (*remove) __P((DB_SEQUENCE *, DB_TXN *, u_int32_t));
+ int (*set_cachesize) __P((DB_SEQUENCE *, int32_t));
+ int (*set_flags) __P((DB_SEQUENCE *, u_int32_t));
+ int (*set_range) __P((DB_SEQUENCE *, db_seq_t, db_seq_t));
+ int (*stat) __P((DB_SEQUENCE *,
+ DB_SEQUENCE_STAT **, u_int32_t));
+ int (*stat_print) __P((DB_SEQUENCE *, u_int32_t));
+ /* DB_SEQUENCE PUBLIC HANDLE LIST END */
+};
+
+struct __db_seq_stat { /* SHARED */
+ uintmax_t st_wait; /* Sequence lock granted w/o wait. */
+ uintmax_t st_nowait; /* Sequence lock granted after wait. */
+ db_seq_t st_current; /* Current value in db. */
+ db_seq_t st_value; /* Current cached value. */
+ db_seq_t st_last_value; /* Last cached value. */
+ db_seq_t st_min; /* Minimum value. */
+ db_seq_t st_max; /* Maximum value. */
+ int32_t st_cache_size; /* Cache size. */
+ u_int32_t st_flags; /* Flag value. */
+};
+
+/*******************************************************
+ * Access methods.
+ *******************************************************/
+/*
+ * Any new methods need to retain the original numbering. The type
+ * is written in a log record so must be maintained.
+ */
+typedef enum {
+ DB_BTREE=1,
+ DB_HASH=2,
+ DB_HEAP=6,
+ DB_RECNO=3,
+ DB_QUEUE=4,
+ DB_UNKNOWN=5 /* Figure it out on open. */
+} DBTYPE;
+
+#define DB_RENAMEMAGIC 0x030800 /* File has been renamed. */
+
+#define DB_BTREEVERSION 9 /* Current btree version. */
+#define DB_BTREEOLDVER 8 /* Oldest btree version supported. */
+#define DB_BTREEMAGIC 0x053162
+
+#define DB_HASHVERSION 9 /* Current hash version. */
+#define DB_HASHOLDVER 7 /* Oldest hash version supported. */
+#define DB_HASHMAGIC 0x061561
+
+#define DB_HEAPVERSION 1 /* Current heap version. */
+#define DB_HEAPOLDVER 1 /* Oldest heap version supported. */
+#define DB_HEAPMAGIC 0x074582
+
+#define DB_QAMVERSION 4 /* Current queue version. */
+#define DB_QAMOLDVER 3 /* Oldest queue version supported. */
+#define DB_QAMMAGIC 0x042253
+
+#define DB_SEQUENCE_VERSION 2 /* Current sequence version. */
+#define DB_SEQUENCE_OLDVER 1 /* Oldest sequence version supported. */
+
+/*
+ * DB access method and cursor operation values. Each value is an operation
+ * code to which additional bit flags are added.
+ */
+#define DB_AFTER 1 /* Dbc.put */
+#define DB_APPEND 2 /* Db.put */
+#define DB_BEFORE 3 /* Dbc.put */
+#define DB_CONSUME 4 /* Db.get */
+#define DB_CONSUME_WAIT 5 /* Db.get */
+#define DB_CURRENT 6 /* Dbc.get, Dbc.put, DbLogc.get */
+#define DB_FIRST 7 /* Dbc.get, DbLogc->get */
+#define DB_GET_BOTH 8 /* Db.get, Dbc.get */
+#define DB_GET_BOTHC 9 /* Dbc.get (internal) */
+#define DB_GET_BOTH_RANGE 10 /* Db.get, Dbc.get */
+#define DB_GET_RECNO 11 /* Dbc.get */
+#define DB_JOIN_ITEM 12 /* Dbc.get; don't do primary lookup */
+#define DB_KEYFIRST 13 /* Dbc.put */
+#define DB_KEYLAST 14 /* Dbc.put */
+#define DB_LAST 15 /* Dbc.get, DbLogc->get */
+#define DB_NEXT 16 /* Dbc.get, DbLogc->get */
+#define DB_NEXT_DUP 17 /* Dbc.get */
+#define DB_NEXT_NODUP 18 /* Dbc.get */
+#define DB_NODUPDATA 19 /* Db.put, Dbc.put */
+#define DB_NOOVERWRITE 20 /* Db.put */
+#define DB_OVERWRITE_DUP 21 /* Dbc.put, Db.put; no DB_KEYEXIST */
+#define DB_POSITION 22 /* Dbc.dup */
+#define DB_PREV 23 /* Dbc.get, DbLogc->get */
+#define DB_PREV_DUP 24 /* Dbc.get */
+#define DB_PREV_NODUP 25 /* Dbc.get */
+#define DB_SET 26 /* Dbc.get, DbLogc->get */
+#define DB_SET_RANGE 27 /* Dbc.get */
+#define DB_SET_RECNO 28 /* Db.get, Dbc.get */
+#define DB_UPDATE_SECONDARY 29 /* Dbc.get, Dbc.del (internal) */
+#define DB_SET_LTE 30 /* Dbc.get (internal) */
+#define DB_GET_BOTH_LTE 31 /* Dbc.get (internal) */
+
+/* This has to change when the max opcode hits 255. */
+#define DB_OPFLAGS_MASK 0x000000ff /* Mask for operations flags. */
+
+/*
+ * DB (user visible) error return codes.
+ *
+ * !!!
+ * We don't want our error returns to conflict with other packages where
+ * possible, so pick a base error value that's hopefully not common. We
+ * document that we own the error name space from -30,800 to -30,999.
+ */
+/* DB (public) error return codes. */
+#define DB_BUFFER_SMALL (-30999)/* User memory too small for return. */
+#define DB_DONOTINDEX (-30998)/* "Null" return from 2ndary callbk. */
+#define DB_FOREIGN_CONFLICT (-30997)/* A foreign db constraint triggered. */
+#define DB_HEAP_FULL (-30996)/* No free space in a heap file. */
+#define DB_KEYEMPTY (-30995)/* Key/data deleted or never created. */
+#define DB_KEYEXIST (-30994)/* The key/data pair already exists. */
+#define DB_LOCK_DEADLOCK (-30993)/* Deadlock. */
+#define DB_LOCK_NOTGRANTED (-30992)/* Lock unavailable. */
+#define DB_LOG_BUFFER_FULL (-30991)/* In-memory log buffer full. */
+#define DB_LOG_VERIFY_BAD (-30990)/* Log verification failed. */
+#define DB_NOSERVER (-30989)/* Server panic return. */
+#define DB_NOTFOUND (-30988)/* Key/data pair not found (EOF). */
+#define DB_OLD_VERSION (-30987)/* Out-of-date version. */
+#define DB_PAGE_NOTFOUND (-30986)/* Requested page not found. */
+#define DB_REP_DUPMASTER (-30985)/* There are two masters. */
+#define DB_REP_HANDLE_DEAD (-30984)/* Rolled back a commit. */
+#define DB_REP_HOLDELECTION (-30983)/* Time to hold an election. */
+#define DB_REP_IGNORE (-30982)/* This msg should be ignored.*/
+#define DB_REP_ISPERM (-30981)/* Cached not written perm written.*/
+#define DB_REP_JOIN_FAILURE (-30980)/* Unable to join replication group. */
+#define DB_REP_LEASE_EXPIRED (-30979)/* Master lease has expired. */
+#define DB_REP_LOCKOUT (-30978)/* API/Replication lockout now. */
+#define DB_REP_NEWSITE (-30977)/* New site entered system. */
+#define DB_REP_NOTPERM (-30976)/* Permanent log record not written. */
+#define DB_REP_UNAVAIL (-30975)/* Site cannot currently be reached. */
+#define DB_REP_WOULDROLLBACK (-30974)/* UNDOC: rollback inhibited by app. */
+#define DB_RUNRECOVERY (-30973)/* Panic return. */
+#define DB_SECONDARY_BAD (-30972)/* Secondary index corrupt. */
+#define DB_TIMEOUT (-30971)/* Timed out on read consistency. */
+#define DB_VERIFY_BAD (-30970)/* Verify failed; bad format. */
+#define DB_VERSION_MISMATCH (-30969)/* Environment version mismatch. */
+
+/* DB (private) error return codes. */
+#define DB_ALREADY_ABORTED (-30899)
+#define DB_CHKSUM_FAIL (-30898)/* Checksum failed. */
+#define DB_DELETED (-30897)/* Recovery file marked deleted. */
+#define DB_EVENT_NOT_HANDLED (-30896)/* Forward event to application. */
+#define DB_NEEDSPLIT (-30895)/* Page needs to be split. */
+#define DB_REP_BULKOVF (-30894)/* Rep bulk buffer overflow. */
+#define DB_REP_LOGREADY (-30893)/* Rep log ready for recovery. */
+#define DB_REP_NEWMASTER (-30892)/* We have learned of a new master. */
+#define DB_REP_PAGEDONE (-30891)/* This page was already done. */
+#define DB_SURPRISE_KID (-30890)/* Child commit where parent
+ didn't know it was a parent. */
+#define DB_SWAPBYTES (-30889)/* Database needs byte swapping. */
+#define DB_TXN_CKP (-30888)/* Encountered ckp record in log. */
+#define DB_VERIFY_FATAL (-30887)/* DB->verify cannot proceed. */
+
+/* Database handle. */
+struct __db {
+ /*******************************************************
+ * Public: owned by the application.
+ *******************************************************/
+ u_int32_t pgsize; /* Database logical page size. */
+ DB_CACHE_PRIORITY priority; /* Database priority in cache. */
+
+ /* Callbacks. */
+ int (*db_append_recno) __P((DB *, DBT *, db_recno_t));
+ void (*db_feedback) __P((DB *, int, int));
+ int (*dup_compare) __P((DB *, const DBT *, const DBT *));
+
+ void *app_private; /* Application-private handle. */
+
+ /*******************************************************
+ * Private: owned by DB.
+ *******************************************************/
+ DB_ENV *dbenv; /* Backing public environment. */
+ ENV *env; /* Backing private environment. */
+
+ DBTYPE type; /* DB access method type. */
+
+ DB_MPOOLFILE *mpf; /* Backing buffer pool. */
+
+ db_mutex_t mutex; /* Synchronization for free threading */
+
+ char *fname, *dname; /* File/database passed to DB->open. */
+ const char *dirname; /* Directory of DB file. */
+ u_int32_t open_flags; /* Flags passed to DB->open. */
+
+ u_int8_t fileid[DB_FILE_ID_LEN];/* File's unique ID for locking. */
+
+ u_int32_t adj_fileid; /* File's unique ID for curs. adj. */
+
+#define DB_LOGFILEID_INVALID -1
+ FNAME *log_filename; /* File's naming info for logging. */
+
+ db_pgno_t meta_pgno; /* Meta page number */
+ DB_LOCKER *locker; /* Locker for handle locking. */
+ DB_LOCKER *cur_locker; /* Current handle lock holder. */
+ DB_TXN *cur_txn; /* Opening transaction. */
+ DB_LOCKER *associate_locker; /* Locker for DB->associate call. */
+ DB_LOCK handle_lock; /* Lock held on this handle. */
+
+ time_t timestamp; /* Handle timestamp for replication. */
+ u_int32_t fid_gen; /* Rep generation number for fids. */
+
+ /*
+ * Returned data memory for DB->get() and friends.
+ */
+ DBT my_rskey; /* Secondary key. */
+ DBT my_rkey; /* [Primary] key. */
+ DBT my_rdata; /* Data. */
+
+ /*
+ * !!!
+ * Some applications use DB but implement their own locking outside of
+ * DB. If they're using fcntl(2) locking on the underlying database
+ * file, and we open and close a file descriptor for that file, we will
+ * discard their locks. The DB_FCNTL_LOCKING flag to DB->open is an
+ * undocumented interface to support this usage which leaves any file
+ * descriptors we open until DB->close. This will only work with the
+ * DB->open interface and simple caches, e.g., creating a transaction
+ * thread may open/close file descriptors this flag doesn't protect.
+ * Locking with fcntl(2) on a file that you don't own is a very, very
+ * unsafe thing to do. 'Nuff said.
+ */
+ DB_FH *saved_open_fhp; /* Saved file handle. */
+
+ /*
+ * Linked list of DBP's, linked from the ENV, used to keep track
+ * of all open db handles for cursor adjustment.
+ *
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(__db) dblistlinks;
+ */
+ struct {
+ struct __db *tqe_next;
+ struct __db **tqe_prev;
+ } dblistlinks;
+
+ /*
+ * Cursor queues.
+ *
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_HEAD(__cq_fq, __dbc) free_queue;
+ * TAILQ_HEAD(__cq_aq, __dbc) active_queue;
+ * TAILQ_HEAD(__cq_jq, __dbc) join_queue;
+ */
+ struct __cq_fq {
+ struct __dbc *tqh_first;
+ struct __dbc **tqh_last;
+ } free_queue;
+ struct __cq_aq {
+ struct __dbc *tqh_first;
+ struct __dbc **tqh_last;
+ } active_queue;
+ struct __cq_jq {
+ struct __dbc *tqh_first;
+ struct __dbc **tqh_last;
+ } join_queue;
+
+ /*
+ * Secondary index support.
+ *
+ * Linked list of secondary indices -- set in the primary.
+ *
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * LIST_HEAD(s_secondaries, __db);
+ */
+ struct {
+ struct __db *lh_first;
+ } s_secondaries;
+
+ /*
+ * List entries for secondaries, and reference count of how many
+ * threads are updating this secondary (see Dbc.put).
+ *
+ * !!!
+ * Note that these are synchronized by the primary's mutex, but
+ * filled in in the secondaries.
+ *
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * LIST_ENTRY(__db) s_links;
+ */
+ struct {
+ struct __db *le_next;
+ struct __db **le_prev;
+ } s_links;
+ u_int32_t s_refcnt;
+
+ /* Secondary callback and free functions -- set in the secondary. */
+ int (*s_callback) __P((DB *, const DBT *, const DBT *, DBT *));
+
+ /* Reference to primary -- set in the secondary. */
+ DB *s_primary;
+
+#define DB_ASSOC_IMMUTABLE_KEY 0x00000001 /* Secondary key is immutable. */
+#define DB_ASSOC_CREATE 0x00000002 /* Secondary db populated on open. */
+
+ /* Flags passed to associate -- set in the secondary. */
+ u_int32_t s_assoc_flags;
+
+ /*
+ * Foreign key support.
+ *
+ * Linked list of primary dbs -- set in the foreign db
+ *
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * LIST_HEAD(f_primaries, __db);
+ */
+ struct {
+ struct __db_foreign_info *lh_first;
+ } f_primaries;
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(__db) felink;
+ *
+ * Links in a list of DBs involved in file extension
+ * during a transaction. These are to be used only while the
+ * metadata is locked.
+ */
+ struct {
+ struct __db *tqe_next;
+ struct __db **tqe_prev;
+ } felink;
+
+ /* Reference to foreign -- set in the secondary. */
+ DB *s_foreign;
+
+ /* API-private structure: used by DB 1.85, C++, Java, Perl and Tcl */
+ void *api_internal;
+
+ /* Subsystem-private structure. */
+ void *bt_internal; /* Btree/Recno access method. */
+ void *h_internal; /* Hash access method. */
+ void *heap_internal; /* Heap access method. */
+ void *p_internal; /* Partition informaiton. */
+ void *q_internal; /* Queue access method. */
+
+ /* DB PUBLIC HANDLE LIST BEGIN */
+ int (*associate) __P((DB *, DB_TXN *, DB *,
+ int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ int (*associate_foreign) __P((DB *, DB *,
+ int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ u_int32_t));
+ int (*close) __P((DB *, u_int32_t));
+ int (*compact) __P((DB *,
+ DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ int (*cursor) __P((DB *, DB_TXN *, DBC **, u_int32_t));
+ int (*del) __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ void (*err) __P((DB *, int, const char *, ...));
+ void (*errx) __P((DB *, const char *, ...));
+ int (*exists) __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ int (*fd) __P((DB *, int *));
+ int (*get) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ int (*get_alloc) __P((DB *, void *(**)(size_t),
+ void *(**)(void *, size_t), void (**)(void *)));
+ int (*get_append_recno) __P((DB *, int (**)(DB *, DBT *, db_recno_t)));
+ int (*get_assoc_flags) __P((DB *, u_int32_t *));
+ int (*get_bt_compare)
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ int (*get_bt_compress) __P((DB *,
+ int (**)(DB *,
+ const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
+ int (**)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+ int (*get_bt_minkey) __P((DB *, u_int32_t *));
+ int (*get_bt_prefix)
+ __P((DB *, size_t (**)(DB *, const DBT *, const DBT *)));
+ int (*get_byteswapped) __P((DB *, int *));
+ int (*get_cachesize) __P((DB *, u_int32_t *, u_int32_t *, int *));
+ int (*get_create_dir) __P((DB *, const char **));
+ int (*get_dbname) __P((DB *, const char **, const char **));
+ int (*get_dup_compare)
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ int (*get_encrypt_flags) __P((DB *, u_int32_t *));
+ DB_ENV *(*get_env) __P((DB *));
+ void (*get_errcall) __P((DB *,
+ void (**)(const DB_ENV *, const char *, const char *)));
+ void (*get_errfile) __P((DB *, FILE **));
+ void (*get_errpfx) __P((DB *, const char **));
+ int (*get_feedback) __P((DB *, void (**)(DB *, int, int)));
+ int (*get_flags) __P((DB *, u_int32_t *));
+ int (*get_h_compare)
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ int (*get_h_ffactor) __P((DB *, u_int32_t *));
+ int (*get_h_hash)
+ __P((DB *, u_int32_t (**)(DB *, const void *, u_int32_t)));
+ int (*get_h_nelem) __P((DB *, u_int32_t *));
+ int (*get_heapsize) __P((DB *, u_int32_t *, u_int32_t *));
+ int (*get_heap_regionsize) __P((DB *, u_int32_t *));
+ int (*get_lk_exclusive) __P((DB *, int *, int *));
+ int (*get_lorder) __P((DB *, int *));
+ DB_MPOOLFILE *(*get_mpf) __P((DB *));
+ void (*get_msgcall) __P((DB *,
+ void (**)(const DB_ENV *, const char *)));
+ void (*get_msgfile) __P((DB *, FILE **));
+ int (*get_multiple) __P((DB *));
+ int (*get_open_flags) __P((DB *, u_int32_t *));
+ int (*get_pagesize) __P((DB *, u_int32_t *));
+ int (*get_partition_callback) __P((DB *,
+ u_int32_t *, u_int32_t (**)(DB *, DBT *key)));
+ int (*get_partition_dirs) __P((DB *, const char ***));
+ int (*get_partition_keys) __P((DB *, u_int32_t *, DBT **));
+ int (*get_priority) __P((DB *, DB_CACHE_PRIORITY *));
+ int (*get_q_extentsize) __P((DB *, u_int32_t *));
+ int (*get_re_delim) __P((DB *, int *));
+ int (*get_re_len) __P((DB *, u_int32_t *));
+ int (*get_re_pad) __P((DB *, int *));
+ int (*get_re_source) __P((DB *, const char **));
+ int (*get_transactional) __P((DB *));
+ int (*get_type) __P((DB *, DBTYPE *));
+ int (*join) __P((DB *, DBC **, DBC **, u_int32_t));
+ int (*key_range)
+ __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ int (*open) __P((DB *,
+ DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int));
+ int (*pget) __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ int (*put) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ int (*remove) __P((DB *, const char *, const char *, u_int32_t));
+ int (*rename) __P((DB *,
+ const char *, const char *, const char *, u_int32_t));
+ int (*set_alloc) __P((DB *, void *(*)(size_t),
+ void *(*)(void *, size_t), void (*)(void *)));
+ int (*set_append_recno) __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
+ int (*set_bt_compare)
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ int (*set_bt_compress) __P((DB *,
+ int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
+ int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+ int (*set_bt_minkey) __P((DB *, u_int32_t));
+ int (*set_bt_prefix)
+ __P((DB *, size_t (*)(DB *, const DBT *, const DBT *)));
+ int (*set_cachesize) __P((DB *, u_int32_t, u_int32_t, int));
+ int (*set_create_dir) __P((DB *, const char *));
+ int (*set_dup_compare)
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ int (*set_encrypt) __P((DB *, const char *, u_int32_t));
+ void (*set_errcall) __P((DB *,
+ void (*)(const DB_ENV *, const char *, const char *)));
+ void (*set_errfile) __P((DB *, FILE *));
+ void (*set_errpfx) __P((DB *, const char *));
+ int (*set_feedback) __P((DB *, void (*)(DB *, int, int)));
+ int (*set_flags) __P((DB *, u_int32_t));
+ int (*set_h_compare)
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ int (*set_h_ffactor) __P((DB *, u_int32_t));
+ int (*set_h_hash)
+ __P((DB *, u_int32_t (*)(DB *, const void *, u_int32_t)));
+ int (*set_h_nelem) __P((DB *, u_int32_t));
+ int (*set_heapsize) __P((DB *, u_int32_t, u_int32_t, u_int32_t));
+ int (*set_heap_regionsize) __P((DB *, u_int32_t));
+ int (*set_lk_exclusive) __P((DB *, int));
+ int (*set_lorder) __P((DB *, int));
+ void (*set_msgcall) __P((DB *, void (*)(const DB_ENV *, const char *)));
+ void (*set_msgfile) __P((DB *, FILE *));
+ int (*set_pagesize) __P((DB *, u_int32_t));
+ int (*set_paniccall) __P((DB *, void (*)(DB_ENV *, int)));
+ int (*set_partition) __P((DB *,
+ u_int32_t, DBT *, u_int32_t (*)(DB *, DBT *key)));
+ int (*set_partition_dirs) __P((DB *, const char **));
+ int (*set_priority) __P((DB *, DB_CACHE_PRIORITY));
+ int (*set_q_extentsize) __P((DB *, u_int32_t));
+ int (*set_re_delim) __P((DB *, int));
+ int (*set_re_len) __P((DB *, u_int32_t));
+ int (*set_re_pad) __P((DB *, int));
+ int (*set_re_source) __P((DB *, const char *));
+ int (*sort_multiple) __P((DB *, DBT *, DBT *, u_int32_t));
+ int (*stat) __P((DB *, DB_TXN *, void *, u_int32_t));
+ int (*stat_print) __P((DB *, u_int32_t));
+ int (*sync) __P((DB *, u_int32_t));
+ int (*truncate) __P((DB *, DB_TXN *, u_int32_t *, u_int32_t));
+ int (*upgrade) __P((DB *, const char *, u_int32_t));
+ int (*verify)
+ __P((DB *, const char *, const char *, FILE *, u_int32_t));
+ /* DB PUBLIC HANDLE LIST END */
+
+ /* DB PRIVATE HANDLE LIST BEGIN */
+ int (*dump) __P((DB *, const char *,
+ int (*)(void *, const void *), void *, int, int));
+ int (*db_am_remove) __P((DB *, DB_THREAD_INFO *,
+ DB_TXN *, const char *, const char *, u_int32_t));
+ int (*db_am_rename) __P((DB *, DB_THREAD_INFO *,
+ DB_TXN *, const char *, const char *, const char *));
+ /* DB PRIVATE HANDLE LIST END */
+
+ /*
+ * Never called; these are a place to save function pointers
+ * so that we can undo an associate.
+ */
+ int (*stored_get) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ int (*stored_close) __P((DB *, u_int32_t));
+
+ /* Alternative handle close function, used by C++ API. */
+ int (*alt_close) __P((DB *, u_int32_t));
+
+#define DB_OK_BTREE 0x01
+#define DB_OK_HASH 0x02
+#define DB_OK_HEAP 0x04
+#define DB_OK_QUEUE 0x08
+#define DB_OK_RECNO 0x10
+ u_int32_t am_ok; /* Legal AM choices. */
+
+ /*
+ * This field really ought to be an AM_FLAG, but we have
+ * have run out of bits. If/when we decide to split up
+ * the flags, we can incorporate it.
+ */
+ int preserve_fid; /* Do not free fileid on close. */
+
+#define DB_AM_CHKSUM 0x00000001 /* Checksumming */
+#define DB_AM_COMPENSATE 0x00000002 /* Created by compensating txn */
+#define DB_AM_COMPRESS 0x00000004 /* Compressed BTree */
+#define DB_AM_CREATED 0x00000008 /* Database was created upon open */
+#define DB_AM_CREATED_MSTR 0x00000010 /* Encompassing file was created */
+#define DB_AM_DBM_ERROR 0x00000020 /* Error in DBM/NDBM database */
+#define DB_AM_DELIMITER 0x00000040 /* Variable length delimiter set */
+#define DB_AM_DISCARD 0x00000080 /* Discard any cached pages */
+#define DB_AM_DUP 0x00000100 /* DB_DUP */
+#define DB_AM_DUPSORT 0x00000200 /* DB_DUPSORT */
+#define DB_AM_ENCRYPT 0x00000400 /* Encryption */
+#define DB_AM_FIXEDLEN 0x00000800 /* Fixed-length records */
+#define DB_AM_INMEM 0x00001000 /* In-memory; no sync on close */
+#define DB_AM_INORDER 0x00002000 /* DB_INORDER */
+#define DB_AM_IN_RENAME 0x00004000 /* File is being renamed */
+#define DB_AM_NOT_DURABLE 0x00008000 /* Do not log changes */
+#define DB_AM_OPEN_CALLED 0x00010000 /* DB->open called */
+#define DB_AM_PAD 0x00020000 /* Fixed-length record pad */
+#define DB_AM_PARTDB 0x00040000 /* Handle for a database partition */
+#define DB_AM_PGDEF 0x00080000 /* Page size was defaulted */
+#define DB_AM_RDONLY 0x00100000 /* Database is readonly */
+#define DB_AM_READ_UNCOMMITTED 0x00200000 /* Support degree 1 isolation */
+#define DB_AM_RECNUM 0x00400000 /* DB_RECNUM */
+#define DB_AM_RECOVER 0x00800000 /* DB opened by recovery routine */
+#define DB_AM_RENUMBER 0x01000000 /* DB_RENUMBER */
+#define DB_AM_REVSPLITOFF 0x02000000 /* DB_REVSPLITOFF */
+#define DB_AM_SECONDARY 0x04000000 /* Database is a secondary index */
+#define DB_AM_SNAPSHOT 0x08000000 /* DB_SNAPSHOT */
+#define DB_AM_SUBDB 0x10000000 /* Subdatabases supported */
+#define DB_AM_SWAP 0x20000000 /* Pages need to be byte-swapped */
+#define DB_AM_TXN 0x40000000 /* Opened in a transaction */
+#define DB_AM_VERIFYING 0x80000000 /* DB handle is in the verifier */
+ u_int32_t orig_flags; /* Flags at open, for refresh */
+ u_int32_t flags;
+
+#define DB2_AM_EXCL 0x00000001 /* Exclusively lock the handle */
+#define DB2_AM_INTEXCL 0x00000002 /* Internal exclusive lock. */
+#define DB2_AM_NOWAIT 0x00000004 /* Do not wait for handle lock */
+ u_int32_t orig_flags2; /* Second flags word; for refresh */
+ u_int32_t flags2; /* Second flags word */
+};
+
+/*
+ * Macros for bulk operations. These are only intended for the C API.
+ * For C++, use DbMultiple*Iterator or DbMultiple*Builder.
+ *
+ * Bulk operations store multiple entries into a single DBT structure. The
+ * following macros assist with creating and reading these Multiple DBTs.
+ *
+ * The basic layout for single data items is:
+ *
+ * -------------------------------------------------------------------------
+ * | data1 | ... | dataN | ..... |-1 | dNLen | dNOff | ... | d1Len | d1Off |
+ * -------------------------------------------------------------------------
+ *
+ * For the DB_MULTIPLE_KEY* macros, the items are in key/data pairs, so data1
+ * would be a key, and data2 its corresponding value (N is always even).
+ *
+ * For the DB_MULTIPLE_RECNO* macros, the record number is stored along with
+ * the len/off pair in the "header" section, and the list is zero terminated
+ * (since -1 is a valid record number):
+ *
+ * --------------------------------------------------------------------------
+ * | d1 |..| dN |..| 0 | dNLen | dNOff | recnoN |..| d1Len | d1Off | recno1 |
+ * --------------------------------------------------------------------------
+ */
+#define DB_MULTIPLE_INIT(pointer, dbt) \
+ (pointer = (u_int8_t *)(dbt)->data + \
+ (dbt)->ulen - sizeof(u_int32_t))
+
+#define DB_MULTIPLE_NEXT(pointer, dbt, retdata, retdlen) \
+ do { \
+ u_int32_t *__p = (u_int32_t *)(pointer); \
+ if (*__p == (u_int32_t)-1) { \
+ retdata = NULL; \
+ pointer = NULL; \
+ break; \
+ } \
+ retdata = (u_int8_t *)(dbt)->data + *__p--; \
+ retdlen = *__p--; \
+ pointer = __p; \
+ if (retdlen == 0 && retdata == (u_int8_t *)(dbt)->data) \
+ retdata = NULL; \
+ } while (0)
+
+#define DB_MULTIPLE_KEY_NEXT(pointer, dbt, retkey, retklen, retdata, retdlen) \
+ do { \
+ u_int32_t *__p = (u_int32_t *)(pointer); \
+ if (*__p == (u_int32_t)-1) { \
+ retdata = NULL; \
+ retkey = NULL; \
+ pointer = NULL; \
+ break; \
+ } \
+ retkey = (u_int8_t *)(dbt)->data + *__p--; \
+ retklen = *__p--; \
+ retdata = (u_int8_t *)(dbt)->data + *__p--; \
+ retdlen = *__p--; \
+ pointer = __p; \
+ } while (0)
+
+#define DB_MULTIPLE_RECNO_NEXT(pointer, dbt, recno, retdata, retdlen) \
+ do { \
+ u_int32_t *__p = (u_int32_t *)(pointer); \
+ if (*__p == (u_int32_t)0) { \
+ recno = 0; \
+ retdata = NULL; \
+ pointer = NULL; \
+ break; \
+ } \
+ recno = *__p--; \
+ retdata = (u_int8_t *)(dbt)->data + *__p--; \
+ retdlen = *__p--; \
+ pointer = __p; \
+ } while (0)
+
+#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \
+ do { \
+ (dbt)->flags |= DB_DBT_BULK; \
+ pointer = (u_int8_t *)(dbt)->data + \
+ (dbt)->ulen - sizeof(u_int32_t); \
+ *(u_int32_t *)(pointer) = (u_int32_t)-1; \
+ } while (0)
+
+#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \
+ do { \
+ u_int32_t *__p = (u_int32_t *)(pointer); \
+ u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
+ (dbt)->ulen - sizeof(u_int32_t)) ? 0 : __p[1] + __p[2];\
+ if ((u_int8_t *)(dbt)->data + __off + (writedlen) > \
+ (u_int8_t *)(__p - 2)) \
+ writedata = NULL; \
+ else { \
+ writedata = (u_int8_t *)(dbt)->data + __off; \
+ __p[0] = __off; \
+ __p[-1] = (u_int32_t)(writedlen); \
+ __p[-2] = (u_int32_t)-1; \
+ pointer = __p - 2; \
+ } \
+ } while (0)
+
+#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \
+ do { \
+ void *__destd; \
+ DB_MULTIPLE_RESERVE_NEXT((pointer), (dbt), \
+ __destd, (writedlen)); \
+ if (__destd == NULL) \
+ pointer = NULL; \
+ else \
+ memcpy(__destd, (writedata), (writedlen)); \
+ } while (0)
+
+#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
+ do { \
+ u_int32_t *__p = (u_int32_t *)(pointer); \
+ u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
+ (dbt)->ulen - sizeof(u_int32_t)) ? 0 : __p[1] + __p[2];\
+ if ((u_int8_t *)(dbt)->data + __off + (writeklen) + \
+ (writedlen) > (u_int8_t *)(__p - 4)) { \
+ writekey = NULL; \
+ writedata = NULL; \
+ } else { \
+ writekey = (u_int8_t *)(dbt)->data + __off; \
+ __p[0] = __off; \
+ __p[-1] = (u_int32_t)(writeklen); \
+ __p -= 2; \
+ __off += (u_int32_t)(writeklen); \
+ writedata = (u_int8_t *)(dbt)->data + __off; \
+ __p[0] = __off; \
+ __p[-1] = (u_int32_t)(writedlen); \
+ __p[-2] = (u_int32_t)-1; \
+ pointer = __p - 2; \
+ } \
+ } while (0)
+
+#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
+ do { \
+ void *__destk, *__destd; \
+ DB_MULTIPLE_KEY_RESERVE_NEXT((pointer), (dbt), \
+ __destk, (writeklen), __destd, (writedlen)); \
+ if (__destk == NULL) \
+ pointer = NULL; \
+ else { \
+ memcpy(__destk, (writekey), (writeklen)); \
+ if (__destd != NULL) \
+ memcpy(__destd, (writedata), (writedlen));\
+ } \
+ } while (0)
+
+#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \
+ do { \
+ (dbt)->flags |= DB_DBT_BULK; \
+ pointer = (u_int8_t *)(dbt)->data + \
+ (dbt)->ulen - sizeof(u_int32_t); \
+ *(u_int32_t *)(pointer) = 0; \
+ } while (0)
+
+#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \
+ do { \
+ u_int32_t *__p = (u_int32_t *)(pointer); \
+ u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
+ (dbt)->ulen - sizeof(u_int32_t)) ? 0 : __p[1] + __p[2]; \
+ if (((u_int8_t *)(dbt)->data + __off) + (writedlen) > \
+ (u_int8_t *)(__p - 3)) \
+ writedata = NULL; \
+ else { \
+ writedata = (u_int8_t *)(dbt)->data + __off; \
+ __p[0] = (u_int32_t)(recno); \
+ __p[-1] = __off; \
+ __p[-2] = (u_int32_t)(writedlen); \
+ __p[-3] = 0; \
+ pointer = __p - 3; \
+ } \
+ } while (0)
+
+#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\
+ do { \
+ void *__destd; \
+ DB_MULTIPLE_RECNO_RESERVE_NEXT((pointer), (dbt), \
+ (recno), __destd, (writedlen)); \
+ if (__destd == NULL) \
+ pointer = NULL; \
+ else if ((writedlen) != 0) \
+ memcpy(__destd, (writedata), (writedlen)); \
+ } while (0)
+
+struct __db_heap_rid {
+ db_pgno_t pgno; /* Page number. */
+ db_indx_t indx; /* Index in the offset table. */
+};
+#define DB_HEAP_RID_SZ (sizeof(db_pgno_t) + sizeof(db_indx_t))
+
+/*******************************************************
+ * Access method cursors.
+ *******************************************************/
+struct __dbc {
+ DB *dbp; /* Backing database */
+ DB_ENV *dbenv; /* Backing environment */
+ ENV *env; /* Backing environment */
+
+ DB_THREAD_INFO *thread_info; /* Thread that owns this cursor. */
+ DB_TXN *txn; /* Associated transaction. */
+ DB_CACHE_PRIORITY priority; /* Priority in cache. */
+
+ /*
+ * Active/free cursor queues.
+ *
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(__dbc) links;
+ */
+ struct {
+ DBC *tqe_next;
+ DBC **tqe_prev;
+ } links;
+
+ /*
+ * Cursor queue of the owning transaction.
+ *
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(__dbc) txn_cursors;
+ */
+ struct {
+ DBC *tqe_next; /* next element */
+ DBC **tqe_prev; /* address of previous next element */
+ } txn_cursors;
+
+ /*
+ * The DBT *'s below are used by the cursor routines to return
+ * data to the user when DBT flags indicate that DB should manage
+ * the returned memory. They point at a DBT containing the buffer
+ * and length that will be used, and "belonging" to the handle that
+ * should "own" this memory. This may be a "my_*" field of this
+ * cursor--the default--or it may be the corresponding field of
+ * another cursor, a DB handle, a join cursor, etc. In general, it
+ * will be whatever handle the user originally used for the current
+ * DB interface call.
+ */
+ DBT *rskey; /* Returned secondary key. */
+ DBT *rkey; /* Returned [primary] key. */
+ DBT *rdata; /* Returned data. */
+
+ DBT my_rskey; /* Space for returned secondary key. */
+ DBT my_rkey; /* Space for returned [primary] key. */
+ DBT my_rdata; /* Space for returned data. */
+
+ DB_LOCKER *lref; /* Reference to default locker. */
+ DB_LOCKER *locker; /* Locker for this operation. */
+ DBT lock_dbt; /* DBT referencing lock. */
+ DB_LOCK_ILOCK lock; /* Object to be locked. */
+ DB_LOCK mylock; /* CDB lock held on this cursor. */
+
+ DBTYPE dbtype; /* Cursor type. */
+
+ DBC_INTERNAL *internal; /* Access method private. */
+
+ /* DBC PUBLIC HANDLE LIST BEGIN */
+ int (*close) __P((DBC *));
+ int (*cmp) __P((DBC *, DBC *, int *, u_int32_t));
+ int (*count) __P((DBC *, db_recno_t *, u_int32_t));
+ int (*del) __P((DBC *, u_int32_t));
+ int (*dup) __P((DBC *, DBC **, u_int32_t));
+ int (*get) __P((DBC *, DBT *, DBT *, u_int32_t));
+ int (*get_priority) __P((DBC *, DB_CACHE_PRIORITY *));
+ int (*pget) __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ int (*put) __P((DBC *, DBT *, DBT *, u_int32_t));
+ int (*set_priority) __P((DBC *, DB_CACHE_PRIORITY));
+ /* DBC PUBLIC HANDLE LIST END */
+
+ /* The following are the method names deprecated in the 4.6 release. */
+ int (*c_close) __P((DBC *));
+ int (*c_count) __P((DBC *, db_recno_t *, u_int32_t));
+ int (*c_del) __P((DBC *, u_int32_t));
+ int (*c_dup) __P((DBC *, DBC **, u_int32_t));
+ int (*c_get) __P((DBC *, DBT *, DBT *, u_int32_t));
+ int (*c_pget) __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ int (*c_put) __P((DBC *, DBT *, DBT *, u_int32_t));
+
+ /* DBC PRIVATE HANDLE LIST BEGIN */
+ int (*am_bulk) __P((DBC *, DBT *, u_int32_t));
+ int (*am_close) __P((DBC *, db_pgno_t, int *));
+ int (*am_del) __P((DBC *, u_int32_t));
+ int (*am_destroy) __P((DBC *));
+ int (*am_get) __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ int (*am_put) __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ int (*am_writelock) __P((DBC *));
+ /* DBC PRIVATE HANDLE LIST END */
+
+/*
+ * DBC_DONTLOCK and DBC_RECOVER are used during recovery and transaction
+ * abort. If a transaction is being aborted or recovered then DBC_RECOVER
+ * will be set and locking and logging will be disabled on this cursor. If
+ * we are performing a compensating transaction (e.g. free page processing)
+ * then DB_DONTLOCK will be set to inhibit locking, but logging will still
+ * be required. DB_DONTLOCK is also used if the whole database is locked.
+ */
+#define DBC_ACTIVE 0x00001 /* Cursor in use. */
+#define DBC_BULK 0x00002 /* Bulk update cursor. */
+#define DBC_DONTLOCK 0x00004 /* Don't lock on this cursor. */
+#define DBC_DOWNREV 0x00008 /* Down rev replication master. */
+#define DBC_DUPLICATE 0x00010 /* Create a duplicate cursor. */
+#define DBC_ERROR 0x00020 /* Error in this request. */
+#define DBC_FAMILY 0x00040 /* Part of a locker family. */
+#define DBC_FROM_DB_GET 0x00080 /* Called from the DB->get() method. */
+#define DBC_MULTIPLE 0x00100 /* Return Multiple data. */
+#define DBC_MULTIPLE_KEY 0x00200 /* Return Multiple keys and data. */
+#define DBC_OPD 0x00400 /* Cursor references off-page dups. */
+#define DBC_OWN_LID 0x00800 /* Free lock id on destroy. */
+#define DBC_PARTITIONED 0x01000 /* Cursor for a partitioned db. */
+#define DBC_READ_COMMITTED 0x02000 /* Cursor has degree 2 isolation. */
+#define DBC_READ_UNCOMMITTED 0x04000 /* Cursor has degree 1 isolation. */
+#define DBC_RECOVER 0x08000 /* Recovery cursor; don't log/lock. */
+#define DBC_RMW 0x10000 /* Acquire write flag in read op. */
+#define DBC_TRANSIENT 0x20000 /* Cursor is transient. */
+#define DBC_WAS_READ_COMMITTED 0x40000 /* Cursor holds a read commited lock. */
+#define DBC_WRITECURSOR 0x80000 /* Cursor may be used to write (CDB). */
+#define DBC_WRITER 0x100000 /* Cursor immediately writing (CDB). */
+ u_int32_t flags;
+};
+
+/* Key range statistics structure */
+struct __key_range {
+ double less;
+ double equal;
+ double greater;
+};
+
+/* Btree/Recno statistics structure. */
+struct __db_bt_stat { /* SHARED */
+ u_int32_t bt_magic; /* Magic number. */
+ u_int32_t bt_version; /* Version number. */
+ u_int32_t bt_metaflags; /* Metadata flags. */
+ u_int32_t bt_nkeys; /* Number of unique keys. */
+ u_int32_t bt_ndata; /* Number of data items. */
+ u_int32_t bt_pagecnt; /* Page count. */
+ u_int32_t bt_pagesize; /* Page size. */
+ u_int32_t bt_minkey; /* Minkey value. */
+ u_int32_t bt_re_len; /* Fixed-length record length. */
+ u_int32_t bt_re_pad; /* Fixed-length record pad. */
+ u_int32_t bt_levels; /* Tree levels. */
+ u_int32_t bt_int_pg; /* Internal pages. */
+ u_int32_t bt_leaf_pg; /* Leaf pages. */
+ u_int32_t bt_dup_pg; /* Duplicate pages. */
+ u_int32_t bt_over_pg; /* Overflow pages. */
+ u_int32_t bt_empty_pg; /* Empty pages. */
+ u_int32_t bt_free; /* Pages on the free list. */
+ uintmax_t bt_int_pgfree; /* Bytes free in internal pages. */
+ uintmax_t bt_leaf_pgfree; /* Bytes free in leaf pages. */
+ uintmax_t bt_dup_pgfree; /* Bytes free in duplicate pages. */
+ uintmax_t bt_over_pgfree; /* Bytes free in overflow pages. */
+};
+
+struct __db_compact {
+ /* Input Parameters. */
+ u_int32_t compact_fillpercent; /* Desired fillfactor: 1-100 */
+ db_timeout_t compact_timeout; /* Lock timeout. */
+ u_int32_t compact_pages; /* Max pages to process. */
+ /* Output Stats. */
+ u_int32_t compact_empty_buckets; /* Empty hash buckets found. */
+ u_int32_t compact_pages_free; /* Number of pages freed. */
+ u_int32_t compact_pages_examine; /* Number of pages examine. */
+ u_int32_t compact_levels; /* Number of levels removed. */
+ u_int32_t compact_deadlock; /* Number of deadlocks. */
+ db_pgno_t compact_pages_truncated; /* Pages truncated to OS. */
+ /* Internal. */
+ db_pgno_t compact_truncate; /* Page number for truncation */
+};
+
+/* Hash statistics structure. */
+struct __db_h_stat { /* SHARED */
+ u_int32_t hash_magic; /* Magic number. */
+ u_int32_t hash_version; /* Version number. */
+ u_int32_t hash_metaflags; /* Metadata flags. */
+ u_int32_t hash_nkeys; /* Number of unique keys. */
+ u_int32_t hash_ndata; /* Number of data items. */
+ u_int32_t hash_pagecnt; /* Page count. */
+ u_int32_t hash_pagesize; /* Page size. */
+ u_int32_t hash_ffactor; /* Fill factor specified at create. */
+ u_int32_t hash_buckets; /* Number of hash buckets. */
+ u_int32_t hash_free; /* Pages on the free list. */
+ uintmax_t hash_bfree; /* Bytes free on bucket pages. */
+ u_int32_t hash_bigpages; /* Number of big key/data pages. */
+ uintmax_t hash_big_bfree; /* Bytes free on big item pages. */
+ u_int32_t hash_overflows; /* Number of overflow pages. */
+ uintmax_t hash_ovfl_free; /* Bytes free on ovfl pages. */
+ u_int32_t hash_dup; /* Number of dup pages. */
+ uintmax_t hash_dup_free; /* Bytes free on duplicate pages. */
+};
+
+/* Heap statistics structure. */
+struct __db_heap_stat { /* SHARED */
+ u_int32_t heap_magic; /* Magic number. */
+ u_int32_t heap_version; /* Version number. */
+ u_int32_t heap_metaflags; /* Metadata flags. */
+ u_int32_t heap_nrecs; /* Number of records. */
+ u_int32_t heap_pagecnt; /* Page count. */
+ u_int32_t heap_pagesize; /* Page size. */
+ u_int32_t heap_nregions; /* Number of regions. */
+ u_int32_t heap_regionsize; /* Number of pages in a region. */
+};
+
+/* Queue statistics structure. */
+struct __db_qam_stat { /* SHARED */
+ u_int32_t qs_magic; /* Magic number. */
+ u_int32_t qs_version; /* Version number. */
+ u_int32_t qs_metaflags; /* Metadata flags. */
+ u_int32_t qs_nkeys; /* Number of unique keys. */
+ u_int32_t qs_ndata; /* Number of data items. */
+ u_int32_t qs_pagesize; /* Page size. */
+ u_int32_t qs_extentsize; /* Pages per extent. */
+ u_int32_t qs_pages; /* Data pages. */
+ u_int32_t qs_re_len; /* Fixed-length record length. */
+ u_int32_t qs_re_pad; /* Fixed-length record pad. */
+ u_int32_t qs_pgfree; /* Bytes free in data pages. */
+ u_int32_t qs_first_recno; /* First not deleted record. */
+ u_int32_t qs_cur_recno; /* Next available record number. */
+};
+
+/*******************************************************
+ * Environment.
+ *******************************************************/
+#define DB_REGION_MAGIC 0x120897 /* Environment magic number. */
+
+/*
+ * Database environment structure.
+ *
+ * This is the public database environment handle. The private environment
+ * handle is the ENV structure. The user owns this structure, the library
+ * owns the ENV structure. The reason there are two structures is because
+ * the user's configuration outlives any particular DB_ENV->open call, and
+ * separate structures allows us to easily discard internal information without
+ * discarding the user's configuration.
+ *
+ * Fields in the DB_ENV structure should normally be set only by application
+ * DB_ENV handle methods.
+ */
+
+/*
+ * Memory configuration types.
+ */
+typedef enum {
+ DB_MEM_LOCK=1,
+ DB_MEM_LOCKOBJECT=2,
+ DB_MEM_LOCKER=3,
+ DB_MEM_LOGID=4,
+ DB_MEM_TRANSACTION=5,
+ DB_MEM_THREAD=6
+} DB_MEM_CONFIG;
+
+/*
+ * Backup configuration types.
+ */
+typedef enum {
+ DB_BACKUP_READ_COUNT = 1,
+ DB_BACKUP_READ_SLEEP = 2,
+ DB_BACKUP_SIZE = 3,
+ DB_BACKUP_WRITE_DIRECT = 4
+} DB_BACKUP_CONFIG;
+
+struct __db_env {
+ ENV *env; /* Linked ENV structure */
+
+ /*
+ * The DB_ENV structure can be used concurrently, so field access is
+ * protected.
+ */
+ db_mutex_t mtx_db_env; /* DB_ENV structure mutex */
+
+ /* Error message callback */
+ void (*db_errcall) __P((const DB_ENV *, const char *, const char *));
+ FILE *db_errfile; /* Error message file stream */
+ const char *db_errpfx; /* Error message prefix */
+
+ /* Other message callback */
+ void (*db_msgcall) __P((const DB_ENV *, const char *));
+ FILE *db_msgfile; /* Other message file stream */
+
+ /* Other application callback functions */
+ int (*app_dispatch) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+ void (*db_event_func) __P((DB_ENV *, u_int32_t, void *));
+ void (*db_feedback) __P((DB_ENV *, int, int));
+ void (*db_free) __P((void *));
+ void (*db_paniccall) __P((DB_ENV *, int));
+ void *(*db_malloc) __P((size_t));
+ void *(*db_realloc) __P((void *, size_t));
+ int (*is_alive) __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+ void (*thread_id) __P((DB_ENV *, pid_t *, db_threadid_t *));
+ char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *));
+
+ /* Application specified paths */
+ char *db_log_dir; /* Database log file directory */
+ char *db_md_dir; /* Persistent metadata directory */
+ char *db_tmp_dir; /* Database tmp file directory */
+
+ char *db_create_dir; /* Create directory for data files */
+ char **db_data_dir; /* Database data file directories */
+ int data_cnt; /* Database data file slots */
+ int data_next; /* Next database data file slot */
+
+ char *intermediate_dir_mode; /* Intermediate directory perms */
+
+ long shm_key; /* shmget key */
+
+ char *passwd; /* Cryptography support */
+ size_t passwd_len;
+
+ /* Private handle references */
+ void *app_private; /* Application-private handle */
+ void *api1_internal; /* C++, Perl API private */
+ void *api2_internal; /* Java API private */
+
+ u_int32_t verbose; /* DB_VERB_XXX flags */
+
+ /* Mutex configuration */
+ u_int32_t mutex_align; /* Mutex alignment */
+ u_int32_t mutex_cnt; /* Number of mutexes to configure */
+ u_int32_t mutex_inc; /* Number of mutexes to add */
+ u_int32_t mutex_max; /* Max number of mutexes */
+ u_int32_t mutex_tas_spins;/* Test-and-set spin count */
+
+ /* Locking configuration */
+ u_int8_t *lk_conflicts; /* Two dimensional conflict matrix */
+ int lk_modes; /* Number of lock modes in table */
+ u_int32_t lk_detect; /* Deadlock detect on all conflicts */
+ u_int32_t lk_max; /* Maximum number of locks */
+ u_int32_t lk_max_lockers;/* Maximum number of lockers */
+ u_int32_t lk_max_objects;/* Maximum number of locked objects */
+ u_int32_t lk_init; /* Initial number of locks */
+ u_int32_t lk_init_lockers;/* Initial number of lockers */
+ u_int32_t lk_init_objects;/* Initial number of locked objects */
+ u_int32_t lk_partitions ;/* Number of object partitions */
+ db_timeout_t lk_timeout; /* Lock timeout period */
+ /* Used during initialization */
+ u_int32_t locker_t_size; /* Locker hash table size. */
+ u_int32_t object_t_size; /* Object hash table size. */
+
+ /* Logging configuration */
+ u_int32_t lg_bsize; /* Buffer size */
+ u_int32_t lg_fileid_init; /* Initial allocation for fname structs */
+ int lg_filemode; /* Log file permission mode */
+ u_int32_t lg_regionmax; /* Region size */
+ u_int32_t lg_size; /* Log file size */
+ u_int32_t lg_flags; /* Log configuration */
+
+ /* Memory pool configuration */
+ u_int32_t mp_gbytes; /* Cache size: GB */
+ u_int32_t mp_bytes; /* Cache size: bytes */
+ u_int32_t mp_max_gbytes; /* Maximum cache size: GB */
+ u_int32_t mp_max_bytes; /* Maximum cache size: bytes */
+ size_t mp_mmapsize; /* Maximum file size for mmap */
+ int mp_maxopenfd; /* Maximum open file descriptors */
+ int mp_maxwrite; /* Maximum buffers to write */
+ u_int mp_ncache; /* Initial number of cache regions */
+ u_int32_t mp_pagesize; /* Average page size */
+ u_int32_t mp_tablesize; /* Approximate hash table size */
+ u_int32_t mp_mtxcount; /* Number of mutexs */
+ /* Sleep after writing max buffers */
+ db_timeout_t mp_maxwrite_sleep;
+
+ /* Transaction configuration */
+ u_int32_t tx_init; /* Initial number of transactions */
+ u_int32_t tx_max; /* Maximum number of transactions */
+ time_t tx_timestamp; /* Recover to specific timestamp */
+ db_timeout_t tx_timeout; /* Timeout for transactions */
+
+ /* Thread tracking configuration */
+ u_int32_t thr_init; /* Thread count */
+ u_int32_t thr_max; /* Thread max */
+ roff_t memory_max; /* Maximum region memory */
+
+ /*
+ * The following fields are not strictly user-owned, but they outlive
+ * the ENV structure, and so are stored here.
+ */
+ DB_FH *registry; /* DB_REGISTER file handle */
+ u_int32_t registry_off; /*
+ * Offset of our slot. We can't use
+ * off_t because its size depends on
+ * build settings.
+ */
+ db_timeout_t envreg_timeout; /* DB_REGISTER wait timeout */
+
+#define DB_ENV_AUTO_COMMIT 0x00000001 /* DB_AUTO_COMMIT */
+#define DB_ENV_CDB_ALLDB 0x00000002 /* CDB environment wide locking */
+#define DB_ENV_FAILCHK 0x00000004 /* Failchk is running */
+#define DB_ENV_DIRECT_DB 0x00000008 /* DB_DIRECT_DB set */
+#define DB_ENV_DSYNC_DB 0x00000010 /* DB_DSYNC_DB set */
+#define DB_ENV_DATABASE_LOCKING 0x00000020 /* Try database-level locking */
+#define DB_ENV_MULTIVERSION 0x00000040 /* DB_MULTIVERSION set */
+#define DB_ENV_NOLOCKING 0x00000080 /* DB_NOLOCKING set */
+#define DB_ENV_NOMMAP 0x00000100 /* DB_NOMMAP set */
+#define DB_ENV_NOPANIC 0x00000200 /* Okay if panic set */
+#define DB_ENV_OVERWRITE 0x00000400 /* DB_OVERWRITE set */
+#define DB_ENV_REGION_INIT 0x00000800 /* DB_REGION_INIT set */
+#define DB_ENV_TIME_NOTGRANTED 0x00001000 /* DB_TIME_NOTGRANTED set */
+#define DB_ENV_TXN_NOSYNC 0x00002000 /* DB_TXN_NOSYNC set */
+#define DB_ENV_TXN_NOWAIT 0x00004000 /* DB_TXN_NOWAIT set */
+#define DB_ENV_TXN_SNAPSHOT 0x00008000 /* DB_TXN_SNAPSHOT set */
+#define DB_ENV_TXN_WRITE_NOSYNC 0x00010000 /* DB_TXN_WRITE_NOSYNC set */
+#define DB_ENV_YIELDCPU 0x00020000 /* DB_YIELDCPU set */
+#define DB_ENV_HOTBACKUP 0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */
+#define DB_ENV_NOFLUSH 0x00080000 /* DB_NOFLUSH set */
+ u_int32_t flags;
+
+ /* DB_ENV PUBLIC HANDLE LIST BEGIN */
+ int (*add_data_dir) __P((DB_ENV *, const char *));
+ int (*backup) __P((DB_ENV *, const char *, u_int32_t));
+ int (*cdsgroup_begin) __P((DB_ENV *, DB_TXN **));
+ int (*close) __P((DB_ENV *, u_int32_t));
+ int (*dbbackup) __P((DB_ENV *, const char *, const char *, u_int32_t));
+ int (*dbremove) __P((DB_ENV *,
+ DB_TXN *, const char *, const char *, u_int32_t));
+ int (*dbrename) __P((DB_ENV *,
+ DB_TXN *, const char *, const char *, const char *, u_int32_t));
+ void (*err) __P((const DB_ENV *, int, const char *, ...));
+ void (*errx) __P((const DB_ENV *, const char *, ...));
+ int (*failchk) __P((DB_ENV *, u_int32_t));
+ int (*fileid_reset) __P((DB_ENV *, const char *, u_int32_t));
+ int (*get_alloc) __P((DB_ENV *, void *(**)(size_t),
+ void *(**)(void *, size_t), void (**)(void *)));
+ int (*get_app_dispatch)
+ __P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+ int (*get_cache_max) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ int (*get_cachesize) __P((DB_ENV *, u_int32_t *, u_int32_t *, int *));
+ int (*get_create_dir) __P((DB_ENV *, const char **));
+ int (*get_data_dirs) __P((DB_ENV *, const char ***));
+ int (*get_data_len) __P((DB_ENV *, u_int32_t *));
+ int (*get_backup_callbacks) __P((DB_ENV *,
+ int (**)(DB_ENV *, const char *, const char *, void **),
+ int (**)(DB_ENV *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+ int (**)(DB_ENV *, const char *, void *)));
+ int (*get_backup_config) __P((DB_ENV *, DB_BACKUP_CONFIG, u_int32_t *));
+ int (*get_encrypt_flags) __P((DB_ENV *, u_int32_t *));
+ void (*get_errcall) __P((DB_ENV *,
+ void (**)(const DB_ENV *, const char *, const char *)));
+ void (*get_errfile) __P((DB_ENV *, FILE **));
+ void (*get_errpfx) __P((DB_ENV *, const char **));
+ int (*get_flags) __P((DB_ENV *, u_int32_t *));
+ int (*get_feedback) __P((DB_ENV *, void (**)(DB_ENV *, int, int)));
+ int (*get_home) __P((DB_ENV *, const char **));
+ int (*get_intermediate_dir_mode) __P((DB_ENV *, const char **));
+ int (*get_isalive) __P((DB_ENV *,
+ int (**)(DB_ENV *, pid_t, db_threadid_t, u_int32_t)));
+ int (*get_lg_bsize) __P((DB_ENV *, u_int32_t *));
+ int (*get_lg_dir) __P((DB_ENV *, const char **));
+ int (*get_lg_filemode) __P((DB_ENV *, int *));
+ int (*get_lg_max) __P((DB_ENV *, u_int32_t *));
+ int (*get_lg_regionmax) __P((DB_ENV *, u_int32_t *));
+ int (*get_lk_conflicts) __P((DB_ENV *, const u_int8_t **, int *));
+ int (*get_lk_detect) __P((DB_ENV *, u_int32_t *));
+ int (*get_lk_max_lockers) __P((DB_ENV *, u_int32_t *));
+ int (*get_lk_max_locks) __P((DB_ENV *, u_int32_t *));
+ int (*get_lk_max_objects) __P((DB_ENV *, u_int32_t *));
+ int (*get_lk_partitions) __P((DB_ENV *, u_int32_t *));
+ int (*get_lk_priority) __P((DB_ENV *, u_int32_t, u_int32_t *));
+ int (*get_lk_tablesize) __P((DB_ENV *, u_int32_t *));
+ int (*get_memory_init) __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t *));
+ int (*get_memory_max) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ int (*get_metadata_dir) __P((DB_ENV *, const char **));
+ int (*get_mp_max_openfd) __P((DB_ENV *, int *));
+ int (*get_mp_max_write) __P((DB_ENV *, int *, db_timeout_t *));
+ int (*get_mp_mmapsize) __P((DB_ENV *, size_t *));
+ int (*get_mp_mtxcount) __P((DB_ENV *, u_int32_t *));
+ int (*get_mp_pagesize) __P((DB_ENV *, u_int32_t *));
+ int (*get_mp_tablesize) __P((DB_ENV *, u_int32_t *));
+ void (*get_msgcall)
+ __P((DB_ENV *, void (**)(const DB_ENV *, const char *)));
+ void (*get_msgfile) __P((DB_ENV *, FILE **));
+ int (*get_open_flags) __P((DB_ENV *, u_int32_t *));
+ int (*get_shm_key) __P((DB_ENV *, long *));
+ int (*get_thread_count) __P((DB_ENV *, u_int32_t *));
+ int (*get_thread_id_fn)
+ __P((DB_ENV *, void (**)(DB_ENV *, pid_t *, db_threadid_t *)));
+ int (*get_thread_id_string_fn) __P((DB_ENV *,
+ char *(**)(DB_ENV *, pid_t, db_threadid_t, char *)));
+ int (*get_timeout) __P((DB_ENV *, db_timeout_t *, u_int32_t));
+ int (*get_tmp_dir) __P((DB_ENV *, const char **));
+ int (*get_tx_max) __P((DB_ENV *, u_int32_t *));
+ int (*get_tx_timestamp) __P((DB_ENV *, time_t *));
+ int (*get_verbose) __P((DB_ENV *, u_int32_t, int *));
+ int (*is_bigendian) __P((void));
+ int (*lock_detect) __P((DB_ENV *, u_int32_t, u_int32_t, int *));
+ int (*lock_get) __P((DB_ENV *,
+ u_int32_t, u_int32_t, DBT *, db_lockmode_t, DB_LOCK *));
+ int (*lock_id) __P((DB_ENV *, u_int32_t *));
+ int (*lock_id_free) __P((DB_ENV *, u_int32_t));
+ int (*lock_put) __P((DB_ENV *, DB_LOCK *));
+ int (*lock_stat) __P((DB_ENV *, DB_LOCK_STAT **, u_int32_t));
+ int (*lock_stat_print) __P((DB_ENV *, u_int32_t));
+ int (*lock_vec) __P((DB_ENV *,
+ u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+ int (*log_archive) __P((DB_ENV *, char **[], u_int32_t));
+ int (*log_cursor) __P((DB_ENV *, DB_LOGC **, u_int32_t));
+ int (*log_file) __P((DB_ENV *, const DB_LSN *, char *, size_t));
+ int (*log_flush) __P((DB_ENV *, const DB_LSN *));
+ int (*log_get_config) __P((DB_ENV *, u_int32_t, int *));
+ int (*log_printf) __P((DB_ENV *, DB_TXN *, const char *, ...));
+ int (*log_put) __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
+ int (*log_put_record) __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *,
+ u_int32_t, u_int32_t, u_int32_t, u_int32_t,
+ DB_LOG_RECSPEC *, ...));
+ int (*log_read_record) __P((DB_ENV *, DB **,
+ void *, void *, DB_LOG_RECSPEC *, u_int32_t, void **));
+ int (*log_set_config) __P((DB_ENV *, u_int32_t, int));
+ int (*log_stat) __P((DB_ENV *, DB_LOG_STAT **, u_int32_t));
+ int (*log_stat_print) __P((DB_ENV *, u_int32_t));
+ int (*log_verify) __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+ int (*lsn_reset) __P((DB_ENV *, const char *, u_int32_t));
+ int (*memp_fcreate) __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
+ int (*memp_register) __P((DB_ENV *, int, int (*)(DB_ENV *, db_pgno_t,
+ void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+ int (*memp_stat) __P((DB_ENV *,
+ DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
+ int (*memp_stat_print) __P((DB_ENV *, u_int32_t));
+ int (*memp_sync) __P((DB_ENV *, DB_LSN *));
+ int (*memp_trickle) __P((DB_ENV *, int, int *));
+ int (*mutex_alloc) __P((DB_ENV *, u_int32_t, db_mutex_t *));
+ int (*mutex_free) __P((DB_ENV *, db_mutex_t));
+ int (*mutex_get_align) __P((DB_ENV *, u_int32_t *));
+ int (*mutex_get_increment) __P((DB_ENV *, u_int32_t *));
+ int (*mutex_get_init) __P((DB_ENV *, u_int32_t *));
+ int (*mutex_get_max) __P((DB_ENV *, u_int32_t *));
+ int (*mutex_get_tas_spins) __P((DB_ENV *, u_int32_t *));
+ int (*mutex_lock) __P((DB_ENV *, db_mutex_t));
+ int (*mutex_set_align) __P((DB_ENV *, u_int32_t));
+ int (*mutex_set_increment) __P((DB_ENV *, u_int32_t));
+ int (*mutex_set_init) __P((DB_ENV *, u_int32_t));
+ int (*mutex_set_max) __P((DB_ENV *, u_int32_t));
+ int (*mutex_set_tas_spins) __P((DB_ENV *, u_int32_t));
+ int (*mutex_stat) __P((DB_ENV *, DB_MUTEX_STAT **, u_int32_t));
+ int (*mutex_stat_print) __P((DB_ENV *, u_int32_t));
+ int (*mutex_unlock) __P((DB_ENV *, db_mutex_t));
+ int (*open) __P((DB_ENV *, const char *, u_int32_t, int));
+ int (*remove) __P((DB_ENV *, const char *, u_int32_t));
+ int (*rep_elect) __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+ int (*rep_flush) __P((DB_ENV *));
+ int (*rep_get_clockskew) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ int (*rep_get_config) __P((DB_ENV *, u_int32_t, int *));
+ int (*rep_get_limit) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ int (*rep_get_nsites) __P((DB_ENV *, u_int32_t *));
+ int (*rep_get_priority) __P((DB_ENV *, u_int32_t *));
+ int (*rep_get_request) __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ int (*rep_get_timeout) __P((DB_ENV *, int, u_int32_t *));
+ int (*rep_process_message)
+ __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *));
+ int (*rep_set_clockskew) __P((DB_ENV *, u_int32_t, u_int32_t));
+ int (*rep_set_config) __P((DB_ENV *, u_int32_t, int));
+ int (*rep_set_limit) __P((DB_ENV *, u_int32_t, u_int32_t));
+ int (*rep_set_nsites) __P((DB_ENV *, u_int32_t));
+ int (*rep_set_priority) __P((DB_ENV *, u_int32_t));
+ int (*rep_set_request) __P((DB_ENV *, u_int32_t, u_int32_t));
+ int (*rep_set_timeout) __P((DB_ENV *, int, db_timeout_t));
+ int (*rep_set_transport) __P((DB_ENV *, int, int (*)(DB_ENV *,
+ const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
+ int (*rep_start) __P((DB_ENV *, DBT *, u_int32_t));
+ int (*rep_stat) __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
+ int (*rep_stat_print) __P((DB_ENV *, u_int32_t));
+ int (*rep_sync) __P((DB_ENV *, u_int32_t));
+ int (*repmgr_channel) __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+ int (*repmgr_get_ack_policy) __P((DB_ENV *, int *));
+ int (*repmgr_local_site) __P((DB_ENV *, DB_SITE **));
+ int (*repmgr_msg_dispatch) __P((DB_ENV *,
+ void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t),
+ u_int32_t));
+ int (*repmgr_set_ack_policy) __P((DB_ENV *, int));
+ int (*repmgr_site)
+ __P((DB_ENV *, const char *, u_int, DB_SITE**, u_int32_t));
+ int (*repmgr_site_by_eid) __P((DB_ENV *, int, DB_SITE**));
+ int (*repmgr_site_list) __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+ int (*repmgr_start) __P((DB_ENV *, int, u_int32_t));
+ int (*repmgr_stat) __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+ int (*repmgr_stat_print) __P((DB_ENV *, u_int32_t));
+ int (*set_alloc) __P((DB_ENV *, void *(*)(size_t),
+ void *(*)(void *, size_t), void (*)(void *)));
+ int (*set_app_dispatch)
+ __P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+ int (*set_cache_max) __P((DB_ENV *, u_int32_t, u_int32_t));
+ int (*set_cachesize) __P((DB_ENV *, u_int32_t, u_int32_t, int));
+ int (*set_create_dir) __P((DB_ENV *, const char *));
+ int (*set_data_dir) __P((DB_ENV *, const char *));
+ int (*set_data_len) __P((DB_ENV *, u_int32_t));
+ int (*set_backup_callbacks) __P((DB_ENV *,
+ int (*)(DB_ENV *, const char *, const char *, void **),
+ int (*)(DB_ENV *, u_int32_t,
+ u_int32_t, u_int32_t, u_int8_t *, void *),
+ int (*)(DB_ENV *, const char *, void *)));
+ int (*set_backup_config) __P((DB_ENV *, DB_BACKUP_CONFIG, u_int32_t));
+ int (*set_encrypt) __P((DB_ENV *, const char *, u_int32_t));
+ void (*set_errcall) __P((DB_ENV *,
+ void (*)(const DB_ENV *, const char *, const char *)));
+ void (*set_errfile) __P((DB_ENV *, FILE *));
+ void (*set_errpfx) __P((DB_ENV *, const char *));
+ int (*set_event_notify)
+ __P((DB_ENV *, void (*)(DB_ENV *, u_int32_t, void *)));
+ int (*set_feedback) __P((DB_ENV *, void (*)(DB_ENV *, int, int)));
+ int (*set_flags) __P((DB_ENV *, u_int32_t, int));
+ int (*set_intermediate_dir_mode) __P((DB_ENV *, const char *));
+ int (*set_isalive) __P((DB_ENV *,
+ int (*)(DB_ENV *, pid_t, db_threadid_t, u_int32_t)));
+ int (*set_lg_bsize) __P((DB_ENV *, u_int32_t));
+ int (*set_lg_dir) __P((DB_ENV *, const char *));
+ int (*set_lg_filemode) __P((DB_ENV *, int));
+ int (*set_lg_max) __P((DB_ENV *, u_int32_t));
+ int (*set_lg_regionmax) __P((DB_ENV *, u_int32_t));
+ int (*set_lk_conflicts) __P((DB_ENV *, u_int8_t *, int));
+ int (*set_lk_detect) __P((DB_ENV *, u_int32_t));
+ int (*set_lk_max_lockers) __P((DB_ENV *, u_int32_t));
+ int (*set_lk_max_locks) __P((DB_ENV *, u_int32_t));
+ int (*set_lk_max_objects) __P((DB_ENV *, u_int32_t));
+ int (*set_lk_partitions) __P((DB_ENV *, u_int32_t));
+ int (*set_lk_priority) __P((DB_ENV *, u_int32_t, u_int32_t));
+ int (*set_lk_tablesize) __P((DB_ENV *, u_int32_t));
+ int (*set_memory_init) __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t));
+ int (*set_memory_max) __P((DB_ENV *, u_int32_t, u_int32_t));
+ int (*set_metadata_dir) __P((DB_ENV *, const char *));
+ int (*set_mp_max_openfd) __P((DB_ENV *, int));
+ int (*set_mp_max_write) __P((DB_ENV *, int, db_timeout_t));
+ int (*set_mp_mmapsize) __P((DB_ENV *, size_t));
+ int (*set_mp_mtxcount) __P((DB_ENV *, u_int32_t));
+ int (*set_mp_pagesize) __P((DB_ENV *, u_int32_t));
+ int (*set_mp_tablesize) __P((DB_ENV *, u_int32_t));
+ void (*set_msgcall)
+ __P((DB_ENV *, void (*)(const DB_ENV *, const char *)));
+ void (*set_msgfile) __P((DB_ENV *, FILE *));
+ int (*set_paniccall) __P((DB_ENV *, void (*)(DB_ENV *, int)));
+ int (*set_shm_key) __P((DB_ENV *, long));
+ int (*set_thread_count) __P((DB_ENV *, u_int32_t));
+ int (*set_thread_id)
+ __P((DB_ENV *, void (*)(DB_ENV *, pid_t *, db_threadid_t *)));
+ int (*set_thread_id_string) __P((DB_ENV *,
+ char *(*)(DB_ENV *, pid_t, db_threadid_t, char *)));
+ int (*set_timeout) __P((DB_ENV *, db_timeout_t, u_int32_t));
+ int (*set_tmp_dir) __P((DB_ENV *, const char *));
+ int (*set_tx_max) __P((DB_ENV *, u_int32_t));
+ int (*set_tx_timestamp) __P((DB_ENV *, time_t *));
+ int (*set_verbose) __P((DB_ENV *, u_int32_t, int));
+ int (*txn_applied) __P((DB_ENV *,
+ DB_TXN_TOKEN *, db_timeout_t, u_int32_t));
+ int (*stat_print) __P((DB_ENV *, u_int32_t));
+ int (*txn_begin) __P((DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t));
+ int (*txn_checkpoint) __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+ int (*txn_recover) __P((DB_ENV *,
+ DB_PREPLIST *, long, long *, u_int32_t));
+ int (*txn_stat) __P((DB_ENV *, DB_TXN_STAT **, u_int32_t));
+ int (*txn_stat_print) __P((DB_ENV *, u_int32_t));
+ /* DB_ENV PUBLIC HANDLE LIST END */
+
+ /* DB_ENV PRIVATE HANDLE LIST BEGIN */
+ int (*prdbt) __P((DBT *, int,
+ const char *, void *, int (*)(void *, const void *), int, int));
+ /* DB_ENV PRIVATE HANDLE LIST END */
+};
+
+/*
+ * Dispatch structure for recovery, log verification and print routines. Since
+ * internal and external routines take different arguments (ENV versus DB_ENV),
+ * we need something more elaborate than a single pointer and size.
+ */
+struct __db_distab {
+ int (**int_dispatch) __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ size_t int_size;
+ int (**ext_dispatch) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+ size_t ext_size;
+};
+
+/*
+ * Log verification configuration structure.
+ */
+struct __db_logvrfy_config {
+ int continue_after_fail, verbose;
+ u_int32_t cachesize;
+ const char *temp_envhome;
+ const char *dbfile, *dbname;
+ DB_LSN start_lsn, end_lsn;
+ time_t start_time, end_time;
+};
+
+struct __db_channel {
+ CHANNEL *channel; /* Pointer to internal state details. */
+ int eid; /* Env. ID passed in constructor. */
+ db_timeout_t timeout;
+
+ /* DB_CHANNEL PUBLIC HANDLE LIST BEGIN */
+ int (*close) __P((DB_CHANNEL *, u_int32_t));
+ int (*send_msg) __P((DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+ int (*send_request) __P((DB_CHANNEL *,
+ DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+ int (*set_timeout) __P((DB_CHANNEL *, db_timeout_t));
+ /* DB_CHANNEL PUBLIC HANDLE LIST END */
+};
+
+struct __db_site {
+ ENV *env;
+ int eid;
+ const char *host;
+ u_int port;
+ u_int32_t flags;
+
+ /* DB_SITE PUBLIC HANDLE LIST BEGIN */
+ int (*get_address) __P((DB_SITE *, const char **, u_int *));
+ int (*get_config) __P((DB_SITE *, u_int32_t, u_int32_t *));
+ int (*get_eid) __P((DB_SITE *, int *));
+ int (*set_config) __P((DB_SITE *, u_int32_t, u_int32_t));
+ int (*remove) __P((DB_SITE *));
+ int (*close) __P((DB_SITE *));
+ /* DB_SITE PUBLIC HANDLE LIST END */
+};
+
+#if DB_DBM_HSEARCH != 0
+/*******************************************************
+ * Dbm/Ndbm historic interfaces.
+ *******************************************************/
+typedef struct __db DBM;
+
+#define DBM_INSERT 0 /* Flags to dbm_store(). */
+#define DBM_REPLACE 1
+
+/*
+ * The DB support for ndbm(3) always appends this suffix to the
+ * file name to avoid overwriting the user's original database.
+ */
+#define DBM_SUFFIX ".db"
+
+#if defined(_XPG4_2)
+typedef struct {
+ char *dptr;
+ size_t dsize;
+} datum;
+#else
+typedef struct {
+ char *dptr;
+ int dsize;
+} datum;
+#endif
+
+/*
+ * Translate NDBM calls into DB calls so that DB doesn't step on the
+ * application's name space.
+ */
+#define dbm_clearerr(a) __db_ndbm_clearerr@DB_VERSION_UNIQUE_NAME@(a)
+#define dbm_close(a) __db_ndbm_close@DB_VERSION_UNIQUE_NAME@(a)
+#define dbm_delete(a, b) __db_ndbm_delete@DB_VERSION_UNIQUE_NAME@(a, b)
+#define dbm_dirfno(a) __db_ndbm_dirfno@DB_VERSION_UNIQUE_NAME@(a)
+#define dbm_error(a) __db_ndbm_error@DB_VERSION_UNIQUE_NAME@(a)
+#define dbm_fetch(a, b) __db_ndbm_fetch@DB_VERSION_UNIQUE_NAME@(a, b)
+#define dbm_firstkey(a) __db_ndbm_firstkey@DB_VERSION_UNIQUE_NAME@(a)
+#define dbm_nextkey(a) __db_ndbm_nextkey@DB_VERSION_UNIQUE_NAME@(a)
+#define dbm_open(a, b, c) __db_ndbm_open@DB_VERSION_UNIQUE_NAME@(a, b, c)
+#define dbm_pagfno(a) __db_ndbm_pagfno@DB_VERSION_UNIQUE_NAME@(a)
+#define dbm_rdonly(a) __db_ndbm_rdonly@DB_VERSION_UNIQUE_NAME@(a)
+#define dbm_store(a, b, c, d) \
+ __db_ndbm_store@DB_VERSION_UNIQUE_NAME@(a, b, c, d)
+
+/*
+ * Translate DBM calls into DB calls so that DB doesn't step on the
+ * application's name space.
+ *
+ * The global variables dbrdonly, dirf and pagf were not retained when 4BSD
+ * replaced the dbm interface with ndbm, and are not supported here.
+ */
+#define dbminit(a) __db_dbm_init@DB_VERSION_UNIQUE_NAME@(a)
+#define dbmclose __db_dbm_close@DB_VERSION_UNIQUE_NAME@
+#if !defined(__cplusplus)
+#define delete(a) __db_dbm_delete@DB_VERSION_UNIQUE_NAME@(a)
+#endif
+#define fetch(a) __db_dbm_fetch@DB_VERSION_UNIQUE_NAME@(a)
+#define firstkey __db_dbm_firstkey@DB_VERSION_UNIQUE_NAME@
+#define nextkey(a) __db_dbm_nextkey@DB_VERSION_UNIQUE_NAME@(a)
+#define store(a, b) __db_dbm_store@DB_VERSION_UNIQUE_NAME@(a, b)
+
+/*******************************************************
+ * Hsearch historic interface.
+ *******************************************************/
+typedef enum {
+ FIND, ENTER
+} ACTION;
+
+typedef struct entry {
+ char *key;
+ char *data;
+} ENTRY;
+
+#define hcreate(a) __db_hcreate@DB_VERSION_UNIQUE_NAME@(a)
+#define hdestroy __db_hdestroy@DB_VERSION_UNIQUE_NAME@
+#define hsearch(a, b) __db_hsearch@DB_VERSION_UNIQUE_NAME@(a, b)
+
+#endif /* DB_DBM_HSEARCH */
+
+#if defined(__cplusplus)
+}
+#endif
+
+@platform_footer@
+#endif /* !_DB_H_ */
diff --git a/src/dbinc/db_185.in b/src/dbinc/db_185.in
new file mode 100644
index 00000000..43735344
--- /dev/null
+++ b/src/dbinc/db_185.in
@@ -0,0 +1,176 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_185_H_
+#define _DB_185_H_
+
+#include <sys/types.h>
+
+#include <limits.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * XXX
+ * Handle function prototypes and the keyword "const". This steps on name
+ * space that DB doesn't control, but all of the other solutions are worse.
+ */
+#undef __P
+#if defined(__STDC__) || defined(__cplusplus)
+#define __P(protos) protos /* ANSI C prototypes */
+#else
+#define const
+#define __P(protos) () /* K&R C preprocessor */
+#endif
+
+#define RET_ERROR -1 /* Return values. */
+#define RET_SUCCESS 0
+#define RET_SPECIAL 1
+
+#ifndef __BIT_TYPES_DEFINED__
+#define __BIT_TYPES_DEFINED__
+@u_int8_decl@
+@int16_decl@
+@u_int16_decl@
+@int32_decl@
+@u_int32_decl@
+#endif
+
+/*
+ * XXX
+ * SGI/IRIX already has a pgno_t.
+ */
+#ifdef __sgi
+#define pgno_t db_pgno_t
+#endif
+
+#define MAX_PAGE_NUMBER 0xffffffff /* >= # of pages in a file */
+typedef u_int32_t pgno_t;
+#define MAX_PAGE_OFFSET 65535 /* >= # of bytes in a page */
+typedef u_int16_t indx_t;
+#define MAX_REC_NUMBER 0xffffffff /* >= # of records in a tree */
+typedef u_int32_t recno_t;
+
+/* Key/data structure -- a Data-Base Thang. */
+typedef struct {
+ void *data; /* data */
+ size_t size; /* data length */
+} DBT;
+
+/* Routine flags. */
+#define R_CURSOR 1 /* del, put, seq */
+#define __R_UNUSED 2 /* UNUSED */
+#define R_FIRST 3 /* seq */
+#define R_IAFTER 4 /* put (RECNO) */
+#define R_IBEFORE 5 /* put (RECNO) */
+#define R_LAST 6 /* seq (BTREE, RECNO) */
+#define R_NEXT 7 /* seq */
+#define R_NOOVERWRITE 8 /* put */
+#define R_PREV 9 /* seq (BTREE, RECNO) */
+#define R_SETCURSOR 10 /* put (RECNO) */
+#define R_RECNOSYNC 11 /* sync (RECNO) */
+
+typedef enum { DB_BTREE, DB_HASH, DB_RECNO } DBTYPE;
+
+/* Access method description structure. */
+typedef struct __db {
+ DBTYPE type; /* Underlying db type. */
+ int (*close) __P((struct __db *));
+ int (*del) __P((const struct __db *, const DBT *, u_int));
+ int (*get) __P((const struct __db *, const DBT *, DBT *, u_int));
+ int (*put) __P((const struct __db *, DBT *, const DBT *, u_int));
+ int (*seq) __P((const struct __db *, DBT *, DBT *, u_int));
+ int (*sync) __P((const struct __db *, u_int));
+ void *internal; /* Access method private. */
+ int (*fd) __P((const struct __db *));
+} DB;
+
+#define BTREEMAGIC 0x053162
+#define BTREEVERSION 3
+
+/* Structure used to pass parameters to the btree routines. */
+typedef struct {
+#define R_DUP 0x01 /* duplicate keys */
+ u_int32_t flags;
+ u_int32_t cachesize; /* bytes to cache */
+ u_int32_t maxkeypage; /* maximum keys per page */
+ u_int32_t minkeypage; /* minimum keys per page */
+ u_int32_t psize; /* page size */
+ int (*compare) /* comparison function */
+ __P((const DBT *, const DBT *));
+ size_t (*prefix) /* prefix function */
+ __P((const DBT *, const DBT *));
+ int lorder; /* byte order */
+} BTREEINFO;
+
+#define HASHMAGIC 0x061561
+#define HASHVERSION 2
+
+/* Structure used to pass parameters to the hashing routines. */
+typedef struct {
+ u_int32_t bsize; /* bucket size */
+ u_int32_t ffactor; /* fill factor */
+ u_int32_t nelem; /* number of elements */
+ u_int32_t cachesize; /* bytes to cache */
+ u_int32_t /* hash function */
+ (*hash) __P((const void *, size_t));
+ int lorder; /* byte order */
+} HASHINFO;
+
+/* Structure used to pass parameters to the record routines. */
+typedef struct {
+#define R_FIXEDLEN 0x01 /* fixed-length records */
+#define R_NOKEY 0x02 /* key not required */
+#define R_SNAPSHOT 0x04 /* snapshot the input */
+ u_int32_t flags;
+ u_int32_t cachesize; /* bytes to cache */
+ u_int32_t psize; /* page size */
+ int lorder; /* byte order */
+ size_t reclen; /* record length (fixed-length records) */
+ u_char bval; /* delimiting byte (variable-length records */
+ char *bfname; /* btree file name */
+} RECNOINFO;
+
+/* Re-define the user's dbopen calls. */
+#define dbopen __db185_open@DB_VERSION_UNIQUE_NAME@
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_185_H_ */
diff --git a/src/dbinc/db_am.h b/src/dbinc/db_am.h
new file mode 100644
index 00000000..f34578c4
--- /dev/null
+++ b/src/dbinc/db_am.h
@@ -0,0 +1,327 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+#ifndef _DB_AM_H_
+#define _DB_AM_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct __db_foreign_info; \
+ typedef struct __db_foreign_info DB_FOREIGN_INFO;
+
+/*
+ * Keep track of information for foreign keys. Used to maintain a linked list
+ * of 'primary' DBs which reference this 'foreign' DB.
+ */
+struct __db_foreign_info {
+ DB *dbp;
+ u_int32_t flags;
+ int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+
+ /*
+ * List entries for foreign key.
+ *
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * LIST_ENTRY(__db) s_links;
+ */
+ struct {
+ struct __db_foreign_info *le_next;
+ struct __db_foreign_info **le_prev;
+ } f_links;
+};
+
+/*
+ * IS_ENV_AUTO_COMMIT --
+ * Auto-commit test for enviroment operations: DbEnv::{open,remove,rename}
+ */
+#define IS_ENV_AUTO_COMMIT(env, txn, flags) \
+ (LF_ISSET(DB_AUTO_COMMIT) || \
+ (((txn) == NULL || F_ISSET((txn), TXN_FAMILY)) && \
+ F_ISSET((env)->dbenv, DB_ENV_AUTO_COMMIT) && \
+ !LF_ISSET(DB_NO_AUTO_COMMIT)))
+
+/*
+ * IS_DB_AUTO_COMMIT --
+ * Auto-commit test for database operations.
+ */
+#define IS_DB_AUTO_COMMIT(dbp, txn) \
+ (((txn) == NULL || F_ISSET((txn), TXN_FAMILY)) && \
+ F_ISSET((dbp), DB_AM_TXN))
+
+/*
+ * STRIP_AUTO_COMMIT --
+ * Releases after 4.3 no longer requires DB operations to specify the
+ * AUTO_COMMIT flag, but the API continues to allow it to be specified.
+ */
+#define STRIP_AUTO_COMMIT(f) FLD_CLR((f), DB_AUTO_COMMIT)
+
+/* DB recovery operation codes. */
+#define DB_ADD_DUP 1
+#define DB_REM_DUP 2
+#define DB_ADD_BIG 3
+#define DB_REM_BIG 4
+#define DB_ADD_PAGE_COMPAT 5 /* Compatibility for 4.2 db_relink */
+#define DB_REM_PAGE_COMPAT 6 /* Compatibility for 4.2 db_relink */
+#define DB_APPEND_BIG 7
+#define DB_ADD_HEAP 8
+#define DB_REM_HEAP 9
+
+#define OP_MODE_SHIFT 8
+#define OP_PAGE_MASK 0xff
+
+#define OP_SET(mode, page) (((mode) << OP_MODE_SHIFT) | (TYPE(page)))
+#define OP_MODE_GET(mode) ((mode) >> OP_MODE_SHIFT)
+#define OP_PAGE_GET(mode) ((mode) & OP_PAGE_MASK)
+
+
+/*
+ * Standard initialization and shutdown macros for all recovery functions.
+ */
+#define REC_INTRO(func, ip, do_cursor) do { \
+ argp = NULL; \
+ dbc = NULL; \
+ file_dbp = NULL; \
+ COMPQUIET(mpf, NULL); /* Not all recovery routines use mpf. */\
+ if ((ret = func(env, &file_dbp, \
+ (info != NULL) ? ((DB_TXNHEAD *)info)->td : NULL, \
+ dbtp->data, &argp)) != 0) { \
+ if (ret == DB_DELETED) { \
+ ret = 0; \
+ goto done; \
+ } \
+ goto out; \
+ } \
+ if (do_cursor) { \
+ if ((ret = __db_cursor(file_dbp, \
+ ip, NULL, &dbc, DB_RECOVER)) != 0) \
+ goto out; \
+ } \
+ mpf = file_dbp->mpf; \
+} while (0)
+
+#define REC_CLOSE { \
+ int __t_ret; \
+ if (argp != NULL) \
+ __os_free(env, argp); \
+ if (dbc != NULL && \
+ (__t_ret = __dbc_close(dbc)) != 0 && ret == 0) \
+ ret = __t_ret; \
+ } \
+ return (ret)
+
+/*
+ * No-op versions of the same macros.
+ */
+#define REC_NOOP_INTRO(func) do { \
+ argp = NULL; \
+ if ((ret = func(env, dbtp->data, &argp)) != 0) \
+ return (ret); \
+} while (0)
+#define REC_NOOP_CLOSE \
+ if (argp != NULL) \
+ __os_free(env, argp); \
+ return (ret)
+
+/*
+ * Macro for reading pages during recovery. In most cases we
+ * want to avoid an error if the page is not found during rollback.
+ */
+#define REC_FGET(mpf, ip, pgno, pagep, cont) \
+ if ((ret = __memp_fget(mpf, \
+ &(pgno), ip, NULL, 0, pagep)) != 0) { \
+ if (ret != DB_PAGE_NOTFOUND) { \
+ ret = __db_pgerr(file_dbp, pgno, ret); \
+ goto out; \
+ } else \
+ goto cont; \
+ }
+#define REC_DIRTY(mpf, ip, priority, pagep) \
+ if ((ret = __memp_dirty(mpf, \
+ pagep, ip, NULL, priority, DB_MPOOL_EDIT)) != 0) { \
+ ret = __db_pgerr(file_dbp, PGNO(*(pagep)), ret); \
+ goto out; \
+ }
+
+/*
+ * Standard debugging macro for all recovery functions.
+ */
+#ifdef DEBUG_RECOVER
+#define REC_PRINT(func) \
+ (void)func(env, dbtp, lsnp, op, info);
+#else
+#define REC_PRINT(func)
+#endif
+
+/*
+ * Actions to __db_lget
+ */
+#define LCK_ALWAYS 1 /* Lock even for off page dup cursors */
+#define LCK_COUPLE 2 /* Lock Couple */
+#define LCK_COUPLE_ALWAYS 3 /* Lock Couple even in txn. */
+#define LCK_DOWNGRADE 4 /* Downgrade the lock. (internal) */
+#define LCK_ROLLBACK 5 /* Lock even if in rollback */
+
+/*
+ * If doing transactions we have to hold the locks associated with a data item
+ * from a page for the entire transaction. However, we don't have to hold the
+ * locks associated with walking the tree. Distinguish between the two so that
+ * we don't tie up the internal pages of the tree longer than necessary.
+ */
+#define __LPUT(dbc, lock) \
+ __ENV_LPUT((dbc)->env, lock)
+
+#define __ENV_LPUT(env, lock) \
+ (LOCK_ISSET(lock) ? __lock_put(env, &(lock)) : 0)
+
+/*
+ * __TLPUT -- transactional lock put
+ * If the lock is valid then
+ * If we are not in a transaction put the lock.
+ * Else if the cursor is doing dirty reads and this was a read then
+ * put the lock.
+ * Else if the db is supporting dirty reads and this is a write then
+ * downgrade it.
+ * Else do nothing.
+ */
+#define __TLPUT(dbc, lock) \
+ (LOCK_ISSET(lock) ? __db_lput(dbc, &(lock)) : 0)
+
+/*
+ * Check whether a database is a primary (that is, has associated secondaries).
+ */
+#define DB_IS_PRIMARY(dbp) (LIST_FIRST(&dbp->s_secondaries) != NULL)
+/*
+ * A database should be required to be readonly if it's been explicitly
+ * specified as such or if we're a client in a replicated environment
+ * and the user did not specify DB_TXN_NOT_DURABLE.
+ */
+#define DB_IS_READONLY(dbp) \
+ (F_ISSET(dbp, DB_AM_RDONLY) || \
+ (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE)))
+
+#ifdef HAVE_COMPRESSION
+/*
+ * Check whether a database is compressed (btree only)
+ */
+#define DB_IS_COMPRESSED(dbp) \
+ (((BTREE *)(dbp)->bt_internal)->bt_compress != NULL)
+#endif
+
+/*
+ * We copy the key out if there's any chance the key in the database is not
+ * the same as the user-specified key. If there is a custom comparator we
+ * return a key, as the user-specified key might be a partial key, containing
+ * only the unique identifier. [#13572] [#15770]
+ *
+ * The test for (flags != 0) is necessary for Db.{get,pget}, but it's not
+ * legal to pass a non-zero flags value to Dbc.{get,pget}.
+ *
+ * We need to split out the hash component, since it is possible to build
+ * without hash support enabled. Which would result in a null pointer access.
+ */
+#ifdef HAVE_HASH
+#define DB_RETURNS_A_KEY_HASH(dbp) \
+ ((HASH *)(dbp)->h_internal)->h_compare != NULL
+#else
+#define DB_RETURNS_A_KEY_HASH(dbp) 0
+#endif
+#define DB_RETURNS_A_KEY(dbp, flags) \
+ (((flags) != 0 && (flags) != DB_GET_BOTH && \
+ (flags) != DB_GET_BOTH_RANGE && (flags) != DB_SET) || \
+ ((BTREE *)(dbp)->bt_internal)->bt_compare != __bam_defcmp ||\
+ DB_RETURNS_A_KEY_HASH(dbp))
+
+/*
+ * For portability, primary keys that are record numbers are stored in
+ * secondaries in the same byte order as the secondary database. As a
+ * consequence, we need to swap the byte order of these keys before attempting
+ * to use them for lookups in the primary. We also need to swap user-supplied
+ * primary keys that are used in secondary lookups (for example, with the
+ * DB_GET_BOTH flag on a secondary get).
+ */
+#include "dbinc/db_swap.h"
+
+#define SWAP_IF_NEEDED(sdbp, pkey) \
+ do { \
+ if (((sdbp)->s_primary->type == DB_QUEUE || \
+ (sdbp)->s_primary->type == DB_RECNO) && \
+ F_ISSET((sdbp), DB_AM_SWAP)) \
+ P_32_SWAP((pkey)->data); \
+ } while (0)
+
+/*
+ * Cursor adjustment:
+ * Return the first DB handle in the sorted ENV list of DB
+ * handles that has a matching file ID.
+ */
+#define FIND_FIRST_DB_MATCH(env, dbp, tdbp) do { \
+ for ((tdbp) = (dbp); \
+ TAILQ_PREV((tdbp), __dblist, dblistlinks) != NULL && \
+ TAILQ_PREV((tdbp), \
+ __dblist, dblistlinks)->adj_fileid == (dbp)->adj_fileid;\
+ (tdbp) = TAILQ_PREV((tdbp), __dblist, dblistlinks)) \
+ ; \
+} while (0)
+
+/*
+ * Macros used to implement a binary search algorithm. Shared between the
+ * btree and hash implementations.
+ */
+#define DB_BINARY_SEARCH_FOR(base, limit, nument, adjust) \
+ for (base = 0, limit = (nument) / (db_indx_t)(adjust); \
+ (limit) != 0; (limit) >>= 1)
+
+#define DB_BINARY_SEARCH_INCR(index, base, limit, adjust) \
+ index = (base) + (((limit) >> 1) * (adjust))
+
+#define DB_BINARY_SEARCH_SHIFT_BASE(index, base, limit, adjust) do { \
+ base = (index) + (adjust); \
+ --(limit); \
+} while (0)
+
+/*
+ * Sequence macros, shared between sequence.c and seq_stat.c
+ */
+#define SEQ_IS_OPEN(seq) ((seq)->seq_key.data != NULL)
+
+#define SEQ_ILLEGAL_AFTER_OPEN(seq, name) \
+ if (SEQ_IS_OPEN(seq)) \
+ return (__db_mi_open((seq)->seq_dbp->env, name, 1));
+
+#define SEQ_ILLEGAL_BEFORE_OPEN(seq, name) \
+ if (!SEQ_IS_OPEN(seq)) \
+ return (__db_mi_open((seq)->seq_dbp->env, name, 0));
+
+/*
+ * Flags to __db_chk_meta.
+ */
+#define DB_CHK_META 0x01 /* Checksum the meta page. */
+#define DB_CHK_NOLSN 0x02 /* Don't check the LSN. */
+#define DB_CHK_ONLY 0x04 /* Only do the checksum. */
+#define DB_SKIP_CHK 0x08 /* Don't checksum or decrypt the meta page. */
+
+/*
+ * Flags to __db_truncate_page.
+ */
+#define DB_EXCH_FREE 0x01 /* Free the old page. */
+#define DB_EXCH_PARENT 0x02 /* There is a parent to update. */
+
+/* We usually want to do these operations. */
+#define DB_EXCH_DEFAULT (DB_EXCH_FREE | DB_EXCH_PARENT)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc/db_dispatch.h"
+#include "dbinc_auto/db_auto.h"
+#include "dbinc_auto/crdel_auto.h"
+#include "dbinc_auto/db_ext.h"
+#endif /* !_DB_AM_H_ */
diff --git a/src/dbinc/db_cxx.in b/src/dbinc/db_cxx.in
new file mode 100644
index 00000000..84fc0f88
--- /dev/null
+++ b/src/dbinc/db_cxx.in
@@ -0,0 +1,1523 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_CXX_H_
+#define _DB_CXX_H_
+//
+// C++ assumptions:
+//
+// To ensure portability to many platforms, both new and old, we make
+// few assumptions about the C++ compiler and library. For example,
+// we do not expect STL, templates or namespaces to be available. The
+// "newest" C++ feature used is exceptions, which are used liberally
+// to transmit error information. Even the use of exceptions can be
+// disabled at runtime, to do so, use the DB_CXX_NO_EXCEPTIONS flags
+// with the DbEnv or Db constructor.
+//
+// C++ naming conventions:
+//
+// - All top level class names start with Db.
+// - All class members start with lower case letter.
+// - All private data members are suffixed with underscore.
+// - Use underscores to divide names into multiple words.
+// - Simple data accessors are named with get_ or set_ prefix.
+// - All method names are taken from names of functions in the C
+// layer of db (usually by dropping a prefix like "db_").
+// These methods have the same argument types and order,
+// other than dropping the explicit arg that acts as "this".
+//
+// As a rule, each DbFoo object has exactly one underlying DB_FOO struct
+// (defined in db.h) associated with it. In some cases, we inherit directly
+// from the DB_FOO structure to make this relationship explicit. Often,
+// the underlying C layer allocates and deallocates these structures, so
+// there is no easy way to add any data to the DbFoo class. When you see
+// a comment about whether data is permitted to be added, this is what
+// is going on. Of course, if we need to add data to such C++ classes
+// in the future, we will arrange to have an indirect pointer to the
+// DB_FOO struct (as some of the classes already have).
+//
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Forward declarations
+//
+
+#include <stdarg.h>
+
+@cxx_have_stdheaders@
+#ifdef HAVE_CXX_STDHEADERS
+#include <iostream>
+#include <exception>
+#define __DB_STD(x) std::x
+#else
+#include <iostream.h>
+#include <exception.h>
+#define __DB_STD(x) x
+#endif
+
+#include "db.h"
+
+class Db; // forward
+class Dbc; // forward
+class DbChannel; // forward
+class DbEnv; // forward
+class DbHeapRecordId; // forward
+class DbInfo; // forward
+class DbLock; // forward
+class DbLogc; // forward
+class DbLsn; // forward
+class DbMpoolFile; // forward
+class DbPreplist; // forward
+class DbSequence; // forward
+class DbSite; // forward
+class Dbt; // forward
+class DbTxn; // forward
+
+class DbMultipleIterator; // forward
+class DbMultipleKeyDataIterator; // forward
+class DbMultipleRecnoDataIterator; // forward
+class DbMultipleDataIterator; // forward
+
+class DbException; // forward
+class DbDeadlockException; // forward
+class DbLockNotGrantedException; // forward
+class DbMemoryException; // forward
+class DbRepHandleDeadException; // forward
+class DbRunRecoveryException; // forward
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Turn off inappropriate compiler warnings
+//
+
+#ifdef _MSC_VER
+
+// These are level 4 warnings that are explicitly disabled.
+// With Visual C++, by default you do not see above level 3 unless
+// you use /W4. But we like to compile with the highest level
+// warnings to catch other errors.
+//
+// 4201: nameless struct/union
+// triggered by standard include file <winnt.h>
+//
+// 4514: unreferenced inline function has been removed
+// certain include files in MSVC define methods that are not called
+//
+#pragma warning(push)
+#pragma warning(disable: 4201 4514)
+
+#endif
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Mechanisms for declaring classes
+//
+
+//
+// Every class defined in this file has an _exported next to the class name.
+// This is needed for WinTel machines so that the class methods can
+// be exported or imported in a DLL as appropriate. Users of the DLL
+// use the define DB_USE_DLL. When the DLL is built, DB_CREATE_DLL
+// must be defined.
+//
+#if defined(_MSC_VER)
+
+# if defined(DB_CREATE_DLL)
+# define _exported __declspec(dllexport) // creator of dll
+# elif defined(DB_USE_DLL)
+# define _exported __declspec(dllimport) // user of dll
+# else
+# define _exported // static lib creator or user
+# endif
+
+#else /* _MSC_VER */
+
+# define _exported
+
+#endif /* _MSC_VER */
+
+// Some interfaces can be customized by allowing users to define
+// callback functions. For performance and logistical reasons, some
+// callback functions must be declared in extern "C" blocks. For others,
+// we allow you to declare the callbacks in C++ or C (or an extern "C"
+// block) as you wish. See the set methods for the callbacks for
+// the choices.
+//
+extern "C" {
+ typedef void * (*db_malloc_fcn_type)
+ (size_t);
+ typedef void * (*db_realloc_fcn_type)
+ (void *, size_t);
+ typedef void (*db_free_fcn_type)
+ (void *);
+ typedef int (*bt_compare_fcn_type) /*C++ version available*/
+ (DB *, const DBT *, const DBT *);
+ typedef size_t (*bt_prefix_fcn_type) /*C++ version available*/
+ (DB *, const DBT *, const DBT *);
+ typedef int (*dup_compare_fcn_type) /*C++ version available*/
+ (DB *, const DBT *, const DBT *);
+ typedef int (*h_compare_fcn_type) /*C++ version available*/
+ (DB *, const DBT *, const DBT *);
+ typedef u_int32_t (*h_hash_fcn_type) /*C++ version available*/
+ (DB *, const void *, u_int32_t);
+ typedef int (*pgin_fcn_type)
+ (DB_ENV *dbenv, db_pgno_t pgno, void *pgaddr, DBT *pgcookie);
+ typedef int (*pgout_fcn_type)
+ (DB_ENV *dbenv, db_pgno_t pgno, void *pgaddr, DBT *pgcookie);
+}
+
+//
+// Represents a database table = a set of keys with associated values.
+//
+class _exported Db
+{
+ friend class DbEnv;
+
+public:
+ Db(DbEnv*, u_int32_t); // Create a Db object.
+ virtual ~Db(); // Calls close() if the user hasn't.
+
+ // These methods exactly match those in the C interface.
+ //
+ virtual int associate(DbTxn *txn, Db *secondary, int (*callback)
+ (Db *, const Dbt *, const Dbt *, Dbt *), u_int32_t flags);
+ virtual int associate_foreign(Db *foreign, int (*callback)
+ (Db *, const Dbt *, Dbt *, const Dbt *, int *), u_int32_t flags);
+ virtual int close(u_int32_t flags);
+ virtual int compact(DbTxn *txnid, Dbt *start,
+ Dbt *stop, DB_COMPACT *c_data, u_int32_t flags, Dbt *end);
+ virtual int cursor(DbTxn *txnid, Dbc **cursorp, u_int32_t flags);
+ virtual int del(DbTxn *txnid, Dbt *key, u_int32_t flags);
+ virtual void err(int, const char *, ...);
+ virtual void errx(const char *, ...);
+ virtual int exists(DbTxn *txnid, Dbt *key, u_int32_t flags);
+ virtual int fd(int *fdp);
+ virtual int get(DbTxn *txnid, Dbt *key, Dbt *data, u_int32_t flags);
+ virtual int get_alloc(
+ db_malloc_fcn_type *, db_realloc_fcn_type *, db_free_fcn_type *);
+ virtual int get_append_recno(int (**)(Db *, Dbt *, db_recno_t));
+ virtual int get_bt_compare(int (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_bt_compress(
+ int (**)(
+ Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *),
+ int (**)(Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *));
+ virtual int get_bt_minkey(u_int32_t *);
+ virtual int get_bt_prefix(size_t (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_byteswapped(int *);
+ virtual int get_cachesize(u_int32_t *, u_int32_t *, int *);
+ virtual int get_create_dir(const char **);
+ virtual int get_dbname(const char **, const char **);
+ virtual int get_dup_compare(int (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_encrypt_flags(u_int32_t *);
+ virtual void get_errcall(
+ void (**)(const DbEnv *, const char *, const char *));
+ virtual void get_errfile(FILE **);
+ virtual void get_errpfx(const char **);
+ virtual int get_feedback(void (**)(Db *, int, int));
+ virtual int get_flags(u_int32_t *);
+ virtual int get_heapsize(u_int32_t *, u_int32_t *);
+ virtual int get_heap_regionsize(u_int32_t *);
+ virtual int get_h_compare(int (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_h_ffactor(u_int32_t *);
+ virtual int get_h_hash(u_int32_t (**)(Db *, const void *, u_int32_t));
+ virtual int get_h_nelem(u_int32_t *);
+ virtual int get_lk_exclusive(bool *, bool *);
+ virtual int get_lorder(int *);
+ virtual void get_msgcall(void (**)(const DbEnv *, const char *));
+ virtual void get_msgfile(FILE **);
+ virtual int get_multiple();
+ virtual int get_open_flags(u_int32_t *);
+ virtual int get_pagesize(u_int32_t *);
+ virtual int get_partition_callback(
+ u_int32_t *, u_int32_t (**)(Db *, Dbt *key));
+ virtual int get_partition_dirs(const char ***);
+ virtual int get_partition_keys(u_int32_t *, Dbt **);
+ virtual int get_priority(DB_CACHE_PRIORITY *);
+ virtual int get_q_extentsize(u_int32_t *);
+ virtual int get_re_delim(int *);
+ virtual int get_re_len(u_int32_t *);
+ virtual int get_re_pad(int *);
+ virtual int get_re_source(const char **);
+ virtual int get_transactional();
+ virtual int get_type(DBTYPE *);
+ virtual int join(Dbc **curslist, Dbc **dbcp, u_int32_t flags);
+ virtual int key_range(DbTxn *, Dbt *, DB_KEY_RANGE *, u_int32_t);
+ virtual int open(DbTxn *txnid,
+ const char *, const char *subname, DBTYPE, u_int32_t, int);
+ virtual int pget(DbTxn *txnid,
+ Dbt *key, Dbt *pkey, Dbt *data, u_int32_t flags);
+ virtual int put(DbTxn *, Dbt *, Dbt *, u_int32_t);
+ virtual int remove(const char *, const char *, u_int32_t);
+ virtual int rename(const char *, const char *, const char *, u_int32_t);
+ virtual int set_alloc(
+ db_malloc_fcn_type, db_realloc_fcn_type, db_free_fcn_type);
+ virtual void set_app_private(void *);
+ virtual int set_append_recno(int (*)(Db *, Dbt *, db_recno_t));
+ virtual int set_bt_compare(bt_compare_fcn_type); /*deprecated*/
+ virtual int set_bt_compare(int (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_bt_compress(
+ int (*)
+ (Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *),
+ int (*)(Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *));
+ virtual int set_bt_minkey(u_int32_t);
+ virtual int set_bt_prefix(bt_prefix_fcn_type); /*deprecated*/
+ virtual int set_bt_prefix(size_t (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_cachesize(u_int32_t, u_int32_t, int);
+ virtual int set_create_dir(const char *);
+ virtual int set_dup_compare(dup_compare_fcn_type); /*deprecated*/
+ virtual int set_dup_compare(int (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_encrypt(const char *, u_int32_t);
+ virtual void set_errcall(
+ void (*)(const DbEnv *, const char *, const char *));
+ virtual void set_errfile(FILE *);
+ virtual void set_errpfx(const char *);
+ virtual int set_feedback(void (*)(Db *, int, int));
+ virtual int set_flags(u_int32_t);
+ virtual int set_heapsize(u_int32_t, u_int32_t);
+ virtual int set_heap_regionsize(u_int32_t);
+ virtual int set_h_compare(h_compare_fcn_type); /*deprecated*/
+ virtual int set_h_compare(int (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_h_ffactor(u_int32_t);
+ virtual int set_h_hash(h_hash_fcn_type); /*deprecated*/
+ virtual int set_h_hash(u_int32_t (*)(Db *, const void *, u_int32_t));
+ virtual int set_h_nelem(u_int32_t);
+ virtual int set_lk_exclusive(bool);
+ virtual int set_lorder(int);
+ virtual void set_msgcall(void (*)(const DbEnv *, const char *));
+ virtual void set_msgfile(FILE *);
+ virtual int set_pagesize(u_int32_t);
+ virtual int set_paniccall(void (*)(DbEnv *, int));
+ virtual int set_partition(
+ u_int32_t, Dbt *, u_int32_t (*)(Db *, Dbt *));
+ virtual int set_partition_dirs(const char **);
+ virtual int set_priority(DB_CACHE_PRIORITY);
+ virtual int set_q_extentsize(u_int32_t);
+ virtual int set_re_delim(int);
+ virtual int set_re_len(u_int32_t);
+ virtual int set_re_pad(int);
+ virtual int set_re_source(const char *);
+ virtual int sort_multiple(Dbt *, Dbt *, u_int32_t);
+ virtual int stat(DbTxn *, void *sp, u_int32_t flags);
+ virtual int stat_print(u_int32_t flags);
+ virtual int sync(u_int32_t flags);
+ virtual int truncate(DbTxn *, u_int32_t *, u_int32_t);
+ virtual int upgrade(const char *name, u_int32_t flags);
+ virtual int verify(
+ const char *, const char *, __DB_STD(ostream) *, u_int32_t);
+
+ // These additional methods are not in the C interface, and
+ // are only available for C++.
+ //
+ virtual void *get_app_private() const;
+ virtual __DB_STD(ostream) *get_error_stream();
+ virtual void set_error_stream(__DB_STD(ostream) *);
+ virtual __DB_STD(ostream) *get_message_stream();
+ virtual void set_message_stream(__DB_STD(ostream) *);
+
+ virtual DbEnv *get_env();
+ virtual DbMpoolFile *get_mpf();
+
+ virtual ENV *get_ENV()
+ {
+ return imp_->env;
+ }
+
+ virtual DB *get_DB()
+ {
+ return imp_;
+ }
+
+ virtual const DB *get_const_DB() const
+ {
+ return imp_;
+ }
+
+ static Db* get_Db(DB *db)
+ {
+ return (Db *)db->api_internal;
+ }
+
+ static const Db* get_const_Db(const DB *db)
+ {
+ return (const Db *)db->api_internal;
+ }
+
+ u_int32_t get_create_flags() const
+ {
+ return construct_flags_;
+ }
+
+private:
+ // no copying
+ Db(const Db &);
+ Db &operator = (const Db &);
+
+ void cleanup();
+ int initialize();
+ int error_policy();
+
+ // instance data
+ DB *imp_;
+ DbEnv *dbenv_;
+ DbMpoolFile *mpf_;
+ int construct_error_;
+ u_int32_t flags_;
+ u_int32_t construct_flags_;
+
+ static int alt_close(DB *, u_int32_t);
+
+public:
+ // These are public only because they need to be called
+ // via C callback functions. They should never be used by
+ // external users of this class.
+ //
+ int (*append_recno_callback_)(Db *, Dbt *, db_recno_t);
+ int (*associate_callback_)(Db *, const Dbt *, const Dbt *, Dbt *);
+ int (*associate_foreign_callback_)
+ (Db *, const Dbt *, Dbt *, const Dbt *, int *);
+ int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *);
+ int (*bt_compress_callback_)(
+ Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *);
+ int (*bt_decompress_callback_)(
+ Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *);
+ size_t (*bt_prefix_callback_)(Db *, const Dbt *, const Dbt *);
+ u_int32_t (*db_partition_callback_)(Db *, Dbt *);
+ int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *);
+ void (*feedback_callback_)(Db *, int, int);
+ int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *);
+ u_int32_t (*h_hash_callback_)(Db *, const void *, u_int32_t);
+};
+
+//
+// Cursor
+//
+class _exported Dbc : protected DBC
+{
+ friend class Db;
+
+public:
+ int close();
+ int cmp(Dbc *other_csr, int *result, u_int32_t flags);
+ int count(db_recno_t *countp, u_int32_t flags);
+ int del(u_int32_t flags);
+ int dup(Dbc** cursorp, u_int32_t flags);
+ int get(Dbt* key, Dbt *data, u_int32_t flags);
+ int get_priority(DB_CACHE_PRIORITY *priorityp);
+ int pget(Dbt* key, Dbt* pkey, Dbt *data, u_int32_t flags);
+ int put(Dbt* key, Dbt *data, u_int32_t flags);
+ int set_priority(DB_CACHE_PRIORITY priority);
+
+private:
+ // No data is permitted in this class (see comment at top)
+
+ // Note: use Db::cursor() to get pointers to a Dbc,
+ // and call Dbc::close() rather than delete to release them.
+ //
+ Dbc();
+ ~Dbc();
+
+ // no copying
+ Dbc(const Dbc &);
+ Dbc &operator = (const Dbc &);
+};
+
+//
+// A channel in replication group
+//
+class _exported DbChannel
+{
+ friend class DbEnv;
+
+public:
+ int close();
+ int send_msg(Dbt *msg, u_int32_t nmsg, u_int32_t flags);
+ int send_request(Dbt *request, u_int32_t nrequest, Dbt *response,
+ db_timeout_t timeout, u_int32_t flags);
+ int set_timeout(db_timeout_t timeout);
+
+ virtual DB_CHANNEL *get_DB_CHANNEL()
+ {
+ return imp_;
+ }
+
+ virtual const DB_CHANNEL *get_const_DB_CHANNEL() const
+ {
+ return imp_;
+ }
+
+private:
+ DbChannel();
+ virtual ~DbChannel();
+
+ // no copying
+ DbChannel(const DbChannel &);
+ DbChannel &operator = (const DbChannel &);
+ DB_CHANNEL *imp_;
+ DbEnv *dbenv_;
+};
+
+//
+// Berkeley DB environment class. Provides functions for opening databases.
+// User of this library can use this class as a starting point for
+// developing a DB application - derive their application class from
+// this one, add application control logic.
+//
+// Note that if you use the default constructor, you must explicitly
+// call appinit() before any other db activity (e.g. opening files)
+//
+class _exported DbEnv
+{
+ friend class Db;
+ friend class DbLock;
+ friend class DbMpoolFile;
+
+public:
+ // After using this constructor, you can set any needed
+ // parameters for the environment using the set_* methods.
+ // Then call open() to finish initializing the environment
+ // and attaching it to underlying files.
+ //
+ DbEnv(u_int32_t flags);
+
+ virtual ~DbEnv();
+
+ // These methods match those in the C interface.
+ //
+ virtual int add_data_dir(const char *);
+ virtual int backup(const char *target, u_int32_t flags);
+ virtual int cdsgroup_begin(DbTxn **tid);
+ virtual int close(u_int32_t);
+ virtual int dbbackup(
+ const char *dbfile, const char *target, u_int32_t flags);
+ virtual int dbremove(DbTxn *txn, const char *name, const char *subdb,
+ u_int32_t flags);
+ virtual int dbrename(DbTxn *txn, const char *name, const char *subdb,
+ const char *newname, u_int32_t flags);
+ virtual void err(int, const char *, ...);
+ virtual void errx(const char *, ...);
+ virtual int failchk(u_int32_t);
+ virtual int fileid_reset(const char *, u_int32_t);
+ virtual int get_alloc(db_malloc_fcn_type *, db_realloc_fcn_type *,
+ db_free_fcn_type *);
+ virtual void *get_app_private() const;
+ virtual int get_home(const char **);
+ virtual int get_open_flags(u_int32_t *);
+ virtual int open(const char *, u_int32_t, int);
+ virtual int remove(const char *, u_int32_t);
+ virtual int stat_print(u_int32_t flags);
+
+ virtual int set_alloc(db_malloc_fcn_type, db_realloc_fcn_type,
+ db_free_fcn_type);
+ virtual void set_app_private(void *);
+ virtual int get_backup_callbacks(
+ int (**)(DbEnv *, const char *, const char *, void **),
+ int (**)(DbEnv *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+ int (**)(DbEnv *, const char *, void *));
+ virtual int set_backup_callbacks(
+ int (*)(DbEnv *, const char *, const char *, void **),
+ int (*)(DbEnv *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+ int (*)(DbEnv *, const char *, void *));
+ virtual int get_backup_config(DB_BACKUP_CONFIG, u_int32_t *);
+ virtual int set_backup_config(DB_BACKUP_CONFIG, u_int32_t);
+ virtual int get_cachesize(u_int32_t *, u_int32_t *, int *);
+ virtual int set_cachesize(u_int32_t, u_int32_t, int);
+ virtual int get_cache_max(u_int32_t *, u_int32_t *);
+ virtual int set_cache_max(u_int32_t, u_int32_t);
+ virtual int get_create_dir(const char **);
+ virtual int set_create_dir(const char *);
+ virtual int get_data_dirs(const char ***);
+ virtual int set_data_dir(const char *);
+ virtual int get_encrypt_flags(u_int32_t *);
+ virtual int get_intermediate_dir_mode(const char **);
+ virtual int set_intermediate_dir_mode(const char *);
+ virtual int get_isalive(
+ int (**)(DbEnv *, pid_t, db_threadid_t, u_int32_t));
+ virtual int set_isalive(
+ int (*)(DbEnv *, pid_t, db_threadid_t, u_int32_t));
+ virtual int set_encrypt(const char *, u_int32_t);
+ virtual void get_errcall(
+ void (**)(const DbEnv *, const char *, const char *));
+ virtual void set_errcall(
+ void (*)(const DbEnv *, const char *, const char *));
+ virtual void get_errfile(FILE **);
+ virtual void set_errfile(FILE *);
+ virtual void get_errpfx(const char **);
+ virtual void set_errpfx(const char *);
+ virtual int set_event_notify(void (*)(DbEnv *, u_int32_t, void *));
+ virtual int get_flags(u_int32_t *);
+ virtual int set_flags(u_int32_t, int);
+ virtual bool is_bigendian();
+ virtual int lsn_reset(const char *, u_int32_t);
+ virtual int get_feedback(void (**)(DbEnv *, int, int));
+ virtual int set_feedback(void (*)(DbEnv *, int, int));
+ virtual int get_lg_bsize(u_int32_t *);
+ virtual int set_lg_bsize(u_int32_t);
+ virtual int get_lg_dir(const char **);
+ virtual int set_lg_dir(const char *);
+ virtual int get_lg_filemode(int *);
+ virtual int set_lg_filemode(int);
+ virtual int get_lg_max(u_int32_t *);
+ virtual int set_lg_max(u_int32_t);
+ virtual int get_lg_regionmax(u_int32_t *);
+ virtual int set_lg_regionmax(u_int32_t);
+ virtual int get_lk_conflicts(const u_int8_t **, int *);
+ virtual int set_lk_conflicts(u_int8_t *, int);
+ virtual int get_lk_detect(u_int32_t *);
+ virtual int set_lk_detect(u_int32_t);
+ virtual int get_lk_max_lockers(u_int32_t *);
+ virtual int set_lk_max_lockers(u_int32_t);
+ virtual int get_lk_max_locks(u_int32_t *);
+ virtual int set_lk_max_locks(u_int32_t);
+ virtual int get_lk_max_objects(u_int32_t *);
+ virtual int set_lk_max_objects(u_int32_t);
+ virtual int get_lk_partitions(u_int32_t *);
+ virtual int set_lk_partitions(u_int32_t);
+ virtual int get_lk_priority(u_int32_t, u_int32_t *);
+ virtual int set_lk_priority(u_int32_t, u_int32_t);
+ virtual int get_lk_tablesize(u_int32_t *);
+ virtual int set_lk_tablesize(u_int32_t);
+ virtual int get_memory_init(DB_MEM_CONFIG, u_int32_t *);
+ virtual int set_memory_init(DB_MEM_CONFIG, u_int32_t);
+ virtual int get_memory_max(u_int32_t *, u_int32_t *);
+ virtual int set_memory_max(u_int32_t, u_int32_t);
+ virtual int get_metadata_dir(const char **);
+ virtual int set_metadata_dir(const char *);
+ virtual int get_mp_mmapsize(size_t *);
+ virtual int set_mp_mmapsize(size_t);
+ virtual int get_mp_max_openfd(int *);
+ virtual int set_mp_max_openfd(int);
+ virtual int get_mp_max_write(int *, db_timeout_t *);
+ virtual int set_mp_max_write(int, db_timeout_t);
+ virtual int get_mp_pagesize(u_int32_t *);
+ virtual int set_mp_pagesize(u_int32_t);
+ virtual int get_mp_tablesize(u_int32_t *);
+ virtual int set_mp_tablesize(u_int32_t);
+ virtual void get_msgcall(void (**)(const DbEnv *, const char *));
+ virtual void set_msgcall(void (*)(const DbEnv *, const char *));
+ virtual void get_msgfile(FILE **);
+ virtual void set_msgfile(FILE *);
+ virtual int set_paniccall(void (*)(DbEnv *, int));
+ virtual int get_shm_key(long *);
+ virtual int set_shm_key(long);
+ virtual int get_timeout(db_timeout_t *, u_int32_t);
+ virtual int set_timeout(db_timeout_t, u_int32_t);
+ virtual int get_tmp_dir(const char **);
+ virtual int set_tmp_dir(const char *);
+ virtual int get_tx_max(u_int32_t *);
+ virtual int set_tx_max(u_int32_t);
+ virtual int get_app_dispatch(
+ int (**)(DbEnv *, Dbt *, DbLsn *, db_recops));
+ virtual int set_app_dispatch(int (*)(DbEnv *,
+ Dbt *, DbLsn *, db_recops));
+ virtual int get_tx_timestamp(time_t *);
+ virtual int set_tx_timestamp(time_t *);
+ virtual int get_verbose(u_int32_t which, int *);
+ virtual int set_verbose(u_int32_t which, int);
+
+ // Version information. Static methods, can be called at any time.
+ //
+ static char *version(int *major, int *minor, int *patch);
+ static char *full_version(int *family, int *release,
+ int *major, int *minor, int *patch);
+
+ // Convert DB errors to strings
+ static char *strerror(int);
+
+ // If an error is detected and the error call function
+ // or stream is set, a message is dispatched or printed.
+ // If a prefix is set, each message is prefixed.
+ //
+ // You can use set_errcall() or set_errfile() above to control
+ // error functionality. Alternatively, you can call
+ // set_error_stream() to force all errors to a C++ stream.
+ // It is unwise to mix these approaches.
+ //
+ virtual __DB_STD(ostream) *get_error_stream();
+ virtual void set_error_stream(__DB_STD(ostream) *);
+ virtual __DB_STD(ostream) *get_message_stream();
+ virtual void set_message_stream(__DB_STD(ostream) *);
+
+ // used internally
+ static void runtime_error(DbEnv *dbenv, const char *caller, int err,
+ int error_policy);
+ static void runtime_error_dbt(DbEnv *dbenv, const char *caller, Dbt *dbt,
+ int error_policy);
+ static void runtime_error_lock_get(DbEnv *dbenv, const char *caller,
+ int err, db_lockop_t op, db_lockmode_t mode,
+ Dbt *obj, DbLock lock, int index,
+ int error_policy);
+
+ // Lock functions
+ //
+ virtual int lock_detect(u_int32_t flags, u_int32_t atype, int *aborted);
+ virtual int lock_get(u_int32_t locker, u_int32_t flags, Dbt *obj,
+ db_lockmode_t lock_mode, DbLock *lock);
+ virtual int lock_id(u_int32_t *idp);
+ virtual int lock_id_free(u_int32_t id);
+ virtual int lock_put(DbLock *lock);
+ virtual int lock_stat(DB_LOCK_STAT **statp, u_int32_t flags);
+ virtual int lock_stat_print(u_int32_t flags);
+ virtual int lock_vec(u_int32_t locker, u_int32_t flags,
+ DB_LOCKREQ list[], int nlist, DB_LOCKREQ **elistp);
+
+ // Log functions
+ //
+ virtual int log_archive(char **list[], u_int32_t flags);
+ static int log_compare(const DbLsn *lsn0, const DbLsn *lsn1);
+ virtual int log_cursor(DbLogc **cursorp, u_int32_t flags);
+ virtual int log_file(DbLsn *lsn, char *namep, size_t len);
+ virtual int log_flush(const DbLsn *lsn);
+ virtual int log_get_config(u_int32_t, int *);
+ virtual int log_put(DbLsn *lsn, const Dbt *data, u_int32_t flags);
+ virtual int log_printf(DbTxn *, const char *, ...);
+ virtual int log_set_config(u_int32_t, int);
+ virtual int log_stat(DB_LOG_STAT **spp, u_int32_t flags);
+ virtual int log_stat_print(u_int32_t flags);
+ virtual int log_verify(DB_LOG_VERIFY_CONFIG *);
+
+ // Mpool functions
+ //
+ virtual int memp_fcreate(DbMpoolFile **dbmfp, u_int32_t flags);
+ virtual int memp_register(int ftype,
+ pgin_fcn_type pgin_fcn,
+ pgout_fcn_type pgout_fcn);
+ virtual int memp_stat(DB_MPOOL_STAT
+ **gsp, DB_MPOOL_FSTAT ***fsp, u_int32_t flags);
+ virtual int memp_stat_print(u_int32_t flags);
+ virtual int memp_sync(DbLsn *lsn);
+ virtual int memp_trickle(int pct, int *nwrotep);
+
+ // Mpool functions
+ //
+ virtual int mutex_alloc(u_int32_t, db_mutex_t *);
+ virtual int mutex_free(db_mutex_t);
+ virtual int mutex_get_align(u_int32_t *);
+ virtual int mutex_get_increment(u_int32_t *);
+ virtual int mutex_get_init(u_int32_t *);
+ virtual int mutex_get_max(u_int32_t *);
+ virtual int mutex_get_tas_spins(u_int32_t *);
+ virtual int mutex_lock(db_mutex_t);
+ virtual int mutex_set_align(u_int32_t);
+ virtual int mutex_set_increment(u_int32_t);
+ virtual int mutex_set_init(u_int32_t);
+ virtual int mutex_set_max(u_int32_t);
+ virtual int mutex_set_tas_spins(u_int32_t);
+ virtual int mutex_stat(DB_MUTEX_STAT **, u_int32_t);
+ virtual int mutex_stat_print(u_int32_t);
+ virtual int mutex_unlock(db_mutex_t);
+
+ // Transaction functions
+ //
+ virtual int txn_begin(DbTxn *pid, DbTxn **tid, u_int32_t flags);
+ virtual int txn_checkpoint(u_int32_t kbyte, u_int32_t min,
+ u_int32_t flags);
+ virtual int txn_recover(DbPreplist *preplist, long count,
+ long *retp, u_int32_t flags);
+ virtual int txn_stat(DB_TXN_STAT **statp, u_int32_t flags);
+ virtual int txn_stat_print(u_int32_t flags);
+
+ // Replication functions
+ //
+ virtual int rep_elect(u_int32_t, u_int32_t, u_int32_t);
+ virtual int rep_flush();
+ virtual int rep_process_message(Dbt *, Dbt *, int, DbLsn *);
+ virtual int rep_start(Dbt *, u_int32_t);
+ virtual int rep_stat(DB_REP_STAT **statp, u_int32_t flags);
+ virtual int rep_stat_print(u_int32_t flags);
+ virtual int rep_get_clockskew(u_int32_t *, u_int32_t *);
+ virtual int rep_set_clockskew(u_int32_t, u_int32_t);
+ virtual int rep_get_limit(u_int32_t *, u_int32_t *);
+ virtual int rep_set_limit(u_int32_t, u_int32_t);
+ virtual int rep_set_transport(int, int (*)(DbEnv *,
+ const Dbt *, const Dbt *, const DbLsn *, int, u_int32_t));
+ virtual int rep_set_request(u_int32_t, u_int32_t);
+ virtual int rep_get_request(u_int32_t *, u_int32_t *);
+ virtual int get_thread_count(u_int32_t *);
+ virtual int set_thread_count(u_int32_t);
+ virtual int get_thread_id_fn(
+ void (**)(DbEnv *, pid_t *, db_threadid_t *));
+ virtual int set_thread_id(void (*)(DbEnv *, pid_t *, db_threadid_t *));
+ virtual int get_thread_id_string_fn(
+ char *(**)(DbEnv *, pid_t, db_threadid_t, char *));
+ virtual int set_thread_id_string(char *(*)(DbEnv *,
+ pid_t, db_threadid_t, char *));
+ virtual int rep_set_config(u_int32_t, int);
+ virtual int rep_get_config(u_int32_t, int *);
+ virtual int rep_sync(u_int32_t flags);
+
+ // Advanced replication functions
+ //
+ virtual int rep_get_nsites(u_int32_t *n);
+ virtual int rep_set_nsites(u_int32_t n);
+ virtual int rep_get_priority(u_int32_t *priorityp);
+ virtual int rep_set_priority(u_int32_t priority);
+ virtual int rep_get_timeout(int which, db_timeout_t *timeout);
+ virtual int rep_set_timeout(int which, db_timeout_t timeout);
+ virtual int repmgr_channel(int eid, DbChannel **channel,
+ u_int32_t flags);
+ virtual int repmgr_get_ack_policy(int *policy);
+ virtual int repmgr_set_ack_policy(int policy);
+ virtual int repmgr_local_site(DbSite **site);
+ virtual int repmgr_msg_dispatch(void (*) (DbEnv *,
+ DbChannel *, Dbt *, u_int32_t, u_int32_t), u_int32_t flags);
+ virtual int repmgr_site(const char *host, u_int port, DbSite **site,
+ u_int32_t flags);
+ virtual int repmgr_site_by_eid(int eid, DbSite **site);
+ virtual int repmgr_site_list(u_int *countp, DB_REPMGR_SITE **listp);
+ virtual int repmgr_start(int nthreads, u_int32_t flags);
+ virtual int repmgr_stat(DB_REPMGR_STAT **statp, u_int32_t flags);
+ virtual int repmgr_stat_print(u_int32_t flags);
+
+ // Conversion functions
+ //
+ virtual ENV *get_ENV()
+ {
+ return imp_->env;
+ }
+
+ virtual DB_ENV *get_DB_ENV()
+ {
+ return imp_;
+ }
+
+ virtual const DB_ENV *get_const_DB_ENV() const
+ {
+ return imp_;
+ }
+
+ static DbEnv* get_DbEnv(DB_ENV *dbenv)
+ {
+ return dbenv ? (DbEnv *)dbenv->api1_internal : 0;
+ }
+
+ static const DbEnv* get_const_DbEnv(const DB_ENV *dbenv)
+ {
+ return dbenv ? (const DbEnv *)dbenv->api1_internal : 0;
+ }
+
+ u_int32_t get_create_flags() const
+ {
+ return construct_flags_;
+ }
+
+ // For internal use only.
+ static DbEnv* wrap_DB_ENV(DB_ENV *dbenv);
+
+ // These are public only because they need to be called
+ // via C functions. They should never be called by users
+ // of this class.
+ //
+ static int _app_dispatch_intercept(DB_ENV *dbenv, DBT *dbt, DB_LSN *lsn,
+ db_recops op);
+ static int _backup_close_intercept(DB_ENV *dbenv,
+ const char *dbname, void *handle);
+ static int _backup_open_intercept(DB_ENV *dbenv,
+ const char *dbname, const char *target, void **handle);
+ static int _backup_write_intercept(DB_ENV *dbenv, u_int32_t off_gbytes,
+ u_int32_t off_bytes, u_int32_t size, u_int8_t *buf, void *handle);
+ static void _paniccall_intercept(DB_ENV *dbenv, int errval);
+ static void _feedback_intercept(DB_ENV *dbenv, int opcode, int pct);
+ static void _event_func_intercept(DB_ENV *dbenv, u_int32_t, void *);
+ static int _isalive_intercept(DB_ENV *dbenv, pid_t pid,
+ db_threadid_t thrid, u_int32_t flags);
+ static int _rep_send_intercept(DB_ENV *dbenv, const DBT *cntrl,
+ const DBT *data, const DB_LSN *lsn, int id, u_int32_t flags);
+ static void _stream_error_function(const DB_ENV *dbenv,
+ const char *prefix, const char *message);
+ static void _stream_message_function(const DB_ENV *dbenv,
+ const char *message);
+ static void _thread_id_intercept(DB_ENV *dbenv, pid_t *pidp,
+ db_threadid_t *thridp);
+ static char *_thread_id_string_intercept(DB_ENV *dbenv, pid_t pid,
+ db_threadid_t thrid, char *buf);
+ static void _message_dispatch_intercept(DB_ENV *dbenv,
+ DB_CHANNEL *dbchannel, DBT *request, u_int32_t nrequest,
+ u_int32_t cb_flags);
+
+private:
+ void cleanup();
+ int initialize(DB_ENV *dbenv);
+ int error_policy();
+
+ // For internal use only.
+ DbEnv(DB_ENV *, u_int32_t flags);
+
+ // no copying
+ DbEnv(const DbEnv &);
+ void operator = (const DbEnv &);
+
+ // instance data
+ DB_ENV *imp_;
+ int construct_error_;
+ u_int32_t construct_flags_;
+ __DB_STD(ostream) *error_stream_;
+ __DB_STD(ostream) *message_stream_;
+
+ int (*app_dispatch_callback_)(DbEnv *, Dbt *, DbLsn *, db_recops);
+ int (*backup_close_callback_)(DbEnv *, const char *, void *);
+ int (*backup_open_callback_)(
+ DbEnv *, const char *, const char *, void **);
+ int (*backup_write_callback_)(
+ DbEnv *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *);
+ int (*isalive_callback_)(DbEnv *, pid_t, db_threadid_t, u_int32_t);
+ void (*error_callback_)(const DbEnv *, const char *, const char *);
+ void (*feedback_callback_)(DbEnv *, int, int);
+ void (*message_callback_)(const DbEnv *, const char *);
+ void (*paniccall_callback_)(DbEnv *, int);
+ void (*event_func_callback_)(DbEnv *, u_int32_t, void *);
+ int (*rep_send_callback_)(DbEnv *, const Dbt *, const Dbt *,
+ const DbLsn *, int, u_int32_t);
+ void (*thread_id_callback_)(DbEnv *, pid_t *, db_threadid_t *);
+ char *(*thread_id_string_callback_)(DbEnv *, pid_t, db_threadid_t,
+ char *);
+ void (*message_dispatch_callback_)(DbEnv *, DbChannel *, Dbt *,
+ u_int32_t, u_int32_t);
+};
+
+//
+// Heap record id
+//
+class _exported DbHeapRecordId : private DB_HEAP_RID
+{
+public:
+ db_pgno_t get_pgno() const { return pgno; }
+ void set_pgno(db_pgno_t value) { pgno = value; }
+
+ db_indx_t get_indx() const { return indx; }
+ void set_indx(db_indx_t value) { indx = value; }
+
+ DB_HEAP_RID *get_DB_HEAP_RID() { return (DB_HEAP_RID *)this; }
+ const DB_HEAP_RID *get_const_DB_HEAP_RID() const
+ { return (const DB_HEAP_RID *)this; }
+
+ static DbHeapRecordId* get_DbHeapRecordId(DB_HEAP_RID *rid)
+ { return (DbHeapRecordId *)rid; }
+ static const DbHeapRecordId* get_const_DbHeapRecordId(DB_HEAP_RID *rid)
+ { return (const DbHeapRecordId *)rid; }
+
+ DbHeapRecordId(db_pgno_t pgno, db_indx_t indx);
+ DbHeapRecordId();
+ ~DbHeapRecordId();
+ DbHeapRecordId(const DbHeapRecordId &);
+ DbHeapRecordId &operator = (const DbHeapRecordId &);
+};
+
+//
+// Lock
+//
+class _exported DbLock
+{
+ friend class DbEnv;
+
+public:
+ DbLock();
+ DbLock(const DbLock &);
+ DbLock &operator = (const DbLock &);
+
+protected:
+ // We can add data to this class if needed
+ // since its contained class is not allocated by db.
+ // (see comment at top)
+
+ DbLock(DB_LOCK);
+ DB_LOCK lock_;
+};
+
+//
+// Log cursor
+//
+class _exported DbLogc : protected DB_LOGC
+{
+ friend class DbEnv;
+
+public:
+ int close(u_int32_t _flags);
+ int get(DbLsn *lsn, Dbt *data, u_int32_t _flags);
+ int version(u_int32_t *versionp, u_int32_t _flags);
+
+private:
+ // No data is permitted in this class (see comment at top)
+
+ // Note: use Db::cursor() to get pointers to a Dbc,
+ // and call Dbc::close() rather than delete to release them.
+ //
+ DbLogc();
+ ~DbLogc();
+
+ // no copying
+ DbLogc(const Dbc &);
+ DbLogc &operator = (const Dbc &);
+};
+
+//
+// Log sequence number
+//
+class _exported DbLsn : public DB_LSN
+{
+ friend class DbEnv; // friendship needed to cast to base class
+ friend class DbLogc; // friendship needed to cast to base class
+};
+
+//
+// Memory pool file
+//
+class _exported DbMpoolFile
+{
+ friend class DbEnv;
+ friend class Db;
+
+public:
+ int close(u_int32_t flags);
+ int get(db_pgno_t *pgnoaddr, DbTxn *txn, u_int32_t flags, void *pagep);
+ int get_clear_len(u_int32_t *len);
+ int get_fileid(u_int8_t *fileid);
+ int get_flags(u_int32_t *flagsp);
+ int get_ftype(int *ftype);
+ int get_last_pgno(db_pgno_t *pgnop);
+ int get_lsn_offset(int32_t *offsetp);
+ int get_maxsize(u_int32_t *gbytes, u_int32_t *bytes);
+ int get_pgcookie(DBT *dbt);
+ int get_priority(DB_CACHE_PRIORITY *priorityp);
+ int get_transactional(void);
+ int open(const char *file, u_int32_t flags, int mode, size_t pagesize);
+ int put(void *pgaddr, DB_CACHE_PRIORITY priority, u_int32_t flags);
+ int set_clear_len(u_int32_t len);
+ int set_fileid(u_int8_t *fileid);
+ int set_flags(u_int32_t flags, int onoff);
+ int set_ftype(int ftype);
+ int set_lsn_offset(int32_t offset);
+ int set_maxsize(u_int32_t gbytes, u_int32_t bytes);
+ int set_pgcookie(DBT *dbt);
+ int set_priority(DB_CACHE_PRIORITY priority);
+ int sync();
+
+ virtual DB_MPOOLFILE *get_DB_MPOOLFILE()
+ {
+ return imp_;
+ }
+
+ virtual const DB_MPOOLFILE *get_const_DB_MPOOLFILE() const
+ {
+ return imp_;
+ }
+
+private:
+ DB_MPOOLFILE *imp_;
+
+ // We can add data to this class if needed
+ // since it is implemented via a pointer.
+ // (see comment at top)
+
+ // Note: use DbEnv::memp_fcreate() to get pointers to a DbMpoolFile,
+ // and call DbMpoolFile::close() rather than delete to release them.
+ //
+ DbMpoolFile();
+
+ // Shut g++ up.
+protected:
+ virtual ~DbMpoolFile();
+
+private:
+ // no copying
+ DbMpoolFile(const DbMpoolFile &);
+ void operator = (const DbMpoolFile &);
+};
+
+//
+// This is filled in and returned by the DbEnv::txn_recover() method.
+//
+class _exported DbPreplist
+{
+public:
+ DbTxn *txn;
+ u_int8_t gid[DB_GID_SIZE];
+};
+
+//
+// A sequence record in a database
+//
+class _exported DbSequence
+{
+public:
+ DbSequence(Db *db, u_int32_t flags);
+ virtual ~DbSequence();
+
+ int open(DbTxn *txnid, Dbt *key, u_int32_t flags);
+ int initial_value(db_seq_t value);
+ int close(u_int32_t flags);
+ int remove(DbTxn *txnid, u_int32_t flags);
+ int stat(DB_SEQUENCE_STAT **sp, u_int32_t flags);
+ int stat_print(u_int32_t flags);
+
+ int get(DbTxn *txnid, int32_t delta, db_seq_t *retp, u_int32_t flags);
+ int get_cachesize(int32_t *sizep);
+ int set_cachesize(int32_t size);
+ int get_flags(u_int32_t *flagsp);
+ int set_flags(u_int32_t flags);
+ int get_range(db_seq_t *minp, db_seq_t *maxp);
+ int set_range(db_seq_t min, db_seq_t max);
+
+ Db *get_db();
+ Dbt *get_key();
+
+ virtual DB_SEQUENCE *get_DB_SEQUENCE()
+ {
+ return imp_;
+ }
+
+ virtual const DB_SEQUENCE *get_const_DB_SEQUENCE() const
+ {
+ return imp_;
+ }
+
+ static DbSequence* get_DbSequence(DB_SEQUENCE *seq)
+ {
+ return (DbSequence *)seq->api_internal;
+ }
+
+ static const DbSequence* get_const_DbSequence(const DB_SEQUENCE *seq)
+ {
+ return (const DbSequence *)seq->api_internal;
+ }
+
+ // For internal use only.
+ static DbSequence* wrap_DB_SEQUENCE(DB_SEQUENCE *seq);
+
+private:
+ DbSequence(DB_SEQUENCE *seq);
+ // no copying
+ DbSequence(const DbSequence &);
+ DbSequence &operator = (const DbSequence &);
+
+ DB_SEQUENCE *imp_;
+ DBT key_;
+};
+
+//
+// A site in replication group
+//
+class _exported DbSite
+{
+ friend class DbEnv;
+
+public:
+ int close();
+ int get_address(const char **hostp, u_int *port);
+ int get_config(u_int32_t which, u_int32_t *value);
+ int get_eid(int *eidp);
+ int remove();
+ int set_config(u_int32_t which, u_int32_t value);
+
+ virtual DB_SITE *get_DB_SITE()
+ {
+ return imp_;
+ }
+
+ virtual const DB_SITE *get_const_DB_SITE() const
+ {
+ return imp_;
+ }
+
+private:
+ DbSite();
+ virtual ~DbSite();
+
+ // no copying
+ DbSite(const DbSite &);
+ DbSite &operator = (const DbSite &);
+ DB_SITE *imp_;
+};
+
+//
+// Transaction
+//
+class _exported DbTxn
+{
+ friend class DbEnv;
+
+public:
+ int abort();
+ int commit(u_int32_t flags);
+ int discard(u_int32_t flags);
+ u_int32_t id();
+ int get_name(const char **namep);
+ int get_priority(u_int32_t *priorityp);
+ int prepare(u_int8_t *gid);
+ int set_name(const char *name);
+ int set_priority(u_int32_t priority);
+ int set_timeout(db_timeout_t timeout, u_int32_t flags);
+
+ virtual DB_TXN *get_DB_TXN()
+ {
+ return imp_;
+ }
+
+ virtual const DB_TXN *get_const_DB_TXN() const
+ {
+ return imp_;
+ }
+
+ static DbTxn* get_DbTxn(DB_TXN *txn)
+ {
+ return (DbTxn *)txn->api_internal;
+ }
+
+ static const DbTxn* get_const_DbTxn(const DB_TXN *txn)
+ {
+ return (const DbTxn *)txn->api_internal;
+ }
+
+ // For internal use only.
+ static DbTxn* wrap_DB_TXN(DB_TXN *txn);
+ void remove_child_txn(DbTxn *kid);
+ void add_child_txn(DbTxn *kid);
+
+ void set_parent(DbTxn *ptxn)
+ {
+ parent_txn_ = ptxn;
+ }
+
+private:
+ DB_TXN *imp_;
+
+ // We use a TAILQ to store this object's kids of DbTxn objects, and
+ // each kid has a "parent_txn_" to point to this DbTxn object.
+ //
+ // If imp_ has a parent transaction which is not wrapped by DbTxn
+ // class, parent_txn_ will be NULL since we don't need to maintain
+ // this parent-kid relationship. This relationship only helps to
+ // delete unresolved kids when the parent is resolved.
+ DbTxn *parent_txn_;
+
+ // We can add data to this class if needed
+ // since it is implemented via a pointer.
+ // (see comment at top)
+
+ // Note: use DbEnv::txn_begin() to get pointers to a DbTxn,
+ // and call DbTxn::abort() or DbTxn::commit rather than
+ // delete to release them.
+ //
+ DbTxn(DbTxn *ptxn);
+ // For internal use only.
+ DbTxn(DB_TXN *txn, DbTxn *ptxn);
+ virtual ~DbTxn();
+
+ // no copying
+ DbTxn(const DbTxn &);
+ void operator = (const DbTxn &);
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_HEAD(__children, DbTxn) children;
+ */
+ struct __children {
+ DbTxn *tqh_first;
+ DbTxn **tqh_last;
+ } children;
+
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(DbTxn) child_entry;
+ */
+ struct {
+ DbTxn *tqe_next;
+ DbTxn **tqe_prev;
+ } child_entry;
+};
+
+//
+// A chunk of data, maybe a key or value.
+//
+class _exported Dbt : private DBT
+{
+ friend class Db;
+ friend class Dbc;
+ friend class DbEnv;
+ friend class DbLogc;
+ friend class DbSequence;
+
+public:
+ // key/data
+ void *get_data() const { return data; }
+ void set_data(void *value) { data = value; }
+
+ // key/data length
+ u_int32_t get_size() const { return size; }
+ void set_size(u_int32_t value) { size = value; }
+
+ // RO: length of user buffer.
+ u_int32_t get_ulen() const { return ulen; }
+ void set_ulen(u_int32_t value) { ulen = value; }
+
+ // RO: get/put record length.
+ u_int32_t get_dlen() const { return dlen; }
+ void set_dlen(u_int32_t value) { dlen = value; }
+
+ // RO: get/put record offset.
+ u_int32_t get_doff() const { return doff; }
+ void set_doff(u_int32_t value) { doff = value; }
+
+ // flags
+ u_int32_t get_flags() const { return flags; }
+ void set_flags(u_int32_t value) { flags = value; }
+
+ // Conversion functions
+ DBT *get_DBT() { return (DBT *)this; }
+ const DBT *get_const_DBT() const { return (const DBT *)this; }
+
+ static Dbt* get_Dbt(DBT *dbt) { return (Dbt *)dbt; }
+ static const Dbt* get_const_Dbt(const DBT *dbt)
+ { return (const Dbt *)dbt; }
+
+ Dbt(void *data, u_int32_t size);
+ Dbt();
+ ~Dbt();
+ Dbt(const Dbt &);
+ Dbt &operator = (const Dbt &);
+
+private:
+ // Note: no extra data appears in this class (other than
+ // inherited from DBT) since we need DBT and Dbt objects
+ // to have interchangable pointers.
+ //
+ // When subclassing this class, remember that callback
+ // methods like bt_compare, bt_prefix, dup_compare may
+ // internally manufacture DBT objects (which later are
+ // cast to Dbt), so such callbacks might receive objects
+ // not of your subclassed type.
+};
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// multiple key/data/recno iterator classes
+//
+
+// DbMultipleIterator is a shared private base class for the three types
+// of bulk-return Iterator; it should never be instantiated directly,
+// but it handles the functionality shared by its subclasses.
+class _exported DbMultipleIterator
+{
+public:
+ DbMultipleIterator(const Dbt &dbt);
+protected:
+ u_int8_t *data_;
+ u_int32_t *p_;
+};
+
+class _exported DbMultipleKeyDataIterator : private DbMultipleIterator
+{
+public:
+ DbMultipleKeyDataIterator(const Dbt &dbt) : DbMultipleIterator(dbt) {}
+ bool next(Dbt &key, Dbt &data);
+};
+
+class _exported DbMultipleRecnoDataIterator : private DbMultipleIterator
+{
+public:
+ DbMultipleRecnoDataIterator(const Dbt &dbt) : DbMultipleIterator(dbt) {}
+ bool next(db_recno_t &recno, Dbt &data);
+};
+
+class _exported DbMultipleDataIterator : private DbMultipleIterator
+{
+public:
+ DbMultipleDataIterator(const Dbt &dbt) : DbMultipleIterator(dbt) {}
+ bool next(Dbt &data);
+};
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// multiple key/data/recno builder classes
+//
+
+// DbMultipleBuilder is a shared private base class for the three types
+// of bulk buffer builders; it should never be instantiated directly,
+// but it handles the functionality shared by its subclasses.
+class _exported DbMultipleBuilder
+{
+public:
+ DbMultipleBuilder(Dbt &dbt);
+protected:
+ Dbt &dbt_;
+ void *p_;
+};
+
+class _exported DbMultipleDataBuilder : DbMultipleBuilder
+{
+public:
+ DbMultipleDataBuilder(Dbt &dbt) : DbMultipleBuilder(dbt) {}
+ bool append(void *dbuf, size_t dlen);
+ bool reserve(void *&ddest, size_t dlen);
+};
+
+class _exported DbMultipleKeyDataBuilder : DbMultipleBuilder
+{
+public:
+ DbMultipleKeyDataBuilder(Dbt &dbt) : DbMultipleBuilder(dbt) {}
+ bool append(void *kbuf, size_t klen, void *dbuf, size_t dlen);
+ bool reserve(void *&kdest, size_t klen, void *&ddest, size_t dlen);
+};
+
+class _exported DbMultipleRecnoDataBuilder
+{
+public:
+ DbMultipleRecnoDataBuilder(Dbt &dbt);
+ bool append(db_recno_t recno, void *dbuf, size_t dlen);
+ bool reserve(db_recno_t recno, void *&ddest, size_t dlen);
+protected:
+ Dbt &dbt_;
+ void *p_;
+};
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Exception classes
+//
+
+// Almost any error in the DB library throws a DbException.
+// Every exception should be considered an abnormality
+// (e.g. bug, misuse of DB, file system error).
+//
+class _exported DbException : public __DB_STD(exception)
+{
+public:
+ virtual ~DbException() throw();
+ DbException(int err);
+ DbException(const char *description);
+ DbException(const char *description, int err);
+ DbException(const char *prefix, const char *description, int err);
+ int get_errno() const;
+ virtual const char *what() const throw();
+ DbEnv *get_env() const;
+ void set_env(DbEnv *dbenv);
+
+ DbException(const DbException &);
+ DbException &operator = (const DbException &);
+
+private:
+ void describe(const char *prefix, const char *description);
+
+ char *what_;
+ int err_; // errno
+ DbEnv *dbenv_;
+};
+
+//
+// A specific sort of exception that occurs when
+// an operation is aborted to resolve a deadlock.
+//
+class _exported DbDeadlockException : public DbException
+{
+public:
+ virtual ~DbDeadlockException() throw();
+ DbDeadlockException(const char *description);
+
+ DbDeadlockException(const DbDeadlockException &);
+ DbDeadlockException &operator = (const DbDeadlockException &);
+};
+
+//
+// A specific sort of exception that occurs when
+// a lock is not granted, e.g. by lock_get or lock_vec.
+// Note that the Dbt is only live as long as the Dbt used
+// in the offending call.
+//
+class _exported DbLockNotGrantedException : public DbException
+{
+public:
+ virtual ~DbLockNotGrantedException() throw();
+ DbLockNotGrantedException(const char *prefix, db_lockop_t op,
+ db_lockmode_t mode, const Dbt *obj, const DbLock lock, int index);
+ DbLockNotGrantedException(const char *description);
+
+ DbLockNotGrantedException(const DbLockNotGrantedException &);
+ DbLockNotGrantedException &operator =
+ (const DbLockNotGrantedException &);
+
+ db_lockop_t get_op() const;
+ db_lockmode_t get_mode() const;
+ const Dbt* get_obj() const;
+ DbLock *get_lock() const;
+ int get_index() const;
+
+private:
+ db_lockop_t op_;
+ db_lockmode_t mode_;
+ const Dbt *obj_;
+ DbLock *lock_;
+ int index_;
+};
+
+//
+// A specific sort of exception that occurs when
+// user declared memory is insufficient in a Dbt.
+//
+class _exported DbMemoryException : public DbException
+{
+public:
+ virtual ~DbMemoryException() throw();
+ DbMemoryException(Dbt *dbt);
+ DbMemoryException(const char *prefix, Dbt *dbt);
+
+ DbMemoryException(const DbMemoryException &);
+ DbMemoryException &operator = (const DbMemoryException &);
+
+ Dbt *get_dbt() const;
+private:
+ Dbt *dbt_;
+};
+
+//
+// A specific sort of exception that occurs when a change of replication
+// master requires that all handles be re-opened.
+//
+class _exported DbRepHandleDeadException : public DbException
+{
+public:
+ virtual ~DbRepHandleDeadException() throw();
+ DbRepHandleDeadException(const char *description);
+
+ DbRepHandleDeadException(const DbRepHandleDeadException &);
+ DbRepHandleDeadException &operator = (const DbRepHandleDeadException &);
+};
+
+//
+// A specific sort of exception that occurs when
+// recovery is required before continuing DB activity.
+//
+class _exported DbRunRecoveryException : public DbException
+{
+public:
+ virtual ~DbRunRecoveryException() throw();
+ DbRunRecoveryException(const char *description);
+
+ DbRunRecoveryException(const DbRunRecoveryException &);
+ DbRunRecoveryException &operator = (const DbRunRecoveryException &);
+};
+
+//
+// A specific sort of exception that occurs when
+
+////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////
+//
+// Restore default compiler warnings
+//
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif /* !_DB_CXX_H_ */
diff --git a/src/dbinc/db_dispatch.h b/src/dbinc/db_dispatch.h
new file mode 100644
index 00000000..b6382871
--- /dev/null
+++ b/src/dbinc/db_dispatch.h
@@ -0,0 +1,97 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * The President and Fellows of Harvard University. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_DISPATCH_H_
+#define _DB_DISPATCH_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Declarations and typedefs for the list of transaction IDs used during
+ * recovery. This is a generic list used to pass along whatever information
+ * we need during recovery.
+ */
+typedef enum {
+ TXNLIST_DELETE,
+ TXNLIST_LSN,
+ TXNLIST_TXNID
+} db_txnlist_type;
+
+#define DB_TXNLIST_MASK(hp, n) (n % hp->nslots)
+struct __db_txnhead {
+ void *td; /* If abort, the detail for the txn. */
+ DB_THREAD_INFO *thread_info; /* Thread information. */
+ u_int32_t maxid; /* Maximum transaction id. */
+ DB_LSN maxlsn; /* Maximum commit lsn. */
+ DB_LSN ckplsn; /* LSN of last retained checkpoint. */
+ DB_LSN trunc_lsn; /* Lsn to which we are going to truncate;
+ * make sure we abort anyone after this. */
+ u_int32_t generation; /* Current generation number. */
+ u_int32_t gen_alloc; /* Number of generations allocated. */
+ struct {
+ u_int32_t generation;
+ u_int32_t txn_min;
+ u_int32_t txn_max;
+ } *gen_array; /* Array of txnids associated with a gen. */
+ u_int nslots;
+ LIST_HEAD(__db_headlink, __db_txnlist) head[1];
+};
+
+#define DB_LSN_STACK_SIZE 4
+struct __db_txnlist {
+ db_txnlist_type type;
+ LIST_ENTRY(__db_txnlist) links;
+ union {
+ struct {
+ u_int32_t txnid;
+ u_int32_t generation;
+ u_int32_t status;
+ } t;
+ struct {
+ u_int32_t stack_size;
+ u_int32_t stack_indx;
+ DB_LSN *lsn_stack;
+ } l;
+ } u;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_DISPATCH_H_ */
diff --git a/src/dbinc/db_int.in b/src/dbinc/db_int.in
new file mode 100644
index 00000000..42439107
--- /dev/null
+++ b/src/dbinc/db_int.in
@@ -0,0 +1,1162 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_INT_H_
+#define _DB_INT_H_
+
+/*******************************************************
+ * Berkeley DB ANSI/POSIX include files.
+ *******************************************************/
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#include <sys/types.h>
+#ifdef DIAG_MVCC
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+
+#if defined(HAVE_REPLICATION_THREADS)
+#ifdef HAVE_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#ifdef HAVE_VXWORKS
+#include <selectLib.h>
+#endif
+#endif
+
+#if TIME_WITH_SYS_TIME
+#include <sys/time.h>
+#include <time.h>
+#else
+#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+#endif
+
+#ifdef HAVE_VXWORKS
+#include <net/uio.h>
+#else
+#include <sys/uio.h>
+#endif
+
+#if defined(HAVE_REPLICATION_THREADS)
+#ifdef HAVE_SYS_SOCKET_H
+#include <sys/socket.h>
+#endif
+#include <netinet/in.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#endif
+
+#if defined(STDC_HEADERS) || defined(__cplusplus)
+#include <stdarg.h>
+#else
+#include <varargs.h>
+#endif
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#endif /* !HAVE_SYSTEM_INCLUDE_FILES */
+
+#ifdef DB_WIN32
+#include "dbinc/win_db.h"
+#endif
+
+#ifdef HAVE_DBM
+#undef DB_DBM_HSEARCH
+#define DB_DBM_HSEARCH 1
+#endif
+
+#include "db.h"
+#include "clib_port.h"
+
+#include "dbinc/queue.h"
+#include "dbinc/shqueue.h"
+#include "dbinc/perfmon.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * The Windows compiler needs to be told about structures that are available
+ * outside a dll.
+ */
+#if defined(DB_WIN32) && defined(_MSC_VER) && \
+ !defined(DB_CREATE_DLL) && !defined(_LIB)
+#define __DB_IMPORT __declspec(dllimport)
+#else
+#define __DB_IMPORT
+#endif
+
+/*******************************************************
+ * Forward structure declarations.
+ *******************************************************/
+struct __db_commit_info; typedef struct __db_commit_info DB_COMMIT_INFO;
+struct __db_reginfo_t; typedef struct __db_reginfo_t REGINFO;
+struct __db_txnhead; typedef struct __db_txnhead DB_TXNHEAD;
+struct __db_txnlist; typedef struct __db_txnlist DB_TXNLIST;
+struct __vrfy_childinfo;typedef struct __vrfy_childinfo VRFY_CHILDINFO;
+struct __vrfy_dbinfo; typedef struct __vrfy_dbinfo VRFY_DBINFO;
+struct __vrfy_pageinfo; typedef struct __vrfy_pageinfo VRFY_PAGEINFO;
+
+struct __db_log_verify_info;
+struct __txn_verify_info;
+struct __lv_filereg_info;
+struct __lv_ckp_info;
+struct __lv_timestamp_info;
+typedef struct __db_log_verify_info DB_LOG_VRFY_INFO;
+typedef struct __txn_verify_info VRFY_TXN_INFO;
+typedef struct __lv_filereg_info VRFY_FILEREG_INFO;
+typedef struct __lv_filelife VRFY_FILELIFE;
+typedef struct __lv_ckp_info VRFY_CKP_INFO;
+typedef struct __lv_timestamp_info VRFY_TIMESTAMP_INFO;
+
+/*
+ * TXNINFO_HANDLER --
+ * Callback function pointer type for __iterate_txninfo.
+ */
+typedef int (*TXNINFO_HANDLER) __P((DB_LOG_VRFY_INFO *, VRFY_TXN_INFO *, void *));
+
+typedef SH_TAILQ_HEAD(__hash_head) DB_HASHTAB;
+
+/*******************************************************
+ * General purpose constants and macros.
+ *******************************************************/
+#undef FALSE
+#define FALSE 0
+#undef TRUE
+#define TRUE (!FALSE)
+
+#define MEGABYTE 1048576
+#define GIGABYTE 1073741824
+
+#define NS_PER_MS 1000000 /* Nanoseconds in a millisecond */
+#define NS_PER_US 1000 /* Nanoseconds in a microsecond */
+#define NS_PER_SEC 1000000000 /* Nanoseconds in a second */
+#define US_PER_MS 1000 /* Microseconds in a millisecond */
+#define US_PER_SEC 1000000 /* Microseconds in a second */
+#define MS_PER_SEC 1000 /* Milliseconds in a second */
+
+#define RECNO_OOB 0 /* Illegal record number. */
+
+/*
+ * Define a macro which has no runtime effect, yet avoids triggering empty
+ * statement compiler warnings. Use it as the text of conditionally-null macros.
+ */
+#define NOP_STATEMENT do { } while (0)
+
+/* Test for a power-of-two (tests true for zero, which doesn't matter here). */
+#define POWER_OF_TWO(x) (((x) & ((x) - 1)) == 0)
+
+/* Test for valid page sizes. */
+#define DB_MIN_PGSIZE 0x000200 /* Minimum page size (512). */
+#define DB_MAX_PGSIZE 0x010000 /* Maximum page size (65536). */
+#define IS_VALID_PAGESIZE(x) \
+ (POWER_OF_TWO(x) && (x) >= DB_MIN_PGSIZE && ((x) <= DB_MAX_PGSIZE))
+
+/* Minimum number of pages cached, by default. */
+#define DB_MINPAGECACHE 16
+
+/*
+ * If we are unable to determine the underlying filesystem block size, use
+ * 8K on the grounds that most OS's use less than 8K for a VM page size.
+ */
+#define DB_DEF_IOSIZE (8 * 1024)
+
+/* Align an integer to a specific boundary. */
+#undef DB_ALIGN
+#define DB_ALIGN(v, bound) \
+ (((v) + (bound) - 1) & ~(((uintmax_t)(bound)) - 1))
+
+/* Increment a pointer to a specific boundary. */
+#undef ALIGNP_INC
+#define ALIGNP_INC(p, bound) \
+ (void *)(((uintptr_t)(p) + (bound) - 1) & ~(((uintptr_t)(bound)) - 1))
+
+/*
+ * DB_ALIGN8 adjusts structure alignments to make sure shared structures have
+ * fixed size and filed offset on both 32bit and 64bit platforms when
+ * HAVE_MIXED_SIZE_ADDRESSING is defined.
+ */
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+#define DB_ALIGN8 @DB_STRUCT_ALIGN8@
+#else
+#define DB_ALIGN8
+#endif
+
+/*
+ * Berkeley DB uses the va_copy macro from C99, not all compilers include
+ * it, so add a dumb implementation compatible with pre C99 implementations.
+ */
+#ifndef va_copy
+#define va_copy(d, s) ((d) = (s))
+#endif
+
+/*
+ * Print an address as a u_long (a u_long is the largest type we can print
+ * portably). Most 64-bit systems have made longs 64-bits, so this should
+ * work.
+ */
+#define P_TO_ULONG(p) ((u_long)(uintptr_t)(p))
+
+/*
+ * Convert a pointer to an integral value.
+ *
+ * The (u_int16_t)(uintptr_t) cast avoids warnings: the (uintptr_t) cast
+ * converts the value to an integral type, and the (u_int16_t) cast converts
+ * it to a small integral type so we don't get complaints when we assign the
+ * final result to an integral type smaller than uintptr_t.
+ */
+#define P_TO_UINT32(p) ((u_int32_t)(uintptr_t)(p))
+#define P_TO_UINT16(p) ((u_int16_t)(uintptr_t)(p))
+#define P_TO_ROFF(p) ((roff_t)(uintptr_t)(p))
+
+/* The converse of P_TO_ROFF() above. */
+#define ROFF_TO_P(roff) ((void *)(uintptr_t)(roff))
+
+/*
+ * There are several on-page structures that are declared to have a number of
+ * fields followed by a variable length array of items. The structure size
+ * without including the variable length array or the address of the first of
+ * those elements can be found using SSZ.
+ *
+ * This macro can also be used to find the offset of a structure element in a
+ * structure. This is used in various places to copy structure elements from
+ * unaligned memory references, e.g., pointers into a packed page.
+ *
+ * There are two versions because compilers object if you take the address of
+ * an array.
+ */
+#undef SSZ
+#define SSZ(name, field) P_TO_UINT16(&(((name *)0)->field))
+
+#undef SSZA
+#define SSZA(name, field) P_TO_UINT16(&(((name *)0)->field[0]))
+
+/* Structure used to print flag values. */
+typedef struct __fn {
+ u_int32_t mask; /* Flag value. */
+ const char *name; /* Flag name. */
+} FN;
+
+/* Set, clear and test flags. */
+#define FLD_CLR(fld, f) (fld) &= ~(f)
+#define FLD_ISSET(fld, f) ((fld) & (f))
+#define FLD_SET(fld, f) (fld) |= (f)
+#define F_CLR(p, f) (p)->flags &= ~(f)
+#define F_ISSET(p, f) ((p)->flags & (f))
+#define F_SET(p, f) (p)->flags |= (f)
+#define F2_CLR(p, f) ((p)->flags2 &= ~(f))
+#define F2_ISSET(p, f) ((p)->flags2 & (f))
+#define F2_SET(p, f) ((p)->flags2 |= (f))
+#define LF_CLR(f) ((flags) &= ~(f))
+#define LF_ISSET(f) ((flags) & (f))
+#define LF_SET(f) ((flags) |= (f))
+
+/*
+ * Calculate a percentage. The values can overflow 32-bit integer arithmetic
+ * so we use floating point.
+ *
+ * When calculating a bytes-vs-page size percentage, we're getting the inverse
+ * of the percentage in all cases, that is, we want 100 minus the percentage we
+ * calculate.
+ */
+#define DB_PCT(v, total) \
+ ((int)((total) == 0 ? 0 : ((double)(v) * 100) / (total)))
+#define DB_PCT_PG(v, total, pgsize) \
+ ((int)((total) == 0 ? 0 : \
+ 100 - ((double)(v) * 100) / (((double)total) * (pgsize))))
+
+/*
+ * Statistics update shared memory and so are expensive -- don't update the
+ * values unless we're going to display the results.
+ * When performance monitoring is enabled, the changed value can be published
+ * (via DTrace or SystemTap) along with another associated value or two.
+ */
+#undef STAT
+#ifdef HAVE_STATISTICS
+#define STAT(x) x
+#define STAT_ADJUST(env, cat, subcat, val, amount, id) \
+ do { \
+ (val) += (amount); \
+ STAT_PERFMON2((env), cat, subcat, (val), (id)); \
+ } while (0)
+#define STAT_ADJUST_VERB(env, cat, subcat, val, amount, id1, id2) \
+ do { \
+ (val) += (amount); \
+ STAT_PERFMON3((env), cat, subcat, (val), (id1), (id2)); \
+ } while (0)
+#define STAT_INC(env, cat, subcat, val, id) \
+ STAT_ADJUST(env, cat, subcat, (val), 1, (id))
+#define STAT_INC_VERB(env, cat, subcat, val, id1, id2) \
+ STAT_ADJUST_VERB((env), cat, subcat, (val), 1, (id1), (id2))
+/*
+ * STAT_DEC() subtracts one rather than adding (-1) with STAT_ADJUST(); the
+ * latter might generate a compilation warning for an unsigned value.
+ */
+#define STAT_DEC(env, cat, subcat, val, id) \
+ do { \
+ (val)--; \
+ STAT_PERFMON2((env), cat, subcat, (val), (id)); \
+ } while (0)
+/* N.B.: Add a verbose version of STAT_DEC() when needed. */
+
+#define STAT_SET(env, cat, subcat, val, newval, id) \
+ do { \
+ (val) = (newval); \
+ STAT_PERFMON2((env), cat, subcat, (val), (id)); \
+ } while (0)
+#define STAT_SET_VERB(env, cat, subcat, val, newval, id1, id2) \
+ do { \
+ (val) = (newval); \
+ STAT_PERFMON3((env), cat, subcat, (val), (id1), (id2)); \
+ } while (0)
+#else
+#define STAT(x) NOP_STATEMENT
+#define STAT_ADJUST(env, cat, subcat, val, amt, id) NOP_STATEMENT
+#define STAT_ADJUST_VERB(env, cat, subcat, val, amt, id1, id2) NOP_STATEMENT
+#define STAT_INC(env, cat, subcat, val, id) NOP_STATEMENT
+#define STAT_INC_VERB(env, cat, subcat, val, id1, id2) NOP_STATEMENT
+#define STAT_DEC(env, cat, subcat, val, id) NOP_STATEMENT
+#define STAT_SET(env, cat, subcat, val, newval, id) NOP_STATEMENT
+#define STAT_SET_VERB(env, cat, subcat, val, newval, id1, id2) NOP_STATEMENT
+#endif
+
+#if defined HAVE_SIMPLE_THREAD_TYPE
+#define DB_THREADID_INIT(t) COMPQUIET((t), 0)
+#else
+#define DB_THREADID_INIT(t) memset(&(t), 0, sizeof(t))
+#endif
+
+/*
+ * These macros are used when an error condition is first noticed. They allow
+ * one to be notified (via e.g. DTrace, SystemTap, ...) when an error occurs
+ * deep inside DB, rather than when it is returned back through the API.
+ *
+ * The second actual argument to these is the second part of the error or
+ * warning event name. They work when 'errcode' is a symbolic name e.g.
+ * EINVAL or DB_LOCK_DEALOCK, not a variable. Noticing system call failures
+ * would be handled by tracing on syscall exit; when e.g., it returns < 0.
+ */
+#define ERR_ORIGIN(env, errcode) \
+ (PERFMON0(env, error, errcode), errcode)
+
+#define ERR_ORIGIN_MSG(env, errcode, msg) \
+ (PERFMON1(env, error, errcode, msg), errcode)
+
+#define WARNING_ORIGIN(env, errcode) \
+ (PERFMON0(env, warning, errcode), errcode)
+
+/*
+ * Structure used for callback message aggregation.
+ *
+ * Display values in XXX_stat_print calls.
+ */
+typedef struct __db_msgbuf {
+ char *buf; /* Heap allocated buffer. */
+ char *cur; /* Current end of message. */
+ size_t len; /* Allocated length of buffer. */
+} DB_MSGBUF;
+#define DB_MSGBUF_INIT(a) do { \
+ (a)->buf = (a)->cur = NULL; \
+ (a)->len = 0; \
+} while (0)
+#define DB_MSGBUF_FLUSH(env, a) do { \
+ if ((a)->buf != NULL) { \
+ if ((a)->cur != (a)->buf) \
+ __db_msg(env, "%s", (a)->buf); \
+ __os_free(env, (a)->buf); \
+ DB_MSGBUF_INIT(a); \
+ } \
+} while (0)
+#define DB_MSGBUF_REP_FLUSH(env, a, diag_msg, regular_msg) do { \
+ if ((a)->buf != NULL) { \
+ if ((a)->cur != (a)->buf && diag_msg) \
+ __db_repmsg(env, "%s", (a)->buf); \
+ if (regular_msg) \
+ DB_MSGBUF_FLUSH(env, a); \
+ else { \
+ __os_free(env, (a)->buf); \
+ DB_MSGBUF_INIT(a); \
+ } \
+ } \
+} while (0)
+#define STAT_FMT(msg, fmt, type, v) do { \
+ DB_MSGBUF __mb; \
+ DB_MSGBUF_INIT(&__mb); \
+ __db_msgadd(env, &__mb, fmt, (type)(v)); \
+ __db_msgadd(env, &__mb, "\t%s", msg); \
+ DB_MSGBUF_FLUSH(env, &__mb); \
+} while (0)
+#define STAT_HEX(msg, v) \
+ __db_msg(env, "%#lx\t%s", (u_long)(v), msg)
+#define STAT_ISSET(msg, p) \
+ __db_msg(env, "%sSet\t%s", (p) == NULL ? "!" : " ", msg)
+#define STAT_LONG(msg, v) \
+ __db_msg(env, "%ld\t%s", (long)(v), msg)
+#define STAT_LSN(msg, lsnp) \
+ __db_msg(env, "%lu/%lu\t%s", \
+ (u_long)(lsnp)->file, (u_long)(lsnp)->offset, msg)
+#define STAT_POINTER(msg, v) \
+ __db_msg(env, "%#lx\t%s", P_TO_ULONG(v), msg)
+#define STAT_STRING(msg, p) do { \
+ const char *__p = p; /* p may be a function call. */ \
+ __db_msg(env, "%s\t%s", __p == NULL ? "!Set" : __p, msg); \
+} while (0)
+#define STAT_ULONG(msg, v) \
+ __db_msg(env, "%lu\t%s", (u_long)(v), msg)
+
+/*
+ * The following macros are used to control how error and message strings are
+ * output by Berkeley DB. There are essentially three different controls
+ * available:
+ * - Default behavior is to output error strings with its unique identifier.
+ * - If HAVE_STRIPPED_MESSAGES is enabled, a unique identifier along with any
+ * parameters to the error string will be output.
+ * - If HAVE_LOCALIZATION is defined, and the '_()' macro is implemented, a
+ * gettext or ICU style translation will be done.
+ *
+ * Each new string that will be output should be wrapped in a DB_STR* macro.
+ * There are three versions of this macro for different scenarions:
+ * - DB_STR for strings that need an identifier, and don't have any argument.
+ * - DB_STR_A for strings that need an identifier, and have argument(s).
+ * - DB_STR_P for strings that don't need an identifier, and don't have
+ * arguments.
+ *
+ * Error message IDs are automatically assigned by dist/s_message_id script.
+ */
+#ifdef HAVE_LOCALIZATION
+#define _(msg) msg /* Replace with localization function. */
+#else
+#define _(msg) msg
+#endif
+
+#ifdef HAVE_STRIPPED_MESSAGES
+#define DB_STR_C(msg, fmt) fmt
+#else
+#define DB_STR_C(msg, fmt) _(msg)
+#endif
+
+#define DB_MSGID(id) "BDB" id
+
+#define DB_STR(id, msg) DB_MSGID(id) " " DB_STR_C(msg, "")
+
+#define DB_STR_A(id, msg, fmt) DB_MSGID(id) " " DB_STR_C(msg, fmt)
+
+#define DB_STR_P(msg) _(msg)
+
+/*
+ * There are quite a few places in Berkeley DB where we want to initialize
+ * a DBT from a string or other random pointer type, using a length typed
+ * to size_t in most cases. This macro avoids a lot of casting. The macro
+ * comes in two flavors because we often want to clear the DBT first.
+ */
+#define DB_SET_DBT(dbt, d, s) do { \
+ (dbt).data = (void *)(d); \
+ (dbt).size = (u_int32_t)(s); \
+} while (0)
+#define DB_INIT_DBT(dbt, d, s) do { \
+ memset(&(dbt), 0, sizeof(dbt)); \
+ DB_SET_DBT(dbt, d, s); \
+} while (0)
+
+/*******************************************************
+ * API return values
+ *******************************************************/
+/*
+ * Return values that are OK for each different call. Most calls have a
+ * standard 'return of 0 is only OK value', but some, like db->get have
+ * DB_NOTFOUND as a return value, but it really isn't an error.
+ */
+#define DB_RETOK_STD(ret) ((ret) == 0)
+#define DB_RETOK_DBCDEL(ret) ((ret) == 0 || (ret) == DB_KEYEMPTY || \
+ (ret) == DB_NOTFOUND)
+#define DB_RETOK_DBCGET(ret) ((ret) == 0 || (ret) == DB_KEYEMPTY || \
+ (ret) == DB_NOTFOUND)
+#define DB_RETOK_DBCPUT(ret) ((ret) == 0 || (ret) == DB_KEYEXIST || \
+ (ret) == DB_NOTFOUND)
+#define DB_RETOK_DBDEL(ret) DB_RETOK_DBCDEL(ret)
+#define DB_RETOK_DBGET(ret) DB_RETOK_DBCGET(ret)
+#define DB_RETOK_DBPUT(ret) ((ret) == 0 || (ret) == DB_KEYEXIST)
+#define DB_RETOK_EXISTS(ret) DB_RETOK_DBCGET(ret)
+#define DB_RETOK_LGGET(ret) ((ret) == 0 || (ret) == DB_NOTFOUND)
+#define DB_RETOK_MPGET(ret) ((ret) == 0 || (ret) == DB_PAGE_NOTFOUND)
+#define DB_RETOK_REPPMSG(ret) ((ret) == 0 || \
+ (ret) == DB_REP_IGNORE || \
+ (ret) == DB_REP_ISPERM || \
+ (ret) == DB_REP_NEWMASTER || \
+ (ret) == DB_REP_NEWSITE || \
+ (ret) == DB_REP_NOTPERM || \
+ (ret) == DB_REP_WOULDROLLBACK)
+#define DB_RETOK_REPMGR_LOCALSITE(ret) ((ret) == 0 || (ret) == DB_NOTFOUND)
+#define DB_RETOK_REPMGR_START(ret) ((ret) == 0 || (ret) == DB_REP_IGNORE)
+#define DB_RETOK_TXNAPPLIED(ret) ((ret) == 0 || \
+ (ret) == DB_NOTFOUND || \
+ (ret) == DB_TIMEOUT || \
+ (ret) == DB_KEYEMPTY)
+
+/* Find a reasonable operation-not-supported error. */
+#ifdef EOPNOTSUPP
+#define DB_OPNOTSUP EOPNOTSUPP
+#else
+#ifdef ENOTSUP
+#define DB_OPNOTSUP ENOTSUP
+#else
+#define DB_OPNOTSUP EINVAL
+#endif
+#endif
+
+/*******************************************************
+ * Files.
+ *******************************************************/
+/*
+ * We use 1024 as the maximum path length. It's too hard to figure out what
+ * the real path length is, as it was traditionally stored in <sys/param.h>,
+ * and that file isn't always available.
+ */
+#define DB_MAXPATHLEN 1024
+
+#define PATH_DOT "." /* Current working directory. */
+ /* Path separator character(s). */
+#define PATH_SEPARATOR "@PATH_SEPARATOR@"
+
+/*******************************************************
+ * Environment.
+ *******************************************************/
+/* Type passed to __db_appname(). */
+typedef enum {
+ DB_APP_NONE=0, /* No type (region). */
+ DB_APP_DATA, /* Data file. */
+ DB_APP_LOG, /* Log file. */
+ DB_APP_META, /* Persistent metadata file. */
+ DB_APP_RECOVER, /* We are in recovery. */
+ DB_APP_TMP /* Temporary file. */
+} APPNAME;
+
+/*
+ * A set of macros to check if various functionality has been configured.
+ *
+ * ALIVE_ON The is_alive function is configured.
+ * CDB_LOCKING CDB product locking.
+ * CRYPTO_ON Security has been configured.
+ * LOCKING_ON Locking has been configured.
+ * LOGGING_ON Logging has been configured.
+ * MUTEX_ON Mutexes have been configured.
+ * MPOOL_ON Memory pool has been configured.
+ * REP_ON Replication has been configured.
+ * TXN_ON Transactions have been configured.
+ *
+ * REP_ON is more complex than most: if the BDB library was compiled without
+ * replication support, ENV->rep_handle will be NULL; if the BDB library has
+ * replication support, but it was not configured, the region reference will
+ * be NULL.
+ */
+#define ALIVE_ON(env) ((env)->dbenv->is_alive != NULL)
+#define CDB_LOCKING(env) F_ISSET(env, ENV_CDB)
+#define CRYPTO_ON(env) ((env)->crypto_handle != NULL)
+#define LOCKING_ON(env) ((env)->lk_handle != NULL)
+#define LOGGING_ON(env) ((env)->lg_handle != NULL)
+#define MPOOL_ON(env) ((env)->mp_handle != NULL)
+#define MUTEX_ON(env) ((env)->mutex_handle != NULL)
+#define REP_ON(env) \
+ ((env)->rep_handle != NULL && (env)->rep_handle->region != NULL)
+#define TXN_ON(env) ((env)->tx_handle != NULL)
+
+/*
+ * STD_LOCKING Standard locking, that is, locking was configured and CDB
+ * was not. We do not do locking in off-page duplicate trees,
+ * so we check for that in the cursor first.
+ */
+#define STD_LOCKING(dbc) \
+ (!F_ISSET(dbc, DBC_OPD) && \
+ !CDB_LOCKING((dbc)->env) && LOCKING_ON((dbc)->env))
+
+/*
+ * IS_RECOVERING: The system is running recovery.
+ */
+#define IS_RECOVERING(env) \
+ (LOGGING_ON(env) && F_ISSET((env)->lg_handle, DBLOG_RECOVER))
+
+/* Initialization methods are often illegal before/after open is called. */
+#define ENV_ILLEGAL_AFTER_OPEN(env, name) \
+ if (F_ISSET((env), ENV_OPEN_CALLED)) \
+ return (__db_mi_open(env, name, 1));
+#define ENV_ILLEGAL_BEFORE_OPEN(env, name) \
+ if (!F_ISSET((env), ENV_OPEN_CALLED)) \
+ return (__db_mi_open(env, name, 0));
+
+/* We're not actually user hostile, honest. */
+#define ENV_REQUIRES_CONFIG(env, handle, i, flags) \
+ if (handle == NULL) \
+ return (__env_not_config(env, i, flags));
+#define ENV_REQUIRES_CONFIG_XX(env, handle, i, flags) \
+ if ((env)->handle->region == NULL) \
+ return (__env_not_config(env, i, flags));
+#define ENV_NOT_CONFIGURED(env, handle, i, flags) \
+ if (F_ISSET((env), ENV_OPEN_CALLED)) \
+ ENV_REQUIRES_CONFIG(env, handle, i, flags)
+
+#define ENV_ENTER_RET(env, ip, ret) do { \
+ ret = 0; \
+ PANIC_CHECK_RET(env, ret); \
+ if (ret == 0) { \
+ if ((env)->thr_hashtab == NULL) \
+ ip = NULL; \
+ else \
+ ret = __env_set_state(env, &(ip), THREAD_ACTIVE);\
+ } \
+} while (0)
+
+#define ENV_ENTER(env, ip) do { \
+ int __ret; \
+ ip = NULL; \
+ ENV_ENTER_RET(env, ip, __ret); \
+ if (__ret != 0) \
+ return (__ret); \
+} while (0)
+
+#define FAILCHK_THREAD(env, ip) do { \
+ if ((ip) != NULL) \
+ (ip)->dbth_state = THREAD_FAILCHK; \
+} while (0)
+
+#define ENV_GET_THREAD_INFO(env, ip) ENV_ENTER(env, ip)
+
+#ifdef DIAGNOSTIC
+#define ENV_LEAVE(env, ip) do { \
+ if ((ip) != NULL) { \
+ DB_ASSERT(env, ((ip)->dbth_state == THREAD_ACTIVE || \
+ (ip)->dbth_state == THREAD_FAILCHK)); \
+ (ip)->dbth_state = THREAD_OUT; \
+ } \
+} while (0)
+#else
+#define ENV_LEAVE(env, ip) do { \
+ if ((ip) != NULL) \
+ (ip)->dbth_state = THREAD_OUT; \
+} while (0)
+#endif
+#ifdef DIAGNOSTIC
+#define CHECK_THREAD(env) do { \
+ if ((env)->thr_hashtab != NULL) \
+ (void)__env_set_state(env, NULL, THREAD_VERIFY); \
+} while (0)
+#ifdef HAVE_STATISTICS
+#define CHECK_MTX_THREAD(env, mtx) do { \
+ if (mtx->alloc_id != MTX_MUTEX_REGION && \
+ mtx->alloc_id != MTX_ENV_REGION && \
+ mtx->alloc_id != MTX_APPLICATION) \
+ CHECK_THREAD(env); \
+} while (0)
+#else
+#define CHECK_MTX_THREAD(env, mtx) NOP_STATEMENT
+#endif
+#else
+#define CHECK_THREAD(env) NOP_STATEMENT
+#define CHECK_MTX_THREAD(env, mtx) NOP_STATEMENT
+#endif
+
+typedef enum {
+ THREAD_SLOT_NOT_IN_USE=0,
+ THREAD_OUT,
+ THREAD_ACTIVE,
+ THREAD_BLOCKED,
+ THREAD_BLOCKED_DEAD,
+ THREAD_FAILCHK,
+ THREAD_VERIFY
+} DB_THREAD_STATE;
+
+typedef struct __pin_list {
+ roff_t b_ref; /* offset to buffer. */
+ int region; /* region containing buffer. */
+} PIN_LIST;
+#define PINMAX 4
+
+struct __db_thread_info { /* SHARED */
+ pid_t dbth_pid;
+ db_threadid_t dbth_tid;
+ DB_THREAD_STATE dbth_state;
+ SH_TAILQ_ENTRY dbth_links;
+ /*
+ * The next field contains the (process local) reference to the XA
+ * transaction currently associated with this thread of control.
+ */
+ SH_TAILQ_HEAD(__dbth_xatxn) dbth_xatxn;
+ u_int32_t dbth_xa_status;
+ /*
+ * The following fields track which buffers this thread of
+ * control has pinned in the mpool buffer cache.
+ */
+ u_int16_t dbth_pincount; /* Number of pins for this thread. */
+ u_int16_t dbth_pinmax; /* Number of slots allocated. */
+ roff_t dbth_pinlist; /* List of pins. */
+ PIN_LIST dbth_pinarray[PINMAX]; /* Initial array of slots. */
+#ifdef DIAGNOSTIC
+ roff_t dbth_locker; /* Current locker for this thread. */
+ u_int32_t dbth_check_off; /* Count of number of LOCK_OFF calls. */
+#endif
+};
+#ifdef DIAGNOSTIC
+#define LOCK_CHECK_OFF(ip) if ((ip) != NULL) \
+ (ip)->dbth_check_off++
+
+#define LOCK_CHECK_ON(ip) if ((ip) != NULL) \
+ (ip)->dbth_check_off--
+
+#define LOCK_CHECK(dbc, pgno, mode, type) \
+ DB_ASSERT((dbc)->dbp->env, (dbc)->locker == NULL || \
+ __db_haslock((dbc)->dbp->env, \
+ (dbc)->locker, (dbc)->dbp->mpf, pgno, mode, type) == 0)
+#else
+#define LOCK_CHECK_OFF(ip) NOP_STATEMENT
+#define LOCK_CHECK_ON(ip) NOP_STATEMENT
+#define LOCK_CHECK(dbc, pgno, mode) NOP_STATEMENT
+#endif
+
+typedef struct __env_thread_info {
+ u_int32_t thr_count;
+ u_int32_t thr_init;
+ u_int32_t thr_max;
+ u_int32_t thr_nbucket;
+ roff_t thr_hashoff;
+} THREAD_INFO;
+
+#define DB_EVENT(env, e, einfo) do { \
+ DB_ENV *__dbenv = (env)->dbenv; \
+ if (__dbenv->db_event_func != NULL) \
+ __dbenv->db_event_func(__dbenv, e, einfo); \
+} while (0)
+
+typedef struct __flag_map {
+ u_int32_t inflag, outflag;
+} FLAG_MAP;
+
+typedef struct __db_backup_handle {
+ int (*open) __P((DB_ENV *, const char *, const char *, void **));
+ int (*write) __P((DB_ENV *,
+ u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *));
+ int (*close) __P((DB_ENV *, const char *, void *));
+ u_int32_t size;
+ u_int32_t read_count;
+ u_int32_t read_sleep;
+#define BACKUP_WRITE_DIRECT 0x0001
+ int flags;
+} DB_BACKUP;
+
+/*
+ * Internal database environment structure.
+ *
+ * This is the private database environment handle. The public environment
+ * handle is the DB_ENV structure. The library owns this structure, the user
+ * owns the DB_ENV structure. The reason there are two structures is because
+ * the user's configuration outlives any particular DB_ENV->open call, and
+ * separate structures allows us to easily discard internal information without
+ * discarding the user's configuration.
+ */
+struct __env {
+ DB_ENV *dbenv; /* Linked DB_ENV structure */
+
+ /*
+ * The ENV structure can be used concurrently, so field access is
+ * protected.
+ */
+ db_mutex_t mtx_env; /* ENV structure mutex */
+
+ /*
+ * Some fields are included in the ENV structure rather than in the
+ * DB_ENV structure because they are only set as arguments to the
+ * DB_ENV->open method. In other words, because of the historic API,
+ * not for any rational reason.
+ *
+ * Arguments to DB_ENV->open.
+ */
+ char *db_home; /* Database home */
+ u_int32_t open_flags; /* Flags */
+ int db_mode; /* Default open permissions */
+
+ pid_t pid_cache; /* Cached process ID */
+
+ DB_FH *lockfhp; /* fcntl(2) locking file handle */
+
+ DB_LOCKER *env_lref; /* Locker in non-threaded handles */
+
+ DB_DISTAB recover_dtab; /* Dispatch table for recover funcs */
+
+ int dir_mode; /* Intermediate directory perms. */
+
+#define ENV_DEF_DATA_LEN 100
+ u_int32_t data_len; /* Data length in __db_prbytes. */
+
+ /* Thread tracking */
+ u_int32_t thr_nbucket; /* Number of hash buckets */
+ DB_HASHTAB *thr_hashtab; /* Hash table of DB_THREAD_INFO */
+
+ /*
+ * List of open DB handles for this ENV, used for cursor
+ * adjustment. Must be protected for multi-threaded support.
+ */
+ db_mutex_t mtx_dblist;
+ int db_ref; /* DB handle reference count */
+ TAILQ_HEAD(__dblist, __db) dblist;
+
+ /*
+ * List of open file handles for this ENV. Must be protected
+ * for multi-threaded support.
+ */
+ TAILQ_HEAD(__fdlist, __fh_t) fdlist;
+
+ db_mutex_t mtx_mt; /* Mersenne Twister mutex */
+ int mti; /* Mersenne Twister index */
+ u_long *mt; /* Mersenne Twister state vector */
+
+ DB_CIPHER *crypto_handle; /* Crypto handle */
+ DB_LOCKTAB *lk_handle; /* Lock handle */
+ DB_LOG *lg_handle; /* Log handle */
+ DB_MPOOL *mp_handle; /* Mpool handle */
+ DB_MUTEXMGR *mutex_handle; /* Mutex handle */
+ DB_REP *rep_handle; /* Replication handle */
+ DB_TXNMGR *tx_handle; /* Txn handle */
+
+ DB_BACKUP *backup_handle; /* database copy configuration. */
+
+ /*
+ * XA support.
+ */
+ int xa_rmid; /* XA Resource Manager ID */
+ int xa_ref; /* XA Reference count */
+ TAILQ_ENTRY(__env) links; /* XA environments */
+
+ /* Application callback to copy data to/from a custom data source */
+#define DB_USERCOPY_GETDATA 0x0001
+#define DB_USERCOPY_SETDATA 0x0002
+ int (*dbt_usercopy)
+ __P((DBT *, u_int32_t, void *, u_int32_t, u_int32_t));
+
+ int (*log_verify_wrap) __P((ENV *, const char *, u_int32_t,
+ const char *, const char *, time_t, time_t, u_int32_t, u_int32_t,
+ u_int32_t, u_int32_t, int, int));
+
+ REGINFO *reginfo; /* REGINFO structure reference */
+
+#define DB_TEST_ELECTINIT 1 /* after __rep_elect_init */
+#define DB_TEST_ELECTVOTE1 2 /* after sending VOTE1 */
+#define DB_TEST_NO_PAGES 3 /* before sending PAGE */
+#define DB_TEST_POSTDESTROY 4 /* after destroy op */
+#define DB_TEST_POSTLOG 5 /* after logging all pages */
+#define DB_TEST_POSTLOGMETA 6 /* after logging meta in btree */
+#define DB_TEST_POSTOPEN 7 /* after __os_open */
+#define DB_TEST_POSTSYNC 8 /* after syncing the log */
+#define DB_TEST_PREDESTROY 9 /* before destroy op */
+#define DB_TEST_PREOPEN 10 /* before __os_open */
+#define DB_TEST_REPMGR_PERM 11 /* repmgr perm/archiving tests */
+#define DB_TEST_SUBDB_LOCKS 12 /* subdb locking tests */
+ int test_abort; /* Abort value for testing */
+ int test_check; /* Checkpoint value for testing */
+ int test_copy; /* Copy value for testing */
+
+#define ENV_CDB 0x00000001 /* DB_INIT_CDB */
+#define ENV_DBLOCAL 0x00000002 /* Environment for a private DB */
+#define ENV_LITTLEENDIAN 0x00000004 /* Little endian system. */
+#define ENV_LOCKDOWN 0x00000008 /* DB_LOCKDOWN set */
+#define ENV_NO_OUTPUT_SET 0x00000010 /* No output channel set */
+#define ENV_OPEN_CALLED 0x00000020 /* DB_ENV->open called */
+#define ENV_PRIVATE 0x00000040 /* DB_PRIVATE set */
+#define ENV_RECOVER_FATAL 0x00000080 /* Doing fatal recovery in env */
+#define ENV_REF_COUNTED 0x00000100 /* Region references this handle */
+#define ENV_SYSTEM_MEM 0x00000200 /* DB_SYSTEM_MEM set */
+#define ENV_THREAD 0x00000400 /* DB_THREAD set */
+#define ENV_FORCE_TXN_BULK 0x00000800 /* Txns use bulk mode-for testing */
+ u_int32_t flags;
+};
+
+/*******************************************************
+ * Database Access Methods.
+ *******************************************************/
+/*
+ * DB_IS_THREADED --
+ * The database handle is free-threaded (was opened with DB_THREAD).
+ */
+#define DB_IS_THREADED(dbp) \
+ ((dbp)->mutex != MUTEX_INVALID)
+
+/* Initialization methods are often illegal before/after open is called. */
+#define DB_ILLEGAL_AFTER_OPEN(dbp, name) \
+ if (F_ISSET((dbp), DB_AM_OPEN_CALLED)) \
+ return (__db_mi_open((dbp)->env, name, 1));
+#define DB_ILLEGAL_BEFORE_OPEN(dbp, name) \
+ if (!F_ISSET((dbp), DB_AM_OPEN_CALLED)) \
+ return (__db_mi_open((dbp)->env, name, 0));
+/* Some initialization methods are illegal if environment isn't local. */
+#define DB_ILLEGAL_IN_ENV(dbp, name) \
+ if (!F_ISSET((dbp)->env, ENV_DBLOCAL)) \
+ return (__db_mi_env((dbp)->env, name));
+#define DB_ILLEGAL_METHOD(dbp, flags) { \
+ int __ret; \
+ if ((__ret = __dbh_am_chk(dbp, flags)) != 0) \
+ return (__ret); \
+}
+
+/*
+ * Common DBC->internal fields. Each access method adds additional fields
+ * to this list, but the initial fields are common.
+ */
+#define __DBC_INTERNAL \
+ DBC *opd; /* Off-page duplicate cursor. */\
+ DBC *pdbc; /* Pointer to parent cursor. */ \
+ \
+ void *page; /* Referenced page. */ \
+ u_int32_t part; /* Partition number. */ \
+ db_pgno_t root; /* Tree root. */ \
+ db_pgno_t pgno; /* Referenced page number. */ \
+ db_indx_t indx; /* Referenced key item index. */\
+ \
+ /* Streaming -- cache last position. */ \
+ db_pgno_t stream_start_pgno; /* Last start pgno. */ \
+ u_int32_t stream_off; /* Current offset. */ \
+ db_pgno_t stream_curr_pgno; /* Current overflow page. */ \
+ \
+ DB_LOCK lock; /* Cursor lock. */ \
+ db_lockmode_t lock_mode; /* Lock mode. */
+
+struct __dbc_internal {
+ __DBC_INTERNAL
+};
+
+/* Actions that __db_master_update can take. */
+typedef enum { MU_REMOVE, MU_RENAME, MU_OPEN, MU_MOVE } mu_action;
+
+/*
+ * Access-method-common macro for determining whether a cursor
+ * has been initialized.
+ */
+#ifdef HAVE_PARTITION
+#define IS_INITIALIZED(dbc) (DB_IS_PARTITIONED((dbc)->dbp) ? \
+ ((PART_CURSOR *)(dbc)->internal)->sub_cursor != NULL && \
+ ((PART_CURSOR *)(dbc)->internal)->sub_cursor-> \
+ internal->pgno != PGNO_INVALID : \
+ (dbc)->internal->pgno != PGNO_INVALID)
+#else
+#define IS_INITIALIZED(dbc) ((dbc)->internal->pgno != PGNO_INVALID)
+#endif
+
+/* Free the callback-allocated buffer, if necessary, hanging off of a DBT. */
+#define FREE_IF_NEEDED(env, dbt) \
+ if (F_ISSET((dbt), DB_DBT_APPMALLOC)) { \
+ __os_ufree((env), (dbt)->data); \
+ F_CLR((dbt), DB_DBT_APPMALLOC); \
+ }
+
+/*
+ * Use memory belonging to object "owner" to return the results of
+ * any no-DBT-flag get ops on cursor "dbc".
+ */
+#define SET_RET_MEM(dbc, owner) \
+ do { \
+ (dbc)->rskey = &(owner)->my_rskey; \
+ (dbc)->rkey = &(owner)->my_rkey; \
+ (dbc)->rdata = &(owner)->my_rdata; \
+ } while (0)
+
+/* Use the return-data memory src is currently set to use in dest as well. */
+#define COPY_RET_MEM(src, dest) \
+ do { \
+ (dest)->rskey = (src)->rskey; \
+ (dest)->rkey = (src)->rkey; \
+ (dest)->rdata = (src)->rdata; \
+ } while (0)
+
+/* Reset the returned-memory pointers to their defaults. */
+#define RESET_RET_MEM(dbc) \
+ do { \
+ (dbc)->rskey = &(dbc)->my_rskey; \
+ (dbc)->rkey = &(dbc)->my_rkey; \
+ (dbc)->rdata = &(dbc)->my_rdata; \
+ } while (0)
+
+#define COMPACT_TRUNCATE(c_data) do { \
+ if (c_data->compact_truncate > 1) \
+ c_data->compact_truncate--; \
+} while (0)
+
+/*******************************************************
+ * Mpool.
+ *******************************************************/
+/*
+ * File types for DB access methods. Negative numbers are reserved to DB.
+ */
+#define DB_FTYPE_SET -1 /* Call pgin/pgout functions. */
+#define DB_FTYPE_NOTSET 0 /* Don't call... */
+#define DB_LSN_OFF_NOTSET -1 /* Not yet set. */
+#define DB_CLEARLEN_NOTSET UINT32_MAX /* Not yet set. */
+
+/* Structure used as the DB pgin/pgout pgcookie. */
+typedef struct __dbpginfo {
+ u_int32_t db_pagesize; /* Underlying page size. */
+ u_int32_t flags; /* Some DB_AM flags needed. */
+ DBTYPE type; /* DB type */
+} DB_PGINFO;
+
+/*******************************************************
+ * Log.
+ *******************************************************/
+/* Initialize an LSN to 'zero'. */
+#define ZERO_LSN(LSN) do { \
+ (LSN).file = 0; \
+ (LSN).offset = 0; \
+} while (0)
+#define IS_ZERO_LSN(LSN) ((LSN).file == 0 && (LSN).offset == 0)
+
+#define IS_INIT_LSN(LSN) ((LSN).file == 1 && (LSN).offset == 0)
+#define INIT_LSN(LSN) do { \
+ (LSN).file = 1; \
+ (LSN).offset = 0; \
+} while (0)
+
+#define MAX_LSN(LSN) do { \
+ (LSN).file = UINT32_MAX; \
+ (LSN).offset = UINT32_MAX; \
+} while (0)
+#define IS_MAX_LSN(LSN) \
+ ((LSN).file == UINT32_MAX && (LSN).offset == UINT32_MAX)
+
+/* If logging is turned off, smash the lsn. */
+#define LSN_NOT_LOGGED(LSN) do { \
+ (LSN).file = 0; \
+ (LSN).offset = 1; \
+} while (0)
+#define IS_NOT_LOGGED_LSN(LSN) \
+ ((LSN).file == 0 && (LSN).offset == 1)
+
+/*
+ * LOG_COMPARE -- compare two LSNs.
+ */
+
+#define LOG_COMPARE(lsn0, lsn1) \
+ ((lsn0)->file != (lsn1)->file ? \
+ ((lsn0)->file < (lsn1)->file ? -1 : 1) : \
+ ((lsn0)->offset != (lsn1)->offset ? \
+ ((lsn0)->offset < (lsn1)->offset ? -1 : 1) : 0))
+
+/*******************************************************
+ * Txn.
+ *******************************************************/
+#define DB_NONBLOCK(C) ((C)->txn != NULL && F_ISSET((C)->txn, TXN_NOWAIT))
+#define NOWAIT_FLAG(txn) \
+ ((txn) != NULL && F_ISSET((txn), TXN_NOWAIT) ? DB_LOCK_NOWAIT : 0)
+#define IS_REAL_TXN(txn) \
+ ((txn) != NULL && !F_ISSET(txn, TXN_FAMILY))
+#define IS_SUBTRANSACTION(txn) \
+ ((txn) != NULL && (txn)->parent != NULL)
+
+/* Checks for existence of an XA transaction in access method interfaces. */
+#define XA_CHECK_TXN(ip, txn) \
+ if ((ip) != NULL && (txn) == NULL) { \
+ (txn) = SH_TAILQ_FIRST(&(ip)->dbth_xatxn, __db_txn); \
+ DB_ASSERT(env, txn == NULL || \
+ txn->xa_thr_status == TXN_XA_THREAD_ASSOCIATED); \
+ }
+
+/* Ensure that there is no XA transaction active. */
+#define XA_NO_TXN(ip, retval) { \
+ DB_TXN *__txn; \
+ retval = 0; \
+ if ((ip) != NULL) { \
+ __txn = SH_TAILQ_FIRST(&(ip)->dbth_xatxn, __db_txn); \
+ if (__txn != NULL && \
+ __txn->xa_thr_status == TXN_XA_THREAD_ASSOCIATED) \
+ retval = EINVAL; \
+ } \
+}
+
+/*******************************************************
+ * Crypto.
+ *******************************************************/
+#define DB_IV_BYTES 16 /* Bytes per IV */
+#define DB_MAC_KEY 20 /* Bytes per MAC checksum */
+
+/*******************************************************
+ * Compression
+ *******************************************************/
+#define CMP_INT_SPARE_VAL 0xFC /* Smallest byte value that the integer
+ compression algorithm doesn't use */
+
+#if defined(__cplusplus)
+}
+#endif
+
+/*******************************************************
+ * Remaining general DB includes.
+ *******************************************************/
+@db_int_def@
+
+#include "dbinc/globals.h"
+#include "dbinc/clock.h"
+#include "dbinc/debug.h"
+#include "dbinc/region.h"
+#include "dbinc_auto/env_ext.h"
+#include "dbinc/mutex.h"
+#ifdef HAVE_REPLICATION_THREADS
+#include "dbinc/repmgr.h"
+#endif
+#include "dbinc/rep.h"
+#include "dbinc/os.h"
+#include "dbinc_auto/clib_ext.h"
+#include "dbinc_auto/common_ext.h"
+
+/*******************************************************
+ * Remaining Log.
+ * These need to be defined after the general includes
+ * because they need rep.h from above.
+ *******************************************************/
+/*
+ * Test if the environment is currently logging changes. If we're in recovery
+ * or we're a replication client, we don't need to log changes because they're
+ * already in the log, even though we have a fully functional log system.
+ */
+#define DBENV_LOGGING(env) \
+ (LOGGING_ON(env) && !IS_REP_CLIENT(env) && (!IS_RECOVERING(env)))
+
+/*
+ * Test if we need to log a change. By default, we don't log operations without
+ * associated transactions, unless DIAGNOSTIC, DEBUG_ROP or DEBUG_WOP are on.
+ * This is because we want to get log records for read/write operations, and, if
+ * we are trying to debug something, more information is always better.
+ *
+ * The DBC_RECOVER flag is set when we're in abort, as well as during recovery;
+ * thus DBC_LOGGING may be false for a particular dbc even when DBENV_LOGGING
+ * is true.
+ *
+ * We explicitly use LOGGING_ON/IS_REP_CLIENT here because we don't want to pull
+ * in the log headers, which IS_RECOVERING (and thus DBENV_LOGGING) rely on, and
+ * because DBC_RECOVER should be set anytime IS_RECOVERING would be true.
+ *
+ * If we're not in recovery (master - doing an abort or a client applying
+ * a txn), then a client's only path through here is on an internal
+ * operation, and a master's only path through here is a transactional
+ * operation. Detect if either is not the case.
+ */
+#if defined(DIAGNOSTIC) || defined(DEBUG_ROP) || defined(DEBUG_WOP)
+#define DBC_LOGGING(dbc) __dbc_logging(dbc)
+#else
+#define DBC_LOGGING(dbc) \
+ ((dbc)->txn != NULL && LOGGING_ON((dbc)->env) && \
+ !F_ISSET((dbc), DBC_RECOVER) && !IS_REP_CLIENT((dbc)->env))
+#endif
+
+#endif /* !_DB_INT_H_ */
diff --git a/src/dbinc/db_join.h b/src/dbinc/db_join.h
new file mode 100644
index 00000000..aecf059a
--- /dev/null
+++ b/src/dbinc/db_join.h
@@ -0,0 +1,37 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_JOIN_H_
+#define _DB_JOIN_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Joins use a join cursor that is similar to a regular DB cursor except
+ * that it only supports c_get and c_close functionality. Also, it does
+ * not support the full range of flags for get.
+ */
+typedef struct __join_cursor {
+ u_int8_t *j_exhausted; /* Array of flags; is cursor i exhausted? */
+ DBC **j_curslist; /* Array of cursors in the join: constant. */
+ DBC **j_fdupcurs; /* Cursors w/ first instances of current dup. */
+ DBC **j_workcurs; /* Scratch cursor copies to muck with. */
+ DB *j_primary; /* Primary dbp. */
+ DBT j_key; /* Used to do lookups. */
+ DBT j_rdata; /* Memory used for data return. */
+ u_int32_t j_ncurs; /* How many cursors do we have? */
+#define JOIN_RETRY 0x01 /* Error on primary get; re-return same key. */
+ u_int32_t flags;
+} JOIN_CURSOR;
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_JOIN_H_ */
diff --git a/src/dbinc/db_page.h b/src/dbinc/db_page.h
new file mode 100644
index 00000000..2d4de2e5
--- /dev/null
+++ b/src/dbinc/db_page.h
@@ -0,0 +1,841 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_PAGE_H_
+#define _DB_PAGE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * DB page formats.
+ *
+ * !!!
+ * This implementation requires that values within the following structures
+ * NOT be padded -- note, ANSI C permits random padding within structures.
+ * If your compiler pads randomly you can just forget ever making DB run on
+ * your system. In addition, no data type can require larger alignment than
+ * its own size, e.g., a 4-byte data element may not require 8-byte alignment.
+ *
+ * Note that key/data lengths are often stored in db_indx_t's -- this is
+ * not accidental, nor does it limit the key/data size. If the key/data
+ * item fits on a page, it's guaranteed to be small enough to fit into a
+ * db_indx_t, and storing it in one saves space.
+ */
+
+#define PGNO_INVALID 0 /* Invalid page number in any database. */
+#define PGNO_BASE_MD 0 /* Base database: metadata page number. */
+
+/* Page types. */
+#define P_INVALID 0 /* Invalid page type. */
+#define __P_DUPLICATE 1 /* Duplicate. DEPRECATED in 3.1 */
+#define P_HASH_UNSORTED 2 /* Hash pages created pre 4.6. DEPRECATED */
+#define P_IBTREE 3 /* Btree internal. */
+#define P_IRECNO 4 /* Recno internal. */
+#define P_LBTREE 5 /* Btree leaf. */
+#define P_LRECNO 6 /* Recno leaf. */
+#define P_OVERFLOW 7 /* Overflow. */
+#define P_HASHMETA 8 /* Hash metadata page. */
+#define P_BTREEMETA 9 /* Btree metadata page. */
+#define P_QAMMETA 10 /* Queue metadata page. */
+#define P_QAMDATA 11 /* Queue data page. */
+#define P_LDUP 12 /* Off-page duplicate leaf. */
+#define P_HASH 13 /* Sorted hash page. */
+#define P_HEAPMETA 14 /* Heap metadata page. */
+#define P_HEAP 15 /* Heap data page. */
+#define P_IHEAP 16 /* Heap internal. */
+#define P_PAGETYPE_MAX 17
+/* Flag to __db_new */
+#define P_DONTEXTEND 0x8000 /* Don't allocate if there are no free pages. */
+
+/*
+ * When we create pages in mpool, we ask mpool to clear some number of bytes
+ * in the header. This number must be at least as big as the regular page
+ * headers and cover enough of the btree and hash meta-data pages to obliterate
+ * the page type.
+ */
+#define DB_PAGE_DB_LEN 32
+#define DB_PAGE_QUEUE_LEN 0
+
+/************************************************************************
+ GENERIC METADATA PAGE HEADER
+ *
+ * !!!
+ * The magic and version numbers have to be in the same place in all versions
+ * of the metadata page as the application may not have upgraded the database.
+ ************************************************************************/
+typedef struct _dbmeta33 {
+ DB_LSN lsn; /* 00-07: LSN. */
+ db_pgno_t pgno; /* 08-11: Current page number. */
+ u_int32_t magic; /* 12-15: Magic number. */
+ u_int32_t version; /* 16-19: Version. */
+ u_int32_t pagesize; /* 20-23: Pagesize. */
+ u_int8_t encrypt_alg; /* 24: Encryption algorithm. */
+ u_int8_t type; /* 25: Page type. */
+#define DBMETA_CHKSUM 0x01
+#define DBMETA_PART_RANGE 0x02
+#define DBMETA_PART_CALLBACK 0x04
+ u_int8_t metaflags; /* 26: Meta-only flags */
+ u_int8_t unused1; /* 27: Unused. */
+ u_int32_t free; /* 28-31: Free list page number. */
+ db_pgno_t last_pgno; /* 32-35: Page number of last page in db. */
+ u_int32_t nparts; /* 36-39: Number of partitions. */
+ u_int32_t key_count; /* 40-43: Cached key count. */
+ u_int32_t record_count; /* 44-47: Cached record count. */
+ u_int32_t flags; /* 48-51: Flags: unique to each AM. */
+ /* 52-71: Unique file ID. */
+ u_int8_t uid[DB_FILE_ID_LEN];
+} DBMETA33, DBMETA;
+
+/************************************************************************
+ BTREE METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _btmeta33 {
+#define BTM_DUP 0x001 /* Duplicates. */
+#define BTM_RECNO 0x002 /* Recno tree. */
+#define BTM_RECNUM 0x004 /* Btree: maintain record count. */
+#define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */
+#define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */
+#define BTM_SUBDB 0x020 /* Subdatabases. */
+#define BTM_DUPSORT 0x040 /* Duplicates are sorted. */
+#define BTM_COMPRESS 0x080 /* Compressed. */
+#define BTM_MASK 0x0ff
+ DBMETA dbmeta; /* 00-71: Generic meta-data header. */
+
+ u_int32_t unused1; /* 72-75: Unused space. */
+ u_int32_t minkey; /* 76-79: Btree: Minkey. */
+ u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */
+ u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */
+ u_int32_t root; /* 88-91: Root page. */
+ u_int32_t unused2[92]; /* 92-459: Unused space. */
+ u_int32_t crypto_magic; /* 460-463: Crypto magic number */
+ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
+ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
+ u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */
+
+ /*
+ * Minimum page size is 512.
+ */
+} BTMETA33, BTMETA;
+
+/************************************************************************
+ HASH METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _hashmeta33 {
+#define DB_HASH_DUP 0x01 /* Duplicates. */
+#define DB_HASH_SUBDB 0x02 /* Subdatabases. */
+#define DB_HASH_DUPSORT 0x04 /* Duplicates are sorted. */
+ DBMETA dbmeta; /* 00-71: Generic meta-data page header. */
+
+ u_int32_t max_bucket; /* 72-75: ID of Maximum bucket in use */
+ u_int32_t high_mask; /* 76-79: Modulo mask into table */
+ u_int32_t low_mask; /* 80-83: Modulo mask into table lower half */
+ u_int32_t ffactor; /* 84-87: Fill factor */
+ u_int32_t nelem; /* 88-91: Number of keys in hash table */
+ u_int32_t h_charkey; /* 92-95: Value of hash(CHARKEY) */
+#define NCACHED 32 /* number of spare points */
+ /* 96-223: Spare pages for overflow */
+ u_int32_t spares[NCACHED];
+ u_int32_t unused[59]; /* 224-459: Unused space */
+ u_int32_t crypto_magic; /* 460-463: Crypto magic number */
+ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
+ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
+ u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */
+
+ /*
+ * Minimum page size is 512.
+ */
+} HMETA33, HMETA;
+
+/************************************************************************
+ HEAP METADATA PAGE LAYOUT
+*************************************************************************/
+/*
+ * Heap Meta data page structure
+ *
+ */
+typedef struct _heapmeta {
+ DBMETA dbmeta; /* 00-71: Generic meta-data header. */
+
+ db_pgno_t curregion; /* 72-75: Current region pgno. */
+ u_int32_t nregions; /* 76-79: Number of regions. */
+ u_int32_t gbytes; /* 80-83: GBytes for fixed size heap. */
+ u_int32_t bytes; /* 84-87: Bytes for fixed size heap. */
+ u_int32_t region_size; /* 88-91: Max region size. */
+ u_int32_t unused2[92]; /* 92-459: Unused space.*/
+ u_int32_t crypto_magic; /* 460-463: Crypto magic number */
+ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
+ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
+ u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */
+
+
+ /*
+ * Minimum page size is 512.
+ */
+} HEAPMETA;
+
+/************************************************************************
+ QUEUE METADATA PAGE LAYOUT
+ ************************************************************************/
+/*
+ * QAM Meta data page structure
+ *
+ */
+typedef struct _qmeta33 {
+ DBMETA dbmeta; /* 00-71: Generic meta-data header. */
+
+ u_int32_t first_recno; /* 72-75: First not deleted record. */
+ u_int32_t cur_recno; /* 76-79: Next recno to be allocated. */
+ u_int32_t re_len; /* 80-83: Fixed-length record length. */
+ u_int32_t re_pad; /* 84-87: Fixed-length record pad. */
+ u_int32_t rec_page; /* 88-91: Records Per Page. */
+ u_int32_t page_ext; /* 92-95: Pages per extent */
+
+ u_int32_t unused[91]; /* 96-459: Unused space */
+ u_int32_t crypto_magic; /* 460-463: Crypto magic number */
+ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
+ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
+ u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */
+ /*
+ * Minimum page size is 512.
+ */
+} QMETA33, QMETA;
+
+/*
+ * DBMETASIZE is a constant used by __db_file_setup and DB->verify
+ * as a buffer which is guaranteed to be larger than any possible
+ * metadata page size and smaller than any disk sector.
+ */
+#define DBMETASIZE 512
+
+/************************************************************************
+ BTREE/HASH MAIN PAGE LAYOUT
+ ************************************************************************/
+/*
+ * +-----------------------------------+
+ * | lsn | pgno | prev pgno |
+ * +-----------------------------------+
+ * | next pgno | entries | hf offset |
+ * +-----------------------------------+
+ * | level | type | chksum |
+ * +-----------------------------------+
+ * | iv | index | free --> |
+ * +-----------+-----------------------+
+ * | F R E E A R E A |
+ * +-----------------------------------+
+ * | <-- free | item |
+ * +-----------------------------------+
+ * | item | item | item |
+ * +-----------------------------------+
+ *
+ * sizeof(PAGE) == 26 bytes + possibly 20 bytes of checksum and possibly
+ * 16 bytes of IV (+ 2 bytes for alignment), and the following indices
+ * are guaranteed to be two-byte aligned. If we aren't doing crypto or
+ * checksumming the bytes are reclaimed for data storage.
+ *
+ * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the
+ * key for inp[1]'s data. All other types of pages only contain single items.
+ */
+typedef struct __pg_chksum {
+ u_int8_t unused[2]; /* 26-27: For alignment */
+ u_int8_t chksum[4]; /* 28-31: Checksum */
+} PG_CHKSUM;
+
+typedef struct __pg_crypto {
+ u_int8_t unused[2]; /* 26-27: For alignment */
+ u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */
+ /* !!!
+ * Must be 16-byte aligned for crypto
+ */
+} PG_CRYPTO;
+
+typedef struct _db_page {
+ DB_LSN lsn; /* 00-07: Log sequence number. */
+ db_pgno_t pgno; /* 08-11: Current page number. */
+ db_pgno_t prev_pgno; /* 12-15: Previous page number. */
+ db_pgno_t next_pgno; /* 16-19: Next page number. */
+ db_indx_t entries; /* 20-21: Number of items on the page. */
+ db_indx_t hf_offset; /* 22-23: High free byte page offset. */
+
+ /*
+ * The btree levels are numbered from the leaf to the root, starting
+ * with 1, so the leaf is level 1, its parent is level 2, and so on.
+ * We maintain this level on all btree pages, but the only place that
+ * we actually need it is on the root page. It would not be difficult
+ * to hide the byte on the root page once it becomes an internal page,
+ * so we could get this byte back if we needed it for something else.
+ */
+#define LEAFLEVEL 1
+#define MAXBTREELEVEL 255
+ u_int8_t level; /* 24: Btree tree level. */
+ u_int8_t type; /* 25: Page type. */
+} PAGE;
+
+/*
+ * With many compilers sizeof(PAGE) == 28, while SIZEOF_PAGE == 26.
+ * We add in other things directly after the page header and need
+ * the SIZEOF_PAGE. When giving the sizeof(), many compilers will
+ * pad it out to the next 4-byte boundary.
+ */
+#define SIZEOF_PAGE 26
+/*
+ * !!!
+ * DB_AM_ENCRYPT always implies DB_AM_CHKSUM so that must come first.
+ */
+#define P_INP(dbp, pg) \
+ ((db_indx_t *)((u_int8_t *)(pg) + SIZEOF_PAGE + \
+ (F_ISSET((dbp), DB_AM_ENCRYPT) ? sizeof(PG_CRYPTO) : \
+ (F_ISSET((dbp), DB_AM_CHKSUM) ? sizeof(PG_CHKSUM) : 0))))
+
+#define P_IV(dbp, pg) \
+ (F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) + \
+ SIZEOF_PAGE + SSZA(PG_CRYPTO, iv)) \
+ : NULL)
+
+#define P_CHKSUM(dbp, pg) \
+ (F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) + \
+ SIZEOF_PAGE + SSZA(PG_CRYPTO, chksum)) : \
+ (F_ISSET((dbp), DB_AM_CHKSUM) ? ((u_int8_t *)(pg) + \
+ SIZEOF_PAGE + SSZA(PG_CHKSUM, chksum)) \
+ : NULL))
+
+/* PAGE element macros. */
+#define LSN(p) (((PAGE *)p)->lsn)
+#define PGNO(p) (((PAGE *)p)->pgno)
+#define PREV_PGNO(p) (((PAGE *)p)->prev_pgno)
+#define NEXT_PGNO(p) (((PAGE *)p)->next_pgno)
+#define NUM_ENT(p) (((PAGE *)p)->entries)
+#define HOFFSET(p) (((PAGE *)p)->hf_offset)
+#define LEVEL(p) (((PAGE *)p)->level)
+#define TYPE(p) (((PAGE *)p)->type)
+
+/************************************************************************
+ HEAP PAGE LAYOUT
+ ************************************************************************/
+#define HEAPPG_NORMAL 26
+#define HEAPPG_CHKSUM 48
+#define HEAPPG_SEC 64
+
+/*
+ * +0-----------2------------4-----------6-----------7+
+ * | lsn |
+ * +-------------------------+------------------------+
+ * | pgno | unused0 |
+ * +-------------+-----------+-----------+------------+
+ * | high_indx | free_indx | entries | hf offset |
+ * +-------+-----+-----------+-----------+------------+
+ * |unused2|type | unused3 | ...chksum... |
+ * +-------+-----+-----------+------------------------+
+ * | ...iv... | offset table / free space map |
+ * +-------------+------------------------------------+
+ * |free-> F R E E A R E A |
+ * +--------------------------------------------------+
+ * | <-- free | item |
+ * +-------------------------+------------------------+
+ * | item | item |
+ * +-------------------------+------------------------+
+ *
+ * The page layout of both heap internal and data pages. If not using
+ * crypto, iv will be overwritten with data. If not using checksumming,
+ * unused3 and chksum will also be overwritten with data and data will start at
+ * 26. Note that this layout lets us re-use a lot of the PAGE element macros
+ * defined above.
+ */
+typedef struct _heappg {
+ DB_LSN lsn; /* 00-07: Log sequence number. */
+ db_pgno_t pgno; /* 08-11: Current page number. */
+ u_int32_t high_pgno; /* 12-15: Highest page in region. */
+ u_int16_t high_indx; /* 16-17: Highest index in the offset table. */
+ db_indx_t free_indx; /* 18-19: First available index. */
+ db_indx_t entries; /* 20-21: Number of items on the page. */
+ db_indx_t hf_offset; /* 22-23: High free byte page offset. */
+ u_int8_t unused2[1]; /* 24: Unused. */
+ u_int8_t type; /* 25: Page type. */
+ u_int8_t unused3[2]; /* 26-27: Never used, just checksum alignment. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */
+} HEAPPG;
+
+/* Define first possible data page for heap, 0 is metapage, 1 is region page */
+#define FIRST_HEAP_RPAGE 1
+#define FIRST_HEAP_DPAGE 2
+
+typedef struct __heaphdr {
+#define HEAP_RECSPLIT 0x01 /* Heap data record is split */
+#define HEAP_RECFIRST 0x02 /* First piece of a split record */
+#define HEAP_RECLAST 0x04 /* Last piece of a split record */
+ u_int8_t flags; /* 00: Flags describing record. */
+ u_int8_t unused; /* 01: Padding. */
+ u_int16_t size; /* 02-03: The size of the stored data piece. */
+} HEAPHDR;
+
+typedef struct __heaphdrsplt {
+ HEAPHDR std_hdr; /* 00-03: The standard data header */
+ u_int32_t tsize; /* 04-07: Total record size, 1st piece only */
+ db_pgno_t nextpg; /* 08-11: RID.pgno of the next record piece */
+ db_indx_t nextindx; /* 12-13: RID.indx of the next record piece */
+ u_int16_t unused; /* 14-15: Padding. */
+} HEAPSPLITHDR;
+
+#define HEAP_HDRSIZE(hdr) \
+ (F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : sizeof(HEAPHDR))
+
+#define HEAPPG_SZ(dbp) \
+ (F_ISSET((dbp), DB_AM_ENCRYPT) ? HEAPPG_SEC : \
+ F_ISSET((dbp), DB_AM_CHKSUM) ? HEAPPG_CHKSUM : HEAPPG_NORMAL)
+
+/* Each byte in the bitmap describes 4 pages (2 bits per page.) */
+#define HEAP_REGION_COUNT(dbp, size) (((size) - HEAPPG_SZ(dbp)) * 4)
+#define HEAP_DEFAULT_REGION_MAX(dbp) \
+ (HEAP_REGION_COUNT(dbp, (u_int32_t)8 * 1024))
+#define HEAP_REGION_SIZE(dbp) (((HEAP*) (dbp)->heap_internal)->region_size)
+
+/* Figure out which region a given page belongs to. */
+#define HEAP_REGION_PGNO(dbp, p) \
+ ((((p) - 1) / (HEAP_REGION_SIZE(dbp) + 1)) * \
+ (HEAP_REGION_SIZE(dbp) + 1) + 1)
+/* Translate a region pgno to region number */
+#define HEAP_REGION_NUM(dbp, pgno) \
+ ((((pgno) - 1) / (HEAP_REGION_SIZE((dbp)) + 1)) + 1)
+/*
+ * Given an internal heap page and page number relative to that page, return the
+ * bits from map describing free space on the nth page. Each byte in the map
+ * describes 4 pages. Point at the correct byte and mask the correct 2 bits.
+ */
+#define HEAP_SPACE(dbp, pg, n) \
+ (HEAP_SPACEMAP((dbp), (pg))[(n) / 4] >> (2 * ((n) % 4)) & 3)
+
+#define HEAP_SETSPACE(dbp, pg, n, b) do { \
+ HEAP_SPACEMAP((dbp), (pg))[(n) / 4] &= ~(3 << (2 * ((n) % 4))); \
+ HEAP_SPACEMAP((dbp), (pg))[(n) / 4] |= ((b & 3) << (2 * ((n) % 4))); \
+} while (0)
+
+/* Return the bitmap describing free space on heap data pages. */
+#define HEAP_SPACEMAP(dbp, pg) ((u_int8_t *)P_INP((dbp), (pg)))
+
+/* Return the offset table for a heap data page. */
+#define HEAP_OFFSETTBL(dbp, pg) P_INP((dbp), (pg))
+
+/*
+ * Calculate the % of a page a given size occupies and translate that to the
+ * corresponding bitmap value.
+ */
+#define HEAP_CALCSPACEBITS(dbp, sz, space) do { \
+ (space) = 100 * (sz) / (dbp)->pgsize; \
+ if ((space) <= HEAP_PG_FULL_PCT) \
+ (space) = HEAP_PG_FULL; \
+ else if ((space) <= HEAP_PG_GT66_PCT) \
+ (space) = HEAP_PG_GT66; \
+ else if ((space) <= HEAP_PG_GT33_PCT) \
+ (space) = HEAP_PG_GT33; \
+ else \
+ (space) = HEAP_PG_LT33; \
+} while (0)
+
+/* Return the amount of free space on a heap data page. */
+#define HEAP_FREESPACE(dbp, p) \
+ (HOFFSET(p) - HEAPPG_SZ(dbp) - \
+ (NUM_ENT(p) == 0 ? 0 : ((HEAP_HIGHINDX(p) + 1) * sizeof(db_indx_t))))
+
+/* The maximum amount of data that can fit on an empty heap data page. */
+#define HEAP_MAXDATASIZE(dbp) \
+ ((dbp)->pgsize - HEAPPG_SZ(dbp) - sizeof(db_indx_t))
+
+#define HEAP_FREEINDX(p) (((HEAPPG *)p)->free_indx)
+#define HEAP_HIGHINDX(p) (((HEAPPG *)p)->high_indx)
+
+/* True if we have a page that deals with heap */
+#define HEAPTYPE(h) \
+ (TYPE(h) == P_HEAPMETA || TYPE(h) == P_HEAP || TYPE(h) == P_IHEAP)
+
+/************************************************************************
+ QUEUE MAIN PAGE LAYOUT
+ ************************************************************************/
+/*
+ * Sizes of page below. Used to reclaim space if not doing
+ * crypto or checksumming. If you change the QPAGE below you
+ * MUST adjust this too.
+ */
+#define QPAGE_NORMAL 28
+#define QPAGE_CHKSUM 48
+#define QPAGE_SEC 64
+
+typedef struct _qpage {
+ DB_LSN lsn; /* 00-07: Log sequence number. */
+ db_pgno_t pgno; /* 08-11: Current page number. */
+ u_int32_t unused0[3]; /* 12-23: Unused. */
+ u_int8_t unused1[1]; /* 24: Unused. */
+ u_int8_t type; /* 25: Page type. */
+ u_int8_t unused2[2]; /* 26-27: Unused. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */
+} QPAGE;
+
+#define QPAGE_SZ(dbp) \
+ (F_ISSET((dbp), DB_AM_ENCRYPT) ? QPAGE_SEC : \
+ F_ISSET((dbp), DB_AM_CHKSUM) ? QPAGE_CHKSUM : QPAGE_NORMAL)
+/*
+ * !!!
+ * The next_pgno and prev_pgno fields are not maintained for btree and recno
+ * internal pages. Doing so only provides a minor performance improvement,
+ * it's hard to do when deleting internal pages, and it increases the chance
+ * of deadlock during deletes and splits because we have to re-link pages at
+ * more than the leaf level.
+ *
+ * !!!
+ * The btree/recno access method needs db_recno_t bytes of space on the root
+ * page to specify how many records are stored in the tree. (The alternative
+ * is to store the number of records in the meta-data page, which will create
+ * a second hot spot in trees being actively modified, or recalculate it from
+ * the BINTERNAL fields on each access.) Overload the PREV_PGNO field.
+ */
+#define RE_NREC(p) \
+ ((TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) ? PREV_PGNO(p) : \
+ (db_pgno_t)(TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : NUM_ENT(p)))
+#define RE_NREC_ADJ(p, adj) \
+ PREV_PGNO(p) += adj;
+#define RE_NREC_SET(p, num) \
+ PREV_PGNO(p) = (num);
+
+/*
+ * Initialize a page.
+ *
+ * !!!
+ * Don't modify the page's LSN, code depends on it being unchanged after a
+ * P_INIT call.
+ */
+#define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \
+ PGNO(pg) = (n); \
+ PREV_PGNO(pg) = (pg_prev); \
+ NEXT_PGNO(pg) = (pg_next); \
+ NUM_ENT(pg) = (0); \
+ HOFFSET(pg) = (db_indx_t)(pg_size); \
+ LEVEL(pg) = (btl); \
+ TYPE(pg) = (pg_type); \
+} while (0)
+
+/* Page header length (offset to first index). */
+#define P_OVERHEAD(dbp) P_TO_UINT16(P_INP(dbp, 0))
+
+/* First free byte. */
+#define LOFFSET(dbp, pg) \
+ (P_OVERHEAD(dbp) + NUM_ENT(pg) * sizeof(db_indx_t))
+
+/* Free space on a regular page. */
+#define P_FREESPACE(dbp, pg) (HOFFSET(pg) - LOFFSET(dbp, pg))
+
+/* Get a pointer to the bytes at a specific index. */
+#define P_ENTRY(dbp, pg, indx) ((u_int8_t *)pg + P_INP(dbp, pg)[indx])
+
+/************************************************************************
+ OVERFLOW PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which
+ * store a page number (the first page of the overflow item) and a length
+ * (the total length of the overflow item). The overflow item consists of
+ * some number of overflow pages, linked by the next_pgno field of the page.
+ * A next_pgno field of PGNO_INVALID flags the end of the overflow item.
+ *
+ * Overflow page overloads:
+ * The amount of overflow data stored on each page is stored in the
+ * hf_offset field.
+ *
+ * The implementation reference counts overflow items as it's possible
+ * for them to be promoted onto btree internal pages. The reference
+ * count is stored in the entries field.
+ */
+#define OV_LEN(p) (((PAGE *)p)->hf_offset)
+#define OV_REF(p) (((PAGE *)p)->entries)
+
+/* Maximum number of bytes that you can put on an overflow page. */
+#define P_MAXSPACE(dbp, psize) ((psize) - P_OVERHEAD(dbp))
+
+/* Free space on an overflow page. */
+#define P_OVFLSPACE(dbp, psize, pg) (P_MAXSPACE(dbp, psize) - HOFFSET(pg))
+
+/************************************************************************
+ HASH PAGE LAYOUT
+ ************************************************************************/
+
+/* Each index references a group of bytes on the page. */
+#define H_KEYDATA 1 /* Key/data item. */
+#define H_DUPLICATE 2 /* Duplicate key/data item. */
+#define H_OFFPAGE 3 /* Overflow key/data item. */
+#define H_OFFDUP 4 /* Overflow page of duplicates. */
+
+/*
+ * !!!
+ * Items on hash pages are (potentially) unaligned, so we can never cast the
+ * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as
+ * we do with B+tree on-page structures. Because we frequently want the type
+ * field, it requires no alignment, and it's in the same location in all three
+ * structures, there's a pair of macros.
+ */
+#define HPAGE_PTYPE(p) (*(u_int8_t *)p)
+#define HPAGE_TYPE(dbp, pg, indx) (*P_ENTRY(dbp, pg, indx))
+
+/*
+ * The first and second types are H_KEYDATA and H_DUPLICATE, represented
+ * by the HKEYDATA structure:
+ *
+ * +-----------------------------------+
+ * | type | key/data ... |
+ * +-----------------------------------+
+ *
+ * For duplicates, the data field encodes duplicate elements in the data
+ * field:
+ *
+ * +---------------------------------------------------------------+
+ * | type | len1 | element1 | len1 | len2 | element2 | len2 |
+ * +---------------------------------------------------------------+
+ *
+ * Thus, by keeping track of the offset in the element, we can do both
+ * backward and forward traversal.
+ */
+typedef struct _hkeydata {
+ u_int8_t type; /* 00: Page type. */
+ u_int8_t data[1]; /* Variable length key/data item. */
+} HKEYDATA;
+#define HKEYDATA_DATA(p) (((u_int8_t *)p) + SSZA(HKEYDATA, data))
+
+/*
+ * The length of any HKEYDATA item. Note that indx is an element index,
+ * not a PAIR index.
+ */
+#define LEN_HITEM(dbp, pg, pgsize, indx) \
+ (((indx) == 0 ? (pgsize) : \
+ (P_INP(dbp, pg)[(indx) - 1])) - (P_INP(dbp, pg)[indx]))
+
+#define LEN_HKEYDATA(dbp, pg, psize, indx) \
+ (db_indx_t)(LEN_HITEM(dbp, pg, psize, indx) - HKEYDATA_SIZE(0))
+
+/*
+ * Page space required to add a new HKEYDATA item to the page, with and
+ * without the index value.
+ */
+#define HKEYDATA_SIZE(len) \
+ ((len) + SSZA(HKEYDATA, data))
+#define HKEYDATA_PSIZE(len) \
+ (HKEYDATA_SIZE(len) + sizeof(db_indx_t))
+
+/* Put a HKEYDATA item at the location referenced by a page entry. */
+#define PUT_HKEYDATA(pe, kd, len, etype) { \
+ ((HKEYDATA *)(pe))->type = etype; \
+ memcpy((u_int8_t *)(pe) + sizeof(u_int8_t), kd, len); \
+}
+
+/*
+ * Macros the describe the page layout in terms of key-data pairs.
+ */
+#define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2)
+#define H_KEYINDEX(indx) (indx)
+#define H_DATAINDEX(indx) ((indx) + 1)
+#define H_PAIRKEY(dbp, pg, indx) P_ENTRY(dbp, pg, H_KEYINDEX(indx))
+#define H_PAIRDATA(dbp, pg, indx) P_ENTRY(dbp, pg, H_DATAINDEX(indx))
+#define H_PAIRSIZE(dbp, pg, psize, indx) \
+ (LEN_HITEM(dbp, pg, psize, H_KEYINDEX(indx)) + \
+ LEN_HITEM(dbp, pg, psize, H_DATAINDEX(indx)))
+#define LEN_HDATA(dbp, p, psize, indx) \
+ LEN_HKEYDATA(dbp, p, psize, H_DATAINDEX(indx))
+#define LEN_HKEY(dbp, p, psize, indx) \
+ LEN_HKEYDATA(dbp, p, psize, H_KEYINDEX(indx))
+
+/*
+ * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
+ */
+typedef struct _hoffpage {
+ u_int8_t type; /* 00: Page type and delete flag. */
+ u_int8_t unused[3]; /* 01-03: Padding, unused. */
+ db_pgno_t pgno; /* 04-07: Offpage page number. */
+ u_int32_t tlen; /* 08-11: Total length of item. */
+} HOFFPAGE;
+
+#define HOFFPAGE_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, pgno))
+#define HOFFPAGE_TLEN(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, tlen))
+
+/*
+ * Page space required to add a new HOFFPAGE item to the page, with and
+ * without the index value.
+ */
+#define HOFFPAGE_SIZE (sizeof(HOFFPAGE))
+#define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t))
+
+/*
+ * The fourth type is H_OFFDUP represented by the HOFFDUP structure:
+ */
+typedef struct _hoffdup {
+ u_int8_t type; /* 00: Page type and delete flag. */
+ u_int8_t unused[3]; /* 01-03: Padding, unused. */
+ db_pgno_t pgno; /* 04-07: Offpage page number. */
+} HOFFDUP;
+#define HOFFDUP_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFDUP, pgno))
+
+/*
+ * Page space required to add a new HOFFDUP item to the page, with and
+ * without the index value.
+ */
+#define HOFFDUP_SIZE (sizeof(HOFFDUP))
+
+/************************************************************************
+ BTREE PAGE LAYOUT
+ ************************************************************************/
+
+/* Each index references a group of bytes on the page. */
+#define B_KEYDATA 1 /* Key/data item. */
+#define B_DUPLICATE 2 /* Duplicate key/data item. */
+#define B_OVERFLOW 3 /* Overflow key/data item. */
+
+/*
+ * We have to store a deleted entry flag in the page. The reason is complex,
+ * but the simple version is that we can't delete on-page items referenced by
+ * a cursor -- the return order of subsequent insertions might be wrong. The
+ * delete flag is an overload of the top bit of the type byte.
+ */
+#define B_DELETE (0x80)
+#define B_DCLR(t) (t) &= ~B_DELETE
+#define B_DSET(t) (t) |= B_DELETE
+#define B_DISSET(t) ((t) & B_DELETE)
+
+#define B_TYPE(t) ((t) & ~B_DELETE)
+#define B_TSET(t, type) ((t) = B_TYPE(type))
+#define B_TSET_DELETED(t, type) ((t) = (type) | B_DELETE)
+
+/*
+ * The first type is B_KEYDATA, represented by the BKEYDATA structure:
+ */
+typedef struct _bkeydata {
+ db_indx_t len; /* 00-01: Key/data item length. */
+ u_int8_t type; /* 02: Page type AND DELETE FLAG. */
+ u_int8_t data[1]; /* Variable length key/data item. */
+} BKEYDATA;
+
+/* Get a BKEYDATA item for a specific index. */
+#define GET_BKEYDATA(dbp, pg, indx) \
+ ((BKEYDATA *)P_ENTRY(dbp, pg, indx))
+
+/*
+ * Page space required to add a new BKEYDATA item to the page, with and
+ * without the index value. The (u_int16_t) cast avoids warnings: DB_ALIGN
+ * casts to uintmax_t, the cast converts it to a small integral type so we
+ * don't get complaints when we assign the final result to an integral type
+ * smaller than uintmax_t.
+ */
+#define BKEYDATA_SIZE(len) \
+ (u_int16_t)DB_ALIGN((len) + SSZA(BKEYDATA, data), sizeof(u_int32_t))
+#define BKEYDATA_PSIZE(len) \
+ (BKEYDATA_SIZE(len) + sizeof(db_indx_t))
+
+/*
+ * The second and third types are B_DUPLICATE and B_OVERFLOW, represented
+ * by the BOVERFLOW structure.
+ */
+typedef struct _boverflow {
+ db_indx_t unused1; /* 00-01: Padding, unused. */
+ u_int8_t type; /* 02: Page type AND DELETE FLAG. */
+ u_int8_t unused2; /* 03: Padding, unused. */
+ db_pgno_t pgno; /* 04-07: Next page number. */
+ u_int32_t tlen; /* 08-11: Total length of item. */
+} BOVERFLOW;
+
+/* Get a BOVERFLOW item for a specific index. */
+#define GET_BOVERFLOW(dbp, pg, indx) \
+ ((BOVERFLOW *)P_ENTRY(dbp, pg, indx))
+
+/*
+ * Page space required to add a new BOVERFLOW item to the page, with and
+ * without the index value.
+ */
+#define BOVERFLOW_SIZE \
+ ((u_int16_t)DB_ALIGN(sizeof(BOVERFLOW), sizeof(u_int32_t)))
+#define BOVERFLOW_PSIZE \
+ (BOVERFLOW_SIZE + sizeof(db_indx_t))
+
+#define BITEM_SIZE(bk) \
+ (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_SIZE : \
+ BKEYDATA_SIZE((bk)->len))
+
+#define BITEM_PSIZE(bk) \
+ (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_PSIZE : \
+ BKEYDATA_PSIZE((bk)->len))
+
+/*
+ * Btree leaf and hash page layouts group indices in sets of two, one for the
+ * key and one for the data. Everything else does it in sets of one to save
+ * space. Use the following macros so that it's real obvious what's going on.
+ */
+#define O_INDX 1
+#define P_INDX 2
+
+/************************************************************************
+ BTREE INTERNAL PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * Btree internal entry.
+ */
+typedef struct _binternal {
+ db_indx_t len; /* 00-01: Key/data item length. */
+ u_int8_t type; /* 02: Page type AND DELETE FLAG. */
+ u_int8_t unused; /* 03: Padding, unused. */
+ db_pgno_t pgno; /* 04-07: Page number of referenced page. */
+ db_recno_t nrecs; /* 08-11: Subtree record count. */
+ u_int8_t data[1]; /* Variable length key item. */
+} BINTERNAL;
+
+/* Get a BINTERNAL item for a specific index. */
+#define GET_BINTERNAL(dbp, pg, indx) \
+ ((BINTERNAL *)P_ENTRY(dbp, pg, indx))
+
+/*
+ * Page space required to add a new BINTERNAL item to the page, with and
+ * without the index value.
+ */
+#define BINTERNAL_SIZE(len) \
+ (u_int16_t)DB_ALIGN((len) + SSZA(BINTERNAL, data), sizeof(u_int32_t))
+#define BINTERNAL_PSIZE(len) \
+ (BINTERNAL_SIZE(len) + sizeof(db_indx_t))
+
+/************************************************************************
+ RECNO INTERNAL PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * The recno internal entry.
+ */
+typedef struct _rinternal {
+ db_pgno_t pgno; /* 00-03: Page number of referenced page. */
+ db_recno_t nrecs; /* 04-07: Subtree record count. */
+} RINTERNAL;
+
+/* Get a RINTERNAL item for a specific index. */
+#define GET_RINTERNAL(dbp, pg, indx) \
+ ((RINTERNAL *)P_ENTRY(dbp, pg, indx))
+
+/*
+ * Page space required to add a new RINTERNAL item to the page, with and
+ * without the index value.
+ */
+#define RINTERNAL_SIZE \
+ (u_int16_t)DB_ALIGN(sizeof(RINTERNAL), sizeof(u_int32_t))
+#define RINTERNAL_PSIZE \
+ (RINTERNAL_SIZE + sizeof(db_indx_t))
+
+typedef struct __pglist {
+ db_pgno_t pgno, next_pgno;
+ DB_LSN lsn;
+} db_pglist_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_PAGE_H_ */
diff --git a/src/dbinc/db_swap.h b/src/dbinc/db_swap.h
new file mode 100644
index 00000000..352ae227
--- /dev/null
+++ b/src/dbinc/db_swap.h
@@ -0,0 +1,262 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_SWAP_H_
+#define _DB_SWAP_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Little endian <==> big endian 64-bit swap macros.
+ * M_64_SWAP swap a memory location
+ * P_64_COPY copy potentially unaligned 4 byte quantities
+ * P_64_SWAP swap a referenced memory location
+ */
+#undef M_64_SWAP
+#define M_64_SWAP(a) { \
+ u_int64_t _tmp; \
+ _tmp = (u_int64_t)a; \
+ ((u_int8_t *)&a)[0] = ((u_int8_t *)&_tmp)[7]; \
+ ((u_int8_t *)&a)[1] = ((u_int8_t *)&_tmp)[6]; \
+ ((u_int8_t *)&a)[2] = ((u_int8_t *)&_tmp)[5]; \
+ ((u_int8_t *)&a)[3] = ((u_int8_t *)&_tmp)[4]; \
+ ((u_int8_t *)&a)[4] = ((u_int8_t *)&_tmp)[3]; \
+ ((u_int8_t *)&a)[5] = ((u_int8_t *)&_tmp)[2]; \
+ ((u_int8_t *)&a)[6] = ((u_int8_t *)&_tmp)[1]; \
+ ((u_int8_t *)&a)[7] = ((u_int8_t *)&_tmp)[0]; \
+}
+#undef P_64_COPY
+#define P_64_COPY(a, b) { \
+ ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \
+ ((u_int8_t *)b)[1] = ((u_int8_t *)a)[1]; \
+ ((u_int8_t *)b)[2] = ((u_int8_t *)a)[2]; \
+ ((u_int8_t *)b)[3] = ((u_int8_t *)a)[3]; \
+ ((u_int8_t *)b)[4] = ((u_int8_t *)a)[4]; \
+ ((u_int8_t *)b)[5] = ((u_int8_t *)a)[5]; \
+ ((u_int8_t *)b)[6] = ((u_int8_t *)a)[6]; \
+ ((u_int8_t *)b)[7] = ((u_int8_t *)a)[7]; \
+}
+#undef P_64_SWAP
+#define P_64_SWAP(a) { \
+ u_int64_t _tmp; \
+ P_64_COPY(a, &_tmp); \
+ ((u_int8_t *)a)[0] = ((u_int8_t *)&_tmp)[7]; \
+ ((u_int8_t *)a)[1] = ((u_int8_t *)&_tmp)[6]; \
+ ((u_int8_t *)a)[2] = ((u_int8_t *)&_tmp)[5]; \
+ ((u_int8_t *)a)[3] = ((u_int8_t *)&_tmp)[4]; \
+ ((u_int8_t *)a)[4] = ((u_int8_t *)&_tmp)[3]; \
+ ((u_int8_t *)a)[5] = ((u_int8_t *)&_tmp)[2]; \
+ ((u_int8_t *)a)[6] = ((u_int8_t *)&_tmp)[1]; \
+ ((u_int8_t *)a)[7] = ((u_int8_t *)&_tmp)[0]; \
+}
+
+/*
+ * Little endian <==> big endian 32-bit swap macros.
+ * P_32_COPY copy potentially unaligned 4 byte quantities
+ * P_32_COPYSWAP copy and swap potentially unaligned 4 byte quantities
+ * P_32_SWAP swap a referenced memory location
+ * M_32_SWAP swap a memory location
+ */
+#undef P_32_COPY
+#define P_32_COPY(a, b) do { \
+ ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \
+ ((u_int8_t *)b)[1] = ((u_int8_t *)a)[1]; \
+ ((u_int8_t *)b)[2] = ((u_int8_t *)a)[2]; \
+ ((u_int8_t *)b)[3] = ((u_int8_t *)a)[3]; \
+} while (0)
+#undef P_32_COPYSWAP
+#define P_32_COPYSWAP(a, b) do { \
+ ((u_int8_t *)b)[0] = ((u_int8_t *)a)[3]; \
+ ((u_int8_t *)b)[1] = ((u_int8_t *)a)[2]; \
+ ((u_int8_t *)b)[2] = ((u_int8_t *)a)[1]; \
+ ((u_int8_t *)b)[3] = ((u_int8_t *)a)[0]; \
+} while (0)
+#undef P_32_SWAP
+#define P_32_SWAP(a) do { \
+ u_int32_t _tmp; \
+ P_32_COPY(a, &_tmp); \
+ P_32_COPYSWAP(&_tmp, a); \
+} while (0)
+#undef M_32_SWAP
+#define M_32_SWAP(a) P_32_SWAP(&a)
+
+/*
+ * Little endian <==> big endian 16-bit swap macros.
+ * P_16_COPY copy potentially unaligned 2 byte quantities
+ * P_16_COPYSWAP copy and swap potentially unaligned 2 byte quantities
+ * P_16_SWAP swap a referenced memory location
+ * M_16_SWAP swap a memory location
+ */
+#undef P_16_COPY
+#define P_16_COPY(a, b) do { \
+ ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \
+ ((u_int8_t *)b)[1] = ((u_int8_t *)a)[1]; \
+} while (0)
+#undef P_16_COPYSWAP
+#define P_16_COPYSWAP(a, b) do { \
+ ((u_int8_t *)b)[0] = ((u_int8_t *)a)[1]; \
+ ((u_int8_t *)b)[1] = ((u_int8_t *)a)[0]; \
+} while (0)
+#undef P_16_SWAP
+#define P_16_SWAP(a) do { \
+ u_int16_t _tmp; \
+ P_16_COPY(a, &_tmp); \
+ P_16_COPYSWAP(&_tmp, a); \
+} while (0)
+#undef M_16_SWAP
+#define M_16_SWAP(a) P_16_SWAP(&a)
+
+#undef SWAP32
+#define SWAP32(p) { \
+ P_32_SWAP(p); \
+ (p) += sizeof(u_int32_t); \
+}
+#undef SWAP16
+#define SWAP16(p) { \
+ P_16_SWAP(p); \
+ (p) += sizeof(u_int16_t); \
+}
+
+/*
+ * Berkeley DB has local versions of htonl() and ntohl() that operate on
+ * pointers to the right size memory locations; the portability magic for
+ * finding the real system functions isn't worth the effort.
+ */
+#undef DB_HTONL_SWAP
+#define DB_HTONL_SWAP(env, p) do { \
+ if (F_ISSET((env), ENV_LITTLEENDIAN)) \
+ P_32_SWAP(p); \
+} while (0)
+#undef DB_NTOHL_SWAP
+#define DB_NTOHL_SWAP(env, p) do { \
+ if (F_ISSET((env), ENV_LITTLEENDIAN)) \
+ P_32_SWAP(p); \
+} while (0)
+
+#undef DB_NTOHL_COPYIN
+#define DB_NTOHL_COPYIN(env, i, p) do { \
+ u_int8_t *tmp; \
+ tmp = (u_int8_t *)&(i); \
+ if (F_ISSET(env, ENV_LITTLEENDIAN)) { \
+ tmp[3] = *p++; \
+ tmp[2] = *p++; \
+ tmp[1] = *p++; \
+ tmp[0] = *p++; \
+ } else { \
+ memcpy(&i, p, sizeof(u_int32_t)); \
+ p = (u_int8_t *)p + sizeof(u_int32_t); \
+ } \
+} while (0)
+
+#undef DB_NTOHS_COPYIN
+#define DB_NTOHS_COPYIN(env, i, p) do { \
+ u_int8_t *tmp; \
+ tmp = (u_int8_t *)&(i); \
+ if (F_ISSET(env, ENV_LITTLEENDIAN)) { \
+ tmp[1] = *p++; \
+ tmp[0] = *p++; \
+ } else { \
+ memcpy(&i, p, sizeof(u_int16_t)); \
+ p = (u_int8_t *)p + sizeof(u_int16_t); \
+ } \
+} while (0)
+
+#undef DB_HTONL_COPYOUT
+#define DB_HTONL_COPYOUT(env, p, i) do { \
+ u_int8_t *tmp; \
+ tmp = (u_int8_t *)p; \
+ if (F_ISSET(env, ENV_LITTLEENDIAN)) { \
+ *tmp++ = ((u_int8_t *)&(i))[3]; \
+ *tmp++ = ((u_int8_t *)&(i))[2]; \
+ *tmp++ = ((u_int8_t *)&(i))[1]; \
+ *tmp++ = ((u_int8_t *)&(i))[0]; \
+ } else \
+ memcpy(p, &i, sizeof(u_int32_t)); \
+ p = (u_int8_t *)p + sizeof(u_int32_t); \
+} while (0)
+
+#undef DB_HTONS_COPYOUT
+#define DB_HTONS_COPYOUT(env, p, i) do { \
+ u_int8_t *tmp; \
+ tmp = (u_int8_t *)p; \
+ if (F_ISSET(env, ENV_LITTLEENDIAN)) { \
+ *tmp++ = ((u_int8_t *)&(i))[1]; \
+ *tmp++ = ((u_int8_t *)&(i))[0]; \
+ } else \
+ memcpy(p, &i, sizeof(u_int16_t)); \
+ p = (u_int8_t *)p + sizeof(u_int16_t); \
+} while (0)
+
+/*
+ * Helper macros for swapped logs. We write logs in little endian format to
+ * minimize disruption on x86 when upgrading from native byte order to
+ * platform-independent logs.
+ */
+#define LOG_SWAPPED(env) !F_ISSET(env, ENV_LITTLEENDIAN)
+
+#define LOGCOPY_32(env, x, p) do { \
+ if (LOG_SWAPPED(env)) \
+ P_32_COPYSWAP((p), (x)); \
+ else \
+ memcpy((x), (p), sizeof(u_int32_t)); \
+} while (0)
+
+#define LOGCOPY_16(env, x, p) do { \
+ if (LOG_SWAPPED(env)) \
+ P_16_COPYSWAP((p), (x)); \
+ else \
+ memcpy((x), (p), sizeof(u_int16_t)); \
+} while (0)
+
+#define LOGCOPY_TOLSN(env, lsnp, p) do { \
+ LOGCOPY_32((env), &(lsnp)->file, (p)); \
+ LOGCOPY_32((env), &(lsnp)->offset, \
+ (u_int8_t *)(p) + sizeof(u_int32_t)); \
+} while (0)
+
+#define LOGCOPY_FROMLSN(env, p, lsnp) do { \
+ LOGCOPY_32((env), (p), &(lsnp)->file); \
+ LOGCOPY_32((env), \
+ (u_int8_t *)(p) + sizeof(u_int32_t), &(lsnp)->offset); \
+} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !_DB_SWAP_H_ */
diff --git a/src/dbinc/db_upgrade.h b/src/dbinc/db_upgrade.h
new file mode 100644
index 00000000..45fb624d
--- /dev/null
+++ b/src/dbinc/db_upgrade.h
@@ -0,0 +1,248 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_UPGRADE_H_
+#define _DB_UPGRADE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This file defines the metadata pages from the previous release.
+ * These structures are only used to upgrade old versions of databases.
+ */
+
+/* Structures from the 3.1 release */
+typedef struct _dbmeta31 {
+ DB_LSN lsn; /* 00-07: LSN. */
+ db_pgno_t pgno; /* 08-11: Current page number. */
+ u_int32_t magic; /* 12-15: Magic number. */
+ u_int32_t version; /* 16-19: Version. */
+ u_int32_t pagesize; /* 20-23: Pagesize. */
+ u_int8_t unused1[1]; /* 24: Unused. */
+ u_int8_t type; /* 25: Page type. */
+ u_int8_t unused2[2]; /* 26-27: Unused. */
+ u_int32_t free; /* 28-31: Free list page number. */
+ DB_LSN unused3; /* 36-39: Unused. */
+ u_int32_t key_count; /* 40-43: Cached key count. */
+ u_int32_t record_count; /* 44-47: Cached record count. */
+ u_int32_t flags; /* 48-51: Flags: unique to each AM. */
+ /* 52-71: Unique file ID. */
+ u_int8_t uid[DB_FILE_ID_LEN];
+} DBMETA31;
+
+typedef struct _btmeta31 {
+ DBMETA31 dbmeta; /* 00-71: Generic meta-data header. */
+
+ u_int32_t maxkey; /* 72-75: Btree: Maxkey. */
+ u_int32_t minkey; /* 76-79: Btree: Minkey. */
+ u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */
+ u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */
+ u_int32_t root; /* 88-92: Root page. */
+
+ /*
+ * Minimum page size is 128.
+ */
+} BTMETA31;
+
+/************************************************************************
+ HASH METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _hashmeta31 {
+ DBMETA31 dbmeta; /* 00-71: Generic meta-data page header. */
+
+ u_int32_t max_bucket; /* 72-75: ID of Maximum bucket in use */
+ u_int32_t high_mask; /* 76-79: Modulo mask into table */
+ u_int32_t low_mask; /* 80-83: Modulo mask into table lower half */
+ u_int32_t ffactor; /* 84-87: Fill factor */
+ u_int32_t nelem; /* 88-91: Number of keys in hash table */
+ u_int32_t h_charkey; /* 92-95: Value of hash(CHARKEY) */
+#define NCACHED 32 /* number of spare points */
+ /* 96-223: Spare pages for overflow */
+ u_int32_t spares[NCACHED];
+
+ /*
+ * Minimum page size is 256.
+ */
+} HMETA31;
+
+/*
+ * QAM Meta data page structure
+ *
+ */
+typedef struct _qmeta31 {
+ DBMETA31 dbmeta; /* 00-71: Generic meta-data header. */
+
+ u_int32_t start; /* 72-75: Start offset. */
+ u_int32_t first_recno; /* 76-79: First not deleted record. */
+ u_int32_t cur_recno; /* 80-83: Last recno allocated. */
+ u_int32_t re_len; /* 84-87: Fixed-length record length. */
+ u_int32_t re_pad; /* 88-91: Fixed-length record pad. */
+ u_int32_t rec_page; /* 92-95: Records Per Page. */
+
+ /*
+ * Minimum page size is 128.
+ */
+} QMETA31;
+/* Structures from the 3.2 release */
+typedef struct _qmeta32 {
+ DBMETA31 dbmeta; /* 00-71: Generic meta-data header. */
+
+ u_int32_t first_recno; /* 72-75: First not deleted record. */
+ u_int32_t cur_recno; /* 76-79: Last recno allocated. */
+ u_int32_t re_len; /* 80-83: Fixed-length record length. */
+ u_int32_t re_pad; /* 84-87: Fixed-length record pad. */
+ u_int32_t rec_page; /* 88-91: Records Per Page. */
+ u_int32_t page_ext; /* 92-95: Pages per extent */
+
+ /*
+ * Minimum page size is 128.
+ */
+} QMETA32;
+
+/* Structures from the 3.0 release */
+
+typedef struct _dbmeta30 {
+ DB_LSN lsn; /* 00-07: LSN. */
+ db_pgno_t pgno; /* 08-11: Current page number. */
+ u_int32_t magic; /* 12-15: Magic number. */
+ u_int32_t version; /* 16-19: Version. */
+ u_int32_t pagesize; /* 20-23: Pagesize. */
+ u_int8_t unused1[1]; /* 24: Unused. */
+ u_int8_t type; /* 25: Page type. */
+ u_int8_t unused2[2]; /* 26-27: Unused. */
+ u_int32_t free; /* 28-31: Free list page number. */
+ u_int32_t flags; /* 32-35: Flags: unique to each AM. */
+ /* 36-55: Unique file ID. */
+ u_int8_t uid[DB_FILE_ID_LEN];
+} DBMETA30;
+
+/************************************************************************
+ BTREE METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _btmeta30 {
+ DBMETA30 dbmeta; /* 00-55: Generic meta-data header. */
+
+ u_int32_t maxkey; /* 56-59: Btree: Maxkey. */
+ u_int32_t minkey; /* 60-63: Btree: Minkey. */
+ u_int32_t re_len; /* 64-67: Recno: fixed-length record length. */
+ u_int32_t re_pad; /* 68-71: Recno: fixed-length record pad. */
+ u_int32_t root; /* 72-75: Root page. */
+
+ /*
+ * Minimum page size is 128.
+ */
+} BTMETA30;
+
+/************************************************************************
+ HASH METADATA PAGE LAYOUT
+ ************************************************************************/
+typedef struct _hashmeta30 {
+ DBMETA30 dbmeta; /* 00-55: Generic meta-data page header. */
+
+ u_int32_t max_bucket; /* 56-59: ID of Maximum bucket in use */
+ u_int32_t high_mask; /* 60-63: Modulo mask into table */
+ u_int32_t low_mask; /* 64-67: Modulo mask into table lower half */
+ u_int32_t ffactor; /* 68-71: Fill factor */
+ u_int32_t nelem; /* 72-75: Number of keys in hash table */
+ u_int32_t h_charkey; /* 76-79: Value of hash(CHARKEY) */
+#define NCACHED30 32 /* number of spare points */
+ /* 80-207: Spare pages for overflow */
+ u_int32_t spares[NCACHED30];
+
+ /*
+ * Minimum page size is 256.
+ */
+} HMETA30;
+
+/************************************************************************
+ QUEUE METADATA PAGE LAYOUT
+ ************************************************************************/
+/*
+ * QAM Meta data page structure
+ *
+ */
+typedef struct _qmeta30 {
+ DBMETA30 dbmeta; /* 00-55: Generic meta-data header. */
+
+ u_int32_t start; /* 56-59: Start offset. */
+ u_int32_t first_recno; /* 60-63: First not deleted record. */
+ u_int32_t cur_recno; /* 64-67: Last recno allocated. */
+ u_int32_t re_len; /* 68-71: Fixed-length record length. */
+ u_int32_t re_pad; /* 72-75: Fixed-length record pad. */
+ u_int32_t rec_page; /* 76-79: Records Per Page. */
+
+ /*
+ * Minimum page size is 128.
+ */
+} QMETA30;
+
+/* Structures from Release 2.x */
+
+/************************************************************************
+ BTREE METADATA PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * Btree metadata page layout:
+ */
+typedef struct _btmeta2X {
+ DB_LSN lsn; /* 00-07: LSN. */
+ db_pgno_t pgno; /* 08-11: Current page number. */
+ u_int32_t magic; /* 12-15: Magic number. */
+ u_int32_t version; /* 16-19: Version. */
+ u_int32_t pagesize; /* 20-23: Pagesize. */
+ u_int32_t maxkey; /* 24-27: Btree: Maxkey. */
+ u_int32_t minkey; /* 28-31: Btree: Minkey. */
+ u_int32_t free; /* 32-35: Free list page number. */
+ u_int32_t flags; /* 36-39: Flags. */
+ u_int32_t re_len; /* 40-43: Recno: fixed-length record length. */
+ u_int32_t re_pad; /* 44-47: Recno: fixed-length record pad. */
+ /* 48-67: Unique file ID. */
+ u_int8_t uid[DB_FILE_ID_LEN];
+} BTMETA2X;
+
+/************************************************************************
+ HASH METADATA PAGE LAYOUT
+ ************************************************************************/
+
+/*
+ * Hash metadata page layout:
+ */
+/* Hash Table Information */
+typedef struct hashhdr { /* Disk resident portion */
+ DB_LSN lsn; /* 00-07: LSN of the header page */
+ db_pgno_t pgno; /* 08-11: Page number (btree compatibility). */
+ u_int32_t magic; /* 12-15: Magic NO for hash tables */
+ u_int32_t version; /* 16-19: Version ID */
+ u_int32_t pagesize; /* 20-23: Bucket/Page Size */
+ u_int32_t ovfl_point; /* 24-27: Overflow page allocation location */
+ u_int32_t last_freed; /* 28-31: Last freed overflow page pgno */
+ u_int32_t max_bucket; /* 32-35: ID of Maximum bucket in use */
+ u_int32_t high_mask; /* 36-39: Modulo mask into table */
+ u_int32_t low_mask; /* 40-43: Modulo mask into table lower half */
+ u_int32_t ffactor; /* 44-47: Fill factor */
+ u_int32_t nelem; /* 48-51: Number of keys in hash table */
+ u_int32_t h_charkey; /* 52-55: Value of hash(CHARKEY) */
+ u_int32_t flags; /* 56-59: Allow duplicates. */
+#define NCACHED2X 32 /* number of spare points */
+ /* 60-187: Spare pages for overflow */
+ u_int32_t spares[NCACHED2X];
+ /* 188-207: Unique file ID. */
+ u_int8_t uid[DB_FILE_ID_LEN];
+
+ /*
+ * Minimum page size is 256.
+ */
+} HASHHDR;
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_UPGRADE_H_ */
diff --git a/src/dbinc/db_verify.h b/src/dbinc/db_verify.h
new file mode 100644
index 00000000..68acbf6c
--- /dev/null
+++ b/src/dbinc/db_verify.h
@@ -0,0 +1,210 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_VERIFY_H_
+#define _DB_VERIFY_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Structures and macros for the storage and retrieval of all information
+ * needed for inter-page verification of a database.
+ */
+
+/*
+ * EPRINT is the macro for error printing. Takes as an arg the arg set
+ * for DB->err.
+ */
+#define EPRINT(x) do { \
+ if (!LF_ISSET(DB_SALVAGE)) \
+ __db_errx x; \
+} while (0)
+
+/* Complain about a totally zeroed page where we don't expect one. */
+#define ZEROPG_ERR_PRINT(dbenv, pgno, str) do { \
+ EPRINT(((dbenv), DB_STR_A("0501", \
+ "Page %lu: %s is of inappropriate type %lu", "%lu %s %lu"), \
+ (u_long)(pgno), str, (u_long)P_INVALID)); \
+ EPRINT(((dbenv), DB_STR_A("0502", \
+ "Page %lu: totally zeroed page", \
+ "%lu"), (u_long)(pgno))); \
+} while (0)
+
+/*
+ * Note that 0 is, in general, a valid pgno, despite equaling PGNO_INVALID;
+ * we have to test it separately where it's not appropriate.
+ */
+#define IS_VALID_PGNO(x) ((x) <= vdp->last_pgno)
+
+/*
+ * VRFY_DBINFO is the fundamental structure; it either represents the database
+ * of subdatabases, or the sole database if there are no subdatabases.
+ */
+struct __vrfy_dbinfo {
+ DB_THREAD_INFO *thread_info;
+ /* Info about this database in particular. */
+ DBTYPE type;
+
+ /* List of subdatabase meta pages, if any. */
+ LIST_HEAD(__subdbs, __vrfy_childinfo) subdbs;
+
+ /* Transaction handle for CDS group. */
+ DB_TXN *txn;
+
+ /* File-global info--stores VRFY_PAGEINFOs for each page. */
+ DB *pgdbp;
+
+ /* Child database--stores VRFY_CHILDINFOs of each page. */
+ DB *cdbp;
+
+ /* Page info structures currently in use. */
+ LIST_HEAD(__activepips, __vrfy_pageinfo) activepips;
+
+ /*
+ * DB we use to keep track of which pages are linked somehow
+ * during verification. 0 is the default, "unseen"; 1 is seen.
+ */
+ DB *pgset;
+
+ /*
+ * This is a database we use during salvaging to keep track of which
+ * overflow and dup pages we need to come back to at the end and print
+ * with key "UNKNOWN". Pages which print with a good key get set
+ * to SALVAGE_IGNORE; others get set, as appropriate, to SALVAGE_LDUP,
+ * SALVAGE_LRECNODUP, SALVAGE_OVERFLOW for normal db overflow pages,
+ * and SALVAGE_BTREE, SALVAGE_LRECNO, and SALVAGE_HASH for subdb
+ * pages.
+ */
+#define SALVAGE_INVALID 0
+#define SALVAGE_IGNORE 1
+#define SALVAGE_LDUP 2
+#define SALVAGE_IBTREE 3
+#define SALVAGE_OVERFLOW 4
+#define SALVAGE_LBTREE 5
+#define SALVAGE_HASH 6
+#define SALVAGE_LRECNO 7
+#define SALVAGE_LRECNODUP 8
+ DB *salvage_pages;
+
+ db_pgno_t last_pgno;
+ db_pgno_t meta_last_pgno;
+ db_pgno_t pgs_remaining; /* For dbp->db_feedback(). */
+
+ /*
+ * These are used during __bam_vrfy_subtree to keep track, while
+ * walking up and down the Btree structure, of the prev- and next-page
+ * chain of leaf pages and verify that it's intact. Also, make sure
+ * that this chain contains pages of only one type.
+ */
+ db_pgno_t prev_pgno;
+ db_pgno_t next_pgno;
+ u_int8_t leaf_type;
+
+ /* Queue needs these to verify data pages in the first pass. */
+ u_int32_t re_pad; /* Record pad character. */
+ u_int32_t re_len; /* Record length. */
+ u_int32_t rec_page;
+ u_int32_t page_ext;
+ u_int32_t first_recno;
+ u_int32_t last_recno;
+ int nextents;
+ db_pgno_t *extents;
+
+#define SALVAGE_PRINTABLE 0x01 /* Output printable chars literally. */
+#define SALVAGE_PRINTHEADER 0x02 /* Print the unknown-key header. */
+#define SALVAGE_PRINTFOOTER 0x04 /* Print the unknown-key footer. */
+#define SALVAGE_HASSUBDBS 0x08 /* There are subdatabases to salvage. */
+#define VRFY_LEAFCHAIN_BROKEN 0x10 /* Lost one or more Btree leaf pgs. */
+#define VRFY_QMETA_SET 0x20 /* We've seen a QUEUE meta page and
+ set things up for it. */
+ u_int32_t flags;
+}; /* VRFY_DBINFO */
+
+/*
+ * The amount of state information we need per-page is small enough that
+ * it's not worth the trouble to define separate structures for each
+ * possible type of page, and since we're doing verification with these we
+ * have to be open to the possibility that page N will be of a completely
+ * unexpected type anyway. So we define one structure here with all the
+ * info we need for inter-page verification.
+ */
+struct __vrfy_pageinfo {
+ u_int8_t type;
+ u_int8_t bt_level;
+ u_int8_t unused1;
+ u_int8_t unused2;
+ db_pgno_t pgno;
+ db_pgno_t prev_pgno;
+ db_pgno_t next_pgno;
+
+ /* meta pages */
+ db_pgno_t root;
+ db_pgno_t free; /* Free list head. */
+
+ db_indx_t entries; /* Actual number of entries. */
+ u_int16_t unused;
+ db_recno_t rec_cnt; /* Record count. */
+ u_int32_t re_pad; /* Record pad character. */
+ u_int32_t re_len; /* Record length. */
+ u_int32_t bt_minkey;
+ u_int32_t h_ffactor;
+ u_int32_t h_nelem;
+
+ /* overflow pages */
+ /*
+ * Note that refcount is the refcount for an overflow page; pi_refcount
+ * is this structure's own refcount!
+ */
+ u_int32_t refcount;
+ u_int32_t olen;
+
+#define VRFY_DUPS_UNSORTED 0x0001 /* Have to flag the negative! */
+#define VRFY_HAS_CHKSUM 0x0002
+#define VRFY_HAS_DUPS 0x0004
+#define VRFY_HAS_DUPSORT 0x0008 /* Has the flag set. */
+#define VRFY_HAS_PART_RANGE 0x0010 /* Has the flag set. */
+#define VRFY_HAS_PART_CALLBACK 0x0020 /* Has the flag set. */
+#define VRFY_HAS_RECNUMS 0x0040
+#define VRFY_HAS_SUBDBS 0x0080
+#define VRFY_INCOMPLETE 0x0100 /* Meta or item order checks incomp. */
+#define VRFY_IS_ALLZEROES 0x0200 /* Hash page we haven't touched? */
+#define VRFY_IS_FIXEDLEN 0x0400
+#define VRFY_IS_RECNO 0x0800
+#define VRFY_IS_RRECNO 0x1000
+#define VRFY_OVFL_LEAFSEEN 0x2000
+#define VRFY_HAS_COMPRESS 0x4000
+#define VRFY_NONEXISTENT 0x8000
+ u_int32_t flags;
+
+ LIST_ENTRY(__vrfy_pageinfo) links;
+ u_int32_t pi_refcount;
+}; /* VRFY_PAGEINFO */
+
+struct __vrfy_childinfo {
+ /* The following fields are set by the caller of __db_vrfy_childput. */
+ db_pgno_t pgno;
+
+#define V_DUPLICATE 1 /* off-page dup metadata */
+#define V_OVERFLOW 2 /* overflow page */
+#define V_RECNO 3 /* btree internal or leaf page */
+ u_int32_t type;
+ db_recno_t nrecs; /* record count on a btree subtree */
+ u_int32_t tlen; /* ovfl. item total size */
+
+ /* The following field is maintained by __db_vrfy_childput. */
+ u_int32_t refcnt; /* # of times parent points to child. */
+
+ LIST_ENTRY(__vrfy_childinfo) links;
+}; /* VRFY_CHILDINFO */
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_VERIFY_H_ */
diff --git a/src/dbinc/debug.h b/src/dbinc/debug.h
new file mode 100644
index 00000000..a8da000d
--- /dev/null
+++ b/src/dbinc/debug.h
@@ -0,0 +1,283 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_DEBUG_H_
+#define _DB_DEBUG_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Turn on additional error checking in gcc 3.X.
+ */
+#if !defined(__GNUC__) || __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 5)
+#define __attribute__(s)
+#endif
+
+/*
+ * When running with #DIAGNOSTIC defined, we smash memory and do memory
+ * guarding with a special byte value.
+ */
+#define CLEAR_BYTE 0xdb
+#define GUARD_BYTE 0xdc
+
+/*
+ * DB assertions.
+ *
+ * Use __STDC__ rather than STDC_HEADERS, the #e construct is ANSI C specific.
+ */
+#if defined(DIAGNOSTIC) && defined(__STDC__)
+#define DB_ASSERT(env, e) \
+ ((e) ? (void)0 : __db_assert(env, #e, __FILE__, __LINE__))
+#else
+#define DB_ASSERT(env, e) NOP_STATEMENT
+#endif
+
+/*
+ * "Shut that bloody compiler up!"
+ *
+ * Unused, or not-used-yet variable. We need to write and then read the
+ * variable, some compilers are too bloody clever by half.
+ */
+#define COMPQUIET(n, v) do { \
+ (n) = (v); \
+ (n) = (n); \
+} while (0)
+
+/*
+ * Purify and other run-time tools complain about uninitialized reads/writes
+ * of structure fields whose only purpose is padding, as well as when heap
+ * memory that was never initialized is written to disk.
+ */
+#ifdef UMRW
+#define UMRW_SET(v) (v) = 0
+#else
+#define UMRW_SET(v) NOP_STATEMENT
+#endif
+
+/*
+ * Errors are in one of two areas: a Berkeley DB error, or a system-level
+ * error. We use db_strerror to translate the former and __os_strerror to
+ * translate the latter.
+ */
+typedef enum {
+ DB_ERROR_NOT_SET=0,
+ DB_ERROR_SET=1,
+ DB_ERROR_SYSTEM=2
+} db_error_set_t;
+
+/*
+ * Message handling. Use a macro instead of a function because va_list
+ * references to variadic arguments cannot be reset to the beginning of the
+ * variadic argument list (and then rescanned), by functions other than the
+ * original routine that took the variadic list of arguments.
+ */
+#if defined(STDC_HEADERS) || defined(__cplusplus)
+#define DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) { \
+ va_list __ap; \
+ \
+ /* Call the application's callback function, if specified. */ \
+ va_start(__ap, fmt); \
+ if ((dbenv) != NULL && (dbenv)->db_errcall != NULL) \
+ __db_errcall(dbenv, error, error_set, fmt, __ap); \
+ va_end(__ap); \
+ \
+ /* \
+ * If the application specified a file descriptor, write to it. \
+ * If we wrote to neither the application's callback routine or \
+ * its file descriptor, and it's an application error message \
+ * using {DbEnv,Db}.{err,errx} or the application has never \
+ * configured an output channel, default by writing to stderr. \
+ */ \
+ va_start(__ap, fmt); \
+ if ((dbenv) == NULL || \
+ (dbenv)->db_errfile != NULL || \
+ ((dbenv)->db_errcall == NULL && \
+ ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \
+ __db_errfile(dbenv, error, error_set, fmt, __ap); \
+ va_end(__ap); \
+}
+#else
+#define DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) { \
+ va_list __ap; \
+ \
+ /* Call the application's callback function, if specified. */ \
+ va_start(__ap); \
+ if ((dbenv) != NULL && (dbenv)->db_errcall != NULL) \
+ __db_errcall(dbenv, error, error_set, fmt, __ap); \
+ va_end(__ap); \
+ \
+ /* \
+ * If the application specified a file descriptor, write to it. \
+ * If we wrote to neither the application's callback routine or \
+ * its file descriptor, and it's an application error message \
+ * using {DbEnv,Db}.{err,errx} or the application has never \
+ * configured an output channel, default by writing to stderr. \
+ */ \
+ va_start(__ap); \
+ if ((dbenv) == NULL || \
+ (dbenv)->db_errfile != NULL || \
+ ((dbenv)->db_errcall == NULL && \
+ ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \
+ __db_errfile(env, error, error_set, fmt, __ap); \
+ va_end(__ap); \
+}
+#endif
+#if defined(STDC_HEADERS) || defined(__cplusplus)
+#define DB_REAL_MSG(dbenv, fmt) { \
+ va_list __ap; \
+ \
+ /* Call the application's callback function, if specified. */ \
+ va_start(__ap, fmt); \
+ if ((dbenv) != NULL && (dbenv)->db_msgcall != NULL) \
+ __db_msgcall(dbenv, fmt, __ap); \
+ va_end(__ap); \
+ \
+ /* \
+ * If the application specified a file descriptor, write to it. \
+ * If we wrote to neither the application's callback routine or \
+ * its file descriptor, write to stdout. \
+ */ \
+ va_start(__ap, fmt); \
+ if ((dbenv) == NULL || \
+ (dbenv)->db_msgfile != NULL || \
+ (dbenv)->db_msgcall == NULL) { \
+ __db_msgfile(dbenv, fmt, __ap); \
+ } \
+ va_end(__ap); \
+}
+#else
+#define DB_REAL_MSG(dbenv, fmt) { \
+ va_list __ap; \
+ \
+ /* Call the application's callback function, if specified. */ \
+ va_start(__ap); \
+ if ((dbenv) != NULL && (dbenv)->db_msgcall != NULL) \
+ __db_msgcall(dbenv, fmt, __ap); \
+ va_end(__ap); \
+ \
+ /* \
+ * If the application specified a file descriptor, write to it. \
+ * If we wrote to neither the application's callback routine or \
+ * its file descriptor, write to stdout. \
+ */ \
+ va_start(__ap); \
+ if ((dbenv) == NULL || \
+ (dbenv)->db_msgfile != NULL || \
+ (dbenv)->db_msgcall == NULL) { \
+ __db_msgfile(dbenv, fmt, __ap); \
+ } \
+ va_end(__ap); \
+}
+#endif
+
+/*
+ * Debugging macro to log operations.
+ * If DEBUG_WOP is defined, log operations that modify the database.
+ * If DEBUG_ROP is defined, log operations that read the database.
+ *
+ * D dbp
+ * T txn
+ * O operation (string)
+ * K key
+ * A data
+ * F flags
+ */
+#define LOG_OP(C, T, O, K, A, F) { \
+ DB_LSN __lsn; \
+ DBT __op; \
+ if (DBC_LOGGING((C))) { \
+ memset(&__op, 0, sizeof(__op)); \
+ __op.data = O; \
+ __op.size = (u_int32_t)strlen(O) + 1; \
+ (void)__db_debug_log((C)->env, T, &__lsn, 0, \
+ &__op, (C)->dbp->log_filename->id, K, A, F); \
+ } \
+}
+#ifdef DEBUG_ROP
+#define DEBUG_LREAD(C, T, O, K, A, F) LOG_OP(C, T, O, K, A, F)
+#else
+#define DEBUG_LREAD(C, T, O, K, A, F)
+#endif
+#ifdef DEBUG_WOP
+#define DEBUG_LWRITE(C, T, O, K, A, F) LOG_OP(C, T, O, K, A, F)
+#else
+#define DEBUG_LWRITE(C, T, O, K, A, F)
+#endif
+
+/*
+ * Hook for testing recovery at various places in the create/delete paths.
+ * Hook for testing subdb locks.
+ */
+#if CONFIG_TEST
+#define DB_TEST_SUBLOCKS(env, flags) do { \
+ if ((env)->test_abort == DB_TEST_SUBDB_LOCKS) \
+ (flags) |= DB_LOCK_NOWAIT; \
+} while (0)
+
+#define DB_ENV_TEST_RECOVERY(env, val, ret, name) do { \
+ int __ret; \
+ PANIC_CHECK((env)); \
+ if ((env)->test_copy == (val)) { \
+ /* COPY the FILE */ \
+ if ((__ret = __db_testcopy((env), NULL, (name))) != 0) \
+ (ret) = __env_panic((env), __ret); \
+ } \
+ if ((env)->test_abort == (val)) { \
+ /* ABORT the TXN */ \
+ (env)->test_abort = 0; \
+ (ret) = EINVAL; \
+ goto db_tr_err; \
+ } \
+} while (0)
+
+#define DB_TEST_RECOVERY(dbp, val, ret, name) do { \
+ ENV *__env = (dbp)->env; \
+ int __ret; \
+ PANIC_CHECK(__env); \
+ if (__env->test_copy == (val)) { \
+ /* Copy the file. */ \
+ if (F_ISSET((dbp), \
+ DB_AM_OPEN_CALLED) && (dbp)->mpf != NULL) \
+ (void)__db_sync(dbp); \
+ if ((__ret = \
+ __db_testcopy(__env, (dbp), (name))) != 0) \
+ (ret) = __env_panic(__env, __ret); \
+ } \
+ if (__env->test_abort == (val)) { \
+ /* Abort the transaction. */ \
+ __env->test_abort = 0; \
+ (ret) = EINVAL; \
+ goto db_tr_err; \
+ } \
+} while (0)
+
+#define DB_TEST_RECOVERY_LABEL db_tr_err:
+
+#define DB_TEST_SET(field, val) do { \
+ if (field == (val)) \
+ goto db_tr_err; \
+} while (0)
+
+#define DB_TEST_WAIT(env, val) \
+ if ((val) != 0) \
+ __os_yield((env), (u_long)(val), 0)
+#else
+#define DB_TEST_SUBLOCKS(env, flags)
+#define DB_ENV_TEST_RECOVERY(env, val, ret, name)
+#define DB_TEST_RECOVERY(dbp, val, ret, name)
+#define DB_TEST_RECOVERY_LABEL
+#define DB_TEST_SET(env, val)
+#define DB_TEST_WAIT(env, val)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_DEBUG_H_ */
diff --git a/src/dbinc/fop.h b/src/dbinc/fop.h
new file mode 100644
index 00000000..94f27f9f
--- /dev/null
+++ b/src/dbinc/fop.h
@@ -0,0 +1,32 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_FOP_H_
+#define _DB_FOP_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define MAKE_INMEM(D) do { \
+ F_SET((D), DB_AM_INMEM); \
+ (void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 1); \
+} while (0)
+
+#define CLR_INMEM(D) do { \
+ F_CLR((D), DB_AM_INMEM); \
+ (void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 0); \
+} while (0)
+
+#include "dbinc_auto/fileops_auto.h"
+#include "dbinc_auto/fileops_ext.h"
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_FOP_H_ */
diff --git a/src/dbinc/globals.h b/src/dbinc/globals.h
new file mode 100644
index 00000000..95e5c118
--- /dev/null
+++ b/src/dbinc/globals.h
@@ -0,0 +1,105 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_GLOBALS_H_
+#define _DB_GLOBALS_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************************
+ * Global variables.
+ *
+ * Held in a single structure to minimize the name-space pollution.
+ *******************************************************/
+#ifdef HAVE_VXWORKS
+#include "semLib.h"
+#endif
+
+typedef struct __db_globals {
+#ifdef HAVE_VXWORKS
+ u_int32_t db_global_init; /* VxWorks: inited */
+ SEM_ID db_global_lock; /* VxWorks: global semaphore */
+#endif
+#ifdef DB_WIN32
+#ifndef DB_WINCE
+ /*
+ * These fields are used by the Windows implementation of mutexes.
+ * Usually they are initialized by the first DB API call to lock a
+ * mutex. If that would result in the mutexes being inaccessible by
+ * other threads (e.g., ones which have lesser privileges) the
+ * application may first call db_env_set_win_security().
+ */
+ SECURITY_DESCRIPTOR win_default_sec_desc;
+ SECURITY_ATTRIBUTES win_default_sec_attr;
+#endif
+ SECURITY_ATTRIBUTES *win_sec_attr;
+#endif
+
+ /* TAILQ_HEAD(__envq, __dbenv) envq; */
+ struct __envq {
+ struct __env *tqh_first;
+ struct __env **tqh_last;
+ } envq;
+
+ char *db_line; /* DB display string. */
+
+ char error_buf[40]; /* Error string buffer. */
+
+ int uid_init; /* srand set in UID generator */
+
+ u_long rand_next; /* rand/srand value */
+
+ u_int32_t fid_serial; /* file id counter */
+
+ int db_errno; /* Errno value if not available */
+
+ size_t num_active_pids; /* number of entries in active_pids */
+
+ size_t size_active_pids; /* allocated size of active_pids */
+
+ pid_t *active_pids; /* array active pids */
+
+ char *saved_errstr; /* saved error string from backup */
+
+ /* Underlying OS interface jump table.*/
+ void (*j_assert) __P((const char *, const char *, int));
+ int (*j_close) __P((int));
+ void (*j_dirfree) __P((char **, int));
+ int (*j_dirlist) __P((const char *, char ***, int *));
+ int (*j_exists) __P((const char *, int *));
+ void (*j_free) __P((void *));
+ int (*j_fsync) __P((int));
+ int (*j_ftruncate) __P((int, off_t));
+ int (*j_ioinfo) __P((const char *,
+ int, u_int32_t *, u_int32_t *, u_int32_t *));
+ void *(*j_malloc) __P((size_t));
+ int (*j_file_map) __P((DB_ENV *, char *, size_t, int, void **));
+ int (*j_file_unmap) __P((DB_ENV *, void *));
+ int (*j_open) __P((const char *, int, ...));
+ ssize_t (*j_pread) __P((int, void *, size_t, off_t));
+ ssize_t (*j_pwrite) __P((int, const void *, size_t, off_t));
+ ssize_t (*j_read) __P((int, void *, size_t));
+ void *(*j_realloc) __P((void *, size_t));
+ int (*j_region_map) __P((DB_ENV *, char *, size_t, int *, void **));
+ int (*j_region_unmap) __P((DB_ENV *, void *));
+ int (*j_rename) __P((const char *, const char *));
+ int (*j_seek) __P((int, off_t, int));
+ int (*j_unlink) __P((const char *));
+ ssize_t (*j_write) __P((int, const void *, size_t));
+ int (*j_yield) __P((u_long, u_long));
+} DB_GLOBALS;
+
+extern DB_GLOBALS __db_global_values;
+#define DB_GLOBAL(v) __db_global_values.v
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_GLOBALS_H_ */
diff --git a/src/dbinc/hash.h b/src/dbinc/hash.h
new file mode 100644
index 00000000..f485128a
--- /dev/null
+++ b/src/dbinc/hash.h
@@ -0,0 +1,173 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * Margo Seltzer. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_HASH_H_
+#define _DB_HASH_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Hash internal structure. */
+typedef struct hash_t {
+ db_pgno_t meta_pgno; /* Page number of the meta data page. */
+ u_int32_t revision; /* Revision of subdb metadata. */
+ u_int32_t h_ffactor; /* Fill factor. */
+ u_int32_t h_nelem; /* Number of elements. */
+ /* Hash and compare functions. */
+ u_int32_t (*h_hash) __P((DB *, const void *, u_int32_t));
+ int (*h_compare) __P((DB *, const DBT *, const DBT *));
+} HASH;
+
+/* Cursor structure definitions. */
+typedef struct cursor_t {
+ /* struct __dbc_internal */
+ __DBC_INTERNAL
+
+ /* Hash private part */
+
+ /* Per-thread information */
+ DB_LOCK hlock; /* Metadata page lock. */
+ HMETA *hdr; /* Pointer to meta-data page. */
+ PAGE *split_buf; /* Temporary buffer for splits. */
+
+ /* Hash cursor information */
+ db_pgno_t bucket; /* Bucket we are traversing. */
+ db_pgno_t lbucket; /* Bucket for which we are locked. */
+ db_indx_t dup_off; /* Offset within a duplicate set. */
+ db_indx_t dup_len; /* Length of current duplicate. */
+ db_indx_t dup_tlen; /* Total length of duplicate entry. */
+ u_int32_t seek_size; /* Number of bytes we need for add. */
+ db_pgno_t seek_found_page;/* Page on which we can insert. */
+ db_indx_t seek_found_indx;/* Insert position for item. */
+ u_int32_t order; /* Relative order among deleted curs. */
+
+#define H_CONTINUE 0x0001 /* Join--search strictly fwd for data */
+#define H_CONTRACT 0x0002 /* Table contracted.*/
+#define H_DELETED 0x0004 /* Cursor item is deleted. */
+#define H_DUPONLY 0x0008 /* Dups only; do not change key. */
+#define H_EXPAND 0x0010 /* Table expanded. */
+#define H_ISDUP 0x0020 /* Cursor is within duplicate set. */
+#define H_NEXT_NODUP 0x0040 /* Get next non-dup entry. */
+#define H_NOMORE 0x0080 /* No more entries in bucket. */
+#define H_OK 0x0100 /* Request succeeded. */
+ u_int32_t flags;
+} HASH_CURSOR;
+
+/* Test string. */
+#define CHARKEY "%$sniglet^&"
+
+/* Overflow management */
+/*
+ * The spares table indicates the page number at which each doubling begins.
+ * From this page number we subtract the number of buckets already allocated
+ * so that we can do a simple addition to calculate the page number here.
+ */
+#define BS_TO_PAGE(bucket, spares) \
+ ((bucket) + (spares)[__db_log2((bucket) + 1)])
+#define BUCKET_TO_PAGE(I, B) (BS_TO_PAGE((B), (I)->hdr->spares))
+
+/* Constraints about much data goes on a page. */
+
+#define MINFILL 4
+#define ISBIG(I, N) (((N) > ((I)->hdr->dbmeta.pagesize / MINFILL)) ? 1 : 0)
+
+/* Shorthands for accessing structure */
+#define NDX_INVALID 0xFFFF
+#define BUCKET_INVALID 0xFFFFFFFF
+
+/* On page duplicates are stored as a string of size-data-size triples. */
+#define DUP_SIZE(len) ((len) + 2 * sizeof(db_indx_t))
+
+/* Log messages types (these are subtypes within a record type) */
+/* These bits are obsolete and are only needed for down rev logs. */
+#define PAIR_KEYMASK 0x1
+#define PAIR_DATAMASK 0x2
+#define PAIR_DUPMASK 0x4
+#define PAIR_MASK 0xf
+#define PAIR_ISKEYBIG(N) (N & PAIR_KEYMASK)
+#define PAIR_ISDATABIG(N) (N & PAIR_DATAMASK)
+#define PAIR_ISDATADUP(N) (N & PAIR_DUPMASK)
+#define OPCODE_OF(N) (N & ~PAIR_MASK)
+
+/* Operators for hash recover routines. */
+#define PUTPAIR 0x20
+#define DELPAIR 0x30
+#define PUTOVFL 0x40
+#define DELOVFL 0x50
+#define HASH_UNUSED1 0x60
+#define HASH_UNUSED2 0x70
+#define SPLITOLD 0x80
+#define SPLITNEW 0x90
+#define SORTPAGE 0x100
+
+/* Flags to control behavior of __ham_del_pair */
+#define HAM_DEL_NO_CURSOR 0x01 /* Don't do any cursor adjustment */
+#define HAM_DEL_NO_RECLAIM 0x02 /* Don't reclaim empty pages */
+/* Just delete onpage items (even if they are references to off-page items). */
+#define HAM_DEL_IGNORE_OFFPAGE 0x04
+
+typedef enum {
+ DB_HAM_CURADJ_DEL = 1,
+ DB_HAM_CURADJ_ADD = 2,
+ DB_HAM_CURADJ_ADDMOD = 3,
+ DB_HAM_CURADJ_DELMOD = 4
+} db_ham_curadj;
+
+typedef enum {
+ DB_HAM_CHGPG = 1,
+ DB_HAM_DELFIRSTPG = 2,
+ DB_HAM_DELMIDPG = 3,
+ DB_HAM_DELLASTPG = 4,
+ DB_HAM_DUP = 5,
+ DB_HAM_SPLIT = 6
+} db_ham_mode;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/hash_auto.h"
+#include "dbinc_auto/hash_ext.h"
+#include "dbinc/db_am.h"
+#endif /* !_DB_HASH_H_ */
diff --git a/src/dbinc/heap.h b/src/dbinc/heap.h
new file mode 100644
index 00000000..ca3407e0
--- /dev/null
+++ b/src/dbinc/heap.h
@@ -0,0 +1,59 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _DB_HEAP_H_
+#define _DB_HEAP_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Forward structure declarations. */
+struct __heap; typedef struct __heap HEAP;
+struct __heap_cursor; typedef struct __heap_cursor HEAP_CURSOR;
+
+/*
+ * The in-memory, per-heap data structure.
+ */
+struct __heap { /* Heap access method. */
+
+ u_int32_t gbytes; /* Initial heap size. */
+ u_int32_t bytes; /* Initial heap size. */
+ u_int32_t region_size; /* Size of each region. */
+
+ db_pgno_t curregion; /* The region of the next insert. */
+ db_pgno_t maxpgno; /* Maximum page number of a fixed size heap. */
+ int curpgindx; /* The last used offset in the region's space bitmap. */
+};
+
+struct __heap_cursor {
+ /* struct __dbc_internal */
+ __DBC_INTERNAL
+
+ /* Heap private part */
+
+ u_int32_t flags;
+};
+
+#define HEAP_PG_FULL 3 /* No space on page. */
+#define HEAP_PG_GT66 2 /* Page greater than 66% full */
+#define HEAP_PG_GT33 1 /* Page greater than 33% full */
+#define HEAP_PG_LT33 0 /* Page less than 33% full */
+
+#define HEAP_PG_FULL_PCT 5 /* Less than 5% of page is free. */
+#define HEAP_PG_GT66_PCT 33 /* Less than 33% of page is free. */
+#define HEAP_PG_GT33_PCT 66 /* Less than 66% of page is free. */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/heap_auto.h"
+#include "dbinc_auto/heap_ext.h"
+#include "dbinc/db_am.h"
+#endif
+
+
diff --git a/src/dbinc/hmac.h b/src/dbinc/hmac.h
new file mode 100644
index 00000000..2a495b17
--- /dev/null
+++ b/src/dbinc/hmac.h
@@ -0,0 +1,39 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_HMAC_H_
+#define _DB_HMAC_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Algorithm specific information.
+ */
+/*
+ * SHA1 checksumming
+ */
+typedef struct {
+ u_int32_t state[5];
+ u_int32_t count[2];
+ unsigned char buffer[64];
+} SHA1_CTX;
+
+/*
+ * AES assumes the SHA1 checksumming (also called MAC)
+ */
+#define DB_MAC_MAGIC "mac derivation key magic value"
+#define DB_ENC_MAGIC "encryption and decryption key value magic"
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/hmac_ext.h"
+#endif /* !_DB_HMAC_H_ */
diff --git a/src/dbinc/lock.h b/src/dbinc/lock.h
new file mode 100644
index 00000000..eab51832
--- /dev/null
+++ b/src/dbinc/lock.h
@@ -0,0 +1,326 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_LOCK_H_
+#define _DB_LOCK_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DB_LOCK_DEFAULT_N 1000 /* Default # of locks in region. */
+
+/*
+ * The locker id space is divided between the transaction manager and the lock
+ * manager. Lock IDs start at 1 and go to DB_LOCK_MAXID. Txn IDs start at
+ * DB_LOCK_MAXID + 1 and go up to TXN_MAXIMUM.
+ */
+#define DB_LOCK_INVALIDID 0
+#define DB_LOCK_MAXID 0x7fffffff
+
+/*
+ * A locker's deadlock resolution priority is stored as a 32 bit unsigned
+ * integer. The maximum priority is DB_LOCK_MAXPRIORITY and the default
+ * priority is DB_LOCK_DEFPRIORITY.
+ */
+#define DB_LOCK_DEFPRIORITY 100
+#define DB_LOCK_MAXPRIORITY UINT32_MAX
+
+/*
+ * Out of band value for a lock. Locks contain an offset into a lock region,
+ * so we use an invalid region offset to indicate an invalid or unset lock.
+ */
+#define LOCK_INVALID INVALID_ROFF
+#define LOCK_ISSET(lock) ((lock).off != LOCK_INVALID)
+#define LOCK_INIT(lock) ((lock).off = LOCK_INVALID)
+
+/*
+ * Macro to identify a write lock for the purpose of counting locks
+ * for the NUMWRITES option to deadlock detection.
+ */
+#define IS_WRITELOCK(m) \
+ ((m) == DB_LOCK_WRITE || (m) == DB_LOCK_WWRITE || \
+ (m) == DB_LOCK_IWRITE || (m) == DB_LOCK_IWR)
+
+/*
+ * Macros to lock/unlock the lock region as a whole. Mostly used for
+ * initialization.
+ */
+#define LOCK_REGION_LOCK(env) \
+ MUTEX_LOCK(env, ((DB_LOCKREGION *) \
+ (env)->lk_handle->reginfo.primary)->mtx_region)
+#define LOCK_REGION_UNLOCK(env) \
+ MUTEX_UNLOCK(env, ((DB_LOCKREGION *) \
+ (env)->lk_handle->reginfo.primary)->mtx_region)
+
+/*
+ * DB_LOCKREGION --
+ * The lock shared region.
+ */
+
+typedef struct __db_lockregion { /* SHARED */
+ db_mutex_t mtx_region; /* Region mutex. */
+
+ u_int32_t need_dd; /* flag for deadlock detector */
+ u_int32_t detect; /* run dd on every conflict */
+ db_timespec next_timeout; /* next time to expire a lock */
+ db_mutex_t mtx_dd; /* mutex for lock object dd list. */
+ db_mutex_t mtx_lockers; /* mutex for locker allocation. */
+ SH_TAILQ_HEAD(__dobj) dd_objs; /* objects with waiters */
+ /* free locker header */
+ roff_t locker_mem_off; /* block memory for lockers */
+ SH_TAILQ_HEAD(__flocker) free_lockers;
+ SH_TAILQ_HEAD(__lkrs) lockers; /* list of lockers */
+
+ db_timeout_t lk_timeout; /* timeout for locks. */
+ db_timeout_t tx_timeout; /* timeout for txns. */
+
+ u_int32_t locker_t_size; /* size of locker hash table */
+ u_int32_t object_t_size; /* size of object hash table */
+ u_int32_t part_t_size; /* number of partitions */
+
+ roff_t conf_off; /* offset of conflicts array */
+ roff_t obj_off; /* offset of object hash table */
+ roff_t part_off; /* offset of partition array */
+ roff_t stat_off; /* offset to object hash stats */
+ roff_t locker_off; /* offset of locker hash table */
+
+ u_int32_t lock_id; /* Current lock(er) id to allocate. */
+ u_int32_t cur_maxid; /* Current max lock(er) id. */
+ u_int32_t nlockers; /* Current number of lockers. */
+ int32_t nmodes; /* Number of modes in conflict table. */
+ DB_LOCK_STAT stat; /* stats about locking. */
+} DB_LOCKREGION;
+
+/*
+ * Since we will store DBTs in shared memory, we need the equivalent of a
+ * DBT that will work in shared memory.
+ */
+typedef struct __sh_dbt { /* SHARED */
+ u_int32_t size; /* Byte length. */
+ roff_t off; /* Region offset. */
+} SH_DBT;
+
+#define SH_DBT_PTR(p) ((void *)(((u_int8_t *)(p)) + (p)->off))
+
+/*
+ * Object structures; these live in the object hash table.
+ */
+typedef struct __db_lockobj { /* SHARED */
+ u_int32_t indx; /* Hash index of this object. */
+ u_int32_t generation; /* Generation of this object. */
+ SH_DBT lockobj; /* Identifies object locked. */
+ SH_TAILQ_ENTRY links; /* Links for free list or hash list. */
+ SH_TAILQ_ENTRY dd_links; /* Links for dd list. */
+ SH_TAILQ_HEAD(__waitl) waiters; /* List of waiting locks. */
+ SH_TAILQ_HEAD(__holdl) holders; /* List of held locks. */
+ /* Declare room in the object to hold
+ * typical DB lock structures so that
+ * we do not have to allocate them from
+ * shalloc at run-time. */
+ u_int8_t objdata[sizeof(struct __db_ilock)];
+} DB_LOCKOBJ;
+
+/*
+ * Locker structures; these live in the locker hash table.
+ */
+struct __db_locker { /* SHARED */
+ u_int32_t id; /* Locker id. */
+
+ pid_t pid; /* Process owning locker ID */
+ db_threadid_t tid; /* Thread owning locker ID */
+ db_mutex_t mtx_locker; /* Mutex to block on. */
+
+ u_int32_t dd_id; /* Deadlock detector id. */
+
+ u_int32_t nlocks; /* Number of locks held. */
+ u_int32_t nwrites; /* Number of write locks held. */
+ u_int32_t priority; /* Deadlock resolution priority. */
+ u_int32_t nrequest; /* number of requests. */
+
+ roff_t master_locker; /* Locker of master transaction. */
+ roff_t parent_locker; /* Parent of this child. */
+ SH_LIST_HEAD(_child) child_locker; /* List of descendant txns;
+ only used in a "master"
+ txn. */
+ SH_LIST_ENTRY child_link; /* Links transactions in the family;
+ elements of the child_locker
+ list. */
+ SH_TAILQ_ENTRY links; /* Links for free and hash list. */
+ SH_TAILQ_ENTRY ulinks; /* Links in-use list. */
+ SH_LIST_HEAD(_held) heldby; /* Locks held by this locker. */
+ db_timespec lk_expire; /* When current lock expires. */
+ db_timespec tx_expire; /* When this txn expires. */
+ db_timeout_t lk_timeout; /* How long do we let locks live. */
+
+#define DB_LOCKER_DIRTY 0x0001 /* Has write locks. */
+#define DB_LOCKER_INABORT 0x0002 /* Is aborting, don't abort again. */
+#define DB_LOCKER_TIMEOUT 0x0004 /* Has timeout set. */
+#define DB_LOCKER_FAMILY_LOCKER 0x0008 /* Part of a family of lockers. */
+#define DB_LOCKER_HANDLE_LOCKER 0x0010 /* Not associated with a thread. */
+ u_int32_t flags;
+};
+
+/*
+ * Map a hash index into a partition.
+ */
+#define LOCK_PART(reg, ndx) (ndx % (reg)->part_t_size)
+
+/*
+ * Structure that contains information about a lock table partition.
+ */
+typedef struct __db_lockpart{ /* SHARED */
+ db_mutex_t mtx_part; /* mutex for partition*/
+ /* free lock header */
+ SH_TAILQ_HEAD(__flock) free_locks;
+ /* free obj header */
+ SH_TAILQ_HEAD(__fobj) free_objs;
+ roff_t lock_mem_off; /* block memory for locks */
+ roff_t lockobj_mem_off;/* block memory for lockobjs */
+#ifdef HAVE_STATISTICS
+ DB_LOCK_PSTAT part_stat; /* Partition stats. */
+#endif
+} DB_LOCKPART;
+
+#define FREE_LOCKS(lt, part) ((lt)->part_array[part].free_locks)
+#define FREE_OBJS(lt, part) ((lt)->part_array[part].free_objs)
+
+/*
+ * DB_LOCKTAB --
+ * The primary library lock data structure (i.e., the one referenced
+ * by the environment, as opposed to the internal one laid out in the region.)
+ */
+struct __db_locktab {
+ ENV *env; /* Environment. */
+ REGINFO reginfo; /* Region information. */
+ u_int8_t *conflicts; /* Pointer to conflict matrix. */
+ DB_LOCKPART *part_array; /* Beginning of partition array. */
+#ifdef HAVE_STATISTICS
+ DB_LOCK_HSTAT *obj_stat; /* Object hash stats array. */
+#endif
+ DB_HASHTAB *obj_tab; /* Beginning of object hash table. */
+ DB_HASHTAB *locker_tab; /* Beginning of locker hash table. */
+};
+
+/*
+ * Test for conflicts.
+ *
+ * Cast HELD and WANTED to ints, they are usually db_lockmode_t enums.
+ */
+#define CONFLICTS(T, R, HELD, WANTED) \
+ (T)->conflicts[((int)HELD) * (R)->nmodes + ((int)WANTED)]
+
+#define OBJ_LINKS_VALID(L) ((L)->links.stqe_prev != -1)
+
+struct __db_lock { /* SHARED */
+ /*
+ * Wait on mutex to wait on lock. You reference your own mutex with
+ * ID 0 and others reference your mutex with ID 1.
+ */
+ db_mutex_t mtx_lock;
+
+ roff_t holder; /* Who holds this lock. */
+ u_int32_t gen; /* Generation count. */
+ SH_TAILQ_ENTRY links; /* Free or holder/waiter list. */
+ SH_LIST_ENTRY locker_links; /* List of locks held by a locker. */
+ u_int32_t refcount; /* Reference count the lock. */
+ db_lockmode_t mode; /* What sort of lock. */
+ roff_t obj; /* Relative offset of object struct. */
+ u_int32_t indx; /* Hash index of this object. */
+ db_status_t status; /* Status of this lock. */
+};
+
+/*
+ * Flag values for __lock_put_internal:
+ * DB_LOCK_DOALL: Unlock all references in this lock (instead of only 1).
+ * DB_LOCK_FREE: Free the lock (used in checklocker).
+ * DB_LOCK_NOPROMOTE: Don't bother running promotion when releasing locks
+ * (used by __lock_put_internal).
+ * DB_LOCK_UNLINK: Remove from the locker links (used in checklocker).
+ * Make sure that these do not conflict with the interface flags because
+ * we pass some of those around.
+ */
+#define DB_LOCK_DOALL 0x010000
+#define DB_LOCK_FREE 0x040000
+#define DB_LOCK_NOPROMOTE 0x080000
+#define DB_LOCK_UNLINK 0x100000
+#define DB_LOCK_ONEWAITER 0x400000
+
+/*
+ * Macros to get/release different types of mutexes.
+ */
+/*
+ * Operations on lock objects must be protected by a mutex, either on their
+ * partition or on the lock region. Lock structures associated with that
+ * object are protected as well. Each partition has a free list of objects
+ * and lock structures protected by that mutex. We want to avoid getting
+ * multiple mutexes, particularly in __lock_vec, when there is only a
+ * single partition. If there is only one partition, then all the calls
+ * to LOCK_SYSTEM_LOCK(UNLOCK) actually acquire(release) a lock system
+ * wide mutex and MUTEX_LOCK(UNLOCK)_PARTITION are no-ops. If the number
+ * of partitions is greater than one, then LOCK_SYSTEM_LOCK(UNLOCK) is a
+ * no-op, and MUTEX_LOCK(UNLOCK)_PARTITION acquire a mutex on a particular
+ * partition of the lock table.
+ */
+#define LOCK_SYSTEM_LOCK(lt, reg) do { \
+ if ((reg)->part_t_size == 1) \
+ MUTEX_LOCK((lt)->env, (reg)->mtx_region); \
+} while (0)
+#define LOCK_SYSTEM_UNLOCK(lt, reg) do { \
+ if ((reg)->part_t_size == 1) \
+ MUTEX_UNLOCK((lt)->env, (reg)->mtx_region); \
+} while (0)
+#define MUTEX_LOCK_PARTITION(lt, reg, p) do { \
+ if ((reg)->part_t_size != 1) \
+ MUTEX_LOCK((lt)->env, (lt)->part_array[p].mtx_part); \
+} while (0)
+#define MUTEX_UNLOCK_PARTITION(lt, reg, p) do { \
+ if ((reg)->part_t_size != 1) \
+ MUTEX_UNLOCK((lt)->env, (lt)->part_array[p].mtx_part); \
+} while (0)
+
+#define OBJECT_LOCK(lt, reg, obj, ndx) do { \
+ ndx = __lock_ohash(obj) % (reg)->object_t_size; \
+ MUTEX_LOCK_PARTITION(lt, reg, LOCK_PART(reg, ndx)); \
+} while (0)
+
+#define OBJECT_LOCK_NDX(lt, reg, ndx) \
+ MUTEX_LOCK_PARTITION(lt, reg, LOCK_PART(reg, ndx));
+
+#define OBJECT_UNLOCK(lt, reg, ndx) \
+ MUTEX_UNLOCK_PARTITION(lt, reg, LOCK_PART(reg, ndx));
+
+/*
+ * Protect the object deadlock detector queue and the locker allocation
+ * and active queues
+ */
+#define LOCK_DD(env, region) \
+ MUTEX_LOCK(env, (region)->mtx_dd)
+#define UNLOCK_DD(env, region) \
+ MUTEX_UNLOCK(env, (region)->mtx_dd)
+#define LOCK_LOCKERS(env, region) \
+ MUTEX_LOCK(env, (region)->mtx_lockers)
+#define UNLOCK_LOCKERS(env, region) \
+ MUTEX_UNLOCK(env, (region)->mtx_lockers)
+
+/*
+ * __lock_locker_hash --
+ * Hash function for entering lockers into the locker hash table.
+ * Since these are simply 32-bit unsigned integers at the moment,
+ * just return the locker value.
+ */
+#define __lock_locker_hash(locker) (locker)
+#define LOCKER_HASH(lt, reg, locker, ndx) \
+ ndx = __lock_locker_hash(locker) % (reg)->locker_t_size;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/lock_ext.h"
+#endif /* !_DB_LOCK_H_ */
diff --git a/src/dbinc/log.h b/src/dbinc/log.h
new file mode 100644
index 00000000..c4dea6fc
--- /dev/null
+++ b/src/dbinc/log.h
@@ -0,0 +1,463 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_LOG_H_
+#define _DB_LOG_H_
+
+#include "dbinc/db_swap.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************************
+ * DBREG:
+ * The DB file register code keeps track of open files. It's stored
+ * in the log subsystem's shared region, and so appears in the log.h
+ * header file, but is logically separate.
+ * The dbp may not be open if we are recovering the abort of a create.
+ *******************************************************/
+/*
+ * The per-process table that maps log file-id's to DB structures.
+ */
+typedef struct __db_entry {
+ DB *dbp; /* Open dbp for this file id. */
+ int deleted; /* File was not found during open. */
+} DB_ENTRY;
+
+/*
+ * FNAME --
+ * File name and id.
+ */
+struct __fname {
+ SH_TAILQ_ENTRY q; /* File name queue. */
+
+ pid_t pid; /* Process that owns this. */
+ int32_t id; /* Logging file id. */
+ int32_t old_id; /* Saved logging file id. */
+ DBTYPE s_type; /* Saved DB type. */
+
+ roff_t fname_off; /* File name offset. */
+ roff_t dname_off; /* Database name offset. */
+ db_pgno_t meta_pgno; /* Page number of the meta page. */
+ u_int8_t ufid[DB_FILE_ID_LEN]; /* Unique file id. */
+
+ u_int32_t create_txnid; /*
+ * Txn ID of the DB create, stored so
+ * we can log it at register time.
+ */
+ db_mutex_t mutex; /* mutex from db handle. */
+ /* number of txn referencing + 1 for the db handle. */
+ u_int32_t txn_ref;
+
+#define DB_FNAME_CLOSED 0x01 /* DBP was closed. */
+#define DB_FNAME_DURABLE 0x02 /* File is durable. */
+#define DB_FNAME_INMEM 0x04 /* File is in memory. */
+#define DB_FNAME_NOTLOGGED 0x08 /* Log of close failed. */
+#define DB_FNAME_RECOVER 0x10 /* File was opened by recovery code. */
+#define DB_FNAME_RESTORED 0x20 /* File may be in restored txn. */
+#define DB_FNAME_DBREG_MASK 0xf000 /* These bits come from DBREG below. */
+ u_int32_t flags;
+};
+
+/* File open/close register log record opcodes. */
+#define DBREG_CHKPNT 1 /* Checkpoint: file name/id dump. */
+#define DBREG_CLOSE 2 /* File close. */
+#define DBREG_OPEN 3 /* File open. */
+#define DBREG_PREOPEN 4 /* Open in mpool only. */
+#define DBREG_RCLOSE 5 /* File close after recovery. */
+#define DBREG_REOPEN 6 /* Open for in-memory database. */
+#define DBREG_XCHKPNT 7 /* Checkpoint of exclusive file. */
+#define DBREG_XOPEN 8 /* File exclusive open. */
+#define DBREG_XREOPEN 9 /* File exclusive open in-memory. */
+
+/* These bits are logged so db_printlog can handle page data. */
+#define DBREG_OP_MASK 0xf /* Opcode mask */
+#define DBREG_BIGEND 0x1000 /* Db Big endian. */
+#define DBREG_CHKSUM 0x2000 /* Db is checksummed. */
+#define DBREG_ENCRYPT 0x4000 /* Db is encrypted. */
+#define DBREG_EXCL 0x8000 /* Db is exclusive. */
+
+/*******************************************************
+ * LOG:
+ * The log subsystem information.
+ *******************************************************/
+struct __hdr; typedef struct __hdr HDR;
+struct __log; typedef struct __log LOG;
+struct __log_persist; typedef struct __log_persist LOGP;
+
+#define LFPREFIX "log." /* Log file name prefix. */
+#define LFNAME "log.%010d" /* Log file name template. */
+#define LFNAME_V1 "log.%05d" /* Log file name template, rev 1. */
+#define IS_LOG_FILE(name) (strncmp(name, LFPREFIX, sizeof(LFPREFIX) - 1) == 0)
+
+#define LG_MAX_DEFAULT (10 * MEGABYTE) /* 10 MB. */
+#define LG_MAX_INMEM (256 * 1024) /* 256 KB. */
+#define LG_BSIZE_INMEM (1 * MEGABYTE) /* 1 MB. */
+
+/*
+ * Allocate a few bytes under a power-of-two value. BDB doesn't care if it's
+ * a power-of-two or not, and requesting slightly under a power-of-two allows
+ * stupid allocators to avoid wasting space.
+ */
+#define LG_BASE_REGION_SIZE (130000) /* 128KB - 1072B */
+#define LG_BSIZE_DEFAULT (32000) /* 32 KB - 768B */
+#define LG_CURSOR_BUF_SIZE (32000) /* 32 KB - 768B */
+
+/*
+ * DB_LOG
+ * Per-process log structure.
+ */
+struct __db_log {
+ /*
+ * These fields need to be protected for multi-threaded support.
+ */
+ db_mutex_t mtx_dbreg; /* Mutex for thread protection. */
+
+ DB_ENTRY *dbentry; /* Recovery file-id mapping. */
+#define DB_GROW_SIZE 64
+ int32_t dbentry_cnt; /* Entries. Grows by DB_GROW_SIZE. */
+
+ /*
+ * These fields are only accessed when the region lock is held, so
+ * they do not have to be protected by the thread lock as well.
+ */
+ u_int32_t lfname; /* Log file "name". */
+ DB_FH *lfhp; /* Log file handle. */
+ time_t lf_timestamp; /* Log file timestamp. */
+
+ u_int8_t *bufp; /* Region buffer. */
+
+ /* These fields are not thread protected. */
+ ENV *env; /* Environment */
+ REGINFO reginfo; /* Region information. */
+
+#define DBLOG_AUTOREMOVE 0x01 /* Autoremove log files. */
+#define DBLOG_DIRECT 0x02 /* Do direct I/O on the log. */
+#define DBLOG_DSYNC 0x04 /* Set OS_DSYNC on the log. */
+#define DBLOG_FORCE_OPEN 0x08 /* Force the DB open even if it appears
+ * to be deleted. */
+#define DBLOG_INMEMORY 0x10 /* Logging is in memory. */
+#define DBLOG_OPENFILES 0x20 /* Prepared files need to be open. */
+#define DBLOG_RECOVER 0x40 /* We are in recovery. */
+#define DBLOG_ZERO 0x80 /* Zero fill the log. */
+#define DBLOG_VERIFYING 0x100 /* The log is being verified. */
+ u_int32_t flags;
+};
+
+/*
+ * HDR --
+ * Log record header.
+ */
+struct __hdr {
+ u_int32_t prev; /* Previous offset. */
+ u_int32_t len; /* Current length. */
+ u_int8_t chksum[DB_MAC_KEY]; /* Current checksum. */
+ u_int8_t iv[DB_IV_BYTES]; /* IV */
+ u_int32_t orig_size; /* Original size of log record */
+ /* !!! - 'size' is not written to log, must be last in hdr */
+ size_t size; /* Size of header to use */
+};
+
+/*
+ * LOG_HDR_SUM -- XOR in prev and len
+ * This helps avoids the race misreading the log while it
+ * it is being updated.
+ */
+#define LOG_HDR_SUM(crypto, hdr, sum) do { \
+ if (crypto) { \
+ ((u_int32_t *)sum)[0] ^= ((HDR *)hdr)->prev; \
+ ((u_int32_t *)sum)[1] ^= ((HDR *)hdr)->len; \
+ } else { \
+ ((u_int32_t *)sum)[0] ^= \
+ ((HDR *)hdr)->prev ^ ((HDR *)hdr)->len; \
+ } \
+} while (0)
+
+/*
+ * We use HDR internally, and then when we write out, we write out
+ * prev, len, and then a 4-byte checksum if normal operation or
+ * a crypto-checksum and IV and original size if running in crypto
+ * mode. We must store the original size in case we pad. Set the
+ * size when we set up the header. We compute a DB_MAC_KEY size
+ * checksum regardless, but we can safely just use the first 4 bytes.
+ */
+#define HDR_NORMAL_SZ 12
+#define HDR_CRYPTO_SZ 12 + DB_MAC_KEY + DB_IV_BYTES
+
+struct __log_persist {
+ u_int32_t magic; /* DB_LOGMAGIC */
+ u_int32_t version; /* DB_LOGVERSION */
+
+ u_int32_t log_size; /* Log file size. */
+ u_int32_t notused; /* Historically the log file mode. */
+};
+
+/* Macros to lock/unlock the log region as a whole. */
+#define LOG_SYSTEM_LOCK(env) \
+ MUTEX_LOCK(env, ((LOG *) \
+ (env)->lg_handle->reginfo.primary)->mtx_region)
+#define LOG_SYSTEM_UNLOCK(env) \
+ MUTEX_UNLOCK(env, ((LOG *) \
+ (env)->lg_handle->reginfo.primary)->mtx_region)
+
+/*
+ * LOG --
+ * Shared log region. One of these is allocated in shared memory,
+ * and describes the log.
+ */
+struct __log { /* SHARED */
+ db_mutex_t mtx_region; /* Region mutex. */
+
+ db_mutex_t mtx_filelist; /* Mutex guarding file name list. */
+
+ LOGP persist; /* Persistent information. */
+
+ SH_TAILQ_HEAD(__fq1) fq; /* List of file names. */
+ int32_t fid_max; /* Max fid allocated. */
+ roff_t free_fid_stack; /* Stack of free file ids. */
+ u_int32_t free_fids; /* Height of free fid stack. */
+ u_int32_t free_fids_alloced; /* N free fid slots allocated. */
+
+ /*
+ * The lsn LSN is the file offset that we're about to write and which
+ * we will return to the user.
+ */
+ DB_LSN lsn; /* LSN at current file offset. */
+
+ /*
+ * The f_lsn LSN is the LSN (returned to the user) that "owns" the
+ * first byte of the buffer. If the record associated with the LSN
+ * spans buffers, it may not reflect the physical file location of
+ * the first byte of the buffer.
+ */
+ DB_LSN f_lsn; /* LSN of first byte in the buffer. */
+ db_size_t b_off; /* Current offset in the buffer. */
+ u_int32_t w_off; /* Current write offset in the file. */
+ u_int32_t len; /* Length of the last record. */
+
+ DB_LSN active_lsn; /* Oldest active LSN in the buffer. */
+ db_size_t a_off; /* Offset in the buffer of first active
+ file. */
+
+ /*
+ * The s_lsn LSN is the last LSN that we know is on disk, not just
+ * written, but synced. This field is protected by the flush mutex
+ * rather than by the region mutex.
+ */
+ db_mutex_t mtx_flush; /* Mutex guarding flushing. */
+ int32_t in_flush; /* Log flush in progress. */
+ DB_LSN s_lsn; /* LSN of the last sync. */
+
+ DB_LOG_STAT stat; /* Log statistics. */
+
+ /*
+ * This timestamp is updated anytime someone unlinks log
+ * files. This can happen when calling __log_vtruncate
+ * or replication internal init when it unlinks log files.
+ *
+ * The timestamp is used so that other processes that might
+ * have file handles to log files know to close/reopen them
+ * so they're not potentially writing to now-removed files.
+ */
+ time_t timestamp; /* Log trunc timestamp. */
+
+ /*
+ * !!!
+ * NOTE: the next group of fields are NOT protected by the log
+ * region lock. They are protected by REP->mtx_clientdb. If you
+ * need access to both, you must acquire REP->mtx_clientdb
+ * before acquiring the log region lock.
+ *
+ * The waiting_lsn is used by the replication system. It is the
+ * first LSN that we are holding without putting in the log, because
+ * we received one or more log records out of order. Associated with
+ * the waiting_lsn is the number of log records that we still have to
+ * receive before we decide that we should request it again.
+ *
+ * The max_wait_lsn is used to control retransmission in the face
+ * of dropped messages. If we are requesting all records from the
+ * current gap (i.e., chunk of the log that we are missing), then
+ * the max_wait_lsn contains the first LSN that we are known to have
+ * in the __db.rep.db. If we requested only a single record, then
+ * the max_wait_lsn has the LSN of that record we requested.
+ */
+ /* BEGIN fields protected by rep->mtx_clientdb. */
+ DB_LSN waiting_lsn; /* First log record after a gap. */
+ DB_LSN verify_lsn; /* LSN we are waiting to verify. */
+ DB_LSN prev_ckp; /* LSN of ckp preceding verify_lsn. */
+ DB_LSN max_wait_lsn; /* Maximum LSN requested. */
+ DB_LSN max_perm_lsn; /* Maximum PERMANENT LSN processed. */
+ db_timespec max_lease_ts; /* Maximum Lease timestamp seen. */
+ db_timespec wait_ts; /* Time to wait before requesting. */
+ db_timespec rcvd_ts; /* Initial received time to wait. */
+ db_timespec last_ts; /* Last time of insert in temp db. */
+ /*
+ * The ready_lsn is also used by the replication system. It is the
+ * next LSN we expect to receive. It's normally equal to "lsn",
+ * except at the beginning of a log file, at which point it's set
+ * to the LSN of the first record of the new file (after the
+ * header), rather than to 0.
+ */
+ DB_LSN ready_lsn;
+ /*
+ * The bulk_buf is used by replication for bulk transfer. While this
+ * is protected by REP->mtx_clientdb, this doesn't contend with the
+ * above fields because the above are used by clients and the bulk
+ * fields below are used by a master.
+ */
+ roff_t bulk_buf; /* Bulk transfer buffer in region. */
+ roff_t bulk_off; /* Current offset into bulk buffer. */
+ u_int32_t bulk_len; /* Length of buffer. */
+ u_int32_t bulk_flags; /* Bulk buffer flags. */
+ /* END fields protected by rep->mtx_clientdb. */
+
+ /*
+ * During initialization, the log system walks forward through the
+ * last log file to find its end. If it runs into a checkpoint
+ * while it's doing so, it caches it here so that the transaction
+ * system doesn't need to walk through the file again on its
+ * initialization.
+ */
+ DB_LSN cached_ckp_lsn;
+
+ u_int32_t regionmax; /* Configured size of the region. */
+
+ roff_t buffer_off; /* Log buffer offset in the region. */
+ u_int32_t buffer_size; /* Log buffer size. */
+
+ u_int32_t log_size; /* Log file's size. */
+ u_int32_t log_nsize; /* Next log file's size. */
+
+ int filemode; /* Log file permissions mode. */
+
+ /*
+ * DB_LOG_AUTOREMOVE and DB_LOG_INMEMORY: not protected by a mutex,
+ * all we care about is if they're zero or non-zero.
+ */
+ int32_t db_log_autoremove;
+ int32_t db_log_inmemory;
+
+ u_int32_t ncommit; /* Number of txns waiting to commit. */
+ DB_LSN t_lsn; /* LSN of first commit */
+ SH_TAILQ_HEAD(__commit) commits;/* list of txns waiting to commit. */
+ SH_TAILQ_HEAD(__free) free_commits;/* free list of commit structs. */
+
+ /*
+ * In-memory logs maintain a list of the start positions of all log
+ * files currently active in the in-memory buffer. This is to make the
+ * lookup from LSN to log buffer offset efficient.
+ */
+ SH_TAILQ_HEAD(__logfile) logfiles;
+ SH_TAILQ_HEAD(__free_logfile) free_logfiles;
+};
+
+/*
+ * __db_commit structure --
+ * One of these is allocated for each transaction waiting to commit.
+ */
+struct __db_commit {
+ db_mutex_t mtx_txnwait; /* Mutex for txn to wait on. */
+ DB_LSN lsn; /* LSN of commit record. */
+ SH_TAILQ_ENTRY links; /* Either on free or waiting list. */
+
+#define DB_COMMIT_FLUSH 0x0001 /* Flush the log when you wake up. */
+ u_int32_t flags;
+};
+
+/*
+ * Check for the proper progression of Log Sequence Numbers.
+ * If we are rolling forward the LSN on the page must be greater
+ * than or equal to the previous LSN in log record.
+ * We ignore NOT LOGGED LSNs. The user did an unlogged update.
+ * We should eventually see a log record that matches and continue
+ * forward.
+ * A ZERO LSN implies a page that was allocated prior to the recovery
+ * start point and then truncated later in the log. An allocation of a
+ * page after this page will extend the file, leaving a hole. We want to
+ * ignore this page until it is truncated again.
+ *
+ */
+
+#define CHECK_LSN(e, redo, cmp, lsn, prev) \
+ if (DB_REDO(redo) && (cmp) < 0 && \
+ ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) || \
+ IS_REP_CLIENT(e))) { \
+ ret = __db_check_lsn(e, lsn, prev); \
+ goto out; \
+ }
+#define CHECK_ABORT(e, redo, cmp, lsn, prev) \
+ if (redo == DB_TXN_ABORT && (cmp) != 0 && \
+ ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) || \
+ IS_REP_CLIENT(e))) { \
+ ret = __db_check_lsn(e, lsn, prev); \
+ goto out; \
+ }
+
+/*
+ * Helper for in-memory logs -- check whether an offset is in range
+ * in a ring buffer (inclusive of start, exclusive of end).
+ */
+struct __db_filestart {
+ u_int32_t file;
+ size_t b_off;
+
+ SH_TAILQ_ENTRY links; /* Either on free or waiting list. */
+};
+
+#define RINGBUF_LEN(lp, start, end) \
+ ((start) < (end) ? \
+ (end) - (start) : (lp)->buffer_size - ((start) - (end)))
+
+/*
+ * Internal macro to set pointer to the begin_lsn for generated
+ * logging routines. If begin_lsn is already set then do nothing.
+ * Return a pointer to the last lsn too.
+ */
+#undef DB_SET_TXN_LSNP
+#define DB_SET_TXN_LSNP(txn, blsnp, llsnp) do { \
+ DB_LSN *__lsnp; \
+ TXN_DETAIL *__td; \
+ __td = (txn)->td; \
+ *(llsnp) = &__td->last_lsn; \
+ while (__td->parent != INVALID_ROFF) \
+ __td = R_ADDR(&(txn)->mgrp->reginfo, __td->parent); \
+ __lsnp = &__td->begin_lsn; \
+ if (IS_ZERO_LSN(*__lsnp)) \
+ *(blsnp) = __lsnp; \
+} while (0)
+
+/*
+ * Status codes indicating the validity of a log file examined by
+ * __log_valid().
+ */
+typedef enum {
+ DB_LV_INCOMPLETE,
+ DB_LV_NONEXISTENT,
+ DB_LV_NORMAL,
+ DB_LV_OLD_READABLE,
+ DB_LV_OLD_UNREADABLE
+} logfile_validity;
+
+/*
+ * All log records have these fields.
+ */
+typedef struct __log_rec_hdr {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+} LOG_REC_HEADER;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/log_ext.h"
+#include "dbinc_auto/dbreg_auto.h"
+#include "dbinc_auto/dbreg_ext.h"
+#endif /* !_DB_LOG_H_ */
diff --git a/src/dbinc/log_verify.h b/src/dbinc/log_verify.h
new file mode 100644
index 00000000..fa90ace4
--- /dev/null
+++ b/src/dbinc/log_verify.h
@@ -0,0 +1,207 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+#ifndef _DB_LOG_VERIFY_H_
+#define _DB_LOG_VERIFY_H_
+
+#include "db_config.h"
+#include "db_int.h"
+
+/*
+ * Log verification handle, such a handle is shared among all verification
+ * functions during one verification process.
+ */
+struct __db_log_verify_info {
+ DB_ENV *dbenv; /* The database environment. */
+ DB *txninfo; /* (txnid, __txn_verify_info) map. */
+ DB *ckps; /* (ckp lrid, __ckpinfo) map. */
+ DB *fileregs; /* (file-uid, __file_reg_info) map. */
+ DB *fnameuid; /* (fname, fuid), secondary db of fileregs. */
+ /* (dbreg-id, __file_reg_info) map, NOT the sec db for fileregs. */
+ DB *dbregids;
+ DB *pgtxn; /* (fileid-pageno, txnid) map. */
+ DB *txnpg; /* (txnid, fileid-pageno), sec db of pgtxn. */
+ /* lsn, (time-stamp, logtype(txn_regop or txn_ckp)) map. */
+ DB *lsntime;
+ /* Secondary db of lsntime, use timestamp as secindex. */
+ DB *timelsn;
+
+ /* Time range database, (u_int32_t, __lv_txnrange) db. */
+ DB *txnrngs;
+ /* Store abort txn (lsn, txnid) map. */
+ DB *txnaborts;
+ DB_LSN last_lsn; /* Lsn of last log record we verified. */
+ /* The number of active, abort, commit and prepared txns. */
+ u_int32_t ntxn_active, ntxn_abort, ntxn_commit, ntxn_prep;
+ u_int32_t nckp; /* The number of checkpoint log records. */
+ /*
+ * Target database file unique id. Set if only verify log records
+ * of a database.
+ */
+ u_int8_t target_dbid[DB_FILE_ID_LEN];
+ u_int32_t non_txnup_cnt;/* Number of non-txnal log records. */
+ u_int32_t unknown_logrec_cnt;/* Number of unknown log record. */
+ u_int32_t external_logrec_cnt;/* Number of external log record. */
+ /*
+ * (Log type, number of record) map. typeids are continuous
+ * integers, 256 is a big enough number.
+ */
+ u_int32_t lrtypes[256];
+ u_int32_t aborted_txnid;/* The last aborted txnid. */
+ DB_LSN aborted_txnlsn; /* Last aborted txn's last log. */
+ DB_LSN valid_lsn; /* When reach this log,unset DB_LOG_VERIFY_PARTIAL. */
+ char *logtype_names[256];/* The type name string of each type of log.*/
+ const DB_LOG_VERIFY_CONFIG *lv_config;
+ DB_THREAD_INFO *ip;
+ u_int32_t flags; /* The result of the verification. */
+};
+
+/* Transaction information. */
+struct __txn_verify_info {
+#define TXN_VERIFY_INFO_FIXSIZE (4 * sizeof(DB_LSN) + 9 * sizeof(u_int32_t))
+#define TXN_VERIFY_INFO_TOTSIZE(s) \
+ (TXN_VERIFY_INFO_FIXSIZE + (s).num_recycle * sizeof(DB_LSN) + \
+ __lv_dbt_arrsz((s).fileups, (s).filenum) + \
+ sizeof(int32_t) * (s).filenum)
+
+ u_int32_t txnid; /* The key, also stored in data here. */
+ u_int32_t ptxnid; /* The parent txn id. */
+
+ DB_LSN first_lsn; /* Lsn of the first log record of this txn. */
+ DB_LSN last_lsn; /* Last lsn of the txn. */
+ DB_LSN prep_lsn; /* txn_prepare's lsn.*/
+ DB_LSN cur_lsn; /* The lsn of the latest db op of this txn. */
+
+ u_int32_t num_recycle; /* The number of recycle lsns. */
+ u_int32_t filenum; /* The number of files updated. */
+
+#define TXN_STAT_ACTIVE 0
+#define TXN_STAT_ABORT 1
+#define TXN_STAT_COMMIT 2
+#define TXN_STAT_PREPARE 3
+ u_int32_t status; /* Txn status */
+
+ /* The number of active, abort and commit children. */
+ u_int32_t nchild_active;
+ u_int32_t nchild_abort;
+ u_int32_t nchild_commit;
+
+ u_int32_t flags; /* Copied from the DB_TXN::flags member. */
+
+ DB_LSN *recycle_lsns; /* The array of txn_recycle records' lsns. */
+ /* The array of file unique ids of files updated by this txn. */
+ DBT *fileups;
+ int32_t *dbregid;/* The array of dbreg file ids updated by this txn. */
+};
+
+/* Database file information. */
+struct __lv_filereg_info {
+#define FILE_REG_INFO_FIXSIZE (sizeof(u_int32_t))
+#define FILE_REG_INFO_TOTSIZE(s) (FILE_REG_INFO_FIXSIZE + (s).fileid.size + \
+ sizeof((s).fileid.size) + sizeof(int32_t) * (s).regcnt + \
+ strlen((s).fname) + 1)
+
+ u_int32_t regcnt; /* The number of dbregids for this file-uid. */
+ int32_t *dbregids;
+ DBT fileid; /* The file unique id. */
+ char *fname; /* Database file name. */
+};
+
+/* Database file dbreg_register information. */
+struct __lv_filelife {
+ int32_t dbregid; /* The primary key. */
+ DBTYPE dbtype; /* The database type. */
+ u_int32_t lifetime; /* DBREG_CHKPNT, DBREG_CLOSE, DBREG_OPEN, DBREG_XCHKPNT, DBREG_XOPEN */
+ db_pgno_t meta_pgno; /* The meta_pgno; */
+ u_int8_t fileid[DB_FILE_ID_LEN];
+ DB_LSN lsn; /* The lsn of log updating lifetime. */
+};
+
+/* Checkpoint information. */
+struct __lv_ckp_info {
+ int32_t timestamp;
+ DB_LSN lsn, ckplsn; /* Lsn member is the primary key. */
+};
+
+/*
+ * General information from log records which have timestamps.
+ * We use it to do time range verifications. Such information is
+ * acquired when backward-playing the logs before verification.
+ */
+struct __lv_timestamp_info {
+ DB_LSN lsn; /* The primary key. */
+ int32_t timestamp; /* The secondary key. */
+
+ /*
+ * The log types containing a time stamp, so far only txn_ckp
+ * and txn_regop types.
+ */
+ u_int32_t logtype;
+};
+
+/*
+ * Transaction ranges. Such information is acquired when backward-playing the
+ * logs before verification. Can be used to find aborted txns.
+ */
+struct __lv_txnrange {
+ /*
+ * Transaction ID, the primary key. The db storing records of this
+ * type should allow dup since txnids maybe reused.
+ */
+ u_int32_t txnid;
+
+ /*
+ * The parent txn id, ptxnid is the parent of txnid
+ * during [begin, end].
+ */
+ u_int32_t ptxnid;
+
+ /*
+ * The first and last lsn, end is used to sort dup data because it's
+ * seen prior to begin in a backward playback, and [begin, end]
+ * intervals won't overlap.
+ */
+ DB_LSN begin, end;
+
+ int32_t when_commit;/* The time of the commit, 0 if aborted. */
+};
+
+
+/* Parameter types for __iterate_txninfo function. */
+struct __add_recycle_params {
+ u_int32_t min, max;/* The recycled txnid range. */
+ /* The array of txn info to update into db. */
+ VRFY_TXN_INFO **ti2u;
+ u_int32_t ti2ui, ti2ul;/* The effective length and array length. */
+ DB_LSN recycle_lsn;
+};
+
+struct __ckp_verify_params {
+ DB_LSN lsn, ckp_lsn;
+ ENV *env;
+};
+
+/* Helper macros. */
+#define LOGTYPE_NAME(lvh, type) (lvh->logtype_names[type] == NULL ? \
+ NULL : lvh->logtype_names[type] + 3)
+#define NUMCMP(i1, i2) ((i1) > (i2) ? 1 : ((i1) < (i2) ? -1 : 0))
+
+#define INVAL_DBREGID -1
+
+/*
+ * During recovery, DBREG_CHKPNT and DBREG_XCHKPNT can be seen as open,
+ * and it's followed by a DBREG_RCLOSE or DBREG_CLOSE.
+ */
+#define IS_DBREG_OPEN(opcode) (opcode == DBREG_OPEN || opcode == \
+ DBREG_PREOPEN || opcode == DBREG_REOPEN || opcode == DBREG_CHKPNT \
+ || opcode == DBREG_XCHKPNT || opcode == DBREG_XOPEN || \
+ opcode == DBREG_XREOPEN)
+#define IS_DBREG_CLOSE(opcode) (opcode == DBREG_CLOSE || opcode == DBREG_RCLOSE)
+
+#define IS_LOG_VRFY_SUPPORTED(version) ((version) == DB_LOGVERSION)
+
+#endif /* !_DB_LOG_VERIFY_H_*/
diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h
new file mode 100644
index 00000000..9a10c6d9
--- /dev/null
+++ b/src/dbinc/mp.h
@@ -0,0 +1,700 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_MP_H_
+#define _DB_MP_H_
+
+#include "dbinc/atomic.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct __bh; typedef struct __bh BH;
+struct __bh_frozen_p; typedef struct __bh_frozen_p BH_FROZEN_PAGE;
+struct __bh_frozen_a; typedef struct __bh_frozen_a BH_FROZEN_ALLOC;
+struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH;
+struct __db_mpool_fstat_int;
+typedef struct __db_mpool_fstat_int DB_MPOOL_FSTAT_INT;
+struct __db_mpreg; typedef struct __db_mpreg DB_MPREG;
+struct __mpool; typedef struct __mpool MPOOL;
+
+ /* We require at least 20KB of cache. */
+#define DB_CACHESIZE_MIN (20 * 1024)
+
+/*
+ * DB_MPOOLFILE initialization methods cannot be called after open is called,
+ * other methods cannot be called before open is called
+ */
+#define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name) \
+ if (F_ISSET(dbmfp, MP_OPEN_CALLED)) \
+ return (__db_mi_open((dbmfp)->env, name, 1));
+#define MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name) \
+ if (!F_ISSET(dbmfp, MP_OPEN_CALLED)) \
+ return (__db_mi_open((dbmfp)->env, name, 0));
+
+/*
+ * Cache flush operations, plus modifiers.
+ */
+#define DB_SYNC_ALLOC 0x0001 /* Flush for allocation. */
+#define DB_SYNC_CACHE 0x0002 /* Flush entire cache. */
+#define DB_SYNC_CHECKPOINT 0x0004 /* Checkpoint. */
+#define DB_SYNC_FILE 0x0008 /* Flush file. */
+#define DB_SYNC_INTERRUPT_OK 0x0010 /* Allow interrupt and return OK. */
+#define DB_SYNC_QUEUE_EXTENT 0x0020 /* Flush a queue file with extents. */
+#define DB_SYNC_SUPPRESS_WRITE 0x0040 /* Ignore max-write configuration. */
+#define DB_SYNC_TRICKLE 0x0080 /* Trickle sync. */
+
+/*
+ * DB_MPOOL --
+ * Per-process memory pool structure.
+ */
+struct __db_mpool {
+ /* These fields need to be protected for multi-threaded support. */
+ db_mutex_t mutex; /* Thread mutex. */
+
+ /*
+ * DB_MPREG structure for the DB pgin/pgout routines.
+ *
+ * Linked list of application-specified pgin/pgout routines.
+ */
+ DB_MPREG *pg_inout;
+ LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;
+
+ /* List of DB_MPOOLFILE's. */
+ TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;
+
+ /*
+ * The env and reginfo fields are not thread protected, as they are
+ * initialized during mpool creation, and not modified again.
+ */
+ ENV *env; /* Enclosing environment. */
+ REGINFO *reginfo; /* Underlying cache regions. */
+};
+
+/*
+ * DB_MPREG --
+ * DB_MPOOL registry of pgin/pgout functions.
+ */
+struct __db_mpreg {
+ LIST_ENTRY(__db_mpreg) q; /* Linked list. */
+
+ int32_t ftype; /* File type. */
+ /* Pgin, pgout routines. */
+ int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+};
+
+/*
+ * File hashing --
+ * We hash each file to hash bucket based on its fileid
+ * or, in the case of in memory files, its name.
+ */
+
+/* Number of file hash buckets, a small prime number */
+#define MPOOL_FILE_BUCKETS 17
+
+#define FHASH(id, len) __ham_func5(NULL, id, (u_int32_t)(len))
+
+#define FNBUCKET(id, len) \
+ (FHASH(id, len) % MPOOL_FILE_BUCKETS)
+
+/* Macros to lock/unlock the mpool region as a whole. */
+#define MPOOL_SYSTEM_LOCK(env) \
+ MUTEX_LOCK(env, ((MPOOL *) \
+ (env)->mp_handle->reginfo[0].primary)->mtx_region)
+#define MPOOL_SYSTEM_UNLOCK(env) \
+ MUTEX_UNLOCK(env, ((MPOOL *) \
+ (env)->mp_handle->reginfo[0].primary)->mtx_region)
+
+/* Macros to lock/unlock a specific mpool region. */
+#define MPOOL_REGION_LOCK(env, infop) \
+ MUTEX_LOCK(env, ((MPOOL *)(infop)->primary)->mtx_region)
+#define MPOOL_REGION_UNLOCK(env, infop) \
+ MUTEX_UNLOCK(env, ((MPOOL *)(infop)->primary)->mtx_region)
+
+/*
+ * MPOOL --
+ * Shared memory pool region.
+ */
+struct __mpool { /* SHARED */
+ /*
+ * The memory pool can be broken up into individual pieces/files.
+ * There are two reasons for this: firstly, on Solaris you can allocate
+ * only a little more than 2GB of memory in a contiguous chunk,
+ * and I expect to see more systems with similar issues. Secondly,
+ * applications can add / remove pieces to dynamically resize the
+ * cache.
+ *
+ * While this structure is duplicated in each piece of the cache,
+ * the first of these pieces/files describes the entire pool, the
+ * second only describe a piece of the cache.
+ */
+ db_mutex_t mtx_region; /* Region mutex. */
+ db_mutex_t mtx_resize; /* Resizing mutex. */
+
+ /*
+ * The lsn field and list of underlying MPOOLFILEs are thread protected
+ * by the region lock.
+ */
+ DB_LSN lsn; /* Maximum checkpoint LSN. */
+
+ /* Configuration information: protected by the region lock. */
+ u_int32_t max_nreg; /* Maximum number of regions. */
+ u_int32_t gbytes; /* Number of gigabytes in cache. */
+ u_int32_t bytes; /* Number of bytes in cache. */
+ u_int32_t pagesize; /* Default page size. */
+ db_size_t mp_mmapsize; /* Maximum file size for mmap. */
+ int32_t mp_maxopenfd; /* Maximum open file descriptors. */
+ int32_t mp_maxwrite; /* Maximum buffers to write. */
+ db_timeout_t mp_maxwrite_sleep; /* Sleep after writing max buffers. */
+
+ /*
+ * The number of regions and the total number of hash buckets across
+ * all regions.
+ * These fields are not protected by a mutex because we assume that we
+ * can read a 32-bit value atomically. They are only modified by cache
+ * resizing which holds the mpool resizing mutex to ensure that
+ * resizing is single-threaded. See the comment in mp_resize.c for
+ * more information.
+ */
+ u_int32_t nreg; /* Number of underlying REGIONS. */
+ u_int32_t nbuckets; /* Total number of hash buckets. */
+
+ /*
+ * The regid field is protected by the resize mutex.
+ */
+ roff_t regids; /* Array of underlying REGION Ids. */
+
+ roff_t ftab; /* Hash table of files. */
+
+ /*
+ * The following fields describe the per-cache portion of the region.
+ *
+ * The htab and htab_buckets fields are not thread protected as they
+ * are initialized during mpool creation, and not modified again.
+ *
+ * The last_checked, lru_priority, and lru_generation fields are thread
+ * protected by the region lock.
+ */
+ roff_t htab; /* Hash table offset. */
+ u_int32_t htab_buckets; /* Number of hash table entries. */
+ u_int32_t last_checked; /* Last bucket checked for free. */
+ u_int32_t lru_priority; /* Priority counter for buffer LRU. */
+ u_int32_t lru_generation; /* Allocation race condition detector. */
+ u_int32_t htab_mutexes; /* Number of hash mutexes per region. */
+
+ /*
+ * The pages field keeps track of the number of pages in the cache
+ * and is protected by the region lock. It is accessed for reading
+ * without the lock to return statistics.
+ */
+ u_int32_t pages; /* Number of pages in the cache. */
+
+ /*
+ * The stat fields are not thread protected, and cannot be trusted.
+ */
+ DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */
+
+ /*
+ * We track page puts so that we can decide when allocation is never
+ * going to succeed. We don't lock the field, all we care about is
+ * if it changes.
+ */
+ u_int32_t put_counter; /* Count of page put calls. */
+
+ /*
+ * Cache flush operations take a long time...
+ *
+ * Some cache flush operations want to ignore the app's configured
+ * max-write parameters (they are trying to quickly shut down an
+ * environment, for example). We can't specify that as an argument
+ * to the cache region functions, because we may decide to ignore
+ * the max-write configuration after the cache operation has begun.
+ * If the variable suppress_maxwrite is set, ignore the application
+ * max-write config.
+ *
+ * We may want to interrupt cache flush operations in high-availability
+ * configurations.
+ */
+#define DB_MEMP_SUPPRESS_WRITE 0x01
+#define DB_MEMP_SYNC_INTERRUPT 0x02
+ u_int32_t config_flags;
+
+ /* Free frozen buffer headers, protected by the region lock. */
+ SH_TAILQ_HEAD(__free_frozen) free_frozen;
+
+ /* Allocated blocks of frozen buffer headers. */
+ SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen;
+};
+
+/*
+ * NREGION --
+ * Select a cache region given the bucket number.
+ */
+#define NREGION(mp, bucket) \
+ ((bucket) / (mp)->htab_buckets)
+
+/*
+ * MP_HASH --
+ * We make the assumption that early pages of the file are more likely
+ * to be retrieved than the later pages, which means the top bits will
+ * be more interesting for hashing as they're less likely to collide.
+ * That said, as 512 8K pages represents a 4MB file, so only reasonably
+ * large files will have page numbers with any other than the bottom 9
+ * bits set. We XOR in the MPOOL offset of the MPOOLFILE that backs the
+ * page, since that should also be unique for the page. We don't want
+ * to do anything very fancy -- speed is more important to us than using
+ * good hashing.
+ *
+ * Since moving to a dynamic hash, which boils down to using some of the
+ * least significant bits of the hash value, we no longer want to use a
+ * simple shift here, because it's likely with a bit shift that mf_offset
+ * will be ignored, and pages from different files end up in the same
+ * hash bucket. Use a nearby prime instead.
+ */
+#define MP_HASH(mf_offset, pgno) \
+ ((((pgno) << 8) ^ (pgno)) ^ (((u_int32_t) mf_offset) * 509))
+
+/*
+ * Inline the calculation of the mask, since we can't reliably store the mask
+ * with the number of buckets in the region.
+ *
+ * This is equivalent to:
+ * mask = (1 << __db_log2(nbuckets)) - 1;
+ */
+#define MP_MASK(nbuckets, mask) do { \
+ for (mask = 1; mask < (nbuckets); mask = (mask << 1) | 1) \
+ ; \
+} while (0)
+
+#define MP_HASH_BUCKET(hash, nbuckets, mask, bucket) do { \
+ (bucket) = (hash) & (mask); \
+ if ((bucket) >= (nbuckets)) \
+ (bucket) &= ((mask) >> 1); \
+} while (0)
+
+#define MP_BUCKET(mf_offset, pgno, nbuckets, bucket) do { \
+ u_int32_t __mask; \
+ MP_MASK(nbuckets, __mask); \
+ MP_HASH_BUCKET(MP_HASH(mf_offset, pgno), nbuckets, \
+ __mask, bucket); \
+} while (0)
+
+/*
+ * MP_GET_REGION --
+ * Select the region for a given page.
+ */
+#define MP_GET_REGION(dbmfp, pgno, infopp, ret) do { \
+ DB_MPOOL *__t_dbmp; \
+ MPOOL *__t_mp; \
+ \
+ __t_dbmp = dbmfp->env->mp_handle; \
+ __t_mp = __t_dbmp->reginfo[0].primary; \
+ if (__t_mp->max_nreg == 1) { \
+ *(infopp) = &__t_dbmp->reginfo[0]; \
+ } else \
+ ret = __memp_get_bucket((dbmfp)->env, \
+ (dbmfp)->mfp, (pgno), (infopp), NULL, NULL); \
+} while (0)
+
+/*
+ * MP_GET_BUCKET --
+ * Select and lock the bucket for a given page.
+ */
+#define MP_GET_BUCKET(env, mfp, pgno, infopp, hp, bucket, ret) do { \
+ DB_MPOOL *__t_dbmp; \
+ MPOOL *__t_mp; \
+ roff_t __t_mf_offset; \
+ \
+ __t_dbmp = (env)->mp_handle; \
+ __t_mp = __t_dbmp->reginfo[0].primary; \
+ if (__t_mp->max_nreg == 1) { \
+ *(infopp) = &__t_dbmp->reginfo[0]; \
+ __t_mf_offset = R_OFFSET(*(infopp), (mfp)); \
+ MP_BUCKET(__t_mf_offset, \
+ (pgno), __t_mp->nbuckets, bucket); \
+ (hp) = R_ADDR(*(infopp), __t_mp->htab); \
+ (hp) = &(hp)[bucket]; \
+ MUTEX_READLOCK(env, (hp)->mtx_hash); \
+ ret = 0; \
+ } else \
+ ret = __memp_get_bucket((env), \
+ (mfp), (pgno), (infopp), &(hp), &(bucket)); \
+} while (0)
+
+struct __db_mpool_hash {
+ db_mutex_t mtx_hash; /* Per-bucket mutex. */
+
+ DB_HASHTAB hash_bucket; /* Head of bucket. */
+
+ db_atomic_t hash_page_dirty;/* Count of dirty pages. */
+
+#ifndef __TEST_DB_NO_STATISTICS
+ u_int32_t hash_io_wait; /* Count of I/O waits. */
+ u_int32_t hash_frozen; /* Count of frozen buffers. */
+ u_int32_t hash_thawed; /* Count of thawed buffers. */
+ u_int32_t hash_frozen_freed;/* Count of freed frozen buffers. */
+#endif
+
+ DB_LSN old_reader; /* Oldest snapshot reader (cached). */
+
+ u_int32_t flags;
+};
+
+/*
+ * Mpool file statistics structure for use in shared memory.
+ * This structure must contain the same fields as the __db_mpool_fstat struct
+ * except for any pointer fields that are filled in only when the struct is
+ * being populated for output through the API.
+ */
+struct __db_mpool_fstat_int { /* SHARED */
+ u_int32_t st_pagesize; /* Page size. */
+#ifndef __TEST_DB_NO_STATISTICS
+ u_int32_t st_map; /* Pages from mapped files. */
+ uintmax_t st_cache_hit; /* Pages found in the cache. */
+ uintmax_t st_cache_miss; /* Pages not found in the cache. */
+ uintmax_t st_page_create; /* Pages created in the cache. */
+ uintmax_t st_page_in; /* Pages read in. */
+ uintmax_t st_page_out; /* Pages written out. */
+ uintmax_t st_backup_spins; /* Number of spins by a backup. */
+#endif
+};
+
+/*
+ * The base mpool priority is 1/4th of the name space, or just under 2^30. When
+ * the LRU priority counter is about to wrap (within a 128-entry 'red zone'
+ * area) we adjust everybody down so that no one is larger than the new LRU
+ * priority.
+ */
+#define MPOOL_LRU_MAX UINT32_MAX
+#define MPOOL_LRU_REDZONE (MPOOL_LRU_MAX - 128)
+#define MPOOL_LRU_BASE (MPOOL_LRU_MAX / 4)
+#define MPOOL_LRU_DECREMENT (MPOOL_LRU_MAX - MPOOL_LRU_BASE)
+
+/*
+ * Mpool priorities from low to high. Defined in terms of fractions of the
+ * buffers in the pool.
+ */
+#define MPOOL_PRI_VERY_LOW -1 /* Dead duck. Check and set to 0. */
+#define MPOOL_PRI_LOW -2 /* Low. */
+#define MPOOL_PRI_DEFAULT 0 /* No adjustment -- special case.*/
+#define MPOOL_PRI_HIGH 10 /* With the dirty buffers. */
+#define MPOOL_PRI_DIRTY 10 /* Dirty gets a 10% boost. */
+#define MPOOL_PRI_VERY_HIGH 1 /* Add number of buffers in pool. */
+
+/*
+ * MPOOLFILE --
+ * Shared DB_MPOOLFILE information.
+ */
+struct __mpoolfile { /* SHARED */
+ db_mutex_t mutex; /* MPOOLFILE mutex. */
+
+#ifndef HAVE_ATOMICFILEREAD
+ /* Information to synchronize backups. */
+ u_int32_t backup_in_progress; /* Backup running. */
+ pid_t pid; /* Process doing backup. */
+ db_threadid_t tid; /* Thread doing backup. */
+ db_atomic_t writers; /* Number of current writers. */
+ db_mutex_t mtx_write; /* block writers while updating.*/
+ db_pgno_t low_pgno, high_pgno;/* Low and high backup range.*/
+#endif
+
+ /* Protected by MPOOLFILE mutex. */
+ u_int32_t revision; /* Bumped on any movement subdbs. */
+ u_int32_t mpf_cnt; /* Ref count: DB_MPOOLFILEs. */
+ u_int32_t neutral_cnt; /* Ref count: refs that don't care about
+ * MVCC or DURABLE. That is, read-only
+ * or write behind references.
+ */
+ u_int32_t block_cnt; /* Ref count: blocks in cache. */
+ db_pgno_t last_pgno; /* Last page in the file. */
+ db_pgno_t last_flushed_pgno; /* Last page flushed to disk. */
+ db_pgno_t orig_last_pgno; /* Original last page in the file. */
+ db_pgno_t maxpgno; /* Maximum page number. */
+ u_int8_t excl_lockout; /* Internal exclusive db lockout. */
+
+ roff_t path_off; /* File name location. */
+
+ /* Protected by hash bucket mutex. */
+ SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */
+
+ /*
+ * The following are used for file compaction processing.
+ * They are only used when a thread is in the process
+ * of trying to move free pages to the end of the file.
+ * Other threads may look here when freeing a page.
+ * Protected by a lock on the metapage.
+ */
+ u_int32_t free_ref; /* Refcount to freelist. */
+ u_int32_t free_cnt; /* Count of free pages. */
+ db_size_t free_size; /* Allocated size of free list. */
+ roff_t free_list; /* Offset to free list. */
+
+ /*
+ * We normally don't lock the deadfile field when we read it since we
+ * only care if the field is zero or non-zero. We do lock on read when
+ * searching for a matching MPOOLFILE -- see that code for more detail.
+ */
+ int32_t deadfile; /* Dirty pages can be discarded. */
+
+ u_int32_t bucket; /* hash bucket for this file. */
+
+ /*
+ * None of the following fields are thread protected.
+ *
+ * There are potential races with the ftype field because it's read
+ * without holding a lock. However, it has to be set before adding
+ * any buffers to the cache that depend on it being set, so there
+ * would need to be incorrect operation ordering to have a problem.
+ */
+ int32_t ftype; /* File type. */
+
+ /*
+ * There are potential races with the priority field because it's read
+ * without holding a lock. However, a collision is unlikely and if it
+ * happens is of little consequence.
+ */
+ int32_t priority; /* Priority when unpinning buffer. */
+
+ /*
+ * There are potential races with the file_written field (many threads
+ * may be writing blocks at the same time), and with no_backing_file
+ * and unlink_on_close fields, as they may be set while other threads
+ * are reading them. However, we only care if the field value is zero
+ * or non-zero, so don't lock the memory.
+ *
+ * !!!
+ * Theoretically, a 64-bit architecture could put two of these fields
+ * in a single memory operation and we could race. I have never seen
+ * an architecture where that's a problem, and I believe Java requires
+ * that to never be the case.
+ *
+ * File_written is set whenever a buffer is marked dirty in the cache.
+ * It can be cleared in some cases, after all dirty buffers have been
+ * written AND the file has been flushed to disk.
+ */
+ int32_t file_written; /* File was written. */
+ int32_t no_backing_file; /* Never open a backing file. */
+ int32_t unlink_on_close; /* Unlink file on last close. */
+ db_atomic_t multiversion; /* Number of DB_MULTIVERSION handles. */
+
+ /*
+ * We do not protect the statistics in "stat" because of the cost of
+ * the mutex in the get/put routines. There is a chance that a count
+ * will get lost.
+ */
+ DB_MPOOL_FSTAT_INT stat; /* Per-file mpool statistics. */
+
+ /*
+ * The remaining fields are initialized at open and never subsequently
+ * modified.
+ */
+ int32_t lsn_off; /* Page's LSN offset. */
+ u_int32_t clear_len; /* Bytes to clear on page create. */
+
+ roff_t fileid_off; /* File ID string location. */
+
+ u_int32_t pagesize; /* Underlying pagesize. */
+ roff_t pgcookie_len; /* Pgin/pgout cookie length. */
+ roff_t pgcookie_off; /* Pgin/pgout cookie location. */
+
+ /*
+ * The flags are initialized at open and never subsequently modified.
+ */
+#define MP_CAN_MMAP 0x001 /* If the file can be mmap'd. */
+#define MP_DATABASE_LOCKING 0x002 /* Lock in exclusive mode. */
+#define MP_DIRECT 0x004 /* No OS buffering. */
+#define MP_DURABLE_UNKNOWN 0x008 /* We don't care about durability. */
+#define MP_EXTENT 0x010 /* Extent file. */
+#define MP_FAKE_DEADFILE 0x020 /* Deadfile field: fake flag. */
+#define MP_FAKE_FILEWRITTEN 0x040 /* File_written field: fake flag. */
+#define MP_FAKE_NB 0x080 /* No_backing_file field: fake flag. */
+#define MP_FAKE_UOC 0x100 /* Unlink_on_close field: fake flag. */
+#define MP_NOT_DURABLE 0x200 /* File is not durable. */
+#define MP_TEMP 0x400 /* Backing file is a temporary. */
+ u_int32_t flags;
+
+ db_pgno_t fe_watermark; /* File extension watermark. */
+ u_int32_t fe_txnid; /* Transaction that set watermark. */
+ u_int32_t fe_nlws; /* Number of log writes suppressed. */
+};
+
+/*
+ * Flags to __memp_bh_free.
+ */
+#define BH_FREE_FREEMEM 0x01
+#define BH_FREE_REUSE 0x02
+#define BH_FREE_UNLOCKED 0x04
+
+/*
+ * BH --
+ * Buffer header.
+ */
+struct __bh { /* SHARED */
+ db_mutex_t mtx_buf; /* Shared/Exclusive mutex */
+ db_atomic_t ref; /* Reference count. */
+#define BH_REFCOUNT(bhp) atomic_read(&(bhp)->ref)
+
+#define BH_CALLPGIN 0x001 /* Convert the page before use. */
+#define BH_DIRTY 0x002 /* Page is modified. */
+#define BH_DIRTY_CREATE 0x004 /* Page is modified. */
+#define BH_DISCARD 0x008 /* Page is useless. */
+#define BH_EXCLUSIVE 0x010 /* Exclusive access acquired. */
+#define BH_FREED 0x020 /* Page was freed. */
+#define BH_FROZEN 0x040 /* Frozen buffer: allocate & re-read. */
+#define BH_TRASH 0x080 /* Page is garbage. */
+#define BH_THAWED 0x100 /* Page was thawed. */
+ u_int16_t flags;
+
+ u_int32_t priority; /* Priority. */
+ SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */
+
+ db_pgno_t pgno; /* Underlying MPOOLFILE page number. */
+ roff_t mf_offset; /* Associated MPOOLFILE offset. */
+ u_int32_t bucket; /* Hash bucket containing header. */
+ int region; /* Region containing header. */
+
+ roff_t td_off; /* MVCC: creating TXN_DETAIL offset. */
+ SH_CHAIN_ENTRY vc; /* MVCC: version chain. */
+#ifdef DIAG_MVCC
+ u_int16_t align_off; /* Alignment offset for diagnostics.*/
+#endif
+
+ /*
+ * !!!
+ * This array must be at least size_t aligned -- the DB access methods
+ * put PAGE and other structures into it, and then access them directly.
+ * (We guarantee size_t alignment to applications in the documentation,
+ * too.)
+ */
+ DB_ALIGN8 u_int8_t buf[1]; /* Variable length data. */
+};
+
+/*
+ * BH_FROZEN_PAGE --
+ * Data used to find a frozen buffer header.
+ */
+struct __bh_frozen_p {
+ BH header;
+ db_pgno_t spgno; /* Page number in freezer file. */
+};
+
+/*
+ * BH_FROZEN_ALLOC --
+ * Frozen buffer headers are allocated a page at a time in general. This
+ * structure is allocated at the beginning of the page so that the
+ * allocation chunks can be tracked and freed (for private environments).
+ */
+struct __bh_frozen_a {
+ SH_TAILQ_ENTRY links;
+};
+
+#define MULTIVERSION(dbp) atomic_read(&(dbp)->mpf->mfp->multiversion)
+
+#define PAGE_TO_BH(p) (BH *)((u_int8_t *)(p) - SSZA(BH, buf))
+#define IS_DIRTY(p) \
+ (F_ISSET(PAGE_TO_BH(p), BH_DIRTY|BH_EXCLUSIVE) == (BH_DIRTY|BH_EXCLUSIVE))
+
+#define BH_OWNER(env, bhp) \
+ ((TXN_DETAIL *)R_ADDR(&env->tx_handle->reginfo, bhp->td_off))
+
+#define BH_OWNED_BY(env, bhp, txn) ((txn) != NULL && \
+ (bhp)->td_off != INVALID_ROFF && \
+ (txn)->td == BH_OWNER(env, bhp))
+
+#define VISIBLE_LSN(env, bhp) \
+ (&BH_OWNER(env, bhp)->visible_lsn)
+
+/*
+ * Make a copy of the buffer's visible LSN, one field at a time. We rely on the
+ * 32-bit operations being atomic. The visible_lsn starts at MAX_LSN and is
+ * set during commit or abort to the current LSN.
+ *
+ * If we race with a commit / abort, we may see either the file or the offset
+ * still at UINT32_MAX, so vlsn is guaranteed to be in the future. That's OK,
+ * since we had to take the log region lock to allocate the read LSN so we were
+ * never going to see this buffer anyway.
+ */
+#define BH_VISIBLE(env, bhp, read_lsnp, vlsn) \
+ (bhp->td_off == INVALID_ROFF || \
+ ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \
+ (vlsn).offset = VISIBLE_LSN(env, bhp)->offset, \
+ LOG_COMPARE((read_lsnp), &(vlsn)) >= 0))
+
+#define BH_OBSOLETE(bhp, old_lsn, vlsn) (SH_CHAIN_HASNEXT(bhp, vc) ? \
+ BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) :\
+ BH_VISIBLE(env, bhp, &(old_lsn), vlsn))
+
+#define MVCC_SKIP_CURADJ(dbc, pgno) (dbc->txn != NULL && \
+ F_ISSET(dbc->txn, TXN_SNAPSHOT) && MULTIVERSION(dbc->dbp) && \
+ dbc->txn->td != NULL && __memp_skip_curadj(dbc, pgno))
+
+#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
+#define VM_PAGESIZE 4096
+#define MVCC_BHSIZE(mfp, sz) do { \
+ sz += VM_PAGESIZE + sizeof(BH); \
+ if (mfp->pagesize < VM_PAGESIZE) \
+ sz += VM_PAGESIZE - mfp->pagesize; \
+} while (0)
+
+#define MVCC_BHALIGN(p) do { \
+ BH *__bhp; \
+ void *__orig = (p); \
+ p = ALIGNP_INC(p, VM_PAGESIZE); \
+ if ((u_int8_t *)p < (u_int8_t *)__orig + sizeof(BH)) \
+ p = (u_int8_t *)p + VM_PAGESIZE; \
+ __bhp = (BH *)((u_int8_t *)p - SSZA(BH, buf)); \
+ DB_ASSERT(env, \
+ ((uintptr_t)__bhp->buf & (VM_PAGESIZE - 1)) == 0); \
+ DB_ASSERT(env, \
+ (u_int8_t *)__bhp >= (u_int8_t *)__orig); \
+ DB_ASSERT(env, (u_int8_t *)p + mfp->pagesize < \
+ (u_int8_t *)__orig + len); \
+ __bhp->align_off = \
+ (u_int16_t)((u_int8_t *)__bhp - (u_int8_t *)__orig); \
+ p = __bhp; \
+} while (0)
+
+#define MVCC_BHUNALIGN(bhp) do { \
+ (bhp) = (BH *)((u_int8_t *)(bhp) - (bhp)->align_off); \
+} while (0)
+
+#ifdef linux
+#define MVCC_MPROTECT(buf, sz, mode) do { \
+ int __ret = mprotect((buf), (sz), (mode)); \
+ DB_ASSERT(env, __ret == 0); \
+} while (0)
+#else
+#define MVCC_MPROTECT(buf, sz, mode) do { \
+ if (!F_ISSET(env, ENV_PRIVATE | ENV_SYSTEM_MEM)) { \
+ int __ret = mprotect((buf), (sz), (mode)); \
+ DB_ASSERT(env, __ret == 0); \
+ } \
+} while (0)
+#endif /* linux */
+
+#else /* defined(DIAG_MVCC) && defined(HAVE_MPROTECT) */
+#define MVCC_BHSIZE(mfp, sz) do {} while (0)
+#define MVCC_BHALIGN(p) do {} while (0)
+#define MVCC_BHUNALIGN(bhp) do {} while (0)
+#define MVCC_MPROTECT(buf, size, mode) do {} while (0)
+#endif
+
+/*
+ * Flags to __memp_ftruncate.
+ */
+#define MP_TRUNC_NOCACHE 0x01
+#define MP_TRUNC_RECOVER 0x02
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/mp_ext.h"
+#endif /* !_DB_MP_H_ */
diff --git a/src/dbinc/mutex.h b/src/dbinc/mutex.h
new file mode 100644
index 00000000..b699142c
--- /dev/null
+++ b/src/dbinc/mutex.h
@@ -0,0 +1,305 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_MUTEX_H_
+#define _DB_MUTEX_H_
+
+#ifdef HAVE_MUTEX_SUPPORT
+/* The inlined trylock calls need access to the details of mutexes. */
+#define LOAD_ACTUAL_MUTEX_CODE
+#include "dbinc/mutex_int.h"
+
+#ifndef HAVE_SHARED_LATCHES
+ #error "Shared latches are required in DB 4.8 and above"
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * By default, spin 50 times per processor if fail to acquire a test-and-set
+ * mutex, we have anecdotal evidence it's a reasonable value.
+ */
+#define MUTEX_SPINS_PER_PROCESSOR 50
+
+/*
+ * Mutexes are represented by unsigned, 32-bit integral values. As the
+ * OOB value is 0, mutexes can be initialized by zero-ing out the memory
+ * in which they reside.
+ */
+#define MUTEX_INVALID 0
+
+/*
+ * We track mutex allocations by ID.
+ */
+#define MTX_APPLICATION 1
+#define MTX_ATOMIC_EMULATION 2
+#define MTX_DB_HANDLE 3
+#define MTX_ENV_DBLIST 4
+#define MTX_ENV_EXCLDBLIST 5
+#define MTX_ENV_HANDLE 6
+#define MTX_ENV_REGION 7
+#define MTX_LOCK_REGION 8
+#define MTX_LOGICAL_LOCK 9
+#define MTX_LOG_FILENAME 10
+#define MTX_LOG_FLUSH 11
+#define MTX_LOG_HANDLE 12
+#define MTX_LOG_REGION 13
+#define MTX_MPOOLFILE_HANDLE 14
+#define MTX_MPOOL_BH 15
+#define MTX_MPOOL_FH 16
+#define MTX_MPOOL_FILE_BUCKET 17
+#define MTX_MPOOL_HANDLE 18
+#define MTX_MPOOL_HASH_BUCKET 19
+#define MTX_MPOOL_REGION 20
+#define MTX_MUTEX_REGION 21
+#define MTX_MUTEX_TEST 22
+#define MTX_REP_CHKPT 23
+#define MTX_REP_DATABASE 24
+#define MTX_REP_DIAG 25
+#define MTX_REP_EVENT 26
+#define MTX_REP_REGION 27
+#define MTX_REP_START 28
+#define MTX_REP_WAITER 29
+#define MTX_REPMGR 30
+#define MTX_SEQUENCE 31
+#define MTX_TWISTER 32
+#define MTX_TCL_EVENTS 33
+#define MTX_TXN_ACTIVE 34
+#define MTX_TXN_CHKPT 35
+#define MTX_TXN_COMMIT 36
+#define MTX_TXN_MVCC 37
+#define MTX_TXN_REGION 38
+
+#define MTX_MAX_ENTRY 38
+
+/* The following macros are defined on some platforms, e.g. QNX. */
+#undef __mutex_init
+#undef __mutex_lock
+#undef __mutex_timedlock
+#undef __mutex_unlock
+#undef __mutex_destroy
+#undef __mutex_trylock
+
+/* Redirect mutex calls to the correct functions. */
+#if !defined(HAVE_MUTEX_HYBRID) && ( \
+ defined(HAVE_MUTEX_PTHREADS) || \
+ defined(HAVE_MUTEX_SOLARIS_LWP) || \
+ defined(HAVE_MUTEX_UI_THREADS))
+#define __mutex_init(a, b, c) __db_pthread_mutex_init(a, b, c)
+#define __mutex_lock(a, b) __db_pthread_mutex_lock(a, b, 0)
+#define __mutex_timedlock(a, b, c) __db_pthread_mutex_lock(a, b, c)
+#define __mutex_unlock(a, b) __db_pthread_mutex_unlock(a, b)
+#define __mutex_destroy(a, b) __db_pthread_mutex_destroy(a, b)
+#define __mutex_trylock(a, b) __db_pthread_mutex_trylock(a, b)
+/*
+ * These trylock versions do not support DB_ENV_FAILCHK. Callers which loop
+ * checking mutexes which are held by dead processes or threads might spin.
+ * These have ANSI-style definitions because this file can be included by
+ * C++ files, and extern "C" affects linkage only, not argument typing.
+ */
+static inline int __db_pthread_mutex_trylock(ENV *env, db_mutex_t mutex)
+{
+ int ret;
+ DB_MUTEX *mutexp;
+ if (!MUTEX_ON(env) || F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+ mutexp = MUTEXP_SET(env, mutex);
+#ifdef HAVE_SHARED_LATCHES
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+ ret = pthread_rwlock_trywrlock(&mutexp->u.rwlock);
+ else
+#endif
+ ret = pthread_mutex_trylock(&mutexp->u.m.mutex);
+ if (ret == EBUSY)
+ ret = DB_LOCK_NOTGRANTED;
+ else if (ret == 0) {
+ F_SET(mutexp, DB_MUTEX_LOCKED);
+ env->dbenv->thread_id(env->dbenv, &mutexp->pid, &mutexp->tid);
+ STAT_INC(env,
+ mutex, set_nowait, mutexp->mutex_set_nowait, mutex);
+ }
+ return (ret);
+}
+#ifdef HAVE_SHARED_LATCHES
+#define __mutex_rdlock(a, b) __db_pthread_mutex_readlock(a, b)
+#define __mutex_tryrdlock(a, b) __db_pthread_mutex_tryreadlock(a, b)
+static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
+{
+ int ret;
+ DB_MUTEX *mutexp;
+ if (!MUTEX_ON(env) || F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+ mutexp = MUTEXP_SET(env, mutex);
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+ ret = pthread_rwlock_tryrdlock(&mutexp->u.rwlock);
+ else
+ return (EINVAL);
+ if (ret == EBUSY)
+ ret = DB_LOCK_NOTGRANTED;
+#ifdef HAVE_STATISTICS
+ if (ret == 0)
+ STAT_INC(env,
+ mutex, set_rd_nowait, mutexp->mutex_set_nowait, mutex);
+#endif
+ return (ret);
+}
+#endif
+#elif defined(HAVE_MUTEX_WIN32) || defined(HAVE_MUTEX_WIN32_GCC)
+#define __mutex_init(a, b, c) __db_win32_mutex_init(a, b, c)
+#define __mutex_lock(a, b) __db_win32_mutex_lock(a, b, 0)
+#define __mutex_timedlock(a, b, c) __db_win32_mutex_lock(a, b, c)
+#define __mutex_trylock(a, b) __db_win32_mutex_trylock(a, b)
+#define __mutex_unlock(a, b) __db_win32_mutex_unlock(a, b)
+#define __mutex_destroy(a, b) __db_win32_mutex_destroy(a, b)
+#ifdef HAVE_SHARED_LATCHES
+#define __mutex_rdlock(a, b) __db_win32_mutex_readlock(a, b)
+#define __mutex_tryrdlock(a, b) __db_win32_mutex_tryreadlock(a, b)
+#endif
+#elif defined(HAVE_MUTEX_FCNTL)
+#define __mutex_init(a, b, c) __db_fcntl_mutex_init(a, b, c)
+#define __mutex_lock(a, b) __db_fcntl_mutex_lock(a, b, 0)
+#define __mutex_timedlock(a, b, c) __db_fcntl_lock(a, b, c)
+#define __mutex_trylock(a, b) __db_fcntl_mutex_trylock(a, b)
+#define __mutex_unlock(a, b) __db_fcntl_mutex_unlock(a, b)
+#define __mutex_destroy(a, b) __db_fcntl_mutex_destroy(a, b)
+#else
+#define __mutex_init(a, b, c) __db_tas_mutex_init(a, b, c)
+#define __mutex_lock(a, b) __db_tas_mutex_lock(a, b, 0)
+#define __mutex_timedlock(a, b, c) __db_tas_mutex_lock(a, b, c)
+#define __mutex_trylock(a, b) __db_tas_mutex_trylock(a, b)
+#define __mutex_unlock(a, b) __db_tas_mutex_unlock(a, b)
+#define __mutex_destroy(a, b) __db_tas_mutex_destroy(a, b)
+#if defined(HAVE_SHARED_LATCHES)
+#define __mutex_rdlock(a, b) __db_tas_mutex_readlock(a, b)
+#define __mutex_tryrdlock(a,b) __db_tas_mutex_tryreadlock(a, b)
+#endif
+#endif
+
+/*
+ * When there is no method to get a shared latch, fall back to
+ * implementing __mutex_rdlock() as getting an exclusive one.
+ * This occurs either when !HAVE_SHARED_LATCHES or HAVE_MUTEX_FCNTL.
+ */
+#ifndef __mutex_rdlock
+#define __mutex_rdlock(a, b) __mutex_lock(a, b)
+#endif
+#ifndef __mutex_tryrdlock
+#define __mutex_tryrdlock(a, b) __mutex_trylock(a, b)
+#endif
+
+/*
+ * Lock/unlock a mutex. If the mutex was never required, the thread of
+ * control can proceed without it.
+ *
+ * We never fail to acquire or release a mutex without panicing. Simplify
+ * the macros to always return a panic value rather than saving the actual
+ * return value of the mutex routine.
+ */
+#ifdef HAVE_MUTEX_SUPPORT
+#define MUTEX_LOCK(env, mutex) do { \
+ if ((mutex) != MUTEX_INVALID && \
+ __mutex_lock(env, mutex) != 0) \
+ return (DB_RUNRECOVERY); \
+} while (0)
+
+/*
+ * Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success,
+ * or DB_LOCK_NOTGRANTED, or possibly DB_RUNRECOVERY for failchk.
+ */
+#define MUTEX_TRYLOCK(env, mutex) \
+ (((mutex) == MUTEX_INVALID) ? 0 : __mutex_trylock(env, mutex))
+
+/*
+ * Acquire a DB_MUTEX_SHARED "mutex" in shared mode.
+ */
+#define MUTEX_READLOCK(env, mutex) do { \
+ if ((mutex) != MUTEX_INVALID && \
+ __mutex_rdlock(env, mutex) != 0) \
+ return (DB_RUNRECOVERY); \
+} while (0)
+#define MUTEX_TRY_READLOCK(env, mutex) \
+ ((mutex) != MUTEX_INVALID ? __mutex_tryrdlock(env, mutex) : 0)
+
+#define MUTEX_UNLOCK(env, mutex) do { \
+ if ((mutex) != MUTEX_INVALID && \
+ __mutex_unlock(env, mutex) != 0) \
+ return (DB_RUNRECOVERY); \
+} while (0)
+
+#define MUTEX_WAIT(env, mutex, duration) do { \
+ int __ret; \
+ if ((mutex) != MUTEX_INVALID && \
+ (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \
+ __ret != DB_TIMEOUT) \
+ return (DB_RUNRECOVERY); \
+} while (0)
+#else
+/*
+ * There are calls to lock/unlock mutexes outside of #ifdef's -- replace
+ * the call with something the compiler can discard, but which will make
+ * if-then-else blocks work correctly.
+ */
+#define MUTEX_LOCK(env, mutex) (mutex) = (mutex)
+#define MUTEX_TRYLOCK(env, mutex) (mutex) = (mutex)
+#define MUTEX_READLOCK(env, mutex) (mutex) = (mutex)
+#define MUTEX_TRY_READLOCK(env, mutex) (mutex) = (mutex)
+#define MUTEX_UNLOCK(env, mutex) (mutex) = (mutex)
+#define MUTEX_REQUIRED(env, mutex) (mutex) = (mutex)
+#define MUTEX_REQUIRED_READ(env, mutex) (mutex) = (mutex)
+#define MUTEX_WAIT(env, mutex, duration) (mutex) = (mutex)
+#endif
+
+/*
+ * Berkeley DB ports may require single-threading at places in the code.
+ */
+#ifdef HAVE_MUTEX_VXWORKS
+#include "taskLib.h"
+/*
+ * Use the taskLock() mutex to eliminate a race where two tasks are
+ * trying to initialize the global lock at the same time.
+ */
+#define DB_BEGIN_SINGLE_THREAD do { \
+ if (DB_GLOBAL(db_global_init)) \
+ (void)semTake(DB_GLOBAL(db_global_lock), WAIT_FOREVER); \
+ else { \
+ taskLock(); \
+ if (DB_GLOBAL(db_global_init)) { \
+ taskUnlock(); \
+ (void)semTake(DB_GLOBAL(db_global_lock), \
+ WAIT_FOREVER); \
+ continue; \
+ } \
+ DB_GLOBAL(db_global_lock) = \
+ semBCreate(SEM_Q_FIFO, SEM_EMPTY); \
+ if (DB_GLOBAL(db_global_lock) != NULL) \
+ DB_GLOBAL(db_global_init) = 1; \
+ taskUnlock(); \
+ } \
+} while (DB_GLOBAL(db_global_init) == 0)
+#define DB_END_SINGLE_THREAD (void)semGive(DB_GLOBAL(db_global_lock))
+#endif
+
+/*
+ * Single-threading defaults to a no-op.
+ */
+#ifndef DB_BEGIN_SINGLE_THREAD
+#define DB_BEGIN_SINGLE_THREAD
+#endif
+#ifndef DB_END_SINGLE_THREAD
+#define DB_END_SINGLE_THREAD
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/mutex_ext.h"
+#endif /* !_DB_MUTEX_H_ */
diff --git a/src/dbinc/mutex_int.h b/src/dbinc/mutex_int.h
new file mode 100644
index 00000000..b9bccdf7
--- /dev/null
+++ b/src/dbinc/mutex_int.h
@@ -0,0 +1,1070 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_MUTEX_INT_H_
+#define _DB_MUTEX_INT_H_
+
+#include "dbinc/atomic.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Mutexes and Shared Latches
+ *
+ * Mutexes may be test-and-set (spinning & yielding when busy),
+ * native versions (pthreads, WaitForSingleObject)
+ * or a hybrid which has the lower no-contention overhead of test-and-set
+ * mutexes, using operating system calls only to block and wakeup.
+ *
+ * Hybrid exclusive-only mutexes include a 'tas' field.
+ * Hybrid DB_MUTEX_SHARED latches also include a 'shared' field.
+ */
+
+/*********************************************************************
+ * POSIX.1 pthreads interface.
+ *********************************************************************/
+#if defined(HAVE_MUTEX_PTHREADS)
+/*
+ * Pthreads-based mutexes (exclusive-only) and latches (possibly shared)
+ * have the same MUTEX_FIELDS union. Different parts of the union are used
+ * depending on:
+ * - whether HAVE_SHARED_LATCHES is defined, and
+ * - if HAVE_SHARED_LATCHES, whether this particular instance of a mutex
+ * is a shared mutexDB_MUTEX_SHARED.
+ *
+ * The rwlock part of the union is used *only* for non-hybrid shared latches;
+ * in all other cases the mutex and cond fields are the only ones used.
+ *
+ * configuration & Who uses the field
+ * mutex
+ * mutex cond rwlock tas
+ * Native mutex y y
+ * Hybrid mutexes y y y
+ * Native sharedlatches y
+ * Hybrid sharedlatches y y y
+ *
+ * They all have a condition variable which is used only for
+ * DB_MUTEX_SELF_BLOCK waits.
+ *
+ * There can be no self-blocking shared latches: the pthread_cond_wait() would
+ * require getting a pthread_mutex_t, also it would not make sense.
+ */
+#define MUTEX_FIELDS \
+ union { \
+ struct { \
+ pthread_mutex_t mutex; /* Mutex */ \
+ pthread_cond_t cond; /* Condition variable */ \
+ } m; \
+ pthread_rwlock_t rwlock; /* Read/write lock */ \
+ } u;
+
+#if defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_HYBRID)
+#define RET_SET_PTHREAD_LOCK(mutexp, ret) do { \
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \
+ RET_SET((pthread_rwlock_wrlock(&(mutexp)->u.rwlock)), \
+ ret); \
+ else \
+ RET_SET((pthread_mutex_lock(&(mutexp)->u.m.mutex)), ret); \
+} while (0)
+#define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) do { \
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \
+ RET_SET((pthread_rwlock_trywrlock(&(mutexp)->u.rwlock)), \
+ ret); \
+ else \
+ RET_SET((pthread_mutex_trylock(&(mutexp)->u.m.mutex)), \
+ ret); \
+} while (0)
+#else
+#define RET_SET_PTHREAD_LOCK(mutexp, ret) \
+ RET_SET(pthread_mutex_lock(&(mutexp)->u.m.mutex), ret);
+#define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) \
+ RET_SET(pthread_mutex_trylock(&(mutexp)->u.m.mutex), ret);
+#endif
+#endif
+
+#ifdef HAVE_MUTEX_UI_THREADS
+#include <thread.h>
+#endif
+
+/*********************************************************************
+ * Solaris lwp threads interface.
+ *
+ * !!!
+ * We use LWP mutexes on Solaris instead of UI or POSIX mutexes (both of
+ * which are available), for two reasons. First, the Solaris C library
+ * includes versions of the both UI and POSIX thread mutex interfaces, but
+ * they are broken in that they don't support inter-process locking, and
+ * there's no way to detect it, e.g., calls to configure the mutexes for
+ * inter-process locking succeed without error. So, we use LWP mutexes so
+ * that we don't fail in fairly undetectable ways because the application
+ * wasn't linked with the appropriate threads library. Second, there were
+ * bugs in SunOS 5.7 (Solaris 7) where if an application loaded the C library
+ * before loading the libthread/libpthread threads libraries (e.g., by using
+ * dlopen to load the DB library), the pwrite64 interface would be translated
+ * into a call to pwrite and DB would drop core.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SOLARIS_LWP
+/*
+ * XXX
+ * Don't change <synch.h> to <sys/lwp.h> -- although lwp.h is listed in the
+ * Solaris manual page as the correct include to use, it causes the Solaris
+ * compiler on SunOS 2.6 to fail.
+ */
+#include <synch.h>
+
+#define MUTEX_FIELDS \
+ lwp_mutex_t mutex; /* Mutex. */ \
+ lwp_cond_t cond; /* Condition variable. */
+#endif
+
+/*********************************************************************
+ * Solaris/Unixware threads interface.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_UI_THREADS
+#include <thread.h>
+#include <synch.h>
+
+#define MUTEX_FIELDS \
+ mutex_t mutex; /* Mutex. */ \
+ cond_t cond; /* Condition variable. */
+#endif
+
+/*********************************************************************
+ * AIX C library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_AIX_CHECK_LOCK
+#include <sys/atomic_op.h>
+typedef int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define MUTEX_INIT(x) 0
+#define MUTEX_SET(x) (!_check_lock(x, 0, 1))
+#define MUTEX_UNSET(x) _clear_lock(x, 0)
+#endif
+#endif
+
+/*********************************************************************
+ * Apple/Darwin library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_DARWIN_SPIN_LOCK_TRY
+typedef u_int32_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+extern int _spin_lock_try(tsl_t *);
+extern void _spin_unlock(tsl_t *);
+#define MUTEX_SET(tsl) _spin_lock_try(tsl)
+#define MUTEX_UNSET(tsl) _spin_unlock(tsl)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * General C library functions (msemaphore).
+ *
+ * !!!
+ * Check for HPPA as a special case, because it requires unusual alignment,
+ * and doesn't support semaphores in malloc(3) or shmget(2) memory.
+ *
+ * !!!
+ * Do not remove the MSEM_IF_NOWAIT flag. The problem is that if a single
+ * process makes two msem_lock() calls in a row, the second one returns an
+ * error. We depend on the fact that we can lock against ourselves in the
+ * locking subsystem, where we set up a mutex so that we can block ourselves.
+ * Tested on OSF1 v4.0.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_HPPA_MSEM_INIT
+#define MUTEX_ALIGN 16
+#endif
+
+#if defined(HAVE_MUTEX_MSEM_INIT) || defined(HAVE_MUTEX_HPPA_MSEM_INIT)
+#include <sys/mman.h>
+typedef msemaphore tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define MUTEX_INIT(x) (msem_init(x, MSEM_UNLOCKED) <= (msemaphore *)0)
+#define MUTEX_SET(x) (!msem_lock(x, MSEM_IF_NOWAIT))
+#define MUTEX_UNSET(x) msem_unlock(x, 0)
+#endif
+#endif
+
+/*********************************************************************
+ * Plan 9 library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_PLAN9
+typedef Lock tsl_t;
+
+#define MUTEX_INIT(x) (memset(x, 0, sizeof(Lock)), 0)
+#define MUTEX_SET(x) canlock(x)
+#define MUTEX_UNSET(x) unlock(x)
+#endif
+
+/*********************************************************************
+ * Reliant UNIX C library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_RELIANTUNIX_INITSPIN
+#include <ulocks.h>
+typedef spinlock_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define MUTEX_INIT(x) (initspin(x, 1), 0)
+#define MUTEX_SET(x) (cspinlock(x) == 0)
+#define MUTEX_UNSET(x) spinunlock(x)
+#endif
+#endif
+
+/*********************************************************************
+ * General C library functions (POSIX 1003.1 sema_XXX).
+ *
+ * !!!
+ * Never selected by autoconfig in this release (semaphore calls are known
+ * to not work in Solaris 5.5).
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SEMA_INIT
+#include <synch.h>
+typedef sema_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define MUTEX_DESTROY(x) sema_destroy(x)
+#define MUTEX_INIT(x) (sema_init(x, 1, USYNC_PROCESS, NULL) != 0)
+#define MUTEX_SET(x) (sema_wait(x) == 0)
+#define MUTEX_UNSET(x) sema_post(x)
+#endif
+#endif
+
+/*********************************************************************
+ * SGI C library functions.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SGI_INIT_LOCK
+#include <abi_mutex.h>
+typedef abilock_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define MUTEX_INIT(x) (init_lock(x) != 0)
+#define MUTEX_SET(x) (!acquire_lock(x))
+#define MUTEX_UNSET(x) release_lock(x)
+#endif
+#endif
+
+/*********************************************************************
+ * Solaris C library functions.
+ *
+ * !!!
+ * These are undocumented functions, but they're the only ones that work
+ * correctly as far as we know.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SOLARIS_LOCK_TRY
+#include <sys/atomic.h>
+#define MUTEX_MEMBAR(x) membar_enter()
+#define MEMBAR_ENTER() membar_enter()
+#define MEMBAR_EXIT() membar_exit()
+#include <sys/machlock.h>
+typedef lock_t tsl_t;
+
+/*
+ * The functions are declared in <sys/machlock.h>, but under #ifdef KERNEL.
+ * Re-declare them here to avoid warnings.
+ */
+extern int _lock_try(lock_t *);
+extern void _lock_clear(lock_t *);
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define MUTEX_INIT(x) 0
+#define MUTEX_SET(x) _lock_try(x)
+#define MUTEX_UNSET(x) _lock_clear(x)
+#endif
+#endif
+
+/*********************************************************************
+ * VMS.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_VMS
+#include <sys/mman.h>
+#include <builtins.h>
+typedef volatile unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#ifdef __ALPHA
+#define MUTEX_SET(tsl) (!__TESTBITSSI(tsl, 0))
+#else /* __VAX */
+#define MUTEX_SET(tsl) (!(int)_BBSSI(0, tsl))
+#endif
+#define MUTEX_UNSET(tsl) (*(tsl) = 0)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * VxWorks
+ * Use basic binary semaphores in VxWorks, as we currently do not need
+ * any special features. We do need the ability to single-thread the
+ * entire system, however, because VxWorks doesn't support the open(2)
+ * flag O_EXCL, the mechanism we normally use to single thread access
+ * when we're first looking for a DB environment.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_VXWORKS
+#include "taskLib.h"
+typedef SEM_ID tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * Uses of this MUTEX_SET() need to have a local 'nowait' variable,
+ * which determines whether to return right away when the semaphore
+ * is busy or to wait until it is available.
+ */
+#define MUTEX_SET(tsl) \
+ (semTake((*(tsl)), nowait ? NO_WAIT : WAIT_FOREVER) == OK)
+#define MUTEX_UNSET(tsl) (semGive((*tsl)))
+#define MUTEX_INIT(tsl) \
+ ((*(tsl) = semBCreate(SEM_Q_FIFO, SEM_FULL)) == NULL)
+#define MUTEX_DESTROY(tsl) semDelete(*tsl)
+#endif
+#endif
+
+/*********************************************************************
+ * Win16
+ *
+ * Win16 spinlocks are simple because we cannot possibly be preempted.
+ *
+ * !!!
+ * We should simplify this by always returning a no-need-to-lock lock
+ * when we initialize the mutex.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_WIN16
+typedef unsigned int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define MUTEX_INIT(x) 0
+#define MUTEX_SET(tsl) (*(tsl) = 1)
+#define MUTEX_UNSET(tsl) (*(tsl) = 0)
+#endif
+#endif
+
+/*********************************************************************
+ * Win32 - always a hybrid mutex
+ *********************************************************************/
+#if defined(HAVE_MUTEX_WIN32) || defined(HAVE_MUTEX_WIN32_GCC)
+typedef LONG volatile tsl_t;
+#define MUTEX_FIELDS \
+ LONG nwaiters; \
+ u_int32_t id; /* ID used for creating events */ \
+
+#if defined(LOAD_ACTUAL_MUTEX_CODE)
+#define MUTEX_SET(tsl) (!InterlockedExchange((PLONG)tsl, 1))
+#define MUTEX_UNSET(tsl) InterlockedExchange((PLONG)tsl, 0)
+#define MUTEX_INIT(tsl) MUTEX_UNSET(tsl)
+
+/*
+ * From Intel's performance tuning documentation (and see SR #6975):
+ * ftp://download.intel.com/design/perftool/cbts/appnotes/sse2/w_spinlock.pdf
+ *
+ * "For this reason, it is highly recommended that you insert the PAUSE
+ * instruction into all spin-wait code immediately. Using the PAUSE
+ * instruction does not affect the correctness of programs on existing
+ * platforms, and it improves performance on Pentium 4 processor platforms."
+ */
+#ifdef HAVE_MUTEX_WIN32
+#if !defined(_WIN64) && !defined(DB_WINCE)
+#define MUTEX_PAUSE {__asm{_emit 0xf3}; __asm{_emit 0x90}}
+#endif
+#endif
+#ifdef HAVE_MUTEX_WIN32_GCC
+#define MUTEX_PAUSE __asm__ volatile ("rep; nop" : : );
+#endif
+#endif
+#endif
+
+/*********************************************************************
+ * 68K/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_68K_GCC_ASSEMBLY
+typedef unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/68K: 0 is clear, 1 is set. */
+#define MUTEX_SET(tsl) ({ \
+ register tsl_t *__l = (tsl); \
+ int __r; \
+ __asm__ volatile("tas %1; \n \
+ seq %0" \
+ : "=dm" (__r), "=m" (*__l) \
+ : "1" (*__l) \
+ ); \
+ __r & 1; \
+})
+
+#define MUTEX_UNSET(tsl) (*(tsl) = 0)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * ALPHA/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_ALPHA_GCC_ASSEMBLY
+typedef u_int32_t tsl_t;
+
+#define MUTEX_ALIGN 4
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * For gcc/alpha. Should return 0 if could not acquire the lock, 1 if
+ * lock was acquired properly.
+ */
+static inline int
+MUTEX_SET(tsl_t *tsl) {
+ register tsl_t *__l = tsl;
+ register tsl_t __r;
+ __asm__ volatile(
+ "1: ldl_l %0,%2\n"
+ " blbs %0,2f\n"
+ " or $31,1,%0\n"
+ " stl_c %0,%1\n"
+ " beq %0,3f\n"
+ " mb\n"
+ " br 3f\n"
+ "2: xor %0,%0\n"
+ "3:"
+ : "=&r"(__r), "=m"(*__l) : "1"(*__l) : "memory");
+ return __r;
+}
+
+/*
+ * Unset mutex. Judging by Alpha Architecture Handbook, the mb instruction
+ * might be necessary before unlocking
+ */
+static inline int
+MUTEX_UNSET(tsl_t *tsl) {
+ __asm__ volatile(" mb\n");
+ return *tsl = 0;
+}
+
+#define MUTEX_INIT(tsl) MUTEX_UNSET(tsl)
+#endif
+#endif
+
+/*********************************************************************
+ * Tru64/cc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_TRU64_CC_ASSEMBLY
+typedef volatile u_int32_t tsl_t;
+
+#define MUTEX_ALIGN 4
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#include <alpha/builtins.h>
+#define MUTEX_SET(tsl) (__LOCK_LONG_RETRY((tsl), 1) != 0)
+#define MUTEX_UNSET(tsl) (__UNLOCK_LONG(tsl))
+
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * ARM/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_ARM_GCC_ASSEMBLY
+typedef unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/arm: 0 is clear, 1 is set. */
+#define MUTEX_SET(tsl) ({ \
+ int __r; \
+ __asm__ volatile( \
+ "swpb %0, %1, [%2]\n\t" \
+ "eor %0, %0, #1\n\t" \
+ : "=&r" (__r) \
+ : "r" (1), "r" (tsl) \
+ ); \
+ __r & 1; \
+})
+
+#define MUTEX_UNSET(tsl) (*(volatile tsl_t *)(tsl) = 0)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * HPPA/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_HPPA_GCC_ASSEMBLY
+typedef u_int32_t tsl_t;
+
+#define MUTEX_ALIGN 16
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * The PA-RISC has a "load and clear" instead of a "test and set" instruction.
+ * The 32-bit word used by that instruction must be 16-byte aligned. We could
+ * use the "aligned" attribute in GCC but that doesn't work for stack variables.
+ */
+#define MUTEX_SET(tsl) ({ \
+ register tsl_t *__l = (tsl); \
+ int __r; \
+ __asm__ volatile("ldcws 0(%1),%0" : "=r" (__r) : "r" (__l)); \
+ __r & 1; \
+})
+
+#define MUTEX_UNSET(tsl) (*(volatile tsl_t *)(tsl) = -1)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * IA64/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_IA64_GCC_ASSEMBLY
+typedef volatile unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/ia64: 0 is clear, 1 is set. */
+#define MUTEX_SET(tsl) ({ \
+ register tsl_t *__l = (tsl); \
+ long __r; \
+ __asm__ volatile("xchg1 %0=%1,%2" : \
+ "=r"(__r), "+m"(*__l) : "r"(1)); \
+ __r ^ 1; \
+})
+
+/*
+ * Store through a "volatile" pointer so we get a store with "release"
+ * semantics.
+ */
+#define MUTEX_UNSET(tsl) (*(tsl_t *)(tsl) = 0)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * PowerPC/gcc assembly.
+ *********************************************************************/
+#if defined(HAVE_MUTEX_PPC_GCC_ASSEMBLY)
+typedef u_int32_t tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * The PowerPC does a sort of pseudo-atomic locking. You set up a
+ * 'reservation' on a chunk of memory containing a mutex by loading the
+ * mutex value with LWARX. If the mutex has an 'unlocked' (arbitrary)
+ * value, you then try storing into it with STWCX. If no other process or
+ * thread broke your 'reservation' by modifying the memory containing the
+ * mutex, then the STCWX succeeds; otherwise it fails and you try to get
+ * a reservation again.
+ *
+ * While mutexes are explicitly 4 bytes, a 'reservation' applies to an
+ * entire cache line, normally 32 bytes, aligned naturally. If the mutex
+ * lives near data that gets changed a lot, there's a chance that you'll
+ * see more broken reservations than you might otherwise. The only
+ * situation in which this might be a problem is if one processor is
+ * beating on a variable in the same cache block as the mutex while another
+ * processor tries to acquire the mutex. That's bad news regardless
+ * because of the way it bashes caches, but if you can't guarantee that a
+ * mutex will reside in a relatively quiescent cache line, you might
+ * consider padding the mutex to force it to live in a cache line by
+ * itself. No, you aren't guaranteed that cache lines are 32 bytes. Some
+ * embedded processors use 16-byte cache lines, while some 64-bit
+ * processors use 128-bit cache lines. But assuming a 32-byte cache line
+ * won't get you into trouble for now.
+ *
+ * If mutex locking is a bottleneck, then you can speed it up by adding a
+ * regular LWZ load before the LWARX load, so that you can test for the
+ * common case of a locked mutex without wasting cycles making a reservation.
+ *
+ * gcc/ppc: 0 is clear, 1 is set.
+ */
+static inline int
+MUTEX_SET(int *tsl) {
+ int __r;
+ __asm__ volatile (
+"0: \n\t"
+" lwarx %0,0,%1 \n\t"
+" cmpwi %0,0 \n\t"
+" bne- 1f \n\t"
+" stwcx. %1,0,%1 \n\t"
+" isync \n\t"
+" beq+ 2f \n\t"
+" b 0b \n\t"
+"1: \n\t"
+" li %1,0 \n\t"
+"2: \n\t"
+ : "=&r" (__r), "+r" (tsl)
+ :
+ : "cr0", "memory");
+ return (int)tsl;
+}
+
+static inline int
+MUTEX_UNSET(tsl_t *tsl) {
+ __asm__ volatile("sync" : : : "memory");
+ return *tsl = 0;
+}
+#define MUTEX_INIT(tsl) MUTEX_UNSET(tsl)
+#endif
+#endif
+
+/*********************************************************************
+ * OS/390 C.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_S390_CC_ASSEMBLY
+typedef int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * cs() is declared in <stdlib.h> but is built in to the compiler.
+ * Must use LANGLVL(EXTENDED) to get its declaration.
+ */
+#define MUTEX_SET(tsl) (!cs(&zero, (tsl), 1))
+#define MUTEX_UNSET(tsl) (*(tsl) = 0)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * S/390 32-bit assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_S390_GCC_ASSEMBLY
+typedef int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/S390: 0 is clear, 1 is set. */
+static inline int
+MUTEX_SET(tsl_t *tsl) { \
+ register tsl_t *__l = (tsl); \
+ int __r; \
+ __asm__ volatile( \
+ " la 1,%1\n" \
+ " lhi 0,1\n" \
+ " l %0,%1\n" \
+ "0: cs %0,0,0(1)\n" \
+ " jl 0b" \
+ : "=&d" (__r), "+m" (*__l) \
+ : : "0", "1", "cc"); \
+ return !__r; \
+}
+
+#define MUTEX_UNSET(tsl) (*(tsl) = 0)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * SCO/cc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SCO_X86_CC_ASSEMBLY
+typedef unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * UnixWare has threads in libthread, but OpenServer doesn't (yet).
+ *
+ * cc/x86: 0 is clear, 1 is set.
+ */
+#if defined(__USLC__)
+asm int
+_tsl_set(void *tsl)
+{
+%mem tsl
+ movl tsl, %ecx
+ movl $1, %eax
+ lock
+ xchgb (%ecx),%al
+ xorl $1,%eax
+}
+#endif
+
+#define MUTEX_SET(tsl) _tsl_set(tsl)
+#define MUTEX_UNSET(tsl) (*(tsl) = 0)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#endif
+#endif
+
+/*********************************************************************
+ * Sparc/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_SPARC_GCC_ASSEMBLY
+typedef unsigned char tsl_t;
+
+#define MUTEX_ALIGN 8
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * The ldstub instruction takes the location specified by its first argument
+ * (a register containing a memory address) and loads its contents into its
+ * second argument (a register) and atomically sets the contents the location
+ * specified by its first argument to a byte of 1s. (The value in the second
+ * argument is never read, but only overwritten.)
+ *
+ * Hybrid mutexes require membar #StoreLoad and #LoadStore ordering on multi-
+ * processor v9 systems.
+ *
+ * gcc/sparc: 0 is clear, 1 is set.
+ */
+#define MUTEX_SET(tsl) ({ \
+ register tsl_t *__l = (tsl); \
+ register tsl_t __r; \
+ __asm__ volatile \
+ ("ldstub [%1],%0; stbar" \
+ : "=r"( __r) : "r" (__l)); \
+ !__r; \
+})
+
+#define MUTEX_UNSET(tsl) (*(tsl) = 0, MUTEX_MEMBAR(tsl))
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+#define MUTEX_MEMBAR(x) \
+ ({ __asm__ volatile ("membar #StoreStore|#StoreLoad|#LoadStore"); })
+#define MEMBAR_ENTER() \
+ ({ __asm__ volatile ("membar #StoreStore|#StoreLoad"); })
+#define MEMBAR_EXIT() \
+ ({ __asm__ volatile ("membar #StoreStore|#LoadStore"); })
+#endif
+#endif
+
+/*********************************************************************
+ * UTS/cc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_UTS_CC_ASSEMBLY
+typedef int tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+#define MUTEX_INIT(x) 0
+#define MUTEX_SET(x) (!uts_lock(x, 1))
+#define MUTEX_UNSET(x) (*(x) = 0)
+#endif
+#endif
+
+/*********************************************************************
+ * MIPS/gcc assembly.
+ *********************************************************************/
+#ifdef HAVE_MUTEX_MIPS_GCC_ASSEMBLY
+typedef u_int32_t tsl_t;
+
+#define MUTEX_ALIGN 4
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/*
+ * For gcc/MIPS. Should return 0 if could not acquire the lock, 1 if
+ * lock was acquired properly.
+ */
+static inline int
+MUTEX_SET(tsl_t *tsl) {
+ register tsl_t *__l = tsl;
+ register tsl_t __r, __t;
+ __asm__ volatile(
+ " .set push \n"
+ " .set mips2 \n"
+ " .set noreorder \n"
+ " .set nomacro \n"
+ "1: ll %0, %3 \n"
+ " ori %2, %0, 1 \n"
+ " sc %2, %1 \n"
+ " beqzl %2, 1b \n"
+ " nop \n"
+ " andi %2, %0, 1 \n"
+ " sync \n"
+ " .set reorder \n"
+ " .set pop \n"
+ : "=&r" (__t), "=m" (*tsl), "=&r" (__r)
+ : "m" (*tsl)
+ : "memory");
+ return (!__r);
+}
+
+static inline void
+MUTEX_UNSET(tsl_t *tsl) {
+ __asm__ volatile(
+ " .set noreorder \n"
+ " sync \n"
+ " sw $0, %0 \n"
+ " .set reorder \n"
+ : "=m" (*tsl)
+ : "m" (*tsl)
+ : "memory");
+}
+
+#define MUTEX_INIT(tsl) (*(tsl) = 0)
+#endif
+#endif
+
+/*********************************************************************
+ * x86/gcc (32- and 64-bit) assembly.
+ *********************************************************************/
+#if defined(HAVE_MUTEX_X86_GCC_ASSEMBLY) || \
+ defined(HAVE_MUTEX_X86_64_GCC_ASSEMBLY)
+typedef volatile unsigned char tsl_t;
+
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+/* gcc/x86: 0 is clear, 1 is set. */
+#define MUTEX_SET(tsl) ({ \
+ tsl_t __r; \
+ __asm__ volatile("movb $1, %b0\n\t" \
+ "xchgb %b0,%1" \
+ : "=&q" (__r) \
+ : "m" (*(tsl_t *)(tsl)) \
+ : "memory", "cc"); \
+ !__r; /* return 1 on success, 0 on failure */ \
+})
+
+#define MUTEX_UNSET(tsl) (*(tsl_t *)(tsl) = 0)
+#define MUTEX_INIT(tsl) (MUTEX_UNSET(tsl), 0)
+/*
+ * We need to pass a valid address to generate the memory barrier
+ * otherwise PURIFY will complain. Use something referenced recently
+ * and initialized.
+ */
+#if defined(HAVE_MUTEX_X86_GCC_ASSEMBLY)
+#define MUTEX_MEMBAR(addr) \
+ ({ __asm__ volatile ("lock; addl $0, %0" ::"m" (addr): "memory"); 1; })
+#else
+#define MUTEX_MEMBAR(addr) \
+ ({ __asm__ volatile ("mfence" ::: "memory"); 1; })
+#endif
+
+/*
+ * From Intel's performance tuning documentation (and see SR #6975):
+ * ftp://download.intel.com/design/perftool/cbts/appnotes/sse2/w_spinlock.pdf
+ *
+ * "For this reason, it is highly recommended that you insert the PAUSE
+ * instruction into all spin-wait code immediately. Using the PAUSE
+ * instruction does not affect the correctness of programs on existing
+ * platforms, and it improves performance on Pentium 4 processor platforms."
+ */
+#define MUTEX_PAUSE __asm__ volatile ("rep; nop" : : );
+#endif
+#endif
+
+/* End of operating system & hardware architecture-specific definitions */
+
+/*
+ * Mutex alignment defaults to sizeof(unsigned int).
+ *
+ * !!!
+ * Various systems require different alignments for mutexes (the worst we've
+ * seen so far is 16-bytes on some HP architectures). Malloc(3) is assumed
+ * to return reasonable alignment, all other mutex users must ensure proper
+ * alignment locally.
+ */
+#ifndef MUTEX_ALIGN
+#define MUTEX_ALIGN sizeof(unsigned int)
+#endif
+
+/*
+ * Mutex destruction defaults to a no-op.
+ */
+#ifndef MUTEX_DESTROY
+#define MUTEX_DESTROY(x)
+#endif
+
+/*
+ * Mutex pause defaults to a no-op.
+ */
+#ifndef MUTEX_PAUSE
+#define MUTEX_PAUSE
+#endif
+
+/*
+ * If no native atomic support is available then use mutexes to
+ * emulate atomic increment, decrement, and compare-and-exchange.
+ * The address of the atomic value selects which of a small number
+ * of mutexes to use to protect the updates.
+ * The number of mutexes should be somewhat larger than the number of
+ * processors in the system in order to minimize unnecessary contention.
+ * It defaults to 8 to handle most small (1-4) cpu systems, if it hasn't
+ * already been defined (e.g. in db_config.h)
+ */
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT) && \
+ !defined(MAX_ATOMIC_MUTEXES)
+#define MAX_ATOMIC_MUTEXES 1
+#endif
+
+/*
+ * DB_MUTEXMGR --
+ * The mutex manager encapsulates the mutex system.
+ */
+struct __db_mutexmgr {
+ /* These fields are never updated after creation, so not protected. */
+ DB_ENV *dbenv; /* Environment */
+ REGINFO reginfo; /* Region information */
+
+ void *mutex_array; /* Base of the mutex array */
+};
+
+/* Macros to lock/unlock the mutex region as a whole. */
+#define MUTEX_SYSTEM_LOCK(dbenv) \
+ MUTEX_LOCK(dbenv, ((DB_MUTEXREGION *) \
+ (dbenv)->mutex_handle->reginfo.primary)->mtx_region)
+#define MUTEX_SYSTEM_UNLOCK(dbenv) \
+ MUTEX_UNLOCK(dbenv, ((DB_MUTEXREGION *) \
+ (dbenv)->mutex_handle->reginfo.primary)->mtx_region)
+
+/*
+ * DB_MUTEXREGION --
+ * The primary mutex data structure in the shared memory region.
+ */
+typedef struct __db_mutexregion { /* SHARED */
+ /* These fields are initialized at create time and never modified. */
+ roff_t mutex_off_alloc;/* Offset of mutex array */
+ roff_t mutex_off; /* Adjusted offset of mutex array */
+ db_size_t mutex_size; /* Size of the aligned mutex */
+ roff_t thread_off; /* Offset of the thread area. */
+
+ db_mutex_t mtx_region; /* Region mutex. */
+
+ /* Protected using the region mutex. */
+ db_mutex_t mutex_next; /* Next free mutex */
+
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+ /* Mutexes for emulating atomic operations. */
+ db_mutex_t mtx_atomic[MAX_ATOMIC_MUTEXES];
+#endif
+
+ DB_MUTEX_STAT stat; /* Mutex statistics */
+} DB_MUTEXREGION;
+
+#ifdef HAVE_MUTEX_SUPPORT
+struct __db_mutex_t { /* SHARED */ /* Mutex. */
+#ifdef MUTEX_FIELDS
+ MUTEX_FIELDS /* Opaque thread mutex structures. */
+#endif
+#ifndef HAVE_MUTEX_FCNTL
+#if defined(HAVE_MUTEX_HYBRID) || \
+ (defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS))
+ /*
+ * For hybrid and test-and-set shared latches it is a counter:
+ * 0 means it is free,
+ * -1 is exclusively locked,
+ * > 0 is the number of shared readers.
+ * Pthreads shared latches use pthread_rwlock instead.
+ */
+ tsl_t tas;
+ db_atomic_t sharecount;
+#elif !defined(MUTEX_FIELDS)
+ /*
+ * This is the Test and Set flag for exclusive latches (mutexes):
+ * there is a free value (often 0, 1, or -1) and a set value.
+ */
+ tsl_t tas;
+#endif
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+ volatile u_int32_t wait; /* Count of waiters. */
+#endif
+ pid_t pid; /* Process owning mutex */
+ db_threadid_t tid; /* Thread owning mutex */
+
+ db_mutex_t mutex_next_link; /* Linked list of free mutexes. */
+
+#ifdef HAVE_STATISTICS
+ int alloc_id; /* Allocation ID. */
+
+ u_int32_t mutex_set_wait; /* Granted after wait. */
+ u_int32_t mutex_set_nowait; /* Granted without waiting. */
+#ifdef HAVE_SHARED_LATCHES
+ u_int32_t mutex_set_rd_wait; /* Granted shared lock after wait. */
+ u_int32_t mutex_set_rd_nowait; /* Granted shared lock w/out waiting. */
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+ u_int32_t hybrid_wait;
+ u_int32_t hybrid_wakeup; /* for counting spurious wakeups */
+#endif
+#endif
+
+ /*
+ * A subset of the flag arguments for __mutex_alloc().
+ *
+ * Flags should be an unsigned integer even if it's not required by
+ * the possible flags values, getting a single byte on some machines
+ * is expensive, and the mutex structure is a MP hot spot.
+ */
+ volatile u_int32_t flags; /* MUTEX_XXX */
+};
+#endif
+
+/* Macro to get a reference to a specific mutex. */
+#define MUTEXP_SET(env, indx) \
+ (F_ISSET(env, ENV_PRIVATE) ? (DB_MUTEX *) indx : \
+ (DB_MUTEX *)((u_int8_t *)env->mutex_handle->mutex_array + \
+ (indx) * \
+ ((DB_MUTEXREGION *)env->mutex_handle->reginfo.primary)->mutex_size))
+
+/*
+ * Check that a particular mutex is exclusively held at least by someone, not
+ * necessarily the current thread.
+ */
+#ifdef HAVE_MUTEX_SUPPORT
+#define MUTEX_IS_OWNED(env, mutex) \
+ (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \
+ F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \
+ F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED))
+#else
+#define MUTEX_IS_OWNED(env, mutex) 0
+#endif
+
+#if defined(HAVE_MUTEX_HYBRID) || defined(DB_WIN32) || \
+ (defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS))
+#define MUTEXP_IS_BUSY(mutexp) \
+ (F_ISSET(mutexp, DB_MUTEX_SHARED) ? \
+ (atomic_read(&(mutexp)->sharecount) != 0) : \
+ F_ISSET(mutexp, DB_MUTEX_LOCKED))
+#define MUTEXP_BUSY_FIELD(mutexp) \
+ (F_ISSET(mutexp, DB_MUTEX_SHARED) ? \
+ (atomic_read(&(mutexp)->sharecount)) : (mutexp)->flags)
+#else
+/* Pthread_rwlocks don't have an low-cost 'is it being shared?' predicate. */
+#define MUTEXP_IS_BUSY(mutexp) (F_ISSET((mutexp), DB_MUTEX_LOCKED))
+#define MUTEXP_BUSY_FIELD(mutexp) ((mutexp)->flags)
+#endif
+
+#define MUTEX_IS_BUSY(env, mutex) \
+ (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \
+ F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \
+ MUTEXP_IS_BUSY(MUTEXP_SET(env, mutex)))
+
+#define MUTEX_REQUIRED(env, mutex) \
+ DB_ASSERT(env, MUTEX_IS_OWNED(env, mutex))
+
+#define MUTEX_REQUIRED_READ(env, mutex) \
+ DB_ASSERT(env, MUTEX_IS_OWNED(env, mutex) || MUTEX_IS_BUSY(env, mutex))
+
+/*
+ * Test and set (and thus hybrid) shared latches use compare & exchange
+ * to acquire; the others the mutex-setting primitive defined above.
+ */
+#ifdef LOAD_ACTUAL_MUTEX_CODE
+
+#if defined(HAVE_SHARED_LATCHES)
+/* This is the value of the 'sharecount' of an exclusively held tas latch.
+ * The particular value is not special; it is just unlikely to be caused
+ * by releasing or acquiring a shared latch too many times.
+ */
+#define MUTEX_SHARE_ISEXCLUSIVE (-1024)
+
+/*
+ * Get an exclusive lock on a possibly sharable latch. We use the native
+ * MUTEX_SET() operation for non-sharable latches; it usually is faster.
+ */
+#define MUTEXP_ACQUIRE(mutexp) \
+ (F_ISSET(mutexp, DB_MUTEX_SHARED) ? \
+ atomic_compare_exchange(env, \
+ &(mutexp)->sharecount, 0, MUTEX_SHARE_ISEXCLUSIVE) : \
+ MUTEX_SET(&(mutexp)->tas))
+#else
+#define MUTEXP_ACQUIRE(mutexp) MUTEX_SET(&(mutexp)->tas)
+#endif
+
+#ifndef MEMBAR_ENTER
+#define MEMBAR_ENTER()
+#define MEMBAR_EXIT()
+#endif
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_MUTEX_INT_H_ */
diff --git a/src/dbinc/os.h b/src/dbinc/os.h
new file mode 100644
index 00000000..2515e6ee
--- /dev/null
+++ b/src/dbinc/os.h
@@ -0,0 +1,178 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_OS_H_
+#define _DB_OS_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Number of times to retry system calls that return EINTR or EBUSY. */
+#define DB_RETRY 100
+
+#ifdef __TANDEM
+/*
+ * OSS Tandem problem: fsync can return a Guardian file system error of 70,
+ * which has no symbolic name in OSS. HP says to retry the fsync. [#12957]
+ */
+#define RETRY_CHK(op, ret) do { \
+ int __retries, __t_ret; \
+ for ((ret) = 0, __retries = DB_RETRY;;) { \
+ if ((op) == 0) \
+ break; \
+ (ret) = __os_get_syserr(); \
+ if (((__t_ret = __os_posix_err(ret)) == EAGAIN || \
+ __t_ret == EBUSY || __t_ret == EINTR || \
+ __t_ret == EIO || __t_ret == 70) && --__retries > 0)\
+ continue; \
+ break; \
+ } \
+} while (0)
+#else
+#define RETRY_CHK(op, ret) do { \
+ int __retries, __t_ret; \
+ for ((ret) = 0, __retries = DB_RETRY;;) { \
+ if ((op) == 0) \
+ break; \
+ (ret) = __os_get_syserr(); \
+ if (((__t_ret = __os_posix_err(ret)) == EAGAIN || \
+ __t_ret == EBUSY || __t_ret == EINTR || \
+ __t_ret == EIO) && --__retries > 0) \
+ continue; \
+ break; \
+ } \
+} while (0)
+#endif
+
+#define RETRY_CHK_EINTR_ONLY(op, ret) do { \
+ int __retries; \
+ for ((ret) = 0, __retries = DB_RETRY;;) { \
+ if ((op) == 0) \
+ break; \
+ (ret) = __os_get_syserr(); \
+ if (__os_posix_err(ret) == EINTR && --__retries > 0) \
+ continue; \
+ break; \
+ } \
+} while (0)
+
+/*
+ * Flags understood by __os_open.
+ */
+#define DB_OSO_ABSMODE 0x0001 /* Absolute mode specified. */
+#define DB_OSO_CREATE 0x0002 /* POSIX: O_CREAT */
+#define DB_OSO_DIRECT 0x0004 /* Don't buffer the file in the OS. */
+#define DB_OSO_DSYNC 0x0008 /* POSIX: O_DSYNC. */
+#define DB_OSO_EXCL 0x0010 /* POSIX: O_EXCL */
+#define DB_OSO_RDONLY 0x0020 /* POSIX: O_RDONLY */
+#define DB_OSO_REGION 0x0040 /* Opening a region file. */
+#define DB_OSO_SEQ 0x0080 /* Expected sequential access. */
+#define DB_OSO_TEMP 0x0100 /* Remove after last close. */
+#define DB_OSO_TRUNC 0x0200 /* POSIX: O_TRUNC */
+
+/*
+ * File modes.
+ */
+#define DB_MODE_400 (S_IRUSR)
+#define DB_MODE_600 (S_IRUSR|S_IWUSR)
+#define DB_MODE_660 (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP)
+#define DB_MODE_666 (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)
+#define DB_MODE_700 (S_IRUSR|S_IWUSR|S_IXUSR)
+
+/*
+ * We group certain seek/write calls into a single function so that we
+ * can use pread(2)/pwrite(2) where they're available.
+ */
+#define DB_IO_READ 1
+#define DB_IO_WRITE 2
+
+/*
+ * Make a last "panic" check. Imagine a thread of control running in Berkeley
+ * DB, going to sleep. Another thread of control decides to run recovery
+ * because the environment is broken. The first thing recovery does is panic
+ * the existing environment, but we only check the panic flag when crossing the
+ * public API. If the sleeping thread wakes up and writes something, we could
+ * have two threads of control writing the log files at the same time. So,
+ * before reading or writing, make a last panic check. Obviously, there's still
+ * a window, but it's very, very small.
+ */
+#define LAST_PANIC_CHECK_BEFORE_IO(env) \
+ PANIC_CHECK(env); \
+ if (env != NULL && \
+ F_ISSET((env)->dbenv, DB_ENV_NOFLUSH)) \
+ return (0) \
+ \
+/* DB filehandle. */
+struct __fh_t {
+ /*
+ * Linked list of DB_FH's, linked from the DB_ENV, used to keep track
+ * of all open file handles for resource cleanup.
+ */
+ TAILQ_ENTRY(__fh_t) q;
+
+ /*
+ * The file-handle mutex is only used to protect the handle/fd
+ * across seek and read/write pairs, it does not protect the
+ * the reference count, or any other fields in the structure.
+ */
+ db_mutex_t mtx_fh; /* Mutex to lock. */
+
+ int ref; /* Reference count. */
+
+#if defined(DB_WIN32)
+ HANDLE handle; /* Windows/32 file handle. */
+ HANDLE trunc_handle; /* Handle for truncate calls. */
+#endif
+ int fd; /* POSIX file descriptor. */
+
+ char *name; /* File name at open. */
+
+ /*
+ * Last seek statistics, used for zero-filling on filesystems
+ * that don't support it directly.
+ */
+ db_pgno_t pgno;
+ u_int32_t pgsize;
+ off_t offset;
+
+#ifdef HAVE_STATISTICS
+ u_int32_t seek_count; /* I/O statistics */
+ u_int32_t read_count;
+ u_int32_t write_count;
+#endif
+
+#define DB_FH_ENVLINK 0x01 /* We're linked on the DB_ENV. */
+#define DB_FH_NOSYNC 0x02 /* Handle doesn't need to be sync'd. */
+#define DB_FH_OPENED 0x04 /* Handle is valid. */
+#define DB_FH_UNLINK 0x08 /* Unlink on close */
+#define DB_FH_REGION 0x10 /* Opened to contain a region */
+ u_int8_t flags;
+};
+
+/* Standard buffer size for ctime/ctime_r function calls. */
+#define CTIME_BUFLEN 26
+
+/*
+ * VxWorks requires we cast (const char *) variables to (char *) in order to
+ * pass them to system calls like stat, read and write.
+ */
+#ifdef HAVE_VXWORKS
+#define CHAR_STAR_CAST (char *)
+#define VOID_STAR_CAST (void *)
+#else
+#define CHAR_STAR_CAST
+#define VOID_STAR_CAST
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/os_ext.h"
+#endif /* !_DB_OS_H_ */
diff --git a/src/dbinc/partition.h b/src/dbinc/partition.h
new file mode 100644
index 00000000..09e42573
--- /dev/null
+++ b/src/dbinc/partition.h
@@ -0,0 +1,57 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * $Id$
+ */
+#ifndef _DB_PART_H_
+#define _DB_PART_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct __db_partition {
+ u_int32_t nparts; /* number of partitions. */
+ DBT *keys; /* array of range keys. */
+ void *data; /* the partition info. */
+ const char **dirs; /* locations for partitions. */
+ DB **handles; /* array of partition handles. */
+ u_int32_t (*callback) (DB *, DBT *);
+#define PART_CALLBACK 0x01
+#define PART_RANGE 0x02
+ u_int32_t flags;
+} DB_PARTITION;
+
+/*
+ * Internal part of a partitioned cursor.
+ */
+typedef struct __part_internal {
+ __DBC_INTERNAL
+ u_int32_t part_id;
+ DBC *sub_cursor;
+} PART_CURSOR;
+
+#ifdef HAVE_PARTITION
+#define PART_NAME "__dbp.%s.%03d"
+#define PART_LEN (strlen("__dbp..")+3)
+#define PART_PREFIX "__dbp."
+#define IS_PARTITION_DB_FILE(name) (strncmp(name, PART_PREFIX, \
+ sizeof(PART_PREFIX) - 1) == 0)
+
+#define DB_IS_PARTITIONED(dbp) \
+ (dbp->p_internal != NULL && \
+ ((DB_PARTITION *)dbp->p_internal)->handles != NULL)
+
+#define DBC_PART_REFRESH(dbc) (F_SET(dbc, DBC_PARTITIONED))
+#else
+#define DBC_PART_REFRESH(dbc)
+#define DB_IS_PARTITIONED(dbp) (0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif
diff --git a/src/dbinc/perfmon.h b/src/dbinc/perfmon.h
new file mode 100644
index 00000000..c3b9b9fa
--- /dev/null
+++ b/src/dbinc/perfmon.h
@@ -0,0 +1,103 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_PERFMON_H_
+#define _DB_PERFMON_H_
+
+/*******************************************************
+ * Oracle Berkeley DB Performance Event Monitoring
+ *
+ * Some events inside of Oracle Berkeley DB can be 'published'
+ * to the operating environment's performance tracing system
+ * as they occur. Current support includes
+ * --enable-dtrace
+ * Solaris
+ * Linux (via SystemTap's dtrace wrappers)
+ * Darwin (Mac OS X)
+ * QNX(?)
+ *
+ ******************************************************/
+
+/*
+ * The performance monitoring system can display many of the statistics which
+ * are obtainable through the {DB,DB_ENV}->xxx_stat() functions. By default
+ * they are excluded. They can be enabled with --enable-perfmon-statistics.
+ */
+#ifdef HAVE_PERFMON_STATISTICS
+#define STAT_PERFMON1(env, cat, id, a1) PERFMON1(env, cat, id, (a1))
+#define STAT_PERFMON2(env, cat, id, a1, a2) \
+ PERFMON2(env, cat, id, (a1), (a2))
+#define STAT_PERFMON3(env, cat, id, a1, a2, a3) \
+ PERFMON3(env, cat, id, (a1), (a2), (a3))
+#else
+#define STAT_PERFMON1(env, cat, id, a1) NOP_STATEMENT
+#define STAT_PERFMON2(env, cat, id, a1, a2) NOP_STATEMENT
+#define STAT_PERFMON3(env, cat, id, a1, a2, a3) NOP_STATEMENT
+#endif
+
+
+#if defined(HAVE_PERFMON) && defined(HAVE_STATISTICS)
+/*
+ * The DTrace macros which are generated at configure time in db_provider.h can
+ * have full function signatures. These declarations are needed for compilation
+ * when DTrace support is enabled. It is "too early" in the include sequence
+ * to include the header files which define these structs.
+ */
+struct _db_page;
+struct __bh;
+struct __db_dbt;
+struct __sh_dbt;
+struct __db_mutex_t;
+
+#if defined(HAVE_DTRACE)
+/*
+ * Solaris 10, Darwin/Mac OS X starting in 10.6 (Snow Leopard), Linux with
+ * the DTrace-compatible version of SystemTap, possibly QNX.
+ */
+#include "db_provider.h"
+
+#define PERFMON0(env, cat, id) bdb_##cat##_##id()
+#define PERFMON1(env, cat, id, a1) bdb_##cat##_##id(a1)
+#define PERFMON2(env, cat, id, a1, a2) \
+ bdb_##cat##_##id((a1), (a2))
+#define PERFMON3(env, cat, id, a1, a2, a3) \
+ do { \
+ if (PERFMON_ENABLED(env, cat, id)) \
+ bdb_##cat##_##id((a1), (a2), (a3)); \
+ } while (0)
+#define PERFMON4(env, cat, id, a1, a2, a3, a4) \
+ do { \
+ if (PERFMON_ENABLED(env, cat, id)) \
+ bdb_##cat##_##id((a1), (a2), (a3), (a4)); \
+ } while (0)
+#define PERFMON5(env, cat, id, a1, a2, a3, a4, a5) \
+ do { \
+ if (PERFMON_ENABLED(env, cat, id)) \
+ bdb_##cat##_##id((a1), (a2), (a3), (a4), (a5)); \
+ } while (0)
+#define PERFMON6(env, cat, id, a1, a2, a3, a4, a5, a6) \
+ do { \
+ if (PERFMON_ENABLED(env, cat, id)) \
+ bdb_##cat##_##id((a1), (a2), (a3), (a4), (a5), (a6)); \
+ } while (0)
+#define PERFMON_ENABLED(env, cat, id) bdb_##cat##_##id##_enabled()
+#endif
+
+#else
+/* Without HAVE_PERFMON or HAVE_STATISTICS these macros map to null bodies. */
+#define PERFMON0(env, cat, id) NOP_STATEMENT
+#define PERFMON1(env, cat, id, a1) NOP_STATEMENT
+#define PERFMON2(env, cat, id, a1, a2) NOP_STATEMENT
+#define PERFMON3(env, cat, id, a1, a2, a3) NOP_STATEMENT
+#define PERFMON4(env, cat, id, a1, a2, a3, a4) NOP_STATEMENT
+#define PERFMON5(env, cat, id, a1, a2, a3, a4, a5) NOP_STATEMENT
+#define PERFMON6(env, cat, id, a1, a2, a3, a4, a5, a6) NOP_STATEMENT
+#define PERFMON_ENABLED(env, cat, id) FALSE
+#endif
+
+#endif
diff --git a/src/dbinc/qam.h b/src/dbinc/qam.h
new file mode 100644
index 00000000..657c11e2
--- /dev/null
+++ b/src/dbinc/qam.h
@@ -0,0 +1,203 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_QAM_H_
+#define _DB_QAM_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * QAM data elements: a status field and the data.
+ */
+typedef struct _qamdata {
+ u_int8_t flags; /* 00: delete bit. */
+#define QAM_VALID 0x01
+#define QAM_SET 0x02
+ u_int8_t data[1]; /* Record. */
+} QAMDATA;
+
+struct __queue; typedef struct __queue QUEUE;
+struct __qcursor; typedef struct __qcursor QUEUE_CURSOR;
+
+struct __qcursor {
+ /* struct __dbc_internal */
+ __DBC_INTERNAL
+
+ /* Queue private part */
+
+ /* Per-thread information: queue private. */
+ db_recno_t recno; /* Current record number. */
+
+ u_int32_t flags;
+};
+
+typedef struct __mpfarray {
+ u_int32_t n_extent; /* Number of extents in table. */
+ u_int32_t low_extent; /* First extent open. */
+ u_int32_t hi_extent; /* Last extent open. */
+ struct __qmpf {
+ int pinref;
+ DB_MPOOLFILE *mpf;
+ } *mpfarray; /* Array of open extents. */
+} MPFARRAY;
+
+/*
+ * The in-memory, per-tree queue data structure.
+ */
+struct __queue {
+ db_pgno_t q_meta; /* Database meta-data page. */
+ db_pgno_t q_root; /* Database root page. */
+
+ int re_pad; /* Fixed-length padding byte. */
+ u_int32_t re_len; /* Length for fixed-length records. */
+ u_int32_t rec_page; /* records per page */
+ u_int32_t page_ext; /* Pages per extent */
+ MPFARRAY array1, array2; /* File arrays. */
+
+ /* Extent file configuration: */
+ DBT pgcookie; /* Initialized pgcookie. */
+ DB_PGINFO pginfo; /* Initialized pginfo struct. */
+
+ char *path; /* Space allocated to file pathname. */
+ char *name; /* The name of the file. */
+ char *dir; /* The dir of the file. */
+ int mode; /* Mode to open extents. */
+};
+
+/* Format for queue extent names. */
+#define QUEUE_EXTENT "%s%c__dbq.%s.%d"
+#define QUEUE_EXTENT_HEAD "__dbq.%s."
+#define QUEUE_EXTENT_PREFIX "__dbq."
+
+typedef struct __qam_filelist {
+ DB_MPOOLFILE *mpf;
+ u_int32_t id;
+} QUEUE_FILELIST;
+
+/*
+ * Calculate the page number of a recno.
+ *
+ * Number of records per page =
+ * Divide the available space on the page by the record len + header.
+ *
+ * Page number for record =
+ * divide the physical record number by the records per page
+ * add the root page number
+ * For now the root page will always be 1, but we might want to change
+ * in the future (e.g. multiple fixed len queues per file).
+ *
+ * Index of record on page =
+ * physical record number, less the logical pno times records/page
+ */
+#define CALC_QAM_RECNO_PER_PAGE(dbp) \
+ (((dbp)->pgsize - QPAGE_SZ(dbp)) / \
+ (u_int32_t)DB_ALIGN((uintmax_t)SSZA(QAMDATA, data) + \
+ ((QUEUE *)(dbp)->q_internal)->re_len, sizeof(u_int32_t)))
+
+#define QAM_RECNO_PER_PAGE(dbp) (((QUEUE*)(dbp)->q_internal)->rec_page)
+
+#define QAM_RECNO_PAGE(dbp, recno) \
+ (((QUEUE *)(dbp)->q_internal)->q_root \
+ + (((recno) - 1) / QAM_RECNO_PER_PAGE(dbp)))
+
+#define QAM_PAGE_EXTENT(dbp, pgno) \
+ (((pgno) - 1) / ((QUEUE *)(dbp)->q_internal)->page_ext)
+
+#define QAM_RECNO_EXTENT(dbp, recno) \
+ QAM_PAGE_EXTENT(dbp, QAM_RECNO_PAGE(dbp, recno))
+
+#define QAM_RECNO_INDEX(dbp, pgno, recno) \
+ (u_int32_t)(((recno) - 1) - (QAM_RECNO_PER_PAGE(dbp) \
+ * (pgno - ((QUEUE *)(dbp)->q_internal)->q_root)))
+
+#define QAM_GET_RECORD(dbp, page, index) \
+ ((QAMDATA *)((u_int8_t *)(page) + (QPAGE_SZ(dbp) + \
+ (DB_ALIGN((uintmax_t)SSZA(QAMDATA, data) + \
+ ((QUEUE *)(dbp)->q_internal)->re_len, sizeof(u_int32_t)) * index))))
+
+#define QAM_OUTSIDE_QUEUE(meta, recno) \
+ (((meta)->cur_recno >= (meta)->first_recno ? \
+ ((recno) < (meta)->first_recno || \
+ (recno) > (meta)->cur_recno) : \
+ ((recno) > (meta)->cur_recno && \
+ (recno) < (meta)->first_recno)))
+
+#define QAM_AFTER_CURRENT(meta, recno) \
+ ((recno) == (meta)->cur_recno || \
+ (QAM_OUTSIDE_QUEUE(meta, recno) && \
+ ((recno) - (meta)->cur_recno) <= ((meta)->first_recno - (recno))))
+
+#define QAM_BEFORE_FIRST(meta, recno) \
+ (QAM_OUTSIDE_QUEUE(meta, recno) && \
+ ((meta)->first_recno - (recno)) < ((recno) - (meta)->cur_recno))
+
+#define QAM_NOT_VALID(meta, recno) \
+ (recno == RECNO_OOB || \
+ QAM_BEFORE_FIRST(meta, recno) || QAM_AFTER_CURRENT(meta, recno))
+
+#define QAM_WAKEUP(dbc, ret) do { \
+ if (STD_LOCKING(dbc)) { \
+ dbc->lock.pgno = PGNO_INVALID; \
+ dbc->lock.type = DB_PAGE_LOCK; \
+ ret = __lock_wakeup((dbc)->dbp->env, &(dbc)->lock_dbt); \
+ } else \
+ ret = 0; \
+} while (0)
+
+/* Handle wrap around. */
+#define QAM_INC_RECNO(recno) do { \
+ recno++; \
+} while (recno == RECNO_OOB)
+
+#define QAM_DEC_RECNO(recno) do { \
+ recno--; \
+} while (recno == RECNO_OOB)
+
+
+/*
+ * Log opcodes for the mvptr routine.
+ */
+#define QAM_SETFIRST 0x01
+#define QAM_SETCUR 0x02
+#define QAM_TRUNCATE 0x04
+
+typedef enum {
+ QAM_PROBE_GET,
+ QAM_PROBE_PUT,
+ QAM_PROBE_DIRTY,
+ QAM_PROBE_MPF
+} qam_probe_mode;
+
+/*
+ * Ops for __qam_nameop.
+ */
+typedef enum {
+ QAM_NAME_DISCARD,
+ QAM_NAME_RENAME,
+ QAM_NAME_REMOVE
+} qam_name_op;
+
+#define __qam_fget(dbc, pgnoaddr, flags, addrp) \
+ __qam_fprobe(dbc, *pgnoaddr, \
+ addrp, QAM_PROBE_GET, DB_PRIORITY_UNCHANGED, flags)
+
+#define __qam_fput(dbc, pgno, addrp, priority) \
+ __qam_fprobe(dbc, pgno, addrp, QAM_PROBE_PUT, priority, 0)
+
+#define __qam_dirty(dbc, pgno, pagep, priority) \
+ __qam_fprobe(dbc, pgno, pagep, QAM_PROBE_DIRTY, priority, 0)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/qam_auto.h"
+#include "dbinc_auto/qam_ext.h"
+#endif /* !_DB_QAM_H_ */
diff --git a/src/dbinc/queue.h b/src/dbinc/queue.h
new file mode 100644
index 00000000..5a62741a
--- /dev/null
+++ b/src/dbinc/queue.h
@@ -0,0 +1,570 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)queue.h 8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.54 2002/08/05 05:18:43 alfred Exp $
+ */
+
+#ifndef _DB_QUEUE_H_
+#define _DB_QUEUE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This file defines four types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists and tail queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction. Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ * SLIST LIST STAILQ TAILQ
+ * _HEAD + + + +
+ * _HEAD_INITIALIZER + + + +
+ * _ENTRY + + + +
+ * _INIT + + + +
+ * _EMPTY + + + +
+ * _FIRST + + + +
+ * _NEXT + + + +
+ * _PREV - - - +
+ * _LAST - - + +
+ * _FOREACH + + + +
+ * _FOREACH_REVERSE - - - +
+ * _INSERT_HEAD + + + +
+ * _INSERT_BEFORE - + - +
+ * _INSERT_AFTER + + + +
+ * _INSERT_TAIL - - + +
+ * _CONCAT - - + +
+ * _REMOVE_HEAD + - + -
+ * _REMOVE + + + +
+ *
+ */
+
+/*
+ * XXX
+ * We #undef all of the macros because there are incompatible versions of this
+ * file and these macros on various systems. What makes the problem worse is
+ * they are included and/or defined by system include files which we may have
+ * already loaded into Berkeley DB before getting here. For example, FreeBSD's
+ * <rpc/rpc.h> includes its system <sys/queue.h>, and VxWorks UnixLib.h defines
+ * several of the LIST_XXX macros. Visual C.NET 7.0 also defines some of these
+ * same macros in Vc7\PlatformSDK\Include\WinNT.h. Make sure we use ours.
+ */
+#undef LIST_EMPTY
+#undef LIST_ENTRY
+#undef LIST_FIRST
+#undef LIST_FOREACH
+#undef LIST_HEAD
+#undef LIST_HEAD_INITIALIZER
+#undef LIST_INIT
+#undef LIST_INSERT_AFTER
+#undef LIST_INSERT_BEFORE
+#undef LIST_INSERT_HEAD
+#undef LIST_NEXT
+#undef LIST_REMOVE
+#undef QMD_TRACE_ELEM
+#undef QMD_TRACE_HEAD
+#undef QUEUE_MACRO_DEBUG
+#undef SLIST_EMPTY
+#undef SLIST_ENTRY
+#undef SLIST_FIRST
+#undef SLIST_FOREACH
+#undef SLIST_FOREACH_PREVPTR
+#undef SLIST_HEAD
+#undef SLIST_HEAD_INITIALIZER
+#undef SLIST_INIT
+#undef SLIST_INSERT_AFTER
+#undef SLIST_INSERT_HEAD
+#undef SLIST_NEXT
+#undef SLIST_REMOVE
+#undef SLIST_REMOVE_HEAD
+#undef STAILQ_CONCAT
+#undef STAILQ_EMPTY
+#undef STAILQ_ENTRY
+#undef STAILQ_FIRST
+#undef STAILQ_FOREACH
+#undef STAILQ_HEAD
+#undef STAILQ_HEAD_INITIALIZER
+#undef STAILQ_INIT
+#undef STAILQ_INSERT_AFTER
+#undef STAILQ_INSERT_HEAD
+#undef STAILQ_INSERT_TAIL
+#undef STAILQ_LAST
+#undef STAILQ_NEXT
+#undef STAILQ_REMOVE
+#undef STAILQ_REMOVE_HEAD
+#undef STAILQ_REMOVE_HEAD_UNTIL
+#undef TAILQ_CONCAT
+#undef TAILQ_EMPTY
+#undef TAILQ_ENTRY
+#undef TAILQ_FIRST
+#undef TAILQ_FOREACH
+#undef TAILQ_FOREACH_REVERSE
+#undef TAILQ_HEAD
+#undef TAILQ_HEAD_INITIALIZER
+#undef TAILQ_INIT
+#undef TAILQ_INSERT_AFTER
+#undef TAILQ_INSERT_BEFORE
+#undef TAILQ_INSERT_HEAD
+#undef TAILQ_INSERT_TAIL
+#undef TAILQ_LAST
+#undef TAILQ_NEXT
+#undef TAILQ_PREV
+#undef TAILQ_REMOVE
+#undef TRACEBUF
+#undef TRASHIT
+
+#define QUEUE_MACRO_DEBUG 0
+#if QUEUE_MACRO_DEBUG
+/* Store the last 2 places the queue element or head was altered */
+struct qm_trace {
+ char * lastfile;
+ int lastline;
+ char * prevfile;
+ int prevline;
+};
+
+#define TRACEBUF struct qm_trace trace;
+#define TRASHIT(x) do {(x) = (void *)-1;} while (0)
+
+#define QMD_TRACE_HEAD(head) do { \
+ (head)->trace.prevline = (head)->trace.lastline; \
+ (head)->trace.prevfile = (head)->trace.lastfile; \
+ (head)->trace.lastline = __LINE__; \
+ (head)->trace.lastfile = __FILE__; \
+} while (0)
+
+#define QMD_TRACE_ELEM(elem) do { \
+ (elem)->trace.prevline = (elem)->trace.lastline; \
+ (elem)->trace.prevfile = (elem)->trace.lastfile; \
+ (elem)->trace.lastline = __LINE__; \
+ (elem)->trace.lastfile = __FILE__; \
+} while (0)
+
+#else
+#define QMD_TRACE_ELEM(elem)
+#define QMD_TRACE_HEAD(head)
+#define TRACEBUF
+#define TRASHIT(x)
+#endif /* QUEUE_MACRO_DEBUG */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define SLIST_HEAD(name, type) \
+struct name { \
+ struct type *slh_first; /* first element */ \
+}
+
+#define SLIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define SLIST_ENTRY(type) \
+struct { \
+ struct type *sle_next; /* next element */ \
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define SLIST_EMPTY(head) ((head)->slh_first == NULL)
+
+#define SLIST_FIRST(head) ((head)->slh_first)
+
+#define SLIST_FOREACH(var, head, field) \
+ for ((var) = SLIST_FIRST((head)); \
+ (var); \
+ (var) = SLIST_NEXT((var), field))
+
+#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \
+ for ((varp) = &SLIST_FIRST((head)); \
+ ((var) = *(varp)) != NULL; \
+ (varp) = &SLIST_NEXT((var), field))
+
+#define SLIST_INIT(head) do { \
+ SLIST_FIRST((head)) = NULL; \
+} while (0)
+
+#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \
+ SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \
+ SLIST_NEXT((slistelm), field) = (elm); \
+} while (0)
+
+#define SLIST_INSERT_HEAD(head, elm, field) do { \
+ SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \
+ SLIST_FIRST((head)) = (elm); \
+} while (0)
+
+#define SLIST_NEXT(elm, field) ((elm)->field.sle_next)
+
+#define SLIST_REMOVE(head, elm, type, field) do { \
+ if (SLIST_FIRST((head)) == (elm)) { \
+ SLIST_REMOVE_HEAD((head), field); \
+ } \
+ else { \
+ struct type *curelm = SLIST_FIRST((head)); \
+ while (curelm != NULL && \
+ SLIST_NEXT(curelm, field) != (elm)) \
+ curelm = SLIST_NEXT(curelm, field); \
+ if (curelm != NULL) \
+ SLIST_NEXT(curelm, field) = \
+ SLIST_NEXT(SLIST_NEXT(curelm, field), field);\
+ } \
+} while (0)
+
+#define SLIST_REMOVE_HEAD(head, field) do { \
+ SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define STAILQ_HEAD(name, type) \
+struct name { \
+ struct type *stqh_first;/* first element */ \
+ struct type **stqh_last;/* addr of last next element */ \
+}
+
+#define STAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).stqh_first }
+
+#define STAILQ_ENTRY(type) \
+struct { \
+ struct type *stqe_next; /* next element */ \
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define STAILQ_CONCAT(head1, head2) do { \
+ if (!STAILQ_EMPTY((head2))) { \
+ *(head1)->stqh_last = (head2)->stqh_first; \
+ (head1)->stqh_last = (head2)->stqh_last; \
+ STAILQ_INIT((head2)); \
+ } \
+} while (0)
+
+#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL)
+
+#define STAILQ_FIRST(head) ((head)->stqh_first)
+
+#define STAILQ_FOREACH(var, head, field) \
+ for ((var) = STAILQ_FIRST((head)); \
+ (var); \
+ (var) = STAILQ_NEXT((var), field))
+
+#define STAILQ_INIT(head) do { \
+ STAILQ_FIRST((head)) = NULL; \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \
+ if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+ STAILQ_NEXT((tqelm), field) = (elm); \
+} while (0)
+
+#define STAILQ_INSERT_HEAD(head, elm, field) do { \
+ if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+ STAILQ_FIRST((head)) = (elm); \
+} while (0)
+
+#define STAILQ_INSERT_TAIL(head, elm, field) do { \
+ STAILQ_NEXT((elm), field) = NULL; \
+ *(head)->stqh_last = (elm); \
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+} while (0)
+
+#define STAILQ_LAST(head, type, field) \
+ (STAILQ_EMPTY((head)) ? \
+ NULL : \
+ ((struct type *) \
+ ((char *)((head)->stqh_last) - __offsetof(struct type, field))))
+
+#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next)
+
+#define STAILQ_REMOVE(head, elm, type, field) do { \
+ if (STAILQ_FIRST((head)) == (elm)) { \
+ STAILQ_REMOVE_HEAD((head), field); \
+ } \
+ else { \
+ struct type *curelm = STAILQ_FIRST((head)); \
+ while (STAILQ_NEXT(curelm, field) != (elm)) \
+ curelm = STAILQ_NEXT(curelm, field); \
+ if ((STAILQ_NEXT(curelm, field) = \
+ STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+ (head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+ } \
+} while (0)
+
+#define STAILQ_REMOVE_HEAD(head, field) do { \
+ if ((STAILQ_FIRST((head)) = \
+ STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \
+ if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define LIST_HEAD(name, type) \
+struct name { \
+ struct type *lh_first; /* first element */ \
+}
+
+#define LIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define LIST_ENTRY(type) \
+struct { \
+ struct type *le_next; /* next element */ \
+ struct type **le_prev; /* address of previous next element */ \
+}
+
+/*
+ * List functions.
+ */
+
+#define LIST_EMPTY(head) ((head)->lh_first == NULL)
+
+#define LIST_FIRST(head) ((head)->lh_first)
+
+#define LIST_FOREACH(var, head, field) \
+ for ((var) = LIST_FIRST((head)); \
+ (var); \
+ (var) = LIST_NEXT((var), field))
+
+#define LIST_INIT(head) do { \
+ LIST_FIRST((head)) = NULL; \
+} while (0)
+
+#define LIST_INSERT_AFTER(listelm, elm, field) do { \
+ if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+ LIST_NEXT((listelm), field)->field.le_prev = \
+ &LIST_NEXT((elm), field); \
+ LIST_NEXT((listelm), field) = (elm); \
+ (elm)->field.le_prev = &LIST_NEXT((listelm), field); \
+} while (0)
+
+#define LIST_INSERT_BEFORE(listelm, elm, field) do { \
+ (elm)->field.le_prev = (listelm)->field.le_prev; \
+ LIST_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.le_prev = (elm); \
+ (listelm)->field.le_prev = &LIST_NEXT((elm), field); \
+} while (0)
+
+#define LIST_INSERT_HEAD(head, elm, field) do { \
+ if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \
+ LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+ LIST_FIRST((head)) = (elm); \
+ (elm)->field.le_prev = &LIST_FIRST((head)); \
+} while (0)
+
+#define LIST_NEXT(elm, field) ((elm)->field.le_next)
+
+#define LIST_REMOVE(elm, field) do { \
+ if (LIST_NEXT((elm), field) != NULL) \
+ LIST_NEXT((elm), field)->field.le_prev = \
+ (elm)->field.le_prev; \
+ *(elm)->field.le_prev = LIST_NEXT((elm), field); \
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define TAILQ_HEAD(name, type) \
+struct name { \
+ struct type *tqh_first; /* first element */ \
+ struct type **tqh_last; /* addr of last next element */ \
+ TRACEBUF \
+}
+
+#define TAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).tqh_first }
+
+#define TAILQ_ENTRY(type) \
+struct { \
+ struct type *tqe_next; /* next element */ \
+ struct type **tqe_prev; /* address of previous next element */ \
+ TRACEBUF \
+}
+
+/*
+ * Tail queue functions.
+ */
+#define TAILQ_CONCAT(head1, head2, field) do { \
+ if (!TAILQ_EMPTY(head2)) { \
+ *(head1)->tqh_last = (head2)->tqh_first; \
+ (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \
+ (head1)->tqh_last = (head2)->tqh_last; \
+ TAILQ_INIT((head2)); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_HEAD(head2); \
+ } \
+} while (0)
+
+#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL)
+
+#define TAILQ_FIRST(head) ((head)->tqh_first)
+
+#define TAILQ_FOREACH(var, head, field) \
+ for ((var) = TAILQ_FIRST((head)); \
+ (var); \
+ (var) = TAILQ_NEXT((var), field))
+
+#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \
+ for ((var) = TAILQ_LAST((head), headname); \
+ (var); \
+ (var) = TAILQ_PREV((var), headname, field))
+
+#define TAILQ_INIT(head) do { \
+ TAILQ_FIRST((head)) = NULL; \
+ (head)->tqh_last = &TAILQ_FIRST((head)); \
+ QMD_TRACE_HEAD(head); \
+} while (0)
+
+#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else { \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_HEAD(head); \
+ } \
+ TAILQ_NEXT((listelm), field) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+ QMD_TRACE_ELEM(&listelm->field); \
+} while (0)
+
+#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
+ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
+ TAILQ_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.tqe_prev = (elm); \
+ (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+ QMD_TRACE_ELEM(&listelm->field); \
+} while (0)
+
+#define TAILQ_INSERT_HEAD(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \
+ TAILQ_FIRST((head))->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ TAILQ_FIRST((head)) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#define TAILQ_INSERT_TAIL(head, elm, field) do { \
+ TAILQ_NEXT((elm), field) = NULL; \
+ (elm)->field.tqe_prev = (head)->tqh_last; \
+ *(head)->tqh_last = (elm); \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#define TAILQ_LAST(head, headname) \
+ (*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define TAILQ_PREV(elm, headname, field) \
+ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define TAILQ_REMOVE(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field)) != NULL) \
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ (elm)->field.tqe_prev; \
+ else { \
+ (head)->tqh_last = (elm)->field.tqe_prev; \
+ QMD_TRACE_HEAD(head); \
+ } \
+ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \
+ TRASHIT((elm)->field.tqe_next); \
+ TRASHIT((elm)->field.tqe_prev); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_QUEUE_H_ */
diff --git a/src/dbinc/region.h b/src/dbinc/region.h
new file mode 100644
index 00000000..ac0ff16f
--- /dev/null
+++ b/src/dbinc/region.h
@@ -0,0 +1,329 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_REGION_H_
+#define _DB_REGION_H_
+
+/*
+ * The DB environment consists of some number of "regions", which are described
+ * by the following four structures:
+ *
+ * REGENV -- shared information about the environment
+ * REGENV_REF -- file describing system memory version of REGENV
+ * REGION -- shared information about a single region
+ * REGINFO -- per-process information about a REGION
+ *
+ * There are three types of memory that hold regions:
+ * per-process heap (malloc)
+ * file mapped into memory (mmap, MapViewOfFile)
+ * system memory (shmget, CreateFileMapping)
+ *
+ * By default, regions are created in filesystem-backed shared memory. They
+ * can also be created in system shared memory (DB_SYSTEM_MEM), or, if private
+ * to a process, in heap memory (DB_PRIVATE).
+ *
+ * Regions in the filesystem are named "__db.001", "__db.002" and so on. If
+ * we're not using a private environment allocated in heap, "__db.001" will
+ * always exist, as we use it to synchronize on the regions, whether they are
+ * in filesystem-backed memory or system memory.
+ *
+ * The file "__db.001" contains a REGENV structure pointing to an
+ * array of REGION structures. Each REGION structures describes an
+ * underlying chunk of shared memory.
+ *
+ * __db.001
+ * +---------+
+ * |REGENV |
+ * +---------+
+ * |
+ * \/
+ * +---------+ +----------+
+ * |REGION |-> | __db.001 |
+ * | | +----------+
+ * +---------+ +----------+
+ * |REGION |-> | __db.002 |
+ * | | +----------+
+ * +---------+ +----------+
+ * |REGION |-> | __db.003 |
+ * | | +----------+
+ * +---------+ +----------+
+ * |REGION |-> | __db.004 |
+ * | | +----------+
+ * +---------+
+ *
+ * The tricky part about manipulating the regions is creating or joining the
+ * database environment. We have to be sure only a single thread of control
+ * creates and/or recovers a database environment. All other threads should
+ * then join without seeing inconsistent data.
+ *
+ * We do this in two parts: first, we use the underlying O_EXCL flag to the
+ * open system call to serialize creation of the __db.001 file. The thread
+ * of control creating that file then proceeds to create the remaining
+ * regions in the environment, including the mutex region. Once the mutex
+ * region has been created, the creating thread of control fills in the
+ * __db.001 file's magic number. Other threads of control (the ones that
+ * didn't create the __db.001 file), wait on the initialization of the
+ * __db.001 file's magic number. After it has been initialized, all threads
+ * of control can proceed, using normal shared mutex locking procedures for
+ * exclusion.
+ *
+ * REGIONs are not moved or removed during the life of the environment, and
+ * so processes can have long-lived references to them.
+ *
+ * One of the REGION structures describes the environment region itself.
+ *
+ * The REGION array is not locked in any way. It's an array so we don't have
+ * to manipulate data structures after a crash -- on some systems, we have to
+ * join and clean up the mutex region after application failure. Using an
+ * array means we don't have to worry about broken links or other nastiness
+ * after the failure.
+ *
+ * All requests to create or join a region return a REGINFO structure, which
+ * is held by the caller and used to open and subsequently close the reference
+ * to the region. The REGINFO structure contains the per-process information
+ * that we need to access the region.
+ *
+ * The one remaining complication. If the regions (including the environment
+ * region) live in system memory, and the system memory isn't "named" somehow
+ * in the filesystem name space, we need some way of finding it. Do this by
+ * by writing the REGENV_REF structure into the "__db.001" file. When we find
+ * a __db.001 file that is too small to be a real, on-disk environment, we use
+ * the information it contains to redirect to the real "__db.001" file/memory.
+ * This currently only happens when the REGENV file is in shared system memory.
+ *
+ * Although DB does not currently grow regions when they run out of memory, it
+ * would be possible to do so. To grow a region, allocate a new region of the
+ * appropriate size, then copy the old region over it and insert the additional
+ * memory into the already existing shalloc arena. Region users must reset
+ * their base addresses and any local pointers into the memory, of course.
+ * This failed in historic versions of DB because the region mutexes lived in
+ * the mapped memory, and when it was unmapped and remapped (or copied),
+ * threads could lose track of it. Also, some systems didn't support mutex
+ * copying, e.g., from OSF1 V4.0:
+ *
+ * The address of an msemaphore structure may be significant. If the
+ * msemaphore structure contains any value copied from an msemaphore
+ * structure at a different address, the result is undefined.
+ *
+ * All mutexes are now maintained in a separate region which is never unmapped,
+ * so growing regions should be possible.
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DB_REGION_PREFIX "__db" /* DB file name prefix. */
+#define DB_REGION_FMT "__db.%03d" /* Region file name format. */
+#define DB_REGION_ENV "__db.001" /* Primary environment name. */
+#define IS_DB_FILE(name) (strncmp(name, DB_REGION_PREFIX, \
+ sizeof(DB_REGION_PREFIX) - 1) == 0)
+
+#define INVALID_REGION_ID 0 /* Out-of-band region ID. */
+#define REGION_ID_ENV 1 /* Primary environment ID. */
+
+typedef enum {
+ INVALID_REGION_TYPE=0, /* Region type. */
+ REGION_TYPE_ENV,
+ REGION_TYPE_LOCK,
+ REGION_TYPE_LOG,
+ REGION_TYPE_MPOOL,
+ REGION_TYPE_MUTEX,
+ REGION_TYPE_TXN } reg_type_t;
+
+#define INVALID_REGION_SEGID -1 /* Segment IDs are either shmget(2) or
+ * Win16 segment identifiers. They are
+ * both stored in a "long", and we need
+ * an out-of-band value.
+ */
+/*
+ * Nothing can live at region offset 0, because, in all cases, that's where
+ * we store *something*. Lots of code needs an out-of-band value for region
+ * offsets, so we use 0.
+ */
+#define INVALID_ROFF 0
+
+/* Reference describing system memory version of REGENV. */
+typedef struct __db_reg_env_ref {
+ roff_t size; /* Region size. */
+ roff_t max; /* Region max in bytes. */
+ long segid; /* UNIX shmget ID, VxWorks ID. */
+} REGENV_REF;
+
+/* Per-environment region information. */
+typedef struct __db_reg_env { /* SHARED */
+ /*
+ * !!!
+ * The magic, panic, version, envid and signature fields of the region
+ * are fixed in size, the timestamp field is the first field which is
+ * variable length. These fields must never change in order, to
+ * guarantee we can always read them, no matter what release we have.
+ *
+ * !!!
+ * The magic and panic fields are NOT protected by any mutex, and for
+ * this reason cannot be anything more complicated than zero/non-zero.
+ */
+ u_int32_t magic; /* Valid region magic number. */
+ u_int32_t panic; /* Environment is dead. */
+
+ u_int32_t majver; /* Major DB version number. */
+ u_int32_t minver; /* Minor DB version number. */
+ u_int32_t patchver; /* Patch DB version number. */
+
+ u_int32_t envid; /* Unique environment ID. */
+
+ u_int32_t signature; /* Structure signatures. */
+
+ time_t timestamp; /* Creation time. */
+
+ /*
+ * Flags saved in the init_flags field of the environment, representing
+ * flags to DB_ENV->set_flags and DB_ENV->open that need to be set.
+ */
+ u_int32_t init_flags;
+#define DB_INITENV_CDB 0x0001 /* DB_INIT_CDB */
+#define DB_INITENV_CDB_ALLDB 0x0002 /* DB_INIT_CDB_ALLDB */
+#define DB_INITENV_LOCK 0x0004 /* DB_INIT_LOCK */
+#define DB_INITENV_LOG 0x0008 /* DB_INIT_LOG */
+#define DB_INITENV_MPOOL 0x0010 /* DB_INIT_MPOOL */
+#define DB_INITENV_REP 0x0020 /* DB_INIT_REP */
+#define DB_INITENV_TXN 0x0040 /* DB_INIT_TXN */
+
+
+ /*
+ * The mtx_regenv mutex protects the environment reference count and
+ * memory allocation from the primary shared region (the crypto, thread
+ * control block and replication implementations allocate memory from
+ * the primary shared region).
+ *
+ * The rest of the fields are initialized at creation time, and don't
+ * need mutex protection. The flags, op_timestamp and rep_timestamp
+ * fields are used by replication only and are protected by the
+ * replication mutex. The rep_timestamp is is not protected when it
+ * is used in recovery as that is already single threaded.
+ */
+ db_mutex_t mtx_regenv; /* Refcnt, region allocation mutex. */
+ u_int32_t refcnt; /* References to the environment. */
+
+ u_int32_t region_cnt; /* Number of REGIONs. */
+ roff_t region_off; /* Offset of region array */
+ roff_t lt_primary; /* Lock primary. */
+ roff_t lg_primary; /* Log primary. */
+ roff_t tx_primary; /* Txn primary. */
+
+ roff_t cipher_off; /* Offset of cipher area */
+
+ roff_t thread_off; /* Offset of the thread area. */
+
+ roff_t rep_off; /* Offset of the replication area. */
+#define DB_REGENV_REPLOCKED 0x0001 /* Env locked for rep backup. */
+ u_int32_t flags; /* Shared environment flags. */
+#define DB_REGENV_TIMEOUT 30 /* Backup timeout. */
+ time_t op_timestamp; /* Timestamp for operations. */
+ time_t rep_timestamp; /* Timestamp for rep db handles. */
+ u_int32_t reg_panic; /* DB_REGISTER triggered panic */
+ uintmax_t unused; /* The ALLOC_LAYOUT structure follows
+ * the REGENV structure in memory and
+ * contains uintmax_t fields. Force
+ * proper alignment of that structure.
+ */
+} REGENV;
+
+/* Per-region shared region information. */
+typedef struct __db_region { /* SHARED */
+ roff_t size; /* Region size in bytes. */
+ roff_t max; /* Region max in bytes. */
+ long segid; /* UNIX shmget(2), Win16 segment ID. */
+
+ u_int32_t id; /* Region id. */
+ reg_type_t type; /* Region type. */
+
+ roff_t primary; /* Primary data structure offset. */
+ roff_t alloc; /* Region allocation size in bytes. */
+} REGION;
+
+/*
+ * Per-process/per-attachment information about a single region.
+ */
+
+/*
+ * Structure used for tracking allocations in DB_PRIVATE regions.
+ */
+struct __db_region_mem_t; typedef struct __db_region_mem_t REGION_MEM;
+struct __db_region_mem_t {
+ REGION_MEM *next;
+};
+
+struct __db_reginfo_t { /* __env_region_attach IN parameters. */
+ ENV *env; /* Enclosing environment. */
+ reg_type_t type; /* Region type. */
+ u_int32_t id; /* Region id. */
+
+ /* env_region_attach OUT parameters. */
+ REGION *rp; /* Shared region. */
+
+ char *name; /* Region file name. */
+ DB_FH *fhp; /* Region file handle */
+
+ void *addr; /* Region address. */
+ void *head; /* Head of the allocation struct. */
+ void *primary; /* Primary data structure address. */
+
+ /* Private Memory Tracking. */
+ size_t max_alloc; /* Maximum bytes allocated. */
+ size_t allocated; /* Bytes allocated. */
+ REGION_MEM *mem; /* List of memory to free */
+
+ db_mutex_t mtx_alloc; /* number of mutex for allocation. */
+
+#ifdef DB_WIN32
+ HANDLE wnt_handle; /* Win/NT HANDLE. */
+#endif
+
+#define REGION_CREATE 0x01 /* Caller created region. */
+#define REGION_CREATE_OK 0x02 /* Caller willing to create region. */
+#define REGION_JOIN_OK 0x04 /* Caller is looking for a match. */
+#define REGION_SHARED 0x08 /* Region is shared. */
+#define REGION_TRACKED 0x10 /* Region private memory is tracked. */
+ u_int32_t flags;
+};
+
+/*
+ * R_ADDR Return a per-process address for a shared region offset.
+ * R_OFFSET Return a shared region offset for a per-process address.
+ */
+#define R_ADDR(reginfop, offset) \
+ (F_ISSET((reginfop)->env, ENV_PRIVATE) ? \
+ ROFF_TO_P(offset) : \
+ (void *)((u_int8_t *)((reginfop)->addr) + (offset)))
+#define R_OFFSET(reginfop, p) \
+ (F_ISSET((reginfop)->env, ENV_PRIVATE) ? \
+ P_TO_ROFF(p) : \
+ (roff_t)((u_int8_t *)(p) - (u_int8_t *)(reginfop)->addr))
+
+/*
+ * PANIC_ISSET, PANIC_CHECK:
+ * Check to see if the DB environment is dead.
+ */
+#define PANIC_ISSET(env) \
+ ((env) != NULL && (env)->reginfo != NULL && \
+ ((REGENV *)(env)->reginfo->primary)->panic != 0 && \
+ !F_ISSET((env)->dbenv, DB_ENV_NOPANIC))
+
+#define PANIC_CHECK(env) \
+ if (PANIC_ISSET(env)) \
+ return (__env_panic_msg(env));
+
+#define PANIC_CHECK_RET(env, ret) \
+ if (PANIC_ISSET(env)) \
+ ret = (__env_panic_msg(env));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_REGION_H_ */
diff --git a/src/dbinc/rep.h b/src/dbinc/rep.h
new file mode 100644
index 00000000..75004239
--- /dev/null
+++ b/src/dbinc/rep.h
@@ -0,0 +1,1102 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_REP_H_
+#define _DB_REP_H_
+
+#include "dbinc_auto/rep_automsg.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Names of client temp databases.
+ */
+#define REPFILEPREFIX "__db.rep"
+#define REPDBNAME "__db.rep.db"
+#define REPPAGENAME "__db.reppg.db"
+
+/*
+ * Name of replicated system database file, and LSN history subdatabase within
+ * it. If the INMEM config flag is set, we create the database in memory, with
+ * the REPLSNHIST name (so that is why it also follows the __db naming
+ * convention).
+ */
+#define REPSYSDBNAME "__db.rep.system"
+#define REPLSNHIST "__db.lsn.history"
+#define REPMEMBERSHIP "__db.membership"
+#define REPSYSDBPGSZ 1024
+#define IS_REP_FILE(name) (strcmp(name, REPSYSDBNAME) == 0)
+
+
+/* Current version of commit token format, and LSN history database format. */
+#define REP_COMMIT_TOKEN_FMT_VERSION 1
+#define REP_LSN_HISTORY_FMT_VERSION 1
+
+/*
+ * Message types
+ */
+#define REP_INVALID 0 /* Invalid message type. */
+#define REP_ALIVE 1 /* I am alive message. */
+#define REP_ALIVE_REQ 2 /* Request for alive messages. */
+#define REP_ALL_REQ 3 /* Request all log records greater than LSN. */
+#define REP_BULK_LOG 4 /* Bulk transfer of log records. */
+#define REP_BULK_PAGE 5 /* Bulk transfer of pages. */
+#define REP_DUPMASTER 6 /* Duplicate master detected; propagate. */
+#define REP_FILE 7 /* Page of a database file. NOTUSED */
+#define REP_FILE_FAIL 8 /* File requested does not exist. */
+#define REP_FILE_REQ 9 /* Request for a database file. NOTUSED */
+#define REP_LEASE_GRANT 10 /* Client grants a lease to a master. */
+#define REP_LOG 11 /* Log record. */
+#define REP_LOG_MORE 12 /* There are more log records to request. */
+#define REP_LOG_REQ 13 /* Request for a log record. */
+#define REP_MASTER_REQ 14 /* Who is the master */
+#define REP_NEWCLIENT 15 /* Announces the presence of a new client. */
+#define REP_NEWFILE 16 /* Announce a log file change. */
+#define REP_NEWMASTER 17 /* Announces who the master is. */
+#define REP_NEWSITE 18 /* Announces that a site has heard from a new
+ * site; like NEWCLIENT, but indirect. A
+ * NEWCLIENT message comes directly from the new
+ * client while a NEWSITE comes indirectly from
+ * someone who heard about a NEWSITE.
+ */
+#define REP_PAGE 19 /* Database page. */
+#define REP_PAGE_FAIL 20 /* Requested page does not exist. */
+#define REP_PAGE_MORE 21 /* There are more pages to request. */
+#define REP_PAGE_REQ 22 /* Request for a database page. */
+#define REP_REREQUEST 23 /* Force rerequest. */
+#define REP_START_SYNC 24 /* Tell client to begin syncing a ckp.*/
+#define REP_UPDATE 25 /* Environment hotcopy information. */
+#define REP_UPDATE_REQ 26 /* Request for hotcopy information. */
+#define REP_VERIFY 27 /* A log record for verification. */
+#define REP_VERIFY_FAIL 28 /* The client is outdated. */
+#define REP_VERIFY_REQ 29 /* Request for a log record to verify. */
+#define REP_VOTE1 30 /* Send out your information for an election. */
+#define REP_VOTE2 31 /* Send a "you are master" vote. */
+/*
+ * Maximum message number for conversion tables. Update this
+ * value as the largest message number above increases.
+ * It might make processing messages more straightforward if
+ * the *_MORE and BULK* messages were flags within the regular
+ * message type instead of separate message types themselves.
+ *
+ * !!!
+ * NOTE: When changing messages above, the two tables for upgrade support
+ * need adjusting. They are in rep_util.c.
+ */
+#define REP_MAX_MSG 31
+
+/*
+ * This is the list of client-to-client requests messages.
+ * We use this to decide if we're doing client-to-client and
+ * might need to send a rerequest.
+ */
+#define REP_MSG_REQ(rectype) \
+ (rectype == REP_ALL_REQ || \
+ rectype == REP_LOG_REQ || \
+ rectype == REP_PAGE_REQ || \
+ rectype == REP_VERIFY_REQ)
+
+/*
+ * Note that the version information should be at the beginning of the
+ * structure, so that we can rearrange the rest of it while letting the
+ * version checks continue to work. DB_REPVERSION should be revved any time
+ * the rest of the structure changes or when the message numbers change.
+ *
+ * Define also, the corresponding log versions that are tied to the
+ * replication/release versions. These are only needed in replication
+ * and that is why they're defined here. db_printlog takes notice as well.
+ */
+#define DB_LOGVERSION_42 8
+#define DB_LOGVERSION_43 10
+#define DB_LOGVERSION_44 11
+#define DB_LOGVERSION_45 12
+#define DB_LOGVERSION_46 13
+#define DB_LOGVERSION_47 14
+#define DB_LOGVERSION_48 15
+#define DB_LOGVERSION_48p2 16
+#define DB_LOGVERSION_50 17
+#define DB_LOGVERSION_51 17
+#define DB_LOGVERSION_52 18
+#define DB_LOGVERSION_53 19
+#define DB_LOGVERSION_MIN DB_LOGVERSION_44
+#define DB_REPVERSION_INVALID 0
+#define DB_REPVERSION_44 3
+#define DB_REPVERSION_45 3
+#define DB_REPVERSION_46 4
+#define DB_REPVERSION_47 5
+#define DB_REPVERSION_48 5
+#define DB_REPVERSION_50 5
+#define DB_REPVERSION_51 5
+#define DB_REPVERSION_52 6
+#define DB_REPVERSION_53 7
+#define DB_REPVERSION DB_REPVERSION_53
+#define DB_REPVERSION_MIN DB_REPVERSION_44
+
+/*
+ * RPRINT - Replication diagnostic output
+ * VPRINT - Replication verbose output (superset of RPRINT).
+ * REP_PRINT_MESSAGE
+ * Macros for verbose replication messages.
+ *
+ * Everything using RPRINT will go to the system diag file (if it
+ * is configured) and also to the user's verbose output if
+ * they have that verbose level configured.
+ * Messages using VPRINT do not ever go to the system diag file,
+ * but will go to the user's verbose output if configured.
+ *
+ * Use VPRINT for anything that might be printed on a standard,
+ * successful transaction. Use RPRINT for error paths, rep
+ * state changes, elections, etc.
+ */
+#define REP_DIAGNAME "__db.rep.diag%02d"
+#define REP_DIAGSIZE MEGABYTE
+#define RPRINT(env, x) do { \
+ if ((env)->dbenv->verbose != 0) \
+ (void)__rep_print_system x; \
+} while (0)
+#define VPRINT(env, x) do { \
+ if ((env)->dbenv->verbose != 0) \
+ (void)__rep_print x; \
+} while (0)
+#define REP_PRINT_MESSAGE(env, eid, rp, str, fl) do { \
+ if ((env)->dbenv->verbose != 0) \
+ __rep_print_message(env, eid, rp, str, fl); \
+} while (0)
+
+/*
+ * Election gen file name
+ * The file contains an egen number for an election this client has NOT
+ * participated in. I.e. it is the number of a future election. We
+ * create it when we create the rep region, if it doesn't already exist
+ * and initialize egen to 1. If it does exist, we read it when we create
+ * the rep region. We write it immediately before sending our VOTE1 in
+ * an election. That way, if a client has ever sent a vote for any
+ * election, the file is already going to be updated to reflect a future
+ * election, should it crash.
+ */
+#define REP_EGENNAME "__db.rep.egen"
+#define REP_GENNAME "__db.rep.gen"
+
+/*
+ * Internal init flag file name:
+ * The existence of this file serves as an indication that the client is in the
+ * process of Internal Initialization, in case it crashes before completing.
+ * During internal init the client's partially reconstructed database pages and
+ * logs may be in an inconsistent state, so much so that running recovery must
+ * be avoided. Furthermore, there is no other way to reliably recognize this
+ * condition. Therefore, when we open an environment, and we're just about to
+ * run recovery, we check for this file first. If it exists we must discard all
+ * logs and databases. This avoids the recovery problems, and leads to a fresh
+ * attempt at internal init if the environment becomes a replication client and
+ * finds a master. The list of databases which may need to be removed is stored
+ * in this file.
+ */
+#define REP_INITNAME "__db.rep.init"
+#define REP_INITVERSION_46 1
+#define REP_INITVERSION_47 2
+#define REP_INITVERSION 3
+
+/*
+ * Database types for __rep_client_dbinit
+ */
+typedef enum {
+ REP_DB, /* Log record database. */
+ REP_PG /* Pg database. */
+} repdb_t;
+
+/* Macros to lock/unlock the replication region as a whole. */
+#define REP_SYSTEM_LOCK(env) \
+ MUTEX_LOCK(env, (env)->rep_handle->region->mtx_region)
+#define REP_SYSTEM_UNLOCK(env) \
+ MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_region)
+
+/*
+ * Macros for manipulating the event synchronization. We use a separate mutex
+ * so that an application's call-back function can be invoked without locking
+ * the whole region.
+ */
+#define REP_EVENT_LOCK(env) \
+ MUTEX_LOCK(env, (env)->rep_handle->region->mtx_event)
+#define REP_EVENT_UNLOCK(env) \
+ MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_event)
+
+/*
+ * Synchronization states
+ * Please change __rep_syncstate_to_string (rep_stat.c) to track any changes
+ * made to these states.
+ *
+ * The states are in alphabetical order (except for OFF). The usual
+ * order of progression for a full internal init is:
+ * VERIFY, UPDATE, PAGE, LOG (then back to OFF when we're done).
+ */
+typedef enum {
+ SYNC_OFF, /* No recovery. */
+ SYNC_LOG, /* Recovery - log. */
+ SYNC_PAGE, /* Recovery - pages. */
+ SYNC_UPDATE, /* Recovery - update. */
+ SYNC_VERIFY /* Recovery - verify. */
+} repsync_t;
+
+/*
+ * A record of the contents of the VOTE1 msg we sent out at current egen, in
+ * case we need to send out a duplicate VOTE1 to a late-joining client in a full
+ * election. The nsites, nvotes, and priority fields of the REP struct can't be
+ * used, because those could change. It's only safe to send out a dup if we
+ * send out the exact same info.
+ */
+typedef struct {
+ DB_LSN lsn;
+ u_int32_t nsites;
+ u_int32_t nvotes;
+ u_int32_t priority;
+ u_int32_t tiebreaker;
+ u_int32_t ctlflags;
+ u_int32_t data_gen;
+} VOTE1_CONTENT;
+
+/*
+ * REP --
+ * Shared replication structure.
+ */
+typedef struct __rep { /* SHARED */
+ db_mutex_t mtx_region; /* Region mutex. */
+ db_mutex_t mtx_clientdb; /* Client database mutex. */
+ db_mutex_t mtx_ckp; /* Checkpoint mutex. */
+ db_mutex_t mtx_diag; /* Diagnostic message mutex. */
+ db_mutex_t mtx_repstart; /* Role change mutex. */
+ int diag_index; /* Diagnostic file index. */
+ off_t diag_off; /* Diagnostic message offset. */
+ roff_t lease_off; /* Offset of the lease table. */
+ roff_t tally_off; /* Offset of the tally region. */
+ roff_t v2tally_off; /* Offset of the vote2 tally region. */
+ int eid; /* Environment id. */
+ int master_id; /* ID of the master site. */
+ u_int32_t version; /* Current replication version. */
+ u_int32_t egen; /* Replication election generation. */
+ u_int32_t spent_egen; /* Egen satisfied by rep_elect call. */
+ u_int32_t gen; /* Replication generation number. */
+ u_int32_t mgen; /* Master gen seen by client. */
+ u_int32_t asites; /* Space allocated for sites. */
+ u_int32_t nsites; /* Number of sites in group. */
+ u_int32_t nvotes; /* Number of votes needed. */
+ u_int32_t priority; /* My priority in an election. */
+ u_int32_t config_nsites;
+
+ db_timeout_t elect_timeout; /* Normal/full election timeouts. */
+ db_timeout_t full_elect_timeout;
+
+ db_timeout_t chkpt_delay; /* Master checkpoint delay. */
+
+#define REP_DEFAULT_THROTTLE (10 * MEGABYTE) /* Default value is < 1Gig. */
+ u_int32_t gbytes; /* Limit on data sent in single... */
+ u_int32_t bytes; /* __rep_process_message call. */
+#define DB_REP_REQUEST_GAP 40000 /* 40 msecs */
+#define DB_REP_MAX_GAP 1280000 /* 1.28 seconds */
+ db_timespec request_gap; /* Minimum time to wait before we
+ * request a missing log record. */
+ db_timespec max_gap; /* Maximum time to wait before
+ * requesting a missing log record. */
+ /* Status change information */
+ u_int32_t apply_th; /* Number of callers in rep_apply. */
+ u_int32_t arch_th; /* Number of callers in log_archive. */
+ u_int32_t elect_th; /* Elect threads in lock-out. */
+ u_int32_t msg_th; /* Number of callers in rep_proc_msg.*/
+ u_int32_t handle_cnt; /* Count of handles in library. */
+ u_int32_t op_cnt; /* Multi-step operation count.*/
+ DB_LSN ckp_lsn; /* LSN for syncing a checkpoint. */
+ DB_LSN max_prep_lsn; /* Max LSN of txn_prepare record. */
+
+ /*
+ * Event notification synchronization: the mtx_event and associate
+ * fields which it protects govern event notification to the
+ * application. They form a guarantee that no matter how crazy the
+ * thread scheduling gets, the application sees a sensible, orderly
+ * progression of events.
+ */
+ db_mutex_t mtx_event; /* Serializes event notification. */
+ /*
+ * Latest generation whose NEWMASTER event the application has been
+ * notified of. Also serves to force STARTUPDONE to occur after
+ * NEWMASTER.
+ */
+ u_int32_t newmaster_event_gen;
+ /*
+ * Latest local victory of an election that the application has been
+ * notified of, expressed as the election generation number. This
+ * ensures we notify the application exactly once when it wins an
+ * election.
+ */
+ u_int32_t notified_egen;
+
+ /* Internal init information. */
+ u_int32_t nfiles; /* Number of files we have info on. */
+ u_int32_t curfile; /* Cur file we're getting (0-based). */
+ roff_t originfo_off; /* Offset of original file info. */
+ u_int32_t infolen; /* Remaining length file info buffer. */
+ u_int32_t originfolen; /* Original length file info buffer. */
+ u_int32_t infoversion; /* Original file info version. */
+ DB_LSN first_lsn; /* Earliest LSN we need. */
+ u_int32_t first_vers; /* Log version of first log file. */
+ DB_LSN last_lsn; /* Latest LSN we need. */
+ /* These are protected by mtx_clientdb. */
+ db_timespec last_pg_ts; /* Last page stored timestamp. */
+ db_pgno_t ready_pg; /* Next pg expected. */
+ db_pgno_t waiting_pg; /* First pg after gap. */
+ db_pgno_t max_wait_pg; /* Maximum pg requested. */
+ u_int32_t npages; /* Num of pages rcvd for this file. */
+ roff_t curinfo_off; /* Offset of current file info. */
+ /* Always access with GET_CURINFO(). */
+
+ /* Vote tallying information. */
+ u_int32_t sites; /* Sites heard from. */
+ int winner; /* Current winner EID. */
+ u_int32_t w_priority; /* Winner priority. */
+ u_int32_t w_gen; /* Winner generation. */
+ u_int32_t w_datagen; /* Winner data generation. */
+ DB_LSN w_lsn; /* Winner LSN. */
+ u_int32_t w_tiebreaker; /* Winner tiebreaking value. */
+ u_int32_t votes; /* Number of votes for this site. */
+
+ VOTE1_CONTENT vote1; /* Valid until rep->egen changes. */
+
+ db_timespec etime; /* Election start timestamp. */
+ int full_elect; /* Is current election a "full" one? */
+
+ /* Leases. */
+ db_timeout_t lease_timeout; /* Lease timeout. */
+ db_timespec lease_duration; /* Lease timeout with clock skew. */
+ u_int32_t clock_skew; /* Clock skew. */
+ u_int32_t clock_base; /* Clock scale factor base. */
+ db_timespec grant_expire; /* Local grant expiration time. */
+
+ /* Cached LSN history, matching current gen. */
+ DB_LSN gen_base_lsn; /* Base LSN of current generation. */
+ u_int32_t master_envid; /* Current master's "unique" env ID. */
+
+ SH_TAILQ_HEAD(__wait) waiters; /* List of threads in txn_applied(). */
+ SH_TAILQ_HEAD(__wfree) free_waiters;/* Free list of waiter structs. */
+
+#ifdef HAVE_REPLICATION_THREADS
+ /*
+ * Replication Framework (repmgr) shared config information.
+ */
+ db_mutex_t mtx_repmgr; /* Region mutex. */
+ roff_t siteinfo_off; /* Offset of site array region. */
+ u_int site_cnt; /* Array slots in use. */
+ u_int site_max; /* Total array slots allocated. */
+ int self_eid; /* Where to find the local site. */
+ u_int siteinfo_seq; /* Number of updates to this info. */
+ u_int32_t min_log_file; /* Earliest log needed by repgroup. */
+
+ pid_t listener;
+
+ int perm_policy;
+ db_timeout_t ack_timeout;
+ db_timeout_t election_retry_wait;
+ db_timeout_t connection_retry_wait;
+ db_timeout_t heartbeat_frequency; /* Max period between msgs. */
+ db_timeout_t heartbeat_monitor_timeout;
+#endif /* HAVE_REPLICATION_THREADS */
+
+ /* Statistics. */
+ DB_REP_STAT stat;
+#if defined(HAVE_REPLICATION_THREADS) && defined(HAVE_STATISTICS)
+ DB_REPMGR_STAT mstat;
+#endif
+
+ /*
+ * Please change __rep_print_all (rep_stat.c) to track any changes made
+ * to all these flag families below.
+ */
+ /* Configuration. */
+#define REP_C_2SITE_STRICT 0x00001 /* Don't cheat on elections. */
+#define REP_C_AUTOINIT 0x00002 /* Auto initialization. */
+#define REP_C_AUTOROLLBACK 0x00004 /* Discard client txns: sync. */
+#define REP_C_BULK 0x00008 /* Bulk transfer. */
+#define REP_C_DELAYCLIENT 0x00010 /* Delay client sync-up. */
+#define REP_C_ELECTIONS 0x00020 /* Repmgr to use elections. */
+#define REP_C_INMEM 0x00040 /* In-memory replication. */
+#define REP_C_LEASE 0x00080 /* Leases configured. */
+#define REP_C_NOWAIT 0x00100 /* Immediate error return. */
+ u_int32_t config; /* Configuration flags. */
+
+ /* Election. */
+#define REP_E_PHASE0 0x00000001 /* In phase 0 of election. */
+#define REP_E_PHASE1 0x00000002 /* In phase 1 of election. */
+#define REP_E_PHASE2 0x00000004 /* In phase 2 of election. */
+#define REP_E_TALLY 0x00000008 /* Tallied vote before elect. */
+ u_int32_t elect_flags; /* Election flags. */
+
+ /* Lockout. */
+#define REP_LOCKOUT_API 0x00000001 /* BDB API - handle_cnt. */
+#define REP_LOCKOUT_APPLY 0x00000002 /* apply msgs - apply_th. */
+#define REP_LOCKOUT_ARCHIVE 0x00000004 /* log_archive. */
+#define REP_LOCKOUT_MSG 0x00000008 /* Message process - msg_th. */
+#define REP_LOCKOUT_OP 0x00000010 /* BDB ops txn,curs - op_cnt. */
+ u_int32_t lockout_flags; /* Lockout flags. */
+
+ /* See above for enumerated sync states. */
+ repsync_t sync_state; /* Recovery/synchronization flags. */
+
+ /*
+ * When adding a new flag value, consider whether it should be
+ * cleared in rep_start() when starting as a master or a client.
+ */
+#define REP_F_ABBREVIATED 0x00000001 /* Recover NIMDB pages only. */
+#define REP_F_APP_BASEAPI 0x00000002 /* Base API application. */
+#define REP_F_APP_REPMGR 0x00000004 /* repmgr application. */
+#define REP_F_CLIENT 0x00000008 /* Client replica. */
+#define REP_F_DELAY 0x00000010 /* Delaying client sync-up. */
+#define REP_F_GROUP_ESTD 0x00000020 /* Rep group is established. */
+#define REP_F_INUPDREQ 0x00000040 /* Thread in rep_update_req. */
+#define REP_F_LEASE_EXPIRED 0x00000080 /* Leases guaranteed expired. */
+#define REP_F_MASTER 0x00000100 /* Master replica. */
+#define REP_F_MASTERELECT 0x00000200 /* Master elect. */
+#define REP_F_NEWFILE 0x00000400 /* Newfile in progress. */
+#define REP_F_NIMDBS_LOADED 0x00000800 /* NIMDBs are materialized. */
+#define REP_F_SKIPPED_APPLY 0x00001000 /* Skipped applying a record. */
+#define REP_F_START_CALLED 0x00002000 /* Rep_start called. */
+#define REP_F_SYS_DB_OP 0x00004000 /* Operation in progress. */
+ u_int32_t flags;
+} REP;
+
+/* Information about a thread waiting in txn_applied(). */
+typedef enum {
+ AWAIT_GEN, /* Client's gen is behind token gen. */
+ AWAIT_HISTORY, /* Haven't received master's LSN db update. */
+ AWAIT_LSN, /* Awaiting replication of user txn. */
+ AWAIT_NIMDB, /* LSN db missing: maybe it's INMEM. */
+ LOCKOUT /* Thread awoken due to pending lockout. */
+} rep_waitreason_t;
+
+struct rep_waitgoal {
+ rep_waitreason_t why;
+ union {
+ DB_LSN lsn; /* For AWAIT_LSN and AWAIT_HISTORY. */
+ u_int32_t gen; /* AWAIT_GEN */
+ } u;
+};
+
+struct __rep_waiter {
+ db_mutex_t mtx_repwait; /* Self-blocking mutex. */
+ struct rep_waitgoal goal;
+ SH_TAILQ_ENTRY links; /* On either free or waiting list. */
+
+#define REP_F_PENDING_LOCKOUT 0x00000001
+#define REP_F_WOKEN 0x00000002
+ u_int32_t flags;
+};
+
+/*
+ * Macros to check and clear the BDB lockouts. Currently they are
+ * locked out/set individually because they pertain to different pieces of
+ * the BDB API, they are otherwise always checked and cleared together.
+ */
+#define ISSET_LOCKOUT_BDB(R) \
+ (FLD_ISSET((R)->lockout_flags, (REP_LOCKOUT_API | REP_LOCKOUT_OP)))
+
+#define CLR_LOCKOUT_BDB(R) \
+ (FLD_CLR((R)->lockout_flags, (REP_LOCKOUT_API | REP_LOCKOUT_OP)))
+
+/*
+ * Recovery flag mask to easily check any/all recovery bits. That is
+ * REP_LOCKOUT_{API|OP} and most REP_S_*. This must change if the values
+ * of the flags change. NOTE: We do not include REP_LOCKOUT_MSG in
+ * this mask because it is used frequently in non-recovery related
+ * areas and we want to manipulate it separately (see especially
+ * in __rep_new_master).
+ */
+#define CLR_RECOVERY_SETTINGS(R) \
+do { \
+ (R)->sync_state = SYNC_OFF; \
+ CLR_LOCKOUT_BDB(R); \
+} while (0)
+
+#define IS_REP_RECOVERING(R) \
+ ((R)->sync_state != SYNC_OFF || ISSET_LOCKOUT_BDB(R))
+
+/*
+ * REP_F_EPHASE0 is not a *real* election phase. It is used for
+ * master leases and allowing the client to find the master or
+ * expire its lease. However, EPHASE0 is cleared by __rep_elect_done.
+ */
+#define IN_ELECTION(R) \
+ FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2)
+#define IN_ELECTION_TALLY(R) \
+ FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2 | REP_E_TALLY)
+#define ELECTION_MAJORITY(n) (((n) / 2) + 1)
+
+#define IN_INTERNAL_INIT(R) \
+ ((R)->sync_state == SYNC_LOG || (R)->sync_state == SYNC_PAGE)
+
+#define IS_REP_MASTER(env) \
+ (REP_ON(env) && \
+ F_ISSET(((env)->rep_handle->region), REP_F_MASTER))
+
+#define IS_REP_CLIENT(env) \
+ (REP_ON(env) && \
+ F_ISSET(((env)->rep_handle->region), REP_F_CLIENT))
+
+#define IS_REP_STARTED(env) \
+ (REP_ON(env) && \
+ F_ISSET(((env)->rep_handle->region), REP_F_START_CALLED))
+
+#define IS_USING_LEASES(env) \
+ (REP_ON(env) && \
+ FLD_ISSET(((env)->rep_handle->region)->config, REP_C_LEASE))
+
+#define IS_CLIENT_PGRECOVER(env) \
+ (IS_REP_CLIENT(env) && \
+ (((env)->rep_handle->region)->sync_state == SYNC_PAGE))
+
+/*
+ * Macros to figure out if we need to do replication pre/post-amble processing.
+ * Skip for specific DB handles owned by the replication layer, either because
+ * replication is running recovery or because it's a handle entirely owned by
+ * the replication code (replication opens its own databases to track state).
+ */
+#define REP_FLAGS_SET(env) \
+ ((env)->rep_handle->region->flags != 0 || \
+ (env)->rep_handle->region->elect_flags != 0 || \
+ (env)->rep_handle->region->lockout_flags != 0)
+
+#define IS_ENV_REPLICATED(env) \
+ (REP_ON(env) && REP_FLAGS_SET(env))
+
+/*
+ * Update the temporary log archive block timer.
+ */
+#define MASTER_UPDATE(env, renv) do { \
+ REP_SYSTEM_LOCK(env); \
+ F_SET((renv), DB_REGENV_REPLOCKED); \
+ (void)time(&(renv)->op_timestamp); \
+ REP_SYSTEM_UNLOCK(env); \
+} while (0)
+
+/*
+ * Macro to set a new generation number. Cached values from the LSN history
+ * database are associated with the current gen, so when the gen changes we must
+ * invalidate the cache. Use this macro for all gen changes, to avoid
+ * forgetting to do so. This macro should be used while holding the rep system
+ * mutex (unless we know we're single-threaded for some other reason, like at
+ * region create time).
+ */
+#define SET_GEN(g) do { \
+ rep->gen = (g); \
+ ZERO_LSN(rep->gen_base_lsn); \
+} while (0)
+
+
+/*
+ * Gap processing flags. These provide control over the basic
+ * gap processing algorithm for some special cases.
+ */
+#define REP_GAP_FORCE 0x001 /* Force a request for a gap. */
+#define REP_GAP_REREQUEST 0x002 /* Gap request is a forced rerequest. */
+ /* REREQUEST is a superset of FORCE. */
+
+/*
+ * Flags indicating what kind of record we want to back up to, in the log.
+ */
+#define REP_REC_COMMIT 0x001 /* Most recent commit record. */
+#define REP_REC_PERM 0x002 /* Most recent perm record. */
+ /* PERM is a superset of COMMIT. */
+
+/*
+ * Basic pre/post-amble processing.
+ */
+#define REPLICATION_WRAP(env, func_call, checklock, ret) do { \
+ int __rep_check, __t_ret; \
+ __rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; \
+ (ret) = __rep_check ? __env_rep_enter(env, checklock) : 0; \
+ if ((ret) == 0) { \
+ (ret) = func_call; \
+ if (__rep_check && (__t_ret = \
+ __env_db_rep_exit(env)) != 0 && (ret) == 0) \
+ (ret) = __t_ret; \
+ } \
+} while (0)
+
+/*
+ * Macro to safely access curinfo and its internal DBT pointers from
+ * any process. This should always be used to access curinfo. If
+ * the internal DBT pointers are to be used, mtx_clientdb must be held
+ * between the time of this call and the use of the pointers.
+ *
+ * The current file information (curinfo) is stored in shared region
+ * memory and accessed via an offset. It contains DBTs that themselves
+ * point to allocated data. __rep_nextfile() manages this information in a
+ * single chunk of shared memory.
+ *
+ * If different processes access curinfo, they may have different shared
+ * region addresses. This means that curinfo and its pointers to DBT data
+ * must be recalculated for each process starting with the offset.
+ */
+#define GET_CURINFO(rep, infop, curinfo) \
+do { \
+ curinfo = R_ADDR(infop, rep->curinfo_off); \
+ if ((curinfo)->uid.size > 0) \
+ (curinfo)->uid.data = R_ADDR(infop, \
+ rep->curinfo_off + sizeof(__rep_fileinfo_args)); \
+ else \
+ (curinfo)->uid.data = NULL; \
+ if ((curinfo)->info.size > 0) \
+ (curinfo)->info.data = R_ADDR(infop, rep->curinfo_off + \
+ sizeof(__rep_fileinfo_args) + (curinfo)->uid.size); \
+ else \
+ (curinfo)->info.data = NULL; \
+ if ((curinfo)->dir.size > 0) \
+ (curinfo)->dir.data = R_ADDR(infop, rep->curinfo_off + \
+ sizeof(__rep_fileinfo_args) + (curinfo)->uid.size + \
+ (curinfo)->info.size); \
+ else \
+ (curinfo)->dir.data = NULL; \
+} while (0)
+
+/*
+ * Per-process replication structure.
+ *
+ * There are 2 mutexes used in the Base replication API. (See LOCK_MUTEX in
+ * repmgr.h for a discussion of repmgr.)
+ * 1. mtx_region - This protects the fields of the rep region above.
+ * 2. mtx_clientdb - This protects the per-process flags, and bookkeeping
+ * database and all of the components that maintain it. Those
+ * components include the following fields in the log region (see log.h):
+ * a. ready_lsn
+ * b. waiting_lsn
+ * c. verify_lsn
+ * d. wait_recs
+ * e. rcvd_recs
+ * f. max_wait_lsn
+ * These fields in the log region are NOT protected by the log region lock at
+ * all.
+ *
+ * Note that the per-process flags should truly be protected by a special
+ * per-process thread mutex, but it is currently set in so isolated a manner
+ * that it didn't make sense to do so and in most case we're already holding
+ * the mtx_clientdb anyway.
+ *
+ * The lock ordering protocol is that mtx_clientdb must be acquired first and
+ * then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if
+ * necessary.
+ *
+ * Note that the appropriate mutex is needed any time one or more related
+ * values are read or written that could possibly use more than one atomic
+ * machine instruction. A single 32-bit integer value is safe without a
+ * mutex, but most other types of value should use a mutex.
+ *
+ * Any use of a mutex must be inside a matched pair of ENV_ENTER() and
+ * ENV_LEAVE() macros. This ensures that if a thread dies while holding
+ * a lock (i.e. a mutex), recovery can clean it up so that it does not
+ * indefinitely block other threads.
+ */
+struct __db_rep {
+ /*
+ * Shared configuration information -- copied to and maintained in the
+ * shared region as soon as the shared region is created.
+ */
+ int eid; /* Environment ID. */
+
+ u_int32_t gbytes; /* Limit on data sent in single... */
+ u_int32_t bytes; /* __rep_process_message call. */
+
+ db_timespec request_gap; /* Minimum time to wait before we
+ * request a missing log record. */
+ db_timespec max_gap; /* Maximum time to wait before
+ * requesting a missing log record. */
+
+ u_int32_t clock_skew; /* Clock skew factor. */
+ u_int32_t clock_base; /* Clock skew base. */
+ u_int32_t config; /* Configuration flags. */
+ u_int32_t config_nsites;
+
+ db_timeout_t elect_timeout; /* Normal/full election timeouts. */
+ db_timeout_t full_elect_timeout;
+
+ db_timeout_t chkpt_delay; /* Master checkpoint delay. */
+
+ u_int32_t my_priority;
+ db_timeout_t lease_timeout; /* Master leases. */
+ /*
+ * End of shared configuration information.
+ */
+ int (*send) /* Send function. */
+ __P((DB_ENV *, const DBT *, const DBT *,
+ const DB_LSN *, int, u_int32_t));
+
+ DB *rep_db; /* Bookkeeping database. */
+ DB *lsn_db; /* (Replicated) LSN history database. */
+
+ REP *region; /* In memory structure. */
+ u_int8_t *bulk; /* Shared memory bulk area. */
+
+#define DBREP_DIAG_FILES 2
+ DB_FH *diagfile[DBREP_DIAG_FILES]; /* Diag files fhp. */
+ off_t diag_off; /* Current diag file offset. */
+
+ /* These are protected by mtx_clientdb. */
+ DB_MPOOLFILE *file_mpf; /* Mpoolfile for current database. */
+ DB *file_dbp; /* This file's page info. */
+ DBC *queue_dbc; /* Dbc for a queue file. */
+
+ /*
+ * Please change __rep_print_all (rep_stat.c) to track any changes made
+ * to these flags.
+ */
+#define DBREP_APP_BASEAPI 0x0001 /* Base API application. */
+#define DBREP_APP_REPMGR 0x0002 /* repmgr application. */
+#define DBREP_OPENFILES 0x0004 /* This handle has opened files. */
+ u_int32_t flags; /* per-process flags. */
+
+#ifdef HAVE_REPLICATION_THREADS
+ /*
+ * Replication Framework (repmgr) per-process information.
+ */
+ u_int nthreads; /* Msg processing threads. */
+ u_int athreads; /* Space allocated for msg threads. */
+ u_int non_rep_th; /* Threads in GMDB or channel msgs. */
+ u_int aelect_threads; /* Space allocated for elect threads. */
+ u_int32_t init_policy;
+ int perm_policy;
+ DB_LSN perm_lsn; /* Last perm LSN we've announced. */
+ db_timeout_t ack_timeout;
+ db_timeout_t election_retry_wait;
+ db_timeout_t connection_retry_wait;
+ db_timeout_t heartbeat_frequency; /* Max period between msgs. */
+ db_timeout_t heartbeat_monitor_timeout;
+
+ /* Thread synchronization. */
+ REPMGR_RUNNABLE *selector, **messengers, **elect_threads;
+ REPMGR_RUNNABLE *preferred_elect_thr;
+ db_timespec repstart_time;
+ mgr_mutex_t *mutex;
+ cond_var_t check_election, gmdb_idle, msg_avail;
+ waiter_t ack_waiters; /* For threads awaiting PERM acks. */
+#ifdef DB_WIN32
+ HANDLE signaler;
+#else
+ int read_pipe, write_pipe;
+#endif
+
+ /* Operational stuff. */
+ REPMGR_SITE *sites; /* Array of known sites. */
+ u_int site_cnt; /* Array slots in use. */
+ u_int site_max; /* Total array slots allocated. */
+ int self_eid; /* Where to find the local site. */
+ u_int siteinfo_seq; /* Last known update to this list. */
+
+ /*
+ * The connections list contains only those connections not actively
+ * associated with a known site (see repmgr.h).
+ */
+ CONNECTION_LIST connections;
+ RETRY_Q_HEADER retries; /* Sites needing connection retry. */
+ struct {
+ int size;
+ STAILQ_HEAD(__repmgr_q_header, __repmgr_message) header;
+ } input_queue;
+
+ socket_t listen_fd;
+ db_timespec last_bcast; /* Time of last broadcast msg. */
+
+ /*
+ * Status of repmgr. It is ready when repmgr is not yet started. It
+ * is running after repmgr is (re)started. It is stopped if the env
+ * of the running repmgr is closed, or the site is removed.
+ */
+ enum { ready, running, stopped } repmgr_status;
+ int new_connection; /* Since last master seek attempt. */
+ int takeover_pending; /* We've been elected master. */
+ int gmdb_busy;
+ int client_intent; /* Will relinquish master role. */
+ int gmdb_dirty;
+ int have_gmdb;
+ int seen_repmsg;
+
+ /*
+ * Flag to show what kind of transaction is currently in progress.
+ * Primary means we're doing the first (critical) phase of a membership
+ * DB update, where we care about perm failures. In the secondary phase
+ * we don't care. Usually the value is "none", when normal user
+ * transactions are happening. We need to use this global flag because
+ * we don't have a more proper direct channel to communicate information
+ * between the originator of a transaction and the replication send()
+ * function that has to wait for acks and decide what to do about them.
+ */
+ enum { none, gmdb_primary, gmdb_secondary } active_gmdb_update;
+ int limbo_resolution_needed;
+
+ /*
+ * GMDB update sequence count. On creation we write version 1; so, once
+ * repmgr has started and tried to read, a 0 here can be taken to mean
+ * that the DB doesn't exist yet.
+ */
+ u_int32_t membership_version;
+ u_int32_t member_version_gen;
+
+ /* LSN of GMDB txn that got a perm failure. */
+ DB_LSN limbo_failure;
+ /* EID whose membership status is therefore unresolved */
+ int limbo_victim;
+ /* LSN of a later txn that achieves perm success. */
+ DB_LSN durable_lsn;
+ DB *gmdb; /* Membership database handle. */
+ /*
+ * Membership list restored from init file after crash during internal init.
+ */
+ u_int8_t *restored_list;
+ size_t restored_list_length;
+
+ /* Application's message dispatch call-back function. */
+ void (*msg_dispatch) __P((DB_ENV *, DB_CHANNEL *,
+ DBT *, u_int32_t, u_int32_t));
+#endif /* HAVE_REPLICATION_THREADS */
+};
+
+/*
+ * Determine whether application is repmgr or base replication API. If
+ * repmgr was configured, base the test on internal replication flags for
+ * APP_REPMGR and APP_BASEAPI. These flags get set by the appropriate parts
+ * of the various replication APIs.
+ */
+#ifdef HAVE_REPLICATION_THREADS
+/*
+ * Application type is set to be repmgr when:
+ * 1. A local site is defined.
+ * 2. A remote site is defined.
+ * 3. An acknowledgement policy is configured.
+ * 4. A repmgr flag is configured.
+ * 5. A timeout value is configured for one of the repmgr timeouts.
+ */
+#define APP_IS_REPMGR(env) \
+ (REP_ON(env) ? \
+ F_ISSET((env)->rep_handle->region, REP_F_APP_REPMGR) : \
+ F_ISSET((env)->rep_handle, DBREP_APP_REPMGR))
+
+/*
+ * Application type is set to be base replication API when:
+ * 1. Transport send function is defined and is not the repmgr send
+ * function.
+ */
+#define APP_IS_BASEAPI(env) \
+ (REP_ON(env) ? \
+ F_ISSET((env)->rep_handle->region, REP_F_APP_BASEAPI) : \
+ F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI))
+
+/*
+ * Set application type. These macros do extra checking to guarantee that
+ * only one application type is ever set.
+ */
+#define APP_SET_REPMGR(env) do { \
+ if (REP_ON(env)) { \
+ ENV_ENTER(env, ip); \
+ REP_SYSTEM_LOCK(env); \
+ if (!F_ISSET((env)->rep_handle->region, \
+ REP_F_APP_BASEAPI)) \
+ F_SET((env)->rep_handle->region, \
+ REP_F_APP_REPMGR); \
+ REP_SYSTEM_UNLOCK(env); \
+ ENV_LEAVE(env, ip); \
+ } else if (!F_ISSET((env)->rep_handle, DBREP_APP_BASEAPI)) \
+ F_SET((env)->rep_handle, DBREP_APP_REPMGR); \
+} while (0)
+#define APP_SET_BASEAPI(env) do { \
+ if (REP_ON(env)) { \
+ ENV_ENTER(env, ip); \
+ REP_SYSTEM_LOCK(env); \
+ if (!F_ISSET((env)->rep_handle->region, \
+ REP_F_APP_REPMGR)) \
+ F_SET((env)->rep_handle->region, \
+ REP_F_APP_BASEAPI); \
+ REP_SYSTEM_UNLOCK(env); \
+ ENV_LEAVE(env, ip); \
+ } else if (!F_ISSET((env)->rep_handle, DBREP_APP_REPMGR)) \
+ F_SET((env)->rep_handle, DBREP_APP_BASEAPI); \
+} while (0)
+
+#else
+/*
+ * We did not configure repmgr, application must be base replication API.
+ * The APP_SET_* macros are noops in this case, but they must be defined
+ * with a null body to avoid compiler warnings on some platforms.
+ */
+#define APP_IS_REPMGR(env) 0
+#define APP_SET_REPMGR(env) do { \
+ ; \
+} while (0)
+#define APP_IS_BASEAPI(env) 1
+#define APP_SET_BASEAPI(env) do { \
+ ; \
+} while (0)
+#endif /* HAVE_REPLICATION_THREADS */
+
+/*
+ * Control structure flags for replication communication infrastructure.
+ */
+/*
+ * Define old DB_LOG_ values that we must support here. For reasons of
+ * compatibility with old versions, these values must be reserved explicitly in
+ * the list of flag values (below)
+ */
+#define DB_LOG_PERM_42_44 0x20
+#define DB_LOG_RESEND_42_44 0x40
+#define REPCTL_INIT_45 0x02 /* Back compatible flag value. */
+
+#define REPCTL_ELECTABLE 0x01 /* Upgraded client is electable. */
+#define REPCTL_FLUSH 0x02 /* Record should be flushed. */
+#define REPCTL_GROUP_ESTD 0x04 /* Message from site in a group. */
+#define REPCTL_INIT 0x08 /* Internal init message. */
+#define REPCTL_LEASE 0x10 /* Lease related message.. */
+ /*
+ * Skip over reserved values 0x20
+ * and 0x40, as explained above.
+ */
+#define REPCTL_LOG_END 0x80 /* Approximate end of group-wide log. */
+#define REPCTL_PERM DB_LOG_PERM_42_44
+#define REPCTL_RESEND DB_LOG_RESEND_42_44
+
+/*
+ * File info flags for internal init. The per-database (i.e., file) flag
+ * represents the on-disk format of the file, and is conveyed from the master to
+ * the initializing client in the UPDATE message, so that the client can know
+ * how to create the file. The per-page flag is conveyed along with each PAGE
+ * message, describing the format of the page image being transmitted; it is of
+ * course set by the site serving the PAGE_REQ. The serving site gets the page
+ * image from its own mpool, and thus the page is in the native format of the
+ * serving site. This format may be different (i.e., opposite) from the on-disk
+ * format, and in fact can vary per-page, since with client-to-client sync it is
+ * possible for various different sites to serve the various PAGE_REQ requests.
+ */
+#define REPINFO_DB_LITTLEENDIAN 0x0001 /* File is little-endian lorder. */
+#define REPINFO_PG_LITTLEENDIAN 0x0002 /* Page is little-endian lorder. */
+
+/*
+ * Control message format for 4.6 release. The db_timespec_t is
+ * not a portable structure. Therefore, in 4.6, replication among
+ * mixed OSs such as Linux and Windows, which have different time_t
+ * sizes, does not work.
+ */
+typedef struct {
+ u_int32_t rep_version; /* Replication version number. */
+ u_int32_t log_version; /* Log version number. */
+
+ DB_LSN lsn; /* Log sequence number. */
+ u_int32_t rectype; /* Message type. */
+ u_int32_t gen; /* Generation number. */
+ db_timespec msg_time; /* Timestamp seconds for leases. */
+ u_int32_t flags; /* log_put flag value. */
+} REP_46_CONTROL;
+
+/*
+ * Control message format for 4.5 release and earlier.
+ */
+typedef struct {
+ u_int32_t rep_version; /* Replication version number. */
+ u_int32_t log_version; /* Log version number. */
+
+ DB_LSN lsn; /* Log sequence number. */
+ u_int32_t rectype; /* Message type. */
+ u_int32_t gen; /* Generation number. */
+ u_int32_t flags; /* log_put flag value. */
+} REP_OLD_CONTROL;
+
+#define LEASE_REFRESH_MIN 30 /* Minimum number of refresh retries. */
+#define LEASE_REFRESH_USEC 50000 /* Microseconds between refresh tries. */
+
+/* Master granted lease information. */
+typedef struct __rep_lease_entry {
+ int eid; /* EID of client grantor. */
+ db_timespec start_time; /* Start time clients echo back. */
+ db_timespec end_time; /* Master lease expiration time. */
+ DB_LSN lease_lsn; /* Durable LSN lease applies to. */
+} REP_LEASE_ENTRY;
+
+/*
+ * Old vote info where some fields were not fixed size.
+ */
+typedef struct {
+ u_int32_t egen; /* Election generation. */
+ int nsites; /* Number of sites I've been in
+ * communication with. */
+ int nvotes; /* Number of votes needed to win. */
+ int priority; /* My site's priority. */
+ u_int32_t tiebreaker; /* Tie-breaking quasi-random value. */
+} REP_OLD_VOTE_INFO;
+
+typedef struct {
+ u_int32_t egen; /* Voter's election generation. */
+ int eid; /* Voter's ID. */
+} REP_VTALLY;
+
+/*
+ * The REP_THROTTLE_ONLY flag is used to do throttle processing only.
+ * If set, it will only allow sending the REP_*_MORE message, but not
+ * the normal, non-throttled message. It is used to support throttling
+ * with bulk transfer.
+ */
+/* Flags for __rep_send_throttle. */
+#define REP_THROTTLE_ONLY 0x0001 /* Send _MORE message only. */
+
+/* Throttled message processing information. */
+typedef struct {
+ DB_LSN lsn; /* LSN of this record. */
+ DBT *data_dbt; /* DBT of this record. */
+ u_int32_t gbytes; /* This call's max gbytes sent. */
+ u_int32_t bytes; /* This call's max bytes sent. */
+ u_int32_t type; /* Record type. */
+} REP_THROTTLE;
+
+/* Bulk processing information. */
+/*
+ * !!!
+ * We use a roff_t for the offset. We'd really like to use a ptrdiff_t
+ * since that really is what it is. But ptrdiff_t is not portable and
+ * doesn't exist everywhere.
+ */
+typedef struct {
+ u_int8_t *addr; /* Address of bulk buffer. */
+ roff_t *offp; /* Ptr to current offset into buffer. */
+ u_int32_t len; /* Bulk buffer length. */
+ u_int32_t type; /* Item type in buffer (log, page). */
+ DB_LSN lsn; /* First LSN in buffer. */
+ int eid; /* ID of potential recipients. */
+#define BULK_XMIT 0x001 /* Buffer in transit. */
+ u_int32_t *flagsp; /* Buffer flags. */
+} REP_BULK;
+
+/*
+ * This structure takes care of representing a transaction.
+ * It holds all the records, sorted by page number so that
+ * we can obtain locks and apply updates in a deadlock free
+ * order.
+ */
+typedef struct {
+ u_int nlsns;
+ u_int nalloc;
+ DB_LSN *array;
+} LSN_COLLECTION;
+
+/*
+ * This is used by the page-prep routines to do the lock_vec call to
+ * apply the updates for a single transaction or a collection of
+ * transactions.
+ */
+typedef struct {
+ int n;
+ DB_LOCKREQ *reqs;
+ DBT *objs;
+} linfo_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/rep_ext.h"
+#endif /* !_DB_REP_H_ */
diff --git a/src/dbinc/repmgr.h b/src/dbinc/repmgr.h
new file mode 100644
index 00000000..d8fd199c
--- /dev/null
+++ b/src/dbinc/repmgr.h
@@ -0,0 +1,843 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_REPMGR_H_
+#define _DB_REPMGR_H_
+
+#include "dbinc_auto/repmgr_automsg.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Replication Manager message format types. These few format codes identify
+ * enough information to describe, at the lowest level, how a message should be
+ * read from the wire, including how much memory should be allocated to hold the
+ * result. (Often we want to allocate more than just enough to hold the
+ * received bytes, if we know that we will need more during processing.)
+ *
+ * These values are transmitted between sites, even sites running differing BDB
+ * versions. Therefore, once assigned, the values are permanently "frozen".
+ *
+ * For example, in repmgr wire protocol version 1 the highest assigned message
+ * type value was 3, for REPMGR_REP_MESSAGE. Wire protocol version 2 added the
+ * HEARTBEAT message type (4).
+ *
+ * New message types added in later versions always get new (higher) values. We
+ * still list them in alphabetical order, for ease of reference. But this
+ * generally does not correspond to numerical order.
+ */
+#define REPMGR_APP_MESSAGE 5 /* Msg sent from app. on DB_CHANNEL. */
+#define REPMGR_APP_RESPONSE 6 /* Response to a channel request. */
+#define REPMGR_OWN_MSG 8 /* Repmgr's own messages, to peers. */
+#define REPMGR_HANDSHAKE 2 /* Connection establishment sequence. */
+#define REPMGR_HEARTBEAT 4 /* Monitor connection health. */
+#define REPMGR_PERMLSN 1 /* My perm LSN. */
+#define REPMGR_REP_MESSAGE 3 /* Normal replication message. */
+#define REPMGR_RESP_ERROR 7 /* Sys-gen'd error resp to request. */
+
+/*
+ * Largest known message type code known in each protocol version we support.
+ * In protocol version one there were only three message types: 1, 2, and 3; so
+ * 3 was the max. In protocol version 2 we introduced heartbeats, type 4.
+ * (Protocol version 3 did not introduce any new message types.) In version 4
+ * we introduced a few more new message types, the largest of which had value 7.
+ */
+#define REPMGR_MAX_V1_MSG_TYPE 3
+#define REPMGR_MAX_V2_MSG_TYPE 4
+#define REPMGR_MAX_V3_MSG_TYPE 4
+#define REPMGR_MAX_V4_MSG_TYPE 8
+#define HEARTBEAT_MIN_VERSION 2
+#define CHANNEL_MIN_VERSION 4
+#define CONN_COLLISION_VERSION 4
+#define GM_MIN_VERSION 4
+#define OWN_MIN_VERSION 4
+
+/* The range of protocol versions we're willing to support. */
+#define DB_REPMGR_VERSION 4
+#define DB_REPMGR_MIN_VERSION 1
+
+/*
+ * For messages with the "REPMGR_OWN_MSG" format code, a message type (see
+ * REPMGR_OWN_MSG_TYPE, below) is included in the header. While at the lowest
+ * level, the format codes identify only enough to read and allocate memory, at
+ * the next higher level the following message type codes identify the content
+ * of the message: how to unmarshal and dispatch it.
+ *
+ * Like the message format types, these message type values should be
+ * permanently frozen.
+ */
+#define REPMGR_CONNECT_REJECT 1
+#define REPMGR_GM_FAILURE 2
+#define REPMGR_GM_FORWARD 3
+#define REPMGR_JOIN_REQUEST 4
+#define REPMGR_JOIN_SUCCESS 5
+#define REPMGR_PARM_REFRESH 6
+#define REPMGR_REJOIN 7
+#define REPMGR_REMOVE_REQUEST 8
+#define REPMGR_REMOVE_SUCCESS 9
+#define REPMGR_RESOLVE_LIMBO 10
+#define REPMGR_SHARING 11
+
+
+struct __repmgr_connection;
+ typedef struct __repmgr_connection REPMGR_CONNECTION;
+struct __repmgr_queue; typedef struct __repmgr_queue REPMGR_QUEUE;
+struct __queued_output; typedef struct __queued_output QUEUED_OUTPUT;
+struct __repmgr_response; typedef struct __repmgr_response REPMGR_RESPONSE;
+struct __repmgr_retry; typedef struct __repmgr_retry REPMGR_RETRY;
+struct __repmgr_runnable; typedef struct __repmgr_runnable REPMGR_RUNNABLE;
+struct __repmgr_site; typedef struct __repmgr_site REPMGR_SITE;
+struct __cond_waiters_table;
+ typedef struct __cond_waiters_table COND_WAITERS_TABLE;
+
+/* Current Group Membership DB format ID. */
+#define REPMGR_GMDB_FMT_VERSION 1
+
+#ifdef DB_WIN32
+typedef SOCKET socket_t;
+typedef HANDLE thread_id_t;
+typedef HANDLE mgr_mutex_t;
+typedef HANDLE cond_var_t;
+
+typedef COND_WAITERS_TABLE *waiter_t;
+typedef WSABUF db_iovec_t;
+#else
+typedef int socket_t;
+typedef pthread_t thread_id_t;
+typedef pthread_mutex_t mgr_mutex_t;
+typedef pthread_cond_t cond_var_t;
+typedef pthread_cond_t waiter_t;
+typedef struct iovec db_iovec_t;
+#endif
+
+/*
+ * The (arbitrary) maximum number of outgoing messages we're willing to hold, on
+ * a queue per connection, waiting for TCP buffer space to become available in
+ * the kernel. Rather than exceeding this limit, we simply discard additional
+ * messages (since this is always allowed by the replication protocol).
+ * As a special dispensation, if a message is destined for a specific remote
+ * site (i.e., it's not a broadcast), then we first try blocking the sending
+ * thread, waiting for space to become available (though we only wait a limited
+ * time). This is so as to be able to handle the immediate flood of (a
+ * potentially large number of) outgoing messages that replication generates, in
+ * a tight loop, when handling PAGE_REQ, LOG_REQ and ALL_REQ requests.
+ */
+#define OUT_QUEUE_LIMIT 10
+
+/*
+ * The system value is available from sysconf(_SC_HOST_NAME_MAX).
+ * Historically, the maximum host name was 256.
+ */
+#ifndef MAXHOSTNAMELEN
+#define MAXHOSTNAMELEN 256
+#endif
+
+/* A buffer big enough for the string "site host.domain.com:65535". */
+#define MAX_SITE_LOC_STRING (MAXHOSTNAMELEN+20)
+typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1];
+
+#define MAX_MSG_BUF (__REPMGR_MAXMSG_SIZE + MAXHOSTNAMELEN + 1)
+
+/* Default timeout values, in seconds. */
+#define DB_REPMGR_DEFAULT_ACK_TIMEOUT (1 * US_PER_SEC)
+#define DB_REPMGR_DEFAULT_CONNECTION_RETRY (30 * US_PER_SEC)
+#define DB_REPMGR_DEFAULT_ELECTION_RETRY (10 * US_PER_SEC)
+#define DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT (5 * US_PER_SEC)
+
+typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST;
+typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER;
+typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER;
+
+/* Information about threads managed by Replication Framework. */
+struct __repmgr_runnable {
+ ENV *env;
+ thread_id_t thread_id;
+ void *(*run) __P((void *));
+ int finished; /* Boolean: thread is exiting, may be joined. */
+ int quit_requested; /* Boolean: thread has been asked to quit. */
+#ifdef DB_WIN32
+ HANDLE quit_event;
+#endif
+ union {
+
+/*
+ * Options governing requested behavior of election thread.
+ */
+#define ELECT_F_EVENT_NOTIFY 0x01 /* Notify application of master failure. */
+#define ELECT_F_FAST 0x02 /* First election "fast" (n-1 trick). */
+#define ELECT_F_IMMED 0x04 /* Start with immediate election. */
+#define ELECT_F_INVITEE 0x08 /* Honor (remote) inviter's nsites. */
+#define ELECT_F_STARTUP 0x10 /* Observe repmgr_start() policy. */
+ u_int32_t flags;
+
+ int eid; /* For Connector thread. */
+
+ /*
+ * Args for other thread types can be added here in the future
+ * as needed.
+ */
+ } args;
+};
+
+/*
+ * Information about pending connection establishment retry operations.
+ *
+ * We keep these in order by time. This works, under the assumption that the
+ * DB_REP_CONNECTION_RETRY never changes once we get going (though that
+ * assumption is of course wrong, so this needs to be fixed).
+ *
+ * Usually, we put things onto the tail end of the list. But when we add a new
+ * site while threads are running, we trigger its first connection attempt by
+ * scheduling a retry for "0" microseconds from now, putting its retry element
+ * at the head of the list instead.
+ *
+ * TODO: I think this can be fixed by defining "time" to be the time the element
+ * was added (with some convention like "0" meaning immediate), rather than the
+ * deadline time.
+ */
+struct __repmgr_retry {
+ TAILQ_ENTRY(__repmgr_retry) entries;
+ int eid;
+ db_timespec time;
+};
+
+/*
+ * We use scatter/gather I/O for both reading and writing. Repmgr messages
+ * (including rep messages) use 3 segments: envelope, control and rec.
+ * Application messages can have any number of segments (the number they
+ * specify, plus 1 for our envelope). REPMGR_IOVECS_ALLOC_SZ should (only) be
+ * used when n > 3.
+ */
+#define REPMGR_IOVECS_ALLOC_SZ(n) \
+ (sizeof(REPMGR_IOVECS) + ((n) - MIN_IOVEC) * sizeof(db_iovec_t))
+typedef struct {
+ /*
+ * Index of the first iovec to be used. Initially of course this is
+ * zero. But as we progress through partial I/O transfers, it ends up
+ * pointing to the first iovec to be used on the next operation.
+ */
+ int offset;
+
+ /*
+ * Total number of pieces defined for this message; equal to the number
+ * of times add_buffer and/or add_dbt were called to populate it. We do
+ * *NOT* revise this as we go along. So subsequent I/O operations must
+ * use count-offset to get the number of active vector pieces still
+ * remaining.
+ */
+ int count;
+
+ /*
+ * Total number of bytes accounted for in all the pieces of this
+ * message. We do *NOT* revise this as we go along.
+ */
+ size_t total_bytes;
+
+#define MIN_IOVEC 3
+ db_iovec_t vectors[MIN_IOVEC]; /* Variable length array. */
+} REPMGR_IOVECS;
+
+typedef struct {
+ size_t length; /* number of bytes in data */
+ int ref_count; /* # of sites' send queues pointing to us */
+ u_int8_t data[1]; /* variable size data area */
+} REPMGR_FLAT;
+
+struct __queued_output {
+ STAILQ_ENTRY(__queued_output) entries;
+ REPMGR_FLAT *msg;
+ size_t offset;
+};
+
+/*
+ * The following is for input. Once we know the sizes of the pieces of an
+ * incoming message, we can create this struct (and also the data areas for the
+ * pieces themselves, in the same memory allocation). This is also the struct
+ * in which the message lives while it's waiting to be processed by message
+ * threads.
+ */
+typedef struct __repmgr_message {
+ STAILQ_ENTRY(__repmgr_message) entries;
+ __repmgr_msg_hdr_args msg_hdr;
+ union {
+ struct {
+ int originating_eid;
+ DBT control, rec;
+ } repmsg;
+ struct {
+ REPMGR_CONNECTION *conn;
+ DBT request;
+ } gmdb_msg;
+ struct {
+ /*
+ * Connection from which the message arrived; NULL if
+ * generated on the local site.
+ */
+ REPMGR_CONNECTION *conn;
+
+ DBT buf; /* for reading */
+ DBT segments[1]; /* expanded in msg th. before callbk */
+ } appmsg;
+ } v; /* Variants */
+} REPMGR_MESSAGE;
+
+typedef enum {
+ SIZES_PHASE,
+ DATA_PHASE
+} phase_t;
+
+typedef enum {
+ APP_CONNECTION,
+ REP_CONNECTION,
+ UNKNOWN_CONN_TYPE
+} conn_type_t;
+
+struct __repmgr_connection {
+ TAILQ_ENTRY(__repmgr_connection) entries;
+
+ socket_t fd;
+#ifdef DB_WIN32
+ WSAEVENT event_object;
+#endif
+
+ /*
+ * Number of other structures referring to this conn struct. This
+ * ref_count must be reduced to zero before this conn struct can be
+ * destroyed. Referents include:
+ *
+ * - the select() loop, which owns the right to do all reading, as well
+ * as the exclusive right to eventually close the socket
+ *
+ * - a "channel" that owns this APP_CONNECTION (on the originating side)
+ *
+ * - a message received on this APP_CONNECTION, queued for processing
+ *
+ * - any writer blocked on waiting for the outbound queue to drain
+ */
+ u_int32_t ref_count;
+
+ conn_type_t type;
+ u_int32_t version; /* Wire protocol version on this connection. */
+ /* (0 means not yet determined.) */
+
+/*
+ * When we make an outgoing connection, it starts in CONNECTED state. When we
+ * get the response to our version negotiation, we move to READY.
+ * For incoming connections that we accept, we start in NEGOTIATE, then to
+ * PARAMETERS, and then to READY.
+ * CONGESTED is a hierarchical substate of READY: it's just like READY, with
+ * the additional wrinkle that we don't bother waiting for the outgoing queue to
+ * drain in certain circumstances.
+ */
+#define CONN_CONGESTED 1 /* Long-lived full outgoing queue. */
+#define CONN_CONNECTED 2 /* Awaiting reply to our version negotiation. */
+#define CONN_DEFUNCT 3 /* Basically dead, awaiting clean-up. */
+#define CONN_NEGOTIATE 4 /* Awaiting version proposal. */
+#define CONN_PARAMETERS 5 /* Awaiting parameters handshake. */
+#define CONN_READY 6 /* Everything's fine. */
+ int state;
+
+ /*
+ * Input: while we're reading a message, we keep track of what phase
+ * we're in. In both phases, we use a REPMGR_IOVECS to keep track of
+ * our progress within the phase. Depending upon the message type, we
+ * end up with either a rep_message (which is a wrapper for the control
+ * and rec DBTs), or a single generic DBT.
+ * Any time we're in DATA_PHASE, it means we have already received
+ * the message header (consisting of msg_type and 2 sizes), and
+ * therefore we have allocated buffer space to read the data. (This is
+ * important for resource clean-up.)
+ */
+ phase_t reading_phase;
+ REPMGR_IOVECS iovecs;
+
+ u_int8_t msg_type;
+ u_int8_t msg_hdr_buf[__REPMGR_MSG_HDR_SIZE];
+
+ union {
+ REPMGR_MESSAGE *rep_message;
+ struct {
+ DBT cntrl, rec;
+ } repmgr_msg;
+ } input;
+
+ /*
+ * Output: usually we just simply write messages right in line, in the
+ * send() function's thread. But if TCP doesn't have enough network
+ * buffer space for us when we first try it, we instead allocate some
+ * memory, and copy the message, and then send it as space becomes
+ * available in our main select() thread. In some cases, if the queue
+ * gets too long we wait until it's drained, and then append to it.
+ * This condition variable's associated mutex is the normal per-repmgr
+ * db_rep->mutex, because that mutex is always held anyway whenever the
+ * output queue is consulted.
+ */
+ OUT_Q_HEADER outbound_queue;
+ int out_queue_length;
+ cond_var_t drained;
+
+ /* =-=-=-=-= app-channel stuff =-=-=-=-= */
+ waiter_t response_waiters;
+
+ /*
+ * Array of info about pending responses to requests. This info is here
+ * (rather than on the stack of the thread calling send_request())
+ * because it provides an easy way to allocate available numbers for
+ * message tags, and also so that we can easily find the right info when
+ * we get the tag back in the msg header of the response.
+ */
+ REPMGR_RESPONSE *responses;
+ u_int32_t aresp; /* Array size. */
+ u_int32_t cur_resp; /* Index of response currently reading. */
+
+ /* =-=-=-=-= for normal repmgr connections =-=-=-=-= */
+ /*
+ * Generally on a REP_CONNECTION type, we have an associated EID (which
+ * is an index into the sites array, by the way). When we initiate the
+ * connection ("outgoing"), we know from the start what the EID is; the
+ * connection struct is linked from the site struct. On the other hand,
+ * when we receive an incoming connection, we don't know at first what
+ * site it may be associated with (or even whether it's an
+ * APP_CONNECTION or REP_CONNECTION, for that matter). During that
+ * initial uncertain time, the eid is -1. Also, when a connection
+ * becomes defunct, but the conn struct hasn't yet been destroyed, the
+ * eid also becomes -1.
+ *
+ * The eid should be -1 if and only if the connection is on the orphans
+ * list.
+ */
+ int eid;
+
+};
+
+#define IS_READY_STATE(s) ((s) == CONN_READY || (s) == CONN_CONGESTED)
+
+#ifdef HAVE_GETADDRINFO
+typedef struct addrinfo ADDRINFO;
+typedef struct sockaddr_storage ACCEPT_ADDR;
+#else
+typedef struct sockaddr_in ACCEPT_ADDR;
+/*
+ * Some windows platforms have getaddrinfo (Windows XP), some don't. We don't
+ * support conditional compilation in our Windows build, so we always use our
+ * own getaddrinfo implementation. Rename everything so that we don't collide
+ * with the system libraries.
+ */
+#undef AI_PASSIVE
+#define AI_PASSIVE 0x01
+#undef AI_CANONNAME
+#define AI_CANONNAME 0x02
+#undef AI_NUMERICHOST
+#define AI_NUMERICHOST 0x04
+
+typedef struct __addrinfo {
+ int ai_flags; /* AI_PASSIVE, AI_CANONNAME, AI_NUMERICHOST */
+ int ai_family; /* PF_xxx */
+ int ai_socktype; /* SOCK_xxx */
+ int ai_protocol; /* 0 or IPPROTO_xxx for IPv4 and IPv6 */
+ size_t ai_addrlen; /* length of ai_addr */
+ char *ai_canonname; /* canonical name for nodename */
+ struct sockaddr *ai_addr; /* binary address */
+ struct __addrinfo *ai_next; /* next structure in linked list */
+} ADDRINFO;
+#endif /* HAVE_GETADDRINFO */
+
+/*
+ * Unprocessed network address configuration.
+ */
+typedef struct {
+ roff_t host; /* Separately allocated copy of string. */
+ u_int16_t port; /* Stored in plain old host-byte-order. */
+} SITEADDR;
+
+/*
+ * Site information, as stored in shared region.
+ */
+typedef struct {
+ SITEADDR addr; /* Unprocessed network address of site. */
+ u_int32_t config; /* Configuration flags: peer, helper, etc. */
+ u_int32_t status; /* Group membership status. */
+} SITEINFO;
+
+/*
+ * A site address, as stored locally.
+ */
+typedef struct {
+ char *host; /* Separately allocated copy of string. */
+ u_int16_t port; /* Stored in plain old host-byte-order. */
+} repmgr_netaddr_t;
+
+/*
+ * We store site structs in a dynamically allocated, growable array, indexed by
+ * EID. We allocate EID numbers for all sites simply according to their
+ * index within this array.
+ */
+#define SITE_FROM_EID(eid) (&db_rep->sites[eid])
+#define EID_FROM_SITE(s) ((int)((s) - (&db_rep->sites[0])))
+#define IS_VALID_EID(e) ((e) >= 0)
+#define IS_KNOWN_REMOTE_SITE(e) ((e) >= 0 && ((e) != db_rep->self_eid) && \
+ (((u_int)(e)) < db_rep->site_cnt))
+#define FOR_EACH_REMOTE_SITE_INDEX(i) \
+ for ((i) = (db_rep->self_eid == 0 ? 1 : 0); \
+ ((u_int)i) < db_rep->site_cnt; \
+ (int)(++(i)) == db_rep->self_eid ? ++(i) : i)
+
+struct __repmgr_site {
+ repmgr_netaddr_t net_addr;
+
+ /*
+ * Group membership status: a copy of the status from the membership
+ * database, or the out-of-band value 0, meaning that it doesn't exist.
+ * We keep track of a "non-existent" site because the associated
+ * host/port network address is promised to be associated with the
+ * locally known EID for the life of the environment.
+ */
+ u_int32_t membership; /* Status flags from GMDB. */
+ u_int32_t config; /* Flags from site->set_config() */
+
+ /*
+ * Everything below here is applicable only to remote sites.
+ */
+ DB_LSN max_ack; /* Best ack we've heard from this site. */
+ int ack_policy; /* Or 0 if unknown. */
+ u_int16_t alignment; /* Requirements for app channel msgs. */
+ db_timespec last_rcvd_timestamp;
+
+ /* Contents depends on state. */
+ struct {
+ struct { /* when CONNECTED */
+ /*
+ * The only time we ever have two connections is in case
+ * of a "collision" on the "server" side. In that case,
+ * the incoming connection either will be closed
+ * promptly by the remote "client", or it is a half-open
+ * connection due to the remote client system having
+ * crashed and rebooted, in which case KEEPALIVE will
+ * eventually clear it.
+ */
+ REPMGR_CONNECTION *in; /* incoming connection */
+ REPMGR_CONNECTION *out; /* outgoing connection */
+ } conn;
+ REPMGR_RETRY *retry; /* when PAUSING */
+ /* Unused when CONNECTING. */
+ } ref;
+
+ /*
+ * Subordinate connections (connections from subordinate processes at a
+ * multi-process site). Note that the SITE_CONNECTED state, and all the
+ * ref.retry stuff above is irrelevant to subordinate connections. If a
+ * connection is on this list, it exists; and we never bother trying to
+ * reconnect lost connections (indeed we can't, for these are always
+ * incoming-only).
+ */
+ CONNECTION_LIST sub_conns;
+ REPMGR_RUNNABLE *connector; /* Thread to open a connection. */
+
+#define SITE_CONNECTED 1 /* We have a (main) connection. */
+#define SITE_CONNECTING 2 /* Trying to establish (main) connection. */
+#define SITE_IDLE 3 /* Doing nothing. */
+#define SITE_PAUSING 4 /* Waiting til time to retry connecting. */
+ int state;
+
+#define SITE_HAS_PRIO 0x01 /* Set if "electable" flag bit is valid. */
+#define SITE_ELECTABLE 0x02
+#define SITE_TOUCHED 0x04 /* Seen GMDB record during present scan. */
+ u_int32_t flags;
+};
+
+/*
+ * Flag values for the public DB_SITE handle.
+ */
+#define DB_SITE_PREOPEN 0x01 /* Provisional EID; may change at env open. */
+
+struct __repmgr_response {
+ DBT dbt;
+ int ret;
+
+#define RESP_COMPLETE 0x01
+#define RESP_DUMMY_BUF 0x02
+#define RESP_IN_USE 0x04
+#define RESP_READING 0x08
+#define RESP_THREAD_WAITING 0x10
+ u_int32_t flags;
+};
+
+/*
+ * Private structure for managing comms "channels." This is separate from
+ * DB_CHANNEL so as to avoid dragging in other private structures (e.g.,
+ * REPMGR_CONNECTION) into db.h, similar to the relationship between DB_ENV and
+ * ENV.
+ */
+struct __channel {
+ DB_CHANNEL *db_channel;
+ ENV *env;
+
+ union {
+ /* For simple, specific-EID channels. */
+ REPMGR_CONNECTION *conn;
+
+ /* For EID_MASTER or EID_BROADCAST channels. */
+ struct {
+ mgr_mutex_t *mutex; /* For connection establishment. */
+ REPMGR_CONNECTION **array;
+ u_int32_t cnt;
+ } conns;
+ } c;
+ REPMGR_MESSAGE *msg; /* Incoming channel only; NULL otherwise. */
+ int responded; /* Boolean flag. */
+ __repmgr_msg_metadata_args *meta;
+
+ /* Used only in send-to-self request case. */
+ struct __repmgr_response response;
+};
+
+/*
+ * Repmgr keeps track of references to connection information (instances
+ * of struct __repmgr_connection). There are three kinds of places
+ * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and
+ * (3) db_rep->connections.
+ *
+ * 1. SITE->ref.conn points to our connection with the main process running
+ * at the given site, if such a connection exists. We may have initiated
+ * the connection to the site ourselves, or we may have received it as an
+ * incoming connection. Once it is established there is very little
+ * difference between those two cases.
+ *
+ * 2. SITE->sub_conns is a list of connections we have with subordinate
+ * processes running at the given site. There can be any number of these
+ * connections, one per subordinate process. Note that these connections
+ * are always incoming: there's no way for us to initiate this kind of
+ * connection because subordinate processes do not "listen".
+ *
+ * 3. The db_rep->connections list contains the references to any
+ * connections that are not actively associated with any site (we
+ * sometimes call these "orphans"). There are two times when this can
+ * be:
+ *
+ * a) When we accept an incoming connection, we don't know what site it
+ * comes from until we read the initial handshake message.
+ *
+ * b) When an error occurs on a connection, we first mark it as DEFUNCT
+ * and stop using it. Then, at a later, well-defined time, we close
+ * the connection's file descriptor and get rid of the connection
+ * struct.
+ *
+ * In light of the above, we can see that the following describes the
+ * rules for how connections may be moved among these three kinds of
+ * "places":
+ *
+ * - when we initiate an outgoing connection, we of course know what site
+ * it's going to be going to, and so we immediately put the pointer to
+ * the connection struct into SITE->ref.conn
+ *
+ * - when we accept an incoming connection, we don't immediately know
+ * whom it's from, so we have to put it on the orphans list
+ * (db_rep->connections).
+ *
+ * - (incoming, cont.) But as soon as we complete the initial "handshake"
+ * message exchange, we will know which site it's from and whether it's
+ * a subordinate or main connection. At that point we remove it from
+ * db_rep->connections and either point to it by SITE->ref.conn, or add
+ * it to the SITE->sub_conns list.
+ *
+ * - (for any active connection) when an error occurs, we move the
+ * connection to the orphans list until we have a chance to close it.
+ */
+
+/*
+ * Repmgr message formats.
+ *
+ * Declarative definitions of current message formats appear in repmgr.msg.
+ * (The s_message/gen_msg.awk utility generates C code.) In general, we send
+ * the buffers marshaled from those structure formats in the "control" portion
+ * of a message.
+ *
+ * Each message is prefaced by a 9-byte message header (as described in
+ * repmgr_net.c). Different message types use the two available 32-bit integers
+ * in different ways, as codified here:
+ */
+#define REPMGR_HDR1(hdr) ((hdr).word1)
+#define REPMGR_HDR2(hdr) ((hdr).word2)
+
+/* REPMGR_APP_MESSAGE */
+#define APP_MSG_BUFFER_SIZE REPMGR_HDR1
+#define APP_MSG_SEGMENT_COUNT REPMGR_HDR2
+
+/* REPMGR_REP_MESSAGE and the other traditional repmgr message types. */
+#define REP_MSG_CONTROL_SIZE REPMGR_HDR1
+#define REP_MSG_REC_SIZE REPMGR_HDR2
+
+/* REPMGR_APP_RESPONSE */
+#define APP_RESP_BUFFER_SIZE REPMGR_HDR1
+#define APP_RESP_TAG REPMGR_HDR2
+
+/* REPMGR_RESP_ERROR. Note that a zero-length message body is implied. */
+#define RESP_ERROR_CODE REPMGR_HDR1
+#define RESP_ERROR_TAG REPMGR_HDR2
+
+/* REPMGR_OWN_MSG */
+#define REPMGR_OWN_BUF_SIZE REPMGR_HDR1
+#define REPMGR_OWN_MSG_TYPE REPMGR_HDR2
+
+/*
+ * Flags for the handshake message. As with repmgr message types, these values
+ * are transmitted between sites, and must therefore be "frozen" permanently.
+ * Names are alphabetized here for easy reference, but values reflect historical
+ * usage.
+ */
+#define APP_CHANNEL_CONNECTION 0x02 /* Connection used for app channel. */
+#define ELECTABLE_SITE 0x04
+#define REPMGR_SUBORDINATE 0x01 /* This is a subordinate connection. */
+
+/*
+ * Flags for application-message meta-data.
+ */
+#define REPMGR_MULTI_RESP 0x01
+#define REPMGR_REQUEST_MSG_TYPE 0x02
+#define REPMGR_RESPONSE_LIMIT 0x04
+
+/*
+ * Legacy V1 handshake message format. For compatibility, we send this as part
+ * of version negotiation upon connection establishment.
+ */
+typedef struct {
+ u_int32_t version;
+ u_int16_t port;
+ u_int32_t priority;
+} DB_REPMGR_V1_HANDSHAKE;
+
+/*
+ * Storage formats.
+ *
+ * As with message formats, stored formats are defined in repmgr.msg.
+ */
+/*
+ * Flags for the Group Membership data portion of a record. Like message type
+ * codes, these values are frozen across releases, in order to avoid pointless
+ * churn.
+ */
+#define SITE_ADDING 0x01
+#define SITE_DELETING 0x02
+#define SITE_PRESENT 0x04
+
+/*
+ * Message types whose processing could take a long time. We're careful to
+ * avoid using up all our message processing threads on these message types, so
+ * that we don't starve out the more important rep messages.
+ */
+#define IS_DEFERRABLE(t) ((t) == REPMGR_OWN_MSG || (t) == REPMGR_APP_MESSAGE)
+/*
+ * When using leases there are times when a thread processing a message
+ * must block, waiting for leases to be refreshed. But refreshing the
+ * leases requires another thread to accept the lease grant messages.
+ */
+#define RESERVED_MSG_TH(env) (IS_USING_LEASES(env) ? 2 : 1)
+
+#define IS_SUBORDINATE(db_rep) (db_rep->listen_fd == INVALID_SOCKET)
+
+#define IS_PEER_POLICY(p) ((p) == DB_REPMGR_ACKS_ALL_PEERS || \
+ (p) == DB_REPMGR_ACKS_QUORUM || \
+ (p) == DB_REPMGR_ACKS_ONE_PEER)
+
+/*
+ * Most of the code in repmgr runs while holding repmgr's main mutex, which
+ * resides in db_rep->mutex. This mutex is owned by a single repmgr process,
+ * and serializes access to the (large) critical sections among threads in the
+ * process. Unlike many other mutexes in DB, it is specifically coded as either
+ * a POSIX threads mutex or a Win32 mutex. Note that although it's a large
+ * fraction of the code, it's a tiny fraction of the time: repmgr spends most of
+ * its time in a call to select(), and as well a bit in calls into the Base
+ * replication API. All of those release the mutex.
+ * Access to repmgr's shared list of site addresses is protected by
+ * another mutex: mtx_repmgr. And, when changing space allocation for that site
+ * list we conform to the convention of acquiring renv->mtx_regenv. These are
+ * less frequent of course.
+ * When it's necessary to acquire more than one of these mutexes, the
+ * ordering priority (or "lock ordering protocol") is:
+ * db_rep->mutex (first)
+ * mtx_repmgr (briefly)
+ * mtx_regenv (last, and most briefly)
+ *
+ * There are also mutexes for app message "channels". Each channel has a mutex,
+ * which is used to serialize any connection re-establishment that may become
+ * necessary during its lifetime (such as when a master changes). This never
+ * happens on a simple, specific-EID channel, but in other cases multiple app
+ * threads could be making send_xxx() calls concurrently, and it would not do to
+ * have two of them try to re-connect concurrently.
+ * When re-establishing a connection, the channel lock is held while
+ * grabbing first the mtx_repmgr, and then the db_rep mutex (but not both
+ * together). I.e., we have:
+ * channel->mutex (first)
+ * [mtx_repmgr (very briefly)] and then [db_rep->mutex (very briefly)]
+ */
+
+#define LOCK_MUTEX(m) do { \
+ if (__repmgr_lock_mutex(m) != 0) \
+ return (DB_RUNRECOVERY); \
+} while (0)
+
+#define UNLOCK_MUTEX(m) do { \
+ if (__repmgr_unlock_mutex(m) != 0) \
+ return (DB_RUNRECOVERY); \
+} while (0)
+
+/* POSIX/Win32 socket (and other) portability. */
+#ifdef DB_WIN32
+#define WOULDBLOCK WSAEWOULDBLOCK
+#undef DB_REPMGR_EAGAIN
+
+#define net_errno WSAGetLastError()
+typedef int socklen_t;
+typedef char * sockopt_t;
+#define sendsocket(s, buf, len, flags) send((s), (buf), (int)(len), (flags))
+
+#define iov_len len
+#define iov_base buf
+
+typedef DWORD threadsync_timeout_t;
+
+#define REPMGR_INITED(db_rep) (db_rep->signaler != NULL)
+#else
+
+#define INVALID_SOCKET -1
+#define SOCKET_ERROR -1
+#define WOULDBLOCK EWOULDBLOCK
+#define DB_REPMGR_EAGAIN EAGAIN
+
+#define net_errno errno
+typedef void * sockopt_t;
+
+#define sendsocket(s, buf, len, flags) send((s), (buf), (len), (flags))
+#define closesocket(fd) close(fd)
+
+typedef struct timespec threadsync_timeout_t;
+
+#define REPMGR_INITED(db_rep) (db_rep->read_pipe >= 0)
+#endif
+
+#define SELECTOR_RUNNING(db_rep) ((db_rep)->selector != NULL)
+
+/*
+ * Generic definition of some action to be performed on each connection, in the
+ * form of a call-back function.
+ */
+typedef int (*CONNECTION_ACTION) __P((ENV *, REPMGR_CONNECTION *, void *));
+
+/*
+ * Generic predicate to test a condition that a thread is waiting for.
+ */
+typedef int (*PREDICATE) __P((ENV *, void *));
+
+#include "dbinc_auto/repmgr_ext.h"
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_REPMGR_H_ */
diff --git a/src/dbinc/shqueue.h b/src/dbinc/shqueue.h
new file mode 100644
index 00000000..22464462
--- /dev/null
+++ b/src/dbinc/shqueue.h
@@ -0,0 +1,410 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_SHQUEUE_H_
+#define _DB_SHQUEUE_H_
+
+/*
+ * This file defines three types of data structures: chains, lists and
+ * tail queues similarly to the include file <sys/queue.h>.
+ *
+ * The difference is that this set of macros can be used for structures that
+ * reside in shared memory that may be mapped at different addresses in each
+ * process. In most cases, the macros for shared structures exactly mirror
+ * the normal macros, although the macro calls require an additional type
+ * parameter, only used by the HEAD and ENTRY macros of the standard macros.
+ *
+ * Since we use relative offsets of type ssize_t rather than pointers, 0
+ * (aka NULL) is a valid offset and cannot be used to indicate the end
+ * of a list. Therefore, we use -1 to indicate end of list.
+ *
+ * The macros ending in "P" return pointers without checking for end or
+ * beginning of lists, the others check for end of list and evaluate to
+ * either a pointer or NULL.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define SH_PTR_TO_OFF(src, dest) \
+ ((db_ssize_t)(((u_int8_t *)(dest)) - ((u_int8_t *)(src))))
+
+#define SH_OFF_TO_PTR(base, off, type) \
+ ((type *) (((u_int8_t *)(base)) + (db_ssize_t) (off)))
+
+
+/*
+ * Shared memory chain definitions.
+ */
+#define SH_CHAIN_ENTRY \
+struct { \
+ db_ssize_t sce_next; /* relative offset to next element */ \
+ db_ssize_t sce_prev; /* relative offset of prev element */ \
+}
+
+#define SH_CHAIN_INIT(elm, field) \
+ (elm)->field.sce_next = (elm)->field.sce_prev = -1
+
+#define SH_CHAIN_HASNEXT(elm, field) ((elm)->field.sce_next != -1)
+#define SH_CHAIN_NEXTP(elm, field, type) \
+ ((struct type *)((u_int8_t *)(elm) + (elm)->field.sce_next))
+#define SH_CHAIN_NEXT(elm, field, type) (SH_CHAIN_HASNEXT(elm, field) ? \
+ SH_CHAIN_NEXTP(elm, field, type) : (struct type *)NULL)
+
+#define SH_CHAIN_HASPREV(elm, field) ((elm)->field.sce_prev != -1)
+#define SH_CHAIN_PREVP(elm, field, type) \
+ ((struct type *)((u_int8_t *)(elm) + (elm)->field.sce_prev))
+#define SH_CHAIN_PREV(elm, field, type) (SH_CHAIN_HASPREV(elm, field) ? \
+ SH_CHAIN_PREVP(elm, field, type) : (struct type *)NULL)
+
+#define SH_CHAIN_SINGLETON(elm, field) \
+ (!(SH_CHAIN_HASNEXT(elm, field) || SH_CHAIN_HASPREV(elm, field)))
+
+#define SH_CHAIN_INSERT_AFTER(listelm, elm, field, type) do { \
+ struct type *__next = SH_CHAIN_NEXT(listelm, field, type); \
+ if (__next != NULL) { \
+ (elm)->field.sce_next = SH_PTR_TO_OFF(elm, __next); \
+ __next->field.sce_prev = SH_PTR_TO_OFF(__next, elm); \
+ } else \
+ (elm)->field.sce_next = -1; \
+ (elm)->field.sce_prev = SH_PTR_TO_OFF(elm, listelm); \
+ (listelm)->field.sce_next = SH_PTR_TO_OFF(listelm, elm); \
+} while (0)
+
+#define SH_CHAIN_INSERT_BEFORE(listelm, elm, field, type) do { \
+ struct type *__prev = SH_CHAIN_PREV(listelm, field, type); \
+ if (__prev != NULL) { \
+ (elm)->field.sce_prev = SH_PTR_TO_OFF(elm, __prev); \
+ __prev->field.sce_next = SH_PTR_TO_OFF(__prev, elm); \
+ } else \
+ (elm)->field.sce_prev = -1; \
+ (elm)->field.sce_next = SH_PTR_TO_OFF(elm, listelm); \
+ (listelm)->field.sce_prev = SH_PTR_TO_OFF(listelm, elm); \
+} while (0)
+
+#define SH_CHAIN_REMOVE(elm, field, type) do { \
+ struct type *__prev = SH_CHAIN_PREV(elm, field, type); \
+ struct type *__next = SH_CHAIN_NEXT(elm, field, type); \
+ if (__next != NULL) \
+ __next->field.sce_prev = (__prev == NULL) ? -1 : \
+ SH_PTR_TO_OFF(__next, __prev); \
+ if (__prev != NULL) \
+ __prev->field.sce_next = (__next == NULL) ? -1 : \
+ SH_PTR_TO_OFF(__prev, __next); \
+ SH_CHAIN_INIT(elm, field); \
+} while (0)
+
+/*
+ * Shared memory list definitions.
+ */
+#define SH_LIST_HEAD(name) \
+struct name { \
+ db_ssize_t slh_first; /* first element */ \
+}
+
+#define SH_LIST_HEAD_INITIALIZER(head) \
+ { -1 }
+
+#define SH_LIST_ENTRY \
+struct { \
+ db_ssize_t sle_next; /* relative offset to next element */ \
+ db_ssize_t sle_prev; /* relative offset of prev element */ \
+}
+
+/*
+ * Shared memory list functions.
+ */
+#define SH_LIST_EMPTY(head) \
+ ((head)->slh_first == -1)
+
+#define SH_LIST_FIRSTP(head, type) \
+ ((struct type *)(((u_int8_t *)(head)) + (head)->slh_first))
+
+#define SH_LIST_FIRST(head, type) \
+ (SH_LIST_EMPTY(head) ? NULL : \
+ ((struct type *)(((u_int8_t *)(head)) + (head)->slh_first)))
+
+#define SH_LIST_NEXTP(elm, field, type) \
+ ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next))
+
+#define SH_LIST_NEXT(elm, field, type) \
+ ((elm)->field.sle_next == -1 ? NULL : \
+ ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next)))
+
+ /*
+ *__SH_LIST_PREV_OFF is private API. It calculates the address of
+ * the elm->field.sle_next member of a SH_LIST structure. All offsets
+ * between elements are relative to that point in SH_LIST structures.
+ */
+#define __SH_LIST_PREV_OFF(elm, field) \
+ ((db_ssize_t *)(((u_int8_t *)(elm)) + (elm)->field.sle_prev))
+
+#define SH_LIST_PREV(elm, field, type) \
+ (struct type *)((db_ssize_t)(elm) - (*__SH_LIST_PREV_OFF(elm, field)))
+
+#define SH_LIST_FOREACH(var, head, field, type) \
+ for ((var) = SH_LIST_FIRST((head), type); \
+ (var) != NULL; \
+ (var) = SH_LIST_NEXT((var), field, type))
+
+/*
+ * Given correct A.next: B.prev = SH_LIST_NEXT_TO_PREV(A)
+ * in a list [A, B]
+ * The prev value is always the offset from an element to its preceding
+ * element's next location, not the beginning of the structure. To get
+ * to the beginning of an element structure in memory given an element
+ * do the following:
+ * A = B - (B.prev + (&B.next - B))
+ * Take the element's next pointer and calculate what the corresponding
+ * Prev pointer should be -- basically it is the negation plus the offset
+ * of the next field in the structure.
+ */
+#define SH_LIST_NEXT_TO_PREV(elm, field) \
+ (((elm)->field.sle_next == -1 ? 0 : -(elm)->field.sle_next) + \
+ SH_PTR_TO_OFF(elm, &(elm)->field.sle_next))
+
+#define SH_LIST_INIT(head) (head)->slh_first = -1
+
+#define SH_LIST_INSERT_BEFORE(head, listelm, elm, field, type) do { \
+ if (listelm == SH_LIST_FIRST(head, type)) { \
+ SH_LIST_INSERT_HEAD(head, elm, field, type); \
+ } else { \
+ (elm)->field.sle_next = SH_PTR_TO_OFF(elm, listelm); \
+ (elm)->field.sle_prev = SH_LIST_NEXT_TO_PREV( \
+ SH_LIST_PREV((listelm), field, type), field) + \
+ (elm)->field.sle_next; \
+ (SH_LIST_PREV(listelm, field, type))->field.sle_next = \
+ (SH_PTR_TO_OFF((SH_LIST_PREV(listelm, field, \
+ type)), elm)); \
+ (listelm)->field.sle_prev = SH_LIST_NEXT_TO_PREV(elm, field); \
+ } \
+} while (0)
+
+#define SH_LIST_INSERT_AFTER(listelm, elm, field, type) do { \
+ if ((listelm)->field.sle_next != -1) { \
+ (elm)->field.sle_next = SH_PTR_TO_OFF(elm, \
+ SH_LIST_NEXTP(listelm, field, type)); \
+ SH_LIST_NEXTP(listelm, field, type)->field.sle_prev = \
+ SH_LIST_NEXT_TO_PREV(elm, field); \
+ } else \
+ (elm)->field.sle_next = -1; \
+ (listelm)->field.sle_next = SH_PTR_TO_OFF(listelm, elm); \
+ (elm)->field.sle_prev = SH_LIST_NEXT_TO_PREV(listelm, field); \
+} while (0)
+
+#define SH_LIST_INSERT_HEAD(head, elm, field, type) do { \
+ if ((head)->slh_first != -1) { \
+ (elm)->field.sle_next = \
+ (head)->slh_first - SH_PTR_TO_OFF(head, elm); \
+ SH_LIST_FIRSTP(head, type)->field.sle_prev = \
+ SH_LIST_NEXT_TO_PREV(elm, field); \
+ } else \
+ (elm)->field.sle_next = -1; \
+ (head)->slh_first = SH_PTR_TO_OFF(head, elm); \
+ (elm)->field.sle_prev = SH_PTR_TO_OFF(elm, &(head)->slh_first); \
+} while (0)
+
+#define SH_LIST_REMOVE(elm, field, type) do { \
+ if ((elm)->field.sle_next != -1) { \
+ SH_LIST_NEXTP(elm, field, type)->field.sle_prev = \
+ (elm)->field.sle_prev - (elm)->field.sle_next; \
+ *__SH_LIST_PREV_OFF(elm, field) += (elm)->field.sle_next;\
+ } else \
+ *__SH_LIST_PREV_OFF(elm, field) = -1; \
+} while (0)
+
+#define SH_LIST_REMOVE_HEAD(head, field, type) do { \
+ if (!SH_LIST_EMPTY(head)) { \
+ SH_LIST_REMOVE(SH_LIST_FIRSTP(head, type), field, type);\
+ } \
+} while (0)
+
+/*
+ * Shared memory tail queue definitions.
+ */
+#define SH_TAILQ_HEAD(name) \
+struct name { \
+ db_ssize_t stqh_first; /* relative offset of first element */ \
+ db_ssize_t stqh_last; /* relative offset of last's next */ \
+}
+
+#define SH_TAILQ_HEAD_INITIALIZER(head) \
+ { -1, 0 }
+
+#define SH_TAILQ_ENTRY \
+struct { \
+ db_ssize_t stqe_next; /* relative offset of next element */ \
+ db_ssize_t stqe_prev; /* relative offset of prev's next */ \
+}
+
+/*
+ * Shared memory tail queue functions.
+ */
+
+#define SH_TAILQ_EMPTY(head) \
+ ((head)->stqh_first == -1)
+
+#define SH_TAILQ_FIRSTP(head, type) \
+ ((struct type *)((u_int8_t *)(head) + (head)->stqh_first))
+
+#define SH_TAILQ_FIRST(head, type) \
+ (SH_TAILQ_EMPTY(head) ? NULL : SH_TAILQ_FIRSTP(head, type))
+
+#define SH_TAILQ_NEXTP(elm, field, type) \
+ ((struct type *)((u_int8_t *)(elm) + (elm)->field.stqe_next))
+
+#define SH_TAILQ_NEXT(elm, field, type) \
+ ((elm)->field.stqe_next == -1 ? NULL : \
+ ((struct type *)((u_int8_t *)(elm) + (elm)->field.stqe_next)))
+
+ /*
+ * __SH_TAILQ_PREV_OFF is private API. It calculates the address of
+ * the elm->field.stqe_next member of a SH_TAILQ structure. All
+ * offsets between elements are relative to that point in SH_TAILQ
+ * structures.
+ */
+#define __SH_TAILQ_PREV_OFF(elm, field) \
+ ((db_ssize_t *)(((u_int8_t *)(elm)) + (elm)->field.stqe_prev))
+
+#define SH_TAILQ_PREVP(elm, field, type) \
+ (struct type *)((db_ssize_t)elm - (*__SH_TAILQ_PREV_OFF(elm, field)))
+
+#define SH_TAILQ_PREV(head, elm, field, type) \
+ (((elm) == SH_TAILQ_FIRST(head, type)) ? NULL : \
+ (struct type *)((db_ssize_t)elm - (*__SH_TAILQ_PREV_OFF(elm, field))))
+
+ /*
+ * __SH_TAILQ_LAST_OFF is private API. It calculates the address of
+ * the stqe_next member of a SH_TAILQ structure in the last element
+ * of this list. All offsets between elements are relative to that
+ * point in SH_TAILQ structures.
+ */
+#define __SH_TAILQ_LAST_OFF(head) \
+ ((db_ssize_t *)(((u_int8_t *)(head)) + (head)->stqh_last))
+
+#define SH_TAILQ_LASTP(head, field, type) \
+ ((struct type *)((db_ssize_t)(head) + \
+ ((db_ssize_t)((head)->stqh_last) - \
+ ((db_ssize_t)SH_PTR_TO_OFF(SH_TAILQ_FIRST(head, type), \
+ &(SH_TAILQ_FIRSTP(head, type)->field.stqe_next))))))
+
+#define SH_TAILQ_LAST(head, field, type) \
+ (SH_TAILQ_EMPTY(head) ? NULL : SH_TAILQ_LASTP(head, field, type))
+
+/*
+ * Given correct A.next: B.prev = SH_TAILQ_NEXT_TO_PREV(A)
+ * in a list [A, B]
+ * The prev value is always the offset from an element to its preceding
+ * element's next location, not the beginning of the structure. To get
+ * to the beginning of an element structure in memory given an element
+ * do the following:
+ * A = B - (B.prev + (&B.next - B))
+ */
+#define SH_TAILQ_NEXT_TO_PREV(elm, field) \
+ (((elm)->field.stqe_next == -1 ? 0 : \
+ (-(elm)->field.stqe_next) + \
+ SH_PTR_TO_OFF(elm, &(elm)->field.stqe_next)))
+
+#define SH_TAILQ_FOREACH(var, head, field, type) \
+ for ((var) = SH_TAILQ_FIRST((head), type); \
+ (var) != NULL; \
+ (var) = SH_TAILQ_NEXT((var), field, type))
+
+#define SH_TAILQ_FOREACH_REVERSE(var, head, field, type) \
+ for ((var) = SH_TAILQ_LAST((head), field, type); \
+ (var) != NULL; \
+ (var) = SH_TAILQ_PREV((head), (var), field, type))
+
+#define SH_TAILQ_INIT(head) { \
+ (head)->stqh_first = -1; \
+ (head)->stqh_last = SH_PTR_TO_OFF(head, &(head)->stqh_first); \
+}
+
+#define SH_TAILQ_INSERT_HEAD(head, elm, field, type) do { \
+ if ((head)->stqh_first != -1) { \
+ (elm)->field.stqe_next = \
+ (head)->stqh_first - SH_PTR_TO_OFF(head, elm); \
+ SH_TAILQ_FIRSTP(head, type)->field.stqe_prev = \
+ SH_TAILQ_NEXT_TO_PREV(elm, field); \
+ } else { \
+ (head)->stqh_last = \
+ SH_PTR_TO_OFF(head, &(elm)->field.stqe_next); \
+ (elm)->field.stqe_next = -1; \
+ } \
+ (head)->stqh_first = SH_PTR_TO_OFF(head, elm); \
+ (elm)->field.stqe_prev = \
+ SH_PTR_TO_OFF(elm, &(head)->stqh_first); \
+} while (0)
+
+#define SH_TAILQ_INSERT_TAIL(head, elm, field) do { \
+ (elm)->field.stqe_next = -1; \
+ (elm)->field.stqe_prev = \
+ -SH_PTR_TO_OFF(head, elm) + (head)->stqh_last; \
+ if ((head)->stqh_last == \
+ SH_PTR_TO_OFF((head), &(head)->stqh_first)) \
+ (head)->stqh_first = SH_PTR_TO_OFF(head, elm); \
+ else \
+ *__SH_TAILQ_LAST_OFF(head) = -(head)->stqh_last + \
+ SH_PTR_TO_OFF((elm), &(elm)->field.stqe_next) + \
+ SH_PTR_TO_OFF(head, elm); \
+ (head)->stqh_last = \
+ SH_PTR_TO_OFF(head, &((elm)->field.stqe_next)); \
+} while (0)
+
+#define SH_TAILQ_INSERT_BEFORE(head, listelm, elm, field, type) do { \
+ if (listelm == SH_TAILQ_FIRST(head, type)) { \
+ SH_TAILQ_INSERT_HEAD(head, elm, field, type); \
+ } else { \
+ (elm)->field.stqe_next = SH_PTR_TO_OFF(elm, listelm); \
+ (elm)->field.stqe_prev = SH_TAILQ_NEXT_TO_PREV( \
+ SH_TAILQ_PREVP((listelm), field, type), field) + \
+ (elm)->field.stqe_next; \
+ (SH_TAILQ_PREVP(listelm, field, type))->field.stqe_next =\
+ (SH_PTR_TO_OFF((SH_TAILQ_PREVP(listelm, field, type)), \
+ elm)); \
+ (listelm)->field.stqe_prev = \
+ SH_TAILQ_NEXT_TO_PREV(elm, field); \
+ } \
+} while (0)
+
+#define SH_TAILQ_INSERT_AFTER(head, listelm, elm, field, type) do { \
+ if ((listelm)->field.stqe_next != -1) { \
+ (elm)->field.stqe_next = (listelm)->field.stqe_next - \
+ SH_PTR_TO_OFF(listelm, elm); \
+ SH_TAILQ_NEXTP(listelm, field, type)->field.stqe_prev = \
+ SH_TAILQ_NEXT_TO_PREV(elm, field); \
+ } else { \
+ (elm)->field.stqe_next = -1; \
+ (head)->stqh_last = \
+ SH_PTR_TO_OFF(head, &(elm)->field.stqe_next); \
+ } \
+ (listelm)->field.stqe_next = SH_PTR_TO_OFF(listelm, elm); \
+ (elm)->field.stqe_prev = SH_TAILQ_NEXT_TO_PREV(listelm, field); \
+} while (0)
+
+#define SH_TAILQ_REMOVE(head, elm, field, type) do { \
+ if ((elm)->field.stqe_next != -1) { \
+ SH_TAILQ_NEXTP(elm, field, type)->field.stqe_prev = \
+ (elm)->field.stqe_prev + \
+ SH_PTR_TO_OFF(SH_TAILQ_NEXTP(elm, \
+ field, type), elm); \
+ *__SH_TAILQ_PREV_OFF(elm, field) += (elm)->field.stqe_next;\
+ } else { \
+ (head)->stqh_last = (elm)->field.stqe_prev + \
+ SH_PTR_TO_OFF(head, elm); \
+ *__SH_TAILQ_PREV_OFF(elm, field) = -1; \
+ } \
+} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_SHQUEUE_H_ */
diff --git a/src/dbinc/tcl_db.h b/src/dbinc/tcl_db.h
new file mode 100644
index 00000000..4c56164f
--- /dev/null
+++ b/src/dbinc/tcl_db.h
@@ -0,0 +1,316 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_TCL_DB_H_
+#define _DB_TCL_DB_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define MSG_SIZE 100 /* Message size */
+
+enum INFOTYPE {
+ I_AUX, I_DB, I_DBC, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN};
+
+#define MAX_ID 8 /* Maximum number of sub-id's we need */
+#define DBTCL_PREP 64 /* Size of txn_recover preplist */
+
+#define DBTCL_DBM 1
+#define DBTCL_NDBM 2
+
+#define DBTCL_GETCLOCK 0
+#define DBTCL_GETLIMIT 1
+#define DBTCL_GETREQ 2
+
+#define DBTCL_MUT_ALIGN 0
+#define DBTCL_MUT_INCR 1
+#define DBTCL_MUT_INIT 2
+#define DBTCL_MUT_MAX 3
+#define DBTCL_MUT_TAS 4
+
+/*
+ * Data structure to record information about events that have occurred. Tcl
+ * command "env event_info" can retrieve the information. For now, we record
+ * only one occurrence per event type; "env event_info -clear" can be used to
+ * reset the info.
+ *
+ * Besides the bit flag that records the fact that an event type occurred, some
+ * event types have associated "info" and we record that here too. When new
+ * event types are invented that have associated info, we should add a field
+ * here to record that info as well, so that it can be returned to the script
+ * with the "env event_info" results.
+ */
+typedef struct dbtcl_event_info {
+ u_int32_t events; /* Bit flag on for each event fired. */
+ int panic_error;
+ int newmaster_eid;
+ int added_eid;
+ int removed_eid;
+ pid_t attached_process;
+ int connected_eid;
+ DB_REPMGR_CONN_ERR conn_broken_info;
+ DB_REPMGR_CONN_ERR conn_failed_try_info;
+ DB_LSN sync_point;
+} DBTCL_EVENT_INFO;
+
+/*
+ * Why use a home grown package over the Tcl_Hash functions?
+ *
+ * We could have implemented the stuff below without maintaining our
+ * own list manipulation, efficiently hashing it with the available
+ * Tcl functions (Tcl_CreateHashEntry, Tcl_GetHashValue, etc). I chose
+ * not to do so for these reasons:
+ *
+ * We still need the information below. Using the hashing only removes
+ * us from needing the next/prev pointers. We still need the structure
+ * itself because we need more than one value associated with a widget.
+ * We need to keep track of parent pointers for sub-widgets (like cursors)
+ * so we can correctly close. We need to keep track of individual widget's
+ * id counters for any sub-widgets they may have. We need to be able to
+ * associate the name/client data outside the scope of the widget.
+ *
+ * So, is it better to use the hashing rather than
+ * the linear list we have now? I decided against it for the simple reason
+ * that to access the structure would require two calls. The first is
+ * Tcl_FindHashEntry(table, key) and then, once we have the entry, we'd
+ * have to do Tcl_GetHashValue(entry) to get the pointer of the structure.
+ *
+ * I believe the number of simultaneous DB widgets in existence at one time
+ * is not going to be that large (more than several dozen) such that
+ * linearly searching the list is not going to impact performance in a
+ * noticeable way. Should performance be impacted due to the size of the
+ * info list, then perhaps it is time to revisit this decision.
+ */
+typedef struct dbtcl_info {
+ LIST_ENTRY(dbtcl_info) entries;
+ Tcl_Interp *i_interp;
+ char *i_name;
+ enum INFOTYPE i_type;
+ union infop {
+ DB *dbp;
+ DBC *dbcp;
+ DB_ENV *envp;
+ DB_LOCK *lock;
+ DB_LOGC *logc;
+ DB_MPOOLFILE *mp;
+ DB_TXN *txnp;
+ void *anyp;
+ } un;
+ union data {
+ int anydata;
+ db_pgno_t pgno; /* For I_MP. */
+ u_int32_t lockid; /* For I_LOCK. */
+ DBTCL_EVENT_INFO *event_info; /* For I_ENV. */
+ DB_TXN_TOKEN *commit_token; /* For I_TXN. */
+ } und;
+ union data2 {
+ int anydata;
+ int pagesz; /* For I_MP. */
+ DB_COMPACT *c_data; /* For I_DB. */
+ db_mutex_t mutex; /* Protects event_info (I_ENV). */
+ } und2;
+ DBT i_lockobj;
+ FILE *i_err;
+ char *i_errpfx;
+ FILE *i_msg;
+
+ /* Callbacks--Tcl_Objs containing proc names */
+ Tcl_Obj *i_compare;
+ Tcl_Obj *i_dupcompare;
+ Tcl_Obj *i_foreign_call;
+ Tcl_Obj *i_hashproc;
+ Tcl_Obj *i_isalive;
+ Tcl_Obj *i_part_callback;
+ Tcl_Obj *i_rep_send;
+ Tcl_Obj *i_second_call;
+
+ /* Environment ID for the i_rep_send callback. */
+ Tcl_Obj *i_rep_eid;
+
+ struct dbtcl_info *i_parent;
+ int i_otherid[MAX_ID];
+
+ /* Heap dbs have an associated recno db, and secondary db. */
+ DB *hrdbp;
+ DB *hsdbp;
+} DBTCL_INFO;
+
+#define i_anyp un.anyp
+#define i_dbp un.dbp
+#define i_dbcp un.dbcp
+#define i_envp un.envp
+#define i_lock un.lock
+#define i_logc un.logc
+#define i_mp un.mp
+#define i_pagep un.anyp
+#define i_txnp un.txnp
+
+#define i_data und.anydata
+#define i_pgno und.pgno
+#define i_locker und.lockid
+#define i_event_info und.event_info
+#define i_commit_token und.commit_token
+#define i_data2 und2.anydata
+#define i_pgsz und2.pagesz
+#define i_cdata und2.c_data
+#define i_mutex und2.mutex
+
+#define i_envtxnid i_otherid[0]
+#define i_envmpid i_otherid[1]
+#define i_envlockid i_otherid[2]
+#define i_envlogcid i_otherid[3]
+
+#define i_mppgid i_otherid[0]
+
+#define i_dbdbcid i_otherid[0]
+
+extern int __debug_on, __debug_print, __debug_stop, __debug_test;
+
+typedef struct dbtcl_global {
+ LIST_HEAD(infohead, dbtcl_info) g_infohead;
+} DBTCL_GLOBAL;
+#define __db_infohead __dbtcl_global.g_infohead
+
+extern DBTCL_GLOBAL __dbtcl_global;
+
+/*
+ * Tcl_NewStringObj takes an "int" length argument, when the typical use is to
+ * call it with a size_t length (for example, returned by strlen). Tcl is in
+ * the wrong, but that doesn't help us much -- cast the argument.
+ */
+#define NewStringObj(a, b) \
+ Tcl_NewStringObj((a), (int)(b))
+
+#define NAME_TO_DB(name) (DB *)_NameToPtr((name))
+#define NAME_TO_DBC(name) (DBC *)_NameToPtr((name))
+#define NAME_TO_ENV(name) (DB_ENV *)_NameToPtr((name))
+#define NAME_TO_LOCK(name) (DB_LOCK *)_NameToPtr((name))
+#define NAME_TO_MP(name) (DB_MPOOLFILE *)_NameToPtr((name))
+#define NAME_TO_TXN(name) (DB_TXN *)_NameToPtr((name))
+#define NAME_TO_SEQUENCE(name) (DB_SEQUENCE *)_NameToPtr((name))
+
+/*
+ * MAKE_STAT_LIST appends a {name value} pair to a result list that MUST be
+ * called 'res' that is a Tcl_Obj * in the local function. This macro also
+ * assumes a label "error" to go to in the event of a Tcl error. For stat
+ * functions this will typically go before the "free" function to free the
+ * stat structure returned by DB.
+ */
+#define MAKE_STAT_LIST(s, v) do { \
+ result = _SetListElemInt(interp, res, (s), (long)(v)); \
+ if (result != TCL_OK) \
+ goto error; \
+} while (0)
+
+#define MAKE_WSTAT_LIST(s, v) do { \
+ result = _SetListElemWideInt(interp, res, (s), (int64_t)(v)); \
+ if (result != TCL_OK) \
+ goto error; \
+} while (0)
+
+/*
+ * MAKE_STAT_LSN appends a {name {LSNfile LSNoffset}} pair to a result list
+ * that MUST be called 'res' that is a Tcl_Obj * in the local
+ * function. This macro also assumes a label "error" to go to
+ * in the even of a Tcl error. For stat functions this will
+ * typically go before the "free" function to free the stat structure
+ * returned by DB.
+ */
+#define MAKE_STAT_LSN(s, lsn) do { \
+ myobjc = 2; \
+ myobjv[0] = Tcl_NewLongObj((long)(lsn)->file); \
+ myobjv[1] = Tcl_NewLongObj((long)(lsn)->offset); \
+ lsnlist = Tcl_NewListObj(myobjc, myobjv); \
+ myobjc = 2; \
+ myobjv[0] = Tcl_NewStringObj((s), (int)strlen(s)); \
+ myobjv[1] = lsnlist; \
+ thislist = Tcl_NewListObj(myobjc, myobjv); \
+ result = Tcl_ListObjAppendElement(interp, res, thislist); \
+ if (result != TCL_OK) \
+ goto error; \
+} while (0)
+
+/*
+ * MAKE_STAT_STRLIST appends a {name string} pair to a result list
+ * that MUST be called 'res' that is a Tcl_Obj * in the local
+ * function. This macro also assumes a label "error" to go to
+ * in the even of a Tcl error. For stat functions this will
+ * typically go before the "free" function to free the stat structure
+ * returned by DB.
+ */
+#define MAKE_STAT_STRLIST(s,s1) do { \
+ result = _SetListElem(interp, res, (s), (u_int32_t)strlen(s), \
+ (s1), (u_int32_t)strlen(s1)); \
+ if (result != TCL_OK) \
+ goto error; \
+} while (0)
+
+/*
+ * MAKE_SITE_LIST appends a {eid host port status} tuple to a result list
+ * that MUST be called 'res' that is a Tcl_Obj * in the local function.
+ * This macro also assumes a label "error" to go to in the event of a Tcl
+ * error.
+ */
+#define MAKE_SITE_LIST(e, h, p, s, pr) do { \
+ myobjc = 5; \
+ myobjv[0] = Tcl_NewIntObj(e); \
+ myobjv[1] = Tcl_NewStringObj((h), (int)strlen(h)); \
+ myobjv[2] = Tcl_NewIntObj((int)p); \
+ myobjv[3] = Tcl_NewStringObj((s), (int)strlen(s)); \
+ myobjv[4] = Tcl_NewStringObj((pr), (int)strlen(pr)); \
+ thislist = Tcl_NewListObj(myobjc, myobjv); \
+ result = Tcl_ListObjAppendElement(interp, res, thislist); \
+ if (result != TCL_OK) \
+ goto error; \
+} while (0)
+
+/*
+ * FLAG_CHECK checks that the given flag is not set yet.
+ * If it is, it sets up an error message.
+ */
+#define FLAG_CHECK(flag) do { \
+ if ((flag) != 0) { \
+ Tcl_SetResult(interp, \
+ " Only 1 policy can be specified.\n", \
+ TCL_STATIC); \
+ result = TCL_ERROR; \
+ break; \
+ } \
+} while (0)
+
+/*
+ * FLAG_CHECK2 checks that the given flag is not set yet or is
+ * only set to the given allowed value.
+ * If it is, it sets up an error message.
+ */
+#define FLAG_CHECK2(flag, val) do { \
+ if (((flag) & ~(val)) != 0) { \
+ Tcl_SetResult(interp, \
+ " Only 1 policy can be specified.\n", \
+ TCL_STATIC); \
+ result = TCL_ERROR; \
+ break; \
+ } \
+} while (0)
+
+/*
+ * IS_HELP checks whether the arg we bombed on is -?, which is a help option.
+ * If it is, we return TCL_OK (but leave the result set to whatever
+ * Tcl_GetIndexFromObj says, which lists all the valid options. Otherwise
+ * return TCL_ERROR.
+ */
+#define IS_HELP(s) \
+ (strcmp(Tcl_GetStringFromObj(s,NULL), "-?") == 0) ? TCL_OK : TCL_ERROR
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/tcl_ext.h"
+#endif /* !_DB_TCL_DB_H_ */
diff --git a/src/dbinc/txn.h b/src/dbinc/txn.h
new file mode 100644
index 00000000..7cbae263
--- /dev/null
+++ b/src/dbinc/txn.h
@@ -0,0 +1,288 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_TXN_H_
+#define _DB_TXN_H_
+
+#include "dbinc/xa.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Operation parameters to the delayed commit processing code. */
+typedef enum {
+ TXN_CLOSE, /* Close a DB handle whose close had failed. */
+ TXN_REMOVE, /* Remove a file. */
+ TXN_TRADE, /* Trade lockers. */
+ TXN_TRADED, /* Already traded; downgrade lock. */
+ TXN_XTRADE /* Trade lockers on exclusive db handle. */
+} TXN_EVENT_T;
+
+struct __db_txnregion; typedef struct __db_txnregion DB_TXNREGION;
+struct __db_txn_stat_int;
+typedef struct __db_txn_stat_int DB_TXN_STAT_INT;
+struct __txn_logrec; typedef struct __txn_logrec DB_TXNLOGREC;
+
+/*
+ * !!!
+ * TXN_MINIMUM = (DB_LOCK_MAXID + 1) but this makes compilers complain.
+ */
+#define TXN_MINIMUM 0x80000000
+#define TXN_MAXIMUM 0xffffffff /* Maximum number of txn ids. */
+#define TXN_INVALID 0 /* Invalid transaction ID. */
+
+#define DEF_MAX_TXNS 100 /* Default max transactions. */
+#define TXN_NSLOTS 4 /* Initial slots to hold DB refs */
+
+#define TXN_PRIORITY_DEFAULT DB_LOCK_DEFPRIORITY
+
+/*
+ * This structure must contain the same fields as the __db_txn_stat struct
+ * except for any pointer fields that are filled in only when the struct is
+ * being populated for output through the API.
+ */
+DB_ALIGN8 struct __db_txn_stat_int { /* SHARED */
+ u_int32_t st_nrestores; /* number of restored transactions
+ after recovery. */
+#ifndef __TEST_DB_NO_STATISTICS
+ DB_LSN st_last_ckp; /* lsn of the last checkpoint */
+ time_t st_time_ckp; /* time of last checkpoint */
+ u_int32_t st_last_txnid; /* last transaction id given out */
+ u_int32_t st_inittxns; /* initial txns allocated */
+ u_int32_t st_maxtxns; /* maximum txns possible */
+ uintmax_t st_naborts; /* number of aborted transactions */
+ uintmax_t st_nbegins; /* number of begun transactions */
+ uintmax_t st_ncommits; /* number of committed transactions */
+ u_int32_t st_nactive; /* number of active transactions */
+ u_int32_t st_nsnapshot; /* number of snapshot transactions */
+ u_int32_t st_maxnactive; /* maximum active transactions */
+ u_int32_t st_maxnsnapshot; /* maximum snapshot transactions */
+ uintmax_t st_region_wait; /* Region lock granted after wait. */
+ uintmax_t st_region_nowait; /* Region lock granted without wait. */
+ roff_t st_regsize; /* Region size. */
+#endif
+};
+
+/*
+ * Internal data maintained in shared memory for each transaction.
+ */
+typedef struct __txn_detail {
+ u_int32_t txnid; /* current transaction id
+ used to link free list also */
+ pid_t pid; /* Process owning txn */
+ db_threadid_t tid; /* Thread owning txn */
+
+ DB_LSN last_lsn; /* Last LSN written for this txn. */
+ DB_LSN begin_lsn; /* LSN of begin record. */
+ roff_t parent; /* Offset of transaction's parent. */
+ roff_t name; /* Offset of txn name. */
+
+ u_int32_t nlog_dbs; /* Number of databases used. */
+ u_int32_t nlog_slots; /* Number of allocated slots. */
+ roff_t log_dbs; /* Databases used. */
+
+ DB_LSN read_lsn; /* Read LSN for MVCC. */
+ DB_LSN visible_lsn; /* LSN at which this transaction's
+ changes are visible. */
+ db_mutex_t mvcc_mtx; /* Version mutex. */
+ u_int32_t mvcc_ref; /* Number of buffers created by this
+ transaction still in cache. */
+
+ u_int32_t priority; /* Deadlock resolution priority. */
+
+ SH_TAILQ_HEAD(__tdkids) kids; /* Linked list of child txn detail. */
+ SH_TAILQ_ENTRY klinks;
+
+ /* TXN_{ABORTED, COMMITTED PREPARED, RUNNING} */
+ u_int32_t status; /* status of the transaction */
+
+#define TXN_DTL_COLLECTED 0x01 /* collected during txn_recover */
+#define TXN_DTL_RESTORED 0x02 /* prepared txn restored */
+#define TXN_DTL_INMEMORY 0x04 /* uses in memory logs */
+#define TXN_DTL_SNAPSHOT 0x08 /* On the list of snapshot txns. */
+#define TXN_DTL_NOWAIT 0x10 /* Don't block on locks. */
+ u_int32_t flags;
+
+ SH_TAILQ_ENTRY links; /* active/free/snapshot list */
+
+ u_int32_t xa_ref; /* XA: reference count; number
+ of DB_TXNs reffing this struct */
+ /* TXN_XA_{ACTIVE, DEADLOCKED, IDLE, PREPARED, ROLLEDBACK} */
+ u_int32_t xa_br_status; /* status of XA branch */
+ u_int8_t gid[DB_GID_SIZE]; /* global transaction id */
+ u_int32_t bqual; /* bqual_length from XID */
+ u_int32_t gtrid; /* gtrid_length from XID */
+ int32_t format; /* XA format */
+ roff_t slots[TXN_NSLOTS]; /* Initial DB slot allocation. */
+} TXN_DETAIL;
+
+/*
+ * DB_TXNMGR --
+ * The transaction manager encapsulates the transaction system.
+ */
+struct __db_txnmgr {
+ /*
+ * These fields need to be protected for multi-threaded support.
+ *
+ * Lock list of active transactions (including the content of each
+ * TXN_DETAIL structure on the list).
+ */
+ db_mutex_t mutex;
+ /* List of active transactions. */
+ TAILQ_HEAD(_chain, __db_txn) txn_chain;
+
+ u_int32_t n_discards; /* Number of txns discarded. */
+
+ /* These fields are never updated after creation, so not protected. */
+ ENV *env; /* Environment. */
+ REGINFO reginfo; /* Region information. */
+};
+
+/* Macros to lock/unlock the transaction region as a whole. */
+#define TXN_SYSTEM_LOCK(env) \
+ MUTEX_LOCK(env, ((DB_TXNREGION *) \
+ (env)->tx_handle->reginfo.primary)->mtx_region)
+#define TXN_SYSTEM_UNLOCK(env) \
+ MUTEX_UNLOCK(env, ((DB_TXNREGION *) \
+ (env)->tx_handle->reginfo.primary)->mtx_region)
+
+/*
+ * DB_TXNREGION --
+ * The primary transaction data structure in the shared memory region.
+ */
+struct __db_txnregion { /* SHARED */
+ db_mutex_t mtx_region; /* Region mutex. */
+
+ u_int32_t inittxns; /* initial number of active TXNs */
+ u_int32_t curtxns; /* current number of active TXNs */
+ u_int32_t maxtxns; /* maximum number of active TXNs */
+ u_int32_t last_txnid; /* last transaction id given out */
+ u_int32_t cur_maxid; /* current max unused id. */
+
+ db_mutex_t mtx_ckp; /* Single thread checkpoints. */
+ DB_LSN last_ckp; /* lsn of the last checkpoint */
+ time_t time_ckp; /* time of last checkpoint */
+
+ DB_TXN_STAT_INT stat; /* Statistics for txns. */
+
+ u_int32_t n_bulk_txn; /* Num. bulk txns in progress. */
+ u_int32_t n_hotbackup; /* Num. of outstanding backup notices.*/
+
+#define TXN_IN_RECOVERY 0x01 /* environment is being recovered */
+ u_int32_t flags;
+ /* active TXN list */
+ SH_TAILQ_HEAD(__active) active_txn;
+ SH_TAILQ_HEAD(__mvcc) mvcc_txn;
+};
+
+/*
+ * DB_COMMIT_INFO --
+ * Meta-data uniquely describing a transaction commit across a replication
+ * group.
+ */
+struct __db_commit_info {
+ u_int32_t version; /* Stored format version. */
+ u_int32_t gen; /* Replication master generation. */
+ u_int32_t envid; /* Unique env ID of master. */
+ DB_LSN lsn; /* LSN of commit log record. */
+};
+
+/*
+ * DB_TXNLOGREC --
+ * An in-memory, linked-list copy of a log record.
+ */
+struct __txn_logrec {
+ STAILQ_ENTRY(__txn_logrec) links;/* Linked list. */
+
+ u_int8_t data[1]; /* Log record. */
+};
+
+/*
+ * Log record types. Note that these are *not* alphabetical. This is
+ * intentional so that we don't change the meaning of values between
+ * software upgrades.
+ *
+ * EXPECTED, UNEXPECTED, IGNORE, and OK are used in the txnlist functions.
+ * Here is an explanation of how the statuses are used.
+ *
+ * TXN_OK
+ * BEGIN records for transactions found on the txnlist during
+ * OPENFILES (BEGIN records are those with a prev_lsn of 0,0)
+ *
+ * TXN_COMMIT
+ * Transaction committed and should be rolled forward.
+ *
+ * TXN_ABORT
+ * This transaction's changes must be undone. Either there was
+ * never a prepare or commit record for this transaction OR there
+ * was a commit, but we are recovering to a timestamp or particular
+ * LSN and that point is before this transaction's commit.
+ *
+ * TXN_PREPARE
+ * Prepare record, but no commit record is in the log.
+ *
+ * TXN_IGNORE
+ * Generic meaning is that this transaction should not be
+ * processed during later recovery passes. We use it in a
+ * number of different manners:
+ *
+ * 1. We never saw its BEGIN record. Therefore, the logs have
+ * been reclaimed and we *know* that this transaction doesn't
+ * need to be aborted, because in order for it to be
+ * reclaimed, there must have been a subsequent checkpoint
+ * (and any dirty pages for this transaction made it to
+ * disk).
+ *
+ * 2. This is a child transaction that created a database.
+ * For some reason, we don't want to recreate that database
+ * (i.e., it already exists or some other database created
+ * after it exists).
+ *
+ * 3. During recovery open of subdatabases, if the master check fails,
+ * we use a TXN_IGNORE on the create of the subdb in the nested
+ * transaction.
+ *
+ * 4. During a remove, the file with the name being removed isn't
+ * the file for which we are recovering a remove.
+ *
+ * TXN_EXPECTED
+ * After a successful open during recovery, we update the
+ * transaction's status to TXN_EXPECTED. The open was done
+ * in the parent, but in the open log record, we record the
+ * child transaction's ID if we also did a create. When there
+ * is a valid ID in that field, we use it and mark the child's
+ * status as TXN_EXPECTED (indicating that we don't need to redo
+ * a create for this file).
+ *
+ * When recovering a remove, if we don't find or can't open
+ * the file, the child (which does the remove) gets marked
+ * EXPECTED (indicating that we don't need to redo the remove).
+ *
+ * TXN_UNEXPECTED
+ * During recovery, we attempted an open that should have succeeded
+ * and we got ENOENT, so like with the EXPECTED case, we indicate
+ * in the child that we got the UNEXPECTED return so that we do redo
+ * the creating/deleting operation.
+ *
+ */
+#define TXN_OK 0
+#define TXN_COMMIT 1
+#define TXN_PREPARE 2
+#define TXN_ABORT 3
+#define TXN_IGNORE 4
+#define TXN_EXPECTED 5
+#define TXN_UNEXPECTED 6
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "dbinc_auto/txn_auto.h"
+#include "dbinc_auto/txn_ext.h"
+#endif /* !_DB_TXN_H_ */
diff --git a/src/dbinc/win_db.h b/src/dbinc/win_db.h
new file mode 100644
index 00000000..ba57cd1f
--- /dev/null
+++ b/src/dbinc/win_db.h
@@ -0,0 +1,148 @@
+/*-
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * The following provides the information necessary to build Berkeley
+ * DB on native Windows, and other Windows environments such as MinGW.
+ */
+
+/*
+ * Berkeley DB requires at least Windows 2000, tell Visual Studio of the
+ * requirement.
+ */
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x0500
+#endif
+
+#ifndef DB_WINCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/timeb.h>
+
+#include <direct.h>
+#include <fcntl.h>
+#include <io.h>
+#include <limits.h>
+#include <memory.h>
+#include <process.h>
+#include <signal.h>
+#endif /* DB_WINCE */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tchar.h>
+#include <time.h>
+
+/*
+ * To build Tcl interface libraries, the include path must be configured to
+ * use the directory containing <tcl.h>, usually the include directory in
+ * the Tcl distribution.
+ */
+#ifdef DB_TCL_SUPPORT
+#include <tcl.h>
+#endif
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winsock2.h>
+#ifndef DB_WINCE
+#include <WinIoCtl.h>
+#endif
+
+#ifdef HAVE_GETADDRINFO
+/*
+ * Need explicit includes for IPv6 support on Windows. Both are necessary to
+ * ensure that pre WinXP versions have an implementation of the getaddrinfo API.
+ */
+#include <ws2tcpip.h>
+#include <wspiapi.h>
+#endif
+
+/*
+ * Microsoft's C runtime library has fsync, getcwd, getpid, snprintf and
+ * vsnprintf, but under different names.
+ */
+#define fsync _commit
+
+#ifndef DB_WINCE
+#define getcwd(buf, size) _getcwd(buf, size)
+#endif
+#define getpid GetCurrentProcessId
+#define snprintf _snprintf
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+#define vsnprintf _vsnprintf
+
+#define h_errno WSAGetLastError()
+
+/*
+ * Win32 does not have getopt.
+ *
+ * The externs are here, instead of using db_config.h and clib_port.h, because
+ * that approach changes function names to BDB specific names, and the example
+ * programs use getopt and can't use BDB specific names.
+ */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+extern int getopt(int, char * const *, const char *);
+#if defined(__cplusplus)
+}
+#endif
+
+/*
+ * Microsoft's compiler _doesn't_ define __STDC__ unless you invoke it with
+ * arguments turning OFF all vendor extensions. Even more unfortunately, if
+ * we do that, it fails to parse windows.h!!!!! So, we define __STDC__ here,
+ * after windows.h comes in. Note: the compiler knows we've defined it, and
+ * starts enforcing strict ANSI compliance from this point on.
+ */
+#ifndef __STDC__
+#define __STDC__ 1
+#endif
+
+#ifdef _UNICODE
+#define TO_TSTRING(dbenv, s, ts, ret) do { \
+ int __len = (int)strlen(s) + 1; \
+ ts = NULL; \
+ if ((ret = __os_malloc((dbenv), \
+ __len * sizeof(_TCHAR), &(ts))) == 0 && \
+ MultiByteToWideChar(CP_UTF8, 0, \
+ (s), -1, (ts), __len) == 0) \
+ ret = __os_posix_err(__os_get_syserr()); \
+ } while (0)
+
+#define FROM_TSTRING(dbenv, ts, s, ret) { \
+ int __len = WideCharToMultiByte(CP_UTF8, 0, ts, -1, \
+ NULL, 0, NULL, NULL); \
+ s = NULL; \
+ if ((ret = __os_malloc((dbenv), __len, &(s))) == 0 && \
+ WideCharToMultiByte(CP_UTF8, 0, \
+ (ts), -1, (s), __len, NULL, NULL) == 0) \
+ ret = __os_posix_err(__os_get_syserr()); \
+ } while (0)
+
+#define FREE_STRING(dbenv, s) do { \
+ if ((s) != NULL) { \
+ __os_free((dbenv), (s)); \
+ (s) = NULL; \
+ } \
+ } while (0)
+
+#else
+#define TO_TSTRING(dbenv, s, ts, ret) (ret) = 0, (ts) = (_TCHAR *)(s)
+#define FROM_TSTRING(dbenv, ts, s, ret) (ret) = 0, (s) = (char *)(ts)
+#define FREE_STRING(dbenv, ts)
+#endif
+
+#ifndef INVALID_HANDLE_VALUE
+#define INVALID_HANDLE_VALUE ((HANDLE)-1)
+#endif
+
+#ifndef INVALID_FILE_ATTRIBUTES
+#define INVALID_FILE_ATTRIBUTES ((DWORD)-1)
+#endif
+
+#ifndef INVALID_SET_FILE_POINTER
+#define INVALID_SET_FILE_POINTER ((DWORD)-1)
+#endif
diff --git a/src/dbinc/xa.h b/src/dbinc/xa.h
new file mode 100644
index 00000000..7283c1ea
--- /dev/null
+++ b/src/dbinc/xa.h
@@ -0,0 +1,183 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef _DB_XA_H_
+#define _DB_XA_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#define XIDDATASIZE 128 /* size in bytes */
+#define MAXGTRIDSIZE 64 /* maximum size in bytes of gtrid */
+#define MAXBQUALSIZE 64 /* maximum size in bytes of bqual */
+
+struct xid_t {
+ long formatID; /* format identifier */
+ long gtrid_length; /* value from 1 through 64 */
+ long bqual_length; /* value from 1 through 64 */
+ char data[XIDDATASIZE];
+};
+typedef struct xid_t XID;
+/*
+ * A value of -1 in formatID means that the XID is null.
+ */
+
+/*
+ * Declarations of routines by which RMs call TMs:
+ */
+extern int ax_reg __P((int, XID *, long));
+extern int ax_unreg __P((int, long));
+
+/*
+ * XA Switch Data Structure
+ */
+#define RMNAMESZ 32 /* length of resource manager name, */
+ /* including the null terminator */
+#define MAXINFOSIZE 256 /* maximum size in bytes of xa_info */
+ /* strings, including the null
+ terminator */
+struct xa_switch_t {
+ char name[RMNAMESZ]; /* name of resource manager */
+ long flags; /* resource manager specific options */
+ long version; /* must be 0 */
+ int (*xa_open_entry) /* xa_open function pointer */
+ __P((char *, int, long));
+ int (*xa_close_entry) /* xa_close function pointer */
+ __P((char *, int, long));
+ int (*xa_start_entry) /* xa_start function pointer */
+ __P((XID *, int, long));
+ int (*xa_end_entry) /* xa_end function pointer */
+ __P((XID *, int, long));
+ int (*xa_rollback_entry) /* xa_rollback function pointer */
+ __P((XID *, int, long));
+ int (*xa_prepare_entry) /* xa_prepare function pointer */
+ __P((XID *, int, long));
+ int (*xa_commit_entry) /* xa_commit function pointer */
+ __P((XID *, int, long));
+ int (*xa_recover_entry) /* xa_recover function pointer */
+ __P((XID *, long, int, long));
+ int (*xa_forget_entry) /* xa_forget function pointer */
+ __P((XID *, int, long));
+ int (*xa_complete_entry) /* xa_complete function pointer */
+ __P((int *, int *, int, long));
+};
+
+/*
+ * Flag definitions for the RM switch
+ */
+#define TMNOFLAGS 0x00000000L /* no resource manager features
+ selected */
+#define TMREGISTER 0x00000001L /* resource manager dynamically
+ registers */
+#define TMNOMIGRATE 0x00000002L /* resource manager does not support
+ association migration */
+#define TMUSEASYNC 0x00000004L /* resource manager supports
+ asynchronous operations */
+/*
+ * Flag definitions for xa_ and ax_ routines
+ */
+/* use TMNOFLAGGS, defined above, when not specifying other flags */
+#define TMASYNC 0x80000000L /* perform routine asynchronously */
+#define TMONEPHASE 0x40000000L /* caller is using one-phase commit
+ optimisation */
+#define TMFAIL 0x20000000L /* dissociates caller and marks
+ transaction branch rollback-only */
+#define TMNOWAIT 0x10000000L /* return if blocking condition
+ exists */
+#define TMRESUME 0x08000000L /* caller is resuming association with
+ suspended transaction branch */
+#define TMSUCCESS 0x04000000L /* dissociate caller from transaction
+ branch */
+#define TMSUSPEND 0x02000000L /* caller is suspending, not ending,
+ association */
+#define TMSTARTRSCAN 0x01000000L /* start a recovery scan */
+#define TMENDRSCAN 0x00800000L /* end a recovery scan */
+#define TMMULTIPLE 0x00400000L /* wait for any asynchronous
+ operation */
+#define TMJOIN 0x00200000L /* caller is joining existing
+ transaction branch */
+#define TMMIGRATE 0x00100000L /* caller intends to perform
+ migration */
+
+/*
+ * ax_() return codes (transaction manager reports to resource manager)
+ */
+#define TM_JOIN 2 /* caller is joining existing
+ transaction branch */
+#define TM_RESUME 1 /* caller is resuming association with
+ suspended transaction branch */
+#define TM_OK 0 /* normal execution */
+#define TMER_TMERR -1 /* an error occurred in the transaction
+ manager */
+#define TMER_INVAL -2 /* invalid arguments were given */
+#define TMER_PROTO -3 /* routine invoked in an improper
+ context */
+
+/*
+ * xa_() return codes (resource manager reports to transaction manager)
+ */
+#define XA_RBBASE 100 /* The inclusive lower bound of the
+ rollback codes */
+#define XA_RBROLLBACK XA_RBBASE /* The rollback was caused by an
+ unspecified reason */
+#define XA_RBCOMMFAIL XA_RBBASE+1 /* The rollback was caused by a
+ communication failure */
+#define XA_RBDEADLOCK XA_RBBASE+2 /* A deadlock was detected */
+#define XA_RBINTEGRITY XA_RBBASE+3 /* A condition that violates the
+ integrity of the resources was
+ detected */
+#define XA_RBOTHER XA_RBBASE+4 /* The resource manager rolled back the
+ transaction branch for a reason not
+ on this list */
+#define XA_RBPROTO XA_RBBASE+5 /* A protocol error occurred in the
+ resource manager */
+#define XA_RBTIMEOUT XA_RBBASE+6 /* A transaction branch took too long */
+#define XA_RBTRANSIENT XA_RBBASE+7 /* May retry the transaction branch */
+#define XA_RBEND XA_RBTRANSIENT /* The inclusive upper bound of the
+ rollback codes */
+#define XA_NOMIGRATE 9 /* resumption must occur where
+ suspension occurred */
+#define XA_HEURHAZ 8 /* the transaction branch may have
+ been heuristically completed */
+#define XA_HEURCOM 7 /* the transaction branch has been
+ heuristically committed */
+#define XA_HEURRB 6 /* the transaction branch has been
+ heuristically rolled back */
+#define XA_HEURMIX 5 /* the transaction branch has been
+ heuristically committed and rolled
+ back */
+#define XA_RETRY 4 /* routine returned with no effect and
+ may be re-issued */
+#define XA_RDONLY 3 /* the transaction branch was read-only
+ and has been committed */
+#define XA_OK 0 /* normal execution */
+#define XAER_ASYNC -2 /* asynchronous operation already
+ outstanding */
+#define XAER_RMERR -3 /* a resource manager error occurred in
+ the transaction branch */
+#define XAER_NOTA -4 /* the XID is not valid */
+#define XAER_INVAL -5 /* invalid arguments were given */
+#define XAER_PROTO -6 /* routine invoked in an improper
+ context */
+#define XAER_RMFAIL -7 /* resource manager unavailable */
+#define XAER_DUPID -8 /* the XID already exists */
+#define XAER_OUTSIDE -9 /* resource manager doing work outside
+ transaction */
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_XA_H_ */
diff --git a/src/dbinc_auto/api_flags.in b/src/dbinc_auto/api_flags.in
new file mode 100644
index 00000000..9727ede2
--- /dev/null
+++ b/src/dbinc_auto/api_flags.in
@@ -0,0 +1,228 @@
+/* DO NOT EDIT: automatically built by dist/s_apiflags. */
+#define DB_AGGRESSIVE 0x00000001
+#define DB_ARCH_ABS 0x00000001
+#define DB_ARCH_DATA 0x00000002
+#define DB_ARCH_LOG 0x00000004
+#define DB_ARCH_REMOVE 0x00000008
+#define DB_AUTO_COMMIT 0x00000100
+#define DB_BACKUP_CLEAN 0x00000002
+#define DB_BACKUP_FILES 0x00000008
+#define DB_BACKUP_NO_LOGS 0x00000010
+#define DB_BACKUP_SINGLE_DIR 0x00000020
+#define DB_BACKUP_UPDATE 0x00000040
+#define DB_BOOTSTRAP_HELPER 0x00000001
+#define DB_CDB_ALLDB 0x00000040
+#define DB_CHKSUM 0x00000008
+#define DB_CKP_INTERNAL 0x00000002
+#define DB_CREATE 0x00000001
+#define DB_CURSOR_BULK 0x00000001
+#define DB_CURSOR_TRANSIENT 0x00000008
+#define DB_CXX_NO_EXCEPTIONS 0x00000002
+#define DB_DATABASE_LOCKING 0x00000080
+#define DB_DIRECT 0x00000020
+#define DB_DIRECT_DB 0x00000200
+#define DB_DSYNC_DB 0x00000400
+#define DB_DUP 0x00000010
+#define DB_DUPSORT 0x00000002
+#define DB_DURABLE_UNKNOWN 0x00000040
+#define DB_ENCRYPT 0x00000001
+#define DB_ENCRYPT_AES 0x00000001
+#define DB_EXCL 0x00000004
+#define DB_EXTENT 0x00000100
+#define DB_FAILCHK 0x00000010
+#define DB_FAILCHK_ISALIVE 0x00000040
+#define DB_FAST_STAT 0x00000001
+#define DB_FCNTL_LOCKING 0x00000800
+#define DB_FLUSH 0x00000002
+#define DB_FORCE 0x00000001
+#define DB_FORCESYNC 0x00000001
+#define DB_FOREIGN_ABORT 0x00000001
+#define DB_FOREIGN_CASCADE 0x00000002
+#define DB_FOREIGN_NULLIFY 0x00000004
+#define DB_FREELIST_ONLY 0x00000001
+#define DB_FREE_SPACE 0x00000002
+#define DB_GROUP_CREATOR 0x00000002
+#define DB_HOTBACKUP_IN_PROGRESS 0x00000800
+#define DB_IGNORE_LEASE 0x00001000
+#define DB_IMMUTABLE_KEY 0x00000002
+#define DB_INIT_CDB 0x00000080
+#define DB_INIT_LOCK 0x00000100
+#define DB_INIT_LOG 0x00000200
+#define DB_INIT_MPOOL 0x00000400
+#define DB_INIT_MUTEX 0x00000800
+#define DB_INIT_REP 0x00001000
+#define DB_INIT_TXN 0x00002000
+#define DB_INORDER 0x00000020
+#define DB_INTERNAL_PERSISTENT_DB 0x00001000
+#define DB_INTERNAL_TEMPORARY_DB 0x00002000
+#define DB_JOIN_NOSORT 0x00000001
+#define DB_LEGACY 0x00000004
+#define DB_LOCAL_SITE 0x00000008
+#define DB_LOCKDOWN 0x00004000
+#define DB_LOCK_CHECK 0x00000001
+#define DB_LOCK_IGNORE_REC 0x00000002
+#define DB_LOCK_NOWAIT 0x00000004
+#define DB_LOCK_RECORD 0x00000008
+#define DB_LOCK_SET_TIMEOUT 0x00000010
+#define DB_LOCK_SWITCH 0x00000020
+#define DB_LOCK_UPGRADE 0x00000040
+#define DB_LOG_AUTO_REMOVE 0x00000001
+#define DB_LOG_CHKPNT 0x00000001
+#define DB_LOG_COMMIT 0x00000004
+#define DB_LOG_DIRECT 0x00000002
+#define DB_LOG_DSYNC 0x00000004
+#define DB_LOG_IN_MEMORY 0x00000008
+#define DB_LOG_NOCOPY 0x00000008
+#define DB_LOG_NOT_DURABLE 0x00000010
+#define DB_LOG_NO_DATA 0x00000002
+#define DB_LOG_VERIFY_CAF 0x00000001
+#define DB_LOG_VERIFY_DBFILE 0x00000002
+#define DB_LOG_VERIFY_ERR 0x00000004
+#define DB_LOG_VERIFY_FORWARD 0x00000008
+#define DB_LOG_VERIFY_INTERR 0x00000010
+#define DB_LOG_VERIFY_PARTIAL 0x00000020
+#define DB_LOG_VERIFY_VERBOSE 0x00000040
+#define DB_LOG_VERIFY_WARNING 0x00000080
+#define DB_LOG_WRNOSYNC 0x00000020
+#define DB_LOG_ZERO 0x00000010
+#define DB_MPOOL_CREATE 0x00000001
+#define DB_MPOOL_DIRTY 0x00000002
+#define DB_MPOOL_DISCARD 0x00000001
+#define DB_MPOOL_EDIT 0x00000004
+#define DB_MPOOL_FREE 0x00000008
+#define DB_MPOOL_LAST 0x00000010
+#define DB_MPOOL_NEW 0x00000020
+#define DB_MPOOL_NOFILE 0x00000001
+#define DB_MPOOL_NOLOCK 0x00000004
+#define DB_MPOOL_TRY 0x00000040
+#define DB_MPOOL_UNLINK 0x00000002
+#define DB_MULTIPLE 0x00000800
+#define DB_MULTIPLE_KEY 0x00004000
+#define DB_MULTIVERSION 0x00000008
+#define DB_MUTEX_ALLOCATED 0x00000001
+#define DB_MUTEX_LOCKED 0x00000002
+#define DB_MUTEX_LOGICAL_LOCK 0x00000004
+#define DB_MUTEX_PROCESS_ONLY 0x00000008
+#define DB_MUTEX_SELF_BLOCK 0x00000010
+#define DB_MUTEX_SHARED 0x00000020
+#define DB_NOERROR 0x00004000
+#define DB_NOFLUSH 0x00001000
+#define DB_NOLOCKING 0x00002000
+#define DB_NOMMAP 0x00000010
+#define DB_NOORDERCHK 0x00000002
+#define DB_NOPANIC 0x00004000
+#define DB_NOSYNC 0x00000001
+#define DB_NO_AUTO_COMMIT 0x00008000
+#define DB_NO_CHECKPOINT 0x00008000
+#define DB_ODDFILESIZE 0x00000080
+#define DB_ORDERCHKONLY 0x00000004
+#define DB_OVERWRITE 0x00008000
+#define DB_PANIC_ENVIRONMENT 0x00010000
+#define DB_PRINTABLE 0x00000008
+#define DB_PRIVATE 0x00010000
+#define DB_PR_PAGE 0x00000010
+#define DB_PR_RECOVERYTEST 0x00000020
+#define DB_RDONLY 0x00000400
+#define DB_RDWRMASTER 0x00010000
+#define DB_READ_COMMITTED 0x00000400
+#define DB_READ_UNCOMMITTED 0x00000200
+#define DB_RECNUM 0x00000040
+#define DB_RECOVER 0x00000002
+#define DB_RECOVER_FATAL 0x00020000
+#define DB_REGION_INIT 0x00020000
+#define DB_REGISTER 0x00040000
+#define DB_RENUMBER 0x00000080
+#define DB_REPMGR_CONF_2SITE_STRICT 0x00000001
+#define DB_REPMGR_CONF_ELECTIONS 0x00000002
+#define DB_REPMGR_NEED_RESPONSE 0x00000001
+#define DB_REPMGR_PEER 0x00000010
+#define DB_REP_ANYWHERE 0x00000001
+#define DB_REP_CLIENT 0x00000001
+#define DB_REP_CONF_AUTOINIT 0x00000004
+#define DB_REP_CONF_AUTOROLLBACK 0x00000008
+#define DB_REP_CONF_BULK 0x00000010
+#define DB_REP_CONF_DELAYCLIENT 0x00000020
+#define DB_REP_CONF_INMEM 0x00000040
+#define DB_REP_CONF_LEASE 0x00000080
+#define DB_REP_CONF_NOWAIT 0x00000100
+#define DB_REP_ELECTION 0x00000004
+#define DB_REP_MASTER 0x00000002
+#define DB_REP_NOBUFFER 0x00000002
+#define DB_REP_PERMANENT 0x00000004
+#define DB_REP_REREQUEST 0x00000008
+#define DB_REVSPLITOFF 0x00000100
+#define DB_RMW 0x00002000
+#define DB_SALVAGE 0x00000040
+#define DB_SA_SKIPFIRSTKEY 0x00000080
+#define DB_SA_UNKNOWNKEY 0x00000100
+#define DB_SEQ_DEC 0x00000001
+#define DB_SEQ_INC 0x00000002
+#define DB_SEQ_RANGE_SET 0x00000004
+#define DB_SEQ_WRAP 0x00000008
+#define DB_SEQ_WRAPPED 0x00000010
+#define DB_SET_LOCK_TIMEOUT 0x00000001
+#define DB_SET_REG_TIMEOUT 0x00000004
+#define DB_SET_TXN_NOW 0x00000008
+#define DB_SET_TXN_TIMEOUT 0x00000002
+#define DB_SHALLOW_DUP 0x00000100
+#define DB_SNAPSHOT 0x00000200
+#define DB_STAT_ALL 0x00000004
+#define DB_STAT_ALLOC 0x00000008
+#define DB_STAT_CLEAR 0x00000001
+#define DB_STAT_LOCK_CONF 0x00000010
+#define DB_STAT_LOCK_LOCKERS 0x00000020
+#define DB_STAT_LOCK_OBJECTS 0x00000040
+#define DB_STAT_LOCK_PARAMS 0x00000080
+#define DB_STAT_MEMP_HASH 0x00000010
+#define DB_STAT_MEMP_NOERROR 0x00000020
+#define DB_STAT_SUBSYSTEM 0x00000002
+#define DB_STAT_SUMMARY 0x00000010
+#define DB_ST_DUPOK 0x00000200
+#define DB_ST_DUPSET 0x00000400
+#define DB_ST_DUPSORT 0x00000800
+#define DB_ST_IS_RECNO 0x00001000
+#define DB_ST_OVFL_LEAF 0x00002000
+#define DB_ST_RECNUM 0x00004000
+#define DB_ST_RELEN 0x00008000
+#define DB_ST_TOPLEVEL 0x00010000
+#define DB_SYSTEM_MEM 0x00080000
+#define DB_THREAD 0x00000020
+#define DB_TIME_NOTGRANTED 0x00040000
+#define DB_TRUNCATE 0x00020000
+#define DB_TXN_BULK 0x00000010
+#define DB_TXN_FAMILY 0x00000040
+#define DB_TXN_NOSYNC 0x00000001
+#define DB_TXN_NOT_DURABLE 0x00000004
+#define DB_TXN_NOWAIT 0x00000002
+#define DB_TXN_SNAPSHOT 0x00000004
+#define DB_TXN_SYNC 0x00000008
+#define DB_TXN_WAIT 0x00000080
+#define DB_TXN_WRITE_NOSYNC 0x00000020
+#define DB_UNREF 0x00020000
+#define DB_UPGRADE 0x00000001
+#define DB_USE_ENVIRON 0x00000004
+#define DB_USE_ENVIRON_ROOT 0x00000008
+#define DB_VERB_BACKUP 0x00000001
+#define DB_VERB_DEADLOCK 0x00000002
+#define DB_VERB_FILEOPS 0x00000004
+#define DB_VERB_FILEOPS_ALL 0x00000008
+#define DB_VERB_RECOVERY 0x00000010
+#define DB_VERB_REGISTER 0x00000020
+#define DB_VERB_REPLICATION 0x00000040
+#define DB_VERB_REPMGR_CONNFAIL 0x00000080
+#define DB_VERB_REPMGR_MISC 0x00000100
+#define DB_VERB_REP_ELECT 0x00000200
+#define DB_VERB_REP_LEASE 0x00000400
+#define DB_VERB_REP_MISC 0x00000800
+#define DB_VERB_REP_MSGS 0x00001000
+#define DB_VERB_REP_SYNC 0x00002000
+#define DB_VERB_REP_SYSTEM 0x00004000
+#define DB_VERB_REP_TEST 0x00008000
+#define DB_VERB_WAITSFOR 0x00010000
+#define DB_VERIFY 0x00000002
+#define DB_VERIFY_PARTITION 0x00040000
+#define DB_WRITECURSOR 0x00000010
+#define DB_WRITELOCK 0x00000020
+#define DB_WRITEOPEN 0x00040000
+#define DB_XA_CREATE 0x00000001
+#define DB_YIELDCPU 0x00080000
diff --git a/src/dbinc_auto/btree_auto.h b/src/dbinc_auto/btree_auto.h
new file mode 100644
index 00000000..e57551c7
--- /dev/null
+++ b/src/dbinc_auto/btree_auto.h
@@ -0,0 +1,456 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __bam_AUTO_H
+#define __bam_AUTO_H
+#include "dbinc/log.h"
+#define DB___bam_split 62
+typedef struct ___bam_split_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ u_int32_t opflags;
+ db_pgno_t left;
+ DB_LSN llsn;
+ db_pgno_t right;
+ DB_LSN rlsn;
+ u_int32_t indx;
+ db_pgno_t npgno;
+ DB_LSN nlsn;
+ db_pgno_t ppgno;
+ DB_LSN plsn;
+ u_int32_t pindx;
+ DBT pg;
+ DBT pentry;
+ DBT rentry;
+} __bam_split_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_split_desc[];
+static inline int
+__bam_split_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, u_int32_t opflags, db_pgno_t left, DB_LSN * llsn, db_pgno_t right,
+ DB_LSN * rlsn, u_int32_t indx, db_pgno_t npgno, DB_LSN * nlsn, db_pgno_t ppgno,
+ DB_LSN * plsn, u_int32_t pindx, const DBT *pg, const DBT *pentry, const DBT *rentry)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_split, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(*llsn) + sizeof(u_int32_t) + sizeof(*rlsn) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*nlsn) +
+ sizeof(u_int32_t) + sizeof(*plsn) + sizeof(u_int32_t) +
+ LOG_DBT_SIZE(pg) + LOG_DBT_SIZE(pentry) + LOG_DBT_SIZE(rentry),
+ __bam_split_desc, opflags, left, llsn, right, rlsn, indx, npgno,
+ nlsn, ppgno, plsn, pindx, pg, pentry, rentry));
+}
+
+static inline int __bam_split_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_split_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_split_desc, sizeof(__bam_split_args), (void**)arg));
+}
+#define DB___bam_split_48 62
+typedef struct ___bam_split_48_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t left;
+ DB_LSN llsn;
+ db_pgno_t right;
+ DB_LSN rlsn;
+ u_int32_t indx;
+ db_pgno_t npgno;
+ DB_LSN nlsn;
+ db_pgno_t ppgno;
+ DB_LSN plsn;
+ u_int32_t pindx;
+ DBT pg;
+ DBT pentry;
+ DBT rentry;
+ u_int32_t opflags;
+} __bam_split_48_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_split_48_desc[];
+static inline int __bam_split_48_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_split_48_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_split_48_desc, sizeof(__bam_split_48_args), (void**)arg));
+}
+#define DB___bam_split_42 62
+typedef struct ___bam_split_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t left;
+ DB_LSN llsn;
+ db_pgno_t right;
+ DB_LSN rlsn;
+ u_int32_t indx;
+ db_pgno_t npgno;
+ DB_LSN nlsn;
+ db_pgno_t root_pgno;
+ DBT pg;
+ u_int32_t opflags;
+} __bam_split_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_split_42_desc[];
+static inline int __bam_split_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_split_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_split_42_desc, sizeof(__bam_split_42_args), (void**)arg));
+}
+#define DB___bam_rsplit 63
+typedef struct ___bam_rsplit_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DBT pgdbt;
+ db_pgno_t root_pgno;
+ db_pgno_t nrec;
+ DBT rootent;
+ DB_LSN rootlsn;
+} __bam_rsplit_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_rsplit_desc[];
+static inline int
+__bam_rsplit_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, const DBT *pgdbt, db_pgno_t root_pgno, db_pgno_t nrec,
+ const DBT *rootent, DB_LSN * rootlsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_rsplit, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(pgdbt) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(rootent) +
+ sizeof(*rootlsn),
+ __bam_rsplit_desc, pgno, pgdbt, root_pgno, nrec, rootent, rootlsn));
+}
+
+static inline int __bam_rsplit_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_rsplit_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_rsplit_desc, sizeof(__bam_rsplit_args), (void**)arg));
+}
+#define DB___bam_adj 55
+typedef struct ___bam_adj_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ u_int32_t indx;
+ u_int32_t indx_copy;
+ u_int32_t is_insert;
+} __bam_adj_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_adj_desc[];
+static inline int
+__bam_adj_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, u_int32_t indx_copy,
+ u_int32_t is_insert)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_adj, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+ __bam_adj_desc, pgno, lsn, indx, indx_copy, is_insert));
+}
+
+static inline int __bam_adj_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_adj_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_adj_desc, sizeof(__bam_adj_args), (void**)arg));
+}
+#define DB___bam_cadjust 56
+typedef struct ___bam_cadjust_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ u_int32_t indx;
+ int32_t adjust;
+ u_int32_t opflags;
+} __bam_cadjust_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_cadjust_desc[];
+static inline int
+__bam_cadjust_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, int32_t adjust,
+ u_int32_t opflags)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_cadjust, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+ __bam_cadjust_desc, pgno, lsn, indx, adjust, opflags));
+}
+
+static inline int __bam_cadjust_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_cadjust_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_cadjust_desc, sizeof(__bam_cadjust_args), (void**)arg));
+}
+#define DB___bam_cdel 57
+typedef struct ___bam_cdel_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ u_int32_t indx;
+} __bam_cdel_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_cdel_desc[];
+static inline int
+__bam_cdel_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_cdel, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+ sizeof(u_int32_t),
+ __bam_cdel_desc, pgno, lsn, indx));
+}
+
+static inline int __bam_cdel_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_cdel_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_cdel_desc, sizeof(__bam_cdel_args), (void**)arg));
+}
+#define DB___bam_repl 58
+typedef struct ___bam_repl_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ u_int32_t indx;
+ u_int32_t isdeleted;
+ DBT orig;
+ DBT repl;
+ u_int32_t prefix;
+ u_int32_t suffix;
+} __bam_repl_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_repl_desc[];
+static inline int
+__bam_repl_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, u_int32_t isdeleted,
+ const DBT *orig, const DBT *repl, u_int32_t prefix, u_int32_t suffix)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_repl, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(orig) +
+ LOG_DBT_SIZE(repl) + sizeof(u_int32_t) + sizeof(u_int32_t),
+ __bam_repl_desc, pgno, lsn, indx, isdeleted, orig, repl, prefix,
+ suffix));
+}
+
+static inline int __bam_repl_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_repl_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_repl_desc, sizeof(__bam_repl_args), (void**)arg));
+}
+#define DB___bam_irep 67
+typedef struct ___bam_irep_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ u_int32_t indx;
+ u_int32_t ptype;
+ DBT hdr;
+ DBT data;
+ DBT old;
+} __bam_irep_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_irep_desc[];
+static inline int
+__bam_irep_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, u_int32_t ptype,
+ const DBT *hdr, const DBT *data, const DBT *old)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_irep, 1,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(hdr) +
+ LOG_DBT_SIZE(data) + LOG_DBT_SIZE(old),
+ __bam_irep_desc, pgno, lsn, indx, ptype, hdr, data, old));
+}
+
+static inline int __bam_irep_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_irep_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_irep_desc, sizeof(__bam_irep_args), (void**)arg));
+}
+#define DB___bam_root 59
+typedef struct ___bam_root_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t meta_pgno;
+ db_pgno_t root_pgno;
+ DB_LSN meta_lsn;
+} __bam_root_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_root_desc[];
+static inline int
+__bam_root_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t meta_pgno, db_pgno_t root_pgno, DB_LSN * meta_lsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_root, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(*meta_lsn),
+ __bam_root_desc, meta_pgno, root_pgno, meta_lsn));
+}
+
+static inline int __bam_root_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_root_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_root_desc, sizeof(__bam_root_args), (void**)arg));
+}
+#define DB___bam_curadj 64
+typedef struct ___bam_curadj_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_ca_mode mode;
+ db_pgno_t from_pgno;
+ db_pgno_t to_pgno;
+ db_pgno_t left_pgno;
+ u_int32_t first_indx;
+ u_int32_t from_indx;
+ u_int32_t to_indx;
+} __bam_curadj_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_curadj_desc[];
+static inline int
+__bam_curadj_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_ca_mode mode, db_pgno_t from_pgno, db_pgno_t to_pgno, db_pgno_t left_pgno,
+ u_int32_t first_indx, u_int32_t from_indx, u_int32_t to_indx)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_curadj, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __bam_curadj_desc, mode, from_pgno, to_pgno, left_pgno, first_indx, from_indx, to_indx));
+}
+
+static inline int __bam_curadj_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_curadj_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_curadj_desc, sizeof(__bam_curadj_args), (void**)arg));
+}
+#define DB___bam_rcuradj 65
+typedef struct ___bam_rcuradj_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ ca_recno_arg mode;
+ db_pgno_t root;
+ db_recno_t recno;
+ u_int32_t order;
+} __bam_rcuradj_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_rcuradj_desc[];
+static inline int
+__bam_rcuradj_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, ca_recno_arg mode, db_pgno_t root, db_recno_t recno, u_int32_t order)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___bam_rcuradj, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __bam_rcuradj_desc, mode, root, recno, order));
+}
+
+static inline int __bam_rcuradj_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_rcuradj_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_rcuradj_desc, sizeof(__bam_rcuradj_args), (void**)arg));
+}
+#define DB___bam_relink_43 147
+typedef struct ___bam_relink_43_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ db_pgno_t prev;
+ DB_LSN lsn_prev;
+ db_pgno_t next;
+ DB_LSN lsn_next;
+} __bam_relink_43_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_relink_43_desc[];
+static inline int __bam_relink_43_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_relink_43_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_relink_43_desc, sizeof(__bam_relink_43_args), (void**)arg));
+}
+#define DB___bam_merge_44 148
+typedef struct ___bam_merge_44_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ db_pgno_t npgno;
+ DB_LSN nlsn;
+ DBT hdr;
+ DBT data;
+ DBT ind;
+} __bam_merge_44_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __bam_merge_44_desc[];
+static inline int __bam_merge_44_read(ENV *env,
+ DB **dbpp, void *td, void *data, __bam_merge_44_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __bam_merge_44_desc, sizeof(__bam_merge_44_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/btree_ext.h b/src/dbinc_auto/btree_ext.h
new file mode 100644
index 00000000..c90f5b80
--- /dev/null
+++ b/src/dbinc_auto/btree_ext.h
@@ -0,0 +1,147 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _btree_ext_h_
+#define _btree_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __bam_compact_int __P((DBC *, DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
+int __bam_compact_opd __P((DBC *, db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *));
+int __bam_truncate_ipages __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *));
+int __bam_defcmp __P((DB *, const DBT *, const DBT *));
+size_t __bam_defpfx __P((DB *, const DBT *, const DBT *));
+int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *));
+int __bam_defcompress __P((DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *));
+int __bam_defdecompress __P((DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *));
+int __bamc_compress_get __P((DBC *, DBT *, DBT *, u_int32_t));
+int __bamc_compress_put __P((DBC *, DBT *, DBT *, u_int32_t));
+int __bamc_compress_del __P((DBC *, u_int32_t));
+int __bamc_compress_bulk_del __P((DBC *, DBT *, u_int32_t));
+int __bamc_compress_count __P((DBC *, db_recno_t *));
+int __bamc_compress_cmp __P((DBC *, DBC *, int *));
+int __bamc_compress_dup __P((DBC *, DBC *, u_int32_t));
+int __bam_compress_salvage __P((DB *, VRFY_DBINFO *, void *, int (*)(void *, const void *), DBT *, DBT *));
+int __bam_compress_count __P((DBC *, u_int32_t *, u_int32_t *));
+int __bam_pgin __P((DB *, db_pgno_t, void *, DBT *));
+int __bam_pgout __P((DB *, db_pgno_t, void *, DBT *));
+int __bam_mswap __P((ENV *, PAGE *));
+int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int, u_int32_t *));
+int __ram_ca_delete __P((DB *, db_pgno_t, u_int32_t *));
+int __bam_ca_di __P((DBC *, db_pgno_t, u_int32_t, int));
+int __bam_ca_dup __P((DBC *, u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t));
+int __bam_ca_undodup __P((DB *, u_int32_t, db_pgno_t, u_int32_t, u_int32_t));
+int __bam_ca_rsplit __P((DBC *, db_pgno_t, db_pgno_t));
+int __bam_ca_split __P((DBC *, db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
+int __bam_ca_undosplit __P((DB *, db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t));
+int __bamc_init __P((DBC *, DBTYPE));
+int __bamc_refresh __P((DBC *));
+int __bamc_cmp __P((DBC *, DBC *, int *));
+int __bamc_count __P((DBC *, db_recno_t *));
+int __bamc_dup __P((DBC *, DBC *, u_int32_t));
+int __bam_bulk_overflow __P((DBC *, u_int32_t, db_pgno_t, u_int8_t *));
+int __bam_bulk_duplicates __P((DBC *, db_pgno_t, u_int8_t *, int32_t *, int32_t **, u_int8_t **, u_int32_t *, int));
+int __bamc_rget __P((DBC *, DBT *));
+int __bam_opd_exists __P((DBC *, db_pgno_t));
+int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
+int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
+int __bam_dpages __P((DBC *, int, int));
+int __bam_pupdate __P((DBC *, PAGE *));
+int __bam_db_create __P((DB *));
+int __bam_db_close __P((DB *));
+void __bam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+int __bam_set_flags __P((DB *, u_int32_t *flagsp));
+int __bam_set_bt_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+int __bam_set_bt_compress __P((DB *, int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+int __bam_get_bt_minkey __P((DB *, u_int32_t *));
+void __bam_copy_config __P((DB *, DB*, u_int32_t));
+void __ram_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+int __ram_set_flags __P((DB *, u_int32_t *flagsp));
+int __ram_get_re_len __P((DB *, u_int32_t *));
+int __ram_get_re_pad __P((DB *, int *));
+int __bam_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, db_pgno_t, u_int32_t));
+int __bam_metachk __P((DB *, const char *, BTMETA *));
+int __bam_read_root __P((DB *, DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
+int __bam_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __bam_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
+int __bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *, u_int32_t));
+int __bam_ritem_nolog __P((DBC *, PAGE *, u_int32_t, DBT *, DBT *, u_int32_t));
+int __bam_irep __P((DBC *, PAGE *, u_int32_t, DBT *, DBT *));
+int __bam_split_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_48_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rsplit_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_adj_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cadjust_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cdel_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_repl_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_irep_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_root_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_curadj_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rcuradj_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_merge_44_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_relink_43_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_reclaim __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+int __bam_truncate __P((DBC *, u_int32_t *));
+int __ram_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, db_pgno_t, u_int32_t));
+int __ram_append __P((DBC *, DBT *, DBT *));
+int __ramc_del __P((DBC *, u_int32_t));
+int __ramc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+int __ramc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+int __ram_ca __P((DBC *, ca_recno_arg, int *));
+int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
+int __ram_writeback __P((DB *));
+int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
+int __bam_adjust __P((DBC *, int32_t));
+int __bam_nrecs __P((DBC *, db_recno_t *));
+db_recno_t __bam_total __P((DB *, PAGE *));
+int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *));
+int __bam_search __P((DBC *, db_pgno_t, const DBT *, u_int32_t, int, db_recno_t *, int *));
+int __bam_stkrel __P((DBC *, u_int32_t));
+int __bam_stkgrow __P((ENV *, BTREE_CURSOR *));
+int __bam_split __P((DBC *, void *, db_pgno_t *));
+int __bam_broot __P((DBC *, PAGE *, u_int32_t, PAGE *, PAGE *));
+int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
+int __bam_pinsert __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int));
+int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
+int __bam_stat __P((DBC *, void *, u_int32_t));
+int __bam_stat_print __P((DBC *, u_int32_t));
+int __bam_stat_callback __P((DBC *, PAGE *, void *, int *));
+void __bam_print_cursor __P((DBC *));
+int __bam_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+int __bam_traverse __P((DBC *, db_lockmode_t, db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *));
+int __bam_30_btreemeta __P((DB *, char *, u_int8_t *));
+int __bam_31_btreemeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __bam_31_lbtree __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *, db_pgno_t, u_int32_t));
+int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *, PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t));
+int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, void *, void *, u_int32_t));
+int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *, void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
+int __bam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, PAGE *, void *, int (*)(void *, const void *), DBT *, u_int32_t));
+int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *, DBT *, void *, int (*)(void *, const void *), u_int32_t));
+int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *, u_int32_t, DB *));
+int __bam_init_recover __P((ENV *, DB_DISTAB *));
+int __bam_split_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_48_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rsplit_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_adj_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cadjust_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cdel_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_repl_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_irep_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_root_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_curadj_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rcuradj_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_relink_43_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_merge_44_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_init_print __P((ENV *, DB_DISTAB *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_btree_ext_h_ */
diff --git a/src/dbinc_auto/clib_ext.h b/src/dbinc_auto/clib_ext.h
new file mode 100644
index 00000000..c53be48c
--- /dev/null
+++ b/src/dbinc_auto/clib_ext.h
@@ -0,0 +1,113 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _clib_ext_h_
+#define _clib_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef HAVE_ATOI
+int atoi __P((const char *));
+#endif
+#ifndef HAVE_ATOL
+long atol __P((const char *));
+#endif
+#ifndef HAVE_BSEARCH
+void *bsearch __P((const void *, const void *, size_t, size_t, int (*)(const void *, const void *)));
+#endif
+#ifndef HAVE_GETCWD
+char *getcwd __P((char *, size_t));
+#endif
+#ifndef HAVE_GETOPT
+int getopt __P((int, char * const *, const char *));
+#endif
+#ifndef HAVE_ISALPHA
+int isalpha __P((int));
+#endif
+#ifndef HAVE_ISDIGIT
+int isdigit __P((int));
+#endif
+#ifndef HAVE_ISPRINT
+int isprint __P((int));
+#endif
+#ifndef HAVE_ISSPACE
+int isspace __P((int));
+#endif
+#ifndef HAVE_MEMCMP
+int memcmp __P((const void *, const void *, size_t));
+#endif
+#ifndef HAVE_MEMCPY
+void *memcpy __P((void *, const void *, size_t));
+#endif
+#ifndef HAVE_MEMMOVE
+void *memmove __P((void *, const void *, size_t));
+#endif
+#ifndef HAVE_PRINTF
+int printf __P((const char *, ...));
+#endif
+#ifndef HAVE_PRINTF
+int fprintf __P((FILE *, const char *, ...));
+#endif
+#ifndef HAVE_PRINTF
+int vfprintf __P((FILE *, const char *, va_list));
+#endif
+#ifndef HAVE_QSORT
+void qsort __P((void *, size_t, size_t, int(*)(const void *, const void *)));
+#endif
+#ifndef HAVE_RAISE
+int raise __P((int));
+#endif
+#ifndef HAVE_RAND
+int rand __P((void));
+void srand __P((unsigned int));
+#endif
+#ifndef HAVE_SNPRINTF
+int snprintf __P((char *, size_t, const char *, ...));
+#endif
+#ifndef HAVE_VSNPRINTF
+int vsnprintf __P((char *, size_t, const char *, va_list));
+#endif
+#ifndef HAVE_STRCASECMP
+int strcasecmp __P((const char *, const char *));
+#endif
+#ifndef HAVE_STRCASECMP
+int strncasecmp __P((const char *, const char *, size_t));
+#endif
+#ifndef HAVE_STRCAT
+char *strcat __P((char *, const char *));
+#endif
+#ifndef HAVE_STRCHR
+char *strchr __P((const char *, int));
+#endif
+#ifndef HAVE_STRDUP
+char *strdup __P((const char *));
+#endif
+#ifndef HAVE_STRERROR
+char *strerror __P((int));
+#endif
+#ifndef HAVE_STRNCAT
+char *strncat __P((char *, const char *, size_t));
+#endif
+#ifndef HAVE_STRNCMP
+int strncmp __P((const char *, const char *, size_t));
+#endif
+#ifndef HAVE_STRRCHR
+char *strrchr __P((const char *, int));
+#endif
+#ifndef HAVE_STRSEP
+char *strsep __P((char **, const char *));
+#endif
+#ifndef HAVE_STRTOL
+long strtol __P((const char *, char **, int));
+#endif
+#ifndef HAVE_STRTOUL
+unsigned long strtoul __P((const char *, char **, int));
+#endif
+#ifndef HAVE_TIME
+time_t time __P((time_t *));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_clib_ext_h_ */
diff --git a/src/dbinc_auto/common_ext.h b/src/dbinc_auto/common_ext.h
new file mode 100644
index 00000000..ac16e9db
--- /dev/null
+++ b/src/dbinc_auto/common_ext.h
@@ -0,0 +1,75 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _common_ext_h_
+#define _common_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void __clock_set_expires __P((ENV *, db_timespec *, db_timeout_t));
+int __clock_expired __P((ENV *, db_timespec *, db_timespec *));
+int __crypto_region_init __P((ENV *));
+int __db_isbigendian __P((void));
+int __db_byteorder __P((ENV *, int));
+u_int32_t __db_compress_count_int __P((u_int64_t));
+int __db_compress_int __P((u_int8_t *, u_int64_t));
+u_int32_t __db_decompress_count_int __P((const u_int8_t *));
+int __db_decompress_int __P((const u_int8_t *, u_int64_t *));
+int __db_decompress_int32 __P((const u_int8_t *, u_int32_t *));
+int __db_fchk __P((ENV *, const char *, u_int32_t, u_int32_t));
+int __db_fcchk __P((ENV *, const char *, u_int32_t, u_int32_t, u_int32_t));
+int __db_ferr __P((const ENV *, const char *, int));
+int __db_fnl __P((const ENV *, const char *));
+int __db_pgerr __P((DB *, db_pgno_t, int));
+int __db_pgfmt __P((ENV *, db_pgno_t));
+#ifdef DIAGNOSTIC
+void __db_assert __P((ENV *, const char *, const char *, int));
+#endif
+int __env_panic_msg __P((ENV *));
+int __env_panic __P((ENV *, int));
+char *__db_unknown_error __P((int));
+void __db_syserr __P((const ENV *, int, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+void __db_err __P((const ENV *, int, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+void __db_errx __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3)));
+void __db_errcall __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
+void __db_errfile __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
+void __db_msgadd __P((ENV *, DB_MSGBUF *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+void __db_msgadd_ap __P((ENV *, DB_MSGBUF *, const char *, va_list));
+void __db_msg __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3)));
+void __db_repmsg __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3)));
+int __db_unknown_flag __P((ENV *, char *, u_int32_t));
+int __db_unknown_type __P((ENV *, char *, DBTYPE));
+int __db_unknown_path __P((ENV *, char *));
+int __db_check_txn __P((DB *, DB_TXN *, DB_LOCKER *, int));
+int __db_txn_deadlock_err __P((ENV *, DB_TXN *));
+int __db_not_txn_env __P((ENV *));
+int __db_rec_toobig __P((ENV *, u_int32_t, u_int32_t));
+int __db_rec_repl __P((ENV *, u_int32_t, u_int32_t));
+int __dbc_logging __P((DBC *));
+int __db_check_lsn __P((ENV *, DB_LSN *, DB_LSN *));
+int __db_rdonly __P((const ENV *, const char *));
+int __db_space_err __P((const DB *));
+int __db_failed __P((const ENV *, const char *, pid_t, db_threadid_t));
+int __db_getlong __P((DB_ENV *, const char *, char *, long, long, long *));
+int __db_getulong __P((DB_ENV *, const char *, char *, u_long, u_long, u_long *));
+void __db_idspace __P((u_int32_t *, int, u_int32_t *, u_int32_t *));
+u_int32_t __db_log2 __P((u_int32_t));
+u_int32_t __db_tablesize __P((u_int32_t));
+void __db_hashinit __P((void *, u_int32_t));
+int __dbt_usercopy __P((ENV *, DBT *));
+void __dbt_userfree __P((ENV *, DBT *, DBT *, DBT *));
+int __db_mkpath __P((ENV *, const char *));
+u_int32_t __db_openflags __P((int));
+int __db_util_arg __P((char *, char *, int *, char ***));
+int __db_util_cache __P((DB *, u_int32_t *, int *));
+int __db_util_logset __P((const char *, char *));
+void __db_util_siginit __P((void));
+int __db_util_interrupted __P((void));
+void __db_util_sigresend __P((void));
+int __db_zero_fill __P((ENV *, DB_FH *));
+int __db_zero_extend __P((ENV *, DB_FH *, db_pgno_t, db_pgno_t, u_int32_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_common_ext_h_ */
diff --git a/src/dbinc_auto/crdel_auto.h b/src/dbinc_auto/crdel_auto.h
new file mode 100644
index 00000000..86a60549
--- /dev/null
+++ b/src/dbinc_auto/crdel_auto.h
@@ -0,0 +1,127 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __crdel_AUTO_H
+#define __crdel_AUTO_H
+#include "dbinc/log.h"
+#define DB___crdel_metasub 142
+typedef struct ___crdel_metasub_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DBT page;
+ DB_LSN lsn;
+} __crdel_metasub_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __crdel_metasub_desc[];
+static inline int
+__crdel_metasub_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, const DBT *page, DB_LSN * lsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___crdel_metasub, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(page) +
+ sizeof(*lsn),
+ __crdel_metasub_desc, pgno, page, lsn));
+}
+
+static inline int __crdel_metasub_read(ENV *env,
+ DB **dbpp, void *td, void *data, __crdel_metasub_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __crdel_metasub_desc, sizeof(__crdel_metasub_args), (void**)arg));
+}
+#define DB___crdel_inmem_create 138
+typedef struct ___crdel_inmem_create_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DBT name;
+ DBT fid;
+ u_int32_t pgsize;
+} __crdel_inmem_create_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __crdel_inmem_create_desc[];
+static inline int
+__crdel_inmem_create_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ int32_t fileid, const DBT *name, const DBT *fid, u_int32_t pgsize)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___crdel_inmem_create, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + LOG_DBT_SIZE(name) + LOG_DBT_SIZE(fid) +
+ sizeof(u_int32_t),
+ __crdel_inmem_create_desc,
+ fileid, name, fid, pgsize));
+}
+
+static inline int __crdel_inmem_create_read(ENV *env,
+ void *data, __crdel_inmem_create_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __crdel_inmem_create_desc, sizeof(__crdel_inmem_create_args), (void**)arg));
+}
+#define DB___crdel_inmem_rename 139
+typedef struct ___crdel_inmem_rename_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT oldname;
+ DBT newname;
+ DBT fid;
+} __crdel_inmem_rename_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __crdel_inmem_rename_desc[];
+static inline int
+__crdel_inmem_rename_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *oldname, const DBT *newname, const DBT *fid)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___crdel_inmem_rename, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(oldname) + LOG_DBT_SIZE(newname) + LOG_DBT_SIZE(fid),
+ __crdel_inmem_rename_desc,
+ oldname, newname, fid));
+}
+
+static inline int __crdel_inmem_rename_read(ENV *env,
+ void *data, __crdel_inmem_rename_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __crdel_inmem_rename_desc, sizeof(__crdel_inmem_rename_args), (void**)arg));
+}
+#define DB___crdel_inmem_remove 140
+typedef struct ___crdel_inmem_remove_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ DBT fid;
+} __crdel_inmem_remove_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __crdel_inmem_remove_desc[];
+static inline int
+__crdel_inmem_remove_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *name, const DBT *fid)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___crdel_inmem_remove, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(name) + LOG_DBT_SIZE(fid),
+ __crdel_inmem_remove_desc,
+ name, fid));
+}
+
+static inline int __crdel_inmem_remove_read(ENV *env,
+ void *data, __crdel_inmem_remove_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __crdel_inmem_remove_desc, sizeof(__crdel_inmem_remove_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/crypto_ext.h b/src/dbinc_auto/crypto_ext.h
new file mode 100644
index 00000000..cd7113d7
--- /dev/null
+++ b/src/dbinc_auto/crypto_ext.h
@@ -0,0 +1,38 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _crypto_ext_h_
+#define _crypto_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __aes_setup __P((ENV *, DB_CIPHER *));
+u_int __aes_adj_size __P((size_t));
+int __aes_close __P((ENV *, void *));
+int __aes_decrypt __P((ENV *, void *, void *, u_int8_t *, size_t));
+int __aes_encrypt __P((ENV *, void *, void *, u_int8_t *, size_t));
+int __aes_init __P((ENV *, DB_CIPHER *));
+int __crypto_env_close __P((ENV *));
+int __crypto_env_refresh __P((ENV *));
+int __crypto_algsetup __P((ENV *, DB_CIPHER *, u_int32_t, int));
+int __crypto_decrypt_meta __P((ENV *, DB *, u_int8_t *, int));
+int __crypto_set_passwd __P((ENV *, ENV *));
+int __db_generate_iv __P((ENV *, u_int32_t *));
+int __db_rijndaelKeySetupEnc __P((u32 *, const u8 *, int));
+int __db_rijndaelKeySetupDec __P((u32 *, const u8 *, int));
+void __db_rijndaelEncrypt __P((u32 *, int, const u8 *, u8 *));
+void __db_rijndaelDecrypt __P((u32 *, int, const u8 *, u8 *));
+void __db_rijndaelEncryptRound __P((const u32 *, int, u8 *, int));
+void __db_rijndaelDecryptRound __P((const u32 *, int, u8 *, int));
+int __db_makeKey __P((keyInstance *, int, int, char *));
+int __db_cipherInit __P((cipherInstance *, int, char *));
+int __db_blockEncrypt __P((cipherInstance *, keyInstance *, u_int8_t *, size_t, u_int8_t *));
+int __db_padEncrypt __P((cipherInstance *, keyInstance *, u_int8_t *, int, u_int8_t *));
+int __db_blockDecrypt __P((cipherInstance *, keyInstance *, u_int8_t *, size_t, u_int8_t *));
+int __db_padDecrypt __P((cipherInstance *, keyInstance *, u_int8_t *, int, u_int8_t *));
+int __db_cipherUpdateRounds __P((cipherInstance *, keyInstance *, u_int8_t *, int, u_int8_t *, int));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_crypto_ext_h_ */
diff --git a/src/dbinc_auto/db_auto.h b/src/dbinc_auto/db_auto.h
new file mode 100644
index 00000000..04e2f465
--- /dev/null
+++ b/src/dbinc_auto/db_auto.h
@@ -0,0 +1,666 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __db_AUTO_H
+#define __db_AUTO_H
+#include "dbinc/log.h"
+#define DB___db_addrem 41
+typedef struct ___db_addrem_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT hdr;
+ DBT dbt;
+ DB_LSN pagelsn;
+} __db_addrem_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_addrem_desc[];
+static inline int
+__db_addrem_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t opcode, db_pgno_t pgno, u_int32_t indx, u_int32_t nbytes,
+ const DBT *hdr, const DBT *dbt, DB_LSN * pagelsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_addrem, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(hdr) +
+ LOG_DBT_SIZE(dbt) + sizeof(*pagelsn),
+ __db_addrem_desc,
+ opcode, pgno, indx, nbytes, hdr, dbt, pagelsn));
+}
+
+static inline int __db_addrem_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_addrem_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_addrem_desc, sizeof(__db_addrem_args), (void**)arg));
+}
+#define DB___db_addrem_42 41
+typedef struct ___db_addrem_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT hdr;
+ DBT dbt;
+ DB_LSN pagelsn;
+} __db_addrem_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_addrem_42_desc[];
+static inline int __db_addrem_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_addrem_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_addrem_42_desc, sizeof(__db_addrem_42_args), (void**)arg));
+}
+#define DB___db_big 43
+typedef struct ___db_big_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ db_pgno_t prev_pgno;
+ db_pgno_t next_pgno;
+ DBT dbt;
+ DB_LSN pagelsn;
+ DB_LSN prevlsn;
+ DB_LSN nextlsn;
+} __db_big_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_big_desc[];
+static inline int
+__db_big_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t opcode, db_pgno_t pgno, db_pgno_t prev_pgno, db_pgno_t next_pgno,
+ const DBT *dbt, DB_LSN * pagelsn, DB_LSN * prevlsn, DB_LSN * nextlsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_big, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(dbt) +
+ sizeof(*pagelsn) + sizeof(*prevlsn) + sizeof(*nextlsn),
+ __db_big_desc,
+ opcode, pgno, prev_pgno, next_pgno, dbt, pagelsn, prevlsn,
+ nextlsn));
+}
+
+static inline int __db_big_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_big_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_big_desc, sizeof(__db_big_args), (void**)arg));
+}
+#define DB___db_big_42 43
+typedef struct ___db_big_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ db_pgno_t prev_pgno;
+ db_pgno_t next_pgno;
+ DBT dbt;
+ DB_LSN pagelsn;
+ DB_LSN prevlsn;
+ DB_LSN nextlsn;
+} __db_big_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_big_42_desc[];
+static inline int __db_big_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_big_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_big_42_desc, sizeof(__db_big_42_args), (void**)arg));
+}
+#define DB___db_ovref 44
+typedef struct ___db_ovref_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ int32_t adjust;
+ DB_LSN lsn;
+} __db_ovref_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_ovref_desc[];
+static inline int
+__db_ovref_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, int32_t adjust, DB_LSN * lsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_ovref, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(*lsn),
+ __db_ovref_desc, pgno, adjust, lsn));
+}
+
+static inline int __db_ovref_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_ovref_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_ovref_desc, sizeof(__db_ovref_args), (void**)arg));
+}
+#define DB___db_relink_42 45
+typedef struct ___db_relink_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ db_pgno_t prev;
+ DB_LSN lsn_prev;
+ db_pgno_t next;
+ DB_LSN lsn_next;
+} __db_relink_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_relink_42_desc[];
+static inline int __db_relink_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_relink_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_relink_42_desc, sizeof(__db_relink_42_args), (void**)arg));
+}
+#define DB___db_debug 47
+typedef struct ___db_debug_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT op;
+ int32_t fileid;
+ DBT key;
+ DBT data;
+ u_int32_t arg_flags;
+} __db_debug_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_debug_desc[];
+static inline int
+__db_debug_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *op, int32_t fileid, const DBT *key, const DBT *data, u_int32_t arg_flags)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___db_debug, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(op) + sizeof(u_int32_t) + LOG_DBT_SIZE(key) +
+ LOG_DBT_SIZE(data) + sizeof(u_int32_t),
+ __db_debug_desc,
+ op, fileid, key, data, arg_flags));
+}
+
+static inline int __db_debug_read(ENV *env,
+ void *data, __db_debug_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __db_debug_desc, sizeof(__db_debug_args), (void**)arg));
+}
+#define DB___db_noop 48
+typedef struct ___db_noop_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN prevlsn;
+} __db_noop_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_noop_desc[];
+static inline int
+__db_noop_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * prevlsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_noop, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*prevlsn),
+ __db_noop_desc, pgno, prevlsn));
+}
+
+static inline int __db_noop_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_noop_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_noop_desc, sizeof(__db_noop_args), (void**)arg));
+}
+#define DB___db_pg_alloc_42 49
+typedef struct ___db_pg_alloc_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DB_LSN meta_lsn;
+ db_pgno_t meta_pgno;
+ DB_LSN page_lsn;
+ db_pgno_t pgno;
+ u_int32_t ptype;
+ db_pgno_t next;
+} __db_pg_alloc_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_alloc_42_desc[];
+static inline int __db_pg_alloc_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pg_alloc_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pg_alloc_42_desc, sizeof(__db_pg_alloc_42_args), (void**)arg));
+}
+#define DB___db_pg_alloc 49
+typedef struct ___db_pg_alloc_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DB_LSN meta_lsn;
+ db_pgno_t meta_pgno;
+ DB_LSN page_lsn;
+ db_pgno_t pgno;
+ u_int32_t ptype;
+ db_pgno_t next;
+ db_pgno_t last_pgno;
+} __db_pg_alloc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_alloc_desc[];
+static inline int
+__db_pg_alloc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * meta_lsn, db_pgno_t meta_pgno, DB_LSN * page_lsn, db_pgno_t pgno,
+ u_int32_t ptype, db_pgno_t next, db_pgno_t last_pgno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_pg_alloc, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(*meta_lsn) + sizeof(u_int32_t) +
+ sizeof(*page_lsn) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __db_pg_alloc_desc, meta_lsn, meta_pgno, page_lsn, pgno, ptype, next, last_pgno));
+}
+
+static inline int __db_pg_alloc_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pg_alloc_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pg_alloc_desc, sizeof(__db_pg_alloc_args), (void**)arg));
+}
+#define DB___db_pg_free_42 50
+typedef struct ___db_pg_free_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN meta_lsn;
+ db_pgno_t meta_pgno;
+ DBT header;
+ db_pgno_t next;
+} __db_pg_free_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_free_42_desc[];
+static inline int __db_pg_free_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pg_free_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pg_free_42_desc, sizeof(__db_pg_free_42_args), (void**)arg));
+}
+#define DB___db_pg_free 50
+typedef struct ___db_pg_free_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN meta_lsn;
+ db_pgno_t meta_pgno;
+ DBT header;
+ db_pgno_t next;
+ db_pgno_t last_pgno;
+} __db_pg_free_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_free_desc[];
+static inline int
+__db_pg_free_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * meta_lsn, db_pgno_t meta_pgno, const DBT *header,
+ db_pgno_t next, db_pgno_t last_pgno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_pg_free, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*meta_lsn) +
+ sizeof(u_int32_t) + LOG_DBT_SIZE(header) + sizeof(u_int32_t) +
+ sizeof(u_int32_t),
+ __db_pg_free_desc, pgno, meta_lsn, meta_pgno, header, next, last_pgno));
+}
+
+static inline int __db_pg_free_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pg_free_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pg_free_desc, sizeof(__db_pg_free_args), (void**)arg));
+}
+#define DB___db_cksum 51
+typedef struct ___db_cksum_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+} __db_cksum_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_cksum_desc[];
+static inline int
+__db_cksum_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___db_cksum, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN),
+ __db_cksum_desc));
+}
+
+static inline int __db_cksum_read(ENV *env,
+ void *data, __db_cksum_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __db_cksum_desc, sizeof(__db_cksum_args), (void**)arg));
+}
+#define DB___db_pg_freedata_42 52
+typedef struct ___db_pg_freedata_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN meta_lsn;
+ db_pgno_t meta_pgno;
+ DBT header;
+ db_pgno_t next;
+ DBT data;
+} __db_pg_freedata_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_freedata_42_desc[];
+static inline int __db_pg_freedata_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pg_freedata_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pg_freedata_42_desc, sizeof(__db_pg_freedata_42_args), (void**)arg));
+}
+#define DB___db_pg_freedata 52
+typedef struct ___db_pg_freedata_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN meta_lsn;
+ db_pgno_t meta_pgno;
+ DBT header;
+ db_pgno_t next;
+ db_pgno_t last_pgno;
+ DBT data;
+} __db_pg_freedata_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_freedata_desc[];
+static inline int
+__db_pg_freedata_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * meta_lsn, db_pgno_t meta_pgno, const DBT *header,
+ db_pgno_t next, db_pgno_t last_pgno, const DBT *data)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_pg_freedata, 1,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*meta_lsn) +
+ sizeof(u_int32_t) + LOG_DBT_SIZE(header) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + LOG_DBT_SIZE(data),
+ __db_pg_freedata_desc, pgno, meta_lsn, meta_pgno, header, next, last_pgno, data));
+}
+
+static inline int __db_pg_freedata_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pg_freedata_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pg_freedata_desc, sizeof(__db_pg_freedata_args), (void**)arg));
+}
+#define DB___db_pg_init 60
+typedef struct ___db_pg_init_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DBT header;
+ DBT data;
+} __db_pg_init_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_init_desc[];
+static inline int
+__db_pg_init_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, const DBT *header, const DBT *data)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_pg_init, 1,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(header) +
+ LOG_DBT_SIZE(data),
+ __db_pg_init_desc, pgno, header, data));
+}
+
+static inline int __db_pg_init_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pg_init_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pg_init_desc, sizeof(__db_pg_init_args), (void**)arg));
+}
+#define DB___db_pg_sort_44 61
+typedef struct ___db_pg_sort_44_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t meta;
+ DB_LSN meta_lsn;
+ db_pgno_t last_free;
+ DB_LSN last_lsn;
+ db_pgno_t last_pgno;
+ DBT list;
+} __db_pg_sort_44_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_sort_44_desc[];
+static inline int __db_pg_sort_44_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pg_sort_44_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pg_sort_44_desc, sizeof(__db_pg_sort_44_args), (void**)arg));
+}
+#define DB___db_pg_trunc 66
+typedef struct ___db_pg_trunc_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t meta;
+ DB_LSN meta_lsn;
+ db_pgno_t last_free;
+ DB_LSN last_lsn;
+ db_pgno_t next_free;
+ db_pgno_t last_pgno;
+ DBT list;
+} __db_pg_trunc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pg_trunc_desc[];
+static inline int
+__db_pg_trunc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t meta, DB_LSN * meta_lsn, db_pgno_t last_free, DB_LSN * last_lsn,
+ db_pgno_t next_free, db_pgno_t last_pgno, const DBT *list)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_pg_trunc, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*meta_lsn) +
+ sizeof(u_int32_t) + sizeof(*last_lsn) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + LOG_DBT_SIZE(list),
+ __db_pg_trunc_desc, meta, meta_lsn, last_free, last_lsn, next_free, last_pgno, list));
+}
+
+static inline int __db_pg_trunc_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pg_trunc_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pg_trunc_desc, sizeof(__db_pg_trunc_args), (void**)arg));
+}
+#define DB___db_realloc 36
+typedef struct ___db_realloc_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t prev_pgno;
+ DB_LSN page_lsn;
+ db_pgno_t next_free;
+ u_int32_t ptype;
+ DBT list;
+} __db_realloc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_realloc_desc[];
+static inline int
+__db_realloc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t prev_pgno, DB_LSN * page_lsn, db_pgno_t next_free, u_int32_t ptype,
+ const DBT *list)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_realloc, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*page_lsn) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(list),
+ __db_realloc_desc, prev_pgno, page_lsn, next_free, ptype, list));
+}
+
+static inline int __db_realloc_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_realloc_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_realloc_desc, sizeof(__db_realloc_args), (void**)arg));
+}
+#define DB___db_relink 147
+typedef struct ___db_relink_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ db_pgno_t new_pgno;
+ db_pgno_t prev_pgno;
+ DB_LSN lsn_prev;
+ db_pgno_t next_pgno;
+ DB_LSN lsn_next;
+} __db_relink_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_relink_desc[];
+static inline int
+__db_relink_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, db_pgno_t new_pgno, db_pgno_t prev_pgno, DB_LSN * lsn_prev,
+ db_pgno_t next_pgno, DB_LSN * lsn_next)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_relink, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(*lsn_prev) + sizeof(u_int32_t) +
+ sizeof(*lsn_next),
+ __db_relink_desc, pgno, new_pgno, prev_pgno, lsn_prev, next_pgno, lsn_next));
+}
+
+static inline int __db_relink_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_relink_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_relink_desc, sizeof(__db_relink_args), (void**)arg));
+}
+#define DB___db_merge 148
+typedef struct ___db_merge_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ db_pgno_t npgno;
+ DB_LSN nlsn;
+ DBT hdr;
+ DBT data;
+ int32_t pg_copy;
+} __db_merge_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_merge_desc[];
+static inline int
+__db_merge_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, db_pgno_t npgno, DB_LSN * nlsn,
+ const DBT *hdr, const DBT *data, int32_t pg_copy)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_merge, 1,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+ sizeof(u_int32_t) + sizeof(*nlsn) + LOG_DBT_SIZE(hdr) +
+ LOG_DBT_SIZE(data) + sizeof(u_int32_t),
+ __db_merge_desc, pgno, lsn, npgno, nlsn, hdr, data, pg_copy));
+}
+
+static inline int __db_merge_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_merge_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_merge_desc, sizeof(__db_merge_args), (void**)arg));
+}
+#define DB___db_pgno 149
+typedef struct ___db_pgno_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN lsn;
+ u_int32_t indx;
+ db_pgno_t opgno;
+ db_pgno_t npgno;
+} __db_pgno_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __db_pgno_desc[];
+static inline int
+__db_pgno_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * lsn, u_int32_t indx, db_pgno_t opgno,
+ db_pgno_t npgno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___db_pgno, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*lsn) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+ __db_pgno_desc, pgno, lsn, indx, opgno, npgno));
+}
+
+static inline int __db_pgno_read(ENV *env,
+ DB **dbpp, void *td, void *data, __db_pgno_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __db_pgno_desc, sizeof(__db_pgno_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/db_ext.h b/src/dbinc_auto/db_ext.h
new file mode 100644
index 00000000..de2a6ce4
--- /dev/null
+++ b/src/dbinc_auto/db_ext.h
@@ -0,0 +1,346 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _db_ext_h_
+#define _db_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __crdel_init_recover __P((ENV *, DB_DISTAB *));
+int __crdel_metasub_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_create_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_rename_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_init_print __P((ENV *, DB_DISTAB *));
+int __crdel_metasub_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_create_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_rename_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_master_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, u_int32_t, int, DB **));
+int __db_master_update __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *, const char *, DBTYPE, mu_action, const char *, u_int32_t));
+int __env_dbreg_setup __P((DB *, DB_TXN *, const char *, const char *, u_int32_t));
+int __env_setup __P((DB *, DB_TXN *, const char *, const char *, u_int32_t, u_int32_t));
+int __env_mpool __P((DB *, const char *, u_int32_t));
+int __db_close __P((DB *, DB_TXN *, u_int32_t));
+int __db_refresh __P((DB *, DB_TXN *, u_int32_t, int *, int));
+int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *));
+int __db_walk_cursors __P((DB *, DBC *, int (*) __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *)), u_int32_t *, db_pgno_t, u_int32_t, void *));
+int __db_backup_name __P((ENV *, const char *, DB_TXN *, char **));
+#ifdef CONFIG_TEST
+int __db_testcopy __P((ENV *, DB *, const char *));
+#endif
+int __db_testdocopy __P((ENV *, const char *));
+int __db_cursor_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBTYPE, db_pgno_t, int, DB_LOCKER *, DBC **));
+int __db_put __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+int __db_del __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, u_int32_t));
+int __db_sync __P((DB *));
+int __db_associate __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB *, int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+int __db_secondary_close __P((DB *, u_int32_t));
+int __db_associate_foreign __P((DB *, DB *, int (*)(DB *, const DBT *, DBT *, const DBT *, int *), u_int32_t));
+int __db_init_recover __P((ENV *, DB_DISTAB *));
+int __db_addrem_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_addrem_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_ovref_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_debug_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_noop_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_cksum_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_init_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_sort_44_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_trunc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_realloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_merge_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pgno_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_init_print __P((ENV *, DB_DISTAB *));
+int __db_dbbackup_pp __P((DB_ENV *, const char *, const char *, u_int32_t));
+int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t));
+int __db_backup __P((DB_ENV *, const char *, u_int32_t));
+int __dbc_close __P((DBC *));
+int __dbc_destroy __P((DBC *));
+int __dbc_cmp __P((DBC *, DBC *, int *));
+int __dbc_count __P((DBC *, db_recno_t *));
+int __dbc_del __P((DBC *, u_int32_t));
+int __dbc_idel __P((DBC *, u_int32_t));
+#ifdef HAVE_COMPRESSION
+int __dbc_bulk_del __P((DBC *, DBT *, u_int32_t));
+#endif
+int __dbc_dup __P((DBC *, DBC **, u_int32_t));
+int __dbc_idup __P((DBC *, DBC **, u_int32_t));
+int __dbc_newopd __P((DBC *, db_pgno_t, DBC *, DBC **));
+int __dbc_get __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_iget __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_put __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_iput __P((DBC *, DBT *, DBT *, u_int32_t));
+int __db_duperr __P((DB *, u_int32_t));
+int __dbc_cleanup __P((DBC *, DBC *, int));
+int __dbc_secondary_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+int __dbc_del_primary __P((DBC *));
+int __db_s_first __P((DB *, DB **));
+int __db_s_next __P((DB **, DB_TXN *));
+int __db_s_done __P((DB *, DB_TXN *));
+int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *));
+u_int32_t __db_partsize __P((u_int32_t, DBT *));
+#ifdef DIAGNOSTIC
+void __db_check_skeyset __P((DB *, DBT *));
+#endif
+int __cdsgroup_begin __P((ENV *, DB_TXN **));
+int __cdsgroup_begin_pp __P((DB_ENV *, DB_TXN **));
+int __db_compact_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int));
+int __db_truncate_overflow __P((DBC *, db_pgno_t, PAGE **, DB_COMPACT *));
+int __db_truncate_root __P((DBC *, PAGE *, u_int32_t, db_pgno_t *, u_int32_t));
+int __db_find_free __P((DBC *, u_int32_t, u_int32_t, db_pgno_t, db_pgno_t *));
+int __db_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t));
+int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *));
+int __db_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *));
+int __db_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *));
+int __db_decrypt_pg __P((ENV *, DB *, PAGE *));
+int __db_encrypt_and_checksum_pg __P((ENV *, DB *, PAGE *));
+void __db_metaswap __P((PAGE *));
+int __db_byteswap __P((DB *, db_pgno_t, PAGE *, size_t, int));
+int __db_pageswap __P((ENV *, DB *, void *, size_t, DBT *, int));
+void __db_recordswap __P((u_int32_t, u_int32_t, void *, void *, u_int32_t));
+int __db_dispatch __P((ENV *, DB_DISTAB *, DBT *, DB_LSN *, db_recops, void *));
+int __db_add_recovery __P((DB_ENV *, DB_DISTAB *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t));
+int __db_add_recovery_int __P((ENV *, DB_DISTAB *, int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
+int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *, u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
+int __db_txnlist_add __P((ENV *, DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
+int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t));
+void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+void __db_txnlist_end __P((ENV *, DB_TXNHEAD *));
+int __db_txnlist_find __P((ENV *, DB_TXNHEAD *, u_int32_t, u_int32_t *));
+int __db_txnlist_update __P((ENV *, DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
+int __db_txnlist_gen __P((ENV *, DB_TXNHEAD *, int, u_int32_t, u_int32_t));
+int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+int __db_txnlist_lsnget __P((ENV *, DB_TXNHEAD *, DB_LSN *, u_int32_t));
+int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+void __db_txnlist_print __P((DB_TXNHEAD *));
+int __db_ditem_nolog __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+int __db_pitem_nolog __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+int __db_pitem __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+int __db_associate_pp __P((DB *, DB_TXN *, DB *, int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+int __db_close_pp __P((DB *, u_int32_t));
+int __db_cursor_pp __P((DB *, DB_TXN *, DBC **, u_int32_t));
+int __db_cursor __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBC **, u_int32_t));
+int __db_del_pp __P((DB *, DB_TXN *, DBT *, u_int32_t));
+int __db_exists __P((DB *, DB_TXN *, DBT *, u_int32_t));
+int __db_fd_pp __P((DB *, int *));
+int __db_get_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+int __db_get __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+int __db_join_pp __P((DB *, DBC **, DBC **, u_int32_t));
+int __db_key_range_pp __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+int __db_open_pp __P((DB *, DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int));
+int __db_pget_pp __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+int __db_pget __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+int __db_put_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+int __db_compact_pp __P((DB *, DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+int __db_associate_foreign_pp __P((DB *, DB *, int (*)(DB *, const DBT *, DBT *, const DBT *, int *), u_int32_t));
+int __db_sync_pp __P((DB *, u_int32_t));
+int __dbc_close_pp __P((DBC *));
+int __dbc_cmp_pp __P((DBC *, DBC *, int*, u_int32_t));
+int __dbc_count_pp __P((DBC *, db_recno_t *, u_int32_t));
+int __dbc_del_pp __P((DBC *, u_int32_t));
+int __dbc_dup_pp __P((DBC *, DBC **, u_int32_t));
+int __dbc_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+int __dbc_get_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+int __db_secondary_close_pp __P((DB *, u_int32_t));
+int __dbc_pget_pp __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+int __dbc_put_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+int __db_txn_auto_init __P((ENV *, DB_THREAD_INFO *, DB_TXN **));
+int __db_txn_auto_resolve __P((ENV *, DB_TXN *, int, int));
+int __db_join __P((DB *, DBC **, DBC **, u_int32_t));
+int __db_join_close __P((DBC *));
+int __db_secondary_corrupt __P((DB *));
+int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
+int __db_free __P((DBC *, PAGE *, u_int32_t));
+#ifdef HAVE_FTRUNCATE
+void __db_freelist_pos __P((db_pgno_t, db_pgno_t *, u_int32_t, u_int32_t *));
+#endif
+void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
+#ifdef HAVE_FTRUNCATE
+int __db_pg_truncate __P((DBC *, DB_TXN *, db_pglist_t *, DB_COMPACT *, u_int32_t *, db_pgno_t , db_pgno_t *, DB_LSN *, int));
+#endif
+#ifdef HAVE_FTRUNCATE
+int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *, db_pgno_t *));
+#endif
+int __db_lprint __P((DBC *));
+int __db_lget __P((DBC *, int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
+#ifdef DIAGNOSTIC
+int __db_haslock __P((ENV *, DB_LOCKER *, DB_MPOOLFILE *, db_pgno_t, db_lockmode_t, u_int32_t));
+#endif
+#ifdef DIAGNOSTIC
+int __db_has_pagelock __P((ENV *, DB_LOCKER *, DB_MPOOLFILE *, PAGE *, db_lockmode_t));
+#endif
+int __db_lput __P((DBC *, DB_LOCK *));
+int __db_create_internal __P((DB **, ENV *, u_int32_t));
+int __dbh_am_chk __P((DB *, u_int32_t));
+int __db_get_flags __P((DB *, u_int32_t *));
+int __db_set_flags __P((DB *, u_int32_t));
+int __db_get_lorder __P((DB *, int *));
+int __db_set_lorder __P((DB *, int));
+int __db_set_pagesize __P((DB *, u_int32_t));
+int __db_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int, db_pgno_t));
+int __db_get_open_flags __P((DB *, u_int32_t *));
+int __db_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __db_init_subdb __P((DB *, DB *, const char *, DB_THREAD_INFO *, DB_TXN *));
+int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t));
+int __db_meta_setup __P((ENV *, DB *, const char *, DBMETA *, u_int32_t, u_int32_t));
+int __db_reopen __P((DBC *));
+int __db_goff __P((DBC *, DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *));
+int __db_poff __P((DBC *, const DBT *, db_pgno_t *));
+int __db_ovref __P((DBC *, db_pgno_t));
+int __db_doff __P((DBC *, db_pgno_t));
+int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *));
+int __db_coff __P((DBC *, const DBT *, const DBT *, int (*)(DB *, const DBT *, const DBT *), int *));
+int __db_vrfy_overflow __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __db_vrfy_ovfl_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, u_int32_t));
+int __db_safe_goff __P((DB *, VRFY_DBINFO *, db_pgno_t, DBT *, void *, u_int32_t *, u_int32_t));
+void __db_loadme __P((void));
+int __db_dumptree __P((DB *, DB_TXN *, char *, char *, db_pgno_t, db_pgno_t));
+const FN * __db_get_flags_fn __P((void));
+int __db_prnpage __P((DB *, DB_TXN *, db_pgno_t));
+int __db_prpage __P((DB *, PAGE *, u_int32_t));
+const char * __db_lockmode_to_string __P((db_lockmode_t));
+int __db_dumptree __P((DB *, DB_TXN *, char *, char *, db_pgno_t, db_pgno_t));
+const FN * __db_get_flags_fn __P((void));
+int __db_prpage_int __P((ENV *, DB_MSGBUF *, DB *, char *, PAGE *, u_int32_t, u_int8_t *, u_int32_t));
+void __db_prbytes __P((ENV *, DB_MSGBUF *, u_int8_t *, u_int32_t));
+void __db_prflags __P((ENV *, DB_MSGBUF *, u_int32_t, const FN *, const char *, const char *));
+int __db_name_to_val __P((FN const *, char *));
+const char *__db_pagetype_to_string __P((u_int32_t));
+int __db_dump_pp __P((DB *, const char *, int (*)(void *, const void *), void *, int, int));
+int __db_dump __P((DB *, const char *, int (*)(void *, const void *), void *, int, int));
+int __db_prdbt __P((DBT *, int, const char *, void *, int (*)(void *, const void *), int, int));
+int __db_prheader __P((DB *, const char *, int, int, void *, int (*)(void *, const void *), VRFY_DBINFO *, db_pgno_t));
+int __db_prfooter __P((void *, int (*)(void *, const void *)));
+int __db_pr_callback __P((void *, const void *));
+const char * __db_dbtype_to_string __P((DBTYPE));
+int __db_addrem_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_addrem_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_ovref_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_debug_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_noop_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_cksum_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_init_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_trunc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_realloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_sort_44_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_merge_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pgno_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+void __db_pglist_swap __P((u_int32_t, void *));
+void __db_pglist_print __P((ENV *, DB_MSGBUF *, DBT *));
+int __db_traverse_big __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *));
+int __db_reclaim_callback __P((DBC *, PAGE *, void *, int *));
+int __db_truncate_callback __P((DBC *, PAGE *, void *, int *));
+int __env_dbremove_pp __P((DB_ENV *, DB_TXN *, const char *, const char *, u_int32_t));
+int __db_remove_pp __P((DB *, const char *, const char *, u_int32_t));
+int __db_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+int __db_remove_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+int __db_inmem_remove __P((DB *, DB_TXN *, const char *));
+int __env_dbrename_pp __P((DB_ENV *, DB_TXN *, const char *, const char *, const char *, u_int32_t));
+int __db_rename_pp __P((DB *, const char *, const char *, const char *, u_int32_t));
+int __db_rename_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *, u_int32_t));
+int __db_ret __P((DBC *, PAGE *, u_int32_t, DBT *, void **, u_int32_t *));
+int __db_retcopy __P((ENV *, DBT *, void *, u_int32_t, void **, u_int32_t *));
+int __env_fileid_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+int __env_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, int));
+int __env_lsn_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+int __db_lsn_reset __P((DB_MPOOLFILE *, DB_THREAD_INFO *));
+int __db_compare_both __P((DB *, const DBT *, const DBT *, const DBT *, const DBT *));
+int __db_sort_multiple __P((DB *, DBT *, DBT *, u_int32_t));
+int __db_stat_pp __P((DB *, DB_TXN *, void *, u_int32_t));
+int __db_stat_print_pp __P((DB *, u_int32_t));
+int __db_stat_print __P((DB *, DB_THREAD_INFO *, u_int32_t));
+int __db_truncate_pp __P((DB *, DB_TXN *, u_int32_t *, u_int32_t));
+int __db_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t *));
+int __db_upgrade_pp __P((DB *, const char *, u_int32_t));
+int __db_upgrade __P((DB *, const char *, u_int32_t));
+int __db_lastpgno __P((DB *, char *, DB_FH *, db_pgno_t *));
+int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *));
+int __db_verify_pp __P((DB *, const char *, const char *, FILE *, u_int32_t));
+int __db_verify_internal __P((DB *, const char *, const char *, void *, int (*)(void *, const void *), u_int32_t));
+int __db_verify __P((DB *, DB_THREAD_INFO *, const char *, const char *, void *, int (*)(void *, const void *), void *, void *, u_int32_t));
+int __db_vrfy_common __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __db_vrfy_datapage __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __db_vrfy_meta __P((DB *, VRFY_DBINFO *, DBMETA *, db_pgno_t, u_int32_t));
+void __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *));
+int __db_salvage_pg __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+int __db_salvage_leaf __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+int __db_vrfy_inpitem __P((DB *, PAGE *, db_pgno_t, u_int32_t, int, u_int32_t, u_int32_t *, u_int32_t *));
+int __db_vrfy_duptype __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+int __db_salvage_duptree __P((DB *, VRFY_DBINFO *, db_pgno_t, DBT *, void *, int (*)(void *, const void *), u_int32_t));
+int __db_vrfy_dbinfo_create __P((ENV *, DB_THREAD_INFO *, u_int32_t, VRFY_DBINFO **));
+int __db_vrfy_dbinfo_destroy __P((ENV *, VRFY_DBINFO *));
+int __db_vrfy_getpageinfo __P((VRFY_DBINFO *, db_pgno_t, VRFY_PAGEINFO **));
+int __db_vrfy_putpageinfo __P((ENV *, VRFY_DBINFO *, VRFY_PAGEINFO *));
+int __db_vrfy_pgset __P((ENV *, DB_THREAD_INFO *, u_int32_t, DB **));
+int __db_vrfy_pgset_get __P((DB *, DB_THREAD_INFO *, DB_TXN *, db_pgno_t, int *));
+int __db_vrfy_pgset_inc __P((DB *, DB_THREAD_INFO *, DB_TXN *, db_pgno_t));
+int __db_vrfy_pgset_next __P((DBC *, db_pgno_t *));
+int __db_vrfy_childcursor __P((VRFY_DBINFO *, DBC **));
+int __db_vrfy_childput __P((VRFY_DBINFO *, db_pgno_t, VRFY_CHILDINFO *));
+int __db_vrfy_ccset __P((DBC *, db_pgno_t, VRFY_CHILDINFO **));
+int __db_vrfy_ccnext __P((DBC *, VRFY_CHILDINFO **));
+int __db_vrfy_ccclose __P((DBC *));
+int __db_salvage_init __P((VRFY_DBINFO *));
+int __db_salvage_destroy __P((VRFY_DBINFO *));
+int __db_salvage_getnext __P((VRFY_DBINFO *, DBC **, db_pgno_t *, u_int32_t *, int));
+int __db_salvage_isdone __P((VRFY_DBINFO *, db_pgno_t));
+int __db_salvage_markdone __P((VRFY_DBINFO *, db_pgno_t));
+int __db_salvage_markneeded __P((VRFY_DBINFO *, db_pgno_t, u_int32_t));
+int __db_vrfy_prdbt __P((DBT *, int, const char *, void *, int (*)(void *, const void *), int, int, VRFY_DBINFO *));
+int __partition_init __P((DB *, u_int32_t));
+int __partition_set __P((DB *, u_int32_t, DBT *, u_int32_t (*callback)(DB *, DBT *key)));
+int __partition_set_dirs __P((DB *, const char **));
+int __partition_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, DBTYPE, u_int32_t, int, int));
+int __partition_get_callback __P((DB *, u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+int __partition_get_dirs __P((DB *, const char ***));
+int __partc_init __P((DBC *));
+int __partc_get __P((DBC*, DBT *, DBT *, u_int32_t));
+int __partition_close __P((DB *, DB_TXN *, u_int32_t));
+int __partition_sync __P((DB *));
+int __partition_stat __P((DBC *, void *, u_int32_t));
+int __part_truncate __P((DBC *, u_int32_t *));
+int __part_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+int __part_lsn_reset __P((DB *, DB_THREAD_INFO *));
+int __part_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+int __part_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+int __part_rename __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *));
+int __part_verify __P((DB *, VRFY_DBINFO *, const char *, void *, int (*)(void *, const void *), u_int32_t));
+int __part_testdocopy __P((DB *, const char *));
+int __db_no_partition __P((ENV *));
+int __partition_set __P((DB *, u_int32_t, DBT *, u_int32_t (*callback)(DB *, DBT *key)));
+int __partition_get_callback __P((DB *, u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+int __partition_get_dirs __P((DB *, const char ***));
+int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+int __partition_init __P((DB *, u_int32_t));
+int __part_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+int __partition_set_dirs __P((DB *, const char **));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_db_ext_h_ */
diff --git a/src/dbinc_auto/dbreg_auto.h b/src/dbinc_auto/dbreg_auto.h
new file mode 100644
index 00000000..63ad0cd3
--- /dev/null
+++ b/src/dbinc_auto/dbreg_auto.h
@@ -0,0 +1,43 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __dbreg_AUTO_H
+#define __dbreg_AUTO_H
+#include "dbinc/log.h"
+#define DB___dbreg_register 2
+typedef struct ___dbreg_register_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ DBT name;
+ DBT uid;
+ int32_t fileid;
+ DBTYPE ftype;
+ db_pgno_t meta_pgno;
+ u_int32_t id;
+} __dbreg_register_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __dbreg_register_desc[];
+static inline int
+__dbreg_register_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t opcode, const DBT *name, const DBT *uid, int32_t fileid, DBTYPE ftype,
+ db_pgno_t meta_pgno, u_int32_t id)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___dbreg_register, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + LOG_DBT_SIZE(name) + LOG_DBT_SIZE(uid) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t),
+ __dbreg_register_desc,
+ opcode, name, uid, fileid, ftype, meta_pgno, id));
+}
+
+static inline int __dbreg_register_read(ENV *env,
+ void *data, __dbreg_register_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __dbreg_register_desc, sizeof(__dbreg_register_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/dbreg_ext.h b/src/dbinc_auto/dbreg_ext.h
new file mode 100644
index 00000000..0f495c33
--- /dev/null
+++ b/src/dbinc_auto/dbreg_ext.h
@@ -0,0 +1,46 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _dbreg_ext_h_
+#define _dbreg_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __dbreg_setup __P((DB *, const char *, const char *, u_int32_t));
+int __dbreg_teardown __P((DB *));
+int __dbreg_teardown_int __P((ENV *, FNAME *));
+int __dbreg_new_id __P((DB *, DB_TXN *));
+int __dbreg_get_id __P((DB *, DB_TXN *, int32_t *));
+int __dbreg_assign_id __P((DB *, int32_t, int));
+int __dbreg_revoke_id __P((DB *, int, int32_t));
+int __dbreg_revoke_id_int __P((ENV *, FNAME *, int, int, int32_t));
+int __dbreg_close_id __P((DB *, DB_TXN *, u_int32_t));
+int __dbreg_close_id_int __P((ENV *, FNAME *, u_int32_t, int));
+int __dbreg_failchk __P((ENV *));
+int __dbreg_log_close __P((ENV *, FNAME *, DB_TXN *, u_int32_t));
+int __dbreg_log_id __P((DB *, DB_TXN *, int32_t, int));
+int __dbreg_init_recover __P((ENV *, DB_DISTAB *));
+int __dbreg_register_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __dbreg_init_print __P((ENV *, DB_DISTAB *));
+int __dbreg_register_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __dbreg_stat_print __P((ENV *, u_int32_t));
+void __dbreg_print_fname __P((ENV *, FNAME *));
+int __dbreg_add_dbentry __P((ENV *, DB_LOG *, DB *, int32_t));
+int __dbreg_rem_dbentry __P((DB_LOG *, int32_t));
+int __dbreg_log_files __P((ENV *, u_int32_t));
+int __dbreg_log_nofiles __P((ENV *));
+int __dbreg_close_files __P((ENV *, int));
+int __dbreg_close_file __P((ENV *, FNAME *));
+int __dbreg_mark_restored __P((ENV *));
+int __dbreg_invalidate_files __P((ENV *, int));
+int __dbreg_id_to_db __P((ENV *, DB_TXN *, DB **, int32_t, int));
+int __dbreg_id_to_fname __P((DB_LOG *, int32_t, int, FNAME **));
+int __dbreg_fid_to_fname __P((DB_LOG *, u_int8_t *, int, FNAME **));
+int __dbreg_get_name __P((ENV *, u_int8_t *, char **, char **));
+int __dbreg_do_open __P((ENV *, DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t, void *, u_int32_t, u_int32_t));
+int __dbreg_lazy_id __P((DB *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_dbreg_ext_h_ */
diff --git a/src/dbinc_auto/env_ext.h b/src/dbinc_auto/env_ext.h
new file mode 100644
index 00000000..55dbcba4
--- /dev/null
+++ b/src/dbinc_auto/env_ext.h
@@ -0,0 +1,158 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _env_ext_h_
+#define _env_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void __env_alloc_init __P((REGINFO *, size_t));
+size_t __env_alloc_overhead __P((void));
+size_t __env_alloc_size __P((size_t));
+int __env_alloc __P((REGINFO *, size_t, void *));
+void __env_alloc_free __P((REGINFO *, void *));
+int __env_alloc_extend __P((REGINFO *, void *, size_t *));
+int __env_region_extend __P((ENV *, REGINFO *));
+uintmax_t __env_elem_size __P((ENV *, void *));
+void * __env_get_chunk __P((REGINFO *, void **, uintmax_t *));
+void __env_alloc_print __P((REGINFO *, u_int32_t));
+int __env_get_backup_config __P((DB_ENV *, DB_BACKUP_CONFIG, u_int32_t*));
+int __env_set_backup_config __P((DB_ENV *, DB_BACKUP_CONFIG, u_int32_t));
+int __env_get_backup_callbacks __P((DB_ENV *, int (**)(DB_ENV *, const char *, const char *, void **), int (**)(DB_ENV *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *), int (**)(DB_ENV *, const char *, void *)));
+int __env_set_backup_callbacks __P((DB_ENV *, int (*)(DB_ENV *, const char *, const char *, void **), int (*)(DB_ENV *, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *), int (*)(DB_ENV *, const char *, void *)));
+int __env_read_db_config __P((ENV *));
+int __env_failchk_pp __P((DB_ENV *, u_int32_t));
+int __env_failchk_int __P((DB_ENV *));
+size_t __env_thread_size __P((ENV *, size_t));
+size_t __env_thread_max __P((ENV *));
+int __env_thread_init __P((ENV *, int));
+void __env_thread_destroy __P((ENV *));
+int __env_set_state __P((ENV *, DB_THREAD_INFO **, DB_THREAD_STATE));
+char *__env_thread_id_string __P((DB_ENV *, pid_t, db_threadid_t, char *));
+int __db_file_extend __P((ENV *, DB_FH *, size_t));
+int __db_file_multi_write __P((ENV *, const char *));
+int __db_file_write __P((ENV *, DB_FH *, u_int32_t, u_int32_t, int));
+void __db_env_destroy __P((DB_ENV *));
+int __env_get_alloc __P((DB_ENV *, void *(**)(size_t), void *(**)(void *, size_t), void (**)(void *)));
+int __env_set_alloc __P((DB_ENV *, void *(*)(size_t), void *(*)(void *, size_t), void (*)(void *)));
+int __env_get_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t *));
+int __env_set_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t));
+int __env_get_memory_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+int __env_set_memory_max __P((DB_ENV *, u_int32_t, u_int32_t));
+int __env_get_encrypt_flags __P((DB_ENV *, u_int32_t *));
+int __env_set_encrypt __P((DB_ENV *, const char *, u_int32_t));
+void __env_map_flags __P((const FLAG_MAP *, u_int, u_int32_t *, u_int32_t *));
+void __env_fetch_flags __P((const FLAG_MAP *, u_int, u_int32_t *, u_int32_t *));
+int __env_set_flags __P((DB_ENV *, u_int32_t, int));
+int __env_set_backup __P((ENV *, int));
+int __env_set_data_dir __P((DB_ENV *, const char *));
+int __env_add_data_dir __P((DB_ENV *, const char *));
+int __env_set_create_dir __P((DB_ENV *, const char *));
+int __env_set_metadata_dir __P((DB_ENV *, const char *));
+int __env_set_data_len __P((DB_ENV *, u_int32_t));
+int __env_set_intermediate_dir_mode __P((DB_ENV *, const char *));
+void __env_get_errcall __P((DB_ENV *, void (**)(const DB_ENV *, const char *, const char *)));
+void __env_set_errcall __P((DB_ENV *, void (*)(const DB_ENV *, const char *, const char *)));
+void __env_get_errfile __P((DB_ENV *, FILE **));
+void __env_set_errfile __P((DB_ENV *, FILE *));
+void __env_get_errpfx __P((DB_ENV *, const char **));
+void __env_set_errpfx __P((DB_ENV *, const char *));
+int __env_set_thread_count __P((DB_ENV *, u_int32_t));
+void __env_get_msgcall __P((DB_ENV *, void (**)(const DB_ENV *, const char *)));
+void __env_set_msgcall __P((DB_ENV *, void (*)(const DB_ENV *, const char *)));
+void __env_get_msgfile __P((DB_ENV *, FILE **));
+void __env_set_msgfile __P((DB_ENV *, FILE *));
+int __env_set_paniccall __P((DB_ENV *, void (*)(DB_ENV *, int)));
+int __env_set_shm_key __P((DB_ENV *, long));
+int __env_set_tmp_dir __P((DB_ENV *, const char *));
+int __env_set_verbose __P((DB_ENV *, u_int32_t, int));
+int __db_mi_env __P((ENV *, const char *));
+int __db_mi_open __P((ENV *, const char *, int));
+int __env_not_config __P((ENV *, char *, u_int32_t));
+int __env_set_timeout __P((DB_ENV *, db_timeout_t, u_int32_t));
+int __db_appname __P((ENV *, APPNAME, const char *, const char **, char **));
+int __db_tmp_open __P((ENV *, u_int32_t, DB_FH **));
+int __env_open_pp __P((DB_ENV *, const char *, u_int32_t, int));
+int __env_open __P((DB_ENV *, const char *, u_int32_t, int));
+int __env_remove __P((DB_ENV *, const char *, u_int32_t));
+int __env_config __P((DB_ENV *, const char *, u_int32_t *, int));
+int __env_close_pp __P((DB_ENV *, u_int32_t));
+int __env_close __P((DB_ENV *, u_int32_t));
+int __env_refresh __P((DB_ENV *, u_int32_t, int));
+int __env_get_open_flags __P((DB_ENV *, u_int32_t *));
+int __env_attach_regions __P((DB_ENV *, u_int32_t, u_int32_t, int));
+int __db_apprec __P((ENV *, DB_THREAD_INFO *, DB_LSN *, DB_LSN *, int, u_int32_t));
+int __env_openfiles __P((ENV *, DB_LOGC *, void *, DBT *, DB_LSN *, DB_LSN *, double, int));
+int __env_init_rec __P((ENV *, u_int32_t));
+int __env_attach __P((ENV *, u_int32_t *, int, int));
+int __env_turn_on __P((ENV *));
+int __env_turn_off __P((ENV *, u_int32_t));
+void __env_panic_set __P((ENV *, int));
+int __env_ref_increment __P((ENV *));
+int __env_ref_decrement __P((ENV *));
+int __env_ref_get __P((DB_ENV *, u_int32_t *));
+int __env_detach __P((ENV *, int));
+int __env_remove_env __P((ENV *));
+int __env_region_attach __P((ENV *, REGINFO *, size_t, size_t));
+int __env_region_share __P((ENV *, REGINFO *));
+int __env_region_detach __P((ENV *, REGINFO *, int));
+int __envreg_register __P((ENV *, int *, u_int32_t));
+int __envreg_unregister __P((ENV *, int));
+int __envreg_xunlock __P((ENV *));
+int __envreg_isalive __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+u_int32_t __env_struct_sig __P((void));
+int __env_stat_print_pp __P((DB_ENV *, u_int32_t));
+void __db_print_fh __P((ENV *, const char *, DB_FH *, u_int32_t));
+void __db_print_fileid __P((ENV *, u_int8_t *, const char *));
+void __db_dl __P((ENV *, const char *, u_long));
+void __db_dl_pct __P((ENV *, const char *, u_long, int, const char *));
+void __db_dlbytes __P((ENV *, const char *, u_long, u_long, u_long));
+void __db_print_reginfo __P((ENV *, REGINFO *, const char *, u_int32_t));
+int __db_stat_not_built __P((ENV *));
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_close __P((ENV *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_ack_policy __P((DB_ENV *, int));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_msg_dispatch __P((DB_ENV *, void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t), u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_env_ext_h_ */
diff --git a/src/dbinc_auto/ext_185_def.in b/src/dbinc_auto/ext_185_def.in
new file mode 100644
index 00000000..8da68a8d
--- /dev/null
+++ b/src/dbinc_auto/ext_185_def.in
@@ -0,0 +1,12 @@
+
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _DB_EXT_185_DEF_IN_
+#define _DB_EXT_185_DEF_IN_
+
+#ifdef _DB185_INT_H_
+#define __db185_open __db185_open@DB_VERSION_UNIQUE_NAME@
+#else
+#define __db185_open __db185_open@DB_VERSION_UNIQUE_NAME@
+#endif
+
+#endif /* !_DB_EXT_185_DEF_IN_ */
diff --git a/src/dbinc_auto/ext_185_prot.in b/src/dbinc_auto/ext_185_prot.in
new file mode 100644
index 00000000..dfd8d3d4
--- /dev/null
+++ b/src/dbinc_auto/ext_185_prot.in
@@ -0,0 +1,19 @@
+
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _DB_EXT_185_PROT_IN_
+#define _DB_EXT_185_PROT_IN_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifdef _DB185_INT_H_
+DB185 *__db185_open __P((const char *, int, int, DBTYPE, const void *));
+#else
+DB *__db185_open __P((const char *, int, int, DBTYPE, const void *));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_EXT_185_PROT_IN_ */
diff --git a/src/dbinc_auto/ext_def.in b/src/dbinc_auto/ext_def.in
new file mode 100644
index 00000000..1a56f192
--- /dev/null
+++ b/src/dbinc_auto/ext_def.in
@@ -0,0 +1,66 @@
+
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _DB_EXT_DEF_IN_
+#define _DB_EXT_DEF_IN_
+
+#define db_copy db_copy@DB_VERSION_UNIQUE_NAME@
+#define db_create db_create@DB_VERSION_UNIQUE_NAME@
+#define db_strerror db_strerror@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_assert db_env_set_func_assert@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_close db_env_set_func_close@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_dirfree db_env_set_func_dirfree@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_dirlist db_env_set_func_dirlist@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_exists db_env_set_func_exists@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_free db_env_set_func_free@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_fsync db_env_set_func_fsync@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_ftruncate db_env_set_func_ftruncate@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_ioinfo db_env_set_func_ioinfo@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_malloc db_env_set_func_malloc@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_file_map db_env_set_func_file_map@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_region_map db_env_set_func_region_map@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_pread db_env_set_func_pread@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_pwrite db_env_set_func_pwrite@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_open db_env_set_func_open@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_read db_env_set_func_read@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_realloc db_env_set_func_realloc@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_rename db_env_set_func_rename@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_seek db_env_set_func_seek@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_unlink db_env_set_func_unlink@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_write db_env_set_func_write@DB_VERSION_UNIQUE_NAME@
+#define db_env_set_func_yield db_env_set_func_yield@DB_VERSION_UNIQUE_NAME@
+#define db_env_create db_env_create@DB_VERSION_UNIQUE_NAME@
+#define db_version db_version@DB_VERSION_UNIQUE_NAME@
+#define db_full_version db_full_version@DB_VERSION_UNIQUE_NAME@
+#define log_compare log_compare@DB_VERSION_UNIQUE_NAME@
+#if defined(DB_WIN32) && !defined(DB_WINCE)
+#define db_env_set_win_security db_env_set_win_security@DB_VERSION_UNIQUE_NAME@
+#endif
+#define db_sequence_create db_sequence_create@DB_VERSION_UNIQUE_NAME@
+#if DB_DBM_HSEARCH != 0
+#define __db_ndbm_clearerr __db_ndbm_clearerr@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_close __db_ndbm_close@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_delete __db_ndbm_delete@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_dirfno __db_ndbm_dirfno@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_error __db_ndbm_error@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_fetch __db_ndbm_fetch@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_firstkey __db_ndbm_firstkey@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_nextkey __db_ndbm_nextkey@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_open __db_ndbm_open@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_pagfno __db_ndbm_pagfno@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_rdonly __db_ndbm_rdonly@DB_VERSION_UNIQUE_NAME@
+#define __db_ndbm_store __db_ndbm_store@DB_VERSION_UNIQUE_NAME@
+#define __db_dbm_close __db_dbm_close@DB_VERSION_UNIQUE_NAME@
+#define __db_dbm_delete __db_dbm_delete@DB_VERSION_UNIQUE_NAME@
+#define __db_dbm_fetch __db_dbm_fetch@DB_VERSION_UNIQUE_NAME@
+#define __db_dbm_firstkey __db_dbm_firstkey@DB_VERSION_UNIQUE_NAME@
+#define __db_dbm_init __db_dbm_init@DB_VERSION_UNIQUE_NAME@
+#define __db_dbm_nextkey __db_dbm_nextkey@DB_VERSION_UNIQUE_NAME@
+#define __db_dbm_store __db_dbm_store@DB_VERSION_UNIQUE_NAME@
+#endif
+#if DB_DBM_HSEARCH != 0
+#define __db_hcreate __db_hcreate@DB_VERSION_UNIQUE_NAME@
+#define __db_hsearch __db_hsearch@DB_VERSION_UNIQUE_NAME@
+#define __db_hdestroy __db_hdestroy@DB_VERSION_UNIQUE_NAME@
+#endif
+
+#endif /* !_DB_EXT_DEF_IN_ */
diff --git a/src/dbinc_auto/ext_prot.in b/src/dbinc_auto/ext_prot.in
new file mode 100644
index 00000000..371e5a3e
--- /dev/null
+++ b/src/dbinc_auto/ext_prot.in
@@ -0,0 +1,73 @@
+
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _DB_EXT_PROT_IN_
+#define _DB_EXT_PROT_IN_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int db_copy __P((DB_ENV *, const char *, const char *, const char *));
+int db_create __P((DB **, DB_ENV *, u_int32_t));
+char *db_strerror __P((int));
+int db_env_set_func_assert __P((void (*)(const char *, const char *, int)));
+int db_env_set_func_close __P((int (*)(int)));
+int db_env_set_func_dirfree __P((void (*)(char **, int)));
+int db_env_set_func_dirlist __P((int (*)(const char *, char ***, int *)));
+int db_env_set_func_exists __P((int (*)(const char *, int *)));
+int db_env_set_func_free __P((void (*)(void *)));
+int db_env_set_func_fsync __P((int (*)(int)));
+int db_env_set_func_ftruncate __P((int (*)(int, off_t)));
+int db_env_set_func_ioinfo __P((int (*)(const char *, int, u_int32_t *, u_int32_t *, u_int32_t *)));
+int db_env_set_func_malloc __P((void *(*)(size_t)));
+int db_env_set_func_file_map __P((int (*)(DB_ENV *, char *, size_t, int, void **), int (*)(DB_ENV *, void *)));
+int db_env_set_func_region_map __P((int (*)(DB_ENV *, char *, size_t, int *, void **), int (*)(DB_ENV *, void *)));
+int db_env_set_func_pread __P((ssize_t (*)(int, void *, size_t, off_t)));
+int db_env_set_func_pwrite __P((ssize_t (*)(int, const void *, size_t, off_t)));
+int db_env_set_func_open __P((int (*)(const char *, int, ...)));
+int db_env_set_func_read __P((ssize_t (*)(int, void *, size_t)));
+int db_env_set_func_realloc __P((void *(*)(void *, size_t)));
+int db_env_set_func_rename __P((int (*)(const char *, const char *)));
+int db_env_set_func_seek __P((int (*)(int, off_t, int)));
+int db_env_set_func_unlink __P((int (*)(const char *)));
+int db_env_set_func_write __P((ssize_t (*)(int, const void *, size_t)));
+int db_env_set_func_yield __P((int (*)(u_long, u_long)));
+int db_env_create __P((DB_ENV **, u_int32_t));
+char *db_version __P((int *, int *, int *));
+char *db_full_version __P((int *, int *, int *, int *, int *));
+int log_compare __P((const DB_LSN *, const DB_LSN *));
+#if defined(DB_WIN32) && !defined(DB_WINCE)
+int db_env_set_win_security __P((SECURITY_ATTRIBUTES *sa));
+#endif
+int db_sequence_create __P((DB_SEQUENCE **, DB *, u_int32_t));
+#if DB_DBM_HSEARCH != 0
+int __db_ndbm_clearerr __P((DBM *));
+void __db_ndbm_close __P((DBM *));
+int __db_ndbm_delete __P((DBM *, datum));
+int __db_ndbm_dirfno __P((DBM *));
+int __db_ndbm_error __P((DBM *));
+datum __db_ndbm_fetch __P((DBM *, datum));
+datum __db_ndbm_firstkey __P((DBM *));
+datum __db_ndbm_nextkey __P((DBM *));
+DBM *__db_ndbm_open __P((const char *, int, int));
+int __db_ndbm_pagfno __P((DBM *));
+int __db_ndbm_rdonly __P((DBM *));
+int __db_ndbm_store __P((DBM *, datum, datum, int));
+int __db_dbm_close __P((void));
+int __db_dbm_delete __P((datum));
+datum __db_dbm_fetch __P((datum));
+datum __db_dbm_firstkey __P((void));
+int __db_dbm_init __P((char *));
+datum __db_dbm_nextkey __P((datum));
+int __db_dbm_store __P((datum, datum));
+#endif
+#if DB_DBM_HSEARCH != 0
+int __db_hcreate __P((size_t));
+ENTRY *__db_hsearch __P((ENTRY, ACTION));
+void __db_hdestroy __P((void));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_EXT_PROT_IN_ */
diff --git a/src/dbinc_auto/fileops_auto.h b/src/dbinc_auto/fileops_auto.h
new file mode 100644
index 00000000..59385c88
--- /dev/null
+++ b/src/dbinc_auto/fileops_auto.h
@@ -0,0 +1,262 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __fop_AUTO_H
+#define __fop_AUTO_H
+#include "dbinc/log.h"
+#define DB___fop_create_42 143
+typedef struct ___fop_create_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ u_int32_t appname;
+ u_int32_t mode;
+} __fop_create_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_create_42_desc[];
+static inline int __fop_create_42_read(ENV *env,
+ void *data, __fop_create_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_create_42_desc, sizeof(__fop_create_42_args), (void**)arg));
+}
+#define DB___fop_create 143
+typedef struct ___fop_create_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ DBT dirname;
+ u_int32_t appname;
+ u_int32_t mode;
+} __fop_create_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_create_desc[];
+static inline int
+__fop_create_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *name, const DBT *dirname, u_int32_t appname, u_int32_t mode)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___fop_create, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(name) + LOG_DBT_SIZE(dirname) + sizeof(u_int32_t) +
+ sizeof(u_int32_t),
+ __fop_create_desc,
+ name, dirname, appname, mode));
+}
+
+static inline int __fop_create_read(ENV *env,
+ void *data, __fop_create_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_create_desc, sizeof(__fop_create_args), (void**)arg));
+}
+#define DB___fop_remove 144
+typedef struct ___fop_remove_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ DBT fid;
+ u_int32_t appname;
+} __fop_remove_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_remove_desc[];
+static inline int
+__fop_remove_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *name, const DBT *fid, u_int32_t appname)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___fop_remove, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(name) + LOG_DBT_SIZE(fid) + sizeof(u_int32_t),
+ __fop_remove_desc,
+ name, fid, appname));
+}
+
+static inline int __fop_remove_read(ENV *env,
+ void *data, __fop_remove_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_remove_desc, sizeof(__fop_remove_args), (void**)arg));
+}
+#define DB___fop_write_42 145
+typedef struct ___fop_write_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ u_int32_t appname;
+ u_int32_t pgsize;
+ db_pgno_t pageno;
+ u_int32_t offset;
+ DBT page;
+ u_int32_t flag;
+} __fop_write_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_42_desc[];
+static inline int __fop_write_42_read(ENV *env,
+ void *data, __fop_write_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_write_42_desc, sizeof(__fop_write_42_args), (void**)arg));
+}
+#define DB___fop_write 145
+typedef struct ___fop_write_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ DBT dirname;
+ u_int32_t appname;
+ u_int32_t pgsize;
+ db_pgno_t pageno;
+ u_int32_t offset;
+ DBT page;
+ u_int32_t flag;
+} __fop_write_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_desc[];
+static inline int
+__fop_write_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *name, const DBT *dirname, u_int32_t appname, u_int32_t pgsize, db_pgno_t pageno,
+ u_int32_t offset, const DBT *page, u_int32_t flag)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___fop_write, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(name) + LOG_DBT_SIZE(dirname) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ LOG_DBT_SIZE(page) + sizeof(u_int32_t),
+ __fop_write_desc,
+ name, dirname, appname, pgsize, pageno, offset, page, flag));
+}
+
+static inline int __fop_write_read(ENV *env,
+ void *data, __fop_write_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_write_desc, sizeof(__fop_write_args), (void**)arg));
+}
+#define DB___fop_rename_42 146
+#define DB___fop_rename_noundo_46 150
+typedef struct ___fop_rename_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT oldname;
+ DBT newname;
+ DBT fileid;
+ u_int32_t appname;
+} __fop_rename_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_42_desc[];
+static inline int __fop_rename_42_read(ENV *env,
+ void *data, __fop_rename_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_rename_42_desc, sizeof(__fop_rename_42_args), (void**)arg));
+}
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_noundo_46_desc[];
+static inline int __fop_rename_noundo_46_read(ENV *env,
+ void *data, __fop_rename_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_rename_noundo_46_desc, sizeof(__fop_rename_42_args), (void**)arg));
+}
+#define DB___fop_rename 146
+#define DB___fop_rename_noundo 150
+typedef struct ___fop_rename_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT oldname;
+ DBT newname;
+ DBT dirname;
+ DBT fileid;
+ u_int32_t appname;
+} __fop_rename_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_desc[];
+static inline int
+__fop_rename_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *oldname, const DBT *newname, const DBT *dirname, const DBT *fileid, u_int32_t appname)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___fop_rename, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(oldname) + LOG_DBT_SIZE(newname) + LOG_DBT_SIZE(dirname) +
+ LOG_DBT_SIZE(fileid) + sizeof(u_int32_t),
+ __fop_rename_desc,
+ oldname, newname, dirname, fileid, appname));
+}
+
+static inline int __fop_rename_read(ENV *env,
+ void *data, __fop_rename_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_rename_desc, sizeof(__fop_rename_args), (void**)arg));
+}
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_noundo_desc[];
+static inline int
+__fop_rename_noundo_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *oldname, const DBT *newname, const DBT *dirname, const DBT *fileid, u_int32_t appname)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___fop_rename_noundo, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(oldname) + LOG_DBT_SIZE(newname) + LOG_DBT_SIZE(dirname) +
+ LOG_DBT_SIZE(fileid) + sizeof(u_int32_t),
+ __fop_rename_noundo_desc,
+ oldname, newname, dirname, fileid, appname));
+}
+
+static inline int __fop_rename_noundo_read(ENV *env,
+ void *data, __fop_rename_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_rename_noundo_desc, sizeof(__fop_rename_args), (void**)arg));
+}
+#define DB___fop_file_remove 141
+typedef struct ___fop_file_remove_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT real_fid;
+ DBT tmp_fid;
+ DBT name;
+ u_int32_t appname;
+ u_int32_t child;
+} __fop_file_remove_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_file_remove_desc[];
+static inline int
+__fop_file_remove_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *real_fid, const DBT *tmp_fid, const DBT *name, u_int32_t appname, u_int32_t child)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___fop_file_remove, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(real_fid) + LOG_DBT_SIZE(tmp_fid) + LOG_DBT_SIZE(name) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __fop_file_remove_desc,
+ real_fid, tmp_fid, name, appname, child));
+}
+
+static inline int __fop_file_remove_read(ENV *env,
+ void *data, __fop_file_remove_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_file_remove_desc, sizeof(__fop_file_remove_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/fileops_ext.h b/src/dbinc_auto/fileops_ext.h
new file mode 100644
index 00000000..0aa6c1e1
--- /dev/null
+++ b/src/dbinc_auto/fileops_ext.h
@@ -0,0 +1,44 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _fileops_ext_h_
+#define _fileops_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __fop_init_recover __P((ENV *, DB_DISTAB *));
+int __fop_create_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_file_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_init_print __P((ENV *, DB_DISTAB *));
+int __fop_create __P((ENV *, DB_TXN *, DB_FH **, const char *, const char **, APPNAME, int, u_int32_t));
+int __fop_remove __P((ENV *, DB_TXN *, u_int8_t *, const char *, const char **, APPNAME, u_int32_t));
+int __fop_write __P((ENV *, DB_TXN *, const char *, const char *, APPNAME, DB_FH *, u_int32_t, db_pgno_t, u_int32_t, void *, u_int32_t, u_int32_t, u_int32_t));
+int __fop_rename __P((ENV *, DB_TXN *, const char *, const char *, const char **, u_int8_t *, APPNAME, int, u_int32_t));
+int __fop_create_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_noundo_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_noundo_46_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_file_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_lock_handle __P((ENV *, DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
+int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip, DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
+int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, int, u_int32_t));
+int __fop_remove_setup __P((DB *, DB_TXN *, const char *, u_int32_t));
+int __fop_read_meta __P((ENV *, const char *, u_int8_t *, size_t, DB_FH *, int, size_t *));
+int __fop_dummy __P((DB *, DB_TXN *, const char *, const char *));
+int __fop_dbrename __P((DB *, const char *, const char *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_fileops_ext_h_ */
diff --git a/src/dbinc_auto/hash_auto.h b/src/dbinc_auto/hash_auto.h
new file mode 100644
index 00000000..c1dcae91
--- /dev/null
+++ b/src/dbinc_auto/hash_auto.h
@@ -0,0 +1,484 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __ham_AUTO_H
+#define __ham_AUTO_H
+#ifdef HAVE_HASH
+#include "dbinc/log.h"
+#define DB___ham_insdel 21
+typedef struct ___ham_insdel_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t ndx;
+ DB_LSN pagelsn;
+ u_int32_t keytype;
+ DBT key;
+ u_int32_t datatype;
+ DBT data;
+} __ham_insdel_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_insdel_desc[];
+static inline int
+__ham_insdel_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t opcode, db_pgno_t pgno, u_int32_t ndx, DB_LSN * pagelsn,
+ u_int32_t keytype, const DBT *key, u_int32_t datatype, const DBT *data)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_insdel, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(*pagelsn) + sizeof(u_int32_t) +
+ LOG_DBT_SIZE(key) + sizeof(u_int32_t) + LOG_DBT_SIZE(data),
+ __ham_insdel_desc,
+ opcode, pgno, ndx, pagelsn, keytype, key, datatype,
+ data));
+}
+
+static inline int __ham_insdel_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_insdel_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_insdel_desc, sizeof(__ham_insdel_args), (void**)arg));
+}
+#define DB___ham_insdel_42 21
+typedef struct ___ham_insdel_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t ndx;
+ DB_LSN pagelsn;
+ DBT key;
+ DBT data;
+} __ham_insdel_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_insdel_42_desc[];
+static inline int __ham_insdel_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_insdel_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_insdel_42_desc, sizeof(__ham_insdel_42_args), (void**)arg));
+}
+#define DB___ham_newpage 22
+typedef struct ___ham_newpage_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t prev_pgno;
+ DB_LSN prevlsn;
+ db_pgno_t new_pgno;
+ DB_LSN pagelsn;
+ db_pgno_t next_pgno;
+ DB_LSN nextlsn;
+} __ham_newpage_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_newpage_desc[];
+static inline int
+__ham_newpage_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t opcode, db_pgno_t prev_pgno, DB_LSN * prevlsn, db_pgno_t new_pgno,
+ DB_LSN * pagelsn, db_pgno_t next_pgno, DB_LSN * nextlsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_newpage, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(*prevlsn) + sizeof(u_int32_t) + sizeof(*pagelsn) +
+ sizeof(u_int32_t) + sizeof(*nextlsn),
+ __ham_newpage_desc,
+ opcode, prev_pgno, prevlsn, new_pgno, pagelsn, next_pgno, nextlsn));
+}
+
+static inline int __ham_newpage_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_newpage_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_newpage_desc, sizeof(__ham_newpage_args), (void**)arg));
+}
+#define DB___ham_splitdata 24
+typedef struct ___ham_splitdata_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ u_int32_t opcode;
+ db_pgno_t pgno;
+ DBT pageimage;
+ DB_LSN pagelsn;
+} __ham_splitdata_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_splitdata_desc[];
+static inline int
+__ham_splitdata_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, u_int32_t opcode, db_pgno_t pgno, const DBT *pageimage, DB_LSN * pagelsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_splitdata, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ LOG_DBT_SIZE(pageimage) + sizeof(*pagelsn),
+ __ham_splitdata_desc, opcode, pgno, pageimage, pagelsn));
+}
+
+static inline int __ham_splitdata_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_splitdata_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_splitdata_desc, sizeof(__ham_splitdata_args), (void**)arg));
+}
+#define DB___ham_replace 25
+typedef struct ___ham_replace_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t ndx;
+ DB_LSN pagelsn;
+ int32_t off;
+ u_int32_t oldtype;
+ DBT olditem;
+ u_int32_t newtype;
+ DBT newitem;
+} __ham_replace_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_replace_desc[];
+static inline int
+__ham_replace_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, u_int32_t ndx, DB_LSN * pagelsn, int32_t off,
+ u_int32_t oldtype, const DBT *olditem, u_int32_t newtype, const DBT *newitem)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_replace, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(*pagelsn) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ LOG_DBT_SIZE(olditem) + sizeof(u_int32_t) + LOG_DBT_SIZE(newitem),
+ __ham_replace_desc, pgno, ndx, pagelsn, off, oldtype, olditem, newtype,
+ newitem));
+}
+
+static inline int __ham_replace_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_replace_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_replace_desc, sizeof(__ham_replace_args), (void**)arg));
+}
+#define DB___ham_replace_42 25
+typedef struct ___ham_replace_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t ndx;
+ DB_LSN pagelsn;
+ int32_t off;
+ DBT olditem;
+ DBT newitem;
+ u_int32_t makedup;
+} __ham_replace_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_replace_42_desc[];
+static inline int __ham_replace_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_replace_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_replace_42_desc, sizeof(__ham_replace_42_args), (void**)arg));
+}
+#define DB___ham_copypage 28
+typedef struct ___ham_copypage_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN pagelsn;
+ db_pgno_t next_pgno;
+ DB_LSN nextlsn;
+ db_pgno_t nnext_pgno;
+ DB_LSN nnextlsn;
+ DBT page;
+} __ham_copypage_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_copypage_desc[];
+static inline int
+__ham_copypage_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, DB_LSN * pagelsn, db_pgno_t next_pgno, DB_LSN * nextlsn,
+ db_pgno_t nnext_pgno, DB_LSN * nnextlsn, const DBT *page)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_copypage, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*pagelsn) +
+ sizeof(u_int32_t) + sizeof(*nextlsn) + sizeof(u_int32_t) +
+ sizeof(*nnextlsn) + LOG_DBT_SIZE(page),
+ __ham_copypage_desc, pgno, pagelsn, next_pgno, nextlsn, nnext_pgno, nnextlsn, page));
+}
+
+static inline int __ham_copypage_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_copypage_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_copypage_desc, sizeof(__ham_copypage_args), (void**)arg));
+}
+#define DB___ham_metagroup_42 29
+typedef struct ___ham_metagroup_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ u_int32_t bucket;
+ db_pgno_t mmpgno;
+ DB_LSN mmetalsn;
+ db_pgno_t mpgno;
+ DB_LSN metalsn;
+ db_pgno_t pgno;
+ DB_LSN pagelsn;
+ u_int32_t newalloc;
+} __ham_metagroup_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_metagroup_42_desc[];
+static inline int __ham_metagroup_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_metagroup_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_metagroup_42_desc, sizeof(__ham_metagroup_42_args), (void**)arg));
+}
+#define DB___ham_metagroup 29
+typedef struct ___ham_metagroup_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ u_int32_t bucket;
+ db_pgno_t mmpgno;
+ DB_LSN mmetalsn;
+ db_pgno_t mpgno;
+ DB_LSN metalsn;
+ db_pgno_t pgno;
+ DB_LSN pagelsn;
+ u_int32_t newalloc;
+ db_pgno_t last_pgno;
+} __ham_metagroup_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_metagroup_desc[];
+static inline int
+__ham_metagroup_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, u_int32_t bucket, db_pgno_t mmpgno, DB_LSN * mmetalsn, db_pgno_t mpgno,
+ DB_LSN * metalsn, db_pgno_t pgno, DB_LSN * pagelsn, u_int32_t newalloc, db_pgno_t last_pgno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_metagroup, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(*mmetalsn) + sizeof(u_int32_t) + sizeof(*metalsn) +
+ sizeof(u_int32_t) + sizeof(*pagelsn) + sizeof(u_int32_t) +
+ sizeof(u_int32_t),
+ __ham_metagroup_desc, bucket, mmpgno, mmetalsn, mpgno, metalsn, pgno, pagelsn,
+ newalloc, last_pgno));
+}
+
+static inline int __ham_metagroup_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_metagroup_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_metagroup_desc, sizeof(__ham_metagroup_args), (void**)arg));
+}
+#define DB___ham_groupalloc_42 32
+typedef struct ___ham_groupalloc_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DB_LSN meta_lsn;
+ db_pgno_t start_pgno;
+ u_int32_t num;
+ db_pgno_t free;
+} __ham_groupalloc_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_groupalloc_42_desc[];
+static inline int __ham_groupalloc_42_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_groupalloc_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_groupalloc_42_desc, sizeof(__ham_groupalloc_42_args), (void**)arg));
+}
+#define DB___ham_groupalloc 32
+typedef struct ___ham_groupalloc_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DB_LSN meta_lsn;
+ db_pgno_t start_pgno;
+ u_int32_t num;
+ db_pgno_t unused;
+ db_pgno_t last_pgno;
+} __ham_groupalloc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_groupalloc_desc[];
+static inline int
+__ham_groupalloc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * meta_lsn, db_pgno_t start_pgno, u_int32_t num, db_pgno_t unused,
+ db_pgno_t last_pgno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_groupalloc, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(*meta_lsn) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+ __ham_groupalloc_desc, meta_lsn, start_pgno, num, unused, last_pgno));
+}
+
+static inline int __ham_groupalloc_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_groupalloc_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_groupalloc_desc, sizeof(__ham_groupalloc_args), (void**)arg));
+}
+#define DB___ham_changeslot 35
+typedef struct ___ham_changeslot_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DB_LSN meta_lsn;
+ u_int32_t slot;
+ db_pgno_t old;
+ db_pgno_t new;
+} __ham_changeslot_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_changeslot_desc[];
+static inline int
+__ham_changeslot_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * meta_lsn, u_int32_t slot, db_pgno_t old, db_pgno_t new)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_changeslot, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(*meta_lsn) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __ham_changeslot_desc, meta_lsn, slot, old, new));
+}
+
+static inline int __ham_changeslot_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_changeslot_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_changeslot_desc, sizeof(__ham_changeslot_args), (void**)arg));
+}
+#define DB___ham_contract 37
+typedef struct ___ham_contract_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t meta;
+ DB_LSN meta_lsn;
+ u_int32_t bucket;
+ db_pgno_t pgno;
+} __ham_contract_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_contract_desc[];
+static inline int
+__ham_contract_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t meta, DB_LSN * meta_lsn, u_int32_t bucket, db_pgno_t pgno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_contract, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(*meta_lsn) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __ham_contract_desc, meta, meta_lsn, bucket, pgno));
+}
+
+static inline int __ham_contract_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_contract_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_contract_desc, sizeof(__ham_contract_args), (void**)arg));
+}
+#define DB___ham_curadj 33
+typedef struct ___ham_curadj_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ u_int32_t len;
+ u_int32_t dup_off;
+ int add;
+ int is_dup;
+ u_int32_t order;
+} __ham_curadj_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_curadj_desc[];
+static inline int
+__ham_curadj_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, u_int32_t indx, u_int32_t len, u_int32_t dup_off,
+ int add, int is_dup, u_int32_t order)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_curadj, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __ham_curadj_desc, pgno, indx, len, dup_off, add, is_dup, order));
+}
+
+static inline int __ham_curadj_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_curadj_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_curadj_desc, sizeof(__ham_curadj_args), (void**)arg));
+}
+#define DB___ham_chgpg 34
+typedef struct ___ham_chgpg_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_ham_mode mode;
+ db_pgno_t old_pgno;
+ db_pgno_t new_pgno;
+ u_int32_t old_indx;
+ u_int32_t new_indx;
+} __ham_chgpg_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __ham_chgpg_desc[];
+static inline int
+__ham_chgpg_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_ham_mode mode, db_pgno_t old_pgno, db_pgno_t new_pgno, u_int32_t old_indx,
+ u_int32_t new_indx)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___ham_chgpg, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+ __ham_chgpg_desc, mode, old_pgno, new_pgno, old_indx, new_indx));
+}
+
+static inline int __ham_chgpg_read(ENV *env,
+ DB **dbpp, void *td, void *data, __ham_chgpg_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __ham_chgpg_desc, sizeof(__ham_chgpg_args), (void**)arg));
+}
+#endif /* HAVE_HASH */
+#endif
diff --git a/src/dbinc_auto/hash_ext.h b/src/dbinc_auto/hash_ext.h
new file mode 100644
index 00000000..e83fe817
--- /dev/null
+++ b/src/dbinc_auto/hash_ext.h
@@ -0,0 +1,129 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _hash_ext_h_
+#define _hash_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __ham_quick_delete __P((DBC *));
+int __hamc_init __P((DBC *));
+int __hamc_count __P((DBC *, db_recno_t *));
+int __hamc_cmp __P((DBC *, DBC *, int *));
+int __hamc_dup __P((DBC *, DBC *));
+int __ham_contract_table __P((DBC *, DB_COMPACT *));
+u_int32_t __ham_call_hash __P((DBC *, u_int8_t *, u_int32_t));
+int __ham_overwrite __P((DBC *, DBT *, u_int32_t));
+int __ham_lookup __P((DBC *, const DBT *, u_int32_t, db_lockmode_t, db_pgno_t *));
+int __ham_init_dbt __P((ENV *, DBT *, u_int32_t, void **, u_int32_t *));
+int __hamc_update __P((DBC *, u_int32_t, db_ham_curadj, int));
+int __ham_get_clist __P((DB *, db_pgno_t, u_int32_t, DBC ***));
+int __ham_init_recover __P((ENV *, DB_DISTAB *));
+int __ham_insdel_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_insdel_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_newpage_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_splitdata_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_copypage_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_changeslot_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_contract_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_curadj_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_chgpg_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_init_print __P((ENV *, DB_DISTAB *));
+int __ham_compact_int __P((DBC *, DBT *, DBT *, u_int32_t, DB_COMPACT *, int *, u_int32_t));
+int __ham_compact_bucket __P((DBC *, DB_COMPACT *, int *));
+int __ham_compact_hash __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+int __ham_pgin __P((DB *, db_pgno_t, void *, DBT *));
+int __ham_pgout __P((DB *, db_pgno_t, void *, DBT *));
+int __ham_mswap __P((ENV *, void *));
+int __ham_add_dup __P((DBC *, DBT *, u_int32_t, db_pgno_t *));
+int __ham_dup_convert __P((DBC *));
+int __ham_make_dup __P((ENV *, const DBT *, DBT *d, void **, u_int32_t *));
+void __ham_dsearch __P((DBC *, DBT *, u_int32_t *, int *, u_int32_t));
+u_int32_t __ham_func2 __P((DB *, const void *, u_int32_t));
+u_int32_t __ham_func3 __P((DB *, const void *, u_int32_t));
+u_int32_t __ham_func4 __P((DB *, const void *, u_int32_t));
+u_int32_t __ham_func5 __P((DB *, const void *, u_int32_t));
+u_int32_t __ham_test __P((DB *, const void *, u_int32_t));
+int __ham_get_meta __P((DBC *));
+int __ham_release_meta __P((DBC *));
+int __ham_dirty_meta __P((DBC *, u_int32_t));
+int __ham_return_meta __P((DBC *, u_int32_t, DBMETA **));
+int __ham_db_create __P((DB *));
+int __ham_db_close __P((DB *));
+int __ham_get_h_ffactor __P((DB *, u_int32_t *));
+int __ham_set_h_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+int __ham_get_h_nelem __P((DB *, u_int32_t *));
+void __ham_copy_config __P((DB *, DB*, u_int32_t));
+int __ham_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char * name, db_pgno_t, u_int32_t));
+int __ham_metachk __P((DB *, const char *, HMETA *));
+int __ham_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __ham_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
+int __ham_item __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_item_reset __P((DBC *));
+int __ham_item_init __P((DBC *));
+int __ham_item_last __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_item_first __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_item_prev __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_item_next __P((DBC *, db_lockmode_t, db_pgno_t *));
+int __ham_insertpair __P((DBC *, PAGE *p, db_indx_t *indxp, const DBT *, const DBT *, u_int32_t, u_int32_t));
+int __ham_getindex __P((DBC *, PAGE *, const DBT *, u_int32_t, int *, db_indx_t *));
+int __ham_verify_sorted_page __P((DBC *, PAGE *));
+int __ham_sort_page_cursor __P((DBC *, PAGE *));
+int __ham_sort_page __P((DBC *, PAGE **, PAGE *));
+int __ham_del_pair __P((DBC *, int, PAGE *));
+int __ham_replpair __P((DBC *, DBT *, u_int32_t));
+void __ham_onpage_replace __P((DB *, PAGE *, u_int32_t, int32_t, u_int32_t, int, DBT *));
+int __ham_merge_pages __P((DBC *, u_int32_t, u_int32_t, DB_COMPACT *));
+int __ham_split_page __P((DBC *, u_int32_t, u_int32_t));
+int __ham_add_el __P((DBC *, const DBT *, const DBT *, u_int32_t));
+int __ham_copypair __P((DBC *, PAGE *, u_int32_t, PAGE *, db_indx_t *, int));
+int __ham_add_ovflpage __P((DBC *, PAGE **));
+int __ham_get_cpage __P((DBC *, db_lockmode_t));
+int __ham_next_cpage __P((DBC *, db_pgno_t));
+int __ham_lock_bucket __P((DBC *, db_lockmode_t));
+void __ham_dpair __P((DB *, PAGE *, u_int32_t));
+int __ham_insdel_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_insdel_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_newpage_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_splitdata_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_copypage_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_contract_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_changeslot_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_curadj_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_chgpg_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_reclaim __P((DB *, DB_THREAD_INFO *, DB_TXN *txn, u_int32_t));
+int __ham_truncate __P((DBC *, u_int32_t *));
+int __ham_stat __P((DBC *, void *, u_int32_t));
+int __ham_stat_print __P((DBC *, u_int32_t));
+void __ham_print_cursor __P((DBC *));
+int __ham_traverse __P((DBC *, db_lockmode_t, int (*)(DBC *, PAGE *, void *, int *), void *, int));
+int __db_no_hash_am __P((ENV *));
+int __ham_30_hashmeta __P((DB *, char *, u_int8_t *));
+int __ham_30_sizefix __P((DB *, DB_FH *, char *, u_int8_t *));
+int __ham_31_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_31_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_46_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_46_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_vrfy_meta __P((DB *, VRFY_DBINFO *, HMETA *, db_pgno_t, u_int32_t));
+int __ham_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __ham_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+int __ham_vrfy_hashing __P((DBC *, u_int32_t, HMETA *, u_int32_t, db_pgno_t, u_int32_t, u_int32_t (*) __P((DB *, const void *, u_int32_t))));
+int __ham_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+int __ham_meta2pgset __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t, DB *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_hash_ext_h_ */
diff --git a/src/dbinc_auto/heap_auto.h b/src/dbinc_auto/heap_auto.h
new file mode 100644
index 00000000..bf288627
--- /dev/null
+++ b/src/dbinc_auto/heap_auto.h
@@ -0,0 +1,146 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __heap_AUTO_H
+#define __heap_AUTO_H
+#ifdef HAVE_HEAP
+#include "dbinc/log.h"
+#define DB___heap_addrem 151
+typedef struct ___heap_addrem_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT hdr;
+ DBT dbt;
+ DB_LSN pagelsn;
+} __heap_addrem_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_addrem_desc[];
+static inline int
+__heap_addrem_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t opcode, db_pgno_t pgno, u_int32_t indx, u_int32_t nbytes,
+ const DBT *hdr, const DBT *dbt, DB_LSN * pagelsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___heap_addrem, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(hdr) +
+ LOG_DBT_SIZE(dbt) + sizeof(*pagelsn),
+ __heap_addrem_desc,
+ opcode, pgno, indx, nbytes, hdr, dbt, pagelsn));
+}
+
+static inline int __heap_addrem_read(ENV *env,
+ DB **dbpp, void *td, void *data, __heap_addrem_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __heap_addrem_desc, sizeof(__heap_addrem_args), (void**)arg));
+}
+#define DB___heap_pg_alloc 152
+typedef struct ___heap_pg_alloc_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DB_LSN meta_lsn;
+ db_pgno_t meta_pgno;
+ db_pgno_t pgno;
+ u_int32_t ptype;
+ db_pgno_t last_pgno;
+} __heap_pg_alloc_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_pg_alloc_desc[];
+static inline int
+__heap_pg_alloc_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * meta_lsn, db_pgno_t meta_pgno, db_pgno_t pgno, u_int32_t ptype,
+ db_pgno_t last_pgno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___heap_pg_alloc, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(*meta_lsn) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+ __heap_pg_alloc_desc, meta_lsn, meta_pgno, pgno, ptype, last_pgno));
+}
+
+static inline int __heap_pg_alloc_read(ENV *env,
+ DB **dbpp, void *td, void *data, __heap_pg_alloc_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __heap_pg_alloc_desc, sizeof(__heap_pg_alloc_args), (void**)arg));
+}
+#define DB___heap_trunc_meta 153
+typedef struct ___heap_trunc_meta_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t last_pgno;
+ u_int32_t key_count;
+ u_int32_t record_count;
+ u_int32_t curregion;
+ u_int32_t nregions;
+ DB_LSN pagelsn;
+} __heap_trunc_meta_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_trunc_meta_desc[];
+static inline int
+__heap_trunc_meta_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, u_int32_t last_pgno, u_int32_t key_count, u_int32_t record_count,
+ u_int32_t curregion, u_int32_t nregions, DB_LSN * pagelsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___heap_trunc_meta, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(*pagelsn),
+ __heap_trunc_meta_desc, pgno, last_pgno, key_count, record_count, curregion, nregions, pagelsn));
+}
+
+static inline int __heap_trunc_meta_read(ENV *env,
+ DB **dbpp, void *td, void *data, __heap_trunc_meta_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __heap_trunc_meta_desc, sizeof(__heap_trunc_meta_args), (void**)arg));
+}
+#define DB___heap_trunc_page 154
+typedef struct ___heap_trunc_page_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DBT old_data;
+ u_int32_t is_region;
+ DB_LSN pagelsn;
+} __heap_trunc_page_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_trunc_page_desc[];
+static inline int
+__heap_trunc_page_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_pgno_t pgno, const DBT *old_data, u_int32_t is_region, DB_LSN * pagelsn)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___heap_trunc_page, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(old_data) +
+ sizeof(u_int32_t) + sizeof(*pagelsn),
+ __heap_trunc_page_desc, pgno, old_data, is_region, pagelsn));
+}
+
+static inline int __heap_trunc_page_read(ENV *env,
+ DB **dbpp, void *td, void *data, __heap_trunc_page_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __heap_trunc_page_desc, sizeof(__heap_trunc_page_args), (void**)arg));
+}
+#endif /* HAVE_HEAP */
+#endif
diff --git a/src/dbinc_auto/heap_ext.h b/src/dbinc_auto/heap_ext.h
new file mode 100644
index 00000000..8bc24b61
--- /dev/null
+++ b/src/dbinc_auto/heap_ext.h
@@ -0,0 +1,58 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _heap_ext_h_
+#define _heap_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __heapc_init __P((DBC *));
+int __heap_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+int __heap_append __P((DBC *, DBT *, DBT *));
+int __heap_pitem __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+int __heapc_dup __P((DBC *, DBC *));
+int __heapc_gsplit __P((DBC *, DBT *, void **, u_int32_t *));
+int __heapc_refresh __P((DBC *));
+int __heap_init_recover __P((ENV *, DB_DISTAB *));
+int __heap_addrem_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_pg_alloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_meta_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_page_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_init_print __P((ENV *, DB_DISTAB *));
+int __heap_backup __P((DB_ENV *, DB *, DB_THREAD_INFO *, DB_FH *, void *, u_int32_t));
+int __heap_pgin __P((DB *, db_pgno_t, void *, DBT *));
+int __heap_pgout __P((DB *, db_pgno_t, void *, DBT *));
+int __heap_mswap __P((ENV *, PAGE *));
+int __heap_db_create __P((DB *));
+int __heap_db_close __P((DB *));
+int __heap_get_heapsize __P((DB *, u_int32_t *, u_int32_t *));
+int __heap_get_heap_regionsize __P((DB *, u_int32_t *));
+int __heap_set_heapsize __P((DB *, u_int32_t, u_int32_t, u_int32_t));
+int __heap_set_heap_regionsize __P((DB *, u_int32_t));
+int __heap_exist __P((void));
+int __heap_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, db_pgno_t, u_int32_t));
+int __heap_metachk __P((DB *, const char *, HEAPMETA *));
+int __heap_read_meta __P((DB *, DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
+int __heap_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __heap_create_region __P((DBC *, db_pgno_t));
+int __heap_addrem_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_pg_alloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_meta_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_page_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_truncate __P((DBC *, u_int32_t *));
+int __heap_stat __P((DBC *, void *, u_int32_t));
+int __heap_stat_print __P((DBC *, u_int32_t));
+void __heap_print_cursor __P((DBC *));
+int __heap_stat_callback __P((DBC *, PAGE *, void *, int *));
+int __heap_traverse __P((DBC *, int (*)(DBC *, PAGE *, void *, int *), void *));
+int __db_no_heap_am __P((ENV *));
+int __heap_vrfy_meta __P((DB *, VRFY_DBINFO *, HEAPMETA *, db_pgno_t, u_int32_t));
+int __heap_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+int __heap_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t));
+int __heap_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+int __heap_meta2pgset __P((DB *, VRFY_DBINFO *, HEAPMETA *, DB *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_heap_ext_h_ */
diff --git a/src/dbinc_auto/hmac_ext.h b/src/dbinc_auto/hmac_ext.h
new file mode 100644
index 00000000..c1371014
--- /dev/null
+++ b/src/dbinc_auto/hmac_ext.h
@@ -0,0 +1,20 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _hmac_ext_h_
+#define _hmac_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void __db_chksum __P((void *, u_int8_t *, size_t, u_int8_t *, u_int8_t *));
+void __db_derive_mac __P((u_int8_t *, size_t, u_int8_t *));
+int __db_check_chksum __P((ENV *, void *, DB_CIPHER *, u_int8_t *, void *, size_t, int));
+void __db_SHA1Transform __P((u_int32_t *, unsigned char *));
+void __db_SHA1Init __P((SHA1_CTX *));
+void __db_SHA1Update __P((SHA1_CTX *, unsigned char *, size_t));
+void __db_SHA1Final __P((unsigned char *, SHA1_CTX *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_hmac_ext_h_ */
diff --git a/src/dbinc_auto/int_def.in b/src/dbinc_auto/int_def.in
new file mode 100644
index 00000000..dce2831c
--- /dev/null
+++ b/src/dbinc_auto/int_def.in
@@ -0,0 +1,2265 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _DB_INT_DEF_IN_
+#define _DB_INT_DEF_IN_
+
+#define __crdel_metasub_desc __crdel_metasub_desc@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_create_desc __crdel_inmem_create_desc@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_rename_desc __crdel_inmem_rename_desc@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_remove_desc __crdel_inmem_remove_desc@DB_VERSION_UNIQUE_NAME@
+#define __crdel_init_recover __crdel_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __crdel_metasub_print __crdel_metasub_print@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_create_print __crdel_inmem_create_print@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_rename_print __crdel_inmem_rename_print@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_remove_print __crdel_inmem_remove_print@DB_VERSION_UNIQUE_NAME@
+#define __crdel_init_print __crdel_init_print@DB_VERSION_UNIQUE_NAME@
+#define __crdel_metasub_recover __crdel_metasub_recover@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_create_recover __crdel_inmem_create_recover@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_rename_recover __crdel_inmem_rename_recover@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_remove_recover __crdel_inmem_remove_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_master_open __db_master_open@DB_VERSION_UNIQUE_NAME@
+#define __db_master_update __db_master_update@DB_VERSION_UNIQUE_NAME@
+#define __env_dbreg_setup __env_dbreg_setup@DB_VERSION_UNIQUE_NAME@
+#define __env_setup __env_setup@DB_VERSION_UNIQUE_NAME@
+#define __env_mpool __env_mpool@DB_VERSION_UNIQUE_NAME@
+#define __db_close __db_close@DB_VERSION_UNIQUE_NAME@
+#define __db_refresh __db_refresh@DB_VERSION_UNIQUE_NAME@
+#define __db_log_page __db_log_page@DB_VERSION_UNIQUE_NAME@
+#define __db_walk_cursors __db_walk_cursors@DB_VERSION_UNIQUE_NAME@
+#define __db_backup_name __db_backup_name@DB_VERSION_UNIQUE_NAME@
+#ifdef CONFIG_TEST
+#define __db_testcopy __db_testcopy@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __db_testdocopy __db_testdocopy@DB_VERSION_UNIQUE_NAME@
+#define __db_cursor_int __db_cursor_int@DB_VERSION_UNIQUE_NAME@
+#define __db_put __db_put@DB_VERSION_UNIQUE_NAME@
+#define __db_del __db_del@DB_VERSION_UNIQUE_NAME@
+#define __db_sync __db_sync@DB_VERSION_UNIQUE_NAME@
+#define __db_associate __db_associate@DB_VERSION_UNIQUE_NAME@
+#define __db_secondary_close __db_secondary_close@DB_VERSION_UNIQUE_NAME@
+#define __db_associate_foreign __db_associate_foreign@DB_VERSION_UNIQUE_NAME@
+#define __db_addrem_desc __db_addrem_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_addrem_42_desc __db_addrem_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_big_desc __db_big_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_big_42_desc __db_big_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_ovref_desc __db_ovref_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_relink_42_desc __db_relink_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_debug_desc __db_debug_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_noop_desc __db_noop_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_alloc_42_desc __db_pg_alloc_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_alloc_desc __db_pg_alloc_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_free_42_desc __db_pg_free_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_free_desc __db_pg_free_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_cksum_desc __db_cksum_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_freedata_42_desc __db_pg_freedata_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_freedata_desc __db_pg_freedata_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_init_desc __db_pg_init_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_sort_44_desc __db_pg_sort_44_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_trunc_desc __db_pg_trunc_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_realloc_desc __db_realloc_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_relink_desc __db_relink_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_merge_desc __db_merge_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_pgno_desc __db_pgno_desc@DB_VERSION_UNIQUE_NAME@
+#define __db_init_recover __db_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_addrem_print __db_addrem_print@DB_VERSION_UNIQUE_NAME@
+#define __db_addrem_42_print __db_addrem_42_print@DB_VERSION_UNIQUE_NAME@
+#define __db_big_print __db_big_print@DB_VERSION_UNIQUE_NAME@
+#define __db_big_42_print __db_big_42_print@DB_VERSION_UNIQUE_NAME@
+#define __db_ovref_print __db_ovref_print@DB_VERSION_UNIQUE_NAME@
+#define __db_relink_42_print __db_relink_42_print@DB_VERSION_UNIQUE_NAME@
+#define __db_debug_print __db_debug_print@DB_VERSION_UNIQUE_NAME@
+#define __db_noop_print __db_noop_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_alloc_42_print __db_pg_alloc_42_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_alloc_print __db_pg_alloc_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_free_42_print __db_pg_free_42_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_free_print __db_pg_free_print@DB_VERSION_UNIQUE_NAME@
+#define __db_cksum_print __db_cksum_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_freedata_42_print __db_pg_freedata_42_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_freedata_print __db_pg_freedata_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_init_print __db_pg_init_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_sort_44_print __db_pg_sort_44_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_trunc_print __db_pg_trunc_print@DB_VERSION_UNIQUE_NAME@
+#define __db_realloc_print __db_realloc_print@DB_VERSION_UNIQUE_NAME@
+#define __db_relink_print __db_relink_print@DB_VERSION_UNIQUE_NAME@
+#define __db_merge_print __db_merge_print@DB_VERSION_UNIQUE_NAME@
+#define __db_pgno_print __db_pgno_print@DB_VERSION_UNIQUE_NAME@
+#define __db_init_print __db_init_print@DB_VERSION_UNIQUE_NAME@
+#define __db_dbbackup_pp __db_dbbackup_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_dbbackup __db_dbbackup@DB_VERSION_UNIQUE_NAME@
+#define __db_backup __db_backup@DB_VERSION_UNIQUE_NAME@
+#define __dbc_close __dbc_close@DB_VERSION_UNIQUE_NAME@
+#define __dbc_destroy __dbc_destroy@DB_VERSION_UNIQUE_NAME@
+#define __dbc_cmp __dbc_cmp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_count __dbc_count@DB_VERSION_UNIQUE_NAME@
+#define __dbc_del __dbc_del@DB_VERSION_UNIQUE_NAME@
+#define __dbc_idel __dbc_idel@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_COMPRESSION
+#define __dbc_bulk_del __dbc_bulk_del@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __dbc_dup __dbc_dup@DB_VERSION_UNIQUE_NAME@
+#define __dbc_idup __dbc_idup@DB_VERSION_UNIQUE_NAME@
+#define __dbc_newopd __dbc_newopd@DB_VERSION_UNIQUE_NAME@
+#define __dbc_get __dbc_get@DB_VERSION_UNIQUE_NAME@
+#define __dbc_iget __dbc_iget@DB_VERSION_UNIQUE_NAME@
+#define __dbc_put __dbc_put@DB_VERSION_UNIQUE_NAME@
+#define __dbc_iput __dbc_iput@DB_VERSION_UNIQUE_NAME@
+#define __db_duperr __db_duperr@DB_VERSION_UNIQUE_NAME@
+#define __dbc_cleanup __dbc_cleanup@DB_VERSION_UNIQUE_NAME@
+#define __dbc_secondary_get_pp __dbc_secondary_get_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_pget __dbc_pget@DB_VERSION_UNIQUE_NAME@
+#define __dbc_del_primary __dbc_del_primary@DB_VERSION_UNIQUE_NAME@
+#define __db_s_first __db_s_first@DB_VERSION_UNIQUE_NAME@
+#define __db_s_next __db_s_next@DB_VERSION_UNIQUE_NAME@
+#define __db_s_done __db_s_done@DB_VERSION_UNIQUE_NAME@
+#define __db_buildpartial __db_buildpartial@DB_VERSION_UNIQUE_NAME@
+#define __db_partsize __db_partsize@DB_VERSION_UNIQUE_NAME@
+#ifdef DIAGNOSTIC
+#define __db_check_skeyset __db_check_skeyset@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __cdsgroup_begin __cdsgroup_begin@DB_VERSION_UNIQUE_NAME@
+#define __cdsgroup_begin_pp __cdsgroup_begin_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_compact_int __db_compact_int@DB_VERSION_UNIQUE_NAME@
+#define __db_exchange_page __db_exchange_page@DB_VERSION_UNIQUE_NAME@
+#define __db_truncate_overflow __db_truncate_overflow@DB_VERSION_UNIQUE_NAME@
+#define __db_truncate_root __db_truncate_root@DB_VERSION_UNIQUE_NAME@
+#define __db_find_free __db_find_free@DB_VERSION_UNIQUE_NAME@
+#define __db_relink __db_relink@DB_VERSION_UNIQUE_NAME@
+#define __db_move_metadata __db_move_metadata@DB_VERSION_UNIQUE_NAME@
+#define __db_pgin __db_pgin@DB_VERSION_UNIQUE_NAME@
+#define __db_pgout __db_pgout@DB_VERSION_UNIQUE_NAME@
+#define __db_decrypt_pg __db_decrypt_pg@DB_VERSION_UNIQUE_NAME@
+#define __db_encrypt_and_checksum_pg __db_encrypt_and_checksum_pg@DB_VERSION_UNIQUE_NAME@
+#define __db_metaswap __db_metaswap@DB_VERSION_UNIQUE_NAME@
+#define __db_byteswap __db_byteswap@DB_VERSION_UNIQUE_NAME@
+#define __db_pageswap __db_pageswap@DB_VERSION_UNIQUE_NAME@
+#define __db_recordswap __db_recordswap@DB_VERSION_UNIQUE_NAME@
+#define __db_dispatch __db_dispatch@DB_VERSION_UNIQUE_NAME@
+#define __db_add_recovery __db_add_recovery@DB_VERSION_UNIQUE_NAME@
+#define __db_add_recovery_int __db_add_recovery_int@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_init __db_txnlist_init@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_add __db_txnlist_add@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_remove __db_txnlist_remove@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_ckp __db_txnlist_ckp@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_end __db_txnlist_end@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_find __db_txnlist_find@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_update __db_txnlist_update@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_gen __db_txnlist_gen@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_lsnadd __db_txnlist_lsnadd@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_lsnget __db_txnlist_lsnget@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_lsninit __db_txnlist_lsninit@DB_VERSION_UNIQUE_NAME@
+#define __db_txnlist_print __db_txnlist_print@DB_VERSION_UNIQUE_NAME@
+#define __db_ditem_nolog __db_ditem_nolog@DB_VERSION_UNIQUE_NAME@
+#define __db_ditem __db_ditem@DB_VERSION_UNIQUE_NAME@
+#define __db_pitem_nolog __db_pitem_nolog@DB_VERSION_UNIQUE_NAME@
+#define __db_pitem __db_pitem@DB_VERSION_UNIQUE_NAME@
+#define __db_associate_pp __db_associate_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_close_pp __db_close_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_cursor_pp __db_cursor_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_cursor __db_cursor@DB_VERSION_UNIQUE_NAME@
+#define __db_del_pp __db_del_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_exists __db_exists@DB_VERSION_UNIQUE_NAME@
+#define __db_fd_pp __db_fd_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_get_pp __db_get_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_get __db_get@DB_VERSION_UNIQUE_NAME@
+#define __db_join_pp __db_join_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_key_range_pp __db_key_range_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_open_pp __db_open_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_pget_pp __db_pget_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_pget __db_pget@DB_VERSION_UNIQUE_NAME@
+#define __db_put_pp __db_put_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_compact_pp __db_compact_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_associate_foreign_pp __db_associate_foreign_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_sync_pp __db_sync_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_close_pp __dbc_close_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_cmp_pp __dbc_cmp_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_count_pp __dbc_count_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_del_pp __dbc_del_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_dup_pp __dbc_dup_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_get_pp __dbc_get_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_get_arg __dbc_get_arg@DB_VERSION_UNIQUE_NAME@
+#define __db_secondary_close_pp __db_secondary_close_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_pget_pp __dbc_pget_pp@DB_VERSION_UNIQUE_NAME@
+#define __dbc_put_pp __dbc_put_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_txn_auto_init __db_txn_auto_init@DB_VERSION_UNIQUE_NAME@
+#define __db_txn_auto_resolve __db_txn_auto_resolve@DB_VERSION_UNIQUE_NAME@
+#define __db_join __db_join@DB_VERSION_UNIQUE_NAME@
+#define __db_join_close __db_join_close@DB_VERSION_UNIQUE_NAME@
+#define __db_secondary_corrupt __db_secondary_corrupt@DB_VERSION_UNIQUE_NAME@
+#define __db_new __db_new@DB_VERSION_UNIQUE_NAME@
+#define __db_free __db_free@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_FTRUNCATE
+#define __db_freelist_pos __db_freelist_pos@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __db_freelist_sort __db_freelist_sort@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_FTRUNCATE
+#define __db_pg_truncate __db_pg_truncate@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef HAVE_FTRUNCATE
+#define __db_free_truncate __db_free_truncate@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __db_lprint __db_lprint@DB_VERSION_UNIQUE_NAME@
+#define __db_lget __db_lget@DB_VERSION_UNIQUE_NAME@
+#ifdef DIAGNOSTIC
+#define __db_haslock __db_haslock@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef DIAGNOSTIC
+#define __db_has_pagelock __db_has_pagelock@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __db_lput __db_lput@DB_VERSION_UNIQUE_NAME@
+#define __db_create_internal __db_create_internal@DB_VERSION_UNIQUE_NAME@
+#define __dbh_am_chk __dbh_am_chk@DB_VERSION_UNIQUE_NAME@
+#define __db_get_flags __db_get_flags@DB_VERSION_UNIQUE_NAME@
+#define __db_set_flags __db_set_flags@DB_VERSION_UNIQUE_NAME@
+#define __db_get_lorder __db_get_lorder@DB_VERSION_UNIQUE_NAME@
+#define __db_set_lorder __db_set_lorder@DB_VERSION_UNIQUE_NAME@
+#define __db_set_pagesize __db_set_pagesize@DB_VERSION_UNIQUE_NAME@
+#define __db_open __db_open@DB_VERSION_UNIQUE_NAME@
+#define __db_get_open_flags __db_get_open_flags@DB_VERSION_UNIQUE_NAME@
+#define __db_new_file __db_new_file@DB_VERSION_UNIQUE_NAME@
+#define __db_init_subdb __db_init_subdb@DB_VERSION_UNIQUE_NAME@
+#define __db_chk_meta __db_chk_meta@DB_VERSION_UNIQUE_NAME@
+#define __db_meta_setup __db_meta_setup@DB_VERSION_UNIQUE_NAME@
+#define __db_reopen __db_reopen@DB_VERSION_UNIQUE_NAME@
+#define __db_goff __db_goff@DB_VERSION_UNIQUE_NAME@
+#define __db_poff __db_poff@DB_VERSION_UNIQUE_NAME@
+#define __db_ovref __db_ovref@DB_VERSION_UNIQUE_NAME@
+#define __db_doff __db_doff@DB_VERSION_UNIQUE_NAME@
+#define __db_moff __db_moff@DB_VERSION_UNIQUE_NAME@
+#define __db_coff __db_coff@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_overflow __db_vrfy_overflow@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_ovfl_structure __db_vrfy_ovfl_structure@DB_VERSION_UNIQUE_NAME@
+#define __db_safe_goff __db_safe_goff@DB_VERSION_UNIQUE_NAME@
+#define __db_loadme __db_loadme@DB_VERSION_UNIQUE_NAME@
+#define __db_dumptree __db_dumptree@DB_VERSION_UNIQUE_NAME@
+#define __db_get_flags_fn __db_get_flags_fn@DB_VERSION_UNIQUE_NAME@
+#define __db_prnpage __db_prnpage@DB_VERSION_UNIQUE_NAME@
+#define __db_prpage __db_prpage@DB_VERSION_UNIQUE_NAME@
+#define __db_lockmode_to_string __db_lockmode_to_string@DB_VERSION_UNIQUE_NAME@
+#define __db_dumptree __db_dumptree@DB_VERSION_UNIQUE_NAME@
+#define __db_get_flags_fn __db_get_flags_fn@DB_VERSION_UNIQUE_NAME@
+#define __db_prpage_int __db_prpage_int@DB_VERSION_UNIQUE_NAME@
+#define __db_prbytes __db_prbytes@DB_VERSION_UNIQUE_NAME@
+#define __db_prflags __db_prflags@DB_VERSION_UNIQUE_NAME@
+#define __db_name_to_val __db_name_to_val@DB_VERSION_UNIQUE_NAME@
+#define __db_pagetype_to_string __db_pagetype_to_string@DB_VERSION_UNIQUE_NAME@
+#define __db_dump_pp __db_dump_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_dump __db_dump@DB_VERSION_UNIQUE_NAME@
+#define __db_prdbt __db_prdbt@DB_VERSION_UNIQUE_NAME@
+#define __db_prheader __db_prheader@DB_VERSION_UNIQUE_NAME@
+#define __db_prfooter __db_prfooter@DB_VERSION_UNIQUE_NAME@
+#define __db_pr_callback __db_pr_callback@DB_VERSION_UNIQUE_NAME@
+#define __db_dbtype_to_string __db_dbtype_to_string@DB_VERSION_UNIQUE_NAME@
+#define __db_addrem_recover __db_addrem_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_addrem_42_recover __db_addrem_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_big_recover __db_big_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_big_42_recover __db_big_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_ovref_recover __db_ovref_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_debug_recover __db_debug_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_noop_recover __db_noop_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_alloc_recover __db_pg_alloc_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_free_recover __db_pg_free_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_freedata_recover __db_pg_freedata_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_cksum_recover __db_cksum_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_init_recover __db_pg_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_trunc_recover __db_pg_trunc_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_realloc_recover __db_realloc_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_sort_44_recover __db_pg_sort_44_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_alloc_42_recover __db_pg_alloc_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_free_42_recover __db_pg_free_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_freedata_42_recover __db_pg_freedata_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_relink_42_recover __db_relink_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_relink_recover __db_relink_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_merge_recover __db_merge_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pgno_recover __db_pgno_recover@DB_VERSION_UNIQUE_NAME@
+#define __db_pglist_swap __db_pglist_swap@DB_VERSION_UNIQUE_NAME@
+#define __db_pglist_print __db_pglist_print@DB_VERSION_UNIQUE_NAME@
+#define __db_traverse_big __db_traverse_big@DB_VERSION_UNIQUE_NAME@
+#define __db_reclaim_callback __db_reclaim_callback@DB_VERSION_UNIQUE_NAME@
+#define __db_truncate_callback __db_truncate_callback@DB_VERSION_UNIQUE_NAME@
+#define __env_dbremove_pp __env_dbremove_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_remove_pp __db_remove_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_remove __db_remove@DB_VERSION_UNIQUE_NAME@
+#define __db_remove_int __db_remove_int@DB_VERSION_UNIQUE_NAME@
+#define __db_inmem_remove __db_inmem_remove@DB_VERSION_UNIQUE_NAME@
+#define __env_dbrename_pp __env_dbrename_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_rename_pp __db_rename_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_rename_int __db_rename_int@DB_VERSION_UNIQUE_NAME@
+#define __db_ret __db_ret@DB_VERSION_UNIQUE_NAME@
+#define __db_retcopy __db_retcopy@DB_VERSION_UNIQUE_NAME@
+#define __env_fileid_reset_pp __env_fileid_reset_pp@DB_VERSION_UNIQUE_NAME@
+#define __env_fileid_reset __env_fileid_reset@DB_VERSION_UNIQUE_NAME@
+#define __env_lsn_reset_pp __env_lsn_reset_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_lsn_reset __db_lsn_reset@DB_VERSION_UNIQUE_NAME@
+#define __db_compare_both __db_compare_both@DB_VERSION_UNIQUE_NAME@
+#define __db_sort_multiple __db_sort_multiple@DB_VERSION_UNIQUE_NAME@
+#define __db_stat_pp __db_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_stat_print_pp __db_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_stat_print __db_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __db_truncate_pp __db_truncate_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_truncate __db_truncate@DB_VERSION_UNIQUE_NAME@
+#define __db_upgrade_pp __db_upgrade_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_upgrade __db_upgrade@DB_VERSION_UNIQUE_NAME@
+#define __db_lastpgno __db_lastpgno@DB_VERSION_UNIQUE_NAME@
+#define __db_31_offdup __db_31_offdup@DB_VERSION_UNIQUE_NAME@
+#define __db_verify_pp __db_verify_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_verify_internal __db_verify_internal@DB_VERSION_UNIQUE_NAME@
+#define __db_verify __db_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_common __db_vrfy_common@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_datapage __db_vrfy_datapage@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_meta __db_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_struct_feedback __db_vrfy_struct_feedback@DB_VERSION_UNIQUE_NAME@
+#define __db_salvage_pg __db_salvage_pg@DB_VERSION_UNIQUE_NAME@
+#define __db_salvage_leaf __db_salvage_leaf@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_inpitem __db_vrfy_inpitem@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_duptype __db_vrfy_duptype@DB_VERSION_UNIQUE_NAME@
+#define __db_salvage_duptree __db_salvage_duptree@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_dbinfo_create __db_vrfy_dbinfo_create@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_dbinfo_destroy __db_vrfy_dbinfo_destroy@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_getpageinfo __db_vrfy_getpageinfo@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_putpageinfo __db_vrfy_putpageinfo@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_pgset __db_vrfy_pgset@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_pgset_get __db_vrfy_pgset_get@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_pgset_inc __db_vrfy_pgset_inc@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_pgset_next __db_vrfy_pgset_next@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_childcursor __db_vrfy_childcursor@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_childput __db_vrfy_childput@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_ccset __db_vrfy_ccset@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_ccnext __db_vrfy_ccnext@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_ccclose __db_vrfy_ccclose@DB_VERSION_UNIQUE_NAME@
+#define __db_salvage_init __db_salvage_init@DB_VERSION_UNIQUE_NAME@
+#define __db_salvage_destroy __db_salvage_destroy@DB_VERSION_UNIQUE_NAME@
+#define __db_salvage_getnext __db_salvage_getnext@DB_VERSION_UNIQUE_NAME@
+#define __db_salvage_isdone __db_salvage_isdone@DB_VERSION_UNIQUE_NAME@
+#define __db_salvage_markdone __db_salvage_markdone@DB_VERSION_UNIQUE_NAME@
+#define __db_salvage_markneeded __db_salvage_markneeded@DB_VERSION_UNIQUE_NAME@
+#define __db_vrfy_prdbt __db_vrfy_prdbt@DB_VERSION_UNIQUE_NAME@
+#define __partition_init __partition_init@DB_VERSION_UNIQUE_NAME@
+#define __partition_set __partition_set@DB_VERSION_UNIQUE_NAME@
+#define __partition_set_dirs __partition_set_dirs@DB_VERSION_UNIQUE_NAME@
+#define __partition_open __partition_open@DB_VERSION_UNIQUE_NAME@
+#define __partition_get_callback __partition_get_callback@DB_VERSION_UNIQUE_NAME@
+#define __partition_get_keys __partition_get_keys@DB_VERSION_UNIQUE_NAME@
+#define __partition_get_dirs __partition_get_dirs@DB_VERSION_UNIQUE_NAME@
+#define __partc_init __partc_init@DB_VERSION_UNIQUE_NAME@
+#define __partc_get __partc_get@DB_VERSION_UNIQUE_NAME@
+#define __partition_close __partition_close@DB_VERSION_UNIQUE_NAME@
+#define __partition_sync __partition_sync@DB_VERSION_UNIQUE_NAME@
+#define __partition_stat __partition_stat@DB_VERSION_UNIQUE_NAME@
+#define __part_truncate __part_truncate@DB_VERSION_UNIQUE_NAME@
+#define __part_compact __part_compact@DB_VERSION_UNIQUE_NAME@
+#define __part_lsn_reset __part_lsn_reset@DB_VERSION_UNIQUE_NAME@
+#define __part_fileid_reset __part_fileid_reset@DB_VERSION_UNIQUE_NAME@
+#define __part_key_range __part_key_range@DB_VERSION_UNIQUE_NAME@
+#define __part_remove __part_remove@DB_VERSION_UNIQUE_NAME@
+#define __part_rename __part_rename@DB_VERSION_UNIQUE_NAME@
+#define __part_verify __part_verify@DB_VERSION_UNIQUE_NAME@
+#define __part_testdocopy __part_testdocopy@DB_VERSION_UNIQUE_NAME@
+#define __db_no_partition __db_no_partition@DB_VERSION_UNIQUE_NAME@
+#define __partition_set __partition_set@DB_VERSION_UNIQUE_NAME@
+#define __partition_get_callback __partition_get_callback@DB_VERSION_UNIQUE_NAME@
+#define __partition_get_dirs __partition_get_dirs@DB_VERSION_UNIQUE_NAME@
+#define __partition_get_keys __partition_get_keys@DB_VERSION_UNIQUE_NAME@
+#define __partition_init __partition_init@DB_VERSION_UNIQUE_NAME@
+#define __part_fileid_reset __part_fileid_reset@DB_VERSION_UNIQUE_NAME@
+#define __partition_set_dirs __partition_set_dirs@DB_VERSION_UNIQUE_NAME@
+#define __bam_compact_int __bam_compact_int@DB_VERSION_UNIQUE_NAME@
+#define __bam_compact_opd __bam_compact_opd@DB_VERSION_UNIQUE_NAME@
+#define __bam_truncate_ipages __bam_truncate_ipages@DB_VERSION_UNIQUE_NAME@
+#define __bam_cmp __bam_cmp@DB_VERSION_UNIQUE_NAME@
+#define __bam_defcmp __bam_defcmp@DB_VERSION_UNIQUE_NAME@
+#define __bam_defpfx __bam_defpfx@DB_VERSION_UNIQUE_NAME@
+#define __bam_compress_dupcmp __bam_compress_dupcmp@DB_VERSION_UNIQUE_NAME@
+#define __bam_defcompress __bam_defcompress@DB_VERSION_UNIQUE_NAME@
+#define __bam_defdecompress __bam_defdecompress@DB_VERSION_UNIQUE_NAME@
+#define __bamc_compress_get __bamc_compress_get@DB_VERSION_UNIQUE_NAME@
+#define __bamc_compress_put __bamc_compress_put@DB_VERSION_UNIQUE_NAME@
+#define __bamc_compress_del __bamc_compress_del@DB_VERSION_UNIQUE_NAME@
+#define __bamc_compress_bulk_del __bamc_compress_bulk_del@DB_VERSION_UNIQUE_NAME@
+#define __bamc_compress_count __bamc_compress_count@DB_VERSION_UNIQUE_NAME@
+#define __bamc_compress_cmp __bamc_compress_cmp@DB_VERSION_UNIQUE_NAME@
+#define __bamc_compress_dup __bamc_compress_dup@DB_VERSION_UNIQUE_NAME@
+#define __bam_compress_salvage __bam_compress_salvage@DB_VERSION_UNIQUE_NAME@
+#define __bam_compress_count __bam_compress_count@DB_VERSION_UNIQUE_NAME@
+#define __bam_pgin __bam_pgin@DB_VERSION_UNIQUE_NAME@
+#define __bam_pgout __bam_pgout@DB_VERSION_UNIQUE_NAME@
+#define __bam_mswap __bam_mswap@DB_VERSION_UNIQUE_NAME@
+#define __bam_ca_delete __bam_ca_delete@DB_VERSION_UNIQUE_NAME@
+#define __ram_ca_delete __ram_ca_delete@DB_VERSION_UNIQUE_NAME@
+#define __bam_ca_di __bam_ca_di@DB_VERSION_UNIQUE_NAME@
+#define __bam_ca_dup __bam_ca_dup@DB_VERSION_UNIQUE_NAME@
+#define __bam_ca_undodup __bam_ca_undodup@DB_VERSION_UNIQUE_NAME@
+#define __bam_ca_rsplit __bam_ca_rsplit@DB_VERSION_UNIQUE_NAME@
+#define __bam_ca_split __bam_ca_split@DB_VERSION_UNIQUE_NAME@
+#define __bam_ca_undosplit __bam_ca_undosplit@DB_VERSION_UNIQUE_NAME@
+#define __bamc_init __bamc_init@DB_VERSION_UNIQUE_NAME@
+#define __bamc_refresh __bamc_refresh@DB_VERSION_UNIQUE_NAME@
+#define __bamc_cmp __bamc_cmp@DB_VERSION_UNIQUE_NAME@
+#define __bamc_count __bamc_count@DB_VERSION_UNIQUE_NAME@
+#define __bamc_dup __bamc_dup@DB_VERSION_UNIQUE_NAME@
+#define __bam_bulk_overflow __bam_bulk_overflow@DB_VERSION_UNIQUE_NAME@
+#define __bam_bulk_duplicates __bam_bulk_duplicates@DB_VERSION_UNIQUE_NAME@
+#define __bamc_rget __bamc_rget@DB_VERSION_UNIQUE_NAME@
+#define __bam_opd_exists __bam_opd_exists@DB_VERSION_UNIQUE_NAME@
+#define __bam_ditem __bam_ditem@DB_VERSION_UNIQUE_NAME@
+#define __bam_adjindx __bam_adjindx@DB_VERSION_UNIQUE_NAME@
+#define __bam_dpages __bam_dpages@DB_VERSION_UNIQUE_NAME@
+#define __bam_pupdate __bam_pupdate@DB_VERSION_UNIQUE_NAME@
+#define __bam_db_create __bam_db_create@DB_VERSION_UNIQUE_NAME@
+#define __bam_db_close __bam_db_close@DB_VERSION_UNIQUE_NAME@
+#define __bam_map_flags __bam_map_flags@DB_VERSION_UNIQUE_NAME@
+#define __bam_set_flags __bam_set_flags@DB_VERSION_UNIQUE_NAME@
+#define __bam_set_bt_compare __bam_set_bt_compare@DB_VERSION_UNIQUE_NAME@
+#define __bam_set_bt_compress __bam_set_bt_compress@DB_VERSION_UNIQUE_NAME@
+#define __bam_get_bt_minkey __bam_get_bt_minkey@DB_VERSION_UNIQUE_NAME@
+#define __bam_copy_config __bam_copy_config@DB_VERSION_UNIQUE_NAME@
+#define __ram_map_flags __ram_map_flags@DB_VERSION_UNIQUE_NAME@
+#define __ram_set_flags __ram_set_flags@DB_VERSION_UNIQUE_NAME@
+#define __ram_get_re_len __ram_get_re_len@DB_VERSION_UNIQUE_NAME@
+#define __ram_get_re_pad __ram_get_re_pad@DB_VERSION_UNIQUE_NAME@
+#define __bam_open __bam_open@DB_VERSION_UNIQUE_NAME@
+#define __bam_metachk __bam_metachk@DB_VERSION_UNIQUE_NAME@
+#define __bam_read_root __bam_read_root@DB_VERSION_UNIQUE_NAME@
+#define __bam_new_file __bam_new_file@DB_VERSION_UNIQUE_NAME@
+#define __bam_new_subdb __bam_new_subdb@DB_VERSION_UNIQUE_NAME@
+#define __bam_iitem __bam_iitem@DB_VERSION_UNIQUE_NAME@
+#define __bam_ritem __bam_ritem@DB_VERSION_UNIQUE_NAME@
+#define __bam_ritem_nolog __bam_ritem_nolog@DB_VERSION_UNIQUE_NAME@
+#define __bam_irep __bam_irep@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_recover __bam_split_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_48_recover __bam_split_48_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_42_recover __bam_split_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_rsplit_recover __bam_rsplit_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_adj_recover __bam_adj_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_cadjust_recover __bam_cadjust_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_cdel_recover __bam_cdel_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_repl_recover __bam_repl_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_irep_recover __bam_irep_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_root_recover __bam_root_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_curadj_recover __bam_curadj_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_rcuradj_recover __bam_rcuradj_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_merge_44_recover __bam_merge_44_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_relink_43_recover __bam_relink_43_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_reclaim __bam_reclaim@DB_VERSION_UNIQUE_NAME@
+#define __bam_truncate __bam_truncate@DB_VERSION_UNIQUE_NAME@
+#define __ram_open __ram_open@DB_VERSION_UNIQUE_NAME@
+#define __ram_append __ram_append@DB_VERSION_UNIQUE_NAME@
+#define __ramc_del __ramc_del@DB_VERSION_UNIQUE_NAME@
+#define __ramc_get __ramc_get@DB_VERSION_UNIQUE_NAME@
+#define __ramc_put __ramc_put@DB_VERSION_UNIQUE_NAME@
+#define __ram_ca __ram_ca@DB_VERSION_UNIQUE_NAME@
+#define __ram_getno __ram_getno@DB_VERSION_UNIQUE_NAME@
+#define __ram_writeback __ram_writeback@DB_VERSION_UNIQUE_NAME@
+#define __bam_rsearch __bam_rsearch@DB_VERSION_UNIQUE_NAME@
+#define __bam_adjust __bam_adjust@DB_VERSION_UNIQUE_NAME@
+#define __bam_nrecs __bam_nrecs@DB_VERSION_UNIQUE_NAME@
+#define __bam_total __bam_total@DB_VERSION_UNIQUE_NAME@
+#define __bam_get_root __bam_get_root@DB_VERSION_UNIQUE_NAME@
+#define __bam_search __bam_search@DB_VERSION_UNIQUE_NAME@
+#define __bam_stkrel __bam_stkrel@DB_VERSION_UNIQUE_NAME@
+#define __bam_stkgrow __bam_stkgrow@DB_VERSION_UNIQUE_NAME@
+#define __bam_split __bam_split@DB_VERSION_UNIQUE_NAME@
+#define __bam_broot __bam_broot@DB_VERSION_UNIQUE_NAME@
+#define __ram_root __ram_root@DB_VERSION_UNIQUE_NAME@
+#define __bam_pinsert __bam_pinsert@DB_VERSION_UNIQUE_NAME@
+#define __bam_copy __bam_copy@DB_VERSION_UNIQUE_NAME@
+#define __bam_stat __bam_stat@DB_VERSION_UNIQUE_NAME@
+#define __bam_stat_print __bam_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_stat_callback __bam_stat_callback@DB_VERSION_UNIQUE_NAME@
+#define __bam_print_cursor __bam_print_cursor@DB_VERSION_UNIQUE_NAME@
+#define __bam_key_range __bam_key_range@DB_VERSION_UNIQUE_NAME@
+#define __bam_traverse __bam_traverse@DB_VERSION_UNIQUE_NAME@
+#define __bam_30_btreemeta __bam_30_btreemeta@DB_VERSION_UNIQUE_NAME@
+#define __bam_31_btreemeta __bam_31_btreemeta@DB_VERSION_UNIQUE_NAME@
+#define __bam_31_lbtree __bam_31_lbtree@DB_VERSION_UNIQUE_NAME@
+#define __bam_vrfy_meta __bam_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define __ram_vrfy_leaf __ram_vrfy_leaf@DB_VERSION_UNIQUE_NAME@
+#define __bam_vrfy __bam_vrfy@DB_VERSION_UNIQUE_NAME@
+#define __bam_vrfy_itemorder __bam_vrfy_itemorder@DB_VERSION_UNIQUE_NAME@
+#define __bam_vrfy_structure __bam_vrfy_structure@DB_VERSION_UNIQUE_NAME@
+#define __bam_vrfy_subtree __bam_vrfy_subtree@DB_VERSION_UNIQUE_NAME@
+#define __bam_salvage __bam_salvage@DB_VERSION_UNIQUE_NAME@
+#define __bam_salvage_walkdupint __bam_salvage_walkdupint@DB_VERSION_UNIQUE_NAME@
+#define __bam_meta2pgset __bam_meta2pgset@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_desc __bam_split_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_48_desc __bam_split_48_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_42_desc __bam_split_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_rsplit_desc __bam_rsplit_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_adj_desc __bam_adj_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_cadjust_desc __bam_cadjust_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_cdel_desc __bam_cdel_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_repl_desc __bam_repl_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_irep_desc __bam_irep_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_root_desc __bam_root_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_curadj_desc __bam_curadj_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_rcuradj_desc __bam_rcuradj_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_relink_43_desc __bam_relink_43_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_merge_44_desc __bam_merge_44_desc@DB_VERSION_UNIQUE_NAME@
+#define __bam_init_recover __bam_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_print __bam_split_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_48_print __bam_split_48_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_42_print __bam_split_42_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_rsplit_print __bam_rsplit_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_adj_print __bam_adj_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_cadjust_print __bam_cadjust_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_cdel_print __bam_cdel_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_repl_print __bam_repl_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_irep_print __bam_irep_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_root_print __bam_root_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_curadj_print __bam_curadj_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_rcuradj_print __bam_rcuradj_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_relink_43_print __bam_relink_43_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_merge_44_print __bam_merge_44_print@DB_VERSION_UNIQUE_NAME@
+#define __bam_init_print __bam_init_print@DB_VERSION_UNIQUE_NAME@
+#ifndef HAVE_ATOI
+#define atoi atoi@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ATOL
+#define atol atol@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_BSEARCH
+#define bsearch bsearch@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_GETCWD
+#define getcwd getcwd@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_GETOPT
+#define getopt getopt@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ISALPHA
+#define isalpha isalpha@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ISDIGIT
+#define isdigit isdigit@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ISPRINT
+#define isprint isprint@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_ISSPACE
+#define isspace isspace@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_MEMCMP
+#define memcmp memcmp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_MEMCPY
+#define memcpy memcpy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_MEMMOVE
+#define memmove memmove@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_PRINTF
+#define printf printf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_PRINTF
+#define fprintf fprintf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_PRINTF
+#define vfprintf vfprintf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_QSORT
+#define qsort qsort@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_RAISE
+#define raise raise@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_RAND
+#define rand rand@DB_VERSION_UNIQUE_NAME@
+#define srand srand@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_SNPRINTF
+#define snprintf snprintf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_VSNPRINTF
+#define vsnprintf vsnprintf@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRCASECMP
+#define strcasecmp strcasecmp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRCASECMP
+#define strncasecmp strncasecmp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRCAT
+#define strcat strcat@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRCHR
+#define strchr strchr@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRDUP
+#define strdup strdup@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRERROR
+#define strerror strerror@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRNCAT
+#define strncat strncat@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRNCMP
+#define strncmp strncmp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRRCHR
+#define strrchr strrchr@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRSEP
+#define strsep strsep@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRTOL
+#define strtol strtol@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_STRTOUL
+#define strtoul strtoul@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_TIME
+#define time time@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __clock_set_expires __clock_set_expires@DB_VERSION_UNIQUE_NAME@
+#define __clock_expired __clock_expired@DB_VERSION_UNIQUE_NAME@
+#define __crypto_region_init __crypto_region_init@DB_VERSION_UNIQUE_NAME@
+#define __db_isbigendian __db_isbigendian@DB_VERSION_UNIQUE_NAME@
+#define __db_byteorder __db_byteorder@DB_VERSION_UNIQUE_NAME@
+#define __db_compress_count_int __db_compress_count_int@DB_VERSION_UNIQUE_NAME@
+#define __db_compress_int __db_compress_int@DB_VERSION_UNIQUE_NAME@
+#define __db_decompress_count_int __db_decompress_count_int@DB_VERSION_UNIQUE_NAME@
+#define __db_decompress_int __db_decompress_int@DB_VERSION_UNIQUE_NAME@
+#define __db_decompress_int32 __db_decompress_int32@DB_VERSION_UNIQUE_NAME@
+#define __db_fchk __db_fchk@DB_VERSION_UNIQUE_NAME@
+#define __db_fcchk __db_fcchk@DB_VERSION_UNIQUE_NAME@
+#define __db_ferr __db_ferr@DB_VERSION_UNIQUE_NAME@
+#define __db_fnl __db_fnl@DB_VERSION_UNIQUE_NAME@
+#define __db_pgerr __db_pgerr@DB_VERSION_UNIQUE_NAME@
+#define __db_pgfmt __db_pgfmt@DB_VERSION_UNIQUE_NAME@
+#ifdef DIAGNOSTIC
+#define __db_assert __db_assert@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __env_panic_msg __env_panic_msg@DB_VERSION_UNIQUE_NAME@
+#define __env_panic __env_panic@DB_VERSION_UNIQUE_NAME@
+#define __db_unknown_error __db_unknown_error@DB_VERSION_UNIQUE_NAME@
+#define __db_syserr __db_syserr@DB_VERSION_UNIQUE_NAME@
+#define __db_err __db_err@DB_VERSION_UNIQUE_NAME@
+#define __db_errx __db_errx@DB_VERSION_UNIQUE_NAME@
+#define __db_errcall __db_errcall@DB_VERSION_UNIQUE_NAME@
+#define __db_errfile __db_errfile@DB_VERSION_UNIQUE_NAME@
+#define __db_msgadd __db_msgadd@DB_VERSION_UNIQUE_NAME@
+#define __db_msgadd_ap __db_msgadd_ap@DB_VERSION_UNIQUE_NAME@
+#define __db_msg __db_msg@DB_VERSION_UNIQUE_NAME@
+#define __db_repmsg __db_repmsg@DB_VERSION_UNIQUE_NAME@
+#define __db_unknown_flag __db_unknown_flag@DB_VERSION_UNIQUE_NAME@
+#define __db_unknown_type __db_unknown_type@DB_VERSION_UNIQUE_NAME@
+#define __db_unknown_path __db_unknown_path@DB_VERSION_UNIQUE_NAME@
+#define __db_check_txn __db_check_txn@DB_VERSION_UNIQUE_NAME@
+#define __db_txn_deadlock_err __db_txn_deadlock_err@DB_VERSION_UNIQUE_NAME@
+#define __db_not_txn_env __db_not_txn_env@DB_VERSION_UNIQUE_NAME@
+#define __db_rec_toobig __db_rec_toobig@DB_VERSION_UNIQUE_NAME@
+#define __db_rec_repl __db_rec_repl@DB_VERSION_UNIQUE_NAME@
+#define __dbc_logging __dbc_logging@DB_VERSION_UNIQUE_NAME@
+#define __db_check_lsn __db_check_lsn@DB_VERSION_UNIQUE_NAME@
+#define __db_rdonly __db_rdonly@DB_VERSION_UNIQUE_NAME@
+#define __db_space_err __db_space_err@DB_VERSION_UNIQUE_NAME@
+#define __db_failed __db_failed@DB_VERSION_UNIQUE_NAME@
+#define __db_getlong __db_getlong@DB_VERSION_UNIQUE_NAME@
+#define __db_getulong __db_getulong@DB_VERSION_UNIQUE_NAME@
+#define __db_idspace __db_idspace@DB_VERSION_UNIQUE_NAME@
+#define __db_log2 __db_log2@DB_VERSION_UNIQUE_NAME@
+#define __db_tablesize __db_tablesize@DB_VERSION_UNIQUE_NAME@
+#define __db_hashinit __db_hashinit@DB_VERSION_UNIQUE_NAME@
+#define __dbt_usercopy __dbt_usercopy@DB_VERSION_UNIQUE_NAME@
+#define __dbt_userfree __dbt_userfree@DB_VERSION_UNIQUE_NAME@
+#define __db_mkpath __db_mkpath@DB_VERSION_UNIQUE_NAME@
+#define __db_openflags __db_openflags@DB_VERSION_UNIQUE_NAME@
+#define __db_util_arg __db_util_arg@DB_VERSION_UNIQUE_NAME@
+#define __db_util_cache __db_util_cache@DB_VERSION_UNIQUE_NAME@
+#define __db_util_logset __db_util_logset@DB_VERSION_UNIQUE_NAME@
+#define __db_util_siginit __db_util_siginit@DB_VERSION_UNIQUE_NAME@
+#define __db_util_interrupted __db_util_interrupted@DB_VERSION_UNIQUE_NAME@
+#define __db_util_sigresend __db_util_sigresend@DB_VERSION_UNIQUE_NAME@
+#define __db_zero_fill __db_zero_fill@DB_VERSION_UNIQUE_NAME@
+#define __db_zero_extend __db_zero_extend@DB_VERSION_UNIQUE_NAME@
+#define __aes_setup __aes_setup@DB_VERSION_UNIQUE_NAME@
+#define __aes_adj_size __aes_adj_size@DB_VERSION_UNIQUE_NAME@
+#define __aes_close __aes_close@DB_VERSION_UNIQUE_NAME@
+#define __aes_decrypt __aes_decrypt@DB_VERSION_UNIQUE_NAME@
+#define __aes_encrypt __aes_encrypt@DB_VERSION_UNIQUE_NAME@
+#define __aes_init __aes_init@DB_VERSION_UNIQUE_NAME@
+#define __crypto_env_close __crypto_env_close@DB_VERSION_UNIQUE_NAME@
+#define __crypto_env_refresh __crypto_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __crypto_algsetup __crypto_algsetup@DB_VERSION_UNIQUE_NAME@
+#define __crypto_decrypt_meta __crypto_decrypt_meta@DB_VERSION_UNIQUE_NAME@
+#define __crypto_set_passwd __crypto_set_passwd@DB_VERSION_UNIQUE_NAME@
+#define __db_generate_iv __db_generate_iv@DB_VERSION_UNIQUE_NAME@
+#define __db_rijndaelKeySetupEnc __db_rijndaelKeySetupEnc@DB_VERSION_UNIQUE_NAME@
+#define __db_rijndaelKeySetupDec __db_rijndaelKeySetupDec@DB_VERSION_UNIQUE_NAME@
+#define __db_rijndaelEncrypt __db_rijndaelEncrypt@DB_VERSION_UNIQUE_NAME@
+#define __db_rijndaelDecrypt __db_rijndaelDecrypt@DB_VERSION_UNIQUE_NAME@
+#define __db_rijndaelEncryptRound __db_rijndaelEncryptRound@DB_VERSION_UNIQUE_NAME@
+#define __db_rijndaelDecryptRound __db_rijndaelDecryptRound@DB_VERSION_UNIQUE_NAME@
+#define __db_makeKey __db_makeKey@DB_VERSION_UNIQUE_NAME@
+#define __db_cipherInit __db_cipherInit@DB_VERSION_UNIQUE_NAME@
+#define __db_blockEncrypt __db_blockEncrypt@DB_VERSION_UNIQUE_NAME@
+#define __db_padEncrypt __db_padEncrypt@DB_VERSION_UNIQUE_NAME@
+#define __db_blockDecrypt __db_blockDecrypt@DB_VERSION_UNIQUE_NAME@
+#define __db_padDecrypt __db_padDecrypt@DB_VERSION_UNIQUE_NAME@
+#define __db_cipherUpdateRounds __db_cipherUpdateRounds@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_setup __dbreg_setup@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_teardown __dbreg_teardown@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_teardown_int __dbreg_teardown_int@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_new_id __dbreg_new_id@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_get_id __dbreg_get_id@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_assign_id __dbreg_assign_id@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_revoke_id __dbreg_revoke_id@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_revoke_id_int __dbreg_revoke_id_int@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_close_id __dbreg_close_id@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_close_id_int __dbreg_close_id_int@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_failchk __dbreg_failchk@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_log_close __dbreg_log_close@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_log_id __dbreg_log_id@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_register_desc __dbreg_register_desc@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_init_recover __dbreg_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_register_print __dbreg_register_print@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_init_print __dbreg_init_print@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_register_recover __dbreg_register_recover@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_stat_print __dbreg_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_print_fname __dbreg_print_fname@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_add_dbentry __dbreg_add_dbentry@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_rem_dbentry __dbreg_rem_dbentry@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_log_files __dbreg_log_files@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_log_nofiles __dbreg_log_nofiles@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_close_files __dbreg_close_files@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_close_file __dbreg_close_file@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_mark_restored __dbreg_mark_restored@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_invalidate_files __dbreg_invalidate_files@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_id_to_db __dbreg_id_to_db@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_id_to_fname __dbreg_id_to_fname@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_fid_to_fname __dbreg_fid_to_fname@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_get_name __dbreg_get_name@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_do_open __dbreg_do_open@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_lazy_id __dbreg_lazy_id@DB_VERSION_UNIQUE_NAME@
+#define __env_alloc_init __env_alloc_init@DB_VERSION_UNIQUE_NAME@
+#define __env_alloc_overhead __env_alloc_overhead@DB_VERSION_UNIQUE_NAME@
+#define __env_alloc_size __env_alloc_size@DB_VERSION_UNIQUE_NAME@
+#define __env_alloc __env_alloc@DB_VERSION_UNIQUE_NAME@
+#define __env_alloc_free __env_alloc_free@DB_VERSION_UNIQUE_NAME@
+#define __env_alloc_extend __env_alloc_extend@DB_VERSION_UNIQUE_NAME@
+#define __env_region_extend __env_region_extend@DB_VERSION_UNIQUE_NAME@
+#define __env_elem_size __env_elem_size@DB_VERSION_UNIQUE_NAME@
+#define __env_get_chunk __env_get_chunk@DB_VERSION_UNIQUE_NAME@
+#define __env_alloc_print __env_alloc_print@DB_VERSION_UNIQUE_NAME@
+#define __env_get_backup_config __env_get_backup_config@DB_VERSION_UNIQUE_NAME@
+#define __env_set_backup_config __env_set_backup_config@DB_VERSION_UNIQUE_NAME@
+#define __env_get_backup_callbacks __env_get_backup_callbacks@DB_VERSION_UNIQUE_NAME@
+#define __env_set_backup_callbacks __env_set_backup_callbacks@DB_VERSION_UNIQUE_NAME@
+#define __env_read_db_config __env_read_db_config@DB_VERSION_UNIQUE_NAME@
+#define __env_failchk_pp __env_failchk_pp@DB_VERSION_UNIQUE_NAME@
+#define __env_failchk_int __env_failchk_int@DB_VERSION_UNIQUE_NAME@
+#define __env_thread_size __env_thread_size@DB_VERSION_UNIQUE_NAME@
+#define __env_thread_max __env_thread_max@DB_VERSION_UNIQUE_NAME@
+#define __env_thread_init __env_thread_init@DB_VERSION_UNIQUE_NAME@
+#define __env_thread_destroy __env_thread_destroy@DB_VERSION_UNIQUE_NAME@
+#define __env_set_state __env_set_state@DB_VERSION_UNIQUE_NAME@
+#define __env_thread_id_string __env_thread_id_string@DB_VERSION_UNIQUE_NAME@
+#define __db_file_extend __db_file_extend@DB_VERSION_UNIQUE_NAME@
+#define __db_file_multi_write __db_file_multi_write@DB_VERSION_UNIQUE_NAME@
+#define __db_file_write __db_file_write@DB_VERSION_UNIQUE_NAME@
+#define __db_env_destroy __db_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define __env_get_alloc __env_get_alloc@DB_VERSION_UNIQUE_NAME@
+#define __env_set_alloc __env_set_alloc@DB_VERSION_UNIQUE_NAME@
+#define __env_get_memory_init __env_get_memory_init@DB_VERSION_UNIQUE_NAME@
+#define __env_set_memory_init __env_set_memory_init@DB_VERSION_UNIQUE_NAME@
+#define __env_get_memory_max __env_get_memory_max@DB_VERSION_UNIQUE_NAME@
+#define __env_set_memory_max __env_set_memory_max@DB_VERSION_UNIQUE_NAME@
+#define __env_get_encrypt_flags __env_get_encrypt_flags@DB_VERSION_UNIQUE_NAME@
+#define __env_set_encrypt __env_set_encrypt@DB_VERSION_UNIQUE_NAME@
+#define __env_map_flags __env_map_flags@DB_VERSION_UNIQUE_NAME@
+#define __env_fetch_flags __env_fetch_flags@DB_VERSION_UNIQUE_NAME@
+#define __env_set_flags __env_set_flags@DB_VERSION_UNIQUE_NAME@
+#define __env_set_backup __env_set_backup@DB_VERSION_UNIQUE_NAME@
+#define __env_set_data_dir __env_set_data_dir@DB_VERSION_UNIQUE_NAME@
+#define __env_add_data_dir __env_add_data_dir@DB_VERSION_UNIQUE_NAME@
+#define __env_set_create_dir __env_set_create_dir@DB_VERSION_UNIQUE_NAME@
+#define __env_set_metadata_dir __env_set_metadata_dir@DB_VERSION_UNIQUE_NAME@
+#define __env_set_data_len __env_set_data_len@DB_VERSION_UNIQUE_NAME@
+#define __env_set_intermediate_dir_mode __env_set_intermediate_dir_mode@DB_VERSION_UNIQUE_NAME@
+#define __env_get_errcall __env_get_errcall@DB_VERSION_UNIQUE_NAME@
+#define __env_set_errcall __env_set_errcall@DB_VERSION_UNIQUE_NAME@
+#define __env_get_errfile __env_get_errfile@DB_VERSION_UNIQUE_NAME@
+#define __env_set_errfile __env_set_errfile@DB_VERSION_UNIQUE_NAME@
+#define __env_get_errpfx __env_get_errpfx@DB_VERSION_UNIQUE_NAME@
+#define __env_set_errpfx __env_set_errpfx@DB_VERSION_UNIQUE_NAME@
+#define __env_set_thread_count __env_set_thread_count@DB_VERSION_UNIQUE_NAME@
+#define __env_get_msgcall __env_get_msgcall@DB_VERSION_UNIQUE_NAME@
+#define __env_set_msgcall __env_set_msgcall@DB_VERSION_UNIQUE_NAME@
+#define __env_get_msgfile __env_get_msgfile@DB_VERSION_UNIQUE_NAME@
+#define __env_set_msgfile __env_set_msgfile@DB_VERSION_UNIQUE_NAME@
+#define __env_set_paniccall __env_set_paniccall@DB_VERSION_UNIQUE_NAME@
+#define __env_set_shm_key __env_set_shm_key@DB_VERSION_UNIQUE_NAME@
+#define __env_set_tmp_dir __env_set_tmp_dir@DB_VERSION_UNIQUE_NAME@
+#define __env_set_verbose __env_set_verbose@DB_VERSION_UNIQUE_NAME@
+#define __db_mi_env __db_mi_env@DB_VERSION_UNIQUE_NAME@
+#define __db_mi_open __db_mi_open@DB_VERSION_UNIQUE_NAME@
+#define __env_not_config __env_not_config@DB_VERSION_UNIQUE_NAME@
+#define __env_set_timeout __env_set_timeout@DB_VERSION_UNIQUE_NAME@
+#define __db_appname __db_appname@DB_VERSION_UNIQUE_NAME@
+#define __db_tmp_open __db_tmp_open@DB_VERSION_UNIQUE_NAME@
+#define __env_open_pp __env_open_pp@DB_VERSION_UNIQUE_NAME@
+#define __env_open __env_open@DB_VERSION_UNIQUE_NAME@
+#define __env_remove __env_remove@DB_VERSION_UNIQUE_NAME@
+#define __env_config __env_config@DB_VERSION_UNIQUE_NAME@
+#define __env_close_pp __env_close_pp@DB_VERSION_UNIQUE_NAME@
+#define __env_close __env_close@DB_VERSION_UNIQUE_NAME@
+#define __env_refresh __env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __env_get_open_flags __env_get_open_flags@DB_VERSION_UNIQUE_NAME@
+#define __env_attach_regions __env_attach_regions@DB_VERSION_UNIQUE_NAME@
+#define __db_apprec __db_apprec@DB_VERSION_UNIQUE_NAME@
+#define __env_openfiles __env_openfiles@DB_VERSION_UNIQUE_NAME@
+#define __env_init_rec __env_init_rec@DB_VERSION_UNIQUE_NAME@
+#define __env_attach __env_attach@DB_VERSION_UNIQUE_NAME@
+#define __env_turn_on __env_turn_on@DB_VERSION_UNIQUE_NAME@
+#define __env_turn_off __env_turn_off@DB_VERSION_UNIQUE_NAME@
+#define __env_panic_set __env_panic_set@DB_VERSION_UNIQUE_NAME@
+#define __env_ref_increment __env_ref_increment@DB_VERSION_UNIQUE_NAME@
+#define __env_ref_decrement __env_ref_decrement@DB_VERSION_UNIQUE_NAME@
+#define __env_ref_get __env_ref_get@DB_VERSION_UNIQUE_NAME@
+#define __env_detach __env_detach@DB_VERSION_UNIQUE_NAME@
+#define __env_remove_env __env_remove_env@DB_VERSION_UNIQUE_NAME@
+#define __env_region_attach __env_region_attach@DB_VERSION_UNIQUE_NAME@
+#define __env_region_share __env_region_share@DB_VERSION_UNIQUE_NAME@
+#define __env_region_detach __env_region_detach@DB_VERSION_UNIQUE_NAME@
+#define __envreg_register __envreg_register@DB_VERSION_UNIQUE_NAME@
+#define __envreg_unregister __envreg_unregister@DB_VERSION_UNIQUE_NAME@
+#define __envreg_xunlock __envreg_xunlock@DB_VERSION_UNIQUE_NAME@
+#define __envreg_isalive __envreg_isalive@DB_VERSION_UNIQUE_NAME@
+#define __env_struct_sig __env_struct_sig@DB_VERSION_UNIQUE_NAME@
+#define __env_stat_print_pp __env_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __db_print_fh __db_print_fh@DB_VERSION_UNIQUE_NAME@
+#define __db_print_fileid __db_print_fileid@DB_VERSION_UNIQUE_NAME@
+#define __db_dl __db_dl@DB_VERSION_UNIQUE_NAME@
+#define __db_dl_pct __db_dl_pct@DB_VERSION_UNIQUE_NAME@
+#define __db_dlbytes __db_dlbytes@DB_VERSION_UNIQUE_NAME@
+#define __db_print_reginfo __db_print_reginfo@DB_VERSION_UNIQUE_NAME@
+#define __db_stat_not_built __db_stat_not_built@DB_VERSION_UNIQUE_NAME@
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_get_ack_policy __repmgr_get_ack_policy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_site_by_eid __repmgr_site_by_eid@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_stat_print_pp __repmgr_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_handle_event __repmgr_handle_event@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_channel __repmgr_channel@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_set_msg_dispatch __repmgr_set_msg_dispatch@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_init_recover __repmgr_init_recover@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __fop_create_42_desc __fop_create_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_desc __fop_create_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove_desc __fop_remove_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_42_desc __fop_write_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_desc __fop_write_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_42_desc __fop_rename_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_noundo_46_desc __fop_rename_noundo_46_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_desc __fop_rename_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_noundo_desc __fop_rename_noundo_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_file_remove_desc __fop_file_remove_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_init_recover __fop_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_42_print __fop_create_42_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_print __fop_create_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove_print __fop_remove_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_42_print __fop_write_42_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_print __fop_write_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_42_print __fop_rename_42_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_print __fop_rename_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_file_remove_print __fop_file_remove_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_init_print __fop_init_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_create __fop_create@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove __fop_remove@DB_VERSION_UNIQUE_NAME@
+#define __fop_write __fop_write@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename __fop_rename@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_recover __fop_create_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_42_recover __fop_create_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove_recover __fop_remove_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_recover __fop_write_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_42_recover __fop_write_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_recover __fop_rename_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_noundo_recover __fop_rename_noundo_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_42_recover __fop_rename_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_noundo_46_recover __fop_rename_noundo_46_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_file_remove_recover __fop_file_remove_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_lock_handle __fop_lock_handle@DB_VERSION_UNIQUE_NAME@
+#define __fop_file_setup __fop_file_setup@DB_VERSION_UNIQUE_NAME@
+#define __fop_subdb_setup __fop_subdb_setup@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove_setup __fop_remove_setup@DB_VERSION_UNIQUE_NAME@
+#define __fop_read_meta __fop_read_meta@DB_VERSION_UNIQUE_NAME@
+#define __fop_dummy __fop_dummy@DB_VERSION_UNIQUE_NAME@
+#define __fop_dbrename __fop_dbrename@DB_VERSION_UNIQUE_NAME@
+#define __ham_quick_delete __ham_quick_delete@DB_VERSION_UNIQUE_NAME@
+#define __hamc_init __hamc_init@DB_VERSION_UNIQUE_NAME@
+#define __hamc_count __hamc_count@DB_VERSION_UNIQUE_NAME@
+#define __hamc_cmp __hamc_cmp@DB_VERSION_UNIQUE_NAME@
+#define __hamc_dup __hamc_dup@DB_VERSION_UNIQUE_NAME@
+#define __ham_contract_table __ham_contract_table@DB_VERSION_UNIQUE_NAME@
+#define __ham_call_hash __ham_call_hash@DB_VERSION_UNIQUE_NAME@
+#define __ham_overwrite __ham_overwrite@DB_VERSION_UNIQUE_NAME@
+#define __ham_lookup __ham_lookup@DB_VERSION_UNIQUE_NAME@
+#define __ham_init_dbt __ham_init_dbt@DB_VERSION_UNIQUE_NAME@
+#define __hamc_update __hamc_update@DB_VERSION_UNIQUE_NAME@
+#define __ham_get_clist __ham_get_clist@DB_VERSION_UNIQUE_NAME@
+#define __ham_insdel_desc __ham_insdel_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_insdel_42_desc __ham_insdel_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_newpage_desc __ham_newpage_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_splitdata_desc __ham_splitdata_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_replace_desc __ham_replace_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_replace_42_desc __ham_replace_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_copypage_desc __ham_copypage_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_metagroup_42_desc __ham_metagroup_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_metagroup_desc __ham_metagroup_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_groupalloc_42_desc __ham_groupalloc_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_groupalloc_desc __ham_groupalloc_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_changeslot_desc __ham_changeslot_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_contract_desc __ham_contract_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_curadj_desc __ham_curadj_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_chgpg_desc __ham_chgpg_desc@DB_VERSION_UNIQUE_NAME@
+#define __ham_init_recover __ham_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_insdel_print __ham_insdel_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_insdel_42_print __ham_insdel_42_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_newpage_print __ham_newpage_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_splitdata_print __ham_splitdata_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_replace_print __ham_replace_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_replace_42_print __ham_replace_42_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_copypage_print __ham_copypage_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_metagroup_42_print __ham_metagroup_42_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_metagroup_print __ham_metagroup_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_groupalloc_42_print __ham_groupalloc_42_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_groupalloc_print __ham_groupalloc_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_changeslot_print __ham_changeslot_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_contract_print __ham_contract_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_curadj_print __ham_curadj_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_chgpg_print __ham_chgpg_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_init_print __ham_init_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_compact_int __ham_compact_int@DB_VERSION_UNIQUE_NAME@
+#define __ham_compact_bucket __ham_compact_bucket@DB_VERSION_UNIQUE_NAME@
+#define __ham_compact_hash __ham_compact_hash@DB_VERSION_UNIQUE_NAME@
+#define __ham_pgin __ham_pgin@DB_VERSION_UNIQUE_NAME@
+#define __ham_pgout __ham_pgout@DB_VERSION_UNIQUE_NAME@
+#define __ham_mswap __ham_mswap@DB_VERSION_UNIQUE_NAME@
+#define __ham_add_dup __ham_add_dup@DB_VERSION_UNIQUE_NAME@
+#define __ham_dup_convert __ham_dup_convert@DB_VERSION_UNIQUE_NAME@
+#define __ham_make_dup __ham_make_dup@DB_VERSION_UNIQUE_NAME@
+#define __ham_dsearch __ham_dsearch@DB_VERSION_UNIQUE_NAME@
+#define __ham_func2 __ham_func2@DB_VERSION_UNIQUE_NAME@
+#define __ham_func3 __ham_func3@DB_VERSION_UNIQUE_NAME@
+#define __ham_func4 __ham_func4@DB_VERSION_UNIQUE_NAME@
+#define __ham_func5 __ham_func5@DB_VERSION_UNIQUE_NAME@
+#define __ham_test __ham_test@DB_VERSION_UNIQUE_NAME@
+#define __ham_get_meta __ham_get_meta@DB_VERSION_UNIQUE_NAME@
+#define __ham_release_meta __ham_release_meta@DB_VERSION_UNIQUE_NAME@
+#define __ham_dirty_meta __ham_dirty_meta@DB_VERSION_UNIQUE_NAME@
+#define __ham_return_meta __ham_return_meta@DB_VERSION_UNIQUE_NAME@
+#define __ham_db_create __ham_db_create@DB_VERSION_UNIQUE_NAME@
+#define __ham_db_close __ham_db_close@DB_VERSION_UNIQUE_NAME@
+#define __ham_get_h_ffactor __ham_get_h_ffactor@DB_VERSION_UNIQUE_NAME@
+#define __ham_set_h_compare __ham_set_h_compare@DB_VERSION_UNIQUE_NAME@
+#define __ham_get_h_nelem __ham_get_h_nelem@DB_VERSION_UNIQUE_NAME@
+#define __ham_copy_config __ham_copy_config@DB_VERSION_UNIQUE_NAME@
+#define __ham_open __ham_open@DB_VERSION_UNIQUE_NAME@
+#define __ham_metachk __ham_metachk@DB_VERSION_UNIQUE_NAME@
+#define __ham_new_file __ham_new_file@DB_VERSION_UNIQUE_NAME@
+#define __ham_new_subdb __ham_new_subdb@DB_VERSION_UNIQUE_NAME@
+#define __ham_item __ham_item@DB_VERSION_UNIQUE_NAME@
+#define __ham_item_reset __ham_item_reset@DB_VERSION_UNIQUE_NAME@
+#define __ham_item_init __ham_item_init@DB_VERSION_UNIQUE_NAME@
+#define __ham_item_last __ham_item_last@DB_VERSION_UNIQUE_NAME@
+#define __ham_item_first __ham_item_first@DB_VERSION_UNIQUE_NAME@
+#define __ham_item_prev __ham_item_prev@DB_VERSION_UNIQUE_NAME@
+#define __ham_item_next __ham_item_next@DB_VERSION_UNIQUE_NAME@
+#define __ham_insertpair __ham_insertpair@DB_VERSION_UNIQUE_NAME@
+#define __ham_getindex __ham_getindex@DB_VERSION_UNIQUE_NAME@
+#define __ham_verify_sorted_page __ham_verify_sorted_page@DB_VERSION_UNIQUE_NAME@
+#define __ham_sort_page_cursor __ham_sort_page_cursor@DB_VERSION_UNIQUE_NAME@
+#define __ham_sort_page __ham_sort_page@DB_VERSION_UNIQUE_NAME@
+#define __ham_del_pair __ham_del_pair@DB_VERSION_UNIQUE_NAME@
+#define __ham_replpair __ham_replpair@DB_VERSION_UNIQUE_NAME@
+#define __ham_onpage_replace __ham_onpage_replace@DB_VERSION_UNIQUE_NAME@
+#define __ham_merge_pages __ham_merge_pages@DB_VERSION_UNIQUE_NAME@
+#define __ham_split_page __ham_split_page@DB_VERSION_UNIQUE_NAME@
+#define __ham_add_el __ham_add_el@DB_VERSION_UNIQUE_NAME@
+#define __ham_copypair __ham_copypair@DB_VERSION_UNIQUE_NAME@
+#define __ham_add_ovflpage __ham_add_ovflpage@DB_VERSION_UNIQUE_NAME@
+#define __ham_get_cpage __ham_get_cpage@DB_VERSION_UNIQUE_NAME@
+#define __ham_next_cpage __ham_next_cpage@DB_VERSION_UNIQUE_NAME@
+#define __ham_lock_bucket __ham_lock_bucket@DB_VERSION_UNIQUE_NAME@
+#define __ham_dpair __ham_dpair@DB_VERSION_UNIQUE_NAME@
+#define __ham_insdel_recover __ham_insdel_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_insdel_42_recover __ham_insdel_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_newpage_recover __ham_newpage_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_replace_recover __ham_replace_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_replace_42_recover __ham_replace_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_splitdata_recover __ham_splitdata_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_copypage_recover __ham_copypage_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_metagroup_recover __ham_metagroup_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_contract_recover __ham_contract_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_groupalloc_recover __ham_groupalloc_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_changeslot_recover __ham_changeslot_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_curadj_recover __ham_curadj_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_chgpg_recover __ham_chgpg_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_metagroup_42_recover __ham_metagroup_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_groupalloc_42_recover __ham_groupalloc_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __ham_reclaim __ham_reclaim@DB_VERSION_UNIQUE_NAME@
+#define __ham_truncate __ham_truncate@DB_VERSION_UNIQUE_NAME@
+#define __ham_stat __ham_stat@DB_VERSION_UNIQUE_NAME@
+#define __ham_stat_print __ham_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __ham_print_cursor __ham_print_cursor@DB_VERSION_UNIQUE_NAME@
+#define __ham_traverse __ham_traverse@DB_VERSION_UNIQUE_NAME@
+#define __db_no_hash_am __db_no_hash_am@DB_VERSION_UNIQUE_NAME@
+#define __ham_30_hashmeta __ham_30_hashmeta@DB_VERSION_UNIQUE_NAME@
+#define __ham_30_sizefix __ham_30_sizefix@DB_VERSION_UNIQUE_NAME@
+#define __ham_31_hashmeta __ham_31_hashmeta@DB_VERSION_UNIQUE_NAME@
+#define __ham_31_hash __ham_31_hash@DB_VERSION_UNIQUE_NAME@
+#define __ham_46_hashmeta __ham_46_hashmeta@DB_VERSION_UNIQUE_NAME@
+#define __ham_46_hash __ham_46_hash@DB_VERSION_UNIQUE_NAME@
+#define __ham_vrfy_meta __ham_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define __ham_vrfy __ham_vrfy@DB_VERSION_UNIQUE_NAME@
+#define __ham_vrfy_structure __ham_vrfy_structure@DB_VERSION_UNIQUE_NAME@
+#define __ham_vrfy_hashing __ham_vrfy_hashing@DB_VERSION_UNIQUE_NAME@
+#define __ham_salvage __ham_salvage@DB_VERSION_UNIQUE_NAME@
+#define __ham_meta2pgset __ham_meta2pgset@DB_VERSION_UNIQUE_NAME@
+#define __heapc_init __heapc_init@DB_VERSION_UNIQUE_NAME@
+#define __heap_ditem __heap_ditem@DB_VERSION_UNIQUE_NAME@
+#define __heap_append __heap_append@DB_VERSION_UNIQUE_NAME@
+#define __heap_pitem __heap_pitem@DB_VERSION_UNIQUE_NAME@
+#define __heapc_dup __heapc_dup@DB_VERSION_UNIQUE_NAME@
+#define __heapc_gsplit __heapc_gsplit@DB_VERSION_UNIQUE_NAME@
+#define __heapc_refresh __heapc_refresh@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_desc __heap_addrem_desc@DB_VERSION_UNIQUE_NAME@
+#define __heap_pg_alloc_desc __heap_pg_alloc_desc@DB_VERSION_UNIQUE_NAME@
+#define __heap_trunc_meta_desc __heap_trunc_meta_desc@DB_VERSION_UNIQUE_NAME@
+#define __heap_trunc_page_desc __heap_trunc_page_desc@DB_VERSION_UNIQUE_NAME@
+#define __heap_init_recover __heap_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_print __heap_addrem_print@DB_VERSION_UNIQUE_NAME@
+#define __heap_pg_alloc_print __heap_pg_alloc_print@DB_VERSION_UNIQUE_NAME@
+#define __heap_trunc_meta_print __heap_trunc_meta_print@DB_VERSION_UNIQUE_NAME@
+#define __heap_trunc_page_print __heap_trunc_page_print@DB_VERSION_UNIQUE_NAME@
+#define __heap_init_print __heap_init_print@DB_VERSION_UNIQUE_NAME@
+#define __heap_backup __heap_backup@DB_VERSION_UNIQUE_NAME@
+#define __heap_pgin __heap_pgin@DB_VERSION_UNIQUE_NAME@
+#define __heap_pgout __heap_pgout@DB_VERSION_UNIQUE_NAME@
+#define __heap_mswap __heap_mswap@DB_VERSION_UNIQUE_NAME@
+#define __heap_db_create __heap_db_create@DB_VERSION_UNIQUE_NAME@
+#define __heap_db_close __heap_db_close@DB_VERSION_UNIQUE_NAME@
+#define __heap_get_heapsize __heap_get_heapsize@DB_VERSION_UNIQUE_NAME@
+#define __heap_get_heap_regionsize __heap_get_heap_regionsize@DB_VERSION_UNIQUE_NAME@
+#define __heap_set_heapsize __heap_set_heapsize@DB_VERSION_UNIQUE_NAME@
+#define __heap_set_heap_regionsize __heap_set_heap_regionsize@DB_VERSION_UNIQUE_NAME@
+#define __heap_exist __heap_exist@DB_VERSION_UNIQUE_NAME@
+#define __heap_open __heap_open@DB_VERSION_UNIQUE_NAME@
+#define __heap_metachk __heap_metachk@DB_VERSION_UNIQUE_NAME@
+#define __heap_read_meta __heap_read_meta@DB_VERSION_UNIQUE_NAME@
+#define __heap_new_file __heap_new_file@DB_VERSION_UNIQUE_NAME@
+#define __heap_create_region __heap_create_region@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_recover __heap_addrem_recover@DB_VERSION_UNIQUE_NAME@
+#define __heap_pg_alloc_recover __heap_pg_alloc_recover@DB_VERSION_UNIQUE_NAME@
+#define __heap_trunc_meta_recover __heap_trunc_meta_recover@DB_VERSION_UNIQUE_NAME@
+#define __heap_trunc_page_recover __heap_trunc_page_recover@DB_VERSION_UNIQUE_NAME@
+#define __heap_truncate __heap_truncate@DB_VERSION_UNIQUE_NAME@
+#define __heap_stat __heap_stat@DB_VERSION_UNIQUE_NAME@
+#define __heap_stat_print __heap_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __heap_print_cursor __heap_print_cursor@DB_VERSION_UNIQUE_NAME@
+#define __heap_stat_callback __heap_stat_callback@DB_VERSION_UNIQUE_NAME@
+#define __heap_traverse __heap_traverse@DB_VERSION_UNIQUE_NAME@
+#define __db_no_heap_am __db_no_heap_am@DB_VERSION_UNIQUE_NAME@
+#define __heap_vrfy_meta __heap_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define __heap_vrfy __heap_vrfy@DB_VERSION_UNIQUE_NAME@
+#define __heap_vrfy_structure __heap_vrfy_structure@DB_VERSION_UNIQUE_NAME@
+#define __heap_salvage __heap_salvage@DB_VERSION_UNIQUE_NAME@
+#define __heap_meta2pgset __heap_meta2pgset@DB_VERSION_UNIQUE_NAME@
+#define __db_chksum __db_chksum@DB_VERSION_UNIQUE_NAME@
+#define __db_derive_mac __db_derive_mac@DB_VERSION_UNIQUE_NAME@
+#define __db_check_chksum __db_check_chksum@DB_VERSION_UNIQUE_NAME@
+#define __db_SHA1Transform __db_SHA1Transform@DB_VERSION_UNIQUE_NAME@
+#define __db_SHA1Init __db_SHA1Init@DB_VERSION_UNIQUE_NAME@
+#define __db_SHA1Update __db_SHA1Update@DB_VERSION_UNIQUE_NAME@
+#define __db_SHA1Final __db_SHA1Final@DB_VERSION_UNIQUE_NAME@
+#define __lock_vec_pp __lock_vec_pp@DB_VERSION_UNIQUE_NAME@
+#define __lock_vec __lock_vec@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_pp __lock_get_pp@DB_VERSION_UNIQUE_NAME@
+#define __lock_get __lock_get@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_internal __lock_get_internal@DB_VERSION_UNIQUE_NAME@
+#define __lock_put_pp __lock_put_pp@DB_VERSION_UNIQUE_NAME@
+#define __lock_put __lock_put@DB_VERSION_UNIQUE_NAME@
+#define __lock_downgrade __lock_downgrade@DB_VERSION_UNIQUE_NAME@
+#define __lock_locker_same_family __lock_locker_same_family@DB_VERSION_UNIQUE_NAME@
+#define __lock_wakeup __lock_wakeup@DB_VERSION_UNIQUE_NAME@
+#define __lock_promote __lock_promote@DB_VERSION_UNIQUE_NAME@
+#define __lock_change __lock_change@DB_VERSION_UNIQUE_NAME@
+#define __lock_detect_pp __lock_detect_pp@DB_VERSION_UNIQUE_NAME@
+#define __lock_detect __lock_detect@DB_VERSION_UNIQUE_NAME@
+#define __lock_failchk __lock_failchk@DB_VERSION_UNIQUE_NAME@
+#define __lock_id_pp __lock_id_pp@DB_VERSION_UNIQUE_NAME@
+#define __lock_id __lock_id@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_thread_id __lock_set_thread_id@DB_VERSION_UNIQUE_NAME@
+#define __lock_id_free_pp __lock_id_free_pp@DB_VERSION_UNIQUE_NAME@
+#define __lock_id_free __lock_id_free@DB_VERSION_UNIQUE_NAME@
+#define __lock_id_set __lock_id_set@DB_VERSION_UNIQUE_NAME@
+#define __lock_getlocker __lock_getlocker@DB_VERSION_UNIQUE_NAME@
+#define __lock_getlocker_int __lock_getlocker_int@DB_VERSION_UNIQUE_NAME@
+#define __lock_addfamilylocker __lock_addfamilylocker@DB_VERSION_UNIQUE_NAME@
+#define __lock_freelocker __lock_freelocker@DB_VERSION_UNIQUE_NAME@
+#define __lock_familyremove __lock_familyremove@DB_VERSION_UNIQUE_NAME@
+#define __lock_fix_list __lock_fix_list@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_list __lock_get_list@DB_VERSION_UNIQUE_NAME@
+#define __lock_list_print __lock_list_print@DB_VERSION_UNIQUE_NAME@
+#define __lock_env_create __lock_env_create@DB_VERSION_UNIQUE_NAME@
+#define __lock_env_destroy __lock_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_lk_conflicts __lock_get_lk_conflicts@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_lk_conflicts __lock_set_lk_conflicts@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_lk_detect __lock_get_lk_detect@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_lk_detect __lock_set_lk_detect@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_lk_max_locks __lock_get_lk_max_locks@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_lk_max_locks __lock_set_lk_max_locks@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_lk_max_lockers __lock_get_lk_max_lockers@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_lk_max_lockers __lock_set_lk_max_lockers@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_lk_max_objects __lock_get_lk_max_objects@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_lk_max_objects __lock_set_lk_max_objects@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_lk_partitions __lock_get_lk_partitions@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_lk_partitions __lock_set_lk_partitions@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_lk_tablesize __lock_get_lk_tablesize@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_lk_tablesize __lock_set_lk_tablesize@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_lk_priority __lock_set_lk_priority@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_lk_priority __lock_get_lk_priority@DB_VERSION_UNIQUE_NAME@
+#define __lock_get_env_timeout __lock_get_env_timeout@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_env_timeout __lock_set_env_timeout@DB_VERSION_UNIQUE_NAME@
+#define __lock_open __lock_open@DB_VERSION_UNIQUE_NAME@
+#define __lock_env_refresh __lock_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __lock_region_mutex_count __lock_region_mutex_count@DB_VERSION_UNIQUE_NAME@
+#define __lock_region_mutex_max __lock_region_mutex_max@DB_VERSION_UNIQUE_NAME@
+#define __lock_region_max __lock_region_max@DB_VERSION_UNIQUE_NAME@
+#define __lock_region_size __lock_region_size@DB_VERSION_UNIQUE_NAME@
+#define __lock_stat_pp __lock_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define __lock_stat_print_pp __lock_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __lock_stat_print __lock_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __lock_printlock __lock_printlock@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_timeout __lock_set_timeout@DB_VERSION_UNIQUE_NAME@
+#define __lock_set_timeout_internal __lock_set_timeout_internal@DB_VERSION_UNIQUE_NAME@
+#define __lock_inherit_timeout __lock_inherit_timeout@DB_VERSION_UNIQUE_NAME@
+#define __lock_ohash __lock_ohash@DB_VERSION_UNIQUE_NAME@
+#define __lock_lhash __lock_lhash@DB_VERSION_UNIQUE_NAME@
+#define __lock_nomem __lock_nomem@DB_VERSION_UNIQUE_NAME@
+#define __log_open __log_open@DB_VERSION_UNIQUE_NAME@
+#define __log_find __log_find@DB_VERSION_UNIQUE_NAME@
+#define __log_valid __log_valid@DB_VERSION_UNIQUE_NAME@
+#define __log_env_refresh __log_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __log_get_cached_ckp_lsn __log_get_cached_ckp_lsn@DB_VERSION_UNIQUE_NAME@
+#define __log_region_mutex_count __log_region_mutex_count@DB_VERSION_UNIQUE_NAME@
+#define __log_region_mutex_max __log_region_mutex_max@DB_VERSION_UNIQUE_NAME@
+#define __log_region_size __log_region_size@DB_VERSION_UNIQUE_NAME@
+#define __log_region_max __log_region_max@DB_VERSION_UNIQUE_NAME@
+#define __log_vtruncate __log_vtruncate@DB_VERSION_UNIQUE_NAME@
+#define __log_is_outdated __log_is_outdated@DB_VERSION_UNIQUE_NAME@
+#define __log_zero __log_zero@DB_VERSION_UNIQUE_NAME@
+#define __log_inmem_lsnoff __log_inmem_lsnoff@DB_VERSION_UNIQUE_NAME@
+#define __log_inmem_newfile __log_inmem_newfile@DB_VERSION_UNIQUE_NAME@
+#define __log_inmem_chkspace __log_inmem_chkspace@DB_VERSION_UNIQUE_NAME@
+#define __log_inmem_copyout __log_inmem_copyout@DB_VERSION_UNIQUE_NAME@
+#define __log_inmem_copyin __log_inmem_copyin@DB_VERSION_UNIQUE_NAME@
+#define __log_set_version __log_set_version@DB_VERSION_UNIQUE_NAME@
+#define __log_get_oldversion __log_get_oldversion@DB_VERSION_UNIQUE_NAME@
+#define __log_archive_pp __log_archive_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_archive __log_archive@DB_VERSION_UNIQUE_NAME@
+#define __log_get_stable_lsn __log_get_stable_lsn@DB_VERSION_UNIQUE_NAME@
+#define __log_autoremove __log_autoremove@DB_VERSION_UNIQUE_NAME@
+#define __log_check_page_lsn __log_check_page_lsn@DB_VERSION_UNIQUE_NAME@
+#define __log_printf_capi __log_printf_capi@DB_VERSION_UNIQUE_NAME@
+#define __log_printf_pp __log_printf_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_printf __log_printf@DB_VERSION_UNIQUE_NAME@
+#define __log_cursor_pp __log_cursor_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_cursor __log_cursor@DB_VERSION_UNIQUE_NAME@
+#define __logc_close __logc_close@DB_VERSION_UNIQUE_NAME@
+#define __logc_version __logc_version@DB_VERSION_UNIQUE_NAME@
+#define __logc_get __logc_get@DB_VERSION_UNIQUE_NAME@
+#define __log_hdrswap __log_hdrswap@DB_VERSION_UNIQUE_NAME@
+#define __log_persistswap __log_persistswap@DB_VERSION_UNIQUE_NAME@
+#define __log_read_record_pp __log_read_record_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_read_record __log_read_record@DB_VERSION_UNIQUE_NAME@
+#define __log_env_create __log_env_create@DB_VERSION_UNIQUE_NAME@
+#define __log_env_destroy __log_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define __log_get_lg_bsize __log_get_lg_bsize@DB_VERSION_UNIQUE_NAME@
+#define __log_set_lg_bsize __log_set_lg_bsize@DB_VERSION_UNIQUE_NAME@
+#define __log_get_lg_filemode __log_get_lg_filemode@DB_VERSION_UNIQUE_NAME@
+#define __log_set_lg_filemode __log_set_lg_filemode@DB_VERSION_UNIQUE_NAME@
+#define __log_get_lg_max __log_get_lg_max@DB_VERSION_UNIQUE_NAME@
+#define __log_set_lg_max __log_set_lg_max@DB_VERSION_UNIQUE_NAME@
+#define __log_get_lg_regionmax __log_get_lg_regionmax@DB_VERSION_UNIQUE_NAME@
+#define __log_set_lg_regionmax __log_set_lg_regionmax@DB_VERSION_UNIQUE_NAME@
+#define __log_get_lg_dir __log_get_lg_dir@DB_VERSION_UNIQUE_NAME@
+#define __log_set_lg_dir __log_set_lg_dir@DB_VERSION_UNIQUE_NAME@
+#define __log_get_flags __log_get_flags@DB_VERSION_UNIQUE_NAME@
+#define __log_set_flags __log_set_flags@DB_VERSION_UNIQUE_NAME@
+#define __log_get_config __log_get_config@DB_VERSION_UNIQUE_NAME@
+#define __log_set_config __log_set_config@DB_VERSION_UNIQUE_NAME@
+#define __log_set_config_int __log_set_config_int@DB_VERSION_UNIQUE_NAME@
+#define __log_check_sizes __log_check_sizes@DB_VERSION_UNIQUE_NAME@
+#define __log_print_record __log_print_record@DB_VERSION_UNIQUE_NAME@
+#define __log_put_pp __log_put_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_put __log_put@DB_VERSION_UNIQUE_NAME@
+#define __log_current_lsn_int __log_current_lsn_int@DB_VERSION_UNIQUE_NAME@
+#define __log_current_lsn __log_current_lsn@DB_VERSION_UNIQUE_NAME@
+#define __log_newfile __log_newfile@DB_VERSION_UNIQUE_NAME@
+#define __log_flush_pp __log_flush_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_flush __log_flush@DB_VERSION_UNIQUE_NAME@
+#define __log_flush_int __log_flush_int@DB_VERSION_UNIQUE_NAME@
+#define __log_file_pp __log_file_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_name __log_name@DB_VERSION_UNIQUE_NAME@
+#define __log_rep_put __log_rep_put@DB_VERSION_UNIQUE_NAME@
+#define __log_put_record_pp __log_put_record_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_put_record __log_put_record@DB_VERSION_UNIQUE_NAME@
+#define __log_stat_pp __log_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_stat_print_pp __log_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_stat_print __log_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __log_verify_pp __log_verify_pp@DB_VERSION_UNIQUE_NAME@
+#define __log_verify __log_verify@DB_VERSION_UNIQUE_NAME@
+#define __log_verify_wrap __log_verify_wrap@DB_VERSION_UNIQUE_NAME@
+#define __crdel_init_verify __crdel_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_init_verify __db_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_init_verify __dbreg_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_init_verify __bam_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_init_verify __fop_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_init_verify __ham_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __heap_init_verify __heap_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __qam_init_verify __qam_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __txn_init_verify __txn_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_log_verify_global_report __db_log_verify_global_report@DB_VERSION_UNIQUE_NAME@
+#define __crdel_metasub_verify __crdel_metasub_verify@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_create_verify __crdel_inmem_create_verify@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_rename_verify __crdel_inmem_rename_verify@DB_VERSION_UNIQUE_NAME@
+#define __crdel_inmem_remove_verify __crdel_inmem_remove_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_addrem_verify __db_addrem_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_big_verify __db_big_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_ovref_verify __db_ovref_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_relink_42_verify __db_relink_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_debug_verify __db_debug_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_noop_verify __db_noop_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_alloc_42_verify __db_pg_alloc_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_alloc_verify __db_pg_alloc_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_free_42_verify __db_pg_free_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_free_verify __db_pg_free_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_cksum_verify __db_cksum_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_freedata_42_verify __db_pg_freedata_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_freedata_verify __db_pg_freedata_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_init_verify __db_pg_init_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_sort_44_verify __db_pg_sort_44_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pg_trunc_verify __db_pg_trunc_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_realloc_verify __db_realloc_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_relink_verify __db_relink_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_merge_verify __db_merge_verify@DB_VERSION_UNIQUE_NAME@
+#define __db_pgno_verify __db_pgno_verify@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_register_verify __dbreg_register_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_verify __bam_split_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_split_42_verify __bam_split_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_rsplit_verify __bam_rsplit_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_adj_verify __bam_adj_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_irep_verify __bam_irep_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_cadjust_verify __bam_cadjust_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_cdel_verify __bam_cdel_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_repl_verify __bam_repl_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_root_verify __bam_root_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_curadj_verify __bam_curadj_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_rcuradj_verify __bam_rcuradj_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_relink_43_verify __bam_relink_43_verify@DB_VERSION_UNIQUE_NAME@
+#define __bam_merge_44_verify __bam_merge_44_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_42_verify __fop_create_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_verify __fop_create_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove_verify __fop_remove_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_42_verify __fop_write_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_verify __fop_write_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_42_verify __fop_rename_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_verify __fop_rename_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_file_remove_verify __fop_file_remove_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_insdel_verify __ham_insdel_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_newpage_verify __ham_newpage_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_splitdata_verify __ham_splitdata_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_replace_verify __ham_replace_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_copypage_verify __ham_copypage_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_metagroup_42_verify __ham_metagroup_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_metagroup_verify __ham_metagroup_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_groupalloc_42_verify __ham_groupalloc_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_groupalloc_verify __ham_groupalloc_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_changeslot_verify __ham_changeslot_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_contract_verify __ham_contract_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_curadj_verify __ham_curadj_verify@DB_VERSION_UNIQUE_NAME@
+#define __ham_chgpg_verify __ham_chgpg_verify@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_verify __heap_addrem_verify@DB_VERSION_UNIQUE_NAME@
+#define __heap_pg_alloc_verify __heap_pg_alloc_verify@DB_VERSION_UNIQUE_NAME@
+#define __heap_trunc_meta_verify __heap_trunc_meta_verify@DB_VERSION_UNIQUE_NAME@
+#define __heap_trunc_page_verify __heap_trunc_page_verify@DB_VERSION_UNIQUE_NAME@
+#define __qam_incfirst_verify __qam_incfirst_verify@DB_VERSION_UNIQUE_NAME@
+#define __qam_mvptr_verify __qam_mvptr_verify@DB_VERSION_UNIQUE_NAME@
+#define __qam_del_verify __qam_del_verify@DB_VERSION_UNIQUE_NAME@
+#define __qam_add_verify __qam_add_verify@DB_VERSION_UNIQUE_NAME@
+#define __qam_delext_verify __qam_delext_verify@DB_VERSION_UNIQUE_NAME@
+#define __txn_regop_42_verify __txn_regop_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __txn_regop_verify __txn_regop_verify@DB_VERSION_UNIQUE_NAME@
+#define __txn_ckp_42_verify __txn_ckp_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __txn_ckp_verify __txn_ckp_verify@DB_VERSION_UNIQUE_NAME@
+#define __txn_child_verify __txn_child_verify@DB_VERSION_UNIQUE_NAME@
+#define __txn_xa_regop_42_verify __txn_xa_regop_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __txn_prepare_verify __txn_prepare_verify@DB_VERSION_UNIQUE_NAME@
+#define __txn_recycle_verify __txn_recycle_verify@DB_VERSION_UNIQUE_NAME@
+#define __create_log_vrfy_info __create_log_vrfy_info@DB_VERSION_UNIQUE_NAME@
+#define __destroy_log_vrfy_info __destroy_log_vrfy_info@DB_VERSION_UNIQUE_NAME@
+#define __put_txn_vrfy_info __put_txn_vrfy_info@DB_VERSION_UNIQUE_NAME@
+#define __get_txn_vrfy_info __get_txn_vrfy_info@DB_VERSION_UNIQUE_NAME@
+#define __add_recycle_lsn_range __add_recycle_lsn_range@DB_VERSION_UNIQUE_NAME@
+#define __iterate_txninfo __iterate_txninfo@DB_VERSION_UNIQUE_NAME@
+#define __rem_last_recycle_lsn __rem_last_recycle_lsn@DB_VERSION_UNIQUE_NAME@
+#define __add_file_updated __add_file_updated@DB_VERSION_UNIQUE_NAME@
+#define __del_file_updated __del_file_updated@DB_VERSION_UNIQUE_NAME@
+#define __clear_fileups __clear_fileups@DB_VERSION_UNIQUE_NAME@
+#define __free_txninfo_stack __free_txninfo_stack@DB_VERSION_UNIQUE_NAME@
+#define __free_txninfo __free_txninfo@DB_VERSION_UNIQUE_NAME@
+#define __put_filereg_info __put_filereg_info@DB_VERSION_UNIQUE_NAME@
+#define __del_filelife __del_filelife@DB_VERSION_UNIQUE_NAME@
+#define __put_filelife __put_filelife@DB_VERSION_UNIQUE_NAME@
+#define __get_filelife __get_filelife@DB_VERSION_UNIQUE_NAME@
+#define __get_filereg_by_dbregid __get_filereg_by_dbregid@DB_VERSION_UNIQUE_NAME@
+#define __add_dbregid __add_dbregid@DB_VERSION_UNIQUE_NAME@
+#define __get_filereg_info __get_filereg_info@DB_VERSION_UNIQUE_NAME@
+#define __free_filereg_info __free_filereg_info@DB_VERSION_UNIQUE_NAME@
+#define __get_ckp_info __get_ckp_info@DB_VERSION_UNIQUE_NAME@
+#define __get_last_ckp_info __get_last_ckp_info@DB_VERSION_UNIQUE_NAME@
+#define __put_ckp_info __put_ckp_info@DB_VERSION_UNIQUE_NAME@
+#define __get_timestamp_info __get_timestamp_info@DB_VERSION_UNIQUE_NAME@
+#define __get_latest_timestamp_info __get_latest_timestamp_info@DB_VERSION_UNIQUE_NAME@
+#define __put_timestamp_info __put_timestamp_info@DB_VERSION_UNIQUE_NAME@
+#define __find_lsnrg_by_timerg __find_lsnrg_by_timerg@DB_VERSION_UNIQUE_NAME@
+#define __add_txnrange __add_txnrange@DB_VERSION_UNIQUE_NAME@
+#define __get_aborttxn __get_aborttxn@DB_VERSION_UNIQUE_NAME@
+#define __txn_started __txn_started@DB_VERSION_UNIQUE_NAME@
+#define __set_logvrfy_dbfuid __set_logvrfy_dbfuid@DB_VERSION_UNIQUE_NAME@
+#define __add_page_to_txn __add_page_to_txn@DB_VERSION_UNIQUE_NAME@
+#define __del_txn_pages __del_txn_pages@DB_VERSION_UNIQUE_NAME@
+#define __is_ancestor_txn __is_ancestor_txn@DB_VERSION_UNIQUE_NAME@
+#define __return_txn_pages __return_txn_pages@DB_VERSION_UNIQUE_NAME@
+#define __memp_alloc __memp_alloc@DB_VERSION_UNIQUE_NAME@
+#define __memp_free __memp_free@DB_VERSION_UNIQUE_NAME@
+#define __memp_backup_open __memp_backup_open@DB_VERSION_UNIQUE_NAME@
+#define __memp_backup_mpf __memp_backup_mpf@DB_VERSION_UNIQUE_NAME@
+#define __memp_backup_close __memp_backup_close@DB_VERSION_UNIQUE_NAME@
+#define __memp_failchk __memp_failchk@DB_VERSION_UNIQUE_NAME@
+#define __memp_bhwrite __memp_bhwrite@DB_VERSION_UNIQUE_NAME@
+#define __memp_pgread __memp_pgread@DB_VERSION_UNIQUE_NAME@
+#define __memp_pg __memp_pg@DB_VERSION_UNIQUE_NAME@
+#define __memp_bhfree __memp_bhfree@DB_VERSION_UNIQUE_NAME@
+#define __memp_fget_pp __memp_fget_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_fget __memp_fget@DB_VERSION_UNIQUE_NAME@
+#define __memp_fcreate_pp __memp_fcreate_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_fcreate __memp_fcreate@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_clear_len __memp_set_clear_len@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_fileid __memp_get_fileid@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_fileid __memp_set_fileid@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_flags __memp_get_flags@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_flags __memp_set_flags@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_ftype __memp_get_ftype@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_ftype __memp_set_ftype@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_lsn_offset __memp_set_lsn_offset@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_pgcookie __memp_get_pgcookie@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_pgcookie __memp_set_pgcookie@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_priority __memp_get_priority@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_last_pgno __memp_get_last_pgno@DB_VERSION_UNIQUE_NAME@
+#define __memp_fn __memp_fn@DB_VERSION_UNIQUE_NAME@
+#define __memp_fns __memp_fns@DB_VERSION_UNIQUE_NAME@
+#define __memp_fopen_pp __memp_fopen_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_fopen __memp_fopen@DB_VERSION_UNIQUE_NAME@
+#define __memp_fclose_pp __memp_fclose_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_fclose __memp_fclose@DB_VERSION_UNIQUE_NAME@
+#define __memp_mf_discard __memp_mf_discard@DB_VERSION_UNIQUE_NAME@
+#define __memp_inmemlist __memp_inmemlist@DB_VERSION_UNIQUE_NAME@
+#define __memp_fput_pp __memp_fput_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_fput __memp_fput@DB_VERSION_UNIQUE_NAME@
+#define __memp_unpin_buffers __memp_unpin_buffers@DB_VERSION_UNIQUE_NAME@
+#define __memp_dirty __memp_dirty@DB_VERSION_UNIQUE_NAME@
+#define __memp_shared __memp_shared@DB_VERSION_UNIQUE_NAME@
+#define __memp_env_create __memp_env_create@DB_VERSION_UNIQUE_NAME@
+#define __memp_env_destroy __memp_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_cachesize __memp_get_cachesize@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_cachesize __memp_set_cachesize@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_config __memp_set_config@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_config __memp_get_config@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_mp_max_openfd __memp_get_mp_max_openfd@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_mp_max_openfd __memp_set_mp_max_openfd@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_mp_max_write __memp_get_mp_max_write@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_mp_max_write __memp_set_mp_max_write@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_mp_mmapsize __memp_get_mp_mmapsize@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_mp_mmapsize __memp_set_mp_mmapsize@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_mp_pagesize __memp_get_mp_pagesize@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_mp_pagesize __memp_set_mp_pagesize@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_mp_tablesize __memp_get_mp_tablesize@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_mp_tablesize __memp_set_mp_tablesize@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_mp_mtxcount __memp_get_mp_mtxcount@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_mp_mtxcount __memp_set_mp_mtxcount@DB_VERSION_UNIQUE_NAME@
+#define __memp_nameop __memp_nameop@DB_VERSION_UNIQUE_NAME@
+#define __memp_ftruncate __memp_ftruncate@DB_VERSION_UNIQUE_NAME@
+#define __memp_alloc_freelist __memp_alloc_freelist@DB_VERSION_UNIQUE_NAME@
+#define __memp_free_freelist __memp_free_freelist@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_freelist __memp_get_freelist@DB_VERSION_UNIQUE_NAME@
+#define __memp_extend_freelist __memp_extend_freelist@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_last_pgno __memp_set_last_pgno@DB_VERSION_UNIQUE_NAME@
+#define __memp_bh_settxn __memp_bh_settxn@DB_VERSION_UNIQUE_NAME@
+#define __memp_skip_curadj __memp_skip_curadj@DB_VERSION_UNIQUE_NAME@
+#define __memp_bh_freeze __memp_bh_freeze@DB_VERSION_UNIQUE_NAME@
+#define __memp_bh_thaw __memp_bh_thaw@DB_VERSION_UNIQUE_NAME@
+#define __memp_open __memp_open@DB_VERSION_UNIQUE_NAME@
+#define __memp_init __memp_init@DB_VERSION_UNIQUE_NAME@
+#define __memp_max_regions __memp_max_regions@DB_VERSION_UNIQUE_NAME@
+#define __memp_region_mutex_count __memp_region_mutex_count@DB_VERSION_UNIQUE_NAME@
+#define __memp_env_refresh __memp_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __memp_register_pp __memp_register_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_register __memp_register@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_bucket __memp_get_bucket@DB_VERSION_UNIQUE_NAME@
+#define __memp_resize __memp_resize@DB_VERSION_UNIQUE_NAME@
+#define __memp_get_cache_max __memp_get_cache_max@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_cache_max __memp_set_cache_max@DB_VERSION_UNIQUE_NAME@
+#define __memp_stat_pp __memp_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_stat_print_pp __memp_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_stat_print __memp_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __memp_stat_hash __memp_stat_hash@DB_VERSION_UNIQUE_NAME@
+#define __memp_walk_files __memp_walk_files@DB_VERSION_UNIQUE_NAME@
+#define __memp_discard_all_mpfs __memp_discard_all_mpfs@DB_VERSION_UNIQUE_NAME@
+#define __memp_sync_pp __memp_sync_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_sync __memp_sync@DB_VERSION_UNIQUE_NAME@
+#define __memp_fsync_pp __memp_fsync_pp@DB_VERSION_UNIQUE_NAME@
+#define __memp_fsync __memp_fsync@DB_VERSION_UNIQUE_NAME@
+#define __mp_xxx_fh __mp_xxx_fh@DB_VERSION_UNIQUE_NAME@
+#define __memp_sync_int __memp_sync_int@DB_VERSION_UNIQUE_NAME@
+#define __memp_mf_sync __memp_mf_sync@DB_VERSION_UNIQUE_NAME@
+#define __memp_trickle_pp __memp_trickle_pp@DB_VERSION_UNIQUE_NAME@
+#define __mutex_alloc __mutex_alloc@DB_VERSION_UNIQUE_NAME@
+#define __mutex_alloc_int __mutex_alloc_int@DB_VERSION_UNIQUE_NAME@
+#define __mutex_free __mutex_free@DB_VERSION_UNIQUE_NAME@
+#define __mutex_free_int __mutex_free_int@DB_VERSION_UNIQUE_NAME@
+#define __mutex_refresh __mutex_refresh@DB_VERSION_UNIQUE_NAME@
+#define __mut_failchk __mut_failchk@DB_VERSION_UNIQUE_NAME@
+#define __db_fcntl_mutex_init __db_fcntl_mutex_init@DB_VERSION_UNIQUE_NAME@
+#define __db_fcntl_mutex_lock __db_fcntl_mutex_lock@DB_VERSION_UNIQUE_NAME@
+#define __db_fcntl_mutex_trylock __db_fcntl_mutex_trylock@DB_VERSION_UNIQUE_NAME@
+#define __db_fcntl_mutex_unlock __db_fcntl_mutex_unlock@DB_VERSION_UNIQUE_NAME@
+#define __db_fcntl_mutex_destroy __db_fcntl_mutex_destroy@DB_VERSION_UNIQUE_NAME@
+#define __mutex_alloc_pp __mutex_alloc_pp@DB_VERSION_UNIQUE_NAME@
+#define __mutex_free_pp __mutex_free_pp@DB_VERSION_UNIQUE_NAME@
+#define __mutex_lock_pp __mutex_lock_pp@DB_VERSION_UNIQUE_NAME@
+#define __mutex_unlock_pp __mutex_unlock_pp@DB_VERSION_UNIQUE_NAME@
+#define __mutex_get_align __mutex_get_align@DB_VERSION_UNIQUE_NAME@
+#define __mutex_set_align __mutex_set_align@DB_VERSION_UNIQUE_NAME@
+#define __mutex_get_increment __mutex_get_increment@DB_VERSION_UNIQUE_NAME@
+#define __mutex_set_increment __mutex_set_increment@DB_VERSION_UNIQUE_NAME@
+#define __mutex_get_init __mutex_get_init@DB_VERSION_UNIQUE_NAME@
+#define __mutex_set_init __mutex_set_init@DB_VERSION_UNIQUE_NAME@
+#define __mutex_get_max __mutex_get_max@DB_VERSION_UNIQUE_NAME@
+#define __mutex_set_max __mutex_set_max@DB_VERSION_UNIQUE_NAME@
+#define __mutex_get_tas_spins __mutex_get_tas_spins@DB_VERSION_UNIQUE_NAME@
+#define __mutex_set_tas_spins __mutex_set_tas_spins@DB_VERSION_UNIQUE_NAME@
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+#define __atomic_inc __atomic_inc@DB_VERSION_UNIQUE_NAME@
+#endif
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+#define __atomic_dec __atomic_dec@DB_VERSION_UNIQUE_NAME@
+#endif
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+#define atomic_compare_exchange atomic_compare_exchange@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __db_pthread_mutex_init __db_pthread_mutex_init@DB_VERSION_UNIQUE_NAME@
+#ifndef HAVE_MUTEX_HYBRID
+#define __db_pthread_mutex_lock __db_pthread_mutex_lock@DB_VERSION_UNIQUE_NAME@
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+#define __db_pthread_mutex_readlock __db_pthread_mutex_readlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+#define __db_hybrid_mutex_suspend __db_hybrid_mutex_suspend@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __db_pthread_mutex_unlock __db_pthread_mutex_unlock@DB_VERSION_UNIQUE_NAME@
+#define __db_pthread_mutex_destroy __db_pthread_mutex_destroy@DB_VERSION_UNIQUE_NAME@
+#define __mutex_open __mutex_open@DB_VERSION_UNIQUE_NAME@
+#define __mutex_env_refresh __mutex_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __mutex_resource_return __mutex_resource_return@DB_VERSION_UNIQUE_NAME@
+#define __mutex_stat_pp __mutex_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define __mutex_stat_print_pp __mutex_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __mutex_stat_print __mutex_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __mutex_print_debug_single __mutex_print_debug_single@DB_VERSION_UNIQUE_NAME@
+#define __mutex_print_debug_stats __mutex_print_debug_stats@DB_VERSION_UNIQUE_NAME@
+#define __mutex_set_wait_info __mutex_set_wait_info@DB_VERSION_UNIQUE_NAME@
+#define __mutex_clear __mutex_clear@DB_VERSION_UNIQUE_NAME@
+#define __db_tas_mutex_init __db_tas_mutex_init@DB_VERSION_UNIQUE_NAME@
+#define __db_tas_mutex_lock __db_tas_mutex_lock@DB_VERSION_UNIQUE_NAME@
+#define __db_tas_mutex_trylock __db_tas_mutex_trylock@DB_VERSION_UNIQUE_NAME@
+#if defined(HAVE_SHARED_LATCHES)
+#define __db_tas_mutex_readlock __db_tas_mutex_readlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+#define __db_tas_mutex_tryreadlock __db_tas_mutex_tryreadlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __db_tas_mutex_unlock __db_tas_mutex_unlock@DB_VERSION_UNIQUE_NAME@
+#define __db_tas_mutex_destroy __db_tas_mutex_destroy@DB_VERSION_UNIQUE_NAME@
+#define __db_win32_mutex_init __db_win32_mutex_init@DB_VERSION_UNIQUE_NAME@
+#define __db_win32_mutex_lock __db_win32_mutex_lock@DB_VERSION_UNIQUE_NAME@
+#define __db_win32_mutex_trylock __db_win32_mutex_trylock@DB_VERSION_UNIQUE_NAME@
+#if defined(HAVE_SHARED_LATCHES)
+#define __db_win32_mutex_readlock __db_win32_mutex_readlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+#define __db_win32_mutex_tryreadlock __db_win32_mutex_tryreadlock@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __db_win32_mutex_unlock __db_win32_mutex_unlock@DB_VERSION_UNIQUE_NAME@
+#define __db_win32_mutex_destroy __db_win32_mutex_destroy@DB_VERSION_UNIQUE_NAME@
+#define __os_abort __os_abort@DB_VERSION_UNIQUE_NAME@
+#define __os_abspath __os_abspath@DB_VERSION_UNIQUE_NAME@
+#if defined(HAVE_REPLICATION_THREADS)
+#define __os_getaddrinfo __os_getaddrinfo@DB_VERSION_UNIQUE_NAME@
+#endif
+#if defined(HAVE_REPLICATION_THREADS)
+#define __os_freeaddrinfo __os_freeaddrinfo@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __os_umalloc __os_umalloc@DB_VERSION_UNIQUE_NAME@
+#define __os_urealloc __os_urealloc@DB_VERSION_UNIQUE_NAME@
+#define __os_ufree __os_ufree@DB_VERSION_UNIQUE_NAME@
+#define __os_strdup __os_strdup@DB_VERSION_UNIQUE_NAME@
+#define __os_calloc __os_calloc@DB_VERSION_UNIQUE_NAME@
+#define __os_malloc __os_malloc@DB_VERSION_UNIQUE_NAME@
+#define __os_realloc __os_realloc@DB_VERSION_UNIQUE_NAME@
+#define __os_free __os_free@DB_VERSION_UNIQUE_NAME@
+#define __ua_memcpy __ua_memcpy@DB_VERSION_UNIQUE_NAME@
+#define __os_gettime __os_gettime@DB_VERSION_UNIQUE_NAME@
+#define __os_fs_notzero __os_fs_notzero@DB_VERSION_UNIQUE_NAME@
+#define __os_support_direct_io __os_support_direct_io@DB_VERSION_UNIQUE_NAME@
+#define __os_support_db_register __os_support_db_register@DB_VERSION_UNIQUE_NAME@
+#define __os_support_replication __os_support_replication@DB_VERSION_UNIQUE_NAME@
+#define __os_cpu_count __os_cpu_count@DB_VERSION_UNIQUE_NAME@
+#define __os_ctime __os_ctime@DB_VERSION_UNIQUE_NAME@
+#define __os_dirlist __os_dirlist@DB_VERSION_UNIQUE_NAME@
+#define __os_dirfree __os_dirfree@DB_VERSION_UNIQUE_NAME@
+#define __os_get_errno_ret_zero __os_get_errno_ret_zero@DB_VERSION_UNIQUE_NAME@
+#define __os_get_errno __os_get_errno@DB_VERSION_UNIQUE_NAME@
+#define __os_get_neterr __os_get_neterr@DB_VERSION_UNIQUE_NAME@
+#define __os_get_syserr __os_get_syserr@DB_VERSION_UNIQUE_NAME@
+#define __os_set_errno __os_set_errno@DB_VERSION_UNIQUE_NAME@
+#define __os_strerror __os_strerror@DB_VERSION_UNIQUE_NAME@
+#define __os_posix_err __os_posix_err@DB_VERSION_UNIQUE_NAME@
+#define __os_fileid __os_fileid@DB_VERSION_UNIQUE_NAME@
+#define __os_fdlock __os_fdlock@DB_VERSION_UNIQUE_NAME@
+#define __os_fsync __os_fsync@DB_VERSION_UNIQUE_NAME@
+#define __os_getenv __os_getenv@DB_VERSION_UNIQUE_NAME@
+#define __os_openhandle __os_openhandle@DB_VERSION_UNIQUE_NAME@
+#define __os_closehandle __os_closehandle@DB_VERSION_UNIQUE_NAME@
+#define __os_attach __os_attach@DB_VERSION_UNIQUE_NAME@
+#define __os_detach __os_detach@DB_VERSION_UNIQUE_NAME@
+#define __os_mapfile __os_mapfile@DB_VERSION_UNIQUE_NAME@
+#define __os_unmapfile __os_unmapfile@DB_VERSION_UNIQUE_NAME@
+#define __os_mkdir __os_mkdir@DB_VERSION_UNIQUE_NAME@
+#define __os_open __os_open@DB_VERSION_UNIQUE_NAME@
+#define __os_concat_path __os_concat_path@DB_VERSION_UNIQUE_NAME@
+#define __os_id __os_id@DB_VERSION_UNIQUE_NAME@
+#define __os_rename __os_rename@DB_VERSION_UNIQUE_NAME@
+#define __os_isroot __os_isroot@DB_VERSION_UNIQUE_NAME@
+#define __db_rpath __db_rpath@DB_VERSION_UNIQUE_NAME@
+#define __os_io __os_io@DB_VERSION_UNIQUE_NAME@
+#define __os_read __os_read@DB_VERSION_UNIQUE_NAME@
+#define __os_write __os_write@DB_VERSION_UNIQUE_NAME@
+#define __os_physwrite __os_physwrite@DB_VERSION_UNIQUE_NAME@
+#define __os_seek __os_seek@DB_VERSION_UNIQUE_NAME@
+#define __os_stack __os_stack@DB_VERSION_UNIQUE_NAME@
+#define __os_exists __os_exists@DB_VERSION_UNIQUE_NAME@
+#define __os_ioinfo __os_ioinfo@DB_VERSION_UNIQUE_NAME@
+#define __os_tmpdir __os_tmpdir@DB_VERSION_UNIQUE_NAME@
+#define __os_truncate __os_truncate@DB_VERSION_UNIQUE_NAME@
+#define __os_unique_id __os_unique_id@DB_VERSION_UNIQUE_NAME@
+#define __os_unlink __os_unlink@DB_VERSION_UNIQUE_NAME@
+#define __os_yield __os_yield@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_QNX
+#define __os_qnx_region_open __os_qnx_region_open@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __os_is_winnt __os_is_winnt@DB_VERSION_UNIQUE_NAME@
+#define __os_cpu_count __os_cpu_count@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_REPLICATION_THREADS
+#define __os_get_neterr __os_get_neterr@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __qam_position __qam_position@DB_VERSION_UNIQUE_NAME@
+#define __qam_pitem __qam_pitem@DB_VERSION_UNIQUE_NAME@
+#define __qam_append __qam_append@DB_VERSION_UNIQUE_NAME@
+#define __qamc_dup __qamc_dup@DB_VERSION_UNIQUE_NAME@
+#define __qamc_init __qamc_init@DB_VERSION_UNIQUE_NAME@
+#define __qam_truncate __qam_truncate@DB_VERSION_UNIQUE_NAME@
+#define __qam_delete __qam_delete@DB_VERSION_UNIQUE_NAME@
+#define __qam_incfirst_desc __qam_incfirst_desc@DB_VERSION_UNIQUE_NAME@
+#define __qam_mvptr_desc __qam_mvptr_desc@DB_VERSION_UNIQUE_NAME@
+#define __qam_del_desc __qam_del_desc@DB_VERSION_UNIQUE_NAME@
+#define __qam_add_desc __qam_add_desc@DB_VERSION_UNIQUE_NAME@
+#define __qam_delext_desc __qam_delext_desc@DB_VERSION_UNIQUE_NAME@
+#define __qam_init_recover __qam_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __qam_incfirst_print __qam_incfirst_print@DB_VERSION_UNIQUE_NAME@
+#define __qam_mvptr_print __qam_mvptr_print@DB_VERSION_UNIQUE_NAME@
+#define __qam_del_print __qam_del_print@DB_VERSION_UNIQUE_NAME@
+#define __qam_add_print __qam_add_print@DB_VERSION_UNIQUE_NAME@
+#define __qam_delext_print __qam_delext_print@DB_VERSION_UNIQUE_NAME@
+#define __qam_init_print __qam_init_print@DB_VERSION_UNIQUE_NAME@
+#define __qam_mswap __qam_mswap@DB_VERSION_UNIQUE_NAME@
+#define __qam_pgin_out __qam_pgin_out@DB_VERSION_UNIQUE_NAME@
+#define __qam_fprobe __qam_fprobe@DB_VERSION_UNIQUE_NAME@
+#define __qam_fclose __qam_fclose@DB_VERSION_UNIQUE_NAME@
+#define __qam_fremove __qam_fremove@DB_VERSION_UNIQUE_NAME@
+#define __qam_sync __qam_sync@DB_VERSION_UNIQUE_NAME@
+#define __qam_gen_filelist __qam_gen_filelist@DB_VERSION_UNIQUE_NAME@
+#define __qam_extent_names __qam_extent_names@DB_VERSION_UNIQUE_NAME@
+#define __qam_exid __qam_exid@DB_VERSION_UNIQUE_NAME@
+#define __qam_nameop __qam_nameop@DB_VERSION_UNIQUE_NAME@
+#define __qam_lsn_reset __qam_lsn_reset@DB_VERSION_UNIQUE_NAME@
+#define __qam_backup_extents __qam_backup_extents@DB_VERSION_UNIQUE_NAME@
+#define __qam_db_create __qam_db_create@DB_VERSION_UNIQUE_NAME@
+#define __qam_db_close __qam_db_close@DB_VERSION_UNIQUE_NAME@
+#define __qam_get_extentsize __qam_get_extentsize@DB_VERSION_UNIQUE_NAME@
+#define __queue_pageinfo __queue_pageinfo@DB_VERSION_UNIQUE_NAME@
+#define __db_prqueue __db_prqueue@DB_VERSION_UNIQUE_NAME@
+#define __qam_remove __qam_remove@DB_VERSION_UNIQUE_NAME@
+#define __qam_rename __qam_rename@DB_VERSION_UNIQUE_NAME@
+#define __qam_map_flags __qam_map_flags@DB_VERSION_UNIQUE_NAME@
+#define __qam_set_flags __qam_set_flags@DB_VERSION_UNIQUE_NAME@
+#define __qam_open __qam_open@DB_VERSION_UNIQUE_NAME@
+#define __qam_set_ext_data __qam_set_ext_data@DB_VERSION_UNIQUE_NAME@
+#define __qam_metachk __qam_metachk@DB_VERSION_UNIQUE_NAME@
+#define __qam_new_file __qam_new_file@DB_VERSION_UNIQUE_NAME@
+#define __qam_incfirst_recover __qam_incfirst_recover@DB_VERSION_UNIQUE_NAME@
+#define __qam_mvptr_recover __qam_mvptr_recover@DB_VERSION_UNIQUE_NAME@
+#define __qam_del_recover __qam_del_recover@DB_VERSION_UNIQUE_NAME@
+#define __qam_delext_recover __qam_delext_recover@DB_VERSION_UNIQUE_NAME@
+#define __qam_add_recover __qam_add_recover@DB_VERSION_UNIQUE_NAME@
+#define __qam_stat __qam_stat@DB_VERSION_UNIQUE_NAME@
+#define __qam_stat_print __qam_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __db_no_queue_am __db_no_queue_am@DB_VERSION_UNIQUE_NAME@
+#define __qam_31_qammeta __qam_31_qammeta@DB_VERSION_UNIQUE_NAME@
+#define __qam_32_qammeta __qam_32_qammeta@DB_VERSION_UNIQUE_NAME@
+#define __qam_vrfy_meta __qam_vrfy_meta@DB_VERSION_UNIQUE_NAME@
+#define __qam_meta2pgset __qam_meta2pgset@DB_VERSION_UNIQUE_NAME@
+#define __qam_vrfy_data __qam_vrfy_data@DB_VERSION_UNIQUE_NAME@
+#define __qam_vrfy_structure __qam_vrfy_structure@DB_VERSION_UNIQUE_NAME@
+#define __qam_vrfy_walkqueue __qam_vrfy_walkqueue@DB_VERSION_UNIQUE_NAME@
+#define __qam_salvage __qam_salvage@DB_VERSION_UNIQUE_NAME@
+#define __rep_bulk_marshal __rep_bulk_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_bulk_unmarshal __rep_bulk_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_control_marshal __rep_control_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_control_unmarshal __rep_control_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_egen_marshal __rep_egen_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_egen_unmarshal __rep_egen_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_fileinfo_marshal __rep_fileinfo_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_fileinfo_unmarshal __rep_fileinfo_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_fileinfo_v6_marshal __rep_fileinfo_v6_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_fileinfo_v6_unmarshal __rep_fileinfo_v6_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_grant_info_marshal __rep_grant_info_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_grant_info_unmarshal __rep_grant_info_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_logreq_marshal __rep_logreq_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_logreq_unmarshal __rep_logreq_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_newfile_marshal __rep_newfile_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_newfile_unmarshal __rep_newfile_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_update_marshal __rep_update_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_update_unmarshal __rep_update_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_vote_info_marshal __rep_vote_info_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_vote_info_unmarshal __rep_vote_info_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_vote_info_v5_marshal __rep_vote_info_v5_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_vote_info_v5_unmarshal __rep_vote_info_v5_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_lsn_hist_key_marshal __rep_lsn_hist_key_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_lsn_hist_key_unmarshal __rep_lsn_hist_key_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_lsn_hist_data_marshal __rep_lsn_hist_data_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_lsn_hist_data_unmarshal __rep_lsn_hist_data_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_update_req __rep_update_req@DB_VERSION_UNIQUE_NAME@
+#define __rep_page_req __rep_page_req@DB_VERSION_UNIQUE_NAME@
+#define __rep_update_setup __rep_update_setup@DB_VERSION_UNIQUE_NAME@
+#define __rep_bulk_page __rep_bulk_page@DB_VERSION_UNIQUE_NAME@
+#define __rep_page __rep_page@DB_VERSION_UNIQUE_NAME@
+#define __rep_init_cleanup __rep_init_cleanup@DB_VERSION_UNIQUE_NAME@
+#define __rep_pggap_req __rep_pggap_req@DB_VERSION_UNIQUE_NAME@
+#define __rep_finfo_alloc __rep_finfo_alloc@DB_VERSION_UNIQUE_NAME@
+#define __rep_remove_init_file __rep_remove_init_file@DB_VERSION_UNIQUE_NAME@
+#define __rep_reset_init __rep_reset_init@DB_VERSION_UNIQUE_NAME@
+#define __rep_elect_pp __rep_elect_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_elect_int __rep_elect_int@DB_VERSION_UNIQUE_NAME@
+#define __rep_vote1 __rep_vote1@DB_VERSION_UNIQUE_NAME@
+#define __rep_vote2 __rep_vote2@DB_VERSION_UNIQUE_NAME@
+#define __rep_update_grant __rep_update_grant@DB_VERSION_UNIQUE_NAME@
+#define __rep_islease_granted __rep_islease_granted@DB_VERSION_UNIQUE_NAME@
+#define __rep_lease_table_alloc __rep_lease_table_alloc@DB_VERSION_UNIQUE_NAME@
+#define __rep_lease_grant __rep_lease_grant@DB_VERSION_UNIQUE_NAME@
+#define __rep_lease_check __rep_lease_check@DB_VERSION_UNIQUE_NAME@
+#define __rep_lease_refresh __rep_lease_refresh@DB_VERSION_UNIQUE_NAME@
+#define __rep_lease_expire __rep_lease_expire@DB_VERSION_UNIQUE_NAME@
+#define __rep_lease_waittime __rep_lease_waittime@DB_VERSION_UNIQUE_NAME@
+#define __rep_allreq __rep_allreq@DB_VERSION_UNIQUE_NAME@
+#define __rep_log __rep_log@DB_VERSION_UNIQUE_NAME@
+#define __rep_bulk_log __rep_bulk_log@DB_VERSION_UNIQUE_NAME@
+#define __rep_logreq __rep_logreq@DB_VERSION_UNIQUE_NAME@
+#define __rep_loggap_req __rep_loggap_req@DB_VERSION_UNIQUE_NAME@
+#define __rep_logready __rep_logready@DB_VERSION_UNIQUE_NAME@
+#define __rep_env_create __rep_env_create@DB_VERSION_UNIQUE_NAME@
+#define __rep_env_destroy __rep_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_config __rep_get_config@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_config __rep_set_config@DB_VERSION_UNIQUE_NAME@
+#define __rep_start_pp __rep_start_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_start_int __rep_start_int@DB_VERSION_UNIQUE_NAME@
+#define __rep_open_sysdb __rep_open_sysdb@DB_VERSION_UNIQUE_NAME@
+#define __rep_client_dbinit __rep_client_dbinit@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_limit __rep_get_limit@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_limit __rep_set_limit@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_nsites_pp __rep_set_nsites_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_nsites_int __rep_set_nsites_int@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_nsites __rep_get_nsites@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_priority __rep_set_priority@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_priority __rep_get_priority@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_timeout __rep_set_timeout@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_timeout __rep_get_timeout@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_request __rep_get_request@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_request __rep_set_request@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_transport_pp __rep_set_transport_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_transport_int __rep_set_transport_int@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_clockskew __rep_get_clockskew@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_clockskew __rep_set_clockskew@DB_VERSION_UNIQUE_NAME@
+#define __rep_flush __rep_flush@DB_VERSION_UNIQUE_NAME@
+#define __rep_sync __rep_sync@DB_VERSION_UNIQUE_NAME@
+#define __rep_txn_applied __rep_txn_applied@DB_VERSION_UNIQUE_NAME@
+#define __rep_process_message_pp __rep_process_message_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_process_message_int __rep_process_message_int@DB_VERSION_UNIQUE_NAME@
+#define __rep_apply __rep_apply@DB_VERSION_UNIQUE_NAME@
+#define __rep_process_txn __rep_process_txn@DB_VERSION_UNIQUE_NAME@
+#define __rep_resend_req __rep_resend_req@DB_VERSION_UNIQUE_NAME@
+#define __rep_check_doreq __rep_check_doreq@DB_VERSION_UNIQUE_NAME@
+#define __rep_check_missing __rep_check_missing@DB_VERSION_UNIQUE_NAME@
+#define __rep_open __rep_open@DB_VERSION_UNIQUE_NAME@
+#define __rep_close_diagfiles __rep_close_diagfiles@DB_VERSION_UNIQUE_NAME@
+#define __rep_env_refresh __rep_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __rep_env_close __rep_env_close@DB_VERSION_UNIQUE_NAME@
+#define __rep_preclose __rep_preclose@DB_VERSION_UNIQUE_NAME@
+#define __rep_closefiles __rep_closefiles@DB_VERSION_UNIQUE_NAME@
+#define __rep_write_egen __rep_write_egen@DB_VERSION_UNIQUE_NAME@
+#define __rep_write_gen __rep_write_gen@DB_VERSION_UNIQUE_NAME@
+#define __rep_stat_pp __rep_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_stat_print_pp __rep_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_stat_print __rep_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __rep_bulk_message __rep_bulk_message@DB_VERSION_UNIQUE_NAME@
+#define __rep_send_bulk __rep_send_bulk@DB_VERSION_UNIQUE_NAME@
+#define __rep_bulk_alloc __rep_bulk_alloc@DB_VERSION_UNIQUE_NAME@
+#define __rep_bulk_free __rep_bulk_free@DB_VERSION_UNIQUE_NAME@
+#define __rep_send_message __rep_send_message@DB_VERSION_UNIQUE_NAME@
+#define __rep_new_master __rep_new_master@DB_VERSION_UNIQUE_NAME@
+#define __rep_elect_done __rep_elect_done@DB_VERSION_UNIQUE_NAME@
+#define __env_rep_enter __env_rep_enter@DB_VERSION_UNIQUE_NAME@
+#define __env_db_rep_exit __env_db_rep_exit@DB_VERSION_UNIQUE_NAME@
+#define __db_rep_enter __db_rep_enter@DB_VERSION_UNIQUE_NAME@
+#define __op_handle_enter __op_handle_enter@DB_VERSION_UNIQUE_NAME@
+#define __op_rep_enter __op_rep_enter@DB_VERSION_UNIQUE_NAME@
+#define __op_rep_exit __op_rep_exit@DB_VERSION_UNIQUE_NAME@
+#define __archive_rep_enter __archive_rep_enter@DB_VERSION_UNIQUE_NAME@
+#define __archive_rep_exit __archive_rep_exit@DB_VERSION_UNIQUE_NAME@
+#define __rep_lockout_archive __rep_lockout_archive@DB_VERSION_UNIQUE_NAME@
+#define __rep_lockout_api __rep_lockout_api@DB_VERSION_UNIQUE_NAME@
+#define __rep_take_apilockout __rep_take_apilockout@DB_VERSION_UNIQUE_NAME@
+#define __rep_clear_apilockout __rep_clear_apilockout@DB_VERSION_UNIQUE_NAME@
+#define __rep_lockout_apply __rep_lockout_apply@DB_VERSION_UNIQUE_NAME@
+#define __rep_lockout_msg __rep_lockout_msg@DB_VERSION_UNIQUE_NAME@
+#define __rep_send_throttle __rep_send_throttle@DB_VERSION_UNIQUE_NAME@
+#define __rep_msg_to_old __rep_msg_to_old@DB_VERSION_UNIQUE_NAME@
+#define __rep_msg_from_old __rep_msg_from_old@DB_VERSION_UNIQUE_NAME@
+#define __rep_print_system __rep_print_system@DB_VERSION_UNIQUE_NAME@
+#define __rep_print __rep_print@DB_VERSION_UNIQUE_NAME@
+#define __rep_print_message __rep_print_message@DB_VERSION_UNIQUE_NAME@
+#define __rep_fire_event __rep_fire_event@DB_VERSION_UNIQUE_NAME@
+#define __rep_msg __rep_msg@DB_VERSION_UNIQUE_NAME@
+#define __rep_notify_threads __rep_notify_threads@DB_VERSION_UNIQUE_NAME@
+#define __rep_check_goal __rep_check_goal@DB_VERSION_UNIQUE_NAME@
+#define __rep_log_backup __rep_log_backup@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_maxpermlsn __rep_get_maxpermlsn@DB_VERSION_UNIQUE_NAME@
+#define __rep_is_internal_rep_file __rep_is_internal_rep_file@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_datagen __rep_get_datagen@DB_VERSION_UNIQUE_NAME@
+#define __rep_verify __rep_verify@DB_VERSION_UNIQUE_NAME@
+#define __rep_verify_fail __rep_verify_fail@DB_VERSION_UNIQUE_NAME@
+#define __rep_verify_req __rep_verify_req@DB_VERSION_UNIQUE_NAME@
+#define __rep_dorecovery __rep_dorecovery@DB_VERSION_UNIQUE_NAME@
+#define __rep_verify_match __rep_verify_match@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_member_desc __repmgr_member_desc@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_init_recover __repmgr_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_handshake_marshal __repmgr_handshake_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_handshake_unmarshal __repmgr_handshake_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v3handshake_marshal __repmgr_v3handshake_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v3handshake_unmarshal __repmgr_v3handshake_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v2handshake_marshal __repmgr_v2handshake_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v2handshake_unmarshal __repmgr_v2handshake_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_parm_refresh_marshal __repmgr_parm_refresh_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_parm_refresh_unmarshal __repmgr_parm_refresh_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_permlsn_marshal __repmgr_permlsn_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_permlsn_unmarshal __repmgr_permlsn_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_version_proposal_marshal __repmgr_version_proposal_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_version_proposal_unmarshal __repmgr_version_proposal_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_version_confirmation_marshal __repmgr_version_confirmation_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_version_confirmation_unmarshal __repmgr_version_confirmation_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_msg_hdr_marshal __repmgr_msg_hdr_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_msg_hdr_unmarshal __repmgr_msg_hdr_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_msg_metadata_marshal __repmgr_msg_metadata_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_msg_metadata_unmarshal __repmgr_msg_metadata_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_membership_key_marshal __repmgr_membership_key_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_membership_key_unmarshal __repmgr_membership_key_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_membership_data_marshal __repmgr_membership_data_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_membership_data_unmarshal __repmgr_membership_data_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_member_metadata_marshal __repmgr_member_metadata_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_member_metadata_unmarshal __repmgr_member_metadata_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_gm_fwd_marshal __repmgr_gm_fwd_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_gm_fwd_unmarshal __repmgr_gm_fwd_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_membr_vers_marshal __repmgr_membr_vers_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_membr_vers_unmarshal __repmgr_membr_vers_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_info_marshal __repmgr_site_info_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_info_unmarshal __repmgr_site_info_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_connect_reject_marshal __repmgr_connect_reject_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_connect_reject_unmarshal __repmgr_connect_reject_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_member_print __repmgr_member_print@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_init_print __repmgr_init_print@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_init_election __repmgr_init_election@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_claim_victory __repmgr_claim_victory@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_turn_on_elections __repmgr_turn_on_elections@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_valid_config __repmgr_valid_config@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_autostart __repmgr_autostart@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_start_selector __repmgr_start_selector@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_stop __repmgr_stop@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_get_ack_policy __repmgr_get_ack_policy@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_env_create __repmgr_env_create@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_env_destroy __repmgr_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_stop_threads __repmgr_stop_threads@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_channel __repmgr_channel@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_msg_dispatch __repmgr_set_msg_dispatch@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_msg __repmgr_send_msg@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_request __repmgr_send_request@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_response __repmgr_send_response@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_channel_close __repmgr_channel_close@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_channel_timeout __repmgr_channel_timeout@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_request_inval __repmgr_send_request_inval@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_channel_close_inval __repmgr_channel_close_inval@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_channel_timeout_inval __repmgr_channel_timeout_inval@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_join_group __repmgr_join_group@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_by_eid __repmgr_site_by_eid@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_get_site_address __repmgr_get_site_address@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_get_eid __repmgr_get_eid@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_get_config __repmgr_get_config@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_config __repmgr_site_config@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_close __repmgr_site_close@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_msg_thread __repmgr_msg_thread@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_err_resp __repmgr_send_err_resp@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_handle_event __repmgr_handle_event@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_update_membership __repmgr_update_membership@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_gm_version __repmgr_set_gm_version@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_setup_gmdb_op __repmgr_setup_gmdb_op@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_cleanup_gmdb_op __repmgr_cleanup_gmdb_op@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_hold_master_role __repmgr_hold_master_role@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_rlse_master_role __repmgr_rlse_master_role@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_sites __repmgr_set_sites@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_connect __repmgr_connect@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send __repmgr_send@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_sync_siteaddr __repmgr_sync_siteaddr@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_broadcast __repmgr_send_broadcast@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_one __repmgr_send_one@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_many __repmgr_send_many@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_own_msg __repmgr_send_own_msg@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_write_iovecs __repmgr_write_iovecs@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_bust_connection __repmgr_bust_connection@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_disable_connection __repmgr_disable_connection@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_cleanup_defunct __repmgr_cleanup_defunct@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_close_connection __repmgr_close_connection@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_decr_conn_ref __repmgr_decr_conn_ref@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_destroy_conn __repmgr_destroy_conn@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_pack_netaddr __repmgr_pack_netaddr@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_getaddr __repmgr_getaddr@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_listen __repmgr_listen@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_net_close __repmgr_net_close@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_net_destroy __repmgr_net_destroy@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_thread_start __repmgr_thread_start@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_thread_join __repmgr_thread_join@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_nonblock_conn __repmgr_set_nonblock_conn@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_nonblocking __repmgr_set_nonblocking@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_wake_waiters __repmgr_wake_waiters@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_await_cond __repmgr_await_cond@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_await_gmdbop __repmgr_await_gmdbop@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_compute_wait_deadline __repmgr_compute_wait_deadline@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_await_drain __repmgr_await_drain@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_alloc_cond __repmgr_alloc_cond@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_free_cond __repmgr_free_cond@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_env_create_pf __repmgr_env_create_pf@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_create_mutex_pf __repmgr_create_mutex_pf@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_destroy_mutex_pf __repmgr_destroy_mutex_pf@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_init __repmgr_init@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_deinit __repmgr_deinit@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_init_waiters __repmgr_init_waiters@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_destroy_waiters __repmgr_destroy_waiters@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_lock_mutex __repmgr_lock_mutex@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_unlock_mutex __repmgr_unlock_mutex@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_signal __repmgr_signal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_wake_msngers __repmgr_wake_msngers@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_wake_main_thread __repmgr_wake_main_thread@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_writev __repmgr_writev@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_readv __repmgr_readv@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_select_loop __repmgr_select_loop@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_queue_destroy __repmgr_queue_destroy@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_queue_get __repmgr_queue_get@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_queue_put __repmgr_queue_put@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_queue_size __repmgr_queue_size@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_member_recover __repmgr_member_recover@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_select_thread __repmgr_select_thread@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_bow_out __repmgr_bow_out@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_accept __repmgr_accept@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_compute_timeout __repmgr_compute_timeout@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_connected_master __repmgr_connected_master@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_check_timeouts __repmgr_check_timeouts@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_first_try_connections __repmgr_first_try_connections@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_v1_handshake __repmgr_send_v1_handshake@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_read_from_site __repmgr_read_from_site@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_read_conn __repmgr_read_conn@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_prepare_simple_input __repmgr_prepare_simple_input@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_handshake __repmgr_send_handshake@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_find_version_info __repmgr_find_version_info@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_write_some __repmgr_write_some@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_stat_print_pp __repmgr_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_stat_print __repmgr_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_get_ack_policy __repmgr_get_ack_policy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_site_by_eid __repmgr_site_by_eid@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_stat_print_pp __repmgr_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_handle_event __repmgr_handle_event@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_channel __repmgr_channel@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_set_msg_dispatch __repmgr_set_msg_dispatch@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_init_recover __repmgr_init_recover@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __repmgr_schedule_connection_attempt __repmgr_schedule_connection_attempt@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_is_server __repmgr_is_server@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_reset_for_reading __repmgr_reset_for_reading@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_new_connection __repmgr_new_connection@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_keepalive __repmgr_set_keepalive@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_new_site __repmgr_new_site@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_create_mutex __repmgr_create_mutex@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_destroy_mutex __repmgr_destroy_mutex@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_cleanup_netaddr __repmgr_cleanup_netaddr@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_iovec_init __repmgr_iovec_init@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_add_buffer __repmgr_add_buffer@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_add_dbt __repmgr_add_dbt@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_update_consumed __repmgr_update_consumed@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_prepare_my_addr __repmgr_prepare_my_addr@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_get_nsites __repmgr_get_nsites@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_thread_failure __repmgr_thread_failure@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_format_eid_loc __repmgr_format_eid_loc@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_format_site_loc __repmgr_format_site_loc@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_format_addr_loc __repmgr_format_addr_loc@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_repstart __repmgr_repstart@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_become_master __repmgr_become_master@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_each_connection __repmgr_each_connection@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_open __repmgr_open@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_join __repmgr_join@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_env_refresh __repmgr_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_share_netaddrs __repmgr_share_netaddrs@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_copy_in_added_sites __repmgr_copy_in_added_sites@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_init_new_sites __repmgr_init_new_sites@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_failchk __repmgr_failchk@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_master_is_known __repmgr_master_is_known@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_stable_lsn __repmgr_stable_lsn@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_send_sync_msg __repmgr_send_sync_msg@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_marshal_member_list __repmgr_marshal_member_list@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_refresh_membership __repmgr_refresh_membership@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_reload_gmdb __repmgr_reload_gmdb@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_gmdb_version_cmp __repmgr_gmdb_version_cmp@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_init_save __repmgr_init_save@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_init_restore __repmgr_init_restore@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_defer_op __repmgr_defer_op@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_fire_conn_err_event __repmgr_fire_conn_err_event@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_print_conn_err __repmgr_print_conn_err@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_become_client __repmgr_become_client@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_lookup_site __repmgr_lookup_site@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_find_site __repmgr_find_site@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_membership __repmgr_set_membership@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_bcast_parm_refresh __repmgr_bcast_parm_refresh@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_chg_prio __repmgr_chg_prio@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_bcast_own_msg __repmgr_bcast_own_msg@DB_VERSION_UNIQUE_NAME@
+#define __seq_stat __seq_stat@DB_VERSION_UNIQUE_NAME@
+#define __seq_stat_print __seq_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __db_get_seq_flags_fn __db_get_seq_flags_fn@DB_VERSION_UNIQUE_NAME@
+#define __db_get_seq_flags_fn __db_get_seq_flags_fn@DB_VERSION_UNIQUE_NAME@
+#define bdb_HCommand bdb_HCommand@DB_VERSION_UNIQUE_NAME@
+#if DB_DBM_HSEARCH != 0
+#define bdb_NdbmOpen bdb_NdbmOpen@DB_VERSION_UNIQUE_NAME@
+#endif
+#if DB_DBM_HSEARCH != 0
+#define bdb_DbmCommand bdb_DbmCommand@DB_VERSION_UNIQUE_NAME@
+#endif
+#define ndbm_Cmd ndbm_Cmd@DB_VERSION_UNIQUE_NAME@
+#define _DbInfoDelete _DbInfoDelete@DB_VERSION_UNIQUE_NAME@
+#define db_Cmd db_Cmd@DB_VERSION_UNIQUE_NAME@
+#define tcl_CompactStat tcl_CompactStat@DB_VERSION_UNIQUE_NAME@
+#define tcl_rep_send tcl_rep_send@DB_VERSION_UNIQUE_NAME@
+#define dbc_Cmd dbc_Cmd@DB_VERSION_UNIQUE_NAME@
+#define env_Cmd env_Cmd@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvRemove tcl_EnvRemove@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvClose tcl_EnvClose@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvIdReset tcl_EnvIdReset@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvLsnReset tcl_EnvLsnReset@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvVerbose tcl_EnvVerbose@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvAttr tcl_EnvAttr@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvSetFlags tcl_EnvSetFlags@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvTest tcl_EnvTest@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvGetEncryptFlags tcl_EnvGetEncryptFlags@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvSetErrfile tcl_EnvSetErrfile@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvSetMsgfile tcl_EnvSetMsgfile@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvSetErrpfx tcl_EnvSetErrpfx@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvStatPrint tcl_EnvStatPrint@DB_VERSION_UNIQUE_NAME@
+#define _NewInfo _NewInfo@DB_VERSION_UNIQUE_NAME@
+#define _NameToPtr _NameToPtr@DB_VERSION_UNIQUE_NAME@
+#define _PtrToInfo _PtrToInfo@DB_VERSION_UNIQUE_NAME@
+#define _NameToInfo _NameToInfo@DB_VERSION_UNIQUE_NAME@
+#define _SetInfoData _SetInfoData@DB_VERSION_UNIQUE_NAME@
+#define _DeleteInfo _DeleteInfo@DB_VERSION_UNIQUE_NAME@
+#define _SetListElem _SetListElem@DB_VERSION_UNIQUE_NAME@
+#define _SetListElemInt _SetListElemInt@DB_VERSION_UNIQUE_NAME@
+#define _SetListElemWideInt _SetListElemWideInt@DB_VERSION_UNIQUE_NAME@
+#define _SetListRecnoElem _SetListRecnoElem@DB_VERSION_UNIQUE_NAME@
+#define _SetListHeapElem _SetListHeapElem@DB_VERSION_UNIQUE_NAME@
+#define _Set3DBTList _Set3DBTList@DB_VERSION_UNIQUE_NAME@
+#define _SetMultiList _SetMultiList@DB_VERSION_UNIQUE_NAME@
+#define _GetGlobPrefix _GetGlobPrefix@DB_VERSION_UNIQUE_NAME@
+#define _ReturnSetup _ReturnSetup@DB_VERSION_UNIQUE_NAME@
+#define _ErrorSetup _ErrorSetup@DB_VERSION_UNIQUE_NAME@
+#define _ErrorFunc _ErrorFunc@DB_VERSION_UNIQUE_NAME@
+#ifdef CONFIG_TEST
+#define _EventFunc _EventFunc@DB_VERSION_UNIQUE_NAME@
+#endif
+#define _GetLsn _GetLsn@DB_VERSION_UNIQUE_NAME@
+#define _GetRid _GetRid@DB_VERSION_UNIQUE_NAME@
+#define _GetUInt32 _GetUInt32@DB_VERSION_UNIQUE_NAME@
+#define _GetFlagsList _GetFlagsList@DB_VERSION_UNIQUE_NAME@
+#define _debug_check _debug_check@DB_VERSION_UNIQUE_NAME@
+#define _CopyObjBytes _CopyObjBytes@DB_VERSION_UNIQUE_NAME@
+#define tcl_LockDetect tcl_LockDetect@DB_VERSION_UNIQUE_NAME@
+#define tcl_LockGet tcl_LockGet@DB_VERSION_UNIQUE_NAME@
+#define tcl_LockStat tcl_LockStat@DB_VERSION_UNIQUE_NAME@
+#define tcl_LockStatPrint tcl_LockStatPrint@DB_VERSION_UNIQUE_NAME@
+#define tcl_LockTimeout tcl_LockTimeout@DB_VERSION_UNIQUE_NAME@
+#define tcl_LockVec tcl_LockVec@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogArchive tcl_LogArchive@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogCompare tcl_LogCompare@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogFile tcl_LogFile@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogFlush tcl_LogFlush@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogGet tcl_LogGet@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogPut tcl_LogPut@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogStat tcl_LogStat@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogStatPrint tcl_LogStatPrint@DB_VERSION_UNIQUE_NAME@
+#define logc_Cmd logc_Cmd@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogConfig tcl_LogConfig@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogGetConfig tcl_LogGetConfig@DB_VERSION_UNIQUE_NAME@
+#define _MpInfoDelete _MpInfoDelete@DB_VERSION_UNIQUE_NAME@
+#define tcl_MpSync tcl_MpSync@DB_VERSION_UNIQUE_NAME@
+#define tcl_MpTrickle tcl_MpTrickle@DB_VERSION_UNIQUE_NAME@
+#define tcl_Mp tcl_Mp@DB_VERSION_UNIQUE_NAME@
+#define tcl_MpStat tcl_MpStat@DB_VERSION_UNIQUE_NAME@
+#define tcl_MpStatPrint tcl_MpStatPrint@DB_VERSION_UNIQUE_NAME@
+#define tcl_Mutex tcl_Mutex@DB_VERSION_UNIQUE_NAME@
+#define tcl_MutFree tcl_MutFree@DB_VERSION_UNIQUE_NAME@
+#define tcl_MutGet tcl_MutGet@DB_VERSION_UNIQUE_NAME@
+#define tcl_MutLock tcl_MutLock@DB_VERSION_UNIQUE_NAME@
+#define tcl_MutSet tcl_MutSet@DB_VERSION_UNIQUE_NAME@
+#define tcl_MutStat tcl_MutStat@DB_VERSION_UNIQUE_NAME@
+#define tcl_MutStatPrint tcl_MutStatPrint@DB_VERSION_UNIQUE_NAME@
+#define tcl_MutUnlock tcl_MutUnlock@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepConfig tcl_RepConfig@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepGetTwo tcl_RepGetTwo@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepGetConfig tcl_RepGetConfig@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepGetTimeout tcl_RepGetTimeout@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepGetAckPolicy tcl_RepGetAckPolicy@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepGetLocalSite tcl_RepGetLocalSite@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepElect tcl_RepElect@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepFlush tcl_RepFlush@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepSync tcl_RepSync@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepLease tcl_RepLease@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepInmemFiles tcl_RepInmemFiles@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepLimit tcl_RepLimit@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepNSites tcl_RepNSites@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepRequest tcl_RepRequest@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepNoarchiveTimeout tcl_RepNoarchiveTimeout@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepTransport tcl_RepTransport@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepStart tcl_RepStart@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepProcessMessage tcl_RepProcessMessage@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepStat tcl_RepStat@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepStatPrint tcl_RepStatPrint@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepMgr tcl_RepMgr@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepMgrSiteList tcl_RepMgrSiteList@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepMgrStat tcl_RepMgrStat@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepMgrStatPrint tcl_RepMgrStatPrint@DB_VERSION_UNIQUE_NAME@
+#define tcl_RepApplied tcl_RepApplied@DB_VERSION_UNIQUE_NAME@
+#define seq_Cmd seq_Cmd@DB_VERSION_UNIQUE_NAME@
+#define _TxnInfoDelete _TxnInfoDelete@DB_VERSION_UNIQUE_NAME@
+#define tcl_TxnCheckpoint tcl_TxnCheckpoint@DB_VERSION_UNIQUE_NAME@
+#define tcl_Txn tcl_Txn@DB_VERSION_UNIQUE_NAME@
+#define tcl_CDSGroup tcl_CDSGroup@DB_VERSION_UNIQUE_NAME@
+#define tcl_TxnStat tcl_TxnStat@DB_VERSION_UNIQUE_NAME@
+#define tcl_TxnStatPrint tcl_TxnStatPrint@DB_VERSION_UNIQUE_NAME@
+#define tcl_TxnTimeout tcl_TxnTimeout@DB_VERSION_UNIQUE_NAME@
+#define tcl_TxnRecover tcl_TxnRecover@DB_VERSION_UNIQUE_NAME@
+#define bdb_RandCommand bdb_RandCommand@DB_VERSION_UNIQUE_NAME@
+#define tcl_LockMutex tcl_LockMutex@DB_VERSION_UNIQUE_NAME@
+#define tcl_UnlockMutex tcl_UnlockMutex@DB_VERSION_UNIQUE_NAME@
+#define __txn_begin_pp __txn_begin_pp@DB_VERSION_UNIQUE_NAME@
+#define __txn_begin __txn_begin@DB_VERSION_UNIQUE_NAME@
+#define __txn_recycle_id __txn_recycle_id@DB_VERSION_UNIQUE_NAME@
+#define __txn_continue __txn_continue@DB_VERSION_UNIQUE_NAME@
+#define __txn_commit __txn_commit@DB_VERSION_UNIQUE_NAME@
+#define __txn_abort __txn_abort@DB_VERSION_UNIQUE_NAME@
+#define __txn_discard_int __txn_discard_int@DB_VERSION_UNIQUE_NAME@
+#define __txn_prepare __txn_prepare@DB_VERSION_UNIQUE_NAME@
+#define __txn_id __txn_id@DB_VERSION_UNIQUE_NAME@
+#define __txn_get_name __txn_get_name@DB_VERSION_UNIQUE_NAME@
+#define __txn_set_name __txn_set_name@DB_VERSION_UNIQUE_NAME@
+#define __txn_get_priority __txn_get_priority@DB_VERSION_UNIQUE_NAME@
+#define __txn_set_priority __txn_set_priority@DB_VERSION_UNIQUE_NAME@
+#define __txn_set_timeout __txn_set_timeout@DB_VERSION_UNIQUE_NAME@
+#define __txn_activekids __txn_activekids@DB_VERSION_UNIQUE_NAME@
+#define __txn_force_abort __txn_force_abort@DB_VERSION_UNIQUE_NAME@
+#define __txn_preclose __txn_preclose@DB_VERSION_UNIQUE_NAME@
+#define __txn_reset __txn_reset@DB_VERSION_UNIQUE_NAME@
+#define __txn_applied_pp __txn_applied_pp@DB_VERSION_UNIQUE_NAME@
+#define __txn_regop_42_desc __txn_regop_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __txn_regop_desc __txn_regop_desc@DB_VERSION_UNIQUE_NAME@
+#define __txn_ckp_42_desc __txn_ckp_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __txn_ckp_desc __txn_ckp_desc@DB_VERSION_UNIQUE_NAME@
+#define __txn_child_desc __txn_child_desc@DB_VERSION_UNIQUE_NAME@
+#define __txn_xa_regop_42_desc __txn_xa_regop_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __txn_prepare_desc __txn_prepare_desc@DB_VERSION_UNIQUE_NAME@
+#define __txn_recycle_desc __txn_recycle_desc@DB_VERSION_UNIQUE_NAME@
+#define __txn_init_recover __txn_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __txn_regop_42_print __txn_regop_42_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_regop_print __txn_regop_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_ckp_42_print __txn_ckp_42_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_ckp_print __txn_ckp_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_child_print __txn_child_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_xa_regop_42_print __txn_xa_regop_42_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_prepare_print __txn_prepare_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_recycle_print __txn_recycle_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_init_print __txn_init_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_checkpoint_pp __txn_checkpoint_pp@DB_VERSION_UNIQUE_NAME@
+#define __txn_checkpoint __txn_checkpoint@DB_VERSION_UNIQUE_NAME@
+#define __txn_getactive __txn_getactive@DB_VERSION_UNIQUE_NAME@
+#define __txn_getckp __txn_getckp@DB_VERSION_UNIQUE_NAME@
+#define __txn_updateckp __txn_updateckp@DB_VERSION_UNIQUE_NAME@
+#define __txn_failchk __txn_failchk@DB_VERSION_UNIQUE_NAME@
+#define __txn_env_create __txn_env_create@DB_VERSION_UNIQUE_NAME@
+#define __txn_env_destroy __txn_env_destroy@DB_VERSION_UNIQUE_NAME@
+#define __txn_get_tx_max __txn_get_tx_max@DB_VERSION_UNIQUE_NAME@
+#define __txn_set_tx_max __txn_set_tx_max@DB_VERSION_UNIQUE_NAME@
+#define __txn_get_tx_timestamp __txn_get_tx_timestamp@DB_VERSION_UNIQUE_NAME@
+#define __txn_set_tx_timestamp __txn_set_tx_timestamp@DB_VERSION_UNIQUE_NAME@
+#define __txn_regop_recover __txn_regop_recover@DB_VERSION_UNIQUE_NAME@
+#define __txn_prepare_recover __txn_prepare_recover@DB_VERSION_UNIQUE_NAME@
+#define __txn_ckp_recover __txn_ckp_recover@DB_VERSION_UNIQUE_NAME@
+#define __txn_child_recover __txn_child_recover@DB_VERSION_UNIQUE_NAME@
+#define __txn_restore_txn __txn_restore_txn@DB_VERSION_UNIQUE_NAME@
+#define __txn_recycle_recover __txn_recycle_recover@DB_VERSION_UNIQUE_NAME@
+#define __txn_regop_42_recover __txn_regop_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __txn_ckp_42_recover __txn_ckp_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __txn_recover_pp __txn_recover_pp@DB_VERSION_UNIQUE_NAME@
+#define __txn_recover __txn_recover@DB_VERSION_UNIQUE_NAME@
+#define __txn_get_prepared __txn_get_prepared@DB_VERSION_UNIQUE_NAME@
+#define __txn_openfiles __txn_openfiles@DB_VERSION_UNIQUE_NAME@
+#define __txn_open __txn_open@DB_VERSION_UNIQUE_NAME@
+#define __txn_findlastckp __txn_findlastckp@DB_VERSION_UNIQUE_NAME@
+#define __txn_env_refresh __txn_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __txn_region_mutex_count __txn_region_mutex_count@DB_VERSION_UNIQUE_NAME@
+#define __txn_region_mutex_max __txn_region_mutex_max@DB_VERSION_UNIQUE_NAME@
+#define __txn_region_size __txn_region_size@DB_VERSION_UNIQUE_NAME@
+#define __txn_region_max __txn_region_max@DB_VERSION_UNIQUE_NAME@
+#define __txn_id_set __txn_id_set@DB_VERSION_UNIQUE_NAME@
+#define __txn_oldest_reader __txn_oldest_reader@DB_VERSION_UNIQUE_NAME@
+#define __txn_add_buffer __txn_add_buffer@DB_VERSION_UNIQUE_NAME@
+#define __txn_remove_buffer __txn_remove_buffer@DB_VERSION_UNIQUE_NAME@
+#define __txn_stat_pp __txn_stat_pp@DB_VERSION_UNIQUE_NAME@
+#define __txn_stat_print_pp __txn_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __txn_stat_print __txn_stat_print@DB_VERSION_UNIQUE_NAME@
+#define __txn_closeevent __txn_closeevent@DB_VERSION_UNIQUE_NAME@
+#define __txn_remevent __txn_remevent@DB_VERSION_UNIQUE_NAME@
+#define __txn_remrem __txn_remrem@DB_VERSION_UNIQUE_NAME@
+#define __txn_lockevent __txn_lockevent@DB_VERSION_UNIQUE_NAME@
+#define __txn_remlock __txn_remlock@DB_VERSION_UNIQUE_NAME@
+#define __txn_doevents __txn_doevents@DB_VERSION_UNIQUE_NAME@
+#define __txn_record_fname __txn_record_fname@DB_VERSION_UNIQUE_NAME@
+#define __txn_dref_fname __txn_dref_fname@DB_VERSION_UNIQUE_NAME@
+#define __txn_reset_fe_watermarks __txn_reset_fe_watermarks@DB_VERSION_UNIQUE_NAME@
+#define __txn_remove_fe_watermark __txn_remove_fe_watermark@DB_VERSION_UNIQUE_NAME@
+#define __txn_add_fe_watermark __txn_add_fe_watermark@DB_VERSION_UNIQUE_NAME@
+#define __txn_flush_fe_files __txn_flush_fe_files@DB_VERSION_UNIQUE_NAME@
+#define __txn_pg_above_fe_watermark __txn_pg_above_fe_watermark@DB_VERSION_UNIQUE_NAME@
+#define __db_rmid_to_env __db_rmid_to_env@DB_VERSION_UNIQUE_NAME@
+#define __db_xid_to_txn __db_xid_to_txn@DB_VERSION_UNIQUE_NAME@
+#define __db_map_rmid __db_map_rmid@DB_VERSION_UNIQUE_NAME@
+#define __db_unmap_rmid __db_unmap_rmid@DB_VERSION_UNIQUE_NAME@
+#define __db_unmap_xid __db_unmap_xid@DB_VERSION_UNIQUE_NAME@
+#define __db_global_values __db_global_values@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_guesstimated_max __repmgr_guesstimated_max@DB_VERSION_UNIQUE_NAME@
+#define db_xa_switch db_xa_switch@DB_VERSION_UNIQUE_NAME@
+
+#endif /* !_DB_INT_DEF_IN_ */
diff --git a/src/dbinc_auto/lock_ext.h b/src/dbinc_auto/lock_ext.h
new file mode 100644
index 00000000..d5981e18
--- /dev/null
+++ b/src/dbinc_auto/lock_ext.h
@@ -0,0 +1,78 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _lock_ext_h_
+#define _lock_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __lock_vec_pp __P((DB_ENV *, u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+int __lock_vec __P((ENV *, DB_LOCKER *, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+int __lock_get_pp __P((DB_ENV *, u_int32_t, u_int32_t, DBT *, db_lockmode_t, DB_LOCK *));
+int __lock_get __P((ENV *, DB_LOCKER *, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *));
+int __lock_get_internal __P((DB_LOCKTAB *, DB_LOCKER *, u_int32_t, const DBT *, db_lockmode_t, db_timeout_t, DB_LOCK *));
+int __lock_put_pp __P((DB_ENV *, DB_LOCK *));
+int __lock_put __P((ENV *, DB_LOCK *));
+int __lock_downgrade __P((ENV *, DB_LOCK *, db_lockmode_t, u_int32_t));
+int __lock_locker_same_family __P((ENV *, DB_LOCKER *, DB_LOCKER *, int *));
+int __lock_wakeup __P((ENV *, const DBT *));
+int __lock_promote __P((DB_LOCKTAB *, DB_LOCKOBJ *, int *, u_int32_t));
+int __lock_change __P((ENV *, DB_LOCK *, DB_LOCK *));
+int __lock_detect_pp __P((DB_ENV *, u_int32_t, u_int32_t, int *));
+int __lock_detect __P((ENV *, u_int32_t, int *));
+int __lock_failchk __P((ENV *));
+int __lock_id_pp __P((DB_ENV *, u_int32_t *));
+int __lock_id __P((ENV *, u_int32_t *, DB_LOCKER **));
+void __lock_set_thread_id __P((void *, pid_t, db_threadid_t));
+int __lock_id_free_pp __P((DB_ENV *, u_int32_t));
+int __lock_id_free __P((ENV *, DB_LOCKER *));
+int __lock_id_set __P((ENV *, u_int32_t, u_int32_t));
+int __lock_getlocker __P((DB_LOCKTAB *, u_int32_t, int, DB_LOCKER **));
+int __lock_getlocker_int __P((DB_LOCKTAB *, u_int32_t, int, DB_LOCKER **));
+int __lock_addfamilylocker __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __lock_freelocker __P((DB_LOCKTAB *, DB_LOCKER *));
+int __lock_familyremove __P((DB_LOCKTAB *, DB_LOCKER *));
+int __lock_fix_list __P((ENV *, DBT *, u_int32_t));
+int __lock_get_list __P((ENV *, DB_LOCKER *, u_int32_t, db_lockmode_t, DBT *));
+void __lock_list_print __P((ENV *, DB_MSGBUF *, DBT *));
+int __lock_env_create __P((DB_ENV *));
+void __lock_env_destroy __P((DB_ENV *));
+int __lock_get_lk_conflicts __P((DB_ENV *, const u_int8_t **, int *));
+int __lock_set_lk_conflicts __P((DB_ENV *, u_int8_t *, int));
+int __lock_get_lk_detect __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_detect __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_max_locks __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_max_locks __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_max_lockers __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_max_lockers __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_max_objects __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_max_objects __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_partitions __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_partitions __P((DB_ENV *, u_int32_t));
+int __lock_get_lk_tablesize __P((DB_ENV *, u_int32_t *));
+int __lock_set_lk_tablesize __P((DB_ENV *, u_int32_t));
+int __lock_set_lk_priority __P((DB_ENV *, u_int32_t, u_int32_t));
+int __lock_get_lk_priority __P((DB_ENV *, u_int32_t, u_int32_t *));
+int __lock_get_env_timeout __P((DB_ENV *, db_timeout_t *, u_int32_t));
+int __lock_set_env_timeout __P((DB_ENV *, db_timeout_t, u_int32_t));
+int __lock_open __P((ENV *));
+int __lock_env_refresh __P((ENV *));
+u_int32_t __lock_region_mutex_count __P((ENV *));
+u_int32_t __lock_region_mutex_max __P((ENV *));
+size_t __lock_region_max __P((ENV *));
+size_t __lock_region_size __P((ENV *, size_t));
+int __lock_stat_pp __P((DB_ENV *, DB_LOCK_STAT **, u_int32_t));
+int __lock_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __lock_stat_print __P((ENV *, u_int32_t));
+void __lock_printlock __P((DB_LOCKTAB *, DB_MSGBUF *mbp, struct __db_lock *, int));
+int __lock_set_timeout __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t));
+int __lock_set_timeout_internal __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t));
+int __lock_inherit_timeout __P((ENV *, DB_LOCKER *, DB_LOCKER *));
+u_int32_t __lock_ohash __P((const DBT *));
+u_int32_t __lock_lhash __P((DB_LOCKOBJ *));
+int __lock_nomem __P((ENV *, const char *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_lock_ext_h_ */
diff --git a/src/dbinc_auto/log_ext.h b/src/dbinc_auto/log_ext.h
new file mode 100644
index 00000000..dde6742d
--- /dev/null
+++ b/src/dbinc_auto/log_ext.h
@@ -0,0 +1,208 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _log_ext_h_
+#define _log_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __log_open __P((ENV *));
+int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
+int __log_valid __P((DB_LOG *, u_int32_t, int, DB_FH **, u_int32_t, logfile_validity *, u_int32_t *));
+int __log_env_refresh __P((ENV *));
+int __log_get_cached_ckp_lsn __P((ENV *, DB_LSN *));
+u_int32_t __log_region_mutex_count __P((ENV *));
+u_int32_t __log_region_mutex_max __P((ENV *));
+size_t __log_region_size __P((ENV *));
+size_t __log_region_max __P((ENV *));
+int __log_vtruncate __P((ENV *, DB_LSN *, DB_LSN *, DB_LSN *));
+int __log_is_outdated __P((ENV *, u_int32_t, int *));
+int __log_zero __P((ENV *, DB_LSN *));
+int __log_inmem_lsnoff __P((DB_LOG *, DB_LSN *, size_t *));
+int __log_inmem_newfile __P((DB_LOG *, u_int32_t));
+int __log_inmem_chkspace __P((DB_LOG *, size_t));
+void __log_inmem_copyout __P((DB_LOG *, size_t, void *, size_t));
+void __log_inmem_copyin __P((DB_LOG *, size_t, void *, size_t));
+void __log_set_version __P((ENV *, u_int32_t));
+int __log_get_oldversion __P((ENV *, u_int32_t *));
+int __log_archive_pp __P((DB_ENV *, char **[], u_int32_t));
+int __log_archive __P((ENV *, char **[], u_int32_t));
+int __log_get_stable_lsn __P((ENV *, DB_LSN *, int));
+void __log_autoremove __P((ENV *));
+int __log_check_page_lsn __P((ENV *, DB *, DB_LSN *));
+int __log_printf_capi __P((DB_ENV *, DB_TXN *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+int __log_printf_pp __P((DB_ENV *, DB_TXN *, const char *, va_list));
+int __log_printf __P((ENV *, DB_TXN *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+int __log_cursor_pp __P((DB_ENV *, DB_LOGC **, u_int32_t));
+int __log_cursor __P((ENV *, DB_LOGC **));
+int __logc_close __P((DB_LOGC *));
+int __logc_version __P((DB_LOGC *, u_int32_t *));
+int __logc_get __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+void __log_hdrswap __P((HDR *, int));
+void __log_persistswap __P((LOGP *));
+int __log_read_record_pp __P((DB_ENV *, DB **, void *, void *, DB_LOG_RECSPEC *, u_int32_t, void **));
+int __log_read_record __P((ENV *, DB **, void *, void *, DB_LOG_RECSPEC *, u_int32_t, void **));
+int __log_env_create __P((DB_ENV *));
+void __log_env_destroy __P((DB_ENV *));
+int __log_get_lg_bsize __P((DB_ENV *, u_int32_t *));
+int __log_set_lg_bsize __P((DB_ENV *, u_int32_t));
+int __log_get_lg_filemode __P((DB_ENV *, int *));
+int __log_set_lg_filemode __P((DB_ENV *, int));
+int __log_get_lg_max __P((DB_ENV *, u_int32_t *));
+int __log_set_lg_max __P((DB_ENV *, u_int32_t));
+int __log_get_lg_regionmax __P((DB_ENV *, u_int32_t *));
+int __log_set_lg_regionmax __P((DB_ENV *, u_int32_t));
+int __log_get_lg_dir __P((DB_ENV *, const char **));
+int __log_set_lg_dir __P((DB_ENV *, const char *));
+void __log_get_flags __P((DB_ENV *, u_int32_t *));
+void __log_set_flags __P((ENV *, u_int32_t, int));
+int __log_get_config __P((DB_ENV *, u_int32_t, int *));
+int __log_set_config __P((DB_ENV *, u_int32_t, int));
+int __log_set_config_int __P((DB_ENV *, u_int32_t, int, int));
+int __log_check_sizes __P((ENV *, u_int32_t, u_int32_t));
+int __log_print_record __P((ENV *, DBT *, DB_LSN *, char *, DB_LOG_RECSPEC *, void *));
+int __log_put_pp __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
+int __log_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
+int __log_current_lsn_int __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
+int __log_current_lsn __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
+int __log_newfile __P((DB_LOG *, DB_LSN *, u_int32_t, u_int32_t));
+int __log_flush_pp __P((DB_ENV *, const DB_LSN *));
+int __log_flush __P((ENV *, const DB_LSN *));
+int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
+int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t));
+int __log_name __P((DB_LOG *, u_int32_t, char **, DB_FH **, u_int32_t));
+int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
+int __log_put_record_pp __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, ...));
+int __log_put_record __P((ENV *, DB *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, ...));
+int __log_stat_pp __P((DB_ENV *, DB_LOG_STAT **, u_int32_t));
+int __log_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __log_stat_print __P((ENV *, u_int32_t));
+int __log_verify_pp __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+int __log_verify __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *, DB_THREAD_INFO *));
+int __log_verify_wrap __P((ENV *, const char *, u_int32_t, const char *, const char *, time_t, time_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t, int, int));
+int __crdel_init_verify __P((ENV *, DB_DISTAB *));
+int __db_init_verify __P((ENV *, DB_DISTAB *));
+int __dbreg_init_verify __P((ENV *, DB_DISTAB *));
+int __bam_init_verify __P((ENV *, DB_DISTAB *));
+int __fop_init_verify __P((ENV *, DB_DISTAB *));
+int __ham_init_verify __P((ENV *, DB_DISTAB *));
+int __heap_init_verify __P((ENV *, DB_DISTAB *));
+int __qam_init_verify __P((ENV *, DB_DISTAB *));
+int __txn_init_verify __P((ENV *, DB_DISTAB *));
+void __db_log_verify_global_report __P((const DB_LOG_VRFY_INFO *));
+int __crdel_metasub_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_create_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_rename_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __crdel_inmem_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_addrem_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_big_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_ovref_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_debug_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_noop_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_alloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_free_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_cksum_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_freedata_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_init_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_sort_44_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pg_trunc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_realloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_relink_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_merge_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __db_pgno_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __dbreg_register_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_split_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rsplit_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_adj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_irep_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cadjust_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_cdel_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_repl_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_root_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_curadj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_rcuradj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_relink_43_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __bam_merge_44_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_file_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_insdel_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_newpage_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_splitdata_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_replace_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_copypage_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_metagroup_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_groupalloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_changeslot_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_contract_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_curadj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __ham_chgpg_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_addrem_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_pg_alloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_meta_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_trunc_page_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_incfirst_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_mvptr_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_del_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_add_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_delext_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_regop_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_regop_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_child_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_xa_regop_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_prepare_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_recycle_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __create_log_vrfy_info __P((const DB_LOG_VERIFY_CONFIG *, DB_LOG_VRFY_INFO **, DB_THREAD_INFO *));
+int __destroy_log_vrfy_info __P((DB_LOG_VRFY_INFO *));
+int __put_txn_vrfy_info __P((const DB_LOG_VRFY_INFO *, const VRFY_TXN_INFO *));
+int __get_txn_vrfy_info __P((const DB_LOG_VRFY_INFO *, u_int32_t, VRFY_TXN_INFO **));
+int __add_recycle_lsn_range __P((DB_LOG_VRFY_INFO *, const DB_LSN *, u_int32_t, u_int32_t));
+int __iterate_txninfo __P((DB_LOG_VRFY_INFO *, u_int32_t, u_int32_t, TXNINFO_HANDLER, void *));
+int __rem_last_recycle_lsn __P((VRFY_TXN_INFO *));
+int __add_file_updated __P((VRFY_TXN_INFO *, const DBT *, int32_t));
+int __del_file_updated __P((VRFY_TXN_INFO *, const DBT *));
+int __clear_fileups __P((VRFY_TXN_INFO *));
+int __free_txninfo_stack __P((VRFY_TXN_INFO *));
+int __free_txninfo __P((VRFY_TXN_INFO *));
+int __put_filereg_info __P((const DB_LOG_VRFY_INFO *, const VRFY_FILEREG_INFO *));
+int __del_filelife __P((const DB_LOG_VRFY_INFO *, int32_t));
+int __put_filelife __P((const DB_LOG_VRFY_INFO *, VRFY_FILELIFE *));
+int __get_filelife __P((const DB_LOG_VRFY_INFO *, int32_t, VRFY_FILELIFE **));
+int __get_filereg_by_dbregid __P((const DB_LOG_VRFY_INFO *, int32_t, VRFY_FILEREG_INFO **));
+int __add_dbregid __P((DB_LOG_VRFY_INFO *, VRFY_FILEREG_INFO *, int32_t, u_int32_t, DB_LSN, DBTYPE, db_pgno_t, int *));
+int __get_filereg_info __P((const DB_LOG_VRFY_INFO *, const DBT *, VRFY_FILEREG_INFO **));
+int __free_filereg_info __P((VRFY_FILEREG_INFO *));
+int __get_ckp_info __P((const DB_LOG_VRFY_INFO *, DB_LSN, VRFY_CKP_INFO **));
+int __get_last_ckp_info __P((const DB_LOG_VRFY_INFO *, VRFY_CKP_INFO **));
+int __put_ckp_info __P((const DB_LOG_VRFY_INFO *, const VRFY_CKP_INFO *));
+int __get_timestamp_info __P((const DB_LOG_VRFY_INFO *, DB_LSN, VRFY_TIMESTAMP_INFO **));
+int __get_latest_timestamp_info __P((const DB_LOG_VRFY_INFO *, DB_LSN, VRFY_TIMESTAMP_INFO **));
+int __put_timestamp_info __P((const DB_LOG_VRFY_INFO *, const VRFY_TIMESTAMP_INFO *));
+int __find_lsnrg_by_timerg __P((DB_LOG_VRFY_INFO *, time_t, time_t, DB_LSN *, DB_LSN *));
+int __add_txnrange __P((DB_LOG_VRFY_INFO *, u_int32_t, DB_LSN, int32_t, int));
+int __get_aborttxn __P((DB_LOG_VRFY_INFO *, DB_LSN));
+int __txn_started __P((DB_LOG_VRFY_INFO *, DB_LSN, u_int32_t, int *));
+int __set_logvrfy_dbfuid __P((DB_LOG_VRFY_INFO *));
+int __add_page_to_txn __P((DB_LOG_VRFY_INFO *, int32_t, db_pgno_t, u_int32_t, u_int32_t *, int *));
+int __del_txn_pages __P((DB_LOG_VRFY_INFO *, u_int32_t));
+int __is_ancestor_txn __P((DB_LOG_VRFY_INFO *, u_int32_t, u_int32_t, DB_LSN, int *));
+int __return_txn_pages __P((DB_LOG_VRFY_INFO *, u_int32_t, u_int32_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_log_ext_h_ */
diff --git a/src/dbinc_auto/mp_ext.h b/src/dbinc_auto/mp_ext.h
new file mode 100644
index 00000000..d142b584
--- /dev/null
+++ b/src/dbinc_auto/mp_ext.h
@@ -0,0 +1,106 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _mp_ext_h_
+#define _mp_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __memp_alloc __P((DB_MPOOL *, REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
+void __memp_free __P((REGINFO *, void *));
+int __memp_backup_open __P((ENV *, DB_MPOOLFILE *, const char *, const char *, u_int32_t, DB_FH **, void**));
+int __memp_backup_mpf __P((ENV *, DB_MPOOLFILE *, DB_THREAD_INFO *, db_pgno_t, db_pgno_t, DB_FH *, void *, u_int32_t));
+int __memp_backup_close __P((ENV *, DB_MPOOLFILE *, const char *, DB_FH *, void *HANDLE));
+int __memp_failchk __P((ENV *));
+int __memp_bhwrite __P((DB_MPOOL *, DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
+int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int));
+int __memp_bhfree __P((DB_MPOOL *, REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t));
+int __memp_fget_pp __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
+int __memp_fget __P((DB_MPOOLFILE *, db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *));
+int __memp_fcreate_pp __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
+int __memp_fcreate __P((ENV *, DB_MPOOLFILE **));
+int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t));
+int __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+int __memp_get_flags __P((DB_MPOOLFILE *, u_int32_t *));
+int __memp_set_flags __P((DB_MPOOLFILE *, u_int32_t, int));
+int __memp_get_ftype __P((DB_MPOOLFILE *, int *));
+int __memp_set_ftype __P((DB_MPOOLFILE *, int));
+int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t));
+int __memp_get_pgcookie __P((DB_MPOOLFILE *, DBT *));
+int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *));
+int __memp_get_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *));
+int __memp_get_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
+char * __memp_fn __P((DB_MPOOLFILE *));
+char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *));
+int __memp_fopen_pp __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t));
+int __memp_fopen __P((DB_MPOOLFILE *, MPOOLFILE *, const char *, const char **, u_int32_t, int, size_t));
+int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t));
+int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
+int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *, int));
+int __memp_inmemlist __P((ENV *, char ***, int *));
+int __memp_fput_pp __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
+int __memp_fput __P((DB_MPOOLFILE *, DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY));
+int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *));
+int __memp_dirty __P((DB_MPOOLFILE *, void *, DB_THREAD_INFO *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t));
+int __memp_shared __P((DB_MPOOLFILE *, void *));
+int __memp_env_create __P((DB_ENV *));
+void __memp_env_destroy __P((DB_ENV *));
+int __memp_get_cachesize __P((DB_ENV *, u_int32_t *, u_int32_t *, int *));
+int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
+int __memp_set_config __P((DB_ENV *, u_int32_t, int));
+int __memp_get_config __P((DB_ENV *, u_int32_t, int *));
+int __memp_get_mp_max_openfd __P((DB_ENV *, int *));
+int __memp_set_mp_max_openfd __P((DB_ENV *, int));
+int __memp_get_mp_max_write __P((DB_ENV *, int *, db_timeout_t *));
+int __memp_set_mp_max_write __P((DB_ENV *, int, db_timeout_t));
+int __memp_get_mp_mmapsize __P((DB_ENV *, size_t *));
+int __memp_set_mp_mmapsize __P((DB_ENV *, size_t));
+int __memp_get_mp_pagesize __P((DB_ENV *, u_int32_t *));
+int __memp_set_mp_pagesize __P((DB_ENV *, u_int32_t));
+int __memp_get_mp_tablesize __P((DB_ENV *, u_int32_t *));
+int __memp_set_mp_tablesize __P((DB_ENV *, u_int32_t));
+int __memp_get_mp_mtxcount __P((DB_ENV *, u_int32_t *));
+int __memp_set_mp_mtxcount __P((DB_ENV *, u_int32_t));
+int __memp_nameop __P((ENV *, u_int8_t *, const char *, const char *, const char *, int));
+int __memp_ftruncate __P((DB_MPOOLFILE *, DB_TXN *, DB_THREAD_INFO *, db_pgno_t, u_int32_t));
+int __memp_alloc_freelist __P((DB_MPOOLFILE *, u_int32_t, db_pgno_t **));
+int __memp_free_freelist __P((DB_MPOOLFILE *));
+int __memp_get_freelist __P(( DB_MPOOLFILE *, u_int32_t *, db_pgno_t **));
+int __memp_extend_freelist __P(( DB_MPOOLFILE *, u_int32_t , db_pgno_t **));
+int __memp_set_last_pgno __P((DB_MPOOLFILE *, db_pgno_t));
+int __memp_bh_settxn __P((DB_MPOOL *, MPOOLFILE *mfp, BH *, void *));
+int __memp_skip_curadj __P((DBC *, db_pgno_t));
+int __memp_bh_freeze __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, int *));
+int __memp_bh_thaw __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, BH *));
+int __memp_open __P((ENV *, int));
+int __memp_init __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int));
+u_int32_t __memp_max_regions __P((ENV *));
+u_int32_t __memp_region_mutex_count __P((ENV *));
+int __memp_env_refresh __P((ENV *));
+int __memp_register_pp __P((DB_ENV *, int, int (*)(DB_ENV *, db_pgno_t, void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+int __memp_register __P((ENV *, int, int (*)(DB_ENV *, db_pgno_t, void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+int __memp_get_bucket __P((ENV *, MPOOLFILE *, db_pgno_t, REGINFO **, DB_MPOOL_HASH **, u_int32_t *));
+int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t));
+int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t));
+int __memp_stat_pp __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
+int __memp_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __memp_stat_print __P((ENV *, u_int32_t));
+void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *));
+int __memp_walk_files __P((ENV *, MPOOL *, int (*) __P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t)), void *, u_int32_t *, u_int32_t));
+int __memp_discard_all_mpfs __P((ENV *, MPOOL *));
+int __memp_sync_pp __P((DB_ENV *, DB_LSN *));
+int __memp_sync __P((ENV *, u_int32_t, DB_LSN *));
+int __memp_fsync_pp __P((DB_MPOOLFILE *));
+int __memp_fsync __P((DB_MPOOLFILE *));
+int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
+int __memp_sync_int __P((ENV *, DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *));
+int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int));
+int __memp_trickle_pp __P((DB_ENV *, int, int *));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_mp_ext_h_ */
diff --git a/src/dbinc_auto/mutex_ext.h b/src/dbinc_auto/mutex_ext.h
new file mode 100644
index 00000000..1a2a1b2b
--- /dev/null
+++ b/src/dbinc_auto/mutex_ext.h
@@ -0,0 +1,91 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _mutex_ext_h_
+#define _mutex_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __mutex_alloc __P((ENV *, int, u_int32_t, db_mutex_t *));
+int __mutex_alloc_int __P((ENV *, int, int, u_int32_t, db_mutex_t *));
+int __mutex_free __P((ENV *, db_mutex_t *));
+int __mutex_free_int __P((ENV *, int, db_mutex_t *));
+int __mutex_refresh __P((ENV *, db_mutex_t));
+int __mut_failchk __P((ENV *));
+int __db_fcntl_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+int __db_fcntl_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+int __db_fcntl_mutex_trylock __P((ENV *, db_mutex_t));
+int __db_fcntl_mutex_unlock __P((ENV *, db_mutex_t));
+int __db_fcntl_mutex_destroy __P((ENV *, db_mutex_t));
+int __mutex_alloc_pp __P((DB_ENV *, u_int32_t, db_mutex_t *));
+int __mutex_free_pp __P((DB_ENV *, db_mutex_t));
+int __mutex_lock_pp __P((DB_ENV *, db_mutex_t));
+int __mutex_unlock_pp __P((DB_ENV *, db_mutex_t));
+int __mutex_get_align __P((DB_ENV *, u_int32_t *));
+int __mutex_set_align __P((DB_ENV *, u_int32_t));
+int __mutex_get_increment __P((DB_ENV *, u_int32_t *));
+int __mutex_set_increment __P((DB_ENV *, u_int32_t));
+int __mutex_get_init __P((DB_ENV *, u_int32_t *));
+int __mutex_set_init __P((DB_ENV *, u_int32_t));
+int __mutex_get_max __P((DB_ENV *, u_int32_t *));
+int __mutex_set_max __P((DB_ENV *, u_int32_t));
+int __mutex_get_tas_spins __P((DB_ENV *, u_int32_t *));
+int __mutex_set_tas_spins __P((DB_ENV *, u_int32_t));
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+atomic_value_t __atomic_inc __P((ENV *, db_atomic_t *));
+#endif
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+atomic_value_t __atomic_dec __P((ENV *, db_atomic_t *));
+#endif
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+int atomic_compare_exchange __P((ENV *, db_atomic_t *, atomic_value_t, atomic_value_t));
+#endif
+int __db_pthread_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+#ifndef HAVE_MUTEX_HYBRID
+int __db_pthread_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+int __db_pthread_mutex_readlock __P((ENV *, db_mutex_t));
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+int __db_hybrid_mutex_suspend __P((ENV *, db_mutex_t, db_timespec *, int));
+#endif
+int __db_pthread_mutex_unlock __P((ENV *, db_mutex_t));
+int __db_pthread_mutex_destroy __P((ENV *, db_mutex_t));
+int __mutex_open __P((ENV *, int));
+int __mutex_env_refresh __P((ENV *));
+void __mutex_resource_return __P((ENV *, REGINFO *));
+int __mutex_stat_pp __P((DB_ENV *, DB_MUTEX_STAT **, u_int32_t));
+int __mutex_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __mutex_stat_print __P((ENV *, u_int32_t));
+void __mutex_print_debug_single __P((ENV *, const char *, db_mutex_t, u_int32_t));
+void __mutex_print_debug_stats __P((ENV *, DB_MSGBUF *, db_mutex_t, u_int32_t));
+void __mutex_set_wait_info __P((ENV *, db_mutex_t, uintmax_t *, uintmax_t *));
+void __mutex_clear __P((ENV *, db_mutex_t));
+int __db_tas_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+int __db_tas_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+int __db_tas_mutex_trylock __P((ENV *, db_mutex_t));
+#if defined(HAVE_SHARED_LATCHES)
+int __db_tas_mutex_readlock __P((ENV *, db_mutex_t));
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+int __db_tas_mutex_tryreadlock __P((ENV *, db_mutex_t));
+#endif
+int __db_tas_mutex_unlock __P((ENV *, db_mutex_t));
+int __db_tas_mutex_destroy __P((ENV *, db_mutex_t));
+int __db_win32_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+int __db_win32_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+int __db_win32_mutex_trylock __P((ENV *, db_mutex_t));
+#if defined(HAVE_SHARED_LATCHES)
+int __db_win32_mutex_readlock __P((ENV *, db_mutex_t));
+#endif
+#if defined(HAVE_SHARED_LATCHES)
+int __db_win32_mutex_tryreadlock __P((ENV *, db_mutex_t));
+#endif
+int __db_win32_mutex_unlock __P((ENV *, db_mutex_t));
+int __db_win32_mutex_destroy __P((ENV *, db_mutex_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_mutex_ext_h_ */
diff --git a/src/dbinc_auto/os_ext.h b/src/dbinc_auto/os_ext.h
new file mode 100644
index 00000000..a0a7b791
--- /dev/null
+++ b/src/dbinc_auto/os_ext.h
@@ -0,0 +1,84 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _os_ext_h_
+#define _os_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void __os_abort __P((ENV *));
+int __os_abspath __P((const char *));
+#if defined(HAVE_REPLICATION_THREADS)
+int __os_getaddrinfo __P((ENV *, const char *, u_int, const char *, const ADDRINFO *, ADDRINFO **));
+#endif
+#if defined(HAVE_REPLICATION_THREADS)
+void __os_freeaddrinfo __P((ENV *, ADDRINFO *));
+#endif
+int __os_umalloc __P((ENV *, size_t, void *));
+int __os_urealloc __P((ENV *, size_t, void *));
+void __os_ufree __P((ENV *, void *));
+int __os_strdup __P((ENV *, const char *, void *));
+int __os_calloc __P((ENV *, size_t, size_t, void *));
+int __os_malloc __P((ENV *, size_t, void *));
+int __os_realloc __P((ENV *, size_t, void *));
+void __os_free __P((ENV *, void *));
+void *__ua_memcpy __P((void *, const void *, size_t));
+void __os_gettime __P((ENV *, db_timespec *, int));
+int __os_fs_notzero __P((void));
+int __os_support_direct_io __P((void));
+int __os_support_db_register __P((void));
+int __os_support_replication __P((void));
+u_int32_t __os_cpu_count __P((void));
+char *__os_ctime __P((const time_t *, char *));
+int __os_dirlist __P((ENV *, const char *, int, char ***, int *));
+void __os_dirfree __P((ENV *, char **, int));
+int __os_get_errno_ret_zero __P((void));
+int __os_get_errno __P((void));
+int __os_get_neterr __P((void));
+int __os_get_syserr __P((void));
+void __os_set_errno __P((int));
+char *__os_strerror __P((int, char *, size_t));
+int __os_posix_err __P((int));
+int __os_fileid __P((ENV *, const char *, int, u_int8_t *));
+int __os_fdlock __P((ENV *, DB_FH *, off_t, int, int));
+int __os_fsync __P((ENV *, DB_FH *));
+int __os_getenv __P((ENV *, const char *, char **, size_t));
+int __os_openhandle __P((ENV *, const char *, int, int, DB_FH **));
+int __os_closehandle __P((ENV *, DB_FH *));
+int __os_attach __P((ENV *, REGINFO *, REGION *));
+int __os_detach __P((ENV *, REGINFO *, int));
+int __os_mapfile __P((ENV *, char *, DB_FH *, size_t, int, void **));
+int __os_unmapfile __P((ENV *, void *, size_t));
+int __os_mkdir __P((ENV *, const char *, int));
+int __os_open __P((ENV *, const char *, u_int32_t, u_int32_t, int, DB_FH **));
+int __os_concat_path __P((char *, size_t, const char *, const char *));
+void __os_id __P((DB_ENV *, pid_t *, db_threadid_t*));
+int __os_rename __P((ENV *, const char *, const char *, u_int32_t));
+int __os_isroot __P((void));
+char *__db_rpath __P((const char *));
+int __os_io __P((ENV *, int, DB_FH *, db_pgno_t, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, size_t *));
+int __os_read __P((ENV *, DB_FH *, void *, size_t, size_t *));
+int __os_write __P((ENV *, DB_FH *, void *, size_t, size_t *));
+int __os_physwrite __P((ENV *, DB_FH *, void *, size_t, size_t *));
+int __os_seek __P((ENV *, DB_FH *, db_pgno_t, u_int32_t, off_t));
+void __os_stack __P((ENV *));
+int __os_exists __P((ENV *, const char *, int *));
+int __os_ioinfo __P((ENV *, const char *, DB_FH *, u_int32_t *, u_int32_t *, u_int32_t *));
+int __os_tmpdir __P((ENV *, u_int32_t));
+int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t));
+void __os_unique_id __P((ENV *, u_int32_t *));
+int __os_unlink __P((ENV *, const char *, int));
+void __os_yield __P((ENV *, u_long, u_long));
+#ifdef HAVE_QNX
+int __os_qnx_region_open __P((ENV *, const char *, int, int, DB_FH **));
+#endif
+int __os_is_winnt __P((void));
+u_int32_t __os_cpu_count __P((void));
+#ifdef HAVE_REPLICATION_THREADS
+int __os_get_neterr __P((void));
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_os_ext_h_ */
diff --git a/src/dbinc_auto/qam_auto.h b/src/dbinc_auto/qam_auto.h
new file mode 100644
index 00000000..fe7c2437
--- /dev/null
+++ b/src/dbinc_auto/qam_auto.h
@@ -0,0 +1,174 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __qam_AUTO_H
+#define __qam_AUTO_H
+#ifdef HAVE_QUEUE
+#include "dbinc/log.h"
+#define DB___qam_incfirst 84
+typedef struct ___qam_incfirst_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ db_recno_t recno;
+ db_pgno_t meta_pgno;
+} __qam_incfirst_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_incfirst_desc[];
+static inline int
+__qam_incfirst_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, db_recno_t recno, db_pgno_t meta_pgno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___qam_incfirst, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
+ __qam_incfirst_desc, recno, meta_pgno));
+}
+
+static inline int __qam_incfirst_read(ENV *env,
+ DB **dbpp, void *td, void *data, __qam_incfirst_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __qam_incfirst_desc, sizeof(__qam_incfirst_args), (void**)arg));
+}
+#define DB___qam_mvptr 85
+typedef struct ___qam_mvptr_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_recno_t old_first;
+ db_recno_t new_first;
+ db_recno_t old_cur;
+ db_recno_t new_cur;
+ DB_LSN metalsn;
+ db_pgno_t meta_pgno;
+} __qam_mvptr_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_mvptr_desc[];
+static inline int
+__qam_mvptr_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t opcode, db_recno_t old_first, db_recno_t new_first, db_recno_t old_cur,
+ db_recno_t new_cur, DB_LSN * metalsn, db_pgno_t meta_pgno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___qam_mvptr, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ sizeof(*metalsn) + sizeof(u_int32_t),
+ __qam_mvptr_desc,
+ opcode, old_first, new_first, old_cur, new_cur, metalsn, meta_pgno));
+}
+
+static inline int __qam_mvptr_read(ENV *env,
+ DB **dbpp, void *td, void *data, __qam_mvptr_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __qam_mvptr_desc, sizeof(__qam_mvptr_args), (void**)arg));
+}
+#define DB___qam_del 79
+typedef struct ___qam_del_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DB_LSN lsn;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ db_recno_t recno;
+} __qam_del_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_del_desc[];
+static inline int
+__qam_del_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * lsn, db_pgno_t pgno, u_int32_t indx, db_recno_t recno)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___qam_del, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(*lsn) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __qam_del_desc, lsn, pgno, indx, recno));
+}
+
+static inline int __qam_del_read(ENV *env,
+ DB **dbpp, void *td, void *data, __qam_del_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __qam_del_desc, sizeof(__qam_del_args), (void**)arg));
+}
+#define DB___qam_add 80
+typedef struct ___qam_add_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DB_LSN lsn;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ db_recno_t recno;
+ DBT data;
+ u_int32_t vflag;
+ DBT olddata;
+} __qam_add_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_add_desc[];
+static inline int
+__qam_add_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * lsn, db_pgno_t pgno, u_int32_t indx, db_recno_t recno,
+ const DBT *data, u_int32_t vflag, const DBT *olddata)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___qam_add, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(*lsn) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(data) +
+ sizeof(u_int32_t) + LOG_DBT_SIZE(olddata),
+ __qam_add_desc, lsn, pgno, indx, recno, data, vflag, olddata));
+}
+
+static inline int __qam_add_read(ENV *env,
+ DB **dbpp, void *td, void *data, __qam_add_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __qam_add_desc, sizeof(__qam_add_args), (void**)arg));
+}
+#define DB___qam_delext 83
+typedef struct ___qam_delext_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ int32_t fileid;
+ DB_LSN lsn;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ db_recno_t recno;
+ DBT data;
+} __qam_delext_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __qam_delext_desc[];
+static inline int
+__qam_delext_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, DB_LSN * lsn, db_pgno_t pgno, u_int32_t indx, db_recno_t recno,
+ const DBT *data)
+{
+ return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
+ flags, DB___qam_delext, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(*lsn) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(data),
+ __qam_delext_desc, lsn, pgno, indx, recno, data));
+}
+
+static inline int __qam_delext_read(ENV *env,
+ DB **dbpp, void *td, void *data, __qam_delext_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __qam_delext_desc, sizeof(__qam_delext_args), (void**)arg));
+}
+#endif /* HAVE_QUEUE */
+#endif
diff --git a/src/dbinc_auto/qam_ext.h b/src/dbinc_auto/qam_ext.h
new file mode 100644
index 00000000..3f143664
--- /dev/null
+++ b/src/dbinc_auto/qam_ext.h
@@ -0,0 +1,68 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _qam_ext_h_
+#define _qam_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __qam_position __P((DBC *, db_recno_t *, u_int32_t, int *));
+int __qam_pitem __P((DBC *, QPAGE *, u_int32_t, db_recno_t, DBT *));
+int __qam_append __P((DBC *, DBT *, DBT *));
+int __qamc_dup __P((DBC *, DBC *));
+int __qamc_init __P((DBC *));
+int __qam_truncate __P((DBC *, u_int32_t *));
+int __qam_delete __P((DBC *, DBT *, u_int32_t));
+int __qam_init_recover __P((ENV *, DB_DISTAB *));
+int __qam_incfirst_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_mvptr_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_del_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_add_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_delext_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_init_print __P((ENV *, DB_DISTAB *));
+int __qam_mswap __P((ENV *, PAGE *));
+int __qam_pgin_out __P((ENV *, db_pgno_t, void *, DBT *));
+int __qam_fprobe __P((DBC *, db_pgno_t, void *, qam_probe_mode, DB_CACHE_PRIORITY, u_int32_t));
+int __qam_fclose __P((DB *, db_pgno_t));
+int __qam_fremove __P((DB *, db_pgno_t));
+int __qam_sync __P((DB *));
+int __qam_gen_filelist __P((DB *, DB_THREAD_INFO *, QUEUE_FILELIST **));
+int __qam_extent_names __P((ENV *, char *, char ***));
+void __qam_exid __P((DB *, u_int8_t *, u_int32_t));
+int __qam_nameop __P((DB *, DB_TXN *, const char *, qam_name_op));
+int __qam_lsn_reset __P((DB *, DB_THREAD_INFO *));
+int __qam_backup_extents __P((DB *, DB_THREAD_INFO *, const char *, u_int32_t));
+int __qam_db_create __P((DB *));
+int __qam_db_close __P((DB *, u_int32_t));
+int __qam_get_extentsize __P((DB *, u_int32_t *));
+int __queue_pageinfo __P((DB *, db_pgno_t *, db_pgno_t *, int *, int, u_int32_t));
+int __db_prqueue __P((DB *, u_int32_t));
+int __qam_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
+int __qam_rename __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *));
+void __qam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+int __qam_set_flags __P((DB *, u_int32_t *flagsp));
+int __qam_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, db_pgno_t, int, u_int32_t));
+int __qam_set_ext_data __P((DB*, const char *));
+int __qam_metachk __P((DB *, const char *, QMETA *));
+int __qam_new_file __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+int __qam_incfirst_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_mvptr_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_del_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_delext_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_add_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __qam_stat __P((DBC *, void *, u_int32_t));
+int __qam_stat_print __P((DBC *, u_int32_t));
+int __db_no_queue_am __P((ENV *));
+int __qam_31_qammeta __P((DB *, char *, u_int8_t *));
+int __qam_32_qammeta __P((DB *, char *, u_int8_t *));
+int __qam_vrfy_meta __P((DB *, VRFY_DBINFO *, QMETA *, db_pgno_t, u_int32_t));
+int __qam_meta2pgset __P((DB *, VRFY_DBINFO *, DB *));
+int __qam_vrfy_data __P((DB *, VRFY_DBINFO *, QPAGE *, db_pgno_t, u_int32_t));
+int __qam_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t));
+int __qam_vrfy_walkqueue __P((DB *, VRFY_DBINFO *, void *, int (*)(void *, const void *), u_int32_t));
+int __qam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_qam_ext_h_ */
diff --git a/src/dbinc_auto/rep_automsg.h b/src/dbinc_auto/rep_automsg.h
new file mode 100644
index 00000000..584040cf
--- /dev/null
+++ b/src/dbinc_auto/rep_automsg.h
@@ -0,0 +1,120 @@
+/* Do not edit: automatically built by gen_msg.awk. */
+
+#ifndef __rep_AUTOMSG_H
+#define __rep_AUTOMSG_H
+
+/*
+ * Message sizes are simply the sum of field sizes (not
+ * counting variable size parts, when DBTs are present),
+ * and may be different from struct sizes due to padding.
+ */
+#define __REP_BULK_SIZE 16
+typedef struct ___rep_bulk_args {
+ u_int32_t len;
+ DB_LSN lsn;
+ DBT bulkdata;
+} __rep_bulk_args;
+
+#define __REP_CONTROL_SIZE 36
+typedef struct ___rep_control_args {
+ u_int32_t rep_version;
+ u_int32_t log_version;
+ DB_LSN lsn;
+ u_int32_t rectype;
+ u_int32_t gen;
+ u_int32_t msg_sec;
+ u_int32_t msg_nsec;
+ u_int32_t flags;
+} __rep_control_args;
+
+#define __REP_EGEN_SIZE 4
+typedef struct ___rep_egen_args {
+ u_int32_t egen;
+} __rep_egen_args;
+
+#define __REP_FILEINFO_SIZE 40
+typedef struct ___rep_fileinfo_args {
+ u_int32_t pgsize;
+ db_pgno_t pgno;
+ db_pgno_t max_pgno;
+ u_int32_t filenum;
+ u_int32_t finfo_flags;
+ u_int32_t type;
+ u_int32_t db_flags;
+ DBT uid;
+ DBT info;
+ DBT dir;
+} __rep_fileinfo_args;
+
+#define __REP_FILEINFO_V6_SIZE 36
+typedef struct ___rep_fileinfo_v6_args {
+ u_int32_t pgsize;
+ db_pgno_t pgno;
+ db_pgno_t max_pgno;
+ u_int32_t filenum;
+ u_int32_t finfo_flags;
+ u_int32_t type;
+ u_int32_t db_flags;
+ DBT uid;
+ DBT info;
+} __rep_fileinfo_v6_args;
+
+#define __REP_GRANT_INFO_SIZE 8
+typedef struct ___rep_grant_info_args {
+ u_int32_t msg_sec;
+ u_int32_t msg_nsec;
+} __rep_grant_info_args;
+
+#define __REP_LOGREQ_SIZE 8
+typedef struct ___rep_logreq_args {
+ DB_LSN endlsn;
+} __rep_logreq_args;
+
+#define __REP_NEWFILE_SIZE 4
+typedef struct ___rep_newfile_args {
+ u_int32_t version;
+} __rep_newfile_args;
+
+#define __REP_UPDATE_SIZE 16
+typedef struct ___rep_update_args {
+ DB_LSN first_lsn;
+ u_int32_t first_vers;
+ u_int32_t num_files;
+} __rep_update_args;
+
+#define __REP_VOTE_INFO_SIZE 28
+typedef struct ___rep_vote_info_args {
+ u_int32_t egen;
+ u_int32_t nsites;
+ u_int32_t nvotes;
+ u_int32_t priority;
+ u_int32_t spare_pri;
+ u_int32_t tiebreaker;
+ u_int32_t data_gen;
+} __rep_vote_info_args;
+
+#define __REP_VOTE_INFO_V5_SIZE 20
+typedef struct ___rep_vote_info_v5_args {
+ u_int32_t egen;
+ u_int32_t nsites;
+ u_int32_t nvotes;
+ u_int32_t priority;
+ u_int32_t tiebreaker;
+} __rep_vote_info_v5_args;
+
+#define __REP_LSN_HIST_KEY_SIZE 8
+typedef struct ___rep_lsn_hist_key_args {
+ u_int32_t version;
+ u_int32_t gen;
+} __rep_lsn_hist_key_args;
+
+#define __REP_LSN_HIST_DATA_SIZE 20
+typedef struct ___rep_lsn_hist_data_args {
+ u_int32_t envid;
+ DB_LSN lsn;
+ u_int32_t hist_sec;
+ u_int32_t hist_nsec;
+} __rep_lsn_hist_data_args;
+
+#define __REP_MAXMSG_SIZE 40
+#endif
diff --git a/src/dbinc_auto/rep_ext.h b/src/dbinc_auto/rep_ext.h
new file mode 100644
index 00000000..89bdc797
--- /dev/null
+++ b/src/dbinc_auto/rep_ext.h
@@ -0,0 +1,151 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _rep_ext_h_
+#define _rep_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __rep_bulk_marshal __P((ENV *, __rep_bulk_args *, u_int8_t *, size_t, size_t *));
+int __rep_bulk_unmarshal __P((ENV *, __rep_bulk_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_control_marshal __P((ENV *, __rep_control_args *, u_int8_t *, size_t, size_t *));
+int __rep_control_unmarshal __P((ENV *, __rep_control_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_egen_marshal __P((ENV *, __rep_egen_args *, u_int8_t *, size_t, size_t *));
+int __rep_egen_unmarshal __P((ENV *, __rep_egen_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_fileinfo_marshal __P((ENV *, u_int32_t, __rep_fileinfo_args *, u_int8_t *, size_t, size_t *));
+int __rep_fileinfo_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_args **, u_int8_t *, size_t, u_int8_t **));
+int __rep_fileinfo_v6_marshal __P((ENV *, u_int32_t, __rep_fileinfo_v6_args *, u_int8_t *, size_t, size_t *));
+int __rep_fileinfo_v6_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_v6_args **, u_int8_t *, size_t, u_int8_t **));
+int __rep_grant_info_marshal __P((ENV *, __rep_grant_info_args *, u_int8_t *, size_t, size_t *));
+int __rep_grant_info_unmarshal __P((ENV *, __rep_grant_info_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_logreq_marshal __P((ENV *, __rep_logreq_args *, u_int8_t *, size_t, size_t *));
+int __rep_logreq_unmarshal __P((ENV *, __rep_logreq_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_newfile_marshal __P((ENV *, __rep_newfile_args *, u_int8_t *, size_t, size_t *));
+int __rep_newfile_unmarshal __P((ENV *, __rep_newfile_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_update_marshal __P((ENV *, u_int32_t, __rep_update_args *, u_int8_t *, size_t, size_t *));
+int __rep_update_unmarshal __P((ENV *, u_int32_t, __rep_update_args **, u_int8_t *, size_t, u_int8_t **));
+int __rep_vote_info_marshal __P((ENV *, __rep_vote_info_args *, u_int8_t *, size_t, size_t *));
+int __rep_vote_info_unmarshal __P((ENV *, __rep_vote_info_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_vote_info_v5_marshal __P((ENV *, __rep_vote_info_v5_args *, u_int8_t *, size_t, size_t *));
+int __rep_vote_info_v5_unmarshal __P((ENV *, __rep_vote_info_v5_args *, u_int8_t *, size_t, u_int8_t **));
+void __rep_lsn_hist_key_marshal __P((ENV *, __rep_lsn_hist_key_args *, u_int8_t *));
+int __rep_lsn_hist_key_unmarshal __P((ENV *, __rep_lsn_hist_key_args *, u_int8_t *, size_t, u_int8_t **));
+void __rep_lsn_hist_data_marshal __P((ENV *, __rep_lsn_hist_data_args *, u_int8_t *));
+int __rep_lsn_hist_data_unmarshal __P((ENV *, __rep_lsn_hist_data_args *, u_int8_t *, size_t, u_int8_t **));
+int __rep_update_req __P((ENV *, __rep_control_args *));
+int __rep_page_req __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+int __rep_update_setup __P((ENV *, int, __rep_control_args *, DBT *, time_t, DB_LSN *));
+int __rep_bulk_page __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+int __rep_page __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+int __rep_init_cleanup __P((ENV *, REP *, int));
+int __rep_pggap_req __P((ENV *, REP *, __rep_fileinfo_args *, u_int32_t));
+int __rep_finfo_alloc __P((ENV *, __rep_fileinfo_args *, __rep_fileinfo_args **));
+int __rep_remove_init_file __P((ENV *));
+int __rep_reset_init __P((ENV *));
+int __rep_elect_pp __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __rep_elect_int __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __rep_vote1 __P((ENV *, __rep_control_args *, DBT *, int));
+int __rep_vote2 __P((ENV *, __rep_control_args *, DBT *, int));
+int __rep_update_grant __P((ENV *, db_timespec *));
+int __rep_islease_granted __P((ENV *));
+int __rep_lease_table_alloc __P((ENV *, u_int32_t));
+int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int));
+int __rep_lease_check __P((ENV *, int));
+int __rep_lease_refresh __P((ENV *));
+int __rep_lease_expire __P((ENV *));
+db_timeout_t __rep_lease_waittime __P((ENV *));
+int __rep_allreq __P((ENV *, __rep_control_args *, int));
+int __rep_log __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, DBT *, int, time_t, DB_LSN *));
+int __rep_bulk_log __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, DBT *, time_t, DB_LSN *));
+int __rep_logreq __P((ENV *, __rep_control_args *, DBT *, int));
+int __rep_loggap_req __P((ENV *, REP *, DB_LSN *, u_int32_t));
+int __rep_logready __P((ENV *, REP *, time_t, DB_LSN *));
+int __rep_env_create __P((DB_ENV *));
+void __rep_env_destroy __P((DB_ENV *));
+int __rep_get_config __P((DB_ENV *, u_int32_t, int *));
+int __rep_set_config __P((DB_ENV *, u_int32_t, int));
+int __rep_start_pp __P((DB_ENV *, DBT *, u_int32_t));
+int __rep_start_int __P((ENV *, DBT *, u_int32_t));
+int __rep_open_sysdb __P((ENV *, DB_THREAD_INFO *, DB_TXN *, const char *, u_int32_t, DB **));
+int __rep_client_dbinit __P((ENV *, int, repdb_t));
+int __rep_get_limit __P((DB_ENV *, u_int32_t *, u_int32_t *));
+int __rep_set_limit __P((DB_ENV *, u_int32_t, u_int32_t));
+int __rep_set_nsites_pp __P((DB_ENV *, u_int32_t));
+int __rep_set_nsites_int __P((ENV *, u_int32_t));
+int __rep_get_nsites __P((DB_ENV *, u_int32_t *));
+int __rep_set_priority __P((DB_ENV *, u_int32_t));
+int __rep_get_priority __P((DB_ENV *, u_int32_t *));
+int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t));
+int __rep_get_timeout __P((DB_ENV *, int, db_timeout_t *));
+int __rep_get_request __P((DB_ENV *, db_timeout_t *, db_timeout_t *));
+int __rep_set_request __P((DB_ENV *, db_timeout_t, db_timeout_t));
+int __rep_set_transport_pp __P((DB_ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
+int __rep_set_transport_int __P((ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
+int __rep_get_clockskew __P((DB_ENV *, u_int32_t *, u_int32_t *));
+int __rep_set_clockskew __P((DB_ENV *, u_int32_t, u_int32_t));
+int __rep_flush __P((DB_ENV *));
+int __rep_sync __P((DB_ENV *, u_int32_t));
+int __rep_txn_applied __P((ENV *, DB_THREAD_INFO *, DB_COMMIT_INFO *, db_timeout_t));
+int __rep_process_message_pp __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *));
+int __rep_process_message_int __P((ENV *, DBT *, DBT *, int, DB_LSN *));
+int __rep_apply __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, DBT *, DB_LSN *, int *, DB_LSN *));
+int __rep_process_txn __P((ENV *, DBT *));
+int __rep_resend_req __P((ENV *, int));
+int __rep_check_doreq __P((ENV *, REP *));
+int __rep_check_missing __P((ENV *, u_int32_t, DB_LSN *));
+int __rep_open __P((ENV *));
+int __rep_close_diagfiles __P((ENV *));
+int __rep_env_refresh __P((ENV *));
+int __rep_env_close __P((ENV *));
+int __rep_preclose __P((ENV *));
+int __rep_closefiles __P((ENV *));
+int __rep_write_egen __P((ENV *, REP *, u_int32_t));
+int __rep_write_gen __P((ENV *, REP *, u_int32_t));
+int __rep_stat_pp __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
+int __rep_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __rep_stat_print __P((ENV *, u_int32_t));
+int __rep_bulk_message __P((ENV *, REP_BULK *, REP_THROTTLE *, DB_LSN *, const DBT *, u_int32_t));
+int __rep_send_bulk __P((ENV *, REP_BULK *, u_int32_t));
+int __rep_bulk_alloc __P((ENV *, REP_BULK *, int, uintptr_t *, u_int32_t *, u_int32_t));
+int __rep_bulk_free __P((ENV *, REP_BULK *, u_int32_t));
+int __rep_send_message __P((ENV *, int, u_int32_t, DB_LSN *, const DBT *, u_int32_t, u_int32_t));
+int __rep_new_master __P((ENV *, __rep_control_args *, int));
+void __rep_elect_done __P((ENV *, REP *));
+int __env_rep_enter __P((ENV *, int));
+int __env_db_rep_exit __P((ENV *));
+int __db_rep_enter __P((DB *, int, int, int));
+int __op_handle_enter __P((ENV *));
+int __op_rep_enter __P((ENV *, int, int));
+int __op_rep_exit __P((ENV *));
+int __archive_rep_enter __P((ENV *));
+int __archive_rep_exit __P((ENV *));
+int __rep_lockout_archive __P((ENV *, REP *));
+int __rep_lockout_api __P((ENV *, REP *));
+int __rep_take_apilockout __P((ENV *));
+int __rep_clear_apilockout __P((ENV *));
+int __rep_lockout_apply __P((ENV *, REP *, u_int32_t));
+int __rep_lockout_msg __P((ENV *, REP *, u_int32_t));
+int __rep_send_throttle __P((ENV *, int, REP_THROTTLE *, u_int32_t, u_int32_t));
+u_int32_t __rep_msg_to_old __P((u_int32_t, u_int32_t));
+u_int32_t __rep_msg_from_old __P((u_int32_t, u_int32_t));
+int __rep_print_system __P((ENV *, u_int32_t, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+int __rep_print __P((ENV *, u_int32_t, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+void __rep_print_message __P((ENV *, int, __rep_control_args *, char *, u_int32_t));
+void __rep_fire_event __P((ENV *, u_int32_t, void *));
+void __rep_msg __P((const ENV *, const char *));
+int __rep_notify_threads __P((ENV *, rep_waitreason_t));
+int __rep_check_goal __P((ENV *, struct rep_waitgoal *));
+int __rep_log_backup __P((ENV *, DB_LOGC *, DB_LSN *, u_int32_t));
+int __rep_get_maxpermlsn __P((ENV *, DB_LSN *));
+int __rep_is_internal_rep_file __P((char *));
+int __rep_get_datagen __P((ENV *, u_int32_t *));
+int __rep_verify __P((ENV *, __rep_control_args *, DBT *, int, time_t));
+int __rep_verify_fail __P((ENV *, __rep_control_args *));
+int __rep_verify_req __P((ENV *, __rep_control_args *, int));
+int __rep_dorecovery __P((ENV *, DB_LSN *, DB_LSN *));
+int __rep_verify_match __P((ENV *, DB_LSN *, time_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_rep_ext_h_ */
diff --git a/src/dbinc_auto/repmgr_auto.h b/src/dbinc_auto/repmgr_auto.h
new file mode 100644
index 00000000..5e9f386d
--- /dev/null
+++ b/src/dbinc_auto/repmgr_auto.h
@@ -0,0 +1,41 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __repmgr_AUTO_H
+#define __repmgr_AUTO_H
+#ifdef HAVE_REPLICATION_THREADS
+#include "dbinc/log.h"
+#define DB___repmgr_member 200
+typedef struct ___repmgr_member_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t version;
+ u_int32_t prev_status;
+ u_int32_t status;
+ DBT host;
+ u_int32_t port;
+} __repmgr_member_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __repmgr_member_desc[];
+static inline int
+__repmgr_member_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t version, u_int32_t prev_status, u_int32_t status, const DBT *host, u_int32_t port)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___repmgr_member, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ LOG_DBT_SIZE(host) + sizeof(u_int32_t),
+ __repmgr_member_desc,
+ version, prev_status, status, host, port));
+}
+
+static inline int __repmgr_member_read(ENV *env,
+ void *data, __repmgr_member_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __repmgr_member_desc, sizeof(__repmgr_member_args), (void**)arg));
+}
+#endif /* HAVE_REPLICATION_THREADS */
+#endif
diff --git a/src/dbinc_auto/repmgr_automsg.h b/src/dbinc_auto/repmgr_automsg.h
new file mode 100644
index 00000000..1b2b928c
--- /dev/null
+++ b/src/dbinc_auto/repmgr_automsg.h
@@ -0,0 +1,113 @@
+/* Do not edit: automatically built by gen_msg.awk. */
+
+#ifndef __repmgr_AUTOMSG_H
+#define __repmgr_AUTOMSG_H
+
+/*
+ * Message sizes are simply the sum of field sizes (not
+ * counting variable size parts, when DBTs are present),
+ * and may be different from struct sizes due to padding.
+ */
+#define __REPMGR_HANDSHAKE_SIZE 12
+typedef struct ___repmgr_handshake_args {
+ u_int16_t port;
+ u_int16_t alignment;
+ u_int32_t ack_policy;
+ u_int32_t flags;
+} __repmgr_handshake_args;
+
+#define __REPMGR_V3HANDSHAKE_SIZE 10
+typedef struct ___repmgr_v3handshake_args {
+ u_int16_t port;
+ u_int32_t priority;
+ u_int32_t flags;
+} __repmgr_v3handshake_args;
+
+#define __REPMGR_V2HANDSHAKE_SIZE 6
+typedef struct ___repmgr_v2handshake_args {
+ u_int16_t port;
+ u_int32_t priority;
+} __repmgr_v2handshake_args;
+
+#define __REPMGR_PARM_REFRESH_SIZE 8
+typedef struct ___repmgr_parm_refresh_args {
+ u_int32_t ack_policy;
+ u_int32_t flags;
+} __repmgr_parm_refresh_args;
+
+#define __REPMGR_PERMLSN_SIZE 12
+typedef struct ___repmgr_permlsn_args {
+ u_int32_t generation;
+ DB_LSN lsn;
+} __repmgr_permlsn_args;
+
+#define __REPMGR_VERSION_PROPOSAL_SIZE 8
+typedef struct ___repmgr_version_proposal_args {
+ u_int32_t min;
+ u_int32_t max;
+} __repmgr_version_proposal_args;
+
+#define __REPMGR_VERSION_CONFIRMATION_SIZE 4
+typedef struct ___repmgr_version_confirmation_args {
+ u_int32_t version;
+} __repmgr_version_confirmation_args;
+
+#define __REPMGR_MSG_HDR_SIZE 9
+typedef struct ___repmgr_msg_hdr_args {
+ u_int8_t type;
+ u_int32_t word1;
+ u_int32_t word2;
+} __repmgr_msg_hdr_args;
+
+#define __REPMGR_MSG_METADATA_SIZE 12
+typedef struct ___repmgr_msg_metadata_args {
+ u_int32_t tag;
+ u_int32_t limit;
+ u_int32_t flags;
+} __repmgr_msg_metadata_args;
+
+#define __REPMGR_MEMBERSHIP_KEY_SIZE 6
+typedef struct ___repmgr_membership_key_args {
+ DBT host;
+ u_int16_t port;
+} __repmgr_membership_key_args;
+
+#define __REPMGR_MEMBERSHIP_DATA_SIZE 4
+typedef struct ___repmgr_membership_data_args {
+ u_int32_t flags;
+} __repmgr_membership_data_args;
+
+#define __REPMGR_MEMBER_METADATA_SIZE 8
+typedef struct ___repmgr_member_metadata_args {
+ u_int32_t format;
+ u_int32_t version;
+} __repmgr_member_metadata_args;
+
+#define __REPMGR_GM_FWD_SIZE 10
+typedef struct ___repmgr_gm_fwd_args {
+ DBT host;
+ u_int16_t port;
+ u_int32_t gen;
+} __repmgr_gm_fwd_args;
+
+#define __REPMGR_MEMBR_VERS_SIZE 8
+typedef struct ___repmgr_membr_vers_args {
+ u_int32_t version;
+ u_int32_t gen;
+} __repmgr_membr_vers_args;
+
+#define __REPMGR_SITE_INFO_SIZE 10
+typedef struct ___repmgr_site_info_args {
+ DBT host;
+ u_int16_t port;
+ u_int32_t flags;
+} __repmgr_site_info_args;
+
+#define __REPMGR_CONNECT_REJECT_SIZE 8
+typedef struct ___repmgr_connect_reject_args {
+ u_int32_t version;
+ u_int32_t gen;
+} __repmgr_connect_reject_args;
+
+#define __REPMGR_MAXMSG_SIZE 12
+#endif
diff --git a/src/dbinc_auto/repmgr_ext.h b/src/dbinc_auto/repmgr_ext.h
new file mode 100644
index 00000000..b1237950
--- /dev/null
+++ b/src/dbinc_auto/repmgr_ext.h
@@ -0,0 +1,249 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _repmgr_ext_h_
+#define _repmgr_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+void __repmgr_handshake_marshal __P((ENV *, __repmgr_handshake_args *, u_int8_t *));
+int __repmgr_handshake_unmarshal __P((ENV *, __repmgr_handshake_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_v3handshake_marshal __P((ENV *, __repmgr_v3handshake_args *, u_int8_t *));
+int __repmgr_v3handshake_unmarshal __P((ENV *, __repmgr_v3handshake_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_v2handshake_marshal __P((ENV *, __repmgr_v2handshake_args *, u_int8_t *));
+int __repmgr_v2handshake_unmarshal __P((ENV *, __repmgr_v2handshake_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_parm_refresh_marshal __P((ENV *, __repmgr_parm_refresh_args *, u_int8_t *));
+int __repmgr_parm_refresh_unmarshal __P((ENV *, __repmgr_parm_refresh_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_permlsn_marshal __P((ENV *, __repmgr_permlsn_args *, u_int8_t *));
+int __repmgr_permlsn_unmarshal __P((ENV *, __repmgr_permlsn_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_version_proposal_marshal __P((ENV *, __repmgr_version_proposal_args *, u_int8_t *));
+int __repmgr_version_proposal_unmarshal __P((ENV *, __repmgr_version_proposal_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_version_confirmation_marshal __P((ENV *, __repmgr_version_confirmation_args *, u_int8_t *));
+int __repmgr_version_confirmation_unmarshal __P((ENV *, __repmgr_version_confirmation_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_msg_hdr_marshal __P((ENV *, __repmgr_msg_hdr_args *, u_int8_t *));
+int __repmgr_msg_hdr_unmarshal __P((ENV *, __repmgr_msg_hdr_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_msg_metadata_marshal __P((ENV *, __repmgr_msg_metadata_args *, u_int8_t *));
+int __repmgr_msg_metadata_unmarshal __P((ENV *, __repmgr_msg_metadata_args *, u_int8_t *, size_t, u_int8_t **));
+int __repmgr_membership_key_marshal __P((ENV *, __repmgr_membership_key_args *, u_int8_t *, size_t, size_t *));
+int __repmgr_membership_key_unmarshal __P((ENV *, __repmgr_membership_key_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_membership_data_marshal __P((ENV *, __repmgr_membership_data_args *, u_int8_t *));
+int __repmgr_membership_data_unmarshal __P((ENV *, __repmgr_membership_data_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_member_metadata_marshal __P((ENV *, __repmgr_member_metadata_args *, u_int8_t *));
+int __repmgr_member_metadata_unmarshal __P((ENV *, __repmgr_member_metadata_args *, u_int8_t *, size_t, u_int8_t **));
+int __repmgr_gm_fwd_marshal __P((ENV *, __repmgr_gm_fwd_args *, u_int8_t *, size_t, size_t *));
+int __repmgr_gm_fwd_unmarshal __P((ENV *, __repmgr_gm_fwd_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_membr_vers_marshal __P((ENV *, __repmgr_membr_vers_args *, u_int8_t *));
+int __repmgr_membr_vers_unmarshal __P((ENV *, __repmgr_membr_vers_args *, u_int8_t *, size_t, u_int8_t **));
+int __repmgr_site_info_marshal __P((ENV *, __repmgr_site_info_args *, u_int8_t *, size_t, size_t *));
+int __repmgr_site_info_unmarshal __P((ENV *, __repmgr_site_info_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_connect_reject_marshal __P((ENV *, __repmgr_connect_reject_args *, u_int8_t *));
+int __repmgr_connect_reject_unmarshal __P((ENV *, __repmgr_connect_reject_args *, u_int8_t *, size_t, u_int8_t **));
+int __repmgr_member_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __repmgr_init_print __P((ENV *, DB_DISTAB *));
+int __repmgr_init_election __P((ENV *, u_int32_t));
+int __repmgr_claim_victory __P((ENV *));
+int __repmgr_turn_on_elections __P((ENV *));
+int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+int __repmgr_valid_config __P((ENV *, u_int32_t));
+int __repmgr_autostart __P((ENV *));
+int __repmgr_start_selector __P((ENV *));
+int __repmgr_close __P((ENV *));
+int __repmgr_stop __P((ENV *));
+int __repmgr_set_ack_policy __P((DB_ENV *, int));
+int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+int __repmgr_env_create __P((ENV *, DB_REP *));
+void __repmgr_env_destroy __P((ENV *, DB_REP *));
+int __repmgr_stop_threads __P((ENV *));
+int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
+int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+int __repmgr_set_msg_dispatch __P((DB_ENV *, void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t), u_int32_t));
+int __repmgr_send_msg __P((DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+int __repmgr_send_request __P((DB_CHANNEL *, DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+int __repmgr_send_response __P((DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+int __repmgr_channel_close __P((DB_CHANNEL *, u_int32_t));
+int __repmgr_channel_timeout __P((DB_CHANNEL *, db_timeout_t));
+int __repmgr_send_request_inval __P((DB_CHANNEL *, DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+int __repmgr_channel_close_inval __P((DB_CHANNEL *, u_int32_t));
+int __repmgr_channel_timeout_inval __P((DB_CHANNEL *, db_timeout_t));
+int __repmgr_join_group __P((ENV *));
+int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
+int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+int __repmgr_get_site_address __P((DB_SITE *, const char **, u_int *));
+int __repmgr_get_eid __P((DB_SITE *, int *));
+int __repmgr_get_config __P((DB_SITE *, u_int32_t, u_int32_t *));
+int __repmgr_site_config __P((DB_SITE *, u_int32_t, u_int32_t));
+int __repmgr_site_close __P((DB_SITE *));
+void *__repmgr_msg_thread __P((void *));
+int __repmgr_send_err_resp __P((ENV *, CHANNEL *, int));
+int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+int __repmgr_update_membership __P((ENV *, DB_THREAD_INFO *, int, u_int32_t));
+int __repmgr_set_gm_version __P((ENV *, DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+int __repmgr_setup_gmdb_op __P((ENV *, DB_THREAD_INFO *, DB_TXN **, u_int32_t));
+int __repmgr_cleanup_gmdb_op __P((ENV *, int));
+int __repmgr_hold_master_role __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_rlse_master_role __P((ENV *));
+void __repmgr_set_sites __P((ENV *));
+int __repmgr_connect __P((ENV *, repmgr_netaddr_t *, REPMGR_CONNECTION **, int *));
+int __repmgr_send __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+int __repmgr_sync_siteaddr __P((ENV *));
+int __repmgr_send_broadcast __P((ENV *, u_int, const DBT *, const DBT *, u_int *, u_int *, int *));
+int __repmgr_send_one __P((ENV *, REPMGR_CONNECTION *, u_int, const DBT *, const DBT *, db_timeout_t));
+int __repmgr_send_many __P((ENV *, REPMGR_CONNECTION *, REPMGR_IOVECS *, db_timeout_t));
+int __repmgr_send_own_msg __P((ENV *, REPMGR_CONNECTION *, u_int32_t, u_int8_t *, u_int32_t));
+int __repmgr_write_iovecs __P((ENV *, REPMGR_CONNECTION *, REPMGR_IOVECS *, size_t *));
+int __repmgr_bust_connection __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_disable_connection __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_cleanup_defunct __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_close_connection __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_decr_conn_ref __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_destroy_conn __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_pack_netaddr __P((ENV *, const char *, u_int, repmgr_netaddr_t *));
+int __repmgr_getaddr __P((ENV *, const char *, u_int, int, ADDRINFO **));
+int __repmgr_listen __P((ENV *));
+int __repmgr_net_close __P((ENV *));
+void __repmgr_net_destroy __P((ENV *, DB_REP *));
+int __repmgr_thread_start __P((ENV *, REPMGR_RUNNABLE *));
+int __repmgr_thread_join __P((REPMGR_RUNNABLE *));
+int __repmgr_set_nonblock_conn __P((REPMGR_CONNECTION *));
+int __repmgr_set_nonblocking __P((socket_t));
+int __repmgr_wake_waiters __P((ENV *, waiter_t *));
+int __repmgr_await_cond __P((ENV *, PREDICATE, void *, db_timeout_t, waiter_t *));
+int __repmgr_await_gmdbop __P((ENV *));
+void __repmgr_compute_wait_deadline __P((ENV*, struct timespec *, db_timeout_t));
+int __repmgr_await_drain __P((ENV *, REPMGR_CONNECTION *, db_timeout_t));
+int __repmgr_alloc_cond __P((cond_var_t *));
+int __repmgr_free_cond __P((cond_var_t *));
+void __repmgr_env_create_pf __P((DB_REP *));
+int __repmgr_create_mutex_pf __P((mgr_mutex_t *));
+int __repmgr_destroy_mutex_pf __P((mgr_mutex_t *));
+int __repmgr_init __P((ENV *));
+int __repmgr_deinit __P((ENV *));
+int __repmgr_init_waiters __P((ENV *, waiter_t *));
+int __repmgr_destroy_waiters __P((ENV *, waiter_t *));
+int __repmgr_lock_mutex __P((mgr_mutex_t *));
+int __repmgr_unlock_mutex __P((mgr_mutex_t *));
+int __repmgr_signal __P((cond_var_t *));
+int __repmgr_wake_msngers __P((ENV*, u_int));
+int __repmgr_wake_main_thread __P((ENV*));
+int __repmgr_writev __P((socket_t, db_iovec_t *, int, size_t *));
+int __repmgr_readv __P((socket_t, db_iovec_t *, int, size_t *));
+int __repmgr_select_loop __P((ENV *));
+int __repmgr_queue_destroy __P((ENV *));
+int __repmgr_queue_get __P((ENV *, REPMGR_MESSAGE **, REPMGR_RUNNABLE *));
+int __repmgr_queue_put __P((ENV *, REPMGR_MESSAGE *));
+int __repmgr_queue_size __P((ENV *));
+int __repmgr_member_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+void *__repmgr_select_thread __P((void *));
+int __repmgr_bow_out __P((ENV *));
+int __repmgr_accept __P((ENV *));
+int __repmgr_compute_timeout __P((ENV *, db_timespec *));
+REPMGR_SITE *__repmgr_connected_master __P((ENV *));
+int __repmgr_check_timeouts __P((ENV *));
+int __repmgr_first_try_connections __P((ENV *));
+int __repmgr_send_v1_handshake __P((ENV *, REPMGR_CONNECTION *, void *, size_t));
+int __repmgr_read_from_site __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_read_conn __P((REPMGR_CONNECTION *));
+int __repmgr_prepare_simple_input __P((ENV *, REPMGR_CONNECTION *, __repmgr_msg_hdr_args *));
+int __repmgr_send_handshake __P((ENV *, REPMGR_CONNECTION *, void *, size_t, u_int32_t));
+int __repmgr_find_version_info __P((ENV *, REPMGR_CONNECTION *, DBT *));
+int __repmgr_write_some __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __repmgr_stat_print __P((ENV *, u_int32_t));
+int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_close __P((ENV *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_ack_policy __P((DB_ENV *, int));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_msg_dispatch __P((DB_ENV *, void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t), u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+#endif
+int __repmgr_schedule_connection_attempt __P((ENV *, int, int));
+int __repmgr_is_server __P((ENV *, REPMGR_SITE *));
+void __repmgr_reset_for_reading __P((REPMGR_CONNECTION *));
+int __repmgr_new_connection __P((ENV *, REPMGR_CONNECTION **, socket_t, int));
+int __repmgr_set_keepalive __P((ENV *, REPMGR_CONNECTION *));
+int __repmgr_new_site __P((ENV *, REPMGR_SITE**, const char *, u_int));
+int __repmgr_create_mutex __P((ENV *, mgr_mutex_t **));
+int __repmgr_destroy_mutex __P((ENV *, mgr_mutex_t *));
+void __repmgr_cleanup_netaddr __P((ENV *, repmgr_netaddr_t *));
+void __repmgr_iovec_init __P((REPMGR_IOVECS *));
+void __repmgr_add_buffer __P((REPMGR_IOVECS *, void *, size_t));
+void __repmgr_add_dbt __P((REPMGR_IOVECS *, const DBT *));
+int __repmgr_update_consumed __P((REPMGR_IOVECS *, size_t));
+int __repmgr_prepare_my_addr __P((ENV *, DBT *));
+int __repmgr_get_nsites __P((ENV *, u_int32_t *));
+int __repmgr_thread_failure __P((ENV *, int));
+char *__repmgr_format_eid_loc __P((DB_REP *, REPMGR_CONNECTION *, char *));
+char *__repmgr_format_site_loc __P((REPMGR_SITE *, char *));
+char *__repmgr_format_addr_loc __P((repmgr_netaddr_t *, char *));
+int __repmgr_repstart __P((ENV *, u_int32_t));
+int __repmgr_become_master __P((ENV *));
+int __repmgr_each_connection __P((ENV *, CONNECTION_ACTION, void *, int));
+int __repmgr_open __P((ENV *, void *));
+int __repmgr_join __P((ENV *, void *));
+int __repmgr_env_refresh __P((ENV *env));
+int __repmgr_share_netaddrs __P((ENV *, void *, u_int, u_int));
+int __repmgr_copy_in_added_sites __P((ENV *));
+int __repmgr_init_new_sites __P((ENV *, int, int));
+int __repmgr_failchk __P((ENV *));
+int __repmgr_master_is_known __P((ENV *));
+int __repmgr_stable_lsn __P((ENV *, DB_LSN *));
+int __repmgr_send_sync_msg __P((ENV *, REPMGR_CONNECTION *, u_int32_t, u_int8_t *, u_int32_t));
+int __repmgr_marshal_member_list __P((ENV *, u_int8_t **, size_t *));
+int __repmgr_refresh_membership __P((ENV *, u_int8_t *, size_t));
+int __repmgr_reload_gmdb __P((ENV *));
+int __repmgr_gmdb_version_cmp __P((ENV *, u_int32_t, u_int32_t));
+int __repmgr_init_save __P((ENV *, DBT *));
+int __repmgr_init_restore __P((ENV *, DBT *));
+int __repmgr_defer_op __P((ENV *, u_int32_t));
+void __repmgr_fire_conn_err_event __P((ENV *, REPMGR_CONNECTION *, int));
+void __repmgr_print_conn_err __P((ENV *, repmgr_netaddr_t *, int));
+int __repmgr_become_client __P((ENV *));
+REPMGR_SITE *__repmgr_lookup_site __P((ENV *, const char *, u_int));
+int __repmgr_find_site __P((ENV *, const char *, u_int, int *));
+int __repmgr_set_membership __P((ENV *, const char *, u_int, u_int32_t));
+int __repmgr_bcast_parm_refresh __P((ENV *));
+int __repmgr_chg_prio __P((ENV *, u_int32_t, u_int32_t));
+int __repmgr_bcast_own_msg __P((ENV *, u_int32_t, u_int8_t *, size_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_repmgr_ext_h_ */
diff --git a/src/dbinc_auto/sequence_ext.h b/src/dbinc_auto/sequence_ext.h
new file mode 100644
index 00000000..a2c114cf
--- /dev/null
+++ b/src/dbinc_auto/sequence_ext.h
@@ -0,0 +1,17 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _sequence_ext_h_
+#define _sequence_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __seq_stat __P((DB_SEQUENCE *, DB_SEQUENCE_STAT **, u_int32_t));
+int __seq_stat_print __P((DB_SEQUENCE *, u_int32_t));
+const FN * __db_get_seq_flags_fn __P((void));
+const FN * __db_get_seq_flags_fn __P((void));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_sequence_ext_h_ */
diff --git a/src/dbinc_auto/tcl_ext.h b/src/dbinc_auto/tcl_ext.h
new file mode 100644
index 00000000..8b076c8b
--- /dev/null
+++ b/src/dbinc_auto/tcl_ext.h
@@ -0,0 +1,134 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _tcl_ext_h_
+#define _tcl_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int bdb_HCommand __P((Tcl_Interp *, int, Tcl_Obj * CONST*));
+#if DB_DBM_HSEARCH != 0
+int bdb_NdbmOpen __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DBM **));
+#endif
+#if DB_DBM_HSEARCH != 0
+int bdb_DbmCommand __P((Tcl_Interp *, int, Tcl_Obj * CONST*, int, DBM *));
+#endif
+int ndbm_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+void _DbInfoDelete __P((Tcl_Interp *, DBTCL_INFO *));
+int db_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_CompactStat __P((Tcl_Interp *, DBTCL_INFO *));
+int tcl_rep_send __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+int dbc_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+int env_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_EnvRemove __P((Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_EnvClose __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int tcl_EnvIdReset __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_EnvLsnReset __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_EnvVerbose __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *));
+int tcl_EnvAttr __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_EnvSetFlags __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *));
+int tcl_EnvTest __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_EnvGetEncryptFlags __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+void tcl_EnvSetErrfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
+void tcl_EnvSetMsgfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
+int tcl_EnvSetErrpfx __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
+int tcl_EnvStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+DBTCL_INFO *_NewInfo __P((Tcl_Interp *, void *, char *, enum INFOTYPE));
+void *_NameToPtr __P((CONST char *));
+DBTCL_INFO *_PtrToInfo __P((CONST void *));
+DBTCL_INFO *_NameToInfo __P((CONST char *));
+void _SetInfoData __P((DBTCL_INFO *, void *));
+void _DeleteInfo __P((DBTCL_INFO *));
+int _SetListElem __P((Tcl_Interp *, Tcl_Obj *, void *, u_int32_t, void *, u_int32_t));
+int _SetListElemInt __P((Tcl_Interp *, Tcl_Obj *, void *, long));
+int _SetListElemWideInt __P((Tcl_Interp *, Tcl_Obj *, void *, int64_t));
+int _SetListRecnoElem __P((Tcl_Interp *, Tcl_Obj *, db_recno_t, u_char *, u_int32_t));
+int _SetListHeapElem __P((Tcl_Interp *, Tcl_Obj *, DB_HEAP_RID, u_char *, u_int32_t));
+int _Set3DBTList __P((Tcl_Interp *, Tcl_Obj *, DBT *, int, DBT *, int, DBT *));
+int _SetMultiList __P((Tcl_Interp *, Tcl_Obj *, DBT *, DBT*, DBTYPE, u_int32_t, DBC*));
+int _GetGlobPrefix __P((char *, char **));
+int _ReturnSetup __P((Tcl_Interp *, int, int, char *));
+int _ErrorSetup __P((Tcl_Interp *, int, char *));
+void _ErrorFunc __P((const DB_ENV *, CONST char *, const char *));
+#ifdef CONFIG_TEST
+void _EventFunc __P((DB_ENV *, u_int32_t, void *));
+#endif
+int _GetLsn __P((Tcl_Interp *, Tcl_Obj *, DB_LSN *));
+int _GetRid __P((Tcl_Interp *, Tcl_Obj *, DB_HEAP_RID *));
+int _GetUInt32 __P((Tcl_Interp *, Tcl_Obj *, u_int32_t *));
+Tcl_Obj *_GetFlagsList __P((Tcl_Interp *, u_int32_t, const FN *));
+void _debug_check __P((void));
+int _CopyObjBytes __P((Tcl_Interp *, Tcl_Obj *obj, void *, u_int32_t *, int *));
+int tcl_LockDetect __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockGet __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockTimeout __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LockVec __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogArchive __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogCompare __P((Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_LogFile __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogFlush __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogGet __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogPut __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int logc_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_LogConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *));
+int tcl_LogGetConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *));
+void _MpInfoDelete __P((Tcl_Interp *, DBTCL_INFO *));
+int tcl_MpSync __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MpTrickle __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_Mp __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int tcl_MpStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MpStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_Mutex __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutFree __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutGet __P((Tcl_Interp *, DB_ENV *, int));
+int tcl_MutLock __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutSet __P((Tcl_Interp *, Tcl_Obj *, DB_ENV *, int));
+int tcl_MutStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutUnlock __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_RepConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *));
+int tcl_RepGetTwo __P((Tcl_Interp *, DB_ENV *, int));
+int tcl_RepGetConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *));
+int tcl_RepGetTimeout __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *));
+int tcl_RepGetAckPolicy __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepGetLocalSite __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepElect __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepFlush __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepSync __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepLease __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepInmemFiles __P((Tcl_Interp *, DB_ENV *));
+int tcl_RepLimit __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepNSites __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepRequest __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepNoarchiveTimeout __P((Tcl_Interp *, DB_ENV *));
+int tcl_RepTransport __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *, DBTCL_INFO *));
+int tcl_RepStart __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepProcessMessage __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepStat __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_RepMgr __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepMgrSiteList __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepMgrStat __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int tcl_RepMgrStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_RepApplied __P((Tcl_Interp *, int, Tcl_Obj * CONST *, DB_ENV *));
+int seq_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+void _TxnInfoDelete __P((Tcl_Interp *, DBTCL_INFO *));
+int tcl_TxnCheckpoint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_Txn __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int tcl_CDSGroup __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int tcl_TxnStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_TxnStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_TxnTimeout __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_TxnRecover __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int bdb_RandCommand __P((Tcl_Interp *, int, Tcl_Obj * CONST*));
+int tcl_LockMutex __P((DB_ENV *, db_mutex_t));
+int tcl_UnlockMutex __P((DB_ENV *, db_mutex_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_tcl_ext_h_ */
diff --git a/src/dbinc_auto/txn_auto.h b/src/dbinc_auto/txn_auto.h
new file mode 100644
index 00000000..48cb066d
--- /dev/null
+++ b/src/dbinc_auto/txn_auto.h
@@ -0,0 +1,220 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#ifndef __txn_AUTO_H
+#define __txn_AUTO_H
+#include "dbinc/log.h"
+#define DB___txn_regop_42 10
+typedef struct ___txn_regop_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t timestamp;
+ DBT locks;
+} __txn_regop_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_regop_42_desc[];
+static inline int __txn_regop_42_read(ENV *env,
+ void *data, __txn_regop_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __txn_regop_42_desc, sizeof(__txn_regop_42_args), (void**)arg));
+}
+#define DB___txn_regop 10
+typedef struct ___txn_regop_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t timestamp;
+ u_int32_t envid;
+ DBT locks;
+} __txn_regop_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_regop_desc[];
+static inline int
+__txn_regop_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t opcode, int32_t timestamp, u_int32_t envid, const DBT *locks)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___txn_regop, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
+ LOG_DBT_SIZE(locks),
+ __txn_regop_desc,
+ opcode, timestamp, envid, locks));
+}
+
+static inline int __txn_regop_read(ENV *env,
+ void *data, __txn_regop_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __txn_regop_desc, sizeof(__txn_regop_args), (void**)arg));
+}
+#define DB___txn_ckp_42 11
+typedef struct ___txn_ckp_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DB_LSN ckp_lsn;
+ DB_LSN last_ckp;
+ int32_t timestamp;
+ u_int32_t rep_gen;
+} __txn_ckp_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_ckp_42_desc[];
+static inline int __txn_ckp_42_read(ENV *env,
+ void *data, __txn_ckp_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __txn_ckp_42_desc, sizeof(__txn_ckp_42_args), (void**)arg));
+}
+#define DB___txn_ckp 11
+typedef struct ___txn_ckp_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DB_LSN ckp_lsn;
+ DB_LSN last_ckp;
+ int32_t timestamp;
+ u_int32_t envid;
+ u_int32_t spare;
+} __txn_ckp_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_ckp_desc[];
+static inline int
+__txn_ckp_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ DB_LSN * ckp_lsn, DB_LSN * last_ckp, int32_t timestamp, u_int32_t envid, u_int32_t spare)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___txn_ckp, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(*ckp_lsn) + sizeof(*last_ckp) + sizeof(u_int32_t) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __txn_ckp_desc,
+ ckp_lsn, last_ckp, timestamp, envid, spare));
+}
+
+static inline int __txn_ckp_read(ENV *env,
+ void *data, __txn_ckp_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __txn_ckp_desc, sizeof(__txn_ckp_args), (void**)arg));
+}
+#define DB___txn_child 12
+typedef struct ___txn_child_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t child;
+ DB_LSN c_lsn;
+} __txn_child_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_child_desc[];
+static inline int
+__txn_child_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t child, DB_LSN * c_lsn)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___txn_child, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(*c_lsn),
+ __txn_child_desc,
+ child, c_lsn));
+}
+
+static inline int __txn_child_read(ENV *env,
+ void *data, __txn_child_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __txn_child_desc, sizeof(__txn_child_args), (void**)arg));
+}
+#define DB___txn_xa_regop_42 13
+typedef struct ___txn_xa_regop_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ DBT xid;
+ int32_t formatID;
+ u_int32_t gtrid;
+ u_int32_t bqual;
+ DB_LSN begin_lsn;
+ DBT locks;
+} __txn_xa_regop_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_xa_regop_42_desc[];
+static inline int __txn_xa_regop_42_read(ENV *env,
+ void *data, __txn_xa_regop_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __txn_xa_regop_42_desc, sizeof(__txn_xa_regop_42_args), (void**)arg));
+}
+#define DB___txn_prepare 13
+typedef struct ___txn_prepare_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ DBT gid;
+ DB_LSN begin_lsn;
+ DBT locks;
+} __txn_prepare_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_prepare_desc[];
+static inline int
+__txn_prepare_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t opcode, const DBT *gid, DB_LSN * begin_lsn, const DBT *locks)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___txn_prepare, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + LOG_DBT_SIZE(gid) + sizeof(*begin_lsn) +
+ LOG_DBT_SIZE(locks),
+ __txn_prepare_desc,
+ opcode, gid, begin_lsn, locks));
+}
+
+static inline int __txn_prepare_read(ENV *env,
+ void *data, __txn_prepare_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __txn_prepare_desc, sizeof(__txn_prepare_args), (void**)arg));
+}
+#define DB___txn_recycle 14
+typedef struct ___txn_recycle_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t min;
+ u_int32_t max;
+} __txn_recycle_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __txn_recycle_desc[];
+static inline int
+__txn_recycle_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ u_int32_t min, u_int32_t max)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___txn_recycle, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ sizeof(u_int32_t) + sizeof(u_int32_t),
+ __txn_recycle_desc,
+ min, max));
+}
+
+static inline int __txn_recycle_read(ENV *env,
+ void *data, __txn_recycle_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __txn_recycle_desc, sizeof(__txn_recycle_args), (void**)arg));
+}
+#endif
diff --git a/src/dbinc_auto/txn_ext.h b/src/dbinc_auto/txn_ext.h
new file mode 100644
index 00000000..7c21455f
--- /dev/null
+++ b/src/dbinc_auto/txn_ext.h
@@ -0,0 +1,93 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _txn_ext_h_
+#define _txn_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __txn_begin_pp __P((DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t));
+int __txn_begin __P((ENV *, DB_THREAD_INFO *, DB_TXN *, DB_TXN **, u_int32_t));
+int __txn_recycle_id __P((ENV *, int));
+int __txn_continue __P((ENV *, DB_TXN *, TXN_DETAIL *, DB_THREAD_INFO *, int));
+int __txn_commit __P((DB_TXN *, u_int32_t));
+int __txn_abort __P((DB_TXN *));
+int __txn_discard_int __P((DB_TXN *, u_int32_t flags));
+int __txn_prepare __P((DB_TXN *, u_int8_t *));
+u_int32_t __txn_id __P((DB_TXN *));
+int __txn_get_name __P((DB_TXN *, const char **));
+int __txn_set_name __P((DB_TXN *, const char *));
+int __txn_get_priority __P((DB_TXN *, u_int32_t *));
+int __txn_set_priority __P((DB_TXN *, u_int32_t));
+int __txn_set_timeout __P((DB_TXN *, db_timeout_t, u_int32_t));
+int __txn_activekids __P((ENV *, u_int32_t, DB_TXN *));
+int __txn_force_abort __P((ENV *, u_int8_t *));
+int __txn_preclose __P((ENV *));
+int __txn_reset __P((ENV *));
+int __txn_applied_pp __P((DB_ENV *, DB_TXN_TOKEN *, db_timeout_t, u_int32_t));
+int __txn_init_recover __P((ENV *, DB_DISTAB *));
+int __txn_regop_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_regop_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_child_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_xa_regop_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_prepare_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_recycle_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_init_print __P((ENV *, DB_DISTAB *));
+int __txn_checkpoint_pp __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __txn_checkpoint __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+int __txn_getactive __P((ENV *, DB_LSN *));
+int __txn_getckp __P((ENV *, DB_LSN *));
+int __txn_updateckp __P((ENV *, DB_LSN *));
+int __txn_failchk __P((ENV *));
+int __txn_env_create __P((DB_ENV *));
+void __txn_env_destroy __P((DB_ENV *));
+int __txn_get_tx_max __P((DB_ENV *, u_int32_t *));
+int __txn_set_tx_max __P((DB_ENV *, u_int32_t));
+int __txn_get_tx_timestamp __P((DB_ENV *, time_t *));
+int __txn_set_tx_timestamp __P((DB_ENV *, time_t *));
+int __txn_regop_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_prepare_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_child_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_restore_txn __P((ENV *, DB_LSN *, __txn_prepare_args *));
+int __txn_recycle_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_regop_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_ckp_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __txn_recover_pp __P((DB_ENV *, DB_PREPLIST *, long, long *, u_int32_t));
+int __txn_recover __P((ENV *, DB_PREPLIST *, long, long *, u_int32_t));
+int __txn_get_prepared __P((ENV *, XID *, DB_PREPLIST *, long, long *, u_int32_t));
+int __txn_openfiles __P((ENV *, DB_THREAD_INFO *, DB_LSN *, int));
+int __txn_open __P((ENV *));
+int __txn_findlastckp __P((ENV *, DB_LSN *, DB_LSN *));
+int __txn_env_refresh __P((ENV *));
+u_int32_t __txn_region_mutex_count __P((ENV *));
+u_int32_t __txn_region_mutex_max __P((ENV *));
+size_t __txn_region_size __P((ENV *));
+size_t __txn_region_max __P((ENV *));
+int __txn_id_set __P((ENV *, u_int32_t, u_int32_t));
+int __txn_oldest_reader __P((ENV *, DB_LSN *));
+int __txn_add_buffer __P((ENV *, TXN_DETAIL *));
+int __txn_remove_buffer __P((ENV *, TXN_DETAIL *, db_mutex_t));
+int __txn_stat_pp __P((DB_ENV *, DB_TXN_STAT **, u_int32_t));
+int __txn_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __txn_stat_print __P((ENV *, u_int32_t));
+int __txn_closeevent __P((ENV *, DB_TXN *, DB *));
+int __txn_remevent __P((ENV *, DB_TXN *, const char *, u_int8_t *, int));
+void __txn_remrem __P((ENV *, DB_TXN *, const char *));
+int __txn_lockevent __P((ENV *, DB_TXN *, DB *, DB_LOCK *, DB_LOCKER *));
+void __txn_remlock __P((ENV *, DB_TXN *, DB_LOCK *, DB_LOCKER *));
+int __txn_doevents __P((ENV *, DB_TXN *, int, int));
+int __txn_record_fname __P((ENV *, DB_TXN *, FNAME *));
+int __txn_dref_fname __P((ENV *, DB_TXN *));
+void __txn_reset_fe_watermarks __P((DB_TXN *));
+void __txn_remove_fe_watermark __P((DB_TXN *,DB *));
+void __txn_add_fe_watermark __P((DB_TXN *, DB *, db_pgno_t));
+int __txn_flush_fe_files __P((DB_TXN *));
+int __txn_pg_above_fe_watermark __P((DB_TXN*, MPOOLFILE*, db_pgno_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_txn_ext_h_ */
diff --git a/src/dbinc_auto/xa_ext.h b/src/dbinc_auto/xa_ext.h
new file mode 100644
index 00000000..47a167f9
--- /dev/null
+++ b/src/dbinc_auto/xa_ext.h
@@ -0,0 +1,18 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _xa_ext_h_
+#define _xa_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __db_rmid_to_env __P((int, ENV **));
+int __db_xid_to_txn __P((ENV *, XID *, TXN_DETAIL **));
+void __db_map_rmid __P((int, ENV *));
+int __db_unmap_rmid __P((int));
+void __db_unmap_xid __P((ENV *, XID *, size_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_xa_ext_h_ */
diff --git a/src/dbreg/dbreg.c b/src/dbreg/dbreg.c
new file mode 100644
index 00000000..5067edac
--- /dev/null
+++ b/src/dbreg/dbreg.c
@@ -0,0 +1,1012 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+
+static int __dbreg_push_id __P((ENV *, int32_t));
+static int __dbreg_pop_id __P((ENV *, int32_t *));
+static int __dbreg_pluck_id __P((ENV *, int32_t));
+
+/*
+ * The dbreg subsystem, as its name implies, registers database handles so
+ * that we can associate log messages with them without logging a filename
+ * or a full, unique DB ID. Instead, we assign each dbp an int32_t which is
+ * easy and cheap to log, and use this subsystem to map back and forth.
+ *
+ * Overview of how dbreg ids are managed:
+ *
+ * OPEN
+ * dbreg_setup (Creates FNAME struct.)
+ * dbreg_new_id (Assigns new ID to dbp and logs it. May be postponed
+ * until we attempt to log something else using that dbp, if the dbp
+ * was opened on a replication client.)
+ *
+ * CLOSE
+ * dbreg_close_id (Logs closure of dbp/revocation of ID.)
+ * dbreg_revoke_id (As name implies, revokes ID.)
+ * dbreg_teardown (Destroys FNAME.)
+ *
+ * RECOVERY
+ * dbreg_setup
+ * dbreg_assign_id (Assigns a particular ID we have in the log to a dbp.)
+ *
+ * sometimes: dbreg_revoke_id; dbreg_teardown
+ * other times: normal close path
+ *
+ * A note about locking:
+ *
+ * FNAME structures are referenced only by their corresponding dbp's
+ * until they have a valid id.
+ *
+ * Once they have a valid id, they must get linked into the log
+ * region list so they can get logged on checkpoints.
+ *
+ * An FNAME that may/does have a valid id must be accessed under
+ * protection of the mtx_filelist, with the following exception:
+ *
+ * We don't want to have to grab the mtx_filelist on every log
+ * record, and it should be safe not to do so when we're just
+ * looking at the id, because once allocated, the id should
+ * not change under a handle until the handle is closed.
+ *
+ * If a handle is closed during an attempt by another thread to
+ * log with it, well, the application doing the close deserves to
+ * go down in flames and a lot else is about to fail anyway.
+ *
+ * When in the course of logging we encounter an invalid id
+ * and go to allocate it lazily, we *do* need to check again
+ * after grabbing the mutex, because it's possible to race with
+ * another thread that has also decided that it needs to allocate
+ * a id lazily.
+ *
+ * See SR #5623 for further discussion of the new dbreg design.
+ */
+
+/*
+ * __dbreg_setup --
+ * Allocate and initialize an FNAME structure. The FNAME structures
+ * live in the log shared region and map one-to-one with open database handles.
+ * When the handle needs to be logged, the FNAME should have a valid fid
+ * allocated. If the handle currently isn't logged, it still has an FNAME
+ * entry. If we later discover that the handle needs to be logged, we can
+ * allocate a id for it later. (This happens when the handle is on a
+ * replication client that later becomes a master.)
+ *
+ * PUBLIC: int __dbreg_setup __P((DB *, const char *, const char *, u_int32_t));
+ */
+int
+__dbreg_setup(dbp, fname, dname, create_txnid)
+ DB *dbp;
+ const char *fname, *dname;
+ u_int32_t create_txnid;
+{
+ DB_LOG *dblp;
+ ENV *env;
+ FNAME *fnp;
+#ifdef HAVE_STATISTICS
+ LOG *lp;
+#endif
+ REGINFO *infop;
+ int ret;
+ size_t len;
+ void *p;
+
+ env = dbp->env;
+ dblp = env->lg_handle;
+ infop = &dblp->reginfo;
+
+ fnp = NULL;
+ p = NULL;
+
+ /* Allocate an FNAME and, if necessary, a buffer for the name itself. */
+ LOG_SYSTEM_LOCK(env);
+ if ((ret = __env_alloc(infop, sizeof(FNAME), &fnp)) != 0)
+ goto err;
+
+#ifdef HAVE_STATISTICS
+ lp = dblp->reginfo.primary;
+ if (++lp->stat.st_nfileid > lp->stat.st_maxnfileid)
+ lp->stat.st_maxnfileid = lp->stat.st_nfileid;
+#endif
+
+ memset(fnp, 0, sizeof(FNAME));
+ if (fname == NULL)
+ fnp->fname_off = INVALID_ROFF;
+ else {
+ len = strlen(fname) + 1;
+ if ((ret = __env_alloc(infop, len, &p)) != 0)
+ goto err;
+ fnp->fname_off = R_OFFSET(infop, p);
+ memcpy(p, fname, len);
+ }
+ if (dname == NULL)
+ fnp->dname_off = INVALID_ROFF;
+ else {
+ len = strlen(dname) + 1;
+ if ((ret = __env_alloc(infop, len, &p)) != 0)
+ goto err;
+ fnp->dname_off = R_OFFSET(infop, p);
+ memcpy(p, dname, len);
+ }
+ LOG_SYSTEM_UNLOCK(env);
+
+ /*
+ * Fill in all the remaining info that we'll need later to register
+ * the file, if we use it for logging.
+ */
+ fnp->id = fnp->old_id = DB_LOGFILEID_INVALID;
+ fnp->s_type = dbp->type;
+ memcpy(fnp->ufid, dbp->fileid, DB_FILE_ID_LEN);
+ fnp->meta_pgno = dbp->meta_pgno;
+ fnp->create_txnid = create_txnid;
+ dbp->dbenv->thread_id(dbp->dbenv, &fnp->pid, NULL);
+
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ F_SET(fnp, DB_FNAME_INMEM);
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ F_SET(fnp, DB_FNAME_RECOVER);
+ /*
+ * The DB is BIGENDed if its bytes are swapped XOR
+ * the machine is bigended
+ */
+ if ((F_ISSET(dbp, DB_AM_SWAP) != 0) ^
+ (F_ISSET(env, ENV_LITTLEENDIAN) == 0))
+ F_SET(fnp, DBREG_BIGEND);
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ F_SET(fnp, DBREG_CHKSUM);
+ if (F_ISSET(dbp, DB_AM_ENCRYPT))
+ F_SET(fnp, DBREG_ENCRYPT);
+ if (F2_ISSET(dbp, DB2_AM_EXCL))
+ F_SET(fnp, DBREG_EXCL);
+ fnp->txn_ref = 1;
+ fnp->mutex = dbp->mutex;
+
+ dbp->log_filename = fnp;
+
+ return (0);
+
+err: LOG_SYSTEM_UNLOCK(env);
+ if (ret == ENOMEM)
+ __db_errx(env, DB_STR("1501",
+ "Logging region out of memory; you may need to increase its size"));
+
+ return (ret);
+}
+
+/*
+ * __dbreg_teardown --
+ * Destroy a DB handle's FNAME struct. This is only called when closing
+ * the DB.
+ *
+ * PUBLIC: int __dbreg_teardown __P((DB *));
+ */
+int
+__dbreg_teardown(dbp)
+ DB *dbp;
+{
+ int ret;
+
+ /*
+ * We may not have an FNAME if we were never opened. This is not an
+ * error.
+ */
+ if (dbp->log_filename == NULL)
+ return (0);
+
+ ret = __dbreg_teardown_int(dbp->env, dbp->log_filename);
+
+ /* We freed the copy of the mutex from the FNAME. */
+ dbp->log_filename = NULL;
+ dbp->mutex = MUTEX_INVALID;
+
+ return (ret);
+}
+
+/*
+ * __dbreg_teardown_int --
+ * Destroy an FNAME struct.
+ *
+ * PUBLIC: int __dbreg_teardown_int __P((ENV *, FNAME *));
+ */
+int
+__dbreg_teardown_int(env, fnp)
+ ENV *env;
+ FNAME *fnp;
+{
+ DB_LOG *dblp;
+#ifdef HAVE_STATISTICS
+ LOG *lp;
+#endif
+ REGINFO *infop;
+ int ret;
+
+ if (F_ISSET(fnp, DB_FNAME_NOTLOGGED))
+ return (0);
+ dblp = env->lg_handle;
+ infop = &dblp->reginfo;
+#ifdef HAVE_STATISTICS
+ lp = dblp->reginfo.primary;
+#endif
+
+ DB_ASSERT(env, fnp->id == DB_LOGFILEID_INVALID);
+ ret = __mutex_free(env, &fnp->mutex);
+
+ LOG_SYSTEM_LOCK(env);
+ if (fnp->fname_off != INVALID_ROFF)
+ __env_alloc_free(infop, R_ADDR(infop, fnp->fname_off));
+ if (fnp->dname_off != INVALID_ROFF)
+ __env_alloc_free(infop, R_ADDR(infop, fnp->dname_off));
+ __env_alloc_free(infop, fnp);
+ STAT(lp->stat.st_nfileid--);
+ LOG_SYSTEM_UNLOCK(env);
+
+ return (ret);
+}
+
+/*
+ * __dbreg_new_id --
+ * Get an unused dbreg id to this database handle.
+ * Used as a wrapper to acquire the mutex and
+ * only set the id on success.
+ *
+ * PUBLIC: int __dbreg_new_id __P((DB *, DB_TXN *));
+ */
+int
+__dbreg_new_id(dbp, txn)
+ DB *dbp;
+ DB_TXN *txn;
+{
+ DB_LOG *dblp;
+ ENV *env;
+ FNAME *fnp;
+ LOG *lp;
+ int32_t id;
+ int ret;
+
+ env = dbp->env;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ fnp = dbp->log_filename;
+
+ /* The mtx_filelist protects the FNAME list and id management. */
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ if (fnp->id != DB_LOGFILEID_INVALID) {
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+ return (0);
+ }
+ if ((ret = __dbreg_get_id(dbp, txn, &id)) == 0)
+ fnp->id = id;
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+ return (ret);
+}
+
+/*
+ * __dbreg_get_id --
+ * Assign an unused dbreg id to this database handle.
+ * Assume the caller holds the mtx_filelist locked. Assume the
+ * caller will set the fnp->id field with the id we return.
+ *
+ * PUBLIC: int __dbreg_get_id __P((DB *, DB_TXN *, int32_t *));
+ */
+int
+__dbreg_get_id(dbp, txn, idp)
+ DB *dbp;
+ DB_TXN *txn;
+ int32_t *idp;
+{
+ DB_LOG *dblp;
+ ENV *env;
+ FNAME *fnp;
+ LOG *lp;
+ int32_t id;
+ int ret;
+
+ env = dbp->env;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ fnp = dbp->log_filename;
+
+ /*
+ * It's possible that after deciding we needed to call this function,
+ * someone else allocated an ID before we grabbed the lock. Check
+ * to make sure there was no race and we have something useful to do.
+ */
+ /* Get an unused ID from the free list. */
+ if ((ret = __dbreg_pop_id(env, &id)) != 0)
+ goto err;
+
+ /* If no ID was found, allocate a new one. */
+ if (id == DB_LOGFILEID_INVALID)
+ id = lp->fid_max++;
+
+ /* If the file is durable (i.e., not, not-durable), mark it as such. */
+ if (!F_ISSET(dbp, DB_AM_NOT_DURABLE))
+ F_SET(fnp, DB_FNAME_DURABLE);
+
+ /* Hook the FNAME into the list of open files. */
+ SH_TAILQ_INSERT_HEAD(&lp->fq, fnp, q, __fname);
+
+ /*
+ * Log the registry. We should only request a new ID in situations
+ * where logging is reasonable.
+ */
+ DB_ASSERT(env, !F_ISSET(dbp, DB_AM_RECOVER));
+
+ if ((ret = __dbreg_log_id(dbp, txn, id, 0)) != 0)
+ goto err;
+
+ /*
+ * Once we log the create_txnid, we need to make sure we never
+ * log it again (as might happen if this is a replication client
+ * that later upgrades to a master).
+ */
+ fnp->create_txnid = TXN_INVALID;
+
+ DB_ASSERT(env, dbp->type == fnp->s_type);
+ DB_ASSERT(env, dbp->meta_pgno == fnp->meta_pgno);
+
+ if ((ret = __dbreg_add_dbentry(env, dblp, dbp, id)) != 0)
+ goto err;
+ /*
+ * If we have a successful call, set the ID. Otherwise
+ * we have to revoke it and remove it from all the lists
+ * it has been added to, and return an invalid id.
+ */
+err:
+ if (ret != 0 && id != DB_LOGFILEID_INVALID) {
+ (void)__dbreg_revoke_id(dbp, 1, id);
+ id = DB_LOGFILEID_INVALID;
+ }
+ *idp = id;
+ return (ret);
+}
+
+/*
+ * __dbreg_assign_id --
+ * Assign a particular dbreg id to this database handle.
+ *
+ * PUBLIC: int __dbreg_assign_id __P((DB *, int32_t, int));
+ */
+int
+__dbreg_assign_id(dbp, id, deleted)
+ DB *dbp;
+ int32_t id;
+ int deleted;
+{
+ DB *close_dbp;
+ DB_LOG *dblp;
+ ENV *env;
+ FNAME *close_fnp, *fnp;
+ LOG *lp;
+ int ret;
+
+ env = dbp->env;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ fnp = dbp->log_filename;
+
+ close_dbp = NULL;
+ close_fnp = NULL;
+
+ /* The mtx_filelist protects the FNAME list and id management. */
+ MUTEX_LOCK(env, lp->mtx_filelist);
+
+ /* We should only call this on DB handles that have no ID. */
+ DB_ASSERT(env, fnp->id == DB_LOGFILEID_INVALID);
+
+ /*
+ * Make sure there isn't already a file open with this ID. There can
+ * be in recovery, if we're recovering across a point where an ID got
+ * reused.
+ */
+ if (__dbreg_id_to_fname(dblp, id, 1, &close_fnp) == 0) {
+ /*
+ * We want to save off any dbp we have open with this id. We
+ * can't safely close it now, because we hold the mtx_filelist,
+ * but we should be able to rely on it being open in this
+ * process, and we're running recovery, so no other thread
+ * should muck with it if we just put off closing it until
+ * we're ready to return.
+ *
+ * Once we have the dbp, revoke its id; we're about to
+ * reuse it.
+ */
+ ret = __dbreg_id_to_db(env, NULL, &close_dbp, id, 0);
+ if (ret == ENOENT) {
+ ret = 0;
+ goto cont;
+ } else if (ret != 0)
+ goto err;
+
+ if ((ret = __dbreg_revoke_id(close_dbp, 1,
+ DB_LOGFILEID_INVALID)) != 0)
+ goto err;
+ }
+
+ /*
+ * Remove this ID from the free list, if it's there, and make sure
+ * we don't allocate it anew.
+ */
+cont: if ((ret = __dbreg_pluck_id(env, id)) != 0)
+ goto err;
+ if (id >= lp->fid_max)
+ lp->fid_max = id + 1;
+
+ /* Now go ahead and assign the id to our dbp. */
+ fnp->id = id;
+ /* If the file is durable (i.e., not, not-durable), mark it as such. */
+ if (!F_ISSET(dbp, DB_AM_NOT_DURABLE))
+ F_SET(fnp, DB_FNAME_DURABLE);
+ SH_TAILQ_INSERT_HEAD(&lp->fq, fnp, q, __fname);
+
+ /*
+ * If we get an error adding the dbentry, revoke the id.
+ * We void the return value since we want to retain and
+ * return the original error in ret anyway.
+ */
+ if ((ret = __dbreg_add_dbentry(env, dblp, dbp, id)) != 0)
+ (void)__dbreg_revoke_id(dbp, 1, id);
+ else
+ dblp->dbentry[id].deleted = deleted;
+
+err: MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+ /* There's nothing useful that our caller can do if this close fails. */
+ if (close_dbp != NULL)
+ (void)__db_close(close_dbp, NULL, DB_NOSYNC);
+
+ return (ret);
+}
+
+/*
+ * __dbreg_revoke_id --
+ * Take a log id away from a dbp, in preparation for closing it,
+ * but without logging the close.
+ *
+ * PUBLIC: int __dbreg_revoke_id __P((DB *, int, int32_t));
+ */
+int
+__dbreg_revoke_id(dbp, have_lock, force_id)
+ DB *dbp;
+ int have_lock;
+ int32_t force_id;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ int push;
+
+ env = dbp->env;
+
+ /*
+ * If we are not in recovery but the file was opened for a recovery
+ * operation, then this process aborted a transaction for another
+ * process and the id may still be in use, so don't reuse this id.
+ * If our fid generation in replication has changed, this fid
+ * should not be reused
+ */
+ db_rep = env->rep_handle;
+ push = (!F_ISSET(dbp, DB_AM_RECOVER) || IS_RECOVERING(env)) &&
+ (!REP_ON(env) || ((REP *)db_rep->region)->gen == dbp->fid_gen);
+
+ return (__dbreg_revoke_id_int(dbp->env,
+ dbp->log_filename, have_lock, push, force_id));
+}
+/*
+ * __dbreg_revoke_id_int --
+ * Revoke a log, in preparation for closing it, but without logging
+ * the close.
+ *
+ * PUBLIC: int __dbreg_revoke_id_int
+ * PUBLIC: __P((ENV *, FNAME *, int, int, int32_t));
+ */
+int
+__dbreg_revoke_id_int(env, fnp, have_lock, push, force_id)
+ ENV *env;
+ FNAME *fnp;
+ int have_lock, push;
+ int32_t force_id;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ int32_t id;
+ int ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+
+ /* If we lack an ID, this is a null-op. */
+ if (fnp == NULL)
+ return (0);
+
+ /*
+ * If we have a force_id, we had an error after allocating
+ * the id, and putting it on the fq list, but before we
+ * finished setting up fnp. So, if we have a force_id use it.
+ */
+ if (force_id != DB_LOGFILEID_INVALID)
+ id = force_id;
+ else if (fnp->id == DB_LOGFILEID_INVALID) {
+ if (fnp->old_id == DB_LOGFILEID_INVALID)
+ return (0);
+ id = fnp->old_id;
+ } else
+ id = fnp->id;
+ if (!have_lock)
+ MUTEX_LOCK(env, lp->mtx_filelist);
+
+ fnp->id = DB_LOGFILEID_INVALID;
+ fnp->old_id = DB_LOGFILEID_INVALID;
+
+ /* Remove the FNAME from the list of open files. */
+ SH_TAILQ_REMOVE(&lp->fq, fnp, q, __fname);
+
+ /*
+ * This FNAME may be for a DBP which is already closed. Its ID may
+ * still be in use by an aborting transaction. If not,
+ * remove this id from the dbentry table and push it onto the
+ * free list.
+ */
+ if ((ret = __dbreg_rem_dbentry(dblp, id)) == 0 && push)
+ ret = __dbreg_push_id(env, id);
+
+ if (!have_lock)
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+ return (ret);
+}
+
+/*
+ * __dbreg_close_id --
+ * Take a dbreg id away from a dbp that we're closing, and log
+ * the unregistry if the refcount goes to 0.
+ *
+ * PUBLIC: int __dbreg_close_id __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__dbreg_close_id(dbp, txn, op)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t op;
+{
+ DB_LOG *dblp;
+ ENV *env;
+ FNAME *fnp;
+ LOG *lp;
+ int ret, t_ret;
+
+ env = dbp->env;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ fnp = dbp->log_filename;
+
+ /* If we lack an ID, this is a null-op. */
+ if (fnp == NULL)
+ return (0);
+
+ if (fnp->id == DB_LOGFILEID_INVALID) {
+ ret = __dbreg_revoke_id(dbp, 0, DB_LOGFILEID_INVALID);
+ goto done;
+ }
+
+ /*
+ * If we are the last reference to this db then we need to log it
+ * as closed. Otherwise the last transaction will do the logging.
+ * Remove the DBP from the db entry table since it can nolonger
+ * be used. If we abort it will have to be reopened.
+ */
+ ret = 0;
+ DB_ASSERT(env, fnp->txn_ref > 0);
+ if (fnp->txn_ref > 1) {
+ MUTEX_LOCK(env, dbp->mutex);
+ if (fnp->txn_ref > 1) {
+ if ((t_ret = __dbreg_rem_dbentry(
+ env->lg_handle, fnp->id)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * The DB handle has been closed in the logging system.
+ * Transactions may still have a ref to this name.
+ * Mark it so that if recovery reopens the file id
+ * the transaction will not close the wrong handle.
+ */
+ F_SET(fnp, DB_FNAME_CLOSED);
+ fnp->txn_ref--;
+ MUTEX_UNLOCK(env, dbp->mutex);
+ /* The mutex now lives only in the FNAME. */
+ dbp->mutex = MUTEX_INVALID;
+ dbp->log_filename = NULL;
+ goto no_log;
+ }
+ }
+ MUTEX_LOCK(env, lp->mtx_filelist);
+
+ if ((ret = __dbreg_log_close(env, fnp, txn, op)) != 0)
+ goto err;
+ ret = __dbreg_revoke_id(dbp, 1, DB_LOGFILEID_INVALID);
+
+err: MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+done: if ((t_ret = __dbreg_teardown(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+no_log:
+ return (ret);
+}
+/*
+ * __dbreg_close_id_int --
+ * Close down a dbreg id and log the unregistry. This is called only
+ * when a transaction has the last ref to the fname.
+ *
+ * PUBLIC: int __dbreg_close_id_int __P((ENV *, FNAME *, u_int32_t, int));
+ */
+int
+__dbreg_close_id_int(env, fnp, op, locked)
+ ENV *env;
+ FNAME *fnp;
+ u_int32_t op;
+ int locked;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ int ret, t_ret;
+
+ DB_ASSERT(env, fnp->txn_ref == 1);
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ if (fnp->id == DB_LOGFILEID_INVALID)
+ return (__dbreg_revoke_id_int(env,
+ fnp, locked, 1, DB_LOGFILEID_INVALID));
+
+ if (F_ISSET(fnp, DB_FNAME_RECOVER))
+ return (__dbreg_close_file(env, fnp));
+ /*
+ * If log_close fails then it will mark the name DB_FNAME_NOTLOGGED
+ * and the id must persist.
+ */
+ if (!locked)
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ if ((ret = __dbreg_log_close(env, fnp, NULL, op)) != 0)
+ goto err;
+
+ ret = __dbreg_revoke_id_int(env, fnp, 1, 1, DB_LOGFILEID_INVALID);
+
+err: if (!locked)
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+ if ((t_ret = __dbreg_teardown_int(env, fnp)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __dbreg_failchk --
+ *
+ * Look for entries that belong to dead processes and either close them
+ * out or, if there are pending transactions, just remove the mutex which
+ * will get discarded later.
+ *
+ * PUBLIC: int __dbreg_failchk __P((ENV *));
+ */
+int
+__dbreg_failchk(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ DB_LOG *dblp;
+ FNAME *fnp, *nnp;
+ LOG *lp;
+ int ret, t_ret;
+ char buf[DB_THREADID_STRLEN];
+ db_threadid_t unused;
+
+ if ((dblp = env->lg_handle) == NULL)
+ return (0);
+
+ DB_THREADID_INIT(unused);
+
+ lp = dblp->reginfo.primary;
+ dbenv = env->dbenv;
+ ret = 0;
+
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); fnp != NULL; fnp = nnp) {
+ nnp = SH_TAILQ_NEXT(fnp, q, __fname);
+ if (dbenv->is_alive(dbenv,
+ fnp->pid, unused, DB_MUTEX_PROCESS_ONLY))
+ continue;
+ MUTEX_LOCK(env, fnp->mutex);
+ __db_msg(env, DB_STR_A("1502",
+ "Freeing log information for process: %s, (ref %lu)",
+ "%s %lu"),
+ dbenv->thread_id_string(dbenv, fnp->pid, unused, buf),
+ (u_long)fnp->txn_ref);
+ if (fnp->txn_ref > 1 || F_ISSET(fnp, DB_FNAME_CLOSED)) {
+ if (!F_ISSET(fnp, DB_FNAME_CLOSED)) {
+ fnp->txn_ref--;
+ F_SET(fnp, DB_FNAME_CLOSED);
+ }
+ MUTEX_UNLOCK(env, fnp->mutex);
+ fnp->mutex = MUTEX_INVALID;
+ fnp->pid = 0;
+ } else {
+ F_SET(fnp, DB_FNAME_CLOSED);
+ if ((t_ret = __dbreg_close_id_int(env,
+ fnp, DBREG_CLOSE, 1)) && ret == 0)
+ ret = t_ret;
+ }
+ }
+
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+ return (ret);
+}
+/*
+ * __dbreg_log_close --
+ *
+ * Log a close of a database. Called when closing a file or when a
+ * replication client is becoming a master. That closes all the
+ * files it previously had open.
+ *
+ * Assumes caller holds the lp->mutex_filelist lock already.
+ *
+ * PUBLIC: int __dbreg_log_close __P((ENV *, FNAME *,
+ * PUBLIC: DB_TXN *, u_int32_t));
+ */
+int
+__dbreg_log_close(env, fnp, txn, op)
+ ENV *env;
+ FNAME *fnp;
+ DB_TXN *txn;
+ u_int32_t op;
+{
+ DBT fid_dbt, r_name, *dbtp;
+ DB_LOG *dblp;
+ DB_LSN r_unused;
+ int ret;
+
+ dblp = env->lg_handle;
+ ret = 0;
+
+ if (fnp->fname_off == INVALID_ROFF)
+ dbtp = NULL;
+ else {
+ memset(&r_name, 0, sizeof(r_name));
+ r_name.data = R_ADDR(&dblp->reginfo, fnp->fname_off);
+ r_name.size = (u_int32_t)strlen((char *)r_name.data) + 1;
+ dbtp = &r_name;
+ }
+ memset(&fid_dbt, 0, sizeof(fid_dbt));
+ fid_dbt.data = fnp->ufid;
+ fid_dbt.size = DB_FILE_ID_LEN;
+ if ((ret = __dbreg_register_log(env, txn, &r_unused,
+ F_ISSET(fnp, DB_FNAME_DURABLE) ? 0 : DB_LOG_NOT_DURABLE,
+ op, dbtp, &fid_dbt, fnp->id,
+ fnp->s_type, fnp->meta_pgno, TXN_INVALID)) != 0) {
+ /*
+ * We are trying to close, but the log write failed.
+ * Unfortunately, close needs to plow forward, because
+ * the application can't do anything with the handle.
+ * Make the entry in the shared memory region so that
+ * when we close the environment, we know that this
+ * happened. Also, make sure we remove this from the
+ * per-process table, so that we don't try to close it
+ * later.
+ */
+ F_SET(fnp, DB_FNAME_NOTLOGGED);
+ (void)__dbreg_rem_dbentry(dblp, fnp->id);
+ }
+ return (ret);
+}
+
+/*
+ * __dbreg_push_id and __dbreg_pop_id --
+ * Dbreg ids from closed files are kept on a stack in shared memory
+ * for recycling. (We want to reuse them as much as possible because each
+ * process keeps open files in an array by ID.) Push them to the stack and
+ * pop them from it, managing memory as appropriate.
+ *
+ * The stack is protected by the mtx_filelist, and both functions assume it
+ * is already locked.
+ */
+static int
+__dbreg_push_id(env, id)
+ ENV *env;
+ int32_t id;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ REGINFO *infop;
+ int32_t *stack, *newstack;
+ int ret;
+
+ dblp = env->lg_handle;
+ infop = &dblp->reginfo;
+ lp = infop->primary;
+
+ if (id == lp->fid_max - 1) {
+ lp->fid_max--;
+ return (0);
+ }
+
+ /* Check if we have room on the stack. */
+ if (lp->free_fid_stack == INVALID_ROFF ||
+ lp->free_fids_alloced <= lp->free_fids + 1) {
+ LOG_SYSTEM_LOCK(env);
+ if ((ret = __env_alloc(infop,
+ (lp->free_fids_alloced + 20) * sizeof(u_int32_t),
+ &newstack)) != 0) {
+ LOG_SYSTEM_UNLOCK(env);
+ return (ret);
+ }
+
+ if (lp->free_fid_stack != INVALID_ROFF) {
+ stack = R_ADDR(infop, lp->free_fid_stack);
+ memcpy(newstack, stack,
+ lp->free_fids_alloced * sizeof(u_int32_t));
+ __env_alloc_free(infop, stack);
+ }
+ lp->free_fid_stack = R_OFFSET(infop, newstack);
+ lp->free_fids_alloced += 20;
+ LOG_SYSTEM_UNLOCK(env);
+ }
+
+ stack = R_ADDR(infop, lp->free_fid_stack);
+ stack[lp->free_fids++] = id;
+ return (0);
+}
+
+static int
+__dbreg_pop_id(env, id)
+ ENV *env;
+ int32_t *id;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ int32_t *stack;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ /* Do we have anything to pop? */
+ if (lp->free_fid_stack != INVALID_ROFF && lp->free_fids > 0) {
+ stack = R_ADDR(&dblp->reginfo, lp->free_fid_stack);
+ *id = stack[--lp->free_fids];
+ } else
+ *id = DB_LOGFILEID_INVALID;
+
+ return (0);
+}
+
+/*
+ * __dbreg_pluck_id --
+ * Remove a particular dbreg id from the stack of free ids. This is
+ * used when we open a file, as in recovery, with a specific ID that might
+ * be on the stack.
+ *
+ * Returns success whether or not the particular id was found, and like
+ * push and pop, assumes that the mtx_filelist is locked.
+ */
+static int
+__dbreg_pluck_id(env, id)
+ ENV *env;
+ int32_t id;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ int32_t *stack;
+ u_int i;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ if (id >= lp->fid_max)
+ return (0);
+
+ /* Do we have anything to look at? */
+ if (lp->free_fid_stack != INVALID_ROFF) {
+ stack = R_ADDR(&dblp->reginfo, lp->free_fid_stack);
+ for (i = 0; i < lp->free_fids; i++)
+ if (id == stack[i]) {
+ /*
+ * Found it. Overwrite it with the top
+ * id (which may harmlessly be itself),
+ * and shorten the stack by one.
+ */
+ stack[i] = stack[lp->free_fids - 1];
+ lp->free_fids--;
+ return (0);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __dbreg_log_id --
+ * Used for in-memory named files. They are created in mpool and
+ * are given id's early in the open process so that we can read and
+ * create pages in the mpool for the files. However, at the time that
+ * the mpf is created, the file may not be fully created and/or its
+ * meta-data may not be fully known, so we can't do a full dbregister.
+ * This is a routine exported that will log a complete dbregister
+ * record that will allow for both recovery and replication.
+ *
+ * PUBLIC: int __dbreg_log_id __P((DB *, DB_TXN *, int32_t, int));
+ */
+int
+__dbreg_log_id(dbp, txn, id, needlock)
+ DB *dbp;
+ DB_TXN *txn;
+ int32_t id;
+ int needlock;
+{
+ DBT fid_dbt, r_name;
+ DB_LOG *dblp;
+ DB_LSN unused;
+ ENV *env;
+ FNAME *fnp;
+ LOG *lp;
+ u_int32_t op;
+ int i, ret;
+
+ env = dbp->env;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ fnp = dbp->log_filename;
+
+ /*
+ * Verify that the fnp has been initialized, by seeing if it
+ * has any non-zero bytes in it.
+ */
+ for (i = 0; i < DB_FILE_ID_LEN; i++)
+ if (fnp->ufid[i] != 0)
+ break;
+ if (i == DB_FILE_ID_LEN)
+ memcpy(fnp->ufid, dbp->fileid, DB_FILE_ID_LEN);
+
+ if (fnp->s_type == DB_UNKNOWN)
+ fnp->s_type = dbp->type;
+
+ /*
+ * Log the registry. We should only request a new ID in situations
+ * where logging is reasonable.
+ */
+ memset(&fid_dbt, 0, sizeof(fid_dbt));
+ memset(&r_name, 0, sizeof(r_name));
+
+ if (needlock)
+ MUTEX_LOCK(env, lp->mtx_filelist);
+
+ if (fnp->fname_off != INVALID_ROFF) {
+ r_name.data = R_ADDR(&dblp->reginfo, fnp->fname_off);
+ r_name.size = (u_int32_t)strlen((char *)r_name.data) + 1;
+ }
+
+ fid_dbt.data = dbp->fileid;
+ fid_dbt.size = DB_FILE_ID_LEN;
+
+ op = !F_ISSET(dbp, DB_AM_OPEN_CALLED) ? DBREG_PREOPEN :
+ (F_ISSET(dbp, DB_AM_INMEM) ?
+ (F2_ISSET(dbp, DB2_AM_EXCL) ? DBREG_XREOPEN : DBREG_REOPEN):
+ (F2_ISSET(dbp, DB2_AM_EXCL) ? DBREG_XOPEN : DBREG_OPEN));
+ ret = __dbreg_register_log(env, txn, &unused,
+ F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0,
+ op | F_ISSET(fnp, DB_FNAME_DBREG_MASK),
+ r_name.size == 0 ? NULL : &r_name, &fid_dbt, id,
+ fnp->s_type, fnp->meta_pgno, fnp->create_txnid);
+
+ if (needlock)
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+ return (ret);
+}
diff --git a/src/dbreg/dbreg.src b/src/dbreg/dbreg.src
new file mode 100644
index 00000000..c7740d63
--- /dev/null
+++ b/src/dbreg/dbreg.src
@@ -0,0 +1,37 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __dbreg
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * Used for registering name/id translations at open or close.
+ * opcode: register or unregister
+ * name: file name
+ * fileid: unique file id
+ * ftype: file type
+ * ftype: database type
+ * id: transaction id of the subtransaction that created the fs object
+ */
+BEGIN register 42 2
+DBOP opcode u_int32_t lu
+DBT name DBT s
+DBT uid DBT s
+ARG fileid int32_t ld
+ARG ftype DBTYPE lx
+ARG meta_pgno db_pgno_t lu
+ARG id u_int32_t lx
+END
diff --git a/src/dbreg/dbreg_auto.c b/src/dbreg/dbreg_auto.c
new file mode 100644
index 00000000..a26e5527
--- /dev/null
+++ b/src/dbreg/dbreg_auto.c
@@ -0,0 +1,35 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __dbreg_register_desc[] = {
+ {LOGREC_DBOP, SSZ(__dbreg_register_args, opcode), "opcode", ""},
+ {LOGREC_DBT, SSZ(__dbreg_register_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__dbreg_register_args, uid), "uid", ""},
+ {LOGREC_ARG, SSZ(__dbreg_register_args, fileid), "fileid", "%ld"},
+ {LOGREC_ARG, SSZ(__dbreg_register_args, ftype), "ftype", "%lx"},
+ {LOGREC_ARG, SSZ(__dbreg_register_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__dbreg_register_args, id), "id", "%lx"},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __dbreg_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__dbreg_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __dbreg_register_recover, DB___dbreg_register)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/dbreg/dbreg_autop.c b/src/dbreg/dbreg_autop.c
new file mode 100644
index 00000000..ea43addd
--- /dev/null
+++ b/src/dbreg/dbreg_autop.c
@@ -0,0 +1,43 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __dbreg_register_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__dbreg_register_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__dbreg_register", __dbreg_register_desc, info));
+}
+
+/*
+ * PUBLIC: int __dbreg_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__dbreg_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __dbreg_register_print, DB___dbreg_register)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/dbreg/dbreg_rec.c b/src/dbreg/dbreg_rec.c
new file mode 100644
index 00000000..1b387bb7
--- /dev/null
+++ b/src/dbreg/dbreg_rec.c
@@ -0,0 +1,399 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * The President and Fellows of Harvard University. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+static int __dbreg_open_file __P((ENV *,
+ DB_TXN *, __dbreg_register_args *, void *));
+/*
+ * PUBLIC: int __dbreg_register_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__dbreg_register_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __dbreg_register_args *argp;
+ DB_ENTRY *dbe;
+ DB_LOG *dblp;
+ DB *dbp;
+ u_int32_t opcode, status;
+ int do_close, do_open, do_rem, ret, t_ret;
+
+ dblp = env->lg_handle;
+ dbp = NULL;
+
+#ifdef DEBUG_RECOVER
+ REC_PRINT(__dbreg_register_print);
+#endif
+ do_open = do_close = 0;
+ if ((ret = __dbreg_register_read(env, dbtp->data, &argp)) != 0)
+ goto out;
+
+ opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK);
+ switch (opcode) {
+ case DBREG_OPEN:
+ case DBREG_PREOPEN:
+ case DBREG_REOPEN:
+ case DBREG_XOPEN:
+ case DBREG_XREOPEN:
+ /*
+ * In general, we redo the open on REDO and abort on UNDO.
+ * However, a reopen is a second instance of an open of
+ * in-memory files and we don't want to close them yet
+ * on abort, so just skip that here.
+ */
+ if ((DB_REDO(op) ||
+ op == DB_TXN_OPENFILES || op == DB_TXN_POPENFILES))
+ do_open = 1;
+ else if (opcode != DBREG_REOPEN && opcode != DBREG_XREOPEN)
+ do_close = 1;
+ break;
+ case DBREG_CLOSE:
+ if (DB_UNDO(op))
+ do_open = 1;
+ else
+ do_close = 1;
+ break;
+ case DBREG_RCLOSE:
+ /*
+ * DBREG_RCLOSE was generated by recover because a file was
+ * left open. The POPENFILES pass, which is run to open
+ * files to abort prepared transactions, may not include the
+ * open for this file so we open it here. Note that a normal
+ * CLOSE is not legal before the prepared transaction is
+ * committed or aborted.
+ */
+ if (DB_UNDO(op) || op == DB_TXN_POPENFILES)
+ do_open = 1;
+ else
+ do_close = 1;
+ break;
+ case DBREG_CHKPNT:
+ case DBREG_XCHKPNT:
+ if (DB_UNDO(op) ||
+ op == DB_TXN_OPENFILES || op == DB_TXN_POPENFILES)
+ do_open = 1;
+ break;
+ default:
+ ret = __db_unknown_path(env, "__dbreg_register_recover");
+ goto out;
+ }
+
+ if (do_open) {
+ /*
+ * We must open the db even if the meta page is not
+ * yet written as we may be creating subdatabase.
+ */
+ if (op == DB_TXN_OPENFILES && opcode != DBREG_CHKPNT
+ && opcode != DBREG_XCHKPNT)
+ F_SET(dblp, DBLOG_FORCE_OPEN);
+
+ /*
+ * During an abort or an open pass to recover prepared txns,
+ * we need to make sure that we use the same locker id on the
+ * open. We pass the txnid along to ensure this.
+ */
+ ret = __dbreg_open_file(env,
+ op == DB_TXN_ABORT || op == DB_TXN_POPENFILES ?
+ argp->txnp : NULL, argp, info);
+ if (ret == DB_PAGE_NOTFOUND && argp->meta_pgno != PGNO_BASE_MD)
+ ret = ENOENT;
+ if (ret == ENOENT || ret == EINVAL) {
+ /*
+ * If this is an OPEN while rolling forward, it's
+ * possible that the file was recreated since last
+ * time we got here. In that case, we've got deleted
+ * set and probably shouldn't, so we need to check
+ * for that case and possibly retry.
+ */
+ if (DB_REDO(op) && argp->txnp != 0 &&
+ dblp->dbentry[argp->fileid].deleted) {
+ dblp->dbentry[argp->fileid].deleted = 0;
+ ret =
+ __dbreg_open_file(env, NULL, argp, info);
+ if (ret == DB_PAGE_NOTFOUND &&
+ argp->meta_pgno != PGNO_BASE_MD)
+ ret = ENOENT;
+ }
+ /*
+ * We treat ENOENT as OK since it's possible that
+ * the file was renamed or deleted.
+ * All other errors, we return.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ }
+ F_CLR(dblp, DBLOG_FORCE_OPEN);
+ }
+
+ if (do_close) {
+ /*
+ * If we are undoing an open, or redoing a close,
+ * then we need to close the file. If we are simply
+ * revoking then we just need to grab the DBP and revoke
+ * the log id.
+ *
+ * If the file is deleted, then we can just ignore this close.
+ * Otherwise, we should usually have a valid dbp we should
+ * close or whose reference count should be decremented.
+ * However, if we shut down without closing a file, we may, in
+ * fact, not have the file open, and that's OK.
+ */
+ do_rem = 0;
+ MUTEX_LOCK(env, dblp->mtx_dbreg);
+ if (argp->fileid < dblp->dbentry_cnt) {
+ /*
+ * Typically, closes should match an open which means
+ * that if this is a close, there should be a valid
+ * entry in the dbentry table when we get here,
+ * however there are exceptions. 1. If this is an
+ * OPENFILES pass, then we may have started from
+ * a log file other than the first, and the
+ * corresponding open appears in an earlier file.
+ * 2. If we are undoing an open on an abort or
+ * recovery, it's possible that we failed after
+ * the log record, but before we actually entered
+ * a handle here.
+ * 3. If we aborted an open, then we wrote a non-txnal
+ * RCLOSE into the log. During the forward pass, the
+ * file won't be open, and that's OK.
+ */
+ dbe = &dblp->dbentry[argp->fileid];
+ if (dbe->dbp == NULL && !dbe->deleted) {
+ /* No valid entry here. Nothing to do. */
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+ goto done;
+ }
+
+ /* We have either an open entry or a deleted entry. */
+ if ((dbp = dbe->dbp) != NULL) {
+ /*
+ * If we're a replication client, it's
+ * possible to get here with a dbp that
+ * the user opened, but which we later
+ * assigned a fileid to. Be sure that
+ * we only close dbps that we opened in
+ * the recovery code or that were opened
+ * inside a currently aborting transaction
+ * but not by the recovery code.
+ */
+ do_rem = (F_ISSET(dbp, DB_AM_RECOVER) ||
+ F2_ISSET(dbp, DB2_AM_EXCL)) ?
+ op != DB_TXN_ABORT :
+ op == DB_TXN_ABORT;
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+ } else if (dbe->deleted) {
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+ if ((ret = __dbreg_rem_dbentry(
+ dblp, argp->fileid)) != 0)
+ goto out;
+ }
+ } else
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+
+ /*
+ * During recovery, all files are closed. On an abort, we only
+ * close the file if we opened it during the abort
+ * (DB_AM_RECOVER set), otherwise we simply do a __db_refresh.
+ * For the close case, if remove or rename has closed the file,
+ * don't request a sync, because a NULL mpf would be a problem.
+ *
+ * If we are undoing a create we'd better discard any buffers
+ * from the memory pool. We identify creates because the
+ * argp->id field contains the transaction containing the file
+ * create; if that id is invalid, we are not creating.
+ *
+ * On the backward pass, we need to "undo" opens even if the
+ * transaction in which they appeared committed, because we have
+ * already undone the corresponding close. In that case, the
+ * id will be valid, but we do not want to discard buffers.
+ */
+ if (do_rem && dbp != NULL) {
+ if (argp->id != TXN_INVALID) {
+ if ((ret = __db_txnlist_find(env,
+ info, argp->txnp->txnid, &status))
+ != DB_NOTFOUND && ret != 0)
+ goto out;
+ if (ret == DB_NOTFOUND || status != TXN_COMMIT)
+ F_SET(dbp, DB_AM_DISCARD);
+ ret = 0;
+ }
+
+ if (op == DB_TXN_ABORT) {
+ if ((t_ret = __db_refresh(dbp,
+ NULL, DB_NOSYNC, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ } else {
+ if ((t_ret = __db_close(
+ dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ }
+ }
+done: if (ret == 0)
+ *lsnp = argp->prev_lsn;
+out: if (argp != NULL)
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * __dbreg_open_file --
+ * Called during log_register recovery. Make sure that we have an
+ * entry in the dbentry table for this ndx. Returns 0 on success,
+ * non-zero on error.
+ */
+static int
+__dbreg_open_file(env, txn, argp, info)
+ ENV *env;
+ DB_TXN *txn;
+ __dbreg_register_args *argp;
+ void *info;
+{
+ DB *dbp;
+ DB_ENTRY *dbe;
+ DB_LOG *dblp;
+ u_int32_t id, opcode, status;
+ int ret;
+
+ dblp = env->lg_handle;
+ opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK);
+
+ /*
+ * When we're opening, we have to check that the name we are opening
+ * is what we expect. If it's not, then we close the old file and
+ * open the new one.
+ */
+ MUTEX_LOCK(env, dblp->mtx_dbreg);
+ if (argp->fileid != DB_LOGFILEID_INVALID &&
+ argp->fileid < dblp->dbentry_cnt)
+ dbe = &dblp->dbentry[argp->fileid];
+ else
+ dbe = NULL;
+
+ if (dbe != NULL) {
+ if (dbe->deleted) {
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+ return (ENOENT);
+ }
+
+ /*
+ * At the end of OPENFILES, we may have a file open. If this
+ * is a reopen, then we will always close and reopen. If the
+ * open was part of a committed transaction, so it doesn't
+ * get undone. However, if the fileid was previously used,
+ * we'll see a close that may need to get undone. There are
+ * three ways we can detect this. 1) the meta-pgno in the
+ * current file does not match that of the open file, 2) the
+ * file uid of the current file does not match that of the
+ * previously opened file, 3) the current file is unnamed, in
+ * which case it should never be opened during recovery.
+ * It is also possible that the db open previously failed
+ * because the file was missing. Check the DB_AM_OPEN_CALLED
+ * bit and try to open it again.
+ */
+ if ((dbp = dbe->dbp) != NULL) {
+ if (opcode == DBREG_REOPEN ||
+ opcode == DBREG_XREOPEN ||
+ !F_ISSET(dbp, DB_AM_OPEN_CALLED) ||
+ dbp->meta_pgno != argp->meta_pgno ||
+ argp->name.size == 0 ||
+ memcmp(dbp->fileid, argp->uid.data,
+ DB_FILE_ID_LEN) != 0) {
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+ (void)__dbreg_revoke_id(dbp, 0,
+ DB_LOGFILEID_INVALID);
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ (void)__db_close(dbp, NULL, DB_NOSYNC);
+ goto reopen;
+ }
+
+ DB_ASSERT(env, dbe->dbp == dbp);
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+
+ /*
+ * This is a successful open. We need to record that
+ * in the txnlist so that we know how to handle the
+ * subtransaction that created the file system object.
+ */
+ if (argp != NULL && argp->id != TXN_INVALID &&
+ (ret = __db_txnlist_update(env, info,
+ argp->id, TXN_EXPECTED, NULL, &status, 1)) != 0)
+ return (ret);
+ return (0);
+ }
+ }
+
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+
+reopen:
+ /*
+ * We never re-open temporary files. Temp files are only useful during
+ * aborts in which case the dbp was entered when the file was
+ * registered. During recovery, we treat temp files as properly deleted
+ * files, allowing the open to fail and not reporting any errors when
+ * recovery fails to get a valid dbp from __dbreg_id_to_db.
+ */
+ if (argp->name.size == 0) {
+ (void)__dbreg_add_dbentry(env, dblp, NULL, argp->fileid);
+ return (ENOENT);
+ }
+
+ /*
+ * We are about to pass a recovery txn pointer into the main library.
+ * We need to make sure that any accessed fields are set appropriately.
+ */
+ if (txn != NULL) {
+ id = txn->txnid;
+ memset(txn, 0, sizeof(DB_TXN));
+ txn->txnid = id;
+ txn->mgrp = env->tx_handle;
+ }
+
+ return (__dbreg_do_open(env,
+ txn, dblp, argp->uid.data, argp->name.data, argp->ftype,
+ argp->fileid, argp->meta_pgno, info, argp->id, opcode));
+}
diff --git a/src/dbreg/dbreg_stat.c b/src/dbreg/dbreg_stat.c
new file mode 100644
index 00000000..6dfb3869
--- /dev/null
+++ b/src/dbreg/dbreg_stat.c
@@ -0,0 +1,140 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_STATISTICS
+static int __dbreg_print_all __P((ENV *, u_int32_t));
+
+/*
+ * __dbreg_stat_print --
+ * Print the dbreg statistics.
+ *
+ * PUBLIC: int __dbreg_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__dbreg_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ int ret;
+
+ if (LF_ISSET(DB_STAT_ALL) &&
+ (ret = __dbreg_print_all(env, flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __dbreg_print_fname --
+ * Display the contents of an FNAME structure.
+ *
+ * PUBLIC: void __dbreg_print_fname __P((ENV *, FNAME *));
+ */
+void
+__dbreg_print_fname(env, fnp)
+ ENV *env;
+ FNAME *fnp;
+{
+ static const FN fn[] = {
+ { DB_FNAME_DURABLE, "DB_FNAME_DURABLE" },
+ { DB_FNAME_NOTLOGGED, "DB_FNAME_NOTLOGGED" },
+ { DB_FNAME_CLOSED, "DB_FNAME_CLOSED" },
+ { DB_FNAME_RECOVER, "DB_FNAME_RECOVER" },
+ { 0, NULL }
+ };
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB handle FNAME contents:");
+ STAT_LONG("log ID", fnp->id);
+ STAT_ULONG("Meta pgno", fnp->meta_pgno);
+ __db_print_fileid(env, fnp->ufid, "\tFile ID");
+ STAT_ULONG("create txn", fnp->create_txnid);
+ STAT_ULONG("refcount", fnp->txn_ref);
+ __db_prflags(env, NULL, fnp->flags, fn, NULL, "\tFlags");
+}
+
+/*
+ * __dbreg_print_all --
+ * Display the ENV's list of files.
+ */
+static int
+__dbreg_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_LOG *dblp;
+ FNAME *fnp;
+ LOG *lp;
+ int32_t *stack;
+ int del, first;
+ u_int32_t i;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ __db_msg(env, "LOG FNAME list:");
+ __mutex_print_debug_single(
+ env, "File name mutex", lp->mtx_filelist, flags);
+
+ STAT_LONG("Fid max", lp->fid_max);
+ STAT_LONG("Log buffer size", lp->buffer_size);
+
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ first = 1;
+ SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname) {
+ if (first) {
+ first = 0;
+ __db_msg(env,
+ "ID\tName\t\tType\tPgno\tPid\tTxnid\tFlags\tRef\tDBP-info");
+ }
+ dbp = fnp->id >= dblp->dbentry_cnt ? NULL :
+ dblp->dbentry[fnp->id].dbp;
+ del = fnp->id >= dblp->dbentry_cnt ? 0 :
+ dblp->dbentry[fnp->id].deleted;
+ __db_msg(env,
+ "%ld\t%-8s%s%-8s%s\t%lu\t%lu\t%lx\t%lx\t%lx\t%s",
+ (long)fnp->id,
+ fnp->fname_off == INVALID_ROFF ?
+ "" : (char *)R_ADDR(&dblp->reginfo, fnp->fname_off),
+ fnp->dname_off == INVALID_ROFF ? "" : ":",
+ fnp->dname_off == INVALID_ROFF ?
+ "" : (char *)R_ADDR(&dblp->reginfo, fnp->dname_off),
+ __db_dbtype_to_string(fnp->s_type),
+ (u_long)fnp->meta_pgno, (u_long)fnp->pid,
+ (u_long)fnp->create_txnid, (u_long)fnp->flags,
+ (u_long)fnp->txn_ref,
+ dbp == NULL ? "No DBP" : "DBP");
+ if (dbp != NULL)
+ __db_msg(env, " (%d %lx %lx)", del, P_TO_ULONG(dbp),
+ (u_long)(dbp == NULL ? 0 : dbp->flags));
+ }
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "LOG region list of free IDs.");
+ if (lp->free_fid_stack == INVALID_ROFF)
+ __db_msg(env, "Free id stack is empty.");
+ else {
+ STAT_ULONG("Free id array size", lp->free_fids_alloced);
+ STAT_ULONG("Number of ids on the free stack", lp->free_fids);
+ stack = R_ADDR(&dblp->reginfo, lp->free_fid_stack);
+ for (i = 0; i < lp->free_fids; i++)
+ STAT_LONG("fid", stack[i]);
+ }
+
+ return (0);
+}
+#endif
diff --git a/src/dbreg/dbreg_util.c b/src/dbreg/dbreg_util.c
new file mode 100644
index 00000000..80de4d91
--- /dev/null
+++ b/src/dbreg/dbreg_util.c
@@ -0,0 +1,847 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __dbreg_check_master __P((ENV *, u_int8_t *, char *));
+
+/*
+ * __dbreg_add_dbentry --
+ * Adds a DB entry to the dbreg DB entry table.
+ *
+ * PUBLIC: int __dbreg_add_dbentry __P((ENV *, DB_LOG *, DB *, int32_t));
+ */
+int
+__dbreg_add_dbentry(env, dblp, dbp, ndx)
+ ENV *env;
+ DB_LOG *dblp;
+ DB *dbp;
+ int32_t ndx;
+{
+ int32_t i;
+ int ret;
+
+ ret = 0;
+
+ MUTEX_LOCK(env, dblp->mtx_dbreg);
+
+ /*
+ * Check if we need to grow the table. Note, ndx is 0-based (the
+ * index into the DB entry table) an dbentry_cnt is 1-based, the
+ * number of available slots.
+ */
+ if (dblp->dbentry_cnt <= ndx) {
+ if ((ret = __os_realloc(env,
+ (size_t)(ndx + DB_GROW_SIZE) * sizeof(DB_ENTRY),
+ &dblp->dbentry)) != 0)
+ goto err;
+
+ /* Initialize the new entries. */
+ for (i = dblp->dbentry_cnt; i < ndx + DB_GROW_SIZE; i++) {
+ dblp->dbentry[i].dbp = NULL;
+ dblp->dbentry[i].deleted = 0;
+ }
+ dblp->dbentry_cnt = i;
+ }
+
+ DB_ASSERT(env, dblp->dbentry[ndx].dbp == NULL);
+ dblp->dbentry[ndx].deleted = dbp == NULL;
+ dblp->dbentry[ndx].dbp = dbp;
+
+err: MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+ return (ret);
+}
+
+/*
+ * __dbreg_rem_dbentry
+ * Remove an entry from the DB entry table.
+ *
+ * PUBLIC: int __dbreg_rem_dbentry __P((DB_LOG *, int32_t));
+ */
+int
+__dbreg_rem_dbentry(dblp, ndx)
+ DB_LOG *dblp;
+ int32_t ndx;
+{
+ MUTEX_LOCK(dblp->env, dblp->mtx_dbreg);
+ if (dblp->dbentry_cnt > ndx) {
+ dblp->dbentry[ndx].dbp = NULL;
+ dblp->dbentry[ndx].deleted = 0;
+ }
+ MUTEX_UNLOCK(dblp->env, dblp->mtx_dbreg);
+
+ return (0);
+}
+
+/*
+ * __dbreg_log_files --
+ * Put a DBREG_CHKPNT/CLOSE log record for each open database.
+ *
+ * PUBLIC: int __dbreg_log_files __P((ENV *, u_int32_t));
+ */
+int
+__dbreg_log_files(env, opcode)
+ ENV *env;
+ u_int32_t opcode;
+{
+ DBT *dbtp, fid_dbt, t;
+ DB_LOG *dblp;
+ DB_LSN r_unused;
+ FNAME *fnp;
+ LOG *lp;
+ u_int32_t lopcode;
+ int ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ ret = 0;
+
+ MUTEX_LOCK(env, lp->mtx_filelist);
+
+ SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname) {
+ /* This id was revoked by a switch in replication master. */
+ if (fnp->id == DB_LOGFILEID_INVALID)
+ continue;
+ if (fnp->fname_off == INVALID_ROFF)
+ dbtp = NULL;
+ else {
+ memset(&t, 0, sizeof(t));
+ t.data = R_ADDR(&dblp->reginfo, fnp->fname_off);
+ t.size = (u_int32_t)strlen(t.data) + 1;
+ dbtp = &t;
+ }
+ memset(&fid_dbt, 0, sizeof(fid_dbt));
+ fid_dbt.data = fnp->ufid;
+ fid_dbt.size = DB_FILE_ID_LEN;
+ /*
+ * Output DBREG_CHKPNT records which will be processed during
+ * the OPENFILES pass of recovery. At the end of recovery we
+ * want to output the files that were open so a future recovery
+ * run will have the correct files open during a backward pass.
+ * For this we output DBREG_RCLOSE records so the files will be
+ * closed on the forward pass.
+ */
+ lopcode = opcode;
+ if ( opcode == DBREG_CHKPNT && F_ISSET(fnp, DBREG_EXCL))
+ lopcode = DBREG_XCHKPNT;
+ if ((ret = __dbreg_register_log(env, NULL, &r_unused,
+ F_ISSET(fnp, DB_FNAME_DURABLE) ? 0 : DB_LOG_NOT_DURABLE,
+ lopcode | F_ISSET(fnp, DB_FNAME_DBREG_MASK),
+ dbtp, &fid_dbt, fnp->id, fnp->s_type, fnp->meta_pgno,
+ TXN_INVALID)) != 0)
+ break;
+ }
+
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+ return (ret);
+}
+
+/*
+ * __dbreg_log_nofiles --
+ *
+ * PUBLIC: int __dbreg_log_nofiles __P((ENV *));
+ */
+int
+__dbreg_log_nofiles(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ return (SH_TAILQ_EMPTY(&lp->fq));
+}
+/*
+ * __dbreg_close_files --
+ * Remove the id's of open files and actually close those
+ * files that were opened by the recovery daemon. We sync the
+ * file, unless its mpf pointer has been NULLed by a db_remove or
+ * db_rename. We may not have flushed the log_register record that
+ * closes the file.
+ *
+ * PUBLIC: int __dbreg_close_files __P((ENV *, int));
+ */
+int
+__dbreg_close_files(env, do_restored)
+ ENV *env;
+ int do_restored;
+{
+ DB *dbp;
+ DB_LOG *dblp;
+ int ret, t_ret;
+ int32_t i;
+
+ /* If we haven't initialized logging, we have nothing to do. */
+ if (!LOGGING_ON(env))
+ return (0);
+
+ dblp = env->lg_handle;
+ ret = 0;
+
+ MUTEX_LOCK(env, dblp->mtx_dbreg);
+ for (i = 0; i < dblp->dbentry_cnt; i++) {
+ /*
+ * We only want to close dbps that recovery opened. Any
+ * dbps that weren't opened by recovery but show up here
+ * are about to be unconditionally removed from the table.
+ * Before doing so, we need to revoke their log fileids
+ * so that we don't end up leaving around FNAME entries
+ * for dbps that shouldn't have them.
+ */
+ if ((dbp = dblp->dbentry[i].dbp) != NULL) {
+ /*
+ * It's unsafe to call DB->close or revoke_id
+ * while holding the thread lock, because
+ * we'll call __dbreg_rem_dbentry and grab it again.
+ *
+ * Just drop it. Since dbreg ids go monotonically
+ * upward, concurrent opens should be safe, and the
+ * user should have no business closing files while
+ * we're in this loop anyway--we're in the process of
+ * making all outstanding dbps invalid.
+ */
+ /*
+ * If we only want to close those FNAMES marked
+ * as restored, check now.
+ */
+ if (do_restored &&
+ !F_ISSET(dbp->log_filename, DB_FNAME_RESTORED))
+ continue;
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ t_ret = __db_close(dbp,
+ NULL, dbp->mpf == NULL ? DB_NOSYNC : 0);
+ else
+ t_ret = __dbreg_revoke_id(
+ dbp, 0, DB_LOGFILEID_INVALID);
+ if (ret == 0)
+ ret = t_ret;
+ MUTEX_LOCK(env, dblp->mtx_dbreg);
+ }
+
+ dblp->dbentry[i].deleted = 0;
+ dblp->dbentry[i].dbp = NULL;
+ }
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+ return (ret);
+}
+
+/*
+ * __dbreg_close_file --
+ * Close a database file opened by recovery.
+ * PUBLIC: int __dbreg_close_file __P((ENV *, FNAME *));
+ */
+int
+__dbreg_close_file(env, fnp)
+ ENV *env;
+ FNAME *fnp;
+{
+ DB *dbp;
+ DB_LOG *dblp;
+
+ dblp = env->lg_handle;
+
+ dbp = dblp->dbentry[fnp->id].dbp;
+ if (dbp == NULL)
+ return (0);
+ DB_ASSERT(env, dbp->log_filename == fnp);
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_RECOVER));
+ return (__db_close(dbp, NULL, DB_NOSYNC));
+}
+
+/*
+ * __dbreg_mark_restored --
+ * Mark files when we change replication roles and there are outstanding
+ * prepared txns that may use these files. These will be invalidated later
+ * when all outstanding prepared txns are resolved.
+ *
+ * PUBLIC: int __dbreg_mark_restored __P((ENV *));
+ */
+int
+__dbreg_mark_restored(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ FNAME *fnp;
+ LOG *lp;
+
+ /* If we haven't initialized logging, we have nothing to do. */
+ if (!LOGGING_ON(env))
+ return (0);
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+ if (fnp->id != DB_LOGFILEID_INVALID)
+ F_SET(fnp, DB_FNAME_RESTORED);
+
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+ return (0);
+}
+
+/*
+ * __dbreg_invalidate_files --
+ * Invalidate files when we change replication roles. Save the
+ * id so that another process will be able to clean up the information
+ * when it notices.
+ *
+ * PUBLIC: int __dbreg_invalidate_files __P((ENV *, int));
+ */
+int
+__dbreg_invalidate_files(env, do_restored)
+ ENV *env;
+ int do_restored;
+{
+ DB_LOG *dblp;
+ FNAME *fnp;
+ LOG *lp;
+ int ret;
+
+ /* If we haven't initialized logging, we have nothing to do. */
+ if (!LOGGING_ON(env))
+ return (0);
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ ret = 0;
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname) {
+ /*
+ * Normally, skip any file with DB_FNAME_RESTORED
+ * set. If do_restored is set, only invalidate
+ * those files with the flag set and skip all others.
+ */
+ if (F_ISSET(fnp, DB_FNAME_RESTORED) && !do_restored)
+ continue;
+ if (!F_ISSET(fnp, DB_FNAME_RESTORED) && do_restored)
+ continue;
+ if (fnp->id != DB_LOGFILEID_INVALID) {
+ if ((ret = __dbreg_log_close(env,
+ fnp, NULL, DBREG_RCLOSE)) != 0)
+ goto err;
+ fnp->old_id = fnp->id;
+ fnp->id = DB_LOGFILEID_INVALID;
+ }
+ }
+err: MUTEX_UNLOCK(env, lp->mtx_filelist);
+ return (ret);
+}
+
+/*
+ * __dbreg_id_to_db --
+ * Return the DB corresponding to the specified dbreg id.
+ *
+ * PUBLIC: int __dbreg_id_to_db __P((ENV *, DB_TXN *, DB **, int32_t, int));
+ */
+int
+__dbreg_id_to_db(env, txn, dbpp, ndx, tryopen)
+ ENV *env;
+ DB_TXN *txn;
+ DB **dbpp;
+ int32_t ndx;
+ int tryopen;
+{
+ DB_LOG *dblp;
+ FNAME *fname;
+ int ret;
+ char *name;
+
+ dblp = env->lg_handle;
+ ret = 0;
+
+ MUTEX_LOCK(env, dblp->mtx_dbreg);
+
+ /*
+ * We take a final parameter that indicates whether we should attempt
+ * to open the file if no mapping is found. During recovery, the
+ * recovery routines all want to try to open the file (and this is
+ * called from __dbreg_id_to_db), however, if we have a multi-process
+ * environment where some processes may not have the files open,
+ * then we also get called from __dbreg_assign_id and it's OK if
+ * there is no mapping.
+ *
+ * Under failchk, a process different than the one issuing DB
+ * operations may abort a transaction. In this case, the "recovery"
+ * routines are run by a process that does not necessarily have the
+ * file open, so we we must open the file explicitly.
+ */
+ if (ndx >= dblp->dbentry_cnt ||
+ (!dblp->dbentry[ndx].deleted && dblp->dbentry[ndx].dbp == NULL)) {
+ if (!tryopen || F_ISSET(dblp, DBLOG_RECOVER)) {
+ ret = ENOENT;
+ goto err;
+ }
+
+ /*
+ * __dbreg_id_to_fname acquires the mtx_filelist mutex, which
+ * we can't safely acquire while we hold the thread lock. We
+ * no longer need it anyway--the dbentry table didn't have what
+ * we needed.
+ */
+ MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+
+ if (__dbreg_id_to_fname(dblp, ndx, 0, &fname) != 0)
+ /*
+ * With transactional opens, we may actually have
+ * closed this file in the transaction in which
+ * case this will fail too. Then it's up to the
+ * caller to reopen the file.
+ */
+ return (ENOENT);
+
+ /*
+ * Note that we're relying on fname not to change, even though
+ * we released the mutex that protects it (mtx_filelist) inside
+ * __dbreg_id_to_fname. This should be a safe assumption, the
+ * other process that has the file open shouldn't be closing it
+ * while we're trying to abort.
+ */
+ name = fname->fname_off == INVALID_ROFF ?
+ NULL : R_ADDR(&dblp->reginfo, fname->fname_off);
+
+ /*
+ * At this point, we are not holding the thread lock, so exit
+ * directly instead of going through the exit code at the
+ * bottom. If the __dbreg_do_open succeeded, then we don't need
+ * to do any of the remaining error checking at the end of this
+ * routine.
+ * If TXN_INVALID is passed then no txnlist is needed.
+ */
+ if ((ret = __dbreg_do_open(env, txn, dblp,
+ fname->ufid, name, fname->s_type, ndx, fname->meta_pgno,
+ NULL, TXN_INVALID, F_ISSET(fname, DB_FNAME_INMEM) ?
+ DBREG_REOPEN : DBREG_OPEN)) != 0)
+ return (ret);
+
+ *dbpp = dblp->dbentry[ndx].dbp;
+ return (*dbpp == NULL ? DB_DELETED : 0);
+ }
+
+ /*
+ * Return DB_DELETED if the file has been deleted (it's not an error).
+ */
+ if (dblp->dbentry[ndx].deleted) {
+ ret = DB_DELETED;
+ goto err;
+ }
+
+ /* It's an error if we don't have a corresponding writable DB. */
+ if ((*dbpp = dblp->dbentry[ndx].dbp) == NULL)
+ ret = ENOENT;
+ else
+ /*
+ * If we are in recovery, then set that the file has
+ * been written. It is possible to run recovery,
+ * find all the pages in their post update state
+ * in the OS buffer pool, put a checkpoint in the log
+ * and then crash the system without forcing the pages
+ * to disk. If this is an in-memory file, we may not have
+ * an mpf yet.
+ */
+ if ((*dbpp)->mpf != NULL && (*dbpp)->mpf->mfp != NULL)
+ (*dbpp)->mpf->mfp->file_written = 1;
+
+err: MUTEX_UNLOCK(env, dblp->mtx_dbreg);
+ return (ret);
+}
+
+/*
+ * __dbreg_id_to_fname --
+ * Traverse the shared-memory region looking for the entry that
+ * matches the passed dbreg id. Returns 0 on success; -1 on error.
+ *
+ * PUBLIC: int __dbreg_id_to_fname __P((DB_LOG *, int32_t, int, FNAME **));
+ */
+int
+__dbreg_id_to_fname(dblp, id, have_lock, fnamep)
+ DB_LOG *dblp;
+ int32_t id;
+ int have_lock;
+ FNAME **fnamep;
+{
+ ENV *env;
+ FNAME *fnp;
+ LOG *lp;
+ int ret;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+
+ ret = -1;
+
+ if (!have_lock)
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+ if (fnp->id == id) {
+ *fnamep = fnp;
+ ret = 0;
+ break;
+ }
+ if (!have_lock)
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+ return (ret);
+}
+/*
+ * __dbreg_fid_to_fname --
+ * Traverse the shared-memory region looking for the entry that
+ * matches the passed file unique id. Returns 0 on success; -1 on error.
+ *
+ * PUBLIC: int __dbreg_fid_to_fname __P((DB_LOG *, u_int8_t *, int, FNAME **));
+ */
+int
+__dbreg_fid_to_fname(dblp, fid, have_lock, fnamep)
+ DB_LOG *dblp;
+ u_int8_t *fid;
+ int have_lock;
+ FNAME **fnamep;
+{
+ ENV *env;
+ FNAME *fnp;
+ LOG *lp;
+ int ret;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+
+ ret = -1;
+
+ if (!have_lock)
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+ if (memcmp(fnp->ufid, fid, DB_FILE_ID_LEN) == 0) {
+ *fnamep = fnp;
+ ret = 0;
+ break;
+ }
+ if (!have_lock)
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+ return (ret);
+}
+
+/*
+ * __dbreg_get_name
+ *
+ * Interface to get name of registered files. This is mainly diagnostic
+ * and the name passed could be transient unless there is something
+ * ensuring that the file cannot be closed.
+ *
+ * PUBLIC: int __dbreg_get_name __P((ENV *, u_int8_t *, char **, char **));
+ */
+int
+__dbreg_get_name(env, fid, fnamep, dnamep)
+ ENV *env;
+ u_int8_t *fid;
+ char **fnamep, **dnamep;
+{
+ DB_LOG *dblp;
+ FNAME *fnp;
+
+ dblp = env->lg_handle;
+
+ if (dblp != NULL && __dbreg_fid_to_fname(dblp, fid, 0, &fnp) == 0) {
+ *fnamep = fnp->fname_off == INVALID_ROFF ?
+ NULL : R_ADDR(&dblp->reginfo, fnp->fname_off);
+ *dnamep = fnp->dname_off == INVALID_ROFF ?
+ NULL : R_ADDR(&dblp->reginfo, fnp->dname_off);
+ return (0);
+ }
+
+ *fnamep = *dnamep = NULL;
+ return (-1);
+}
+
+/*
+ * __dbreg_do_open --
+ * Open files referenced in the log. This is the part of the open that
+ * is not protected by the thread mutex.
+ * PUBLIC: int __dbreg_do_open __P((ENV *,
+ * PUBLIC: DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE,
+ * PUBLIC: int32_t, db_pgno_t, void *, u_int32_t, u_int32_t));
+ */
+int
+__dbreg_do_open(env,
+ txn, lp, uid, name, ftype, ndx, meta_pgno, info, id, opcode)
+ ENV *env;
+ DB_TXN *txn;
+ DB_LOG *lp;
+ u_int8_t *uid;
+ char *name;
+ DBTYPE ftype;
+ int32_t ndx;
+ db_pgno_t meta_pgno;
+ void *info;
+ u_int32_t id, opcode;
+{
+ DB *dbp;
+ u_int32_t cstat, ret_stat;
+ int ret, t_ret, try_inmem;
+ char *dname, *fname;
+
+ cstat = TXN_EXPECTED;
+ fname = name;
+ dname = NULL;
+ try_inmem = 0;
+
+retry_inmem:
+ if ((ret = __db_create_internal(&dbp, lp->env, 0)) != 0)
+ return (ret);
+
+ /*
+ * We can open files under a number of different scenarios.
+ * First, we can open a file during a normal txn_abort, if that file
+ * was opened and closed during the transaction (as is the master
+ * database of a sub-database).
+ * Second, we might be aborting a transaction in a process other than
+ * the one that did it (failchk).
+ * Third, we might be in recovery.
+ * In case 3, there is no locking, so there is no issue.
+ * In cases 1 and 2, we are guaranteed to already hold any locks
+ * that we need, since we're still in the same transaction, so by
+ * setting DB_AM_RECOVER, we guarantee that we don't log and that
+ * we don't try to acquire locks on behalf of a different locker id.
+ */
+ F_SET(dbp, DB_AM_RECOVER);
+ if (meta_pgno != PGNO_BASE_MD) {
+ memcpy(dbp->fileid, uid, DB_FILE_ID_LEN);
+ dbp->meta_pgno = meta_pgno;
+ }
+
+ if (opcode == DBREG_PREOPEN) {
+ dbp->type = ftype;
+ if ((ret = __dbreg_setup(dbp, name, NULL, id)) != 0)
+ goto err;
+ MAKE_INMEM(dbp);
+ goto skip_open;
+ }
+
+ if (opcode == DBREG_REOPEN || opcode == DBREG_XREOPEN || try_inmem) {
+ MAKE_INMEM(dbp);
+ fname = NULL;
+ dname = name;
+ }
+
+ if (opcode == DBREG_XOPEN || opcode == DBREG_XCHKPNT ||
+ opcode == DBREG_XREOPEN)
+ F2_SET(dbp, DB2_AM_EXCL|DB2_AM_INTEXCL);
+
+ if ((ret = __db_open(dbp, NULL, txn, fname, dname, ftype,
+ DB_DURABLE_UNKNOWN | DB_ODDFILESIZE,
+ DB_MODE_600, meta_pgno)) == 0) {
+skip_open:
+ /*
+ * Verify that we are opening the same file that we were
+ * referring to when we wrote this log record.
+ */
+ if ((meta_pgno != PGNO_BASE_MD &&
+ __dbreg_check_master(env, uid, name) != 0) ||
+ memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
+ cstat = TXN_UNEXPECTED;
+ else
+ cstat = TXN_EXPECTED;
+
+ /* Assign the specific dbreg id to this dbp. */
+ if ((ret = __dbreg_assign_id(dbp, ndx, 0)) != 0)
+ goto err;
+
+ /*
+ * Record the newly-opened file in the transaction so it closed
+ * when the transaction ends. Decrement the reference count
+ * because there will be no explicit close for this handle and
+ * we want it to be closed when the transaction ends.
+ */
+ if (txn != NULL && (ret =
+ __txn_record_fname(env, txn, dbp->log_filename)) != 0)
+ goto err;
+ --dbp->log_filename->txn_ref;
+
+ /*
+ * If we successfully opened this file, then we need to
+ * convey that information to the txnlist so that we
+ * know how to handle the subtransaction that created
+ * the file system object.
+ */
+ if (id != TXN_INVALID)
+ ret = __db_txnlist_update(env,
+ info, id, cstat, NULL, &ret_stat, 1);
+
+err: if (cstat == TXN_UNEXPECTED)
+ goto not_right;
+ return (ret);
+ } else if (ret == ENOENT) {
+ /*
+ * If the open failed with ENOENT, retry it as a named in-mem
+ * database. Some record types do not distinguish between a
+ * named in-memory database and one on-disk. Therefore, an
+ * internal init via replication that is trying to open and
+ * access this as a named in-mem database will not find it
+ * on-disk, and we need to try to open it in-memory too.
+ * But don't do this for [P]REOPEN, since we're already
+ * handling those cases specially, above.
+ */
+ if (try_inmem == 0 &&
+ opcode != DBREG_PREOPEN && opcode != DBREG_REOPEN &&
+ opcode != DBREG_XREOPEN) {
+ if ((ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0)
+ return (ret);
+ try_inmem = 1;
+ goto retry_inmem;
+ } else if (try_inmem != 0)
+ CLR_INMEM(dbp);
+
+ /*
+ * If it exists neither on disk nor in memory
+ * record that the open failed in the txnlist.
+ */
+ if (id != TXN_INVALID && (ret = __db_txnlist_update(env,
+ info, id, TXN_UNEXPECTED, NULL, &ret_stat, 1)) != 0)
+ goto not_right;
+
+ /*
+ * If this is file is missing then we may have crashed
+ * without writing the corresponding close, record
+ * the open so recovery will write a close record
+ * with its checkpoint. If this is a backward pass then
+ * we are closing a non-existent file and need to mark
+ * it as deleted.
+ */
+ if (dbp->log_filename == NULL &&
+ (ret = __dbreg_setup(dbp, name, NULL, id)) != 0)
+ return (ret);
+ ret = __dbreg_assign_id(dbp, ndx, 1);
+ return (ret);
+ }
+not_right:
+ if ((t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0)
+ return (ret == 0 ? t_ret : ret);
+
+ /* Add this file as deleted. */
+ if ((t_ret = __dbreg_add_dbentry(env, lp, NULL, ndx)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static int
+__dbreg_check_master(env, uid, name)
+ ENV *env;
+ u_int8_t *uid;
+ char *name;
+{
+ DB *dbp;
+ int ret;
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+ F_SET(dbp, DB_AM_RECOVER);
+ ret = __db_open(dbp, NULL, NULL,
+ name, NULL, DB_BTREE, 0, DB_MODE_600, PGNO_BASE_MD);
+
+ if (ret == 0 && memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
+ ret = EINVAL;
+
+ (void)__db_close(dbp, NULL, 0);
+ return (ret);
+}
+
+/*
+ * __dbreg_lazy_id --
+ * When a replication client gets upgraded to being a replication master,
+ * it may have database handles open that have not been assigned an ID, but
+ * which have become legal to use for logging.
+ *
+ * This function lazily allocates a new ID for such a function, in a
+ * new transaction created for the purpose. We need to do this in a new
+ * transaction because we definitely wish to commit the dbreg_register, but
+ * at this point we have no way of knowing whether the log record that incited
+ * us to call this will be part of a committed transaction.
+ *
+ * We first revoke any old id this handle may have had. That can happen
+ * if a master becomes a client and then becomes a master again and
+ * there are other processes with valid open handles to this env.
+ *
+ * PUBLIC: int __dbreg_lazy_id __P((DB *));
+ */
+int
+__dbreg_lazy_id(dbp)
+ DB *dbp;
+{
+ DB_LOG *dblp;
+ DB_TXN *txn;
+ ENV *env;
+ FNAME *fnp;
+ LOG *lp;
+ int32_t id;
+ int ret;
+
+ env = dbp->env;
+
+ DB_ASSERT(env, IS_REP_MASTER(env) || F_ISSET(dbp, DB_AM_NOT_DURABLE));
+
+ env = dbp->env;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ fnp = dbp->log_filename;
+
+ /* The mtx_filelist protects the FNAME list and id management. */
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ if (fnp->id != DB_LOGFILEID_INVALID) {
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+ return (0);
+ }
+ id = DB_LOGFILEID_INVALID;
+ /*
+ * When we became master we moved the fnp->id to old_id in
+ * every FNAME structure that was open. If our id was changed,
+ * we need to revoke and give back that id.
+ */
+ if (fnp->old_id != DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_revoke_id(dbp, 1, DB_LOGFILEID_INVALID)) != 0)
+ goto err;
+ if ((ret = __txn_begin(env, NULL, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+ goto err;
+
+ if ((ret = __dbreg_get_id(dbp, txn, &id)) != 0) {
+ (void)__txn_abort(txn);
+ goto err;
+ }
+
+ if ((ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0)
+ goto err;
+
+ /*
+ * All DB related logging routines check the id value *without*
+ * holding the mtx_filelist to know whether we need to call
+ * dbreg_lazy_id to begin with. We must set the ID after a
+ * *successful* commit so that there is no possibility of a second
+ * modification call finding a valid ID in the dbp before the
+ * dbreg_register and commit records are in the log.
+ * If there was an error, then we call __dbreg_revoke_id to
+ * remove the entry from the lists.
+ */
+ fnp->id = id;
+err:
+ if (ret != 0 && id != DB_LOGFILEID_INVALID)
+ (void)__dbreg_revoke_id(dbp, 1, id);
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+ return (ret);
+}
diff --git a/src/env/env_alloc.c b/src/env/env_alloc.c
new file mode 100644
index 00000000..700bfb27
--- /dev/null
+++ b/src/env/env_alloc.c
@@ -0,0 +1,759 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Implement shared memory region allocation. The initial list is a single
+ * memory "chunk" which is carved up as memory is requested. Chunks are
+ * coalesced when free'd. We maintain two types of linked-lists: a list of
+ * all chunks sorted by address, and a set of lists with free chunks sorted
+ * by size.
+ *
+ * The ALLOC_LAYOUT structure is the governing structure for the allocator.
+ *
+ * The ALLOC_ELEMENT structure is the structure that describes any single
+ * chunk of memory, and is immediately followed by the user's memory.
+ *
+ * The internal memory chunks are always aligned to a uintmax_t boundary so
+ * we don't drop core accessing the fields of the ALLOC_ELEMENT structure.
+ *
+ * The memory chunks returned to the user are aligned to a uintmax_t boundary.
+ * This is enforced by terminating the ALLOC_ELEMENT structure with a uintmax_t
+ * field as that immediately precedes the user's memory. Any caller needing
+ * more than uintmax_t alignment is responsible for doing alignment themselves.
+ */
+
+typedef SH_TAILQ_HEAD(__sizeq) SIZEQ_HEAD;
+
+typedef struct __alloc_layout {
+ SH_TAILQ_HEAD(__addrq) addrq; /* Sorted by address */
+
+ /*
+ * A perfect Berkeley DB application does little allocation because
+ * most things are allocated on startup and never free'd. This is
+ * true even for the cache, because we don't free and re-allocate
+ * the memory associated with a cache buffer when swapping a page
+ * in memory for a page on disk -- unless the page is changing size.
+ * The latter problem is why we have multiple size queues. If the
+ * application's working set fits in cache, it's not a problem. If
+ * the application's working set doesn't fit in cache, but all of
+ * the databases have the same size pages, it's still not a problem.
+ * If the application's working set doesn't fit in cache, and its
+ * databases have different page sizes, we can end up walking a lot
+ * of 512B chunk allocations looking for an available 64KB chunk.
+ *
+ * So, we keep a set of queues, where we expect to find a chunk of
+ * roughly the right size at the front of the list. The first queue
+ * is chunks <= 1024, the second is <= 2048, and so on. With 11
+ * queues, we have separate queues for chunks up to 1MB.
+ */
+#define DB_SIZE_Q_COUNT 11
+ SIZEQ_HEAD sizeq[DB_SIZE_Q_COUNT]; /* Sorted by size */
+#ifdef HAVE_STATISTICS
+ u_int32_t pow2_size[DB_SIZE_Q_COUNT];
+#endif
+
+#ifdef HAVE_STATISTICS
+ u_int32_t success; /* Successful allocations */
+ u_int32_t failure; /* Failed allocations */
+ u_int32_t freed; /* Free calls */
+ u_int32_t longest; /* Longest chain walked */
+#endif
+ uintmax_t unused; /* Guarantee alignment */
+} ALLOC_LAYOUT;
+
+typedef struct __alloc_element {
+ SH_TAILQ_ENTRY addrq; /* List by address */
+ SH_TAILQ_ENTRY sizeq; /* List by size */
+
+ /*
+ * The "len" field is the total length of the chunk, not the size
+ * available to the caller. Use a uintmax_t to guarantee that the
+ * size of this struct will be aligned correctly.
+ */
+ uintmax_t len; /* Chunk length */
+
+ /*
+ * The "ulen" field is the length returned to the caller.
+ *
+ * Set to 0 if the chunk is not currently in use.
+ */
+ uintmax_t ulen; /* User's length */
+} ALLOC_ELEMENT;
+
+/*
+ * If the chunk can be split into two pieces, with the fragment holding at
+ * least 64 bytes of memory, we divide the chunk into two parts.
+ */
+#define SHALLOC_FRAGMENT (sizeof(ALLOC_ELEMENT) + 64)
+
+/* Macro to find the appropriate queue for a specific size chunk. */
+#undef SET_QUEUE_FOR_SIZE
+#define SET_QUEUE_FOR_SIZE(head, q, i, len) do { \
+ for (i = 0; i < DB_SIZE_Q_COUNT; ++i) { \
+ q = &(head)->sizeq[i]; \
+ if ((len) <= (u_int64_t)1024 << i) \
+ break; \
+ } \
+} while (0)
+
+static void __env_size_insert __P((ALLOC_LAYOUT *, ALLOC_ELEMENT *));
+
+/*
+ * __env_alloc_init --
+ * Initialize the area as one large chunk.
+ *
+ * PUBLIC: void __env_alloc_init __P((REGINFO *, size_t));
+ */
+void
+__env_alloc_init(infop, size)
+ REGINFO *infop;
+ size_t size;
+{
+ ALLOC_ELEMENT *elp;
+ ALLOC_LAYOUT *head;
+ ENV *env;
+ u_int i;
+
+ env = infop->env;
+
+ /* No initialization needed for heap memory regions. */
+ if (F_ISSET(env, ENV_PRIVATE))
+ return;
+
+ /*
+ * The first chunk of memory is the ALLOC_LAYOUT structure.
+ */
+ head = infop->head;
+ memset(head, 0, sizeof(*head));
+ SH_TAILQ_INIT(&head->addrq);
+ for (i = 0; i < DB_SIZE_Q_COUNT; ++i)
+ SH_TAILQ_INIT(&head->sizeq[i]);
+ COMPQUIET(head->unused, 0);
+
+ /*
+ * The rest of the memory is the first available chunk.
+ */
+ elp = (ALLOC_ELEMENT *)((u_int8_t *)head + sizeof(ALLOC_LAYOUT));
+ elp->len = size - sizeof(ALLOC_LAYOUT);
+ elp->ulen = 0;
+
+ SH_TAILQ_INSERT_HEAD(&head->addrq, elp, addrq, __alloc_element);
+ SH_TAILQ_INSERT_HEAD(
+ &head->sizeq[DB_SIZE_Q_COUNT - 1], elp, sizeq, __alloc_element);
+}
+
+/*
+ * The length, the ALLOC_ELEMENT structure and an optional guard byte,
+ * rounded up to standard alignment.
+ */
+#ifdef DIAGNOSTIC
+#define DB_ALLOC_SIZE(len) \
+ (size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT) + 1, sizeof(uintmax_t))
+#else
+#define DB_ALLOC_SIZE(len) \
+ (size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT), sizeof(uintmax_t))
+#endif
+
+/*
+ * __env_alloc_overhead --
+ * Return the overhead needed for an allocation.
+ *
+ * PUBLIC: size_t __env_alloc_overhead __P((void));
+ */
+size_t
+__env_alloc_overhead()
+{
+ return (sizeof(ALLOC_ELEMENT));
+}
+
+/*
+ * __env_alloc_size --
+ * Return the space needed for an allocation, including alignment.
+ *
+ * PUBLIC: size_t __env_alloc_size __P((size_t));
+ */
+size_t
+__env_alloc_size(len)
+ size_t len;
+{
+ return (DB_ALLOC_SIZE(len));
+}
+
+/*
+ * __env_alloc --
+ * Allocate space from the shared region.
+ *
+ * PUBLIC: int __env_alloc __P((REGINFO *, size_t, void *));
+ */
+int
+__env_alloc(infop, len, retp)
+ REGINFO *infop;
+ size_t len;
+ void *retp;
+{
+ SIZEQ_HEAD *q;
+ ALLOC_ELEMENT *elp, *frag, *elp_tmp;
+ ALLOC_LAYOUT *head;
+ ENV *env;
+ REGION_MEM *mem;
+ REGINFO *envinfop;
+ size_t total_len;
+ u_int8_t *p;
+ u_int i;
+ int ret;
+#ifdef HAVE_STATISTICS
+ u_int32_t st_search;
+#endif
+ env = infop->env;
+ *(void **)retp = NULL;
+#ifdef HAVE_MUTEX_SUPPORT
+ MUTEX_REQUIRED(env, infop->mtx_alloc);
+#endif
+
+ PERFMON3(env, mpool, env_alloc, len, infop->id, infop->type);
+ /*
+ * In a heap-backed environment, we call malloc for additional space.
+ * (Malloc must return memory correctly aligned for our use.)
+ *
+ * In a heap-backed environment, memory is laid out as follows:
+ *
+ * { uintmax_t total-length } { user-memory } { guard-byte }
+ */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ /*
+ * If we are shared then we must track the allocation
+ * in the main environment region.
+ */
+ if (F_ISSET(infop, REGION_SHARED))
+ envinfop = env->reginfo;
+ else
+ envinfop = infop;
+ /*
+ * We need an additional uintmax_t to hold the length (and
+ * keep the buffer aligned on 32-bit systems).
+ */
+ len += sizeof(uintmax_t);
+ if (F_ISSET(infop, REGION_TRACKED))
+ len += sizeof(REGION_MEM);
+
+#ifdef DIAGNOSTIC
+ /* Plus one byte for the guard byte. */
+ ++len;
+#endif
+ /* Check if we're over the limit. */
+ if (envinfop->max_alloc != 0 &&
+ envinfop->allocated + len > envinfop->max_alloc)
+ return (ENOMEM);
+
+ /* Allocate the space. */
+ if ((ret = __os_malloc(env, len, &p)) != 0)
+ return (ret);
+ infop->allocated += len;
+ if (infop != envinfop)
+ envinfop->allocated += len;
+
+ *(uintmax_t *)p = len;
+#ifdef DIAGNOSTIC
+ p[len - 1] = GUARD_BYTE;
+#endif
+ if (F_ISSET(infop, REGION_TRACKED)) {
+ mem = (REGION_MEM *)(p + sizeof(uintmax_t));
+ mem->next = infop->mem;
+ infop->mem = mem;
+ p += sizeof(mem);
+ }
+ *(void **)retp = p + sizeof(uintmax_t);
+ return (0);
+ }
+
+ head = infop->head;
+ total_len = DB_ALLOC_SIZE(len);
+
+ /* Find the first size queue that could satisfy the request. */
+ COMPQUIET(q, NULL);
+#ifdef HAVE_MMAP_EXTEND
+retry:
+#endif
+ SET_QUEUE_FOR_SIZE(head, q, i, total_len);
+
+#ifdef HAVE_STATISTICS
+ if (i >= DB_SIZE_Q_COUNT)
+ i = DB_SIZE_Q_COUNT - 1;
+ ++head->pow2_size[i]; /* Note the size of the request. */
+#endif
+
+ /*
+ * Search this queue, and, if necessary, queues larger than this queue,
+ * looking for a chunk we can use.
+ */
+ STAT(st_search = 0);
+ for (elp = NULL;; ++q) {
+ SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element) {
+ STAT(++st_search);
+
+ /*
+ * Chunks are sorted from largest to smallest -- if
+ * this chunk is less than what we need, no chunk
+ * further down the list will be large enough.
+ */
+ if (elp_tmp->len < total_len)
+ break;
+
+ /*
+ * This chunk will do... maybe there's a better one,
+ * but this one will do.
+ */
+ elp = elp_tmp;
+
+ /*
+ * We might have many chunks of the same size. Stop
+ * looking if we won't fragment memory by picking the
+ * current one.
+ */
+ if (elp_tmp->len - total_len <= SHALLOC_FRAGMENT)
+ break;
+ }
+ if (elp != NULL || ++i >= DB_SIZE_Q_COUNT)
+ break;
+ }
+
+#ifdef HAVE_STATISTICS
+ if (head->longest < st_search) {
+ head->longest = st_search;
+ STAT_PERFMON3(env,
+ mpool, longest_search, len, infop->id, st_search);
+ }
+#endif
+
+ /*
+ * If we don't find an element of the right size, try to extend
+ * the region, if not then we are done.
+ */
+ if (elp == NULL) {
+ ret = ENOMEM;
+#ifdef HAVE_MMAP_EXTEND
+ if (infop->rp->size < infop->rp->max &&
+ (ret = __env_region_extend(env, infop)) == 0)
+ goto retry;
+#endif
+ STAT_INC_VERB(env, mpool, fail, head->failure, len, infop->id);
+ return (ret);
+ }
+ STAT_INC_VERB(env, mpool, alloc, head->success, len, infop->id);
+
+ /* Pull the chunk off of the size queue. */
+ SH_TAILQ_REMOVE(q, elp, sizeq, __alloc_element);
+
+ if (elp->len - total_len > SHALLOC_FRAGMENT) {
+ frag = (ALLOC_ELEMENT *)((u_int8_t *)elp + total_len);
+ frag->len = elp->len - total_len;
+ frag->ulen = 0;
+
+ elp->len = total_len;
+
+ /* The fragment follows the chunk on the address queue. */
+ SH_TAILQ_INSERT_AFTER(
+ &head->addrq, elp, frag, addrq, __alloc_element);
+
+ /* Insert the frag into the correct size queue. */
+ __env_size_insert(head, frag);
+ }
+
+ p = (u_int8_t *)elp + sizeof(ALLOC_ELEMENT);
+ elp->ulen = len;
+#ifdef DIAGNOSTIC
+ p[len] = GUARD_BYTE;
+#endif
+ *(void **)retp = p;
+
+ return (0);
+}
+
+/*
+ * __env_alloc_free --
+ * Free space into the shared region.
+ *
+ * PUBLIC: void __env_alloc_free __P((REGINFO *, void *));
+ */
+void
+__env_alloc_free(infop, ptr)
+ REGINFO *infop;
+ void *ptr;
+{
+ ALLOC_ELEMENT *elp, *elp_tmp;
+ ALLOC_LAYOUT *head;
+ ENV *env;
+ SIZEQ_HEAD *q;
+ size_t len;
+ u_int8_t i, *p;
+
+ env = infop->env;
+
+ /* In a private region, we call free. */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ /* Find the start of the memory chunk and its length. */
+ p = (u_int8_t *)((uintmax_t *)ptr - 1);
+ len = (size_t)*(uintmax_t *)p;
+
+ infop->allocated -= len;
+ if (F_ISSET(infop, REGION_SHARED))
+ env->reginfo->allocated -= len;
+
+#ifdef DIAGNOSTIC
+ /* Check the guard byte. */
+ DB_ASSERT(env, p[len - 1] == GUARD_BYTE);
+
+ /* Trash the memory chunk. */
+ memset(p, CLEAR_BYTE, len);
+#endif
+ __os_free(env, p);
+ return;
+ }
+
+#ifdef HAVE_MUTEX_SUPPORT
+ MUTEX_REQUIRED(env, infop->mtx_alloc);
+#endif
+
+ head = infop->head;
+
+ p = ptr;
+ elp = (ALLOC_ELEMENT *)(p - sizeof(ALLOC_ELEMENT));
+
+ STAT_INC_VERB(env, mpool, free, head->freed, elp->ulen, infop->id);
+
+#ifdef DIAGNOSTIC
+ /* Check the guard byte. */
+ DB_ASSERT(env, p[elp->ulen] == GUARD_BYTE);
+
+ /* Trash the memory chunk. */
+ memset(p, CLEAR_BYTE, (size_t)elp->len - sizeof(ALLOC_ELEMENT));
+#endif
+
+ /* Mark the memory as no longer in use. */
+ elp->ulen = 0;
+
+ /*
+ * Try and merge this chunk with chunks on either side of it. Two
+ * chunks can be merged if they're contiguous and not in use.
+ */
+ if ((elp_tmp =
+ SH_TAILQ_PREV(&head->addrq, elp, addrq, __alloc_element)) != NULL &&
+ elp_tmp->ulen == 0 &&
+ (u_int8_t *)elp_tmp + elp_tmp->len == (u_int8_t *)elp) {
+ /*
+ * If we're merging the entry into a previous entry, remove the
+ * current entry from the addr queue and the previous entry from
+ * its size queue, and merge.
+ */
+ SH_TAILQ_REMOVE(&head->addrq, elp, addrq, __alloc_element);
+ SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len);
+ SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element);
+
+ elp_tmp->len += elp->len;
+ elp = elp_tmp;
+ }
+ if ((elp_tmp = SH_TAILQ_NEXT(elp, addrq, __alloc_element)) != NULL &&
+ elp_tmp->ulen == 0 &&
+ (u_int8_t *)elp + elp->len == (u_int8_t *)elp_tmp) {
+ /*
+ * If we're merging the current entry into a subsequent entry,
+ * remove the subsequent entry from the addr and size queues
+ * and merge.
+ */
+ SH_TAILQ_REMOVE(&head->addrq, elp_tmp, addrq, __alloc_element);
+ SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len);
+ SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element);
+
+ elp->len += elp_tmp->len;
+ }
+
+ /* Insert in the correct place in the size queues. */
+ __env_size_insert(head, elp);
+}
+
+/*
+ * __env_alloc_extend --
+ * Extend a previously allocated chunk at the end of a region.
+ *
+ * PUBLIC: int __env_alloc_extend __P((REGINFO *, void *, size_t *));
+ */
+int
+__env_alloc_extend(infop, ptr, lenp)
+ REGINFO *infop;
+ void *ptr;
+ size_t *lenp;
+{
+ ALLOC_ELEMENT *elp, *elp_tmp;
+ ALLOC_LAYOUT *head;
+ ENV *env;
+ SIZEQ_HEAD *q;
+ size_t len, tlen;
+ u_int8_t i, *p;
+ int ret;
+
+ env = infop->env;
+
+ DB_ASSERT(env, !F_ISSET(env, ENV_PRIVATE));
+
+#ifdef HAVE_MUTEX_SUPPORT
+ MUTEX_REQUIRED(env, infop->mtx_alloc);
+#endif
+
+ head = infop->head;
+
+ p = ptr;
+ len = *lenp;
+ elp = (ALLOC_ELEMENT *)(p - sizeof(ALLOC_ELEMENT));
+#ifdef DIAGNOSTIC
+ /* Check the guard byte. */
+ DB_ASSERT(env, p[elp->ulen] == GUARD_BYTE);
+#endif
+
+ /* See if there is anything left in the region. */
+again: if ((elp_tmp = SH_TAILQ_NEXT(elp, addrq, __alloc_element)) != NULL &&
+ elp_tmp->ulen == 0 &&
+ (u_int8_t *)elp + elp->len == (u_int8_t *)elp_tmp) {
+ /*
+ * If we're merging the current entry into a subsequent entry,
+ * remove the subsequent entry from the addr and size queues
+ * and merge.
+ */
+ SH_TAILQ_REMOVE(&head->addrq, elp_tmp, addrq, __alloc_element);
+ SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len);
+ SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element);
+ if (elp_tmp->len < len + SHALLOC_FRAGMENT) {
+ elp->len += elp_tmp->len;
+ if (elp_tmp->len < len)
+ len -= (size_t)elp_tmp->len;
+ else
+ len = 0;
+ } else {
+ tlen = (size_t)elp_tmp->len;
+ elp_tmp = (ALLOC_ELEMENT *) ((u_int8_t *)elp_tmp + len);
+ elp_tmp->len = tlen - len;
+ elp_tmp->ulen = 0;
+ elp->len += len;
+ len = 0;
+
+ /* The fragment follows the on the address queue. */
+ SH_TAILQ_INSERT_AFTER(
+ &head->addrq, elp, elp_tmp, addrq, __alloc_element);
+
+ /* Insert the frag into the correct size queue. */
+ __env_size_insert(head, elp_tmp);
+ }
+ } else if (elp_tmp != NULL) {
+ __db_errx(env, DB_STR("1583", "block not at end of region"));
+ return (__env_panic(env, EINVAL));
+ }
+ if (len == 0)
+ goto done;
+
+ if ((ret = __env_region_extend(env, infop)) != 0) {
+ if (ret != ENOMEM)
+ return (ret);
+ goto done;
+ }
+ goto again;
+
+done: elp->ulen = elp->len - sizeof(ALLOC_ELEMENT);
+#ifdef DIAGNOSTIC
+ elp->ulen -= sizeof(uintmax_t);
+ /* There was room for the guard byte in the chunk that came in. */
+ p[elp->ulen] = GUARD_BYTE;
+#endif
+ *lenp -= len;
+ infop->allocated += *lenp;
+ if (F_ISSET(infop, REGION_SHARED))
+ env->reginfo->allocated += *lenp;
+ return (0);
+}
+
+/*
+ * __env_size_insert --
+ * Insert into the correct place in the size queues.
+ */
+static void
+__env_size_insert(head, elp)
+ ALLOC_LAYOUT *head;
+ ALLOC_ELEMENT *elp;
+{
+ SIZEQ_HEAD *q;
+ ALLOC_ELEMENT *elp_tmp;
+ u_int i;
+
+ /* Find the appropriate queue for the chunk. */
+ SET_QUEUE_FOR_SIZE(head, q, i, elp->len);
+
+ /* Find the correct slot in the size queue. */
+ SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element)
+ if (elp->len >= elp_tmp->len)
+ break;
+ if (elp_tmp == NULL)
+ SH_TAILQ_INSERT_TAIL(q, elp, sizeq);
+ else
+ SH_TAILQ_INSERT_BEFORE(q, elp_tmp, elp, sizeq, __alloc_element);
+}
+
+/*
+ * __env_region_extend --
+ * Extend a region.
+ *
+ * PUBLIC: int __env_region_extend __P((ENV *, REGINFO *));
+ */
+int
+__env_region_extend(env, infop)
+ ENV *env;
+ REGINFO *infop;
+{
+ ALLOC_ELEMENT *elp;
+ REGION *rp;
+ int ret;
+
+ DB_ASSERT(env, !F_ISSET(env, ENV_PRIVATE));
+
+ ret = 0;
+ rp = infop->rp;
+ if (rp->size >= rp->max)
+ return (ENOMEM);
+ elp = (ALLOC_ELEMENT *)((u_int8_t *)infop->addr + rp->size);
+ if (rp->size + rp->alloc > rp->max)
+ rp->alloc = rp->max - rp->size;
+ rp->size += rp->alloc;
+ rp->size = (size_t)ALIGNP_INC(rp->size, sizeof(size_t));
+ if (rp->max - rp->size <= SHALLOC_FRAGMENT)
+ rp->size = rp->max;
+ if (infop->fhp &&
+ (ret = __db_file_extend(env, infop->fhp, rp->size)) != 0)
+ return (ret);
+ elp->len = rp->alloc;
+ elp->ulen = 0;
+#ifdef DIAGNOSTIC
+ *(u_int8_t *)(elp+1) = GUARD_BYTE;
+#endif
+
+ SH_TAILQ_INSERT_TAIL(&((ALLOC_LAYOUT *)infop->head)->addrq, elp, addrq);
+ __env_alloc_free(infop, elp + 1);
+ if (rp->alloc < MEGABYTE)
+ rp->alloc += rp->size;
+ if (rp->alloc > MEGABYTE)
+ rp->alloc = MEGABYTE;
+ return (ret);
+}
+
+/*
+ * __env_elem_size --
+ * Return the size of an allocated element.
+ * PUBLIC: uintmax_t __env_elem_size __P((ENV *, void *));
+ */
+uintmax_t
+__env_elem_size(env, p)
+ ENV *env;
+ void *p;
+{
+ ALLOC_ELEMENT *elp;
+ uintmax_t size;
+
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ size = *((uintmax_t *)p - 1);
+ size -= sizeof(uintmax_t);
+ } else {
+ elp = (ALLOC_ELEMENT *)((u_int8_t *)p - sizeof(ALLOC_ELEMENT));
+ size = elp->ulen;
+ }
+ return (size);
+}
+
+/*
+ * __env_get_chunk --
+ * Return the next chunk allocated in a private region.
+ * PUBLIC: void * __env_get_chunk __P((REGINFO *, void **, uintmax_t *));
+ */
+void *
+__env_get_chunk(infop, nextp, sizep)
+ REGINFO *infop;
+ void **nextp;
+ uintmax_t *sizep;
+{
+ REGION_MEM *mem;
+
+ if (infop->mem == NULL)
+ return (NULL);
+ if (*nextp == NULL)
+ *nextp = infop->mem;
+ mem = *(REGION_MEM **)nextp;
+ *nextp = mem->next;
+
+ *sizep = __env_elem_size(infop->env, mem);
+ *sizep -= sizeof(*mem);
+
+ return ((void *)(mem + 1));
+}
+
+#ifdef HAVE_STATISTICS
+/*
+ * __env_alloc_print --
+ * Display the lists of memory chunks.
+ *
+ * PUBLIC: void __env_alloc_print __P((REGINFO *, u_int32_t));
+ */
+void
+__env_alloc_print(infop, flags)
+ REGINFO *infop;
+ u_int32_t flags;
+{
+ ALLOC_ELEMENT *elp;
+ ALLOC_LAYOUT *head;
+ ENV *env;
+ u_int i;
+
+ env = infop->env;
+ head = infop->head;
+
+ if (F_ISSET(env, ENV_PRIVATE))
+ return;
+
+ __db_msg(env,
+ "Region allocations: %lu allocations, %lu failures, %lu frees, %lu longest",
+ (u_long)head->success, (u_long)head->failure, (u_long)head->freed,
+ (u_long)head->longest);
+
+ if (!LF_ISSET(DB_STAT_ALL))
+ return;
+
+ __db_msg(env, "%s", "Allocations by power-of-two sizes:");
+ for (i = 0; i < DB_SIZE_Q_COUNT; ++i)
+ __db_msg(env, "%3dKB\t%lu",
+ (1024 << i) / 1024, (u_long)head->pow2_size[i]);
+
+ if (!LF_ISSET(DB_STAT_ALLOC))
+ return;
+ /*
+ * We don't normally display the list of address/chunk pairs, a few
+ * thousand lines of output is too voluminous for even DB_STAT_ALL.
+ */
+ __db_msg(env,
+ "Allocation list by address, offset: {chunk length, user length}");
+ SH_TAILQ_FOREACH(elp, &head->addrq, addrq, __alloc_element)
+ __db_msg(env, "\t%#lx, %lu {%lu, %lu}",
+ P_TO_ULONG(elp), (u_long)R_OFFSET(infop, elp),
+ (u_long)elp->len, (u_long)elp->ulen);
+
+ __db_msg(env, "Allocation free list by size: KB {chunk length}");
+ for (i = 0; i < DB_SIZE_Q_COUNT; ++i) {
+ __db_msg(env, "%3dKB", (1024 << i) / 1024);
+ SH_TAILQ_FOREACH(elp, &head->sizeq[i], sizeq, __alloc_element)
+ __db_msg(env,
+ "\t%#lx {%lu}", P_TO_ULONG(elp), (u_long)elp->len);
+ }
+}
+#endif
diff --git a/src/env/env_backup.c b/src/env/env_backup.c
new file mode 100644
index 00000000..9c79dbb4
--- /dev/null
+++ b/src/env/env_backup.c
@@ -0,0 +1,166 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int __env_backup_alloc __P((DB_ENV *));
+
+static int
+__env_backup_alloc(dbenv)
+ DB_ENV *dbenv;
+{
+ ENV *env;
+
+ env = dbenv->env;
+ if (env->backup_handle != NULL)
+ return (0);
+ return (__os_calloc(env, 1,
+ sizeof(*env->backup_handle), &env->backup_handle));
+}
+
+/*
+ * __env_get_backup_config --
+ *
+ * PUBLIC: int __env_get_backup_config __P((DB_ENV *,
+ * PUBLIC: DB_BACKUP_CONFIG, u_int32_t*));
+ */
+int
+__env_get_backup_config(dbenv, config, valuep)
+ DB_ENV *dbenv;
+ DB_BACKUP_CONFIG config;
+ u_int32_t *valuep;
+{
+ DB_BACKUP *backup;
+
+ backup = dbenv->env->backup_handle;
+ if (backup == NULL)
+ return (EINVAL);
+
+ switch (config) {
+ case DB_BACKUP_WRITE_DIRECT:
+ *valuep = F_ISSET(backup, BACKUP_WRITE_DIRECT);
+ break;
+
+ case DB_BACKUP_READ_COUNT:
+ *valuep = backup->read_count;
+ break;
+
+ case DB_BACKUP_READ_SLEEP:
+ *valuep = backup->read_sleep;
+ break;
+
+ case DB_BACKUP_SIZE:
+ *valuep = backup->size;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __env_set_backup_config --
+ *
+ * PUBLIC: int __env_set_backup_config __P((DB_ENV *,
+ * PUBLIC: DB_BACKUP_CONFIG, u_int32_t));
+ */
+int
+__env_set_backup_config(dbenv, config, value)
+ DB_ENV *dbenv;
+ DB_BACKUP_CONFIG config;
+ u_int32_t value;
+{
+ DB_BACKUP *backup;
+ int ret;
+
+ if ((ret = __env_backup_alloc(dbenv)) != 0)
+ return (ret);
+
+ backup = dbenv->env->backup_handle;
+ switch (config) {
+ case DB_BACKUP_WRITE_DIRECT:
+ if (value == 0)
+ F_CLR(backup, BACKUP_WRITE_DIRECT);
+ else
+ F_SET(backup, BACKUP_WRITE_DIRECT);
+ break;
+
+ case DB_BACKUP_READ_COUNT:
+ backup->read_count = value;
+ break;
+
+ case DB_BACKUP_READ_SLEEP:
+ backup->read_sleep = value;
+ break;
+
+ case DB_BACKUP_SIZE:
+ backup->size = value;
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * __env_get_backup_callbacks --
+ *
+ * PUBLIC: int __env_get_backup_callbacks __P((DB_ENV *,
+ * PUBLIC: int (**)(DB_ENV *, const char *, const char *, void **),
+ * PUBLIC: int (**)(DB_ENV *,
+ * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+ * PUBLIC: int (**)(DB_ENV *, const char *, void *)));
+ */
+int
+__env_get_backup_callbacks(dbenv, openp, writep, closep)
+ DB_ENV *dbenv;
+ int (**openp)(DB_ENV *, const char *, const char *, void **);
+ int (**writep)(DB_ENV *,
+ u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *);
+ int (**closep)(DB_ENV *, const char *, void *);
+{
+ DB_BACKUP *backup;
+
+ backup = dbenv->env->backup_handle;
+ if (backup == NULL)
+ return (EINVAL);
+
+ *openp = backup->open;
+ *writep = backup->write;
+ *closep = backup->close;
+ return (0);
+}
+
+/*
+ * __env_set_backup_callbacks --
+ *
+ * PUBLIC: int __env_set_backup_callbacks __P((DB_ENV *,
+ * PUBLIC: int (*)(DB_ENV *, const char *, const char *, void **),
+ * PUBLIC: int (*)(DB_ENV *,
+ * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *),
+ * PUBLIC: int (*)(DB_ENV *, const char *, void *)));
+ */
+int
+__env_set_backup_callbacks(dbenv, open_func, write_func, close_func)
+ DB_ENV *dbenv;
+ int (*open_func)(DB_ENV *, const char *, const char *, void **);
+ int (*write_func)(DB_ENV *,
+ u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *);
+ int (*close_func)(DB_ENV *, const char *, void *);
+{
+ DB_BACKUP *backup;
+ int ret;
+
+ if ((ret = __env_backup_alloc(dbenv)) != 0)
+ return (ret);
+
+ backup = dbenv->env->backup_handle;
+ backup->open = open_func;
+ backup->write = write_func;
+ backup->close = close_func;
+ return (0);
+}
diff --git a/src/env/env_config.c b/src/env/env_config.c
new file mode 100644
index 00000000..57496909
--- /dev/null
+++ b/src/env/env_config.c
@@ -0,0 +1,737 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_page.h"
+#include "dbinc_auto/db_ext.h"
+
+/*
+ * DB_CONFIG lines are processed primarily by interpreting the command
+ * description tables initialized below.
+ *
+ * Most DB_CONFIG commands consist of a single token name followed by one or two
+ * integer or string arguments. These commands are described by entries in the
+ * config_descs[] array.
+ *
+ * The remaining, usually more complex, DB_CONFIG commands are handled by small
+ * code blocks in __config_parse(). Many of those commands need to translate
+ * option names to the integer values needed by the API configuration functions.
+ * Below the __config_descs[] initialization there are many FN array
+ * initializations which provide the mapping between user-specifiable strings
+ * and internally-used integer values. Typically there is one of these mappings
+ * defined for each complex DB_CONFIG command. Use __db_name_to_val()
+ * to translate a string to its integer value.
+ */
+typedef enum {
+ CFG_INT, /* The argument is 1 signed integer. */
+ CFG_LONG, /* The argument is 1 signed long int. */
+ CFG_UINT, /* The argument is 1 unsigned integer. */
+ CFG_2INT, /* The arguments are 2 signed integers. */
+ CFG_2UINT, /* The arguments are 2 unsigned integers. */
+ CFG_STRING /* The rest of the line is a string. */
+} __db_config_type;
+
+typedef struct __db_config_desc {
+ char *name; /* The name of a simple DB_CONFIG command. */
+ __db_config_type type; /* The enum describing its argument type(s). */
+ int (*func)(); /* The function to call with the argument(s). */
+} CFG_DESC;
+
+/* These typedefs help eliminate lint warnings where "func" above is used. */
+typedef int (*CFG_FUNC_STRING) __P((DB_ENV *, const char *));
+typedef int (*CFG_FUNC_INT) __P((DB_ENV *, int));
+typedef int (*CFG_FUNC_LONG) __P((DB_ENV *, long));
+typedef int (*CFG_FUNC_UINT) __P((DB_ENV *, u_int32_t));
+typedef int (*CFG_FUNC_2INT) __P((DB_ENV *, int, int));
+typedef int (*CFG_FUNC_2UINT) __P((DB_ENV *, u_int32_t, u_int32_t));
+
+/*
+ * This table lists the simple DB_CONFIG configuration commands. It is sorted by
+ * the command name, so that __config_scan() can bsearch() it. After making an
+ * addition to this table, please be sure that it remains sorted. With vi or
+ * vim, the following command line will do it:
+ * :/^static const CFG_DESC config_descs/+1, /^}/-1 ! sort
+ *
+ * This table can contain aliases. Aliases have different names with identical
+ * types and functions. At this time there are four aliases:
+ * Outdated Name Current Name
+ * db_data_dir set_data_dir
+ * db_log_dir set_lg_dir
+ * db_tmp_dir set_tmp_dir
+ * set_tas_spins mutex_set_tas_spins
+ */
+static const CFG_DESC config_descs[] = {
+ { "add_data_dir", CFG_STRING, __env_add_data_dir },
+ { "db_data_dir", CFG_STRING, __env_set_data_dir },
+ { "db_log_dir", CFG_STRING, __log_set_lg_dir },
+ { "db_tmp_dir", CFG_STRING, __env_set_tmp_dir },
+ { "mutex_set_align", CFG_UINT, __mutex_set_align },
+ { "mutex_set_increment", CFG_UINT, __mutex_set_increment },
+ { "mutex_set_init", CFG_UINT, __mutex_set_init },
+ { "mutex_set_max", CFG_UINT, __mutex_set_max },
+ { "mutex_set_tas_spins", CFG_UINT, __mutex_set_tas_spins },
+ { "rep_set_clockskew", CFG_2UINT, __rep_set_clockskew },
+ { "rep_set_limit", CFG_2UINT, __rep_set_limit },
+ { "rep_set_nsites", CFG_UINT, __rep_set_nsites_pp },
+ { "rep_set_priority", CFG_UINT, __rep_set_priority },
+ { "rep_set_request", CFG_2UINT, __rep_set_request },
+ { "set_cache_max", CFG_2UINT, __memp_set_cache_max },
+ { "set_create_dir", CFG_STRING, __env_set_create_dir },
+ { "set_data_dir", CFG_STRING, __env_set_data_dir },
+ { "set_data_len", CFG_UINT, __env_set_data_len },
+ { "set_intermediate_dir_mode",CFG_STRING, __env_set_intermediate_dir_mode },
+ { "set_lg_bsize", CFG_UINT, __log_set_lg_bsize },
+ { "set_lg_dir", CFG_STRING, __log_set_lg_dir },
+ { "set_lg_filemode", CFG_INT, __log_set_lg_filemode },
+ { "set_lg_max", CFG_UINT, __log_set_lg_max },
+ { "set_lg_regionmax", CFG_UINT, __log_set_lg_regionmax },
+ { "set_lk_max_lockers", CFG_UINT, __lock_set_lk_max_lockers },
+ { "set_lk_max_locks", CFG_UINT, __lock_set_lk_max_locks },
+ { "set_lk_max_objects", CFG_UINT, __lock_set_lk_max_objects },
+ { "set_lk_partitions", CFG_UINT, __lock_set_lk_partitions },
+ { "set_lk_tablesize", CFG_UINT, __lock_set_lk_tablesize },
+ { "set_memory_max", CFG_2UINT, __env_set_memory_max },
+ { "set_metadata_dir", CFG_STRING, __env_set_metadata_dir },
+ { "set_mp_max_openfd", CFG_INT, __memp_set_mp_max_openfd },
+ { "set_mp_max_write", CFG_2INT, __memp_set_mp_max_write },
+ { "set_mp_mmapsize", CFG_UINT, __memp_set_mp_mmapsize },
+ { "set_mp_mtxcount", CFG_UINT, __memp_set_mp_mtxcount },
+ { "set_mp_pagesize", CFG_UINT, __memp_set_mp_pagesize },
+ { "set_shm_key", CFG_LONG, __env_set_shm_key },
+ { "set_tas_spins", CFG_UINT, __mutex_set_tas_spins },
+ { "set_thread_count", CFG_UINT, __env_set_thread_count },
+ { "set_tmp_dir", CFG_STRING, __env_set_tmp_dir },
+ { "set_tx_max", CFG_UINT, __txn_set_tx_max }
+};
+
+/*
+ * Here are the option-name to option-value mappings used by complex commands.
+ */
+static const FN config_mem_init[] = {
+ { (u_int32_t) DB_MEM_LOCK, "DB_MEM_LOCK" },
+ { (u_int32_t) DB_MEM_LOCKER, "DB_MEM_LOCKER" },
+ { (u_int32_t) DB_MEM_LOCKOBJECT, "DB_MEM_LOCKOBJECT" },
+ { (u_int32_t) DB_MEM_TRANSACTION, "DB_MEM_TRANSACTION" },
+ { (u_int32_t) DB_MEM_THREAD, "DB_MEM_THREAD" },
+ { (u_int32_t) DB_MEM_LOGID, "DB_MEM_LOGID" },
+ { 0, NULL }
+};
+
+static const FN config_rep_config[] = {
+ { DB_REP_CONF_AUTOINIT, "db_rep_conf_autoinit" },
+ { DB_REP_CONF_AUTOROLLBACK, "db_rep_conf_autorollback" },
+ { DB_REP_CONF_BULK, "db_rep_conf_bulk" },
+ { DB_REP_CONF_DELAYCLIENT, "db_rep_conf_delayclient" },
+ { DB_REP_CONF_INMEM, "db_rep_conf_inmem" },
+ { DB_REP_CONF_LEASE, "db_rep_conf_lease" },
+ { DB_REP_CONF_NOWAIT, "db_rep_conf_nowait" },
+ { DB_REPMGR_CONF_2SITE_STRICT, "db_repmgr_conf_2site_strict" },
+ { DB_REPMGR_CONF_ELECTIONS, "db_repmgr_conf_elections" },
+ { 0, NULL }
+};
+
+static const FN config_rep_timeout[] = {
+ { DB_REP_ACK_TIMEOUT, "db_rep_ack_timeout" },
+ { DB_REP_CHECKPOINT_DELAY, "db_rep_checkpoint_delay" },
+ { DB_REP_CONNECTION_RETRY, "db_rep_connection_retry" },
+ { DB_REP_ELECTION_TIMEOUT, "db_rep_election_timeout" },
+ { DB_REP_ELECTION_RETRY, "db_rep_election_retry" },
+ { DB_REP_FULL_ELECTION_TIMEOUT, "db_rep_full_election_timeout" },
+ { DB_REP_HEARTBEAT_MONITOR, "db_rep_heartbeat_monitor" },
+ { DB_REP_HEARTBEAT_SEND, "db_rep_heartbeat_send" },
+ { DB_REP_LEASE_TIMEOUT, "db_rep_lease_timeout" },
+ { 0, NULL }
+};
+
+static const FN config_repmgr_ack_policy[] = {
+ { DB_REPMGR_ACKS_ALL, "db_repmgr_acks_all" },
+ { DB_REPMGR_ACKS_ALL_AVAILABLE, "db_repmgr_acks_all_available" },
+ { DB_REPMGR_ACKS_ALL_PEERS, "db_repmgr_acks_all_peers" },
+ { DB_REPMGR_ACKS_NONE, "db_repmgr_acks_none" },
+ { DB_REPMGR_ACKS_ONE, "db_repmgr_acks_one" },
+ { DB_REPMGR_ACKS_ONE_PEER, "db_repmgr_acks_one_peer" },
+ { DB_REPMGR_ACKS_QUORUM, "db_repmgr_acks_quorum" },
+ { 0, NULL }
+};
+
+static const FN config_repmgr_site[] = {
+ { DB_BOOTSTRAP_HELPER, "db_bootstrap_helper" },
+ { DB_GROUP_CREATOR, "db_group_creator" },
+ { DB_LEGACY, "db_legacy" },
+ { DB_LOCAL_SITE, "db_local_site" },
+ { DB_REPMGR_PEER, "db_repmgr_peer" },
+ { 0, NULL }
+};
+
+static const FN config_set_flags[] = {
+ { DB_AUTO_COMMIT, "db_auto_commit" },
+ { DB_CDB_ALLDB, "db_cdb_alldb" },
+ { DB_DIRECT_DB, "db_direct_db" },
+ { DB_DSYNC_DB, "db_dsync_db" },
+ { DB_MULTIVERSION, "db_multiversion" },
+ { DB_NOLOCKING, "db_nolocking" },
+ { DB_NOMMAP, "db_nommap" },
+ { DB_NOPANIC, "db_nopanic" },
+ { DB_OVERWRITE, "db_overwrite" },
+ { DB_REGION_INIT, "db_region_init" },
+ { DB_TIME_NOTGRANTED, "db_time_notgranted" },
+ { DB_TXN_NOSYNC, "db_txn_nosync" },
+ { DB_TXN_NOWAIT, "db_txn_nowait" },
+ { DB_TXN_SNAPSHOT, "db_txn_snapshot" },
+ { DB_TXN_WRITE_NOSYNC, "db_txn_write_nosync" },
+ { DB_YIELDCPU, "db_yieldcpu" },
+ { 0, NULL }
+};
+
+static const FN config_set_flags_forlog[] = {
+ { DB_LOG_DIRECT, "db_direct_log" },
+ { DB_LOG_DSYNC, "db_dsync_log" },
+ { DB_LOG_AUTO_REMOVE, "db_log_autoremove" },
+ { DB_LOG_IN_MEMORY, "db_log_inmemory" },
+ { 0, NULL }
+};
+
+static const FN config_log_set_config[] = {
+ { DB_LOG_DIRECT, "db_log_direct" },
+ { DB_LOG_DSYNC, "db_log_dsync" },
+ { DB_LOG_AUTO_REMOVE, "db_log_auto_remove" },
+ { DB_LOG_IN_MEMORY, "db_log_in_memory" },
+ { DB_LOG_ZERO, "db_log_zero" },
+ { 0, NULL }
+};
+
+static const FN config_set_lk_detect[] = {
+ { DB_LOCK_DEFAULT, "db_lock_default" },
+ { DB_LOCK_EXPIRE, "db_lock_expire" },
+ { DB_LOCK_MAXLOCKS, "db_lock_maxlocks" },
+ { DB_LOCK_MAXWRITE, "db_lock_maxwrite" },
+ { DB_LOCK_MINLOCKS, "db_lock_minlocks" },
+ { DB_LOCK_MINWRITE, "db_lock_minwrite" },
+ { DB_LOCK_OLDEST, "db_lock_oldest" },
+ { DB_LOCK_RANDOM, "db_lock_random" },
+ { DB_LOCK_YOUNGEST, "db_lock_youngest" },
+ { 0, NULL }
+};
+
+static const FN config_set_open_flags[] = {
+ { DB_INIT_REP, "db_init_rep" },
+ { DB_PRIVATE, "db_private" },
+ { DB_REGISTER, "db_register" },
+ { DB_THREAD, "db_thread" },
+ { 0, NULL }
+};
+
+static const FN config_set_verbose[] = {
+ { DB_VERB_BACKUP, "db_verb_backup" },
+ { DB_VERB_DEADLOCK, "db_verb_deadlock" },
+ { DB_VERB_FILEOPS, "db_verb_fileops" },
+ { DB_VERB_FILEOPS_ALL, "db_verb_fileops_all" },
+ { DB_VERB_RECOVERY, "db_verb_recovery" },
+ { DB_VERB_REGISTER, "db_verb_register" },
+ { DB_VERB_REPLICATION, "db_verb_replication" },
+ { DB_VERB_REP_ELECT, "db_verb_rep_elect" },
+ { DB_VERB_REP_LEASE, "db_verb_rep_lease" },
+ { DB_VERB_REP_MISC, "db_verb_rep_misc" },
+ { DB_VERB_REP_MSGS, "db_verb_rep_msgs" },
+ { DB_VERB_REP_SYNC, "db_verb_rep_sync" },
+ { DB_VERB_REP_SYSTEM, "db_verb_rep_system" },
+ { DB_VERB_REP_TEST, "db_verb_rep_test" },
+ { DB_VERB_REPMGR_CONNFAIL, "db_verb_repmgr_connfail" },
+ { DB_VERB_REPMGR_MISC, "db_verb_repmgr_misc" },
+ { DB_VERB_WAITSFOR, "db_verb_waitsfor" },
+ { 0, NULL}
+};
+
+static int __config_parse __P((ENV *, char *, int));
+static int __config_scan __P((char *, char **, const CFG_DESC **));
+static int cmp_cfg_name __P((const void *, const void *element));
+
+/*
+ * __env_read_db_config --
+ * Read the DB_CONFIG file.
+ *
+ * PUBLIC: int __env_read_db_config __P((ENV *));
+ */
+int
+__env_read_db_config(env)
+ ENV *env;
+{
+ FILE *fp;
+ int lc, ret;
+ char *p, buf[256];
+
+ /* Parse the config file. */
+ p = NULL;
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, "DB_CONFIG", NULL, &p)) != 0)
+ return (ret);
+ if (p == NULL)
+ fp = NULL;
+ else {
+ fp = fopen(p, "r");
+ __os_free(env, p);
+ }
+
+ if (fp == NULL)
+ return (0);
+
+ for (lc = 1; fgets(buf, sizeof(buf), fp) != NULL; ++lc) {
+ if ((p = strchr(buf, '\n')) == NULL)
+ p = buf + strlen(buf);
+ if (p > buf && p[-1] == '\r')
+ --p;
+ *p = '\0';
+ for (p = buf; *p != '\0' && isspace((int)*p); ++p)
+ ;
+ if (*p == '\0' || *p == '#')
+ continue;
+
+ if ((ret = __config_parse(env, p, lc)) != 0)
+ break;
+ }
+ (void)fclose(fp);
+
+ return (ret);
+}
+
+#undef CFG_GET_INT
+#define CFG_GET_INT(s, vp) do { \
+ int __ret; \
+ if ((__ret = \
+ __db_getlong(env->dbenv, NULL, s, 0, INT_MAX, vp)) != 0) \
+ return (__ret); \
+} while (0)
+#undef CFG_GET_LONG
+#define CFG_GET_LONG(s, vp) do { \
+ int __ret; \
+ if ((__ret = \
+ __db_getlong(env->dbenv, NULL, s, 0, LONG_MAX, vp)) != 0) \
+ return (__ret); \
+} while (0)
+#undef CFG_GET_UINT
+#define CFG_GET_UINT(s, vp) do { \
+ int __ret; \
+ if ((__ret = \
+ __db_getulong(env->dbenv, NULL, s, 0, UINT_MAX, vp)) != 0) \
+ return (__ret); \
+} while (0)
+#undef CFG_GET_UINT32
+#define CFG_GET_UINT32(s, vp) do { \
+ if (__db_getulong(env->dbenv, NULL, s, 0, UINT32_MAX, vp) != 0) \
+ return (EINVAL); \
+} while (0)
+
+/* This is the maximum number of tokens in a DB_CONFIG line. */
+#undef CFG_SLOTS
+#define CFG_SLOTS 10
+
+/*
+ * __config_parse --
+ * Parse a single NAME VALUE pair.
+ */
+static int
+__config_parse(env, s, lc)
+ ENV *env;
+ char *s;
+ int lc;
+{
+ DB_ENV *dbenv;
+ DB_SITE *site;
+ u_long uv1, uv2;
+ long lv1, lv2;
+ u_int port;
+ int i, nf, onoff, bad, ret, t_ret;
+ char *argv[CFG_SLOTS];
+ const CFG_DESC *desc;
+
+ bad = 0;
+ dbenv = env->dbenv;
+
+ /*
+ * Split the input line in 's' into its argv-like components, returning
+ * the number of fields. If the command is one of the "simple" ones in
+ * config_descs, also return its command descriptor.
+ */
+ if ((nf = __config_scan(s, argv, &desc)) < 2) {
+format: __db_errx(env, DB_STR_A("1584",
+ "line %d: %s: incorrect name-value pair", "%d %s"),
+ lc, argv[0]);
+ return (EINVAL);
+ }
+
+ /* Handle simple configuration lines here. */
+ if (desc != NULL) {
+ ret = 0;
+ switch (desc->type) {
+ case CFG_INT: /* <command> <int> */
+ if (nf != 2)
+ goto format;
+ CFG_GET_INT(argv[1], &lv1);
+ ret = ((CFG_FUNC_INT)desc->func)(dbenv, (int) lv1);
+ break;
+
+ case CFG_LONG: /* <command> <long int> */
+ if (nf != 2)
+ goto format;
+ CFG_GET_LONG(argv[1], &lv1);
+ ret = ((CFG_FUNC_LONG)desc->func)(dbenv, lv1);
+ break;
+
+ case CFG_UINT: /* <command> <uint> */
+ if (nf != 2)
+ goto format;
+ CFG_GET_UINT(argv[1], &uv1);
+ ret = ((CFG_FUNC_UINT)desc->func)
+ (dbenv, (u_int32_t) uv1);
+ break;
+
+ case CFG_2INT: /* <command> <int1> <int2> */
+ if (nf != 3)
+ goto format;
+ CFG_GET_INT(argv[1], &lv1);
+ CFG_GET_INT(argv[2], &lv2);
+ ret = ((CFG_FUNC_2INT)desc->func)
+ (dbenv, (int) lv1, (int) lv2);
+ break;
+
+ case CFG_2UINT: /* <command> <uint1> <uint2> */
+ if (nf != 3)
+ goto format;
+ CFG_GET_UINT(argv[1], &uv1);
+ CFG_GET_UINT(argv[2], &uv2);
+ ret = ((CFG_FUNC_2UINT)desc->func)
+ (dbenv, (u_int32_t) uv1, (u_int32_t) uv2);
+ break;
+
+ case CFG_STRING: /* <command> <rest of line as string> */
+ ret = ((CFG_FUNC_STRING) desc->func)(dbenv, argv[1]);
+ break;
+ }
+ return (ret);
+ }
+
+ /*
+ * The commands not covered in config_descs are handled below, each
+ * with their own command-specific block of code. Most of them are
+ * fairly similar to each other, but not quite enough to warrant
+ * that they all be table-driven too.
+ */
+
+ /* set_memory_init db_mem_XXX <unsigned> */
+ if (strcasecmp(argv[0], "set_memory_init") == 0) {
+ if (nf != 3)
+ goto format;
+ if ((lv1 = __db_name_to_val(config_mem_init, argv[1])) == -1)
+ goto format;
+ CFG_GET_UINT32(argv[2], &uv2);
+ return (__env_set_memory_init(dbenv,
+ (DB_MEM_CONFIG) lv1, (u_int32_t)uv2));
+ }
+
+ /* rep_set_config { db_rep_conf_XXX | db_repmgr_conf_XXX } [on|off] */
+ if (strcasecmp(argv[0], "rep_set_config") == 0) {
+ if (nf != 2 && nf != 3)
+ goto format;
+ onoff = 1;
+ if (nf == 3) {
+ if (strcasecmp(argv[2], "off") == 0)
+ onoff = 0;
+ else if (strcasecmp(argv[2], "on") != 0)
+ goto format;
+ }
+ if ((lv1 = __db_name_to_val(config_rep_config, argv[1])) == -1)
+ goto format;
+ return (__rep_set_config(dbenv, (u_int32_t)lv1, onoff));
+ }
+
+ /* rep_set_timeout db_rep_XXX <unsigned> */
+ if (strcasecmp(argv[0], "rep_set_timeout") == 0) {
+ if (nf != 3)
+ goto format;
+ if ((lv1 = __db_name_to_val(config_rep_timeout, argv[1])) == -1)
+ goto format;
+ CFG_GET_UINT32(argv[2], &uv2);
+ return (__rep_set_timeout(dbenv, lv1, (db_timeout_t)uv2));
+ }
+
+ /* repmgr_set_ack_policy db_repmgr_acks_XXX */
+ if (strcasecmp(argv[0], "repmgr_set_ack_policy") == 0) {
+ if (nf != 2)
+ goto format;
+ if ((lv1 =
+ __db_name_to_val(config_repmgr_ack_policy, argv[1])) == -1)
+ goto format;
+ return (__repmgr_set_ack_policy(dbenv, lv1));
+ }
+
+ /*
+ * Configure name/value pairs of config information for a site (local or
+ * remote).
+ *
+ * repmgr_site host port [which value(on | off | unsigned)}] ...
+ */
+ if (strcasecmp(argv[0], "repmgr_site") == 0) {
+ if (nf < 3 || (nf % 2) == 0)
+ goto format;
+ CFG_GET_UINT(argv[2], &uv2);
+ port = (u_int)uv2;
+
+ if ((ret = __repmgr_site(dbenv, argv[1], port, &site, 0)) != 0)
+ return (ret);
+#ifdef HAVE_REPLICATION_THREADS
+ for (i = 3; i < nf; i += 2) {
+ if ((lv1 = __db_name_to_val(
+ config_repmgr_site, argv[i])) == -1) {
+ bad = 1;
+ break;
+ }
+
+ if (strcasecmp(argv[i + 1], "on") == 0)
+ uv2 = 1;
+ else if (strcasecmp(argv[i + 1], "off") == 0)
+ uv2 = 0;
+ else
+ CFG_GET_UINT32(argv[i + 1], &uv2);
+ if ((ret = __repmgr_site_config(site,
+ (u_int32_t)lv1, (u_int32_t)uv2)) != 0)
+ break;
+ }
+ if ((t_ret = __repmgr_site_close(site)) != 0 && ret == 0)
+ ret = t_ret;
+ if (bad)
+ goto format;
+#else
+ /* If repmgr not built, __repmgr_site() returns DB_OPNOTSUP. */
+ COMPQUIET(i, 0);
+ COMPQUIET(t_ret, 0);
+ DB_ASSERT(env, 0);
+#endif
+ return (ret);
+ }
+
+ /* set_cachesize <unsigned gbytes> <unsigned bytes> <int ncaches> */
+ if (strcasecmp(argv[0], "set_cachesize") == 0) {
+ if (nf != 4)
+ goto format;
+ CFG_GET_UINT32(argv[1], &uv1);
+ CFG_GET_UINT32(argv[2], &uv2);
+ CFG_GET_INT(argv[3], &lv1);
+ return (__memp_set_cachesize(
+ dbenv, (u_int32_t)uv1, (u_int32_t)uv2, (int)lv1));
+ }
+
+ /* set_intermediate_dir <integer dir permission> */
+ if (strcasecmp(argv[0], "set_intermediate_dir") == 0) {
+ if (nf != 2)
+ goto format;
+ CFG_GET_INT(argv[1], &lv1);
+ if (lv1 <= 0)
+ goto format;
+ env->dir_mode = (int)lv1;
+ return (0);
+ }
+
+ /* set_flags <env or log flag name> [on | off] */
+ if (strcasecmp(argv[0], "set_flags") == 0) {
+ if (nf != 2 && nf != 3)
+ goto format;
+ onoff = 1;
+ if (nf == 3) {
+ if (strcasecmp(argv[2], "off") == 0)
+ onoff = 0;
+ else if (strcasecmp(argv[2], "on") != 0)
+ goto format;
+ }
+ /* First see whether it is an env flag, then a log flag. */
+ if ((lv1 = __db_name_to_val(config_set_flags, argv[1])) != -1)
+ return (__env_set_flags(dbenv, (u_int32_t)lv1, onoff));
+ else if ((lv1 =
+ __db_name_to_val(config_set_flags_forlog, argv[1])) != -1)
+ return (__log_set_config(dbenv, (u_int32_t)lv1, onoff));
+ goto format;
+ }
+
+ /* log_set_config <log flag name> [on | off] */
+ if (strcasecmp(argv[0], "log_set_config") == 0) {
+ if (nf != 2 && nf != 3)
+ goto format;
+ onoff = 1;
+ if (nf == 3) {
+ if (strcasecmp(argv[2], "off") == 0)
+ onoff = 0;
+ else if (strcasecmp(argv[2], "on") != 0)
+ goto format;
+ }
+ if ((lv1 =
+ __db_name_to_val(config_log_set_config, argv[1])) == -1)
+ goto format;
+ return (__log_set_config(dbenv, (u_int32_t)lv1, onoff));
+ }
+
+ /* set_lk_detect db_lock_xxx */
+ if (strcasecmp(argv[0], "set_lk_detect") == 0) {
+ if (nf != 2)
+ goto format;
+ if ((lv1 =
+ __db_name_to_val(config_set_lk_detect, argv[1])) == -1)
+ goto format;
+ return (__lock_set_lk_detect(dbenv, (u_int32_t)lv1));
+ }
+
+ /* set_lock_timeout <unsigned lock timeout> */
+ if (strcasecmp(argv[0], "set_lock_timeout") == 0) {
+ if (nf != 2)
+ goto format;
+ CFG_GET_UINT32(argv[1], &uv1);
+ return (__lock_set_env_timeout(
+ dbenv, (u_int32_t)uv1, DB_SET_LOCK_TIMEOUT));
+ }
+
+ /* set_open_flags <env open flag name> [on | off] */
+ if (strcasecmp(argv[0], "set_open_flags") == 0) {
+ if (nf != 2 && nf != 3)
+ goto format;
+ onoff = 1;
+ if (nf == 3) {
+ if (strcasecmp(argv[2], "off") == 0)
+ onoff = 0;
+ else if (strcasecmp(argv[2], "on") != 0)
+ goto format;
+ }
+ if ((lv1 =
+ __db_name_to_val(config_set_open_flags, argv[1])) == -1)
+ goto format;
+ if (onoff == 1)
+ FLD_SET(env->open_flags, (u_int32_t)lv1);
+ else
+ FLD_CLR(env->open_flags, (u_int32_t)lv1);
+ return (0);
+ }
+
+ /* set_region_init <0 or 1> */
+ if (strcasecmp(argv[0], "set_region_init") == 0) {
+ if (nf != 2)
+ goto format;
+ CFG_GET_INT(argv[1], &lv1);
+ if (lv1 != 0 && lv1 != 1)
+ goto format;
+ return (__env_set_flags(
+ dbenv, DB_REGION_INIT, lv1 == 0 ? 0 : 1));
+ }
+
+ /* set_reg_timeout <unsigned timeout> */
+ if (strcasecmp(argv[0], "set_reg_timeout") == 0) {
+ if (nf != 2)
+ goto format;
+ CFG_GET_UINT32(argv[1], &uv1);
+ return (__env_set_timeout(
+ dbenv, (u_int32_t)uv1, DB_SET_REG_TIMEOUT));
+ }
+
+ /* set_txn_timeout <unsigned timeout> */
+ if (strcasecmp(argv[0], "set_txn_timeout") == 0) {
+ if (nf != 2)
+ goto format;
+ CFG_GET_UINT32(argv[1], &uv1);
+ return (__lock_set_env_timeout(
+ dbenv, (u_int32_t)uv1, DB_SET_TXN_TIMEOUT));
+ }
+
+ /* set_verbose db_verb_XXX [on | off] */
+ if (strcasecmp(argv[0], "set_verbose") == 0) {
+ if (nf != 2 && nf != 3)
+ goto format;
+ onoff = 1;
+ if (nf == 3) {
+ if (strcasecmp(argv[2], "off") == 0)
+ onoff = 0;
+ else if (strcasecmp(argv[2], "on") != 0)
+ goto format;
+ }
+ if ((lv1 = __db_name_to_val(config_set_verbose, argv[1])) == -1)
+ goto format;
+ return (__env_set_verbose(dbenv, (u_int32_t)lv1, onoff));
+ }
+
+ __db_errx(env,
+ DB_STR_A("1585", "unrecognized name-value pair: %s", "%s"), s);
+ return (EINVAL);
+}
+
+/* cmp_cfg_name --
+ * Bsearch comparison function for CFG_DESC.name, for looking up
+ * the names of simple commmands.
+ */
+static int
+cmp_cfg_name(sought, element)
+ const void *sought;
+ const void *element;
+{
+ return
+ (strcmp((const char *) sought, ((const CFG_DESC *) element)->name));
+}
+
+/*
+ * __config_scan --
+ * Split DB_CONFIG lines into fields. Usually each whitespace separated
+ * field is scanned as a distinct argument. However, if the command is
+ * recognized as one needing a single string value, then the rest of the
+ * line is returned as the one argument. That supports strings which
+ * contain whitespaces, such as some directory paths.
+ *
+ * This returns the number of fields. It sets *descptr to the command
+ * descriptor (if it is recognized), or NULL.
+ */
+static int
+__config_scan(input, argv, descptr)
+ char *input, *argv[CFG_SLOTS];
+ const CFG_DESC **descptr;
+{
+ size_t tablecount;
+ int count;
+ char **ap;
+
+ tablecount = sizeof(config_descs) / sizeof(config_descs[0]);
+ *descptr = NULL;
+ for (count = 0, ap = argv; (*ap = strsep(&input, " \t\n")) != NULL;) {
+ /* Empty tokens are adjacent whitespaces; skip them. */
+ if (**ap == '\0')
+ continue;
+ /* Accept a non-empty token as the next field. */
+ count++;
+ ap++;
+ /*
+ * If that was the first token, look it up in the simple command
+ * table. If it is there and takes a single string value, then
+ * return the remainder of the line (after skipping over any
+ * leading whitespaces) without splitting it further.
+ */
+ if (count == 1) {
+ *descptr = bsearch(argv[0], config_descs,
+ tablecount, sizeof(config_descs[0]), cmp_cfg_name);
+ if (*descptr != NULL &&
+ (*descptr)->type == CFG_STRING) {
+ count++;
+ while (isspace(*input))
+ input++;
+ *ap++ = input;
+ break;
+ }
+ }
+ /* Stop scanning if the line has too many tokens. */
+ if (count >= CFG_SLOTS)
+ break;
+ }
+ return (count);
+}
diff --git a/src/env/env_failchk.c b/src/env/env_failchk.c
new file mode 100644
index 00000000..05752f07
--- /dev/null
+++ b/src/env/env_failchk.c
@@ -0,0 +1,558 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#ifndef HAVE_SIMPLE_THREAD_TYPE
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h" /* Needed for call to __ham_func5. */
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __env_in_api __P((ENV *));
+static void __env_clear_state __P((ENV *));
+
+/*
+ * __env_failchk_pp --
+ * ENV->failchk pre/post processing.
+ *
+ * PUBLIC: int __env_failchk_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_failchk_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->failchk");
+
+ /*
+ * ENV->failchk requires self and is-alive functions. We
+ * have a default self function, but no is-alive function.
+ */
+ if (!ALIVE_ON(env)) {
+ __db_errx(env, DB_STR("1503",
+ "DB_ENV->failchk requires DB_ENV->is_alive be configured"));
+ return (EINVAL);
+ }
+
+ if (flags != 0)
+ return (__db_ferr(env, "DB_ENV->failchk", 0));
+
+ ENV_ENTER(env, ip);
+ FAILCHK_THREAD(env, ip); /* mark as failchk thread */
+ ret = __env_failchk_int(dbenv);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+/*
+ * __env_failchk_int --
+ * Process the subsystem failchk routines
+ *
+ * PUBLIC: int __env_failchk_int __P((DB_ENV *));
+ */
+int
+__env_failchk_int(dbenv)
+ DB_ENV *dbenv;
+{
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ F_SET(dbenv, DB_ENV_FAILCHK);
+
+ /*
+ * We check for dead threads in the API first as this would be likely
+ * to hang other things we try later, like locks and transactions.
+ */
+ if ((ret = __env_in_api(env)) != 0)
+ goto err;
+
+ if (LOCKING_ON(env) && (ret = __lock_failchk(env)) != 0)
+ goto err;
+
+ if (TXN_ON(env) &&
+ ((ret = __txn_failchk(env)) != 0 ||
+ (ret = __dbreg_failchk(env)) != 0))
+ goto err;
+
+ if ((ret = __memp_failchk(env)) != 0)
+ goto err;
+
+#ifdef HAVE_REPLICATION_THREADS
+ if (REP_ON(env) && (ret = __repmgr_failchk(env)) != 0)
+ goto err;
+#endif
+
+ /* Mark any dead blocked threads as dead. */
+ __env_clear_state(env);
+
+#ifdef HAVE_MUTEX_SUPPORT
+ ret = __mut_failchk(env);
+#endif
+
+err: F_CLR(dbenv, DB_ENV_FAILCHK);
+ return (ret);
+}
+
+/*
+ * __env_thread_size --
+ * Initial amount of memory for thread info blocks.
+ * PUBLIC: size_t __env_thread_size __P((ENV *, size_t));
+ */
+size_t
+__env_thread_size(env, other_alloc)
+ ENV *env;
+ size_t other_alloc;
+{
+ DB_ENV *dbenv;
+ size_t size;
+ u_int32_t max;
+
+ dbenv = env->dbenv;
+ size = 0;
+
+ max = dbenv->thr_max;
+ if (dbenv->thr_init != 0) {
+ size =
+ dbenv->thr_init * __env_alloc_size(sizeof(DB_THREAD_INFO));
+ if (max < dbenv->thr_init)
+ max = dbenv->thr_init;
+ } else if (max == 0 && ALIVE_ON(env)) {
+ if ((max = dbenv->tx_init) == 0) {
+ /*
+ * They want thread tracking, but don't say how much.
+ * Arbitrarily assume 1/10 of the remaining memory
+ * or at least 100. We just use this to size
+ * the hash table.
+ */
+ if (dbenv->memory_max != 0)
+ max = (u_int32_t)
+ (((dbenv->memory_max - other_alloc) / 10) /
+ sizeof(DB_THREAD_INFO));
+ if (max < 100)
+ max = 100;
+ }
+ }
+ /*
+ * Set the number of buckets to be 1/8th the number of
+ * thread control blocks. This is rather arbitrary.
+ */
+ dbenv->thr_max = max;
+ if (max != 0)
+ size += __env_alloc_size(sizeof(DB_HASHTAB) *
+ __db_tablesize(max / 8));
+ return (size);
+}
+
+/*
+ * __env_thread_max --
+ * Return the amount of extra memory to hold thread information.
+ * PUBLIC: size_t __env_thread_max __P((ENV *));
+ */
+size_t
+__env_thread_max(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ size_t size;
+
+ dbenv = env->dbenv;
+
+ /*
+ * Allocate space for thread info blocks. Max is only advisory,
+ * so we allocate 25% more.
+ */
+ if (dbenv->thr_max > dbenv->thr_init) {
+ size = dbenv->thr_max - dbenv->thr_init;
+ size += size / 4;
+ } else {
+ dbenv->thr_max = dbenv->thr_init;
+ size = dbenv->thr_init / 4;
+ }
+
+ size = size * __env_alloc_size(sizeof(DB_THREAD_INFO));
+ return (size);
+}
+
+/*
+ * __env_thread_init --
+ * Initialize the thread control block table.
+ *
+ * PUBLIC: int __env_thread_init __P((ENV *, int));
+ */
+int
+__env_thread_init(env, during_creation)
+ ENV *env;
+ int during_creation;
+{
+ DB_ENV *dbenv;
+ DB_HASHTAB *htab;
+ REGENV *renv;
+ REGINFO *infop;
+ THREAD_INFO *thread;
+ int ret;
+
+ dbenv = env->dbenv;
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ if (renv->thread_off == INVALID_ROFF) {
+ if (dbenv->thr_max == 0) {
+ env->thr_hashtab = NULL;
+ if (ALIVE_ON(env)) {
+ __db_errx(env, DB_STR("1504",
+ "is_alive method specified but no thread region allocated"));
+ return (EINVAL);
+ }
+ return (0);
+ }
+
+ if (!during_creation) {
+ __db_errx(env, DB_STR("1505",
+"thread table must be allocated when the database environment is created"));
+ return (EINVAL);
+ }
+
+ if ((ret =
+ __env_alloc(infop, sizeof(THREAD_INFO), &thread)) != 0) {
+ __db_err(env, ret, DB_STR("1506",
+ "unable to allocate a thread status block"));
+ return (ret);
+ }
+ memset(thread, 0, sizeof(*thread));
+ renv->thread_off = R_OFFSET(infop, thread);
+ thread->thr_nbucket = __db_tablesize(dbenv->thr_max / 8);
+ if ((ret = __env_alloc(infop,
+ thread->thr_nbucket * sizeof(DB_HASHTAB), &htab)) != 0)
+ return (ret);
+ thread->thr_hashoff = R_OFFSET(infop, htab);
+ __db_hashinit(htab, thread->thr_nbucket);
+ thread->thr_max = dbenv->thr_max;
+ thread->thr_init = dbenv->thr_init;
+ } else {
+ thread = R_ADDR(infop, renv->thread_off);
+ htab = R_ADDR(infop, thread->thr_hashoff);
+ }
+
+ env->thr_hashtab = htab;
+ env->thr_nbucket = thread->thr_nbucket;
+ dbenv->thr_max = thread->thr_max;
+ dbenv->thr_init = thread->thr_init;
+ return (0);
+}
+
+/*
+ * __env_thread_destroy --
+ * Destroy the thread control block table.
+ *
+ * PUBLIC: void __env_thread_destroy __P((ENV *));
+ */
+void
+__env_thread_destroy(env)
+ ENV *env;
+{
+ DB_HASHTAB *htab;
+ DB_THREAD_INFO *ip, *np;
+ REGENV *renv;
+ REGINFO *infop;
+ THREAD_INFO *thread;
+ u_int32_t i;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ if (renv->thread_off == INVALID_ROFF)
+ return;
+
+ thread = R_ADDR(infop, renv->thread_off);
+ if ((htab = env->thr_hashtab) != NULL) {
+ for (i = 0; i < env->thr_nbucket; i++) {
+ ip = SH_TAILQ_FIRST(&htab[i], __db_thread_info);
+ for (; ip != NULL; ip = np) {
+ np = SH_TAILQ_NEXT(ip,
+ dbth_links, __db_thread_info);
+ __env_alloc_free(infop, ip);
+ }
+ }
+ __env_alloc_free(infop, htab);
+ }
+
+ __env_alloc_free(infop, thread);
+ return;
+}
+
+/*
+ * __env_in_api --
+ * Look for threads which died in the api and complain.
+ * If no threads died but there are blocked threads unpin
+ * any buffers they may have locked.
+ */
+static int
+__env_in_api(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ DB_HASHTAB *htab;
+ DB_THREAD_INFO *ip;
+ REGENV *renv;
+ REGINFO *infop;
+ THREAD_INFO *thread;
+ u_int32_t i;
+ int unpin, ret;
+
+ if ((htab = env->thr_hashtab) == NULL)
+ return (EINVAL);
+
+ dbenv = env->dbenv;
+ infop = env->reginfo;
+ renv = infop->primary;
+ thread = R_ADDR(infop, renv->thread_off);
+ unpin = 0;
+
+ for (i = 0; i < env->thr_nbucket; i++)
+ SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) {
+ if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE ||
+ (ip->dbth_state == THREAD_OUT &&
+ thread->thr_count < thread->thr_max))
+ continue;
+ if (dbenv->is_alive(
+ dbenv, ip->dbth_pid, ip->dbth_tid, 0))
+ continue;
+ if (ip->dbth_state == THREAD_BLOCKED) {
+ ip->dbth_state = THREAD_BLOCKED_DEAD;
+ unpin = 1;
+ continue;
+ }
+ if (ip->dbth_state == THREAD_OUT) {
+ ip->dbth_state = THREAD_SLOT_NOT_IN_USE;
+ continue;
+ }
+ return (__db_failed(env, DB_STR("1507",
+ "Thread died in Berkeley DB library"),
+ ip->dbth_pid, ip->dbth_tid));
+ }
+
+ if (unpin == 0)
+ return (0);
+
+ for (i = 0; i < env->thr_nbucket; i++)
+ SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info)
+ if (ip->dbth_state == THREAD_BLOCKED_DEAD &&
+ (ret = __memp_unpin_buffers(env, ip)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __env_clear_state --
+ * Look for threads which died while blockedi and clear them..
+ */
+static void
+__env_clear_state(env)
+ ENV *env;
+{
+ DB_HASHTAB *htab;
+ DB_THREAD_INFO *ip;
+ u_int32_t i;
+
+ htab = env->thr_hashtab;
+ for (i = 0; i < env->thr_nbucket; i++)
+ SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info)
+ if (ip->dbth_state == THREAD_BLOCKED_DEAD)
+ ip->dbth_state = THREAD_SLOT_NOT_IN_USE;
+}
+
+struct __db_threadid {
+ pid_t pid;
+ db_threadid_t tid;
+};
+
+/*
+ * PUBLIC: int __env_set_state __P((ENV *, DB_THREAD_INFO **, DB_THREAD_STATE));
+ */
+int
+__env_set_state(env, ipp, state)
+ ENV *env;
+ DB_THREAD_INFO **ipp;
+ DB_THREAD_STATE state;
+{
+ struct __db_threadid id;
+ DB_ENV *dbenv;
+ DB_HASHTAB *htab;
+ DB_THREAD_INFO *ip;
+ REGENV *renv;
+ REGINFO *infop;
+ THREAD_INFO *thread;
+ u_int32_t indx;
+ int ret;
+
+ dbenv = env->dbenv;
+ htab = env->thr_hashtab;
+
+ if (F_ISSET(dbenv, DB_ENV_NOLOCKING)) {
+ *ipp = NULL;
+ return (0);
+ }
+ dbenv->thread_id(dbenv, &id.pid, &id.tid);
+
+ /*
+ * Hashing of thread ids. This is simple but could be replaced with
+ * something more expensive if needed.
+ */
+#ifdef HAVE_SIMPLE_THREAD_TYPE
+ /*
+ * A thread ID may be a pointer, so explicitly cast to a pointer of
+ * the appropriate size before doing the bitwise XOR.
+ */
+ indx = (u_int32_t)((uintptr_t)id.pid ^ (uintptr_t)id.tid);
+#else
+ indx = __ham_func5(NULL, &id.tid, sizeof(id.tid));
+#endif
+ indx %= env->thr_nbucket;
+ SH_TAILQ_FOREACH(ip, &htab[indx], dbth_links, __db_thread_info) {
+#ifdef HAVE_SIMPLE_THREAD_TYPE
+ if (id.pid == ip->dbth_pid && id.tid == ip->dbth_tid)
+ break;
+#else
+ if (memcmp(&id.pid, &ip->dbth_pid, sizeof(id.pid)) != 0)
+ continue;
+#ifdef HAVE_MUTEX_PTHREADS
+ if (pthread_equal(id.tid, ip->dbth_tid) == 0)
+#else
+ if (memcmp(&id.tid, &ip->dbth_tid, sizeof(id.tid)) != 0)
+#endif
+ continue;
+ break;
+#endif
+ }
+
+ /*
+ * If ipp is not null, return the thread control block if found.
+ * Check to ensure the thread of control has been registered.
+ */
+ if (state == THREAD_VERIFY) {
+ DB_ASSERT(env, ip != NULL && ip->dbth_state != THREAD_OUT);
+ if (ipp != NULL) {
+ if (ip == NULL) /* The control block wasn't found */
+ return (EINVAL);
+ *ipp = ip;
+ }
+ return (0);
+ }
+
+ *ipp = NULL;
+ ret = 0;
+ if (ip == NULL) {
+ infop = env->reginfo;
+ renv = infop->primary;
+ thread = R_ADDR(infop, renv->thread_off);
+ MUTEX_LOCK(env, renv->mtx_regenv);
+
+ /*
+ * If we are passed the specified max, try to reclaim one from
+ * our queue. If failcheck has marked the slot not in use, we
+ * can take it, otherwise we must call is_alive before freeing
+ * it.
+ */
+ if (thread->thr_count >= thread->thr_max) {
+ SH_TAILQ_FOREACH(
+ ip, &htab[indx], dbth_links, __db_thread_info)
+ if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE ||
+ (ip->dbth_state == THREAD_OUT &&
+ ALIVE_ON(env) && !dbenv->is_alive(
+ dbenv, ip->dbth_pid, ip->dbth_tid, 0)))
+ break;
+
+ if (ip != NULL) {
+ DB_ASSERT(env, ip->dbth_pincount == 0);
+ goto init;
+ }
+ }
+
+ thread->thr_count++;
+ if ((ret = __env_alloc(infop,
+ sizeof(DB_THREAD_INFO), &ip)) == 0) {
+ memset(ip, 0, sizeof(*ip));
+ /*
+ * This assumes we can link atomically since we do
+ * no locking here. We never use the backpointer
+ * so we only need to be able to write an offset
+ * atomically.
+ */
+ SH_TAILQ_INSERT_HEAD(
+ &htab[indx], ip, dbth_links, __db_thread_info);
+ ip->dbth_pincount = 0;
+ ip->dbth_pinmax = PINMAX;
+ ip->dbth_pinlist = R_OFFSET(infop, ip->dbth_pinarray);
+
+init: ip->dbth_pid = id.pid;
+ ip->dbth_tid = id.tid;
+ ip->dbth_state = state;
+ SH_TAILQ_INIT(&ip->dbth_xatxn);
+ }
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ } else
+ ip->dbth_state = state;
+ *ipp = ip;
+
+ DB_ASSERT(env, ret == 0);
+ if (ret != 0)
+ __db_errx(env, DB_STR("1508",
+ "Unable to allocate thread control block"));
+ return (ret);
+}
+
+/*
+ * __env_thread_id_string --
+ * Convert a thread id to a string.
+ *
+ * PUBLIC: char *__env_thread_id_string
+ * PUBLIC: __P((DB_ENV *, pid_t, db_threadid_t, char *));
+ */
+char *
+__env_thread_id_string(dbenv, pid, tid, buf)
+ DB_ENV *dbenv;
+ pid_t pid;
+ db_threadid_t tid;
+ char *buf;
+{
+#ifdef HAVE_SIMPLE_THREAD_TYPE
+#ifdef UINT64_FMT
+ char fmt[20];
+
+ snprintf(fmt, sizeof(fmt), "%s/%s", UINT64_FMT, UINT64_FMT);
+ snprintf(buf,
+ DB_THREADID_STRLEN, fmt, (u_int64_t)pid, (u_int64_t)(uintptr_t)tid);
+#else
+ snprintf(buf, DB_THREADID_STRLEN, "%lu/%lu", (u_long)pid, (u_long)tid);
+#endif
+#else
+#ifdef UINT64_FMT
+ char fmt[20];
+
+ snprintf(fmt, sizeof(fmt), "%s/TID", UINT64_FMT);
+ snprintf(buf, DB_THREADID_STRLEN, fmt, (u_int64_t)pid);
+#else
+ snprintf(buf, DB_THREADID_STRLEN, "%lu/TID", (u_long)pid);
+#endif
+#endif
+ COMPQUIET(dbenv, NULL);
+ COMPQUIET(*(u_int8_t *)&tid, 0);
+
+ return (buf);
+}
diff --git a/src/env/env_file.c b/src/env/env_file.c
new file mode 100644
index 00000000..b102404d
--- /dev/null
+++ b/src/env/env_file.c
@@ -0,0 +1,128 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2002, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_file_extend --
+ * Initialize a regular file by writing the last page of the file.
+ *
+ * PUBLIC: int __db_file_extend __P((ENV *, DB_FH *, size_t));
+ */
+int
+__db_file_extend(env, fhp, size)
+ ENV *env;
+ DB_FH *fhp;
+ size_t size;
+{
+ db_pgno_t pages;
+ size_t nw;
+ u_int32_t relative;
+ int ret;
+ char buf;
+
+ buf = '\0';
+ /*
+ * Extend the file by writing the last page. If the region is >4Gb,
+ * increment may be larger than the maximum possible seek "relative"
+ * argument, as it's an unsigned 32-bit value. Break the offset into
+ * pages of 1MB each so we don't overflow -- (2^20 * 2^32 is bigger
+ * than any memory I expect to see for awhile).
+ */
+ pages = (db_pgno_t)((size - sizeof(buf)) / MEGABYTE);
+ relative = (u_int32_t)((size - sizeof(buf)) % MEGABYTE);
+ if ((ret = __os_seek(env, fhp, pages, MEGABYTE, relative)) == 0)
+ ret = __os_write(env, fhp, &buf, sizeof(buf), &nw);
+
+ return (ret);
+}
+
+/*
+ * __db_file_multi_write --
+ * Overwrite a file with multiple passes to corrupt the data.
+ *
+ * PUBLIC: int __db_file_multi_write __P((ENV *, const char *));
+ */
+int
+__db_file_multi_write(env, path)
+ ENV *env;
+ const char *path;
+{
+ DB_FH *fhp;
+ u_int32_t mbytes, bytes;
+ int ret;
+
+ if ((ret = __os_open(env, path, 0, DB_OSO_REGION, 0, &fhp)) == 0 &&
+ (ret = __os_ioinfo(env, path, fhp, &mbytes, &bytes, NULL)) == 0) {
+ /*
+ * !!!
+ * Overwrite a regular file with alternating 0xff, 0x00 and 0xff
+ * byte patterns. Implies a fixed-block filesystem, journaling
+ * or logging filesystems will require operating system support.
+ */
+ if ((ret =
+ __db_file_write(env, fhp, mbytes, bytes, 255)) != 0)
+ goto err;
+ if ((ret =
+ __db_file_write(env, fhp, mbytes, bytes, 0)) != 0)
+ goto err;
+ if ((ret =
+ __db_file_write(env, fhp, mbytes, bytes, 255)) != 0)
+ goto err;
+ } else
+ __db_err(env, ret, "%s", path);
+
+err: if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ return (ret);
+}
+
+/*
+ * __db_file_write --
+ * A single pass over the file, writing the specified byte pattern.
+ *
+ * PUBLIC: int __db_file_write __P((ENV *,
+ * PUBLIC: DB_FH *, u_int32_t, u_int32_t, int));
+ */
+int
+__db_file_write(env, fhp, mbytes, bytes, pattern)
+ ENV *env;
+ DB_FH *fhp;
+ int pattern;
+ u_int32_t mbytes, bytes;
+{
+ size_t len, nw;
+ int i, ret;
+ char *buf;
+
+#undef FILE_WRITE_IO_SIZE
+#define FILE_WRITE_IO_SIZE (64 * 1024)
+ if ((ret = __os_malloc(env, FILE_WRITE_IO_SIZE, &buf)) != 0)
+ return (ret);
+ memset(buf, pattern, FILE_WRITE_IO_SIZE);
+
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ for (; mbytes > 0; --mbytes)
+ for (i = MEGABYTE / FILE_WRITE_IO_SIZE; i > 0; --i)
+ if ((ret = __os_write(
+ env, fhp, buf, FILE_WRITE_IO_SIZE, &nw)) != 0)
+ goto err;
+ for (; bytes > 0; bytes -= (u_int32_t)len) {
+ len = bytes < FILE_WRITE_IO_SIZE ? bytes : FILE_WRITE_IO_SIZE;
+ if ((ret = __os_write(env, fhp, buf, len, &nw)) != 0)
+ goto err;
+ }
+
+ ret = __os_fsync(env, fhp);
+
+err: __os_free(env, buf);
+ return (ret);
+}
diff --git a/src/env/env_globals.c b/src/env/env_globals.c
new file mode 100644
index 00000000..955e6738
--- /dev/null
+++ b/src/env/env_globals.c
@@ -0,0 +1,66 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * A structure with static initialization values for all of the global fields
+ * used by Berkeley DB.
+ * See dbinc/globals.h for the structure definition.
+ */
+DB_GLOBALS __db_global_values = {
+#ifdef HAVE_VXWORKS
+ 0, /* VxWorks: db_global_init */
+ NULL, /* VxWorks: db_global_lock */
+#endif
+#ifdef DB_WIN32
+#ifndef DB_WINCE
+ { 0 }, /* SECURITY_DESCRIPTOR win_default_sec_desc */
+ { 0 }, /* SECURITY_ATTRIBUTES win_default_sec_attr */
+#endif
+ NULL, /* SECURITY_ATTRIBUTES *win_sec_attr */
+#endif
+ { NULL, NULL }, /* XA env list */
+
+ "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", /* db_line */
+ { 0 }, /* error_buf */
+ 0, /* uid_init */
+ 0, /* rand_next */
+ 0, /* fid_serial */
+ 0, /* db_errno */
+ 0, /* num_active_pids */
+ 0, /* size_active_pids */
+ NULL, /* active_pids */
+ NULL, /* saved_errstr */
+ NULL, /* j_assert */
+ NULL, /* j_close */
+ NULL, /* j_dirfree */
+ NULL, /* j_dirlist */
+ NULL, /* j_exists*/
+ NULL, /* j_free */
+ NULL, /* j_fsync */
+ NULL, /* j_ftruncate */
+ NULL, /* j_ioinfo */
+ NULL, /* j_malloc */
+ NULL, /* j_file_map */
+ NULL, /* j_file_unmap */
+ NULL, /* j_open */
+ NULL, /* j_pread */
+ NULL, /* j_pwrite */
+ NULL, /* j_read */
+ NULL, /* j_realloc */
+ NULL, /* j_region_map */
+ NULL, /* j_region_unmap */
+ NULL, /* j_rename */
+ NULL, /* j_seek */
+ NULL, /* j_unlink */
+ NULL, /* j_write */
+ NULL /* j_yield */
+};
diff --git a/src/env/env_method.c b/src/env/env_method.c
new file mode 100644
index 00000000..63deacea
--- /dev/null
+++ b/src/env/env_method.c
@@ -0,0 +1,1918 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id: env_method.c,v dabaaeb7d839 2010/08/03 17:28:53 mike $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_env_init __P((DB_ENV *));
+static void __env_err __P((const DB_ENV *, int, const char *, ...));
+static void __env_errx __P((const DB_ENV *, const char *, ...));
+static int __env_get_create_dir __P((DB_ENV *, const char **));
+static int __env_get_data_dirs __P((DB_ENV *, const char ***));
+static int __env_get_data_len __P((DB_ENV *, u_int32_t *));
+static int __env_get_flags __P((DB_ENV *, u_int32_t *));
+static int __env_get_home __P((DB_ENV *, const char **));
+static int __env_get_intermediate_dir_mode __P((DB_ENV *, const char **));
+static int __env_get_metadata_dir __P((DB_ENV *, const char **));
+static int __env_get_shm_key __P((DB_ENV *, long *));
+static int __env_get_thread_count __P((DB_ENV *, u_int32_t *));
+static int __env_get_thread_id_fn __P((DB_ENV *,
+ void (**)(DB_ENV *, pid_t *, db_threadid_t *)));
+static int __env_get_thread_id_string_fn __P((DB_ENV *,
+ char * (**)(DB_ENV *, pid_t, db_threadid_t, char *)));
+static int __env_get_timeout __P((DB_ENV *, db_timeout_t *, u_int32_t));
+static int __env_get_tmp_dir __P((DB_ENV *, const char **));
+static int __env_get_verbose __P((DB_ENV *, u_int32_t, int *));
+static int __env_get_app_dispatch
+ __P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+static int __env_set_app_dispatch
+ __P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+static int __env_set_event_notify
+ __P((DB_ENV *, void (*)(DB_ENV *, u_int32_t, void *)));
+static int __env_get_feedback __P((DB_ENV *, void (**)(DB_ENV *, int, int)));
+static int __env_set_feedback __P((DB_ENV *, void (*)(DB_ENV *, int, int)));
+static int __env_get_isalive __P((DB_ENV *,
+ int (**)(DB_ENV *, pid_t, db_threadid_t, u_int32_t)));
+static int __env_set_isalive __P((DB_ENV *,
+ int (*)(DB_ENV *, pid_t, db_threadid_t, u_int32_t)));
+static int __env_set_thread_id __P((DB_ENV *, void (*)(DB_ENV *,
+ pid_t *, db_threadid_t *)));
+static int __env_set_thread_id_string __P((DB_ENV *,
+ char * (*)(DB_ENV *, pid_t, db_threadid_t, char *)));
+
+/*
+ * db_env_create --
+ * DB_ENV constructor.
+ *
+ * EXTERN: int db_env_create __P((DB_ENV **, u_int32_t));
+ */
+int
+db_env_create(dbenvpp, flags)
+ DB_ENV **dbenvpp;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ ENV *env;
+ int ret;
+
+ /*
+ * !!!
+ * Our caller has not yet had the opportunity to reset the panic
+ * state or turn off mutex locking, and so we can neither check
+ * the panic state or acquire a mutex in the DB_ENV create path.
+ *
+ * !!!
+ * We can't call the flags-checking routines, we don't have an
+ * environment yet.
+ */
+ if (flags != 0)
+ return (EINVAL);
+
+ /* Allocate the DB_ENV and ENV structures -- we always have both. */
+ if ((ret = __os_calloc(NULL, 1, sizeof(DB_ENV), &dbenv)) != 0)
+ return (ret);
+ if ((ret = __os_calloc(NULL, 1, sizeof(ENV), &env)) != 0)
+ goto err;
+ dbenv->env = env;
+ env->dbenv = dbenv;
+
+ if ((ret = __db_env_init(dbenv)) != 0 ||
+ (ret = __lock_env_create(dbenv)) != 0 ||
+ (ret = __log_env_create(dbenv)) != 0 ||
+ (ret = __memp_env_create(dbenv)) != 0 ||
+#ifdef HAVE_REPLICATION
+ (ret = __rep_env_create(dbenv)) != 0 ||
+#endif
+ (ret = __txn_env_create(dbenv)))
+ goto err;
+
+ *dbenvpp = dbenv;
+ return (0);
+
+err: __db_env_destroy(dbenv);
+ return (ret);
+}
+
+/*
+ * __db_env_destroy --
+ * DB_ENV destructor.
+ *
+ * PUBLIC: void __db_env_destroy __P((DB_ENV *));
+ */
+void
+__db_env_destroy(dbenv)
+ DB_ENV *dbenv;
+{
+ __lock_env_destroy(dbenv);
+ __log_env_destroy(dbenv);
+ __memp_env_destroy(dbenv);
+#ifdef HAVE_REPLICATION
+ __rep_env_destroy(dbenv);
+#endif
+ __txn_env_destroy(dbenv);
+
+ /*
+ * Discard the underlying ENV structure.
+ *
+ * XXX
+ * This is wrong, but can't be fixed until we finish the work of
+ * splitting up the DB_ENV and ENV structures so that we don't
+ * touch anything in the ENV as part of the above calls to subsystem
+ * DB_ENV cleanup routines.
+ */
+ memset(dbenv->env, CLEAR_BYTE, sizeof(ENV));
+ __os_free(NULL, dbenv->env);
+
+ memset(dbenv, CLEAR_BYTE, sizeof(DB_ENV));
+ __os_free(NULL, dbenv);
+}
+
+/*
+ * __db_env_init --
+ * Initialize a DB_ENV structure.
+ */
+static int
+__db_env_init(dbenv)
+ DB_ENV *dbenv;
+{
+ ENV *env;
+ /*
+ * !!!
+ * Our caller has not yet had the opportunity to reset the panic
+ * state or turn off mutex locking, and so we can neither check
+ * the panic state or acquire a mutex in the DB_ENV create path.
+ *
+ * Initialize the method handles.
+ */
+ /* DB_ENV PUBLIC HANDLE LIST BEGIN */
+ dbenv->add_data_dir = __env_add_data_dir;
+ dbenv->backup = __db_backup;
+ dbenv->dbbackup = __db_dbbackup_pp;
+ dbenv->cdsgroup_begin = __cdsgroup_begin_pp;
+ dbenv->close = __env_close_pp;
+ dbenv->dbremove = __env_dbremove_pp;
+ dbenv->dbrename = __env_dbrename_pp;
+ dbenv->err = __env_err;
+ dbenv->errx = __env_errx;
+ dbenv->failchk = __env_failchk_pp;
+ dbenv->fileid_reset = __env_fileid_reset_pp;
+ dbenv->get_alloc = __env_get_alloc;
+ dbenv->get_app_dispatch = __env_get_app_dispatch;
+ dbenv->get_cache_max = __memp_get_cache_max;
+ dbenv->get_cachesize = __memp_get_cachesize;
+ dbenv->get_backup_callbacks = __env_get_backup_callbacks;
+ dbenv->get_backup_config = __env_get_backup_config;
+ dbenv->get_create_dir = __env_get_create_dir;
+ dbenv->get_data_dirs = __env_get_data_dirs;
+ dbenv->get_data_len = __env_get_data_len;
+ dbenv->get_encrypt_flags = __env_get_encrypt_flags;
+ dbenv->get_errcall = __env_get_errcall;
+ dbenv->get_errfile = __env_get_errfile;
+ dbenv->get_errpfx = __env_get_errpfx;
+ dbenv->get_feedback = __env_get_feedback;
+ dbenv->get_flags = __env_get_flags;
+ dbenv->get_home = __env_get_home;
+ dbenv->get_intermediate_dir_mode = __env_get_intermediate_dir_mode;
+ dbenv->get_isalive = __env_get_isalive;
+ dbenv->get_lg_bsize = __log_get_lg_bsize;
+ dbenv->get_lg_dir = __log_get_lg_dir;
+ dbenv->get_lg_filemode = __log_get_lg_filemode;
+ dbenv->get_lg_max = __log_get_lg_max;
+ dbenv->get_lg_regionmax = __log_get_lg_regionmax;
+ dbenv->get_lk_conflicts = __lock_get_lk_conflicts;
+ dbenv->get_lk_detect = __lock_get_lk_detect;
+ dbenv->get_lk_max_lockers = __lock_get_lk_max_lockers;
+ dbenv->get_lk_max_locks = __lock_get_lk_max_locks;
+ dbenv->get_lk_max_objects = __lock_get_lk_max_objects;
+ dbenv->get_lk_partitions = __lock_get_lk_partitions;
+ dbenv->get_lk_priority = __lock_get_lk_priority;
+ dbenv->get_lk_tablesize = __lock_get_lk_tablesize;
+ dbenv->get_memory_init = __env_get_memory_init;
+ dbenv->get_memory_max = __env_get_memory_max;
+ dbenv->get_metadata_dir = __env_get_metadata_dir;
+ dbenv->get_mp_max_openfd = __memp_get_mp_max_openfd;
+ dbenv->get_mp_max_write = __memp_get_mp_max_write;
+ dbenv->get_mp_mmapsize = __memp_get_mp_mmapsize;
+ dbenv->get_mp_mtxcount = __memp_get_mp_mtxcount;
+ dbenv->get_mp_pagesize = __memp_get_mp_pagesize;
+ dbenv->get_mp_tablesize = __memp_get_mp_tablesize;
+ dbenv->get_msgcall = __env_get_msgcall;
+ dbenv->get_msgfile = __env_get_msgfile;
+ dbenv->get_open_flags = __env_get_open_flags;
+ dbenv->get_shm_key = __env_get_shm_key;
+ dbenv->get_thread_count = __env_get_thread_count;
+ dbenv->get_thread_id_fn = __env_get_thread_id_fn;
+ dbenv->get_thread_id_string_fn = __env_get_thread_id_string_fn;
+ dbenv->get_timeout = __env_get_timeout;
+ dbenv->get_tmp_dir = __env_get_tmp_dir;
+ dbenv->get_tx_max = __txn_get_tx_max;
+ dbenv->get_tx_timestamp = __txn_get_tx_timestamp;
+ dbenv->get_verbose = __env_get_verbose;
+ dbenv->is_bigendian = __db_isbigendian;
+ dbenv->lock_detect = __lock_detect_pp;
+ dbenv->lock_get = __lock_get_pp;
+ dbenv->lock_id = __lock_id_pp;
+ dbenv->lock_id_free = __lock_id_free_pp;
+ dbenv->lock_put = __lock_put_pp;
+ dbenv->lock_stat = __lock_stat_pp;
+ dbenv->lock_stat_print = __lock_stat_print_pp;
+ dbenv->lock_vec = __lock_vec_pp;
+ dbenv->log_archive = __log_archive_pp;
+ dbenv->log_cursor = __log_cursor_pp;
+ dbenv->log_file = __log_file_pp;
+ dbenv->log_flush = __log_flush_pp;
+ dbenv->log_get_config = __log_get_config;
+ dbenv->log_printf = __log_printf_capi;
+ dbenv->log_put = __log_put_pp;
+ dbenv->log_put_record = __log_put_record_pp;
+ dbenv->log_read_record = __log_read_record_pp;
+ dbenv->log_set_config = __log_set_config;
+ dbenv->log_stat = __log_stat_pp;
+ dbenv->log_stat_print = __log_stat_print_pp;
+ dbenv->log_verify = __log_verify_pp;
+ dbenv->lsn_reset = __env_lsn_reset_pp;
+ dbenv->memp_fcreate = __memp_fcreate_pp;
+ dbenv->memp_register = __memp_register_pp;
+ dbenv->memp_stat = __memp_stat_pp;
+ dbenv->memp_stat_print = __memp_stat_print_pp;
+ dbenv->memp_sync = __memp_sync_pp;
+ dbenv->memp_trickle = __memp_trickle_pp;
+ dbenv->mutex_alloc = __mutex_alloc_pp;
+ dbenv->mutex_free = __mutex_free_pp;
+ dbenv->mutex_get_align = __mutex_get_align;
+ dbenv->mutex_get_increment = __mutex_get_increment;
+ dbenv->mutex_get_init = __mutex_get_init;
+ dbenv->mutex_get_max = __mutex_get_max;
+ dbenv->mutex_get_tas_spins = __mutex_get_tas_spins;
+ dbenv->mutex_lock = __mutex_lock_pp;
+ dbenv->mutex_set_align = __mutex_set_align;
+ dbenv->mutex_set_increment = __mutex_set_increment;
+ dbenv->mutex_set_init = __mutex_set_init;
+ dbenv->mutex_set_max = __mutex_set_max;
+ dbenv->mutex_set_tas_spins = __mutex_set_tas_spins;
+ dbenv->mutex_stat = __mutex_stat_pp;
+ dbenv->mutex_stat_print = __mutex_stat_print_pp;
+ dbenv->mutex_unlock = __mutex_unlock_pp;
+ dbenv->open = __env_open_pp;
+ dbenv->remove = __env_remove;
+ dbenv->rep_elect = __rep_elect_pp;
+ dbenv->rep_flush = __rep_flush;
+ dbenv->rep_get_clockskew = __rep_get_clockskew;
+ dbenv->rep_get_config = __rep_get_config;
+ dbenv->rep_get_limit = __rep_get_limit;
+ dbenv->rep_get_nsites = __rep_get_nsites;
+ dbenv->rep_get_priority = __rep_get_priority;
+ dbenv->rep_get_request = __rep_get_request;
+ dbenv->rep_get_timeout = __rep_get_timeout;
+ dbenv->rep_process_message = __rep_process_message_pp;
+ dbenv->rep_set_clockskew = __rep_set_clockskew;
+ dbenv->rep_set_config = __rep_set_config;
+ dbenv->rep_set_limit = __rep_set_limit;
+ dbenv->rep_set_nsites = __rep_set_nsites_pp;
+ dbenv->rep_set_priority = __rep_set_priority;
+ dbenv->rep_set_request = __rep_set_request;
+ dbenv->rep_set_timeout = __rep_set_timeout;
+ dbenv->rep_set_transport = __rep_set_transport_pp;
+ dbenv->rep_start = __rep_start_pp;
+ dbenv->rep_stat = __rep_stat_pp;
+ dbenv->rep_stat_print = __rep_stat_print_pp;
+ dbenv->rep_sync = __rep_sync;
+ dbenv->repmgr_channel = __repmgr_channel;
+ dbenv->repmgr_get_ack_policy = __repmgr_get_ack_policy;
+ dbenv->repmgr_local_site = __repmgr_local_site;
+ dbenv->repmgr_msg_dispatch = __repmgr_set_msg_dispatch;
+ dbenv->repmgr_set_ack_policy = __repmgr_set_ack_policy;
+ dbenv->repmgr_site = __repmgr_site;
+ dbenv->repmgr_site_by_eid = __repmgr_site_by_eid;
+ dbenv->repmgr_site_list = __repmgr_site_list;
+ dbenv->repmgr_start = __repmgr_start;
+ dbenv->repmgr_stat = __repmgr_stat_pp;
+ dbenv->repmgr_stat_print = __repmgr_stat_print_pp;
+ dbenv->set_alloc = __env_set_alloc;
+ dbenv->set_app_dispatch = __env_set_app_dispatch;
+ dbenv->set_backup_callbacks = __env_set_backup_callbacks;
+ dbenv->set_backup_config = __env_set_backup_config;
+ dbenv->set_cache_max = __memp_set_cache_max;
+ dbenv->set_cachesize = __memp_set_cachesize;
+ dbenv->set_create_dir = __env_set_create_dir;
+ dbenv->set_data_dir = __env_set_data_dir;
+ dbenv->set_data_len = __env_set_data_len;
+ dbenv->set_encrypt = __env_set_encrypt;
+ dbenv->set_errcall = __env_set_errcall;
+ dbenv->set_errfile = __env_set_errfile;
+ dbenv->set_errpfx = __env_set_errpfx;
+ dbenv->set_event_notify = __env_set_event_notify;
+ dbenv->set_feedback = __env_set_feedback;
+ dbenv->set_flags = __env_set_flags;
+ dbenv->set_intermediate_dir_mode = __env_set_intermediate_dir_mode;
+ dbenv->set_isalive = __env_set_isalive;
+ dbenv->set_lg_bsize = __log_set_lg_bsize;
+ dbenv->set_lg_dir = __log_set_lg_dir;
+ dbenv->set_lg_filemode = __log_set_lg_filemode;
+ dbenv->set_lg_max = __log_set_lg_max;
+ dbenv->set_lg_regionmax = __log_set_lg_regionmax;
+ dbenv->set_lk_conflicts = __lock_set_lk_conflicts;
+ dbenv->set_lk_detect = __lock_set_lk_detect;
+ dbenv->set_lk_max_lockers = __lock_set_lk_max_lockers;
+ dbenv->set_lk_max_locks = __lock_set_lk_max_locks;
+ dbenv->set_lk_max_objects = __lock_set_lk_max_objects;
+ dbenv->set_lk_partitions = __lock_set_lk_partitions;
+ dbenv->set_lk_priority = __lock_set_lk_priority;
+ dbenv->set_lk_tablesize = __lock_set_lk_tablesize;
+ dbenv->set_memory_init = __env_set_memory_init;
+ dbenv->set_memory_max = __env_set_memory_max;
+ dbenv->set_metadata_dir = __env_set_metadata_dir;
+ dbenv->set_mp_max_openfd = __memp_set_mp_max_openfd;
+ dbenv->set_mp_max_write = __memp_set_mp_max_write;
+ dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
+ dbenv->set_mp_mtxcount = __memp_set_mp_mtxcount;
+ dbenv->set_mp_pagesize = __memp_set_mp_pagesize;
+ dbenv->set_mp_tablesize = __memp_set_mp_tablesize;
+ dbenv->set_msgcall = __env_set_msgcall;
+ dbenv->set_msgfile = __env_set_msgfile;
+ dbenv->set_paniccall = __env_set_paniccall;
+ dbenv->set_shm_key = __env_set_shm_key;
+ dbenv->set_thread_count = __env_set_thread_count;
+ dbenv->set_thread_id = __env_set_thread_id;
+ dbenv->set_thread_id_string = __env_set_thread_id_string;
+ dbenv->set_timeout = __env_set_timeout;
+ dbenv->set_tmp_dir = __env_set_tmp_dir;
+ dbenv->set_tx_max = __txn_set_tx_max;
+ dbenv->set_tx_timestamp = __txn_set_tx_timestamp;
+ dbenv->set_verbose = __env_set_verbose;
+ dbenv->stat_print = __env_stat_print_pp;
+ dbenv->txn_applied = __txn_applied_pp;
+ dbenv->txn_begin = __txn_begin_pp;
+ dbenv->txn_checkpoint = __txn_checkpoint_pp;
+ dbenv->txn_recover = __txn_recover_pp;
+ dbenv->txn_stat = __txn_stat_pp;
+ dbenv->txn_stat_print = __txn_stat_print_pp;
+ /* DB_ENV PUBLIC HANDLE LIST END */
+
+ /* DB_ENV PRIVATE HANDLE LIST BEGIN */
+ dbenv->prdbt = __db_prdbt;
+ /* DB_ENV PRIVATE HANDLE LIST END */
+
+ dbenv->shm_key = INVALID_REGION_SEGID;
+ dbenv->thread_id = __os_id;
+ dbenv->thread_id_string = __env_thread_id_string;
+
+ env = dbenv->env;
+ __os_id(NULL, &env->pid_cache, NULL);
+
+ env->db_ref = 0;
+ env->log_verify_wrap = __log_verify_wrap;
+ env->data_len = ENV_DEF_DATA_LEN;
+ TAILQ_INIT(&env->fdlist);
+
+ if (!__db_isbigendian())
+ F_SET(env, ENV_LITTLEENDIAN);
+ F_SET(env, ENV_NO_OUTPUT_SET);
+
+ return (0);
+}
+
+/*
+ * __env_err --
+ * DbEnv.err method.
+ */
+static void
+#ifdef STDC_HEADERS
+__env_err(const DB_ENV *dbenv, int error, const char *fmt, ...)
+#else
+__env_err(dbenv, error, fmt, va_alist)
+ const DB_ENV *dbenv;
+ int error;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ /* Message with error string, to stderr by default. */
+ DB_REAL_ERR(dbenv, error, DB_ERROR_SET, 1, fmt);
+}
+
+/*
+ * __env_errx --
+ * DbEnv.errx method.
+ */
+static void
+#ifdef STDC_HEADERS
+__env_errx(const DB_ENV *dbenv, const char *fmt, ...)
+#else
+__env_errx(dbenv, fmt, va_alist)
+ const DB_ENV *dbenv;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ /* Message without error string, to stderr by default. */
+ DB_REAL_ERR(dbenv, 0, DB_ERROR_NOT_SET, 1, fmt);
+}
+
+static int
+__env_get_home(dbenv, homep)
+ DB_ENV *dbenv;
+ const char **homep;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->get_home");
+ *homep = env->db_home;
+
+ return (0);
+}
+
+/*
+ * __env_get_alloc --
+ * {DB_ENV,DB}->get_alloc.
+ *
+ * PUBLIC: int __env_get_alloc __P((DB_ENV *, void *(**)(size_t),
+ * PUBLIC: void *(**)(void *, size_t), void (**)(void *)));
+ */
+int
+__env_get_alloc(dbenv, mal_funcp, real_funcp, free_funcp)
+ DB_ENV *dbenv;
+ void *(**mal_funcp) __P((size_t));
+ void *(**real_funcp) __P((void *, size_t));
+ void (**free_funcp) __P((void *));
+{
+
+ if (mal_funcp != NULL)
+ *mal_funcp = dbenv->db_malloc;
+ if (real_funcp != NULL)
+ *real_funcp = dbenv->db_realloc;
+ if (free_funcp != NULL)
+ *free_funcp = dbenv->db_free;
+ return (0);
+}
+
+/*
+ * __env_set_alloc --
+ * {DB_ENV,DB}->set_alloc.
+ *
+ * PUBLIC: int __env_set_alloc __P((DB_ENV *, void *(*)(size_t),
+ * PUBLIC: void *(*)(void *, size_t), void (*)(void *)));
+ */
+int
+__env_set_alloc(dbenv, mal_func, real_func, free_func)
+ DB_ENV *dbenv;
+ void *(*mal_func) __P((size_t));
+ void *(*real_func) __P((void *, size_t));
+ void (*free_func) __P((void *));
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_alloc");
+
+ dbenv->db_malloc = mal_func;
+ dbenv->db_realloc = real_func;
+ dbenv->db_free = free_func;
+ return (0);
+}
+/*
+ * __env_get_memory_init --
+ * DB_ENV->get_memory_init.
+ *
+ * PUBLIC: int __env_get_memory_init __P((DB_ENV *,
+ * PUBLIC: DB_MEM_CONFIG, u_int32_t *));
+ */
+int
+__env_get_memory_init(dbenv, type, countp)
+ DB_ENV *dbenv;
+ DB_MEM_CONFIG type;
+ u_int32_t *countp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ switch (type) {
+ case DB_MEM_LOCK:
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_memory_init", DB_INIT_LOCK);
+ if (LOCKING_ON(env))
+ *countp = ((DB_LOCKREGION *)
+ env->lk_handle->reginfo.primary)->stat.st_initlocks;
+ else
+ *countp = dbenv->lk_init;
+ break;
+ case DB_MEM_LOCKOBJECT:
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_memory_init", DB_INIT_LOCK);
+ if (LOCKING_ON(env))
+ *countp = ((DB_LOCKREGION *) env->
+ lk_handle->reginfo.primary)->stat.st_initobjects;
+ else
+ *countp = dbenv->lk_init_objects;
+ break;
+ case DB_MEM_LOCKER:
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_memory_init", DB_INIT_LOCK);
+ if (LOCKING_ON(env))
+ *countp = ((DB_LOCKREGION *) env->
+ lk_handle->reginfo.primary)->stat.st_initlockers;
+ else
+ *countp = dbenv->lk_init_lockers;
+ break;
+ case DB_MEM_LOGID:
+ ENV_NOT_CONFIGURED(env,
+ env->lg_handle, "DB_ENV->get_memory_init", DB_INIT_LOG);
+
+ if (LOGGING_ON(env))
+ *countp = ((LOG *)env->lg_handle->
+ reginfo.primary)->stat.st_fileid_init;
+ else
+ *countp = dbenv->lg_fileid_init;
+ break;
+ case DB_MEM_TRANSACTION:
+ ENV_NOT_CONFIGURED(env,
+ env->tx_handle, "DB_ENV->memory_init", DB_INIT_TXN);
+
+ if (TXN_ON(env))
+ *countp = ((DB_TXNREGION *)
+ env->tx_handle->reginfo.primary)->inittxns;
+ else
+ *countp = dbenv->tx_init;
+ break;
+ case DB_MEM_THREAD:
+ /* We always update thr_init when joining an env. */
+ *countp = dbenv->thr_init;
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * __env_set_memory_init --
+ * DB_ENV->set_memory_init.
+ *
+ * PUBLIC: int __env_set_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t));
+ */
+int
+__env_set_memory_init(dbenv, type, count)
+ DB_ENV *dbenv;
+ DB_MEM_CONFIG type;
+ u_int32_t count;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_memory_init");
+ switch (type) {
+ case DB_MEM_LOCK:
+ dbenv->lk_init = count;
+ break;
+ case DB_MEM_LOCKOBJECT:
+ dbenv->lk_init_objects = count;
+ break;
+ case DB_MEM_LOCKER:
+ dbenv->lk_init_lockers = count;
+ break;
+ case DB_MEM_LOGID:
+ dbenv->lg_fileid_init = count;
+ break;
+ case DB_MEM_TRANSACTION:
+ dbenv->tx_init = count;
+ break;
+ case DB_MEM_THREAD:
+ dbenv->thr_init = count;
+ break;
+ }
+
+ return (0);
+}
+/*
+ * __env_get_memory_max --
+ * DB_ENV->get_memory_max.
+ *
+ * PUBLIC: int __env_get_memory_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__env_get_memory_max(dbenv, gbytes, bytes)
+ DB_ENV *dbenv;
+ u_int32_t *gbytes, *bytes;
+{
+ ENV *env;
+ env = dbenv->env;
+
+ if (F_ISSET(env, ENV_OPEN_CALLED)) {
+ *gbytes = (u_int32_t)(env->reginfo->rp->max / GIGABYTE);
+ *bytes = (u_int32_t)(env->reginfo->rp->max % GIGABYTE);
+ } else {
+ *gbytes = (u_int32_t)(dbenv->memory_max / GIGABYTE);
+ *bytes = (u_int32_t)(dbenv->memory_max % GIGABYTE);
+ }
+ return (0);
+}
+
+/*
+ * __env_set_memory_max --
+ * DB_ENV->set_memory_max.
+ *
+ * PUBLIC: int __env_set_memory_max __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__env_set_memory_max(dbenv, gbytes, bytes)
+ DB_ENV *dbenv;
+ u_int32_t gbytes, bytes;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_memory_max");
+
+ /*
+ * If they are asking for 4GB exactly on a 32 bit platform, they
+ * really meant 4GB - 1. Give it to them.
+ */
+ if (sizeof(roff_t) == 4 && gbytes == 4 && bytes == 0) {
+ --gbytes;
+ bytes = GIGABYTE - 1;
+ }
+ /*
+ * Make sure they wouldn't overflow the memory_max field on a
+ * 32 bit architecture.
+ */
+ if (sizeof(roff_t) == 4 && gbytes >= 4) {
+ __db_errx(env, DB_STR("1588",
+ "Maximum memory size too large: maximum is 4GB"));
+ return (EINVAL);
+ }
+ dbenv->memory_max = ((roff_t)gbytes * GIGABYTE) + bytes;
+ return (0);
+}
+
+/*
+ * __env_get_app_dispatch --
+ * Get the transaction abort recover function.
+ */
+static int
+__env_get_app_dispatch(dbenv, app_dispatchp)
+ DB_ENV *dbenv;
+ int (**app_dispatchp) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+{
+
+ if (app_dispatchp != NULL)
+ *app_dispatchp = dbenv->app_dispatch;
+ return (0);
+}
+
+/*
+ * __env_set_app_dispatch --
+ * Set the transaction abort recover function.
+ */
+static int
+__env_set_app_dispatch(dbenv, app_dispatch)
+ DB_ENV *dbenv;
+ int (*app_dispatch) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_app_dispatch");
+
+ dbenv->app_dispatch = app_dispatch;
+ return (0);
+}
+
+/*
+ * __env_get_encrypt_flags --
+ * {DB_ENV,DB}->get_encrypt_flags.
+ *
+ * PUBLIC: int __env_get_encrypt_flags __P((DB_ENV *, u_int32_t *));
+ */
+int
+__env_get_encrypt_flags(dbenv, flagsp)
+ DB_ENV *dbenv;
+ u_int32_t *flagsp;
+{
+#ifdef HAVE_CRYPTO
+ DB_CIPHER *db_cipher;
+#endif
+ ENV *env;
+
+ env = dbenv->env;
+
+#ifdef HAVE_CRYPTO
+ db_cipher = env->crypto_handle;
+ if (db_cipher != NULL && db_cipher->alg == CIPHER_AES)
+ *flagsp = DB_ENCRYPT_AES;
+ else
+ *flagsp = 0;
+ return (0);
+#else
+ COMPQUIET(flagsp, 0);
+ __db_errx(env, DB_STR("1555",
+ "library build did not include support for cryptography"));
+ return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __env_set_encrypt --
+ * DB_ENV->set_encrypt.
+ *
+ * PUBLIC: int __env_set_encrypt __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_set_encrypt(dbenv, passwd, flags)
+ DB_ENV *dbenv;
+ const char *passwd;
+ u_int32_t flags;
+{
+#ifdef HAVE_CRYPTO
+ DB_THREAD_INFO *ip;
+ DB_CIPHER *db_cipher;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_encrypt");
+#define OK_CRYPTO_FLAGS (DB_ENCRYPT_AES)
+
+ if (flags != 0 && LF_ISSET(~OK_CRYPTO_FLAGS))
+ return (__db_ferr(env, "DB_ENV->set_encrypt", 0));
+
+ if (passwd == NULL || strlen(passwd) == 0) {
+ __db_errx(env, DB_STR("1556",
+ "Empty password specified to set_encrypt"));
+ return (EINVAL);
+ }
+ ENV_ENTER(env, ip);
+ if (!CRYPTO_ON(env)) {
+ if ((ret = __os_calloc(env, 1, sizeof(DB_CIPHER), &db_cipher))
+ != 0)
+ goto err;
+ env->crypto_handle = db_cipher;
+ } else
+ db_cipher = env->crypto_handle;
+
+ if (dbenv->passwd != NULL)
+ __os_free(env, dbenv->passwd);
+ if ((ret = __os_strdup(env, passwd, &dbenv->passwd)) != 0) {
+ __os_free(env, db_cipher);
+ goto err;
+ }
+ /*
+ * We're going to need this often enough to keep around
+ */
+ dbenv->passwd_len = strlen(dbenv->passwd) + 1;
+ /*
+ * The MAC key is for checksumming, and is separate from
+ * the algorithm. So initialize it here, even if they
+ * are using CIPHER_ANY.
+ */
+ __db_derive_mac(
+ (u_int8_t *)dbenv->passwd, dbenv->passwd_len, db_cipher->mac_key);
+ switch (flags) {
+ case 0:
+ F_SET(db_cipher, CIPHER_ANY);
+ break;
+ case DB_ENCRYPT_AES:
+ if ((ret =
+ __crypto_algsetup(env, db_cipher, CIPHER_AES, 0)) != 0)
+ goto err1;
+ break;
+ default: /* Impossible. */
+ break;
+ }
+ ENV_LEAVE(env, ip);
+ return (0);
+
+err1:
+ __os_free(env, dbenv->passwd);
+ __os_free(env, db_cipher);
+ env->crypto_handle = NULL;
+err:
+ ENV_LEAVE(env, ip);
+ return (ret);
+#else
+ COMPQUIET(passwd, NULL);
+ COMPQUIET(flags, 0);
+
+ __db_errx(dbenv->env, DB_STR("1557",
+ "library build did not include support for cryptography"));
+ return (DB_OPNOTSUP);
+#endif
+}
+#ifndef HAVE_BREW
+static
+#endif
+const FLAG_MAP EnvMap[] = {
+ { DB_AUTO_COMMIT, DB_ENV_AUTO_COMMIT },
+ { DB_CDB_ALLDB, DB_ENV_CDB_ALLDB },
+ { DB_DATABASE_LOCKING, DB_ENV_DATABASE_LOCKING },
+ { DB_DIRECT_DB, DB_ENV_DIRECT_DB },
+ { DB_DSYNC_DB, DB_ENV_DSYNC_DB },
+ { DB_HOTBACKUP_IN_PROGRESS, DB_ENV_HOTBACKUP },
+ { DB_MULTIVERSION, DB_ENV_MULTIVERSION },
+ { DB_NOFLUSH, DB_ENV_NOFLUSH },
+ { DB_NOLOCKING, DB_ENV_NOLOCKING },
+ { DB_NOMMAP, DB_ENV_NOMMAP },
+ { DB_NOPANIC, DB_ENV_NOPANIC },
+ { DB_OVERWRITE, DB_ENV_OVERWRITE },
+ { DB_REGION_INIT, DB_ENV_REGION_INIT },
+ { DB_TIME_NOTGRANTED, DB_ENV_TIME_NOTGRANTED },
+ { DB_TXN_NOSYNC, DB_ENV_TXN_NOSYNC },
+ { DB_TXN_NOWAIT, DB_ENV_TXN_NOWAIT },
+ { DB_TXN_SNAPSHOT, DB_ENV_TXN_SNAPSHOT },
+ { DB_TXN_WRITE_NOSYNC, DB_ENV_TXN_WRITE_NOSYNC },
+ { DB_YIELDCPU, DB_ENV_YIELDCPU }
+};
+
+/*
+ * __env_map_flags -- map from external to internal flags.
+ * PUBLIC: void __env_map_flags __P((const FLAG_MAP *,
+ * PUBLIC: u_int, u_int32_t *, u_int32_t *));
+ */
+void
+__env_map_flags(flagmap, mapsize, inflagsp, outflagsp)
+ const FLAG_MAP *flagmap;
+ u_int mapsize;
+ u_int32_t *inflagsp, *outflagsp;
+{
+
+ const FLAG_MAP *fmp;
+ u_int i;
+
+ for (i = 0, fmp = flagmap;
+ i < mapsize / sizeof(flagmap[0]); ++i, ++fmp)
+ if (FLD_ISSET(*inflagsp, fmp->inflag)) {
+ FLD_SET(*outflagsp, fmp->outflag);
+ FLD_CLR(*inflagsp, fmp->inflag);
+ if (*inflagsp == 0)
+ break;
+ }
+}
+
+/*
+ * __env_fetch_flags -- map from internal to external flags.
+ * PUBLIC: void __env_fetch_flags __P((const FLAG_MAP *,
+ * PUBLIC: u_int, u_int32_t *, u_int32_t *));
+ */
+void
+__env_fetch_flags(flagmap, mapsize, inflagsp, outflagsp)
+ const FLAG_MAP *flagmap;
+ u_int mapsize;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ const FLAG_MAP *fmp;
+ u_int32_t i;
+
+ *outflagsp = 0;
+ for (i = 0, fmp = flagmap;
+ i < mapsize / sizeof(flagmap[0]); ++i, ++fmp)
+ if (FLD_ISSET(*inflagsp, fmp->outflag))
+ FLD_SET(*outflagsp, fmp->inflag);
+}
+
+static int
+__env_get_flags(dbenv, flagsp)
+ DB_ENV *dbenv;
+ u_int32_t *flagsp;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+
+ __env_fetch_flags(EnvMap, sizeof(EnvMap), &dbenv->flags, flagsp);
+
+ env = dbenv->env;
+ /* Some flags are persisted in the regions. */
+ if (env->reginfo != NULL &&
+ ((REGENV *)env->reginfo->primary)->panic != 0)
+ FLD_SET(*flagsp, DB_PANIC_ENVIRONMENT);
+
+ /* If the hotbackup counter is positive, set the flag indicating so. */
+ if (TXN_ON(env)) {
+ ENV_ENTER(env, ip);
+ TXN_SYSTEM_LOCK(env);
+ if (((DB_TXNREGION *)
+ env->tx_handle->reginfo.primary)->n_hotbackup > 0)
+ FLD_SET(*flagsp, DB_HOTBACKUP_IN_PROGRESS);
+ TXN_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ }
+
+ return (0);
+}
+
+/*
+ * __env_set_flags --
+ * DB_ENV->set_flags.
+ *
+ * PUBLIC: int __env_set_flags __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__env_set_flags(dbenv, flags, on)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+ int on;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t mapped_flags;
+ int mem_on, ret;
+
+ env = dbenv->env;
+
+#define OK_FLAGS \
+ (DB_AUTO_COMMIT | DB_CDB_ALLDB | DB_DATABASE_LOCKING | \
+ DB_DIRECT_DB | DB_DSYNC_DB | DB_MULTIVERSION | \
+ DB_NOLOCKING | DB_NOMMAP | DB_NOPANIC | DB_OVERWRITE | \
+ DB_PANIC_ENVIRONMENT | DB_REGION_INIT | \
+ DB_TIME_NOTGRANTED | DB_TXN_NOSYNC | DB_TXN_NOWAIT | \
+ DB_TXN_SNAPSHOT | DB_TXN_WRITE_NOSYNC | DB_YIELDCPU | \
+ DB_HOTBACKUP_IN_PROGRESS | DB_NOFLUSH)
+
+ if (LF_ISSET(~OK_FLAGS))
+ return (__db_ferr(env, "DB_ENV->set_flags", 0));
+ if (on) {
+ if ((ret = __db_fcchk(env, "DB_ENV->set_flags",
+ flags, DB_TXN_NOSYNC, DB_TXN_WRITE_NOSYNC)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_DIRECT_DB) && __os_support_direct_io() == 0) {
+ __db_errx(env,
+ "DB_ENV->set_flags: direct I/O either not configured or not supported");
+ return (EINVAL);
+ }
+ }
+
+ if (LF_ISSET(DB_CDB_ALLDB))
+ ENV_ILLEGAL_AFTER_OPEN(env,
+ "DB_ENV->set_flags: DB_CDB_ALLDB");
+ if (LF_ISSET(DB_PANIC_ENVIRONMENT)) {
+ ENV_ILLEGAL_BEFORE_OPEN(env,
+ "DB_ENV->set_flags: DB_PANIC_ENVIRONMENT");
+ if (on) {
+ __db_errx(env, DB_STR("1558",
+ "Environment panic set"));
+ (void)__env_panic(env, DB_RUNRECOVERY);
+ } else
+ __env_panic_set(env, 0);
+ }
+ if (LF_ISSET(DB_REGION_INIT))
+ ENV_ILLEGAL_AFTER_OPEN(env,
+ "DB_ENV->set_flags: DB_REGION_INIT");
+
+ /*
+ * DB_LOG_IN_MEMORY, DB_TXN_NOSYNC and DB_TXN_WRITE_NOSYNC are
+ * mutually incompatible. If we're setting one of them, clear all
+ * current settings. If the environment is open, check to see that
+ * logging is not in memory.
+ */
+ if (on && LF_ISSET(DB_TXN_NOSYNC | DB_TXN_WRITE_NOSYNC)) {
+ F_CLR(dbenv, DB_ENV_TXN_NOSYNC | DB_ENV_TXN_WRITE_NOSYNC);
+ if (!F_ISSET(env, ENV_OPEN_CALLED)) {
+ if ((ret =
+ __log_set_config(dbenv, DB_LOG_IN_MEMORY, 0)) != 0)
+ return (ret);
+ } else if (LOGGING_ON(env)) {
+ if ((ret = __log_get_config(dbenv,
+ DB_LOG_IN_MEMORY, &mem_on)) != 0)
+ return (ret);
+ if (mem_on == 1) {
+ __db_errx(env, DB_STR("1559",
+ "DB_TXN_NOSYNC and DB_TXN_WRITE_NOSYNC"
+ " may not be used with DB_LOG_IN_MEMORY"));
+ return (EINVAL);
+ }
+ }
+ }
+
+ /*
+ * Settings of DB_HOTBACKUP_IN_PROGRESS are reference-counted
+ * in REGENV.
+ */
+ if (LF_ISSET(DB_HOTBACKUP_IN_PROGRESS)) {
+ /* You can't take a hot backup without transactions. */
+ ENV_REQUIRES_CONFIG(env, env->tx_handle,
+ "DB_ENV->set_flags: DB_HOTBACKUP_IN_PROGRESS", DB_INIT_TXN);
+
+ ENV_ENTER(env, ip);
+ ret = __env_set_backup(env, on);
+ ENV_LEAVE(env, ip);
+ if (ret != 0)
+ return (ret);
+ }
+
+ mapped_flags = 0;
+ __env_map_flags(EnvMap, sizeof(EnvMap), &flags, &mapped_flags);
+ if (on)
+ F_SET(dbenv, mapped_flags);
+ else
+ F_CLR(dbenv, mapped_flags);
+
+ return (0);
+}
+
+/*
+ * __env_set_backup --
+ * PUBLIC: int __env_set_backup __P((ENV *, int));
+ */
+int
+__env_set_backup(env, on)
+ ENV *env;
+ int on;
+{
+ DB_TXNREGION *tenv;
+ int needs_checkpoint, ret;
+
+ tenv = (DB_TXNREGION *)env->tx_handle->reginfo.primary;
+ needs_checkpoint = 0;
+
+ TXN_SYSTEM_LOCK(env);
+ if (on) {
+ tenv->n_hotbackup++;
+ if (tenv->n_bulk_txn > 0)
+ needs_checkpoint = 1;
+ } else {
+ if (tenv->n_hotbackup == 0)
+ needs_checkpoint = -1; /* signal count error */
+ else
+ tenv->n_hotbackup--;
+ }
+ TXN_SYSTEM_UNLOCK(env);
+
+ if (needs_checkpoint == -1) {
+ __db_errx(env, DB_STR("1560",
+ "Attempt to decrement hotbackup counter past zero"));
+ return (EINVAL);
+ }
+
+ if (needs_checkpoint && (ret = __txn_checkpoint(env, 0, 0, 0)))
+ return (ret);
+ return (0);
+}
+
+static int
+__env_get_data_dirs(dbenv, dirpp)
+ DB_ENV *dbenv;
+ const char ***dirpp;
+{
+ *dirpp = (const char **)dbenv->db_data_dir;
+ return (0);
+}
+
+/*
+ * __env_set_data_dir --
+ * DB_ENV->set_data_dir.
+ *
+ * PUBLIC: int __env_set_data_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_set_data_dir(dbenv, dir)
+ DB_ENV *dbenv;
+ const char *dir;
+{
+ int ret;
+
+ if ((ret = __env_add_data_dir(dbenv, dir)) != 0)
+ return (ret);
+
+ if (dbenv->data_next == 1)
+ return (__env_set_create_dir(dbenv, dir));
+
+ return (0);
+}
+
+/*
+ * __env_add_data_dir --
+ * DB_ENV->add_data_dir.
+ *
+ * PUBLIC: int __env_add_data_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_add_data_dir(dbenv, dir)
+ DB_ENV *dbenv;
+ const char *dir;
+{
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->add_data_dir");
+
+ /*
+ * The array is NULL-terminated so it can be returned by get_data_dirs
+ * without a length.
+ */
+
+#define DATA_INIT_CNT 20 /* Start with 20 data slots. */
+ if (dbenv->db_data_dir == NULL) {
+ if ((ret = __os_calloc(env, DATA_INIT_CNT,
+ sizeof(char **), &dbenv->db_data_dir)) != 0)
+ return (ret);
+ dbenv->data_cnt = DATA_INIT_CNT;
+ } else if (dbenv->data_next == dbenv->data_cnt - 2) {
+ dbenv->data_cnt *= 2;
+ if ((ret = __os_realloc(env,
+ (u_int)dbenv->data_cnt * sizeof(char **),
+ &dbenv->db_data_dir)) != 0)
+ return (ret);
+ }
+
+ ret = __os_strdup(env,
+ dir, &dbenv->db_data_dir[dbenv->data_next++]);
+ dbenv->db_data_dir[dbenv->data_next] = NULL;
+ return (ret);
+}
+
+/*
+ * __env_set_create_dir --
+ * DB_ENV->set_create_dir.
+ * The list of directories cannot change after opening the env and setting
+ * a pointer must be atomic so we do not need to mutex here even if multiple
+ * threads are using the DB_ENV handle.
+ *
+ * PUBLIC: int __env_set_create_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_set_create_dir(dbenv, dir)
+ DB_ENV *dbenv;
+ const char *dir;
+{
+ ENV *env;
+ int i;
+
+ env = dbenv->env;
+
+ for (i = 0; i < dbenv->data_next; i++)
+ if (strcmp(dir, dbenv->db_data_dir[i]) == 0)
+ break;
+
+ if (i == dbenv->data_next) {
+ __db_errx(env, DB_STR_A("1561",
+ "Directory %s not in environment list.", "%s"), dir);
+ return (EINVAL);
+ }
+
+ dbenv->db_create_dir = dbenv->db_data_dir[i];
+ return (0);
+}
+
+static int
+__env_get_create_dir(dbenv, dirp)
+ DB_ENV *dbenv;
+ const char **dirp;
+{
+ *dirp = dbenv->db_create_dir;
+ return (0);
+}
+
+static int
+__env_get_intermediate_dir_mode(dbenv, modep)
+ DB_ENV *dbenv;
+ const char **modep;
+{
+ *modep = dbenv->intermediate_dir_mode;
+ return (0);
+}
+
+/*
+ * __env_set_metadata_dir --
+ * DB_ENV->set_metadata_dir.
+ *
+ * PUBLIC: int __env_set_metadata_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_set_metadata_dir(dbenv, dir)
+ DB_ENV *dbenv;
+ const char *dir;
+{
+ ENV *env;
+ int i, ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_metadata_dir");
+
+ /* If metadata_dir is not already on data_dir list, add it. */
+ for (i = 0; i < dbenv->data_next; i++)
+ if (strcmp(dir, dbenv->db_data_dir[i]) == 0)
+ break;
+ if (i == dbenv->data_next &&
+ (ret = __env_add_data_dir(dbenv, dir)) != 0) {
+ __db_errx(env, DB_STR_A("1590",
+ "Could not add %s to environment list.", "%s"), dir);
+ return (ret);
+ }
+
+ if (dbenv->db_md_dir != NULL)
+ __os_free(env, dbenv->db_md_dir);
+ return (__os_strdup(env, dir, &dbenv->db_md_dir));
+}
+
+static int
+__env_get_metadata_dir(dbenv, dirp)
+ DB_ENV *dbenv;
+ const char **dirp;
+{
+ *dirp = dbenv->db_md_dir;
+ return (0);
+}
+
+/*
+ * __env_set_data_len --
+ * DB_ENV->set_data_len.
+ *
+ * PUBLIC: int __env_set_data_len __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_set_data_len(dbenv, data_len)
+ DB_ENV *dbenv;
+ u_int32_t data_len;
+{
+
+ dbenv->env->data_len = data_len;
+ return (0);
+}
+
+static int
+__env_get_data_len(dbenv, data_lenp)
+ DB_ENV *dbenv;
+ u_int32_t *data_lenp;
+{
+ *data_lenp = dbenv->env->data_len;
+ return (0);
+}
+
+/*
+ * __env_set_intermediate_dir_mode --
+ * DB_ENV->set_intermediate_dir_mode.
+ *
+ * PUBLIC: int __env_set_intermediate_dir_mode __P((DB_ENV *, const char *));
+ */
+int
+__env_set_intermediate_dir_mode(dbenv, mode)
+ DB_ENV *dbenv;
+ const char *mode;
+{
+ ENV *env;
+ u_int t;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_intermediate_dir_mode");
+
+#define __SETMODE(offset, valid_ch, mask) { \
+ if (mode[offset] == (valid_ch)) \
+ t |= (mask); \
+ else if (mode[offset] != '-') \
+ goto format_err; \
+}
+ t = 0;
+ __SETMODE(0, 'r', S_IRUSR);
+ __SETMODE(1, 'w', S_IWUSR);
+ __SETMODE(2, 'x', S_IXUSR);
+ __SETMODE(3, 'r', S_IRGRP);
+ __SETMODE(4, 'w', S_IWGRP);
+ __SETMODE(5, 'x', S_IXGRP);
+ __SETMODE(6, 'r', S_IROTH);
+ __SETMODE(7, 'w', S_IWOTH);
+ __SETMODE(8, 'x', S_IXOTH);
+ if (mode[9] != '\0' || t == 0) {
+ /*
+ * We disallow modes of 0 -- we use 0 to decide the application
+ * never configured intermediate directory permissions, and we
+ * shouldn't create intermediate directories. Besides, setting
+ * the permissions to 0 makes no sense.
+ */
+format_err: __db_errx(env,
+ "DB_ENV->set_intermediate_dir_mode: illegal mode \"%s\"", mode);
+ return (EINVAL);
+ }
+
+ if (dbenv->intermediate_dir_mode != NULL)
+ __os_free(env, dbenv->intermediate_dir_mode);
+ if ((ret = __os_strdup(env, mode, &dbenv->intermediate_dir_mode)) != 0)
+ return (ret);
+
+ env->dir_mode = (int)t;
+ return (0);
+}
+
+/*
+ * __env_get_errcall --
+ * {DB_ENV,DB}->get_errcall.
+ *
+ * PUBLIC: void __env_get_errcall __P((DB_ENV *,
+ * PUBLIC: void (**)(const DB_ENV *, const char *, const char *)));
+ */
+void
+__env_get_errcall(dbenv, errcallp)
+ DB_ENV *dbenv;
+ void (**errcallp) __P((const DB_ENV *, const char *, const char *));
+{
+ *errcallp = dbenv->db_errcall;
+}
+
+/*
+ * __env_set_errcall --
+ * {DB_ENV,DB}->set_errcall.
+ *
+ * PUBLIC: void __env_set_errcall __P((DB_ENV *,
+ * PUBLIC: void (*)(const DB_ENV *, const char *, const char *)));
+ */
+void
+__env_set_errcall(dbenv, errcall)
+ DB_ENV *dbenv;
+ void (*errcall) __P((const DB_ENV *, const char *, const char *));
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ F_CLR(env, ENV_NO_OUTPUT_SET);
+ dbenv->db_errcall = errcall;
+}
+
+/*
+ * __env_get_errfile --
+ * {DB_ENV,DB}->get_errfile.
+ *
+ * PUBLIC: void __env_get_errfile __P((DB_ENV *, FILE **));
+ */
+void
+__env_get_errfile(dbenv, errfilep)
+ DB_ENV *dbenv;
+ FILE **errfilep;
+{
+ *errfilep = dbenv->db_errfile;
+}
+
+/*
+ * __env_set_errfile --
+ * {DB_ENV,DB}->set_errfile.
+ *
+ * PUBLIC: void __env_set_errfile __P((DB_ENV *, FILE *));
+ */
+void
+__env_set_errfile(dbenv, errfile)
+ DB_ENV *dbenv;
+ FILE *errfile;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ F_CLR(env, ENV_NO_OUTPUT_SET);
+ dbenv->db_errfile = errfile;
+}
+
+/*
+ * __env_get_errpfx --
+ * {DB_ENV,DB}->get_errpfx.
+ *
+ * PUBLIC: void __env_get_errpfx __P((DB_ENV *, const char **));
+ */
+void
+__env_get_errpfx(dbenv, errpfxp)
+ DB_ENV *dbenv;
+ const char **errpfxp;
+{
+ *errpfxp = dbenv->db_errpfx;
+}
+
+/*
+ * __env_set_errpfx --
+ * {DB_ENV,DB}->set_errpfx.
+ *
+ * PUBLIC: void __env_set_errpfx __P((DB_ENV *, const char *));
+ */
+void
+__env_set_errpfx(dbenv, errpfx)
+ DB_ENV *dbenv;
+ const char *errpfx;
+{
+ dbenv->db_errpfx = errpfx;
+}
+
+static int
+__env_get_feedback(dbenv, feedbackp)
+ DB_ENV *dbenv;
+ void (**feedbackp) __P((DB_ENV *, int, int));
+{
+ if (feedbackp != NULL)
+ *feedbackp = dbenv->db_feedback;
+ return (0);
+}
+
+static int
+__env_set_feedback(dbenv, feedback)
+ DB_ENV *dbenv;
+ void (*feedback) __P((DB_ENV *, int, int));
+{
+ dbenv->db_feedback = feedback;
+ return (0);
+}
+
+/*
+ * __env_get_thread_id_fn --
+ * DB_ENV->get_thread_id_fn
+ */
+static int
+__env_get_thread_id_fn(dbenv, idp)
+ DB_ENV *dbenv;
+ void (**idp) __P((DB_ENV *, pid_t *, db_threadid_t *));
+{
+ if (idp != NULL)
+ *idp = dbenv->thread_id;
+ return (0);
+}
+
+/*
+ * __env_set_thread_id --
+ * DB_ENV->set_thread_id
+ */
+static int
+__env_set_thread_id(dbenv, id)
+ DB_ENV *dbenv;
+ void (*id) __P((DB_ENV *, pid_t *, db_threadid_t *));
+{
+ dbenv->thread_id = id;
+ return (0);
+}
+
+/*
+ * __env_get_threadid_string_fn --
+ * DB_ENV->get_threadid_string_fn
+ */
+static int
+__env_get_thread_id_string_fn(dbenv, thread_id_stringp)
+ DB_ENV *dbenv;
+ char *(**thread_id_stringp)
+ __P((DB_ENV *, pid_t, db_threadid_t, char *));
+{
+ if (thread_id_stringp != NULL)
+ *thread_id_stringp = dbenv->thread_id_string;
+ return (0);
+}
+
+/*
+ * __env_set_threadid_string --
+ * DB_ENV->set_threadid_string
+ */
+static int
+__env_set_thread_id_string(dbenv, thread_id_string)
+ DB_ENV *dbenv;
+ char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *));
+{
+ dbenv->thread_id_string = thread_id_string;
+ return (0);
+}
+
+/*
+ * __env_get_isalive --
+ * DB_ENV->get_isalive
+ */
+static int
+__env_get_isalive(dbenv, is_alivep)
+ DB_ENV *dbenv;
+ int (**is_alivep) __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (F_ISSET(env, ENV_OPEN_CALLED) && env->thr_nbucket == 0) {
+ __db_errx(env, DB_STR("1562",
+ "is_alive method specified but no thread region allocated"));
+ return (EINVAL);
+ }
+ if (is_alivep != NULL)
+ *is_alivep = dbenv->is_alive;
+ return (0);
+}
+
+/*
+ * __env_set_isalive --
+ * DB_ENV->set_isalive
+ */
+static int
+__env_set_isalive(dbenv, is_alive)
+ DB_ENV *dbenv;
+ int (*is_alive) __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (F_ISSET(env, ENV_OPEN_CALLED) && env->thr_nbucket == 0) {
+ __db_errx(env, DB_STR("1563",
+ "is_alive method specified but no thread region allocated"));
+ return (EINVAL);
+ }
+ dbenv->is_alive = is_alive;
+ return (0);
+}
+
+/*
+ * __env_get_thread_count --
+ * DB_ENV->get_thread_count
+ */
+static int
+__env_get_thread_count(dbenv, countp)
+ DB_ENV *dbenv;
+ u_int32_t *countp;
+{
+ *countp = dbenv->thr_max;
+ return (0);
+}
+
+/*
+ * __env_set_thread_count --
+ * DB_ENV->set_thread_count
+ *
+ * PUBLIC: int __env_set_thread_count __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_set_thread_count(dbenv, count)
+ DB_ENV *dbenv;
+ u_int32_t count;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_thread_count");
+ dbenv->thr_max = count;
+
+ return (0);
+}
+
+/*
+ * __env_get_msgcall --
+ * {DB_ENV,DB}->get_msgcall.
+ *
+ * PUBLIC: void __env_get_msgcall
+ * PUBLIC: __P((DB_ENV *, void (**)(const DB_ENV *, const char *)));
+ */
+void
+__env_get_msgcall(dbenv, msgcallp)
+ DB_ENV *dbenv;
+ void (**msgcallp) __P((const DB_ENV *, const char *));
+{
+ if (msgcallp != NULL)
+ *msgcallp = dbenv->db_msgcall;
+}
+
+/*
+ * __env_set_msgcall --
+ * {DB_ENV,DB}->set_msgcall.
+ *
+ * PUBLIC: void __env_set_msgcall
+ * PUBLIC: __P((DB_ENV *, void (*)(const DB_ENV *, const char *)));
+ */
+void
+__env_set_msgcall(dbenv, msgcall)
+ DB_ENV *dbenv;
+ void (*msgcall) __P((const DB_ENV *, const char *));
+{
+ dbenv->db_msgcall = msgcall;
+}
+
+/*
+ * __env_get_msgfile --
+ * {DB_ENV,DB}->get_msgfile.
+ *
+ * PUBLIC: void __env_get_msgfile __P((DB_ENV *, FILE **));
+ */
+void
+__env_get_msgfile(dbenv, msgfilep)
+ DB_ENV *dbenv;
+ FILE **msgfilep;
+{
+ *msgfilep = dbenv->db_msgfile;
+}
+
+/*
+ * __env_set_msgfile --
+ * {DB_ENV,DB}->set_msgfile.
+ *
+ * PUBLIC: void __env_set_msgfile __P((DB_ENV *, FILE *));
+ */
+void
+__env_set_msgfile(dbenv, msgfile)
+ DB_ENV *dbenv;
+ FILE *msgfile;
+{
+ dbenv->db_msgfile = msgfile;
+}
+
+/*
+ * __env_set_paniccall --
+ * {DB_ENV,DB}->set_paniccall.
+ *
+ * PUBLIC: int __env_set_paniccall __P((DB_ENV *, void (*)(DB_ENV *, int)));
+ */
+int
+__env_set_paniccall(dbenv, paniccall)
+ DB_ENV *dbenv;
+ void (*paniccall) __P((DB_ENV *, int));
+{
+ dbenv->db_paniccall = paniccall;
+ return (0);
+}
+
+/*
+ * __env_set_event_notify --
+ * DB_ENV->set_event_notify.
+ */
+static int
+__env_set_event_notify(dbenv, event_func)
+ DB_ENV *dbenv;
+ void (*event_func) __P((DB_ENV *, u_int32_t, void *));
+{
+ dbenv->db_event_func = event_func;
+ return (0);
+}
+
+static int
+__env_get_shm_key(dbenv, shm_keyp)
+ DB_ENV *dbenv;
+ long *shm_keyp; /* !!!: really a key_t *. */
+{
+ *shm_keyp = dbenv->shm_key;
+ return (0);
+}
+
+/*
+ * __env_set_shm_key --
+ * DB_ENV->set_shm_key.
+ *
+ * PUBLIC: int __env_set_shm_key __P((DB_ENV *, long));
+ */
+int
+__env_set_shm_key(dbenv, shm_key)
+ DB_ENV *dbenv;
+ long shm_key; /* !!!: really a key_t. */
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_shm_key");
+
+ dbenv->shm_key = shm_key;
+ return (0);
+}
+
+static int
+__env_get_tmp_dir(dbenv, dirp)
+ DB_ENV *dbenv;
+ const char **dirp;
+{
+ *dirp = dbenv->db_tmp_dir;
+ return (0);
+}
+
+/*
+ * __env_set_tmp_dir --
+ * DB_ENV->set_tmp_dir.
+ *
+ * PUBLIC: int __env_set_tmp_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_set_tmp_dir(dbenv, dir)
+ DB_ENV *dbenv;
+ const char *dir;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (dbenv->db_tmp_dir != NULL)
+ __os_free(env, dbenv->db_tmp_dir);
+ return (__os_strdup(env, dir, &dbenv->db_tmp_dir));
+}
+
+static int
+__env_get_verbose(dbenv, which, onoffp)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int *onoffp;
+{
+ switch (which) {
+ case DB_VERB_BACKUP:
+ case DB_VERB_DEADLOCK:
+ case DB_VERB_FILEOPS:
+ case DB_VERB_FILEOPS_ALL:
+ case DB_VERB_RECOVERY:
+ case DB_VERB_REGISTER:
+ case DB_VERB_REPLICATION:
+ case DB_VERB_REP_ELECT:
+ case DB_VERB_REP_LEASE:
+ case DB_VERB_REP_MISC:
+ case DB_VERB_REP_MSGS:
+ case DB_VERB_REP_SYNC:
+ case DB_VERB_REP_SYSTEM:
+ case DB_VERB_REP_TEST:
+ case DB_VERB_REPMGR_CONNFAIL:
+ case DB_VERB_REPMGR_MISC:
+ case DB_VERB_WAITSFOR:
+ *onoffp = FLD_ISSET(dbenv->verbose, which) ? 1 : 0;
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __env_set_verbose --
+ * DB_ENV->set_verbose.
+ *
+ * PUBLIC: int __env_set_verbose __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__env_set_verbose(dbenv, which, on)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int on;
+{
+ switch (which) {
+ case DB_VERB_BACKUP:
+ case DB_VERB_DEADLOCK:
+ case DB_VERB_FILEOPS:
+ case DB_VERB_FILEOPS_ALL:
+ case DB_VERB_RECOVERY:
+ case DB_VERB_REGISTER:
+ case DB_VERB_REPLICATION:
+ case DB_VERB_REP_ELECT:
+ case DB_VERB_REP_LEASE:
+ case DB_VERB_REP_MISC:
+ case DB_VERB_REP_MSGS:
+ case DB_VERB_REP_SYNC:
+ case DB_VERB_REP_SYSTEM:
+ case DB_VERB_REP_TEST:
+ case DB_VERB_REPMGR_CONNFAIL:
+ case DB_VERB_REPMGR_MISC:
+ case DB_VERB_WAITSFOR:
+ if (on)
+ FLD_SET(dbenv->verbose, which);
+ else
+ FLD_CLR(dbenv->verbose, which);
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __db_mi_env --
+ * Method illegally called with public environment.
+ *
+ * PUBLIC: int __db_mi_env __P((ENV *, const char *));
+ */
+int
+__db_mi_env(env, name)
+ ENV *env;
+ const char *name;
+{
+ __db_errx(env, DB_STR_A("1564",
+ "%s: method not permitted when environment specified", "%s"),
+ name);
+ return (EINVAL);
+}
+
+/*
+ * __db_mi_open --
+ * Method illegally called after open.
+ *
+ * PUBLIC: int __db_mi_open __P((ENV *, const char *, int));
+ */
+int
+__db_mi_open(env, name, after)
+ ENV *env;
+ const char *name;
+ int after;
+{
+ __db_errx(env, DB_STR_A("1565",
+ "%s: method not permitted %s handle's open method", "%s %s"),
+ name, after ? DB_STR_P("after") : DB_STR_P("before"));
+ return (EINVAL);
+}
+
+/*
+ * __env_not_config --
+ * Method or function called without required configuration.
+ *
+ * PUBLIC: int __env_not_config __P((ENV *, char *, u_int32_t));
+ */
+int
+__env_not_config(env, i, flags)
+ ENV *env;
+ char *i;
+ u_int32_t flags;
+{
+ char *sub;
+ int is_sub;
+
+ is_sub = 1;
+
+ switch (flags) {
+ case DB_INIT_CDB:
+ sub = "DB_INIT_CDB";
+ is_sub = 0;
+ break;
+ case DB_INIT_LOCK:
+ sub = "locking";
+ break;
+ case DB_INIT_LOG:
+ sub = "logging";
+ break;
+ case DB_INIT_MPOOL:
+ sub = "memory pool";
+ break;
+ case DB_INIT_MUTEX:
+ sub = "mutex";
+ break;
+ case DB_INIT_REP:
+ sub = "replication";
+ break;
+ case DB_INIT_TXN:
+ sub = "transaction";
+ break;
+ default:
+ sub = "<unspecified>";
+ break;
+ }
+
+ if (is_sub) {
+ __db_errx(env, DB_STR_A("1566",
+ "%s interface requires an environment configured for the %s subsystem",
+ "%s %s"), i, sub);
+ } else {
+ __db_errx(env, DB_STR_A("1587",
+ "%s interface requires an environment configured with %s",
+ "%s %s"), i, sub);
+ }
+
+ return (EINVAL);
+}
+
+/*
+ * __env_get_timeout --
+ * DB_ENV->get_timeout
+ */
+static int
+__env_get_timeout(dbenv, timeoutp, flags)
+ DB_ENV *dbenv;
+ db_timeout_t *timeoutp;
+ u_int32_t flags;
+{
+ int ret;
+
+ ret = 0;
+ if (flags == DB_SET_REG_TIMEOUT) {
+ *timeoutp = dbenv->envreg_timeout;
+ } else
+ ret = __lock_get_env_timeout(dbenv, timeoutp, flags);
+ return (ret);
+}
+
+/*
+ * __env_set_timeout --
+ * DB_ENV->set_timeout
+ *
+ * PUBLIC: int __env_set_timeout __P((DB_ENV *, db_timeout_t, u_int32_t));
+ */
+int
+__env_set_timeout(dbenv, timeout, flags)
+ DB_ENV *dbenv;
+ db_timeout_t timeout;
+ u_int32_t flags;
+{
+ int ret;
+
+ ret = 0;
+ if (flags == DB_SET_REG_TIMEOUT)
+ dbenv->envreg_timeout = timeout;
+ else
+ ret = __lock_set_env_timeout(dbenv, timeout, flags);
+ return (ret);
+}
diff --git a/src/env/env_name.c b/src/env/env_name.c
new file mode 100644
index 00000000..a3a0b371
--- /dev/null
+++ b/src/env/env_name.c
@@ -0,0 +1,285 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int __db_fullpath
+ __P((ENV *, const char *, const char *, int, int, char **));
+
+#define DB_ADDSTR(add) { \
+ /* \
+ * The string might be NULL or zero-length, and the p[-1] \
+ * might indirect to before the beginning of our buffer. \
+ */ \
+ if ((add) != NULL && (add)[0] != '\0') { \
+ /* If leading slash, start over. */ \
+ if (__os_abspath(add)) { \
+ p = str; \
+ slash = 0; \
+ } \
+ /* Append to the current string. */ \
+ len = strlen(add); \
+ if (slash) \
+ *p++ = PATH_SEPARATOR[0]; \
+ memcpy(p, add, len); \
+ p += len; \
+ slash = strchr(PATH_SEPARATOR, p[-1]) == NULL; \
+ } \
+}
+
+/*
+ * __db_fullpath --
+ * Constructs a path name relative to the environment home, and optionally
+ * checks whether the file or directory exist.
+ */
+static int
+__db_fullpath(env, dir, file, check_file, check_dir, namep)
+ ENV *env;
+ const char *dir;
+ const char *file;
+ int check_file;
+ int check_dir;
+ char **namep;
+{
+ size_t len;
+ const char *home;
+ char *p, *str;
+ int isdir, ret, slash;
+
+ /* All paths are relative to the environment home. */
+ home = (env == NULL) ? NULL : env->db_home;
+
+ len =
+ (home == NULL ? 0 : strlen(home) + 1) +
+ (dir == NULL ? 0 : strlen(dir) + 1) +
+ (file == NULL ? 0 : strlen(file) + 1);
+
+ if ((ret = __os_malloc(env, len, &str)) != 0)
+ return (ret);
+
+ slash = 0;
+ p = str;
+ DB_ADDSTR(home);
+ DB_ADDSTR(dir);
+ *p = '\0';
+ if (check_dir && (__os_exists(env, str, &isdir) != 0 || !isdir)) {
+ __os_free(env, str);
+ return (ENOENT);
+ }
+ DB_ADDSTR(file);
+ *p = '\0';
+
+ /*
+ * If we're opening a data file, see if it exists. If not, keep
+ * trying.
+ */
+ if (check_file && __os_exists(env, str, NULL) != 0) {
+ __os_free(env, str);
+ return (ENOENT);
+ }
+
+ if (namep == NULL)
+ __os_free(env, str);
+ else
+ *namep = str;
+ return (0);
+}
+
+#define DB_CHECKFILE(file, dir, check_file, check_dir, namep, ret_dir) do { \
+ ret = __db_fullpath(env, dir, file, \
+ check_file, check_dir, namep); \
+ if (ret == 0 && (ret_dir) != NULL) \
+ *(ret_dir) = (dir); \
+ if (ret != ENOENT) \
+ return (ret); \
+} while (0)
+
+/*
+ * __db_appname --
+ * Given an optional DB environment, directory and file name and type
+ * of call, build a path based on the ENV->open rules, and return
+ * it in allocated space. Dirp can be used to specify a data directory
+ * to use. If not and one is used then drip will contain a pointer
+ * to the directory name.
+ *
+ * PUBLIC: int __db_appname __P((ENV *, APPNAME,
+ * PUBLIC: const char *, const char **, char **));
+ */
+int
+__db_appname(env, appname, file, dirp, namep)
+ ENV *env;
+ APPNAME appname;
+ const char *file;
+ const char **dirp;
+ char **namep;
+{
+ DB_ENV *dbenv;
+ char **ddp;
+ const char *dir;
+ int ret;
+
+ dbenv = env->dbenv;
+ dir = NULL;
+
+ if (namep != NULL)
+ *namep = NULL;
+
+ /*
+ * Absolute path names are never modified. If the file is an absolute
+ * path, we're done.
+ */
+ if (file != NULL && __os_abspath(file))
+ return (__os_strdup(env, file, namep));
+
+ /*
+ * DB_APP_NONE:
+ * DB_HOME/file
+ * DB_APP_DATA:
+ * DB_HOME/DB_DATA_DIR/file
+ * DB_APP_LOG:
+ * DB_HOME/DB_LOG_DIR/file
+ * DB_APP_TMP:
+ * DB_HOME/DB_TMP_DIR/<create>
+ */
+ switch (appname) {
+ case DB_APP_NONE:
+ break;
+ case DB_APP_RECOVER:
+ case DB_APP_DATA:
+ /*
+ * First, step through the data_dir entries, if any, looking
+ * for the file.
+ */
+ if (dbenv != NULL && dbenv->db_data_dir != NULL)
+ for (ddp = dbenv->db_data_dir; *ddp != NULL; ddp++)
+ DB_CHECKFILE(file, *ddp, 1, 0, namep, dirp);
+
+ /* Second, look in the environment home directory. */
+ DB_CHECKFILE(file, NULL, 1, 0, namep, dirp);
+
+ /*
+ * Otherwise, we're going to create. Use the specified
+ * directory unless we're in recovery and it doesn't exist.
+ */
+ if (dirp != NULL && *dirp != NULL)
+ DB_CHECKFILE(file, *dirp, 0,
+ appname == DB_APP_RECOVER, namep, dirp);
+
+ /* Finally, use the create directory, if set. */
+ if (dbenv != NULL && dbenv->db_create_dir != NULL)
+ dir = dbenv->db_create_dir;
+ break;
+ case DB_APP_LOG:
+ if (dbenv != NULL)
+ dir = dbenv->db_log_dir;
+ break;
+ case DB_APP_TMP:
+ if (dbenv != NULL)
+ dir = dbenv->db_tmp_dir;
+ break;
+ case DB_APP_META:
+ if (dbenv != NULL)
+ dir = dbenv->db_md_dir;
+ break;
+ }
+
+ /*
+ * Construct the full path. For temporary files, it is an error if the
+ * directory does not exist: if it doesn't, checking whether millions
+ * of temporary files exist inside it takes a *very* long time.
+ */
+ DB_CHECKFILE(file, dir, 0, appname == DB_APP_TMP, namep, dirp);
+
+ return (ret);
+}
+
+/*
+ * __db_tmp_open --
+ * Create a temporary file.
+ *
+ * PUBLIC: int __db_tmp_open __P((ENV *, u_int32_t, DB_FH **));
+ */
+int
+__db_tmp_open(env, oflags, fhpp)
+ ENV *env;
+ u_int32_t oflags;
+ DB_FH **fhpp;
+{
+ pid_t pid;
+ int filenum, i, ipid, ret;
+ char *path;
+ char *firstx, *trv;
+
+ DB_ASSERT(env, fhpp != NULL);
+ *fhpp = NULL;
+
+#define DB_TRAIL "BDBXXXXX"
+ if ((ret = __db_appname(env, DB_APP_TMP, DB_TRAIL, NULL, &path)) != 0)
+ goto done;
+
+ /* Replace the X's with the process ID (in decimal). */
+ __os_id(env->dbenv, &pid, NULL);
+ ipid = (int)pid;
+ if (ipid < 0)
+ ipid = -ipid;
+ for (trv = path + strlen(path); *--trv == 'X'; ipid /= 10)
+ *trv = '0' + (u_char)(ipid % 10);
+ firstx = trv + 1;
+
+ /* Loop, trying to open a file. */
+ for (filenum = 1;; filenum++) {
+ if ((ret = __os_open(env, path, 0,
+ oflags | DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
+ DB_MODE_600, fhpp)) == 0) {
+ ret = 0;
+ goto done;
+ }
+
+ /*
+ * !!!:
+ * If we don't get an EEXIST error, then there's something
+ * seriously wrong. Unfortunately, if the implementation
+ * doesn't return EEXIST for O_CREAT and O_EXCL regardless
+ * of other possible errors, we've lost.
+ */
+ if (ret != EEXIST) {
+ __db_err(env, ret, DB_STR_A("1586",
+ "temporary open: %s", "%s"), path);
+ goto done;
+ }
+
+ /*
+ * Generate temporary file names in a backwards-compatible way.
+ * If pid == 12345, the result is:
+ * <path>/DB12345 (tried above, the first time through).
+ * <path>/DBa2345 ... <path>/DBz2345
+ * <path>/DBaa345 ... <path>/DBaz345
+ * <path>/DBba345, and so on.
+ *
+ * XXX
+ * This algorithm is O(n**2) -- that is, creating 100 temporary
+ * files requires 5,000 opens, creating 1000 files requires
+ * 500,000. If applications open a lot of temporary files, we
+ * could improve performance by switching to timestamp-based
+ * file names.
+ */
+ for (i = filenum, trv = firstx; i > 0; i = (i - 1) / 26)
+ if (*trv++ == '\0') {
+ ret = EINVAL;
+ goto done;
+ }
+
+ for (i = filenum; i > 0; i = (i - 1) / 26)
+ *--trv = 'a' + ((i - 1) % 26);
+ }
+done:
+ __os_free(env, path);
+ return (ret);
+}
diff --git a/src/env/env_open.c b/src/env/env_open.c
new file mode 100644
index 00000000..7eddca3a
--- /dev/null
+++ b/src/env/env_open.c
@@ -0,0 +1,1262 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __env_open_arg __P((DB_ENV *, u_int32_t));
+static int __file_handle_cleanup __P((ENV *));
+
+/*
+ * db_version --
+ * Return legacy version information, including DB Major Version,
+ * DB Minor Version, and DB Patch/Build numbers.
+ *
+ * EXTERN: char *db_version __P((int *, int *, int *));
+ */
+char *
+db_version(majverp, minverp, patchp)
+ int *majverp, *minverp, *patchp;
+{
+ if (majverp != NULL)
+ *majverp = DB_VERSION_MAJOR;
+ if (minverp != NULL)
+ *minverp = DB_VERSION_MINOR;
+ if (patchp != NULL)
+ *patchp = DB_VERSION_PATCH;
+ return ((char *)DB_VERSION_STRING);
+}
+
+/*
+ * db_full_version --
+ * Return complete version information, including Oracle Family,
+ * Oracle Release, DB Major Version, DB Minor Version, and DB
+ * Patch/Build numbers.
+ *
+ * EXTERN: char *db_full_version __P((int *, int *, int *, int *, int *));
+ */
+char *
+db_full_version(familyp, releasep, majverp, minverp, patchp)
+ int *familyp, *releasep, *majverp, *minverp, *patchp;
+{
+ if (familyp != NULL)
+ *familyp = DB_VERSION_FAMILY;
+ if (releasep != NULL)
+ *releasep = DB_VERSION_RELEASE;
+ if (majverp != NULL)
+ *majverp = DB_VERSION_MAJOR;
+ if (minverp != NULL)
+ *minverp = DB_VERSION_MINOR;
+ if (patchp != NULL)
+ *patchp = DB_VERSION_PATCH;
+ return ((char *)DB_VERSION_FULL_STRING);
+}
+
+/*
+ * __env_open_pp --
+ * DB_ENV->open pre/post processing.
+ *
+ * PUBLIC: int __env_open_pp __P((DB_ENV *, const char *, u_int32_t, int));
+ */
+int
+__env_open_pp(dbenv, db_home, flags, mode)
+ DB_ENV *dbenv;
+ const char *db_home;
+ u_int32_t flags;
+ int mode;
+{
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->open");
+
+#undef OKFLAGS
+#define OKFLAGS \
+ (DB_CREATE | DB_FAILCHK | DB_FAILCHK_ISALIVE | DB_INIT_CDB | \
+ DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_REP | \
+ DB_INIT_TXN | DB_LOCKDOWN | DB_NO_CHECKPOINT | DB_PRIVATE | \
+ DB_RECOVER | DB_RECOVER_FATAL | DB_REGISTER | DB_SYSTEM_MEM | \
+ DB_THREAD | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
+#undef OKFLAGS_CDB
+#define OKFLAGS_CDB \
+ (DB_CREATE | DB_INIT_CDB | DB_INIT_MPOOL | DB_LOCKDOWN | \
+ DB_PRIVATE | DB_SYSTEM_MEM | DB_THREAD | \
+ DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
+
+ if ((ret = __db_fchk(env, "DB_ENV->open", flags, OKFLAGS)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(
+ env, "DB_ENV->open", flags, DB_INIT_CDB, ~OKFLAGS_CDB)) != 0)
+ return (ret);
+
+#if defined(HAVE_MIXED_SIZE_ADDRESSING) && (SIZEOF_CHAR_P == 8)
+ if (F_ISSET(env, DB_PRIVATE)) {
+ __db_errx(env, DB_STR("1589", "DB_PRIVATE is not "
+ "supported by 64-bit applications in "
+ "mixed-size-addressing mode"));
+ return (EINVAL);
+ }
+#endif
+
+ return (__env_open(dbenv, db_home, flags, mode));
+}
+
+/*
+ * __env_open --
+ * DB_ENV->open.
+ *
+ * PUBLIC: int __env_open __P((DB_ENV *, const char *, u_int32_t, int));
+ */
+int
+__env_open(dbenv, db_home, flags, mode)
+ DB_ENV *dbenv;
+ const char *db_home;
+ u_int32_t flags;
+ int mode;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ u_int32_t orig_flags;
+ int register_recovery, ret, t_ret;
+
+ ip = NULL;
+ env = dbenv->env;
+ register_recovery = 0;
+
+ /* Initial configuration. */
+ if ((ret = __env_config(dbenv, db_home, &flags, mode)) != 0)
+ return (ret);
+
+ /*
+ * Save the DB_ENV handle's configuration flags as set by user-called
+ * configuration methods and the environment directory's DB_CONFIG
+ * file. If we use this DB_ENV structure to recover the existing
+ * environment or to remove an environment we created after failure,
+ * we'll restore the DB_ENV flags to these values.
+ */
+ orig_flags = dbenv->flags;
+
+ /* Check open flags. */
+ if ((ret = __env_open_arg(dbenv, flags)) != 0)
+ return (ret);
+
+ /*
+ * If we're going to register with the environment, that's the first
+ * thing we do.
+ */
+ if (LF_ISSET(DB_REGISTER)) {
+ /*
+ * Through the SQL interface (btree.c) we set
+ * DB_FAILCHK_ISALIVE. When set, we want to run failchk
+ * if a recovery is needed. Set up the infrastructure to run
+ * it. SQL applications have no way to specify the thread
+ * count or an isalive, so force it here. Failchk is run
+ * inside of register code.
+ */
+ if (LF_ISSET(DB_FAILCHK_ISALIVE)) {
+ (void)__env_set_thread_count(dbenv, 50);
+ dbenv->is_alive = __envreg_isalive;
+ }
+
+ if ((ret =
+ __envreg_register(env, &register_recovery, flags)) != 0)
+ goto err;
+ if (register_recovery) {
+ if (!LF_ISSET(DB_RECOVER)) {
+ __db_errx(env, DB_STR("1567",
+ "The DB_RECOVER flag was not specified, and recovery is needed"));
+ ret = DB_RUNRECOVERY;
+ goto err;
+ }
+ } else
+ LF_CLR(DB_RECOVER);
+ }
+
+ /*
+ * If we're doing recovery, destroy the environment so that we create
+ * all the regions from scratch. The major concern I have is if the
+ * application stomps the environment with a rogue pointer. We have
+ * no way of detecting that, and we could be forced into a situation
+ * where we start up and then crash, repeatedly.
+ *
+ * We do not check any flags like DB_PRIVATE before calling remove.
+ * We don't care if the current environment was private or not, we
+ * want to remove files left over for any reason, from any session.
+ */
+retry: if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))
+#ifdef HAVE_REPLICATION
+ if ((ret = __rep_reset_init(env)) != 0 ||
+ (ret = __env_remove_env(env)) != 0 ||
+#else
+ if ((ret = __env_remove_env(env)) != 0 ||
+#endif
+ (ret = __env_refresh(dbenv, orig_flags, 0)) != 0)
+ goto err;
+
+ if ((ret = __env_attach_regions(dbenv, flags, orig_flags, 1)) != 0)
+ goto err;
+
+ /*
+ * After attached to env, run failchk if not doing register
+ * recovery. Not providing this option with the DB_FAILCHK_ISALIVE
+ * flag.
+ */
+ if (LF_ISSET(DB_FAILCHK) && !register_recovery) {
+ ENV_ENTER(env, ip);
+ if ((ret = __env_failchk_int(dbenv)) != 0)
+ goto err;
+ ENV_LEAVE(env, ip);
+ }
+
+err: if (ret != 0)
+ (void)__env_refresh(dbenv, orig_flags, 0);
+
+ if (register_recovery) {
+ /*
+ * If recovery succeeded, release our exclusive lock, other
+ * processes can now proceed.
+ *
+ * If recovery failed, unregister now and let another process
+ * clean up.
+ */
+ if (ret == 0 && (t_ret = __envreg_xunlock(env)) != 0)
+ ret = t_ret;
+ if (ret != 0)
+ (void)__envreg_unregister(env, 1);
+ }
+
+ /*
+ * If the open is called with DB_REGISTER we can potentially skip
+ * running recovery on a panicked environment. We can't check the panic
+ * bit earlier since checking requires opening the environment.
+ * Only retry if DB_RECOVER was specified - the register_recovery flag
+ * indicates that.
+ */
+ if (ret == DB_RUNRECOVERY && !register_recovery &&
+ !LF_ISSET(DB_RECOVER) && LF_ISSET(DB_REGISTER)) {
+ LF_SET(DB_RECOVER);
+ goto retry;
+ }
+
+ return (ret);
+}
+
+/*
+ * __env_open_arg --
+ * DB_ENV->open flags checking.
+ */
+static int
+__env_open_arg(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ ret = 0;
+
+ if (LF_ISSET(DB_REGISTER)) {
+ if (!__os_support_db_register()) {
+ __db_errx(env, DB_STR("1568",
+ "Berkeley DB library does not support DB_REGISTER on this system"));
+ return (EINVAL);
+ }
+ if ((ret = __db_fcchk(env, "DB_ENV->open", flags,
+ DB_PRIVATE, DB_REGISTER | DB_SYSTEM_MEM)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_CREATE) && !LF_ISSET(DB_INIT_TXN)) {
+ __db_errx(env, DB_STR("1569",
+ "registration requires transaction support"));
+ return (EINVAL);
+ }
+ }
+ /*
+ * Only check for flags compatible with DB_INIT_REP when creating
+ * since otherwise it'll be ignored anyway.
+ */
+ if (LF_ISSET(DB_INIT_REP) && LF_ISSET(DB_CREATE)) {
+ if (!__os_support_replication()) {
+ __db_errx(env, DB_STR("1570",
+ "Berkeley DB library does not support replication on this system"));
+ return (EINVAL);
+ }
+ if (!LF_ISSET(DB_INIT_LOCK)) {
+ __db_errx(env, DB_STR("1571",
+ "replication requires locking support"));
+ return (EINVAL);
+ }
+ if (!LF_ISSET(DB_INIT_TXN)) {
+ __db_errx(env, DB_STR("1572",
+ "replication requires transaction support"));
+ return (EINVAL);
+ }
+ }
+ if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) {
+ if ((ret = __db_fcchk(env,
+ "DB_ENV->open", flags, DB_RECOVER, DB_RECOVER_FATAL)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(env,
+ "DB_ENV->open", flags, DB_REGISTER, DB_RECOVER_FATAL)) != 0)
+ return (ret);
+ if (!LF_ISSET(DB_CREATE)) {
+ __db_errx(env, DB_STR("1573",
+ "recovery requires the create flag"));
+ return (EINVAL);
+ }
+ if (!LF_ISSET(DB_INIT_TXN)) {
+ __db_errx(env, DB_STR("1574",
+ "recovery requires transaction support"));
+ return (EINVAL);
+ }
+ }
+ if (LF_ISSET(DB_FAILCHK)) {
+ if (!ALIVE_ON(env)) {
+ __db_errx(env, DB_STR("1575",
+ "DB_FAILCHK requires DB_ENV->is_alive be configured"));
+ return (EINVAL);
+ }
+ if (dbenv->thr_max == 0) {
+ __db_errx(env, DB_STR("1576",
+ "DB_FAILCHK requires DB_ENV->set_thread_count be configured"));
+ return (EINVAL);
+ }
+ }
+
+#ifdef HAVE_MUTEX_THREAD_ONLY
+ /*
+ * Currently we support one kind of mutex that is intra-process only,
+ * POSIX 1003.1 pthreads, because a variety of systems don't support
+ * the full pthreads API, and our only alternative is test-and-set.
+ */
+ if (!LF_ISSET(DB_PRIVATE)) {
+ __db_errx(env, DB_STR("1577",
+ "Berkeley DB library configured to support only private environments"));
+ return (EINVAL);
+ }
+#endif
+
+#ifdef HAVE_MUTEX_FCNTL
+ /*
+ * !!!
+ * We need a file descriptor for fcntl(2) locking. We use the file
+ * handle from the REGENV file for this purpose.
+ *
+ * Since we may be using shared memory regions, e.g., shmget(2), and
+ * not a mapped-in regular file, the backing file may be only a few
+ * bytes in length. So, this depends on the ability to call fcntl to
+ * lock file offsets much larger than the actual physical file. I
+ * think that's safe -- besides, very few systems actually need this
+ * kind of support, SunOS is the only one still in wide use of which
+ * I'm aware.
+ *
+ * The error case is if an application lacks spinlocks and wants to be
+ * threaded. That doesn't work because fcntl will lock the underlying
+ * process, including all its threads.
+ */
+ if (F_ISSET(env, ENV_THREAD)) {
+ __db_errx(env, DB_STR("1578",
+ "architecture lacks fast mutexes: applications cannot be threaded"));
+ return (EINVAL);
+ }
+#endif
+ return (ret);
+}
+
+/*
+ * __env_remove --
+ * DB_ENV->remove.
+ *
+ * PUBLIC: int __env_remove __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_remove(dbenv, db_home, flags)
+ DB_ENV *dbenv;
+ const char *db_home;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbenv->env;
+
+#undef OKFLAGS
+#define OKFLAGS \
+ (DB_FORCE | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT)
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env, "DB_ENV->remove", flags, OKFLAGS)) != 0)
+ return (ret);
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->remove");
+
+ if ((ret = __env_config(dbenv, db_home, &flags, 0)) != 0)
+ return (ret);
+
+ /*
+ * Turn the environment off -- if the environment is corrupted, this
+ * could fail. Ignore any error if we're forcing the question.
+ */
+ if ((ret = __env_turn_off(env, flags)) == 0 || LF_ISSET(DB_FORCE))
+ ret = __env_remove_env(env);
+
+ if ((t_ret = __env_close(dbenv, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __env_config --
+ * Argument-based initialization.
+ *
+ * PUBLIC: int __env_config __P((DB_ENV *, const char *, u_int32_t *, int));
+ */
+int
+__env_config(dbenv, db_home, flagsp, mode)
+ DB_ENV *dbenv;
+ const char *db_home;
+ u_int32_t *flagsp;
+ int mode;
+{
+ ENV *env;
+ int ret;
+ u_int32_t flags;
+ char *home, home_buf[DB_MAXPATHLEN];
+
+ env = dbenv->env;
+ flags = *flagsp;
+
+ /*
+ * Set the database home.
+ *
+ * Use db_home by default, this allows utilities to reasonably
+ * override the environment either explicitly or by using a -h
+ * option. Otherwise, use the environment if it's permitted
+ * and initialized.
+ */
+ home = (char *)db_home;
+ if (home == NULL && (LF_ISSET(DB_USE_ENVIRON) ||
+ (LF_ISSET(DB_USE_ENVIRON_ROOT) && __os_isroot()))) {
+ home = home_buf;
+ if ((ret = __os_getenv(
+ env, "DB_HOME", &home, sizeof(home_buf))) != 0)
+ return (ret);
+ /*
+ * home set to NULL if __os_getenv failed to find DB_HOME.
+ */
+ }
+ if (home != NULL) {
+ if (env->db_home != NULL)
+ __os_free(env, env->db_home);
+ if ((ret = __os_strdup(env, home, &env->db_home)) != 0)
+ return (ret);
+ }
+
+ /* Save a copy of the DB_ENV->open method flags. */
+ env->open_flags = flags;
+
+ /* Default permissions are read-write for both owner and group. */
+ env->db_mode = mode == 0 ? DB_MODE_660 : mode;
+
+ /* Read the DB_CONFIG file. */
+ if ((ret = __env_read_db_config(env)) != 0)
+ return (ret);
+
+ /*
+ * Update the DB_ENV->open method flags. The copy of the flags might
+ * have been changed during reading DB_CONFIG file.
+ */
+ flags = env->open_flags;
+
+ /*
+ * If no temporary directory path was specified in the config file,
+ * choose one.
+ */
+ if (dbenv->db_tmp_dir == NULL && (ret = __os_tmpdir(env, flags)) != 0)
+ return (ret);
+
+ *flagsp = flags;
+ return (0);
+}
+
+/*
+ * __env_close_pp --
+ * DB_ENV->close pre/post processor.
+ *
+ * PUBLIC: int __env_close_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_close_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int rep_check, ret, t_ret;
+ u_int32_t close_flags, flags_orig;
+
+ env = dbenv->env;
+ ret = 0;
+ close_flags = flags_orig = 0;
+
+ /*
+ * Validate arguments, but as a DB_ENV handle destructor, we can't
+ * fail.
+ */
+ if (flags != 0 && flags != DB_FORCESYNC &&
+ (t_ret = __db_ferr(env, "DB_ENV->close", 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+#define DBENV_FORCESYNC 0x00000001
+#define DBENV_CLOSE_REPCHECK 0x00000010
+ if (flags == DB_FORCESYNC)
+ close_flags |= DBENV_FORCESYNC;
+
+ /*
+ * If the environment has panic'd, all we do is try and discard
+ * the important resources.
+ */
+ if (PANIC_ISSET(env)) {
+ /* clean up from registry file */
+ if (dbenv->registry != NULL) {
+ /*
+ * Temporarily set no panic so we do not trigger the
+ * LAST_PANIC_CHECK_BEFORE_IO check in __os_physwr
+ * thus allowing the unregister to happen correctly.
+ */
+ flags_orig = F_ISSET(dbenv, DB_ENV_NOPANIC);
+ F_SET(dbenv, DB_ENV_NOPANIC);
+ (void)__envreg_unregister(env, 0);
+ dbenv->registry = NULL;
+ if (!flags_orig)
+ F_CLR(dbenv, DB_ENV_NOPANIC);
+ }
+
+ /* Close all underlying threads and sockets. */
+ if (IS_ENV_REPLICATED(env))
+ (void)__repmgr_close(env);
+
+ /* Close all underlying file handles. */
+ (void)__file_handle_cleanup(env);
+
+ PANIC_CHECK(env);
+ }
+
+ ENV_ENTER(env, ip);
+
+ rep_check = IS_ENV_REPLICATED(env) ? 1 : 0;
+ if (rep_check) {
+#ifdef HAVE_REPLICATION_THREADS
+ /*
+ * Shut down Replication Manager threads first of all. This
+ * must be done before __env_rep_enter to avoid a deadlock that
+ * could occur if repmgr's background threads try to do a rep
+ * operation that needs __rep_lockout.
+ */
+ if ((t_ret = __repmgr_close(env)) != 0 && ret == 0)
+ ret = t_ret;
+#endif
+ if ((t_ret = __env_rep_enter(env, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ if (rep_check)
+ close_flags |= DBENV_CLOSE_REPCHECK;
+ if ((t_ret = __env_close(dbenv, close_flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Don't ENV_LEAVE as we have already detached from the region. */
+ return (ret);
+}
+
+/*
+ * __env_close --
+ * DB_ENV->close.
+ *
+ * PUBLIC: int __env_close __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_close(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+ int ret, rep_check, t_ret;
+ char **p;
+ u_int32_t close_flags;
+
+ env = dbenv->env;
+ ret = 0;
+ close_flags = LF_ISSET(DBENV_FORCESYNC) ? 0 : DB_NOSYNC;
+ rep_check = LF_ISSET(DBENV_CLOSE_REPCHECK);
+
+ /*
+ * Check to see if we were in the middle of restoring transactions and
+ * need to close the open files.
+ */
+ if (TXN_ON(env) && (t_ret = __txn_preclose(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+#ifdef HAVE_REPLICATION
+ if ((t_ret = __rep_env_close(env)) != 0 && ret == 0)
+ ret = t_ret;
+#endif
+
+ /*
+ * Close all databases opened in this environment after the rep region
+ * is closed. Rep region's internal database is already closed now.
+ */
+ while ((dbp = TAILQ_FIRST(&env->dblist)) != NULL) {
+ /*
+ * Do not close the handle on a database partition, since it
+ * will be closed when closing the handle on the main database.
+ */
+ while (dbp != NULL && F_ISSET(dbp, DB_AM_PARTDB))
+ dbp = TAILQ_NEXT(dbp, dblistlinks);
+ DB_ASSERT(env, dbp != NULL);
+ /*
+ * Note down and ignore the error code. Since we can't do
+ * anything about the dbp handle anyway if the close
+ * operation fails. But we want to return the error to the
+ * caller. This is how this function takes care of various
+ * close operation errors.
+ */
+ if (dbp->alt_close != NULL)
+ t_ret = dbp->alt_close(dbp, close_flags);
+ else
+ t_ret = __db_close(dbp, NULL, close_flags);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /*
+ * Detach from the regions and undo the allocations done by
+ * DB_ENV->open.
+ */
+ if ((t_ret = __env_refresh(dbenv, 0, rep_check)) != 0 && ret == 0)
+ ret = t_ret;
+
+#ifdef HAVE_CRYPTO
+ /*
+ * Crypto comes last, because higher level close functions need
+ * cryptography.
+ */
+ if ((t_ret = __crypto_env_close(env)) != 0 && ret == 0)
+ ret = t_ret;
+#endif
+
+ /* If we're registered, clean up. */
+ if (dbenv->registry != NULL) {
+ (void)__envreg_unregister(env, 0);
+ dbenv->registry = NULL;
+ }
+
+ /* Check we've closed all underlying file handles. */
+ if ((t_ret = __file_handle_cleanup(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Release any string-based configuration parameters we've copied. */
+ if (dbenv->db_log_dir != NULL)
+ __os_free(env, dbenv->db_log_dir);
+ dbenv->db_log_dir = NULL;
+ if (dbenv->db_tmp_dir != NULL)
+ __os_free(env, dbenv->db_tmp_dir);
+ dbenv->db_tmp_dir = NULL;
+ if (dbenv->db_md_dir != NULL)
+ __os_free(env, dbenv->db_md_dir);
+ dbenv->db_md_dir = NULL;
+ if (dbenv->db_data_dir != NULL) {
+ for (p = dbenv->db_data_dir; *p != NULL; ++p)
+ __os_free(env, *p);
+ __os_free(env, dbenv->db_data_dir);
+ dbenv->db_data_dir = NULL;
+ dbenv->data_next = 0;
+ }
+ if (dbenv->intermediate_dir_mode != NULL)
+ __os_free(env, dbenv->intermediate_dir_mode);
+ if (env->db_home != NULL) {
+ __os_free(env, env->db_home);
+ env->db_home = NULL;
+ }
+
+ if (env->backup_handle != NULL) {
+ __os_free(env, env->backup_handle);
+ env->backup_handle = NULL;
+ }
+
+ /* Discard the structure. */
+ __db_env_destroy(dbenv);
+
+ return (ret);
+}
+
+/*
+ * __env_refresh --
+ * Refresh the DB_ENV structure.
+ * PUBLIC: int __env_refresh __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__env_refresh(dbenv, orig_flags, rep_check)
+ DB_ENV *dbenv;
+ u_int32_t orig_flags;
+ int rep_check;
+{
+ DB *ldbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbenv->env;
+ ret = 0;
+
+ /*
+ * Release resources allocated by DB_ENV->open, and return it to the
+ * state it was in just before __env_open was called. (This means
+ * state set by pre-open configuration functions must be preserved.)
+ *
+ * Refresh subsystems, in the reverse order they were opened (txn
+ * must be first, it may want to discard locks and flush the log).
+ *
+ * !!!
+ * Note that these functions, like all of __env_refresh, only undo
+ * the effects of __env_open. Functions that undo work done by
+ * db_env_create or by a configuration function should go in
+ * __env_close.
+ */
+ if (TXN_ON(env) &&
+ (t_ret = __txn_env_refresh(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (LOGGING_ON(env) &&
+ (t_ret = __log_env_refresh(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Locking should come after logging, because closing log results
+ * in files closing which may require locks being released.
+ */
+ if (LOCKING_ON(env)) {
+ if (!F_ISSET(env, ENV_THREAD) &&
+ env->env_lref != NULL && (t_ret =
+ __lock_id_free(env, env->env_lref)) != 0 && ret == 0)
+ ret = t_ret;
+ env->env_lref = NULL;
+
+ if ((t_ret = __lock_env_refresh(env)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /* Discard the DB_ENV, ENV handle mutexes. */
+ if ((t_ret = __mutex_free(env, &dbenv->mtx_db_env)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __mutex_free(env, &env->mtx_env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Discard DB list and its mutex.
+ * Discard the MT mutex.
+ *
+ * !!!
+ * This must be done after we close the log region, because we close
+ * database handles and so acquire this mutex when we close log file
+ * handles.
+ */
+ if (env->db_ref != 0) {
+ __db_errx(env, DB_STR("1579",
+ "Database handles still open at environment close"));
+ TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks)
+ __db_errx(env, DB_STR_A("1580",
+ "Open database handle: %s%s%s", "%s %s %s"),
+ ldbp->fname == NULL ? "unnamed" : ldbp->fname,
+ ldbp->dname == NULL ? "" : "/",
+ ldbp->dname == NULL ? "" : ldbp->dname);
+ if (ret == 0)
+ ret = EINVAL;
+ }
+ TAILQ_INIT(&env->dblist);
+ if ((t_ret = __mutex_free(env, &env->mtx_dblist)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __mutex_free(env, &env->mtx_mt)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (env->mt != NULL) {
+ __os_free(env, env->mt);
+ env->mt = NULL;
+ }
+
+ if (MPOOL_ON(env)) {
+ /*
+ * If it's a private environment, flush the contents to disk.
+ * Recovery would have put everything back together, but it's
+ * faster and cleaner to flush instead.
+ *
+ * Ignore application max-write configuration, we're shutting
+ * down.
+ */
+ if (F_ISSET(env, ENV_PRIVATE) &&
+ !F_ISSET(dbenv, DB_ENV_NOFLUSH) &&
+ (t_ret = __memp_sync_int(env, NULL, 0,
+ DB_SYNC_CACHE | DB_SYNC_SUPPRESS_WRITE, NULL, NULL)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __memp_env_refresh(env)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /*
+ * If we're included in a shared replication handle count, this
+ * is our last chance to decrement that count.
+ *
+ * !!!
+ * We can't afford to do anything dangerous after we decrement the
+ * handle count, of course, as replication may be proceeding with
+ * client recovery. However, since we're discarding the regions
+ * as soon as we drop the handle count, there's little opportunity
+ * to do harm.
+ */
+ if (rep_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Refresh the replication region.
+ *
+ * Must come after we call __env_db_rep_exit above.
+ */
+ if (REP_ON(env) && (t_ret = __rep_env_refresh(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+#ifdef HAVE_CRYPTO
+ /*
+ * Crypto comes last, because higher level close functions need
+ * cryptography.
+ */
+ if (env->reginfo != NULL &&
+ (t_ret = __crypto_env_refresh(env)) != 0 && ret == 0)
+ ret = t_ret;
+#endif
+
+ /*
+ * Mark the thread as out of the env before we get rid of the handles
+ * needed to do so.
+ */
+ if (env->thr_hashtab != NULL &&
+ (t_ret = __env_set_state(env, &ip, THREAD_OUT)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * We are about to detach from the mutex region. This is the last
+ * chance we have to acquire/destroy a mutex -- acquire/destroy the
+ * mutex and release our reference.
+ *
+ * !!!
+ * There are two DbEnv methods that care about environment reference
+ * counts: DbEnv.close and DbEnv.remove. The DbEnv.close method is
+ * not a problem because it only decrements the reference count and
+ * no actual resources are discarded -- lots of threads of control
+ * can call DbEnv.close at the same time, and regardless of racing
+ * on the reference count mutex, we wouldn't have a problem. Since
+ * the DbEnv.remove method actually discards resources, we can have
+ * a problem.
+ *
+ * If we decrement the reference count to 0 here, go to sleep, and
+ * the DbEnv.remove method is called, by the time we run again, the
+ * underlying shared regions could have been removed. That's fine,
+ * except we might actually need the regions to resolve outstanding
+ * operations in the various subsystems, and if we don't have hard
+ * OS references to the regions, we could get screwed. Of course,
+ * we should have hard OS references to everything we need, but just
+ * in case, we put off decrementing the reference count as long as
+ * possible.
+ */
+ if ((t_ret = __env_ref_decrement(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+#ifdef HAVE_MUTEX_SUPPORT
+ if (MUTEX_ON(env) &&
+ (t_ret = __mutex_env_refresh(env)) != 0 && ret == 0)
+ ret = t_ret;
+#endif
+ /* Free memory for thread tracking. */
+ if (env->reginfo != NULL) {
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ __env_thread_destroy(env);
+ t_ret = __env_detach(env, 1);
+ } else
+ t_ret = __env_detach(env, 0);
+
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * !!!
+ * Don't free env->reginfo or set the reference to NULL,
+ * that was done by __env_detach().
+ */
+ }
+
+ if (env->recover_dtab.int_dispatch != NULL) {
+ __os_free(env, env->recover_dtab.int_dispatch);
+ env->recover_dtab.int_size = 0;
+ env->recover_dtab.int_dispatch = NULL;
+ }
+ if (env->recover_dtab.ext_dispatch != NULL) {
+ __os_free(env, env->recover_dtab.ext_dispatch);
+ env->recover_dtab.ext_size = 0;
+ env->recover_dtab.ext_dispatch = NULL;
+ }
+
+ dbenv->flags = orig_flags;
+
+ return (ret);
+}
+
+/*
+ * __file_handle_cleanup --
+ * Close any underlying open file handles so we don't leak system
+ * resources.
+ */
+static int
+__file_handle_cleanup(env)
+ ENV *env;
+{
+ DB_FH *fhp;
+
+ if (TAILQ_FIRST(&env->fdlist) == NULL)
+ return (0);
+
+ __db_errx(env, DB_STR("1581",
+ "File handles still open at environment close"));
+ while ((fhp = TAILQ_FIRST(&env->fdlist)) != NULL) {
+ __db_errx(env, DB_STR_A("1582", "Open file handle: %s", "%s"),
+ fhp->name);
+ (void)__os_closehandle(env, fhp);
+ }
+ return (EINVAL);
+}
+
+/*
+ * __env_get_open_flags
+ * DbEnv.get_open_flags method.
+ *
+ * PUBLIC: int __env_get_open_flags __P((DB_ENV *, u_int32_t *));
+ */
+int
+__env_get_open_flags(dbenv, flagsp)
+ DB_ENV *dbenv;
+ u_int32_t *flagsp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->get_open_flags");
+
+ *flagsp = env->open_flags;
+ return (0);
+}
+/*
+ * __env_attach_regions --
+ * Perform attaches to env and required regions (subsystems)
+ *
+ * PUBLIC: int __env_attach_regions __P((DB_ENV *, u_int32_t, u_int32_t, int));
+ */
+int
+__env_attach_regions(dbenv, flags, orig_flags, retry_ok)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+ u_int32_t orig_flags;
+ int retry_ok;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REGINFO *infop;
+ u_int32_t init_flags;
+ int create_ok, rep_check, ret;
+
+ ip = NULL;
+ env = dbenv->env;
+ rep_check = 0;
+
+ /* Convert the DB_ENV->open flags to internal flags. */
+ create_ok = LF_ISSET(DB_CREATE) ? 1 : 0;
+ if (LF_ISSET(DB_LOCKDOWN))
+ F_SET(env, ENV_LOCKDOWN);
+ if (LF_ISSET(DB_PRIVATE))
+ F_SET(env, ENV_PRIVATE);
+ if (LF_ISSET(DB_RECOVER_FATAL))
+ F_SET(env, ENV_RECOVER_FATAL);
+ if (LF_ISSET(DB_SYSTEM_MEM))
+ F_SET(env, ENV_SYSTEM_MEM);
+ if (LF_ISSET(DB_THREAD))
+ F_SET(env, ENV_THREAD);
+
+ /*
+ * Create/join the environment. We pass in the flags of interest to
+ * a thread subsequently joining an environment we create. If we're
+ * not the ones to create the environment, our flags will be updated
+ * to match the existing environment.
+ */
+ init_flags = 0;
+ if (LF_ISSET(DB_INIT_CDB))
+ FLD_SET(init_flags, DB_INITENV_CDB);
+ if (F_ISSET(dbenv, DB_ENV_CDB_ALLDB))
+ FLD_SET(init_flags, DB_INITENV_CDB_ALLDB);
+ if (LF_ISSET(DB_INIT_LOCK))
+ FLD_SET(init_flags, DB_INITENV_LOCK);
+ if (LF_ISSET(DB_INIT_LOG))
+ FLD_SET(init_flags, DB_INITENV_LOG);
+ if (LF_ISSET(DB_INIT_MPOOL))
+ FLD_SET(init_flags, DB_INITENV_MPOOL);
+ if (LF_ISSET(DB_INIT_REP))
+ FLD_SET(init_flags, DB_INITENV_REP);
+ if (LF_ISSET(DB_INIT_TXN))
+ FLD_SET(init_flags, DB_INITENV_TXN);
+ if ((ret = __env_attach(env, &init_flags, create_ok, retry_ok)) != 0)
+ goto err;
+
+ /*
+ * __env_attach will return the saved init_flags field, which contains
+ * the DB_INIT_* flags used when the environment was created.
+ *
+ * We may be joining an environment -- reset our flags to match the
+ * ones in the environment.
+ */
+ if (FLD_ISSET(init_flags, DB_INITENV_CDB))
+ LF_SET(DB_INIT_CDB);
+ if (FLD_ISSET(init_flags, DB_INITENV_LOCK))
+ LF_SET(DB_INIT_LOCK);
+ if (FLD_ISSET(init_flags, DB_INITENV_LOG))
+ LF_SET(DB_INIT_LOG);
+ if (FLD_ISSET(init_flags, DB_INITENV_MPOOL))
+ LF_SET(DB_INIT_MPOOL);
+ if (FLD_ISSET(init_flags, DB_INITENV_REP))
+ LF_SET(DB_INIT_REP);
+ if (FLD_ISSET(init_flags, DB_INITENV_TXN))
+ LF_SET(DB_INIT_TXN);
+ if (FLD_ISSET(init_flags, DB_INITENV_CDB_ALLDB) &&
+ (ret = __env_set_flags(dbenv, DB_CDB_ALLDB, 1)) != 0)
+ goto err;
+
+ /* Initialize for CDB product. */
+ if (LF_ISSET(DB_INIT_CDB)) {
+ LF_SET(DB_INIT_LOCK);
+ F_SET(env, ENV_CDB);
+ }
+
+ /*
+ * Update the flags to match the database environment. The application
+ * may have specified flags of 0 to join the environment, and this line
+ * replaces that value with the flags corresponding to the existing,
+ * underlying set of subsystems. This means the DbEnv.get_open_flags
+ * method returns the flags to open the existing environment instead of
+ * the specific flags passed to the DbEnv.open method.
+ */
+ env->open_flags = flags;
+
+ /*
+ * The DB_ENV structure has now been initialized. Turn off further
+ * use of the DB_ENV structure and most initialization methods, we're
+ * about to act on the values we currently have.
+ */
+ F_SET(env, ENV_OPEN_CALLED);
+
+ infop = env->reginfo;
+
+#ifdef HAVE_MUTEX_SUPPORT
+ /*
+ * Initialize the mutex regions first before ENV_ENTER().
+ * Mutexes need to be 'on' when attaching to an existing env
+ * in order to safely allocate the thread tracking info.
+ */
+ if ((ret = __mutex_open(env, create_ok)) != 0)
+ goto err;
+ /* The MUTEX_REQUIRED() in __env_alloc() expects this to be set. */
+ infop->mtx_alloc = ((REGENV *)infop->primary)->mtx_regenv;
+#endif
+ /*
+ * Initialize thread tracking and enter the API.
+ */
+ if ((ret =
+ __env_thread_init(env, F_ISSET(infop, REGION_CREATE) ? 1 : 0)) != 0)
+ goto err;
+
+ ENV_ENTER(env, ip);
+
+ /*
+ * Initialize the subsystems.
+ */
+ /*
+ * We can now acquire/create mutexes: increment the region's reference
+ * count.
+ */
+ if ((ret = __env_ref_increment(env)) != 0)
+ goto err;
+
+ /*
+ * Initialize the handle mutexes.
+ */
+ if ((ret = __mutex_alloc(env,
+ MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbenv->mtx_db_env)) != 0 ||
+ (ret = __mutex_alloc(env,
+ MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &env->mtx_env)) != 0)
+ goto err;
+
+ /*
+ * Initialize the replication area next, so that we can lock out this
+ * call if we're currently running recovery for replication.
+ */
+ if (LF_ISSET(DB_INIT_REP) && (ret = __rep_open(env)) != 0)
+ goto err;
+
+ rep_check = IS_ENV_REPLICATED(env) ? 1 : 0;
+ if (rep_check && (ret = __env_rep_enter(env, 0)) != 0)
+ goto err;
+
+ if (LF_ISSET(DB_INIT_MPOOL)) {
+ if ((ret = __memp_open(env, create_ok)) != 0)
+ goto err;
+
+ /*
+ * BDB does do cache I/O during recovery and when starting up
+ * replication. If creating a new environment, then suppress
+ * any application max-write configuration.
+ */
+ if (create_ok)
+ (void)__memp_set_config(
+ dbenv, DB_MEMP_SUPPRESS_WRITE, 1);
+
+ /*
+ * Initialize the DB list and its mutex. If the mpool is
+ * not initialized, we can't ever open a DB handle, which
+ * is why this code lives here.
+ */
+ TAILQ_INIT(&env->dblist);
+ if ((ret = __mutex_alloc(env, MTX_ENV_DBLIST,
+ DB_MUTEX_PROCESS_ONLY, &env->mtx_dblist)) != 0)
+ goto err;
+
+ /* Register DB's pgin/pgout functions. */
+ if ((ret = __memp_register(
+ env, DB_FTYPE_SET, __db_pgin, __db_pgout)) != 0)
+ goto err;
+ }
+
+ /*
+ * Initialize the ciphering area prior to any running of recovery so
+ * that we can initialize the keys, etc. before recovery, including
+ * the MT mutex.
+ *
+ * !!!
+ * This must be after the mpool init, but before the log initialization
+ * because log_open may attempt to run log_recover during its open.
+ */
+ if (LF_ISSET(DB_INIT_MPOOL | DB_INIT_LOG | DB_INIT_TXN) &&
+ (ret = __crypto_region_init(env)) != 0)
+ goto err;
+ if ((ret = __mutex_alloc(
+ env, MTX_TWISTER, DB_MUTEX_PROCESS_ONLY, &env->mtx_mt)) != 0)
+ goto err;
+
+ /*
+ * Transactions imply logging but do not imply locking. While almost
+ * all applications want both locking and logging, it would not be
+ * unreasonable for a single threaded process to want transactions for
+ * atomicity guarantees, but not necessarily need concurrency.
+ */
+ if (LF_ISSET(DB_INIT_LOG | DB_INIT_TXN))
+ if ((ret = __log_open(env)) != 0)
+ goto err;
+ if (LF_ISSET(DB_INIT_LOCK))
+ if ((ret = __lock_open(env)) != 0)
+ goto err;
+
+ if (LF_ISSET(DB_INIT_TXN)) {
+ if ((ret = __txn_open(env)) != 0)
+ goto err;
+
+ /*
+ * If the application is running with transactions, initialize
+ * the function tables.
+ */
+ if ((ret = __env_init_rec(env,
+ ((LOG *)env->lg_handle->reginfo.primary)->persist.version))
+ != 0)
+ goto err;
+ }
+
+ /* Perform recovery for any previous run. */
+ if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) &&
+ (ret = __db_apprec(env, ip, NULL, NULL, 1,
+ LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL | DB_NO_CHECKPOINT))) != 0)
+ goto err;
+
+ /*
+ * If we've created the regions, are running with transactions, and did
+ * not just run recovery, we need to log the fact that the transaction
+ * IDs got reset.
+ *
+ * If we ran recovery, there may be prepared-but-not-yet-committed
+ * transactions that need to be resolved. Recovery resets the minimum
+ * transaction ID and logs the reset if that's appropriate, so we
+ * don't need to do anything here in the recover case.
+ */
+ if (TXN_ON(env) &&
+ !FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) &&
+ F_ISSET(infop, REGION_CREATE) &&
+ !LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) &&
+ (ret = __txn_reset(env)) != 0)
+ goto err;
+
+ /* The database environment is ready for business. */
+ if ((ret = __env_turn_on(env)) != 0)
+ goto err;
+
+ if (rep_check)
+ ret = __env_db_rep_exit(env);
+
+ /* Turn any application-specific max-write configuration back on. */
+ if (LF_ISSET(DB_INIT_MPOOL))
+ (void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 0);
+
+err: if (ret == 0)
+ ENV_LEAVE(env, ip);
+ else {
+ /*
+ * If we fail after creating regions, panic and remove them.
+ *
+ * !!!
+ * No need to call __env_db_rep_exit, that work is done by the
+ * calls to __env_refresh.
+ */
+ infop = env->reginfo;
+ if (infop != NULL && F_ISSET(infop, REGION_CREATE)) {
+ ret = __env_panic(env, ret);
+
+ /* Refresh the DB_ENV so can use it to call remove. */
+ (void)__env_refresh(dbenv, orig_flags, rep_check);
+ (void)__env_remove_env(env);
+ (void)__env_refresh(dbenv, orig_flags, 0);
+ } else
+ (void)__env_refresh(dbenv, orig_flags, rep_check);
+ /* clear the fact that the region had been opened */
+ F_CLR(env, ENV_OPEN_CALLED);
+ }
+
+ return (ret);
+}
diff --git a/src/env/env_recover.c b/src/env/env_recover.c
new file mode 100644
index 00000000..9636554a
--- /dev/null
+++ b/src/env/env_recover.c
@@ -0,0 +1,1093 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+#ifndef lint
+static const char copyright[] =
+ "Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.\n";
+#endif
+
+static int __db_log_corrupt __P((ENV *, DB_LSN *));
+static int __env_init_rec_42 __P((ENV *));
+static int __env_init_rec_43 __P((ENV *));
+static int __env_init_rec_46 __P((ENV *));
+static int __env_init_rec_47 __P((ENV *));
+static int __env_init_rec_48 __P((ENV *));
+static int __log_earliest __P((ENV *, DB_LOGC *, int32_t *, DB_LSN *));
+
+static double __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int));
+static int __log_backup __P((ENV *, DB_LOGC *, DB_LSN *, DB_LSN*));
+
+/*
+ * __db_apprec --
+ * Perform recovery. If max_lsn is non-NULL, then we are trying
+ * to synchronize this system up with another system that has a max
+ * LSN of max_lsn, so we need to roll back sufficiently far for that
+ * to work. See __log_backup for details.
+ *
+ * PUBLIC: int __db_apprec __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, DB_LSN *, DB_LSN *, int, u_int32_t));
+ */
+int
+__db_apprec(env, ip, max_lsn, trunclsn, update, flags)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_LSN *max_lsn, *trunclsn;
+ int update;
+ u_int32_t flags;
+{
+ DBT data;
+ DB_ENV *dbenv;
+ DB_LOGC *logc;
+ DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, stop_lsn, tlsn;
+ DB_LSN *vtrunc_ckp, *vtrunc_lsn;
+ DB_TXNHEAD *txninfo;
+ DB_TXNREGION *region;
+ REGENV *renv;
+ REGINFO *infop;
+ __txn_ckp_args *ckp_args;
+ time_t now, tlow;
+ double nfiles;
+ u_int32_t hi_txn, log_size, txnid;
+ int32_t low;
+ int all_recovered, progress, rectype, ret, t_ret;
+ char *p, *pass;
+ char t1[CTIME_BUFLEN], t2[CTIME_BUFLEN], time_buf[CTIME_BUFLEN];
+
+ COMPQUIET(nfiles, (double)0.001);
+
+ dbenv = env->dbenv;
+ logc = NULL;
+ ckp_args = NULL;
+ hi_txn = TXN_MAXIMUM;
+ txninfo = NULL;
+ pass = DB_STR_P("initial");
+ ZERO_LSN(lsn);
+
+ /*
+ * XXX
+ * Get the log size. No locking required because we're single-threaded
+ * during recovery.
+ */
+ log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size;
+
+ /*
+ * If we need to, update the env handle timestamp.
+ */
+ if (update && REP_ON(env)) {
+ infop = env->reginfo;
+ renv = infop->primary;
+ (void)time(&renv->rep_timestamp);
+ }
+
+ /* Set in-recovery flags. */
+ F_SET(env->lg_handle, DBLOG_RECOVER);
+ region = env->tx_handle->reginfo.primary;
+ F_SET(region, TXN_IN_RECOVERY);
+
+ /* Allocate a cursor for the log. */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+
+ /*
+ * If the user is specifying recovery to a particular point in time
+ * or to a particular LSN, find the point to start recovery from.
+ */
+ ZERO_LSN(lowlsn);
+ if (max_lsn != NULL) {
+ if ((ret = __log_backup(env, logc, max_lsn, &lowlsn)) != 0)
+ goto err;
+ } else if (dbenv->tx_timestamp != 0) {
+ if ((ret = __log_earliest(env, logc, &low, &lowlsn)) != 0)
+ goto err;
+ if ((int32_t)dbenv->tx_timestamp < low) {
+ t1[sizeof(t1) - 1] = '\0';
+ (void)strncpy(t1, __os_ctime(
+ &dbenv->tx_timestamp, time_buf), sizeof(t1) - 1);
+ if ((p = strchr(t1, '\n')) != NULL)
+ *p = '\0';
+
+ t2[sizeof(t2) - 1] = '\0';
+ tlow = (time_t)low;
+ (void)strncpy(t2, __os_ctime(
+ &tlow, time_buf), sizeof(t2) - 1);
+ if ((p = strchr(t2, '\n')) != NULL)
+ *p = '\0';
+
+ __db_errx(env, DB_STR_A("1509",
+ "Invalid recovery timestamp %s; earliest time is %s",
+ "%s %s"), t1, t2);
+ ret = EINVAL;
+ goto err;
+ }
+ }
+
+ /*
+ * Recovery is done in three passes:
+ * Pass #0:
+ * We need to find the position from which we will open files.
+ * We need to open files beginning with the earlier of the
+ * most recent checkpoint LSN and a checkpoint LSN before the
+ * recovery timestamp, if specified. We need to be before the
+ * most recent checkpoint LSN because we are going to collect
+ * information about which transactions were begun before we
+ * start rolling forward. Those that were should never be undone
+ * because queue cannot use LSNs to determine what operations can
+ * safely be aborted and it cannot rollback operations in
+ * transactions for which there may be records not processed
+ * during recovery. We need to consider earlier points in time
+ * in case we are recovering to a particular timestamp.
+ *
+ * Pass #1:
+ * Read forward through the log from the position found in pass 0
+ * opening and closing files, and recording transactions for which
+ * we've seen their first record (the transaction's prev_lsn is
+ * 0,0). At the end of this pass, we know all transactions for
+ * which we've seen begins and we have the "current" set of files
+ * open.
+ *
+ * Pass #2:
+ * Read backward through the log undoing any uncompleted TXNs.
+ * There are four cases:
+ * 1. If doing catastrophic recovery, we read to the
+ * beginning of the log
+ * 2. If we are doing normal reovery, then we have to roll
+ * back to the most recent checkpoint LSN.
+ * 3. If we are recovering to a point in time, then we have
+ * to roll back to the checkpoint whose ckp_lsn is earlier
+ * than the specified time. __log_earliest will figure
+ * this out for us.
+ * 4. If we are recovering back to a particular LSN, then
+ * we have to roll back to the checkpoint whose ckp_lsn
+ * is earlier than the max_lsn. __log_backup will figure
+ * that out for us.
+ * In case 2, "uncompleted TXNs" include all those who committed
+ * after the user's specified timestamp.
+ *
+ * Pass #3:
+ * Read forward through the log from the LSN found in pass #2,
+ * redoing any committed TXNs (which committed after any user-
+ * specified rollback point). During this pass, checkpoint
+ * file information is ignored, and file openings and closings
+ * are redone.
+ *
+ * ckp_lsn -- lsn of the last checkpoint or the first in the log.
+ * first_lsn -- the lsn where the forward passes begin.
+ * last_lsn -- the last lsn in the log, used for feedback
+ * lowlsn -- the lsn we are rolling back to, if we are recovering
+ * to a point in time.
+ * lsn -- temporary use lsn.
+ * stop_lsn -- the point at which forward roll should stop
+ */
+
+ /*
+ * Find out the last lsn, so that we can estimate how far along we
+ * are in recovery. This will help us determine how much log there
+ * is between the first LSN that we're going to be working with and
+ * the last one. We assume that each of the three phases takes the
+ * same amount of time (a false assumption) and then use the %-age
+ * of the amount of log traversed to figure out how much of the
+ * pass we've accomplished.
+ *
+ * If we can't find any log records, we're kind of done.
+ */
+#ifdef UMRW
+ ZERO_LSN(last_lsn);
+#endif
+ memset(&data, 0, sizeof(data));
+ /*
+ * Pass #0
+ * Find the LSN from which we begin OPENFILES.
+ *
+ * If this is a catastrophic recovery, or if no checkpoint exists
+ * in the log, the LSN is the first LSN in the log.
+ *
+ * Otherwise, it is the minimum of (1) the LSN in the last checkpoint
+ * and (2) the LSN in the checkpoint before any specified recovery
+ * timestamp or max_lsn.
+ */
+ /*
+ * Get the first LSN in the log; it's an initial default
+ * even if this is not a catastrophic recovery.
+ */
+ if ((ret = __logc_get(logc, &ckp_lsn, &data, DB_FIRST)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ else
+ __db_errx(env, DB_STR("1510",
+ "First log record not found"));
+ goto err;
+ }
+ first_lsn = ckp_lsn;
+
+ if (!LF_ISSET(DB_RECOVER_FATAL)) {
+ if ((ret = __txn_getckp(env, &ckp_lsn)) == 0 &&
+ (ret = __logc_get(logc, &ckp_lsn, &data, DB_SET)) == 0) {
+ /* We have a recent checkpoint. This is LSN (1). */
+ if ((ret = __txn_ckp_read(env,
+ data.data, &ckp_args)) != 0) {
+ __db_errx(env, DB_STR_A("1511",
+ "Invalid checkpoint record at [%ld][%ld]",
+ "%ld %ld"), (u_long)ckp_lsn.file,
+ (u_long)ckp_lsn.offset);
+ goto err;
+ }
+ first_lsn = ckp_args->ckp_lsn;
+ __os_free(env, ckp_args);
+ }
+
+ /*
+ * If LSN (2) exists, use it if it's before LSN (1).
+ * (If LSN (1) doesn't exist, first_lsn is the
+ * beginning of the log, so will "win" this check.)
+ *
+ * XXX
+ * In the recovery-to-a-timestamp case, lowlsn is chosen by
+ * __log_earliest, and is the checkpoint LSN of the
+ * *earliest* checkpoint in the unreclaimed log. I
+ * (krinsky) believe that we could optimize this by looking
+ * instead for the LSN of the *latest* checkpoint before
+ * the timestamp of interest, but I'm not sure that this
+ * is worth doing right now. (We have to look for lowlsn
+ * and low anyway, to make sure the requested timestamp is
+ * somewhere in the logs we have, and all that's required
+ * is that we pick *some* checkpoint after the beginning of
+ * the logs and before the timestamp.
+ */
+ if ((dbenv->tx_timestamp != 0 || max_lsn != NULL) &&
+ LOG_COMPARE(&lowlsn, &first_lsn) < 0) {
+ first_lsn = lowlsn;
+ }
+ }
+
+ if ((ret = __logc_get(logc, &last_lsn, &data, DB_LAST)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ else
+ __db_errx(env, DB_STR("1512",
+ "Last log record not found"));
+ goto err;
+ }
+
+ rectype = 0;
+ txnid = 0;
+ do {
+ if (LOG_COMPARE(&lsn, &first_lsn) == 0)
+ break;
+ /* check if we have a recycle record. */
+ if (rectype != DB___txn_recycle)
+ LOGCOPY_32(env, &rectype, data.data);
+ /* txnid is after rectype, which is a u_int32. */
+ LOGCOPY_32(env, &txnid,
+ (u_int8_t *)data.data + sizeof(u_int32_t));
+
+ if (txnid != 0)
+ break;
+ } while ((ret = __logc_get(logc, &lsn, &data, DB_PREV)) == 0);
+
+ /*
+ * There are no transactions, so there is nothing to do unless
+ * we're recovering to an LSN. If we are, we need to proceed since
+ * we'll still need to do a vtruncate based on information we haven't
+ * yet collected.
+ */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ else if (ret != 0)
+ goto err;
+
+ hi_txn = txnid;
+
+ /* Get the record at first_lsn. */
+ if ((ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0) {
+ __db_errx(env, DB_STR_A("1513",
+ "Checkpoint LSN record [%ld][%ld] not found", "%ld %ld"),
+ (u_long)first_lsn.file, (u_long)first_lsn.offset);
+ goto err;
+ }
+
+ if (dbenv->db_feedback != NULL) {
+ if (last_lsn.file == first_lsn.file)
+ nfiles = (double)
+ (last_lsn.offset - first_lsn.offset) / log_size;
+ else
+ nfiles = (double)(last_lsn.file - first_lsn.file) +
+ (double)((log_size - first_lsn.offset) +
+ last_lsn.offset) / log_size;
+ /* We are going to divide by nfiles; make sure it isn't 0. */
+ if (nfiles < 0.001)
+ nfiles = 0.001;
+ }
+
+ /* Find a low txnid. */
+ ret = 0;
+ if (hi_txn != 0) do {
+ /* txnid is after rectype, which is a u_int32. */
+ LOGCOPY_32(env, &txnid,
+ (u_int8_t *)data.data + sizeof(u_int32_t));
+
+ if (txnid != 0)
+ break;
+ } while ((ret = __logc_get(logc, &lsn, &data, DB_NEXT)) == 0);
+
+ /*
+ * There are no transactions and we're not recovering to an LSN (see
+ * above), so there is nothing to do.
+ */
+ if (ret == DB_NOTFOUND) {
+ if (LOG_COMPARE(&lsn, &last_lsn) != 0)
+ ret = __db_log_corrupt(env, &lsn);
+ else
+ ret = 0;
+ }
+
+ /* Reset to the first lsn. */
+ if (ret != 0 ||
+ (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0)
+ goto err;
+
+ /* Initialize the transaction list. */
+ if ((ret = __db_txnlist_init(env, ip,
+ txnid, hi_txn, max_lsn, &txninfo)) != 0)
+ goto err;
+
+ /*
+ * Pass #1
+ * Run forward through the log starting at the first relevant lsn.
+ */
+ if ((ret = __env_openfiles(env, logc,
+ txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
+ goto err;
+
+ /* If there were no transactions, then we can bail out early. */
+ if (hi_txn == 0 && max_lsn == NULL) {
+ lsn = last_lsn;
+ goto done;
+ }
+
+ /*
+ * Pass #2.
+ *
+ * We used first_lsn to tell us how far back we need to recover,
+ * use it here.
+ */
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+ __db_msg(env, DB_STR_A("1514",
+ "Recovery starting from [%lu][%lu]", "%lu %lu"),
+ (u_long)first_lsn.file, (u_long)first_lsn.offset);
+
+ pass = DB_STR_P("backward");
+ for (ret = __logc_get(logc, &lsn, &data, DB_LAST);
+ ret == 0 && LOG_COMPARE(&lsn, &first_lsn) >= 0;
+ ret = __logc_get(logc, &lsn, &data, DB_PREV)) {
+ if (dbenv->db_feedback != NULL) {
+ progress = 34 + (int)(33 * (__lsn_diff(&first_lsn,
+ &last_lsn, &lsn, log_size, 0) / nfiles));
+ dbenv->db_feedback(dbenv, DB_RECOVER, progress);
+ }
+
+ tlsn = lsn;
+ ret = __db_dispatch(env, &env->recover_dtab,
+ &data, &tlsn, DB_TXN_BACKWARD_ROLL, txninfo);
+ if (ret != 0) {
+ if (ret != DB_TXN_CKP)
+ goto msgerr;
+ else
+ ret = 0;
+ }
+ }
+ if (ret == DB_NOTFOUND) {
+ if (LOG_COMPARE(&lsn, &first_lsn) > 0)
+ ret = __db_log_corrupt(env, &lsn);
+ else
+ ret = 0;
+ }
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Pass #3. If we are recovering to a timestamp or to an LSN,
+ * we need to make sure that we don't roll-forward beyond that
+ * point because there may be non-transactional operations (e.g.,
+ * closes that would fail). The last_lsn variable is used for
+ * feedback calculations, but use it to set an initial stopping
+ * point for the forward pass, and then reset appropriately to
+ * derive a real stop_lsn that tells how far the forward pass
+ * should go.
+ */
+ pass = DB_STR_P("forward");
+ stop_lsn = last_lsn;
+ if (max_lsn != NULL || dbenv->tx_timestamp != 0)
+ stop_lsn = ((DB_TXNHEAD *)txninfo)->maxlsn;
+
+ for (ret = __logc_get(logc, &lsn, &data, DB_NEXT);
+ ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) {
+ if (dbenv->db_feedback != NULL) {
+ progress = 67 + (int)(33 * (__lsn_diff(&first_lsn,
+ &last_lsn, &lsn, log_size, 1) / nfiles));
+ dbenv->db_feedback(dbenv, DB_RECOVER, progress);
+ }
+
+ tlsn = lsn;
+ ret = __db_dispatch(env, &env->recover_dtab,
+ &data, &tlsn, DB_TXN_FORWARD_ROLL, txninfo);
+ if (ret != 0) {
+ if (ret != DB_TXN_CKP)
+ goto msgerr;
+ else
+ ret = 0;
+ }
+ /*
+ * If we are recovering to a timestamp or an LSN,
+ * we need to make sure that we don't try to roll
+ * forward beyond the soon-to-be end of log.
+ */
+ if (LOG_COMPARE(&lsn, &stop_lsn) >= 0)
+ break;
+
+ }
+ if (ret == DB_NOTFOUND)
+ ret = __db_log_corrupt(env, &lsn);
+ if (ret != 0)
+ goto err;
+
+ if (max_lsn == NULL)
+ region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid;
+
+done:
+ /* We are going to truncate, so we'd best close the cursor. */
+ if (logc != NULL) {
+ if ((ret = __logc_close(logc)) != 0)
+ goto err;
+ logc = NULL;
+ }
+ /*
+ * Also flush the cache before truncating the log. It's recovery,
+ * ignore any application max-write configuration.
+ */
+ if ((ret = __memp_sync_int(env,
+ NULL, 0, DB_SYNC_CACHE | DB_SYNC_SUPPRESS_WRITE, NULL, NULL)) != 0)
+ goto err;
+ if (dbenv->tx_timestamp != 0) {
+ /* Run recovery up to this timestamp. */
+ region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
+ vtrunc_lsn = &((DB_TXNHEAD *)txninfo)->maxlsn;
+ vtrunc_ckp = &((DB_TXNHEAD *)txninfo)->ckplsn;
+ } else if (max_lsn != NULL) {
+ /* This is a HA client syncing to the master. */
+ if (!IS_ZERO_LSN(((DB_TXNHEAD *)txninfo)->ckplsn))
+ region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
+ else if ((ret =
+ __txn_findlastckp(env, &region->last_ckp, max_lsn)) != 0)
+ goto err;
+ vtrunc_lsn = max_lsn;
+ vtrunc_ckp = &((DB_TXNHEAD *)txninfo)->ckplsn;
+ } else {
+ /*
+ * The usual case: we recovered the whole (valid) log; clear
+ * out any partial record after the recovery point.
+ */
+ vtrunc_lsn = &lsn;
+ vtrunc_ckp = &region->last_ckp;
+ }
+ if ((ret = __log_vtruncate(env, vtrunc_lsn, vtrunc_ckp, trunclsn)) != 0)
+ goto err;
+
+ /* If we had no txns, figure out if we need a checkpoint. */
+ if (hi_txn == 0 && __dbreg_log_nofiles(env))
+ LF_SET(DB_NO_CHECKPOINT);
+ /*
+ * Usually we close all files at the end of recovery, unless there are
+ * prepared transactions or errors in the checkpoint.
+ */
+ all_recovered = region->stat.st_nrestores == 0;
+ /*
+ * Log a checkpoint here so subsequent recoveries can skip what's been
+ * done; this is unnecessary for HA rep clients, as they do not write
+ * log records.
+ */
+ if (max_lsn == NULL && !LF_ISSET(DB_NO_CHECKPOINT) &&
+ (ret = __txn_checkpoint(env,
+ 0, 0, DB_CKP_INTERNAL | DB_FORCE)) != 0) {
+ /*
+ * If there was no space for the checkpoint or flushing db
+ * pages we can still bring the environment up, if only for
+ * read-only access. We must not close the open files because a
+ * subsequent recovery might still need to redo this portion
+ * of the log [#18590].
+ */
+ if (max_lsn == NULL && ret == ENOSPC) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+ __db_msg(env, DB_STR_A("1515",
+ "Recovery continuing after non-fatal checkpoint error: %s",
+ "%s"), db_strerror(ret));
+ all_recovered = 0;
+ }
+ else
+ goto err;
+ }
+
+ if (all_recovered ) {
+ /* Close all the db files that are open. */
+ if ((ret = __dbreg_close_files(env, 0)) != 0)
+ goto err;
+ } else {
+ if ((ret = __dbreg_mark_restored(env)) != 0)
+ goto err;
+ F_SET(env->lg_handle, DBLOG_OPENFILES);
+ }
+
+ if (max_lsn != NULL) {
+ /*
+ * Now we need to open files that should be open in order for
+ * client processing to continue. However, since we've
+ * truncated the log, we need to recompute from where the
+ * openfiles pass should begin.
+ */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+ if ((ret =
+ __logc_get(logc, &first_lsn, &data, DB_FIRST)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ else
+ __db_errx(env, DB_STR("1516",
+ "First log record not found"));
+ goto err;
+ }
+ if ((ret = __txn_getckp(env, &first_lsn)) == 0 &&
+ (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) == 0) {
+ /* We have a recent checkpoint. This is LSN (1). */
+ if ((ret = __txn_ckp_read(env,
+ data.data, &ckp_args)) != 0) {
+ __db_errx(env, DB_STR_A("1517",
+ "Invalid checkpoint record at [%ld][%ld]",
+ "%ld %ld"), (u_long)first_lsn.file,
+ (u_long)first_lsn.offset);
+ goto err;
+ }
+ first_lsn = ckp_args->ckp_lsn;
+ __os_free(env, ckp_args);
+ }
+ if ((ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0)
+ goto err;
+ if ((ret = __env_openfiles(env, logc,
+ txninfo, &data, &first_lsn, max_lsn, nfiles, 1)) != 0)
+ goto err;
+ } else if (all_recovered) {
+ /*
+ * If there are no transactions that need resolution, whether
+ * because they are prepared or because recovery will need to
+ * process them, we need to reset the transaction ID space and
+ * log this fact.
+ */
+ if ((rectype != DB___txn_recycle || hi_txn != 0) &&
+ (ret = __txn_reset(env)) != 0)
+ goto err;
+ } else {
+ if ((ret = __txn_recycle_id(env, 0)) != 0)
+ goto err;
+ }
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) {
+ (void)time(&now);
+ __db_msg(env, DB_STR_A("1518",
+ "Recovery complete at %.24s", "%.24s"),
+ __os_ctime(&now, time_buf));
+ __db_msg(env, DB_STR_A("1519",
+ "Maximum transaction ID %lx recovery checkpoint [%lu][%lu]",
+ "%lx %lu %lu"), (u_long)(txninfo == NULL ?
+ TXN_MINIMUM : ((DB_TXNHEAD *)txninfo)->maxid),
+ (u_long)region->last_ckp.file,
+ (u_long)region->last_ckp.offset);
+ }
+
+ if (0) {
+msgerr: __db_errx(env, DB_STR_A("1520",
+ "Recovery function for LSN %lu %lu failed on %s pass",
+ "%lu %lu %s"), (u_long)lsn.file, (u_long)lsn.offset, pass);
+ }
+
+err: if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (txninfo != NULL)
+ __db_txnlist_end(env, txninfo);
+
+ dbenv->tx_timestamp = 0;
+
+ F_CLR(env->lg_handle, DBLOG_RECOVER);
+ F_CLR(region, TXN_IN_RECOVERY);
+
+ return (ret);
+}
+
+/*
+ * Figure out how many logfiles we have processed. If we are moving
+ * forward (is_forward != 0), then we're computing current - low. If
+ * we are moving backward, we are computing high - current. max is
+ * the number of bytes per logfile.
+ */
+static double
+__lsn_diff(low, high, current, max, is_forward)
+ DB_LSN *low, *high, *current;
+ u_int32_t max;
+ int is_forward;
+{
+ double nf;
+
+ /*
+ * There are three cases in each direction. If you are in the
+ * same file, then all you need worry about is the difference in
+ * offsets. If you are in different files, then either your offsets
+ * put you either more or less than the integral difference in the
+ * number of files -- we need to handle both of these.
+ */
+ if (is_forward) {
+ if (current->file == low->file)
+ nf = (double)(current->offset - low->offset) / max;
+ else if (current->offset < low->offset)
+ nf = (double)((current->file - low->file) - 1) +
+ (double)((max - low->offset) + current->offset) /
+ max;
+ else
+ nf = (double)(current->file - low->file) +
+ (double)(current->offset - low->offset) / max;
+ } else {
+ if (current->file == high->file)
+ nf = (double)(high->offset - current->offset) / max;
+ else if (current->offset > high->offset)
+ nf = (double)((high->file - current->file) - 1) +
+ (double)
+ ((max - current->offset) + high->offset) / max;
+ else
+ nf = (double)(high->file - current->file) +
+ (double)(high->offset - current->offset) / max;
+ }
+ return (nf);
+}
+
+/*
+ * __log_backup --
+ *
+ * This is used to find the earliest log record to process when a client
+ * is trying to sync up with a master whose max LSN is less than this
+ * client's max lsn; we want to roll back everything after that.
+ *
+ * Find the latest checkpoint whose ckp_lsn is less than the max lsn.
+ */
+static int
+__log_backup(env, logc, max_lsn, start_lsn)
+ ENV *env;
+ DB_LOGC *logc;
+ DB_LSN *max_lsn, *start_lsn;
+{
+ DBT data;
+ DB_LSN lsn;
+ __txn_ckp_args *ckp_args;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ ckp_args = NULL;
+
+ if ((ret = __txn_getckp(env, &lsn)) != 0)
+ goto err;
+ while ((ret = __logc_get(logc, &lsn, &data, DB_SET)) == 0) {
+ if ((ret = __txn_ckp_read(env, data.data, &ckp_args)) != 0)
+ return (ret);
+ /*
+ * Follow checkpoints through the log until
+ * we find one with a ckp_lsn less than
+ * or equal max_lsn.
+ */
+ if (LOG_COMPARE(&ckp_args->ckp_lsn, max_lsn) <= 0) {
+ *start_lsn = ckp_args->ckp_lsn;
+ break;
+ }
+
+ lsn = ckp_args->last_ckp;
+ /*
+ * If there are no more checkpoints behind us, we're
+ * done. Break with DB_NOTFOUND.
+ */
+ if (IS_ZERO_LSN(lsn)) {
+ ret = DB_NOTFOUND;
+ break;
+ }
+ __os_free(env, ckp_args);
+ ckp_args = NULL;
+ }
+
+ if (ckp_args != NULL)
+ __os_free(env, ckp_args);
+ /*
+ * If we walked back through all the checkpoints,
+ * set the cursor on the first log record.
+ */
+err: if (IS_ZERO_LSN(*start_lsn) && (ret == 0 || ret == DB_NOTFOUND))
+ ret = __logc_get(logc, start_lsn, &data, DB_FIRST);
+ return (ret);
+}
+
+/*
+ * __log_earliest --
+ *
+ * Return the earliest recovery point for the log files present. The
+ * earliest recovery time is the time stamp of the first checkpoint record
+ * whose checkpoint LSN is greater than the first LSN we process.
+ */
+static int
+__log_earliest(env, logc, lowtime, lowlsn)
+ ENV *env;
+ DB_LOGC *logc;
+ int32_t *lowtime;
+ DB_LSN *lowlsn;
+{
+ __txn_ckp_args *ckpargs;
+ DB_LSN first_lsn, lsn;
+ DBT data;
+ u_int32_t rectype;
+ int cmp, ret;
+
+ memset(&data, 0, sizeof(data));
+
+ /*
+ * Read forward through the log looking for the first checkpoint
+ * record whose ckp_lsn is greater than first_lsn.
+ */
+ for (ret = __logc_get(logc, &first_lsn, &data, DB_FIRST);
+ ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) {
+ LOGCOPY_32(env, &rectype, data.data);
+ if (rectype != DB___txn_ckp)
+ continue;
+ if ((ret =
+ __txn_ckp_read(env, data.data, &ckpargs)) == 0) {
+ cmp = LOG_COMPARE(&ckpargs->ckp_lsn, &first_lsn);
+ *lowlsn = ckpargs->ckp_lsn;
+ *lowtime = ckpargs->timestamp;
+
+ __os_free(env, ckpargs);
+ if (cmp >= 0)
+ break;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __env_openfiles --
+ * Perform the pass of recovery that opens files. This is used
+ * both during regular recovery and an initial call to txn_recover (since
+ * we need files open in order to abort prepared, but not yet committed
+ * transactions).
+ *
+ * See the comments in db_apprec for a detailed description of the
+ * various recovery passes.
+ *
+ * If we are not doing feedback processing (i.e., we are doing txn_recover
+ * processing and in_recovery is zero), then last_lsn can be NULL.
+ *
+ * PUBLIC: int __env_openfiles __P((ENV *,
+ * PUBLIC: DB_LOGC *, void *, DBT *, DB_LSN *, DB_LSN *, double, int));
+ */
+int
+__env_openfiles(env, logc, txninfo,
+ data, open_lsn, last_lsn, nfiles, in_recovery)
+ ENV *env;
+ DB_LOGC *logc;
+ void *txninfo;
+ DBT *data;
+ DB_LSN *open_lsn, *last_lsn;
+ double nfiles;
+ int in_recovery;
+{
+ DB_ENV *dbenv;
+ DB_LSN lsn, tlsn;
+ u_int32_t log_size;
+ int progress, ret;
+
+ dbenv = env->dbenv;
+
+ /*
+ * XXX
+ * Get the log size. No locking required because we're single-threaded
+ * during recovery.
+ */
+ log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size;
+
+ lsn = *open_lsn;
+ for (;;) {
+ if (in_recovery && dbenv->db_feedback != NULL) {
+ DB_ASSERT(env, last_lsn != NULL);
+ progress = (int)(33 * (__lsn_diff(open_lsn,
+ last_lsn, &lsn, log_size, 1) / nfiles));
+ dbenv->db_feedback(dbenv, DB_RECOVER, progress);
+ }
+
+ tlsn = lsn;
+ ret = __db_dispatch(env, &env->recover_dtab, data, &tlsn,
+ in_recovery ? DB_TXN_OPENFILES : DB_TXN_POPENFILES,
+ txninfo);
+ if (ret != 0 && ret != DB_TXN_CKP) {
+ __db_errx(env, DB_STR_A("1521",
+ "Recovery function for LSN %lu %lu failed",
+ "%lu %lu"), (u_long)lsn.file, (u_long)lsn.offset);
+ break;
+ }
+ if ((ret = __logc_get(logc, &lsn, data, DB_NEXT)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ if (last_lsn != NULL &&
+ LOG_COMPARE(&lsn, last_lsn) != 0)
+ ret = __db_log_corrupt(env, &lsn);
+ else
+ ret = 0;
+ }
+ break;
+ }
+ }
+
+ return (ret);
+}
+
+static int
+__db_log_corrupt(env, lsnp)
+ ENV *env;
+ DB_LSN *lsnp;
+{
+ __db_errx(env, DB_STR_A("1522",
+ "Log file corrupt at LSN: [%lu][%lu]", "%lu %lu"),
+ (u_long)lsnp->file, (u_long)lsnp->offset);
+ return (EINVAL);
+}
+
+/*
+ * __env_init_rec --
+ *
+ * PUBLIC: int __env_init_rec __P((ENV *, u_int32_t));
+ */
+int
+__env_init_rec(env, version)
+ ENV *env;
+ u_int32_t version;
+{
+ int ret;
+
+ /*
+ * We need to prime the recovery table with the current recovery
+ * functions. Then we overwrite only specific entries based on
+ * each previous version we support.
+ */
+ if ((ret = __bam_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+ if ((ret = __crdel_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+ if ((ret = __db_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+ if ((ret = __dbreg_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+ if ((ret = __fop_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+ if ((ret = __ham_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+ if ((ret = __heap_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+ if ((ret = __qam_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+ if ((ret = __repmgr_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+ if ((ret = __txn_init_recover(env, &env->recover_dtab)) != 0)
+ goto err;
+
+ /*
+ * After installing all the current recovery routines, we want to
+ * override them with older versions if we are reading a down rev
+ * log (from a downrev replication master). If a log record is
+ * changed then we must use the previous version for all older
+ * logs. If a record is changed in multiple revisions then the
+ * oldest revision that applies must be used. Therefore we override
+ * the recovery functions in reverse log version order.
+ */
+ /*
+ * DB_LOGVERSION_53 is a strict superset of DB_LOGVERSION_50.
+ * So, only check > DB_LOGVERSION_48p2. If/When log records are
+ * altered, the condition below will need to change.
+ */
+ if (version > DB_LOGVERSION_48p2)
+ goto done;
+ if ((ret = __env_init_rec_48(env)) != 0)
+ goto err;
+ /*
+ * Patch 2 added __db_pg_trunc but did not replace any log records
+ * so we want to override the same functions as in the original release.
+ */
+ if (version >= DB_LOGVERSION_48)
+ goto done;
+ if ((ret = __env_init_rec_47(env)) != 0)
+ goto err;
+ if (version == DB_LOGVERSION_47)
+ goto done;
+ if ((ret = __env_init_rec_46(env)) != 0)
+ goto err;
+ /*
+ * There are no log record/recovery differences between 4.4 and 4.5.
+ * The log version changed due to checksum. There are no log recovery
+ * differences between 4.5 and 4.6. The name of the rep_gen in
+ * txn_checkpoint changed (to spare, since we don't use it anymore).
+ */
+ if (version >= DB_LOGVERSION_44)
+ goto done;
+ if ((ret = __env_init_rec_43(env)) != 0)
+ goto err;
+ if (version == DB_LOGVERSION_43)
+ goto done;
+ if (version != DB_LOGVERSION_42) {
+ __db_errx(env, DB_STR_A("1523", "Unknown version %lu",
+ "%lu"), (u_long)version);
+ ret = EINVAL;
+ goto err;
+ }
+ ret = __env_init_rec_42(env);
+
+done:
+err: return (ret);
+}
+
+static int
+__env_init_rec_42(env)
+ ENV *env;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __db_relink_42_recover, DB___db_relink_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __db_pg_alloc_42_recover, DB___db_pg_alloc_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __db_pg_free_42_recover, DB___db_pg_free_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __db_pg_freedata_42_recover, DB___db_pg_freedata_42)) != 0)
+ goto err;
+#ifdef HAVE_HASH
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __ham_metagroup_42_recover, DB___ham_metagroup_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __ham_groupalloc_42_recover, DB___ham_groupalloc_42)) != 0)
+ goto err;
+#endif
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __txn_ckp_42_recover, DB___txn_ckp_42)) != 0)
+ goto err;
+err:
+ return (ret);
+}
+
+static int
+__env_init_rec_43(env)
+ ENV *env;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __bam_relink_43_recover, DB___bam_relink_43)) != 0)
+ goto err;
+ /*
+ * We want to use the 4.2-based txn_regop record.
+ */
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __txn_regop_42_recover, DB___txn_regop_42)) != 0)
+ goto err;
+err:
+ return (ret);
+}
+
+static int
+__env_init_rec_46(env)
+ ENV *env;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __bam_merge_44_recover, DB___bam_merge_44)) != 0)
+ goto err;
+
+err: return (ret);
+}
+
+static int
+__env_init_rec_47(env)
+ ENV *env;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __bam_split_42_recover, DB___bam_split_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __db_pg_sort_44_recover, DB___db_pg_sort_44)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_create_42_recover, DB___fop_create_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_write_42_recover, DB___fop_write_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_rename_42_recover, DB___fop_rename_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_rename_noundo_46_recover, DB___fop_rename_noundo_46)) != 0)
+ goto err;
+
+err:
+ return (ret);
+}
+
+static int
+__env_init_rec_48(env)
+ ENV *env;
+{
+ int ret;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __db_pg_sort_44_recover, DB___db_pg_sort_44)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __db_addrem_42_recover, DB___db_addrem_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __db_big_42_recover, DB___db_big_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __bam_split_48_recover, DB___bam_split_48)) != 0)
+ goto err;
+#ifdef HAVE_HASH
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __ham_insdel_42_recover, DB___ham_insdel_42)) != 0)
+ goto err;
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __ham_replace_42_recover, DB___ham_replace_42)) != 0)
+ goto err;
+#endif
+err:
+ return (ret);
+}
diff --git a/src/env/env_region.c b/src/env/env_region.c
new file mode 100644
index 00000000..113bea21
--- /dev/null
+++ b/src/env/env_region.c
@@ -0,0 +1,1497 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+static int __env_des_get __P((ENV *, REGINFO *, REGINFO *, REGION **));
+static int __env_faultmem __P((ENV *, void *, size_t, int));
+static int __env_sys_attach __P((ENV *, REGINFO *, REGION *));
+static int __env_sys_detach __P((ENV *, REGINFO *, int));
+static void __env_des_destroy __P((ENV *, REGION *));
+static void __env_remove_file __P((ENV *));
+
+/*
+ * __env_attach
+ * Join/create the environment
+ *
+ * PUBLIC: int __env_attach __P((ENV *, u_int32_t *, int, int));
+ */
+int
+__env_attach(env, init_flagsp, create_ok, retry_ok)
+ ENV *env;
+ u_int32_t *init_flagsp;
+ int create_ok, retry_ok;
+{
+ DB_ENV *dbenv;
+ REGENV rbuf, *renv;
+ REGENV_REF ref;
+ REGINFO *infop;
+ REGION *rp, tregion;
+ size_t max, nrw, size;
+ long segid;
+ u_int32_t bytes, i, mbytes, nregions, signature;
+ u_int retry_cnt;
+ int majver, minver, patchver, ret;
+ char buf[sizeof(DB_REGION_FMT) + 20];
+
+ /* Initialization */
+ dbenv = env->dbenv;
+ retry_cnt = 0;
+ signature = __env_struct_sig();
+
+ /* Repeated initialization. */
+loop: renv = NULL;
+ rp = NULL;
+
+ /* Set up the ENV's REG_INFO structure. */
+ if ((ret = __os_calloc(env, 1, sizeof(REGINFO), &infop)) != 0)
+ return (ret);
+ infop->env = env;
+ infop->type = REGION_TYPE_ENV;
+ infop->id = REGION_ID_ENV;
+ infop->flags = REGION_JOIN_OK;
+ if (create_ok)
+ F_SET(infop, REGION_CREATE_OK);
+
+ /* Build the region name. */
+ if (F_ISSET(env, ENV_PRIVATE))
+ ret = __os_strdup(env, "process-private", &infop->name);
+ else {
+ (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
+ ret = __db_appname(env, DB_APP_NONE, buf, NULL, &infop->name);
+ }
+ if (ret != 0)
+ goto err;
+
+ /*
+ * We have to single-thread the creation of the REGENV region. Once
+ * it exists, we can serialize using region mutexes, but until then
+ * we have to be the only player in the game.
+ *
+ * If this is a private environment, we are only called once and there
+ * are no possible race conditions.
+ *
+ * If this is a public environment, we use the filesystem to ensure
+ * the creation of the environment file is single-threaded.
+ *
+ * If the application has specified their own mapping functions, try
+ * and create the region. The application will have to let us know if
+ * it's actually a creation or not, and we'll have to fall-back to a
+ * join if it's not a create.
+ */
+ if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL)
+ goto creation;
+
+ /*
+ * Try to create the file, if we have the authority. We have to ensure
+ * that multiple threads/processes attempting to simultaneously create
+ * the file are properly ordered. Open using the O_CREAT and O_EXCL
+ * flags so that multiple attempts to create the region will return
+ * failure in all but one. POSIX 1003.1 requires that EEXIST be the
+ * errno return value -- I sure hope they're right.
+ */
+ if (create_ok) {
+ if ((ret = __os_open(env, infop->name, 0,
+ DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_REGION,
+ env->db_mode, &env->lockfhp)) == 0)
+ goto creation;
+ if (ret != EEXIST) {
+ __db_err(env, ret, "%s", infop->name);
+ goto err;
+ }
+ }
+
+ /* The region must exist, it's not okay to recreate it. */
+ F_CLR(infop, REGION_CREATE_OK);
+
+ /*
+ * If we couldn't create the file, try and open it. (If that fails,
+ * we're done.)
+ */
+ if ((ret = __os_open(
+ env, infop->name, 0, DB_OSO_REGION, 0, &env->lockfhp)) != 0)
+ goto err;
+
+ /*
+ * !!!
+ * The region may be in system memory not backed by the filesystem
+ * (more specifically, not backed by this file), and we're joining
+ * it. In that case, the process that created it will have written
+ * out a REGENV_REF structure as its only contents. We read that
+ * structure before we do anything further, e.g., we can't just map
+ * that file in and then figure out what's going on.
+ *
+ * All of this noise is because some systems don't have a coherent VM
+ * and buffer cache, and what's worse, when you mix operations on the
+ * VM and buffer cache, half the time you hang the system.
+ *
+ * If the file is the size of an REGENV_REF structure, then we know
+ * the real region is in some other memory. (The only way you get a
+ * file that size is to deliberately write it, as it's smaller than
+ * any possible disk sector created by writing a file or mapping the
+ * file into memory.) In which case, retrieve the structure from the
+ * file and use it to acquire the referenced memory.
+ *
+ * If the structure is larger than a REGENV_REF structure, then this
+ * file is backing the shared memory region, and we just map it into
+ * memory.
+ *
+ * And yes, this makes me want to take somebody and kill them. (I
+ * digress -- but you have no freakin' idea. This is unbelievably
+ * stupid and gross, and I've probably spent six months of my life,
+ * now, trying to make different versions of it work.)
+ */
+ if ((ret = __os_ioinfo(env, infop->name,
+ env->lockfhp, &mbytes, &bytes, NULL)) != 0) {
+ __db_err(env, ret, "%s", infop->name);
+ goto err;
+ }
+
+ /*
+ * !!!
+ * A size_t is OK -- regions get mapped into memory, and so can't
+ * be larger than a size_t.
+ */
+ size = mbytes * MEGABYTE + bytes;
+
+ /*
+ * If the size is less than the size of a REGENV_REF structure, the
+ * region (or, possibly, the REGENV_REF structure) has not yet been
+ * completely written. Shouldn't be possible, but there's no reason
+ * not to wait awhile and try again.
+ *
+ * If the region is precisely the size of a ref, then we don't
+ * have the region here, just the meta-data, which implies that
+ * that we are using SYSTEM V shared memory (SYSTEM_MEM). However,
+ * if the flags say that we are using SYSTEM_MEM and the region is
+ * bigger than the ref, something bad has happened -- we are storing
+ * something in the region file other than meta-data and that
+ * shouldn't happen.
+ */
+ if (size < sizeof(ref))
+ goto retry;
+ else {
+
+ if (size == sizeof(ref))
+ F_SET(env, ENV_SYSTEM_MEM);
+ else if (F_ISSET(env, ENV_SYSTEM_MEM)) {
+ ret = EINVAL;
+ __db_err(env, ret, DB_STR_A("1535",
+ "%s: existing environment not created in system memory",
+ "%s"), infop->name);
+ goto err;
+ } else {
+ if ((ret = __os_read(env, env->lockfhp, &rbuf,
+ sizeof(rbuf), &nrw)) != 0 ||
+ nrw < (size_t)sizeof(rbuf) ||
+ (ret = __os_seek(env,
+ env->lockfhp, 0, 0, rbuf.region_off)) != 0) {
+ __db_err(env, ret, DB_STR_A("1536",
+ "%s: unable to read region info", "%s"),
+ infop->name);
+ goto err;
+ }
+ }
+
+ if ((ret = __os_read(env, env->lockfhp, &ref,
+ sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) {
+ if (ret == 0)
+ ret = EIO;
+ __db_err(env, ret, DB_STR_A("1537",
+ "%s: unable to read system-memory information",
+ "%s"), infop->name);
+ goto err;
+ }
+ size = ref.size;
+ max = ref.max;
+ segid = ref.segid;
+ }
+
+#ifndef HAVE_MUTEX_FCNTL
+ /*
+ * If we're not doing fcntl locking, we can close the file handle. We
+ * no longer need it and the less contact between the buffer cache and
+ * the VM, the better.
+ */
+ (void)__os_closehandle(env, env->lockfhp);
+ env->lockfhp = NULL;
+#endif
+
+ /* Call the region join routine to acquire the region. */
+ memset(&tregion, 0, sizeof(tregion));
+ tregion.size = (roff_t)size;
+ tregion.max = (roff_t)max;
+ tregion.segid = segid;
+ if ((ret = __env_sys_attach(env, infop, &tregion)) != 0)
+ goto err;
+
+user_map_functions:
+ /*
+ * The environment's REGENV structure has to live at offset 0 instead
+ * of the usual alloc information. Set the primary reference and
+ * correct the "head" value to reference the alloc region.
+ */
+ infop->primary = infop->addr;
+ infop->head = (u_int8_t *)infop->addr + sizeof(REGENV);
+ renv = infop->primary;
+
+ /*
+ * Make sure the region matches our build. Special case a region
+ * that's all nul bytes, just treat it like any other corruption.
+ */
+ if (renv->majver != DB_VERSION_MAJOR ||
+ renv->minver != DB_VERSION_MINOR) {
+ if (renv->majver != 0 || renv->minver != 0) {
+ __db_errx(env, DB_STR_A("1538",
+ "Program version %d.%d doesn't match environment version %d.%d",
+ "%d %d %d %d"), DB_VERSION_MAJOR, DB_VERSION_MINOR,
+ renv->majver, renv->minver);
+ ret = DB_VERSION_MISMATCH;
+ } else
+ ret = EINVAL;
+ goto err;
+ }
+ if (renv->signature != signature) {
+ __db_errx(env, DB_STR("1539",
+ "Build signature doesn't match environment"));
+ ret = DB_VERSION_MISMATCH;
+ goto err;
+ }
+
+ /*
+ * Check if the environment has had a catastrophic failure.
+ *
+ * Check the magic number to ensure the region is initialized. If the
+ * magic number isn't set, the lock may not have been initialized, and
+ * an attempt to use it could lead to random behavior.
+ *
+ * The panic and magic values aren't protected by any lock, so we never
+ * use them in any check that's more complex than set/not-set.
+ *
+ * !!!
+ * I'd rather play permissions games using the underlying file, but I
+ * can't because Windows/NT filesystems won't open files mode 0.
+ */
+ if (renv->panic && !F_ISSET(dbenv, DB_ENV_NOPANIC)) {
+ ret = __env_panic_msg(env);
+ goto err;
+ }
+ if (renv->magic != DB_REGION_MAGIC)
+ goto retry;
+
+ /*
+ * Get a reference to the underlying REGION information for this
+ * environment.
+ */
+ if ((ret = __env_des_get(env, infop, infop, &rp)) != 0 || rp == NULL)
+ goto find_err;
+ infop->rp = rp;
+
+ /*
+ * There's still a possibility for inconsistent data. When we acquired
+ * the size of the region and attached to it, it might have still been
+ * growing as part of its creation. We can detect this by checking the
+ * size we originally found against the region's current size. (The
+ * region's current size has to be final, the creator finished growing
+ * it before setting the magic number in the region.)
+ *
+ * !!!
+ * Skip this test when the application specified its own map functions.
+ * The size of the region is essentially unknown in that case: some
+ * other process asked the application's map function for some bytes,
+ * but we were never told the final size of the region. We could get
+ * a size back from the map function, but for all we know, our process'
+ * map function only knows how to join regions, it has no clue how big
+ * those regions are.
+ */
+ if (DB_GLOBAL(j_region_map) == NULL && rp->size != size)
+ goto retry;
+
+ /*
+ * Check our callers configuration flags, it's an error to configure
+ * incompatible or additional subsystems in an existing environment.
+ * Return the total set of flags to the caller so they initialize the
+ * correct set of subsystems.
+ */
+ if (init_flagsp != NULL) {
+ FLD_CLR(*init_flagsp, renv->init_flags);
+ if (*init_flagsp != 0) {
+ __db_errx(env, DB_STR("1540",
+ "configured environment flags incompatible with existing environment"));
+ ret = EINVAL;
+ goto err;
+ }
+ *init_flagsp = renv->init_flags;
+ }
+
+ /*
+ * Fault the pages into memory. Note, do this AFTER releasing the
+ * lock, because we're only reading the pages, not writing them.
+ */
+ (void)__env_faultmem(env, infop->primary, rp->size, 0);
+
+ /* Everything looks good, we're done. */
+ env->reginfo = infop;
+ return (0);
+
+creation:
+ /* Create the environment region. */
+ F_SET(infop, REGION_CREATE);
+
+ /*
+ * Allocate room for REGION structures plus overhead.
+ */
+ memset(&tregion, 0, sizeof(tregion));
+ nregions = __memp_max_regions(env) + 5;
+ size = nregions * sizeof(REGION);
+ size += dbenv->passwd_len;
+ size += (dbenv->thr_max + dbenv->thr_max / 4) *
+ __env_alloc_size(sizeof(DB_THREAD_INFO));
+ /* Space for replication buffer. */
+ if (init_flagsp != NULL && FLD_ISSET(*init_flagsp, DB_INITENV_REP))
+ size += MEGABYTE;
+ size += __txn_region_size(env);
+ size += __log_region_size(env);
+ size += __env_thread_size(env, size);
+ size += __lock_region_size(env, size);
+
+ tregion.size = (roff_t)size;
+ tregion.segid = INVALID_REGION_SEGID;
+
+ if ((tregion.max = dbenv->memory_max) == 0) {
+ /* Add some slop. */
+ size += 16 * 1024;
+ tregion.max = (roff_t)size;
+
+ tregion.max += (roff_t)__lock_region_max(env);
+ tregion.max += (roff_t)__txn_region_max(env);
+ tregion.max += (roff_t)__log_region_max(env);
+ tregion.max += (roff_t)__env_thread_max(env);
+ } else if (tregion.size > tregion.max) {
+ __db_errx(env, DB_STR_A("1542",
+ "Minimum environment memory size %ld is bigger than spcified max %ld.",
+ "%ld %ld"), (u_long)tregion.size, (u_long)tregion.max);
+ ret = EINVAL;
+ goto err;
+ } else if (F_ISSET(env, ENV_PRIVATE))
+ infop->max_alloc = dbenv->memory_max;
+
+ if ((ret = __env_sys_attach(env, infop, &tregion)) != 0)
+ goto err;
+
+ /*
+ * If the application has specified its own mapping functions, we don't
+ * know until we get here if we are creating the region or not. The
+ * way we find out is underlying functions clear the REGION_CREATE flag.
+ */
+ if (!F_ISSET(infop, REGION_CREATE))
+ goto user_map_functions;
+
+ /*
+ * Fault the pages into memory. Note, do this BEFORE we initialize
+ * anything, because we're writing the pages, not just reading them.
+ */
+ (void)__env_faultmem(env, infop->addr, tregion.size, 1);
+
+ /*
+ * The first object in the region is the REGENV structure. This is
+ * different from the other regions, and, from everything else in
+ * this region, where all objects are allocated from the pool, i.e.,
+ * there aren't any fixed locations. The remaining space is made
+ * available for later allocation.
+ *
+ * The allocation space must be size_t aligned, because that's what
+ * the initialization routine is going to store there. To make sure
+ * that happens, the REGENV structure was padded with a final size_t.
+ * No other region needs to worry about it because all of them treat
+ * the entire region as allocation space.
+ *
+ * Set the primary reference and correct the "head" value to reference
+ * the alloc region.
+ */
+ infop->primary = infop->addr;
+ infop->head = (u_int8_t *)infop->addr + sizeof(REGENV);
+ __env_alloc_init(infop, tregion.size - sizeof(REGENV));
+
+ /*
+ * Initialize the rest of the REGENV structure. (Don't set the magic
+ * number to the correct value, that would validate the environment).
+ */
+ renv = infop->primary;
+ renv->magic = 0;
+ renv->panic = 0;
+
+ (void)db_version(&majver, &minver, &patchver);
+ renv->majver = (u_int32_t)majver;
+ renv->minver = (u_int32_t)minver;
+ renv->patchver = (u_int32_t)patchver;
+ renv->signature = signature;
+
+ (void)time(&renv->timestamp);
+ __os_unique_id(env, &renv->envid);
+
+ /*
+ * Initialize init_flags to store the flags that any other environment
+ * handle that uses DB_JOINENV to join this environment will need.
+ */
+ renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp;
+
+ /*
+ * Set up the region array. We use an array rather than a linked list
+ * as we have to traverse this list after failure in some cases, and
+ * we don't want to infinitely loop should the application fail while
+ * we're manipulating the list.
+ */
+ renv->region_cnt = nregions;
+ if ((ret = __env_alloc(infop, nregions * sizeof(REGION), &rp)) != 0) {
+ __db_err(env, ret, DB_STR("1543",
+ "unable to create new master region array"));
+ goto err;
+ }
+ renv->region_off = R_OFFSET(infop, rp);
+ for (i = 0; i < nregions; ++i, ++rp)
+ rp->id = INVALID_REGION_ID;
+
+ renv->cipher_off = renv->thread_off = renv->rep_off = INVALID_ROFF;
+ renv->flags = 0;
+ renv->op_timestamp = renv->rep_timestamp = 0;
+ renv->mtx_regenv = MUTEX_INVALID;
+ renv->reg_panic = 0;
+
+ /*
+ * Get the underlying REGION structure for this environment. Note,
+ * we created the underlying OS region before we acquired the REGION
+ * structure, which is backwards from the normal procedure. Update
+ * the REGION structure.
+ */
+ if ((ret = __env_des_get(env, infop, infop, &rp)) != 0) {
+find_err: __db_errx(env, DB_STR_A("1544",
+ "%s: unable to find environment", "%s"), infop->name);
+ if (ret == 0)
+ ret = EINVAL;
+ goto err;
+ }
+ infop->rp = rp;
+ rp->alloc = rp->size = tregion.size;
+ rp->max = tregion.max;
+ rp->segid = tregion.segid;
+
+ /*
+ * !!!
+ * If we create an environment where regions are public and in system
+ * memory, we have to inform processes joining the environment how to
+ * attach to the shared memory segment. So, we write the shared memory
+ * identifier into the file, to be read by those other processes.
+ *
+ * XXX
+ * This is really OS-layer information, but I can't see any easy way
+ * to move it down there without passing down information that it has
+ * no right to know, e.g., that this is the one-and-only REGENV region
+ * and not some other random region.
+ */
+ if (tregion.segid != INVALID_REGION_SEGID) {
+ ref.size = tregion.size;
+ ref.segid = tregion.segid;
+ ref.max = tregion.max;
+ if ((ret = __os_write(
+ env, env->lockfhp, &ref, sizeof(ref), &nrw)) != 0) {
+ __db_err(env, ret, DB_STR_A("1545",
+ "%s: unable to write out public environment ID",
+ "%s"), infop->name);
+ goto err;
+ }
+ }
+
+#ifndef HAVE_MUTEX_FCNTL
+ /*
+ * If we're not doing fcntl locking, we can close the file handle. We
+ * no longer need it and the less contact between the buffer cache and
+ * the VM, the better.
+ */
+ if (env->lockfhp != NULL) {
+ (void)__os_closehandle(env, env->lockfhp);
+ env->lockfhp = NULL;
+ }
+#endif
+
+ /* Everything looks good, we're done. */
+ env->reginfo = infop;
+ return (0);
+
+err:
+retry: /* Close any open file handle. */
+ if (env->lockfhp != NULL) {
+ (void)__os_closehandle(env, env->lockfhp);
+ env->lockfhp = NULL;
+ }
+
+ /*
+ * If we joined or created the region, detach from it. If we created
+ * it, destroy it. Note, there's a path in the above code where we're
+ * using a temporary REGION structure because we haven't yet allocated
+ * the real one. In that case the region address (addr) will be filled
+ * in, but the REGION pointer (rp) won't. Fix it.
+ */
+ if (infop->addr != NULL) {
+ if (infop->rp == NULL)
+ infop->rp = &tregion;
+
+ (void)__env_sys_detach(env,
+ infop, F_ISSET(infop, REGION_CREATE));
+
+ if (rp != NULL && F_ISSET(env, DB_PRIVATE))
+ __env_alloc_free(infop, rp);
+ }
+
+ /* Free the allocated name and/or REGINFO structure. */
+ if (infop->name != NULL)
+ __os_free(env, infop->name);
+ __os_free(env, infop);
+
+ /* If we had a temporary error, wait awhile and try again. */
+ if (ret == 0) {
+ if (!retry_ok || ++retry_cnt > 3) {
+ __db_errx(env, DB_STR("1546",
+ "unable to join the environment"));
+ ret = EAGAIN;
+ } else {
+ __os_yield(env, retry_cnt * 3, 0);
+ goto loop;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __env_turn_on --
+ * Turn on the created environment.
+ *
+ * PUBLIC: int __env_turn_on __P((ENV *));
+ */
+int
+__env_turn_on(env)
+ ENV *env;
+{
+ REGENV *renv;
+ REGINFO *infop;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ /* If we didn't create the region, there's no need for further work. */
+ if (!F_ISSET(infop, REGION_CREATE))
+ return (0);
+
+ /*
+ * Validate the file. All other threads of control are waiting
+ * on this value to be written -- "Let slip the hounds of war!"
+ */
+ renv->magic = DB_REGION_MAGIC;
+
+ return (0);
+}
+
+/*
+ * __env_turn_off --
+ * Turn off the environment.
+ *
+ * PUBLIC: int __env_turn_off __P((ENV *, u_int32_t));
+ */
+int
+__env_turn_off(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ int ret, t_ret;
+
+ ret = 0;
+
+ /*
+ * Connect to the environment: If we can't join the environment, we
+ * guess it's because it doesn't exist and we're done.
+ *
+ * If the environment exists, attach and lock the environment.
+ */
+ if (__env_attach(env, NULL, 0, 1) != 0)
+ return (0);
+
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ MUTEX_LOCK(env, renv->mtx_regenv);
+
+ /*
+ * If the environment is in use, we're done unless we're forcing the
+ * issue or the environment has panic'd. (If the environment panic'd,
+ * the thread holding the reference count may not have cleaned up, so
+ * we clean up. It's possible the application didn't plan on removing
+ * the environment in this particular call, but panic'd environments
+ * aren't useful to anyone.)
+ *
+ * Otherwise, panic the environment and overwrite the magic number so
+ * any thread of control attempting to connect (or racing with us) will
+ * back off and retry, or just die.
+ */
+ if (renv->refcnt > 0 && !LF_ISSET(DB_FORCE) && !renv->panic)
+ ret = EBUSY;
+ else
+ renv->panic = 1;
+
+ /*
+ * Unlock the environment (nobody should need this lock because
+ * we've poisoned the pool) and detach from the environment.
+ */
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+
+ if ((t_ret = __env_detach(env, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __env_panic_set --
+ * Set/clear unrecoverable error.
+ *
+ * PUBLIC: void __env_panic_set __P((ENV *, int));
+ */
+void
+__env_panic_set(env, on)
+ ENV *env;
+ int on;
+{
+ if (env != NULL && env->reginfo != NULL)
+ ((REGENV *)env->reginfo->primary)->panic = on ? 1 : 0;
+}
+
+/*
+ * __env_ref_increment --
+ * Increment the environment's reference count.
+ *
+ * PUBLIC: int __env_ref_increment __P((ENV *));
+ */
+int
+__env_ref_increment(env)
+ ENV *env;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ int ret;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ /* If we're creating the primary region, allocate a mutex. */
+ if (F_ISSET(infop, REGION_CREATE)) {
+ if ((ret = __mutex_alloc(
+ env, MTX_ENV_REGION, 0, &renv->mtx_regenv)) != 0)
+ return (ret);
+ renv->refcnt = 1;
+ } else {
+ /* Lock the environment, increment the reference, unlock. */
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ ++renv->refcnt;
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ }
+
+ F_SET(env, ENV_REF_COUNTED);
+ return (0);
+}
+
+/*
+ * __env_ref_decrement --
+ * Decrement the environment's reference count.
+ *
+ * PUBLIC: int __env_ref_decrement __P((ENV *));
+ */
+int
+__env_ref_decrement(env)
+ ENV *env;
+{
+ REGENV *renv;
+ REGINFO *infop;
+
+ /* Be cautious -- we may not have an environment. */
+ if ((infop = env->reginfo) == NULL)
+ return (0);
+
+ renv = infop->primary;
+
+ /* Even if we have an environment, may not have reference counted it. */
+ if (F_ISSET(env, ENV_REF_COUNTED)) {
+ /* Lock the environment, decrement the reference, unlock. */
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ if (renv->refcnt == 0)
+ __db_errx(env, DB_STR("1547",
+ "environment reference count went negative"));
+ else
+ --renv->refcnt;
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+
+ F_CLR(env, ENV_REF_COUNTED);
+ }
+
+ /* If a private environment, we're done with the mutex, destroy it. */
+ return (F_ISSET(env, ENV_PRIVATE) ?
+ __mutex_free(env, &renv->mtx_regenv) : 0);
+}
+
+/*
+ * __env_ref_get --
+ * Get the number of environment references. This is an unprotected
+ * read of refcnt to simply provide a spot check of the value. It
+ * is only intended for use as an internal utility routine.
+ *
+ * PUBLIC: int __env_ref_get __P((DB_ENV *, u_int32_t *));
+ */
+int
+__env_ref_get(dbenv, countp)
+ DB_ENV *dbenv;
+ u_int32_t *countp;
+{
+ ENV *env;
+ REGENV *renv;
+ REGINFO *infop;
+
+ env = dbenv->env;
+ infop = env->reginfo;
+ renv = infop->primary;
+ *countp = renv->refcnt;
+ return (0);
+}
+
+/*
+ * __env_detach --
+ * Detach from the environment.
+ *
+ * PUBLIC: int __env_detach __P((ENV *, int));
+ */
+int
+__env_detach(env, destroy)
+ ENV *env;
+ int destroy;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ REGION rp;
+ int ret, t_ret;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ ret = 0;
+
+ /* Close the locking file handle. */
+ if (env->lockfhp != NULL) {
+ if ((t_ret =
+ __os_closehandle(env, env->lockfhp)) != 0 && ret == 0)
+ ret = t_ret;
+ env->lockfhp = NULL;
+ }
+
+ /*
+ * If a private region, return the memory to the heap. Not needed for
+ * filesystem-backed or system shared memory regions, that memory isn't
+ * owned by any particular process.
+ */
+ if (destroy) {
+ /*
+ * Free the REGION array.
+ *
+ * The actual underlying region structure is allocated from the
+ * primary shared region, and we're about to free it. Save a
+ * copy on our stack for the REGINFO to reference when it calls
+ * down into the OS layer to release the shared memory segment.
+ */
+ rp = *infop->rp;
+ infop->rp = &rp;
+
+ if (renv->region_off != INVALID_ROFF)
+ __env_alloc_free(
+ infop, R_ADDR(infop, renv->region_off));
+ }
+
+ /*
+ * Set the ENV->reginfo field to NULL. BDB uses the ENV->reginfo
+ * field to decide if the underlying region can be accessed or needs
+ * cleanup. We're about to destroy what it references, so it needs to
+ * be cleared.
+ */
+ env->reginfo = NULL;
+ env->thr_hashtab = NULL;
+
+ if ((t_ret = __env_sys_detach(env, infop, destroy)) != 0 && ret == 0)
+ ret = t_ret;
+ if (infop->name != NULL)
+ __os_free(env, infop->name);
+
+ /* Discard the ENV->reginfo field's memory. */
+ __os_free(env, infop);
+
+ return (ret);
+}
+
+/*
+ * __env_remove_env --
+ * Remove an environment.
+ *
+ * PUBLIC: int __env_remove_env __P((ENV *));
+ */
+int
+__env_remove_env(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ REGENV *renv;
+ REGINFO *infop, reginfo;
+ REGION *rp;
+ u_int32_t flags_orig, i;
+
+ dbenv = env->dbenv;
+
+ /*
+ * We do not want to hang on a mutex request, nor do we care about
+ * panics.
+ */
+ flags_orig = F_ISSET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
+ F_SET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
+
+ /*
+ * This routine has to walk a nasty line between not looking into the
+ * environment (which may be corrupted after an app or system crash),
+ * and removing everything that needs removing.
+ *
+ * Connect to the environment: If we can't join the environment, we
+ * guess it's because it doesn't exist. Remove the underlying files,
+ * at least.
+ */
+ if (__env_attach(env, NULL, 0, 0) != 0)
+ goto remfiles;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ /*
+ * Kill the environment, if it's not already dead.
+ */
+ renv->panic = 1;
+
+ /*
+ * Walk the array of regions. Connect to each region and disconnect
+ * with the destroy flag set. This shouldn't cause any problems, even
+ * if the region is corrupted, because we never look inside the region
+ * (with the single exception of mutex regions on systems where we have
+ * to return resources to the underlying system).
+ */
+ for (rp = R_ADDR(infop, renv->region_off),
+ i = 0; i < renv->region_cnt; ++i, ++rp) {
+ if (rp->id == INVALID_REGION_ID || rp->type == REGION_TYPE_ENV)
+ continue;
+ /*
+ * !!!
+ * The REGION_CREATE_OK flag is set for Windows/95 -- regions
+ * are zero'd out when the last reference to the region goes
+ * away, in which case the underlying OS region code requires
+ * callers be prepared to create the region in order to join it.
+ */
+ memset(&reginfo, 0, sizeof(reginfo));
+ reginfo.id = rp->id;
+ reginfo.flags = REGION_CREATE_OK;
+
+ /*
+ * If we get here and can't attach and/or detach to the
+ * region, it's a mess. Ignore errors, there's nothing
+ * we can do about them.
+ */
+ if (__env_region_attach(env, &reginfo, 0, 0) != 0)
+ continue;
+
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
+ /*
+ * If destroying the mutex region, return any system
+ * resources to the system.
+ */
+ if (reginfo.type == REGION_TYPE_MUTEX)
+ __mutex_resource_return(env, &reginfo);
+#endif
+ (void)__env_region_detach(env, &reginfo, 1);
+ }
+
+ /* Detach from the environment's primary region. */
+ (void)__env_detach(env, 1);
+
+remfiles:
+ /*
+ * Walk the list of files in the directory, unlinking files in the
+ * Berkeley DB name space.
+ */
+ __env_remove_file(env);
+
+ F_CLR(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
+ F_SET(dbenv, flags_orig);
+
+ return (0);
+}
+
+/*
+ * __env_remove_file --
+ * Discard any region files in the filesystem.
+ */
+static void
+__env_remove_file(env)
+ ENV *env;
+{
+ int cnt, fcnt, lastrm, ret;
+ const char *dir;
+ char saved_char, *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20];
+
+ /* Get the full path of a file in the environment. */
+ (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, buf, NULL, &path)) != 0)
+ return;
+
+ /* Get the parent directory for the environment. */
+ if ((p = __db_rpath(path)) == NULL) {
+ p = path;
+ saved_char = *p;
+
+ dir = PATH_DOT;
+ } else {
+ saved_char = *p;
+ *p = '\0';
+
+ dir = path;
+ }
+
+ /* Get the list of file names. */
+ if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0)
+ __db_err(env, ret, "%s", dir);
+
+ /* Restore the path, and free it. */
+ *p = saved_char;
+ __os_free(env, path);
+
+ if (ret != 0)
+ return;
+
+ /*
+ * Remove files from the region directory.
+ */
+ for (lastrm = -1, cnt = fcnt; --cnt >= 0;) {
+ /* Skip anything outside our name space. */
+ if (!IS_DB_FILE(names[cnt]))
+ continue;
+
+ /* Skip queue extent files. */
+ if (strncmp(names[cnt], "__dbq.", 6) == 0)
+ continue;
+ if (strncmp(names[cnt], "__dbp.", 6) == 0)
+ continue;
+
+ /* Skip registry files. */
+ if (strncmp(names[cnt], "__db.register", 13) == 0)
+ continue;
+
+ /* Skip replication files. */
+ if (strncmp(names[cnt], "__db.rep", 8) == 0)
+ continue;
+
+ /*
+ * Remove the primary environment region last, because it's
+ * the key to this whole mess.
+ */
+ if (strcmp(names[cnt], DB_REGION_ENV) == 0) {
+ lastrm = cnt;
+ continue;
+ }
+
+ /* Remove the file. */
+ if (__db_appname(env,
+ DB_APP_NONE, names[cnt], NULL, &path) == 0) {
+ /*
+ * Overwrite region files. Temporary files would have
+ * been maintained in encrypted format, so there's no
+ * reason to overwrite them. This is not an exact
+ * check on the file being a region file, but it's
+ * not likely to be wrong, and the worst thing that can
+ * happen is we overwrite a file that didn't need to be
+ * overwritten.
+ */
+ (void)__os_unlink(env, path, 1);
+ __os_free(env, path);
+ }
+ }
+
+ if (lastrm != -1)
+ if (__db_appname(env,
+ DB_APP_NONE, names[lastrm], NULL, &path) == 0) {
+ (void)__os_unlink(env, path, 1);
+ __os_free(env, path);
+ }
+ __os_dirfree(env, names, fcnt);
+}
+
+/*
+ * __env_region_attach
+ * Join/create a region.
+ *
+ * PUBLIC: int __env_region_attach __P((ENV *, REGINFO *, size_t, size_t));
+ */
+int
+__env_region_attach(env, infop, init, max)
+ ENV *env;
+ REGINFO *infop;
+ size_t init, max;
+{
+ REGION *rp;
+ int ret;
+ char buf[sizeof(DB_REGION_FMT) + 20];
+
+ /*
+ * Find or create a REGION structure for this region. If we create
+ * it, the REGION_CREATE flag will be set in the infop structure.
+ */
+ F_CLR(infop, REGION_CREATE);
+ if ((ret = __env_des_get(env, env->reginfo, infop, &rp)) != 0)
+ return (ret);
+ infop->env = env;
+ infop->rp = rp;
+ infop->type = rp->type;
+ infop->id = rp->id;
+
+ /*
+ * __env_des_get may have created the region and reset the create
+ * flag. If we're creating the region, set the desired size.
+ */
+ if (F_ISSET(infop, REGION_CREATE)) {
+ rp->alloc = rp->size = (roff_t)init;
+ rp->max = (roff_t)max;
+ }
+
+ /* Join/create the underlying region. */
+ (void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id);
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, buf, NULL, &infop->name)) != 0)
+ goto err;
+ if ((ret = __env_sys_attach(env, infop, rp)) != 0)
+ goto err;
+
+ /*
+ * Fault the pages into memory. Note, do this BEFORE we initialize
+ * anything because we're writing pages in created regions, not just
+ * reading them.
+ */
+ (void)__env_faultmem(env,
+ infop->addr, rp->size, F_ISSET(infop, REGION_CREATE));
+
+ /*
+ * !!!
+ * The underlying layer may have just decided that we are going
+ * to create the region. There are various system issues that
+ * can result in a useless region that requires re-initialization.
+ *
+ * If we created the region, initialize it for allocation.
+ */
+ if (F_ISSET(infop, REGION_CREATE))
+ __env_alloc_init(infop, rp->size);
+
+ return (0);
+
+err: /* Discard the underlying region. */
+ if (infop->addr != NULL)
+ (void)__env_sys_detach(env,
+ infop, F_ISSET(infop, REGION_CREATE));
+ else if (infop->name != NULL) {
+ __os_free(env, infop->name);
+ infop->name = NULL;
+ }
+ infop->rp = NULL;
+ infop->id = INVALID_REGION_ID;
+
+ /* Discard the REGION structure if we created it. */
+ if (F_ISSET(infop, REGION_CREATE)) {
+ __env_des_destroy(env, rp);
+ F_CLR(infop, REGION_CREATE);
+ }
+
+ return (ret);
+}
+
+/*
+ * __env_region_share
+ * Share the primary region.
+ *
+ * PUBLIC: int __env_region_share __P((ENV *, REGINFO *));
+ */
+int
+__env_region_share(env, infop)
+ ENV *env;
+ REGINFO *infop;
+{
+ REGINFO *envinfo;
+ REGION *rp;
+
+ envinfo = env->reginfo;
+ rp = envinfo->rp;
+ F_SET(infop, F_ISSET(envinfo, REGION_CREATE) | REGION_SHARED);
+ infop->addr = envinfo->addr;
+ infop->head = envinfo->head;
+
+ infop->env = env;
+ infop->rp = rp;
+ infop->name = envinfo->name;
+ infop->fhp = envinfo->fhp;
+ infop->type = rp->type;
+ infop->id = rp->id;
+
+ return (0);
+}
+
+/*
+ * __env_region_detach --
+ * Detach from a region.
+ *
+ * PUBLIC: int __env_region_detach __P((ENV *, REGINFO *, int));
+ */
+int
+__env_region_detach(env, infop, destroy)
+ ENV *env;
+ REGINFO *infop;
+ int destroy;
+{
+ REGION *rp;
+ REGION_MEM *mem, *next;
+ int ret;
+
+ if (F_ISSET(env, ENV_PRIVATE))
+ destroy = 1;
+ else if (F_ISSET(infop, REGION_SHARED))
+ return (0);
+
+ rp = infop->rp;
+
+ /*
+ * When discarding the regions as we shut down a database environment,
+ * discard any allocated shared memory segments. This is the last time
+ * we use them, and db_region_destroy is the last region-specific call
+ * we make.
+ */
+ if (F_ISSET(env, ENV_PRIVATE) && infop->primary != NULL) {
+ for (mem = infop->mem; mem != NULL; mem = next) {
+ next = mem->next;
+ __env_alloc_free(infop, mem);
+ }
+ __env_alloc_free(infop, infop->primary);
+ }
+
+ if (F_ISSET(infop, REGION_SHARED))
+ return (0);
+
+ /* Detach from the underlying OS region. */
+ ret = __env_sys_detach(env, infop, destroy);
+
+ /* If we destroyed the region, discard the REGION structure. */
+ if (destroy)
+ __env_des_destroy(env, rp);
+
+ /* Destroy the structure. */
+ if (infop->name != NULL)
+ __os_free(env, infop->name);
+
+ return (ret);
+}
+
+/*
+ * __env_sys_attach --
+ * Prep and call the underlying OS attach function.
+ */
+static int
+__env_sys_attach(env, infop, rp)
+ ENV *env;
+ REGINFO *infop;
+ REGION *rp;
+{
+ int ret;
+
+ /*
+ * All regions are created on 8K boundaries out of sheer paranoia,
+ * so we don't make some underlying VM unhappy. Make sure we don't
+ * overflow or underflow.
+ */
+#define OS_VMPAGESIZE (8 * 1024)
+#define OS_VMROUNDOFF(i) { \
+ if ((i) + OS_VMPAGESIZE - 1 > (i)) \
+ (i) += OS_VMPAGESIZE - 1; \
+ (i) -= (i) % OS_VMPAGESIZE; \
+}
+ if (F_ISSET(infop, REGION_CREATE)) {
+ OS_VMROUNDOFF(rp->size);
+ OS_VMROUNDOFF(rp->max);
+ }
+
+#ifdef DB_REGIONSIZE_MAX
+ /* Some architectures have hard limits on the maximum region size. */
+ if (rp->size > DB_REGIONSIZE_MAX) {
+ __db_errx(env, DB_STR_A("1548",
+ "region size %lu is too large; maximum is %lu", "%lu %lu"),
+ (u_long)rp->size, (u_long)DB_REGIONSIZE_MAX);
+ return (EINVAL);
+ }
+ if (rp->max > DB_REGIONSIZE_MAX) {
+ __db_errx(env, DB_STR_A("1549",
+ "region max %lu is too large; maximum is %lu", "%lu %lu"),
+ (u_long)rp->max, (u_long)DB_REGIONSIZE_MAX);
+ return (EINVAL);
+ }
+#endif
+
+ /*
+ * If a region is private, malloc the memory.
+ *
+ * !!!
+ * If this fails because the region is too large to malloc, mmap(2)
+ * using the MAP_ANON or MAP_ANONYMOUS flags would be an alternative.
+ * I don't know of any architectures (yet!) where malloc is a problem.
+ */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+#if defined(HAVE_MUTEX_HPPA_MSEM_INIT)
+ /*
+ * !!!
+ * There exist spinlocks that don't work in malloc memory, e.g.,
+ * the HP/UX msemaphore interface. If we don't have locks that
+ * will work in malloc memory, we better not be private or not
+ * be threaded.
+ */
+ if (F_ISSET(env, ENV_THREAD)) {
+ __db_errx(env, DB_STR("1550",
+"architecture does not support locks inside process-local (malloc) memory"));
+ __db_errx(env, DB_STR("1551",
+ "application may not specify both DB_PRIVATE and DB_THREAD"));
+ return (EINVAL);
+ }
+#endif
+ if ((ret = __os_malloc(
+ env, sizeof(REGENV), &infop->addr)) != 0)
+ return (ret);
+
+ } else {
+#if !defined(HAVE_MMAP_EXTEND)
+ /* Extend any disk file to its full size before mapping it. */
+ rp->size = rp->max;
+#endif
+ if ((ret = __os_attach(env, infop, rp)) != 0)
+ return (ret);
+ }
+
+ /* Set the start of the allocation region. */
+ infop->head = infop->addr;
+
+ /*
+ * We require that the memory is aligned to fix the largest integral
+ * type. Otherwise, multiple processes mapping the same shared region
+ * would have to memcpy every value before reading it.
+ */
+ if (infop->addr != ALIGNP_INC(infop->addr, sizeof(uintmax_t))) {
+ __db_errx(env, DB_STR("1552",
+ "region memory was not correctly aligned"));
+ (void)__env_sys_detach(env, infop,
+ F_ISSET(infop, REGION_CREATE));
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __env_sys_detach --
+ * Prep and call the underlying OS detach function.
+ */
+static int
+__env_sys_detach(env, infop, destroy)
+ ENV *env;
+ REGINFO *infop;
+ int destroy;
+{
+
+ /* If a region is private, free the memory. */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ __os_free(env, infop->addr);
+ return (0);
+ }
+
+ return (__os_detach(env, infop, destroy));
+}
+
+/*
+ * __env_des_get --
+ * Return a reference to the shared information for a REGION,
+ * optionally creating a new entry.
+ */
+static int
+__env_des_get(env, env_infop, infop, rpp)
+ ENV *env;
+ REGINFO *env_infop, *infop;
+ REGION **rpp;
+{
+ REGENV *renv;
+ REGION *rp, *empty_slot, *first_type;
+ u_int32_t i, maxid;
+
+ *rpp = NULL;
+ renv = env_infop->primary;
+
+ /*
+ * If the caller wants to join a region, walk through the existing
+ * regions looking for a matching ID (if ID specified) or matching
+ * type (if type specified). If we return based on a matching type
+ * return the "primary" region, that is, the first region that was
+ * created of this type.
+ *
+ * Track the first empty slot and maximum region ID for new region
+ * allocation.
+ *
+ * MaxID starts at REGION_ID_ENV, the ID of the primary environment.
+ */
+ maxid = REGION_ID_ENV;
+ empty_slot = first_type = NULL;
+ for (rp = R_ADDR(env_infop, renv->region_off),
+ i = 0; i < renv->region_cnt; ++i, ++rp) {
+ if (rp->id == INVALID_REGION_ID) {
+ if (empty_slot == NULL)
+ empty_slot = rp;
+ continue;
+ }
+ if (infop->id != INVALID_REGION_ID) {
+ if (infop->id == rp->id)
+ break;
+ continue;
+ }
+ if (infop->type == rp->type &&
+ F_ISSET(infop, REGION_JOIN_OK) &&
+ (first_type == NULL || first_type->id > rp->id))
+ first_type = rp;
+
+ if (rp->id > maxid)
+ maxid = rp->id;
+ }
+
+ /* If we found a matching ID (or a matching type), return it. */
+ if (i >= renv->region_cnt)
+ rp = first_type;
+ if (rp != NULL) {
+ *rpp = rp;
+ return (0);
+ }
+
+ /*
+ * If we didn't find a region and we don't have permission to create
+ * the region, fail. The caller generates any error message.
+ */
+ if (!F_ISSET(infop, REGION_CREATE_OK))
+ return (ENOENT);
+
+ /*
+ * If we didn't find a region and don't have room to create the region
+ * fail with an error message, there's a sizing problem.
+ */
+ if (empty_slot == NULL) {
+ __db_errx(env, DB_STR("1553",
+ "no room remaining for additional REGIONs"));
+ return (ENOENT);
+ }
+
+ /*
+ * Initialize a REGION structure for the caller. If id was set, use
+ * that value, otherwise we use the next available ID.
+ */
+ memset(empty_slot, 0, sizeof(REGION));
+ empty_slot->segid = INVALID_REGION_SEGID;
+
+ /*
+ * Set the type and ID; if no region ID was specified,
+ * allocate one.
+ */
+ empty_slot->type = infop->type;
+ empty_slot->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id;
+
+ F_SET(infop, REGION_CREATE);
+
+ *rpp = empty_slot;
+ return (0);
+}
+
+/*
+ * __env_des_destroy --
+ * Destroy a reference to a REGION.
+ */
+static void
+__env_des_destroy(env, rp)
+ ENV *env;
+ REGION *rp;
+{
+ COMPQUIET(env, NULL);
+
+ rp->id = INVALID_REGION_ID;
+}
+
+/*
+ * __env_faultmem --
+ * Fault the region into memory.
+ */
+static int
+__env_faultmem(env, addr, size, created)
+ ENV *env;
+ void *addr;
+ size_t size;
+ int created;
+{
+ int ret;
+ u_int8_t *p, *t;
+
+ /* Ignore heap regions. */
+ if (F_ISSET(env, ENV_PRIVATE))
+ return (0);
+
+ /*
+ * It's sometimes significantly faster to page-fault in all of the
+ * region's pages before we run the application, as we see nasty
+ * side-effects when we page-fault while holding various locks, i.e.,
+ * the lock takes a long time to acquire because of the underlying
+ * page fault, and the other threads convoy behind the lock holder.
+ *
+ * If we created the region, we write a non-zero value so that the
+ * system can't cheat. If we're just joining the region, we can
+ * only read the value and try to confuse the compiler sufficiently
+ * that it doesn't figure out that we're never really using it.
+ *
+ * Touch every page (assuming pages are 512B, the smallest VM page
+ * size used in any general purpose processor).
+ */
+ ret = 0;
+ if (F_ISSET(env->dbenv, DB_ENV_REGION_INIT)) {
+ if (created)
+ for (p = addr,
+ t = (u_int8_t *)addr + size; p < t; p += 512)
+ p[0] = 0xdb;
+ else
+ for (p = addr,
+ t = (u_int8_t *)addr + size; p < t; p += 512)
+ ret |= p[0];
+ }
+
+ return (ret);
+}
diff --git a/src/env/env_register.c b/src/env/env_register.c
new file mode 100644
index 00000000..7475444d
--- /dev/null
+++ b/src/env/env_register.c
@@ -0,0 +1,730 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#define REGISTER_FILE "__db.register"
+
+#define PID_EMPTY "X 0\n" /* Unused PID entry */
+#define PID_FMT "%24lu\n" /* PID entry format */
+ /* Unused PID test */
+#define PID_ISEMPTY(p) (memcmp(p, PID_EMPTY, PID_LEN) == 0)
+#define PID_LEN (25) /* PID entry length */
+
+#define REGISTRY_LOCK(env, pos, nowait) \
+ __os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 1, nowait)
+#define REGISTRY_UNLOCK(env, pos) \
+ __os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 0, 0)
+#define REGISTRY_EXCL_LOCK(env, nowait) \
+ REGISTRY_LOCK(env, 1, nowait)
+#define REGISTRY_EXCL_UNLOCK(env) \
+ REGISTRY_UNLOCK(env, 1)
+
+static int __envreg_add __P((ENV *, int *, u_int32_t));
+static int __envreg_pid_compare __P((const void *, const void *));
+static int __envreg_create_active_pid __P((ENV *, char *));
+
+/*
+ * Support for portable, multi-process database environment locking, based on
+ * the Subversion SR (#11511).
+ *
+ * The registry feature is configured by specifying the DB_REGISTER flag to the
+ * DbEnv.open method. If DB_REGISTER is specified, DB opens the registry file
+ * in the database environment home directory. The registry file is formatted
+ * as follows:
+ *
+ * 12345 # process ID slot 1
+ * X # empty slot
+ * 12346 # process ID slot 2
+ * X # empty slot
+ * 12347 # process ID slot 3
+ * 12348 # process ID slot 4
+ * X 12349 # empty slot
+ * X # empty slot
+ *
+ * All lines are fixed-length. All lines are process ID slots. Empty slots
+ * are marked with leading non-digit characters.
+ *
+ * To modify the file, you get an exclusive lock on the first byte of the file.
+ *
+ * While holding any DbEnv handle, each process has an exclusive lock on the
+ * first byte of a process ID slot. There is a restriction on having more
+ * than one DbEnv handle open at a time, because Berkeley DB uses per-process
+ * locking to implement this feature, that is, a process may never have more
+ * than a single slot locked.
+ *
+ * This work requires that if a process dies or the system crashes, locks held
+ * by the dying processes will be dropped. (We can't use system shared
+ * memory-backed or filesystem-backed locks because they're persistent when a
+ * process dies.) On POSIX systems, we use fcntl(2) locks; on Win32 we have
+ * LockFileEx/UnlockFile, except for Win/9X and Win/ME which have to loop on
+ * Lockfile/UnlockFile.
+ *
+ * We could implement the same solution with flock locking instead of fcntl,
+ * but flock would require a separate file for each process of control (and
+ * probably each DbEnv handle) in the database environment, which is fairly
+ * ugly.
+ *
+ * Whenever a process opens a new DbEnv handle, it walks the registry file and
+ * verifies it CANNOT acquire the lock for any non-empty slot. If a lock for
+ * a non-empty slot is available, we know a process died holding an open handle,
+ * and recovery needs to be run.
+ *
+ * It's possible to get corruption in the registry file. If a write system
+ * call fails after partially completing, there can be corrupted entries in
+ * the registry file, or a partial entry at the end of the file. This is OK.
+ * A corrupted entry will be flagged as a non-empty line during the registry
+ * file walk. Since the line was corrupted by process failure, no process will
+ * hold a lock on the slot, which will lead to recovery being run.
+ *
+ * There can still be processes running in the environment when we recover it,
+ * and, in fact, there can still be processes running in the old environment
+ * after we're up and running in a new one. This is safe because performing
+ * recovery panics (and removes) the existing environment, so the window of
+ * vulnerability is small. Further, we check the panic flag in the DB API
+ * methods, when waking from spinning on a mutex, and whenever we're about to
+ * write to disk). The only window of corruption is if the write check of the
+ * panic were to complete, the region subsequently be recovered, and then the
+ * write continues. That's very, very unlikely to happen. This vulnerability
+ * already exists in Berkeley DB, too, the registry code doesn't make it any
+ * worse than it already is.
+ *
+ * The only way to avoid that window entirely is to ensure that all processes
+ * in the Berkeley DB environment exit before we run recovery. Applications
+ * can do that if they maintain their own process registry outside of Berkeley
+ * DB, but it's a little more difficult to do here. The obvious approach is
+ * to send signals to any process using the database environment as soon as we
+ * decide to run recovery, but there are problems with that approach: we might
+ * not have permission to send signals to the process, the process might have
+ * signal handlers installed, the cookie stored might not be the same as kill's
+ * argument, we may not be able to reliably tell if the process died, and there
+ * are probably other problems. However, if we can send a signal, it reduces
+ * the window, and so we include the code here. To configure it, turn on the
+ * DB_ENVREG_KILL_ALL #define.
+ */
+#define DB_ENVREG_KILL_ALL 0
+
+/*
+ * __envreg_register --
+ * Register a ENV handle.
+ *
+ * PUBLIC: int __envreg_register __P((ENV *, int *, u_int32_t));
+ */
+int
+__envreg_register(env, need_recoveryp, flags)
+ ENV *env;
+ int *need_recoveryp;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ pid_t pid;
+ u_int32_t bytes, mbytes;
+ int ret;
+ char *pp;
+
+ *need_recoveryp = 0;
+
+ dbenv = env->dbenv;
+ dbenv->thread_id(dbenv, &pid, NULL);
+ pp = NULL;
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR_A("1524",
+ "%lu: register environment", "%lu"), (u_long)pid);
+
+ /* Build the path name and open the registry file. */
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, REGISTER_FILE, NULL, &pp)) != 0)
+ goto err;
+ if ((ret = __os_open(env, pp, 0,
+ DB_OSO_CREATE, DB_MODE_660, &dbenv->registry)) != 0)
+ goto err;
+
+ /*
+ * Wait for an exclusive lock on the file.
+ *
+ * !!!
+ * We're locking bytes that don't yet exist, but that's OK as far as
+ * I know.
+ */
+ if ((ret = REGISTRY_EXCL_LOCK(env, 0)) != 0)
+ goto err;
+
+ /*
+ * If the file size is 0, initialize the file.
+ *
+ * Run recovery if we create the file, that means we can clean up the
+ * system by removing the registry file and restarting the application.
+ */
+ if ((ret = __os_ioinfo(
+ env, pp, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
+ goto err;
+ if (mbytes == 0 && bytes == 0) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR_A("1525",
+ "%lu: creating %s", "%lu %s"), (u_long)pid, pp);
+ *need_recoveryp = 1;
+ }
+
+ /* Register this process. */
+ if ((ret = __envreg_add(env, need_recoveryp, flags)) != 0)
+ goto err;
+
+ /*
+ * Release our exclusive lock if we don't need to run recovery. If
+ * we need to run recovery, ENV->open will call back into register
+ * code once recovery has completed.
+ */
+ if (*need_recoveryp == 0 && (ret = REGISTRY_EXCL_UNLOCK(env)) != 0)
+ goto err;
+
+ if (0) {
+err: *need_recoveryp = 0;
+
+ /*
+ * !!!
+ * Closing the file handle must release all of our locks.
+ */
+ if (dbenv->registry != NULL)
+ (void)__os_closehandle(env, dbenv->registry);
+ dbenv->registry = NULL;
+ }
+
+ if (pp != NULL)
+ __os_free(env, pp);
+
+ return (ret);
+}
+
+/*
+ * __envreg_add --
+ * Add the process' pid to the register.
+ */
+static int
+__envreg_add(env, need_recoveryp, flags)
+ ENV *env;
+ int *need_recoveryp;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ DB_THREAD_INFO *ip;
+ REGENV * renv;
+ REGINFO *infop;
+ pid_t pid;
+ off_t end, pos, dead;
+ size_t nr, nw;
+ u_int lcnt;
+ u_int32_t bytes, mbytes, orig_flags;
+ int need_recovery, ret, t_ret;
+ char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10];
+
+ dbenv = env->dbenv;
+ need_recovery = 0;
+ COMPQUIET(dead, 0);
+ COMPQUIET(p, NULL);
+ ip = NULL;
+
+ /* Get a copy of our process ID. */
+ dbenv->thread_id(dbenv, &pid, NULL);
+ snprintf(pid_buf, sizeof(pid_buf), PID_FMT, (u_long)pid);
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR_A("1526",
+ "%lu: adding self to registry", "%lu"), (u_long)pid);
+
+#if DB_ENVREG_KILL_ALL
+ if (0) {
+kill_all: /*
+ * A second pass through the file, this time killing any
+ * processes still running.
+ */
+ if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
+ return (ret);
+ }
+#endif
+
+ /*
+ * Read the file. Skip empty slots, and check that a lock is held
+ * for any allocated slots. An allocated slot which we can lock
+ * indicates a process died holding a handle and recovery needs to
+ * be run.
+ */
+ for (lcnt = 0;; ++lcnt) {
+ if ((ret = __os_read(
+ env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
+ return (ret);
+ if (nr == 0)
+ break;
+
+ /*
+ * A partial record at the end of the file is possible if a
+ * previously un-registered process was interrupted while
+ * registering.
+ */
+ if (nr != PID_LEN) {
+ need_recovery = 1;
+ break;
+ }
+
+ if (PID_ISEMPTY(buf)) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR_A("1527",
+ "%02u: EMPTY", "%02u"), lcnt);
+ continue;
+ }
+
+ /*
+ * !!!
+ * DB_REGISTER is implemented using per-process locking, only
+ * a single ENV handle may be open per process. Enforce
+ * that restriction.
+ */
+ if (memcmp(buf, pid_buf, PID_LEN) == 0) {
+ __db_errx(env, DB_STR("1528",
+"DB_REGISTER limits processes to one open DB_ENV handle per environment"));
+ return (EINVAL);
+ }
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) {
+ for (p = buf; *p == ' ';)
+ ++p;
+ buf[nr - 1] = '\0';
+ }
+
+#if DB_ENVREG_KILL_ALL
+ if (need_recovery) {
+ pid = (pid_t)strtoul(buf, NULL, 10);
+ (void)kill(pid, SIGKILL);
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR_A("1529",
+ "%02u: %s: KILLED", "%02u %s"), lcnt, p);
+ continue;
+ }
+#endif
+ pos = (off_t)lcnt * PID_LEN;
+ if (REGISTRY_LOCK(env, pos, 1) == 0) {
+ if ((ret = REGISTRY_UNLOCK(env, pos)) != 0)
+ return (ret);
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR_A("1530",
+ "%02u: %s: FAILED", "%02u %s"), lcnt, p);
+
+ need_recovery = 1;
+ dead = pos;
+#if DB_ENVREG_KILL_ALL
+ goto kill_all;
+#else
+ break;
+#endif
+ } else
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR_A("1531",
+ "%02u: %s: LOCKED", "%02u %s"), lcnt, p);
+ }
+
+ /*
+ * If we have to perform recovery...
+ *
+ * Mark all slots empty. Registry ignores empty slots we can't lock,
+ * so it doesn't matter if any of the processes are in the middle of
+ * exiting Berkeley DB -- they'll discard their lock when they exit.
+ */
+ if (need_recovery) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, "%lu: recovery required", (u_long)pid);
+
+ if (LF_ISSET(DB_FAILCHK) || LF_ISSET(DB_FAILCHK_ISALIVE)) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env,
+ "%lu: performing failchk", (u_long)pid);
+
+ if (LF_ISSET(DB_FAILCHK_ISALIVE))
+ if ((ret = __envreg_create_active_pid(
+ env, pid_buf)) != 0)
+ goto sig_proc;
+
+ /* The environment will already exist, so we do not
+ * want DB_CREATE set, nor do we want any recovery at
+ * this point. No need to put values back as flags is
+ * passed in by value. Save original dbenv flags in
+ * case we need to recover/remove existing environment.
+ * Set DB_ENV_FAILCHK before attach to help ensure we
+ * dont block on a mutex held by the dead process.
+ */
+ LF_CLR(DB_CREATE | DB_RECOVER | DB_RECOVER_FATAL);
+ orig_flags = dbenv->flags;
+ F_SET(dbenv, DB_ENV_FAILCHK);
+ /* Attach to environment and subsystems. */
+ if ((ret = __env_attach_regions(
+ dbenv, flags, orig_flags, 0)) != 0)
+ goto sig_proc;
+ if ((t_ret =
+ __env_set_state(env, &ip, THREAD_FAILCHK)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if ((t_ret =
+ __env_failchk_int(dbenv)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free active pid array if used. */
+ if (LF_ISSET(DB_FAILCHK_ISALIVE)) {
+ DB_GLOBAL(num_active_pids) = 0;
+ DB_GLOBAL(size_active_pids) = 0;
+ __os_free( env, DB_GLOBAL(active_pids));
+ }
+
+ /* Detach from environment and deregister thread. */
+ if ((t_ret =
+ __env_refresh(dbenv, orig_flags, 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (ret == 0) {
+ if ((ret = __os_seek(env, dbenv->registry,
+ 0, 0,(u_int32_t)dead)) != 0 ||
+ (ret = __os_write(env, dbenv->registry,
+ PID_EMPTY, PID_LEN, &nw)) != 0)
+ return (ret);
+ need_recovery = 0;
+ goto add;
+ }
+
+ }
+ /* If we can't attach, then we cannot set DB_REGISTER panic. */
+sig_proc: if (__env_attach(env, NULL, 0, 0) == 0) {
+ infop = env->reginfo;
+ renv = infop->primary;
+ /* Indicate DB_REGSITER panic. Also, set environment
+ * panic as this is the panic trigger mechanism in
+ * the code that everything looks for.
+ */
+ renv->reg_panic = 1;
+ renv->panic = 1;
+ (void)__env_detach(env, 0);
+ }
+
+ /* Wait for processes to see the panic and leave. */
+ __os_yield(env, 0, dbenv->envreg_timeout);
+
+ /* FIGURE out how big the file is. */
+ if ((ret = __os_ioinfo(
+ env, NULL, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
+ return (ret);
+ end = (off_t)mbytes * MEGABYTE + bytes;
+
+ /*
+ * Seek to the beginning of the file and overwrite slots to
+ * the end of the file.
+ *
+ * It's possible for there to be a partial entry at the end of
+ * the file if a process died when trying to register. If so,
+ * correct for it and overwrite it as well.
+ */
+ if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
+ return (ret);
+ for (lcnt = 0; lcnt < ((u_int)end / PID_LEN +
+ ((u_int)end % PID_LEN == 0 ? 0 : 1)); ++lcnt) {
+
+ if ((ret = __os_read(
+ env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
+ return (ret);
+
+ pos = (off_t)lcnt * PID_LEN;
+ /* do not notify on dead process */
+ if (pos != dead) {
+ pid = (pid_t)strtoul(buf, NULL, 10);
+ DB_EVENT(env, DB_EVENT_REG_ALIVE, &pid);
+ }
+
+ if ((ret = __os_seek(env,
+ dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 ||
+ (ret = __os_write(env,
+ dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0)
+ return (ret);
+ }
+ /* wait one last time to get everyone out */
+ __os_yield(env, 0, dbenv->envreg_timeout);
+ }
+
+ /*
+ * Seek to the first process slot and add ourselves to the first empty
+ * slot we can lock.
+ */
+add: if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
+ return (ret);
+ for (lcnt = 0;; ++lcnt) {
+ if ((ret = __os_read(
+ env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
+ return (ret);
+ if (nr == PID_LEN && !PID_ISEMPTY(buf))
+ continue;
+ pos = (off_t)lcnt * PID_LEN;
+ if (REGISTRY_LOCK(env, pos, 1) == 0) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR_A("1532",
+ "%lu: locking slot %02u at offset %lu",
+ "%lu %02u %lu"), (u_long)pid, lcnt,
+ (u_long)pos);
+
+ if ((ret = __os_seek(env,
+ dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 ||
+ (ret = __os_write(env,
+ dbenv->registry, pid_buf, PID_LEN, &nw)) != 0)
+ return (ret);
+ dbenv->registry_off = (u_int32_t)pos;
+ break;
+ }
+ }
+
+ if (need_recovery)
+ *need_recoveryp = 1;
+
+ return (ret);
+}
+
+/*
+ * __envreg_unregister --
+ * Unregister a ENV handle.
+ *
+ * PUBLIC: int __envreg_unregister __P((ENV *, int));
+ */
+int
+__envreg_unregister(env, recovery_failed)
+ ENV *env;
+ int recovery_failed;
+{
+ DB_ENV *dbenv;
+ size_t nw;
+ int ret, t_ret;
+
+ dbenv = env->dbenv;
+ ret = 0;
+
+ /*
+ * If recovery failed, we want to drop our locks and return, but still
+ * make sure any subsequent process doesn't decide everything is just
+ * fine and try to get into the database environment. In the case of
+ * an error, discard our locks, but leave our slot filled-in.
+ */
+ if (recovery_failed)
+ goto err;
+
+ /*
+ * Why isn't an exclusive lock necessary to discard a ENV handle?
+ *
+ * We mark our process ID slot empty before we discard the process slot
+ * lock, and threads of control reviewing the register file ignore any
+ * slots which they can't lock.
+ */
+ if ((ret = __os_seek(env,
+ dbenv->registry, 0, 0, dbenv->registry_off)) != 0 ||
+ (ret = __os_write(
+ env, dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0)
+ goto err;
+
+ /*
+ * !!!
+ * This code assumes that closing the file descriptor discards all
+ * held locks.
+ *
+ * !!!
+ * There is an ordering problem here -- in the case of a process that
+ * failed in recovery, we're unlocking both the exclusive lock and our
+ * slot lock. If the OS unlocked the exclusive lock and then allowed
+ * another thread of control to acquire the exclusive lock before also
+ * also releasing our slot lock, we could race. That can't happen, I
+ * don't think.
+ */
+err: if ((t_ret =
+ __os_closehandle(env, dbenv->registry)) != 0 && ret == 0)
+ ret = t_ret;
+
+ dbenv->registry = NULL;
+ return (ret);
+}
+
+/*
+ * __envreg_xunlock --
+ * Discard the exclusive lock held by the ENV handle.
+ *
+ * PUBLIC: int __envreg_xunlock __P((ENV *));
+ */
+int
+__envreg_xunlock(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ pid_t pid;
+ int ret;
+
+ dbenv = env->dbenv;
+ dbenv->thread_id(dbenv, &pid, NULL);
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR_A("1533",
+ "%lu: recovery completed, unlocking", "%lu"), (u_long)pid);
+
+ if ((ret = REGISTRY_EXCL_UNLOCK(env)) == 0)
+ return (ret);
+
+ __db_err(env, ret, DB_STR_A("1534",
+ "%s: exclusive file unlock", "%s"), REGISTER_FILE);
+ return (__env_panic(env, ret));
+}
+
+/*
+ * __envreg_pid_compare --
+ * Compare routine for qsort and bsearch calls.
+ * returns neg if key is less than membr, 0 if equal and
+ * pos if key is greater than membr.
+ */
+static int
+__envreg_pid_compare(key, membr)
+ const void *key;
+ const void *membr;
+{
+ return ( *(pid_t*)key - *(pid_t*)membr );
+}
+
+/*
+ * __envreg_isalive --
+ * Default isalive function that uses contents of an array of active pids
+ * gotten from the db_register file to determine if process is still
+ * alive.
+ *
+ * PUBLIC: int __envreg_isalive
+ * PUBLIC: __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
+ */
+int
+__envreg_isalive(dbenv, pid, tid, flags )
+ DB_ENV *dbenv;
+ pid_t pid;
+ db_threadid_t tid;
+ u_int32_t flags;
+{
+ /* in this case we really do not care about tid, simply for lint */
+ DB_THREADID_INIT(tid);
+
+ /* if is not an expected value then return early */
+ if (!((flags == 0) || (flags == DB_MUTEX_PROCESS_ONLY)))
+ return (EINVAL);
+
+ if (DB_GLOBAL(active_pids) == NULL ||
+ DB_GLOBAL(num_active_pids) == 0 || dbenv == NULL)
+ return (0);
+ /*
+ * bsearch returns a pointer to an entry in active_pids if a match
+ * is found on pid, else no match found it returns NULL. This
+ * routine will return a 1 if a match is found, else a 0.
+ */
+ if (bsearch(&pid, DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids),
+ sizeof(pid_t), __envreg_pid_compare))
+ return 1;
+
+ return (0);
+}
+
+/*
+ * __envreg_create_active_pid --
+ * Create array of pids, if need more room in array then double size.
+ * Only add active pids from DB_REGISTER file into array.
+ */
+static int
+__envreg_create_active_pid(env, my_pid)
+ ENV *env;
+ char *my_pid;
+{
+ DB_ENV *dbenv;
+ char buf[PID_LEN + 10];
+ int ret;
+ off_t pos;
+ pid_t pid, *tmparray;
+ size_t tmpsize, nr;
+ u_int lcnt;
+
+ dbenv = env->dbenv;
+ pos = 0;
+ ret = 0;
+
+ /*
+ * Walk through DB_REGISTER file, we grab pid entries that are locked
+ * as those represent processes that are still alive. Ignore empty
+ * slots, or those that are unlocked.
+ */
+ if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
+ return (ret);
+ for (lcnt = 0;; ++lcnt) {
+ if ((ret = __os_read(
+ env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
+ return (ret);
+
+ /* all done is read nothing, or get a partial record */
+ if (nr == 0 || nr != PID_LEN)
+ break;
+ if (PID_ISEMPTY(buf))
+ continue;
+
+ pos = (off_t)lcnt * PID_LEN;
+ if (REGISTRY_LOCK(env, pos, 1) == 0) {
+ /* got lock, so process died. Do not add to array */
+ if ((ret = REGISTRY_UNLOCK(env, pos)) != 0)
+ return (ret);
+ } else {
+ /* first, check to make sure we have room in arrary */
+ if (DB_GLOBAL(num_active_pids) + 1 >
+ DB_GLOBAL(size_active_pids)) {
+ tmpsize =
+ DB_GLOBAL(size_active_pids) * sizeof(pid_t);
+
+ /* start with 512, then double if must grow */
+ tmpsize = tmpsize>0 ? tmpsize*2 : 512;
+ if ((ret = __os_malloc
+ (env, tmpsize, &tmparray )) != 0)
+ return (ret);
+
+ /* if array exists, then copy and free */
+ if (DB_GLOBAL(active_pids)) {
+ memcpy( tmparray,
+ DB_GLOBAL(active_pids),
+ DB_GLOBAL(num_active_pids) *
+ sizeof(pid_t));
+ __os_free( env, DB_GLOBAL(active_pids));
+ }
+
+ DB_GLOBAL(active_pids) = tmparray;
+ DB_GLOBAL(size_active_pids) = tmpsize;
+
+ /*
+ * The process getting here has not been added
+ * to the DB_REGISTER file yet, so include it
+ * as the first item in array
+ */
+ if (DB_GLOBAL(num_active_pids) == 0) {
+ pid = (pid_t)strtoul(my_pid, NULL, 10);
+ DB_GLOBAL(active_pids)
+ [DB_GLOBAL(num_active_pids)++] = pid;
+ }
+ }
+
+ /* insert into array */
+ pid = (pid_t)strtoul(buf, NULL, 10);
+ DB_GLOBAL(active_pids)
+ [DB_GLOBAL(num_active_pids)++] = pid;
+
+ }
+
+ }
+
+ /* lets sort the array to allow for binary search in isalive func */
+ qsort(DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids),
+ sizeof(pid_t), __envreg_pid_compare);
+ return (ret);
+}
diff --git a/src/env/env_sig.c b/src/env/env_sig.c
new file mode 100644
index 00000000..6d127f85
--- /dev/null
+++ b/src/env/env_sig.c
@@ -0,0 +1,201 @@
+/*-
+ * DO NOT EDIT: automatically built by dist/s_sig.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_join.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/log_verify.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * For a pure 32bit/64bit environment, we check all structures and calculate a
+ * signature. For compatible environment, we only check the structures in
+ * shared memory.
+ */
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+#define __STRUCTURE_COUNT 41
+#else
+#define __STRUCTURE_COUNT (41 + 104)
+#endif
+
+/*
+ * __env_struct_sig --
+ * Compute signature of structures.
+ *
+ * PUBLIC: u_int32_t __env_struct_sig __P((void));
+ */
+u_int32_t
+__env_struct_sig()
+{
+ u_short t[__STRUCTURE_COUNT + 5];
+ u_int i;
+
+ i = 0;
+#define __ADD(s) (t[i++] = sizeof(struct s))
+
+#ifdef HAVE_MUTEX_SUPPORT
+ __ADD(__db_mutex_stat);
+#endif
+ __ADD(__db_lock_stat);
+ __ADD(__db_lock_hstat);
+ __ADD(__db_lock_pstat);
+ __ADD(__db_ilock);
+ __ADD(__db_lock_u);
+ __ADD(__db_lsn);
+ __ADD(__db_log_stat);
+ __ADD(__db_mpool_stat);
+ __ADD(__db_rep_stat);
+ __ADD(__db_repmgr_stat);
+ __ADD(__db_seq_stat);
+ __ADD(__db_bt_stat);
+ __ADD(__db_h_stat);
+ __ADD(__db_heap_stat);
+ __ADD(__db_qam_stat);
+ __ADD(__db_thread_info);
+ __ADD(__db_lockregion);
+ __ADD(__sh_dbt);
+ __ADD(__db_lockobj);
+ __ADD(__db_locker);
+ __ADD(__db_lockpart);
+ __ADD(__db_lock);
+ __ADD(__log);
+ __ADD(__mpool);
+ __ADD(__db_mpool_fstat_int);
+ __ADD(__mpoolfile);
+ __ADD(__bh);
+#ifdef HAVE_MUTEX_SUPPORT
+ __ADD(__db_mutexregion);
+#endif
+#ifdef HAVE_MUTEX_SUPPORT
+ __ADD(__db_mutex_t);
+#endif
+ __ADD(__db_reg_env);
+ __ADD(__db_region);
+ __ADD(__rep);
+ __ADD(__db_txn_stat_int);
+ __ADD(__db_txnregion);
+
+#ifndef HAVE_MIXED_SIZE_ADDRESSING
+ __ADD(__db_dbt);
+ __ADD(__db_lockreq);
+ __ADD(__db_log_cursor);
+ __ADD(__log_rec_spec);
+ __ADD(__db_mpoolfile);
+ __ADD(__db_mpool_fstat);
+ __ADD(__db_txn);
+ __ADD(__kids);
+ __ADD(__my_cursors);
+ __ADD(__femfs);
+ __ADD(__db_preplist);
+ __ADD(__db_txn_active);
+ __ADD(__db_txn_stat);
+ __ADD(__db_txn_token);
+ __ADD(__db_repmgr_site);
+ __ADD(__db_repmgr_conn_err);
+ __ADD(__db_seq_record);
+ __ADD(__db_sequence);
+ __ADD(__db);
+ __ADD(__cq_fq);
+ __ADD(__cq_aq);
+ __ADD(__cq_jq);
+ __ADD(__db_heap_rid);
+ __ADD(__dbc);
+ __ADD(__key_range);
+ __ADD(__db_compact);
+ __ADD(__db_env);
+ __ADD(__db_distab);
+ __ADD(__db_logvrfy_config);
+ __ADD(__db_channel);
+ __ADD(__db_site);
+ __ADD(__fn);
+ __ADD(__db_msgbuf);
+ __ADD(__pin_list);
+ __ADD(__env_thread_info);
+ __ADD(__flag_map);
+ __ADD(__db_backup_handle);
+ __ADD(__env);
+ __ADD(__dbc_internal);
+ __ADD(__dbpginfo);
+ __ADD(__epg);
+ __ADD(__cursor);
+ __ADD(__btree);
+ __ADD(__db_cipher);
+ __ADD(__db_foreign_info);
+ __ADD(__db_txnhead);
+ __ADD(__db_txnlist);
+ __ADD(__join_cursor);
+ __ADD(__pg_chksum);
+ __ADD(__pg_crypto);
+ __ADD(__heaphdr);
+ __ADD(__heaphdrsplt);
+ __ADD(__pglist);
+ __ADD(__vrfy_dbinfo);
+ __ADD(__vrfy_pageinfo);
+ __ADD(__vrfy_childinfo);
+ __ADD(__db_globals);
+ __ADD(__envq);
+ __ADD(__heap);
+ __ADD(__heap_cursor);
+ __ADD(__db_locktab);
+ __ADD(__db_entry);
+ __ADD(__fname);
+ __ADD(__db_log);
+ __ADD(__hdr);
+ __ADD(__log_persist);
+ __ADD(__db_commit);
+ __ADD(__db_filestart);
+ __ADD(__log_rec_hdr);
+ __ADD(__db_log_verify_info);
+ __ADD(__txn_verify_info);
+ __ADD(__lv_filereg_info);
+ __ADD(__lv_filelife);
+ __ADD(__lv_ckp_info);
+ __ADD(__lv_timestamp_info);
+ __ADD(__lv_txnrange);
+ __ADD(__add_recycle_params);
+ __ADD(__ckp_verify_params);
+ __ADD(__db_mpool);
+ __ADD(__db_mpreg);
+ __ADD(__db_mpool_hash);
+ __ADD(__bh_frozen_p);
+ __ADD(__bh_frozen_a);
+#ifdef HAVE_MUTEX_SUPPORT
+ __ADD(__db_mutexmgr);
+#endif
+ __ADD(__fh_t);
+ __ADD(__db_partition);
+ __ADD(__part_internal);
+ __ADD(__qcursor);
+ __ADD(__mpfarray);
+ __ADD(__qmpf);
+ __ADD(__queue);
+ __ADD(__qam_filelist);
+ __ADD(__db_reg_env_ref);
+ __ADD(__db_region_mem_t);
+ __ADD(__db_reginfo_t);
+ __ADD(__rep_waiter);
+ __ADD(__db_rep);
+ __ADD(__rep_lease_entry);
+ __ADD(__txn_detail);
+ __ADD(__db_txnmgr);
+ __ADD(__db_commit_info);
+ __ADD(__txn_logrec);
+#endif
+
+ return (__ham_func5(NULL, t, i * sizeof(t[0])));
+}
diff --git a/src/env/env_stat.c b/src/env/env_stat.c
new file mode 100644
index 00000000..9bc3fe7e
--- /dev/null
+++ b/src/env/env_stat.c
@@ -0,0 +1,879 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_STATISTICS
+static int __env_print_all __P((ENV *, u_int32_t));
+static int __env_print_dbenv_all __P((ENV *, u_int32_t));
+static int __env_print_env_all __P((ENV *, u_int32_t));
+static int __env_print_fh __P((ENV *));
+static int __env_print_stats __P((ENV *, u_int32_t));
+static int __env_print_thread __P((ENV *));
+static int __env_stat_print __P((ENV *, u_int32_t));
+static char *__env_thread_state_print __P((DB_THREAD_STATE));
+static const char *
+ __reg_type __P((reg_type_t));
+
+/*
+ * __env_stat_print_pp --
+ * ENV->stat_print pre/post processor.
+ *
+ * PUBLIC: int __env_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__env_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->stat_print");
+
+ if ((ret = __db_fchk(env, "DB_ENV->stat_print",
+ flags, DB_STAT_ALL | DB_STAT_ALLOC |
+ DB_STAT_CLEAR | DB_STAT_SUBSYSTEM)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__env_stat_print(env, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __env_stat_print --
+ * ENV->stat_print method.
+ */
+static int
+__env_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ time_t now;
+ int ret;
+ char time_buf[CTIME_BUFLEN];
+
+ (void)time(&now);
+ __db_msg(env, "%.24s\tLocal time", __os_ctime(&now, time_buf));
+
+ if ((ret = __env_print_stats(env, flags)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL) &&
+ (ret = __env_print_all(env, flags)) != 0)
+ return (ret);
+
+ if ((ret = __env_print_thread(env)) != 0)
+ return (ret);
+
+ if ((ret = __env_print_fh(env)) != 0)
+ return (ret);
+
+ if (!LF_ISSET(DB_STAT_SUBSYSTEM))
+ return (0);
+
+ if (LOGGING_ON(env)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ if ((ret = __log_stat_print(env, flags)) != 0)
+ return (ret);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ if ((ret = __dbreg_stat_print(env, flags)) != 0)
+ return (ret);
+ }
+
+ if (LOCKING_ON(env)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ if ((ret = __lock_stat_print(env, flags)) != 0)
+ return (ret);
+ }
+
+ if (MPOOL_ON(env)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ if ((ret = __memp_stat_print(env, flags)) != 0)
+ return (ret);
+ }
+
+ if (REP_ON(env)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ if ((ret = __rep_stat_print(env, flags)) != 0)
+ return (ret);
+#ifdef HAVE_REPLICATION_THREADS
+ if ((ret = __repmgr_stat_print(env, flags)) != 0)
+ return (ret);
+#endif
+ }
+
+ if (TXN_ON(env)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ if ((ret = __txn_stat_print(env, flags)) != 0)
+ return (ret);
+ }
+
+#ifdef HAVE_MUTEX_SUPPORT
+ /*
+ * Dump the mutexes last. If DB_STAT_CLEAR is set this will
+ * clear out the mutex counters and we want to see them in
+ * the context of the other subsystems first.
+ */
+ if (MUTEX_ON(env)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ if ((ret = __mutex_stat_print(env, flags)) != 0)
+ return (ret);
+ }
+#endif
+
+ return (0);
+}
+
+/*
+ * __env_print_stats --
+ * Display the default environment statistics.
+ *
+ */
+static int
+__env_print_stats(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ char time_buf[CTIME_BUFLEN];
+
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ if (LF_ISSET(DB_STAT_ALL)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Default database environment information:");
+ }
+ STAT_HEX("Magic number", renv->magic);
+ STAT_LONG("Panic value", renv->panic);
+ __db_msg(env, "%d.%d.%d\tEnvironment version",
+ renv->majver, renv->minver, renv->patchver);
+ STAT_LONG("Btree version", DB_BTREEVERSION);
+ STAT_LONG("Hash version", DB_HASHVERSION);
+ STAT_LONG("Lock version", DB_LOCKVERSION);
+ STAT_LONG("Log version", DB_LOGVERSION);
+ STAT_LONG("Queue version", DB_QAMVERSION);
+ STAT_LONG("Sequence version", DB_SEQUENCE_VERSION);
+ STAT_LONG("Txn version", DB_TXNVERSION);
+ __db_msg(env,
+ "%.24s\tCreation time", __os_ctime(&renv->timestamp, time_buf));
+ STAT_HEX("Environment ID", renv->envid);
+ __mutex_print_debug_single(env,
+ "Primary region allocation and reference count mutex",
+ renv->mtx_regenv, flags);
+ STAT_LONG("References", renv->refcnt);
+ __db_dlbytes(env, "Current region size",
+ (u_long)0, (u_long)0, (u_long)infop->rp->size);
+ __db_dlbytes(env, "Maximum region size",
+ (u_long)0, (u_long)0, (u_long)infop->rp->max);
+
+ return (0);
+}
+
+/*
+ * __env_print_all --
+ * Display the debugging environment statistics.
+ */
+static int
+__env_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ int ret, t_ret;
+
+ /*
+ * There are two structures -- DB_ENV and ENV.
+ */
+ ret = __env_print_dbenv_all(env, flags);
+ if ((t_ret = __env_print_env_all(env, flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __env_print_dbenv_all --
+ * Display the debugging environment statistics.
+ */
+static int
+__env_print_dbenv_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ static const FN db_env_fn[] = {
+ { DB_ENV_AUTO_COMMIT, "DB_ENV_AUTO_COMMIT" },
+ { DB_ENV_CDB_ALLDB, "DB_ENV_CDB_ALLDB" },
+ { DB_ENV_DIRECT_DB, "DB_ENV_DIRECT_DB" },
+ { DB_ENV_DSYNC_DB, "DB_ENV_DSYNC_DB" },
+ { DB_ENV_MULTIVERSION, "DB_ENV_MULTIVERSION" },
+ { DB_ENV_NOLOCKING, "DB_ENV_NOLOCKING" },
+ { DB_ENV_NOMMAP, "DB_ENV_NOMMAP" },
+ { DB_ENV_NOPANIC, "DB_ENV_NOPANIC" },
+ { DB_ENV_OVERWRITE, "DB_ENV_OVERWRITE" },
+ { DB_ENV_REGION_INIT, "DB_ENV_REGION_INIT" },
+ { DB_ENV_TIME_NOTGRANTED, "DB_ENV_TIME_NOTGRANTED" },
+ { DB_ENV_TXN_NOSYNC, "DB_ENV_TXN_NOSYNC" },
+ { DB_ENV_TXN_NOWAIT, "DB_ENV_TXN_NOWAIT" },
+ { DB_ENV_TXN_SNAPSHOT, "DB_ENV_TXN_SNAPSHOT" },
+ { DB_ENV_TXN_WRITE_NOSYNC, "DB_ENV_TXN_WRITE_NOSYNC" },
+ { DB_ENV_YIELDCPU, "DB_ENV_YIELDCPU" },
+ { 0, NULL }
+ };
+ static const FN vfn[] = {
+ { DB_VERB_DEADLOCK, "DB_VERB_DEADLOCK" },
+ { DB_VERB_FILEOPS, "DB_VERB_FILEOPS" },
+ { DB_VERB_FILEOPS_ALL, "DB_VERB_FILEOPS_ALL" },
+ { DB_VERB_RECOVERY, "DB_VERB_RECOVERY" },
+ { DB_VERB_REGISTER, "DB_VERB_REGISTER" },
+ { DB_VERB_REPLICATION, "DB_VERB_REPLICATION" },
+ { DB_VERB_REP_ELECT, "DB_VERB_REP_ELECT" },
+ { DB_VERB_REP_LEASE, "DB_VERB_REP_LEASE" },
+ { DB_VERB_REP_MISC, "DB_VERB_REP_MISC" },
+ { DB_VERB_REP_MSGS, "DB_VERB_REP_MSGS" },
+ { DB_VERB_REP_SYNC, "DB_VERB_REP_SYNC" },
+ { DB_VERB_REP_SYSTEM, "DB_VERB_REP_SYSTEM" },
+ { DB_VERB_REP_TEST, "DB_VERB_REP_TEST" },
+ { DB_VERB_REPMGR_CONNFAIL, "DB_VERB_REPMGR_CONNFAIL" },
+ { DB_VERB_REPMGR_MISC, "DB_VERB_REPMGR_MISC" },
+ { DB_VERB_WAITSFOR, "DB_VERB_WAITSFOR" },
+ { 0, NULL }
+ };
+ DB_ENV *dbenv;
+ DB_MSGBUF mb;
+ char **p;
+
+ dbenv = env->dbenv;
+ DB_MSGBUF_INIT(&mb);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ STAT_POINTER("ENV", dbenv->env);
+ __mutex_print_debug_single(
+ env, "DB_ENV handle mutex", dbenv->mtx_db_env, flags);
+ STAT_ISSET("Errcall", dbenv->db_errcall);
+ STAT_ISSET("Errfile", dbenv->db_errfile);
+ STAT_STRING("Errpfx", dbenv->db_errpfx);
+ STAT_ISSET("Msgfile", dbenv->db_msgfile);
+ STAT_ISSET("Msgcall", dbenv->db_msgcall);
+
+ STAT_ISSET("AppDispatch", dbenv->app_dispatch);
+ STAT_ISSET("Event", dbenv->db_event_func);
+ STAT_ISSET("Feedback", dbenv->db_feedback);
+ STAT_ISSET("Free", dbenv->db_free);
+ STAT_ISSET("Panic", dbenv->db_paniccall);
+ STAT_ISSET("Malloc", dbenv->db_malloc);
+ STAT_ISSET("Realloc", dbenv->db_realloc);
+ STAT_ISSET("IsAlive", dbenv->is_alive);
+ STAT_ISSET("ThreadId", dbenv->thread_id);
+ STAT_ISSET("ThreadIdString", dbenv->thread_id_string);
+
+ STAT_STRING("Log dir", dbenv->db_log_dir);
+ STAT_STRING("Metadata dir", dbenv->db_md_dir);
+ STAT_STRING("Tmp dir", dbenv->db_tmp_dir);
+ if (dbenv->db_data_dir == NULL)
+ STAT_ISSET("Data dir", dbenv->db_data_dir);
+ else {
+ for (p = dbenv->db_data_dir; *p != NULL; ++p)
+ __db_msgadd(env, &mb, "%s\tData dir", *p);
+ DB_MSGBUF_FLUSH(env, &mb);
+ }
+
+ STAT_STRING(
+ "Intermediate directory mode", dbenv->intermediate_dir_mode);
+
+ STAT_LONG("Shared memory key", dbenv->shm_key);
+
+ STAT_ISSET("Password", dbenv->passwd);
+
+ STAT_ISSET("App private", dbenv->app_private);
+ STAT_ISSET("Api1 internal", dbenv->api1_internal);
+ STAT_ISSET("Api2 internal", dbenv->api2_internal);
+
+ __db_prflags(env, NULL, dbenv->verbose, vfn, NULL, "\tVerbose flags");
+
+ STAT_ULONG("Mutex align", dbenv->mutex_align);
+ STAT_ULONG("Mutex cnt", dbenv->mutex_cnt);
+ STAT_ULONG("Mutex inc", dbenv->mutex_inc);
+ STAT_ULONG("Mutex tas spins", dbenv->mutex_tas_spins);
+
+ STAT_ISSET("Lock conflicts", dbenv->lk_conflicts);
+ STAT_LONG("Lock modes", dbenv->lk_modes);
+ STAT_ULONG("Lock detect", dbenv->lk_detect);
+ STAT_ULONG("Lock init", dbenv->lk_init);
+ STAT_ULONG("Lock init lockers", dbenv->lk_init_lockers);
+ STAT_ULONG("Lock init objects", dbenv->lk_init_objects);
+ STAT_ULONG("Lock max", dbenv->lk_max);
+ STAT_ULONG("Lock max lockers", dbenv->lk_max_lockers);
+ STAT_ULONG("Lock max objects", dbenv->lk_max_objects);
+ STAT_ULONG("Lock partitions", dbenv->lk_partitions);
+ STAT_ULONG("Lock object hash table size", dbenv->object_t_size);
+ STAT_ULONG("Lock timeout", dbenv->lk_timeout);
+
+ STAT_ULONG("Log bsize", dbenv->lg_bsize);
+ STAT_FMT("Log file mode", "%#o", int, dbenv->lg_filemode);
+ STAT_ULONG("Log region max", dbenv->lg_regionmax);
+ STAT_ULONG("Log size", dbenv->lg_size);
+
+ STAT_ULONG("Cache GB", dbenv->mp_gbytes);
+ STAT_ULONG("Cache B", dbenv->mp_bytes);
+ STAT_ULONG("Cache max GB", dbenv->mp_max_gbytes);
+ STAT_ULONG("Cache max B", dbenv->mp_max_bytes);
+ STAT_ULONG("Cache mmap size", dbenv->mp_mmapsize);
+ STAT_ULONG("Cache max open fd", dbenv->mp_maxopenfd);
+ STAT_ULONG("Cache max write", dbenv->mp_maxwrite);
+ STAT_ULONG("Cache number", dbenv->mp_ncache);
+ STAT_ULONG("Cache max write sleep", dbenv->mp_maxwrite_sleep);
+
+ STAT_ULONG("Txn init", dbenv->tx_init);
+ STAT_ULONG("Txn max", dbenv->tx_max);
+ STAT_ULONG("Txn timestamp", dbenv->tx_timestamp);
+ STAT_ULONG("Txn timeout", dbenv->tx_timeout);
+
+ STAT_ULONG("Thread count", dbenv->thr_max);
+
+ STAT_ISSET("Registry", dbenv->registry);
+ STAT_ULONG("Registry offset", dbenv->registry_off);
+ STAT_ULONG("Registry timeout", dbenv->envreg_timeout);
+
+ __db_prflags(env,
+ NULL, dbenv->flags, db_env_fn, NULL, "\tPublic environment flags");
+
+ return (0);
+}
+
+/*
+ * __env_print_env_all --
+ * Display the debugging environment statistics.
+ */
+static int
+__env_print_env_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ static const FN env_fn[] = {
+ { ENV_CDB, "ENV_CDB" },
+ { ENV_DBLOCAL, "ENV_DBLOCAL" },
+ { ENV_LOCKDOWN, "ENV_LOCKDOWN" },
+ { ENV_NO_OUTPUT_SET, "ENV_NO_OUTPUT_SET" },
+ { ENV_OPEN_CALLED, "ENV_OPEN_CALLED" },
+ { ENV_PRIVATE, "ENV_PRIVATE" },
+ { ENV_RECOVER_FATAL, "ENV_RECOVER_FATAL" },
+ { ENV_REF_COUNTED, "ENV_REF_COUNTED" },
+ { ENV_SYSTEM_MEM, "ENV_SYSTEM_MEM" },
+ { ENV_THREAD, "ENV_THREAD" },
+ { 0, NULL }
+ };
+ static const FN ofn[] = {
+ { DB_CREATE, "DB_CREATE" },
+ { DB_FORCE, "DB_FORCE" },
+ { DB_INIT_CDB, "DB_INIT_CDB" },
+ { DB_INIT_LOCK, "DB_INIT_LOCK" },
+ { DB_INIT_LOG, "DB_INIT_LOG" },
+ { DB_INIT_MPOOL, "DB_INIT_MPOOL" },
+ { DB_INIT_REP, "DB_INIT_REP" },
+ { DB_INIT_TXN, "DB_INIT_TXN" },
+ { DB_LOCKDOWN, "DB_LOCKDOWN" },
+ { DB_NOMMAP, "DB_NOMMAP" },
+ { DB_PRIVATE, "DB_PRIVATE" },
+ { DB_RDONLY, "DB_RDONLY" },
+ { DB_RECOVER, "DB_RECOVER" },
+ { DB_RECOVER_FATAL, "DB_RECOVER_FATAL" },
+ { DB_SYSTEM_MEM, "DB_SYSTEM_MEM" },
+ { DB_THREAD, "DB_THREAD" },
+ { DB_TRUNCATE, "DB_TRUNCATE" },
+ { DB_TXN_NOSYNC, "DB_TXN_NOSYNC" },
+ { DB_USE_ENVIRON, "DB_USE_ENVIRON" },
+ { DB_USE_ENVIRON_ROOT, "DB_USE_ENVIRON_ROOT" },
+ { 0, NULL }
+ };
+ static const FN regenvfn[] = {
+ { DB_REGENV_REPLOCKED, "DB_REGENV_REPLOCKED" },
+ { 0, NULL }
+ };
+ REGENV *renv;
+ REGINFO *infop;
+ REGION *rp;
+ u_int32_t i;
+ char time_buf[CTIME_BUFLEN];
+
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ STAT_POINTER("DB_ENV", env->dbenv);
+ __mutex_print_debug_single(
+ env, "ENV handle mutex", env->mtx_env, flags);
+
+ STAT_STRING("Home", env->db_home);
+ __db_prflags(env, NULL, env->open_flags, ofn, NULL, "\tOpen flags");
+ STAT_FMT("Mode", "%#o", int, env->db_mode);
+
+ STAT_ULONG("Pid cache", env->pid_cache);
+
+ STAT_ISSET("Lockfhp", env->lockfhp);
+
+ STAT_ISSET("Locker", env->env_lref);
+
+ STAT_ISSET("Internal recovery table", env->recover_dtab.int_dispatch);
+ STAT_ULONG("Number of recovery table slots",
+ env->recover_dtab.int_size);
+ STAT_ISSET("External recovery table", env->recover_dtab.ext_dispatch);
+ STAT_ULONG("Number of recovery table slots",
+ env->recover_dtab.ext_size);
+
+ STAT_ULONG("Thread hash buckets", env->thr_nbucket);
+ STAT_ISSET("Thread hash table", env->thr_hashtab);
+
+ __mutex_print_debug_single(
+ env, "ENV list of DB handles mutex", env->mtx_dblist, flags);
+ STAT_LONG("DB reference count", env->db_ref);
+
+ __mutex_print_debug_single(env, "MT mutex", env->mtx_mt, flags);
+
+ STAT_ISSET("Crypto handle", env->crypto_handle);
+ STAT_ISSET("Lock handle", env->lk_handle);
+ STAT_ISSET("Log handle", env->lg_handle);
+ STAT_ISSET("Cache handle", env->mp_handle);
+ STAT_ISSET("Mutex handle", env->mutex_handle);
+ STAT_ISSET("Replication handle", env->rep_handle);
+ STAT_ISSET("Txn handle", env->tx_handle);
+
+ STAT_ISSET("User copy", env->dbt_usercopy);
+
+ STAT_LONG("Test abort", env->test_abort);
+ STAT_LONG("Test check", env->test_check);
+ STAT_LONG("Test copy", env->test_copy);
+
+ __db_prflags(env,
+ NULL, env->flags, env_fn, NULL, "\tPrivate environment flags");
+
+ __db_print_reginfo(env, infop, "Primary", flags);
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Per region database environment information:");
+ for (rp = R_ADDR(infop, renv->region_off),
+ i = 0; i < renv->region_cnt; ++i, ++rp) {
+ if (rp->id == INVALID_REGION_ID)
+ continue;
+ __db_msg(env, "%s Region:", __reg_type(rp->type));
+ STAT_LONG("Region ID", rp->id);
+ STAT_LONG("Segment ID", rp->segid);
+ __db_dlbytes(env,
+ "Size", (u_long)0, (u_long)0, (u_long)rp->size);
+ }
+ __db_prflags(env,
+ NULL, renv->init_flags, ofn, NULL, "\tInitialization flags");
+ STAT_ULONG("Region slots", renv->region_cnt);
+ __db_prflags(env,
+ NULL, renv->flags, regenvfn, NULL, "\tReplication flags");
+ __db_msg(env, "%.24s\tOperation timestamp",
+ renv->op_timestamp == 0 ?
+ "!Set" : __os_ctime(&renv->op_timestamp, time_buf));
+ __db_msg(env, "%.24s\tReplication timestamp",
+ renv->rep_timestamp == 0 ?
+ "!Set" : __os_ctime(&renv->rep_timestamp, time_buf));
+
+ return (0);
+}
+
+static char *
+__env_thread_state_print(state)
+ DB_THREAD_STATE state;
+{
+ switch (state) {
+ case THREAD_ACTIVE:
+ return ("active");
+ case THREAD_BLOCKED:
+ return ("blocked");
+ case THREAD_BLOCKED_DEAD:
+ return ("blocked and dead");
+ case THREAD_OUT:
+ return ("out");
+ default:
+ return ("unknown");
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __env_print_thread --
+ * Display the thread block state.
+ */
+static int
+__env_print_thread(env)
+ ENV *env;
+{
+ BH *bhp;
+ DB_ENV *dbenv;
+ DB_HASHTAB *htab;
+ DB_MPOOL *dbmp;
+ DB_THREAD_INFO *ip;
+ PIN_LIST *list, *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ THREAD_INFO *thread;
+ u_int32_t i;
+ char buf[DB_THREADID_STRLEN];
+
+ dbenv = env->dbenv;
+
+ /* The thread table may not be configured. */
+ if ((htab = env->thr_hashtab) == NULL)
+ return (0);
+
+ dbmp = env->mp_handle;
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Thread tracking information");
+
+ /* Dump out the info we have on thread tracking. */
+ infop = env->reginfo;
+ renv = infop->primary;
+ thread = R_ADDR(infop, renv->thread_off);
+ STAT_ULONG("Thread blocks allocated", thread->thr_count);
+ STAT_ULONG("Thread allocation threshold", thread->thr_max);
+ STAT_ULONG("Thread hash buckets", thread->thr_nbucket);
+
+ /* Dump out the info we have on active threads. */
+ __db_msg(env, "Thread status blocks:");
+ for (i = 0; i < env->thr_nbucket; i++)
+ SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) {
+ if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE)
+ continue;
+ __db_msg(env, "\tprocess/thread %s: %s",
+ dbenv->thread_id_string(
+ dbenv, ip->dbth_pid, ip->dbth_tid, buf),
+ __env_thread_state_print(ip->dbth_state));
+ list = R_ADDR(env->reginfo, ip->dbth_pinlist);
+ for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) {
+ if (lp->b_ref == INVALID_ROFF)
+ continue;
+ bhp = R_ADDR(
+ &dbmp->reginfo[lp->region], lp->b_ref);
+ __db_msg(env,
+ "\t\tpins: %lu", (u_long)bhp->pgno);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __env_print_fh --
+ * Display statistics for all handles open in this environment.
+ */
+static int
+__env_print_fh(env)
+ ENV *env;
+{
+ DB_FH *fhp;
+
+ if (TAILQ_FIRST(&env->fdlist) == NULL)
+ return (0);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Environment file handle information");
+
+ MUTEX_LOCK(env, env->mtx_env);
+
+ TAILQ_FOREACH(fhp, &env->fdlist, q)
+ __db_print_fh(env, NULL, fhp, 0);
+
+ MUTEX_UNLOCK(env, env->mtx_env);
+
+ return (0);
+}
+
+/*
+ * __db_print_fh --
+ * Print out a file handle.
+ *
+ * PUBLIC: void __db_print_fh __P((ENV *, const char *, DB_FH *, u_int32_t));
+ */
+void
+__db_print_fh(env, tag, fh, flags)
+ ENV *env;
+ const char *tag;
+ DB_FH *fh;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { DB_FH_NOSYNC, "DB_FH_NOSYNC" },
+ { DB_FH_OPENED, "DB_FH_OPENED" },
+ { DB_FH_UNLINK, "DB_FH_UNLINK" },
+ { 0, NULL }
+ };
+
+ if (fh == NULL) {
+ STAT_ISSET(tag, fh);
+ return;
+ }
+
+ STAT_STRING("file-handle.file name", fh->name);
+
+ __mutex_print_debug_single(
+ env, "file-handle.mutex", fh->mtx_fh, flags);
+
+ STAT_LONG("file-handle.reference count", fh->ref);
+ STAT_LONG("file-handle.file descriptor", fh->fd);
+
+ STAT_ULONG("file-handle.page number", fh->pgno);
+ STAT_ULONG("file-handle.page size", fh->pgsize);
+ STAT_ULONG("file-handle.page offset", fh->offset);
+
+ STAT_ULONG("file-handle.seek count", fh->seek_count);
+ STAT_ULONG("file-handle.read count", fh->read_count);
+ STAT_ULONG("file-handle.write count", fh->write_count);
+
+ __db_prflags(env, NULL, fh->flags, fn, NULL, "\tfile-handle.flags");
+}
+
+/*
+ * __db_print_fileid --
+ * Print out a file ID.
+ *
+ * PUBLIC: void __db_print_fileid __P((ENV *, u_int8_t *, const char *));
+ */
+void
+__db_print_fileid(env, id, suffix)
+ ENV *env;
+ u_int8_t *id;
+ const char *suffix;
+{
+ DB_MSGBUF mb;
+ int i;
+
+ if (id == NULL) {
+ STAT_ISSET("ID", id);
+ return;
+ }
+
+ DB_MSGBUF_INIT(&mb);
+ for (i = 0; i < DB_FILE_ID_LEN; ++i, ++id) {
+ __db_msgadd(env, &mb, "%x", (u_int)*id);
+ if (i < DB_FILE_ID_LEN - 1)
+ __db_msgadd(env, &mb, " ");
+ }
+ if (suffix != NULL)
+ __db_msgadd(env, &mb, "%s", suffix);
+ DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_dl --
+ * Display a big value.
+ *
+ * PUBLIC: void __db_dl __P((ENV *, const char *, u_long));
+ */
+void
+__db_dl(env, msg, value)
+ ENV *env;
+ const char *msg;
+ u_long value;
+{
+ /*
+ * Two formats: if less than 10 million, display as the number, if
+ * greater than 10 million display as ###M.
+ */
+ if (value < 10000000)
+ __db_msg(env, "%lu\t%s", value, msg);
+ else
+ __db_msg(env, "%luM\t%s (%lu)", value / 1000000, msg, value);
+}
+
+/*
+ * __db_dl_pct --
+ * Display a big value, and related percentage.
+ *
+ * PUBLIC: void __db_dl_pct
+ * PUBLIC: __P((ENV *, const char *, u_long, int, const char *));
+ */
+void
+__db_dl_pct(env, msg, value, pct, tag)
+ ENV *env;
+ const char *msg, *tag;
+ u_long value;
+ int pct;
+{
+ DB_MSGBUF mb;
+
+ DB_MSGBUF_INIT(&mb);
+
+ /*
+ * Two formats: if less than 10 million, display as the number, if
+ * greater than 10 million, round it off and display as ###M.
+ */
+ if (value < 10000000)
+ __db_msgadd(env, &mb, "%lu\t%s", value, msg);
+ else
+ __db_msgadd(env,
+ &mb, "%luM\t%s", (value + 500000) / 1000000, msg);
+ if (tag == NULL)
+ __db_msgadd(env, &mb, " (%d%%)", pct);
+ else
+ __db_msgadd(env, &mb, " (%d%% %s)", pct, tag);
+
+ DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_dlbytes --
+ * Display a big number of bytes.
+ *
+ * PUBLIC: void __db_dlbytes
+ * PUBLIC: __P((ENV *, const char *, u_long, u_long, u_long));
+ */
+void
+__db_dlbytes(env, msg, gbytes, mbytes, bytes)
+ ENV *env;
+ const char *msg;
+ u_long gbytes, mbytes, bytes;
+{
+ DB_MSGBUF mb;
+ const char *sep;
+
+ DB_MSGBUF_INIT(&mb);
+
+ /* Normalize the values. */
+ while (bytes >= MEGABYTE) {
+ ++mbytes;
+ bytes -= MEGABYTE;
+ }
+ while (mbytes >= GIGABYTE / MEGABYTE) {
+ ++gbytes;
+ mbytes -= GIGABYTE / MEGABYTE;
+ }
+
+ if (gbytes == 0 && mbytes == 0 && bytes == 0)
+ __db_msgadd(env, &mb, "0");
+ else {
+ sep = "";
+ if (gbytes > 0) {
+ __db_msgadd(env, &mb, "%luGB", gbytes);
+ sep = " ";
+ }
+ if (mbytes > 0) {
+ __db_msgadd(env, &mb, "%s%luMB", sep, mbytes);
+ sep = " ";
+ }
+ if (bytes >= 1024) {
+ __db_msgadd(env, &mb, "%s%luKB", sep, bytes / 1024);
+ bytes %= 1024;
+ sep = " ";
+ }
+ if (bytes > 0)
+ __db_msgadd(env, &mb, "%s%luB", sep, bytes);
+ }
+
+ __db_msgadd(env, &mb, "\t%s", msg);
+
+ DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_print_reginfo --
+ * Print out underlying shared region information.
+ *
+ * PUBLIC: void __db_print_reginfo
+ * PUBLIC: __P((ENV *, REGINFO *, const char *, u_int32_t));
+ */
+void
+__db_print_reginfo(env, infop, s, flags)
+ ENV *env;
+ REGINFO *infop;
+ const char *s;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { REGION_CREATE, "REGION_CREATE" },
+ { REGION_CREATE_OK, "REGION_CREATE_OK" },
+ { REGION_JOIN_OK, "REGION_JOIN_OK" },
+ { REGION_SHARED, "REGION_SHARED" },
+ { 0, NULL }
+ };
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "%s REGINFO information:", s);
+ STAT_STRING("Region type", __reg_type(infop->type));
+ STAT_ULONG("Region ID", infop->id);
+ STAT_STRING("Region name", infop->name);
+ STAT_POINTER("Region address", infop->addr);
+ STAT_POINTER("Region allocation head", infop->head);
+ STAT_POINTER("Region primary address", infop->primary);
+ STAT_ULONG("Region maximum allocation", infop->max_alloc);
+ STAT_ULONG("Region allocated", infop->allocated);
+ __env_alloc_print(infop, flags);
+
+ __db_prflags(env, NULL, infop->flags, fn, NULL, "\tRegion flags");
+}
+
+/*
+ * __reg_type --
+ * Return the region type string.
+ */
+static const char *
+__reg_type(t)
+ reg_type_t t;
+{
+ switch (t) {
+ case REGION_TYPE_ENV:
+ return ("Environment");
+ case REGION_TYPE_LOCK:
+ return ("Lock");
+ case REGION_TYPE_LOG:
+ return ("Log");
+ case REGION_TYPE_MPOOL:
+ return ("Mpool");
+ case REGION_TYPE_MUTEX:
+ return ("Mutex");
+ case REGION_TYPE_TXN:
+ return ("Transaction");
+ case INVALID_REGION_TYPE:
+ return ("Invalid");
+ }
+ return ("Unknown");
+}
+
+#else /* !HAVE_STATISTICS */
+
+/*
+ * __db_stat_not_built --
+ * Common error routine when library not built with statistics.
+ *
+ * PUBLIC: int __db_stat_not_built __P((ENV *));
+ */
+int
+__db_stat_not_built(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("1554",
+ "Library build did not include statistics support"));
+ return (DB_OPNOTSUP);
+}
+
+int
+__env_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/fileops/fileops.src b/src/fileops/fileops.src
new file mode 100644
index 00000000..cdb6af27
--- /dev/null
+++ b/src/fileops/fileops.src
@@ -0,0 +1,137 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __fop
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE #include "dbinc/fop.h"
+INCLUDE
+
+/*
+ * create -- create a file system object.
+ *
+ * name: name in the file system
+ * appname: indicates if the name needs to go through __db_appname
+ * mode: file system mode
+ */
+BEGIN_COMPAT create 42 143
+DBT name DBT s
+ARG appname u_int32_t lu
+ARG mode u_int32_t o
+END
+
+BEGIN create 48 143
+DBT name DBT s
+DBT dirname DBT s
+ARG appname u_int32_t lu
+ARG mode u_int32_t o
+END
+
+/*
+ * remove -- remove a file system object.
+ *
+ * name: name in the file system
+ * appname: indicates if the name needs to go through __db_appname
+ */
+BEGIN remove 42 144
+DBT name DBT s
+DBT fid DBT s
+ARG appname u_int32_t lu
+END
+
+/*
+ * write: log the writing of data into an object.
+ *
+ * name: file containing the page.
+ * appname: indicates if the name needs to go through __db_appname
+ * pgsize: page size.
+ * pageno: page number in the file.
+ * offset: offset on the page.
+ * page: the actual meta-data page.
+ * flag: non-0 indicates that this is a tempfile, so we needn't undo
+ * these modifications (we'll toss the file).
+ */
+BEGIN_COMPAT write 42 145
+DBT name DBT s
+ARG appname u_int32_t lu
+ARG pgsize u_int32_t lu
+ARG pageno db_pgno_t lu
+ARG offset u_int32_t lu
+DBT page DBT s
+ARG flag u_int32_t lu
+END
+
+BEGIN write 48 145
+DBT name DBT s
+DBT dirname DBT s
+ARG appname u_int32_t lu
+ARG pgsize u_int32_t lu
+ARG pageno db_pgno_t lu
+ARG offset u_int32_t lu
+DBT page DBT s
+ARG flag u_int32_t lu
+END
+
+/*
+ * rename: move a file from one name to another.
+ * The appname value indicates if this is a path name that should be used
+ * directly (i.e., no interpretation) or if it is a pathname that should
+ * be interpreted via calls to __db_appname. The fileid is the 20-byte
+ * DB fileid of the file being renamed. We need to check it on recovery
+ * so that we don't inadvertently overwrite good files.
+ *
+ * There are two variants of this log record: one that must be both done
+ * and undone and one that is not undone (used for renaming tmp files, see
+ * SR #15119)
+ *
+ * These two record types use the same structure, read, and print functions,
+ * but have different recovery functions.
+ */
+BEGIN_COMPAT rename 42 146
+DUPLICATE rename_noundo 46 150
+DBT oldname DBT s
+DBT newname DBT s
+DBT fileid DBT s
+ARG appname u_int32_t lu
+END
+
+BEGIN rename 48 146
+DUPLICATE rename_noundo 46 150
+DBT oldname DBT s
+DBT newname DBT s
+DBT dirname DBT s
+DBT fileid DBT s
+ARG appname u_int32_t lu
+END
+
+/*
+ * File removal record. This is a DB-level log record that indicates
+ * we've just completed some form of file removal. The purpose of this
+ * log record is to logically identify the particular instance of the
+ * named file so that during recovery, in deciding if we should roll-forward
+ * a remove or a rename, we can make sure that we don't roll one forward and
+ * delete or overwrite the wrong file.
+ * real_fid: The 20-byte unique file identifier of the original file being
+ * removed.
+ * tmp_fid: The unique fid of the tmp file that is removed.
+ * name: The pre- __db_appname name of the file
+ * child: The transaction that removed or renamed the file.
+ */
+ */
+BEGIN file_remove 42 141
+DBT real_fid DBT s
+DBT tmp_fid DBT s
+DBT name DBT s
+ARG appname u_int32_t lu
+ARG child u_int32_t lx
+END
diff --git a/src/fileops/fileops_auto.c b/src/fileops/fileops_auto.c
new file mode 100644
index 00000000..0db619a5
--- /dev/null
+++ b/src/fileops/fileops_auto.c
@@ -0,0 +1,118 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+#include "dbinc/fop.h"
+
+DB_LOG_RECSPEC __fop_create_42_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_create_42_args, name), "name", ""},
+ {LOGREC_ARG, SSZ(__fop_create_42_args, appname), "appname", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_create_42_args, mode), "mode", "%o"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_create_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_create_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__fop_create_args, dirname), "dirname", ""},
+ {LOGREC_ARG, SSZ(__fop_create_args, appname), "appname", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_create_args, mode), "mode", "%o"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_remove_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_remove_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__fop_remove_args, fid), "fid", ""},
+ {LOGREC_ARG, SSZ(__fop_remove_args, appname), "appname", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_write_42_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_write_42_args, name), "name", ""},
+ {LOGREC_ARG, SSZ(__fop_write_42_args, appname), "appname", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_42_args, pgsize), "pgsize", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_42_args, pageno), "pageno", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_42_args, offset), "offset", "%lu"},
+ {LOGREC_DBT, SSZ(__fop_write_42_args, page), "page", ""},
+ {LOGREC_ARG, SSZ(__fop_write_42_args, flag), "flag", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_write_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_write_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__fop_write_args, dirname), "dirname", ""},
+ {LOGREC_ARG, SSZ(__fop_write_args, appname), "appname", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_args, pgsize), "pgsize", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_args, pageno), "pageno", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_args, offset), "offset", "%lu"},
+ {LOGREC_DBT, SSZ(__fop_write_args, page), "page", ""},
+ {LOGREC_ARG, SSZ(__fop_write_args, flag), "flag", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_rename_42_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_rename_42_args, oldname), "oldname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_42_args, newname), "newname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__fop_rename_42_args, appname), "appname", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_rename_noundo_46_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_rename_42_args, oldname), "oldname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_42_args, newname), "newname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__fop_rename_42_args, appname), "appname", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_rename_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_rename_args, oldname), "oldname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_args, newname), "newname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_args, dirname), "dirname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__fop_rename_args, appname), "appname", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_rename_noundo_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_rename_args, oldname), "oldname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_args, newname), "newname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_args, dirname), "dirname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__fop_rename_args, appname), "appname", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_file_remove_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_file_remove_args, real_fid), "real_fid", ""},
+ {LOGREC_DBT, SSZ(__fop_file_remove_args, tmp_fid), "tmp_fid", ""},
+ {LOGREC_DBT, SSZ(__fop_file_remove_args, name), "name", ""},
+ {LOGREC_ARG, SSZ(__fop_file_remove_args, appname), "appname", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_file_remove_args, child), "child", "%lx"},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __fop_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__fop_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_create_recover, DB___fop_create)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_remove_recover, DB___fop_remove)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_write_recover, DB___fop_write)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_rename_recover, DB___fop_rename)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_rename_noundo_recover, DB___fop_rename_noundo)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_file_remove_recover, DB___fop_file_remove)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/fileops/fileops_autop.c b/src/fileops/fileops_autop.c
new file mode 100644
index 00000000..6e271a17
--- /dev/null
+++ b/src/fileops/fileops_autop.c
@@ -0,0 +1,177 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+#include "dbinc/fop.h"
+
+/*
+ * PUBLIC: int __fop_create_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_create_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_create_42", __fop_create_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_create_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_create_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_create", __fop_create_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_remove_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_remove_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_remove", __fop_remove_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_write_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_write_42", __fop_write_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_write_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_write", __fop_write_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_rename_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_rename_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_rename_42", __fop_rename_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_rename_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_rename_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_rename", __fop_rename_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_file_remove_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_file_remove_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_file_remove", __fop_file_remove_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__fop_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_create_print, DB___fop_create)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_remove_print, DB___fop_remove)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_write_print, DB___fop_write)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_rename_print, DB___fop_rename)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_rename_print, DB___fop_rename_noundo)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_file_remove_print, DB___fop_file_remove)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/fileops/fop_basic.c b/src/fileops/fop_basic.c
new file mode 100644
index 00000000..d6c707f2
--- /dev/null
+++ b/src/fileops/fop_basic.c
@@ -0,0 +1,318 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+
+/*
+ * The transactional guarantees Berkeley DB provides for file
+ * system level operations (database physical file create, delete,
+ * rename) are based on our understanding of current file system
+ * semantics; a system that does not provide these semantics and
+ * guarantees could be in danger.
+ *
+ * First, as in standard database changes, fsync and fdatasync must
+ * work: when applied to the log file, the records written into the
+ * log must be transferred to stable storage.
+ *
+ * Second, it must not be possible for the log file to be removed
+ * without previous file system level operations being flushed to
+ * stable storage. Berkeley DB applications write log records
+ * describing file system operations into the log, then perform the
+ * file system operation, then commit the enclosing transaction
+ * (which flushes the log file to stable storage). Subsequently,
+ * a database environment checkpoint may make it possible for the
+ * application to remove the log file containing the record of the
+ * file system operation. DB's transactional guarantees for file
+ * system operations require the log file removal not succeed until
+ * all previous filesystem operations have been flushed to stable
+ * storage. In other words, the flush of the log file, or the
+ * removal of the log file, must block until all previous
+ * filesystem operations have been flushed to stable storage. This
+ * semantic is not, as far as we know, required by any existing
+ * standards document, but we have never seen a filesystem where
+ * it does not apply.
+ */
+
+/*
+ * __fop_create --
+ * Create a (transactionally protected) file system object. This is used
+ * to create DB files now, potentially blobs, queue extents and anything
+ * else you wish to store in a file system object.
+ *
+ * PUBLIC: int __fop_create __P((ENV *, DB_TXN *,
+ * PUBLIC: DB_FH **, const char *, const char **, APPNAME, int, u_int32_t));
+ */
+int
+__fop_create(env, txn, fhpp, name, dirp, appname, mode, flags)
+ ENV *env;
+ DB_TXN *txn;
+ DB_FH **fhpp;
+ const char *name, **dirp;
+ APPNAME appname;
+ int mode;
+ u_int32_t flags;
+{
+ DBT data, dirdata;
+ DB_FH *fhp;
+ DB_LSN lsn;
+ int ret;
+ char *real_name;
+
+ real_name = NULL;
+ fhp = NULL;
+
+ if ((ret = __db_appname(env, appname, name, dirp, &real_name)) != 0)
+ return (ret);
+
+ if (mode == 0)
+ mode = DB_MODE_600;
+
+ if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+ && txn != NULL
+#endif
+ ) {
+ DB_INIT_DBT(data, name, strlen(name) + 1);
+ if (dirp != NULL && *dirp != NULL)
+ DB_INIT_DBT(dirdata, *dirp, strlen(*dirp) + 1);
+ else
+ memset(&dirdata, 0, sizeof(dirdata));
+ if ((ret = __fop_create_log(env, txn, &lsn,
+ flags | DB_FLUSH,
+ &data, &dirdata, (u_int32_t)appname, (u_int32_t)mode)) != 0)
+ goto err;
+ }
+
+ DB_ENV_TEST_RECOVERY(env, DB_TEST_POSTLOG, ret, name);
+
+ if (fhpp == NULL)
+ fhpp = &fhp;
+ ret = __os_open(
+ env, real_name, 0, DB_OSO_CREATE | DB_OSO_EXCL, mode, fhpp);
+
+err:
+DB_TEST_RECOVERY_LABEL
+ if (fhpp == &fhp && fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ if (real_name != NULL)
+ __os_free(env, real_name);
+ return (ret);
+}
+
+/*
+ * __fop_remove --
+ * Remove a file system object.
+ *
+ * PUBLIC: int __fop_remove __P((ENV *, DB_TXN *,
+ * PUBLIC: u_int8_t *, const char *, const char **, APPNAME, u_int32_t));
+ */
+int
+__fop_remove(env, txn, fileid, name, dirp, appname, flags)
+ ENV *env;
+ DB_TXN *txn;
+ u_int8_t *fileid;
+ const char *name, **dirp;
+ APPNAME appname;
+ u_int32_t flags;
+{
+ DBT fdbt, ndbt;
+ DB_LSN lsn;
+ char *real_name;
+ int ret;
+
+ real_name = NULL;
+
+ if ((ret = __db_appname(env, appname, name, dirp, &real_name)) != 0)
+ goto err;
+
+ if (!IS_REAL_TXN(txn)) {
+ if (fileid != NULL && (ret = __memp_nameop(
+ env, fileid, NULL, real_name, NULL, 0)) != 0)
+ goto err;
+ } else {
+ if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+ && txn != NULL
+#endif
+ ) {
+ memset(&fdbt, 0, sizeof(ndbt));
+ fdbt.data = fileid;
+ fdbt.size = fileid == NULL ? 0 : DB_FILE_ID_LEN;
+ DB_INIT_DBT(ndbt, name, strlen(name) + 1);
+ if ((ret = __fop_remove_log(env, txn, &lsn,
+ flags, &ndbt, &fdbt, (u_int32_t)appname)) != 0)
+ goto err;
+ }
+ ret = __txn_remevent(env, txn, real_name, fileid, 0);
+ }
+
+err: if (real_name != NULL)
+ __os_free(env, real_name);
+ return (ret);
+}
+
+/*
+ * __fop_write
+ *
+ * Write "size" bytes from "buf" to file "name" beginning at offset "off."
+ * If the file is open, supply a handle in fhp. Istmp indicate if this is
+ * an operation that needs to be undone in the face of failure (i.e., if
+ * this is a write to a temporary file, we're simply going to remove the
+ * file, so don't worry about undoing the write).
+ *
+ * Currently, we *only* use this with istmp true. If we need more general
+ * handling, then we'll have to zero out regions on abort (and possibly
+ * log the before image of the data in the log record).
+ *
+ * PUBLIC: int __fop_write __P((ENV *, DB_TXN *,
+ * PUBLIC: const char *, const char *, APPNAME, DB_FH *, u_int32_t,
+ * PUBLIC: db_pgno_t, u_int32_t, void *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__fop_write(env, txn,
+ name, dirname, appname, fhp, pgsize, pageno, off, buf, size, istmp, flags)
+ ENV *env;
+ DB_TXN *txn;
+ const char *name, *dirname;
+ APPNAME appname;
+ DB_FH *fhp;
+ u_int32_t pgsize;
+ db_pgno_t pageno;
+ u_int32_t off;
+ void *buf;
+ u_int32_t size, istmp, flags;
+{
+ DBT data, namedbt, dirdbt;
+ DB_LSN lsn;
+ size_t nbytes;
+ int local_open, ret, t_ret;
+ char *real_name;
+
+ DB_ASSERT(env, istmp != 0);
+
+ ret = local_open = 0;
+ real_name = NULL;
+
+ if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+ && txn != NULL
+#endif
+ ) {
+ memset(&data, 0, sizeof(data));
+ data.data = buf;
+ data.size = size;
+ DB_INIT_DBT(namedbt, name, strlen(name) + 1);
+ if (dirname != NULL)
+ DB_INIT_DBT(dirdbt, dirname, strlen(dirname) + 1);
+ else
+ memset(&dirdbt, 0, sizeof(dirdbt));
+ if ((ret = __fop_write_log(env, txn,
+ &lsn, flags, &namedbt, &dirdbt, (u_int32_t)appname,
+ pgsize, pageno, off, &data, istmp)) != 0)
+ goto err;
+ }
+
+ if (fhp == NULL) {
+ /* File isn't open; we need to reopen it. */
+ if ((ret = __db_appname(env,
+ appname, name, &dirname, &real_name)) != 0)
+ return (ret);
+
+ if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0)
+ goto err;
+ local_open = 1;
+ }
+
+ /* Seek to offset. */
+ if ((ret = __os_seek(env, fhp, pageno, pgsize, off)) != 0)
+ goto err;
+
+ /* Now do the write. */
+ if ((ret = __os_write(env, fhp, buf, size, &nbytes)) != 0)
+ goto err;
+
+err: if (local_open &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (real_name != NULL)
+ __os_free(env, real_name);
+ return (ret);
+}
+
+/*
+ * __fop_rename --
+ * Change a file's name.
+ *
+ * PUBLIC: int __fop_rename __P((ENV *, DB_TXN *, const char *, const char *,
+ * PUBLIC: const char **, u_int8_t *, APPNAME, int, u_int32_t));
+ */
+int
+__fop_rename(env, txn, oldname, newname, dirp, fid, appname, with_undo, flags)
+ ENV *env;
+ DB_TXN *txn;
+ const char *oldname;
+ const char *newname;
+ const char **dirp;
+ u_int8_t *fid;
+ APPNAME appname;
+ int with_undo;
+ u_int32_t flags;
+{
+ DBT fiddbt, dir, new, old;
+ DB_LSN lsn;
+ int ret;
+ char *n, *o;
+
+ o = n = NULL;
+ if ((ret = __db_appname(env, appname, oldname, dirp, &o)) != 0)
+ goto err;
+ if ((ret = __db_appname(env, appname, newname, dirp, &n)) != 0)
+ goto err;
+
+ if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+ && txn != NULL
+#endif
+ ) {
+ DB_INIT_DBT(old, oldname, strlen(oldname) + 1);
+ DB_INIT_DBT(new, newname, strlen(newname) + 1);
+ if (dirp != NULL && *dirp != NULL)
+ DB_INIT_DBT(dir, *dirp, strlen(*dirp) + 1);
+ else
+ memset(&dir, 0, sizeof(dir));
+ memset(&fiddbt, 0, sizeof(fiddbt));
+ fiddbt.data = fid;
+ fiddbt.size = DB_FILE_ID_LEN;
+ if (with_undo)
+ ret = __fop_rename_log(env,
+ txn, &lsn, flags | DB_FLUSH,
+ &old, &new, &dir, &fiddbt, (u_int32_t)appname);
+ else
+ ret = __fop_rename_noundo_log(env,
+ txn, &lsn, flags | DB_FLUSH,
+ &old, &new, &dir, &fiddbt, (u_int32_t)appname);
+ if (ret != 0)
+ goto err;
+ }
+
+ ret = __memp_nameop(env, fid, newname, o, n, 0);
+
+err: if (o != NULL)
+ __os_free(env, o);
+ if (n != NULL)
+ __os_free(env, n);
+ return (ret);
+}
diff --git a/src/fileops/fop_rec.c b/src/fileops/fop_rec.c
new file mode 100644
index 00000000..52d6175d
--- /dev/null
+++ b/src/fileops/fop_rec.c
@@ -0,0 +1,697 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __fop_rename_recover_int
+ __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
+static int __fop_rename_42_recover_int
+ __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
+
+/*
+ * The transactional guarantees Berkeley DB provides for file
+ * system level operations (database physical file create, delete,
+ * rename) are based on our understanding of current file system
+ * semantics; a system that does not provide these semantics and
+ * guarantees could be in danger.
+ *
+ * First, as in standard database changes, fsync and fdatasync must
+ * work: when applied to the log file, the records written into the
+ * log must be transferred to stable storage.
+ *
+ * Second, it must not be possible for the log file to be removed
+ * without previous file system level operations being flushed to
+ * stable storage. Berkeley DB applications write log records
+ * describing file system operations into the log, then perform the
+ * file system operation, then commit the enclosing transaction
+ * (which flushes the log file to stable storage). Subsequently,
+ * a database environment checkpoint may make it possible for the
+ * application to remove the log file containing the record of the
+ * file system operation. DB's transactional guarantees for file
+ * system operations require the log file removal not succeed until
+ * all previous filesystem operations have been flushed to stable
+ * storage. In other words, the flush of the log file, or the
+ * removal of the log file, must block until all previous
+ * filesystem operations have been flushed to stable storage. This
+ * semantic is not, as far as we know, required by any existing
+ * standards document, but we have never seen a filesystem where
+ * it does not apply.
+ */
+
+/*
+ * __fop_create_recover --
+ * Recovery function for create.
+ *
+ * PUBLIC: int __fop_create_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_create_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_create_args *argp;
+ DB_FH *fhp;
+ DBMETA *meta;
+ u_int8_t mbuf[DBMETASIZE];
+ int ret;
+ char *real_name;
+ const char *dirname;
+
+ COMPQUIET(info, NULL);
+
+ real_name = NULL;
+ REC_PRINT(__fop_create_print);
+ REC_NOOP_INTRO(__fop_create_read);
+ meta = (DBMETA *)mbuf;
+
+ if (argp->dirname.size == 0)
+ dirname = NULL;
+ else
+ dirname = (const char *)argp->dirname.data;
+
+ if ((ret = __db_appname(env, (APPNAME)argp->appname == DB_APP_DATA ?
+ DB_APP_RECOVER : (APPNAME)argp->appname,
+ (const char *)argp->name.data, &dirname, &real_name)) != 0)
+ goto out;
+
+ if (DB_UNDO(op)) {
+ /*
+ * If the file was opened in mpool, we must mark it as
+ * dead via nameop which will also unlink the file.
+ */
+ if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
+ if (__fop_read_meta(env,
+ real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
+ __db_chk_meta(env, NULL, meta, 1) == 0) {
+ if ((ret = __memp_nameop(env,
+ meta->uid, NULL, real_name, NULL, 0)) != 0)
+ goto out;
+ } else {
+ (void)__os_closehandle(env, fhp);
+ goto do_unlink;
+ }
+ (void)__os_closehandle(env, fhp);
+ } else
+do_unlink: (void)__os_unlink(env, real_name, 0);
+ } else if (DB_REDO(op)) {
+ if ((ret = __os_open(env, real_name, 0,
+ DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
+ (void)__os_closehandle(env, fhp);
+ else
+ goto out;
+ }
+
+ *lsnp = argp->prev_lsn;
+
+out: if (real_name != NULL)
+ __os_free(env, real_name);
+
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_create_42_recover --
+ * Recovery function for create.
+ *
+ * PUBLIC: int __fop_create_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_create_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_create_args *argp;
+ DB_FH *fhp;
+ DBMETA *meta;
+ u_int8_t mbuf[DBMETASIZE];
+ int ret;
+ char *real_name;
+
+ COMPQUIET(info, NULL);
+
+ real_name = NULL;
+ REC_PRINT(__fop_create_print);
+ REC_NOOP_INTRO(__fop_create_read);
+ meta = (DBMETA *)mbuf;
+
+ if ((ret = __db_appname(env, (APPNAME)argp->appname,
+ (const char *)argp->name.data, NULL, &real_name)) != 0)
+ goto out;
+
+ if (DB_UNDO(op)) {
+ /*
+ * If the file was opened in mpool, we must mark it as
+ * dead via nameop which will also unlink the file.
+ */
+ if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
+ if (__fop_read_meta(env,
+ real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
+ __db_chk_meta(env, NULL, meta, 1) == 0) {
+ if ((ret = __memp_nameop(env,
+ meta->uid, NULL, real_name, NULL, 0)) != 0)
+ goto out;
+ } else
+ goto do_unlink;
+ (void)__os_closehandle(env, fhp);
+ } else
+do_unlink: (void)__os_unlink(env, real_name, 0);
+ } else if (DB_REDO(op)) {
+ if ((ret = __os_open(env, real_name, 0,
+ DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
+ (void)__os_closehandle(env, fhp);
+ else
+ goto out;
+ }
+
+ *lsnp = argp->prev_lsn;
+
+out: if (real_name != NULL)
+ __os_free(env, real_name);
+
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_remove_recover --
+ * Recovery function for remove.
+ *
+ * PUBLIC: int __fop_remove_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_remove_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_remove_args *argp;
+ int ret;
+ char *real_name;
+
+ COMPQUIET(info, NULL);
+
+ real_name = NULL;
+ REC_PRINT(__fop_remove_print);
+ REC_NOOP_INTRO(__fop_remove_read);
+
+ if ((ret = __db_appname(env, (APPNAME)argp->appname,
+ (const char *)argp->name.data, NULL, &real_name)) != 0)
+ goto out;
+
+ /* Its ok if the file is not there. */
+ if (DB_REDO(op))
+ (void)__memp_nameop(env,
+ (u_int8_t *)argp->fid.data, NULL, real_name, NULL, 0);
+
+ *lsnp = argp->prev_lsn;
+out: if (real_name != NULL)
+ __os_free(env, real_name);
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_write_recover --
+ * Recovery function for writechunk.
+ *
+ * PUBLIC: int __fop_write_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_write_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_write_args *argp;
+ int ret;
+
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__fop_write_print);
+ REC_NOOP_INTRO(__fop_write_read);
+
+ ret = 0;
+ if (DB_UNDO(op))
+ DB_ASSERT(env, argp->flag != 0);
+ else if (DB_REDO(op))
+ ret = __fop_write(env,
+ argp->txnp, argp->name.data,
+ argp->dirname.size == 0 ? NULL : argp->dirname.data,
+ (APPNAME)argp->appname == DB_APP_DATA ? DB_APP_RECOVER :
+ (APPNAME)argp->appname,
+ NULL, argp->pgsize, argp->pageno, argp->offset,
+ argp->page.data, argp->page.size, argp->flag, 0);
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_write_42_recover --
+ * Recovery function for writechunk.
+ *
+ * PUBLIC: int __fop_write_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_write_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_write_args *argp;
+ int ret;
+
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__fop_write_print);
+ REC_NOOP_INTRO(__fop_write_read);
+
+ ret = 0;
+ if (DB_UNDO(op))
+ DB_ASSERT(env, argp->flag != 0);
+ else if (DB_REDO(op))
+ ret = __fop_write(env,
+ argp->txnp, argp->name.data, NULL, (APPNAME)argp->appname,
+ NULL, argp->pgsize, argp->pageno, argp->offset,
+ argp->page.data, argp->page.size, argp->flag, 0);
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_rename_recover --
+ * Recovery functions for rename. There are two variants that
+ * both use the same utility function. Had we known about this on day
+ * one, we would have simply added a parameter. However, since we need
+ * to retain old records for backward compatibility (online-upgrade)
+ * wrapping the two seems like the right solution.
+ *
+ * PUBLIC: int __fop_rename_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * PUBLIC: int __fop_rename_noundo_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_rename_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 1));
+}
+
+int
+__fop_rename_noundo_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 0));
+}
+
+static int
+__fop_rename_recover_int(env, dbtp, lsnp, op, info, undo)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+ int undo;
+{
+ __fop_rename_args *argp;
+ APPNAME appname;
+ DB_FH *fhp;
+ DBMETA *meta;
+ u_int8_t *fileid, mbuf[DBMETASIZE];
+ int ret;
+ char *real_new, *real_old, *src;
+ const char *dirname;
+
+ COMPQUIET(info, NULL);
+
+ fhp = NULL;
+ meta = (DBMETA *)&mbuf[0];
+ ret = 0;
+ real_new = real_old = NULL;
+
+ REC_PRINT(__fop_rename_print);
+ REC_NOOP_INTRO(__fop_rename_read);
+ fileid = argp->fileid.data;
+
+ if (argp->dirname.size == 0)
+ dirname = NULL;
+ else
+ dirname = (const char *)argp->dirname.data;
+
+ if ((APPNAME)argp->appname == DB_APP_DATA)
+ appname = DB_APP_RECOVER;
+ else
+ appname = (APPNAME)argp->appname;
+
+ if ((ret = __db_appname(env, appname, (const char *)argp->newname.data,
+ &dirname, &real_new)) != 0)
+ goto out;
+ if ((ret = __db_appname(env, appname, (const char *)argp->oldname.data,
+ &dirname, &real_old)) != 0)
+ goto out;
+
+ /*
+ * Verify that we are manipulating the correct file. We should always
+ * be OK on an ABORT or an APPLY, but during recovery, we have to
+ * check.
+ */
+ if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
+ src = DB_UNDO(op) ? real_new : real_old;
+ /*
+ * Interpret any error as meaning that the file either doesn't
+ * exist, doesn't have a meta-data page, or is in some other
+ * way, shape or form, incorrect, so that we should not restore
+ * it.
+ */
+ if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
+ goto done;
+ if (__fop_read_meta(env,
+ src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
+ goto done;
+ if (__db_chk_meta(env, NULL, meta, 1) != 0)
+ goto done;
+ if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
+ goto done;
+ (void)__os_closehandle(env, fhp);
+ fhp = NULL;
+ if (DB_REDO(op)) {
+ /*
+ * Check to see if the target file exists. If it
+ * does and it does not have the proper id then
+ * it is a later version. We just remove the source
+ * file since the state of the world is beyond this
+ * point.
+ */
+ if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
+ __fop_read_meta(env, src, mbuf,
+ DBMETASIZE, fhp, 1, NULL) == 0 &&
+ __db_chk_meta(env, NULL, meta, 1) == 0 &&
+ memcmp(argp->fileid.data,
+ meta->uid, DB_FILE_ID_LEN) != 0) {
+ (void)__memp_nameop(env,
+ fileid, NULL, real_old, NULL, 0);
+ goto done;
+ }
+ }
+ }
+
+ if (undo && DB_UNDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->oldname.data, real_new, real_old, 0);
+ if (DB_REDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->newname.data, real_old, real_new, 0);
+
+done: *lsnp = argp->prev_lsn;
+out: if (real_new != NULL)
+ __os_free(env, real_new);
+ if (real_old != NULL)
+ __os_free(env, real_old);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+
+ REC_NOOP_CLOSE;
+}
+/*
+ * __fop_rename_42_recover --
+ * Recovery functions for rename. There are two variants that
+ * both use the same utility function. Had we known about this on day
+ * one, we would have simply added a parameter. However, since we need
+ * to retain old records for backward compatibility (online-upgrade)
+ * wrapping the two seems like the right solution.
+ *
+ * PUBLIC: int __fop_rename_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * PUBLIC: int __fop_rename_noundo_46_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_rename_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ return (__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, 1));
+}
+
+int
+__fop_rename_noundo_46_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ return (__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, 0));
+}
+
+static int
+__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+ int undo;
+{
+ __fop_rename_args *argp;
+ DB_FH *fhp;
+ DBMETA *meta;
+ u_int8_t *fileid, mbuf[DBMETASIZE];
+ int ret;
+ char *real_new, *real_old, *src;
+
+ COMPQUIET(info, NULL);
+
+ fhp = NULL;
+ meta = (DBMETA *)&mbuf[0];
+ ret = 0;
+ real_new = real_old = NULL;
+
+ REC_PRINT(__fop_rename_print);
+ REC_NOOP_INTRO(__fop_rename_read);
+ fileid = argp->fileid.data;
+
+ if ((ret = __db_appname(env, (APPNAME)argp->appname,
+ (const char *)argp->newname.data, NULL, &real_new)) != 0)
+ goto out;
+ if ((ret = __db_appname(env, (APPNAME)argp->appname,
+ (const char *)argp->oldname.data, NULL, &real_old)) != 0)
+ goto out;
+
+ /*
+ * Verify that we are manipulating the correct file. We should always
+ * be OK on an ABORT or an APPLY, but during recovery, we have to
+ * check.
+ */
+ if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
+ src = DB_UNDO(op) ? real_new : real_old;
+ /*
+ * Interpret any error as meaning that the file either doesn't
+ * exist, doesn't have a meta-data page, or is in some other
+ * way, shape or form, incorrect, so that we should not restore
+ * it.
+ */
+ if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
+ goto done;
+ if (__fop_read_meta(env,
+ src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
+ goto done;
+ if (__db_chk_meta(env, NULL, meta, 1) != 0)
+ goto done;
+ if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
+ goto done;
+ (void)__os_closehandle(env, fhp);
+ fhp = NULL;
+ if (DB_REDO(op)) {
+ /*
+ * Check to see if the target file exists. If it
+ * does and it does not have the proper id then
+ * it is a later version. We just remove the source
+ * file since the state of the world is beyond this
+ * point.
+ */
+ if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
+ __fop_read_meta(env, src, mbuf,
+ DBMETASIZE, fhp, 1, NULL) == 0 &&
+ __db_chk_meta(env, NULL, meta, 1) == 0 &&
+ memcmp(argp->fileid.data,
+ meta->uid, DB_FILE_ID_LEN) != 0) {
+ (void)__memp_nameop(env,
+ fileid, NULL, real_old, NULL, 0);
+ goto done;
+ }
+ }
+ }
+
+ if (undo && DB_UNDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->oldname.data, real_new, real_old, 0);
+ if (DB_REDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->newname.data, real_old, real_new, 0);
+
+done: *lsnp = argp->prev_lsn;
+out: if (real_new != NULL)
+ __os_free(env, real_new);
+ if (real_old != NULL)
+ __os_free(env, real_old);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_file_remove_recover --
+ * Recovery function for file_remove. On the REDO pass, we need to
+ * make sure no one recreated the file while we weren't looking. On an
+ * undo pass must check if the file we are interested in is the one that
+ * exists and then set the status of the child transaction depending on
+ * what we find out.
+ *
+ * PUBLIC: int __fop_file_remove_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_file_remove_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_file_remove_args *argp;
+ DBMETA *meta;
+ DB_FH *fhp;
+ size_t len;
+ u_int8_t mbuf[DBMETASIZE];
+ u_int32_t cstat, ret_stat;
+ int is_real, is_tmp, ret;
+ char *real_name;
+
+ fhp = NULL;
+ meta = (DBMETA *)&mbuf[0];
+ is_real = is_tmp = 0;
+ real_name = NULL;
+ REC_PRINT(__fop_file_remove_print);
+ REC_NOOP_INTRO(__fop_file_remove_read);
+
+ /*
+ * This record is only interesting on the backward, forward, and
+ * apply phases.
+ */
+ if (op != DB_TXN_BACKWARD_ROLL &&
+ op != DB_TXN_FORWARD_ROLL && op != DB_TXN_APPLY)
+ goto done;
+
+ if ((ret = __db_appname(env, (APPNAME)argp->appname,
+ argp->name.data, NULL, &real_name)) != 0)
+ goto out;
+
+ /* Verify that we are manipulating the correct file. */
+ len = 0;
+ if (__os_open(env, real_name, 0, 0, 0, &fhp) != 0 ||
+ (ret = __fop_read_meta(env, real_name,
+ mbuf, DBMETASIZE, fhp, 1, &len)) != 0) {
+ /*
+ * If len is non-zero, then the file exists and has something
+ * in it, but that something isn't a full meta-data page, so
+ * this is very bad. Bail out!
+ */
+ if (len != 0)
+ goto out;
+
+ /* File does not exist. */
+ cstat = TXN_EXPECTED;
+ } else {
+ /*
+ * We can ignore errors here since we'll simply fail the
+ * checks below and assume this is the wrong file.
+ */
+ (void)__db_chk_meta(env, NULL, meta, 1);
+ is_real =
+ memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
+ is_tmp =
+ memcmp(argp->tmp_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
+
+ if (!is_real && !is_tmp)
+ /* File exists, but isn't what we were removing. */
+ cstat = TXN_IGNORE;
+ else
+ /* File exists and is the one that we were removing. */
+ cstat = TXN_COMMIT;
+ }
+ if (fhp != NULL) {
+ (void)__os_closehandle(env, fhp);
+ fhp = NULL;
+ }
+
+ if (DB_UNDO(op)) {
+ /* On the backward pass, we leave a note for the child txn. */
+ if ((ret = __db_txnlist_update(env,
+ info, argp->child, cstat, NULL, &ret_stat, 1)) != 0)
+ goto out;
+ } else if (DB_REDO(op)) {
+ /*
+ * On the forward pass, check if someone recreated the
+ * file while we weren't looking.
+ */
+ if (cstat == TXN_COMMIT)
+ (void)__memp_nameop(env,
+ is_real ? argp->real_fid.data : argp->tmp_fid.data,
+ NULL, real_name, NULL, 0);
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (real_name != NULL)
+ __os_free(env, real_name);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ REC_NOOP_CLOSE;
+}
diff --git a/src/fileops/fop_util.c b/src/fileops/fop_util.c
new file mode 100644
index 00000000..1925ffd1
--- /dev/null
+++ b/src/fileops/fop_util.c
@@ -0,0 +1,1841 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __fop_set_pgsize __P((DB *, DB_FH *, const char *));
+static int __fop_inmem_create __P((DB *, const char *, DB_TXN *, u_int32_t));
+static int __fop_inmem_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
+static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t,
+ u_int32_t));
+static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *,
+ const char *, const char *, const char *, DB_LOCKER *));
+static int __fop_ondisk_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
+static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *,
+ const char *, const char *, const char *, DB_LOCKER *));
+
+/*
+ * Acquire the environment meta-data lock. The parameters are the
+ * environment (ENV), the locker id to use in acquiring the lock (ID)
+ * and a pointer to a DB_LOCK.
+ *
+ * !!!
+ * Turn off locking for Critical Path. The application must do its own
+ * synchronization of open/create. Two threads creating and opening a
+ * file at the same time may have unpredictable results.
+ */
+#ifdef CRITICALPATH_10266
+#define GET_ENVLOCK(ENV, ID, L) (0)
+#else
+#define GET_ENVLOCK(ENV, ID, L) do { \
+ DBT __dbt; \
+ u_int32_t __lockval; \
+ \
+ if (LOCKING_ON((ENV))) { \
+ __lockval = 1; \
+ __dbt.data = &__lockval; \
+ __dbt.size = sizeof(__lockval); \
+ if ((ret = __lock_get((ENV), (ID), \
+ 0, &__dbt, DB_LOCK_WRITE, (L))) != 0) \
+ goto err; \
+ } \
+} while (0)
+#endif
+
+#define RESET_MPF(D, F) do { \
+ (void)__memp_fclose((D)->mpf, (F)); \
+ (D)->mpf = NULL; \
+ F_CLR((D), DB_AM_OPEN_CALLED); \
+ if ((ret = __memp_fcreate((D)->env, &(D)->mpf)) != 0) \
+ goto err; \
+} while (0)
+
+/*
+ * If we open a file handle and our caller is doing fcntl(2) locking,
+ * we can't close the handle because that would discard the caller's
+ * lock. Save it until we close or refresh the DB handle.
+ */
+#define CLOSE_HANDLE(D, F) { \
+ if ((F) != NULL) { \
+ if (LF_ISSET(DB_FCNTL_LOCKING)) \
+ (D)->saved_open_fhp = (F); \
+ else if ((t_ret = \
+ __os_closehandle((D)->env, (F))) != 0) { \
+ if (ret == 0) \
+ ret = t_ret; \
+ goto err; \
+ } \
+ (F) = NULL; \
+ } \
+}
+
+/*
+ * __fop_lock_handle --
+ *
+ * Get the handle lock for a database. If the envlock is specified, do this
+ * as a lock_vec call that releases the environment lock before acquiring the
+ * handle lock.
+ *
+ * PUBLIC: int __fop_lock_handle __P((ENV *,
+ * PUBLIC: DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
+ *
+ */
+int
+__fop_lock_handle(env, dbp, locker, mode, elockp, flags)
+ ENV *env;
+ DB *dbp;
+ DB_LOCKER *locker;
+ db_lockmode_t mode;
+ DB_LOCK *elockp;
+ u_int32_t flags;
+{
+ DBT fileobj;
+ DB_LOCKREQ reqs[2], *ereq;
+ DB_LOCK_ILOCK lock_desc;
+ int ret;
+
+ if (!LOCKING_ON(env) ||
+ F_ISSET(dbp, DB_AM_COMPENSATE | DB_AM_RECOVER))
+ return (0);
+
+ /*
+ * If we are in recovery, the only locking we should be
+ * doing is on the global environment. The one exception
+ * is if we are opening an exclusive database on a client
+ * syncing with the master.
+ */
+ if (IS_RECOVERING(env) && !F2_ISSET(dbp, DB2_AM_INTEXCL))
+ return (elockp == NULL ? 0 : __ENV_LPUT(env, *elockp));
+
+ memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN);
+ lock_desc.pgno = dbp->meta_pgno;
+ lock_desc.type = DB_HANDLE_LOCK;
+
+ memset(&fileobj, 0, sizeof(fileobj));
+ fileobj.data = &lock_desc;
+ fileobj.size = sizeof(lock_desc);
+ DB_TEST_SUBLOCKS(env, flags);
+ if (F2_ISSET(dbp, DB2_AM_INTEXCL))
+ flags |= DB_LOCK_IGNORE_REC;
+ if (elockp == NULL)
+ ret = __lock_get(env, locker,
+ flags, &fileobj, mode, &dbp->handle_lock);
+ else {
+ reqs[0].op = DB_LOCK_PUT;
+ reqs[0].lock = *elockp;
+ reqs[1].op = DB_LOCK_GET;
+ reqs[1].mode = mode;
+ reqs[1].obj = &fileobj;
+ reqs[1].timeout = 0;
+ if ((ret = __lock_vec(env,
+ locker, flags, reqs, 2, &ereq)) == 0) {
+ dbp->handle_lock = reqs[1].lock;
+ if (elockp != &dbp->handle_lock)
+ LOCK_INIT(*elockp);
+ } else if (ereq != reqs)
+ LOCK_INIT(*elockp);
+ }
+
+ dbp->cur_locker = locker;
+ return (ret);
+}
+
+/*
+ * __fop_file_setup --
+ *
+ * Perform all the needed checking and locking to open up or create a
+ * file.
+ *
+ * There's a reason we don't push this code down into the buffer cache.
+ * The problem is that there's no information external to the file that
+ * we can use as a unique ID. UNIX has dev/inode pairs, but they are
+ * not necessarily unique after reboot, if the file was mounted via NFS.
+ * Windows has similar problems, as the FAT filesystem doesn't maintain
+ * dev/inode numbers across reboot. So, we must get something from the
+ * file we can use to ensure that, even after a reboot, the file we're
+ * joining in the cache is the right file for us to join. The solution
+ * we use is to maintain a file ID that's stored in the database, and
+ * that's why we have to open and read the file before calling into the
+ * buffer cache or obtaining a lock (we use this unique fileid to lock
+ * as well as to identify like files in the cache).
+ *
+ * There are a couple of idiosyncrasies that this code must support, in
+ * particular, DB_TRUNCATE and DB_FCNTL_LOCKING. First, we disallow
+ * DB_TRUNCATE in the presence of transactions, since opening a file with
+ * O_TRUNC will result in data being lost in an unrecoverable fashion.
+ * We also disallow DB_TRUNCATE if locking is enabled, because even in
+ * the presence of locking, we cannot avoid race conditions, so allowing
+ * DB_TRUNCATE with locking would be misleading. See SR [#7345] for more
+ * details.
+ *
+ * However, if you are running with neither locking nor transactions, then
+ * you can specify DB_TRUNCATE, and if you do so, we will truncate the file
+ * regardless of its contents.
+ *
+ * FCNTL locking introduces another set of complications. First, the only
+ * reason we support the DB_FCNTL_LOCKING flag is for historic compatibility
+ * with programs like Sendmail and Postfix. In these cases, the caller may
+ * already have a lock on the file; we need to make sure that any file handles
+ * we open remain open, because if we were to close them, the lock held by the
+ * caller would go away. Furthermore, Sendmail and/or Postfix need the ability
+ * to create databases in empty files. So, when you're doing FCNTL locking,
+ * it's reasonable that you are trying to create a database into a 0-length
+ * file and we allow it, while under normal conditions, we do not create
+ * databases if the files already exist and are not Berkeley DB files.
+ *
+ * PUBLIC: int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip,
+ * PUBLIC: DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
+ */
+int
+__fop_file_setup(dbp, ip, txn, name, mode, flags, retidp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ int mode;
+ u_int32_t flags, *retidp;
+{
+ DBTYPE save_type;
+ DB_FH *fhp;
+ DB_LOCK elock;
+ DB_LOCKER *locker;
+ DB_TXN *stxn;
+ ENV *env;
+ size_t len;
+ APPNAME aflags;
+ u_int32_t dflags, oflags;
+ u_int8_t mbuf[DBMETASIZE];
+ int created_locker, create_ok, ret, retries, t_ret, tmp_created;
+ int truncating, was_inval;
+ char *real_name, *real_tmpname, *tmpname;
+ db_lockmode_t lockmode;
+
+ *retidp = TXN_INVALID;
+
+ env = dbp->env;
+ fhp = NULL;
+ LOCK_INIT(elock);
+ stxn = NULL;
+ created_locker = tmp_created = truncating = was_inval = 0;
+ real_name = real_tmpname = tmpname = NULL;
+ dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+ aflags = LF_ISSET(DB_INTERNAL_PERSISTENT_DB) ? DB_APP_META :
+ (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA);
+ LF_CLR(DB_INTERNAL_PERSISTENT_DB | DB_INTERNAL_TEMPORARY_DB);
+
+ ret = 0;
+ retries = 0;
+ save_type = dbp->type;
+ if (F2_ISSET(dbp, DB2_AM_EXCL))
+ lockmode = DB_LOCK_WRITE;
+ else
+ lockmode = DB_LOCK_READ;
+
+ /*
+ * Get a lockerid for this handle. There are paths through queue
+ * rename and remove where this dbp already has a locker, so make
+ * sure we don't clobber it and conflict.
+ */
+ if (LOCKING_ON(env) &&
+ !F_ISSET(dbp, DB_AM_COMPENSATE) &&
+ !F_ISSET(dbp, DB_AM_RECOVER) &&
+ dbp->locker == DB_LOCK_INVALIDID) {
+ if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+ goto err;
+ created_locker = 1;
+ }
+ LOCK_INIT(dbp->handle_lock);
+
+ if (txn != NULL && dbp->locker != NULL && F_ISSET(txn, TXN_INFAMILY)) {
+ if ((ret = __lock_addfamilylocker(env,
+ txn->txnid, dbp->locker->id, 1)) != 0)
+ goto err;
+ txn = NULL;
+ }
+
+ locker = txn == NULL ? dbp->locker : txn->locker;
+
+ oflags = 0;
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ real_name = (char *)name;
+ else {
+ /* Get the real backing file name. */
+ if ((ret = __db_appname(env,
+ aflags, name, &dbp->dirname, &real_name)) != 0)
+ goto err;
+
+ /* Fill in the default file mode. */
+ if (mode == 0)
+ mode = DB_MODE_660;
+
+ if (LF_ISSET(DB_RDONLY))
+ oflags |= DB_OSO_RDONLY;
+ if (LF_ISSET(DB_TRUNCATE))
+ oflags |= DB_OSO_TRUNC;
+ }
+
+ retries = 0;
+ create_ok = LF_ISSET(DB_CREATE);
+ LF_CLR(DB_CREATE);
+
+retry:
+ /*
+ * If we cannot create the file, only retry a few times. We
+ * think we might be in a race with another create, but it could
+ * be that the backup filename exists (that is, is left over from
+ * a previous crash). It is also possible to read the metadata
+ * page while it is being written and fail the checksum.
+ */
+ if (++retries > DB_RETRY) {
+ __db_errx(env, DB_STR_A("0002",
+ "__fop_file_setup: Retry limit (%d) exceeded", "%d"),
+ DB_RETRY);
+ goto err;
+ }
+ if (!F_ISSET(dbp, DB_AM_COMPENSATE) && !F_ISSET(dbp, DB_AM_RECOVER))
+ GET_ENVLOCK(env, locker, &elock);
+ if (name == NULL)
+ ret = ENOENT;
+ else if (F_ISSET(dbp, DB_AM_INMEM)) {
+ ret = __env_mpool(dbp, name, flags);
+ /*
+ * We are using __env_open as a check for existence.
+ * However, __env_mpool does an actual open and there
+ * are scenarios where the object exists, but cannot be
+ * opened, because our settings don't match those internally.
+ * We need to check for that explicitly. We'll need the
+ * mpool open to read the meta-data page, so we're going to
+ * have to temporarily turn this dbp into an UNKNOWN one.
+ */
+ if (ret == EINVAL) {
+ was_inval = 1;
+ save_type = dbp->type;
+ dbp->type = DB_UNKNOWN;
+ ret = __env_mpool(dbp, name, flags);
+ dbp->type = save_type;
+ }
+ } else
+ ret = __os_exists(env, real_name, NULL);
+
+ if (ret == 0) {
+ /*
+ * If the file exists, there are 5 possible cases:
+ * 1. DB_EXCL was specified so this is an error, unless
+ * this is a file left around after a rename and we
+ * are in the same transaction. This gets decomposed
+ * into several subcases, because we check for various
+ * errors before we know we're in rename.
+ * 2. We are truncating, and it doesn't matter what kind
+ * of file it is, we should open/create it.
+ * 3. It is 0-length, we are not doing transactions (i.e.,
+ * we are sendmail), we should open/create into it.
+ * -- on-disk files only!
+ * 4. Is it a Berkeley DB file and we should simply open it.
+ * 5. It is not a BDB file and we should return an error.
+ */
+
+ /* Open file (if there is one). */
+reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
+ __os_open(env, real_name, 0, oflags, 0, &fhp)) != 0)
+ goto err;
+
+ /* Case 2: DB_TRUNCATE: we must do the creation in place. */
+ if (LF_ISSET(DB_TRUNCATE)) {
+ if (LF_ISSET(DB_EXCL)) {
+ /* Case 1a: DB_EXCL and DB_TRUNCATE. */
+ ret = EEXIST;
+ goto err;
+ }
+ tmpname = (char *)name;
+ goto creat2;
+ }
+
+ /* Cases 1,3-5: we need to read the meta-data page. */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if (LOGGING_ON(env) && (ret = __env_dbreg_setup(dbp,
+ txn, NULL, name, TXN_INVALID)) != 0)
+ return (ret);
+ ret = __fop_inmem_read_meta(
+ dbp, txn, name, flags, DB_CHK_META|DB_CHK_ONLY);
+ } else {
+ ret = __fop_read_meta(env, real_name, mbuf,
+ sizeof(mbuf), fhp,
+ LF_ISSET(DB_NOERROR) ||
+ (LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL) ? 1 : 0,
+ &len);
+
+ /* Case 3: 0-length, no txns. */
+ if (ret != 0 && len == 0 && txn == NULL) {
+ if (LF_ISSET(DB_EXCL)) {
+ /*
+ * Case 1b: DB_EXCL and
+ * 0-length file exists.
+ */
+ ret = EEXIST;
+ goto err;
+ }
+ tmpname = (char *)name;
+ if (create_ok)
+ goto creat2;
+ goto done;
+ }
+
+ /*
+ * Case 4: This is a valid file. Now check the
+ * checksum and decrypt the file so the file
+ * id can be obtained for the handle lock. Note that
+ * the checksum can fail if the database is being
+ * written (possible because the handle lock has
+ * not been obtained yet). So on checksum fail retry
+ * until the checksum succeeds or the number of
+ * retries is exhausted, then throw an error.
+ */
+ if (ret == 0 && (ret = __db_chk_meta(env, dbp,
+ (DBMETA *)mbuf, DB_CHK_META)) == DB_CHKSUM_FAIL) {
+ if ((t_ret = __ENV_LPUT(env, elock)) != 0) {
+ ret = t_ret;
+ goto err;
+ }
+ /*
+ * Retry unless the number of retries is
+ * exhausted.
+ */
+ if (!(retries < DB_RETRY)) {
+ __db_errx(env, DB_STR_A("0210",
+ "%s: metadata page checksum error", "%s"), real_name);
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ ret = ENOENT;
+ else
+ ret = EINVAL;
+ goto err;
+ }
+ if ((ret = __os_closehandle(env, fhp)) != 0)
+ goto err;
+ goto retry;
+ }
+ /* Get the file id for the handle lock. */
+ if (ret == 0)
+ memcpy(dbp->fileid,
+ ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
+ }
+
+ /* Case 5: Invalid file. */
+ if (ret != 0)
+ goto err;
+
+ /* Now, get our handle lock. */
+ if ((ret = __fop_lock_handle(env,
+ dbp, locker, lockmode, NULL, DB_LOCK_NOWAIT)) == 0) {
+ if ((ret = __ENV_LPUT(env, elock)) != 0)
+ goto err;
+ } else if (ret != DB_LOCK_NOTGRANTED ||
+ ((txn != NULL && (F_ISSET(txn, TXN_NOWAIT))) ||
+ F2_ISSET(dbp, DB2_AM_NOWAIT)))
+ goto err;
+ else {
+ PERFMON3(env,
+ race, fop_file_setup, (char *) name, ret, flags);
+ /*
+ * We were unable to acquire the handle lock without
+ * blocking. The fact that we are blocking might mean
+ * that someone else is trying to delete the file.
+ * Since some platforms cannot delete files while they
+ * are open (Windows), we are going to have to close
+ * the file. This would be a problem if we were doing
+ * FCNTL locking, because our closing the handle would
+ * release the FCNTL locks. Fortunately, if we are
+ * doing FCNTL locking, then we should never fail to
+ * acquire our handle lock, so we should never get here.
+ * We assert it here to make sure we aren't destroying
+ * any application level FCNTL semantics.
+ */
+ DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING));
+ if (!F_ISSET(dbp, DB_AM_INMEM)) {
+ if ((ret = __os_closehandle(env, fhp)) != 0)
+ goto err;
+ fhp = NULL;
+ }
+ if ((ret = __fop_lock_handle(env,
+ dbp, locker, lockmode, &elock, 0)) != 0) {
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ RESET_MPF(dbp, 0);
+ goto err;
+ }
+
+ /*
+ * If we had to wait, we might be waiting on a
+ * dummy file used in create/destroy of a database.
+ * To be sure we have the correct information we
+ * try again.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ RESET_MPF(dbp, 0);
+ MAKE_INMEM(dbp);
+ }
+ if ((ret =
+ __ENV_LPUT(env, dbp->handle_lock)) != 0) {
+ LOCK_INIT(dbp->handle_lock);
+ goto err;
+ }
+ goto retry;
+
+ }
+
+ /*
+ * If we got here, then we have the handle lock, it is now
+ * safe to check the rest of the meta data, since the file
+ * will not be deleted out from under the handle.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if ((ret = __fop_inmem_read_meta(
+ dbp, txn, name, flags, DB_SKIP_CHK)) != 0)
+ goto err;
+ } else {
+ if ((ret = __db_meta_setup(env, dbp, real_name,
+ (DBMETA *)mbuf, flags, DB_SKIP_CHK)) != 0)
+ goto err;
+ }
+
+ /*
+ * Check for a file in the midst of a rename. If we find that
+ * the file is in the midst of a rename, it must be the case
+ * that it is in our current transaction (else we would still
+ * be blocking), so we can continue along and create a new file
+ * with the same name. In that case, we have to close the file
+ * handle because we reuse it below. This is a case where
+ * a 'was_inval' above is OK.
+ */
+ if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
+ was_inval = 0;
+ if (create_ok) {
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ RESET_MPF(dbp, DB_MPOOL_DISCARD);
+ } else if ((ret =
+ __os_closehandle(env, fhp)) != 0)
+ goto err;
+ LF_SET(DB_CREATE);
+ goto create;
+ } else {
+ ret = ENOENT;
+ goto err;
+ }
+ }
+
+ /* If we get here, a was_inval is bad. */
+ if (was_inval) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Now, case 1: check for DB_EXCL, because the file that exists
+ * is not in the middle of a rename, so we have an error. This
+ * is a weird case, but we need to make sure that we don't
+ * continue to hold the handle lock, since technically, we
+ * should not have been allowed to open it.
+ */
+ if (LF_ISSET(DB_EXCL)) {
+ ret = __ENV_LPUT(env, dbp->handle_lock);
+ LOCK_INIT(dbp->handle_lock);
+ if (ret == 0)
+ ret = EEXIST;
+ goto err;
+ }
+ goto done;
+ }
+
+ /* File does not exist. */
+#ifdef HAVE_VXWORKS
+ /*
+ * VxWorks can return file-system specific error codes if the
+ * file does not exist, not ENOENT.
+ */
+ if (!create_ok)
+#else
+ if (!create_ok || ret != ENOENT)
+#endif
+ goto err;
+ LF_SET(DB_CREATE);
+ /*
+ * If we were trying to open a non-existent master database
+ * readonly clear that here.
+ */
+ LF_CLR(DB_RDONLY);
+ F_CLR(dbp, DB_AM_RDONLY);
+ ret = 0;
+
+ /*
+ * We need to create file, which means that we need to set up the file,
+ * the fileid and the locks. Then we need to call the appropriate
+ * routines to create meta-data pages. For in-memory files, we retain
+ * the environment lock, while for on-disk files, we drop the env lock
+ * and create into a temporary.
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM) &&
+ (ret = __ENV_LPUT(env, elock)) != 0)
+ goto err;
+
+create: if (txn != NULL && IS_REP_CLIENT(env) &&
+ !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ __db_errx(env, DB_STR("0003",
+ "Transactional create on replication client disallowed"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if (LOGGING_ON(env) && (ret =
+ __env_dbreg_setup(dbp, txn, NULL, name, TXN_INVALID)) != 0)
+ return (ret);
+ if ((ret = __fop_inmem_create(dbp, name, txn, flags)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __db_backup_name(env, name, txn, &tmpname)) != 0)
+ goto err;
+ if (TXN_ON(env) && txn != NULL &&
+ (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
+ goto err;
+ if ((ret = __fop_create(env, stxn, &fhp,
+ tmpname, &dbp->dirname, aflags, mode, dflags)) != 0) {
+ /*
+ * If no transactions, there is a race on creating the
+ * backup file, as the backup file name is the same for
+ * all processes. Wait for the other process to finish
+ * with the name.
+ */
+ if (!TXN_ON(env) && ret == EEXIST) {
+ PERFMON3(env,
+ race, fop_file_setup, tmpname, ret, flags);
+ __os_free(env, tmpname);
+ tmpname = NULL;
+ __os_yield(env, 1, 0);
+ goto retry;
+ }
+ goto err;
+ }
+ tmp_created = 1;
+ }
+
+creat2: if (!F_ISSET(dbp, DB_AM_INMEM)) {
+ if ((ret = __db_appname(env,
+ aflags, tmpname, &dbp->dirname, &real_tmpname)) != 0)
+ goto err;
+
+ /* Set the pagesize if it isn't yet set. */
+ if (dbp->pgsize == 0 &&
+ (ret = __fop_set_pgsize(dbp, fhp, real_tmpname)) != 0)
+ goto errmsg;
+
+ /* Construct a file_id. */
+ if ((ret =
+ __os_fileid(env, real_tmpname, 1, dbp->fileid)) != 0)
+ goto errmsg;
+ }
+
+ if ((ret = __db_new_file(dbp, ip,
+ F_ISSET(dbp, DB_AM_INMEM) ? txn : stxn, fhp, tmpname)) != 0)
+ goto err;
+
+ /* Output the REOPEN record after we create. */
+ if (F_ISSET(dbp, DB_AM_INMEM) && dbp->log_filename != NULL && (ret =
+ __dbreg_log_id(dbp, txn, dbp->log_filename->id, 0)) != 0)
+ return (ret);
+
+ /*
+ * We need to close the handle here on platforms where remove and
+ * rename fail if a handle is open (including Windows).
+ */
+ CLOSE_HANDLE(dbp, fhp);
+
+ /*
+ * Now move the file into place unless we are creating in place (because
+ * we created a database in a file that started out 0-length). If
+ * this is an in-memory file, we may or may not hold the environment
+ * lock depending on how we got here.
+ */
+ if (!F_ISSET(dbp, DB_AM_COMPENSATE) &&
+ !F_ISSET(dbp, DB_AM_RECOVER) && !LOCK_ISSET(elock))
+ GET_ENVLOCK(env, locker, &elock);
+
+ if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
+ F_CLR(dbp, DB_AM_IN_RENAME);
+ __txn_remrem(env, txn, real_name);
+ } else if (name == tmpname) {
+ /* We created it in place. */
+ } else if (!F_ISSET(dbp, DB_AM_INMEM) &&
+ __os_exists(env, real_name, NULL) == 0) {
+ /*
+ * Someone managed to create the file; remove our temp
+ * and try to open the file that now exists.
+ */
+ (void)__fop_remove(env, NULL,
+ dbp->fileid, tmpname, &dbp->dirname, aflags, dflags);
+ (void)__ENV_LPUT(env, dbp->handle_lock);
+ LOCK_INIT(dbp->handle_lock);
+
+ if (stxn != NULL) {
+ ret = __txn_abort(stxn);
+ stxn = NULL;
+ }
+ if (ret != 0)
+ goto err;
+ goto reopen;
+ }
+
+ if (name != NULL && (ret = __fop_lock_handle(env,
+ dbp, locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn)|
+ (F2_ISSET(dbp,DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
+ goto err;
+ if (tmpname != NULL &&
+ tmpname != name && (ret = __fop_rename(env, stxn, tmpname,
+ name, &dbp->dirname, dbp->fileid, aflags, 1, dflags)) != 0)
+ goto err;
+ if ((ret = __ENV_LPUT(env, elock)) != 0)
+ goto err;
+
+ if (stxn != NULL) {
+ *retidp = stxn->txnid;
+ ret = __txn_commit(stxn, 0);
+ stxn = NULL;
+ } else
+ *retidp = TXN_INVALID;
+
+ if (ret != 0)
+ goto err;
+
+ F_SET(dbp, DB_AM_CREATED);
+
+ if (0) {
+errmsg: __db_err(env, ret, "%s", name);
+
+err: CLOSE_HANDLE(dbp, fhp);
+ if (stxn != NULL)
+ (void)__txn_abort(stxn);
+ if (tmp_created && txn == NULL)
+ (void)__fop_remove(env,
+ NULL, NULL, tmpname, NULL, aflags, dflags);
+ if (txn == NULL)
+ (void)__ENV_LPUT(env, dbp->handle_lock);
+ (void)__ENV_LPUT(env, elock);
+ if (created_locker) {
+ (void)__lock_id_free(env, dbp->locker);
+ dbp->locker = NULL;
+ }
+ }
+
+done: /*
+ * There are cases where real_name and tmpname take on the
+ * exact same string, so we need to make sure that we do not
+ * free twice.
+ */
+ if (!truncating && tmpname != NULL && tmpname != name)
+ __os_free(env, tmpname);
+ if (real_name != name && real_name != NULL)
+ __os_free(env, real_name);
+ if (real_tmpname != NULL)
+ __os_free(env, real_tmpname);
+ CLOSE_HANDLE(dbp, fhp);
+
+ return (ret);
+}
+
+/*
+ * __fop_set_pgsize --
+ * Set the page size based on file information.
+ */
+static int
+__fop_set_pgsize(dbp, fhp, name)
+ DB *dbp;
+ DB_FH *fhp;
+ const char *name;
+{
+ ENV *env;
+ u_int32_t iopsize;
+ int ret;
+
+ env = dbp->env;
+
+ /*
+ * Use the filesystem's optimum I/O size as the pagesize if a pagesize
+ * not specified. Some filesystems have 64K as their optimum I/O size,
+ * but as that results in fairly large default caches, we limit the
+ * default pagesize to 16K.
+ */
+ if ((ret = __os_ioinfo(env, name, fhp, NULL, NULL, &iopsize)) != 0) {
+ __db_err(env, ret, "%s", name);
+ return (ret);
+ }
+ if (iopsize < 512)
+ iopsize = 512;
+ if (iopsize > 16 * 1024)
+ iopsize = 16 * 1024;
+
+ /*
+ * Sheer paranoia, but we don't want anything that's not a power-of-2
+ * (we rely on that for alignment of various types on the pages), and
+ * we want a multiple of the sector size as well. If the value
+ * we got out of __os_ioinfo looks bad, use a default instead.
+ */
+ if (!IS_VALID_PAGESIZE(iopsize))
+ iopsize = DB_DEF_IOSIZE;
+
+ dbp->pgsize = iopsize;
+ F_SET(dbp, DB_AM_PGDEF);
+
+ return (0);
+}
+
+/*
+ * __fop_subdb_setup --
+ *
+ * Subdb setup is significantly simpler than file setup. In terms of
+ * locking, for the duration of the operation/transaction, the locks on
+ * the meta-data page will suffice to protect us from simultaneous operations
+ * on the sub-database. Before we complete the operation though, we'll get a
+ * handle lock on the subdatabase so that on one else can try to remove it
+ * while we've got it open. We use an object that looks like the meta-data
+ * page lock with a different type (DB_HANDLE_LOCK) for the long-term handle.
+ * locks.
+ *
+ * PUBLIC: int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: const char *, const char *, int, u_int32_t));
+ */
+int
+__fop_subdb_setup(dbp, ip, txn, mname, name, mode, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *mname, *name;
+ int mode;
+ u_int32_t flags;
+{
+ DB *mdbp;
+ ENV *env;
+ db_lockmode_t lkmode;
+ u_int32_t mflags;
+ int ret, t_ret;
+
+ mdbp = NULL;
+ env = dbp->env;
+
+ mflags = flags | DB_RDONLY;
+retry: if ((ret = __db_master_open(dbp,
+ ip, txn, mname, mflags, mode, &mdbp)) != 0)
+ return (ret);
+ /*
+ * If we created this file, then we need to set the DISCARD flag so
+ * that if we fail in the middle of this routine, we discard from the
+ * mpool any pages that we just created.
+ */
+ if (F_ISSET(mdbp, DB_AM_CREATED))
+ F_SET(mdbp, DB_AM_DISCARD);
+
+ /*
+ * We are going to close this instance of the master, so we can
+ * steal its handle instead of reopening a handle on the database.
+ */
+ if (LF_ISSET(DB_FCNTL_LOCKING)) {
+ dbp->saved_open_fhp = mdbp->saved_open_fhp;
+ mdbp->saved_open_fhp = NULL;
+ }
+
+ /* Copy the pagesize and set the sub-database flag. */
+ dbp->pgsize = mdbp->pgsize;
+ F_SET(dbp, DB_AM_SUBDB);
+
+ if (name != NULL && (ret = __db_master_update(mdbp, dbp,
+ ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0) {
+ if (ret == EBADF && F_ISSET(mdbp, DB_AM_RDONLY)) {
+ /* We need to reopen the master R/W to do the create. */
+ if ((ret = __db_close(mdbp, txn, 0)) != 0)
+ goto err;
+ FLD_CLR(mflags, DB_RDONLY);
+ goto retry;
+ }
+ goto err;
+ }
+
+ /*
+ * Hijack the master's locker ID as well, so that our locks don't
+ * conflict with the master's. Since we're closing the master,
+ * that locker would just have been freed anyway. Once we've gotten
+ * the locker id, we need to acquire the handle lock for this
+ * subdatabase.
+ */
+ dbp->locker = mdbp->locker;
+ mdbp->locker = NULL;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname);
+
+ /*
+ * We copy our fileid from our master so that we all open
+ * the same file in mpool. We'll use the meta-pgno to lock
+ * so that we end up with different handle locks.
+ */
+
+ memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN);
+ lkmode = F_ISSET(dbp, DB_AM_CREATED) || LF_ISSET(DB_WRITEOPEN) ||
+ F2_ISSET(dbp, DB2_AM_EXCL) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if ((ret = __fop_lock_handle(env, dbp,
+ txn == NULL ? dbp->locker : txn->locker, lkmode, NULL,
+ NOWAIT_FLAG(txn) |
+ (F2_ISSET(dbp, DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
+ goto err;
+
+ if ((ret = __db_init_subdb(mdbp, dbp, name, ip, txn)) != 0) {
+ /*
+ * If there was no transaction and we created this database,
+ * then we need to undo the update of the master database.
+ */
+ if (F_ISSET(dbp, DB_AM_CREATED) && txn == NULL)
+ (void)__db_master_update(mdbp, dbp,
+ ip, txn, name, dbp->type, MU_REMOVE, NULL, 0);
+ F_CLR(dbp, DB_AM_CREATED);
+ goto err;
+ }
+
+ /*
+ * XXX
+ * This should have been done at the top of this routine. The problem
+ * is that __db_init_subdb() uses "standard" routines to process the
+ * meta-data page and set information in the DB handle based on it.
+ * Those routines have to deal with swapped pages and will normally set
+ * the DB_AM_SWAP flag. However, we use the master's metadata page and
+ * that has already been swapped, so they get the is-swapped test wrong.
+ */
+ F_CLR(dbp, DB_AM_SWAP);
+ F_SET(dbp, F_ISSET(mdbp, DB_AM_SWAP));
+
+ /*
+ * In the file create case, these happen in separate places so we have
+ * two different tests. They end up in the same place for subdbs, but
+ * for compatibility with file testing, we put them both here anyway.
+ */
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, mname);
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, mname);
+
+ /*
+ * File exists and we have the appropriate locks; we should now
+ * process a normal open.
+ */
+ if (F_ISSET(mdbp, DB_AM_CREATED)) {
+ F_SET(dbp, DB_AM_CREATED_MSTR);
+ F_CLR(mdbp, DB_AM_DISCARD);
+ }
+
+ if (0) {
+err:
+DB_TEST_RECOVERY_LABEL
+ if (txn == NULL)
+ (void)__ENV_LPUT(env, dbp->handle_lock);
+ }
+
+ /*
+ * The master's handle lock is under the control of the
+ * subdb (it acquired the master's locker). We want to
+ * keep the master's handle lock so that no one can remove
+ * the file while the subdb is open. If we register the
+ * trade event and then invalidate the copy of the lock
+ * in the master's handle, that will accomplish this. However,
+ * before we register this event, we'd better remove any
+ * events that we've already registered for the master.
+ */
+ if (!F_ISSET(dbp, DB_AM_RECOVER) && IS_REAL_TXN(txn)) {
+ /* Unregister old master events. */
+ __txn_remlock(env,
+ txn, &mdbp->handle_lock, DB_LOCK_INVALIDID);
+
+ /* Now register the new event. */
+ if ((t_ret = __txn_lockevent(env, txn, dbp,
+ &mdbp->handle_lock, dbp->locker == NULL ?
+ mdbp->locker : dbp->locker)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ LOCK_INIT(mdbp->handle_lock);
+
+ /*
+ * If the master was created, we need to sync so that the metadata
+ * page is correct on disk for recovery, since it isn't read through
+ * mpool. If we're opening a subdb in an existing file, we can skip
+ * the sync.
+ */
+ if ((t_ret = __db_close(mdbp, txn,
+ F_ISSET(dbp, DB_AM_CREATED_MSTR) ? 0 : DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __fop_remove_setup --
+ * Open handle appropriately and lock for removal of a database file.
+ *
+ * PUBLIC: int __fop_remove_setup __P((DB *,
+ * PUBLIC: DB_TXN *, const char *, u_int32_t));
+ */
+int
+__fop_remove_setup(dbp, txn, name, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+ u_int32_t flags;
+{
+ DB_FH *fhp;
+ DB_LOCK elock;
+ ENV *env;
+ u_int8_t mbuf[DBMETASIZE];
+ int ret;
+
+ COMPQUIET(flags, 0);
+
+ env = dbp->env;
+
+ LOCK_INIT(elock);
+ fhp = NULL;
+ ret = 0;
+
+ /* Create locker if necessary. */
+retry: if (LOCKING_ON(env)) {
+ if (IS_REAL_TXN(txn))
+ dbp->locker = txn->locker;
+ else if (dbp->locker == DB_LOCK_INVALIDID) {
+ if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+ goto err;
+ if (txn != NULL && F_ISSET(txn, TXN_INFAMILY) &&
+ (ret = __lock_addfamilylocker(env,
+ txn->txnid, dbp->locker->id, 1)) != 0)
+ goto err;
+ }
+ }
+
+ /*
+ * We are about to open a file handle and then possibly close it.
+ * We cannot close handles if we are doing FCNTL locking. However,
+ * there is no way to pass the FCNTL flag into this routine via the
+ * user API. The only way we can get in here and be doing FCNTL
+ * locking is if we are trying to clean up an open that was called
+ * with FCNTL locking. In that case, the save_fhp should already be
+ * set. So, we use that field to tell us if we need to make sure
+ * that we shouldn't close the handle.
+ */
+ fhp = dbp->saved_open_fhp;
+ DB_ASSERT(env, LF_ISSET(DB_FCNTL_LOCKING) || fhp == NULL);
+
+ /*
+ * Lock environment to protect file open. That will enable us to
+ * read the meta-data page and get the fileid so that we can lock
+ * the handle.
+ */
+ GET_ENVLOCK(env, dbp->locker, &elock);
+
+ /* Open database. */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if ((ret = __env_mpool(dbp, name, flags)) == 0)
+ ret = __os_strdup(env, name, &dbp->dname);
+ } else if (fhp == NULL)
+ ret = __os_open(env, name, 0, DB_OSO_RDONLY, 0, &fhp);
+ if (ret != 0)
+ goto err;
+
+ /* Get meta-data */
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ ret = __fop_inmem_read_meta(
+ dbp, txn, name, flags, DB_CHK_META);
+ else if ((ret = __fop_read_meta(env,
+ name, mbuf, sizeof(mbuf), fhp, 0, NULL)) == 0)
+ ret = __db_meta_setup(env, dbp,
+ name, (DBMETA *)mbuf, flags, DB_CHK_META | DB_CHK_NOLSN);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Now, get the handle lock. We first try with NOWAIT, because if
+ * we have to wait, we're going to have to close the file and reopen
+ * it, so that if there is someone else removing it, our open doesn't
+ * prevent that.
+ */
+ if ((ret = __fop_lock_handle(env,
+ dbp, dbp->locker, DB_LOCK_WRITE, NULL, DB_LOCK_NOWAIT)) != 0) {
+ /*
+ * Close the file, block on the lock, clean up the dbp, and
+ * then start all over again.
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM) && !LF_ISSET(DB_FCNTL_LOCKING)) {
+ (void)__os_closehandle(env, fhp);
+ fhp = NULL;
+ }
+ if (ret != DB_LOCK_NOTGRANTED ||
+ (txn != NULL && F_ISSET(txn, TXN_NOWAIT)))
+ goto err;
+ else if ((ret = __fop_lock_handle(env,
+ dbp, dbp->locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+ goto err;
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ (void)__lock_put(env, &dbp->handle_lock);
+ (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 1);
+ } else {
+ if (txn != NULL)
+ dbp->locker = NULL;
+ (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 0);
+ }
+ goto retry;
+ } else if ((ret = __ENV_LPUT(env, elock)) != 0)
+ goto err;
+ else if (F_ISSET(dbp, DB_AM_IN_RENAME))
+ ret = ENOENT;
+
+ if (0) {
+err: (void)__ENV_LPUT(env, elock);
+ }
+ if (fhp != NULL && !LF_ISSET(DB_FCNTL_LOCKING))
+ (void)__os_closehandle(env, fhp);
+ /*
+ * If this is a real file and we are going to proceed with the removal,
+ * then we need to make sure that we don't leave any pages around in the
+ * mpool since the file is closed and will be reopened again before
+ * access. However, this might be an in-memory file, in which case
+ * we will handle the discard from the mpool later as it's the "real"
+ * removal of the database.
+ */
+ if (ret == 0 && !F_ISSET(dbp, DB_AM_INMEM))
+ F_SET(dbp, DB_AM_DISCARD);
+ return (ret);
+}
+
+/*
+ * __fop_read_meta --
+ * Read the meta-data page from a file and return it in buf.
+ *
+ * PUBLIC: int __fop_read_meta __P((ENV *, const char *,
+ * PUBLIC: u_int8_t *, size_t, DB_FH *, int, size_t *));
+ */
+int
+__fop_read_meta(env, name, buf, size, fhp, errok, nbytesp)
+ ENV *env;
+ const char *name;
+ u_int8_t *buf;
+ size_t size;
+ DB_FH *fhp;
+ int errok;
+ size_t *nbytesp;
+{
+ size_t nr;
+ int ret;
+
+ /*
+ * Our caller wants to know the number of bytes read, even if we
+ * return an error.
+ */
+ if (nbytesp != NULL)
+ *nbytesp = 0;
+
+ nr = 0;
+ ret = __os_read(env, fhp, buf, size, &nr);
+ if (nbytesp != NULL)
+ *nbytesp = nr;
+
+ if (ret != 0) {
+ if (!errok)
+ __db_err(env, ret, "%s", name);
+ goto err;
+ }
+
+ if (nr != size) {
+ if (!errok)
+ __db_errx(env, DB_STR_A("0004",
+ "fop_read_meta: %s: unexpected file type or format",
+ "%s"), name);
+ ret = EINVAL;
+ }
+
+err:
+ return (ret);
+}
+
+/*
+ * __fop_dummy --
+ * This implements the creation and name swapping of dummy files that
+ * we use for remove and rename (remove is simply a rename with a delayed
+ * remove).
+ *
+ * PUBLIC: int __fop_dummy __P((DB *,
+ * PUBLIC: DB_TXN *, const char *, const char *));
+ */
+int
+__fop_dummy(dbp, txn, old, new)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *old, *new;
+{
+ DB *tmpdbp;
+ DB_TXN *stxn;
+ ENV *env;
+ char *back;
+ int ret, t_ret;
+ u_int8_t mbuf[DBMETASIZE];
+
+ env = dbp->env;
+ back = NULL;
+ stxn = NULL;
+ tmpdbp = NULL;
+
+ DB_ASSERT(env, txn != NULL);
+
+ /*
+ * Begin sub transaction to encapsulate the rename. Note that we
+ * expect the inmem_swap calls to complete the sub-transaction,
+ * aborting on error and committing on success.
+ */
+ if (TXN_ON(env) &&
+ (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
+ goto err;
+
+ /* We need to create a dummy file as a place holder. */
+ if ((ret = __db_backup_name(env, new, stxn, &back)) != 0)
+ goto err;
+ /* Create a dummy dbp handle. */
+ if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+ goto err;
+ if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
+ (ret = __db_set_flags(tmpdbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ memset(mbuf, 0, sizeof(mbuf));
+ ret = F_ISSET(dbp, DB_AM_INMEM) ?
+ __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) :
+ __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf);
+
+ if (ret != 0)
+ goto err;
+
+ ret = F_ISSET(dbp, DB_AM_INMEM) ?
+ __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) :
+ __fop_ondisk_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker);
+ stxn = NULL;
+ if (ret != 0)
+ goto err;
+
+err: if (stxn != NULL)
+ (void)__txn_abort(stxn);
+ if (tmpdbp != NULL &&
+ (t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (back != NULL)
+ __os_free(env, back);
+ return (ret);
+}
+
+/*
+ * __fop_dbrename --
+ * Do the appropriate file locking and file system operations
+ * to effect a dbrename in the absence of transactions (__fop_dummy
+ * and the subsequent calls in __db_rename do the work for the
+ * transactional case).
+ *
+ * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *));
+ */
+int
+__fop_dbrename(dbp, old, new)
+ DB *dbp;
+ const char *old, *new;
+{
+ DB_LOCK elock;
+ ENV *env;
+ char *real_new, *real_old;
+ int ret, t_ret;
+
+ env = dbp->env;
+ real_new = NULL;
+ real_old = NULL;
+ LOCK_INIT(elock);
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ real_new = (char *)new;
+ real_old = (char *)old;
+ } else {
+ /* Get full names. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, old, &dbp->dirname, &real_old)) != 0)
+ goto err;
+
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, new, &dbp->dirname, &real_new)) != 0)
+ goto err;
+ }
+
+ /*
+ * It is an error to rename a file over one that already exists,
+ * as that wouldn't be transaction-safe. We check explicitly
+ * for ondisk files, but it's done memp_nameop for in-memory ones.
+ */
+ GET_ENVLOCK(env, dbp->locker, &elock);
+ ret = F_ISSET(dbp, DB_AM_INMEM) ? ENOENT :
+ __os_exists(env, real_new, NULL);
+
+ if (ret == 0) {
+ ret = EEXIST;
+ __db_errx(env, DB_STR_A("0005",
+ "rename: file %s exists", "%s"), real_new);
+ goto err;
+ }
+
+ ret = __memp_nameop(env,
+ dbp->fileid, new, real_old, real_new, F_ISSET(dbp, DB_AM_INMEM));
+
+err: if ((t_ret = __ENV_LPUT(env, elock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (!F_ISSET(dbp, DB_AM_INMEM) && real_old != NULL)
+ __os_free(env, real_old);
+ if (!F_ISSET(dbp, DB_AM_INMEM) && real_new != NULL)
+ __os_free(env, real_new);
+ return (ret);
+}
+
+static int
+__fop_inmem_create(dbp, name, txn, flags)
+ DB *dbp;
+ const char *name;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DBT fid_dbt, name_dbt;
+ DB_LSN lsn;
+ ENV *env;
+ int ret;
+ int32_t lfid;
+ u_int32_t dflags, *p32;
+
+ env = dbp->env;
+ dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+
+ MAKE_INMEM(dbp);
+
+ /* Set the pagesize if it isn't yet set. */
+ if (dbp->pgsize == 0)
+ dbp->pgsize = DB_DEF_IOSIZE;
+
+ /*
+ * Construct a file_id.
+ *
+ * If this file has no name, then we only need a fileid for locking.
+ * If this file has a name, we need the fileid both for locking and
+ * matching in the memory pool. So, with unnamed in-memory databases,
+ * use a lock_id. For named in-memory files, we need to find a value
+ * that we can use to uniquely identify a name/fid pair. We use a
+ * combination of a unique id (__os_unique_id) and a hash of the
+ * original name.
+ */
+ if (name == NULL) {
+ if (LOCKING_ON(env) && (ret =
+ __lock_id(env, (u_int32_t *)dbp->fileid, NULL)) != 0)
+ goto err;
+ } else {
+ p32 = (u_int32_t *)(&dbp->fileid[0]);
+ __os_unique_id(env, p32);
+ p32++;
+ (void)strncpy(
+ (char *)p32, name, DB_FILE_ID_LEN - sizeof(u_int32_t));
+ dbp->preserve_fid = 1;
+
+ if (DBENV_LOGGING(env) &&
+#if !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
+ txn != NULL &&
+#endif
+ dbp->log_filename != NULL)
+ memcpy(dbp->log_filename->ufid,
+ dbp->fileid, DB_FILE_ID_LEN);
+ }
+
+ /* Now, set the fileid. */
+ if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
+ goto err;
+
+ if ((ret = __env_mpool(dbp, name, flags)) != 0)
+ goto err;
+
+ if (DBENV_LOGGING(env) &&
+#if !defined(DEBUG_WOP)
+ txn != NULL &&
+#endif
+ name != NULL) {
+ DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
+ memset(&fid_dbt, 0, sizeof(fid_dbt));
+ fid_dbt.data = dbp->fileid;
+ fid_dbt.size = DB_FILE_ID_LEN;
+ lfid = dbp->log_filename == NULL ?
+ DB_LOGFILEID_INVALID : dbp->log_filename->id;
+ if ((ret = __crdel_inmem_create_log(env, txn,
+ &lsn, dflags, lfid, &name_dbt, &fid_dbt, dbp->pgsize)) != 0)
+ goto err;
+ }
+
+ F_SET(dbp, DB_AM_CREATED);
+
+err:
+ return (ret);
+}
+
+static int
+__fop_inmem_read_meta(dbp, txn, name, flags, chkflags)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+ u_int32_t flags;
+ u_int32_t chkflags;
+{
+ DBMETA *metap;
+ DB_THREAD_INFO *ip;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ if (txn == NULL)
+ ENV_GET_THREAD_INFO(dbp->env, ip);
+ else
+ ip = txn->thread_info;
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &metap)) != 0)
+ return (ret);
+ if (FLD_ISSET(chkflags, DB_CHK_ONLY)) {
+ if ((ret = __db_chk_meta(dbp->env, dbp, metap, chkflags)) == 0)
+ memcpy(dbp->fileid,
+ ((DBMETA *)metap)->uid, DB_FILE_ID_LEN);
+ } else
+ ret = __db_meta_setup(
+ dbp->env, dbp, name, metap, flags, chkflags);
+
+ if ((t_ret =
+ __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+static int
+__fop_ondisk_dummy(dbp, txn, name, mbuf)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+ u_int8_t *mbuf;
+{
+ ENV *env;
+ int ret;
+ char *realname;
+ u_int32_t dflags;
+
+ realname = NULL;
+ env = dbp->env;
+ dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, name, &dbp->dirname, &realname)) != 0)
+ goto err;
+
+ if ((ret = __fop_create(env,
+ txn, NULL, name, &dbp->dirname, DB_APP_DATA, 0, dflags)) != 0)
+ goto err;
+
+ if ((ret =
+ __os_fileid(env, realname, 1, ((DBMETA *)mbuf)->uid)) != 0)
+ goto err;
+
+ ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
+ if ((ret = __fop_write(env, txn, name, dbp->dirname,
+ DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0)
+ goto err;
+
+ memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
+
+err: if (realname != NULL)
+ __os_free(env, realname);
+
+ return (ret);
+}
+
+static int
+__fop_inmem_dummy(dbp, txn, name, mbuf)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+ u_int8_t *mbuf;
+{
+ DBMETA *metap;
+ DB_THREAD_INFO *ip;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ if ((ret = __fop_inmem_create(dbp, name, txn, DB_CREATE)) != 0)
+ return (ret);
+ if (txn == NULL)
+ ENV_GET_THREAD_INFO(dbp->env, ip);
+ else
+ ip = txn->thread_info;
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &metap)) != 0)
+ return (ret);
+ /* Check file existed. */
+ if (metap->magic != 0)
+ ret = EEXIST;
+ else
+ metap->magic = DB_RENAMEMAGIC;
+
+ /* Copy the fileid onto the meta-data page. */
+ memcpy(metap->uid, dbp->fileid, DB_FILE_ID_LEN);
+
+ if ((t_ret = __memp_fput(dbp->mpf, ip, metap,
+ ret == 0 ? dbp->priority : DB_PRIORITY_VERY_LOW)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ goto err;
+
+ ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
+
+err: return (ret);
+}
+
+static int
+__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker)
+ DB *dbp, *tmpdbp;
+ DB_TXN *txn;
+ const char *old, *new, *back;
+ DB_LOCKER *locker;
+{
+ DBT fiddbt, namedbt, tmpdbt;
+ DB_FH *fhp;
+ DB_LOCK elock;
+ DB_LSN lsn;
+ DB_TXN *parent;
+ ENV *env;
+ u_int8_t mbuf[DBMETASIZE];
+ u_int32_t child_txnid, dflags;
+ int ret, t_ret;
+ char *realold, *realnew;
+
+ env = dbp->env;
+ DB_ASSERT(env, txn != NULL);
+ DB_ASSERT(env, old != NULL);
+
+ realold = realnew = NULL;
+ LOCK_INIT(elock);
+ fhp = NULL;
+ dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, new, &dbp->dirname, &realnew)) != 0)
+ goto err;
+
+ /* Now, lock the name space while we initialize this file. */
+retry: GET_ENVLOCK(env, locker, &elock);
+ if (__os_exists(env, realnew, NULL) == 0) {
+ /*
+ * It is possible that the only reason this file exists is
+ * because we've done a previous rename of it and we have
+ * left a placeholder here. We need to check for that case
+ * and allow this rename to succeed if that's the case.
+ */
+ if ((ret = __os_open(env, realnew, 0, 0, 0, &fhp)) != 0)
+ goto err;
+ if ((ret = __fop_read_meta(env,
+ realnew, mbuf, sizeof(mbuf), fhp, 0, NULL)) != 0 ||
+ (ret = __db_meta_setup(env,
+ tmpdbp, realnew, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0) {
+ ret = EEXIST;
+ goto err;
+ }
+ ret = __os_closehandle(env, fhp);
+ fhp = NULL;
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Now, try to acquire the handle lock. If the handle is locked
+ * by our current, transaction, then we'll get it and life is
+ * good.
+ *
+ * Alternately, it's not locked at all, we'll get the lock, but
+ * we will realize it exists and consider this an error.
+ *
+ * However, if it's held by another transaction, then there
+ * could be two different scenarios: 1) the file is in the
+ * midst of being created or deleted and when that transaction
+ * is over, we might be able to proceed. 2) the file is open
+ * and exists and we should report an error. In order to
+ * distinguish these two cases, we do the following. First, we
+ * try to acquire a READLOCK. If the handle is in the midst of
+ * being created, then we'll block because a writelock is held.
+ * In that case, we should request a blocking write, and when we
+ * get the lock, we should then go back and check to see if the
+ * object exists and start all over again.
+ *
+ * If we got the READLOCK, then either no one is holding the
+ * lock or someone has an open handle and the fact that the file
+ * exists is problematic. So, in this case, we request the
+ * WRITELOCK non-blocking -- if it succeeds, we're golden. If
+ * it fails, then the file exists and we return EEXIST.
+ */
+ if ((ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
+ /*
+ * Someone holds a write-lock. Wait for the write-lock
+ * and after we get it, release it and start over.
+ */
+ if ((ret = __fop_lock_handle(env, tmpdbp,
+ locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+ goto err;
+ if ((ret =
+ __lock_put(env, &tmpdbp->handle_lock)) != 0)
+ goto err;
+ if ((ret = __db_refresh(tmpdbp, NULL, 0, NULL, 0)) != 0)
+ goto err;
+ goto retry;
+ }
+
+ /* We got the read lock; try to upgrade it. */
+ ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_WRITE,
+ NULL, DB_LOCK_UPGRADE | DB_LOCK_NOWAIT);
+ if (ret != 0) {
+ /*
+ * We did not get the writelock, so someone
+ * has the handle open. This is an error.
+ */
+ (void)__lock_put(env, &tmpdbp->handle_lock);
+ ret = EEXIST;
+ } else if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
+ /* We got the lock and are renaming it. */
+ ret = 0;
+ else { /* We got the lock, but the file exists. */
+ (void)__lock_put(env, &tmpdbp->handle_lock);
+ ret = EEXIST;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * While we have the namespace locked, do the renames and then
+ * swap for the handle lock.
+ */
+ if ((ret = __fop_rename(env, txn,
+ old, new, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
+ goto err;
+ if ((ret = __fop_rename(env, txn, back, old,
+ &dbp->dirname, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0)
+ goto err;
+ if ((ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0)
+ goto err;
+
+ /*
+ * We just acquired a transactional lock on the tmp handle.
+ * We need to null out the tmp handle's lock so that it
+ * doesn't create problems for us in the close path.
+ */
+ LOCK_INIT(tmpdbp->handle_lock);
+
+ /* Commit the child. */
+ child_txnid = txn->txnid;
+ parent = txn->parent;
+ ret = __txn_commit(txn, 0);
+ txn = NULL;
+
+ /*
+ * If the new name is available because it was previously renamed
+ * remove it from the remove list.
+ */
+ if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
+ __txn_remrem(env, parent, realnew);
+
+ /* Now log the child information in the parent. */
+ memset(&fiddbt, 0, sizeof(fiddbt));
+ fiddbt.data = dbp->fileid;
+ fiddbt.size = DB_FILE_ID_LEN;
+ memset(&tmpdbt, 0, sizeof(fiddbt));
+ tmpdbt.data = tmpdbp->fileid;
+ tmpdbt.size = DB_FILE_ID_LEN;
+ DB_INIT_DBT(namedbt, old, strlen(old) + 1);
+ if ((t_ret = __fop_file_remove_log(env,
+ parent, &lsn, dflags, &fiddbt, &tmpdbt, &namedbt,
+ (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* This is a delayed delete of the dummy file. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, old, &dbp->dirname, &realold)) != 0)
+ goto err;
+
+ if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0)
+ goto err;
+
+err: if (txn != NULL) /* Ret must already be set, so void abort. */
+ (void)__txn_abort(txn);
+
+ (void)__ENV_LPUT(env, elock);
+
+ if (fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (realnew != NULL)
+ __os_free(env, realnew);
+ if (realold != NULL)
+ __os_free(env, realold);
+ return (ret);
+}
+
+static int
+__fop_inmem_swap(olddbp, backdbp, txn, old, new, back, locker)
+ DB *olddbp, *backdbp;
+ DB_TXN *txn;
+ const char *old, *new, *back;
+ DB_LOCKER *locker;
+{
+ DB *tmpdbp;
+ DBT fid_dbt, n1_dbt, n2_dbt;
+ DB_LOCK elock;
+ DB_LSN lsn;
+ DB_TXN *parent;
+ ENV *env;
+ int ret, t_ret;
+
+ env = olddbp->env;
+ parent = txn->parent;
+retry: LOCK_INIT(elock);
+ if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+ return (ret);
+ MAKE_INMEM(tmpdbp);
+
+ GET_ENVLOCK(env, locker, &elock);
+ if ((ret = __env_mpool(tmpdbp, new, 0)) == 0) {
+ /*
+ * It is possible that the only reason this database exists is
+ * because we've done a previous rename of it and we have
+ * left a placeholder here. We need to check for that case
+ * and allow this rename to succeed if that's the case.
+ */
+
+ if ((ret = __fop_inmem_read_meta(
+ tmpdbp, txn, new, 0, DB_CHK_META)) != 0) {
+ ret = EEXIST;
+ goto err;
+ }
+
+ /*
+ * Now, try to acquire the handle lock. If it's from our txn,
+ * then we'll get the lock. If it's not, then someone else has
+ * it locked. See the comments in __fop_ondisk_swap for
+ * details.
+ */
+ if ((ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
+ /*
+ * Someone holds a writelock. Try for the WRITELOCK
+ * and after we get it, retry.
+ */
+ if ((ret = __fop_lock_handle(env, tmpdbp,
+ locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+ goto err;
+
+ /* We have the write lock; release it and start over. */
+ (void)__lock_put(env, &tmpdbp->handle_lock);
+ (void)__db_close(tmpdbp, NULL, DB_NOSYNC);
+ (void)__ENV_LPUT(env, elock);
+ goto retry;
+ } else {
+ (void)__lock_put(env, &tmpdbp->handle_lock);
+ if (!F_ISSET(tmpdbp, DB_AM_IN_RENAME))
+ ret = EEXIST;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ /* Log the renames. */
+ if (LOGGING_ON(env)
+#ifndef DEBUG_WOP
+ && txn != NULL
+#endif
+ ) {
+ /* Rename old to new. */
+ DB_INIT_DBT(fid_dbt, olddbp->fileid, DB_FILE_ID_LEN);
+ DB_INIT_DBT(n1_dbt, old, strlen(old) + 1);
+ DB_INIT_DBT(n2_dbt, new, strlen(new) + 1);
+ if ((ret = __crdel_inmem_rename_log(
+ env, txn, &lsn, 0, &n1_dbt, &n2_dbt, &fid_dbt)) != 0)
+ goto err;
+
+ /* Rename back to old */
+ fid_dbt.data = backdbp->fileid;
+ DB_SET_DBT(n2_dbt, back, strlen(back) + 1);
+ if ((ret = __crdel_inmem_rename_log(
+ env, txn, &lsn, 0, &n2_dbt, &n1_dbt, &fid_dbt)) != 0)
+ goto err;
+ }
+
+ /*
+ * While we have the namespace locked, do the renames and then
+ * swap for the handle lock. If we ran into a file in the midst
+ * of rename, then we need to delete it first, else nameop is
+ * going to consider it an error.
+ */
+ if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) {
+ if ((ret = __memp_nameop(env,
+ tmpdbp->fileid, NULL, new, NULL, 1)) != 0)
+ goto err;
+ __txn_remrem(env, parent, new);
+ }
+
+ if ((ret = __memp_nameop(
+ env, olddbp->fileid, new, old, new, 1)) != 0)
+ goto err;
+ if ((ret = __memp_nameop(
+ env, backdbp->fileid, old, back, old, 1)) != 0)
+ goto err;
+
+ if ((ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+ goto err;
+
+ /*
+ * We just acquired a transactional lock on the tmp handle.
+ * We need to null out the tmp handle's lock so that it
+ * doesn't create problems for us in the close path.
+ */
+ LOCK_INIT(tmpdbp->handle_lock);
+
+ DB_ASSERT(env, txn != NULL);
+
+ /* Commit the child. */
+ ret = __txn_commit(txn, 0);
+ txn = NULL;
+
+ if ((ret = __db_inmem_remove(backdbp, parent, old)) != 0)
+ goto err;
+
+err: (void)__ENV_LPUT(env, elock);
+
+ if (txn != NULL)
+ (void)__txn_abort(txn);
+
+ if ((t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/src/hash/hash.c b/src/hash/hash.c
new file mode 100644
index 00000000..ae5736e7
--- /dev/null
+++ b/src/hash/hash.c
@@ -0,0 +1,2340 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * Margo Seltzer. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+static int __ham_bulk __P((DBC *, DBT *, u_int32_t));
+static int __hamc_close __P((DBC *, db_pgno_t, int *));
+static int __hamc_del __P((DBC *, u_int32_t));
+static int __hamc_destroy __P((DBC *));
+static int __hamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __hamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __hamc_writelock __P((DBC *));
+static int __ham_dup_return __P((DBC *, DBT *, u_int32_t));
+static int __ham_expand_table __P((DBC *));
+static int __hamc_update_getorder
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __hamc_update_setorder
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __ham_get_clist_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * __ham_quick_delete --
+ * This function is called by __db_del when the appropriate conditions
+ * are met, and it performs the delete in the optimized way.
+ *
+ * PUBLIC: int __ham_quick_delete __P((DBC *));
+ */
+int
+__ham_quick_delete(dbc)
+ DBC *dbc;
+{
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ int ret, t_ret;
+
+ /*
+ * When performing a DB->del operation not involving secondary indices
+ * and not removing an off-page duplicate tree, we can speed things up
+ * substantially by removing the entire duplicate set, if any is
+ * present, in one operation, rather than by conjuring up and deleting
+ * each of the items individually. (All are stored in one big HKEYDATA
+ * structure.) We don't bother to distinguish on-page duplicate sets
+ * from single, non-dup items; they're deleted in exactly the same way.
+ *
+ * The cursor should be set to the first item in the duplicate set, or
+ * to the sole key/data pair when the key does not have a duplicate set,
+ * before the function is called.
+ *
+ * We do not need to call CDB_LOCKING_INIT, __db_del calls here with
+ * a write cursor.
+ *
+ * Assert we're initialized, but not to an off-page duplicate.
+ * Assert we're not using secondary indices.
+ */
+ DB_ASSERT(dbc->env, IS_INITIALIZED(dbc));
+ DB_ASSERT(dbc->env, dbc->internal->opd == NULL);
+ DB_ASSERT(dbc->env, !F_ISSET(dbc->dbp, DB_AM_SECONDARY));
+ DB_ASSERT(dbc->env, !DB_IS_PRIMARY(dbc->dbp));
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ mpf = dbc->dbp->mpf;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ return (ret);
+
+ if ((ret = __hamc_writelock(dbc)) == 0) {
+ ret = __ham_del_pair(dbc, 0, NULL);
+ /*
+ * If a page was retrieved during the delete, put it now. We
+ * can't rely on the callers cursor close to do that, since bulk
+ * delete operations keep the cursor open across deletes.
+ */
+ if (hcp->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ hcp->page = NULL;
+ }
+ }
+
+ if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/* ****************** CURSORS ********************************** */
+/*
+ * __hamc_init --
+ * Initialize the hash-specific portion of a cursor.
+ *
+ * PUBLIC: int __hamc_init __P((DBC *));
+ */
+int
+__hamc_init(dbc)
+ DBC *dbc;
+{
+ ENV *env;
+ HASH_CURSOR *new_curs;
+ int ret;
+
+ env = dbc->env;
+ if ((ret = __os_calloc(env,
+ 1, sizeof(struct cursor_t), &new_curs)) != 0)
+ return (ret);
+ if ((ret = __os_malloc(env,
+ dbc->dbp->pgsize, &new_curs->split_buf)) != 0) {
+ __os_free(env, new_curs);
+ return (ret);
+ }
+
+ dbc->internal = (DBC_INTERNAL *) new_curs;
+ dbc->close = dbc->c_close = __dbc_close_pp;
+ dbc->cmp = __dbc_cmp_pp;
+ dbc->count = dbc->c_count = __dbc_count_pp;
+ dbc->del = dbc->c_del = __dbc_del_pp;
+ dbc->dup = dbc->c_dup = __dbc_dup_pp;
+ dbc->get = dbc->c_get = __dbc_get_pp;
+ dbc->pget = dbc->c_pget = __dbc_pget_pp;
+ dbc->put = dbc->c_put = __dbc_put_pp;
+ dbc->am_bulk = __ham_bulk;
+ dbc->am_close = __hamc_close;
+ dbc->am_del = __hamc_del;
+ dbc->am_destroy = __hamc_destroy;
+ dbc->am_get = __hamc_get;
+ dbc->am_put = __hamc_put;
+ dbc->am_writelock = __hamc_writelock;
+
+ return (__ham_item_init(dbc));
+}
+
+/*
+ * __hamc_close --
+ * Close down the cursor from a single use.
+ */
+static int
+__hamc_close(dbc, root_pgno, rmroot)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int *rmroot;
+{
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ HKEYDATA *dp;
+ db_lockmode_t lock_mode;
+ int doroot, gotmeta, ret, t_ret;
+
+ COMPQUIET(rmroot, 0);
+ mpf = dbc->dbp->mpf;
+ doroot = gotmeta = ret = 0;
+ hcp = (HASH_CURSOR *) dbc->internal;
+
+ /* Check for off page dups. */
+ if (dbc->internal->opd != NULL) {
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto done;
+ gotmeta = 1;
+ lock_mode = DB_LOCK_READ;
+
+ /* To support dirty reads we must reget the write lock. */
+ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+ F_ISSET((BTREE_CURSOR *)
+ dbc->internal->opd->internal, C_DELETED))
+ lock_mode = DB_LOCK_WRITE;
+
+ if ((ret = __ham_get_cpage(dbc, lock_mode)) != 0)
+ goto out;
+ dp = (HKEYDATA *)H_PAIRDATA(dbc->dbp, hcp->page, hcp->indx);
+
+ /* If it's not a dup we aborted before we changed it. */
+ if (HPAGE_PTYPE(dp) == H_OFFDUP)
+ memcpy(&root_pgno,
+ HOFFPAGE_PGNO(dp), sizeof(db_pgno_t));
+ else
+ root_pgno = PGNO_INVALID;
+
+ if ((ret =
+ hcp->opd->am_close(hcp->opd, root_pgno, &doroot)) != 0)
+ goto out;
+ if (doroot != 0) {
+ if ((ret = __memp_dirty(mpf, &hcp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto out;
+ if ((ret = __ham_del_pair(dbc, 0, NULL)) != 0)
+ goto out;
+ }
+ }
+
+out: if (ret != 0)
+ F_SET(dbc, DBC_ERROR);
+ if (hcp->page != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (gotmeta != 0 && (t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+done: if ((t_ret = __ham_item_init(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __hamc_destroy --
+ * Cleanup the access method private part of a cursor.
+ */
+static int
+__hamc_destroy(dbc)
+ DBC *dbc;
+{
+ HASH_CURSOR *hcp;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if (hcp->split_buf != NULL)
+ __os_free(dbc->env, hcp->split_buf);
+ __os_free(dbc->env, hcp);
+
+ return (0);
+}
+
+/*
+ * __hamc_count --
+ * Return a count of on-page duplicates.
+ *
+ * PUBLIC: int __hamc_count __P((DBC *, db_recno_t *));
+ */
+int
+__hamc_count(dbc, recnop)
+ DBC *dbc;
+ db_recno_t *recnop;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ db_indx_t len;
+ db_recno_t recno;
+ int ret, t_ret;
+ u_int8_t *p, *pend;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ recno = 0;
+
+ if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0)
+ return (ret);
+ if (hcp->indx >= NUM_ENT(hcp->page)) {
+ *recnop = 0;
+ goto err;
+ }
+
+ switch (HPAGE_PTYPE(H_PAIRDATA(dbp, hcp->page, hcp->indx))) {
+ case H_KEYDATA:
+ case H_OFFPAGE:
+ recno = 1;
+ break;
+ case H_DUPLICATE:
+ p = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx));
+ pend = p +
+ LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx);
+ for (; p < pend; recno++) {
+ /* p may be odd, so copy rather than just dereffing */
+ memcpy(&len, p, sizeof(db_indx_t));
+ p += 2 * sizeof(db_indx_t) + len;
+ }
+
+ break;
+ default:
+ ret = __db_pgfmt(dbp->env, hcp->pgno);
+ goto err;
+ }
+
+ *recnop = recno;
+
+err: if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ hcp->page = NULL;
+ return (ret);
+}
+
+/*
+ * __hamc_cmp --
+ * Compare two hash cursors for equality.
+ *
+ * This function is only called with two cursors that point to the same item.
+ * It distinguishes two cases:
+ * * Cursors pointing to different items in the same on-page duplicate set.
+ * * Cursors pointing to the same item, with different DELETED flags.
+ *
+ * PUBLIC: int __hamc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__hamc_cmp(dbc, other_dbc, result)
+ DBC *dbc, *other_dbc;
+ int *result;
+{
+ ENV *env;
+ HASH_CURSOR *hcp, *ohcp;
+
+ env = dbc->env;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ ohcp = (HASH_CURSOR *)other_dbc->internal;
+
+ DB_ASSERT (env, hcp->pgno == ohcp->pgno);
+ DB_ASSERT (env, hcp->indx == ohcp->indx);
+
+ /* Only compare the duplicate offsets if this is a duplicate item. */
+ if ((F_ISSET(hcp, H_ISDUP) && hcp->dup_off != ohcp->dup_off) ||
+ F_ISSET(hcp, H_DELETED) != F_ISSET(ohcp, H_DELETED))
+ *result = 1;
+ else
+ *result = 0;
+ return (0);
+}
+
+static int
+__hamc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBT repldbt;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ int ret, t_ret;
+
+ COMPQUIET(flags, 0);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if (F_ISSET(hcp, H_DELETED))
+ return (DB_NOTFOUND);
+
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto out;
+
+ if ((ret = __ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0)
+ goto out;
+
+ /* Off-page duplicates. */
+ if (HPAGE_TYPE(dbp, hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP)
+ goto out;
+
+ DB_ASSERT(dbp->env, IS_DIRTY(hcp->page));
+
+ if (F_ISSET(hcp, H_ISDUP)) { /* On-page duplicate. */
+ if (hcp->dup_off == 0 &&
+ DUP_SIZE(hcp->dup_len) == LEN_HDATA(dbp, hcp->page,
+ hcp->hdr->dbmeta.pagesize, hcp->indx))
+ ret = __ham_del_pair(dbc, 0, NULL);
+ else {
+ repldbt.flags = 0;
+ F_SET(&repldbt, DB_DBT_PARTIAL);
+ repldbt.doff = hcp->dup_off;
+ repldbt.dlen = DUP_SIZE(hcp->dup_len);
+ repldbt.size = 0;
+ repldbt.data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page,
+ hcp->indx));
+ if ((ret =
+ __ham_replpair(dbc, &repldbt, H_DUPLICATE)) == 0) {
+ hcp->dup_tlen -= DUP_SIZE(hcp->dup_len);
+ F_SET(hcp, H_DELETED);
+ /*
+ * Clear any cached streaming information.
+ */
+ hcp->stream_start_pgno = PGNO_INVALID;
+ ret = __hamc_update(dbc, DUP_SIZE(hcp->dup_len),
+ DB_HAM_CURADJ_DEL, 1);
+ }
+ }
+ } else /* Not a duplicate */
+ ret = __ham_del_pair(dbc, 0, NULL);
+
+out: if (hcp->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ hcp->page = NULL;
+ }
+ if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __hamc_dup --
+ * Duplicate a hash cursor, such that the new one holds appropriate
+ * locks for the position of the original.
+ *
+ * PUBLIC: int __hamc_dup __P((DBC *, DBC *));
+ */
+int
+__hamc_dup(orig_dbc, new_dbc)
+ DBC *orig_dbc, *new_dbc;
+{
+ HASH_CURSOR *orig, *new;
+
+ orig = (HASH_CURSOR *)orig_dbc->internal;
+ new = (HASH_CURSOR *)new_dbc->internal;
+
+ new->bucket = orig->bucket;
+ new->lbucket = orig->lbucket;
+ new->dup_off = orig->dup_off;
+ new->dup_len = orig->dup_len;
+ new->dup_tlen = orig->dup_tlen;
+
+ if (F_ISSET(orig, H_DELETED))
+ F_SET(new, H_DELETED);
+ if (F_ISSET(orig, H_ISDUP))
+ F_SET(new, H_ISDUP);
+
+ return (0);
+}
+
+static int
+__hamc_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HASH_CURSOR *hcp;
+ db_lockmode_t lock_type;
+ int ret, t_ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+
+ /* Clear OR'd in additional bits so we can check for flag equality. */
+ if (F_ISSET(dbc, DBC_RMW))
+ lock_type = DB_LOCK_WRITE;
+ else
+ lock_type = DB_LOCK_READ;
+
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ return (ret);
+ hcp->seek_size = 0;
+
+ ret = 0;
+ switch (flags) {
+ case DB_PREV_DUP:
+ F_SET(hcp, H_DUPONLY);
+ goto prev;
+ case DB_PREV_NODUP:
+ F_SET(hcp, H_NEXT_NODUP);
+ /* FALLTHROUGH */
+ case DB_PREV:
+ if (IS_INITIALIZED(dbc)) {
+prev: ret = __ham_item_prev(dbc, lock_type, pgnop);
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST:
+ ret = __ham_item_last(dbc, lock_type, pgnop);
+ break;
+ case DB_NEXT_DUP:
+ case DB_GET_BOTHC:
+ /* cgetchk has already determined that the cursor is set. */
+ F_SET(hcp, H_DUPONLY);
+ goto next;
+ case DB_NEXT_NODUP:
+ F_SET(hcp, H_NEXT_NODUP);
+ /* FALLTHROUGH */
+ case DB_NEXT:
+ if (IS_INITIALIZED(dbc)) {
+next: ret = __ham_item_next(dbc, lock_type, pgnop);
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_FIRST:
+ ret = __ham_item_first(dbc, lock_type, pgnop);
+ break;
+ case DB_SET:
+ case DB_SET_RANGE:
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ ret = __ham_lookup(dbc, key, 0, lock_type, pgnop);
+ break;
+ case DB_CURRENT:
+ /* cgetchk has already determined that the cursor is set. */
+ if (F_ISSET(hcp, H_DELETED)) {
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ ret = __ham_item(dbc, lock_type, pgnop);
+ break;
+ default:
+ ret = __db_unknown_flag(env, "__hamc_get", flags);
+ break;
+ }
+
+ /*
+ * Must always enter this loop to do error handling and
+ * check for big key/data pair.
+ */
+ for (;;) {
+ if (ret != 0 && ret != DB_NOTFOUND)
+ goto err;
+ else if (F_ISSET(hcp, H_OK)) {
+ if (*pgnop == PGNO_INVALID)
+ ret = __ham_dup_return(dbc, data, flags);
+ break;
+ } else if (!F_ISSET(hcp, H_NOMORE)) {
+ __db_errx(env, DB_STR("1130",
+ "H_NOMORE returned to __hamc_get"));
+ ret = EINVAL;
+ break;
+ }
+
+ /*
+ * Ran out of entries in a bucket; change buckets.
+ */
+ switch (flags) {
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority);
+ hcp->page = NULL;
+ if (hcp->bucket == 0) {
+ ret = DB_NOTFOUND;
+ hcp->pgno = PGNO_INVALID;
+ goto err;
+ }
+ F_CLR(hcp, H_ISDUP);
+ hcp->bucket--;
+ hcp->indx = NDX_INVALID;
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ if (ret == 0)
+ ret = __ham_item_prev(dbc, lock_type, pgnop);
+ break;
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority);
+ hcp->page = NULL;
+ hcp->indx = NDX_INVALID;
+ hcp->bucket++;
+ F_CLR(hcp, H_ISDUP);
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ if (hcp->bucket > hcp->hdr->max_bucket) {
+ ret = DB_NOTFOUND;
+ hcp->pgno = PGNO_INVALID;
+ goto err;
+ }
+ if (ret == 0)
+ ret = __ham_item_next(dbc, lock_type, pgnop);
+ break;
+ case DB_GET_BOTH:
+ case DB_GET_BOTHC:
+ case DB_GET_BOTH_RANGE:
+ case DB_NEXT_DUP:
+ case DB_SET:
+ case DB_SET_RANGE:
+ /* Key not found. */
+ ret = DB_NOTFOUND;
+ goto err;
+ case DB_CURRENT:
+ /*
+ * This should only happen if you are doing deletes and
+ * reading with concurrent threads and not doing proper
+ * locking. We return the same error code as we would
+ * if the cursor were deleted.
+ */
+ ret = DB_KEYEMPTY;
+ goto err;
+ default:
+ DB_ASSERT(env, 0);
+ }
+ }
+
+err: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ F_CLR(hcp, H_DUPONLY);
+ F_CLR(hcp, H_NEXT_NODUP);
+
+ return (ret);
+}
+
+/*
+ * __ham_bulk -- Return bulk data from a hash table.
+ */
+static int
+__ham_bulk(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *cp;
+ PAGE *pg;
+ db_indx_t dup_len, dup_off, dup_tlen, indx, *inp;
+ db_lockmode_t lock_mode;
+ db_pgno_t pgno;
+ int32_t *endp, *offp, *saveoff;
+ u_int32_t key_off, key_size, pagesize, size, space;
+ u_int8_t *dbuf, *dp, *hk, *np, *tmp;
+ int is_dup, is_key;
+ int need_pg, next_key, no_dup, ret, t_ret;
+
+ ret = 0;
+ key_off = 0;
+ dup_len = dup_off = dup_tlen = 0;
+ size = 0;
+ dbp = dbc->dbp;
+ pagesize = dbp->pgsize;
+ mpf = dbp->mpf;
+ cp = (HASH_CURSOR *)dbc->internal;
+ is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+ next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP;
+ no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP;
+ dbuf = data->data;
+ np = dp = dbuf;
+
+ /* Keep track of space that is left. There is an termination entry */
+ space = data->ulen;
+ space -= sizeof(*offp);
+
+ /* Build the offset/size table from the end up. */
+ endp = (int32_t *) ((u_int8_t *)dbuf + data->ulen);
+ endp--;
+ offp = endp;
+
+ key_size = 0;
+ lock_mode = F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE: DB_LOCK_READ;
+
+next_pg:
+ need_pg = 1;
+ indx = cp->indx;
+ pg = cp->page;
+ inp = P_INP(dbp, pg);
+
+ do {
+ if (is_key) {
+ hk = H_PAIRKEY(dbp, pg, indx);
+ if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
+ memcpy(&key_size,
+ HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ memcpy(&pgno,
+ HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ size = key_size;
+ if (key_size > space)
+ goto get_key_space;
+ if ((ret = __bam_bulk_overflow(
+ dbc, key_size, pgno, np)) != 0)
+ return (ret);
+ space -= key_size;
+ key_off = (u_int32_t)(np - dbuf);
+ np += key_size;
+ } else {
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+get_key_space:
+ if (offp == endp) {
+ data->size = (u_int32_t)
+ DB_ALIGN(size +
+ pagesize, 1024);
+ return
+ (DB_BUFFER_SMALL);
+ }
+ goto back_up;
+ }
+ memcpy(dp,
+ (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+ key_size = LEN_HKEY(dbp, pg, pagesize, indx);
+ key_off = ((inp[indx] - HOFFSET(pg)) +
+ (u_int32_t)(dp - dbuf)) +
+ SSZA(HKEYDATA, data);
+ }
+ }
+
+ hk = H_PAIRDATA(dbp, pg, indx);
+ switch (HPAGE_PTYPE(hk)) {
+ case H_DUPLICATE:
+ case H_KEYDATA:
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+back_up:
+ if (indx != 0) {
+ indx -= 2;
+ /* XXX
+ * It's not clear that this is
+ * the right way to fix this,
+ * but here goes.
+ * If we are backing up onto a
+ * duplicate, then we need to
+ * position ourselves at the
+ * end of the duplicate set.
+ * We probably need to make
+ * this work for H_OFFDUP too.
+ * It might be worth making a
+ * dummy cursor and calling
+ * __ham_item_prev.
+ */
+ tmp = H_PAIRDATA(dbp, pg, indx);
+ if (HPAGE_PTYPE(tmp) ==
+ H_DUPLICATE) {
+ dup_off = dup_tlen =
+ LEN_HDATA(dbp, pg,
+ pagesize, indx + 1);
+ memcpy(&dup_len,
+ HKEYDATA_DATA(tmp),
+ sizeof(db_indx_t));
+ } else {
+ is_dup = 0;
+ dup_len = 0;
+ dup_off = 0;
+ dup_tlen = 0;
+ F_CLR(cp, H_ISDUP);
+ }
+ goto get_space;
+ }
+ /* indx == 0 */
+ cp->dup_len = dup_len;
+ cp->dup_off = dup_off;
+ cp->dup_tlen = dup_tlen;
+ if ((ret = __ham_item_prev(dbc,
+ lock_mode, &pgno)) != 0) {
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, cp->page,
+ dbc->priority)) != 0)
+ return (ret);
+ cp->page = NULL;
+ if (cp->bucket == 0) {
+ cp->indx = indx =
+ NDX_INVALID;
+ goto get_space;
+ }
+ if ((ret =
+ __ham_get_meta(dbc)) != 0)
+ return (ret);
+
+ cp->bucket--;
+ cp->pgno = BUCKET_TO_PAGE(cp,
+ cp->bucket);
+ cp->indx = NDX_INVALID;
+ if ((ret = __ham_release_meta(
+ dbc)) != 0)
+ return (ret);
+ /*
+ * Not an error to get
+ * DB_NOTFOUND, we're just at
+ * the beginning of the db.
+ */
+ if ((ret = __ham_item_prev(dbc,
+ lock_mode, &pgno)) != 0) {
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ else
+ ret = 0;
+ }
+ }
+ indx = cp->indx;
+get_space:
+ /*
+ * See if we put any data in the buffer.
+ */
+ if (offp >= endp ||
+ F_ISSET(dbc, DBC_TRANSIENT)) {
+ data->size = (u_int32_t)
+ DB_ALIGN(size +
+ data->ulen - space, 1024);
+ return (DB_BUFFER_SMALL);
+ }
+ /*
+ * Don't continue; we're all out
+ * of space, even though we're
+ * returning success.
+ */
+ next_key = 0;
+ break;
+ }
+ memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+
+ /*
+ * We're about to crack the offset(s) and length(s)
+ * out of an H_KEYDATA or H_DUPLICATE item.
+ * There are three cases:
+ * 1. We were moved into a duplicate set by
+ * the standard hash cursor code. Respect
+ * the dup_off and dup_tlen we were given.
+ * 2. We stumbled upon a duplicate set while
+ * walking the page on our own. We need to
+ * recognize it as a dup and set dup_off and
+ * dup_tlen.
+ * 3. The current item is not a dup.
+ */
+ if (F_ISSET(cp, H_ISDUP)) {
+ /* Case 1 */
+ is_dup = 1;
+ dup_len = cp->dup_len;
+ dup_off = cp->dup_off;
+ dup_tlen = cp->dup_tlen;
+ } else if (HPAGE_PTYPE(hk) == H_DUPLICATE) {
+ /* Case 2 */
+ is_dup = 1;
+ /*
+ * If we run out of memory and bail,
+ * make sure the fact we're in a dup set
+ * isn't ignored later.
+ */
+ F_SET(cp, H_ISDUP);
+ dup_off = 0;
+ memcpy(&dup_len,
+ HKEYDATA_DATA(hk), sizeof(db_indx_t));
+ dup_tlen = LEN_HDATA(dbp, pg, pagesize, indx);
+ } else {
+ /* Case 3 */
+ is_dup = 0;
+ dup_len = 0;
+ dup_off = 0;
+ dup_tlen = 0;
+ }
+
+ do {
+ space -= (is_key ? 4 : 2) * sizeof(*offp);
+ size += (is_key ? 4 : 2) * sizeof(*offp);
+ /*
+ * Since space is an unsigned, if we happen
+ * to wrap, then this comparison will turn out
+ * to be true. XXX Wouldn't it be better to
+ * simply check above that space is greater than
+ * the value we're about to subtract???
+ */
+ if (space > data->ulen) {
+ if (!is_dup || dup_off == 0)
+ goto back_up;
+ dup_off -= (db_indx_t)
+ DUP_SIZE((u_int32_t)offp[1]);
+ goto get_space;
+ }
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+ if (is_dup) {
+ *offp-- = (int32_t)(
+ ((inp[indx + 1] - HOFFSET(pg)) +
+ dp - dbuf) + SSZA(HKEYDATA, data) +
+ dup_off + sizeof(db_indx_t));
+ memcpy(&dup_len,
+ HKEYDATA_DATA(hk) + dup_off,
+ sizeof(db_indx_t));
+ dup_off += DUP_SIZE(dup_len);
+ *offp-- = dup_len;
+ } else {
+ *offp-- = (int32_t)(
+ ((inp[indx + 1] - HOFFSET(pg)) +
+ dp - dbuf) + SSZA(HKEYDATA, data));
+ *offp-- = LEN_HDATA(dbp, pg,
+ pagesize, indx);
+ }
+ } while (is_dup && dup_off < dup_tlen && no_dup == 0);
+ F_CLR(cp, H_ISDUP);
+ break;
+ case H_OFFDUP:
+ memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ space -= 2 * sizeof(*offp);
+ if (space > data->ulen)
+ goto back_up;
+
+ if (is_key) {
+ space -= 2 * sizeof(*offp);
+ if (space > data->ulen)
+ goto back_up;
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+ saveoff = offp;
+ if ((ret = __bam_bulk_duplicates(dbc,
+ pgno, dbuf, is_key ? offp + 2 : NULL,
+ &offp, &np, &space, no_dup)) != 0) {
+ if (ret == DB_BUFFER_SMALL) {
+ size = space;
+ space = 0;
+ if (is_key && saveoff == offp) {
+ offp += 2;
+ goto back_up;
+ }
+ goto get_space;
+ }
+ return (ret);
+ }
+ break;
+ case H_OFFPAGE:
+ space -= (is_key ? 4 : 2) * sizeof(*offp);
+ if (space > data->ulen)
+ goto back_up;
+
+ memcpy(&size, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ if (size > space)
+ goto back_up;
+
+ if ((ret =
+ __bam_bulk_overflow(dbc, size, pgno, np)) != 0)
+ return (ret);
+
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+
+ *offp-- = (int32_t)(np - dbuf);
+ *offp-- = (int32_t)size;
+
+ np += size;
+ space -= size;
+ break;
+ default:
+ /* Do nothing. */
+ break;
+ }
+ } while (next_key && (indx += 2) < NUM_ENT(pg));
+
+ cp->indx = indx;
+ cp->dup_len = dup_len;
+ cp->dup_off = dup_off;
+ cp->dup_tlen = dup_tlen;
+
+ /* If we are off the page then try to the next page. */
+ if (ret == 0 && next_key && indx >= NUM_ENT(pg)) {
+ if ((ret = __ham_item_next(dbc, lock_mode, &pgno)) == 0)
+ goto next_pg;
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ if ((ret = __memp_fput(dbc->dbp->mpf,
+ dbc->thread_info, cp->page, dbc->priority)) != 0)
+ return (ret);
+ cp->page = NULL;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ return (ret);
+
+ cp->bucket++;
+ if (cp->bucket > cp->hdr->max_bucket) {
+ /*
+ * Restore cursor to its previous state. We're past
+ * the last item in the last bucket, so the next
+ * DBC->get(DB_NEXT) will return DB_NOTFOUND.
+ */
+ cp->bucket--;
+ ret = DB_NOTFOUND;
+ } else {
+ /*
+ * Start on the next bucket.
+ *
+ * Note that if this new bucket happens to be empty,
+ * but there's another non-empty bucket after it,
+ * we'll return early. This is a rare case, and we
+ * don't guarantee any particular number of keys
+ * returned on each call, so just let the next call
+ * to bulk get move forward by yet another bucket.
+ */
+ cp->pgno = BUCKET_TO_PAGE(cp, cp->bucket);
+ cp->indx = NDX_INVALID;
+ F_CLR(cp, H_ISDUP);
+ ret = __ham_item_next(dbc, lock_mode, &pgno);
+ }
+
+ if ((t_ret = __ham_release_meta(dbc)) != 0)
+ return (t_ret);
+ if (ret == 0)
+ goto next_pg;
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ }
+ *offp = -1;
+ return (0);
+}
+
+static int
+__hamc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DBT tmp_val, *myval;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ u_int32_t nbytes;
+ int ret, t_ret;
+
+ /*
+ * The compiler doesn't realize that we only use this when ret is
+ * equal to 0 and that if ret is equal to 0, that we must have set
+ * myval. So, we initialize it here to shut the compiler up.
+ */
+ COMPQUIET(myval, NULL);
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if (F_ISSET(hcp, H_DELETED) && flags != DB_KEYFIRST &&
+ flags != DB_KEYLAST && flags != DB_OVERWRITE_DUP)
+ return (DB_NOTFOUND);
+
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto err1;
+
+ switch (flags) {
+ case DB_KEYLAST:
+ case DB_KEYFIRST:
+ case DB_NODUPDATA:
+ case DB_NOOVERWRITE:
+ case DB_OVERWRITE_DUP:
+ nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE :
+ HKEYDATA_PSIZE(key->size)) +
+ (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE :
+ HKEYDATA_PSIZE(data->size));
+ if ((ret = __ham_lookup(dbc,
+ key, nbytes, DB_LOCK_WRITE, pgnop)) == DB_NOTFOUND) {
+ if (hcp->seek_found_page != PGNO_INVALID &&
+ hcp->seek_found_page != hcp->pgno) {
+ if ((ret = __memp_fput(mpf, dbc->thread_info,
+ hcp->page, dbc->priority)) != 0)
+ goto err2;
+ hcp->page = NULL;
+ hcp->pgno = hcp->seek_found_page;
+ hcp->indx = NDX_INVALID;
+ }
+
+ if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) {
+ /*
+ * A partial put, but the key does not exist
+ * and we are not beginning the write at 0.
+ * We must create a data item padded up to doff
+ * and then write the new bytes represented by
+ * val.
+ */
+ if ((ret = __ham_init_dbt(dbp->env, &tmp_val,
+ data->size + data->doff,
+ &dbc->my_rdata.data,
+ &dbc->my_rdata.ulen)) != 0)
+ goto err2;
+
+ memset(tmp_val.data, 0, data->doff);
+ memcpy((u_int8_t *)tmp_val.data +
+ data->doff, data->data, data->size);
+ myval = &tmp_val;
+ } else
+ myval = (DBT *)data;
+
+ ret = __ham_add_el(dbc, key, myval, H_KEYDATA);
+ goto done;
+ } else if (ret == 0 && flags == DB_NOOVERWRITE &&
+ !F_ISSET(hcp, H_DELETED)) {
+ if (*pgnop == PGNO_INVALID)
+ ret = DB_KEYEXIST;
+ else
+ ret = __bam_opd_exists(dbc, *pgnop);
+ if (ret != 0)
+ goto done;
+ }
+ break;
+ case DB_BEFORE:
+ case DB_AFTER:
+ case DB_CURRENT:
+ ret = __ham_item(dbc, DB_LOCK_WRITE, pgnop);
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__hamc_put", flags);
+ break;
+ }
+
+ /*
+ * Invalidate any insert index found so they are not reused
+ * in future inserts.
+ */
+ hcp->seek_found_page = PGNO_INVALID;
+ hcp->seek_found_indx = NDX_INVALID;
+
+ if (*pgnop == PGNO_INVALID && ret == 0) {
+ if ((ret = __memp_dirty(mpf, &hcp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto done;
+ if (flags == DB_CURRENT ||
+ (!(F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK)) &&
+ (flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+ flags == DB_NODUPDATA || flags == DB_OVERWRITE_DUP)))
+ ret = __ham_overwrite(dbc, data, flags);
+ else
+ ret = __ham_add_dup(dbc, data, flags, pgnop);
+ }
+
+done: if (hcp->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (t_ret == 0)
+ hcp->page = NULL;
+ }
+
+ if (ret == 0 && F_ISSET(hcp, H_EXPAND)) {
+ ret = __ham_expand_table(dbc);
+ F_CLR(hcp, H_EXPAND);
+ /* If we are out of space, ignore the error. */
+ if (ret == ENOSPC && dbc->txn == NULL)
+ ret = 0;
+ } else if (ret == 0 && F_ISSET(hcp, H_CONTRACT)) {
+ if (!F_ISSET(dbp, DB_AM_REVSPLITOFF))
+ ret = __ham_contract_table(dbc, NULL);
+ F_CLR(hcp, H_CONTRACT);
+ }
+
+err2: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+err1: return (ret);
+}
+
+/********************************* UTILITIES ************************/
+
+/*
+ * __ham_contract_table -- remove the last bucket.
+ * PUBLIC: int __ham_contract_table __P((DBC *, DB_COMPACT *));
+ */
+int
+__ham_contract_table(dbc, c_data)
+ DBC *dbc;
+ DB_COMPACT *c_data;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ HMETA *hdr;
+ PAGE *h;
+ db_pgno_t maxpgno, stoppgno;
+ int drop_segment, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ h = NULL;
+ if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+ return (ret);
+ hcp = (HASH_CURSOR *)dbc->internal;
+ hdr = hcp->hdr;
+
+ if ((ret = __ham_merge_pages(dbc,
+ hdr->max_bucket & hdr->low_mask, hdr->max_bucket, c_data)) != 0)
+ return (ret);
+
+ maxpgno = BUCKET_TO_PAGE(hcp, hdr->max_bucket);
+ drop_segment = hdr->max_bucket == (hdr->low_mask + 1);
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __ham_contract_log(dbp, dbc->txn, &LSN(hdr),
+ 0, PGNO(hdr), &LSN(hdr), hdr->max_bucket, maxpgno)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(hdr));
+
+ hdr->max_bucket--;
+ /*
+ * If we are dropping a segment then adjust the spares table and masks
+ * and free the pages in that segment.
+ */
+ if (drop_segment) {
+ LOCK_CHECK_OFF(dbc->thread_info);
+ hdr->spares[__db_log2(hdr->max_bucket + 1) + 1] = PGNO_INVALID;
+ hdr->high_mask = hdr->low_mask;
+ hdr->low_mask >>= 1;
+ stoppgno = maxpgno + hdr->max_bucket + 1;
+ do {
+ if ((ret = __memp_fget(mpf, &maxpgno,
+ dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+ break;
+ if ((ret = __db_free(dbc, h, 0)) != 0)
+ break;
+ ret = 0;
+ } while (++maxpgno < stoppgno);
+ LOCK_CHECK_ON(dbc->thread_info);
+ }
+
+err: return (ret);
+}
+
+/*
+ * __ham_expand_table --
+ */
+static int
+__ham_expand_table(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DBMETA *mmeta;
+ DB_LOCK metalock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ PAGE *h;
+ db_pgno_t pgno, mpgno;
+ u_int32_t logn, newalloc, new_bucket, old_bucket;
+ int got_meta, new_double, ret, t_ret;
+
+ LOCK_CHECK_OFF(dbc->thread_info);
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+ return (ret);
+
+ LOCK_INIT(metalock);
+ mmeta = (DBMETA *) hcp->hdr;
+ mpgno = mmeta->pgno;
+ h = NULL;
+ newalloc = 0;
+ got_meta = 0;
+
+ /*
+ * If the split point is about to increase, make sure that we
+ * have enough extra pages. The calculation here is weird.
+ * We'd like to do this after we've upped max_bucket, but it's
+ * too late then because we've logged the meta-data split. What
+ * we'll do between then and now is increment max bucket and then
+ * see what the log of one greater than that is; here we have to
+ * look at the log of max + 2. VERY NASTY STUFF.
+ *
+ * We figure out what we need to do, then we log it, then request
+ * the pages from mpool. We don't want to fail after extending
+ * the file.
+ *
+ * If the page we are about to split into has already been allocated,
+ * then we simply need to get it to get its LSN. If it hasn't yet
+ * been allocated, then we know it's LSN (0,0).
+ */
+
+ new_bucket = hcp->hdr->max_bucket + 1;
+ old_bucket = new_bucket & hcp->hdr->low_mask;
+
+ new_double = hcp->hdr->max_bucket == hcp->hdr->high_mask;
+ logn = __db_log2(new_bucket);
+
+ if (!new_double || hcp->hdr->spares[logn + 1] != PGNO_INVALID) {
+ /* Page exists; get it so we can get its LSN */
+ pgno = BUCKET_TO_PAGE(hcp, new_bucket);
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+ goto err;
+ lsn = h->lsn;
+ } else {
+ /* Get the master meta-data page to do allocation. */
+ if (F_ISSET(dbp, DB_AM_SUBDB)) {
+ mpgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ 0, mpgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &mpgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &mmeta)) != 0)
+ goto err;
+ got_meta = 1;
+ }
+ pgno = mmeta->last_pgno + 1;
+ ZERO_LSN(lsn);
+ newalloc = 1;
+ }
+
+ /* Log the meta-data split first. */
+ if (DBC_LOGGING(dbc)) {
+ /*
+ * We always log the page number of the first page of
+ * the allocation group. However, the LSN that we log
+ * is either the LSN on the first page (if we did not
+ * do the actual allocation here) or the LSN on the last
+ * page of the unit (if we did do the allocation here).
+ */
+ if ((ret = __ham_metagroup_log(dbp, dbc->txn,
+ &lsn, 0, hcp->hdr->max_bucket, mpgno, &mmeta->lsn,
+ hcp->hdr->dbmeta.pgno, &hcp->hdr->dbmeta.lsn,
+ pgno, &lsn, newalloc, mmeta->last_pgno)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(lsn);
+
+ hcp->hdr->dbmeta.lsn = lsn;
+
+ if (new_double && hcp->hdr->spares[logn + 1] == PGNO_INVALID) {
+ /*
+ * We need to begin a new doubling and we have not allocated
+ * any pages yet. Read the last page in and initialize it to
+ * make the allocation contiguous. The pgno we calculated
+ * above is the first page allocated. The entry in spares is
+ * that page number minus any buckets already allocated (it
+ * simplifies bucket to page transaction). After we've set
+ * that, we calculate the last pgno.
+ */
+
+ pgno += hcp->hdr->max_bucket;
+
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+ goto err;
+
+ hcp->hdr->spares[logn + 1] =
+ (pgno - new_bucket) - hcp->hdr->max_bucket;
+ mmeta->last_pgno = pgno;
+ mmeta->lsn = lsn;
+
+ P_INIT(h, dbp->pgsize,
+ pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ }
+
+ /* Write out whatever page we ended up modifying. */
+ h->lsn = lsn;
+ if ((ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ h = NULL;
+
+ /*
+ * Update the meta-data page of this hash database.
+ */
+ hcp->hdr->max_bucket = new_bucket;
+ if (new_double) {
+ hcp->hdr->low_mask = hcp->hdr->high_mask;
+ hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask;
+ }
+
+err: if (got_meta)
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, mmeta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (h != NULL)
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Relocate records to the new bucket -- after releasing metapage. */
+ if (ret == 0)
+ ret = __ham_split_page(dbc, old_bucket, new_bucket);
+ LOCK_CHECK_ON(dbc->thread_info);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: u_int32_t __ham_call_hash __P((DBC *, u_int8_t *, u_int32_t));
+ */
+u_int32_t
+__ham_call_hash(dbc, k, len)
+ DBC *dbc;
+ u_int8_t *k;
+ u_int32_t len;
+{
+ DB *dbp;
+ HASH *hashp;
+ HASH_CURSOR *hcp;
+ u_int32_t n, bucket;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ hashp = dbp->h_internal;
+
+ n = (u_int32_t)(hashp->h_hash(dbp, k, len));
+
+ bucket = n & hcp->hdr->high_mask;
+ if (bucket > hcp->hdr->max_bucket)
+ bucket = bucket & hcp->hdr->low_mask;
+ return (bucket);
+}
+
+/*
+ * Check for duplicates, and call __db_ret appropriately. Release
+ * everything held by the cursor.
+ */
+static int
+__ham_dup_return(dbc, val, flags)
+ DBC *dbc;
+ DBT *val;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBT *myval, tmp_val;
+ HASH_CURSOR *hcp;
+ PAGE *pp;
+ db_indx_t ndx;
+ db_pgno_t pgno;
+ u_int32_t off, tlen;
+ u_int8_t *hk, type;
+ int cmp, ret;
+ db_indx_t len;
+
+ /* Check for duplicate and return the first one. */
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ ndx = H_DATAINDEX(hcp->indx);
+ type = HPAGE_TYPE(dbp, hcp->page, ndx);
+ pp = hcp->page;
+ myval = val;
+
+ /*
+ * There are 4 cases:
+ * 1. We are not in duplicate, simply return; the upper layer
+ * will do the right thing.
+ * 2. We are looking at keys and stumbled onto a duplicate.
+ * 3. We are in the middle of a duplicate set. (ISDUP set)
+ * 4. We need to check for particular data match.
+ */
+
+ /* We should never get here with off-page dups. */
+ DB_ASSERT(dbp->env, type != H_OFFDUP);
+
+ /* Case 1 */
+ if (type != H_DUPLICATE && flags != DB_GET_BOTH &&
+ flags != DB_GET_BOTHC && flags != DB_GET_BOTH_RANGE)
+ return (0);
+
+ /*
+ * Here we check for the case where we just stumbled onto a
+ * duplicate. In this case, we do initialization and then
+ * let the normal duplicate code handle it. (Case 2)
+ */
+ if (!F_ISSET(hcp, H_ISDUP) && type == H_DUPLICATE) {
+ F_SET(hcp, H_ISDUP);
+ hcp->dup_tlen = LEN_HDATA(dbp, hcp->page,
+ hcp->hdr->dbmeta.pagesize, hcp->indx);
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ if (flags == DB_LAST ||
+ flags == DB_PREV || flags == DB_PREV_NODUP) {
+ hcp->dup_off = 0;
+ do {
+ memcpy(&len,
+ HKEYDATA_DATA(hk) + hcp->dup_off,
+ sizeof(db_indx_t));
+ hcp->dup_off += DUP_SIZE(len);
+ } while (hcp->dup_off < hcp->dup_tlen);
+ hcp->dup_off -= DUP_SIZE(len);
+ } else {
+ memcpy(&len,
+ HKEYDATA_DATA(hk), sizeof(db_indx_t));
+ hcp->dup_off = 0;
+ }
+ hcp->dup_len = len;
+ }
+
+ /*
+ * If we are retrieving a specific key/data pair, then we
+ * may need to adjust the cursor before returning data.
+ * Case 4
+ */
+ if (flags == DB_GET_BOTH ||
+ flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
+ if (F_ISSET(hcp, H_ISDUP)) {
+ /*
+ * If we're doing a join, search forward from the
+ * current position, not the beginning of the dup set.
+ */
+ if (flags == DB_GET_BOTHC)
+ F_SET(hcp, H_CONTINUE);
+
+ __ham_dsearch(dbc, val, &off, &cmp, flags);
+
+ /*
+ * This flag is set nowhere else and is safe to
+ * clear unconditionally.
+ */
+ F_CLR(hcp, H_CONTINUE);
+ hcp->dup_off = off;
+ } else {
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ if (((HKEYDATA *)hk)->type == H_OFFPAGE) {
+ memcpy(&tlen,
+ HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ memcpy(&pgno,
+ HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ if ((ret = __db_moff(dbc, val, pgno, tlen,
+ dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ cmp = -cmp;
+ } else {
+ /*
+ * We do not zero tmp_val since the comparison
+ * routines may only look at data and size.
+ */
+ tmp_val.data = HKEYDATA_DATA(hk);
+ tmp_val.size = LEN_HDATA(dbp, hcp->page,
+ dbp->pgsize, hcp->indx);
+ cmp = dbp->dup_compare == NULL ?
+ __bam_defcmp(dbp, &tmp_val, val) :
+ dbp->dup_compare(dbp, &tmp_val, val);
+ }
+
+ if (cmp > 0 && flags == DB_GET_BOTH_RANGE &&
+ F_ISSET(dbp, DB_AM_DUPSORT))
+ cmp = 0;
+ }
+
+ if (cmp != 0)
+ return (DB_NOTFOUND);
+ }
+
+ /*
+ * If we've already got the data for this value, or we're doing a bulk
+ * get, we don't want to return the data.
+ */
+ if (F_ISSET(dbc, DBC_MULTIPLE | DBC_MULTIPLE_KEY) ||
+ F_ISSET(val, DB_DBT_ISSET))
+ return (0);
+
+ /*
+ * Now, everything is initialized, grab a duplicate if
+ * necessary.
+ */
+ if (F_ISSET(hcp, H_ISDUP)) { /* Case 3 */
+ /*
+ * Copy the DBT in case we are retrieving into user
+ * memory and we need the parameters for it. If the
+ * user requested a partial, then we need to adjust
+ * the user's parameters to get the partial of the
+ * duplicate which is itself a partial.
+ */
+ memcpy(&tmp_val, val, sizeof(*val));
+
+ if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) {
+ /*
+ * Take the user's length unless it would go
+ * beyond the end of the duplicate.
+ */
+ if (tmp_val.doff > hcp->dup_len)
+ tmp_val.dlen = 0;
+ else if (tmp_val.dlen + tmp_val.doff > hcp->dup_len)
+ tmp_val.dlen = hcp->dup_len - tmp_val.doff;
+
+ } else {
+ F_SET(&tmp_val, DB_DBT_PARTIAL);
+ tmp_val.dlen = hcp->dup_len;
+ tmp_val.doff = 0;
+ }
+
+ /*
+ * Set offset to the appropriate place within the
+ * current duplicate -- need to take into account
+ * both the dup_off and the current duplicate's
+ * length.
+ */
+ tmp_val.doff += hcp->dup_off + sizeof(db_indx_t);
+
+ myval = &tmp_val;
+ }
+
+ /*
+ * Finally, if we had a duplicate, pp, ndx, and myval should be
+ * set appropriately.
+ */
+ if ((ret = __db_ret(dbc, pp, ndx, myval,
+ &dbc->rdata->data, &dbc->rdata->ulen)) != 0) {
+ if (ret == DB_BUFFER_SMALL)
+ val->size = myval->size;
+ return (ret);
+ }
+
+ /*
+ * In case we sent a temporary off to db_ret, set the real
+ * return values.
+ */
+ val->data = myval->data;
+ val->size = myval->size;
+
+ F_SET(val, DB_DBT_ISSET);
+
+ return (0);
+}
+
+/*
+ * Overwrite a record.
+ *
+ * PUBLIC: int __ham_overwrite __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__ham_overwrite(dbc, nval, flags)
+ DBC *dbc;
+ DBT *nval;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBT *myval, tmp_val, tmp_val2;
+ ENV *env;
+ HASH_CURSOR *hcp;
+ void *newrec;
+ u_int8_t *hk, *p;
+ u_int32_t len, nondup_size;
+ db_indx_t newsize;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if (F_ISSET(hcp, H_ISDUP)) {
+ /*
+ * This is an overwrite of a duplicate. We should never
+ * be off-page at this point.
+ */
+ DB_ASSERT(env, hcp->opd == NULL);
+ /* On page dups */
+ if (F_ISSET(nval, DB_DBT_PARTIAL)) {
+ /*
+ * We're going to have to get the current item, then
+ * construct the record, do any padding and do a
+ * replace.
+ */
+ memset(&tmp_val, 0, sizeof(tmp_val));
+ if ((ret =
+ __ham_dup_return(dbc, &tmp_val, DB_CURRENT)) != 0)
+ return (ret);
+
+ /* Figure out new size. */
+ nondup_size = tmp_val.size;
+ newsize = nondup_size;
+
+ /*
+ * Three cases:
+ * 1. strictly append (may need to allocate space
+ * for pad bytes; really gross).
+ * 2. overwrite some and append.
+ * 3. strictly overwrite.
+ */
+ if (nval->doff > nondup_size)
+ newsize +=
+ ((nval->doff - nondup_size) + nval->size);
+ else if (nval->doff + nval->dlen > nondup_size)
+ newsize += nval->size -
+ (nondup_size - nval->doff);
+ else
+ newsize += nval->size - nval->dlen;
+
+ /*
+ * Make sure that the new size doesn't put us over
+ * the onpage duplicate size in which case we need
+ * to convert to off-page duplicates.
+ */
+ if (ISBIG(hcp,
+ (hcp->dup_tlen - nondup_size) + newsize)) {
+ if ((ret = __ham_dup_convert(dbc)) != 0)
+ return (ret);
+ return (hcp->opd->am_put(hcp->opd,
+ NULL, nval, flags, NULL));
+ }
+
+ if ((ret = __os_malloc(dbp->env,
+ DUP_SIZE(newsize), &newrec)) != 0)
+ return (ret);
+ memset(&tmp_val2, 0, sizeof(tmp_val2));
+ F_SET(&tmp_val2, DB_DBT_PARTIAL);
+
+ /* Construct the record. */
+ p = newrec;
+ /* Initial size. */
+ memcpy(p, &newsize, sizeof(db_indx_t));
+ p += sizeof(db_indx_t);
+
+ /* First part of original record. */
+ len = nval->doff > tmp_val.size
+ ? tmp_val.size : nval->doff;
+ memcpy(p, tmp_val.data, len);
+ p += len;
+
+ if (nval->doff > tmp_val.size) {
+ /* Padding */
+ memset(p, 0, nval->doff - tmp_val.size);
+ p += nval->doff - tmp_val.size;
+ }
+
+ /* New bytes */
+ memcpy(p, nval->data, nval->size);
+ p += nval->size;
+
+ /* End of original record (if there is any) */
+ if (nval->doff + nval->dlen < tmp_val.size) {
+ len = (tmp_val.size - nval->doff) - nval->dlen;
+ memcpy(p, (u_int8_t *)tmp_val.data +
+ nval->doff + nval->dlen, len);
+ p += len;
+ }
+
+ /* Final size. */
+ memcpy(p, &newsize, sizeof(db_indx_t));
+
+ /*
+ * Make sure that the caller isn't corrupting
+ * the sort order.
+ */
+ if (dbp->dup_compare != NULL) {
+ tmp_val2.data =
+ (u_int8_t *)newrec + sizeof(db_indx_t);
+ tmp_val2.size = newsize;
+ if (dbp->dup_compare(
+ dbp, &tmp_val, &tmp_val2) != 0) {
+ __os_free(env, newrec);
+ return (__db_duperr(dbp, flags));
+ }
+ }
+
+ tmp_val2.data = newrec;
+ tmp_val2.size = DUP_SIZE(newsize);
+ tmp_val2.doff = hcp->dup_off;
+ tmp_val2.dlen = DUP_SIZE(hcp->dup_len);
+
+ ret = __ham_replpair(dbc, &tmp_val2, H_DUPLICATE);
+ __os_free(env, newrec);
+
+ /* Update cursor */
+ if (ret != 0)
+ return (ret);
+
+ if (newsize > nondup_size) {
+ if ((ret = __hamc_update(dbc,
+ (newsize - nondup_size),
+ DB_HAM_CURADJ_ADDMOD, 1)) != 0)
+ return (ret);
+ hcp->dup_tlen += (newsize - nondup_size);
+ } else {
+ if ((ret = __hamc_update(dbc,
+ (nondup_size - newsize),
+ DB_HAM_CURADJ_DELMOD, 1)) != 0)
+ return (ret);
+ hcp->dup_tlen -= (nondup_size - newsize);
+ }
+ hcp->dup_len = newsize;
+ return (0);
+ } else {
+ /* Check whether we need to convert to off page. */
+ if (ISBIG(hcp,
+ (hcp->dup_tlen - hcp->dup_len) + nval->size)) {
+ if ((ret = __ham_dup_convert(dbc)) != 0)
+ return (ret);
+ return (hcp->opd->am_put(hcp->opd,
+ NULL, nval, flags, NULL));
+ }
+
+ /* Make sure we maintain sort order. */
+ if (dbp->dup_compare != NULL) {
+ tmp_val2.data =
+ HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page,
+ hcp->indx)) + hcp->dup_off +
+ sizeof(db_indx_t);
+ tmp_val2.size = hcp->dup_len;
+ if (dbp->dup_compare(
+ dbp, nval, &tmp_val2) != 0) {
+ __db_errx(env, DB_STR("1131",
+ "Existing data sorts differently from put data"));
+ return (EINVAL);
+ }
+ }
+ /* Overwriting a complete duplicate. */
+ if ((ret =
+ __ham_make_dup(dbp->env, nval, &tmp_val,
+ &dbc->my_rdata.data, &dbc->my_rdata.ulen)) != 0)
+ return (ret);
+ /* Now fix what we are replacing. */
+ tmp_val.doff = hcp->dup_off;
+ tmp_val.dlen = DUP_SIZE(hcp->dup_len);
+
+ /* Update cursor */
+ if (nval->size > hcp->dup_len) {
+ if ((ret = __hamc_update(dbc,
+ (nval->size - hcp->dup_len),
+ DB_HAM_CURADJ_ADDMOD, 1)) != 0)
+ return (ret);
+ hcp->dup_tlen += (nval->size - hcp->dup_len);
+ } else {
+ if ((ret = __hamc_update(dbc,
+ (hcp->dup_len - nval->size),
+ DB_HAM_CURADJ_DELMOD, 1)) != 0)
+ return (ret);
+ hcp->dup_tlen -= (hcp->dup_len - nval->size);
+ }
+ hcp->dup_len = (db_indx_t)nval->size;
+ }
+ myval = &tmp_val;
+ } else if (!F_ISSET(nval, DB_DBT_PARTIAL)) {
+ /* Put/overwrite */
+ memcpy(&tmp_val, nval, sizeof(*nval));
+ F_SET(&tmp_val, DB_DBT_PARTIAL);
+ tmp_val.doff = 0;
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ if (HPAGE_PTYPE(hk) == H_OFFPAGE)
+ memcpy(&tmp_val.dlen,
+ HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ else
+ tmp_val.dlen = LEN_HDATA(dbp, hcp->page,
+ hcp->hdr->dbmeta.pagesize, hcp->indx);
+ myval = &tmp_val;
+ } else
+ /* Regular partial put */
+ myval = nval;
+
+ return (__ham_replpair(dbc, myval,
+ F_ISSET(hcp, H_ISDUP) ? H_DUPLICATE : H_KEYDATA));
+}
+
+/*
+ * Given a key and a cursor, sets the cursor to the page/ndx on which
+ * the key resides. If the key is found, the cursor H_OK flag is set
+ * and the pagep, bndx, pgno (dpagep, dndx, dpgno) fields are set.
+ * If the key is not found, the H_OK flag is not set. If the sought
+ * field is non-0, the pagep, bndx, pgno (dpagep, dndx, dpgno) fields
+ * are set indicating where an add might take place. If it is 0,
+ * none of the cursor pointer field are valid.
+ * PUBLIC: int __ham_lookup __P((DBC *,
+ * PUBLIC: const DBT *, u_int32_t, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_lookup(dbc, key, sought, mode, pgnop)
+ DBC *dbc;
+ const DBT *key;
+ u_int32_t sought;
+ db_lockmode_t mode;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ HASH_CURSOR *hcp;
+ db_pgno_t next_pgno;
+ int match, ret;
+ u_int8_t *dk;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ /*
+ * Set up cursor so that we're looking for space to add an item
+ * as we cycle through the pages looking for the key.
+ */
+ if ((ret = __ham_item_reset(dbc)) != 0)
+ return (ret);
+ hcp->seek_size = sought;
+
+ hcp->bucket = __ham_call_hash(dbc, (u_int8_t *)key->data, key->size);
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ /* look though all pages in the bucket for the key */
+ if ((ret = __ham_get_cpage(dbc, mode)) != 0)
+ return (ret);
+
+ *pgnop = PGNO_INVALID;
+ if (hcp->indx == NDX_INVALID) {
+ hcp->indx = 0;
+ F_CLR(hcp, H_ISDUP);
+ }
+ while (hcp->pgno != PGNO_INVALID) {
+ /* Are we looking for space to insert an item. */
+ if (hcp->seek_size != 0 &&
+ hcp->seek_found_page == PGNO_INVALID &&
+ hcp->seek_size < P_FREESPACE(dbp, hcp->page)) {
+ hcp->seek_found_page = hcp->pgno;
+ hcp->seek_found_indx = NDX_INVALID;
+ }
+
+ if ((ret = __ham_getindex(dbc, hcp->page, key,
+ H_KEYDATA, &match, &hcp->indx)) != 0)
+ return (ret);
+
+ /*
+ * If this is the first page in the bucket with space for
+ * inserting the requested item. Store the insert index to
+ * save having to look it up again later.
+ */
+ if (hcp->seek_found_page == hcp->pgno)
+ hcp->seek_found_indx = hcp->indx;
+
+ if (match == 0) {
+ F_SET(hcp, H_OK);
+ dk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ if (HPAGE_PTYPE(dk) == H_OFFDUP)
+ memcpy(pgnop, HOFFDUP_PGNO(dk),
+ sizeof(db_pgno_t));
+ return (0);
+ }
+
+ /* move the cursor to the next page. */
+ if (NEXT_PGNO(hcp->page) == PGNO_INVALID)
+ break;
+ next_pgno = NEXT_PGNO(hcp->page);
+ hcp->indx = 0;
+ if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0)
+ return (ret);
+ }
+ F_SET(hcp, H_NOMORE);
+ return (DB_NOTFOUND);
+}
+
+/*
+ * __ham_init_dbt --
+ * Initialize a dbt using some possibly already allocated storage
+ * for items.
+ *
+ * PUBLIC: int __ham_init_dbt __P((ENV *,
+ * PUBLIC: DBT *, u_int32_t, void **, u_int32_t *));
+ */
+int
+__ham_init_dbt(env, dbt, size, bufp, sizep)
+ ENV *env;
+ DBT *dbt;
+ u_int32_t size;
+ void **bufp;
+ u_int32_t *sizep;
+{
+ int ret;
+
+ memset(dbt, 0, sizeof(*dbt));
+ if (*sizep < size) {
+ if ((ret = __os_realloc(env, size, bufp)) != 0) {
+ *sizep = 0;
+ return (ret);
+ }
+ *sizep = size;
+ }
+ dbt->data = *bufp;
+ dbt->size = size;
+ return (0);
+}
+
+/*
+ * Adjust the cursor after an insert or delete. The cursor passed is
+ * the one that was operated upon; we just need to check any of the
+ * others.
+ *
+ * len indicates the length of the item added/deleted
+ * add indicates if the item indicated by the cursor has just been
+ * added (add == 1) or deleted (add == 0).
+ * dup indicates if the addition occurred into a duplicate set.
+ *
+ * PUBLIC: int __hamc_update
+ * PUBLIC: __P((DBC *, u_int32_t, db_ham_curadj, int));
+ */
+ static int
+ __hamc_update_getorder(cp, dbc, orderp, pgno, is_dup, args)
+ DBC *dbc, *cp;
+ u_int32_t *orderp;
+ db_pgno_t pgno;
+ u_int32_t is_dup;
+ void *args;
+{
+ HASH_CURSOR *hcp, *lcp;
+
+ COMPQUIET(args, NULL);
+ COMPQUIET(pgno, 0);
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if (cp == dbc || cp->dbtype != DB_HASH)
+ return (0);
+ lcp = (HASH_CURSOR *)cp->internal;
+ if (F_ISSET(lcp, H_DELETED) &&
+ hcp->pgno == lcp->pgno &&
+ hcp->indx == lcp->indx &&
+ *orderp < lcp->order &&
+ (!is_dup || hcp->dup_off == lcp->dup_off) &&
+ !MVCC_SKIP_CURADJ(cp, lcp->pgno))
+ *orderp = lcp->order;
+ return (0);
+}
+struct __hamc_update_setorder_args {
+ int was_mod, was_add;
+ u_int32_t len, order;
+ DB_TXN *my_txn;
+};
+
+static int
+__hamc_update_setorder(cp, dbc, foundp, pgno, is_dup, vargs)
+ DBC *dbc, *cp;
+ u_int32_t *foundp;
+ db_pgno_t pgno;
+ u_int32_t is_dup;
+ void *vargs;
+{
+ HASH_CURSOR *hcp, *lcp;
+ struct __hamc_update_setorder_args *args;
+
+ COMPQUIET(pgno, 0);
+
+ if (cp == dbc || cp->dbtype != DB_HASH)
+ return (0);
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ lcp = (HASH_CURSOR *)cp->internal;
+
+ if (lcp->pgno != hcp->pgno ||
+ lcp->indx == NDX_INVALID ||
+ MVCC_SKIP_CURADJ(cp, lcp->pgno))
+ return (0);
+
+ args = vargs;
+ /*
+ * We're about to move things out from under this
+ * cursor. Clear any cached streaming information.
+ */
+ lcp->stream_start_pgno = PGNO_INVALID;
+
+ if (args->my_txn != NULL && cp->txn != args->my_txn)
+ *foundp = 1;
+
+ if (!is_dup) {
+ if (args->was_add == 1) {
+ /*
+ * This routine is not called to add
+ * non-dup records which are always put
+ * at the end. It is only called from
+ * recovery in this case and the
+ * cursor will be marked deleted.
+ * We are "undeleting" so unmark all
+ * cursors with the same order.
+ */
+ if (lcp->indx == hcp->indx &&
+ F_ISSET(lcp, H_DELETED)) {
+ if (lcp->order == hcp->order)
+ F_CLR(lcp, H_DELETED);
+ else if (lcp->order >
+ hcp->order) {
+
+ /*
+ * If we've moved this cursor's
+ * index, split its order
+ * number--i.e., decrement it by
+ * enough so that the lowest
+ * cursor moved has order 1.
+ * cp_arg->order is the split
+ * point, so decrement by it.
+ */
+ lcp->order -=
+ hcp->order;
+ lcp->indx += 2;
+ }
+ } else if (lcp->indx >= hcp->indx)
+ lcp->indx += 2;
+ } else {
+ if (lcp->indx > hcp->indx) {
+ lcp->indx -= 2;
+ if (lcp->indx == hcp->indx &&
+ F_ISSET(lcp, H_DELETED))
+ lcp->order += args->order;
+ } else if (lcp->indx == hcp->indx &&
+ !F_ISSET(lcp, H_DELETED)) {
+ F_SET(lcp, H_DELETED);
+ F_CLR(lcp, H_ISDUP);
+ lcp->order = args->order;
+ }
+ }
+ } else if (lcp->indx == hcp->indx) {
+ /*
+ * Handle duplicates. This routine is only
+ * called for on page dups. Off page dups are
+ * handled by btree/rtree code.
+ */
+ if (args->was_add == 1) {
+ lcp->dup_tlen += args->len;
+ if (lcp->dup_off == hcp->dup_off &&
+ F_ISSET(hcp, H_DELETED) &&
+ F_ISSET(lcp, H_DELETED)) {
+ /* Abort of a delete. */
+ if (lcp->order == hcp->order)
+ F_CLR(lcp, H_DELETED);
+ else if (lcp->order >
+ hcp->order) {
+ lcp->order -=
+ (hcp->order -1);
+ lcp->dup_off += args->len;
+ }
+ } else if (lcp->dup_off >
+ hcp->dup_off || (!args->was_mod &&
+ lcp->dup_off == hcp->dup_off))
+ lcp->dup_off += args->len;
+ } else {
+ lcp->dup_tlen -= args->len;
+ if (lcp->dup_off > hcp->dup_off) {
+ lcp->dup_off -= args->len;
+ if (lcp->dup_off ==
+ hcp->dup_off &&
+ F_ISSET(lcp, H_DELETED))
+ lcp->order += args->order;
+ } else if (!args->was_mod &&
+ lcp->dup_off == hcp->dup_off &&
+ !F_ISSET(lcp, H_DELETED)) {
+ F_SET(lcp, H_DELETED);
+ lcp->order = args->order;
+ }
+ }
+ }
+ return (0);
+}
+
+int
+__hamc_update(dbc, len, operation, is_dup)
+ DBC *dbc;
+ u_int32_t len;
+ db_ham_curadj operation;
+ int is_dup;
+{
+ DB *dbp;
+ DB_LSN lsn;
+ HASH_CURSOR *hcp;
+ int ret;
+ u_int32_t found;
+ struct __hamc_update_setorder_args args;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ /*
+ * Adjustment will only be logged if this is a subtransaction.
+ * Only subtransactions can abort and effect their parent
+ * transactions cursors.
+ */
+
+ args.my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL;
+ args.len = len;
+
+ switch (operation) {
+ case DB_HAM_CURADJ_DEL:
+ args.was_mod = 0;
+ args.was_add = 0;
+ break;
+ case DB_HAM_CURADJ_ADD:
+ args.was_mod = 0;
+ args.was_add = 1;
+ break;
+ case DB_HAM_CURADJ_DELMOD:
+ args.was_mod = 1;
+ args.was_add = 0;
+ break;
+ case DB_HAM_CURADJ_ADDMOD:
+ args.was_mod = 1;
+ args.was_add = 1;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /*
+ * Calculate the order of this deleted record.
+ * This will be one greater than any cursor that is pointing
+ * at this record and already marked as deleted.
+ */
+ if (args.was_add == 0) {
+ if ((ret = __db_walk_cursors(dbp, dbc, __hamc_update_getorder,
+ &args.order, 0, (u_int32_t)is_dup, NULL)) != 0)
+ return (ret);
+ args.order++;
+ hcp->order = args.order;
+ }
+
+ if ((ret = __db_walk_cursors(dbp, dbc,
+ __hamc_update_setorder, &found, 0, (u_int32_t)is_dup, &args)) != 0)
+ return (ret);
+
+ if (found != 0 && DBC_LOGGING(dbc)) {
+ if ((ret = __ham_curadj_log(dbp, args.my_txn, &lsn, 0,
+ hcp->pgno, hcp->indx, len, hcp->dup_off,
+ (int)operation, is_dup, args.order)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+struct __ham_get_clist_args {
+ u_int nalloc, nused;
+ DBC **listp;
+};
+
+static int
+__ham_get_clist_func(dbc, my_dbc, countp, pgno, indx, vargs)
+ DBC *dbc, *my_dbc;
+ u_int32_t *countp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ void *vargs;
+{
+ int ret;
+ struct __ham_get_clist_args *args;
+
+ COMPQUIET(my_dbc, NULL);
+ COMPQUIET(countp, NULL);
+ args = vargs;
+ /*
+ * We match if dbc->pgno matches the specified
+ * pgno, and if either the dbc->indx matches
+ * or we weren't given an index.
+ */
+ if (dbc->internal->pgno == pgno &&
+ (indx == NDX_INVALID ||
+ dbc->internal->indx == indx) &&
+ !MVCC_SKIP_CURADJ(dbc, pgno)) {
+ if (args->nused >= args->nalloc) {
+ args->nalloc += 10;
+ if ((ret = __os_realloc(dbc->dbp->env,
+ args->nalloc * sizeof(HASH_CURSOR *),
+ &args->listp)) != 0)
+ return (ret);
+ }
+ args->listp[args->nused++] = dbc;
+ }
+ return (0);
+}
+/*
+ * __ham_get_clist --
+ *
+ * Get a list of cursors either on a particular bucket or on a particular
+ * page and index combination. The former is so that we can update
+ * cursors on a split. The latter is so we can update cursors when we
+ * move items off page.
+ *
+ * PUBLIC: int __ham_get_clist __P((DB *, db_pgno_t, u_int32_t, DBC ***));
+ */
+int
+__ham_get_clist(dbp, pgno, indx, listp)
+ DB *dbp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ DBC ***listp;
+{
+ ENV *env;
+ int ret;
+ u_int32_t count;
+ struct __ham_get_clist_args args;
+
+ env = dbp->env;
+ args.listp = NULL;
+ args.nalloc = args.nused = 0;
+
+ if ((ret = __db_walk_cursors(dbp, NULL,
+ __ham_get_clist_func, &count, pgno, indx, &args)) != 0)
+ return (ret);
+ if (args.listp != NULL) {
+ if (args.nused >= args.nalloc) {
+ args.nalloc++;
+ if ((ret = __os_realloc(env,
+ args.nalloc * sizeof(HASH_CURSOR *),
+ &args.listp)) != 0)
+ return (ret);
+ }
+ args.listp[args.nused] = NULL;
+ }
+ *listp = args.listp;
+ return (0);
+}
+
+static int
+__hamc_writelock(dbc)
+ DBC *dbc;
+{
+ DB_LOCK tmp_lock;
+ HASH_CURSOR *hcp;
+ int ret;
+
+ /*
+ * All we need do is acquire the lock and let the off-page
+ * dup tree do its thing.
+ */
+ if (!STD_LOCKING(dbc))
+ return (0);
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ ret = 0;
+ if ((!LOCK_ISSET(hcp->lock) || hcp->lock_mode != DB_LOCK_WRITE)) {
+ tmp_lock = hcp->lock;
+ if ((ret = __ham_lock_bucket(dbc, DB_LOCK_WRITE)) == 0 &&
+ tmp_lock.mode != DB_LOCK_WWRITE)
+ ret = __LPUT(dbc, tmp_lock);
+ }
+ return (ret);
+}
diff --git a/src/hash/hash.src b/src/hash/hash.src
new file mode 100644
index 00000000..e544c6f3
--- /dev/null
+++ b/src/hash/hash.src
@@ -0,0 +1,328 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * Margo Seltzer. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * The President and Fellows of Harvard University. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+DBPRIVATE
+PREFIX __ham
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/hash.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * HASH-insdel: used for hash to insert/delete a pair of entries onto a master
+ * page. The pair might be regular key/data pairs or they might be the
+ * structures that refer to off page items, duplicates or offpage duplicates.
+ * opcode - PUTPAIR/DELPAIR + big masks
+ * fileid - identifies the file referenced
+ * pgno - page within file
+ * ndx - index on the page of the item being added (item index)
+ * pagelsn - lsn on the page before the update
+ * key - the key being inserted
+ * data - the data being inserted
+ */
+BEGIN insdel 50 21
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG ndx u_int32_t lu
+POINTER pagelsn DB_LSN * lu
+OP keytype u_int32_t lu
+HDR key DBT s
+OP datatype u_int32_t lu
+HDR data DBT s
+END
+
+BEGIN_COMPAT insdel 42 21
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG ndx u_int32_t lu
+POINTER pagelsn DB_LSN * lu
+DBT key DBT s
+DBT data DBT s
+END
+
+/*
+ * Used to add and remove overflow pages.
+ * prev_pgno is the previous page that is going to get modified to
+ * point to this one. If this is the first page in a chain
+ * then prev_pgno should be PGNO_INVALID.
+ * new_pgno is the page being allocated.
+ * next_pgno is the page that follows this one. On allocation,
+ * this should be PGNO_INVALID. For deletes, it may exist.
+ * pagelsn is the old lsn on the page.
+ */
+BEGIN newpage 42 22
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG prev_pgno db_pgno_t lu
+POINTER prevlsn DB_LSN * lu
+ARG new_pgno db_pgno_t lu
+POINTER pagelsn DB_LSN * lu
+ARG next_pgno db_pgno_t lu
+POINTER nextlsn DB_LSN * lu
+END
+
+/*
+ * Splitting requires two types of log messages. The second logs the
+ * data on the original page. To redo the split, we have to visit the
+ * new page (pages) and add the items back on the page if they are not
+ * yet there.
+ */
+BEGIN splitdata 42 24
+DB fileid int32_t ld
+ARG opcode u_int32_t lu
+ARG pgno db_pgno_t lu
+PGDBT pageimage DBT s
+POINTER pagelsn DB_LSN * lu
+END
+
+/*
+ * HASH-replace: is used for hash to handle partial puts that only
+ * affect a single master page.
+ * fileid - identifies the file referenced
+ * pgno - page within file
+ * ndx - index on the page of the item being modified (item index)
+ * pagelsn - lsn on the page before the update
+ * off - offset in the old item where the new item is going.
+ * olditem - DBT that describes the part of the item being replaced.
+ * newitem - DBT of the new item.
+ * makedup - this was a replacement that made an item a duplicate.
+ */
+BEGIN replace 50 25
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG ndx u_int32_t lu
+POINTER pagelsn DB_LSN * lu
+ARG off int32_t ld
+OP oldtype u_int32_t lu
+HDR olditem DBT s
+OP newtype u_int32_t lu
+HDR newitem DBT s
+END
+
+BEGIN_COMPAT replace 42 25
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG ndx u_int32_t lu
+POINTER pagelsn DB_LSN * lu
+ARG off int32_t ld
+DBT olditem DBT s
+DBT newitem DBT s
+ARG makedup u_int32_t lu
+END
+
+/*
+ * Used when we empty the first page in a bucket and there are pages after
+ * it. The page after it gets copied into the bucket page (since bucket
+ * pages have to be in fixed locations).
+ * pgno: the bucket page
+ * pagelsn: the old LSN on the bucket page
+ * next_pgno: the page number of the next page
+ * nnext_pgno: page after next_pgno (may need to change its prev)
+ * nnextlsn: the LSN of nnext_pgno.
+ */
+BEGIN copypage 42 28
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER pagelsn DB_LSN * lu
+ARG next_pgno db_pgno_t lu
+POINTER nextlsn DB_LSN * lu
+ARG nnext_pgno db_pgno_t lu
+POINTER nnextlsn DB_LSN * lu
+PGDBT page DBT s
+END
+
+/*
+ * This record logs the meta-data aspects of a split operation. It has enough
+ * information so that we can record both an individual page allocation as well
+ * as a group allocation which we do because in sub databases, the pages in
+ * a hash doubling, must be contiguous. If we do a group allocation, the
+ * number of pages allocated is bucket + 1, pgno is the page number of the
+ * first newly allocated bucket.
+ *
+ * bucket: Old maximum bucket number.
+ * mmpgno: Master meta-data page number (0 if same as mpgno).
+ * mmetalsn: Lsn of the master meta-data page.
+ * mpgno: Meta-data page number.
+ * metalsn: Lsn of the meta-data page.
+ * pgno: Page allocated to bucket + 1 (first newly allocated page)
+ * pagelsn: Lsn of either the first page allocated (if newalloc == 0) or
+ * the last page allocated (if newalloc == 1).
+ * newalloc: 1 indicates that this record did the actual allocation;
+ * 0 indicates that the pages were already allocated from a
+ * previous (failed) allocation.
+ * last_pgno: the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT metagroup 42 29
+DB fileid int32_t ld
+ARG bucket u_int32_t lu
+ARG mmpgno db_pgno_t lu
+POINTER mmetalsn DB_LSN * lu
+ARG mpgno db_pgno_t lu
+POINTER metalsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+POINTER pagelsn DB_LSN * lu
+ARG newalloc u_int32_t lu
+END
+
+BEGIN metagroup 43 29
+DB fileid int32_t ld
+ARG bucket u_int32_t lu
+ARG mmpgno db_pgno_t lu
+POINTER mmetalsn DB_LSN * lu
+ARG mpgno db_pgno_t lu
+POINTER metalsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+POINTER pagelsn DB_LSN * lu
+ARG newalloc u_int32_t lu
+ARG last_pgno db_pgno_t lu
+END
+
+/*
+ * groupalloc
+ *
+ * This is used in conjunction with MPOOL_NEW_GROUP when we are creating
+ * a new database to make sure that we recreate or reclaim free pages
+ * when we allocate a chunk of contiguous ones during database creation.
+ *
+ * meta_lsn: meta-data lsn
+ * start_pgno: starting page number
+ * num: number of allocated pages
+ * unused: unused, historically the meta-data free list page number
+ * last_pgno: the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT groupalloc 42 32
+DB fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+ARG start_pgno db_pgno_t lu
+ARG num u_int32_t lu
+ARG free db_pgno_t lu
+END
+
+BEGIN groupalloc 43 32
+DB fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+ARG start_pgno db_pgno_t lu
+ARG num u_int32_t lu
+ARG unused db_pgno_t lu
+ARG last_pgno db_pgno_t lu
+END
+
+/*
+ * Changeslot
+ * Change the entry in a spares table slot from the "old" page to the "new"
+ * page.
+ */
+BEGIN changeslot 50 35
+DB fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+ARG slot u_int32_t lu
+ARG old db_pgno_t lu
+ARG new db_pgno_t lu
+END
+
+/*
+ * Contract
+ * Contract the hash table by removing the last "bucket". "pgno" is the
+ * page number for that bucket.
+ */
+BEGIN contract 50 37
+DB fileid int32_t ld
+ARG meta db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG bucket u_int32_t lu
+ARG pgno db_pgno_t lu
+END
+
+/*
+ * Records for backing out cursor adjustment.
+ * curadj - added or deleted a record or a dup
+ * within a record.
+ * pgno - page that was effected
+ * indx - indx of recrod effected.
+ * len - if a dup its length.
+ * dup_off - if a dup its offset
+ * add - 1 if add 0 if delete
+ * is_dup - 1 if dup 0 otherwise.
+ * order - order assigned to this deleted record or dup.
+ *
+ * chgpg - rmoved a page, move the records to a new page
+ * mode - CHGPG page was deleted or records move to new page.
+ * - SPLIT we split a bucket
+ * - DUP we convered to off page duplicates.
+ * old_pgno, new_pgno - old and new page numbers.
+ * old_index, new_index - old and new index numbers, NDX_INVALID if
+ * it effects all records on the page.
+ * For three opcodes new in 3.3 (DB_HAM_DELFIRSTPG, DELMIDPG,
+ * and DELLASTPG), we overload old_indx and new_indx to avoid
+ * needing a new log record type: old_indx stores the only
+ * indx of interest to these records, and new_indx stores the
+ * order that's assigned to the lowest deleted record we're
+ * moving.
+ */
+BEGIN curadj 42 33
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG len u_int32_t lu
+ARG dup_off u_int32_t lu
+ARG add int ld
+ARG is_dup int ld
+ARG order u_int32_t lu
+END
+
+BEGIN chgpg 42 34
+DB fileid int32_t ld
+ARG mode db_ham_mode ld
+ARG old_pgno db_pgno_t lu
+ARG new_pgno db_pgno_t lu
+ARG old_indx u_int32_t lu
+ARG new_indx u_int32_t lu
+END
+
diff --git a/src/hash/hash_auto.c b/src/hash/hash_auto.c
new file mode 100644
index 00000000..4adb6cd9
--- /dev/null
+++ b/src/hash/hash_auto.c
@@ -0,0 +1,209 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __ham_insdel_desc[] = {
+ {LOGREC_ARG, SSZ(__ham_insdel_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__ham_insdel_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_insdel_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_insdel_args, ndx), "ndx", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_insdel_args, pagelsn), "pagelsn", ""},
+ {LOGREC_OP, SSZ(__ham_insdel_args, keytype), "keytype", "%lu"},
+ {LOGREC_HDR, SSZ(__ham_insdel_args, key), "key", ""},
+ {LOGREC_OP, SSZ(__ham_insdel_args, datatype), "datatype", "%lu"},
+ {LOGREC_HDR, SSZ(__ham_insdel_args, data), "data", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_insdel_42_desc[] = {
+ {LOGREC_ARG, SSZ(__ham_insdel_42_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__ham_insdel_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_insdel_42_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_insdel_42_args, ndx), "ndx", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_insdel_42_args, pagelsn), "pagelsn", ""},
+ {LOGREC_DBT, SSZ(__ham_insdel_42_args, key), "key", ""},
+ {LOGREC_DBT, SSZ(__ham_insdel_42_args, data), "data", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_newpage_desc[] = {
+ {LOGREC_ARG, SSZ(__ham_newpage_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__ham_newpage_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_newpage_args, prev_pgno), "prev_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_newpage_args, prevlsn), "prevlsn", ""},
+ {LOGREC_ARG, SSZ(__ham_newpage_args, new_pgno), "new_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_newpage_args, pagelsn), "pagelsn", ""},
+ {LOGREC_ARG, SSZ(__ham_newpage_args, next_pgno), "next_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_newpage_args, nextlsn), "nextlsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_splitdata_desc[] = {
+ {LOGREC_DB, SSZ(__ham_splitdata_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_splitdata_args, opcode), "opcode", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_splitdata_args, pgno), "pgno", "%lu"},
+ {LOGREC_PGDBT, SSZ(__ham_splitdata_args, pageimage), "pageimage", ""},
+ {LOGREC_POINTER, SSZ(__ham_splitdata_args, pagelsn), "pagelsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_replace_desc[] = {
+ {LOGREC_DB, SSZ(__ham_replace_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_replace_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_replace_args, ndx), "ndx", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_replace_args, pagelsn), "pagelsn", ""},
+ {LOGREC_ARG, SSZ(__ham_replace_args, off), "off", "%ld"},
+ {LOGREC_OP, SSZ(__ham_replace_args, oldtype), "oldtype", "%lu"},
+ {LOGREC_HDR, SSZ(__ham_replace_args, olditem), "olditem", ""},
+ {LOGREC_OP, SSZ(__ham_replace_args, newtype), "newtype", "%lu"},
+ {LOGREC_HDR, SSZ(__ham_replace_args, newitem), "newitem", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_replace_42_desc[] = {
+ {LOGREC_DB, SSZ(__ham_replace_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_replace_42_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_replace_42_args, ndx), "ndx", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_replace_42_args, pagelsn), "pagelsn", ""},
+ {LOGREC_ARG, SSZ(__ham_replace_42_args, off), "off", "%ld"},
+ {LOGREC_DBT, SSZ(__ham_replace_42_args, olditem), "olditem", ""},
+ {LOGREC_DBT, SSZ(__ham_replace_42_args, newitem), "newitem", ""},
+ {LOGREC_ARG, SSZ(__ham_replace_42_args, makedup), "makedup", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_copypage_desc[] = {
+ {LOGREC_DB, SSZ(__ham_copypage_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_copypage_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_copypage_args, pagelsn), "pagelsn", ""},
+ {LOGREC_ARG, SSZ(__ham_copypage_args, next_pgno), "next_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_copypage_args, nextlsn), "nextlsn", ""},
+ {LOGREC_ARG, SSZ(__ham_copypage_args, nnext_pgno), "nnext_pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_copypage_args, nnextlsn), "nnextlsn", ""},
+ {LOGREC_PGDBT, SSZ(__ham_copypage_args, page), "page", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_metagroup_42_desc[] = {
+ {LOGREC_DB, SSZ(__ham_metagroup_42_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_metagroup_42_args, bucket), "bucket", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_metagroup_42_args, mmpgno), "mmpgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_metagroup_42_args, mmetalsn), "mmetalsn", ""},
+ {LOGREC_ARG, SSZ(__ham_metagroup_42_args, mpgno), "mpgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_metagroup_42_args, metalsn), "metalsn", ""},
+ {LOGREC_ARG, SSZ(__ham_metagroup_42_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_metagroup_42_args, pagelsn), "pagelsn", ""},
+ {LOGREC_ARG, SSZ(__ham_metagroup_42_args, newalloc), "newalloc", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_metagroup_desc[] = {
+ {LOGREC_DB, SSZ(__ham_metagroup_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_metagroup_args, bucket), "bucket", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_metagroup_args, mmpgno), "mmpgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_metagroup_args, mmetalsn), "mmetalsn", ""},
+ {LOGREC_ARG, SSZ(__ham_metagroup_args, mpgno), "mpgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_metagroup_args, metalsn), "metalsn", ""},
+ {LOGREC_ARG, SSZ(__ham_metagroup_args, pgno), "pgno", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_metagroup_args, pagelsn), "pagelsn", ""},
+ {LOGREC_ARG, SSZ(__ham_metagroup_args, newalloc), "newalloc", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_metagroup_args, last_pgno), "last_pgno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_groupalloc_42_desc[] = {
+ {LOGREC_DB, SSZ(__ham_groupalloc_42_args, fileid), "fileid", ""},
+ {LOGREC_POINTER, SSZ(__ham_groupalloc_42_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__ham_groupalloc_42_args, start_pgno), "start_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_groupalloc_42_args, num), "num", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_groupalloc_42_args, free), "free", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_groupalloc_desc[] = {
+ {LOGREC_DB, SSZ(__ham_groupalloc_args, fileid), "fileid", ""},
+ {LOGREC_POINTER, SSZ(__ham_groupalloc_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__ham_groupalloc_args, start_pgno), "start_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_groupalloc_args, num), "num", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_groupalloc_args, unused), "unused", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_groupalloc_args, last_pgno), "last_pgno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_changeslot_desc[] = {
+ {LOGREC_DB, SSZ(__ham_changeslot_args, fileid), "fileid", ""},
+ {LOGREC_POINTER, SSZ(__ham_changeslot_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__ham_changeslot_args, slot), "slot", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_changeslot_args, old), "old", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_changeslot_args, new), "new", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_contract_desc[] = {
+ {LOGREC_DB, SSZ(__ham_contract_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_contract_args, meta), "meta", "%lu"},
+ {LOGREC_POINTER, SSZ(__ham_contract_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__ham_contract_args, bucket), "bucket", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_contract_args, pgno), "pgno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_curadj_desc[] = {
+ {LOGREC_DB, SSZ(__ham_curadj_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_curadj_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_curadj_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_curadj_args, len), "len", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_curadj_args, dup_off), "dup_off", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_curadj_args, add), "add", "%ld"},
+ {LOGREC_ARG, SSZ(__ham_curadj_args, is_dup), "is_dup", "%ld"},
+ {LOGREC_ARG, SSZ(__ham_curadj_args, order), "order", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __ham_chgpg_desc[] = {
+ {LOGREC_DB, SSZ(__ham_chgpg_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__ham_chgpg_args, mode), "mode", "%ld"},
+ {LOGREC_ARG, SSZ(__ham_chgpg_args, old_pgno), "old_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_chgpg_args, new_pgno), "new_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_chgpg_args, old_indx), "old_indx", "%lu"},
+ {LOGREC_ARG, SSZ(__ham_chgpg_args, new_indx), "new_indx", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __ham_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__ham_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_insdel_recover, DB___ham_insdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_newpage_recover, DB___ham_newpage)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_splitdata_recover, DB___ham_splitdata)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_replace_recover, DB___ham_replace)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_copypage_recover, DB___ham_copypage)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_metagroup_recover, DB___ham_metagroup)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_groupalloc_recover, DB___ham_groupalloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_changeslot_recover, DB___ham_changeslot)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_contract_recover, DB___ham_contract)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_curadj_recover, DB___ham_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_chgpg_recover, DB___ham_chgpg)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/hash/hash_autop.c b/src/hash/hash_autop.c
new file mode 100644
index 00000000..f1ef0042
--- /dev/null
+++ b/src/hash/hash_autop.c
@@ -0,0 +1,314 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#ifdef HAVE_HASH
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __ham_insdel_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_insdel_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_insdel", __ham_insdel_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_insdel_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_insdel_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_insdel_42", __ham_insdel_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_newpage_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_newpage_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_newpage", __ham_newpage_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_splitdata_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_splitdata_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_splitdata", __ham_splitdata_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_replace_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_replace_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_replace", __ham_replace_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_replace_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_replace_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_replace_42", __ham_replace_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_copypage_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_copypage_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_copypage", __ham_copypage_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_metagroup_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_metagroup_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_metagroup_42", __ham_metagroup_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_metagroup_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_metagroup_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_metagroup", __ham_metagroup_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_groupalloc_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_groupalloc_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_groupalloc_42", __ham_groupalloc_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_groupalloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_groupalloc_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_groupalloc", __ham_groupalloc_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_changeslot_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_changeslot_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_changeslot", __ham_changeslot_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_contract_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_contract_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_contract", __ham_contract_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_curadj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_curadj_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_curadj", __ham_curadj_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_chgpg_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_chgpg_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__ham_chgpg", __ham_chgpg_desc, info));
+}
+
+/*
+ * PUBLIC: int __ham_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__ham_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_insdel_print, DB___ham_insdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_newpage_print, DB___ham_newpage)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_splitdata_print, DB___ham_splitdata)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_replace_print, DB___ham_replace)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_copypage_print, DB___ham_copypage)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_metagroup_print, DB___ham_metagroup)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_groupalloc_print, DB___ham_groupalloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_changeslot_print, DB___ham_changeslot)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_contract_print, DB___ham_contract)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_curadj_print, DB___ham_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_chgpg_print, DB___ham_chgpg)) != 0)
+ return (ret);
+ return (0);
+}
+#endif /* HAVE_HASH */
diff --git a/src/hash/hash_compact.c b/src/hash/hash_compact.c
new file mode 100644
index 00000000..83b5ffb1
--- /dev/null
+++ b/src/hash/hash_compact.c
@@ -0,0 +1,549 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+#include "dbinc/mp.h"
+
+static int __ham_copy_data __P((DBC *, PAGE *, DB_COMPACT *, int *));
+static int __ham_truncate_overflow __P((DBC *, u_int32_t, DB_COMPACT *, int *));
+
+/*
+ * __ham_compact_int -- internal HASH compaction routine.
+ *
+ * PUBLIC: int __ham_compact_int __P((DBC *,
+ * PUBLIC: DBT *, DBT *, u_int32_t, DB_COMPACT *, int *, u_int32_t));
+ */
+int
+__ham_compact_int(dbc, start, stop, factor, c_data, donep, flags)
+ DBC *dbc;
+ DBT *start, *stop;
+ u_int32_t factor;
+ DB_COMPACT *c_data;
+ int *donep;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ db_pgno_t origpgno, pgno;
+ int check_trunc, pgs_done, ret, t_ret;
+ u_int32_t empty_buckets, i, stop_bucket;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ pgs_done = 0;
+ empty_buckets = 0;
+ check_trunc = c_data->compact_truncate != PGNO_INVALID;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ return (ret);
+
+ if (stop != NULL && stop->size != 0)
+ stop_bucket = *(u_int32_t *)stop->data;
+ else
+ stop_bucket = hcp->hdr->max_bucket;
+
+ if (start != NULL && start->size != 0)
+ hcp->bucket = *(u_int32_t *)start->data;
+ else
+ hcp->bucket = 0;
+
+ for (; hcp->bucket <= stop_bucket && ret == 0; hcp->bucket++) {
+ /*
+ * For each bucket first move records toward the head of
+ * the bucket.
+ */
+ hcp->indx = NDX_INVALID;
+ F_CLR(hcp, H_ISDUP);
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ pgno = PGNO_INVALID;
+ ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
+
+ /*
+ * If the bucket is empty, just note it, otherwise process it.
+ * If there are any records there must be some in the head
+ * of the bucket.
+ */
+ if (ret == DB_NOTFOUND ) {
+ empty_buckets++;
+ c_data->compact_pages_examine++;
+ DB_ASSERT(dbp->env,
+ PREV_PGNO(hcp->page) == PGNO_INVALID &&
+ NEXT_PGNO(hcp->page) == PGNO_INVALID);
+ goto err;
+ } else if (ret != 0)
+ break;
+ c_data->compact_pages_examine++;
+
+ if (NEXT_PGNO(hcp->page) != PGNO_INVALID) {
+ if ((ret =
+ __ham_compact_bucket(dbc, c_data, &pgs_done)) != 0)
+ goto err;
+ pgno = PGNO_INVALID;
+ if ((ret = __ham_item(dbc, DB_LOCK_WRITE, &pgno)) != 0)
+ goto err;
+ }
+
+ /*
+ * Loop through the items in this page in the bucket and process
+ * overflow records and off page duplicate sets.
+ */
+ while (ret == 0) {
+ /* Handle off page duplicate trees. */
+ if (pgno == PGNO_INVALID)
+ goto no_opd;
+ if (check_trunc &&
+ pgno > c_data->compact_truncate) {
+ c_data->compact_pages_examine++;
+ /*
+ * Truncate this page if possible.
+ * We must update the parent here
+ * because the page number is
+ * not aligned.
+ */
+ if ((ret = __memp_dirty(mpf, &hcp->page,
+ dbc->thread_info,
+ dbc->txn, dbc->priority, 0)) != 0)
+ break;
+ origpgno = pgno;
+ if ((ret = __db_truncate_root(dbc, hcp->page,
+ H_DATAINDEX(hcp->indx), &pgno, 0)) != 0)
+ break;
+ if (pgno != origpgno) {
+ memcpy(HOFFDUP_PGNO(H_PAIRDATA(dbp,
+ hcp->page, hcp->indx)),
+ &pgno, sizeof(db_pgno_t));
+ pgs_done++;
+ c_data->compact_pages--;
+ }
+ }
+ /*
+ * Compact the off page duplicate tree.
+ */
+ if ((ret = __bam_compact_opd(dbc,
+ pgno, NULL, factor, c_data, &pgs_done)) != 0)
+ break;
+
+no_opd: if (check_trunc && HPAGE_PTYPE(H_PAIRDATA(
+ dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
+ /* This is an overflow chain. */
+ if ((ret = __ham_truncate_overflow(dbc,
+ H_DATAINDEX(hcp->indx),
+ c_data, &pgs_done)) != 0)
+ break;
+ }
+
+ /* Check for an overflow key. */
+ if (check_trunc && HPAGE_PTYPE(H_PAIRKEY(
+ dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
+ /* This is an overflow chain. */
+ if ((ret = __ham_truncate_overflow(dbc,
+ H_KEYINDEX(hcp->indx),
+ c_data, &pgs_done)) != 0)
+ break;
+ }
+
+ pgno = PGNO_INVALID;
+ ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
+ }
+
+err: if (hcp->page != NULL &&
+ (t_ret = __memp_fput(mpf, dbc->thread_info,
+ hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ hcp->page = NULL;
+ hcp->pgno = pgno = PGNO_INVALID;
+ /*
+ * If we are in an auto-transaction and we updated something
+ * return to the caller to commit this transaction to
+ * avoid holding locks. Otherwise process the next bucket.
+ * We can drop the lock if we did not do anything.
+ * We always must commit the txn if we are in MVCC
+ * as we have dirtied the hash buckets.
+ */
+ if (ret == 0 &&
+ atomic_read(&dbp->mpf->mfp->multiversion) == 0 &&
+ (pgs_done == 0 || dbc->txn == NULL))
+ ret = __LPUT(dbc, hcp->lock);
+ else if (LF_ISSET(DB_AUTO_COMMIT)) {
+ if (ret == 0)
+ hcp->bucket++;
+ break;
+ }
+ }
+ /*
+ * If we saw any empty buckets and we are freeing space we
+ * want to contract the table before dropping the metadata
+ * page. Wait till we are done with everything else as we
+ * need to get an exclusive lock on the metadata page.
+ */
+ if (ret == 0 && empty_buckets != 0 && LF_ISSET(DB_FREE_SPACE)) {
+ for (i = 0; i < empty_buckets && hcp->hdr->max_bucket > 2; i++)
+ if ((ret = __ham_contract_table(dbc, c_data)) != 0)
+ break;
+ }
+
+ if (ret == 0)
+ ret = __db_retcopy(dbp->env, start, &hcp->bucket,
+ sizeof(hcp->bucket), &start->data, &start->ulen);
+ (void)__ham_release_meta(dbc);
+ c_data->compact_empty_buckets += empty_buckets;
+ if (hcp->bucket > stop_bucket)
+ *donep = 1;
+ return (ret);
+}
+
+/*
+ * __ham_compact_bucket -- move data to as few pages as possible.
+ *
+ * PUBLIC: int __ham_compact_bucket __P((DBC *, DB_COMPACT *, int *));
+ */
+int
+__ham_compact_bucket(dbc, c_data, pgs_donep)
+ DBC *dbc;
+ DB_COMPACT *c_data;
+ int *pgs_donep;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ PAGE *pg;
+ db_pgno_t pgno;
+ int check_trunc, ret, t_ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ pg = hcp->page;
+ check_trunc = c_data->compact_truncate != PGNO_INVALID;
+ ret = 0;
+
+ pgno = hcp->pgno;
+ do {
+ if (pg == NULL && (ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+ break;
+ /* Sort any unsorted pages before adding to the page. */
+ if (TYPE(pg) == P_HASH_UNSORTED) {
+ if ((ret = __ham_sort_page_cursor(dbc, pg)) != 0)
+ break;
+ (*pgs_donep)++;
+ }
+
+ /* If this is not the head try to move it to a lower page. */
+ if (check_trunc && PREV_PGNO(pg) != PGNO_INVALID &&
+ PGNO(pg) > c_data->compact_truncate &&
+ (ret = __db_exchange_page(dbc, &pg,
+ hcp->page, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+ break;
+ if (pgno != PGNO(pg))
+ (*pgs_donep)++;
+
+ if (NEXT_PGNO(pg) == PGNO_INVALID)
+ break;
+ if ((ret = __ham_copy_data(dbc, pg, c_data, pgs_donep)) != 0)
+ break;
+ pgno = NEXT_PGNO(pg);
+ if (pg != hcp->page && (ret = __memp_fput(mpf,
+ dbc->thread_info, pg, dbc->priority)) != 0)
+ break;
+ pg = NULL;
+ } while (pgno != PGNO_INVALID);
+
+ if (pg != NULL && pg != hcp->page &&
+ (t_ret = __memp_fput(mpf, dbc->thread_info, pg, dbc->priority)) &&
+ ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __ham_copy_data -- copy as many records as possible from next page
+ */
+static int
+__ham_copy_data(dbc, pg, c_data, pgs_donep)
+ DBC *dbc;
+ PAGE *pg;
+ DB_COMPACT *c_data;
+ int *pgs_donep;
+{
+ DB *dbp;
+ DBC *newdbc;
+ DBT data, key;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp, *ncp;
+ PAGE *nextpage;
+ db_pgno_t origpgno;
+ int i, nument, records, ret, t_ret;
+ u_int32_t len;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ records = 0;
+
+ if ((ret = __dbc_dup(dbc, &newdbc, 0)) != 0)
+ return (ret);
+ ncp = (HASH_CURSOR *)newdbc->internal;
+ ncp->hdr = hcp->hdr;
+
+ /*
+ * Copy data to the front of the bucket. Loop until either we
+ * have not replaced the next page or there is no next page.
+ * If the next page was not removed then it still has data
+ * on it.
+ */
+ origpgno = PGNO_INVALID;
+ while (origpgno != NEXT_PGNO(pg) &&
+ (origpgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
+
+ if ((ret = __memp_fget(mpf, &NEXT_PGNO(pg), dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &nextpage)) != 0)
+ break;
+
+ c_data->compact_pages_examine++;
+ ncp->page = nextpage;
+ ncp->pgno = PGNO(nextpage);
+ ncp->indx = 0;
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ nument = NUM_ENT(nextpage);
+ DB_ASSERT(dbp->env, nument != 0);
+ for (i = 0; i < nument; i += 2) {
+ len = LEN_HITEM(dbp, nextpage, dbp->pgsize, 0) +
+ LEN_HITEM(dbp, nextpage, dbp->pgsize, 1) +
+ 2 * sizeof(db_indx_t);
+ if (P_FREESPACE(dbp, pg) < len)
+ continue;
+
+ if ((ret =
+ __ham_copypair(dbc, nextpage, 0, pg, NULL, 1)) != 0)
+ break;
+
+ records++;
+ if ((ret = __ham_del_pair(newdbc,
+ HAM_DEL_IGNORE_OFFPAGE, pg)) != 0)
+ break;
+ if (!STD_LOCKING(dbc)) {
+ if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+ return (ret);
+ ++hcp->hdr->nelem;
+ }
+ }
+ /*
+ * If we moved all the records then __ham_del_pair will
+ * have deleted the nextpage.
+ */
+ if (records >= nument/2) {
+ c_data->compact_pages_examine++;
+ c_data->compact_pages_free++;
+ COMPACT_TRUNCATE(c_data);
+ }
+ if (ncp->page != NULL &&
+ (t_ret = __memp_fput(mpf, dbc->thread_info,
+ ncp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ ncp->page = NULL;
+ ncp->pgno = PGNO_INVALID;
+ }
+
+ /*
+ * If __ham_del_pair freed a page then we needed to dirty the metapage
+ * and it could change so we need to copy it back to hcp.
+ */
+ hcp->hdr = ncp->hdr;
+ ncp->hdr = NULL;
+ if ((t_ret = __ham_release_meta(newdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __dbc_close(newdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (records != 0)
+ (*pgs_donep)++;
+ return (ret);
+}
+
+/*
+ * __ham_truncate_overflow -- try to truncate pages from an overflow chain.
+ */
+static int
+__ham_truncate_overflow(dbc, indx, c_data, pgs_done)
+ DBC *dbc;
+ u_int32_t indx;
+ DB_COMPACT *c_data;
+ int *pgs_done;
+{
+ DB *dbp;
+ HASH_CURSOR *hcp;
+ db_pgno_t origpgno, pgno;
+ int ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ memcpy(&pgno,
+ HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)), sizeof(db_pgno_t));
+ if (pgno > c_data->compact_truncate) {
+ c_data->compact_pages_examine++;
+ origpgno = pgno;
+ if ((ret = __memp_dirty(dbp->mpf, &hcp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ if ((ret =
+ __db_truncate_root(dbc, hcp->page, indx, &pgno, 0)) != 0)
+ return (ret);
+ if (pgno != origpgno) {
+ memcpy(HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)),
+ &pgno, sizeof(db_pgno_t));
+ (*pgs_done)++;
+ c_data->compact_pages--;
+ }
+ }
+ if ((ret = __db_truncate_overflow(dbc, pgno, NULL, c_data)) != 0)
+ return (ret);
+ return (0);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __ham_compact_hash -- compact the hash table.
+ * PUBLIC: int __ham_compact_hash __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+ */
+int
+__ham_compact_hash(dbp, ip, txn, c_data)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_COMPACT *c_data;
+{
+ DBC *dbc;
+ DB_LOCK lock;
+ HASH_CURSOR *hcp;
+ HMETA *meta;
+ PAGE *oldpage;
+ db_pgno_t free_pgno, last_pgno, pgno, start_pgno;
+ int flags, local_txn, ret, t_ret;
+ u_int32_t bucket, i, size;
+
+ local_txn = IS_DB_AUTO_COMMIT(dbp, txn);
+ oldpage = NULL;
+ dbc = NULL;
+ LOCK_INIT(lock);
+
+ if (local_txn &&
+ (ret = __txn_begin(dbp->env, ip, txn, &txn, 0)) != 0)
+ return (ret);
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err1;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if ((ret = __ham_get_meta(dbc)) != 0 ||
+ (ret = __ham_dirty_meta(dbc, 0)) != 0)
+ goto err1;
+
+ meta = hcp->hdr;
+
+ LOCK_CHECK_OFF(ip);
+
+ /*
+ * Find contiguous lower numbered pages for each hash table segment.
+ */
+ for (i = 0; i < NCACHED && meta->spares[i] != PGNO_INVALID; i++) {
+ if (i == 0) {
+ bucket = 0;
+ size = 1;
+ } else {
+ bucket = 1 << (i - 1);
+ size = bucket;
+ }
+ start_pgno = meta->spares[i] + bucket;
+ if ((ret = __db_find_free(dbc, P_HASH,
+ size, start_pgno, &free_pgno)) != 0) {
+ if (ret != DB_NOTFOUND)
+ break;
+ ret = 0;
+ continue;
+ }
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __ham_changeslot_log(dbp,
+ dbc->txn, &LSN(meta),
+ 0, &LSN(meta), i, start_pgno, free_pgno)) != 0)
+ break;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+ last_pgno = free_pgno + bucket;
+ /*
+ * March through the list swapping pages. If the page is
+ * empty we just need to free it. If we are just sliding
+ * things down don't free the pages that will be reused.
+ * Note that __db_exchange_page returns the new page so
+ * we must put it.
+ */
+ for (pgno = start_pgno;
+ pgno < start_pgno + size; pgno++, free_pgno++) {
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE, pgno, DB_LOCK_WRITE, 0, &lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &oldpage)) != 0)
+ goto err;
+ if (NUM_ENT(oldpage) != 0) {
+ if (pgno < last_pgno)
+ flags = 0;
+ else
+ flags = DB_EXCH_FREE;
+ if ((ret = __db_exchange_page(dbc,
+ &oldpage, NULL, free_pgno, flags)) != 0)
+ goto err;
+ } else if (pgno >= last_pgno) {
+ if ((ret = __db_free(dbc, oldpage, 0)) != 0)
+ goto err;
+ COMPACT_TRUNCATE(c_data);
+ oldpage = NULL;
+ }
+ if (oldpage != NULL && (ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, oldpage, dbc->priority)) != 0)
+ goto err;
+ ret = 0;
+ oldpage = NULL;
+ c_data->compact_pages_examine++;
+ }
+ meta->spares[i] = free_pgno - (size + bucket);
+ }
+ if (ret == 0 && F_ISSET(dbp, DB_AM_SUBDB) &&
+ PGNO(hcp->hdr) > c_data->compact_truncate)
+ ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr, c_data);
+
+err: if (oldpage != NULL && (t_ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, oldpage, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ LOCK_CHECK_ON(ip);
+err1: if (dbc != NULL) {
+ if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if (local_txn && (t_ret = (ret == 0 ?
+ __txn_commit(txn, 0) : __txn_abort(txn))) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+#endif
diff --git a/src/hash/hash_conv.c b/src/hash/hash_conv.c
new file mode 100644
index 00000000..fa084f2a
--- /dev/null
+++ b/src/hash/hash_conv.c
@@ -0,0 +1,110 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/hash.h"
+
+/*
+ * __ham_pgin --
+ * Convert host-specific page layout from the host-independent format
+ * stored on disk.
+ *
+ * PUBLIC: int __ham_pgin __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__ham_pgin(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+ PAGE *h;
+
+ h = pp;
+ pginfo = (DB_PGINFO *)cookie->data;
+
+ /*
+ * The hash access method does blind reads of pages, causing them
+ * to be created. If the type field isn't set it's one of them,
+ * initialize the rest of the page and return.
+ */
+ if (h->type != P_HASHMETA && h->pgno == PGNO_INVALID) {
+ P_INIT(pp, (db_indx_t)pginfo->db_pagesize,
+ pg, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ return (0);
+ }
+
+ if (!F_ISSET(pginfo, DB_AM_SWAP))
+ return (0);
+
+ return (h->type == P_HASHMETA ? __ham_mswap(dbp->env, pp) :
+ __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 1));
+}
+
+/*
+ * __ham_pgout --
+ * Convert host-specific page layout to the host-independent format
+ * stored on disk.
+ *
+ * PUBLIC: int __ham_pgout __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__ham_pgout(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+ PAGE *h;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!F_ISSET(pginfo, DB_AM_SWAP))
+ return (0);
+
+ h = pp;
+ return (h->type == P_HASHMETA ? __ham_mswap(dbp->env, pp) :
+ __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 0));
+}
+
+/*
+ * __ham_mswap --
+ * Swap the bytes on the hash metadata page.
+ *
+ * PUBLIC: int __ham_mswap __P((ENV *, void *));
+ */
+int
+__ham_mswap(env, pg)
+ ENV *env;
+ void *pg;
+{
+ u_int8_t *p;
+ int i;
+
+ COMPQUIET(env, NULL);
+
+ __db_metaswap(pg);
+ p = (u_int8_t *)pg + sizeof(DBMETA);
+
+ SWAP32(p); /* max_bucket */
+ SWAP32(p); /* high_mask */
+ SWAP32(p); /* low_mask */
+ SWAP32(p); /* ffactor */
+ SWAP32(p); /* nelem */
+ SWAP32(p); /* h_charkey */
+ for (i = 0; i < NCACHED; ++i)
+ SWAP32(p); /* spares */
+ p += 59 * sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* crypto_magic */
+ return (0);
+}
diff --git a/src/hash/hash_dup.c b/src/hash/hash_dup.c
new file mode 100644
index 00000000..879c33d7
--- /dev/null
+++ b/src/hash/hash_dup.c
@@ -0,0 +1,943 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+/*
+ * PACKAGE: hashing
+ *
+ * DESCRIPTION:
+ * Manipulation of duplicates for the hash package.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+
+static int __hamc_chgpg __P((DBC *,
+ db_pgno_t, u_int32_t, db_pgno_t, u_int32_t));
+static int __ham_check_move __P((DBC *, u_int32_t));
+static int __ham_dcursor __P((DBC *, db_pgno_t, u_int32_t));
+static int __ham_move_offpage __P((DBC *, PAGE *, u_int32_t, db_pgno_t));
+static int __hamc_chgpg_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * Called from hash_access to add a duplicate key. nval is the new
+ * value that we want to add. The flags correspond to the flag values
+ * to cursor_put indicating where to add the new element.
+ * There are 4 cases.
+ * Case 1: The existing duplicate set already resides on a separate page.
+ * We return and let the common code handle this.
+ * Case 2: The element is small enough to just be added to the existing set.
+ * Case 3: The element is large enough to be a big item, so we're going to
+ * have to push the set onto a new page.
+ * Case 4: The element is large enough to push the duplicate set onto a
+ * separate page.
+ *
+ * PUBLIC: int __ham_add_dup __P((DBC *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ham_add_dup(dbc, nval, flags, pgnop)
+ DBC *dbc;
+ DBT *nval;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DBT pval, tmp_val;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HASH_CURSOR *hcp;
+ u_int32_t add_bytes, new_size;
+ int cmp, ret;
+ u_int8_t *hk;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ DB_ASSERT(env, flags != DB_CURRENT);
+
+ add_bytes = nval->size +
+ (F_ISSET(nval, DB_DBT_PARTIAL) ? nval->doff : 0);
+ add_bytes = DUP_SIZE(add_bytes);
+
+ if ((ret = __ham_check_move(dbc, add_bytes)) != 0)
+ return (ret);
+
+ /*
+ * Check if resulting duplicate set is going to need to go
+ * onto a separate duplicate page. If so, convert the
+ * duplicate set and add the new one. After conversion,
+ * hcp->dndx is the first free ndx or the index of the
+ * current pointer into the duplicate set.
+ */
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ /* Add the len bytes to the current singleton. */
+ if (HPAGE_PTYPE(hk) != H_DUPLICATE)
+ add_bytes += DUP_SIZE(0);
+ new_size =
+ LEN_HKEYDATA(dbp, hcp->page, dbp->pgsize, H_DATAINDEX(hcp->indx)) +
+ add_bytes;
+
+ /*
+ * We convert to off-page duplicates if the item is a big item,
+ * the addition of the new item will make the set large, or
+ * if there isn't enough room on this page to add the next item.
+ */
+ if (HPAGE_PTYPE(hk) != H_OFFDUP &&
+ (HPAGE_PTYPE(hk) == H_OFFPAGE || ISBIG(hcp, new_size) ||
+ add_bytes > P_FREESPACE(dbp, hcp->page))) {
+
+ if ((ret = __ham_dup_convert(dbc)) != 0)
+ return (ret);
+ return (hcp->opd->am_put(hcp->opd,
+ NULL, nval, flags, NULL));
+ }
+
+ /* There are two separate cases here: on page and off page. */
+ if (HPAGE_PTYPE(hk) != H_OFFDUP) {
+ if (HPAGE_PTYPE(hk) != H_DUPLICATE) {
+ pval.flags = 0;
+ pval.data = HKEYDATA_DATA(hk);
+ pval.size = LEN_HDATA(dbp, hcp->page, dbp->pgsize,
+ hcp->indx);
+ if ((ret = __ham_make_dup(env,
+ &pval, &tmp_val, &dbc->my_rdata.data,
+ &dbc->my_rdata.ulen)) != 0 || (ret =
+ __ham_replpair(dbc, &tmp_val, H_DUPLICATE)) != 0)
+ return (ret);
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ HPAGE_PTYPE(hk) = H_DUPLICATE;
+
+ /*
+ * Update the cursor position since we now are in
+ * duplicates.
+ */
+ F_SET(hcp, H_ISDUP);
+ hcp->dup_off = 0;
+ hcp->dup_len = pval.size;
+ hcp->dup_tlen = DUP_SIZE(hcp->dup_len);
+ }
+
+ /* Now make the new entry a duplicate. */
+ if ((ret = __ham_make_dup(env, nval,
+ &tmp_val, &dbc->my_rdata.data, &dbc->my_rdata.ulen)) != 0)
+ return (ret);
+
+ tmp_val.dlen = 0;
+ switch (flags) { /* On page. */
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_OVERWRITE_DUP:
+ if (dbp->dup_compare != NULL) {
+ __ham_dsearch(dbc,
+ nval, &tmp_val.doff, &cmp, flags);
+
+ /*
+ * Duplicate duplicates are not supported w/
+ * sorted dups. We can either overwrite or
+ * return DB_KEYEXIST.
+ */
+ if (cmp == 0) {
+ if (flags == DB_OVERWRITE_DUP)
+ return (__ham_overwrite(dbc,
+ nval, flags));
+ return (__db_duperr(dbp, flags));
+ }
+ } else {
+ hcp->dup_tlen = LEN_HDATA(dbp, hcp->page,
+ dbp->pgsize, hcp->indx);
+ hcp->dup_len = nval->size;
+ F_SET(hcp, H_ISDUP);
+ if (flags == DB_KEYFIRST)
+ hcp->dup_off = tmp_val.doff = 0;
+ else
+ hcp->dup_off =
+ tmp_val.doff = hcp->dup_tlen;
+ }
+ break;
+ case DB_BEFORE:
+ tmp_val.doff = hcp->dup_off;
+ break;
+ case DB_AFTER:
+ tmp_val.doff = hcp->dup_off + DUP_SIZE(hcp->dup_len);
+ break;
+ default:
+ return (__db_unknown_path(env, "__ham_add_dup"));
+ }
+
+ /* Add the duplicate. */
+ if ((ret = __memp_dirty(mpf, &hcp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0 ||
+ (ret = __ham_replpair(dbc, &tmp_val, H_DUPLICATE)) != 0)
+ return (ret);
+
+ /* Now, update the cursor if necessary. */
+ switch (flags) {
+ case DB_AFTER:
+ hcp->dup_off += DUP_SIZE(hcp->dup_len);
+ hcp->dup_len = nval->size;
+ hcp->dup_tlen += (db_indx_t)DUP_SIZE(nval->size);
+ break;
+ case DB_BEFORE:
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_OVERWRITE_DUP:
+ hcp->dup_tlen += (db_indx_t)DUP_SIZE(nval->size);
+ hcp->dup_len = nval->size;
+ break;
+ default:
+ return (__db_unknown_path(env, "__ham_add_dup"));
+ }
+ ret = __hamc_update(dbc, tmp_val.size, DB_HAM_CURADJ_ADD, 1);
+ return (ret);
+ }
+
+ /*
+ * If we get here, then we're on duplicate pages; set pgnop and
+ * return so the common code can handle it.
+ */
+ memcpy(pgnop, HOFFDUP_PGNO(H_PAIRDATA(dbp, hcp->page, hcp->indx)),
+ sizeof(db_pgno_t));
+
+ return (ret);
+}
+
+/*
+ * Convert an on-page set of duplicates to an offpage set of duplicates.
+ *
+ * PUBLIC: int __ham_dup_convert __P((DBC *));
+ */
+int
+__ham_dup_convert(dbc)
+ DBC *dbc;
+{
+ BOVERFLOW bo;
+ DB *dbp;
+ DBC **hcs;
+ DBT dbt;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HASH_CURSOR *hcp;
+ HOFFPAGE ho;
+ PAGE *dp;
+ db_indx_t i, len, off;
+ int c, ret, t_ret;
+ u_int8_t *p, *pend;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ /*
+ * Create a new page for the duplicates.
+ */
+ if ((ret = __db_new(dbc,
+ dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, NULL, &dp)) != 0)
+ return (ret);
+ P_INIT(dp, dbp->pgsize,
+ dp->pgno, PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp));
+
+ /*
+ * Get the list of cursors that may need to be updated.
+ */
+ if ((ret = __ham_get_clist(dbp,
+ PGNO(hcp->page), (u_int32_t)hcp->indx, &hcs)) != 0)
+ goto err;
+
+ /*
+ * Now put the duplicates onto the new page.
+ */
+ dbt.flags = 0;
+ switch (HPAGE_PTYPE(H_PAIRDATA(dbp, hcp->page, hcp->indx))) {
+ case H_KEYDATA:
+ /* Simple case, one key on page; move it to dup page. */
+ dbt.size = LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx);
+ dbt.data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx));
+ ret = __db_pitem(dbc,
+ dp, 0, BKEYDATA_SIZE(dbt.size), NULL, &dbt);
+ goto finish;
+ case H_OFFPAGE:
+ /* Simple case, one key on page; move it to dup page. */
+ memcpy(&ho, P_ENTRY(dbp, hcp->page, H_DATAINDEX(hcp->indx)),
+ HOFFPAGE_SIZE);
+ UMRW_SET(bo.unused1);
+ B_TSET(bo.type, ho.type);
+ UMRW_SET(bo.unused2);
+ bo.pgno = ho.pgno;
+ bo.tlen = ho.tlen;
+ dbt.size = BOVERFLOW_SIZE;
+ dbt.data = &bo;
+
+ ret = __db_pitem(dbc, dp, 0, dbt.size, &dbt, NULL);
+finish: if (ret == 0) {
+ /* Update any other cursors. */
+ if (hcs != NULL && DBC_LOGGING(dbc) &&
+ IS_SUBTRANSACTION(dbc->txn)) {
+ if ((ret = __ham_chgpg_log(dbp, dbc->txn,
+ &lsn, 0, DB_HAM_DUP, PGNO(hcp->page),
+ PGNO(dp), hcp->indx, 0)) != 0)
+ break;
+ }
+ for (c = 0; hcs != NULL && hcs[c] != NULL; c++)
+ if ((ret = __ham_dcursor(hcs[c],
+ PGNO(dp), 0)) != 0)
+ break;
+ }
+ break;
+ case H_DUPLICATE:
+ p = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx));
+ pend = p +
+ LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx);
+
+ /*
+ * We need to maintain the duplicate cursor position.
+ * Keep track of where we are in the duplicate set via
+ * the offset, and when it matches the one in the cursor,
+ * set the off-page duplicate cursor index to the current
+ * index.
+ */
+ for (off = 0, i = 0; p < pend; i++) {
+ memcpy(&len, p, sizeof(db_indx_t));
+ dbt.size = len;
+ p += sizeof(db_indx_t);
+ dbt.data = p;
+ p += len + sizeof(db_indx_t);
+ if ((ret = __db_pitem(dbc, dp,
+ i, BKEYDATA_SIZE(dbt.size), NULL, &dbt)) != 0)
+ break;
+
+ /* Update any other cursors */
+ if (hcs != NULL && DBC_LOGGING(dbc) &&
+ IS_SUBTRANSACTION(dbc->txn)) {
+ if ((ret = __ham_chgpg_log(dbp, dbc->txn,
+ &lsn, 0, DB_HAM_DUP, PGNO(hcp->page),
+ PGNO(dp), hcp->indx, i)) != 0)
+ break;
+ }
+ for (c = 0; hcs != NULL && hcs[c] != NULL; c++)
+ if (((HASH_CURSOR *)(hcs[c]->internal))->dup_off
+ == off && (ret = __ham_dcursor(hcs[c],
+ PGNO(dp), i)) != 0)
+ goto err;
+ off += len + 2 * sizeof(db_indx_t);
+ }
+ break;
+ default:
+ ret = __db_pgfmt(env, hcp->pgno);
+ break;
+ }
+
+ /*
+ * Now attach this to the source page in place of the old duplicate
+ * item.
+ */
+ if (ret == 0)
+ ret = __memp_dirty(mpf,
+ &hcp->page, dbc->thread_info, dbc->txn, dbc->priority, 0);
+
+ if (ret == 0)
+ ret = __ham_move_offpage(dbc, hcp->page,
+ (u_int32_t)H_DATAINDEX(hcp->indx), PGNO(dp));
+
+err: if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, dp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret == 0)
+ hcp->dup_tlen = hcp->dup_off = hcp->dup_len = 0;
+
+ if (hcs != NULL)
+ __os_free(env, hcs);
+
+ return (ret);
+}
+
+/*
+ * __ham_make_dup
+ *
+ * Take a regular dbt and make it into a duplicate item with all the partial
+ * information set appropriately. If the incoming dbt is a partial, assume
+ * we are creating a new entry and make sure that we do any initial padding.
+ *
+ * PUBLIC: int __ham_make_dup __P((ENV *,
+ * PUBLIC: const DBT *, DBT *d, void **, u_int32_t *));
+ */
+int
+__ham_make_dup(env, notdup, duplicate, bufp, sizep)
+ ENV *env;
+ const DBT *notdup;
+ DBT *duplicate;
+ void **bufp;
+ u_int32_t *sizep;
+{
+ db_indx_t tsize, item_size;
+ int ret;
+ u_int8_t *p;
+
+ item_size = (db_indx_t)notdup->size;
+ if (F_ISSET(notdup, DB_DBT_PARTIAL))
+ item_size += notdup->doff;
+
+ tsize = DUP_SIZE(item_size);
+ if ((ret = __ham_init_dbt(env, duplicate, tsize, bufp, sizep)) != 0)
+ return (ret);
+
+ duplicate->dlen = 0;
+ duplicate->flags = notdup->flags;
+ F_SET(duplicate, DB_DBT_PARTIAL);
+
+ p = duplicate->data;
+ memcpy(p, &item_size, sizeof(db_indx_t));
+ p += sizeof(db_indx_t);
+ if (F_ISSET(notdup, DB_DBT_PARTIAL)) {
+ memset(p, 0, notdup->doff);
+ p += notdup->doff;
+ }
+ memcpy(p, notdup->data, notdup->size);
+ p += notdup->size;
+ memcpy(p, &item_size, sizeof(db_indx_t));
+
+ duplicate->doff = 0;
+ duplicate->dlen = notdup->size;
+
+ return (0);
+}
+
+/*
+ * __ham_check_move --
+ *
+ * Check if we can do whatever we need to on this page. If not,
+ * then we'll have to move the current element to a new page.
+ */
+static int
+__ham_check_move(dbc, add_len)
+ DBC *dbc;
+ u_int32_t add_len;
+{
+ DB *dbp;
+ DBT k, d;
+ DB_LSN new_lsn;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ PAGE *new_pagep, *next_pagep;
+ db_pgno_t next_pgno;
+ u_int32_t data_type, key_type, new_datalen, old_len;
+ db_indx_t new_indx;
+ u_int8_t *hk;
+ int found, match, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ found = 0;
+
+ /*
+ * If the item is already off page duplicates or an offpage item,
+ * then we know we can do whatever we need to do in-place
+ */
+ if (HPAGE_PTYPE(hk) == H_OFFDUP || HPAGE_PTYPE(hk) == H_OFFPAGE)
+ return (0);
+
+ old_len =
+ LEN_HITEM(dbp, hcp->page, dbp->pgsize, H_DATAINDEX(hcp->indx));
+ new_datalen = (old_len - HKEYDATA_SIZE(0)) + add_len;
+ if (HPAGE_PTYPE(hk) != H_DUPLICATE)
+ new_datalen += DUP_SIZE(0);
+
+ /*
+ * We need to add a new page under two conditions:
+ * 1. The addition makes the total data length cross the BIG
+ * threshold and the OFFDUP structure won't fit on this page.
+ * 2. The addition does not make the total data cross the
+ * threshold, but the new data won't fit on the page.
+ * If neither of these is true, then we can return.
+ */
+ if (ISBIG(hcp, new_datalen) && (old_len > HOFFDUP_SIZE ||
+ HOFFDUP_SIZE - old_len <= P_FREESPACE(dbp, hcp->page)))
+ return (0);
+
+ if (!ISBIG(hcp, new_datalen) &&
+ (new_datalen - old_len) <= P_FREESPACE(dbp, hcp->page))
+ return (0);
+
+ /*
+ * If we get here, then we need to move the item to a new page.
+ * Check if there are more pages in the chain. We now need to
+ * update new_datalen to include the size of both the key and
+ * the data that we need to move.
+ */
+
+ new_datalen = ISBIG(hcp, new_datalen) ?
+ HOFFDUP_SIZE : HKEYDATA_SIZE(new_datalen);
+ new_datalen +=
+ LEN_HITEM(dbp, hcp->page, dbp->pgsize, H_KEYINDEX(hcp->indx));
+
+ new_pagep = NULL;
+ next_pagep = hcp->page;
+ for (next_pgno = NEXT_PGNO(hcp->page); next_pgno != PGNO_INVALID;
+ next_pgno = NEXT_PGNO(next_pagep)) {
+ if (next_pagep != hcp->page && (ret = __memp_fput(mpf,
+ dbc->thread_info, next_pagep, dbc->priority)) != 0)
+ return (ret);
+
+ if ((ret = __memp_fget(mpf,
+ &next_pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE, &next_pagep)) != 0)
+ return (ret);
+
+ if (P_FREESPACE(dbp, next_pagep) >= new_datalen) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found != 0) {
+ /* Found a page with space, dirty it and the original. */
+ new_pagep = next_pagep;
+ if ((ret = __memp_dirty(mpf, &hcp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+ if ((ret = __memp_dirty(mpf, &new_pagep,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+ } else {
+ if ((ret = __memp_dirty(mpf, &next_pagep,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+
+ /* Add new page at the end of the chain. */
+ new_pagep = next_pagep;
+ if ((ret = __ham_add_ovflpage(dbc, &new_pagep)) != 0)
+ goto err;
+
+ if (next_pagep != hcp->page) {
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, next_pagep, dbc->priority)) != 0)
+ goto err;
+ next_pagep = NULL;
+ /* Dirty the original page to update it. */
+ if ((ret = __memp_dirty(mpf, &hcp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+ }
+ }
+
+ /* Copy the item to the new page. */
+ if (DBC_LOGGING(dbc)) {
+ memset(&k, 0, sizeof(DBT));
+ d.flags = 0;
+ if (HPAGE_PTYPE(
+ H_PAIRKEY(dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
+ k.data = H_PAIRKEY(dbp, hcp->page, hcp->indx);
+ k.size = HOFFPAGE_SIZE;
+ key_type = H_OFFPAGE;
+ } else {
+ k.data =
+ HKEYDATA_DATA(H_PAIRKEY(dbp, hcp->page, hcp->indx));
+ k.size =
+ LEN_HKEY(dbp, hcp->page, dbp->pgsize, hcp->indx);
+ key_type = H_KEYDATA;
+ }
+
+ /* Resolve the insert index so it can be written to the log. */
+ if ((ret = __ham_getindex(dbc, new_pagep, &k,
+ key_type, &match, &new_indx)) != 0)
+ return (ret);
+
+ if ((data_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) {
+ d.data = hk;
+ d.size = HOFFPAGE_SIZE;
+ } else if (data_type == H_OFFDUP) {
+ d.data = hk;
+ d.size = HOFFDUP_SIZE;
+ } else {
+ d.data = HKEYDATA_DATA(hk);
+ d.size = LEN_HDATA(dbp,
+ hcp->page, dbp->pgsize, hcp->indx);
+ }
+
+ if ((ret = __ham_insdel_log(dbp, dbc->txn, &new_lsn,
+ 0, PUTPAIR, PGNO(new_pagep), (u_int32_t)new_indx,
+ &LSN(new_pagep), OP_SET(key_type, new_pagep), &k,
+ OP_SET(data_type, new_pagep), &d)) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, new_pagep, dbc->priority);
+ return (ret);
+ }
+ } else {
+ LSN_NOT_LOGGED(new_lsn);
+ /*
+ * Ensure that an invalid index is passed to __ham_copypair, so
+ * it knows to resolve the index. Resolving the insert index
+ * here would require creating a temporary DBT with the key,
+ * and calling __ham_getindex. Let __ham_copypair do the
+ * resolution using the final key DBT.
+ */
+ new_indx = NDX_INVALID;
+ }
+
+ /* Move lsn onto page. */
+ LSN(new_pagep) = new_lsn; /* Structure assignment. */
+
+ if ((ret = __ham_copypair(dbc, hcp->page,
+ H_KEYINDEX(hcp->indx), new_pagep, &new_indx, 0)) != 0)
+ goto err;
+
+ /* Update all cursors that used to point to this item. */
+ if ((ret = __hamc_chgpg(dbc, PGNO(hcp->page), H_KEYINDEX(hcp->indx),
+ PGNO(new_pagep), new_indx)) != 0)
+ goto err;
+
+ /* Now delete the pair from the current page. */
+ if ((ret = __ham_del_pair(dbc, HAM_DEL_NO_RECLAIM, NULL)) != 0)
+ goto err;
+
+ /*
+ * __ham_del_pair decremented nelem. This is incorrect; we
+ * manually copied the element elsewhere, so the total number
+ * of elements hasn't changed. Increment it again.
+ *
+ * !!!
+ * Note that we still have the metadata page pinned, and
+ * __ham_del_pair dirtied it, so we don't need to set the dirty
+ * flag again.
+ */
+ if (!STD_LOCKING(dbc))
+ hcp->hdr->nelem++;
+
+ ret = __memp_fput(mpf, dbc->thread_info, hcp->page, dbc->priority);
+ hcp->page = new_pagep;
+ hcp->pgno = PGNO(hcp->page);
+ hcp->indx = new_indx;
+ F_SET(hcp, H_EXPAND);
+ F_CLR(hcp, H_DELETED);
+
+ return (ret);
+
+err: if (new_pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, new_pagep, dbc->priority);
+ if (next_pagep != NULL &&
+ next_pagep != hcp->page && next_pagep != new_pagep)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, next_pagep, dbc->priority);
+ return (ret);
+
+}
+
+/*
+ * __ham_move_offpage --
+ * Replace an onpage set of duplicates with the OFFDUP structure
+ * that references the duplicate page.
+ *
+ * XXX
+ * This is really just a special case of __onpage_replace; we should
+ * probably combine them.
+ *
+ */
+static int
+__ham_move_offpage(dbc, pagep, ndx, pgno)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t ndx;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DBT new_dbt;
+ DBT old_dbt;
+ HOFFDUP od;
+ db_indx_t i, *inp;
+ int32_t difflen;
+ u_int8_t *src;
+ int ret;
+
+ dbp = dbc->dbp;
+ od.type = H_OFFDUP;
+ UMRW_SET(od.unused[0]);
+ UMRW_SET(od.unused[1]);
+ UMRW_SET(od.unused[2]);
+ od.pgno = pgno;
+ ret = 0;
+
+ if (DBC_LOGGING(dbc)) {
+ HKEYDATA *hk;
+ new_dbt.data = &od;
+ new_dbt.size = HOFFDUP_SIZE;
+ hk = (HKEYDATA *)P_ENTRY(dbp, pagep, ndx);
+ if (hk->type == H_KEYDATA || hk->type == H_DUPLICATE) {
+ old_dbt.data = hk->data;
+ old_dbt.size = LEN_HITEM(dbp, pagep, dbp->pgsize, ndx) -
+ SSZA(HKEYDATA, data);
+ } else {
+ old_dbt.data = hk;
+ old_dbt.size = LEN_HITEM(dbp, pagep, dbp->pgsize, ndx);
+ }
+ if ((ret = __ham_replace_log(dbp, dbc->txn, &LSN(pagep), 0,
+ PGNO(pagep), (u_int32_t)ndx, &LSN(pagep), -1,
+ OP_SET(hk->type, pagep), &old_dbt,
+ OP_SET(H_OFFDUP, pagep), &new_dbt)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+
+ /*
+ * difflen is the difference in the lengths, and so may be negative.
+ * We know that the difference between two unsigned lengths from a
+ * database page will fit into an int32_t.
+ */
+ difflen =
+ (int32_t)LEN_HITEM(dbp, pagep, dbp->pgsize, ndx) -
+ (int32_t)HOFFDUP_SIZE;
+ if (difflen != 0) {
+ /* Copy data. */
+ inp = P_INP(dbp, pagep);
+ src = (u_int8_t *)(pagep) + HOFFSET(pagep);
+ memmove(src + difflen, src, inp[ndx] - HOFFSET(pagep));
+ HOFFSET(pagep) += difflen;
+
+ /* Update index table. */
+ for (i = ndx; i < NUM_ENT(pagep); i++)
+ inp[i] += difflen;
+ }
+
+ /* Now copy the offdup entry onto the page. */
+ memcpy(P_ENTRY(dbp, pagep, ndx), &od, HOFFDUP_SIZE);
+ return (ret);
+}
+
+/*
+ * __ham_dsearch:
+ * Locate a particular duplicate in a duplicate set. Make sure that
+ * we exit with the cursor set appropriately.
+ *
+ * PUBLIC: void __ham_dsearch
+ * PUBLIC: __P((DBC *, DBT *, u_int32_t *, int *, u_int32_t));
+ */
+void
+__ham_dsearch(dbc, dbt, offp, cmpp, flags)
+ DBC *dbc;
+ DBT *dbt;
+ u_int32_t *offp, flags;
+ int *cmpp;
+{
+ DB *dbp;
+ DBT cur;
+ HASH_CURSOR *hcp;
+ db_indx_t i, len;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ u_int8_t *data;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ func = dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare;
+
+ i = F_ISSET(hcp, H_CONTINUE) ? hcp->dup_off: 0;
+ data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)) + i;
+ hcp->dup_tlen = LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx);
+ len = hcp->dup_len;
+ while (i < hcp->dup_tlen) {
+ memcpy(&len, data, sizeof(db_indx_t));
+ data += sizeof(db_indx_t);
+ DB_SET_DBT(cur, data, len);
+
+ /*
+ * If we find an exact match, we're done. If in a sorted
+ * duplicate set and the item is larger than our test item,
+ * we're done. In the latter case, if permitting partial
+ * matches, it's not a failure.
+ */
+ *cmpp = func(dbp, dbt, &cur);
+ if (*cmpp == 0)
+ break;
+ if (*cmpp < 0 && dbp->dup_compare != NULL) {
+ if (flags == DB_GET_BOTH_RANGE)
+ *cmpp = 0;
+ break;
+ }
+
+ i += len + 2 * sizeof(db_indx_t);
+ data += len + sizeof(db_indx_t);
+ }
+
+ *offp = i;
+ hcp->dup_off = i;
+ hcp->dup_len = len;
+ F_SET(hcp, H_ISDUP);
+}
+
+/*
+ * __ham_dcursor --
+ *
+ * Create an off page duplicate cursor for this cursor.
+ */
+static int
+__ham_dcursor(dbc, pgno, indx)
+ DBC *dbc;
+ db_pgno_t pgno;
+ u_int32_t indx;
+{
+ BTREE_CURSOR *dcp;
+ DB *dbp;
+ HASH_CURSOR *hcp;
+ int ret;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if ((ret = __dbc_newopd(dbc, pgno, hcp->opd, &hcp->opd)) != 0)
+ return (ret);
+
+ dcp = (BTREE_CURSOR *)hcp->opd->internal;
+ dcp->pgno = pgno;
+ dcp->indx = indx;
+
+ if (dbp->dup_compare == NULL) {
+ /*
+ * Converting to off-page Recno trees is tricky. The
+ * record number for the cursor is the index + 1 (to
+ * convert to 1-based record numbers).
+ */
+ dcp->recno = indx + 1;
+ }
+
+ /*
+ * Transfer the deleted flag from the top-level cursor to the
+ * created one.
+ */
+ if (F_ISSET(hcp, H_DELETED)) {
+ F_SET(dcp, C_DELETED);
+ F_CLR(hcp, H_DELETED);
+ }
+
+ return (0);
+}
+
+struct __hamc_chgpg_args {
+ db_pgno_t new_pgno;
+ db_indx_t new_index;
+ DB_TXN *my_txn;
+};
+
+static int
+__hamc_chgpg_func(cp, my_dbc, foundp, old_pgno, old_index, vargs)
+ DBC *cp, *my_dbc;
+ u_int32_t *foundp;
+ db_pgno_t old_pgno;
+ u_int32_t old_index;
+ void *vargs;
+{
+ HASH_CURSOR *hcp;
+ struct __hamc_chgpg_args *args;
+
+ if (cp == my_dbc || cp->dbtype != DB_HASH)
+ return (0);
+
+ hcp = (HASH_CURSOR *)cp->internal;
+
+ /*
+ * If a cursor is deleted, it doesn't refer to this
+ * item--it just happens to have the same indx, but
+ * it points to a former neighbor. Don't move it.
+ */
+ if (F_ISSET(hcp, H_DELETED))
+ return (0);
+
+ args = vargs;
+
+ if (hcp->pgno == old_pgno &&
+ hcp->indx == old_index &&
+ !MVCC_SKIP_CURADJ(cp, old_pgno)) {
+ hcp->pgno = args->new_pgno;
+ hcp->indx = args->new_index;
+ if (args->my_txn != NULL && cp->txn != args->my_txn)
+ *foundp = 1;
+ }
+ return (0);
+}
+
+/*
+ * __hamc_chgpg --
+ * Adjust the cursors after moving an item to a new page. We only
+ * move cursors that are pointing at this one item and are not
+ * deleted; since we only touch non-deleted cursors, and since
+ * (by definition) no item existed at the pgno/indx we're moving the
+ * item to, we're guaranteed that all the cursors we affect here or
+ * on abort really do refer to this one item.
+ */
+static int
+__hamc_chgpg(dbc, old_pgno, old_index, new_pgno, new_index)
+ DBC *dbc;
+ db_pgno_t old_pgno, new_pgno;
+ u_int32_t old_index, new_index;
+{
+ DB *dbp;
+ DB_LSN lsn;
+ int ret;
+ u_int32_t found;
+ struct __hamc_chgpg_args args;
+
+ dbp = dbc->dbp;
+
+ args.my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL;
+ args.new_pgno = new_pgno;
+ args.new_index = new_index;
+
+ if ((ret = __db_walk_cursors(dbp, dbc,
+ __hamc_chgpg_func, &found, old_pgno, old_index, &args)) != 0)
+ return (ret);
+ if (found != 0 && DBC_LOGGING(dbc)) {
+ if ((ret = __ham_chgpg_log(dbp,
+ args.my_txn, &lsn, 0, DB_HAM_CHGPG,
+ old_pgno, new_pgno, old_index, new_index)) != 0)
+ return (ret);
+ }
+ return (0);
+}
diff --git a/src/hash/hash_func.c b/src/hash/hash_func.c
new file mode 100644
index 00000000..baf6061c
--- /dev/null
+++ b/src/hash/hash_func.c
@@ -0,0 +1,240 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ * Margo Seltzer. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+/*
+ * __ham_func2 --
+ * Phong Vo's linear congruential hash.
+ *
+ * PUBLIC: u_int32_t __ham_func2 __P((DB *, const void *, u_int32_t));
+ */
+#define DCHARHASH(h, c) ((h) = 0x63c63cd9*(h) + 0x9c39c33d + (c))
+
+u_int32_t
+__ham_func2(dbp, key, len)
+ DB *dbp;
+ const void *key;
+ u_int32_t len;
+{
+ const u_int8_t *e, *k;
+ u_int32_t h;
+ u_int8_t c;
+
+ if (dbp != NULL)
+ COMPQUIET(dbp, NULL);
+
+ k = key;
+ e = k + len;
+ for (h = 0; k != e;) {
+ c = *k++;
+ if (!c && k > e)
+ break;
+ DCHARHASH(h, c);
+ }
+ return (h);
+}
+
+/*
+ * __ham_func3 --
+ * Ozan Yigit's original sdbm hash.
+ *
+ * Ugly, but fast. Break the string up into 8 byte units. On the first time
+ * through the loop get the "leftover bytes" (strlen % 8). On every other
+ * iteration, perform 8 HASHC's so we handle all 8 bytes. Essentially, this
+ * saves us 7 cmp & branch instructions.
+ *
+ * PUBLIC: u_int32_t __ham_func3 __P((DB *, const void *, u_int32_t));
+ */
+u_int32_t
+__ham_func3(dbp, key, len)
+ DB *dbp;
+ const void *key;
+ u_int32_t len;
+{
+ const u_int8_t *k;
+ u_int32_t n, loop;
+
+ if (dbp != NULL)
+ COMPQUIET(dbp, NULL);
+
+ if (len == 0)
+ return (0);
+
+#define HASHC n = *k++ + 65599 * n
+ n = 0;
+ k = key;
+
+ loop = (len + 8 - 1) >> 3;
+ switch (len & (8 - 1)) {
+ case 0:
+ do {
+ HASHC;
+ case 7:
+ HASHC;
+ case 6:
+ HASHC;
+ case 5:
+ HASHC;
+ case 4:
+ HASHC;
+ case 3:
+ HASHC;
+ case 2:
+ HASHC;
+ case 1:
+ HASHC;
+ } while (--loop);
+ }
+ return (n);
+}
+
+/*
+ * __ham_func4 --
+ * Chris Torek's hash function. Although this function performs only
+ * slightly worse than __ham_func5 on strings, it performs horribly on
+ * numbers.
+ *
+ * PUBLIC: u_int32_t __ham_func4 __P((DB *, const void *, u_int32_t));
+ */
+u_int32_t
+__ham_func4(dbp, key, len)
+ DB *dbp;
+ const void *key;
+ u_int32_t len;
+{
+ const u_int8_t *k;
+ u_int32_t h, loop;
+
+ if (dbp != NULL)
+ COMPQUIET(dbp, NULL);
+
+ if (len == 0)
+ return (0);
+
+#define HASH4a h = (h << 5) - h + *k++;
+#define HASH4b h = (h << 5) + h + *k++;
+#define HASH4 HASH4b
+ h = 0;
+ k = key;
+
+ loop = (len + 8 - 1) >> 3;
+ switch (len & (8 - 1)) {
+ case 0:
+ do {
+ HASH4;
+ case 7:
+ HASH4;
+ case 6:
+ HASH4;
+ case 5:
+ HASH4;
+ case 4:
+ HASH4;
+ case 3:
+ HASH4;
+ case 2:
+ HASH4;
+ case 1:
+ HASH4;
+ } while (--loop);
+ }
+ return (h);
+}
+
+/*
+ * Fowler/Noll/Vo hash
+ *
+ * The basis of the hash algorithm was taken from an idea sent by email to the
+ * IEEE Posix P1003.2 mailing list from Phong Vo (kpv@research.att.com) and
+ * Glenn Fowler (gsf@research.att.com). Landon Curt Noll (chongo@toad.com)
+ * later improved on their algorithm.
+ *
+ * The magic is in the interesting relationship between the special prime
+ * 16777619 (2^24 + 403) and 2^32 and 2^8.
+ *
+ * This hash produces the fewest collisions of any function that we've seen so
+ * far, and works well on both numbers and strings.
+ *
+ * PUBLIC: u_int32_t __ham_func5 __P((DB *, const void *, u_int32_t));
+ */
+u_int32_t
+__ham_func5(dbp, key, len)
+ DB *dbp;
+ const void *key;
+ u_int32_t len;
+{
+ const u_int8_t *k, *e;
+ u_int32_t h;
+
+ if (dbp != NULL)
+ COMPQUIET(dbp, NULL);
+
+ k = key;
+ e = k + len;
+ for (h = 0; k < e; ++k) {
+ h *= 16777619;
+ h ^= *k;
+ }
+ return (h);
+}
+
+/*
+ * __ham_test --
+ *
+ * PUBLIC: u_int32_t __ham_test __P((DB *, const void *, u_int32_t));
+ */
+u_int32_t
+__ham_test(dbp, key, len)
+ DB *dbp;
+ const void *key;
+ u_int32_t len;
+{
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(len, 0);
+ return ((u_int32_t)*(char *)key);
+}
diff --git a/src/hash/hash_meta.c b/src/hash/hash_meta.c
new file mode 100644
index 00000000..d9a35cb4
--- /dev/null
+++ b/src/hash/hash_meta.c
@@ -0,0 +1,170 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * Acquire the meta-data page.
+ *
+ * PUBLIC: int __ham_get_meta __P((DBC *));
+ */
+int
+__ham_get_meta(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH *hashp;
+ HASH_CURSOR *hcp;
+ u_int32_t revision;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hashp = dbp->h_internal;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+again:
+ revision = hashp->revision;
+ if ((ret = __db_lget(dbc, 0,
+ hashp->meta_pgno, DB_LOCK_READ, 0, &hcp->hlock)) != 0)
+ return (ret);
+
+ if ((ret = __memp_fget(mpf, &hashp->meta_pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_CREATE, &hcp->hdr)) != 0) {
+ (void)__LPUT(dbc, hcp->hlock);
+ return (ret);
+ }
+
+ if (F_ISSET(dbp, DB_AM_SUBDB) &&
+ (revision != dbp->mpf->mfp->revision ||
+ (TYPE(hcp->hdr) != P_HASHMETA &&
+ !IS_RECOVERING(dbp->env) && !F_ISSET(dbp, DB_AM_RECOVER)))) {
+ ret = __LPUT(dbc, hcp->hlock);
+ t_ret =
+ __memp_fput(mpf, dbc->thread_info, hcp->hdr, dbc->priority);
+ hcp->hdr = NULL;
+ if (ret != 0)
+ return (ret);
+ if (t_ret != 0)
+ return (t_ret);
+ if ((ret = __db_reopen(dbc)) != 0)
+ return (ret);
+ goto again;
+ }
+
+ return (ret);
+}
+
+/*
+ * Release the meta-data page.
+ *
+ * PUBLIC: int __ham_release_meta __P((DBC *));
+ */
+int
+__ham_release_meta(dbc)
+ DBC *dbc;
+{
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ int ret;
+
+ mpf = dbc->dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if (hcp->hdr != NULL) {
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->hdr, dbc->priority)) != 0)
+ return (ret);
+ hcp->hdr = NULL;
+ }
+
+ ret = __TLPUT(dbc, hcp->hlock);
+ hcp->hlock.mode = DB_LOCK_NG;
+ return (ret);
+}
+
+/*
+ * Mark the meta-data page dirty.
+ *
+ * PUBLIC: int __ham_dirty_meta __P((DBC *, u_int32_t));
+ */
+int
+__ham_dirty_meta(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ HASH *hashp;
+ HASH_CURSOR *hcp;
+ int ret;
+
+ if (F_ISSET(dbc, DBC_OPD))
+ dbc = dbc->internal->pdbc;
+ hashp = dbc->dbp->h_internal;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if (hcp->hlock.mode == DB_LOCK_WRITE)
+ return (0);
+
+ mpf = dbc->dbp->mpf;
+
+ if ((ret = __db_lget(dbc, LCK_COUPLE, hashp->meta_pgno,
+ DB_LOCK_WRITE, DB_LOCK_NOWAIT, &hcp->hlock)) != 0) {
+ if (ret != DB_LOCK_NOTGRANTED && ret != DB_LOCK_DEADLOCK)
+ return (ret);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->hdr, dbc->priority)) != 0)
+ return (ret);
+ hcp->hdr = NULL;
+ if ((ret = __db_lget(dbc, LCK_COUPLE, hashp->meta_pgno,
+ DB_LOCK_WRITE, 0, &hcp->hlock)) != 0)
+ return (ret);
+ ret = __memp_fget(mpf, &hashp->meta_pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &hcp->hdr);
+ return (ret);
+ }
+
+ return (__memp_dirty(mpf,
+ &hcp->hdr, dbc->thread_info, dbc->txn, dbc->priority, flags));
+}
+
+/*
+ * Return the meta data page if it is saved in the cursor.
+ *
+ * PUBLIC: int __ham_return_meta __P((DBC *, u_int32_t, DBMETA **));
+ */
+ int
+ __ham_return_meta(dbc, flags, metap)
+ DBC *dbc;
+ u_int32_t flags;
+ DBMETA **metap;
+{
+ HASH_CURSOR *hcp;
+ int ret;
+
+ *metap = NULL;
+ if (F_ISSET(dbc, DBC_OPD))
+ dbc = dbc->internal->pdbc;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if (hcp->hdr == NULL || PGNO(hcp->hdr) != PGNO_BASE_MD)
+ return (0);
+
+ if (LF_ISSET(DB_MPOOL_DIRTY) &&
+ (ret = __ham_dirty_meta(dbc, flags)) != 0)
+ return (ret);
+
+ *metap = (DBMETA *)hcp->hdr;
+ return (0);
+}
diff --git a/src/hash/hash_method.c b/src/hash/hash_method.c
new file mode 100644
index 00000000..1da81e70
--- /dev/null
+++ b/src/hash/hash_method.c
@@ -0,0 +1,250 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+static int __ham_set_h_ffactor __P((DB *, u_int32_t));
+static int __ham_get_h_hash
+ __P((DB *, u_int32_t(**)(DB *, const void *, u_int32_t)));
+static int __ham_set_h_hash
+ __P((DB *, u_int32_t(*)(DB *, const void *, u_int32_t)));
+static int __ham_set_h_nelem __P((DB *, u_int32_t));
+
+static int __ham_get_h_compare
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+
+/*
+ * __ham_db_create --
+ * Hash specific initialization of the DB structure.
+ *
+ * PUBLIC: int __ham_db_create __P((DB *));
+ */
+int
+__ham_db_create(dbp)
+ DB *dbp;
+{
+ HASH *hashp;
+ int ret;
+
+ if ((ret = __os_malloc(dbp->env,
+ sizeof(HASH), &dbp->h_internal)) != 0)
+ return (ret);
+
+ hashp = dbp->h_internal;
+
+ hashp->h_nelem = 0; /* Defaults. */
+ hashp->h_ffactor = 0;
+ hashp->h_hash = NULL;
+ hashp->h_compare = NULL;
+
+ dbp->get_h_ffactor = __ham_get_h_ffactor;
+ dbp->set_h_ffactor = __ham_set_h_ffactor;
+ dbp->get_h_hash = __ham_get_h_hash;
+ dbp->set_h_hash = __ham_set_h_hash;
+ dbp->get_h_compare = __ham_get_h_compare;
+ dbp->set_h_compare = __ham_set_h_compare;
+ dbp->get_h_nelem = __ham_get_h_nelem;
+ dbp->set_h_nelem = __ham_set_h_nelem;
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __ham_db_close __P((DB *));
+ */
+int
+__ham_db_close(dbp)
+ DB *dbp;
+{
+ if (dbp->h_internal == NULL)
+ return (0);
+ __os_free(dbp->env, dbp->h_internal);
+ dbp->h_internal = NULL;
+ return (0);
+}
+
+/*
+ * __ham_get_h_ffactor --
+ *
+ * PUBLIC: int __ham_get_h_ffactor __P((DB *, u_int32_t *));
+ */
+int
+__ham_get_h_ffactor(dbp, h_ffactorp)
+ DB *dbp;
+ u_int32_t *h_ffactorp;
+{
+ HASH *hashp;
+
+ hashp = dbp->h_internal;
+ *h_ffactorp = hashp->h_ffactor;
+ return (0);
+}
+
+/*
+ * __ham_set_h_ffactor --
+ * Set the fill factor.
+ */
+static int
+__ham_set_h_ffactor(dbp, h_ffactor)
+ DB *dbp;
+ u_int32_t h_ffactor;
+{
+ HASH *hashp;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_h_ffactor");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+ hashp = dbp->h_internal;
+ hashp->h_ffactor = h_ffactor;
+ return (0);
+}
+
+/*
+ * __ham_get_h_hash --
+ * Get the hash function.
+ */
+static int
+__ham_get_h_hash(dbp, funcp)
+ DB *dbp;
+ u_int32_t (**funcp) __P((DB *, const void *, u_int32_t));
+{
+ HASH *hashp;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+ hashp = dbp->h_internal;
+ if (funcp != NULL)
+ *funcp = hashp->h_hash;
+ return (0);
+}
+
+/*
+ * __ham_set_h_hash --
+ * Set the hash function.
+ */
+static int
+__ham_set_h_hash(dbp, func)
+ DB *dbp;
+ u_int32_t (*func) __P((DB *, const void *, u_int32_t));
+{
+ HASH *hashp;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_h_hash");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+ hashp = dbp->h_internal;
+ hashp->h_hash = func;
+ return (0);
+}
+
+/*
+ * __ham_get_h_compare --
+ * Get the comparison function.
+ */
+static int
+__ham_get_h_compare(dbp, funcp)
+ DB *dbp;
+ int (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+ HASH *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+ t = dbp->h_internal;
+ if (funcp != NULL)
+ *funcp = t->h_compare;
+
+ return (0);
+}
+
+/*
+ * __ham_set_h_compare --
+ * Set the comparison function.
+ *
+ * PUBLIC: int __ham_set_h_compare
+ * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ */
+int
+__ham_set_h_compare(dbp, func)
+ DB *dbp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+{
+ HASH *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_h_compare");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+ t = dbp->h_internal;
+
+ t->h_compare = func;
+
+ return (0);
+}
+
+/*
+ * __db_get_h_nelem --
+ *
+ * PUBLIC: int __ham_get_h_nelem __P((DB *, u_int32_t *));
+ */
+int
+__ham_get_h_nelem(dbp, h_nelemp)
+ DB *dbp;
+ u_int32_t *h_nelemp;
+{
+ HASH *hashp;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+ hashp = dbp->h_internal;
+ *h_nelemp = hashp->h_nelem;
+ return (0);
+}
+
+/*
+ * __ham_set_h_nelem --
+ * Set the table size.
+ */
+static int
+__ham_set_h_nelem(dbp, h_nelem)
+ DB *dbp;
+ u_int32_t h_nelem;
+{
+ HASH *hashp;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_h_nelem");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+ hashp = dbp->h_internal;
+ hashp->h_nelem = h_nelem;
+ return (0);
+}
+
+/*
+ * __ham_copy_config
+ * Copy the configuration of one DB handle to another.
+ * PUBLIC: void __ham_copy_config __P((DB *, DB*, u_int32_t));
+ */
+void
+__ham_copy_config(src, dst, nparts)
+ DB *src, *dst;
+ u_int32_t nparts;
+{
+ HASH *s, *d;
+
+ s = src->h_internal;
+ d = dst->h_internal;
+
+ d->h_ffactor = s->h_ffactor;
+ d->h_nelem = s->h_nelem / nparts;
+ d->h_hash = s->h_hash;
+ d->h_compare = s->h_compare;
+}
diff --git a/src/hash/hash_open.c b/src/hash/hash_open.c
new file mode 100644
index 00000000..3d0bb220
--- /dev/null
+++ b/src/hash/hash_open.c
@@ -0,0 +1,584 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * Margo Seltzer. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+
+static db_pgno_t __ham_init_meta __P((DB *, HMETA *, db_pgno_t, DB_LSN *));
+
+/*
+ * __ham_open --
+ *
+ * PUBLIC: int __ham_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char * name, db_pgno_t, u_int32_t));
+ */
+int
+__ham_open(dbp, ip, txn, name, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DBMETA *dbmeta;
+ ENV *env;
+ HASH *hashp;
+ HASH_CURSOR *hcp;
+ int ret, t_ret;
+
+ env = dbp->env;
+ dbc = NULL;
+
+ /*
+ * Get a cursor. If DB_CREATE is specified, we may be creating
+ * pages, and to do that safely in CDB we need a write cursor.
+ * In STD_LOCKING mode, we'll synchronize using the meta page
+ * lock instead.
+ */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc,
+ (LF_ISSET(DB_CREATE) && CDB_LOCKING(env) ? DB_WRITECURSOR : 0) |
+ (F_ISSET(dbp, DB_AM_RECOVER) ? DB_RECOVER : 0))) != 0)
+ return (ret);
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ hashp = dbp->h_internal;
+ hashp->meta_pgno = base_pgno;
+ hashp->revision = dbp->mpf->mfp->revision;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto err;
+
+ /* Initialize the hdr structure. */
+ dbmeta = &hcp->hdr->dbmeta;
+ if (dbmeta->magic == DB_HASHMAGIC) {
+ /* File exists, verify the data in the header. */
+ if (hashp->h_hash == NULL)
+ hashp->h_hash = dbmeta->version < 5
+ ? __ham_func4 : __ham_func5;
+ hashp->h_nelem = hcp->hdr->nelem;
+ if (F_ISSET(dbmeta, DB_HASH_DUP))
+ F_SET(dbp, DB_AM_DUP);
+ if (F_ISSET(dbmeta, DB_HASH_DUPSORT))
+ F_SET(dbp, DB_AM_DUPSORT);
+ if (F_ISSET(dbmeta, DB_HASH_SUBDB))
+ F_SET(dbp, DB_AM_SUBDB);
+ if (PGNO(hcp->hdr) == PGNO_BASE_MD &&
+ !F_ISSET(dbp, DB_AM_RECOVER) &&
+ (txn == NULL || !F_ISSET(txn, TXN_SNAPSHOT)) && (ret =
+ __memp_set_last_pgno(dbp->mpf, dbmeta->last_pgno)) != 0)
+ goto err;
+ } else if (!IS_RECOVERING(env) && !F_ISSET(dbp, DB_AM_RECOVER)) {
+ __db_errx(env, DB_STR_A("1124",
+ "%s: Invalid hash meta page %lu", "%s %lu"),
+ name, (u_long)base_pgno);
+ ret = EINVAL;
+ }
+
+ /* Release the meta data page */
+ if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+err: if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __ham_metachk --
+ *
+ * PUBLIC: int __ham_metachk __P((DB *, const char *, HMETA *));
+ */
+int
+__ham_metachk(dbp, name, hashm)
+ DB *dbp;
+ const char *name;
+ HMETA *hashm;
+{
+ ENV *env;
+ u_int32_t vers;
+ int ret;
+
+ env = dbp->env;
+
+ /*
+ * At this point, all we know is that the magic number is for a Hash.
+ * Check the version, the database may be out of date.
+ */
+ vers = hashm->dbmeta.version;
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ M_32_SWAP(vers);
+ switch (vers) {
+ case 4:
+ case 5:
+ case 6:
+ __db_errx(env, DB_STR_A("1125",
+ "%s: hash version %lu requires a version upgrade",
+ "%s %lu"), name, (u_long)vers);
+ return (DB_OLD_VERSION);
+ case 7:
+ case 8:
+ case 9:
+ break;
+ default:
+ __db_errx(env, DB_STR_A("1126",
+ "%s: unsupported hash version: %lu", "%s %lu"),
+ name, (u_long)vers);
+ return (EINVAL);
+ }
+
+ /* Swap the page if we need to. */
+ if (F_ISSET(dbp, DB_AM_SWAP) &&
+ (ret = __ham_mswap(env, (PAGE *)hashm)) != 0)
+ return (ret);
+
+ /* Check the type. */
+ if (dbp->type != DB_HASH && dbp->type != DB_UNKNOWN)
+ return (EINVAL);
+ dbp->type = DB_HASH;
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
+
+ /*
+ * Check application info against metadata info, and set info, flags,
+ * and type based on metadata info.
+ */
+ if ((ret = __db_fchk(env,
+ "DB->open", hashm->dbmeta.flags,
+ DB_HASH_DUP | DB_HASH_SUBDB | DB_HASH_DUPSORT)) != 0)
+ return (ret);
+
+ if (F_ISSET(&hashm->dbmeta, DB_HASH_DUP))
+ F_SET(dbp, DB_AM_DUP);
+ else
+ if (F_ISSET(dbp, DB_AM_DUP)) {
+ __db_errx(env, DB_STR_A("1127",
+ "%s: DB_DUP specified to open method but not set in database",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&hashm->dbmeta, DB_HASH_SUBDB))
+ F_SET(dbp, DB_AM_SUBDB);
+ else
+ if (F_ISSET(dbp, DB_AM_SUBDB)) {
+ __db_errx(env, DB_STR_A("1128",
+ "%s: multiple databases specified but not supported in file",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&hashm->dbmeta, DB_HASH_DUPSORT)) {
+ if (dbp->dup_compare == NULL)
+ dbp->dup_compare = __bam_defcmp;
+ } else
+ if (dbp->dup_compare != NULL) {
+ __db_errx(env, DB_STR_A("1129",
+ "%s: duplicate sort function specified but not set in database",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ /* Set the page size. */
+ dbp->pgsize = hashm->dbmeta.pagesize;
+
+ /* Copy the file's ID. */
+ memcpy(dbp->fileid, hashm->dbmeta.uid, DB_FILE_ID_LEN);
+
+ return (0);
+}
+
+/*
+ * __ham_init_meta --
+ *
+ * Initialize a hash meta-data page. We assume that the meta-data page is
+ * contiguous with the initial buckets that we create. If that turns out
+ * to be false, we'll fix it up later. Return the initial number of buckets
+ * allocated.
+ */
+static db_pgno_t
+__ham_init_meta(dbp, meta, pgno, lsnp)
+ DB *dbp;
+ HMETA *meta;
+ db_pgno_t pgno;
+ DB_LSN *lsnp;
+{
+#ifdef HAVE_PARTITION
+ DB_PARTITION *part;
+#endif
+ ENV *env;
+ HASH *hashp;
+ db_pgno_t nbuckets;
+ u_int i, l2;
+
+ env = dbp->env;
+ hashp = dbp->h_internal;
+
+ if (hashp->h_hash == NULL)
+ hashp->h_hash = DB_HASHVERSION < 5 ? __ham_func4 : __ham_func5;
+
+ if (hashp->h_nelem != 0 && hashp->h_ffactor != 0) {
+ nbuckets = (hashp->h_nelem - 1) / hashp->h_ffactor + 1;
+ l2 = __db_log2(nbuckets > 2 ? nbuckets : 2);
+ } else
+ l2 = 1;
+
+ /* Now make number of buckets a power of two. */
+ nbuckets = (db_pgno_t)(1 << l2);
+
+ memset(meta, 0, sizeof(HMETA));
+ meta->dbmeta.lsn = *lsnp;
+ meta->dbmeta.pgno = pgno;
+ meta->dbmeta.magic = DB_HASHMAGIC;
+ meta->dbmeta.version = DB_HASHVERSION;
+ meta->dbmeta.pagesize = dbp->pgsize;
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
+ DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
+ meta->crypto_magic = meta->dbmeta.magic;
+ }
+ meta->dbmeta.type = P_HASHMETA;
+ meta->dbmeta.free = PGNO_INVALID;
+ meta->dbmeta.last_pgno = pgno;
+ meta->max_bucket = nbuckets - 1;
+ meta->high_mask = nbuckets - 1;
+ meta->low_mask = (nbuckets >> 1) - 1;
+ meta->ffactor = hashp->h_ffactor;
+ meta->nelem = hashp->h_nelem;
+ meta->h_charkey = hashp->h_hash(dbp, CHARKEY, sizeof(CHARKEY));
+ memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+
+ if (F_ISSET(dbp, DB_AM_DUP))
+ F_SET(&meta->dbmeta, DB_HASH_DUP);
+ if (F_ISSET(dbp, DB_AM_SUBDB))
+ F_SET(&meta->dbmeta, DB_HASH_SUBDB);
+ if (dbp->dup_compare != NULL)
+ F_SET(&meta->dbmeta, DB_HASH_DUPSORT);
+
+#ifdef HAVE_PARTITION
+ if ((part = dbp->p_internal) != NULL) {
+ meta->dbmeta.nparts = part->nparts;
+ if (F_ISSET(part, PART_CALLBACK))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_CALLBACK);
+ if (F_ISSET(part, PART_RANGE))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_RANGE);
+ }
+#endif
+
+ /*
+ * Create the first and second buckets pages so that we have the
+ * page numbers for them and we can store that page number in the
+ * meta-data header (spares[0]).
+ */
+ meta->spares[0] = pgno + 1;
+
+ /* Fill in the last fields of the meta data page. */
+ for (i = 1; i <= l2; i++)
+ meta->spares[i] = meta->spares[0];
+ for (; i < NCACHED; i++)
+ meta->spares[i] = PGNO_INVALID;
+
+ return (nbuckets);
+}
+
+/*
+ * __ham_new_file --
+ * Create the necessary pages to begin a new database file. If name
+ * is NULL, then this is an unnamed file, the mpf has been set in the dbp
+ * and we simply create the pages using mpool. In this case, we don't log
+ * because we never have to redo an unnamed create and the undo simply
+ * frees resources.
+ *
+ * This code appears more complex than it is because of the two cases (named
+ * and unnamed). The way to read the code is that for each page being created,
+ * there are three parts: 1) a "get page" chunk (which either uses malloc'd
+ * memory or calls __memp_fget), 2) the initialization, and 3) the "put page"
+ * chunk which either does a fop write or an __memp_fput.
+ *
+ * PUBLIC: int __ham_new_file __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__ham_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ DBT pdbt;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO pginfo;
+ ENV *env;
+ HMETA *meta;
+ PAGE *page;
+ int ret;
+ db_pgno_t lpgno;
+ void *buf;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ meta = NULL;
+ page = NULL;
+ buf = NULL;
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ /* Build meta-data page. */
+ lpgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &lpgno,
+ ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+ return (ret);
+ LSN_NOT_LOGGED(lsn);
+ lpgno = __ham_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+ meta->dbmeta.last_pgno = lpgno;
+ if ((ret = __db_log_page(dbp,
+ txn, &lsn, meta->dbmeta.pgno, (PAGE *)meta)) != 0)
+ goto err;
+ ret = __memp_fput(mpf, ip, meta, dbp->priority);
+ meta = NULL;
+ if (ret != 0)
+ goto err;
+
+ /* Allocate the final hash bucket. */
+ if ((ret = __memp_fget(mpf, &lpgno,
+ ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &page)) != 0)
+ goto err;
+ P_INIT(page,
+ dbp->pgsize, lpgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ LSN_NOT_LOGGED(page->lsn);
+ if ((ret =
+ __db_log_page(dbp, txn, &page->lsn, lpgno, page)) != 0)
+ goto err;
+ ret = __memp_fput(mpf, ip, page, dbp->priority);
+ page = NULL;
+ if (ret != 0)
+ goto err;
+ } else {
+ memset(&pdbt, 0, sizeof(pdbt));
+
+ /* Build meta-data page. */
+ pginfo.db_pagesize = dbp->pgsize;
+ pginfo.type = dbp->type;
+ pginfo.flags =
+ F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+ pdbt.data = &pginfo;
+ pdbt.size = sizeof(pginfo);
+ if ((ret = __os_calloc(dbp->env, 1, dbp->pgsize, &buf)) != 0)
+ return (ret);
+ meta = (HMETA *)buf;
+ LSN_NOT_LOGGED(lsn);
+ lpgno = __ham_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+ meta->dbmeta.last_pgno = lpgno;
+ if ((ret =
+ __db_pgout(env->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
+ goto err;
+ if ((ret = __fop_write(env, txn, name, dbp->dirname,
+ DB_APP_DATA, fhp,
+ dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, F_ISSET(
+ dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ meta = NULL;
+
+ /* Allocate the final hash bucket. */
+#ifdef DIAGNOSTIC
+ memset(buf, 0, dbp->pgsize);
+#endif
+ page = (PAGE *)buf;
+ P_INIT(page,
+ dbp->pgsize, lpgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ LSN_NOT_LOGGED(page->lsn);
+ if ((ret = __db_pgout(env->dbenv, lpgno, buf, &pdbt)) != 0)
+ goto err;
+ if ((ret = __fop_write(env, txn, name, dbp->dirname,
+ DB_APP_DATA, fhp,
+ dbp->pgsize, lpgno, 0, buf, dbp->pgsize, 1, F_ISSET(
+ dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ page = NULL;
+ }
+
+err: if (buf != NULL)
+ __os_free(env, buf);
+ else {
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, dbp->priority);
+ if (page != NULL)
+ (void)__memp_fput(mpf, ip, page, dbp->priority);
+ }
+ return (ret);
+}
+
+/*
+ * __ham_new_subdb --
+ * Create the necessary pages to begin a new subdatabase.
+ *
+ * PUBLIC: int __ham_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
+ */
+int
+__ham_new_subdb(mdbp, dbp, ip, txn)
+ DB *mdbp, *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+{
+ DBC *dbc;
+ DBMETA *mmeta;
+ DB_LOCK lock, metalock, mmlock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HMETA *meta;
+ PAGE *h;
+ int i, ret, t_ret;
+ db_pgno_t lpgno, mpgno;
+
+ env = mdbp->env;
+ mpf = mdbp->mpf;
+ dbc = NULL;
+ meta = NULL;
+ mmeta = NULL;
+ LOCK_INIT(lock);
+ LOCK_INIT(metalock);
+ LOCK_INIT(mmlock);
+
+ if ((ret = __db_cursor(mdbp, ip, txn,
+ &dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
+ return (ret);
+
+ /* Get and lock the new meta data page. */
+ if ((ret = __db_lget(dbc,
+ 0, dbp->meta_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &dbp->meta_pgno, ip, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+
+ /* Initialize the new meta-data page. */
+ lsn = meta->dbmeta.lsn;
+ lpgno = __ham_init_meta(dbp, meta, dbp->meta_pgno, &lsn);
+
+ /*
+ * We are about to allocate a set of contiguous buckets (lpgno
+ * worth). We need to get the master meta-data page to figure
+ * out where these pages are and to allocate them. So, lock and
+ * get the master meta data page.
+ */
+ mpgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc, 0, mpgno, DB_LOCK_WRITE, 0, &mmlock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &mpgno, ip, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &mmeta)) != 0)
+ goto err;
+
+ /*
+ * Now update the hash meta-data page to reflect where the first
+ * set of buckets are actually located.
+ */
+ meta->spares[0] = mmeta->last_pgno + 1;
+ for (i = 0; i < NCACHED && meta->spares[i] != PGNO_INVALID; i++)
+ meta->spares[i] = meta->spares[0];
+
+ /* The new meta data page is now complete; log it. */
+ if ((ret = __db_log_page(mdbp,
+ txn, &meta->dbmeta.lsn, dbp->meta_pgno, (PAGE *)meta)) != 0)
+ goto err;
+
+ /* Reflect the group allocation. */
+ if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+ && txn != NULL
+#endif
+ )
+ if ((ret = __ham_groupalloc_log(mdbp, txn,
+ &LSN(mmeta), 0, &LSN(mmeta), meta->spares[0],
+ meta->max_bucket + 1, 0, mmeta->last_pgno)) != 0)
+ goto err;
+
+ /* Release the new meta-data page. */
+ if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
+ goto err;
+ meta = NULL;
+
+ lpgno += mmeta->last_pgno;
+
+ /* Now allocate the final hash bucket. */
+ if ((ret = __db_lget(dbc, 0, lpgno, DB_LOCK_WRITE, 0, &lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &lpgno, ip, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+ goto err;
+
+ mmeta->last_pgno = lpgno;
+ P_INIT(h, dbp->pgsize, lpgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ LSN(h) = LSN(mmeta);
+ if ((ret = __memp_fput(mpf, ip, h, dbc->priority)) != 0)
+ goto err;
+
+err: /* Now put the master-metadata page back. */
+ if (mmeta != NULL && (t_ret = __memp_fput(mpf,
+ ip, mmeta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, mmlock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ ip, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL)
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
diff --git a/src/hash/hash_page.c b/src/hash/hash_page.c
new file mode 100644
index 00000000..7576fe61
--- /dev/null
+++ b/src/hash/hash_page.c
@@ -0,0 +1,3182 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * Margo Seltzer. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+/*
+ * PACKAGE: hashing
+ *
+ * DESCRIPTION:
+ * Page manipulation for hashing package.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __hamc_delpg
+ __P((DBC *, db_pgno_t, db_pgno_t, u_int32_t, db_ham_mode, u_int32_t *));
+static int __ham_getindex_sorted
+ __P((DBC *, PAGE *, const DBT *, u_int32_t, int *, db_indx_t *));
+static int __ham_getindex_unsorted
+ __P((DBC *, PAGE *, const DBT *, int *, db_indx_t *));
+static int __hamc_delpg_getorder
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __hamc_delpg_setorder
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * PUBLIC: int __ham_item __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item(dbc, mode, pgnop)
+ DBC *dbc;
+ db_lockmode_t mode;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ HASH_CURSOR *hcp;
+ db_pgno_t next_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if (F_ISSET(hcp, H_DELETED)) {
+ __db_errx(dbp->env, DB_STR("1132",
+ "Attempt to return a deleted item"));
+ return (EINVAL);
+ }
+ F_CLR(hcp, H_OK | H_NOMORE);
+
+ /* Check if we need to get a page for this cursor. */
+ if ((ret = __ham_get_cpage(dbc, mode)) != 0)
+ return (ret);
+
+recheck:
+ /* Check if we are looking for space in which to insert an item. */
+ if (hcp->seek_size != 0 && hcp->seek_found_page == PGNO_INVALID &&
+ hcp->seek_size < P_FREESPACE(dbp, hcp->page)) {
+ hcp->seek_found_page = hcp->pgno;
+ hcp->seek_found_indx = NDX_INVALID;
+ }
+
+ /* Check for off-page duplicates. */
+ if (hcp->indx < NUM_ENT(hcp->page) &&
+ HPAGE_TYPE(dbp, hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP) {
+ memcpy(pgnop,
+ HOFFDUP_PGNO(H_PAIRDATA(dbp, hcp->page, hcp->indx)),
+ sizeof(db_pgno_t));
+ F_SET(hcp, H_OK);
+ return (0);
+ }
+
+ /* Check if we need to go on to the next page. */
+ if (F_ISSET(hcp, H_ISDUP))
+ /*
+ * ISDUP is set, and offset is at the beginning of the datum.
+ * We need to grab the length of the datum, then set the datum
+ * pointer to be the beginning of the datum.
+ */
+ memcpy(&hcp->dup_len,
+ HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx)) +
+ hcp->dup_off, sizeof(db_indx_t));
+
+ if (hcp->indx >= (db_indx_t)NUM_ENT(hcp->page)) {
+ /* Fetch next page. */
+ if (NEXT_PGNO(hcp->page) == PGNO_INVALID) {
+ F_SET(hcp, H_NOMORE);
+ return (DB_NOTFOUND);
+ }
+ next_pgno = NEXT_PGNO(hcp->page);
+ hcp->indx = 0;
+ if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0)
+ return (ret);
+ goto recheck;
+ }
+
+ F_SET(hcp, H_OK);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __ham_item_reset __P((DBC *));
+ */
+int
+__ham_item_reset(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ ret = 0;
+ if (hcp->page != NULL) {
+ ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority);
+ hcp->page = NULL;
+ }
+
+ if ((t_ret = __ham_item_init(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_item_init __P((DBC *));
+ */
+int
+__ham_item_init(dbc)
+ DBC *dbc;
+{
+ HASH_CURSOR *hcp;
+ int ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ /*
+ * If this cursor still holds any locks, we must release them if
+ * we are not running with transactions.
+ */
+ ret = __TLPUT(dbc, hcp->lock);
+
+ /*
+ * The following fields must *not* be initialized here because they
+ * may have meaning across inits.
+ * hlock, hdr, split_buf, stats
+ */
+ hcp->bucket = BUCKET_INVALID;
+ hcp->lbucket = BUCKET_INVALID;
+ LOCK_INIT(hcp->lock);
+ hcp->lock_mode = DB_LOCK_NG;
+ hcp->dup_off = 0;
+ hcp->dup_len = 0;
+ hcp->dup_tlen = 0;
+ hcp->seek_size = 0;
+ hcp->seek_found_page = PGNO_INVALID;
+ hcp->seek_found_indx = NDX_INVALID;
+ hcp->flags = 0;
+
+ hcp->pgno = PGNO_INVALID;
+ hcp->indx = NDX_INVALID;
+ hcp->page = NULL;
+
+ return (ret);
+}
+
+/*
+ * Returns the last item in a bucket.
+ *
+ * PUBLIC: int __ham_item_last __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item_last(dbc, mode, pgnop)
+ DBC *dbc;
+ db_lockmode_t mode;
+ db_pgno_t *pgnop;
+{
+ HASH_CURSOR *hcp;
+ int ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if ((ret = __ham_item_reset(dbc)) != 0)
+ return (ret);
+
+ hcp->bucket = hcp->hdr->max_bucket;
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ F_SET(hcp, H_OK);
+ return (__ham_item_prev(dbc, mode, pgnop));
+}
+
+/*
+ * PUBLIC: int __ham_item_first __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item_first(dbc, mode, pgnop)
+ DBC *dbc;
+ db_lockmode_t mode;
+ db_pgno_t *pgnop;
+{
+ HASH_CURSOR *hcp;
+ int ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if ((ret = __ham_item_reset(dbc)) != 0)
+ return (ret);
+ F_SET(hcp, H_OK);
+ hcp->bucket = 0;
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ hcp->dup_off = 0;
+ return (__ham_item_next(dbc, mode, pgnop));
+}
+
+/*
+ * __ham_item_prev --
+ * Returns a pointer to key/data pair on a page. In the case of
+ * bigkeys, just returns the page number and index of the bigkey
+ * pointer pair.
+ *
+ * PUBLIC: int __ham_item_prev __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item_prev(dbc, mode, pgnop)
+ DBC *dbc;
+ db_lockmode_t mode;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ HASH_CURSOR *hcp;
+ db_pgno_t next_pgno;
+ int ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+
+ /*
+ * There are 5 cases for backing up in a hash file.
+ * Case 1: In the middle of a page, no duplicates, just dec the index.
+ * Case 2: In the middle of a duplicate set, back up one.
+ * Case 3: At the beginning of a duplicate set, get out of set and
+ * back up to next key.
+ * Case 4: At the beginning of a page; go to previous page.
+ * Case 5: At the beginning of a bucket; go to prev bucket.
+ */
+ F_CLR(hcp, H_OK | H_NOMORE | H_DELETED);
+
+ if ((ret = __ham_get_cpage(dbc, mode)) != 0)
+ return (ret);
+
+ /*
+ * First handle the duplicates. Either you'll get the key here
+ * or you'll exit the duplicate set and drop into the code below
+ * to handle backing up through keys.
+ */
+ if (!F_ISSET(hcp, H_NEXT_NODUP) && F_ISSET(hcp, H_ISDUP)) {
+ if (HPAGE_TYPE(dbp, hcp->page, H_DATAINDEX(hcp->indx)) ==
+ H_OFFDUP) {
+ memcpy(pgnop,
+ HOFFDUP_PGNO(H_PAIRDATA(dbp, hcp->page, hcp->indx)),
+ sizeof(db_pgno_t));
+ F_SET(hcp, H_OK);
+ return (0);
+ }
+
+ /* Duplicates are on-page. */
+ if (hcp->dup_off != 0) {
+ memcpy(&hcp->dup_len, HKEYDATA_DATA(
+ H_PAIRDATA(dbp, hcp->page, hcp->indx))
+ + hcp->dup_off - sizeof(db_indx_t),
+ sizeof(db_indx_t));
+ hcp->dup_off -=
+ DUP_SIZE(hcp->dup_len);
+ return (__ham_item(dbc, mode, pgnop));
+ }
+ }
+
+ /*
+ * If we get here, we are not in a duplicate set, and just need
+ * to back up the cursor. There are still three cases:
+ * midpage, beginning of page, beginning of bucket.
+ */
+
+ if (F_ISSET(hcp, H_DUPONLY)) {
+ F_CLR(hcp, H_OK);
+ F_SET(hcp, H_NOMORE);
+ return (0);
+ } else
+ /*
+ * We are no longer in a dup set; flag this so the dup code
+ * will reinitialize should we stumble upon another one.
+ */
+ F_CLR(hcp, H_ISDUP);
+
+ if (hcp->indx == 0) { /* Beginning of page. */
+ hcp->pgno = PREV_PGNO(hcp->page);
+ if (hcp->pgno == PGNO_INVALID) {
+ /* Beginning of bucket. */
+ F_SET(hcp, H_NOMORE);
+ return (DB_NOTFOUND);
+ } else if ((ret =
+ __ham_next_cpage(dbc, hcp->pgno)) != 0)
+ return (ret);
+ else
+ hcp->indx = NUM_ENT(hcp->page);
+ }
+
+ /*
+ * Either we've got the cursor set up to be decremented, or we
+ * have to find the end of a bucket.
+ */
+ if (hcp->indx == NDX_INVALID) {
+ DB_ASSERT(dbp->env, hcp->page != NULL);
+
+ hcp->indx = NUM_ENT(hcp->page);
+ for (next_pgno = NEXT_PGNO(hcp->page);
+ next_pgno != PGNO_INVALID;
+ next_pgno = NEXT_PGNO(hcp->page)) {
+ if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0)
+ return (ret);
+ hcp->indx = NUM_ENT(hcp->page);
+ }
+
+ if (hcp->indx == 0) {
+ /* Bucket was empty. */
+ F_SET(hcp, H_NOMORE);
+ return (DB_NOTFOUND);
+ }
+ }
+
+ hcp->indx -= 2;
+
+ return (__ham_item(dbc, mode, pgnop));
+}
+
+/*
+ * Sets the cursor to the next key/data pair on a page.
+ *
+ * PUBLIC: int __ham_item_next __P((DBC *, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_item_next(dbc, mode, pgnop)
+ DBC *dbc;
+ db_lockmode_t mode;
+ db_pgno_t *pgnop;
+{
+ HASH_CURSOR *hcp;
+ int ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if ((ret = __ham_get_cpage(dbc, mode)) != 0)
+ return (ret);
+
+ /*
+ * Deleted on-page duplicates are a weird case. If we delete the last
+ * one, then our cursor is at the very end of a duplicate set and
+ * we actually need to go on to the next key.
+ */
+ if (F_ISSET(hcp, H_DELETED)) {
+ if (hcp->indx != NDX_INVALID &&
+ F_ISSET(hcp, H_ISDUP) &&
+ HPAGE_TYPE(dbc->dbp, hcp->page, H_DATAINDEX(hcp->indx))
+ == H_DUPLICATE && hcp->dup_tlen == hcp->dup_off) {
+ if (F_ISSET(hcp, H_DUPONLY)) {
+ F_CLR(hcp, H_OK);
+ F_SET(hcp, H_NOMORE);
+ return (0);
+ } else {
+ F_CLR(hcp, H_ISDUP);
+ hcp->indx += 2;
+ }
+ } else if (!F_ISSET(hcp, H_ISDUP) && F_ISSET(hcp, H_DUPONLY)) {
+ F_CLR(hcp, H_OK);
+ F_SET(hcp, H_NOMORE);
+ return (0);
+ } else if (F_ISSET(hcp, H_ISDUP) &&
+ F_ISSET(hcp, H_NEXT_NODUP)) {
+ F_CLR(hcp, H_ISDUP);
+ hcp->indx += 2;
+ }
+ F_CLR(hcp, H_DELETED);
+ } else if (hcp->indx == NDX_INVALID) {
+ hcp->indx = 0;
+ F_CLR(hcp, H_ISDUP);
+ } else if (F_ISSET(hcp, H_NEXT_NODUP)) {
+ hcp->indx += 2;
+ F_CLR(hcp, H_ISDUP);
+ } else if (F_ISSET(hcp, H_ISDUP) && hcp->dup_tlen != 0) {
+ if (hcp->dup_off + DUP_SIZE(hcp->dup_len) >=
+ hcp->dup_tlen && F_ISSET(hcp, H_DUPONLY)) {
+ F_CLR(hcp, H_OK);
+ F_SET(hcp, H_NOMORE);
+ return (0);
+ }
+ hcp->dup_off += DUP_SIZE(hcp->dup_len);
+ if (hcp->dup_off >= hcp->dup_tlen) {
+ F_CLR(hcp, H_ISDUP);
+ hcp->indx += 2;
+ }
+ } else if (F_ISSET(hcp, H_DUPONLY)) {
+ F_CLR(hcp, H_OK);
+ F_SET(hcp, H_NOMORE);
+ return (0);
+ } else {
+ hcp->indx += 2;
+ F_CLR(hcp, H_ISDUP);
+ }
+
+ ret = __ham_item(dbc, mode, pgnop);
+ return (ret);
+}
+
+/*
+ * __ham_insertpair --
+ *
+ * Used for adding a pair of elements to a sorted page. We are guaranteed that
+ * the pair will fit on this page.
+ *
+ * indexp will return the point at which we inserted the pair.
+ *
+ * We're overloading the meaning of the H_OFFPAGE type here, which is a little
+ * bit sleazy. When we recover deletes, we have the entire entry instead of
+ * having only the DBT, so we'll pass type H_OFFPAGE to mean "copy the whole
+ * entry" as opposed to constructing an H_KEYDATA around it. In the recovery
+ * case it is assumed that a valid index is passed in, since a lookup using
+ * the overloaded H_OFFPAGE key will be incorrect.
+ *
+ * PUBLIC: int __ham_insertpair __P((DBC *, PAGE *p,
+ * PUBLIC: db_indx_t *indxp, const DBT *,
+ * PUBLIC: const DBT *, u_int32_t, u_int32_t));
+ */
+int
+__ham_insertpair(dbc, p, indxp, key_dbt, data_dbt, key_type, data_type)
+ DBC *dbc;
+ PAGE *p;
+ db_indx_t *indxp;
+ const DBT *key_dbt, *data_dbt;
+ u_int32_t key_type, data_type;
+{
+ DB *dbp;
+ u_int16_t n, indx;
+ db_indx_t *inp;
+ u_int32_t ksize, dsize, increase, distance;
+ u_int8_t *offset;
+ int i;
+
+ dbp = dbc->dbp;
+ n = NUM_ENT(p);
+ inp = P_INP(dbp, p);
+ ksize = (key_type == H_OFFPAGE) ?
+ key_dbt->size : HKEYDATA_SIZE(key_dbt->size);
+ dsize = (data_type == H_OFFPAGE || data_type == H_OFFDUP) ?
+ data_dbt->size : HKEYDATA_SIZE(data_dbt->size);
+ increase = ksize + dsize;
+
+ DB_ASSERT(dbp->env, indxp != NULL && *indxp != NDX_INVALID);
+ DB_ASSERT(dbp->env,
+ P_FREESPACE(dbp, p) >= dsize + ksize + 2 * sizeof(db_indx_t));
+ indx = *indxp;
+
+ /* Special case if the page is empty or inserting at end of page.*/
+ if (n == 0 || indx == n) {
+ inp[indx] = HOFFSET(p) - ksize;
+ inp[indx+1] = HOFFSET(p) - increase;
+ } else {
+ /*
+ * Shuffle the data elements.
+ *
+ * For example, inserting an element that sorts between items
+ * 2 and 3 on a page:
+ * The copy starts from the beginning of the second item.
+ *
+ * ---------------------------
+ * |pgheader..
+ * |__________________________
+ * ||1|2|3|4|...
+ * |--------------------------
+ * |
+ * |__________________________
+ * | ...|4|3|2|1|
+ * |--------------------------
+ * ---------------------------
+ *
+ * Becomes:
+ *
+ * ---------------------------
+ * |pgheader..
+ * |__________________________
+ * ||1|2|2a|3|4|...
+ * |--------------------------
+ * |
+ * |__________________________
+ * | ...|4|3|2a|2|1|
+ * |--------------------------
+ * ---------------------------
+ *
+ * Index's 3,4 etc move down the page.
+ * The data for 3,4,etc moves up the page by sizeof(2a)
+ * The index pointers in 3,4 etc are updated to point at the
+ * relocated data.
+ * It is necessary to move the data (not just adjust the index)
+ * since the hash format uses consecutive data items to
+ * dynamically calculate the item size.
+ * An item in this example is a key/data pair.
+ */
+ offset = (u_int8_t *)p + HOFFSET(p);
+ if (indx == 0)
+ distance = dbp->pgsize - HOFFSET(p);
+ else
+ distance = (u_int32_t)
+ (P_ENTRY(dbp, p, indx - 1) - offset);
+ memmove(offset - increase, offset, distance);
+
+ /* Shuffle the index array */
+ memmove(&inp[indx + 2], &inp[indx],
+ (n - indx) * sizeof(db_indx_t));
+
+ /* update the index array */
+ for (i = indx + 2; i < n + 2; i++)
+ inp[i] -= increase;
+
+ /* set the new index elements. */
+ inp[indx] = (HOFFSET(p) - increase) + distance + dsize;
+ inp[indx + 1] = (HOFFSET(p) - increase) + distance;
+ }
+
+ HOFFSET(p) -= increase;
+ /* insert the new elements */
+ if (key_type == H_OFFPAGE)
+ memcpy(P_ENTRY(dbp, p, indx), key_dbt->data, key_dbt->size);
+ else
+ PUT_HKEYDATA(P_ENTRY(dbp, p, indx), key_dbt->data,
+ key_dbt->size, key_type);
+ if (data_type == H_OFFPAGE || data_type == H_OFFDUP)
+ memcpy(P_ENTRY(dbp, p, indx+1), data_dbt->data,
+ data_dbt->size);
+ else
+ PUT_HKEYDATA(P_ENTRY(dbp, p, indx+1), data_dbt->data,
+ data_dbt->size, data_type);
+ NUM_ENT(p) += 2;
+
+ /*
+ * If debugging a sorted hash page problem, this is a good place to
+ * insert a call to __ham_verify_sorted_page.
+ * It used to be called when diagnostic mode was enabled, but that
+ * causes problems in recovery if a custom comparator was used.
+ */
+ return (0);
+}
+
+/*
+ * __hame_getindex --
+ *
+ * The key_type parameter overloads the entry type to allow for comparison of
+ * a key DBT that contains off-page data. A key that is not of type H_OFFPAGE
+ * might contain data larger than the page size, since this routine can be
+ * called with user-provided DBTs.
+ *
+ * PUBLIC: int __ham_getindex __P((DBC *,
+ * PUBLIC: PAGE *, const DBT *, u_int32_t, int *, db_indx_t *));
+ */
+int
+__ham_getindex(dbc, p, key, key_type, match, indx)
+ DBC *dbc;
+ PAGE *p;
+ const DBT *key;
+ u_int32_t key_type;
+ int *match;
+ db_indx_t *indx;
+{
+ /* Since all entries are key/data pairs. */
+ DB_ASSERT(dbc->env, NUM_ENT(p)%2 == 0 );
+
+ /* Support pre 4.6 unsorted hash pages. */
+ if (p->type == P_HASH_UNSORTED)
+ return (__ham_getindex_unsorted(dbc, p, key, match, indx));
+ else
+ return (__ham_getindex_sorted(dbc,
+ p, key, key_type, match, indx));
+}
+
+#undef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+/*
+ * Perform a linear search of an unsorted (pre 4.6 format) hash page.
+ *
+ * This routine is never used to generate an index for insertion, because any
+ * unsorted page is sorted before we insert.
+ *
+ * Returns 0 if an exact match is found, with indx set to requested elem.
+ * Returns 1 if the item did not exist, indx is set to the last element on the
+ * page.
+ */
+static int
+__ham_getindex_unsorted(dbc, p, key, match, indx)
+ DBC *dbc;
+ PAGE *p;
+ const DBT *key;
+ int *match;
+ db_indx_t *indx;
+{
+ DB *dbp;
+ DBT pg_dbt;
+ HASH *t;
+ db_pgno_t pgno;
+ int i, n, res, ret;
+ u_int32_t tlen;
+ u_int8_t *hk;
+
+ dbp = dbc->dbp;
+ n = NUM_ENT(p);
+ t = dbp->h_internal;
+ res = 1;
+
+ /* Do a linear search over the page looking for an exact match */
+ for (i = 0; i < n; i+=2) {
+ hk = H_PAIRKEY(dbp, p, i);
+ switch (HPAGE_PTYPE(hk)) {
+ case H_OFFPAGE:
+ /* extract item length from possibly unaligned DBT */
+ memcpy(&tlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ if (tlen == key->size) {
+ memcpy(&pgno,
+ HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ if ((ret = __db_moff(dbc, key, pgno, tlen,
+ t->h_compare, &res)) != 0)
+ return (ret);
+ }
+ break;
+ case H_KEYDATA:
+ if (t->h_compare != NULL) {
+ DB_INIT_DBT(pg_dbt,
+ HKEYDATA_DATA(hk), key->size);
+ if (t->h_compare(
+ dbp, key, &pg_dbt) != 0)
+ break;
+ } else if (key->size ==
+ LEN_HKEY(dbp, p, dbp->pgsize, i))
+ res = memcmp(key->data, HKEYDATA_DATA(hk),
+ key->size);
+ break;
+ case H_DUPLICATE:
+ case H_OFFDUP:
+ /*
+ * These are errors because keys are never duplicated.
+ */
+ /* FALLTHROUGH */
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(p)));
+ }
+ if (res == 0)
+ break;
+ }
+ *indx = i;
+ *match = (res == 0 ? 0 : 1);
+ return (0);
+}
+
+/*
+ * Perform a binary search of a sorted hash page for a key.
+ * Return 0 if an exact match is found, with indx set to requested elem.
+ * Return 1 if the item did not exist, indx will be set to the first element
+ * greater than the requested item.
+ */
+static int
+__ham_getindex_sorted(dbc, p, key, key_type, match, indxp)
+ DBC *dbc;
+ PAGE *p;
+ const DBT *key;
+ u_int32_t key_type;
+ int *match;
+ db_indx_t *indxp;
+{
+ DB *dbp;
+ DBT tmp_dbt;
+ HASH *t;
+ HOFFPAGE *offp;
+ db_indx_t indx;
+ db_pgno_t off_pgno, koff_pgno;
+ u_int32_t base, itemlen, lim, off_len;
+ u_int8_t *entry;
+ int res, ret;
+ void *data;
+
+ dbp = dbc->dbp;
+ DB_ASSERT(dbp->env, p->type == P_HASH );
+
+ t = dbp->h_internal;
+ /* Initialize so the return params are correct for empty pages. */
+ res = indx = 0;
+
+ /* Do a binary search for the element. */
+ DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(p), 2) {
+ DB_BINARY_SEARCH_INCR(indx, base, lim, 2);
+ data = HKEYDATA_DATA(H_PAIRKEY(dbp, p, indx));
+ /*
+ * There are 4 cases here:
+ * 1) Off page key, off page match
+ * 2) Off page key, on page match
+ * 3) On page key, off page match
+ * 4) On page key, on page match
+ */
+ entry = P_ENTRY(dbp, p, indx);
+ if (*entry == H_OFFPAGE) {
+ offp = (HOFFPAGE*)P_ENTRY(dbp, p, indx);
+ (void)__ua_memcpy(&itemlen, HOFFPAGE_TLEN(offp),
+ sizeof(u_int32_t));
+ if (key_type == H_OFFPAGE) {
+ /*
+ * Case 1.
+ *
+ * If both key and cmp DBTs refer to different
+ * offpage items, it is necessary to compare
+ * the content of the entries, in order to be
+ * able to maintain a valid lexicographic sort
+ * order.
+ */
+ (void)__ua_memcpy(&koff_pgno,
+ HOFFPAGE_PGNO(key->data),
+ sizeof(db_pgno_t));
+ (void)__ua_memcpy(&off_pgno,
+ HOFFPAGE_PGNO(offp), sizeof(db_pgno_t));
+ if (koff_pgno == off_pgno)
+ res = 0;
+ else {
+ memset(&tmp_dbt, 0, sizeof(tmp_dbt));
+ tmp_dbt.size = HOFFPAGE_SIZE;
+ tmp_dbt.data = offp;
+ if ((ret = __db_coff(dbc, key, &tmp_dbt,
+ t->h_compare, &res)) != 0)
+ return (ret);
+ }
+ } else {
+ /* Case 2 */
+ (void)__ua_memcpy(&off_pgno,
+ HOFFPAGE_PGNO(offp), sizeof(db_pgno_t));
+ if ((ret = __db_moff(dbc, key, off_pgno,
+ itemlen, t->h_compare, &res)) != 0)
+ return (ret);
+ }
+ } else {
+ itemlen = LEN_HKEYDATA(dbp, p, dbp->pgsize, indx);
+ if (key_type == H_OFFPAGE) {
+ /* Case 3 */
+ tmp_dbt.data = data;
+ tmp_dbt.size = itemlen;
+ offp = (HOFFPAGE *)key->data;
+ (void)__ua_memcpy(&off_pgno,
+ HOFFPAGE_PGNO(offp), sizeof(db_pgno_t));
+ (void)__ua_memcpy(&off_len, HOFFPAGE_TLEN(offp),
+ sizeof(u_int32_t));
+ if ((ret = __db_moff(dbc, &tmp_dbt, off_pgno,
+ off_len, t->h_compare, &res)) != 0)
+ return (ret);
+ /*
+ * Since we switched the key/match parameters
+ * in the __db_moff call, the result needs to
+ * be inverted.
+ */
+ res = -res;
+ } else if (t->h_compare != NULL) {
+ /* Case 4, with a user comparison func */
+ DB_INIT_DBT(tmp_dbt, data, itemlen);
+ res = t->h_compare(dbp, key, &tmp_dbt);
+ } else {
+ /* Case 4, without a user comparison func */
+ if ((res = memcmp(key->data, data,
+ min(key->size, itemlen))) == 0)
+ res = itemlen > key->size ? 1 :
+ (itemlen < key->size ? -1 : 0);
+ }
+ }
+ if (res == 0) {
+ /* Found a match */
+ *indxp = indx;
+ *match = 0;
+ return (0);
+ } else if (res > 0)
+ DB_BINARY_SEARCH_SHIFT_BASE(indx, base, lim, 2);
+ }
+ /*
+ * If no match was found, and the comparison indicates that the
+ * closest match was lexicographically less than the input key adjust
+ * the insertion index to be after the index of the closest match.
+ */
+ if (res > 0)
+ indx += 2;
+ *indxp = indx;
+ *match = 1;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __ham_verify_sorted_page __P((DBC *, PAGE *));
+ *
+ * The__ham_verify_sorted_page function is used to determine the correctness
+ * of sorted hash pages. The checks are used by verification, they are
+ * implemented in the hash code because they are also useful debugging aids.
+ */
+int
+__ham_verify_sorted_page (dbc, p)
+ DBC *dbc;
+ PAGE *p;
+{
+ DB *dbp;
+ DBT prev_dbt, curr_dbt;
+ ENV *env;
+ HASH *t;
+ db_pgno_t tpgno;
+ u_int32_t curr_len, prev_len, tlen;
+ u_int16_t *indxp;
+ db_indx_t i, n;
+ int res, ret;
+ char *prev, *curr;
+
+ /* Validate that next, prev pointers are OK */
+ n = NUM_ENT(p);
+ dbp = dbc->dbp;
+ DB_ASSERT(dbp->env, n%2 == 0 );
+
+ env = dbp->env;
+ t = dbp->h_internal;
+
+ /* Disable verification if a custom comparator is supplied */
+ if (t->h_compare != NULL)
+ return (0);
+
+ /* Iterate through page, ensuring order */
+ prev = (char *)HKEYDATA_DATA(H_PAIRKEY(dbp, p, 0));
+ prev_len = LEN_HKEYDATA(dbp, p, dbp->pgsize, 0);
+ for (i = 2; i < n; i+=2) {
+ curr = (char *)HKEYDATA_DATA(H_PAIRKEY(dbp, p, i));
+ curr_len = LEN_HKEYDATA(dbp, p, dbp->pgsize, i);
+
+ if (HPAGE_TYPE(dbp, p, i-2) == H_OFFPAGE &&
+ HPAGE_TYPE(dbp, p, i) == H_OFFPAGE) {
+ memset(&prev_dbt, 0, sizeof(prev_dbt));
+ memset(&curr_dbt, 0, sizeof(curr_dbt));
+ prev_dbt.size = curr_dbt.size = HOFFPAGE_SIZE;
+ prev_dbt.data = H_PAIRKEY(dbp, p, i-2);
+ curr_dbt.data = H_PAIRKEY(dbp, p, i);
+ if ((ret = __db_coff(dbc,
+ &prev_dbt, &curr_dbt, t->h_compare, &res)) != 0)
+ return (ret);
+ } else if (HPAGE_TYPE(dbp, p, i-2) == H_OFFPAGE) {
+ memset(&curr_dbt, 0, sizeof(curr_dbt));
+ curr_dbt.size = curr_len;
+ curr_dbt.data = H_PAIRKEY(dbp, p, i);
+ memcpy(&tlen, HOFFPAGE_TLEN(H_PAIRKEY(dbp, p, i-2)),
+ sizeof(u_int32_t));
+ memcpy(&tpgno, HOFFPAGE_PGNO(H_PAIRKEY(dbp, p, i-2)),
+ sizeof(db_pgno_t));
+ if ((ret = __db_moff(dbc,
+ &curr_dbt, tpgno, tlen, t->h_compare, &res)) != 0)
+ return (ret);
+ } else if (HPAGE_TYPE(dbp, p, i) == H_OFFPAGE) {
+ memset(&prev_dbt, 0, sizeof(prev_dbt));
+ prev_dbt.size = prev_len;
+ prev_dbt.data = H_PAIRKEY(dbp, p, i);
+ memcpy(&tlen, HOFFPAGE_TLEN(H_PAIRKEY(dbp, p, i)),
+ sizeof(u_int32_t));
+ memcpy(&tpgno, HOFFPAGE_PGNO(H_PAIRKEY(dbp, p, i)),
+ sizeof(db_pgno_t));
+ if ((ret = __db_moff(dbc,
+ &prev_dbt, tpgno, tlen, t->h_compare, &res)) != 0)
+ return (ret);
+ } else
+ res = memcmp(prev, curr, min(curr_len, prev_len));
+
+ if (res == 0 && curr_len > prev_len)
+ res = 1;
+ else if (res == 0 && curr_len < prev_len)
+ res = -1;
+
+ if (res >= 0) {
+ __db_msg(env, "key1: %s, key2: %s, len: %lu\n",
+ (char *)prev, (char *)curr,
+ (u_long)min(curr_len, prev_len));
+ __db_msg(env, "curroffset %lu\n", (u_long)i);
+ __db_msg(env, "indexes: ");
+ for (i = 0; i < n; i++) {
+ indxp = P_INP(dbp, p) + i;
+ __db_msg(env, "%04X, ", *indxp);
+ }
+ __db_msg(env, "\n");
+#ifdef HAVE_STATISTICS
+ if ((ret = __db_prpage(dbp, p, DB_PR_PAGE)) != 0)
+ return (ret);
+#endif
+ DB_ASSERT(dbp->env, res < 0);
+ }
+
+ prev = curr;
+ prev_len = curr_len;
+ }
+ return (0);
+}
+
+/*
+ * A wrapper for the __ham_sort_page function. Implements logging and cursor
+ * adjustments associated with sorting a page outside of recovery/upgrade.
+ * PUBLIC: int __ham_sort_page_cursor __P((DBC *, PAGE *));
+ */
+int
+__ham_sort_page_cursor(dbc, page)
+ DBC *dbc;
+ PAGE *page;
+{
+ DB *dbp;
+ DBT page_dbt;
+ DB_LSN new_lsn;
+ HASH_CURSOR *hcp;
+ int ret;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if (DBC_LOGGING(dbc)) {
+ page_dbt.size = dbp->pgsize;
+ page_dbt.data = page;
+ if ((ret = __ham_splitdata_log(dbp, dbc->txn,
+ &new_lsn, 0, SORTPAGE, PGNO(page),
+ &page_dbt, &LSN(page))) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+ /* Move lsn onto page. */
+ LSN(page) = new_lsn; /* Structure assignment. */
+
+ /*
+ * Invalidate the saved index, it needs to be retrieved
+ * again once the page is sorted.
+ */
+ hcp->seek_found_indx = NDX_INVALID;
+ hcp->seek_found_page = PGNO_INVALID;
+
+ return (__ham_sort_page(dbc, &hcp->split_buf, page));
+}
+
+/*
+ * PUBLIC: int __ham_sort_page __P((DBC *, PAGE **, PAGE *));
+ *
+ * Convert a page from P_HASH_UNSORTED into the sorted format P_HASH.
+ *
+ * All locking and logging is carried out be the caller. A user buffer can
+ * optionally be passed in to save allocating a page size buffer for sorting.
+ * This is allows callers to re-use the buffer pre-allocated for page splits
+ * in the hash cursor. The buffer is optional since no cursor exists when in
+ * the recovery or upgrade code paths.
+ */
+int
+__ham_sort_page(dbc, tmp_buf, page)
+ DBC *dbc;
+ PAGE **tmp_buf;
+ PAGE *page;
+{
+ DB *dbp;
+ PAGE *temp_pagep;
+ db_indx_t i;
+ int ret;
+
+ dbp = dbc->dbp;
+ DB_ASSERT(dbp->env, page->type == P_HASH_UNSORTED);
+
+ ret = 0;
+ if (tmp_buf != NULL)
+ temp_pagep = *tmp_buf;
+ else if ((ret = __os_malloc(dbp->env, dbp->pgsize, &temp_pagep)) != 0)
+ return (ret);
+
+ memcpy(temp_pagep, page, dbp->pgsize);
+
+ /* Re-initialize the page. */
+ P_INIT(page, dbp->pgsize,
+ page->pgno, page->prev_pgno, page->next_pgno, 0, P_HASH);
+
+ for (i = 0; i < NUM_ENT(temp_pagep); i += 2)
+ if ((ret =
+ __ham_copypair(dbc, temp_pagep, i, page, NULL, 0)) != 0)
+ break;
+
+ if (tmp_buf == NULL)
+ __os_free(dbp->env, temp_pagep);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_del_pair __P((DBC *, int, PAGE *));
+ */
+int
+__ham_del_pair(dbc, flags, ppg)
+ DBC *dbc;
+ int flags;
+ PAGE *ppg;
+{
+ DB *dbp;
+ DBT data_dbt, key_dbt;
+ DB_LSN new_lsn, *n_lsn, tmp_lsn;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ PAGE *n_pagep, *nn_pagep, *p, *p_pagep;
+ db_ham_mode op;
+ db_indx_t ndx;
+ db_pgno_t chg_pgno, pgno, tmp_pgno;
+ u_int32_t data_type, key_type, order;
+ int ret, t_ret;
+ u_int8_t *hk;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ n_pagep = p_pagep = nn_pagep = NULL;
+ ndx = hcp->indx;
+
+ if (hcp->page == NULL &&
+ (ret = __memp_fget(mpf, &hcp->pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &hcp->page)) != 0)
+ return (ret);
+ p = hcp->page;
+
+ /*
+ * We optimize for the normal case which is when neither the key nor
+ * the data are large. In this case, we write a single log record
+ * and do the delete. If either is large, we'll call __big_delete
+ * to remove the big item and then update the page to remove the
+ * entry referring to the big item.
+ */
+ if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) &&
+ HPAGE_PTYPE(H_PAIRKEY(dbp, p, ndx)) == H_OFFPAGE) {
+ memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(dbp, p, H_KEYINDEX(ndx))),
+ sizeof(db_pgno_t));
+ ret = __db_doff(dbc, pgno);
+ } else
+ ret = 0;
+
+ if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) && ret == 0)
+ switch (HPAGE_PTYPE(H_PAIRDATA(dbp, p, ndx))) {
+ case H_OFFPAGE:
+ memcpy(&pgno,
+ HOFFPAGE_PGNO(P_ENTRY(dbp, p, H_DATAINDEX(ndx))),
+ sizeof(db_pgno_t));
+ ret = __db_doff(dbc, pgno);
+ break;
+ case H_OFFDUP:
+ case H_DUPLICATE:
+ /*
+ * If we delete a pair that is/was a duplicate, then
+ * we had better clear the flag so that we update the
+ * cursor appropriately.
+ */
+ F_CLR(hcp, H_ISDUP);
+ break;
+ default:
+ /* No-op */
+ break;
+ }
+
+ if (ret)
+ return (ret);
+
+ /* Now log the delete off this page. */
+ if (DBC_LOGGING(dbc)) {
+ hk = H_PAIRKEY(dbp, hcp->page, ndx);
+ if ((key_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) {
+ key_dbt.data = hk;
+ key_dbt.size = HOFFPAGE_SIZE;
+ } else {
+ key_dbt.data = HKEYDATA_DATA(hk);
+ key_dbt.size =
+ LEN_HKEY(dbp, hcp->page, dbp->pgsize, ndx);
+ }
+ hk = H_PAIRDATA(dbp, hcp->page, ndx);
+ if ((data_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) {
+ data_dbt.data = hk;
+ data_dbt.size = HOFFPAGE_SIZE;
+ } else if (data_type == H_OFFDUP) {
+ data_dbt.data = hk;
+ data_dbt.size = HOFFDUP_SIZE;
+ } else {
+ data_dbt.data = HKEYDATA_DATA(hk);
+ data_dbt.size =
+ LEN_HDATA(dbp, hcp->page, dbp->pgsize, ndx);
+ }
+
+ if ((ret = __ham_insdel_log(dbp, dbc->txn, &new_lsn, 0,
+ DELPAIR, PGNO(p), (u_int32_t)ndx, &LSN(p),
+ OP_SET(key_type, p), &key_dbt,
+ OP_SET(data_type, p), &data_dbt)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+
+ /* Move lsn onto page. */
+ LSN(p) = new_lsn;
+ /* Do the delete. */
+ __ham_dpair(dbp, p, ndx);
+
+ /*
+ * Mark item deleted so that we don't try to return it, and
+ * so that we update the cursor correctly on the next call
+ * to next.
+ */
+ F_SET(hcp, H_DELETED);
+ F_CLR(hcp, H_OK);
+
+ /* Clear any cache streaming information. */
+ hcp->stream_start_pgno = PGNO_INVALID;
+
+ /*
+ * If we are locking, we will not maintain this, because it is
+ * a hot spot.
+ *
+ * XXX
+ * Perhaps we can retain incremental numbers and apply them later.
+ */
+ if (!STD_LOCKING(dbc)) {
+ if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+ return (ret);
+ --hcp->hdr->nelem;
+ }
+
+ /* The HAM_DEL_NO_CURSOR flag implies HAM_DEL_NO_RECLAIM. */
+ if (LF_ISSET(HAM_DEL_NO_CURSOR))
+ return (0);
+ /*
+ * Update cursors that are on the page where the delete happened.
+ */
+ if ((ret = __hamc_update(dbc, 0, DB_HAM_CURADJ_DEL, 0)) != 0)
+ return (ret);
+
+ /*
+ * If we need to reclaim the page, then check if the page is empty.
+ * There are two cases. If it's empty and it's not the first page
+ * in the bucket (i.e., the bucket page) then we can simply remove
+ * it. If it is the first chain in the bucket, then we need to copy
+ * the second page into it and remove the second page.
+ * If its the only page in the bucket we leave it alone.
+ */
+ if (LF_ISSET(HAM_DEL_NO_RECLAIM) ||
+ NUM_ENT(p) != 0 ||
+ (PREV_PGNO(p) == PGNO_INVALID && NEXT_PGNO(p) == PGNO_INVALID)) {
+ if (NUM_ENT(p) == 0)
+ F_SET(hcp, H_CONTRACT);
+ return (0);
+ }
+
+ if (PREV_PGNO(p) == PGNO_INVALID) {
+ /*
+ * First page in chain is empty and we know that there
+ * are more pages in the chain.
+ */
+ if ((ret = __memp_fget(mpf,
+ &NEXT_PGNO(p), dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &n_pagep)) != 0)
+ return (ret);
+
+ if (NEXT_PGNO(n_pagep) != PGNO_INVALID &&
+ (ret = __memp_fget(mpf, &NEXT_PGNO(n_pagep),
+ dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &nn_pagep)) != 0)
+ goto err;
+
+ if (DBC_LOGGING(dbc)) {
+ key_dbt.data = n_pagep;
+ key_dbt.size = dbp->pgsize;
+ if ((ret = __ham_copypage_log(dbp,
+ dbc->txn, &new_lsn, 0, PGNO(p),
+ &LSN(p), PGNO(n_pagep), &LSN(n_pagep),
+ NEXT_PGNO(n_pagep),
+ nn_pagep == NULL ? NULL : &LSN(nn_pagep),
+ &key_dbt)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+
+ /* Move lsn onto page. */
+ LSN(p) = new_lsn; /* Structure assignment. */
+ LSN(n_pagep) = new_lsn;
+ if (NEXT_PGNO(n_pagep) != PGNO_INVALID)
+ LSN(nn_pagep) = new_lsn;
+
+ if (nn_pagep != NULL) {
+ PREV_PGNO(nn_pagep) = PGNO(p);
+ ret = __memp_fput(mpf,
+ dbc->thread_info, nn_pagep, dbc->priority);
+ nn_pagep = NULL;
+ if (ret != 0)
+ goto err;
+ }
+
+ tmp_pgno = PGNO(p);
+ tmp_lsn = LSN(p);
+ memcpy(p, n_pagep, dbp->pgsize);
+ PGNO(p) = tmp_pgno;
+ LSN(p) = tmp_lsn;
+ PREV_PGNO(p) = PGNO_INVALID;
+
+ /*
+ * Update cursors to reflect the fact that records
+ * on the second page have moved to the first page.
+ */
+ if ((ret = __hamc_delpg(dbc, PGNO(n_pagep),
+ PGNO(p), 0, DB_HAM_DELFIRSTPG, &order)) != 0)
+ goto err;
+
+ /*
+ * Update the cursor to reflect its new position.
+ */
+ hcp->indx = 0;
+ hcp->pgno = PGNO(p);
+ hcp->order += order;
+
+ if ((ret = __db_free(dbc, n_pagep, 0)) != 0) {
+ n_pagep = NULL;
+ goto err;
+ }
+ } else {
+ if ((p_pagep = ppg) == NULL && (ret = __memp_fget(mpf,
+ &PREV_PGNO(p), dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &p_pagep)) != 0)
+ goto err;
+
+ if (NEXT_PGNO(p) != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &NEXT_PGNO(p),
+ dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &n_pagep)) != 0)
+ goto err;
+ n_lsn = &LSN(n_pagep);
+ } else {
+ n_pagep = NULL;
+ n_lsn = NULL;
+ }
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __ham_newpage_log(dbp, dbc->txn,
+ &new_lsn, 0, DELOVFL, PREV_PGNO(p), &LSN(p_pagep),
+ PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+
+ /* Move lsn onto page. */
+ LSN(p_pagep) = new_lsn; /* Structure assignment. */
+ if (n_pagep)
+ LSN(n_pagep) = new_lsn;
+ LSN(p) = new_lsn;
+
+ NEXT_PGNO(p_pagep) = NEXT_PGNO(p);
+ if (n_pagep != NULL)
+ PREV_PGNO(n_pagep) = PGNO(p_pagep);
+
+ if (NEXT_PGNO(p) == PGNO_INVALID) {
+ /*
+ * There is no next page; put the cursor on the
+ * previous page as if we'd deleted the last item
+ * on that page, with index after the last valid
+ * entry.
+ *
+ * The deleted flag was set up above.
+ */
+ hcp->pgno = PGNO(p_pagep);
+ hcp->indx = NUM_ENT(p_pagep);
+ op = DB_HAM_DELLASTPG;
+ } else {
+ /*
+ * There is a next page, so put the cursor at
+ * the beginning of it.
+ */
+ hcp->pgno = NEXT_PGNO(p);
+ hcp->indx = 0;
+ op = DB_HAM_DELMIDPG;
+ }
+
+ /*
+ * Since we are about to delete the cursor page and we have
+ * just moved the cursor, we need to make sure that the
+ * old page pointer isn't left hanging around in the cursor.
+ */
+ hcp->page = NULL;
+ chg_pgno = PGNO(p);
+ ret = __db_free(dbc, p, 0);
+ if (ppg == NULL && (t_ret = __memp_fput(mpf, dbc->thread_info,
+ p_pagep, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (n_pagep != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, n_pagep, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ if ((ret = __hamc_delpg(dbc,
+ chg_pgno, hcp->pgno, hcp->indx, op, &order)) != 0)
+ return (ret);
+ hcp->order += order;
+ }
+ return (ret);
+
+err: /* Clean up any pages. */
+ if (n_pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, n_pagep, dbc->priority);
+ if (nn_pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, nn_pagep, dbc->priority);
+ if (ppg == NULL && p_pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, p_pagep, dbc->priority);
+ return (ret);
+}
+
+/*
+ * __ham_replpair --
+ * Given the key data indicated by the cursor, replace part/all of it
+ * according to the fields in the dbt.
+ *
+ * PUBLIC: int __ham_replpair __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__ham_replpair(dbc, dbt, newtype)
+ DBC *dbc;
+ DBT *dbt;
+ u_int32_t newtype;
+{
+ DB *dbp;
+ DBC **carray, *dbc_n;
+ DBT old_dbt, tdata, tmp, *new_dbt;
+ DB_LSN new_lsn;
+ ENV *env;
+ HASH_CURSOR *hcp, *cp;
+ db_indx_t orig_indx;
+ db_pgno_t off_pgno, orig_pgno;
+ u_int32_t change;
+ u_int32_t dup_flag, len, memsize, newlen, oldtype, type;
+ char tmp_ch;
+ int beyond_eor, is_big, is_plus, ret, i, found, t_ret;
+ u_int8_t *beg, *dest, *end, *hk, *src;
+ void *memp;
+
+ /*
+ * Most items that were already offpage (ISBIG) were handled before
+ * we get in here. So, we need only handle cases where the old
+ * key is on a regular page. That leaves us 6 cases:
+ * 1. Original data onpage; new data is smaller
+ * 2. Original data onpage; new data is the same size
+ * 3. Original data onpage; new data is bigger, but not ISBIG,
+ * fits on page
+ * 4. Original data onpage; new data is bigger, but not ISBIG,
+ * does not fit on page
+ * 5. Original data onpage; New data is an off-page item.
+ * 6. Original data was offpage; new item is smaller.
+ * 7. Original data was offpage; new item is supplied as a partial.
+ *
+ * Cases 1-3 are essentially the same (and should be the common case).
+ * We handle 4-6 as delete and add. 7 is generally a delete and add,
+ * unless it is an append, when we extend the offpage item, and
+ * update the HOFFPAGE item on the current page to have the new size
+ * via a delete/add.
+ */
+ dbp = dbc->dbp;
+ env = dbp->env;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ carray = NULL;
+ dbc_n = memp = NULL;
+ found = 0;
+ new_dbt = NULL;
+ off_pgno = PGNO_INVALID;
+ type = 0;
+
+ /*
+ * We need to compute the number of bytes that we are adding or
+ * removing from the entry. Normally, we can simply subtract
+ * the number of bytes we are replacing (dbt->dlen) from the
+ * number of bytes we are inserting (dbt->size). However, if
+ * we are doing a partial put off the end of a record, then this
+ * formula doesn't work, because we are essentially adding
+ * new bytes.
+ */
+ if (dbt->size > dbt->dlen) {
+ change = dbt->size - dbt->dlen;
+ is_plus = 1;
+ } else {
+ change = dbt->dlen - dbt->size;
+ is_plus = 0;
+ }
+
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ oldtype = HPAGE_PTYPE(hk);
+ is_big = oldtype == H_OFFPAGE;
+
+ if (is_big) {
+ memcpy(&len, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ memcpy(&off_pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ } else
+ len = LEN_HKEYDATA(dbp, hcp->page,
+ dbp->pgsize, H_DATAINDEX(hcp->indx));
+
+ beyond_eor = dbt->doff + dbt->dlen > len;
+ if (beyond_eor) {
+ /*
+ * The change is beyond the end of record. If change
+ * is a positive number, we can simply add the extension
+ * to it. However, if change is negative, then we need
+ * to figure out if the extension is larger than the
+ * negative change.
+ */
+ if (is_plus)
+ change += dbt->doff + dbt->dlen - len;
+ else if (dbt->doff + dbt->dlen - len > change) {
+ /* Extension bigger than change */
+ is_plus = 1;
+ change = (dbt->doff + dbt->dlen - len) - change;
+ } else /* Extension is smaller than change. */
+ change -= (dbt->doff + dbt->dlen - len);
+ }
+
+ newlen = (is_plus ? len + change : len - change);
+ if (is_big || beyond_eor || ISBIG(hcp, newlen) ||
+ (is_plus && change > P_FREESPACE(dbp, hcp->page))) {
+ /*
+ * If we are in cases 4 or 5 then is_plus will be true.
+ * If we don't have a transaction then we cannot roll back,
+ * make sure there is enough room for the new page.
+ */
+ if (is_plus && dbc->txn == NULL &&
+ dbp->mpf->mfp->maxpgno != 0 &&
+ dbp->mpf->mfp->maxpgno == dbp->mpf->mfp->last_pgno)
+ return (__db_space_err(dbp));
+ /*
+ * Cases 4-6 -- two subcases.
+ * A. This is not really a partial operation, but an overwrite.
+ * Simple del and add works.
+ * B. This is a partial and we need to construct the data that
+ * we are really inserting (yuck).
+ * In both cases, we need to grab the key off the page (in
+ * some cases we could do this outside of this routine; for
+ * cleanliness we do it here. If you happen to be on a big
+ * key, this could be a performance hit).
+ */
+ memset(&tmp, 0, sizeof(tmp));
+ if ((ret = __db_ret(dbc, hcp->page, H_KEYINDEX(hcp->indx),
+ &tmp, &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+ return (ret);
+
+ /* Preserve duplicate info. */
+ dup_flag = F_ISSET(hcp, H_ISDUP);
+ /* Streaming insert. */
+ if (is_big && !dup_flag && !DB_IS_PRIMARY(dbp) &&
+ F_ISSET(dbt, DB_DBT_PARTIAL) && dbt->doff == len) {
+ /*
+ * If the cursor has not already cached the last page
+ * in the offpage chain, we need to walk the chain to
+ * be sure that the page has been read.
+ */
+ if (hcp->stream_start_pgno != off_pgno ||
+ hcp->stream_off > dbt->doff || dbt->doff >
+ hcp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) {
+ memset(&tdata, 0, sizeof(DBT));
+ tdata.doff = dbt->doff - 1;
+ /*
+ * Set the length to 1, to force __db_goff
+ * to do the traversal.
+ */
+ tdata.dlen = tdata.ulen = 1;
+ tdata.data = &tmp_ch;
+ tdata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+ /*
+ * Read to the last page. It will be cached
+ * in the cursor.
+ */
+ if ((ret = __db_goff(dbc, &tdata, len,
+ off_pgno, NULL, NULL)) != 0)
+ return (ret);
+ }
+ /*
+ * Since this is an append, dlen is irrelevant (there
+ * are no bytes to overwrite). We need the caller's
+ * DBT size to end up with the total size of the item.
+ * From now on, use dlen as the length of the user's
+ * data that we are going to append.
+ * Don't futz with the caller's DBT any more than we
+ * have to in order to send back the size.
+ */
+ tdata = *dbt;
+ tdata.dlen = dbt->size;
+ tdata.size = newlen;
+ new_dbt = &tdata;
+ F_SET(new_dbt, DB_DBT_STREAMING);
+ type = H_KEYDATA;
+ }
+
+ /*
+ * In cases 4-6, a delete and insert works, but we need to
+ * track and update any cursors pointing to the item being
+ * moved.
+ */
+ orig_pgno = PGNO(hcp->page);
+ orig_indx = hcp->indx;
+ if ((ret = __ham_get_clist(dbp,
+ orig_pgno, orig_indx, &carray)) != 0)
+ goto err;
+
+ if (dbt->doff == 0 && dbt->dlen == len) {
+ type = (dup_flag ? H_DUPLICATE : H_KEYDATA);
+ new_dbt = dbt;
+ } else if (!F_ISSET(dbt, DB_DBT_STREAMING)) { /* Case B */
+ type = HPAGE_PTYPE(hk) != H_OFFPAGE ?
+ HPAGE_PTYPE(hk) : H_KEYDATA;
+ memset(&tdata, 0, sizeof(tdata));
+ memsize = 0;
+ if ((ret = __db_ret(dbc, hcp->page,
+ H_DATAINDEX(hcp->indx), &tdata,
+ &memp, &memsize)) != 0)
+ goto err;
+
+ /* Now shift old data around to make room for new. */
+ if (is_plus) {
+ if ((ret = __os_realloc(env,
+ tdata.size + change, &tdata.data)) != 0)
+ return (ret);
+ memp = tdata.data;
+ memsize = tdata.size + change;
+ memset((u_int8_t *)tdata.data + tdata.size,
+ 0, change);
+ }
+ end = (u_int8_t *)tdata.data + tdata.size;
+
+ src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen;
+ if (src < end && tdata.size > dbt->doff + dbt->dlen) {
+ len = tdata.size - (dbt->doff + dbt->dlen);
+ if (is_plus)
+ dest = src + change;
+ else
+ dest = src - change;
+ memmove(dest, src, len);
+ }
+ memcpy((u_int8_t *)tdata.data + dbt->doff,
+ dbt->data, dbt->size);
+ if (is_plus)
+ tdata.size += change;
+ else
+ tdata.size -= change;
+ new_dbt = &tdata;
+ }
+ if ((ret = __ham_del_pair(dbc, HAM_DEL_NO_CURSOR |
+ (F_ISSET(dbt, DB_DBT_STREAMING) ? HAM_DEL_IGNORE_OFFPAGE :
+ 0), NULL)) != 0)
+ goto err;
+ /*
+ * Save the state of the cursor after the delete, so that we
+ * can adjust any cursors impacted by the delete. Don't just
+ * update the cursors now, to avoid ambiguity in reversing the
+ * adjustments during abort.
+ */
+ if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0)
+ goto err;
+ if ((ret = __ham_add_el(dbc, &tmp, new_dbt, type)) != 0)
+ goto err;
+ F_SET(hcp, dup_flag);
+
+ /*
+ * If the delete/insert pair caused the item to be moved
+ * to another location (which is possible for duplicate sets
+ * that are moved onto another page in the bucket), then update
+ * any impacted cursors.
+ */
+ if (((HASH_CURSOR*)dbc_n->internal)->pgno != hcp->pgno ||
+ ((HASH_CURSOR*)dbc_n->internal)->indx != hcp->indx) {
+ /*
+ * Set any cursors pointing to items in the moved
+ * duplicate set to the destination location and reset
+ * the deleted flag. This can't be done earlier, since
+ * the insert location is not computed until the actual
+ * __ham_add_el call is made.
+ */
+ if (carray != NULL) {
+ for (i = 0; carray[i] != NULL; i++) {
+ cp = (HASH_CURSOR*)carray[i]->internal;
+ cp->pgno = hcp->pgno;
+ cp->indx = hcp->indx;
+ F_CLR(cp, H_DELETED);
+ found = 1;
+ }
+ /*
+ * Only log the update once, since the recovery
+ * code iterates through all open cursors and
+ * applies the change to all matching cursors.
+ */
+ if (found && DBC_LOGGING(dbc) &&
+ IS_SUBTRANSACTION(dbc->txn)) {
+ if ((ret =
+ __ham_chgpg_log(dbp,
+ dbc->txn, &new_lsn, 0,
+ DB_HAM_CHGPG, orig_pgno, hcp->pgno,
+ orig_indx, hcp->indx)) != 0)
+ goto err;
+ }
+ }
+ /*
+ * Update any cursors impacted by the delete. Do this
+ * after chgpg log so that recovery does not re-bump
+ * cursors pointing to the deleted item.
+ */
+ ret = __hamc_update(dbc_n, 0, DB_HAM_CURADJ_DEL, 0);
+ }
+
+err: if (dbc_n != NULL && (t_ret = __dbc_close(dbc_n)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (carray != NULL)
+ __os_free(env, carray);
+ if (memp != NULL)
+ __os_free(env, memp);
+ return (ret);
+ }
+
+ /*
+ * Set up pointer into existing data. Do it before the log
+ * message so we can use it inside of the log setup.
+ */
+ beg = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx));
+ beg += dbt->doff;
+
+ /*
+ * If we are going to have to move bytes at all, figure out
+ * all the parameters here. Then log the call before moving
+ * anything around.
+ */
+ if (DBC_LOGGING(dbc)) {
+ old_dbt.data = beg;
+ old_dbt.size = dbt->dlen;
+ if ((ret = __ham_replace_log(dbp, dbc->txn, &new_lsn,
+ 0, PGNO(hcp->page),
+ (u_int32_t)H_DATAINDEX(hcp->indx), &LSN(hcp->page),
+ (int32_t)dbt->doff, OP_SET(oldtype, hcp->page),
+ &old_dbt, OP_SET(newtype, hcp->page), dbt)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+
+ LSN(hcp->page) = new_lsn; /* Structure assignment. */
+
+ __ham_onpage_replace(dbp, hcp->page, (u_int32_t)H_DATAINDEX(hcp->indx),
+ (int32_t)dbt->doff, change, is_plus, dbt);
+
+ return (0);
+}
+
+/*
+ * Replace data on a page with new data, possibly growing or shrinking what's
+ * there. This is called on two different occasions. On one (from replpair)
+ * we are interested in changing only the data. On the other (from recovery)
+ * we are replacing the entire data (header and all) with a new element. In
+ * the latter case, the off argument is negative.
+ * pagep: the page that we're changing
+ * ndx: page index of the element that is growing/shrinking.
+ * off: Offset at which we are beginning the replacement.
+ * change: the number of bytes (+ or -) that the element is growing/shrinking.
+ * dbt: the new data that gets written at beg.
+ *
+ * PUBLIC: void __ham_onpage_replace __P((DB *, PAGE *, u_int32_t,
+ * PUBLIC: int32_t, u_int32_t, int, DBT *));
+ */
+void
+__ham_onpage_replace(dbp, pagep, ndx, off, change, is_plus, dbt)
+ DB *dbp;
+ PAGE *pagep;
+ u_int32_t ndx;
+ int32_t off;
+ u_int32_t change;
+ int is_plus;
+ DBT *dbt;
+{
+ db_indx_t i, *inp;
+ int32_t len;
+ size_t pgsize;
+ u_int8_t *src, *dest;
+ int zero_me;
+
+ pgsize = dbp->pgsize;
+ inp = P_INP(dbp, pagep);
+ if (change != 0) {
+ zero_me = 0;
+ src = (u_int8_t *)(pagep) + HOFFSET(pagep);
+ if (off < 0)
+ len = inp[ndx] - HOFFSET(pagep);
+ else if ((u_int32_t)off >=
+ LEN_HKEYDATA(dbp, pagep, pgsize, ndx)) {
+ len = (int32_t)(HKEYDATA_DATA(P_ENTRY(dbp, pagep, ndx))
+ + LEN_HKEYDATA(dbp, pagep, pgsize, ndx) - src);
+ zero_me = 1;
+ } else
+ len = (int32_t)(
+ (HKEYDATA_DATA(P_ENTRY(dbp, pagep, ndx)) + off) -
+ src);
+ if (is_plus)
+ dest = src - change;
+ else
+ dest = src + change;
+ memmove(dest, src, (size_t)len);
+ if (zero_me)
+ memset(dest + len, 0, change);
+
+ /* Now update the indices. */
+ for (i = ndx; i < NUM_ENT(pagep); i++) {
+ if (is_plus)
+ inp[i] -= change;
+ else
+ inp[i] += change;
+ }
+ if (is_plus)
+ HOFFSET(pagep) -= change;
+ else
+ HOFFSET(pagep) += change;
+ }
+ if (off >= 0)
+ memcpy(HKEYDATA_DATA(P_ENTRY(dbp, pagep, ndx)) + off,
+ dbt->data, dbt->size);
+ else
+ memcpy(P_ENTRY(dbp, pagep, ndx), dbt->data, dbt->size);
+}
+
+/*
+ * __ham_merge_page --
+ * Merge pages from one bucket to another.
+ * PUBLIC: int __ham_merge_pages __P((DBC *,
+ * PUBLIC: u_int32_t, u_int32_t, DB_COMPACT *));
+ */
+int
+__ham_merge_pages(dbc, tobucket, frombucket, c_data)
+ DBC *dbc;
+ u_int32_t tobucket, frombucket;
+ DB_COMPACT *c_data;
+{
+ DB *dbp;
+ DBC **carray;
+ DB_LOCK tlock, firstlock;
+ DB_LSN from_lsn;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HASH_CURSOR *hcp, *cp;
+ PAGE *to_pagep, *first_pagep,
+ *from_pagep, *last_pagep, *next_pagep, *prev_pagep;
+ db_pgno_t to_pgno, first_pgno, from_pgno;
+ u_int32_t len;
+ db_indx_t dest_indx, n, num_ent;
+ int check_trunc, found, i, ret;
+
+ dbp = dbc->dbp;
+ carray = NULL;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ hcp->pgno = PGNO_INVALID;
+ to_pagep = first_pagep = NULL;
+ from_pagep = last_pagep = next_pagep = prev_pagep = NULL;
+ from_pgno = PGNO_INVALID;
+ LOCK_INIT(tlock);
+ LOCK_INIT(firstlock);
+
+ check_trunc =
+ c_data == NULL ? 0 : c_data->compact_truncate != PGNO_INVALID;
+ to_pgno = BUCKET_TO_PAGE(hcp, tobucket);
+ if ((ret = __db_lget(dbc,
+ 0, to_pgno, DB_LOCK_WRITE, 0, &tlock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &to_pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &to_pagep)) != 0)
+ goto err;
+
+ /* Sort any unsorted pages before adding to the page. */
+ if (to_pagep->type == P_HASH_UNSORTED)
+ if ((ret = __ham_sort_page_cursor(dbc, to_pagep)) != 0)
+ return (ret);
+
+ /* Fetch the first page of the bucket we are getting rid of. */
+ from_pgno = BUCKET_TO_PAGE(hcp, frombucket);
+ if ((ret = __db_lget(dbc,
+ 0, from_pgno, DB_LOCK_WRITE, 0, &firstlock)) != 0)
+ goto err;
+next_page:
+ /*
+ * from_pagep is the starting point in the bucket at which records
+ * are moved to the new bucket.
+ */
+ if (from_pagep == NULL &&
+ (ret = __memp_fget(mpf, &from_pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &from_pagep)) != 0)
+ goto err;
+ if ((ret = __ham_get_clist(dbp, from_pgno, NDX_INVALID, &carray)) != 0)
+ goto err;
+
+ hcp->indx = 0;
+ hcp->pgno = from_pgno;
+ hcp->page = from_pagep;
+ num_ent = NUM_ENT(from_pagep);
+ for (n = 0; n < num_ent; n += 2) {
+ /*
+ * Figure out how many bytes we need on the from
+ * page to store the key/data pair.
+ */
+ len = LEN_HITEM(dbp, from_pagep,
+ dbp->pgsize, H_DATAINDEX(hcp->indx)) +
+ LEN_HITEM(dbp, from_pagep,
+ dbp->pgsize, H_KEYINDEX(hcp->indx)) +
+ 2 * sizeof(db_indx_t);
+
+ /*
+ * Find a page that will fit this data. We don't go back
+ * to a page, so we may leave some space if there is a big
+ * variation in record size.
+ */
+ while (P_FREESPACE(dbp, to_pagep) < len) {
+ to_pgno = NEXT_PGNO(to_pagep);
+ if (to_pgno == PGNO_INVALID) {
+ next_pagep = to_pagep;
+ if ((ret =
+ __ham_add_ovflpage(dbc, &next_pagep)) != 0)
+ goto err;
+ if ((ret = __memp_fput(mpf, dbc->thread_info,
+ to_pagep, dbc->priority)) != 0)
+ goto err;
+ to_pagep = next_pagep;
+ next_pagep = NULL;
+ if (c_data != NULL &&
+ c_data->compact_pages_free > 0)
+ c_data->compact_pages_free--;
+ to_pgno = PGNO(to_pagep);
+ } else {
+ if ((ret = __memp_fput(mpf, dbc->thread_info,
+ to_pagep, dbc->priority)) != 0)
+ goto err;
+ to_pagep = NULL;
+ if ((ret = __memp_fget(mpf,
+ &to_pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY,
+ &to_pagep)) != 0)
+ goto err;
+
+ /*
+ * Sort any unsorted pages before adding
+ * to the page.
+ */
+ if (to_pagep->type == P_HASH_UNSORTED)
+ if ((ret = __ham_sort_page_cursor(dbc,
+ to_pagep)) != 0)
+ goto err;
+ }
+ }
+ dest_indx = NDX_INVALID;
+ if ((ret = __ham_copypair(dbc,
+ from_pagep, hcp->indx, to_pagep, &dest_indx, 1)) != 0)
+ goto err;
+
+ /* Update any cursors pointing at the moved item. */
+ if (carray != NULL) {
+ found = 0;
+ for (i = 0; carray[i] != NULL; i++) {
+ cp =
+ (HASH_CURSOR *)carray[i]->internal;
+ if (cp->pgno == from_pgno &&
+ cp->indx == n) {
+ cp->pgno = PGNO(to_pagep);
+ cp->indx = dest_indx;
+ cp->bucket = tobucket;
+ found = 1;
+ }
+ }
+ /*
+ * Only log the update once, since the recovery
+ * code iterates through all open cursors and
+ * applies the change to all matching cursors.
+ */
+ if (found && DBC_LOGGING(dbc) &&
+ IS_SUBTRANSACTION(dbc->txn)) {
+ if ((ret =
+ __ham_chgpg_log(dbp,
+ dbc->txn, &from_lsn, 0,
+ DB_HAM_SPLIT, from_pgno,
+ PGNO(to_pagep), n, dest_indx)) != 0)
+ goto err;
+ }
+ }
+ /*
+ * If this is the head of the bucket, delete the record.
+ * Otherwise we will just free the page after the loop.
+ */
+ if (PREV_PGNO(from_pagep) == PGNO_INVALID) {
+ if ((ret = __ham_del_pair(dbc,
+ HAM_DEL_IGNORE_OFFPAGE | HAM_DEL_NO_CURSOR,
+ from_pagep)) != 0)
+ goto err;
+ if (!STD_LOCKING(dbc)) {
+ if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+ return (ret);
+ ++hcp->hdr->nelem;
+ }
+ } else
+ hcp->indx += 2;
+ }
+ /*
+ * If there are more pages in the bucket then we need to process them.
+ * First we may remove a page that is empty. If there is a next
+ * page then save the previous one for relinking.
+ */
+ from_pgno = NEXT_PGNO(from_pagep);
+ if (PREV_PGNO(from_pagep) != PGNO_INVALID) {
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_relink_log(dbp, dbc->txn,
+ &LSN(prev_pagep), 0, PGNO(from_pagep),
+ PGNO_INVALID, PGNO(prev_pagep),
+ &LSN(prev_pagep), PGNO_INVALID, NULL)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(prev_pagep));
+
+ NEXT_PGNO(prev_pagep) = PGNO_INVALID;
+
+ if ((ret = __db_free(dbc, from_pagep, 0)) != 0) {
+ from_pagep = NULL;
+ goto err;
+ }
+ if (c_data != NULL)
+ c_data->compact_pages_free++;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, prev_pagep, dbc->priority)) != 0)
+ goto err;
+ prev_pagep = NULL;
+ } else if (from_pgno != PGNO_INVALID)
+ prev_pagep = from_pagep;
+ else if ((ret = __memp_fput(mpf,
+ dbc->thread_info, from_pagep, dbc->priority)) != 0)
+ goto err;
+
+ from_pagep = NULL;
+ hcp->page = NULL;
+ if (carray != NULL)
+ __os_free(env, carray);
+ carray = NULL;
+
+ /*
+ * The head of the bucket has been copied. Try to figure out
+ * if we should just relink the following pages or try to merge
+ * them into existing pages. This is quick and dirty: if it
+ * looks like the data will fit on the current "to" page then
+ * merge it, otherwise just do the linking.
+ * If this was called from DB->compact it will be better to copy
+ * the data to lower numbered pages.
+ */
+ if (check_trunc && from_pgno > c_data->compact_truncate)
+ goto next_page;
+
+ /*
+ * first_pgno will be the first page of a list that gets
+ * relinked to the new bucket. last_pagep will point at the
+ * last page of the linked list.
+ */
+ first_pgno = from_pgno;
+ last_pagep = NULL;
+ while (from_pgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf,
+ &from_pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &from_pagep)) != 0)
+ goto err;
+ if (P_FREESPACE(dbp, to_pagep) >
+ (dbp->pgsize - HOFFSET(from_pagep)) +
+ (NUM_ENT(from_pagep) * sizeof(db_indx_t)))
+ break;
+ if (check_trunc && from_pgno > c_data->compact_truncate)
+ break;
+ from_pgno = NEXT_PGNO(from_pagep);
+ if (last_pagep != NULL && last_pagep != first_pagep &&
+ (ret = __memp_fput(mpf,
+ dbc->thread_info, last_pagep, dbc->priority)) != 0)
+ goto err;
+ last_pagep = from_pagep;
+ if (first_pagep == NULL)
+ first_pagep = from_pagep;
+ from_pagep = NULL;
+ }
+
+ /* Link the chain of "full" pages into the "to" bucket. */
+ if (first_pgno != PGNO_INVALID && first_pgno != from_pgno) {
+ DB_ASSERT(dbp->env, first_pagep != NULL);
+ next_pagep = NULL;
+ if (NEXT_PGNO(to_pagep) != PGNO_INVALID && (ret =
+ __memp_fget(mpf, &NEXT_PGNO(to_pagep), dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &next_pagep)) != 0)
+ goto err;
+
+ if (last_pagep == NULL)
+ last_pagep = first_pagep;
+ DB_ASSERT(dbp->env, last_pagep != NULL);
+ /*
+ * At the point we have:
+ * to_pagep -- the page that we are linking to.
+ * first_pagep -- the page that is first in the list.
+ * last_pagep -- the page that is the last in the list.
+ * prev_pagep -- the page that points at first_pagep.
+ * next_pagep -- the next page after the list.
+ */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_relink_log(dbp, dbc->txn,
+ &LSN(to_pagep), 0, NEXT_PGNO(to_pagep),
+ first_pgno, to_pgno, &LSN(to_pagep),
+ PGNO_INVALID, NULL)) != 0)
+ goto err;
+ if ((ret = __db_relink_log(dbp, dbc->txn,
+ &LSN(first_pagep), 0, PREV_PGNO(first_pagep),
+ to_pgno, PGNO_INVALID, NULL, first_pgno,
+ &LSN(first_pagep))) != 0)
+ goto err;
+ if (next_pagep != NULL) {
+ if ((ret = __db_relink_log(dbp, dbc->txn,
+ &LSN(next_pagep), 0, PREV_PGNO(next_pagep),
+ PGNO(last_pagep), PGNO_INVALID, NULL,
+ PGNO(next_pagep), &LSN(next_pagep))) != 0)
+ goto err;
+ if ((ret = __db_relink_log(dbp, dbc->txn,
+ &LSN(last_pagep), 0, NEXT_PGNO(last_pagep),
+ PGNO(next_pagep), PGNO(last_pagep),
+ &LSN(last_pagep), PGNO_INVALID, NULL)) != 0)
+ goto err;
+ } else if (NEXT_PGNO(last_pagep) != PGNO_INVALID &&
+ (ret = __db_relink_log(dbp, dbc->txn,
+ &LSN(last_pagep), 0, NEXT_PGNO(last_pagep),
+ PGNO_INVALID, PGNO(last_pagep),
+ &LSN(last_pagep), PGNO_INVALID, NULL)) != 0)
+ goto err;
+ if (prev_pagep != NULL &&
+ (ret = __db_relink_log(dbp, dbc->txn,
+ &LSN(prev_pagep), 0, NEXT_PGNO(prev_pagep),
+ NEXT_PGNO(last_pagep), PGNO(prev_pagep),
+ &LSN(prev_pagep), PGNO_INVALID, NULL)) != 0)
+ goto err;
+ } else {
+ LSN_NOT_LOGGED(LSN(to_pagep));
+ LSN_NOT_LOGGED(LSN(first_pagep));
+ LSN_NOT_LOGGED(LSN(last_pagep));
+ if (next_pagep != NULL)
+ LSN_NOT_LOGGED(LSN(to_pagep));
+ }
+ if (prev_pagep != NULL)
+ NEXT_PGNO(prev_pagep) = NEXT_PGNO(last_pagep);
+ NEXT_PGNO(last_pagep) = NEXT_PGNO(to_pagep);
+ NEXT_PGNO(to_pagep) = first_pgno;
+ PREV_PGNO(first_pagep) = to_pgno;
+ if (next_pagep != NULL) {
+ PREV_PGNO(next_pagep) = PGNO(last_pagep);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, next_pagep, dbc->priority)) != 0)
+ goto err;
+ next_pagep = NULL;
+ }
+ if (last_pagep != first_pagep && (ret = __memp_fput(mpf,
+ dbc->thread_info, last_pagep, dbc->priority)) != 0)
+ goto err;
+ last_pagep = NULL;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, first_pagep, dbc->priority)) != 0)
+ goto err;
+ first_pagep = NULL;
+ } else if (last_pagep != NULL && (ret = __memp_fput(mpf,
+ dbc->thread_info, last_pagep, dbc->priority)) != 0)
+ goto err;
+
+ if (from_pagep == NULL) {
+ from_pagep = first_pagep;
+ first_pagep = NULL;
+ }
+ if (from_pgno != PGNO_INVALID)
+ goto next_page;
+
+ if (prev_pagep != NULL && (ret = __memp_fput(mpf,
+ dbc->thread_info, prev_pagep, dbc->priority)) != 0)
+ goto err;
+ ret = __memp_fput(mpf, dbc->thread_info, to_pagep, dbc->priority);
+ return (ret);
+
+err: if (last_pagep != NULL && last_pagep != first_pagep)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, last_pagep, dbc->priority);
+ if (first_pagep != NULL && first_pagep != from_pagep)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, first_pagep, dbc->priority);
+ if (next_pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, next_pagep, dbc->priority);
+ if (from_pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, from_pagep, dbc->priority);
+ if (to_pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, to_pagep, dbc->priority);
+ if (prev_pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, prev_pagep, dbc->priority);
+ hcp->page = NULL;
+ (void)__TLPUT(dbc, tlock);
+ (void)__TLPUT(dbc, firstlock);
+ if (carray != NULL)
+ __os_free(env, carray);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_split_page __P((DBC *, u_int32_t, u_int32_t));
+ */
+int
+__ham_split_page(dbc, obucket, nbucket)
+ DBC *dbc;
+ u_int32_t obucket, nbucket;
+{
+ DB *dbp;
+ DBC **carray, *tmp_dbc;
+ DBT key, page_dbt;
+ DB_LOCK block;
+ DB_LSN new_lsn;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HASH_CURSOR *hcp, *cp;
+ PAGE **pp, *old_pagep, *temp_pagep, *new_pagep, *next_pagep;
+ db_indx_t n, dest_indx;
+ db_pgno_t bucket_pgno, npgno, next_pgno;
+ u_int32_t big_len, len;
+ int found, i, ret, t_ret;
+ void *big_buf;
+
+ dbp = dbc->dbp;
+ carray = NULL;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ temp_pagep = old_pagep = new_pagep = NULL;
+ npgno = PGNO_INVALID;
+ LOCK_INIT(block);
+
+ bucket_pgno = BUCKET_TO_PAGE(hcp, obucket);
+ if ((ret = __db_lget(dbc,
+ 0, bucket_pgno, DB_LOCK_WRITE, 0, &block)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &bucket_pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &old_pagep)) != 0)
+ goto err;
+
+ /* Sort any unsorted pages before doing a hash split. */
+ if (old_pagep->type == P_HASH_UNSORTED)
+ if ((ret = __ham_sort_page_cursor(dbc, old_pagep)) != 0)
+ return (ret);
+
+ /* Properly initialize the new bucket page. */
+ npgno = BUCKET_TO_PAGE(hcp, nbucket);
+ if ((ret = __memp_fget(mpf, &npgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &new_pagep)) != 0)
+ goto err;
+ P_INIT(new_pagep,
+ dbp->pgsize, npgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+
+ temp_pagep = hcp->split_buf;
+ memcpy(temp_pagep, old_pagep, dbp->pgsize);
+
+ if (DBC_LOGGING(dbc)) {
+ page_dbt.size = dbp->pgsize;
+ page_dbt.data = old_pagep;
+ if ((ret = __ham_splitdata_log(dbp,
+ dbc->txn, &new_lsn, 0, SPLITOLD,
+ PGNO(old_pagep), &page_dbt, &LSN(old_pagep))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+
+ LSN(old_pagep) = new_lsn; /* Structure assignment. */
+
+ P_INIT(old_pagep, dbp->pgsize, PGNO(old_pagep), PGNO_INVALID,
+ PGNO_INVALID, 0, P_HASH);
+
+ big_len = 0;
+ big_buf = NULL;
+ memset(&key, 0, sizeof(key));
+ while (temp_pagep != NULL) {
+ if ((ret = __ham_get_clist(dbp,
+ PGNO(temp_pagep), NDX_INVALID, &carray)) != 0)
+ goto err;
+
+ for (n = 0; n < (db_indx_t)NUM_ENT(temp_pagep); n += 2) {
+ if ((ret = __db_ret(dbc, temp_pagep, H_KEYINDEX(n),
+ &key, &big_buf, &big_len)) != 0)
+ goto err;
+
+ if (__ham_call_hash(dbc, key.data, key.size) == obucket)
+ pp = &old_pagep;
+ else
+ pp = &new_pagep;
+
+ /*
+ * Figure out how many bytes we need on the new
+ * page to store the key/data pair.
+ */
+ len = LEN_HITEM(dbp, temp_pagep, dbp->pgsize,
+ H_DATAINDEX(n)) +
+ LEN_HITEM(dbp, temp_pagep, dbp->pgsize,
+ H_KEYINDEX(n)) +
+ 2 * sizeof(db_indx_t);
+
+ if (P_FREESPACE(dbp, *pp) < len) {
+ if (DBC_LOGGING(dbc)) {
+ page_dbt.size = dbp->pgsize;
+ page_dbt.data = *pp;
+ if ((ret = __ham_splitdata_log(dbp,
+ dbc->txn, &new_lsn, 0,
+ SPLITNEW, PGNO(*pp), &page_dbt,
+ &LSN(*pp))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+ LSN(*pp) = new_lsn;
+ next_pagep = *pp;
+ if ((ret =
+ __ham_add_ovflpage(dbc, &next_pagep)) != 0)
+ goto err;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, *pp, dbc->priority)) != 0)
+ goto err;
+ *pp = next_pagep;
+ }
+
+ dest_indx = NDX_INVALID;
+ if ((ret = __ham_copypair(dbc, temp_pagep,
+ H_KEYINDEX(n), *pp, &dest_indx, 0)) != 0)
+ goto err;
+
+ /*
+ * Update any cursors that were pointing to items
+ * shuffled because of this insert.
+ * Use __hamc_update, since the cursor adjustments are
+ * the same as those required for an insert. The
+ * overhead of creating a cursor is worthwhile to save
+ * replicating the adjustment functionality.
+ * Adjusting shuffled cursors needs to be done prior to
+ * adjusting any cursors that were pointing to the
+ * moved item.
+ * All pages in a bucket are sorted, but the items are
+ * not sorted across pages within a bucket. This means
+ * that splitting the first page in a bucket into two
+ * new buckets won't require any cursor shuffling,
+ * since all inserts will be appends. Splitting of the
+ * second etc page from the initial bucket could
+ * cause an item to be inserted at any location on a
+ * page (since items already inserted from page 1 of
+ * the initial bucket may overlap), so only adjust
+ * cursors for the second etc pages within a bucket.
+ */
+ if (PGNO(temp_pagep) != bucket_pgno) {
+ if ((ret = __db_cursor_int(dbp,
+ dbc->thread_info, dbc->txn, dbp->type,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID,
+ &tmp_dbc)) != 0)
+ goto err;
+ hcp = (HASH_CURSOR*)tmp_dbc->internal;
+ hcp->pgno = PGNO(*pp);
+ hcp->indx = dest_indx;
+ hcp->dup_off = 0;
+ hcp->order = 0;
+ if ((ret = __hamc_update(
+ tmp_dbc, len, DB_HAM_CURADJ_ADD, 0)) != 0)
+ goto err;
+ if ((ret = __dbc_close(tmp_dbc)) != 0)
+ goto err;
+ }
+ /* Update any cursors pointing at the moved item. */
+ if (carray != NULL) {
+ found = 0;
+ for (i = 0; carray[i] != NULL; i++) {
+ cp =
+ (HASH_CURSOR *)carray[i]->internal;
+ if (cp->pgno == PGNO(temp_pagep) &&
+ cp->indx == n) {
+ cp->pgno = PGNO(*pp);
+ cp->indx = dest_indx;
+ if (cp->pgno == PGNO(old_pagep))
+ cp->bucket = obucket;
+ else
+ cp->bucket = nbucket;
+ found = 1;
+ }
+ }
+ /*
+ * Only log the update once, since the recovery
+ * code iterates through all open cursors and
+ * applies the change to all matching cursors.
+ */
+ if (found && DBC_LOGGING(dbc) &&
+ IS_SUBTRANSACTION(dbc->txn)) {
+ if ((ret =
+ __ham_chgpg_log(dbp,
+ dbc->txn, &new_lsn, 0,
+ DB_HAM_SPLIT, PGNO(temp_pagep),
+ PGNO(*pp), n, dest_indx)) != 0)
+ goto err;
+ }
+ }
+ }
+ next_pgno = NEXT_PGNO(temp_pagep);
+
+ /* Clear temp_page; if it's a link overflow page, free it. */
+ if (PGNO(temp_pagep) != bucket_pgno && (ret =
+ __db_free(dbc, temp_pagep, 0)) != 0) {
+ temp_pagep = NULL;
+ goto err;
+ }
+
+ if (next_pgno == PGNO_INVALID)
+ temp_pagep = NULL;
+ else if ((ret = __memp_fget(mpf,
+ &next_pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &temp_pagep)) != 0)
+ goto err;
+
+ if (temp_pagep != NULL) {
+ if (DBC_LOGGING(dbc)) {
+ page_dbt.size = dbp->pgsize;
+ page_dbt.data = temp_pagep;
+ if ((ret = __ham_splitdata_log(dbp,
+ dbc->txn, &new_lsn, 0,
+ SPLITOLD, PGNO(temp_pagep),
+ &page_dbt, &LSN(temp_pagep))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+ LSN(temp_pagep) = new_lsn;
+ }
+
+ if (carray != NULL) /* We never knew its size. */
+ __os_free(env, carray);
+ carray = NULL;
+ }
+ if (big_buf != NULL)
+ __os_free(env, big_buf);
+
+ /*
+ * If the original bucket spanned multiple pages, then we've got
+ * a pointer to a page that used to be on the bucket chain. It
+ * should be deleted.
+ */
+ if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno &&
+ (ret = __db_free(dbc, temp_pagep, 0)) != 0) {
+ temp_pagep = NULL;
+ goto err;
+ }
+
+ /*
+ * Write new buckets out.
+ */
+ if (DBC_LOGGING(dbc)) {
+ page_dbt.size = dbp->pgsize;
+ page_dbt.data = old_pagep;
+ if ((ret = __ham_splitdata_log(dbp, dbc->txn,
+ &new_lsn, 0, SPLITNEW, PGNO(old_pagep), &page_dbt,
+ &LSN(old_pagep))) != 0)
+ goto err;
+ LSN(old_pagep) = new_lsn;
+
+ page_dbt.data = new_pagep;
+ if ((ret = __ham_splitdata_log(dbp, dbc->txn, &new_lsn, 0,
+ SPLITNEW, PGNO(new_pagep), &page_dbt,
+ &LSN(new_pagep))) != 0)
+ goto err;
+ LSN(new_pagep) = new_lsn;
+ } else {
+ LSN_NOT_LOGGED(LSN(old_pagep));
+ LSN_NOT_LOGGED(LSN(new_pagep));
+ }
+
+ ret = __memp_fput(mpf, dbc->thread_info, old_pagep, dbc->priority);
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, new_pagep, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (0) {
+err: if (old_pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, old_pagep, dbc->priority);
+ if (new_pagep != NULL) {
+ P_INIT(new_pagep, dbp->pgsize,
+ npgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ (void)__memp_fput(mpf,
+ dbc->thread_info, new_pagep, dbc->priority);
+ }
+ if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, temp_pagep, dbc->priority);
+ }
+ if ((t_ret = __TLPUT(dbc, block)) != 0 && ret == 0)
+ ret = t_ret;
+ if (carray != NULL) /* We never knew its size. */
+ __os_free(env, carray);
+ return (ret);
+}
+
+/*
+ * Add the given pair to the page. The page in question may already be
+ * held (i.e. it was already gotten). If it is, then the page is passed
+ * in via the pagep parameter. On return, pagep will contain the page
+ * to which we just added something. This allows us to link overflow
+ * pages and return the new page having correctly put the last page.
+ *
+ * PUBLIC: int __ham_add_el __P((DBC *, const DBT *, const DBT *, u_int32_t));
+ */
+int
+__ham_add_el(dbc, key, val, type)
+ DBC *dbc;
+ const DBT *key, *val;
+ u_int32_t type;
+{
+ const DBT *pkey, *pdata;
+ DB *dbp;
+ DBT key_dbt, data_dbt;
+ DB_LSN new_lsn;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ HOFFPAGE doff, koff;
+ PAGE *new_pagep;
+ db_pgno_t next_pgno, pgno;
+ u_int32_t data_size, data_type, key_size, key_type;
+ u_int32_t pages, pagespace, pairsize;
+ int do_expand, is_keybig, is_databig, match, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ do_expand = 0;
+
+ pgno = hcp->seek_found_page != PGNO_INVALID ?
+ hcp->seek_found_page : hcp->pgno;
+ if (hcp->page == NULL && (ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_CREATE, &hcp->page)) != 0)
+ return (ret);
+
+ key_size = HKEYDATA_PSIZE(key->size);
+ data_size = HKEYDATA_PSIZE(val->size);
+ is_keybig = ISBIG(hcp, key->size);
+ is_databig = ISBIG(hcp, val->size);
+ if (is_keybig)
+ key_size = HOFFPAGE_PSIZE;
+ if (is_databig)
+ data_size = HOFFPAGE_PSIZE;
+
+ pairsize = key_size + data_size;
+
+ /* Advance to first page in chain with room for item. */
+ while (H_NUMPAIRS(hcp->page) && NEXT_PGNO(hcp->page) != PGNO_INVALID) {
+ /*
+ * This may not be the end of the chain, but the pair may fit
+ * anyway. Check if it's a bigpair that fits or a regular
+ * pair that fits.
+ */
+ if (P_FREESPACE(dbp, hcp->page) >= pairsize)
+ break;
+ next_pgno = NEXT_PGNO(hcp->page);
+ if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0)
+ return (ret);
+ }
+
+ /*
+ * Check if we need to allocate a new page.
+ */
+ if (P_FREESPACE(dbp, hcp->page) < pairsize) {
+ do_expand = 1;
+ if ((ret = __memp_dirty(mpf, &hcp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ new_pagep = hcp->page;
+ if ((ret = __ham_add_ovflpage(dbc, &new_pagep)) != 0)
+ return (ret);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority)) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, new_pagep, dbc->priority);
+ return (ret);
+ }
+ hcp->page = new_pagep;
+ hcp->pgno = PGNO(hcp->page);
+ }
+
+ /*
+ * If we don't have a transaction then make sure we will not
+ * run out of file space before updating the key or data.
+ */
+ if (dbc->txn == NULL &&
+ dbp->mpf->mfp->maxpgno != 0 && (is_keybig || is_databig)) {
+ pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+ pages = 0;
+ if (is_databig)
+ pages = ((data_size - 1) / pagespace) + 1;
+ if (is_keybig) {
+ pages += ((key->size - 1) / pagespace) + 1;
+ if (pages >
+ (dbp->mpf->mfp->maxpgno - dbp->mpf->mfp->last_pgno))
+ return (__db_space_err(dbp));
+ }
+ }
+
+ if ((ret = __memp_dirty(mpf,
+ &hcp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+
+ /*
+ * Update cursor.
+ */
+ hcp->indx = hcp->seek_found_indx;
+ F_CLR(hcp, H_DELETED);
+ if (is_keybig) {
+ koff.type = H_OFFPAGE;
+ UMRW_SET(koff.unused[0]);
+ UMRW_SET(koff.unused[1]);
+ UMRW_SET(koff.unused[2]);
+ if ((ret = __db_poff(dbc, key, &koff.pgno)) != 0)
+ return (ret);
+ koff.tlen = key->size;
+ key_dbt.data = &koff;
+ key_dbt.size = sizeof(koff);
+ pkey = &key_dbt;
+ key_type = H_OFFPAGE;
+ } else {
+ pkey = key;
+ key_type = H_KEYDATA;
+ }
+
+ if (is_databig) {
+ doff.type = H_OFFPAGE;
+ UMRW_SET(doff.unused[0]);
+ UMRW_SET(doff.unused[1]);
+ UMRW_SET(doff.unused[2]);
+ if ((ret = __db_poff(dbc, val, &doff.pgno)) != 0)
+ return (ret);
+ doff.tlen = val->size;
+ data_dbt.data = &doff;
+ data_dbt.size = sizeof(doff);
+ pdata = &data_dbt;
+ data_type = H_OFFPAGE;
+ } else {
+ pdata = val;
+ data_type = type;
+ }
+
+ /* Sort any unsorted pages before doing the insert. */
+ if (((PAGE *)hcp->page)->type == P_HASH_UNSORTED)
+ if ((ret = __ham_sort_page_cursor(dbc, hcp->page)) != 0)
+ return (ret);
+
+ /*
+ * If inserting on the page found initially, then use the saved index.
+ * If inserting on a different page resolve the index now so it can be
+ * logged.
+ * The page might be different, if P_FREESPACE constraint failed (due
+ * to a partial put that increases the data size).
+ */
+ if (PGNO(hcp->page) != hcp->seek_found_page) {
+ if ((ret = __ham_getindex(dbc, hcp->page, pkey,
+ key_type, &match, &hcp->seek_found_indx)) != 0)
+ return (ret);
+ hcp->seek_found_page = PGNO(hcp->page);
+
+ DB_ASSERT(dbp->env, hcp->seek_found_indx <= NUM_ENT(hcp->page));
+ }
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __ham_insdel_log(dbp, dbc->txn, &new_lsn, 0,
+ PUTPAIR, PGNO(hcp->page), (u_int32_t)hcp->seek_found_indx,
+ &LSN(hcp->page), OP_SET(key_type, hcp->page), pkey,
+ OP_SET(data_type, hcp->page), pdata)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+
+ /* Move lsn onto page. */
+ LSN(hcp->page) = new_lsn; /* Structure assignment. */
+
+ if ((ret = __ham_insertpair(dbc, hcp->page,
+ &hcp->seek_found_indx, pkey, pdata, key_type, data_type)) != 0)
+ return (ret);
+
+ /*
+ * Adjust any cursors that were pointing at items whose indices were
+ * shuffled due to the insert.
+ */
+ if ((ret = __hamc_update(dbc, pairsize, DB_HAM_CURADJ_ADD, 0)) != 0)
+ return (ret);
+
+ /*
+ * For splits, we are going to update item_info's page number
+ * field, so that we can easily return to the same page the
+ * next time we come in here. For other operations, this doesn't
+ * matter, since this is the last thing that happens before we return
+ * to the user program.
+ */
+ hcp->pgno = PGNO(hcp->page);
+ /*
+ * When moving an item from one page in a bucket to another, due to an
+ * expanding on page duplicate set, or a partial put that increases the
+ * size of an item. The destination index needs to be saved so that the
+ * __ham_replpair code can update any cursors impacted by the move. For
+ * other operations, this does not matter, since this is the last thing
+ * that happens before we return to the user program.
+ */
+ hcp->indx = hcp->seek_found_indx;
+
+ /*
+ * XXX
+ * Maybe keep incremental numbers here.
+ */
+ if (!STD_LOCKING(dbc)) {
+ if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+ return (ret);
+ hcp->hdr->nelem++;
+ }
+
+ if (do_expand || (hcp->hdr->ffactor != 0 &&
+ (u_int32_t)H_NUMPAIRS(hcp->page) > hcp->hdr->ffactor))
+ F_SET(hcp, H_EXPAND);
+ return (0);
+}
+
+/*
+ * Special insert pair call -- copies a key/data pair from one page to
+ * another. Works for all types of hash entries (H_OFFPAGE, H_KEYDATA,
+ * H_DUPLICATE, H_OFFDUP). Since we log splits at a high level, we
+ * do not need to log them here.
+ *
+ * dest_indx is an optional parameter, it serves several purposes:
+ * * ignored if NULL
+ * * Used as an insert index if non-null and not NDX_INVALID
+ * * Populated with the insert index if non-null and NDX_INVALID
+ *
+ * PUBLIC: int __ham_copypair __P((DBC *, PAGE *, u_int32_t,
+ * PUBLIC: PAGE *, db_indx_t *, int));
+ */
+int
+__ham_copypair(dbc, src_page, src_ndx, dest_page, dest_indx, log)
+ DBC *dbc;
+ PAGE *src_page;
+ u_int32_t src_ndx;
+ PAGE *dest_page;
+ db_indx_t *dest_indx;
+ int log;
+{
+ DB *dbp;
+ DBT tkey, tdata;
+ db_indx_t kindx, dindx, dest;
+ u_int32_t ktype, dtype;
+ int match, ret;
+
+ dbp = dbc->dbp;
+ ret = 0;
+ memset(&tkey, 0, sizeof(tkey));
+ memset(&tdata, 0, sizeof(tdata));
+
+ ktype = HPAGE_TYPE(dbp, src_page, H_KEYINDEX(src_ndx));
+ dtype = HPAGE_TYPE(dbp, src_page, H_DATAINDEX(src_ndx));
+ kindx = H_KEYINDEX(src_ndx);
+ dindx = H_DATAINDEX(src_ndx);
+ if (ktype == H_OFFPAGE) {
+ tkey.data = P_ENTRY(dbp, src_page, kindx);
+ tkey.size = LEN_HITEM(dbp, src_page, dbp->pgsize, kindx);
+ } else {
+ tkey.data = HKEYDATA_DATA(P_ENTRY(dbp, src_page, kindx));
+ tkey.size = LEN_HKEYDATA(dbp, src_page, dbp->pgsize, kindx);
+ }
+ if (dtype == H_OFFPAGE || dtype == H_OFFDUP) {
+ tdata.data = P_ENTRY(dbp, src_page, dindx);
+ tdata.size = LEN_HITEM(dbp, src_page, dbp->pgsize, dindx);
+ } else {
+ tdata.data = HKEYDATA_DATA(P_ENTRY(dbp, src_page, dindx));
+ tdata.size = LEN_HKEYDATA(dbp, src_page, dbp->pgsize, dindx);
+ }
+ if (dest_indx != NULL)
+ dest = *dest_indx;
+ else
+ dest = NDX_INVALID;
+ if (dest == NDX_INVALID) {
+ if ((ret = __ham_getindex(dbc,
+ dest_page, &tkey, ktype, &match, &dest)) != 0)
+ return (ret);
+ /* It is an error to insert a duplicate key */
+ DB_ASSERT(dbp->env, match != 0);
+ }
+
+ if (log == 1) {
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __ham_insdel_log(dbp, dbc->txn,
+ &LSN(dest_page), 0, PUTPAIR,
+ PGNO(dest_page), (u_int32_t)dest, &LSN(dest_page),
+ OP_SET(ktype, dest_page), &tkey,
+ OP_SET(dtype, dest_page), &tdata)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(dest_page));
+ }
+
+ if ((ret = __ham_insertpair(dbc, dest_page, &dest,
+ &tkey, &tdata, ktype, dtype)) != 0)
+ return (ret);
+
+ DB_ASSERT(dbp->env, dtype != H_DUPLICATE ||
+ HPAGE_TYPE(dbp, dest_page, H_DATAINDEX(dest)) == dtype);
+
+ if (dest_indx != NULL)
+ *dest_indx = dest;
+
+ return (ret);
+}
+
+/*
+ * __ham_add_ovflpage --
+ *
+ * Returns:
+ * 0 on success: pp points to new page; !0 on error, pp not valid.
+ *
+ * PUBLIC: int __ham_add_ovflpage __P((DBC *, PAGE **));
+ */
+int
+__ham_add_ovflpage(dbc, pp)
+ DBC *dbc;
+ PAGE **pp;
+{
+ DB *dbp;
+ DB_LSN new_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *new_pagep, *pagep;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ pagep = *pp;
+ *pp = NULL;
+
+ DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+
+ if ((ret = __db_new(dbc, P_HASH, NULL, &new_pagep)) != 0)
+ return (ret);
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __ham_newpage_log(dbp, dbc->txn, &new_lsn, 0,
+ PUTOVFL, PGNO(pagep), &LSN(pagep), PGNO(new_pagep),
+ &LSN(new_pagep), PGNO_INVALID, NULL)) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, new_pagep, dbc->priority);
+ return (ret);
+ }
+ } else
+ LSN_NOT_LOGGED(new_lsn);
+
+ /* Move lsn onto page. */
+ LSN(pagep) = LSN(new_pagep) = new_lsn;
+ NEXT_PGNO(pagep) = PGNO(new_pagep);
+
+ PREV_PGNO(new_pagep) = PGNO(pagep);
+
+ *pp = new_pagep;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __ham_get_cpage __P((DBC *, db_lockmode_t));
+ */
+int
+__ham_get_cpage(dbc, mode)
+ DBC *dbc;
+ db_lockmode_t mode;
+{
+ DB *dbp;
+ DB_LOCK tmp_lock;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /*
+ * There are four cases with respect to buckets and locks.
+ * 1. If there is no lock held, then if we are locking, we should
+ * get the lock.
+ * 2. If there is a lock held, it's for the current bucket, and it's
+ * for the right mode, we don't need to do anything.
+ * 3. If there is a lock held for the current bucket but it's not
+ * strong enough, we need to upgrade.
+ * 4. If there is a lock, but it's for a different bucket, then we need
+ * to release the existing lock and get a new lock.
+ */
+ LOCK_INIT(tmp_lock);
+ if (STD_LOCKING(dbc)) {
+ if (hcp->lbucket != hcp->bucket) { /* Case 4 */
+ if ((ret = __TLPUT(dbc, hcp->lock)) != 0)
+ return (ret);
+ LOCK_INIT(hcp->lock);
+ hcp->stream_start_pgno = PGNO_INVALID;
+ }
+
+ /*
+ * See if we have the right lock. If we are doing
+ * dirty reads we assume the write lock has been downgraded.
+ */
+ if ((LOCK_ISSET(hcp->lock) &&
+ ((hcp->lock_mode == DB_LOCK_READ ||
+ F_ISSET(dbp, DB_AM_READ_UNCOMMITTED)) &&
+ mode == DB_LOCK_WRITE))) {
+ /* Case 3. */
+ tmp_lock = hcp->lock;
+ LOCK_INIT(hcp->lock);
+ }
+
+ /* Acquire the lock. */
+ if (!LOCK_ISSET(hcp->lock))
+ /* Cases 1, 3, and 4. */
+ if ((ret = __ham_lock_bucket(dbc, mode)) != 0)
+ return (ret);
+
+ if (ret == 0) {
+ hcp->lock_mode = mode;
+ hcp->lbucket = hcp->bucket;
+ /* Case 3: release the original lock. */
+ if ((ret = __ENV_LPUT(dbp->env, tmp_lock)) != 0)
+ return (ret);
+ } else if (LOCK_ISSET(tmp_lock))
+ hcp->lock = tmp_lock;
+ }
+
+ if (ret == 0 && hcp->page == NULL) {
+ if (hcp->pgno == PGNO_INVALID)
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ if ((ret = __memp_fget(mpf,
+ &hcp->pgno, dbc->thread_info, dbc->txn,
+ (mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0) |
+ DB_MPOOL_CREATE, &hcp->page)) != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+/*
+ * Get a new page at the cursor, putting the last page if necessary.
+ * If the flag is set to H_ISDUP, then we are talking about the
+ * duplicate page, not the main page.
+ *
+ * PUBLIC: int __ham_next_cpage __P((DBC *, db_pgno_t));
+ */
+int
+__ham_next_cpage(dbc, pgno)
+ DBC *dbc;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ PAGE *p;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if (hcp->page != NULL && (ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority)) != 0)
+ return (ret);
+ hcp->stream_start_pgno = PGNO_INVALID;
+ hcp->page = NULL;
+
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE, &p)) != 0)
+ return (ret);
+
+ hcp->page = p;
+ hcp->pgno = pgno;
+ hcp->indx = 0;
+
+ return (0);
+}
+
+/*
+ * __ham_lock_bucket --
+ * Get the lock on a particular bucket.
+ *
+ * PUBLIC: int __ham_lock_bucket __P((DBC *, db_lockmode_t));
+ */
+int
+__ham_lock_bucket(dbc, mode)
+ DBC *dbc;
+ db_lockmode_t mode;
+{
+ HASH_CURSOR *hcp;
+ db_pgno_t pgno;
+ int gotmeta, ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ gotmeta = hcp->hdr == NULL ? 1 : 0;
+ if (gotmeta)
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ return (ret);
+ pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ if (gotmeta)
+ if ((ret = __ham_release_meta(dbc)) != 0)
+ return (ret);
+
+ ret = __db_lget(dbc, 0, pgno, mode, 0, &hcp->lock);
+
+ hcp->lock_mode = mode;
+ return (ret);
+}
+
+/*
+ * __ham_dpair --
+ * Delete a pair on a page, paying no attention to what the pair
+ * represents. The caller is responsible for freeing up duplicates
+ * or offpage entries that might be referenced by this pair.
+ *
+ * Recovery assumes that this may be called without the metadata
+ * page pinned.
+ *
+ * PUBLIC: void __ham_dpair __P((DB *, PAGE *, u_int32_t));
+ */
+void
+__ham_dpair(dbp, p, indx)
+ DB *dbp;
+ PAGE *p;
+ u_int32_t indx;
+{
+ db_indx_t delta, n, *inp;
+ u_int8_t *dest, *src;
+
+ inp = P_INP(dbp, p);
+ /*
+ * Compute "delta", the amount we have to shift all of the
+ * offsets. To find the delta, we just need to calculate
+ * the size of the pair of elements we are removing.
+ */
+ delta = H_PAIRSIZE(dbp, p, dbp->pgsize, indx);
+
+ /*
+ * The hard case: we want to remove something other than
+ * the last item on the page. We need to shift data and
+ * offsets down.
+ */
+ if ((db_indx_t)indx != NUM_ENT(p) - 2) {
+ /*
+ * Move the data: src is the first occupied byte on
+ * the page. (Length is delta.)
+ */
+ src = (u_int8_t *)p + HOFFSET(p);
+
+ /*
+ * Destination is delta bytes beyond src. This might
+ * be an overlapping copy, so we have to use memmove.
+ */
+ dest = src + delta;
+ memmove(dest, src, inp[H_DATAINDEX(indx)] - HOFFSET(p));
+ }
+
+ /* Adjust page metadata. */
+ HOFFSET(p) = HOFFSET(p) + delta;
+ NUM_ENT(p) = NUM_ENT(p) - 2;
+
+ /* Adjust the offsets. */
+ for (n = (db_indx_t)indx; n < (db_indx_t)(NUM_ENT(p)); n++)
+ inp[n] = inp[n + 2] + delta;
+
+}
+
+static int
+__hamc_delpg_getorder(cp, my_dbc, orderp, new_pgno, indx, args)
+ DBC *cp, *my_dbc;
+ u_int32_t *orderp;
+ db_pgno_t new_pgno;
+ u_int32_t indx;
+ void *args;
+{
+ HASH_CURSOR *hcp;
+
+ COMPQUIET(args, NULL);
+
+ if (cp == my_dbc || cp->dbtype != DB_HASH)
+ return (0);
+ hcp = (HASH_CURSOR *)cp->internal;
+ if (hcp->pgno == new_pgno &&
+ !MVCC_SKIP_CURADJ(cp, new_pgno)) {
+ if (hcp->indx == indx &&
+ F_ISSET(hcp, H_DELETED) &&
+ hcp->order > *orderp)
+ *orderp = hcp->order;
+ }
+ return (0);
+}
+
+struct __hamc_delpg_setorder_args {
+ db_pgno_t new_pgno;
+ u_int32_t order;
+ db_ham_mode op;
+ DB_TXN *my_txn;
+};
+
+static int
+__hamc_delpg_setorder(cp, my_dbc, foundp, old_pgno, indx, vargs)
+ DBC *cp, *my_dbc;
+ u_int32_t *foundp;
+ db_pgno_t old_pgno;
+ u_int32_t indx;
+ void *vargs;
+{
+ HASH_CURSOR *hcp;
+ struct __hamc_delpg_setorder_args *args;
+
+ if (cp == my_dbc || cp->dbtype != DB_HASH)
+ return (0);
+
+ hcp = (HASH_CURSOR *)cp->internal;
+ args = vargs;
+
+ if (hcp->pgno == old_pgno &&
+ !MVCC_SKIP_CURADJ(cp, old_pgno)) {
+ switch (args->op) {
+ case DB_HAM_DELFIRSTPG:
+ /*
+ * We're moving all items,
+ * regardless of index.
+ */
+ hcp->pgno = args->new_pgno;
+
+ /*
+ * But we have to be careful of
+ * the order values.
+ */
+ if (hcp->indx == indx)
+ hcp->order += args->order;
+ break;
+ case DB_HAM_DELMIDPG:
+ hcp->pgno = args->new_pgno;
+ DB_ASSERT(cp->dbp->env, hcp->indx == 0 &&
+ F_ISSET(hcp, H_DELETED));
+ hcp->order += args->order;
+ break;
+ case DB_HAM_DELLASTPG:
+ hcp->pgno = args->new_pgno;
+ DB_ASSERT(cp->dbp->env, hcp->indx == 0 &&
+ F_ISSET(hcp, H_DELETED));
+ hcp->indx = indx;
+ hcp->order += args->order;
+ break;
+ default:
+ return (__db_unknown_path(
+ cp->dbp->env, "__hamc_delpg"));
+ }
+ if (args->my_txn != NULL && cp->txn != args->my_txn)
+ *foundp = 1;
+ }
+ return (0);
+}
+
+/*
+ * __hamc_delpg --
+ *
+ * Adjust the cursors after we've emptied a page in a bucket, taking
+ * care that when we move cursors pointing to deleted items, their
+ * orders don't collide with the orders of cursors on the page we move
+ * them to (since after this function is called, cursors with the same
+ * index on the two pages will be otherwise indistinguishable--they'll
+ * all have pgno new_pgno). There are three cases:
+ *
+ * 1) The emptied page is the first page in the bucket. In this
+ * case, we've copied all the items from the second page into the
+ * first page, so the first page is new_pgno and the second page is
+ * old_pgno. new_pgno is empty, but can have deleted cursors
+ * pointing at indx 0, so we need to be careful of the orders
+ * there. This is DB_HAM_DELFIRSTPG.
+ *
+ * 2) The page is somewhere in the middle of a bucket. Our caller
+ * can just delete such a page, so it's old_pgno. old_pgno is
+ * empty, but may have deleted cursors pointing at indx 0, so we
+ * need to be careful of indx 0 when we move those cursors to
+ * new_pgno. This is DB_HAM_DELMIDPG.
+ *
+ * 3) The page is the last in a bucket. Again the empty page is
+ * old_pgno, and again it should only have cursors that are deleted
+ * and at indx == 0. This time, though, there's no next page to
+ * move them to, so we set them to indx == num_ent on the previous
+ * page--and indx == num_ent is the index whose cursors we need to
+ * be careful of. This is DB_HAM_DELLASTPG.
+ */
+static int
+__hamc_delpg(dbc, old_pgno, new_pgno, num_ent, op, orderp)
+ DBC *dbc;
+ db_pgno_t old_pgno, new_pgno;
+ u_int32_t num_ent;
+ db_ham_mode op;
+ u_int32_t *orderp;
+{
+ DB *dbp;
+ DB_LSN lsn;
+ db_indx_t indx;
+ int ret;
+ u_int32_t found;
+ struct __hamc_delpg_setorder_args args;
+
+ /* Which is the worrisome index? */
+ indx = (op == DB_HAM_DELLASTPG) ? num_ent : 0;
+
+ dbp = dbc->dbp;
+
+ /*
+ * Find the highest order of any cursor our movement
+ * may collide with.
+ */
+ if ((ret = __db_walk_cursors(dbp, dbc,
+ __hamc_delpg_getorder, &args.order, new_pgno, indx, NULL)) != 0)
+ return (ret);
+ args.order++;
+
+ args.my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL;
+ args.op = op;
+ args.new_pgno = new_pgno;
+ if ((ret = __db_walk_cursors(dbp, dbc,
+ __hamc_delpg_setorder, &found, old_pgno, indx, &args)) != 0)
+ return (ret);
+
+ if (found != 0 && DBC_LOGGING(dbc)) {
+ if ((ret = __ham_chgpg_log(dbp, args.my_txn, &lsn, 0, op,
+ old_pgno, new_pgno, indx, args.order)) != 0)
+ return (ret);
+ }
+ *orderp = args.order;
+ return (0);
+}
diff --git a/src/hash/hash_rec.c b/src/hash/hash_rec.c
new file mode 100644
index 00000000..58965569
--- /dev/null
+++ b/src/hash/hash_rec.c
@@ -0,0 +1,1896 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * Margo Seltzer. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * The President and Fellows of Harvard University. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/mp.h"
+
+static int __ham_alloc_pages __P((DBC *, __ham_groupalloc_args *, DB_LSN *));
+static int __ham_alloc_pages_42
+ __P((DBC *, __ham_groupalloc_42_args *, DB_LSN *));
+static int __ham_chgpg_recover_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * __ham_insdel_recover --
+ *
+ * PUBLIC: int __ham_insdel_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_insdel_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_insdel_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_indx_t dindx;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__ham_insdel_print);
+ REC_INTRO(__ham_insdel_read, ip, 1);
+
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+ 0, &pagep)) != 0) {
+ if (DB_UNDO(op)) {
+ if (ret == DB_PAGE_NOTFOUND)
+ goto done;
+ else {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+ /* If the page is not here then it was later truncated. */
+ if (!IS_ZERO_LSN(argp->pagelsn))
+ goto done;
+ /*
+ * This page was created by a group allocation and
+ * the file may not have been extend yet.
+ * Create the page if necessary.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+ DB_MPOOL_CREATE, &pagep)) != 0) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+ /*
+ * Two possible things going on:
+ * redo a delete/undo a put: delete the item from the page.
+ * redo a put/undo a delete: add the item to the page.
+ * If we are undoing a delete, then the information logged is the
+ * entire entry off the page, not just the data of a dbt. In
+ * this case, we want to copy it back onto the page verbatim.
+ * We do this by calling __insertpair with the type H_OFFPAGE instead
+ * of H_KEYDATA.
+ */
+ if ((argp->opcode == DELPAIR && cmp_n == 0 && DB_UNDO(op)) ||
+ (argp->opcode == PUTPAIR && cmp_p == 0 && DB_REDO(op))) {
+ /*
+ * Need to redo a PUT or undo a delete.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ dindx = (db_indx_t)argp->ndx;
+ if ((ret = __ham_insertpair(dbc, pagep, &dindx, &argp->key,
+ &argp->data, OP_MODE_GET(argp->keytype),
+ OP_MODE_GET(argp->datatype))) != 0)
+ goto out;
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+ } else if ((argp->opcode == DELPAIR && cmp_p == 0 && DB_REDO(op)) ||
+ (argp->opcode == PUTPAIR && cmp_n == 0 && DB_UNDO(op))) {
+ /* Need to undo a put or redo a delete. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ __ham_dpair(file_dbp, pagep, argp->ndx);
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+ /* Return the previous LSN. */
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __ham_insdel_42_recover --
+ *
+ * PUBLIC: int __ham_insdel_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_insdel_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_insdel_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_indx_t dindx;
+ u_int32_t dtype, ktype, opcode;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__ham_insdel_print);
+ REC_INTRO(__ham_insdel_42_read, ip, 1);
+
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+ 0, &pagep)) != 0) {
+ if (DB_UNDO(op)) {
+ if (ret == DB_PAGE_NOTFOUND)
+ goto done;
+ else {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+ /* If the page is not here then it was later truncated. */
+ if (!IS_ZERO_LSN(argp->pagelsn))
+ goto done;
+ /*
+ * This page was created by a group allocation and
+ * the file may not have been extend yet.
+ * Create the page if necessary.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+ DB_MPOOL_CREATE, &pagep)) != 0) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+ /*
+ * Two possible things going on:
+ * redo a delete/undo a put: delete the item from the page.
+ * redo a put/undo a delete: add the item to the page.
+ * If we are undoing a delete, then the information logged is the
+ * entire entry off the page, not just the data of a dbt. In
+ * this case, we want to copy it back onto the page verbatim.
+ * We do this by calling __insertpair with the type H_OFFPAGE instead
+ * of H_KEYDATA.
+ */
+ opcode = OPCODE_OF(argp->opcode);
+ if ((opcode == DELPAIR && cmp_n == 0 && DB_UNDO(op)) ||
+ (opcode == PUTPAIR && cmp_p == 0 && DB_REDO(op))) {
+ /*
+ * Need to redo a PUT or undo a delete.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ ktype = DB_UNDO(op) || PAIR_ISKEYBIG(argp->opcode) ?
+ H_OFFPAGE : H_KEYDATA;
+ if (PAIR_ISDATADUP(argp->opcode))
+ dtype = H_DUPLICATE;
+ else if (DB_UNDO(op) || PAIR_ISDATABIG(argp->opcode))
+ dtype = H_OFFPAGE;
+ else
+ dtype = H_KEYDATA;
+ dindx = (db_indx_t)argp->ndx;
+ if ((ret = __ham_insertpair(dbc, pagep, &dindx,
+ &argp->key, &argp->data, ktype, dtype)) != 0)
+ goto out;
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+ } else if ((opcode == DELPAIR && cmp_p == 0 && DB_REDO(op)) ||
+ (opcode == PUTPAIR && cmp_n == 0 && DB_UNDO(op))) {
+ /* Need to undo a put or redo a delete. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ __ham_dpair(file_dbp, pagep, argp->ndx);
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+ /* Return the previous LSN. */
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __ham_newpage_recover --
+ * This log message is used when we add/remove overflow pages. This
+ * message takes care of the pointer chains, not the data on the pages.
+ *
+ * PUBLIC: int __ham_newpage_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_newpage_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_newpage_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int change, cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__ham_newpage_print);
+ REC_INTRO(__ham_newpage_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->new_pgno, &pagep, ppage);
+ change = 0;
+
+ /*
+ * There are potentially three pages we need to check: the one
+ * that we created/deleted, the one before it and the one after
+ * it.
+ */
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
+ /* Redo a create new page or undo a delete new page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, argp->new_pgno,
+ argp->prev_pgno, argp->next_pgno, 0, P_HASH);
+ change = 1;
+ } else if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DELOVFL) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
+ /*
+ * Redo a delete or undo a create new page. All we
+ * really need to do is change the LSN.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ change = 1;
+ }
+
+ if (change)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+ /* Now do the prev page. */
+ppage: if (argp->prev_pgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ change = 0;
+
+ if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
+ /* Redo a create new page or undo a delete new page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->new_pgno;
+ change = 1;
+ } else if ((cmp_p == 0 &&
+ DB_REDO(op) && argp->opcode == DELOVFL) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
+ /* Redo a delete or undo a create new page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->next_pgno;
+ change = 1;
+ }
+
+ if (change)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
+
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+ }
+
+ /* Now time to do the next page */
+npage: if (argp->next_pgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ change = 0;
+
+ if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
+ /* Redo a create new page or undo a delete new page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->new_pgno;
+ change = 1;
+ } else if ((cmp_p == 0 &&
+ DB_REDO(op) && argp->opcode == DELOVFL) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
+ /* Redo a delete or undo a create new page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->prev_pgno;
+ change = 1;
+ }
+
+ if (change)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
+
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+ }
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __ham_replace_recover --
+ * This log message refers to partial puts that are local to a single
+ * page. You can think of them as special cases of the more general
+ * insdel log message.
+ *
+ * PUBLIC: int __ham_replace_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_replace_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_replace_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DBT dbt;
+ PAGE *pagep;
+ u_int32_t change;
+ int cmp_n, cmp_p, is_plus, modified, off, ret;
+ u_int8_t *hk;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__ham_replace_print);
+ REC_INTRO(__ham_replace_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ memset(&dbt, 0, sizeof(dbt));
+ modified = 0;
+
+ /*
+ * Before we know the direction of the transformation we will
+ * determine the size differential; then once we know if we are
+ * redoing or undoing, we'll adjust the sign (is_plus) appropriately.
+ */
+ if (argp->newitem.size > argp->olditem.size) {
+ change = argp->newitem.size - argp->olditem.size;
+ is_plus = 1;
+ } else {
+ change = argp->olditem.size - argp->newitem.size;
+ is_plus = 0;
+ }
+ /*
+ * When chaining from a "regular" record to an off page record
+ * the old record does not contain a header while the new record
+ * does and is at an offset of -1 relative to the data part of
+ * the record. We add this to the amount of the change (which is
+ * an absolute value). If we are undoing then the offset is not
+ * used in the placement of the data.
+ */
+ off = argp->off;
+ if (off < 0 &&
+ (OP_MODE_GET(argp->oldtype) == H_DUPLICATE ||
+ OP_MODE_GET(argp->oldtype) == H_KEYDATA)) {
+ change -= (u_int32_t)off;
+ if (DB_UNDO(op))
+ off = 0;
+ }
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Reapply the change as specified. */
+ dbt.data = argp->newitem.data;
+ dbt.size = argp->newitem.size;
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = *lsnp;
+ /*
+ * The is_plus flag is set properly to reflect
+ * newitem.size - olditem.size.
+ */
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the already applied change. */
+ dbt.data = argp->olditem.data;
+ dbt.size = argp->olditem.size;
+ /*
+ * Invert is_plus to reflect sign of
+ * olditem.size - newitem.size.
+ */
+ is_plus = !is_plus;
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = argp->pagelsn;
+ modified = 1;
+ }
+
+ if (modified) {
+ __ham_onpage_replace(file_dbp, pagep,
+ argp->ndx, off, change, is_plus, &dbt);
+ if (argp->oldtype != argp->newtype) {
+ hk = P_ENTRY(file_dbp, pagep, argp->ndx);
+ if (DB_REDO(op))
+ HPAGE_PTYPE(hk) = OP_MODE_GET(argp->newtype);
+ else
+ HPAGE_PTYPE(hk) = OP_MODE_GET(argp->oldtype);
+ }
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __ham_replace_42_recover --
+ * This log message refers to partial puts that are local to a single
+ * page. You can think of them as special cases of the more general
+ * insdel log message.
+ *
+ * PUBLIC: int __ham_replace_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_replace_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_replace_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DBT dbt;
+ PAGE *pagep;
+ u_int32_t change;
+ int cmp_n, cmp_p, is_plus, modified, ret;
+ u_int8_t *hk;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__ham_replace_print);
+ REC_INTRO(__ham_replace_42_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ memset(&dbt, 0, sizeof(dbt));
+ modified = 0;
+
+ /*
+ * Before we know the direction of the transformation we will
+ * determine the size differential; then once we know if we are
+ * redoing or undoing, we'll adjust the sign (is_plus) appropriately.
+ */
+ if (argp->newitem.size > argp->olditem.size) {
+ change = argp->newitem.size - argp->olditem.size;
+ is_plus = 1;
+ } else {
+ change = argp->olditem.size - argp->newitem.size;
+ is_plus = 0;
+ }
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Reapply the change as specified. */
+ dbt.data = argp->newitem.data;
+ dbt.size = argp->newitem.size;
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = *lsnp;
+ /*
+ * The is_plus flag is set properly to reflect
+ * newitem.size - olditem.size.
+ */
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the already applied change. */
+ dbt.data = argp->olditem.data;
+ dbt.size = argp->olditem.size;
+ /*
+ * Invert is_plus to reflect sign of
+ * olditem.size - newitem.size.
+ */
+ is_plus = !is_plus;
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = argp->pagelsn;
+ modified = 1;
+ }
+
+ if (modified) {
+ __ham_onpage_replace(file_dbp, pagep,
+ argp->ndx, argp->off, change, is_plus, &dbt);
+ if (argp->makedup) {
+ hk = P_ENTRY(file_dbp, pagep, argp->ndx);
+ if (DB_REDO(op))
+ HPAGE_PTYPE(hk) = H_DUPLICATE;
+ else
+ HPAGE_PTYPE(hk) = H_KEYDATA;
+ }
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __ham_splitdata_recover --
+ *
+ * PUBLIC: int __ham_splitdata_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_splitdata_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_splitdata_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__ham_splitdata_print);
+ REC_INTRO(__ham_splitdata_read, ip, 1);
+
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_UNDO(op)) {
+ if (ret == DB_PAGE_NOTFOUND)
+ goto done;
+ else {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+ /* If the page is not here then it was later truncated. */
+ if (!IS_ZERO_LSN(argp->pagelsn))
+ goto done;
+ /*
+ * This page was created by a group allocation and
+ * the file may not have been extend yet.
+ * Create the page if necessary.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ /*
+ * There are three types of log messages here. Two are related
+ * to an actual page split operation, one for the old page
+ * and one for the new pages created. The original image in the
+ * SPLITOLD record is used for undo. The image in the SPLITNEW
+ * is used for redo. We should never have a case where there is
+ * a redo operation and the SPLITOLD record is on disk, but not
+ * the SPLITNEW record. Therefore, we only have work to do when
+ * redo NEW messages and undo OLD messages, but we have to update
+ * LSNs in both cases.
+ *
+ * The third message is generated when a page is sorted (SORTPAGE). In
+ * an undo the original image in the SORTPAGE is used. In a redo we
+ * recreate the sort operation by calling __ham_sort_page.
+ */
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (argp->opcode == SPLITNEW)
+ /* Need to redo the split described. */
+ memcpy(pagep, argp->pageimage.data,
+ argp->pageimage.size);
+ else if (argp->opcode == SORTPAGE) {
+ if ((ret = __ham_sort_page(dbc, NULL, pagep)) != 0)
+ goto out;
+ }
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (argp->opcode == SPLITOLD || argp->opcode == SORTPAGE) {
+ /* Put back the old image. */
+ memcpy(pagep, argp->pageimage.data,
+ argp->pageimage.size);
+ } else
+ P_INIT(pagep, file_dbp->pgsize, argp->pgno,
+ PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ LSN(pagep) = argp->pagelsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __ham_copypage_recover --
+ * Recovery function for copypage.
+ *
+ * PUBLIC: int __ham_copypage_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_copypage_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_copypage_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__ham_copypage_print);
+ REC_INTRO(__ham_copypage_read, ip, 0);
+
+ /* This is the bucket page. */
+ REC_FGET(mpf, ip, argp->pgno, &pagep, donext);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->page.data, argp->page.size);
+ PGNO(pagep) = argp->pgno;
+ PREV_PGNO(pagep) = PGNO_INVALID;
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, argp->pgno, PGNO_INVALID,
+ argp->next_pgno, 0, P_HASH);
+ LSN(pagep) = argp->pagelsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+donext: /* Now fix up the "next" page. */
+ REC_FGET(mpf, ip, argp->next_pgno, &pagep, do_nn);
+
+ /* For REDO just update the LSN. For UNDO copy page back. */
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->page.data, argp->page.size);
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+ /* Now fix up the next's next page. */
+do_nn: if (argp->nnext_pgno == PGNO_INVALID)
+ goto done;
+
+ REC_FGET(mpf, ip, argp->nnext_pgno, &pagep, done);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nnextlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nnextlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ PREV_PGNO(pagep) = argp->pgno;
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ PREV_PGNO(pagep) = argp->next_pgno;
+ LSN(pagep) = argp->nnextlsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __ham_metagroup_recover --
+ * Recovery function for metagroup.
+ *
+ * PUBLIC: int __ham_metagroup_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_metagroup_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_metagroup_args *argp;
+ DB_THREAD_INFO *ip;
+ HASH_CURSOR *hcp;
+ DB *file_dbp;
+ DBMETA *mmeta;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, did_alloc, groupgrow, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ mmeta = NULL;
+ did_alloc = 0;
+ REC_PRINT(__ham_metagroup_print);
+ REC_INTRO(__ham_metagroup_read, ip, 1);
+
+ /*
+ * This logs the virtual create of pages pgno to pgno + bucket.
+ * The log record contains:
+ * bucket: old maximum bucket
+ * pgno: page number of the new bucket.
+ * We round up on log calculations, so we can figure out if we are
+ * about to double the hash table if argp->bucket+1 is a power of 2.
+ * If it is, then we are allocating an entire doubling of pages,
+ * otherwise, we are simply allocated one new page.
+ */
+ groupgrow =
+ (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1;
+ pgno = argp->pgno;
+ if (argp->newalloc)
+ pgno += argp->bucket;
+
+ pagep = NULL;
+ ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep);
+
+ /* If we are undoing, then we don't want to create the page. */
+ if (ret != 0 && DB_REDO(op))
+ ret = __memp_fget(mpf,
+ &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep);
+ else if (ret == DB_PAGE_NOTFOUND)
+ goto do_meta;
+ if (ret != 0) {
+ if (ret != ENOSPC)
+ goto out;
+ pgno = 0;
+ goto do_meta;
+ }
+
+ /*
+ * When we get here then either we did not grow the file
+ * (groupgrow == 0) or we did grow the file and the allocation
+ * of those new pages succeeded.
+ */
+ did_alloc = groupgrow;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* If this record allocated the pages give them back. */
+ if (argp->newalloc) {
+ if (pagep != NULL && (ret = __memp_fput(mpf,
+ ip, pagep, DB_PRIORITY_VERY_LOW)) != 0)
+ goto out;
+ pagep = NULL;
+ if ((ret = __memp_ftruncate(mpf, NULL, ip,
+ argp->pgno, 0)) != 0)
+ goto out;
+ } else {
+ /*
+ * Otherwise just roll the page back to its
+ * previous state.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ pagep->lsn = argp->pagelsn;
+ }
+ }
+ if (pagep != NULL &&
+ (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+
+ /*
+ * If a earlier aborted allocation used one of our pages it may
+ * be in the wrong state, read all the pages in the group and init
+ * them to be empty.
+ */
+ if (DB_REDO(op) && argp->newalloc) {
+ for (pgno = argp->pgno;
+ pgno < argp->pgno + argp->bucket; pgno++) {
+ if ((ret = __memp_fget(mpf,
+ &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+
+ if (IS_ZERO_LSN(LSN(pagep))) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ PGNO_INVALID, PGNO_INVALID, PGNO_INVALID,
+ 0, P_HASH);
+ }
+ if ((ret =
+ __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ }
+ }
+
+do_meta:
+ /* Now we have to update the meta-data page. */
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto out;
+ cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn);
+ cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn);
+ CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn);
+ CHECK_ABORT(env, op, cmp_n, &hcp->hdr->dbmeta.lsn, lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the actual updating of bucket counts. */
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ ++hcp->hdr->max_bucket;
+ if (groupgrow) {
+ hcp->hdr->low_mask = hcp->hdr->high_mask;
+ hcp->hdr->high_mask =
+ (argp->bucket + 1) | hcp->hdr->low_mask;
+ }
+ hcp->hdr->dbmeta.lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the actual updating of bucket counts. */
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ hcp->hdr->max_bucket = argp->bucket;
+ if (groupgrow) {
+ hcp->hdr->high_mask = argp->bucket;
+ hcp->hdr->low_mask = hcp->hdr->high_mask >> 1;
+ }
+ hcp->hdr->dbmeta.lsn = argp->metalsn;
+ }
+
+ /*
+ * Now we need to fix up the spares array. Each entry in the
+ * spares array indicates the beginning page number for the
+ * indicated doubling.
+ */
+ if (cmp_p == 0 && did_alloc && !DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] =
+ (argp->pgno - argp->bucket) - 1;
+ }
+ if (cmp_n == 0 && groupgrow && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ hcp->hdr->spares[
+ __db_log2(argp->bucket + 1) + 1] = PGNO_INVALID;
+ }
+
+ /*
+ * Finally, we need to potentially fix up the last_pgno field
+ * in the master meta-data page (which may or may not be the
+ * same as the hash header page).
+ */
+ if (argp->mmpgno != argp->mpgno) {
+ if ((ret = __memp_fget(mpf,
+ &argp->mmpgno, ip, NULL, DB_MPOOL_EDIT, &mmeta)) != 0) {
+ if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND)
+ ret = 0;
+ goto out;
+ }
+ cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn);
+ cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+ mmeta->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+ mmeta->lsn = argp->mmetalsn;
+ }
+ } else {
+ mmeta = (DBMETA *)hcp->hdr;
+ REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+ }
+
+ if (cmp_n == 0 && DB_UNDO(op))
+ mmeta->last_pgno = argp->last_pgno;
+ else if (cmp_p == 0 && DB_REDO(op) && mmeta->last_pgno < pgno)
+ mmeta->last_pgno = pgno;
+
+ if (argp->mmpgno != argp->mpgno &&
+ (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0)
+ goto out;
+ mmeta = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (mmeta != NULL)
+ (void)__memp_fput(mpf, ip, mmeta, dbc->priority);
+ if (dbc != NULL)
+ (void)__ham_release_meta(dbc);
+
+ REC_CLOSE;
+}
+
+/*
+ * __ham_contract_recover --
+ * Recovery function for contracting a hash table
+ *
+ * PUBLIC: int __ham_contract_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_contract_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_contract_args *argp;
+ DB_THREAD_INFO *ip;
+ DB_MPOOLFILE *mpf;
+ DB *file_dbp;
+ DBC *dbc;
+ HASH_CURSOR *hcp;
+ HMETA *meta;
+ int cmp_n, cmp_p, ret, t_ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__ham_contract_print);
+ REC_INTRO(__ham_contract_read, ip, 1);
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto done;
+ meta = hcp->hdr;
+ cmp_n = LOG_COMPARE(lsnp, &meta->dbmeta.lsn);
+ cmp_p = LOG_COMPARE(&meta->dbmeta.lsn, &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &meta->dbmeta.lsn, &argp->meta_lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ meta = hcp->hdr;
+ meta->max_bucket = argp->bucket - 1;
+ if (argp->bucket == meta->low_mask + 1) {
+ meta->spares[
+ __db_log2(argp->bucket) + 1] = PGNO_INVALID;
+ meta->high_mask = meta->low_mask;
+ meta->low_mask >>= 1;
+ }
+ meta->dbmeta.lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ meta = hcp->hdr;
+ meta->max_bucket = argp->bucket;
+ if (argp->bucket == meta->high_mask + 1) {
+ meta->spares[__db_log2(argp->bucket) + 1] =
+ argp->pgno - argp->bucket;
+ meta->low_mask = meta->high_mask;
+ meta->high_mask = meta->max_bucket | meta->low_mask;
+ }
+ meta->dbmeta.lsn = argp->meta_lsn;
+ }
+ *lsnp = argp->prev_lsn;
+
+out: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+done: REC_CLOSE;
+}
+
+/*
+ * __ham_groupalloc_recover --
+ * Recover the batch creation of a set of pages for a new database.
+ *
+ * PUBLIC: int __ham_groupalloc_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_groupalloc_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_groupalloc_args *argp;
+ DB_THREAD_INFO *ip;
+ DBMETA *mmeta;
+ DB_MPOOLFILE *mpf;
+ DB *file_dbp;
+ DBC *dbc;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ mmeta = NULL;
+ REC_PRINT(__ham_groupalloc_print);
+ REC_INTRO(__ham_groupalloc_read, ip, 1);
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) {
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(mmeta));
+ cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(mmeta), lsnp);
+
+ /*
+ * Basically, we used mpool to allocate a chunk of pages.
+ * We need to either add those to a free list (in the undo
+ * case) or initialize them (in the redo case).
+ *
+ * If we are redoing and this is a hash subdatabase, it's possible
+ * that the pages were never allocated, so we'd better check for
+ * that and handle it here.
+ */
+ pgno = argp->start_pgno + argp->num - 1;
+ if (DB_REDO(op)) {
+ if ((ret = __ham_alloc_pages(dbc, argp, lsnp)) != 0)
+ goto out;
+ if (cmp_p == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
+ LSN(mmeta) = *lsnp;
+ }
+ } else if (DB_UNDO(op)) {
+ /*
+ * Fetch the last page and determine if it is in
+ * the post allocation state.
+ */
+ pagep = NULL;
+ if ((ret = __memp_fget(mpf, &pgno,
+ ip, NULL, DB_MPOOL_EDIT, &pagep)) == 0) {
+ if (LOG_COMPARE(&pagep->lsn, lsnp) != 0) {
+ if ((ret = __memp_fput(mpf, ip,
+ pagep, DB_PRIORITY_VERY_LOW)) != 0)
+ goto out;
+ pagep = NULL;
+ }
+ } else if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ /*
+ * If the last page was allocated then truncate back
+ * to the first page.
+ */
+ if (pagep != NULL) {
+ if ((ret = __memp_fput(mpf, ip,
+ pagep, DB_PRIORITY_VERY_LOW)) != 0)
+ goto out;
+ if ((ret = __memp_ftruncate(mpf, NULL,
+ ip, argp->start_pgno, 0)) != 0)
+ goto out;
+ }
+
+ /*
+ * If we are rolling back the metapage, then make
+ * sure it reflects the the correct last_pgno.
+ */
+ if (cmp_n == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
+ mmeta->last_pgno = argp->last_pgno;
+ }
+ pgno = 0;
+ if (cmp_n == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
+ LSN(mmeta) = argp->meta_lsn;
+ }
+ }
+
+ /*
+ * Set the last page number to the current value.
+ */
+ if (pgno > mmeta->last_pgno) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
+ mmeta->last_pgno = pgno;
+ }
+
+done: if (ret == 0)
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (mmeta != NULL)
+ (void)__memp_fput(mpf, ip, mmeta, file_dbp->priority);
+
+ REC_CLOSE;
+}
+
+/*
+ * __ham_alloc_pages --
+ *
+ * Called during redo of a file create. We create new pages in the file
+ * using the MPOOL_NEW_GROUP flag. We then log the meta-data page with a
+ * __crdel_metasub message. If we manage to crash without the newly written
+ * pages getting to disk (I'm not sure this can happen anywhere except our
+ * test suite?!), then we need to go through a recreate the final pages.
+ * Hash normally has holes in its files and handles them appropriately.
+ */
+static int
+__ham_alloc_pages(dbc, argp, lsnp)
+ DBC *dbc;
+ __ham_groupalloc_args *argp;
+ DB_LSN *lsnp;
+{
+ DB *file_dbp;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int ret;
+
+ file_dbp = dbc->dbp;
+ mpf = file_dbp->mpf;
+ ip = dbc->thread_info;
+
+ /* Read the last page of the allocation. */
+ pgno = argp->start_pgno + argp->num - 1;
+
+ /* If the page exists, and it has been initialized, then we're done. */
+ if ((ret =
+ __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) == 0) {
+ if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn))
+ goto reinit_page;
+ return (__memp_fput(mpf, ip, pagep, dbc->priority));
+ }
+
+ /* Had to create the page. */
+ if ((ret = __memp_fget(mpf, &pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ return (__db_pgerr(dbc->dbp, pgno, ret));
+
+reinit_page:
+ /* Initialize the newly allocated page. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ P_INIT(pagep, dbc->dbp->pgsize,
+ pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ pagep->lsn = *lsnp;
+
+out: return (__memp_fput(mpf, ip, pagep, dbc->priority));
+}
+
+/*
+ * __ham_changeslot_recover --
+ * Recovery function for changeslot.
+ * When we compact a hash database we may change one of the spares slots
+ * to point at a new block of pages.
+ *
+ * PUBLIC: int __ham_changeslot_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_changeslot_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_changeslot_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ HASH_CURSOR *hcp;
+ HMETA *meta;
+ u_int32_t bucket;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+
+ REC_PRINT(__ham_changeslot_print);
+ REC_INTRO(__ham_changeslot_read, ip, 1);
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto out;
+ meta = hcp->hdr;
+ cmp_n = log_compare(lsnp, &LSN(meta));
+ cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+
+ bucket = argp->slot == 0 ? 0 : 1 << (argp->slot - 1);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ meta = hcp->hdr;
+ meta->spares[argp->slot] = argp->new - bucket;
+ LSN(meta) = *lsnp;
+ } else if (cmp_n == 0 && !DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ meta = hcp->hdr;
+ meta->spares[argp->slot] = argp->old - bucket;
+ LSN(meta) = argp->meta_lsn;
+ }
+ *lsnp = argp->prev_lsn;
+ ret = __ham_release_meta(dbc);
+
+done:
+out: REC_CLOSE;
+}
+
+/*
+ * __ham_curadj_recover --
+ * Undo cursor adjustments if a subtransaction fails.
+ *
+ * PUBLIC: int __ham_curadj_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_curadj_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_curadj_args *argp;
+ db_ham_curadj mode, hamc_mode;
+ DB_THREAD_INFO *ip;
+ DB_MPOOLFILE *mpf;
+ DB *file_dbp;
+ DBC *dbc;
+ HASH_CURSOR *hcp;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__ham_curadj_print);
+ REC_INTRO(__ham_curadj_read, ip, 1);
+
+ if (op != DB_TXN_ABORT)
+ goto done;
+
+ mode = (db_ham_curadj)argp->add;
+
+ /*
+ * Reverse the logged operation, so that the consequences are reversed
+ * by the __hamc_update code.
+ */
+ switch (mode) {
+ case DB_HAM_CURADJ_DEL:
+ hamc_mode = DB_HAM_CURADJ_ADD;
+ break;
+ case DB_HAM_CURADJ_ADD:
+ hamc_mode = DB_HAM_CURADJ_DEL;
+ break;
+ case DB_HAM_CURADJ_ADDMOD:
+ hamc_mode = DB_HAM_CURADJ_DELMOD;
+ break;
+ case DB_HAM_CURADJ_DELMOD:
+ hamc_mode = DB_HAM_CURADJ_ADDMOD;
+ break;
+ default:
+ __db_errx(env, DB_STR("1122",
+ "Invalid flag in __ham_curadj_recover"));
+ ret = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Undo the adjustment by reinitializing the the cursor to look like
+ * the one that was used to do the adjustment, then we invert the
+ * add so that undo the adjustment.
+ */
+ hcp = (HASH_CURSOR *)dbc->internal;
+ hcp->pgno = argp->pgno;
+ hcp->indx = argp->indx;
+ hcp->dup_off = argp->dup_off;
+ hcp->order = argp->order;
+ if (mode == DB_HAM_CURADJ_DEL)
+ F_SET(hcp, H_DELETED);
+ (void)__hamc_update(dbc, argp->len, hamc_mode, argp->is_dup);
+
+done: *lsnp = argp->prev_lsn;
+out: REC_CLOSE;
+}
+
+static int
+__ham_chgpg_recover_func(cp, my_dbc, countp, pgno, indx, vargs)
+ DBC *cp, *my_dbc;
+ u_int32_t *countp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ void *vargs;
+{
+ BTREE_CURSOR *opdcp;
+ HASH_CURSOR *lcp;
+ u_int32_t order;
+ int ret;
+ __ham_chgpg_args *argp;
+
+ COMPQUIET(my_dbc, NULL);
+ COMPQUIET(countp, NULL);
+ COMPQUIET(pgno, 0);
+ lcp = (HASH_CURSOR *)cp->internal;
+ argp = vargs;
+
+ /* Overloaded field for DB_HAM_DEL*PG */
+ order = argp->new_indx;
+
+ switch (argp->mode) {
+ case DB_HAM_DELFIRSTPG:
+ if (lcp->pgno != argp->new_pgno ||
+ MVCC_SKIP_CURADJ(cp, lcp->pgno))
+ break;
+ if (lcp->indx != indx ||
+ !F_ISSET(lcp, H_DELETED) ||
+ lcp->order >= order) {
+ lcp->pgno = argp->old_pgno;
+ if (lcp->indx == indx)
+ lcp->order -= order;
+ }
+ break;
+ case DB_HAM_DELMIDPG:
+ case DB_HAM_DELLASTPG:
+ if (lcp->pgno == argp->new_pgno &&
+ lcp->indx == indx &&
+ F_ISSET(lcp, H_DELETED) &&
+ lcp->order >= order &&
+ !MVCC_SKIP_CURADJ(cp, lcp->pgno)) {
+ lcp->pgno = argp->old_pgno;
+ lcp->order -= order;
+ lcp->indx = 0;
+ }
+ break;
+ case DB_HAM_CHGPG:
+ /*
+ * If we're doing a CHGPG, we're undoing
+ * the move of a non-deleted item to a
+ * new page. Any cursors with the deleted
+ * flag set do not belong to this item;
+ * don't touch them.
+ */
+ if (F_ISSET(lcp, H_DELETED))
+ break;
+ /* FALLTHROUGH */
+ case DB_HAM_SPLIT:
+ if (lcp->pgno == argp->new_pgno &&
+ lcp->indx == argp->new_indx &&
+ !MVCC_SKIP_CURADJ(cp, lcp->pgno)) {
+ lcp->indx = argp->old_indx;
+ lcp->pgno = argp->old_pgno;
+ }
+ break;
+ case DB_HAM_DUP:
+ if (lcp->opd == NULL)
+ break;
+ opdcp = (BTREE_CURSOR *)lcp->opd->internal;
+ if (opdcp->pgno != argp->new_pgno ||
+ opdcp->indx != argp->new_indx ||
+ MVCC_SKIP_CURADJ(lcp->opd, opdcp->pgno))
+ break;
+
+ if (F_ISSET(opdcp, C_DELETED))
+ F_SET(lcp, H_DELETED);
+ /*
+ * We can't close a cursor while we have the
+ * dbp mutex locked, since c_close reacquires
+ * it. It should be safe to drop the mutex
+ * here, though, since newly opened cursors
+ * are put only at the end of the tailq and
+ * the cursor we're adjusting can't be closed
+ * under us.
+ */
+ MUTEX_UNLOCK(cp->dbp->env, cp->dbp->mutex);
+ ret = __dbc_close(lcp->opd);
+ MUTEX_LOCK(cp->dbp->env, cp->dbp->mutex);
+ if (ret != 0)
+ return (ret);
+ lcp->opd = NULL;
+ break;
+ }
+ return (0);
+}
+/*
+ * __ham_chgpg_recover --
+ * Undo cursor adjustments if a subtransaction fails.
+ *
+ * PUBLIC: int __ham_chgpg_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_chgpg_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_chgpg_args *argp;
+ DB_THREAD_INFO *ip;
+ DB_MPOOLFILE *mpf;
+ DB *file_dbp;
+ DBC *dbc;
+ int ret;
+ u_int32_t count;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__ham_chgpg_print);
+ REC_INTRO(__ham_chgpg_read, ip, 0);
+
+ if (op != DB_TXN_ABORT)
+ goto done;
+
+ ret = __db_walk_cursors(file_dbp, dbc,
+ __ham_chgpg_recover_func, &count, 0, argp->old_indx, argp);
+
+done: *lsnp = argp->prev_lsn;
+out: REC_CLOSE;
+}
+
+/*
+ * __ham_metagroup_recover --
+ * Recovery function for metagroup.
+ *
+ * PUBLIC: int __ham_metagroup_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_metagroup_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_metagroup_42_args *argp;
+ DB_THREAD_INFO *ip;
+ HASH_CURSOR *hcp;
+ DB *file_dbp;
+ DBMETA *mmeta;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ u_int32_t flags;
+ int cmp_n, cmp_p, did_alloc, groupgrow, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ mmeta = NULL;
+ did_alloc = 0;
+ REC_PRINT(__ham_metagroup_42_print);
+ REC_INTRO(__ham_metagroup_42_read, ip, 1);
+
+ /*
+ * This logs the virtual create of pages pgno to pgno + bucket
+ * If HAVE_FTRUNCATE is not supported the mpool page-allocation is not
+ * transaction protected, we can never undo it. Even in an abort,
+ * we have to allocate these pages to the hash table if they
+ * were actually created. In particular, during disaster
+ * recovery the metapage may be before this point if we
+ * are rolling backward. If the file has not been extended
+ * then the metapage could not have been updated.
+ * The log record contains:
+ * bucket: old maximum bucket
+ * pgno: page number of the new bucket.
+ * We round up on log calculations, so we can figure out if we are
+ * about to double the hash table if argp->bucket+1 is a power of 2.
+ * If it is, then we are allocating an entire doubling of pages,
+ * otherwise, we are simply allocated one new page.
+ */
+ groupgrow =
+ (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1;
+ pgno = argp->pgno;
+ if (argp->newalloc)
+ pgno += argp->bucket;
+
+ flags = 0;
+ pagep = NULL;
+ LF_SET(DB_MPOOL_CREATE);
+ ret = __memp_fget(mpf, &pgno, ip, NULL, flags, &pagep);
+
+ if (ret != 0) {
+ if (ret != ENOSPC)
+ goto out;
+ pgno = 0;
+ goto do_meta;
+ }
+
+ /*
+ * When we get here then either we did not grow the file
+ * (groupgrow == 0) or we did grow the file and the allocation
+ * of those new pages succeeded.
+ */
+ did_alloc = groupgrow;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /*
+ * Otherwise just roll the page back to its
+ * previous state.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ pagep->lsn = argp->pagelsn;
+ }
+ if (pagep != NULL &&
+ (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+
+do_meta:
+ /* Now we have to update the meta-data page. */
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto out;
+ cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn);
+ cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn);
+ CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the actual updating of bucket counts. */
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ ++hcp->hdr->max_bucket;
+ if (groupgrow) {
+ hcp->hdr->low_mask = hcp->hdr->high_mask;
+ hcp->hdr->high_mask =
+ (argp->bucket + 1) | hcp->hdr->low_mask;
+ }
+ hcp->hdr->dbmeta.lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the actual updating of bucket counts. */
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ hcp->hdr->max_bucket = argp->bucket;
+ if (groupgrow) {
+ hcp->hdr->high_mask = argp->bucket;
+ hcp->hdr->low_mask = hcp->hdr->high_mask >> 1;
+ }
+ hcp->hdr->dbmeta.lsn = argp->metalsn;
+ }
+
+ /*
+ * Now we need to fix up the spares array. Each entry in the
+ * spares array indicates the beginning page number for the
+ * indicated doubling. We need to fill this in whenever the
+ * spares array is invalid, if we never reclaim pages then
+ * we have to allocate the pages to the spares array in both
+ * the redo and undo cases.
+ */
+ if (did_alloc &&
+ hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) {
+ REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
+ hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] =
+ (argp->pgno - argp->bucket) - 1;
+ }
+
+ /*
+ * Finally, we need to potentially fix up the last_pgno field
+ * in the master meta-data page (which may or may not be the
+ * same as the hash header page).
+ */
+ if (argp->mmpgno != argp->mpgno) {
+ if ((ret = __memp_fget(mpf, &argp->mmpgno, ip, NULL,
+ DB_MPOOL_EDIT, &mmeta)) != 0) {
+ if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND)
+ ret = 0;
+ goto out;
+ }
+ cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn);
+ cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+ mmeta->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+ mmeta->lsn = argp->mmetalsn;
+ }
+ } else {
+ mmeta = (DBMETA *)hcp->hdr;
+ REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+ }
+
+ if (mmeta->last_pgno < pgno)
+ mmeta->last_pgno = pgno;
+
+ if (argp->mmpgno != argp->mpgno &&
+ (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0)
+ goto out;
+ mmeta = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (mmeta != NULL)
+ (void)__memp_fput(mpf, ip, mmeta, dbc->priority);
+ if (dbc != NULL)
+ (void)__ham_release_meta(dbc);
+
+ REC_CLOSE;
+}
+
+/*
+ * __ham_groupalloc_42_recover --
+ * Recover the batch creation of a set of pages for a new database.
+ *
+ * PUBLIC: int __ham_groupalloc_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__ham_groupalloc_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __ham_groupalloc_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DBMETA *mmeta;
+ DB_MPOOLFILE *mpf;
+ DB *file_dbp;
+ DBC *dbc;
+ db_pgno_t pgno;
+ int cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ mmeta = NULL;
+ REC_PRINT(__ham_groupalloc_42_print);
+ REC_INTRO(__ham_groupalloc_42_read, ip, 1);
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) {
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn);
+
+ /*
+ * Basically, we used mpool to allocate a chunk of pages.
+ * We need to either add those to a free list (in the undo
+ * case) or initialize them (in the redo case).
+ *
+ * If we are redoing and this is a hash subdatabase, it's possible
+ * that the pages were never allocated, so we'd better check for
+ * that and handle it here.
+ */
+ pgno = argp->start_pgno + argp->num - 1;
+ if (DB_REDO(op)) {
+ if ((ret = __ham_alloc_pages_42(dbc, argp, lsnp)) != 0)
+ goto out;
+ if (cmp_p == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+ LSN(mmeta) = *lsnp;
+ }
+ } else if (DB_UNDO(op)) {
+ /*
+ * We cannot roll back 4.2 style allocations.
+ */
+ __db_errx(env, DB_STR("1123",
+"Cannot replicate prepared transactions from master running release 4.2."));
+ ret = __env_panic(env, EINVAL);
+ goto out;
+ }
+
+ /*
+ * In both REDO and UNDO, we have grown the file and need to make
+ * sure that last_pgno is correct. If we HAVE_FTRUNCATE pgno
+ * will only be valid on REDO.
+ */
+ if (pgno > mmeta->last_pgno) {
+ REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
+ mmeta->last_pgno = pgno;
+ }
+
+done: if (ret == 0)
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (mmeta != NULL)
+ (void)__memp_fput(mpf, ip, mmeta, dbc->priority);
+
+ REC_CLOSE;
+}
+
+/*
+ * __ham_alloc_pages_42 --
+ *
+ * Called during redo of a file create. We create new pages in the file
+ * using the MPOOL_NEW_GROUP flag. We then log the meta-data page with a
+ * __crdel_metasub message. If we manage to crash without the newly written
+ * pages getting to disk (I'm not sure this can happen anywhere except our
+ * test suite?!), then we need to go through a recreate the final pages.
+ * Hash normally has holes in its files and handles them appropriately.
+ */
+static int
+__ham_alloc_pages_42(dbc, argp, lsnp)
+ DBC *dbc;
+ __ham_groupalloc_42_args *argp;
+ DB_LSN *lsnp;
+{
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int ret;
+
+ mpf = dbc->dbp->mpf;
+ ip = dbc->thread_info;
+
+ /* Read the last page of the allocation. */
+ pgno = argp->start_pgno + argp->num - 1;
+
+ /* If the page exists, and it has been initialized, then we're done. */
+ if ((ret = __memp_fget(mpf,
+ &pgno, ip, NULL, 0, &pagep)) == 0) {
+ if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn))
+ goto reinit_page;
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, dbc->priority)) != 0)
+ return (ret);
+ return (0);
+ }
+
+ /* Had to create the page. */
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) != 0)
+ return (__db_pgerr(dbc->dbp, pgno, ret));
+
+reinit_page:
+ /* Initialize the newly allocated page. */
+ P_INIT(pagep,
+ dbc->dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ pagep->lsn = *lsnp;
+
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ return (ret);
+
+ return (0);
+}
diff --git a/src/hash/hash_reclaim.c b/src/hash/hash_reclaim.c
new file mode 100644
index 00000000..ce3f6d9e
--- /dev/null
+++ b/src/hash/hash_reclaim.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+/*
+ * __ham_reclaim --
+ * Reclaim the pages from a subdatabase and return them to the
+ * parent free list. For now, we link each freed page on the list
+ * separately. If people really store hash databases in subdatabases
+ * and do a lot of creates and deletes, this is going to be a problem,
+ * because hash needs chunks of contiguous storage. We may eventually
+ * need to go to a model where we maintain the free list with chunks of
+ * contiguous pages as well.
+ *
+ * PUBLIC: int __ham_reclaim __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *txn, u_int32_t));
+ */
+int
+__ham_reclaim(dbp, ip, txn, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ HASH_CURSOR *hcp;
+ int ret;
+
+ /* Open up a cursor that we'll use for traversing. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ return (ret);
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto err;
+
+ /* Write lock the metapage for deallocations. */
+ if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+ goto err;
+
+ /* Avoid locking every page, we have the handle locked exclusive. */
+ F_SET(dbc, DBC_DONTLOCK);
+
+ if ((ret = __ham_traverse(dbc, DB_LOCK_WRITE,
+ __db_reclaim_callback, &flags, 1)) != 0)
+ goto err;
+ if ((ret = __dbc_close(dbc)) != 0)
+ goto err;
+ if ((ret = __ham_release_meta(dbc)) != 0)
+ goto err;
+ return (0);
+
+err: if (hcp->hdr != NULL)
+ (void)__ham_release_meta(dbc);
+ (void)__dbc_close(dbc);
+ return (ret);
+}
+
+/*
+ * __ham_truncate --
+ * Reclaim the pages from a subdatabase and return them to the
+ * parent free list.
+ *
+ * PUBLIC: int __ham_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__ham_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ u_int32_t count;
+ int ret, t_ret;
+
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ return (ret);
+
+ count = 0;
+
+ ret = __ham_traverse(dbc,
+ DB_LOCK_WRITE, __db_truncate_callback, &count, 1);
+
+ if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (countp != NULL)
+ *countp = count;
+ return (ret);
+}
diff --git a/src/hash/hash_stat.c b/src/hash/hash_stat.c
new file mode 100644
index 00000000..683ce5a6
--- /dev/null
+++ b/src/hash/hash_stat.c
@@ -0,0 +1,518 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/mp.h"
+
+#ifdef HAVE_STATISTICS
+static int __ham_stat_callback __P((DBC *, PAGE *, void *, int *));
+
+/*
+ * __ham_stat --
+ * Gather/print the hash statistics
+ *
+ * PUBLIC: int __ham_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__ham_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_HASH_STAT *sp;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HASH_CURSOR *hcp;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ mpf = dbp->mpf;
+ sp = NULL;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto err;
+
+ /* Allocate and clear the structure. */
+ if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
+ goto err;
+ memset(sp, 0, sizeof(*sp));
+ /* Copy the fields that we have. */
+ sp->hash_nkeys = hcp->hdr->dbmeta.key_count;
+ sp->hash_ndata = hcp->hdr->dbmeta.record_count;
+ /*
+ * Don't take the page number from the meta-data page -- that value is
+ * only maintained in the primary database, we may have been called on
+ * a subdatabase.
+ */
+ if ((ret = __memp_get_last_pgno(dbp->mpf, &pgno)) != 0)
+ goto err;
+ sp->hash_pagecnt = pgno + 1;
+ sp->hash_pagesize = dbp->pgsize;
+ sp->hash_buckets = hcp->hdr->max_bucket + 1;
+ sp->hash_magic = hcp->hdr->dbmeta.magic;
+ sp->hash_version = hcp->hdr->dbmeta.version;
+ sp->hash_metaflags = hcp->hdr->dbmeta.flags;
+ sp->hash_ffactor = hcp->hdr->ffactor;
+
+ if (flags == DB_FAST_STAT)
+ goto done;
+
+ /* Walk the free list, counting pages. */
+ for (sp->hash_free = 0, pgno = hcp->hdr->dbmeta.free;
+ pgno != PGNO_INVALID;) {
+ ++sp->hash_free;
+
+ if ((ret = __memp_fget(mpf,
+ &pgno, dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+
+ pgno = h->next_pgno;
+ (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ }
+
+ /* Now traverse the rest of the table. */
+ sp->hash_nkeys = 0;
+ sp->hash_ndata = 0;
+ if ((ret = __ham_traverse(dbc,
+ DB_LOCK_READ, __ham_stat_callback, sp, 0)) != 0)
+ goto err;
+
+ if (!F_ISSET(dbp, DB_AM_RDONLY)) {
+ /*
+ * A transaction is not required for DB->stat, so this update
+ * can't safely make a copy of the meta page. We have to
+ * update in place.
+ */
+ if ((ret = __ham_dirty_meta(dbc,
+ (dbc->txn == NULL) ? DB_MPOOL_EDIT : 0)) != 0)
+ goto err;
+ hcp->hdr->dbmeta.key_count = sp->hash_nkeys;
+ hcp->hdr->dbmeta.record_count = sp->hash_ndata;
+ }
+
+done: if ((ret = __ham_release_meta(dbc)) != 0)
+ goto err;
+
+ *(DB_HASH_STAT **)spp = sp;
+ return (0);
+
+err: if (sp != NULL)
+ __os_ufree(env, sp);
+
+ if (hcp->hdr != NULL)
+ (void)__ham_release_meta(dbc);
+
+ return (ret);
+}
+
+/*
+ * __ham_stat_print --
+ * Display hash statistics.
+ *
+ * PUBLIC: int __ham_stat_print __P((DBC *, u_int32_t));
+ */
+int
+__ham_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { DB_HASH_DUP, "duplicates" },
+ { DB_HASH_SUBDB, "multiple-databases" },
+ { DB_HASH_DUPSORT, "sorted duplicates" },
+ { 0, NULL }
+ };
+ DB *dbp;
+ ENV *env;
+ DB_HASH_STAT *sp;
+ int lorder, ret;
+ const char *s;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if ((ret = __ham_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Default Hash database information:");
+ }
+ __db_msg(env, "%lx\tHash magic number", (u_long)sp->hash_magic);
+ __db_msg(env,
+ "%lu\tHash version number", (u_long)sp->hash_version);
+ (void)__db_get_lorder(dbp, &lorder);
+ switch (lorder) {
+ case 1234:
+ s = "Little-endian";
+ break;
+ case 4321:
+ s = "Big-endian";
+ break;
+ default:
+ s = "Unrecognized byte order";
+ break;
+ }
+ __db_msg(env, "%s\tByte order", s);
+ __db_prflags(env, NULL, sp->hash_metaflags, fn, NULL, "\tFlags");
+ __db_dl(env,
+ "Number of pages in the database", (u_long)sp->hash_pagecnt);
+ __db_dl(env,
+ "Underlying database page size", (u_long)sp->hash_pagesize);
+ __db_dl(env, "Specified fill factor", (u_long)sp->hash_ffactor);
+ __db_dl(env,
+ "Number of keys in the database", (u_long)sp->hash_nkeys);
+ __db_dl(env,
+ "Number of data items in the database", (u_long)sp->hash_ndata);
+
+ __db_dl(env, "Number of hash buckets", (u_long)sp->hash_buckets);
+ __db_dl_pct(env, "Number of bytes free on bucket pages",
+ (u_long)sp->hash_bfree, DB_PCT_PG(
+ sp->hash_bfree, sp->hash_buckets, sp->hash_pagesize), "ff");
+
+ __db_dl(env,
+ "Number of overflow pages", (u_long)sp->hash_bigpages);
+ __db_dl_pct(env, "Number of bytes free in overflow pages",
+ (u_long)sp->hash_big_bfree, DB_PCT_PG(
+ sp->hash_big_bfree, sp->hash_bigpages, sp->hash_pagesize), "ff");
+
+ __db_dl(env,
+ "Number of bucket overflow pages", (u_long)sp->hash_overflows);
+ __db_dl_pct(env,
+ "Number of bytes free in bucket overflow pages",
+ (u_long)sp->hash_ovfl_free, DB_PCT_PG(
+ sp->hash_ovfl_free, sp->hash_overflows, sp->hash_pagesize), "ff");
+
+ __db_dl(env, "Number of duplicate pages", (u_long)sp->hash_dup);
+ __db_dl_pct(env, "Number of bytes free in duplicate pages",
+ (u_long)sp->hash_dup_free, DB_PCT_PG(
+ sp->hash_dup_free, sp->hash_dup, sp->hash_pagesize), "ff");
+
+ __db_dl(env,
+ "Number of pages on the free list", (u_long)sp->hash_free);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+static int
+__ham_stat_callback(dbc, pagep, cookie, putp)
+ DBC *dbc;
+ PAGE *pagep;
+ void *cookie;
+ int *putp;
+{
+ DB *dbp;
+ DB_BTREE_STAT bstat;
+ DB_HASH_STAT *sp;
+ db_indx_t indx, len, off, tlen, top;
+ u_int8_t *hk;
+ int ret;
+
+ *putp = 0;
+ sp = cookie;
+ dbp = dbc->dbp;
+
+ switch (pagep->type) {
+ case P_INVALID:
+ /*
+ * Hash pages may be wholly zeroed; this is not a bug.
+ * Obviously such pages have no data, so we can just proceed.
+ */
+ break;
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ /*
+ * We count the buckets and the overflow pages
+ * separately and tally their bytes separately
+ * as well. We need to figure out if this page
+ * is a bucket.
+ */
+ if (PREV_PGNO(pagep) == PGNO_INVALID)
+ sp->hash_bfree += P_FREESPACE(dbp, pagep);
+ else {
+ sp->hash_overflows++;
+ sp->hash_ovfl_free += P_FREESPACE(dbp, pagep);
+ }
+ top = NUM_ENT(pagep);
+ /* Correct for on-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ switch (*H_PAIRDATA(dbp, pagep, indx)) {
+ case H_OFFDUP:
+ break;
+ case H_OFFPAGE:
+ case H_KEYDATA:
+ sp->hash_ndata++;
+ break;
+ case H_DUPLICATE:
+ tlen = LEN_HDATA(dbp, pagep, 0, indx);
+ hk = H_PAIRDATA(dbp, pagep, indx);
+ for (off = 0; off < tlen;
+ off += len + 2 * sizeof(db_indx_t)) {
+ sp->hash_ndata++;
+ memcpy(&len,
+ HKEYDATA_DATA(hk)
+ + off, sizeof(db_indx_t));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(pagep)));
+ }
+ }
+ sp->hash_nkeys += H_NUMPAIRS(pagep);
+ break;
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ /*
+ * These are all btree pages; get a correct
+ * cookie and call them. Then add appropriate
+ * fields into our stat structure.
+ */
+ memset(&bstat, 0, sizeof(bstat));
+ if ((ret = __bam_stat_callback(dbc, pagep, &bstat, putp)) != 0)
+ return (ret);
+ sp->hash_dup++;
+ sp->hash_dup_free += bstat.bt_leaf_pgfree +
+ bstat.bt_dup_pgfree + bstat.bt_int_pgfree;
+ sp->hash_ndata += bstat.bt_ndata;
+ break;
+ case P_OVERFLOW:
+ sp->hash_bigpages++;
+ sp->hash_big_bfree += P_OVFLSPACE(dbp, dbp->pgsize, pagep);
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(pagep)));
+ }
+
+ return (0);
+}
+
+/*
+ * __ham_print_cursor --
+ * Display the current cursor.
+ *
+ * PUBLIC: void __ham_print_cursor __P((DBC *));
+ */
+void
+__ham_print_cursor(dbc)
+ DBC *dbc;
+{
+ static const FN fn[] = {
+ { H_CONTINUE, "H_CONTINUE" },
+ { H_DELETED, "H_DELETED" },
+ { H_DUPONLY, "H_DUPONLY" },
+ { H_EXPAND, "H_EXPAND" },
+ { H_ISDUP, "H_ISDUP" },
+ { H_NEXT_NODUP, "H_NEXT_NODUP" },
+ { H_NOMORE, "H_NOMORE" },
+ { H_OK, "H_OK" },
+ { 0, NULL }
+ };
+ ENV *env;
+ HASH_CURSOR *cp;
+
+ env = dbc->env;
+ cp = (HASH_CURSOR *)dbc->internal;
+
+ STAT_ULONG("Bucket traversing", cp->bucket);
+ STAT_ULONG("Bucket locked", cp->lbucket);
+ STAT_ULONG("Duplicate set offset", cp->dup_off);
+ STAT_ULONG("Current duplicate length", cp->dup_len);
+ STAT_ULONG("Total duplicate set length", cp->dup_tlen);
+ STAT_ULONG("Bytes needed for add", cp->seek_size);
+ STAT_ULONG("Page on which we can insert", cp->seek_found_page);
+ STAT_ULONG("Order", cp->order);
+ __db_prflags(env, NULL, cp->flags, fn, NULL, "\tInternal Flags");
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__ham_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbc->env));
+}
+#endif
+
+/*
+ * __ham_traverse
+ * Traverse an entire hash table. We use the callback so that we
+ * can use this both for stat collection and for deallocation.
+ *
+ * PUBLIC: int __ham_traverse __P((DBC *, db_lockmode_t,
+ * PUBLIC: int (*)(DBC *, PAGE *, void *, int *), void *, int));
+ */
+int
+__ham_traverse(dbc, mode, callback, cookie, look_past_max)
+ DBC *dbc;
+ db_lockmode_t mode;
+ int (*callback) __P((DBC *, PAGE *, void *, int *));
+ void *cookie;
+ int look_past_max;
+{
+ DB *dbp;
+ DBC *opd;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ HKEYDATA *hk;
+ db_pgno_t pgno, opgno;
+ int did_put, i, ret, t_ret;
+ u_int32_t bucket, spares_entry;
+
+ dbp = dbc->dbp;
+ opd = NULL;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /*
+ * In a perfect world, we could simply read each page in the file
+ * and look at its page type to tally the information necessary.
+ * Unfortunately, the bucket locking that hash tables do to make
+ * locking easy, makes this a pain in the butt. We have to traverse
+ * duplicate, overflow and big pages from the bucket so that we
+ * don't access anything that isn't properly locked.
+ *
+ */
+ for (bucket = 0;; bucket++) {
+ /*
+ * We put the loop exit condition check here, because
+ * it made for a really vile extended ?: that made SCO's
+ * compiler drop core.
+ *
+ * If look_past_max is not set, we can stop at max_bucket;
+ * if it is set, we need to include pages that are part of
+ * the current doubling but beyond the highest bucket we've
+ * split into, as well as pages from a "future" doubling
+ * that may have been created within an aborted
+ * transaction. To do this, keep looping (and incrementing
+ * bucket) until the corresponding spares array entries
+ * cease to be defined.
+ */
+ if (look_past_max) {
+ spares_entry = __db_log2(bucket + 1);
+ if (spares_entry >= NCACHED ||
+ hcp->hdr->spares[spares_entry] == 0)
+ break;
+ } else {
+ if (bucket > hcp->hdr->max_bucket)
+ break;
+ }
+
+ hcp->bucket = bucket;
+ hcp->pgno = pgno = BUCKET_TO_PAGE(hcp, bucket);
+ for (ret = __ham_get_cpage(dbc, mode); ret == 0;
+ ret = __ham_next_cpage(dbc, pgno)) {
+
+ /*
+ * If we are cleaning up pages past the max_bucket,
+ * then they may be on the free list and have their
+ * next pointers set, but they should be ignored. In
+ * fact, we really ought to just skip anybody who is
+ * not a valid page.
+ */
+ if (TYPE(hcp->page) == P_INVALID)
+ break;
+ pgno = NEXT_PGNO(hcp->page);
+
+ /*
+ * Go through each item on the page checking for
+ * duplicates (in which case we have to count the
+ * duplicate pages) or big key/data items (in which
+ * case we have to count those pages).
+ */
+ for (i = 0; i < NUM_ENT(hcp->page); i++) {
+ hk = (HKEYDATA *)P_ENTRY(dbp, hcp->page, i);
+ switch (HPAGE_PTYPE(hk)) {
+ case H_OFFDUP:
+ memcpy(&opgno, HOFFDUP_PGNO(hk),
+ sizeof(db_pgno_t));
+ if ((ret = __dbc_newopd(dbc,
+ opgno, NULL, &opd)) != 0)
+ return (ret);
+ if ((ret = __bam_traverse(opd,
+ DB_LOCK_READ, opgno,
+ callback, cookie))
+ != 0)
+ goto err;
+ if ((ret = __dbc_close(opd)) != 0)
+ return (ret);
+ opd = NULL;
+ break;
+ case H_OFFPAGE:
+ /*
+ * We are about to get a big page
+ * which will use the same spot that
+ * the current page uses, so we need
+ * to restore the current page before
+ * looking at it again.
+ */
+ memcpy(&opgno, HOFFPAGE_PGNO(hk),
+ sizeof(db_pgno_t));
+ if ((ret = __db_traverse_big(dbc,
+ opgno, callback, cookie)) != 0)
+ goto err;
+ break;
+ case H_KEYDATA:
+ case H_DUPLICATE:
+ break;
+ default:
+ ret = __db_unknown_path(
+ dbp->env, "__ham_traverse");
+ goto err;
+ }
+ }
+
+ /* Call the callback on main pages. */
+ if ((ret = callback(dbc,
+ hcp->page, cookie, &did_put)) != 0)
+ goto err;
+
+ if (did_put)
+ hcp->page = NULL;
+ if (pgno == PGNO_INVALID)
+ break;
+ }
+ if (ret != 0)
+ goto err;
+
+ if (hcp->page != NULL) {
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority)) != 0)
+ return (ret);
+ hcp->page = NULL;
+ }
+
+ }
+err: if (opd != NULL &&
+ (t_ret = __dbc_close(opd)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
diff --git a/src/hash/hash_stub.c b/src/hash/hash_stub.c
new file mode 100644
index 00000000..57337ea9
--- /dev/null
+++ b/src/hash/hash_stub.c
@@ -0,0 +1,470 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_HASH
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+/*
+ * If the library wasn't compiled with the Hash access method, various
+ * routines aren't available. Stub them here, returning an appropriate
+ * error.
+ */
+
+/*
+ * __db_nohasham --
+ * Error when a Berkeley DB build doesn't include the access method.
+ *
+ * PUBLIC: int __db_no_hash_am __P((ENV *));
+ */
+int
+__db_no_hash_am(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("1133",
+ "library build did not include support for the Hash access method"));
+ return (DB_OPNOTSUP);
+}
+
+int
+__ham_30_hashmeta(dbp, real_name, obuf)
+ DB *dbp;
+ char *real_name;
+ u_int8_t *obuf;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(obuf, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_30_sizefix(dbp, fhp, realname, metabuf)
+ DB *dbp;
+ DB_FH *fhp;
+ char *realname;
+ u_int8_t *metabuf;
+{
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(realname, NULL);
+ COMPQUIET(metabuf, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_31_hash(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(dirtyp, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_31_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(dirtyp, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_46_hash(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(dirtyp, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_46_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(dirtyp, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__hamc_cmp(dbc, other_dbc, result)
+ DBC *dbc, *other_dbc;
+ int *result;
+{
+ COMPQUIET(other_dbc, NULL);
+ COMPQUIET(result, NULL);
+ return (__db_no_hash_am(dbc->env));
+}
+
+int
+__hamc_count(dbc, recnop)
+ DBC *dbc;
+ db_recno_t *recnop;
+{
+ COMPQUIET(recnop, NULL);
+ return (__db_no_hash_am(dbc->env));
+}
+
+int
+__hamc_dup(orig_dbc, new_dbc)
+ DBC *orig_dbc, *new_dbc;
+{
+ COMPQUIET(new_dbc, NULL);
+ return (__db_no_hash_am(orig_dbc->env));
+}
+
+int
+__hamc_init(dbc)
+ DBC *dbc;
+{
+ return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_db_close(dbp)
+ DB *dbp;
+{
+ COMPQUIET(dbp, NULL);
+ return (0);
+}
+
+int
+__ham_db_create(dbp)
+ DB *dbp;
+{
+ COMPQUIET(dbp, NULL);
+ return (0);
+}
+
+int
+__ham_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(dtabp, NULL);
+ return (0);
+}
+
+int
+__ham_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(dtabp, NULL);
+ return (0);
+}
+
+int
+__ham_meta2pgset(dbp, vdp, hmeta, flags, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ HMETA *hmeta;
+ u_int32_t flags;
+ DB *pgset;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(hmeta, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(pgset, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_metachk(dbp, name, hashm)
+ DB *dbp;
+ const char *name;
+ HMETA *hashm;
+{
+ COMPQUIET(name, NULL);
+ COMPQUIET(hashm, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_metagroup_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, (db_recops)0);
+ COMPQUIET(info, NULL);
+ return (__db_no_hash_am(env));
+}
+
+int
+__ham_mswap(env, pg)
+ ENV *env;
+ void *pg;
+{
+ COMPQUIET(pg, NULL);
+ return (__db_no_hash_am(env));
+}
+
+int
+__ham_groupalloc_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, (db_recops)0);
+ COMPQUIET(info, NULL);
+ return (__db_no_hash_am(env));
+}
+
+int
+__ham_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ COMPQUIET(ip, NULL);
+ COMPQUIET(txn, NULL);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(name, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_new_subdb(mdbp, dbp, ip, txn)
+ DB *mdbp, *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+{
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(txn, NULL);
+ COMPQUIET(ip, NULL);
+ return (__db_no_hash_am(mdbp->env));
+}
+
+int
+__ham_open(dbp, ip, txn, name, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ COMPQUIET(ip, NULL);
+ COMPQUIET(txn, NULL);
+ COMPQUIET(name, NULL);
+ COMPQUIET(base_pgno, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_pgin(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ COMPQUIET(pg, 0);
+ COMPQUIET(pp, NULL);
+ COMPQUIET(cookie, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_pgout(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ COMPQUIET(pg, 0);
+ COMPQUIET(pp, NULL);
+ COMPQUIET(cookie, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+void
+__ham_print_cursor(dbc)
+ DBC *dbc;
+{
+ (void)__db_no_hash_am(dbc->env);
+}
+
+int
+__ham_quick_delete(dbc)
+ DBC *dbc;
+{
+ return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_reclaim(dbp, ip, txn, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ COMPQUIET(txn, NULL);
+ COMPQUIET(ip, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(h, NULL);
+ COMPQUIET(handle, NULL);
+ COMPQUIET(callback, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ COMPQUIET(countp, NULL);
+ return (__db_no_hash_am(dbc->env));
+}
+
+int
+__ham_vrfy(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_vrfy_hashing(dbc, nentries, m, thisbucket, pgno, flags, hfunc)
+ DBC *dbc;
+ u_int32_t nentries;
+ HMETA *m;
+ u_int32_t thisbucket;
+ db_pgno_t pgno;
+ u_int32_t flags;
+ u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
+{
+ COMPQUIET(nentries, 0);
+ COMPQUIET(m, NULL);
+ COMPQUIET(thisbucket, 0);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(flags, 0);
+ COMPQUIET(hfunc, NULL);
+ return (__db_no_hash_am(dbc->dbp->env));
+}
+
+int
+__ham_vrfy_meta(dbp, vdp, m, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ HMETA *m;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(m, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_vrfy_structure(dbp, vdp, meta_pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(meta_pgno, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_hash_am(dbp->env));
+}
+#endif /* !HAVE_HASH */
diff --git a/src/hash/hash_upgrade.c b/src/hash/hash_upgrade.c
new file mode 100644
index 00000000..f66a7a58
--- /dev/null
+++ b/src/hash/hash_upgrade.c
@@ -0,0 +1,323 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/db_upgrade.h"
+
+/*
+ * __ham_30_hashmeta --
+ * Upgrade the database from version 4/5 to version 6.
+ *
+ * PUBLIC: int __ham_30_hashmeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__ham_30_hashmeta(dbp, real_name, obuf)
+ DB *dbp;
+ char *real_name;
+ u_int8_t *obuf;
+{
+ ENV *env;
+ HASHHDR *oldmeta;
+ HMETA30 newmeta;
+ u_int32_t *o_spares, *n_spares;
+ u_int32_t fillf, i, maxb, max_entry, nelem;
+ int ret;
+
+ env = dbp->env;
+ memset(&newmeta, 0, sizeof(newmeta));
+
+ oldmeta = (HASHHDR *)obuf;
+
+ /*
+ * The first 32 bytes are similar. The only change is the version
+ * and that we removed the ovfl_point and have the page type now.
+ */
+
+ newmeta.dbmeta.lsn = oldmeta->lsn;
+ newmeta.dbmeta.pgno = oldmeta->pgno;
+ newmeta.dbmeta.magic = oldmeta->magic;
+ newmeta.dbmeta.version = 6;
+ newmeta.dbmeta.pagesize = oldmeta->pagesize;
+ newmeta.dbmeta.type = P_HASHMETA;
+
+ /* Move flags */
+ newmeta.dbmeta.flags = oldmeta->flags;
+
+ /* Copy the free list, which has changed its name but works the same. */
+ newmeta.dbmeta.free = oldmeta->last_freed;
+
+ /* Copy: max_bucket, high_mask, low-mask, ffactor, nelem, h_charkey */
+ newmeta.max_bucket = oldmeta->max_bucket;
+ newmeta.high_mask = oldmeta->high_mask;
+ newmeta.low_mask = oldmeta->low_mask;
+ newmeta.ffactor = oldmeta->ffactor;
+ newmeta.nelem = oldmeta->nelem;
+ newmeta.h_charkey = oldmeta->h_charkey;
+
+ /*
+ * There was a bug in 2.X versions where the nelem could go negative.
+ * In general, this is considered "bad." If it does go negative
+ * (that is, very large and positive), we'll die trying to dump and
+ * load this database. So, let's see if we can fix it here.
+ */
+ nelem = newmeta.nelem;
+ fillf = newmeta.ffactor;
+ maxb = newmeta.max_bucket;
+
+ if ((fillf != 0 && fillf * maxb < 2 * nelem) ||
+ (fillf == 0 && nelem > 0x8000000))
+ newmeta.nelem = 0;
+
+ /*
+ * We now have to convert the spares array. The old spares array
+ * contained the total number of extra pages allocated prior to
+ * the bucket that begins the next doubling. The new spares array
+ * contains the page number of the first bucket in the next doubling
+ * MINUS the bucket number of that bucket.
+ */
+ o_spares = oldmeta->spares;
+ n_spares = newmeta.spares;
+ max_entry = __db_log2(maxb + 1); /* highest spares entry in use */
+ n_spares[0] = 1;
+ for (i = 1; i < NCACHED && i <= max_entry; i++)
+ n_spares[i] = 1 + o_spares[i - 1];
+
+ /* Replace the unique ID. */
+ if ((ret = __os_fileid(env, real_name, 1, newmeta.dbmeta.uid)) != 0)
+ return (ret);
+
+ /* Overwrite the original. */
+ memcpy(oldmeta, &newmeta, sizeof(newmeta));
+
+ return (0);
+}
+
+/*
+ * __ham_30_sizefix --
+ * Make sure that all hash pages belonging to the current
+ * hash doubling are within the bounds of the file.
+ *
+ * PUBLIC: int __ham_30_sizefix __P((DB *, DB_FH *, char *, u_int8_t *));
+ */
+int
+__ham_30_sizefix(dbp, fhp, realname, metabuf)
+ DB *dbp;
+ DB_FH *fhp;
+ char *realname;
+ u_int8_t *metabuf;
+{
+ u_int8_t buf[DB_MAX_PGSIZE];
+ ENV *env;
+ HMETA30 *meta;
+ db_pgno_t last_actual, last_desired;
+ int ret;
+ size_t nw;
+ u_int32_t pagesize;
+
+ env = dbp->env;
+ memset(buf, 0, DB_MAX_PGSIZE);
+
+ meta = (HMETA30 *)metabuf;
+ pagesize = meta->dbmeta.pagesize;
+
+ /*
+ * Get the last page number. To do this, we'll need dbp->pgsize
+ * to be set right, so slam it into place.
+ */
+ dbp->pgsize = pagesize;
+ if ((ret = __db_lastpgno(dbp, realname, fhp, &last_actual)) != 0)
+ return (ret);
+
+ /*
+ * The last bucket in the doubling is equal to high_mask; calculate
+ * the page number that implies.
+ */
+ last_desired = BS_TO_PAGE(meta->high_mask, meta->spares);
+
+ /*
+ * If last_desired > last_actual, we need to grow the file. Write
+ * a zeroed page where last_desired would go.
+ */
+ if (last_desired > last_actual) {
+ if ((ret = __os_seek(
+ env, fhp, last_desired, pagesize, 0)) != 0)
+ return (ret);
+ if ((ret = __os_write(env, fhp, buf, pagesize, &nw)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __ham_31_hashmeta --
+ * Upgrade the database from version 6 to version 7.
+ *
+ * PUBLIC: int __ham_31_hashmeta
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_31_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ HMETA30 *oldmeta;
+ HMETA31 *newmeta;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+
+ newmeta = (HMETA31 *)h;
+ oldmeta = (HMETA30 *)h;
+
+ /*
+ * Copy the fields down the page.
+ * The fields may overlap so start at the bottom and use memmove().
+ */
+ memmove(newmeta->spares, oldmeta->spares, sizeof(oldmeta->spares));
+ newmeta->h_charkey = oldmeta->h_charkey;
+ newmeta->nelem = oldmeta->nelem;
+ newmeta->ffactor = oldmeta->ffactor;
+ newmeta->low_mask = oldmeta->low_mask;
+ newmeta->high_mask = oldmeta->high_mask;
+ newmeta->max_bucket = oldmeta->max_bucket;
+ memmove(newmeta->dbmeta.uid,
+ oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid));
+ newmeta->dbmeta.flags = oldmeta->dbmeta.flags;
+ newmeta->dbmeta.record_count = 0;
+ newmeta->dbmeta.key_count = 0;
+ ZERO_LSN(newmeta->dbmeta.unused3);
+
+ /* Update the version. */
+ newmeta->dbmeta.version = 7;
+
+ /* Upgrade the flags. */
+ if (LF_ISSET(DB_DUPSORT))
+ F_SET(&newmeta->dbmeta, DB_HASH_DUPSORT);
+
+ *dirtyp = 1;
+ return (0);
+}
+
+/*
+ * __ham_31_hash --
+ * Upgrade the database hash leaf pages.
+ *
+ * PUBLIC: int __ham_31_hash
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_31_hash(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ HKEYDATA *hk;
+ db_pgno_t pgno, tpgno;
+ db_indx_t indx;
+ int ret;
+
+ COMPQUIET(flags, 0);
+
+ ret = 0;
+ for (indx = 0; indx < NUM_ENT(h); indx += 2) {
+ hk = (HKEYDATA *)H_PAIRDATA(dbp, h, indx);
+ if (HPAGE_PTYPE(hk) == H_OFFDUP) {
+ memcpy(&pgno, HOFFDUP_PGNO(hk), sizeof(db_pgno_t));
+ tpgno = pgno;
+ if ((ret = __db_31_offdup(dbp, real_name, fhp,
+ LF_ISSET(DB_DUPSORT) ? 1 : 0, &tpgno)) != 0)
+ break;
+ if (pgno != tpgno) {
+ *dirtyp = 1;
+ memcpy(HOFFDUP_PGNO(hk),
+ &tpgno, sizeof(db_pgno_t));
+ }
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __ham_46_hashmeta --
+ * Upgrade the database from version 8 to version 9.
+ *
+ * PUBLIC: int __ham_46_hashmeta
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_46_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ HMETA33 *newmeta;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+
+ newmeta = (HMETA33 *)h;
+ /* Update the version. */
+ newmeta->dbmeta.version = 9;
+ *dirtyp = 1;
+
+ return (0);
+}
+
+/*
+ * __ham_46_hash --
+ * Upgrade the database hash leaf pages.
+ * From version 8 databases to version 9.
+ * Involves sorting leaf pages, no format change.
+ *
+ * PUBLIC: int __ham_46_hash
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_46_hash(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ DBC *dbc;
+ int ret, t_ret;
+
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+
+ if ((ret = __db_cursor(dbp, NULL, NULL, &dbc, 0)) != 0)
+ return (ret);
+ *dirtyp = 1;
+ ret = __ham_sort_page(dbc, NULL, h);
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/src/hash/hash_verify.c b/src/hash/hash_verify.c
new file mode 100644
index 00000000..662e7ac8
--- /dev/null
+++ b/src/hash/hash_verify.c
@@ -0,0 +1,1157 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __ham_dups_unsorted __P((DB *, u_int8_t *, u_int32_t));
+static int __ham_vrfy_bucket __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t,
+ u_int32_t));
+static int __ham_vrfy_item __P((DB *,
+ VRFY_DBINFO *, db_pgno_t, PAGE *, u_int32_t, u_int32_t));
+
+/*
+ * __ham_vrfy_meta --
+ * Verify the hash-specific part of a metadata page.
+ *
+ * Note that unlike btree, we don't save things off, because we
+ * will need most everything again to verify each page and the
+ * amount of state here is significant.
+ *
+ * PUBLIC: int __ham_vrfy_meta __P((DB *, VRFY_DBINFO *, HMETA *,
+ * PUBLIC: db_pgno_t, u_int32_t));
+ */
+int
+__ham_vrfy_meta(dbp, vdp, m, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ HMETA *m;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ HASH *hashp;
+ VRFY_PAGEINFO *pip;
+ int i, ret, t_ret, isbad;
+ u_int32_t pwr, mbucket;
+ u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ hashp = dbp->h_internal;
+
+ if (hashp != NULL && hashp->h_hash != NULL)
+ hfunc = hashp->h_hash;
+ else
+ hfunc = __ham_func5;
+
+ /*
+ * If we came through __db_vrfy_pagezero, we have already checked the
+ * common fields. However, we used the on-disk metadata page, it may
+ * have been stale. We now have the page from mpool, so check that.
+ */
+ if ((ret = __db_vrfy_meta(dbp, vdp, &m->dbmeta, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /* h_charkey */
+ if (!LF_ISSET(DB_NOORDERCHK))
+ if (m->h_charkey != hfunc(dbp, CHARKEY, sizeof(CHARKEY))) {
+ EPRINT((env, DB_STR_A("1096",
+"Page %lu: database has custom hash function; reverify with DB_NOORDERCHK set",
+ "%lu"), (u_long)pgno));
+ /*
+ * Return immediately; this is probably a sign of user
+ * error rather than database corruption, so we want to
+ * avoid extraneous errors.
+ */
+ isbad = 1;
+ goto err;
+ }
+
+ /* max_bucket must be less than the last pgno. */
+ if (m->max_bucket > vdp->last_pgno) {
+ EPRINT((env, DB_STR_A("1097",
+ "Page %lu: Impossible max_bucket %lu on meta page",
+ "%lu %lu"), (u_long)pgno, (u_long)m->max_bucket));
+ /*
+ * Most other fields depend somehow on max_bucket, so
+ * we just return--there will be lots of extraneous
+ * errors.
+ */
+ isbad = 1;
+ goto err;
+ }
+
+ /*
+ * max_bucket, high_mask and low_mask: high_mask must be one
+ * less than the next power of two above max_bucket, and
+ * low_mask must be one less than the power of two below it.
+ */
+ pwr = (m->max_bucket == 0) ? 1 : 1 << __db_log2(m->max_bucket + 1);
+ if (m->high_mask != pwr - 1) {
+ EPRINT((env, DB_STR_A("1098",
+ "Page %lu: incorrect high_mask %lu, should be %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)m->high_mask,
+ (u_long)pwr - 1));
+ isbad = 1;
+ }
+ pwr >>= 1;
+ if (m->low_mask != pwr - 1) {
+ EPRINT((env, DB_STR_A("1099",
+ "Page %lu: incorrect low_mask %lu, should be %lu",
+ "%lu %lu %lu"), (u_long)pgno, (u_long)m->low_mask,
+ (u_long)pwr - 1));
+ isbad = 1;
+ }
+
+ /* ffactor: no check possible. */
+ pip->h_ffactor = m->ffactor;
+
+ /*
+ * nelem: just make sure it's not astronomical for now. This is the
+ * same check that hash_upgrade does, since there was a bug in 2.X
+ * which could make nelem go "negative".
+ */
+ if (m->nelem > 0x80000000) {
+ EPRINT((env, DB_STR_A("1100",
+ "Page %lu: suspiciously high nelem of %lu", "%lu %lu"),
+ (u_long)pgno, (u_long)m->nelem));
+ isbad = 1;
+ pip->h_nelem = 0;
+ } else
+ pip->h_nelem = m->nelem;
+
+ /* flags */
+ if (F_ISSET(&m->dbmeta, DB_HASH_DUP))
+ F_SET(pip, VRFY_HAS_DUPS);
+ if (F_ISSET(&m->dbmeta, DB_HASH_DUPSORT))
+ F_SET(pip, VRFY_HAS_DUPSORT);
+ /* XXX: Why is the DB_HASH_SUBDB flag necessary? */
+
+ /* spares array */
+ for (i = 0; i < NCACHED && m->spares[i] != 0; i++) {
+ /*
+ * We set mbucket to the maximum bucket that would use a given
+ * spares entry; we want to ensure that it's always less
+ * than last_pgno.
+ */
+ mbucket = (1 << i) - 1;
+ if (BS_TO_PAGE(mbucket, m->spares) > vdp->last_pgno) {
+ EPRINT((env, DB_STR_A("1101",
+ "Page %lu: spares array entry %d is invalid",
+ "%lu %d"), (u_long)pgno, i));
+ isbad = 1;
+ }
+ }
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (LF_ISSET(DB_SALVAGE) &&
+ (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ham_vrfy --
+ * Verify hash page.
+ *
+ * PUBLIC: int __ham_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__ham_vrfy(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ u_int32_t ent, himark, inpend;
+ db_indx_t *inp;
+ int isbad, ret, t_ret;
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if (TYPE(h) != P_HASH && TYPE(h) != P_HASH_UNSORTED) {
+ ret = __db_unknown_path(env, "__ham_vrfy");
+ goto err;
+ }
+
+ /* Verify and save off fields common to all PAGEs. */
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * Verify inp[]. Each offset from 0 to NUM_ENT(h) must be lower
+ * than the previous one, higher than the current end of the inp array,
+ * and lower than the page size.
+ *
+ * In any case, we return immediately if things are bad, as it would
+ * be unsafe to proceed.
+ */
+ inp = P_INP(dbp, h);
+ for (ent = 0, himark = dbp->pgsize,
+ inpend = (u_int32_t)((u_int8_t *)inp - (u_int8_t *)h);
+ ent < NUM_ENT(h); ent++)
+ if (inp[ent] >= himark) {
+ EPRINT((env, DB_STR_A("1102",
+ "Page %lu: item %lu is out of order or nonsensical",
+ "%lu %lu"), (u_long)pgno, (u_long)ent));
+ isbad = 1;
+ goto err;
+ } else if (inpend >= himark) {
+ EPRINT((env, DB_STR_A("1103",
+ "Page %lu: entries array collided with data",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ goto err;
+
+ } else {
+ himark = inp[ent];
+ inpend += sizeof(db_indx_t);
+ if ((ret = __ham_vrfy_item(
+ dbp, vdp, pgno, h, ent, flags)) != 0)
+ goto err;
+ }
+
+ if ((ret = __db_cursor_int(dbp, vdp->thread_info, NULL, DB_HASH,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ return (ret);
+ if (!LF_ISSET(DB_NOORDERCHK) && TYPE(h) == P_HASH &&
+ (ret = __ham_verify_sorted_page(dbc, h)) != 0)
+ isbad = 1;
+
+err: if ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret == 0 && isbad == 1 ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ham_vrfy_item --
+ * Given a hash page and an offset, sanity-check the item itself,
+ * and save off any overflow items or off-page dup children as necessary.
+ */
+static int
+__ham_vrfy_item(dbp, vdp, pgno, h, i, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ u_int32_t i, flags;
+{
+ HOFFDUP hod;
+ HOFFPAGE hop;
+ VRFY_CHILDINFO child;
+ VRFY_PAGEINFO *pip;
+ db_indx_t offset, len, dlen, elen;
+ int ret, t_ret;
+ u_int8_t *databuf;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (HPAGE_TYPE(dbp, h, i)) {
+ case H_KEYDATA:
+ /* Nothing to do here--everything but the type field is data */
+ break;
+ case H_DUPLICATE:
+ /* Are we a datum or a key? Better be the former. */
+ if (i % 2 == 0) {
+ EPRINT((dbp->env, DB_STR_A("1104",
+ "Page %lu: hash key stored as duplicate item %lu",
+ "%lu %lu"), (u_long)pip->pgno, (u_long)i));
+ }
+ /*
+ * Dups are encoded as a series within a single HKEYDATA,
+ * in which each dup is surrounded by a copy of its length
+ * on either side (so that the series can be walked in either
+ * direction. We loop through this series and make sure
+ * each dup is reasonable.
+ *
+ * Note that at this point, we've verified item i-1, so
+ * it's safe to use LEN_HKEYDATA (which looks at inp[i-1]).
+ */
+ len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
+ databuf = HKEYDATA_DATA(P_ENTRY(dbp, h, i));
+ for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) {
+ memcpy(&dlen, databuf + offset, sizeof(db_indx_t));
+
+ /* Make sure the length is plausible. */
+ if (offset + DUP_SIZE(dlen) > len) {
+ EPRINT((dbp->env, DB_STR_A("1105",
+ "Page %lu: duplicate item %lu has bad length",
+ "%lu %lu"), (u_long)pip->pgno, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Make sure the second copy of the length is the
+ * same as the first.
+ */
+ memcpy(&elen,
+ databuf + offset + dlen + sizeof(db_indx_t),
+ sizeof(db_indx_t));
+ if (elen != dlen) {
+ EPRINT((dbp->env, DB_STR_A("1106",
+ "Page %lu: duplicate item %lu has two different lengths",
+ "%lu %lu"), (u_long)pip->pgno, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ }
+ F_SET(pip, VRFY_HAS_DUPS);
+ if (!LF_ISSET(DB_NOORDERCHK) &&
+ __ham_dups_unsorted(dbp, databuf, len))
+ F_SET(pip, VRFY_DUPS_UNSORTED);
+ break;
+ case H_OFFPAGE:
+ /* Offpage item. Make sure pgno is sane, save off. */
+ memcpy(&hop, P_ENTRY(dbp, h, i), HOFFPAGE_SIZE);
+ if (!IS_VALID_PGNO(hop.pgno) || hop.pgno == pip->pgno ||
+ hop.pgno == PGNO_INVALID) {
+ EPRINT((dbp->env, DB_STR_A("1107",
+ "Page %lu: offpage item %lu has bad pgno %lu",
+ "%lu %lu %lu"), (u_long)pip->pgno, (u_long)i,
+ (u_long)hop.pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ child.pgno = hop.pgno;
+ child.type = V_OVERFLOW;
+ child.tlen = hop.tlen; /* This will get checked later. */
+ if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0)
+ goto err;
+ break;
+ case H_OFFDUP:
+ /* Offpage duplicate item. Same drill. */
+ memcpy(&hod, P_ENTRY(dbp, h, i), HOFFDUP_SIZE);
+ if (!IS_VALID_PGNO(hod.pgno) || hod.pgno == pip->pgno ||
+ hod.pgno == PGNO_INVALID) {
+ EPRINT((dbp->env, DB_STR_A("1108",
+ "Page %lu: offpage item %lu has bad page number",
+ "%lu %lu"), (u_long)pip->pgno, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ child.pgno = hod.pgno;
+ child.type = V_DUPLICATE;
+ if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0)
+ goto err;
+ F_SET(pip, VRFY_HAS_DUPS);
+ break;
+ default:
+ EPRINT((dbp->env, DB_STR_A("1109",
+ "Page %lu: item %lu has bad type", "%lu %lu"),
+ (u_long)pip->pgno, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+err: if ((t_ret =
+ __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __ham_vrfy_structure --
+ * Verify the structure of a hash database.
+ *
+ * PUBLIC: int __ham_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__ham_vrfy_structure(dbp, vdp, meta_pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+ u_int32_t flags;
+{
+ DB *pgset;
+ DB_MPOOLFILE *mpf;
+ HMETA *m;
+ PAGE *h;
+ VRFY_PAGEINFO *pip;
+ int isbad, p, ret, t_ret;
+ db_pgno_t pgno;
+ u_int32_t bucket, spares_entry;
+
+ mpf = dbp->mpf;
+ pgset = vdp->pgset;
+ h = NULL;
+ ret = isbad = 0;
+
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, vdp->txn, meta_pgno, &p)) != 0)
+ return (ret);
+ if (p != 0) {
+ EPRINT((dbp->env, DB_STR_A("1110",
+ "Page %lu: Hash meta page referenced twice", "%lu"),
+ (u_long)meta_pgno));
+ return (DB_VERIFY_BAD);
+ }
+ if ((ret = __db_vrfy_pgset_inc(pgset,
+ vdp->thread_info, vdp->txn, meta_pgno)) != 0)
+ return (ret);
+
+ /* Get the meta page; we'll need it frequently. */
+ if ((ret = __memp_fget(mpf,
+ &meta_pgno, vdp->thread_info, NULL, 0, &m)) != 0)
+ return (ret);
+
+ /* Loop through bucket by bucket. */
+ for (bucket = 0; bucket <= m->max_bucket; bucket++)
+ if ((ret =
+ __ham_vrfy_bucket(dbp, vdp, m, bucket, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * There may be unused hash pages corresponding to buckets
+ * that have been allocated but not yet used. These may be
+ * part of the current doubling above max_bucket, or they may
+ * correspond to buckets that were used in a transaction
+ * that then aborted.
+ *
+ * Loop through them, as far as the spares array defines them,
+ * and make sure they're all empty.
+ *
+ * Note that this should be safe, since we've already verified
+ * that the spares array is sane.
+ */
+ for (bucket = m->max_bucket + 1; spares_entry = __db_log2(bucket + 1),
+ spares_entry < NCACHED && m->spares[spares_entry] != 0; bucket++) {
+ pgno = BS_TO_PAGE(bucket, m->spares);
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ goto err;
+
+ /* It's okay if these pages are totally zeroed; unmark it. */
+ F_CLR(pip, VRFY_IS_ALLZEROES);
+
+ /* It's also OK if this page is simply invalid. */
+ if (pip->type == P_INVALID) {
+ if ((ret = __db_vrfy_putpageinfo(dbp->env,
+ vdp, pip)) != 0)
+ goto err;
+ continue;
+ }
+
+ if (pip->type != P_HASH && pip->type != P_HASH_UNSORTED) {
+ EPRINT((dbp->env, DB_STR_A("1111",
+ "Page %lu: hash bucket %lu maps to non-hash page",
+ "%lu %lu"), (u_long)pgno, (u_long)bucket));
+ isbad = 1;
+ } else if (pip->entries != 0) {
+ EPRINT((dbp->env, DB_STR_A("1112",
+ "Page %lu: non-empty page in unused hash bucket %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)bucket));
+ isbad = 1;
+ } else {
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, vdp->txn, pgno, &p)) != 0)
+ goto err;
+ if (p != 0) {
+ EPRINT((dbp->env, DB_STR_A("1113",
+ "Page %lu: above max_bucket referenced",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ } else {
+ if ((ret = __db_vrfy_pgset_inc(pgset,
+ vdp->thread_info, vdp->txn, pgno)) != 0)
+ goto err;
+ if ((ret = __db_vrfy_putpageinfo(dbp->env,
+ vdp, pip)) != 0)
+ goto err;
+ continue;
+ }
+ }
+
+ /* If we got here, it's an error. */
+ (void)__db_vrfy_putpageinfo(dbp->env, vdp, pip);
+ goto err;
+ }
+
+err: if ((t_ret = __memp_fput(mpf, vdp->thread_info, m, dbp->priority)) != 0)
+ return (t_ret);
+ if (h != NULL &&
+ (t_ret = __memp_fput(mpf, vdp->thread_info, h, dbp->priority)) != 0)
+ return (t_ret);
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD: ret);
+}
+
+/*
+ * __ham_vrfy_bucket --
+ * Verify a given bucket.
+ */
+static int
+__ham_vrfy_bucket(dbp, vdp, m, bucket, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ HMETA *m;
+ u_int32_t bucket, flags;
+{
+ ENV *env;
+ HASH *hashp;
+ VRFY_CHILDINFO *child;
+ VRFY_PAGEINFO *mip, *pip;
+ int ret, t_ret, isbad, p;
+ db_pgno_t pgno, next_pgno;
+ DBC *cc;
+ u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
+
+ env = dbp->env;
+ isbad = 0;
+ pip = NULL;
+ cc = NULL;
+
+ hashp = dbp->h_internal;
+ if (hashp != NULL && hashp->h_hash != NULL)
+ hfunc = hashp->h_hash;
+ else
+ hfunc = __ham_func5;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, PGNO(m), &mip)) != 0)
+ return (ret);
+
+ /* Calculate the first pgno for this bucket. */
+ pgno = BS_TO_PAGE(bucket, m->spares);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ goto err;
+
+ /*
+ * Hash pages that nothing has ever hashed to may never have actually
+ * come into existence, it is possible, and legal, for the first page in
+ * a bucket to not exist. This flag would have been set in
+ * __db_vrfy_walkpages.
+ */
+ if (F_ISSET(pip, VRFY_NONEXISTENT))
+ goto err;
+
+ /* Make sure we got a plausible page number. */
+ if (pgno > vdp->last_pgno ||
+ (pip->type != P_HASH && pip->type != P_HASH_UNSORTED)) {
+ EPRINT((env, DB_STR_A("1114",
+ "Page %lu: impossible first page in bucket %lu", "%lu %lu"),
+ (u_long)pgno, (u_long)bucket));
+ /* Unsafe to continue. */
+ isbad = 1;
+ goto err;
+ }
+
+ if (pip->prev_pgno != PGNO_INVALID) {
+ EPRINT((env, DB_STR_A("1115",
+ "Page %lu: first page in hash bucket %lu has a prev_pgno",
+ "%lu %lu"), (u_long)pgno, (u_long)bucket));
+ isbad = 1;
+ }
+
+ /*
+ * Set flags for dups and sorted dups.
+ */
+ flags |= F_ISSET(mip, VRFY_HAS_DUPS) ? DB_ST_DUPOK : 0;
+ flags |= F_ISSET(mip, VRFY_HAS_DUPSORT) ? DB_ST_DUPSORT : 0;
+
+ /* Loop until we find a fatal bug, or until we run out of pages. */
+ for (;;) {
+ /* Provide feedback on our progress to the application. */
+ if (!LF_ISSET(DB_SALVAGE))
+ __db_vrfy_struct_feedback(dbp, vdp);
+
+ if ((ret = __db_vrfy_pgset_get(vdp->pgset,
+ vdp->thread_info, vdp->txn, pgno, &p)) != 0)
+ goto err;
+ if (p != 0) {
+ EPRINT((env, DB_STR_A("1116",
+ "Page %lu: hash page referenced twice", "%lu"),
+ (u_long)pgno));
+ isbad = 1;
+ /* Unsafe to continue. */
+ goto err;
+ } else if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
+ vdp->thread_info, vdp->txn, pgno)) != 0)
+ goto err;
+
+ /*
+ * Hash pages that nothing has ever hashed to may never
+ * have actually come into existence, and may appear to be
+ * entirely zeroed. This is acceptable, and since there's
+ * no real way for us to know whether this has actually
+ * occurred, we clear the "wholly zeroed" flag on every
+ * hash page. A wholly zeroed page, by nature, will appear
+ * to have no flags set and zero entries, so should
+ * otherwise verify correctly.
+ */
+ F_CLR(pip, VRFY_IS_ALLZEROES);
+
+ /* If we have dups, our meta page had better know about it. */
+ if (F_ISSET(pip, VRFY_HAS_DUPS) &&
+ !F_ISSET(mip, VRFY_HAS_DUPS)) {
+ EPRINT((env, DB_STR_A("1117",
+ "Page %lu: duplicates present in non-duplicate database",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+
+ /*
+ * If the database has sorted dups, this page had better
+ * not have unsorted ones.
+ */
+ if (F_ISSET(mip, VRFY_HAS_DUPSORT) &&
+ F_ISSET(pip, VRFY_DUPS_UNSORTED)) {
+ EPRINT((env, DB_STR_A("1118",
+ "Page %lu: unsorted dups in sorted-dup database",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+
+ /* Walk overflow chains and offpage dup trees. */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pip->pgno, &child); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child))
+ if (child->type == V_OVERFLOW) {
+ if ((ret = __db_vrfy_ovfl_structure(dbp, vdp,
+ child->pgno, child->tlen,
+ flags | DB_ST_OVFL_LEAF)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ } else if (child->type == V_DUPLICATE) {
+ if ((ret = __db_vrfy_duptype(dbp,
+ vdp, child->pgno, flags)) != 0) {
+ isbad = 1;
+ continue;
+ }
+ if ((ret = __bam_vrfy_subtree(dbp, vdp,
+ child->pgno, NULL, NULL,
+ flags | DB_ST_RECNUM | DB_ST_DUPSET | DB_ST_TOPLEVEL,
+ NULL, NULL, NULL)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ }
+ /* Close the cursor on vdp, open one on dbp */
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ if ((ret = __db_cursor_int(dbp, vdp->thread_info, NULL,
+ DB_HASH, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &cc)) != 0)
+ goto err;
+ /* If it's safe to check that things hash properly, do so. */
+ if (isbad == 0 && !LF_ISSET(DB_NOORDERCHK) &&
+ (ret = __ham_vrfy_hashing(cc, pip->entries,
+ m, bucket, pgno, flags, hfunc)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ next_pgno = pip->next_pgno;
+ ret = __db_vrfy_putpageinfo(env, vdp, pip);
+
+ pip = NULL;
+ if (ret != 0)
+ goto err;
+
+ if (next_pgno == PGNO_INVALID)
+ break; /* End of the bucket. */
+
+ /* We already checked this, but just in case... */
+ if (!IS_VALID_PGNO(next_pgno)) {
+ EPRINT((env, DB_STR_A("1119",
+ "Page %lu: hash page has bad next_pgno", "%lu"),
+ (u_long)pgno));
+ isbad = 1;
+ goto err;
+ }
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0)
+ goto err;
+
+ if (pip->prev_pgno != pgno) {
+ EPRINT((env, DB_STR_A("1120",
+ "Page %lu: hash page has bad prev_pgno", "%lu"),
+ (u_long)next_pgno));
+ isbad = 1;
+ }
+ pgno = next_pgno;
+ }
+
+err: if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
+ ret = t_ret;
+ if (mip != NULL && ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
+ ret = t_ret;
+ if (pip != NULL && ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ham_vrfy_hashing --
+ * Verify that all items on a given hash page hash correctly.
+ *
+ * PUBLIC: int __ham_vrfy_hashing __P((DBC *,
+ * PUBLIC: u_int32_t, HMETA *, u_int32_t, db_pgno_t, u_int32_t,
+ * PUBLIC: u_int32_t (*) __P((DB *, const void *, u_int32_t))));
+ */
+int
+__ham_vrfy_hashing(dbc, nentries, m, thisbucket, pgno, flags, hfunc)
+ DBC *dbc;
+ u_int32_t nentries;
+ HMETA *m;
+ u_int32_t thisbucket;
+ db_pgno_t pgno;
+ u_int32_t flags;
+ u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
+{
+ DB *dbp;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *h;
+ db_indx_t i;
+ int ret, t_ret, isbad;
+ u_int32_t hval, bucket;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ ret = isbad = 0;
+
+ memset(&dbt, 0, sizeof(DBT));
+ F_SET(&dbt, DB_DBT_REALLOC);
+ ENV_GET_THREAD_INFO(dbp->env, ip);
+
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &h)) != 0)
+ return (ret);
+
+ for (i = 0; i < nentries; i += 2) {
+ /*
+ * We've already verified the page integrity and that of any
+ * overflow chains linked off it; it is therefore safe to use
+ * __db_ret. It's also not all that much slower, since we have
+ * to copy every hash item to deal with alignment anyway; we
+ * can tweak this a bit if this proves to be a bottleneck,
+ * but for now, take the easy route.
+ */
+ if ((ret = __db_ret(dbc, h, i, &dbt, NULL, NULL)) != 0)
+ goto err;
+ hval = hfunc(dbp, dbt.data, dbt.size);
+
+ bucket = hval & m->high_mask;
+ if (bucket > m->max_bucket)
+ bucket = bucket & m->low_mask;
+
+ if (bucket != thisbucket) {
+ EPRINT((dbp->env, DB_STR_A("1121",
+ "Page %lu: item %lu hashes incorrectly", "%lu %lu"),
+ (u_long)pgno, (u_long)i));
+ isbad = 1;
+ }
+ }
+
+err: if (dbt.data != NULL)
+ __os_ufree(dbp->env, dbt.data);
+ if ((t_ret = __memp_fput(mpf, ip, h, dbp->priority)) != 0)
+ return (t_ret);
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ham_salvage --
+ * Safely dump out anything that looks like a key on an alleged
+ * hash page.
+ *
+ * PUBLIC: int __ham_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *,
+ * PUBLIC: void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__ham_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DBT dbt, key_dbt, unkdbt;
+ db_pgno_t dpgno;
+ int ret, err_ret, t_ret;
+ u_int32_t himark, i, ovfl_bufsz;
+ u_int8_t *hk, *p;
+ void *buf, *key_buf;
+ db_indx_t dlen, len, tlen;
+
+ memset(&dbt, 0, sizeof(DBT));
+ dbt.flags = DB_DBT_REALLOC;
+
+ DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1);
+
+ err_ret = 0;
+
+ /*
+ * Allocate a buffer for overflow items. Start at one page;
+ * __db_safe_goff will realloc as needed.
+ */
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize, &buf)) != 0)
+ return (ret);
+ ovfl_bufsz = dbp->pgsize;
+
+ himark = dbp->pgsize;
+ for (i = 0;; i++) {
+ /* If we're not aggressive, break when we hit NUM_ENT(h). */
+ if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h))
+ break;
+
+ /*
+ * Verify the current item. If we're beyond NUM_ENT errors are
+ * expected and ignored.
+ */
+ ret = __db_vrfy_inpitem(dbp,
+ h, pgno, i, 0, flags, &himark, NULL);
+ /* If this returned a fatality, it's time to break. */
+ if (ret == DB_VERIFY_FATAL) {
+ if (i >= NUM_ENT(h))
+ ret = 0;
+ break;
+ } else if (ret != 0 && i >= NUM_ENT(h)) {
+ /* Not a reportable error, but don't salvage item. */
+ ret = 0;
+ } else if (ret == 0) {
+ /* Set len to total entry length. */
+ len = LEN_HITEM(dbp, h, dbp->pgsize, i);
+ hk = P_ENTRY(dbp, h, i);
+ if (len == 0 || len > dbp->pgsize ||
+ (u_int32_t)(hk + len - (u_int8_t *)h) >
+ dbp->pgsize) {
+ /* Item is unsafely large; skip it. */
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+ switch (HPAGE_PTYPE(hk)) {
+ case H_KEYDATA:
+ /* Update len to size of item. */
+ len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
+keydata: memcpy(buf, HKEYDATA_DATA(hk), len);
+ dbt.size = len;
+ dbt.data = buf;
+ if ((ret = __db_vrfy_prdbt(&dbt,
+ 0, " ", handle, callback, 0, 0, vdp)) != 0)
+ err_ret = ret;
+ break;
+ case H_OFFPAGE:
+ if (len < HOFFPAGE_SIZE) {
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+ memcpy(&dpgno,
+ HOFFPAGE_PGNO(hk), sizeof(dpgno));
+ if ((ret = __db_safe_goff(dbp,
+ vdp, dpgno, &dbt, &buf,
+ &ovfl_bufsz, flags)) != 0) {
+ err_ret = ret;
+ (void)__db_vrfy_prdbt(&unkdbt, 0, " ",
+ handle, callback, 0, 0, vdp);
+ /* fallthrough to end of case */
+ } else if ((ret = __db_vrfy_prdbt(&dbt,
+ 0, " ", handle, callback, 0, 0, vdp)) != 0)
+ err_ret = ret;
+ break;
+ case H_OFFDUP:
+ if (len < HOFFDUP_SIZE) {
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+ memcpy(&dpgno,
+ HOFFDUP_PGNO(hk), sizeof(dpgno));
+ /* UNKNOWN iff pgno is bad or we're a key. */
+ if (!IS_VALID_PGNO(dpgno) || (i % 2 == 0)) {
+ if ((ret =
+ __db_vrfy_prdbt(&unkdbt, 0, " ",
+ handle, callback, 0, 0, vdp)) != 0)
+ err_ret = ret;
+ } else if ((ret = __db_salvage_duptree(dbp,
+ vdp, dpgno, &dbt, handle, callback,
+ flags | DB_SA_SKIPFIRSTKEY)) != 0)
+ err_ret = ret;
+ break;
+ case H_DUPLICATE:
+ /*
+ * This is an on-page duplicate item, iterate
+ * over the duplicate set, printing out
+ * key/data pairs.
+ */
+ len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
+ /*
+ * If this item is at an even index it must be
+ * a key item and it should never be of type
+ * H_DUPLICATE. If we are in aggressive mode,
+ * print the item out as a normal key, and let
+ * the user resolve the discrepancy.
+ */
+ if (i % 2 == 0) {
+ err_ret = ret;
+ if (LF_ISSET(DB_AGGRESSIVE))
+ goto keydata;
+ break;
+ }
+
+ /*
+ * Check to ensure that the item size is
+ * greater than the smallest possible on page
+ * duplicate.
+ */
+ if (len <
+ HKEYDATA_SIZE(2 * sizeof(db_indx_t))) {
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+
+ /*
+ * Copy out the key from the dbt, it is still
+ * present from the previous pass.
+ */
+ memset(&key_dbt, 0, sizeof(key_dbt));
+ if ((ret = __os_malloc(
+ dbp->env, dbt.size, &key_buf)) != 0)
+ return (ret);
+ memcpy(key_buf, buf, dbt.size);
+ key_dbt.data = key_buf;
+ key_dbt.size = dbt.size;
+ key_dbt.flags = DB_DBT_USERMEM;
+
+ /* Loop until we hit the total length. */
+ for (tlen = 0; tlen + sizeof(db_indx_t) < len;
+ tlen += dlen + 2 * sizeof(db_indx_t)) {
+ /*
+ * Print the key for every duplicate
+ * item. Except the first dup, since
+ * the key was already output once by
+ * the previous iteration.
+ */
+ if (tlen != 0) {
+ if ((ret = __db_vrfy_prdbt(
+ &key_dbt, 0, " ", handle,
+ callback, 0, 0, vdp)) != 0)
+ err_ret = ret;
+ }
+ p = HKEYDATA_DATA(hk) + tlen;
+ memcpy(&dlen, p, sizeof(db_indx_t));
+ p += sizeof(db_indx_t);
+ /*
+ * If dlen is too long, print all the
+ * rest of the dup set in a chunk.
+ */
+ if (dlen + tlen + sizeof(db_indx_t) >
+ len) {
+ dlen = len -
+ (tlen + sizeof(db_indx_t));
+ err_ret = DB_VERIFY_BAD;
+ }
+ memcpy(buf, p, dlen);
+ dbt.size = dlen;
+ dbt.data = buf;
+ if ((ret = __db_vrfy_prdbt(&dbt, 0, " ",
+ handle, callback, 0, 0, vdp)) != 0)
+ err_ret = ret;
+ }
+ __os_free(dbp->env, key_buf);
+ break;
+ default:
+ if (!LF_ISSET(DB_AGGRESSIVE))
+ break;
+ err_ret = DB_VERIFY_BAD;
+ break;
+ }
+ }
+ }
+
+ __os_free(dbp->env, buf);
+ if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
+ return (t_ret);
+ return ((ret == 0 && err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __ham_meta2pgset --
+ * Return the set of hash pages corresponding to the given
+ * known-good meta page.
+ *
+ * PUBLIC: int __ham_meta2pgset __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t,
+ * PUBLIC: DB *));
+ */
+int
+__ham_meta2pgset(dbp, vdp, hmeta, flags, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ HMETA *hmeta;
+ u_int32_t flags;
+ DB *pgset;
+{
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t bucket, totpgs;
+ int ret, val;
+
+ /*
+ * We don't really need flags, but leave them for consistency with
+ * __bam_meta2pgset.
+ */
+ COMPQUIET(flags, 0);
+ ip = vdp->thread_info;
+
+ DB_ASSERT(dbp->env, pgset != NULL);
+
+ mpf = dbp->mpf;
+ totpgs = 0;
+
+ /*
+ * Loop through all the buckets, pushing onto pgset the corresponding
+ * page(s) for each one.
+ */
+ for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) {
+ pgno = BS_TO_PAGE(bucket, hmeta->spares);
+
+ /*
+ * We know the initial pgno is safe because the spares array has
+ * been verified.
+ *
+ * Safely walk the list of pages in this bucket.
+ */
+ for (;;) {
+ if ((ret =
+ __memp_fget(mpf, &pgno, ip, NULL, 0, &h)) != 0)
+ return (ret);
+ if (TYPE(h) == P_HASH || TYPE(h) == P_HASH_UNSORTED) {
+
+ /*
+ * Make sure we don't go past the end of
+ * pgset.
+ */
+ if (++totpgs > vdp->last_pgno) {
+ (void)__memp_fput(mpf,
+ ip, h, dbp->priority);
+ return (DB_VERIFY_BAD);
+ }
+ if ((ret = __db_vrfy_pgset_inc(pgset,
+ vdp->thread_info, vdp->txn, pgno)) != 0) {
+ (void)__memp_fput(mpf,
+ ip, h, dbp->priority);
+ return (ret);
+ }
+
+ pgno = NEXT_PGNO(h);
+ } else
+ pgno = PGNO_INVALID;
+
+ if ((ret = __memp_fput(mpf, ip, h, dbp->priority)) != 0)
+ return (ret);
+
+ /* If the new pgno is wonky, go onto the next bucket. */
+ if (!IS_VALID_PGNO(pgno) ||
+ pgno == PGNO_INVALID)
+ break;
+
+ /*
+ * If we've touched this page before, we have a cycle;
+ * go on to the next bucket.
+ */
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, vdp->txn, pgno, &val)) != 0)
+ return (ret);
+ if (val != 0)
+ break;
+ }
+ }
+ return (0);
+}
+
+/*
+ * __ham_dups_unsorted --
+ * Takes a known-safe hash duplicate set and its total length.
+ * Returns 1 if there are out-of-order duplicates in this set,
+ * 0 if there are not.
+ */
+static int
+__ham_dups_unsorted(dbp, buf, len)
+ DB *dbp;
+ u_int8_t *buf;
+ u_int32_t len;
+{
+ DBT a, b;
+ db_indx_t offset, dlen;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+
+ memset(&a, 0, sizeof(DBT));
+ memset(&b, 0, sizeof(DBT));
+
+ func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+
+ /*
+ * Loop through the dup set until we hit the end or we find
+ * a pair of dups that's out of order. b is always the current
+ * dup, a the one before it.
+ */
+ for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) {
+ memcpy(&dlen, buf + offset, sizeof(db_indx_t));
+ b.data = buf + offset + sizeof(db_indx_t);
+ b.size = dlen;
+
+ if (a.data != NULL && func(dbp, &a, &b) > 0)
+ return (1);
+
+ a.data = b.data;
+ a.size = b.size;
+ }
+
+ return (0);
+}
diff --git a/src/heap/heap.c b/src/heap/heap.c
new file mode 100644
index 00000000..ab404658
--- /dev/null
+++ b/src/heap/heap.c
@@ -0,0 +1,2812 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __heap_bulk __P((DBC *, DBT *, u_int32_t));
+static int __heap_getpage __P((DBC *, u_int32_t, u_int8_t *));
+static int __heapc_close __P((DBC *, db_pgno_t, int *));
+static int __heapc_del __P((DBC *, u_int32_t));
+static int __heapc_destroy __P((DBC *));
+static int __heapc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __heapc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __heapc_reloc __P((DBC *, DBT *, DBT *));
+static int __heapc_reloc_partial __P((DBC *, DBT *, DBT *));
+static int __heapc_split __P((DBC *, DBT *, DBT *, int));
+
+/*
+ * Acquire a new page/lock. If we are already holding a page and a lock
+ * we discard those and get the new ones. In this case we can use
+ * LCK_COUPLE to save trips to lock manager. If we are not holding a page or
+ * locks, we just get a new lock and page. Lock release done with a
+ * transactional lock put.
+ */
+#undef ACQUIRE
+#define ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, flags, mflags, ret) do { \
+ DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \
+ if ((pagep) != NULL) { \
+ ret = __memp_fput(__mpf, \
+ (dbc)->thread_info, pagep, dbc->priority); \
+ pagep = NULL; \
+ } \
+ if ((ret) == 0 && STD_LOCKING(dbc)) \
+ ret = __db_lget(dbc, \
+ LOCK_ISSET(lock) ? LCK_COUPLE : 0, \
+ lpgno, mode, flags, &(lock)); \
+ if ((ret) == 0) \
+ ret = __memp_fget(__mpf, &(fpgno), \
+ (dbc)->thread_info, (dbc)->txn, mflags, &(pagep)); \
+} while (0)
+
+/* Acquire a new page/lock for a heap cursor */
+#undef ACQUIRE_CUR
+#define ACQUIRE_CUR(dbc, mode, p, flags, mflags, ret) do { \
+ HEAP_CURSOR *__cp = (HEAP_CURSOR *)(dbc)->internal; \
+ if (p != __cp->pgno) \
+ __cp->pgno = PGNO_INVALID; \
+ ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, flags, mflags, ret); \
+ if ((ret) == 0) { \
+ __cp->pgno = p; \
+ __cp->lock_mode = (mode); \
+ } \
+} while (0)
+
+/* Discard the current page/lock for a cursor, indicate txn lock release */
+#undef DISCARD
+#define DISCARD(dbc, pagep, lock, tlock, ret) do { \
+ DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \
+ int __t_ret; \
+ __t_ret = 0; \
+ if ((pagep) != NULL) { \
+ __t_ret = __memp_fput(__mpf, \
+ (dbc)->thread_info, pagep, dbc->priority); \
+ pagep = NULL; \
+ } \
+ if (__t_ret != 0 && (ret) == 0) \
+ ret = __t_ret; \
+ if (tlock == 1) \
+ __t_ret = __TLPUT((dbc), lock); \
+ else \
+ __t_ret = __LPUT((dbc), lock); \
+ if (__t_ret != 0 && (ret) == 0) \
+ ret = __t_ret; \
+} while (0)
+
+/*
+ * __heapc_init --
+ * Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __heapc_init __P((DBC *));
+ */
+int
+__heapc_init(dbc)
+ DBC *dbc;
+{
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+
+ if (dbc->internal == NULL)
+ if ((ret = __os_calloc(
+ env, 1, sizeof(HEAP_CURSOR), &dbc->internal)) != 0)
+ return (ret);
+
+ /* Initialize methods. */
+ dbc->close = dbc->c_close = __dbc_close_pp;
+ dbc->cmp = __dbc_cmp_pp;
+ dbc->count = dbc->c_count = __dbc_count_pp;
+ dbc->del = dbc->c_del = __dbc_del_pp;
+ dbc->dup = dbc->c_dup = __dbc_dup_pp;
+ dbc->get = dbc->c_get = __dbc_get_pp;
+ dbc->pget = dbc->c_pget = __dbc_pget_pp;
+ dbc->put = dbc->c_put = __dbc_put_pp;
+ dbc->am_bulk = __heap_bulk;
+ dbc->am_close = __heapc_close;
+ dbc->am_del = __heapc_del;
+ dbc->am_destroy = __heapc_destroy;
+ dbc->am_get = __heapc_get;
+ dbc->am_put = __heapc_put;
+ dbc->am_writelock = NULL;
+
+ return (0);
+}
+
+static int
+__heap_bulk(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_HEAP_RID prev_rid, rid;
+ DBT sdata;
+ HEAP_CURSOR *cp;
+ HEAPHDR *hdr;
+ HEAPSPLITHDR *shdr;
+ PAGE *pg;
+ db_lockmode_t lock_type;
+ int is_key, ret;
+ int32_t *offp;
+ u_int32_t data_size, key_size, needed, space;
+ u_int8_t *dbuf, *np;
+
+ ret = 0;
+ dbp = dbc->dbp;
+ cp = (HEAP_CURSOR *)dbc->internal;
+ hdr = NULL;
+ shdr = NULL;
+
+ /* Check for additional bits for locking */
+ if (F_ISSET(dbc, DBC_RMW))
+ lock_type = DB_LOCK_WRITE;
+ else
+ lock_type = DB_LOCK_READ;
+
+ /*
+ * np is the next place to copy things into the buffer.
+ * dbuf always stays at the beginning of the buffer.
+ */
+ dbuf = data->data;
+ np = dbuf;
+
+ /* Keep track of space that is left. There is a termination entry */
+ space = data->ulen;
+ space -= sizeof(*offp);
+
+ /* Build the offset/size table from the end up. */
+ offp = (int32_t *)((u_int8_t *)dbuf + data->ulen);
+ offp--;
+
+ /*
+ * key_size and data_size hold the 32-bit aligned size of the key and
+ * data values written to the buffer.
+ */
+ key_size = DB_ALIGN(DB_HEAP_RID_SZ, sizeof(u_int32_t));
+ data_size = 0;
+
+ /* is_key indicates whether keys are returned. */
+ is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+
+next_pg:
+ rid.indx = cp->indx;
+ rid.pgno = cp->pgno;
+ pg = cp->page;
+
+ /*
+ * Write records to the buffer, in the format needed by the DB_MULTIPLE
+ * macros. For a description of the data layout, see db.h.
+ */
+ do {
+ if (HEAP_OFFSETTBL(dbp, pg)[rid.indx] == 0)
+ continue;
+ hdr = (HEAPHDR *)P_ENTRY(dbp, pg, rid.indx);
+ /*
+ * If this is a split record and not the first piece of the
+ * record, skip it.
+ */
+ if (F_ISSET(hdr, HEAP_RECSPLIT) &&
+ !F_ISSET(hdr, HEAP_RECFIRST))
+ continue;
+
+ /*
+ * Calculate how much space is needed to add this record. If
+ * there's not enough, we're done. If we haven't written any
+ * data to the buffer, or if we are doing a DBP->get, return
+ * DB_BUFFER_SMALL.
+ */
+ needed = 0;
+ if (is_key)
+ needed = 2 * sizeof(*offp) + key_size;
+ if (F_ISSET(hdr, HEAP_RECSPLIT)) {
+ shdr = (HEAPSPLITHDR *)hdr;
+ data_size = DB_ALIGN(shdr->tsize, sizeof(u_int32_t));
+ } else
+ data_size = DB_ALIGN(hdr->size, sizeof(u_int32_t));
+ needed += 2 * sizeof(*offp) + data_size;
+
+ if (needed > space) {
+ if (np == dbuf || F_ISSET(dbc, DBC_FROM_DB_GET)) {
+ data->size = (u_int32_t)DB_ALIGN(
+ needed + data->ulen - space, 1024);
+ return (DB_BUFFER_SMALL);
+ }
+ break;
+ }
+
+ if (is_key) {
+ memcpy(np, &rid, key_size);
+ *offp-- = (int32_t)(np - dbuf);
+ *offp-- = (int32_t)DB_HEAP_RID_SZ;
+ np += key_size;
+ }
+
+ if (F_ISSET(hdr, HEAP_RECSPLIT)) {
+ /*
+ * Use __heapc_gsplit to write a split record to the
+ * return buffer. gsplit will return any fetched pages
+ * to the cache, but will leave the cursor's current
+ * page alone.
+ */
+ memset(&sdata, 0, sizeof(DBT));
+ sdata.data = np;
+ sdata.size = sdata.ulen = shdr->tsize;
+ sdata.flags = DB_DBT_USERMEM;
+ /* gsplit expects the cursor to be positioned. */
+ cp->pgno = rid.pgno;
+ cp->indx = rid.indx;
+ if ((ret = __heapc_gsplit(
+ dbc, &sdata, NULL, NULL)) != 0)
+ return (ret);
+ } else {
+ memcpy(np,
+ (u_int8_t *)hdr + sizeof(HEAPHDR), hdr->size);
+ }
+ *offp-- = (int32_t)(np - dbuf);
+ if (F_ISSET(hdr, HEAP_RECSPLIT))
+ *offp-- = (int32_t)shdr->tsize;
+ else
+ *offp-- = (int32_t)hdr->size;
+ np += data_size;
+ space -= needed;
+ prev_rid = rid;
+
+ /*
+ * The data and "metadata" ends of the buffer should never
+ * overlap.
+ */
+ DB_ASSERT(dbp->env, (void *)np <= (void *)offp);
+ } while (++rid.indx < NUM_ENT(pg));
+
+ /* If we are off the page then try the next page. */
+ if (rid.indx >= NUM_ENT(pg)) {
+ rid.pgno++;
+ ACQUIRE_CUR(dbc, lock_type, rid.pgno, 0, 0, ret);
+ if (ret == 0) {
+ cp->indx = 0;
+ goto next_pg;
+ } else if (ret != DB_PAGE_NOTFOUND)
+ return (ret);
+ }
+
+ DB_ASSERT(dbp->env, (ret == 0 || ret == DB_PAGE_NOTFOUND));
+ cp->indx = prev_rid.indx;
+ cp->pgno = prev_rid.pgno;
+
+ *offp = -1;
+
+ return (0);
+}
+
+static int
+__heapc_close(dbc, root_pgno, rmroot)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int *rmroot;
+{
+ DB_MPOOLFILE *mpf;
+ HEAP_CURSOR *cp;
+ int ret;
+
+ COMPQUIET(root_pgno, 0);
+ COMPQUIET(rmroot, 0);
+
+ cp = (HEAP_CURSOR *)dbc->internal;
+ mpf = dbc->dbp->mpf;
+ ret = 0;
+
+ /* Release the page/lock held by the cursor. */
+ DISCARD(dbc, cp->page, cp->lock, 1, ret);
+ if (ret == 0 && !LOCK_ISSET(cp->lock))
+ cp->lock_mode = DB_LOCK_NG;
+
+ return (ret);
+}
+
+static int
+__heapc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_HEAP_RID next_rid, orig_rid;
+ DB_MPOOLFILE *mpf;
+ DBT hdr_dbt, log_dbt;
+ HEAP *h;
+ HEAPHDR *hdr;
+ HEAPPG *rpage;
+ HEAP_CURSOR *cp;
+ db_pgno_t region_pgno;
+ int oldspacebits, ret, spacebits, t_ret;
+ u_int16_t data_size, size;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ h = dbp->heap_internal;
+ cp = (HEAP_CURSOR *)dbc->internal;
+ rpage = NULL;
+ COMPQUIET(flags, 0);
+
+ /*
+ * We need to be able to reset the cursor after deleting a record split
+ * across multiple pages.
+ */
+ orig_rid.pgno = cp->pgno;
+ orig_rid.indx = cp->indx;
+
+ /*
+ * This code is always called with a page lock but no page.
+ */
+ DB_ASSERT(dbp->env, cp->page == NULL);
+
+ /* We have a read lock, but need a write lock. */
+start: if (STD_LOCKING(dbc) && (ret = __db_lget(dbc,
+ LCK_COUPLE, cp->pgno, DB_LOCK_WRITE, 0, &cp->lock)) != 0)
+ return (ret);
+
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &cp->page)) != 0)
+ return (ret);
+
+ HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), oldspacebits);
+
+ hdr = (HEAPHDR *)P_ENTRY(dbp, cp->page, cp->indx);
+ data_size = DB_ALIGN(hdr->size, sizeof(u_int32_t));
+ size = data_size + HEAP_HDRSIZE(hdr);
+ if (size < sizeof(HEAPSPLITHDR))
+ size = sizeof(HEAPSPLITHDR);
+ if (F_ISSET(hdr, HEAP_RECSPLIT) && !F_ISSET(hdr, HEAP_RECLAST)) {
+ next_rid.pgno = F_ISSET(hdr, HEAP_RECLAST) ?
+ PGNO_INVALID : ((HEAPSPLITHDR *)hdr)->nextpg;
+ next_rid.indx = F_ISSET(hdr, HEAP_RECLAST) ?
+ PGNO_INVALID : ((HEAPSPLITHDR *)hdr)->nextindx;
+ } else {
+ next_rid.pgno = PGNO_INVALID;
+ next_rid.indx = 0;
+ }
+
+ /* Log the deletion. */
+ if (DBC_LOGGING(dbc)) {
+ hdr_dbt.data = hdr;
+ hdr_dbt.size = HEAP_HDRSIZE(hdr);
+ log_dbt.data = (u_int8_t *)hdr + hdr_dbt.size;
+ log_dbt.size = data_size;
+ if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
+ 0, DB_REM_HEAP, cp->pgno, (u_int32_t)cp->indx,
+ size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ if ((ret = __heap_ditem(dbc, cp->page, cp->indx, size)) != 0)
+ goto err;
+
+ /*
+ * If the deleted item lived in a region prior to our current, back up
+ * the current region, giving us a chance to reuse the newly available
+ * space on the next insert.
+ */
+ region_pgno = HEAP_REGION_PGNO(dbp, cp->pgno);
+ if (region_pgno < h->curregion)
+ h->curregion = region_pgno;
+
+ HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), spacebits);
+
+ if (spacebits != oldspacebits) {
+ /*
+ * Get the region page. We never lock the region page, the data
+ * page lock locks the corresponding bits in the bitmap and
+ * latching serializes access.
+ */
+ if ((ret = __memp_fget(mpf, &region_pgno,
+ dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
+ goto err;
+ HEAP_SETSPACE(dbp, rpage,
+ cp->pgno - region_pgno - 1, spacebits);
+ }
+
+err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ if (rpage != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ rpage = NULL;
+
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ cp->page = NULL;
+
+ if (ret == 0 && next_rid.pgno != PGNO_INVALID) {
+ cp->pgno = next_rid.pgno;
+ cp->indx = next_rid.indx;
+ goto start;
+ }
+
+ cp->pgno = orig_rid.pgno;
+ cp->indx = orig_rid.indx;
+
+ return (ret);
+}
+
+/*
+ * __heap_ditem --
+ * Remove an item from a page.
+ *
+ * PUBLIC: int __heap_ditem
+ * PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__heap_ditem(dbc, pagep, indx, nbytes)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx, nbytes;
+{
+ DB *dbp;
+ db_indx_t first, i, max, off, *offtbl, span;
+ u_int8_t *src, *dest;
+
+ dbp = dbc->dbp;
+
+ DB_ASSERT(dbp->env, TYPE(pagep) == P_HEAP);
+ DB_ASSERT(dbp->env, nbytes == DB_ALIGN(nbytes, sizeof(u_int32_t)));
+ DB_ASSERT(dbp->env, nbytes >= sizeof(HEAPSPLITHDR));
+
+ offtbl = (db_indx_t *)HEAP_OFFSETTBL(dbp, pagep);
+ off = offtbl[indx];
+ /*
+ * Find the lowest offset on the page, and adjust offsets that are about
+ * to be moved. If the deleted item is the lowest offset on the page,
+
+ * everything will work, that is not a special case.
+ */
+ max = HEAP_HIGHINDX(pagep);
+ first = HOFFSET(pagep);
+ for (i = 0; i <= max; i++) {
+ if (offtbl[i] < off && offtbl[i] != 0)
+ offtbl[i] += nbytes;
+ }
+ offtbl[indx] = 0;
+
+ /*
+ * Coalesce free space at the beginning of the page. Shift all the data
+ * preceding the deleted entry down, overwriting the deleted entry.
+ */
+ src = (u_int8_t *)(pagep) + first;
+ dest = src + nbytes;
+ span = off - first;
+ memmove(dest, src, span);
+#ifdef DIAGNOSTIC
+ memset(src, CLEAR_BYTE, nbytes);
+#endif
+
+ /* Update the page's metadata. */
+ NUM_ENT(pagep)--;
+ HOFFSET(pagep) += nbytes;
+ if (indx < HEAP_FREEINDX(pagep))
+ HEAP_FREEINDX(pagep) = indx;
+ while (HEAP_HIGHINDX(pagep) > 0 && offtbl[HEAP_HIGHINDX(pagep)] == 0)
+ HEAP_HIGHINDX(pagep)--;
+ if (NUM_ENT(pagep) == 0)
+ HEAP_FREEINDX(pagep) = 0;
+ else if (HEAP_FREEINDX(pagep) > HEAP_HIGHINDX(pagep) + 1)
+ HEAP_FREEINDX(pagep) = HEAP_HIGHINDX(pagep) + 1;
+
+ return (0);
+}
+
+static int
+__heapc_destroy(dbc)
+ DBC *dbc;
+{
+ HEAP_CURSOR *cp;
+
+ cp = (HEAP_CURSOR *)dbc->internal;
+ __os_free(dbc->env, cp);
+ dbc->internal = NULL;
+
+ return (0);
+}
+
+/*
+ * __heapc_get --
+ * Get using a cursor (heap).
+ */
+static int
+__heapc_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DB_HEAP_RID rid;
+ DB_MPOOLFILE *mpf;
+ DB_LOCK meta_lock;
+ DBT tmp_val;
+ HEAP *h;
+ HEAPHDR *hdr;
+ HEAPMETA *meta;
+ HEAPPG *dpage;
+ HEAP_CURSOR *cp;
+ db_lockmode_t lock_type;
+ db_pgno_t pgno;
+ int cmp, f_indx, found, getpage, indx, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ h = dbp->heap_internal;
+ cp = (HEAP_CURSOR *)dbc->internal;
+ LOCK_INIT(meta_lock);
+ COMPQUIET(pgnop, NULL);
+
+ if (F_ISSET(key, DB_DBT_USERMEM) && key->ulen < DB_HEAP_RID_SZ) {
+ key->size = DB_HEAP_RID_SZ;
+ return (DB_BUFFER_SMALL);
+ }
+
+ /* Check for additional bits for locking */
+ if (F_ISSET(dbc, DBC_RMW))
+ lock_type = DB_LOCK_WRITE;
+ else
+ lock_type = DB_LOCK_READ;
+
+ ret = 0;
+ found = getpage = FALSE;
+ meta = NULL;
+ dpage = NULL;
+ switch (flags) {
+ case DB_CURRENT:
+
+ /*
+ * Acquire the current page with read lock unless user
+ * has asked for a write lock. Ensure page and record
+ * exist still.
+ */
+ ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret);
+ if (ret != 0) {
+ if (ret == DB_PAGE_NOTFOUND)
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ if (HEAP_OFFSETTBL(dbp, cp->page)[cp->indx] == 0) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ dpage = (HEAPPG *)cp->page;
+ hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, cp->indx);
+ if (F_ISSET(hdr, HEAP_RECSPLIT) &&
+ !F_ISSET(hdr, HEAP_RECFIRST)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ break;
+ case DB_FIRST:
+ /*
+ * The region pages do not distinguish between an empty
+ * page and page with a something on it. So, we will
+ * grab the first possible data page and look for the
+ * lowest index with data. If page is empty we go on to
+ * the next page and look. If no page, then no records.
+ */
+first: pgno = FIRST_HEAP_DPAGE;
+ while (!found) {
+ /* Put old lock/page and get the new lock/page */
+ ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+ if (ret != 0 ) {
+ if (ret == DB_PAGE_NOTFOUND)
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ dpage = (HEAPPG *)cp->page;
+ /*
+ * The page needs to be a data page with entries on
+ * it. If page is good, loop through the offset table
+ * finding first non-split record or first piece of a
+ * split record, then set up cursor.
+ */
+ if (TYPE(dpage) == P_HEAP && NUM_ENT(dpage) != 0) {
+ for (indx = 0;
+ indx <= HEAP_HIGHINDX(dpage); indx++) {
+ if (HEAP_OFFSETTBL(
+ dbp, dpage)[indx] == 0)
+ continue;
+ hdr = (HEAPHDR *)P_ENTRY(
+ dbp, dpage, indx);
+ if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+ F_ISSET(hdr, HEAP_RECFIRST)) {
+ found = TRUE;
+ cp->pgno = pgno;
+ cp->indx = indx;
+ break;
+ }
+ }
+ if (!found)
+ pgno++;
+ } else
+ pgno++;
+ }
+ break;
+ case DB_LAST:
+ /*
+ * Grab the metadata page to find the last page, and start
+ * there looking backwards for the record with the highest
+ * index and return that one.
+ */
+last: pgno = PGNO_BASE_MD;
+ ACQUIRE(dbc, DB_LOCK_READ,
+ pgno, meta_lock, pgno, meta, 0, 0, ret);
+ if (ret != 0)
+ goto err;
+
+ pgno = meta->dbmeta.last_pgno;
+
+ /*
+ * It is possible to have another page added while we are
+ * searching backwards for last record. No need to block
+ * this case from occurring by keeping meta page lock.
+ */
+ DISCARD(dbc, meta, meta_lock, 1, ret);
+ if (ret != 0)
+ goto err;
+
+ while (!found) {
+ /* Don't look earlier than the first data page. */
+ if (pgno < FIRST_HEAP_DPAGE) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ /* Put old lock/page and get the new lock/page. */
+ ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+ if (ret != 0)
+ goto err;
+ dpage = (HEAPPG *)cp->page;
+ /*
+ * The page needs to be a data page with entries on
+ * it. If page is good, search backwards until the a
+ * non-split record or the first piece of a split record
+ * is found.
+ */
+ if (TYPE(dpage) == P_HEAP && NUM_ENT(dpage) != 0) {
+ for (indx = HEAP_HIGHINDX(dpage);
+ indx >= 0; indx--) {
+ if (HEAP_OFFSETTBL(
+ dbp, dpage)[indx] == 0)
+ continue;
+ hdr = (HEAPHDR *)P_ENTRY(
+ dbp, dpage, indx);
+ if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+ F_ISSET(hdr, HEAP_RECFIRST)) {
+ found = TRUE;
+ cp->pgno = pgno;
+ cp->indx = indx;
+ break;
+ }
+ }
+ if (!found)
+ pgno--;
+ } else
+ pgno--;
+ }
+ break;
+ case DB_NEXT_NODUP:
+ case DB_NEXT:
+ /* If cursor not initialize, behave as DB_FIRST */
+ if (dbc->internal->pgno == PGNO_INVALID)
+ goto first;
+
+ /*
+ * Acquire the current page with the lock we have already,
+ * unless user has asked for a write lock.
+ */
+ ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret);
+ if (ret != 0)
+ goto err;
+ dpage = (HEAPPG *)cp->page;
+
+ /* At end of current page, must get next page */
+ if (cp->indx >= HEAP_HIGHINDX(dpage))
+ getpage = TRUE;
+
+ while (!found) {
+ if (getpage) {
+ pgno = cp->pgno + 1;
+
+ /* Put current page/lock and get next one */
+ ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+ if (ret != 0) {
+ /* Beyond last page? */
+ if (ret == DB_PAGE_NOTFOUND)
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ dpage = (HEAPPG *)cp->page;
+
+ /*
+ * If page is a spam page or its a data
+ * page without entries, try again.
+ */
+ if (TYPE(dpage) != P_HEAP ||
+ (TYPE(dpage) == P_HEAP &&
+ NUM_ENT(dpage) == 0))
+ continue;
+
+ /* When searching, indx gets bumped to 0 */
+ cp->indx = -1;
+ getpage = FALSE;
+ }
+
+ /*
+ * Bump index and loop through the offset table finding
+ * first nonzero entry. If the offset is for a split
+ * record, make sure it's the first piece of the split
+ * record. HEAP_HIGHINDX always points to highest filled
+ * entry on page.
+ */
+ cp->indx++;
+ for (indx=cp->indx;
+ indx <= HEAP_HIGHINDX(dpage); indx++) {
+ if (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)
+ continue;
+ hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
+ if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+ F_ISSET(hdr, HEAP_RECFIRST)) {
+ found = TRUE;
+ cp->indx = indx;
+ break;
+ }
+ }
+
+ /* Nothing of interest on page, so try next */
+ if (!found)
+ getpage = TRUE;
+ }
+ break;
+ case DB_PREV_NODUP:
+ case DB_PREV:
+ /* If cursor not initialize, behave as DB_LAST */
+ if (dbc->internal->pgno == PGNO_INVALID)
+ goto last;
+
+ /*
+ * Acquire the current page with the lock we have already,
+ * unless user has asked for a write lock.
+ */
+ ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret);
+ if (ret != 0)
+ goto err;
+ dpage = (HEAPPG *)cp->page;
+
+ /*
+ * Loop through indexes and find first used slot. Check if
+ * already at the first slot.
+ */
+ for (f_indx=0; (f_indx <= HEAP_HIGHINDX(dpage)) &&
+ (HEAP_OFFSETTBL(dbp, dpage)[f_indx] == 0); f_indx++) ;
+
+ /* At the beginning of current page, must get new page */
+ if (cp->indx == 0 || cp->indx <= f_indx) {
+ if (cp->pgno == FIRST_HEAP_DPAGE) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ getpage = TRUE;
+ }
+
+ while (!found) {
+ if (getpage) {
+ pgno = cp->pgno - 1;
+ /* Do not go past first page */
+ if (pgno < FIRST_HEAP_DPAGE) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ /* Put current page/lock and get prev page. */
+ ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+ if (ret != 0)
+ goto err;
+
+ dpage = (HEAPPG *)cp->page;
+
+ /*
+ * If page is a spam page or its a data
+ * page without entries, try again.
+ */
+ if (TYPE(dpage) != P_HEAP ||
+ (TYPE(dpage) == P_HEAP &&
+ NUM_ENT(dpage) == 0))
+ continue;
+
+ /* When search, this gets bumped to high indx */
+ cp->indx = HEAP_HIGHINDX(dpage) + 1;
+ getpage = FALSE;
+ }
+
+ /*
+ * Decrement index and loop through the offset table
+ * finding previous nonzero entry.
+ */
+ cp->indx--;
+ for (indx=cp->indx;
+ indx >= 0; indx--) {
+ if (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)
+ continue;
+ hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
+ if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+ F_ISSET(hdr, HEAP_RECFIRST)) {
+ found = TRUE;
+ cp->indx = indx;
+ break;
+ }
+ }
+
+ /* Nothing of interest on page, so try previous */
+ if (!found)
+ getpage = TRUE;
+ }
+ break;
+ case DB_GET_BOTH_RANGE:
+ case DB_GET_BOTH:
+ case DB_SET_RANGE:
+ case DB_SET:
+ pgno = ((DB_HEAP_RID *)key->data)->pgno;
+ indx = ((DB_HEAP_RID *)key->data)->indx;
+
+ /* First make sure we're trying to get a data page. */
+ if (pgno == PGNO_BASE_MD ||
+ pgno == HEAP_REGION_PGNO(dbp, pgno)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ /* Lock the data page and get it. */
+ ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
+
+ if (ret != 0) {
+ if (ret == DB_PAGE_NOTFOUND)
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ dpage = (HEAPPG *)cp->page;
+
+ /* validate requested index, throw error if not in range */
+ if ((indx > HEAP_HIGHINDX(dpage)) ||
+ (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)) {
+ DISCARD(dbc, cp->page, cp->lock, 0, ret);
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
+ if (F_ISSET(hdr, HEAP_RECSPLIT) &&
+ !F_ISSET(hdr, HEAP_RECFIRST)) {
+ DISCARD(dbc, cp->page, cp->lock, 0, ret);
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ cp->pgno = pgno;
+ cp->indx = indx;
+
+ if (flags == DB_GET_BOTH || flags == DB_GET_BOTH_RANGE) {
+ memset(&tmp_val, 0, sizeof(DBT));
+ /* does the data match ? */
+ if (F_ISSET(hdr, HEAP_RECSPLIT)) {
+ tmp_val.flags = DB_DBT_MALLOC;
+ if ((ret = __heapc_gsplit(
+ dbc, &tmp_val, NULL, 0)) != 0)
+ goto err;
+ } else {
+ tmp_val.data =
+ (void *)((u_int8_t *)hdr + sizeof(HEAPHDR));
+ tmp_val.size = hdr->size;
+ }
+ cmp = __bam_defcmp(dbp, &tmp_val, data);
+ if (F_ISSET(&tmp_val, DB_DBT_MALLOC))
+ __os_ufree(dbp->env, tmp_val.data);
+ if (cmp != 0) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ }
+
+ break;
+ case DB_NEXT_DUP:
+ case DB_PREV_DUP:
+ ret = DB_NOTFOUND;
+ goto err;
+ default:
+ /* DB_GET_RECNO, DB_JOIN_ITEM, DB_SET_RECNO are invalid */
+ ret = __db_unknown_flag(dbp->env, "__heap_get", flags);
+ goto err;
+
+ }
+
+err: if (ret == 0 ) {
+ if (key != NULL) {
+ rid.pgno = cp->pgno;
+ rid.indx = cp->indx;
+ ret = __db_retcopy(dbp->env, key, &rid,
+ DB_HEAP_RID_SZ, &dbc->rkey->data, &dbc->rkey->ulen);
+ F_SET(key, DB_DBT_ISSET);
+ }
+
+ } else {
+ if (meta != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority);
+ if (LOCK_ISSET(meta_lock))
+ (void)__LPUT(dbc, meta_lock);
+ if (LOCK_ISSET(cp->lock))
+ (void)__LPUT(dbc, cp->lock);
+ }
+ DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ return (ret);
+}
+
+#undef IS_FIRST
+#define IS_FIRST (last_rid.pgno == PGNO_INVALID)
+/*
+ * __heapc_reloc_partial --
+ * Move data from a too-full page to a new page. The old data page must
+ * be write locked before calling this method.
+ */
+static int
+__heapc_reloc_partial(dbc, key, data)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+{
+ DB *dbp;
+ DBT hdr_dbt, log_dbt, t_data, t_key;
+ DB_HEAP_RID last_rid, next_rid;
+ HEAPHDR *old_hdr;
+ HEAPSPLITHDR new_hdr;
+ HEAP_CURSOR *cp;
+ int add_bytes, ret;
+ u_int32_t buflen, data_size, dlen, doff, left, old_size;
+ u_int32_t remaining, size;
+ u_int8_t *buf, *olddata;
+
+ dbp = dbc->dbp;
+ cp = (HEAP_CURSOR *)dbc->internal;
+ old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+ memset(&hdr_dbt, 0, sizeof(DBT));
+ memset(&log_dbt, 0, sizeof(DBT));
+ buf = NULL;
+ COMPQUIET(key, NULL);
+
+ /* We only work on partial puts. */
+ DB_ASSERT(dbp->env, F_ISSET(data, DB_DBT_PARTIAL));
+
+ /*
+ * Start by calculating the data_size, total size of the new record, and
+ * dlen, the number of bytes we will actually overwrite. Keep a local
+ * copy of doff, we'll adjust it as we see pieces of the record so that
+ * it's always relative to the current piece of data.
+ */
+ if (F_ISSET(old_hdr, HEAP_RECSPLIT))
+ old_size = ((HEAPSPLITHDR *)old_hdr)->tsize;
+ else
+ old_size = old_hdr->size;
+ doff = data->doff;
+ if (old_size < doff) {
+ /* Post-pending */
+ dlen = data->dlen;
+ data_size = doff + data->size;
+ } else {
+ if (old_size - doff < data->dlen)
+ dlen = old_size - doff;
+ else
+ dlen = data->dlen;
+ data_size = old_size - dlen + data->size;
+ }
+
+ /*
+ * We don't need a buffer large enough to hold the data_size
+ * bytes, just one large enough to hold the bytes that will be
+ * written to an individual page. We'll realloc to the necessary size
+ * as needed.
+ */
+ buflen = 0;
+ buf = NULL;
+
+ /*
+ * We are updating an existing record, which will grow into a split
+ * record. The strategy is to overwrite the existing record (or each
+ * piece of the record if the record is already split.) If the new
+ * record is shorter than the old, delete any extra pieces. If the new
+ * record is longer than the old, use heapc_split() to write the extra
+ * data.
+ *
+ * We start each loop with old_hdr pointed at the header for the old
+ * record and the necessary page write locked in cp->page.
+ */
+ last_rid.pgno = PGNO_INVALID;
+ last_rid.indx = 0;
+ add_bytes = 1;
+ left = data_size;
+ memset(&t_data, 0, sizeof(DBT));
+ remaining = 0;
+ for (;;) {
+ /* Figure out if we have a next piece. */
+ if (F_ISSET(old_hdr, HEAP_RECSPLIT)) {
+ next_rid.pgno = ((HEAPSPLITHDR *)old_hdr)->nextpg;
+ next_rid.indx = ((HEAPSPLITHDR *)old_hdr)->nextindx;
+ } else {
+ next_rid.pgno = PGNO_INVALID;
+ next_rid.indx = 0;
+ }
+
+ /*
+ * Before we delete the old data, use it to construct the new
+ * data. First figure out the size of the new piece, including
+ * any remaining data from the last piece.
+ */
+ if (doff >= old_hdr->size)
+ if (F_ISSET(old_hdr, HEAP_RECLAST) ||
+ !F_ISSET(old_hdr, HEAP_RECSPLIT)) {
+ /* Post-pending. */
+ data_size = doff + data->size;
+ } else {
+ /* The new piece is just the old piece. */
+ data_size = old_hdr->size;
+ }
+ else if (doff + dlen > old_hdr->size)
+ /*
+ * Some of the to-be-overwritten bytes are on the next
+ * piece, but we'll append all the new bytes to this
+ * piece if we haven't already written them.
+ */
+ data_size = doff + (add_bytes ? data->size : 0);
+ else
+ data_size = old_hdr->size -
+ dlen + (add_bytes ? data->size : 0);
+ data_size += remaining;
+
+ if (data_size > buflen) {
+ if (__os_realloc(dbp->env, data_size, &buf) != 0)
+ return (ENOMEM);
+ buflen = data_size;
+ }
+ t_data.data = buf;
+
+ /*
+ * Adjust past any remaining bytes, they've already been moved
+ * to the beginning of the buffer.
+ */
+ buf += remaining;
+ remaining = 0;
+
+ olddata = (u_int8_t *)old_hdr + HEAP_HDRSIZE(old_hdr);
+ if (doff >= old_hdr->size) {
+ memcpy(buf, olddata, old_hdr->size);
+ doff -= old_hdr->size;
+ if (F_ISSET(old_hdr, HEAP_RECLAST) ||
+ !F_ISSET(old_hdr, HEAP_RECSPLIT)) {
+ /* Post-pending. */
+ buf += old_hdr->size;
+ memset(buf, '\0', doff);
+ buf += doff;
+ memcpy(buf, data->data, data->size);
+ }
+ } else {
+ /* Preserve the first doff bytes. */
+ memcpy(buf, olddata, doff);
+ buf += doff;
+ olddata += doff;
+ /* Copy in the new bytes, if needed. */
+ if (add_bytes) {
+ memcpy(buf, data->data, data->size);
+ buf += data->size;
+ add_bytes = 0;
+ }
+ /* Skip dlen bytes. */
+ if (doff + dlen < old_hdr->size) {
+ olddata += dlen;
+ memcpy(buf,
+ olddata, old_hdr->size - doff - dlen);
+ dlen = 0;
+ } else
+ /*
+ * The data to be removed spills over onto the
+ * following page(s). Adjust dlen to account
+ * for the bytes removed from this page.
+ */
+ dlen = doff + dlen - old_hdr->size;
+ doff = 0;
+ }
+ buf = t_data.data;
+
+ /* Delete the old data, after logging it. */
+ old_size = DB_ALIGN(
+ old_hdr->size + HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+ if (old_size < sizeof(HEAPSPLITHDR))
+ old_size = sizeof(HEAPSPLITHDR);
+ if (DBC_LOGGING(dbc)) {
+ hdr_dbt.data = old_hdr;
+ hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+ log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+ log_dbt.size = DB_ALIGN(
+ old_hdr->size, sizeof(u_int32_t));
+ if ((ret = __heap_addrem_log(dbp, dbc->txn,
+ &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+ (u_int32_t)cp->indx, old_size,
+ &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+ if ((ret = __heap_ditem(
+ dbc, cp->page, cp->indx, old_size)) != 0)
+ goto err;
+
+ if (left == 0)
+ /*
+ * We've finished writing the new record, we're just
+ * cleaning up the old record now.
+ */
+ goto next_pg;
+
+ if (data_size == 0 && !IS_FIRST) {
+ /*
+ * This piece is being completely removed. We need to
+ * adjust the header of the previous piece now.
+ */
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+ last_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+ if (ret != 0)
+ goto err;
+
+ cp->indx = last_rid.indx;
+ old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+
+ if (DBC_LOGGING(dbc)) {
+ old_size = DB_ALIGN(old_hdr->size +
+ HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+ hdr_dbt.data = old_hdr;
+ hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+ log_dbt.data =
+ (u_int8_t *)old_hdr + hdr_dbt.size;
+ log_dbt.size = DB_ALIGN(
+ old_hdr->size, sizeof(u_int32_t));
+ if ((ret = __heap_addrem_log(dbp, dbc->txn,
+ &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+ (u_int32_t)cp->indx, old_size,
+ &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ ((HEAPSPLITHDR *)old_hdr)->nextpg = next_rid.pgno;
+ ((HEAPSPLITHDR *)old_hdr)->nextindx = next_rid.indx;
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __heap_addrem_log(dbp, dbc->txn,
+ &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno,
+ (u_int32_t)cp->indx, old_size,
+ &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ DISCARD(dbc, cp->page, cp->lock, 1, ret);
+
+ goto next_pg;
+ }
+
+ /* Set up the header for the new record. */
+ memset(&new_hdr, 0, sizeof(HEAPSPLITHDR));
+ new_hdr.std_hdr.flags = HEAP_RECSPLIT;
+ /*
+ * If next_rid.pgno == PGNO_INVALID and there's still more data,
+ * we'll come back and correct the header once we know where the
+ * next piece lives.
+ */
+ new_hdr.nextpg = next_rid.pgno;
+ new_hdr.nextindx = next_rid.indx;
+ /*
+ * Figure out how much we can fit on the page, rounding down to
+ * a multiple of 4. If we will have to expand the offset table,
+ * account for that. It needs to be enough to at least fit the
+ * split header.
+ */
+ size = HEAP_FREESPACE(dbp, cp->page);
+ if (NUM_ENT(cp->page) == 0 ||
+ cp->indx > HEAP_HIGHINDX(cp->page))
+ size -= sizeof(db_indx_t);
+ /* Round down to a multiple of 4. */
+ size = DB_ALIGN(
+ size - sizeof(u_int32_t) + 1, sizeof(u_int32_t));
+ DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR));
+
+ /*
+ * We try to fill the page, but cannot write more than
+ * t_data.size bytes, that's all we have in-memory.
+ */
+ new_hdr.std_hdr.size = (u_int16_t)
+ (size - sizeof(HEAPSPLITHDR));
+ if (new_hdr.std_hdr.size > data_size)
+ new_hdr.std_hdr.size = data_size;
+ if (new_hdr.std_hdr.size >= left) {
+ new_hdr.std_hdr.size = left;
+ new_hdr.std_hdr.flags |= HEAP_RECLAST;
+ new_hdr.nextpg = PGNO_INVALID;
+ new_hdr.nextindx = 0;
+ }
+ if (IS_FIRST) {
+ new_hdr.std_hdr.flags |= HEAP_RECFIRST;
+ new_hdr.tsize = left;
+ }
+
+ /* Now write the new data to the page. */
+ t_data.size = new_hdr.std_hdr.size;
+ hdr_dbt.data = &new_hdr;
+ hdr_dbt.size = sizeof(HEAPSPLITHDR);
+ /* Log the write. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __heap_addrem_log(dbp,
+ dbc->txn, &LSN(cp->page), 0,
+ DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx,
+ size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+ if ((ret = __heap_pitem(dbc,
+ (PAGE *)cp->page, cp->indx, size, &hdr_dbt, &t_data)) != 0)
+ goto err;
+
+ left -= new_hdr.std_hdr.size;
+ /*
+ * If any data couldn't fit on this page, it has to go onto the
+ * next. Copy it to the front of the buffer and it will be
+ * preserved in the next loop.
+ */
+ if (new_hdr.std_hdr.size < data_size) {
+ remaining = data_size - new_hdr.std_hdr.size;
+ memmove(buf, buf + new_hdr.std_hdr.size, remaining);
+ }
+
+ /*
+ * Remember this piece's RID, we may need to update the header
+ * if the next data piece is removed, or if this is the final
+ * piece and we add data to the end of the record.
+ */
+next_pg: last_rid.pgno = cp->pgno;
+ last_rid.indx = cp->indx;
+ /* Get the next page, if any. */
+ if (next_rid.pgno != PGNO_INVALID) {
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+ next_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+ if (ret != 0)
+ goto err;
+ cp->indx = next_rid.indx;
+ old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+ DB_ASSERT(dbp->env,
+ HEAP_HIGHINDX(cp->page) <= cp->indx);
+ DB_ASSERT(dbp->env, F_ISSET(old_hdr, HEAP_RECSPLIT));
+ } else {
+ /* Discard the page and drop the lock, txn-ally. */
+ DISCARD(dbc, cp->page, cp->lock, 1, ret);
+ if (ret != 0)
+ goto err;
+ break;
+ }
+ }
+
+ /*
+ * If there is more work to do, let heapc_split do it. After
+ * heapc_split returns we need to update nextpg and nextindx in the
+ * header of the last piece we wrote above.
+ *
+ * For logging purposes, we "delete" the old record and then "add" the
+ * record. This makes redo/undo work as-is, but we won't actually
+ * delete and re-add the record.
+ */
+ if (left > 0) {
+ memset(&t_key, 0, sizeof(DBT));
+ t_key.size = t_key.ulen = sizeof(DB_HEAP_RID);
+ t_key.data = &next_rid;
+ t_key.flags = DB_DBT_USERMEM;
+ t_data.size = left;
+ if ((ret = __heapc_split(dbc, &t_key, &t_data, 0)) != 0)
+ goto err;
+
+ ACQUIRE_CUR(dbc,
+ DB_LOCK_WRITE, last_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+ if (ret != 0)
+ goto err;
+
+ cp->indx = last_rid.indx;
+ old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+
+ if (DBC_LOGGING(dbc)) {
+ old_size = DB_ALIGN(old_hdr->size +
+ HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+ hdr_dbt.data = old_hdr;
+ hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+ log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+ log_dbt.size = DB_ALIGN(
+ old_hdr->size, sizeof(u_int32_t));
+ if ((ret = __heap_addrem_log(dbp, dbc->txn,
+ &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+ (u_int32_t)cp->indx, old_size,
+ &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ ((HEAPSPLITHDR *)old_hdr)->nextpg = next_rid.pgno;
+ ((HEAPSPLITHDR *)old_hdr)->nextindx = next_rid.indx;
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __heap_addrem_log(dbp, dbc->txn,
+ &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno,
+ (u_int32_t)cp->indx, old_size,
+ &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ DISCARD(dbc, cp->page, cp->lock, 1, ret);
+ }
+
+err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ if (buf != NULL)
+ __os_free(dbp->env, buf);
+ return (ret);
+}
+
+/*
+ * __heapc_reloc --
+ * Move data from a too-full page to a new page. The old data page must
+ * be write locked before calling this method.
+ */
+static int
+__heapc_reloc(dbc, key, data)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+{
+ DB *dbp;
+ DBT hdr_dbt, log_dbt, t_data, t_key;
+ DB_HEAP_RID last_rid, next_rid;
+ HEAPHDR *old_hdr;
+ HEAPSPLITHDR new_hdr;
+ HEAP_CURSOR *cp;
+ int is_first, ret;
+ u_int32_t left, old_size, size;
+
+ dbp = dbc->dbp;
+ cp = (HEAP_CURSOR *)dbc->internal;
+ old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+ memset(&hdr_dbt, 0, sizeof(DBT));
+ memset(&log_dbt, 0, sizeof(DBT));
+ COMPQUIET(key, NULL);
+
+ /*
+ * We are updating an existing record, which will grow into a split
+ * record. The strategy is to overwrite the existing record (or each
+ * piece of the record if the record is already split.) If the new
+ * record is shorter than the old, delete any extra pieces. If the new
+ * record is longer than the old, use heapc_split() to write the extra
+ * data.
+ *
+ * We start each loop with t_data.data positioned to the next byte to be
+ * written, old_hdr pointed at the header for the old record and the
+ * necessary page write locked in cp->page.
+ */
+ is_first = 1;
+ left = data->size;
+ memset(&t_data, 0, sizeof(DBT));
+ t_data.data = data->data;
+ for (;;) {
+ /* Figure out if we have a next piece. */
+ if (F_ISSET(old_hdr, HEAP_RECSPLIT)) {
+ next_rid.pgno = ((HEAPSPLITHDR *)old_hdr)->nextpg;
+ next_rid.indx = ((HEAPSPLITHDR *)old_hdr)->nextindx;
+ } else {
+ next_rid.pgno = PGNO_INVALID;
+ next_rid.indx = 0;
+ }
+
+ /* Delete the old data, after logging it. */
+ old_size = DB_ALIGN(
+ old_hdr->size + HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+ if (old_size < sizeof(HEAPSPLITHDR))
+ old_size = sizeof(HEAPSPLITHDR);
+ if (DBC_LOGGING(dbc)) {
+ hdr_dbt.data = old_hdr;
+ hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+ log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+ log_dbt.size = DB_ALIGN(
+ old_hdr->size, sizeof(u_int32_t));
+ if ((ret = __heap_addrem_log(dbp, dbc->txn,
+ &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+ (u_int32_t)cp->indx, old_size,
+ &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+ if ((ret = __heap_ditem(
+ dbc, cp->page, cp->indx, old_size)) != 0)
+ goto err;
+
+ if (left == 0)
+ /*
+ * We've finished writing the new record, we're just
+ * cleaning up the old record now.
+ */
+ goto next_pg;
+
+ /* Set up the header for the new record. */
+ memset(&new_hdr, 0, sizeof(HEAPSPLITHDR));
+ new_hdr.std_hdr.flags = HEAP_RECSPLIT;
+ /* We'll set this later if next_rid.pgno == PGNO_INVALID. */
+ new_hdr.nextpg = next_rid.pgno;
+ new_hdr.nextindx = next_rid.indx;
+ /*
+ * Figure out how much we can fit on the page, rounding down to
+ * a multiple of 4. If we will have to expand the offset table,
+ * account for that.It needs to be enough to at least fit the
+ * split header.
+ */
+ size = HEAP_FREESPACE(dbp, cp->page);
+ if (NUM_ENT(cp->page) == 0 ||
+ cp->indx > HEAP_HIGHINDX(cp->page))
+ size -= sizeof(db_indx_t);
+ /* Round down to a multiple of 4. */
+ size = DB_ALIGN(
+ size - sizeof(u_int32_t) + 1, sizeof(u_int32_t));
+ DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR));
+ new_hdr.std_hdr.size =
+ (u_int16_t)(size - sizeof(HEAPSPLITHDR));
+ if (new_hdr.std_hdr.size >= left) {
+ new_hdr.std_hdr.size = left;
+ new_hdr.std_hdr.flags |= HEAP_RECLAST;
+ new_hdr.nextpg = PGNO_INVALID;
+ new_hdr.nextindx = 0;
+ }
+ if (is_first) {
+ new_hdr.std_hdr.flags |= HEAP_RECFIRST;
+ new_hdr.tsize = left;
+ is_first = 0;
+ }
+
+ /* Now write the new data to the page. */
+ t_data.size = new_hdr.std_hdr.size;
+ hdr_dbt.data = &new_hdr;
+ hdr_dbt.size = sizeof(HEAPSPLITHDR);
+ /* Log the write. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __heap_addrem_log(dbp,
+ dbc->txn, &LSN(cp->page), 0,
+ DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx,
+ size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+ if ((ret = __heap_pitem(dbc,
+ (PAGE *)cp->page, cp->indx, size, &hdr_dbt, &t_data)) != 0)
+ goto err;
+
+ left -= new_hdr.std_hdr.size;
+ t_data.data = (u_int8_t *)(t_data.data) + new_hdr.std_hdr.size;
+
+ /* Get the next page, if any. */
+next_pg: if (next_rid.pgno != PGNO_INVALID) {
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+ next_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+ if (ret != 0)
+ goto err;
+ cp->indx = next_rid.indx;
+ old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+ } else {
+ /*
+ * Remember the final piece's RID, we may need to update
+ * the header after writing the rest of the record.
+ */
+ last_rid.pgno = cp->pgno;
+ last_rid.indx = cp->indx;
+ /* Discard the page and drop the lock, txn-ally. */
+ DISCARD(dbc, cp->page, cp->lock, 1, ret);
+ if (ret != 0)
+ goto err;
+ break;
+ }
+ }
+
+ /*
+ * If there is more work to do, let heapc_split do it. After
+ * heapc_split returns we need to update nextpg and nextindx in the
+ * header of the last piece we wrote above.
+ *
+ * For logging purposes, we "delete" the old record and then "add" the
+ * record. This makes redo/undo work as-is, but we won't actually
+ * delete and re-add the record.
+ */
+ if (left > 0) {
+ memset(&t_key, 0, sizeof(DBT));
+ t_key.size = t_key.ulen = sizeof(DB_HEAP_RID);
+ t_key.data = &next_rid;
+ t_key.flags = DB_DBT_USERMEM;
+ t_data.size = left;
+ if ((ret = __heapc_split(dbc, &t_key, &t_data, 0)) != 0)
+ goto err;
+
+ ACQUIRE_CUR(dbc,
+ DB_LOCK_WRITE, last_rid.pgno, 0, DB_MPOOL_DIRTY, ret);
+ if (ret != 0)
+ goto err;
+
+ cp->indx = last_rid.indx;
+ old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+
+ if (DBC_LOGGING(dbc)) {
+ old_size = DB_ALIGN(old_hdr->size +
+ HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+ hdr_dbt.data = old_hdr;
+ hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+ log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+ log_dbt.size = DB_ALIGN(
+ old_hdr->size, sizeof(u_int32_t));
+ if ((ret = __heap_addrem_log(dbp, dbc->txn,
+ &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+ (u_int32_t)cp->indx, old_size,
+ &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ ((HEAPSPLITHDR *)old_hdr)->nextpg = next_rid.pgno;
+ ((HEAPSPLITHDR *)old_hdr)->nextindx = next_rid.indx;
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __heap_addrem_log(dbp, dbc->txn,
+ &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno,
+ (u_int32_t)cp->indx,old_size,
+ &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ DISCARD(dbc, cp->page, cp->lock, 1, ret);
+ }
+
+err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ return (ret);
+}
+
+/*
+ * __heapc_put --
+ *
+ * Put using a cursor. If the given key exists, update the associated data. If
+ * the given key does not exsist, return an error.
+ */
+static int
+__heapc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DBT hdr_dbt, log_dbt, new_data;
+ DB_MPOOLFILE *mpf;
+ HEAPHDR hdr, *old_hdr;
+ HEAP_CURSOR *cp;
+ PAGE *rpage;
+ db_pgno_t region_pgno;
+ int oldspace, ret, space, t_ret;
+ u_int32_t data_size, dlen, new_size, old_flags, old_size, tot_size;
+ u_int8_t *buf, *olddata, *src, *dest;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (HEAP_CURSOR *)dbc->internal;
+ rpage = NULL;
+ buf = dest = src = NULL;
+ dlen = 0;
+
+ if (flags != DB_CURRENT) {
+ /* We're going to write following the get, so use RMW. */
+ old_flags = dbc->flags;
+ F_SET(dbc, DBC_RMW);
+ ret = __heapc_get(dbc, key, data, DB_SET, pgnop);
+ F_CLR(key, DB_DBT_ISSET);
+ dbc->flags = old_flags;
+ DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ if (ret != 0)
+ return (ret);
+ else if (flags == DB_NOOVERWRITE)
+ return (DB_KEYEXIST);
+ if ((ret = __memp_dirty(mpf, &cp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ } else {
+ /* We have a read lock, but need a write lock. */
+ if (STD_LOCKING(dbc) && cp->lock_mode != DB_LOCK_WRITE &&
+ (ret = __db_lget(dbc,
+ LCK_COUPLE, cp->pgno, DB_LOCK_WRITE, 0, &cp->lock)) != 0)
+ return (ret);
+
+ if ((ret = __memp_fget(mpf, &cp->pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &cp->page)) != 0)
+ return (ret);
+ }
+
+ /* We've got the page locked and stored in cp->page. */
+ HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), oldspace);
+
+ /*
+ * Figure out the spacing issue. There is a very rare corner case where
+ * we don't have enough space on the page to expand the data. Splitting
+ * the record results in a larger header, if the page is jam packed
+ * there might not be room for the larger header.
+ *
+ * hdr->size is the size of the stored data, it doesn't include any
+ * padding.
+ */
+ old_hdr = (HEAPHDR *)(P_ENTRY(dbp, cp->page, cp->indx));
+ /* Need data.size + header size, 4-byte aligned. */
+ old_size =
+ DB_ALIGN(old_hdr->size + HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
+ if (old_size < sizeof(HEAPSPLITHDR))
+ old_size = sizeof(HEAPSPLITHDR);
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if (F_ISSET(old_hdr, HEAP_RECSPLIT))
+ tot_size = ((HEAPSPLITHDR *)old_hdr)->tsize;
+ else
+ tot_size = old_hdr->size;
+ if (tot_size < data->doff) {
+ /* Post-pending */
+ dlen = data->dlen;
+ data_size = data->doff + data->size;
+ } else {
+ if (tot_size - data->doff < data->dlen)
+ dlen = tot_size - data->doff;
+ else
+ dlen = data->dlen;
+ data_size = tot_size - dlen + data->size;
+ }
+ } else
+ data_size = data->size;
+ new_size = DB_ALIGN(data_size + sizeof(HEAPHDR), sizeof(u_int32_t));
+ if (new_size < sizeof(HEAPSPLITHDR))
+ new_size = sizeof(HEAPSPLITHDR);
+
+ /* Check whether we actually have enough space on this page. */
+ if (F_ISSET(old_hdr, HEAP_RECSPLIT) ||
+ (new_size > old_size &&
+ new_size - old_size > HEAP_FREESPACE(dbp, cp->page))) {
+ /*
+ * We've got to split the record, not enough room on the
+ * page. Splitting the record will remove old_size bytes and
+ * introduce at least sizeof(HEAPSPLITHDR).
+ */
+ if (F_ISSET(data, DB_DBT_PARTIAL))
+ return (__heapc_reloc_partial(dbc, key, data));
+ else
+ return (__heapc_reloc(dbc, key, data));
+ }
+
+ memset(&new_data, 0, sizeof(DBT));
+ new_data.size = data_size;
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ /*
+ * Before replacing the old data, we need to use it to build the
+ * new data.
+ */
+ if ((ret = __os_malloc(dbp->env, data_size, &buf)) != 0)
+ goto err;
+ new_data.data = buf;
+
+ /*
+ * Preserve data->doff bytes at the start, or all of the old
+ * record plus padding, if post-pending.
+ */
+ olddata = (u_int8_t *)old_hdr + sizeof(HEAPHDR);
+ if (data->doff > old_hdr->size) {
+ memcpy(buf, olddata, old_hdr->size);
+ buf += old_hdr->size;
+ memset(buf, '\0', data->doff - old_hdr->size);
+ buf += data->doff - old_hdr->size;
+ } else {
+ memcpy(buf, olddata, data->doff);
+ buf += data->doff;
+ }
+
+ /* Now copy in the user's data. */
+ memcpy(buf, data->data, data->size);
+ buf += data->size;
+
+ /* Fill in remaining data from the old record, skipping dlen. */
+ if (data->doff < old_hdr->size) {
+ olddata += data->doff + data->dlen;
+ memcpy(buf,
+ olddata, old_hdr->size - data->doff - data->dlen);
+ }
+ } else {
+ new_data.data = data->data;
+ }
+
+ /*
+ * Do the update by deleting the old record and writing the new
+ * record. Start by logging the entire operation.
+ */
+ memset(&hdr, 0, sizeof(HEAPHDR));
+ hdr.size = data_size;
+ if (DBC_LOGGING(dbc)) {
+ hdr_dbt.data = old_hdr;
+ hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
+ log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
+ log_dbt.size = DB_ALIGN(old_hdr->size, sizeof(u_int32_t));
+ if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
+ 0, DB_REM_HEAP, cp->pgno, (u_int32_t)cp->indx,
+ old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ goto err;
+ hdr_dbt.data = &hdr;
+ hdr_dbt.size = HEAP_HDRSIZE(&hdr);
+ if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
+ 0, DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx,
+ new_size, &hdr_dbt, &new_data, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ if ((ret = __heap_ditem(dbc, cp->page, cp->indx, old_size)) != 0)
+ goto err;
+ hdr_dbt.data = &hdr;
+ hdr_dbt.size = HEAP_HDRSIZE(&hdr);
+ if ((ret = __heap_pitem(dbc,
+ (PAGE *)cp->page, cp->indx, new_size, &hdr_dbt, &new_data)) != 0)
+ goto err;
+
+ /* Check whether we need to update the space bitmap. */
+ HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), space);
+
+ if (space != oldspace) {
+ /* Get the region page with an exclusive latch. */
+ region_pgno = HEAP_REGION_PGNO(dbp, cp->pgno);
+
+ if ((ret = __memp_fget(mpf, &region_pgno,
+ dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
+ goto err;
+
+ HEAP_SETSPACE(dbp, rpage, cp->pgno - region_pgno - 1, space);
+ }
+
+err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ if (rpage != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (F_ISSET(data, DB_DBT_PARTIAL))
+ __os_free(dbp->env, new_data.data);
+
+ if (ret != 0 && LOCK_ISSET(cp->lock))
+ (void)__TLPUT(dbc, cp->lock);
+
+ return (ret);
+}
+
+/*
+ * __heap_getpage --
+ * Return a page with sufficient free space. The page will be write locked
+ * and marked dirty.
+ */
+static int
+__heap_getpage(dbc, size, avail)
+ DBC *dbc;
+ u_int32_t size;
+ u_int8_t *avail;
+{
+ DB *dbp;
+ DBMETA *meta;
+ DB_LOCK meta_lock;
+ DB_LSN meta_lsn;
+ DB_MPOOLFILE *mpf;
+ HEAP *h;
+ HEAPPG *rpage;
+ HEAP_CURSOR *cp;
+ db_pgno_t data_pgno, *lkd_pgs, meta_pgno, region_pgno, start_region;
+ int i, lk_mode, max, p, ret, space, start, t_ret;
+
+ LOCK_INIT(meta_lock);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (HEAP_CURSOR *)dbc->internal;
+ h = dbp->heap_internal;
+ start_region = region_pgno = h->curregion;
+ max = HEAP_REGION_SIZE(dbp);
+ i = ret = t_ret = 0;
+ lkd_pgs = NULL;
+
+ /*
+ * The algorithm for finding a page:
+ *
+ * Look in the space bitmap of the current region page for a data page
+ * with at least size bytes free. Once we find a page, try to lock it
+ * and if we get the lock we're done.
+ *
+ * Don't wait for a locked region page, just move on to the next region
+ * page, creating it if it doesn't exist. If the size of the heap
+ * database is not constrained, just keep creating regions and extending
+ * the database until we find a page with space. If the database size
+ * is constrained, loop back to the first region page from the final
+ * region page. If we wind up making it all the way back to where our
+ * search began, we need to start waiting for locked region pages. If
+ * we finish another loop through the database waiting for every region
+ * page, we know there's no room.
+ */
+
+ /*
+ * Figure out the % of the page the data will occupy and translate that
+ * to the relevant bit-map value we need to look for.
+ */
+ HEAP_CALCSPACEBITS(dbp, size, space);
+
+ /*
+ * Get the current region page, with a shared latch. On the first loop
+ * through a fixed size database, we move on to the next region if the
+ * page is locked. On the second loop, we wait for locked region
+ * pages. If the database isn't fixed size, we never wait, we'll
+ * eventually get to use one of the region pages we create.
+ */
+ lk_mode = DB_MPOOL_TRY;
+find: while ((ret = __memp_fget(mpf, &region_pgno,
+ dbc->thread_info, NULL, lk_mode, &rpage)) != 0 ||
+ TYPE(rpage) != P_IHEAP) {
+ if (ret == DB_LOCK_NOTGRANTED)
+ goto next_region;
+ if (ret != 0 && ret != DB_PAGE_NOTFOUND)
+ return (ret);
+ /*
+ * The region page doesn't exist, or hasn't been initialized,
+ * create it, then try again. If the page exists, we have to
+ * drop it before initializing the region.
+ */
+ if (ret == 0 && (ret = __memp_fput(
+ mpf, dbc->thread_info, rpage, dbc->priority)) != 0)
+ return (ret);
+
+ if ((ret = __heap_create_region(dbc, region_pgno)) != 0)
+ return (ret);
+ }
+
+ start = h->curpgindx;
+ /*
+ * If this is the last region page in a fixed size db, figure out the
+ * maximum pgno in the bitmap.
+ */
+ if (region_pgno + max > h->maxpgno)
+ max = h->maxpgno - region_pgno;
+ /*
+ * Look in the bitmap for a page with sufficient free space. We use i
+ * in a slightly strange way. Because the 2-bits in the bitmap are only
+ * an estimate, there is a chance the data won't fit on the page we
+ * choose. In that case, we re-start the process and want to be able to
+ * resume this loop where we left off.
+ */
+ for (; i < max; i++) {
+ p = start + i;
+ if (p >= max)
+ p -= max;
+ if ((*avail = HEAP_SPACE(dbp, rpage, p)) > space)
+ continue;
+ data_pgno = region_pgno + p + 1;
+ ACQUIRE_CUR(dbc,
+ DB_LOCK_WRITE, data_pgno, DB_LOCK_NOWAIT, 0, ret);
+ /*
+ * If we have the lock and the page or have the lock and need to
+ * create the page, we're good. If we don't have the lock, try
+ * to find different page.
+ */
+ if (ret == 0 || ret == DB_PAGE_NOTFOUND)
+ break;
+ else if (ret == DB_LOCK_NOTGRANTED || ret == DB_LOCK_DEADLOCK) {
+ ret = 0;
+ continue;
+ } else
+ goto err;
+ }
+
+ /*
+ * Keep a worst case range of highest used page in the region.
+ */
+ if (i < max && data_pgno > rpage->high_pgno) {
+ if ((ret = __memp_dirty(mpf,
+ &rpage, dbc->thread_info, NULL, dbc->priority, 0)) != 0)
+ goto err;
+ /* We might have blocked, check again */
+ if (data_pgno > rpage->high_pgno)
+ rpage->high_pgno = data_pgno;
+ }
+
+ /* Done with the region page, even if we didn't find a page. */
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, rpage, dbc->priority)) != 0) {
+ /* Did not read the data page, so we can release its lock. */
+ DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+ goto err;
+ }
+ rpage = NULL;
+
+ if (i >= max) {
+ /*
+ * No free pages on this region page, advance to the next region
+ * page. If we're at the end of a fixed size heap db, loop
+ * around to the first region page. There is not currently a
+ * data page locked.
+ */
+next_region: region_pgno += HEAP_REGION_SIZE(dbp) + 1;
+
+ if (region_pgno > h->maxpgno)
+ region_pgno = FIRST_HEAP_RPAGE;
+
+ if (region_pgno == start_region) {
+ /*
+ * We're in a fixed size db and we've looped through all
+ * region pages.
+ */
+
+ if (lk_mode == DB_MPOOL_TRY) {
+ /*
+ * We may have missed a region page with room,
+ * because we didn't wait for locked pages. Try
+ * another loop, waiting for all pages.
+ */
+ lk_mode = 0;
+ } else {
+ /*
+ * We've seen every region page, because we
+ * waited for all pages. No room.
+ */
+ ret = DB_HEAP_FULL;
+ goto err;
+ }
+ }
+
+ h->curregion = region_pgno;
+ h->curpgindx = 0;
+ i = 0;
+ goto find;
+ }
+
+ /*
+ * At this point we have the page locked. If we have the page, we need
+ * to mark it dirty. If we don't have the page (or if the page is
+ * empty) we need to create and initialize it.
+ */
+ if (cp->pgno == PGNO_INVALID || PGNO(cp->page) == PGNO_INVALID) {
+ /*
+ * The data page needs to be created and the metadata page needs
+ * to be updated. Once we get the metadata page, we must not
+ * jump to err, the metadata page and lock are put back here.
+ *
+ * It is possible that the page was created by an aborted txn,
+ * in which case the page exists but is all zeros. We still
+ * need to "create" it and log the creation.
+ *
+ */
+
+ meta_pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc, LCK_ALWAYS, meta_pgno,
+ DB_LOCK_WRITE, DB_LOCK_NOWAIT, &meta_lock)) != 0) {
+ /*
+ * We don't want to block while having latched
+ * a page off the end of file. This could
+ * get truncated by another thread and we
+ * will deadlock.
+ */
+ p = cp->page != NULL;
+ DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+ if (t_ret != 0 ||
+ (ret != DB_LOCK_NOTGRANTED &&
+ ret != DB_LOCK_DEADLOCK))
+ goto pg_err;
+ if ((ret = __db_lget(dbc, LCK_ALWAYS, meta_pgno,
+ DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+ goto pg_err;
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+ data_pgno, 0, DB_MPOOL_CREATE, ret);
+ /*
+ * We can race, having read this page when it was
+ * less than last_pgno but now an aborted
+ * allocation can make this page beyond last_pgno
+ * so we must free it. If we can't get the
+ * lock on the page again, then some other
+ * thread will handle the issue.
+ */
+ if (ret != 0) {
+pg_err: if (p != 0) {
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE,
+ data_pgno, 0, 0, t_ret);
+ if (t_ret == 0 &&
+ PGNO(cp->page) == PGNO_INVALID) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info,
+ cp->page, dbc->priority);
+ (void)__memp_fget(mpf,
+ &data_pgno,
+ dbc->thread_info, dbc->txn,
+ DB_MPOOL_FREE, &cp->page);
+ }
+ (void)__LPUT(dbc, cp->lock);
+ }
+ (void)__LPUT(dbc, meta_lock);
+ goto err;
+ }
+ /* Check if we lost a race. */
+ if (PGNO(cp->page) != PGNO_INVALID) {
+ if ((ret = __LPUT(dbc, meta_lock)) != 0)
+ goto err;
+ goto check;
+ }
+ }
+
+ /*
+ * Before creating a new page in this region, check that the
+ * region page still exists. By this point, the transaction
+ * that created the region must have aborted or committed,
+ * because we now hold the metadata lock. If we can't get the
+ * latch, the page must exist.
+ */
+ ret = __memp_fget(mpf, &region_pgno,
+ dbc->thread_info, NULL, DB_MPOOL_TRY, &rpage);
+ if (ret == DB_LOCK_NOTGRANTED)
+ ret = 0;
+ else if (ret != 0) {
+ /*
+ * Free up the metadata lock. If this was an error
+ * other than a missing region page, bail.
+ */
+ if ((t_ret = __LPUT(dbc, meta_lock)) != 0)
+ ret = t_ret;
+ if (ret != DB_PAGE_NOTFOUND)
+ goto err;
+ /*
+ * The region no longer exists. Release the page's lock
+ * (we haven't created the page yet) and find a new page
+ * on a different region.
+ */
+ DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+ goto find;
+ } else
+ ret = __memp_fput(mpf,
+ dbc->thread_info, rpage, dbc->priority);
+ rpage = NULL;
+ if (ret != 0)
+ goto meta_unlock;
+
+ if ((ret = __memp_fget(mpf, &meta_pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+
+ /* Log the page creation. Can't jump to err if it fails. */
+ if (DBC_LOGGING(dbc))
+ ret = __heap_pg_alloc_log(dbp,
+ dbc->txn, &LSN(meta), 0, &LSN(meta), meta_pgno,
+ data_pgno, (u_int32_t)P_HEAP, meta->last_pgno);
+ else
+ LSN_NOT_LOGGED(LSN(meta));
+
+ /*
+ * We may have created a page earlier with a larger page number
+ * check before updating the metadata page.
+ */
+ if (ret == 0 && data_pgno > meta->last_pgno)
+ meta->last_pgno = data_pgno;
+ meta_lsn = LSN(meta);
+
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ meta = NULL;
+ if (ret != 0)
+ goto meta_unlock;
+
+ /* If the page doesn't actually exist we need to create it. */
+ if (cp->pgno == PGNO_INVALID) {
+ cp->pgno = data_pgno;
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &cp->page)) != 0)
+ goto meta_unlock;
+ DB_ASSERT(dbp->env, cp->pgno == data_pgno);
+ } else if ((ret = __memp_dirty(mpf, &cp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+ /* Did not read the page, so we can release the lock. */
+ DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+ goto meta_unlock;
+ }
+
+ /* Now that we have the page we initialize it and we're done. */
+ P_INIT(cp->page,
+ dbp->pgsize, cp->pgno, P_INVALID, P_INVALID, 0, P_HEAP);
+ LSN(cp->page) = meta_lsn;
+
+meta_unlock: if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ } else {
+ /* Check whether we actually have enough space on this page. */
+check: if (size + sizeof(db_indx_t) > HEAP_FREESPACE(dbp, cp->page)) {
+ /* Put back the page and lock, they were never used. */
+ DISCARD(dbc, cp->page, cp->lock, 0, ret);
+ if (ret != 0)
+ goto err;
+
+ /* Re-start the bitmap check on the next page. */
+ i++;
+ goto find;
+ }
+
+ if ((ret = __memp_dirty(mpf, &cp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+ /* Did not read the page, so we can release the lock. */
+ DISCARD(dbc, cp->page, cp->lock, 0, t_ret);
+ goto err;
+ }
+ }
+
+ h->curpgindx = data_pgno - region_pgno - 1;
+err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ if (rpage != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __heap_append --
+ * Add an item to a heap database.
+ *
+ * PUBLIC: int __heap_append
+ * PUBLIC: __P((DBC *, DBT *, DBT *));
+ */
+int
+__heap_append(dbc, key, data)
+ DBC *dbc;
+ DBT *data, *key;
+{
+ DB *dbp;
+ DBT tmp_dbt;
+ DB_HEAP_RID rid;
+ DB_MPOOLFILE *mpf;
+ HEAPPG *rpage;
+ HEAPHDR hdr;
+ HEAP_CURSOR *cp;
+ db_indx_t indx;
+ db_pgno_t region_pgno;
+ int ret, space, t_ret;
+ u_int8_t avail;
+ u_int32_t data_size;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ ret = t_ret = 0;
+ rpage = NULL;
+ cp = (HEAP_CURSOR *)dbc->internal;
+
+ /* Need data.size + header size, 4-byte aligned. */
+ if (F_ISSET(data, DB_DBT_PARTIAL))
+ data_size = DB_ALIGN(data->doff +
+ data->size + sizeof(HEAPHDR), sizeof(u_int32_t));
+ else
+ data_size = DB_ALIGN(
+ data->size + sizeof(HEAPHDR), sizeof(u_int32_t));
+
+ if (data_size >= HEAP_MAXDATASIZE(dbp))
+ return (__heapc_split(dbc, key, data, 1));
+ else if (data_size < sizeof(HEAPSPLITHDR))
+ data_size = sizeof(HEAPSPLITHDR);
+
+ if ((ret = __heap_getpage(dbc, data_size, &avail)) != 0)
+ goto err;
+
+ indx = HEAP_FREEINDX(cp->page);
+ memset(&hdr, 0, sizeof(HEAPHDR));
+ hdr.size = data->size;
+ if (F_ISSET(data, DB_DBT_PARTIAL))
+ hdr.size += data->doff;
+ tmp_dbt.data = &hdr;
+ tmp_dbt.size = sizeof(HEAPHDR);
+
+ /* Log the write. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
+ 0, DB_ADD_HEAP, cp->pgno, (u_int32_t)indx,
+ data_size, &tmp_dbt, data, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ if ((ret = __heap_pitem(
+ dbc, (PAGE *)cp->page, indx, data_size, &tmp_dbt, data)) != 0)
+ goto err;
+
+ rid.pgno = cp->pgno;
+ rid.indx = indx;
+ cp->indx = indx;
+
+ /* Check whether we need to update the space bitmap. */
+ HEAP_CALCSPACEBITS(dbp, HEAP_FREESPACE(dbp, cp->page), space);
+
+ if (space != avail) {
+ /* Get the region page with an exclusive latch. */
+ region_pgno = HEAP_REGION_PGNO(dbp, cp->pgno);
+ if ((ret = __memp_fget(mpf, &region_pgno,
+ dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
+ goto err;
+
+ HEAP_SETSPACE(dbp, rpage, cp->pgno - region_pgno - 1, space);
+ }
+
+err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ if (rpage != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (cp->page != NULL) {
+ DISCARD(dbc, cp->page, cp->lock, 1, t_ret);
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ if (ret == 0 && key != NULL)
+ ret = __db_retcopy(dbp->env, key,
+ &rid, DB_HEAP_RID_SZ, &dbc->rkey->data, &dbc->rkey->ulen);
+
+ return (ret);
+}
+
+static int
+__heapc_split(dbc, key, data, is_first)
+ DBC *dbc;
+ DBT *key, *data;
+ int is_first;
+{
+ DB *dbp;
+ DBT hdr_dbt, t_data;
+ DB_HEAP_RID rid;
+ DB_MPOOLFILE *mpf;
+ HEAPPG *rpage;
+ HEAPSPLITHDR hdrs;
+ HEAP_CURSOR *cp;
+ db_indx_t indx;
+ db_pgno_t region_pgno;
+ int ret, spacebits, t_ret;
+ u_int32_t buflen, doff, left, size;
+ u_int8_t availbits, *buf;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (HEAP_CURSOR *)dbc->internal;
+ memset(&hdrs, 0, sizeof(HEAPSPLITHDR));
+ memset(&t_data, 0, sizeof(DBT));
+ hdrs.std_hdr.flags = HEAP_RECSPLIT | HEAP_RECLAST;
+
+ doff = data->doff;
+ rpage = NULL;
+ ret = t_ret = 0;
+ indx = 0;
+ buf = NULL;
+ buflen = 0;
+
+ /*
+ * Write the record to multiple pages, in chunks starting from the end.
+ * To reconstruct during a get we need the RID of the next chunk, so if
+ * work our way from back to front during writing we always know the rid
+ * of the "next" chunk, it's the chunk we just wrote.
+ */
+ t_data.data = (u_int8_t *)data->data + data->size;
+ left = data->size;
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ left += data->doff;
+ }
+ hdrs.tsize = left;
+ while (left > 0) {
+ size = DB_ALIGN(left + sizeof(HEAPSPLITHDR), sizeof(u_int32_t));
+ if (size < sizeof(HEAPSPLITHDR))
+ size = sizeof(HEAPSPLITHDR);
+
+ if (size > HEAP_MAXDATASIZE(dbp))
+ /*
+ * Data won't fit on a single page, find one at least
+ * 33% free.
+ */
+ size = DB_ALIGN(dbp->pgsize / 3, sizeof(u_int32_t));
+ else
+ hdrs.std_hdr.flags |= HEAP_RECFIRST;
+
+ if ((ret = __heap_getpage(dbc, size, &availbits)) != 0)
+ return (ret);
+
+ /*
+ * size is the total number of bytes being written to the page.
+ * The header holds the size of the data being written.
+ */
+ if (F_ISSET(&(hdrs.std_hdr), HEAP_RECFIRST)) {
+ hdrs.std_hdr.size = left;
+ /*
+ * If we're called from heapc_reloc, we are only writing
+ * a piece of the full record and shouldn't set
+ * HEAP_RECFIRST.
+ */
+ if (!is_first)
+ F_CLR(&(hdrs.std_hdr), HEAP_RECFIRST);
+ } else {
+ /*
+ * Figure out how much room is on the page. If we will
+ * have to expand the offset table, account for that.
+ */
+ size = HEAP_FREESPACE(dbp, cp->page);
+ if (NUM_ENT(cp->page) == 0 ||
+ HEAP_FREEINDX(cp->page) > HEAP_HIGHINDX(cp->page))
+ size -= sizeof(db_indx_t);
+ /* Round down to a multiple of 4. */
+ size = DB_ALIGN(
+ size - sizeof(u_int32_t) + 1, sizeof(u_int32_t));
+ DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR));
+ hdrs.std_hdr.size =
+ (u_int16_t)(size - sizeof(HEAPSPLITHDR));
+ }
+
+ /*
+ * t_data.data points at the end of the data left to write. Now
+ * that we know how much we're going to write to this page, we
+ * can adjust the pointer to point at the start of the data to
+ * be written.
+ *
+ * If DB_DBT_PARTIAL is set, once data->data is exhausted, we
+ * have to pad with data->doff bytes (or as much as can fit on
+ * this page.) left - doff gives the number of bytes to use
+ * from data->data. Once that can't fill t_data, we have to
+ * start padding.
+ */
+ t_data.data = (u_int8_t *)(t_data.data) - hdrs.std_hdr.size;
+ DB_ASSERT(dbp->env, (F_ISSET(data, DB_DBT_PARTIAL) ||
+ t_data.data >= data->data));
+ t_data.size = hdrs.std_hdr.size;
+ if (F_ISSET(data, DB_DBT_PARTIAL) &&
+ t_data.size > left - doff) {
+ if (buflen < t_data.size) {
+ if (__os_realloc(
+ dbp->env, t_data.size, &buf) != 0)
+ return (ENOMEM);
+ buflen = t_data.size;
+ }
+ /*
+ * We have to figure out how much data remains. left
+ * includes doff, so we need (left - doff) bytes from
+ * data. We also need the amount of padding that can
+ * fit on the page. That's the amount we can fit on the
+ * page minus the bytes we're taking from data.
+ */
+ t_data.data = buf;
+ memset(buf, '\0', t_data.size - left + doff);
+ buf += t_data.size - left + doff;
+ memcpy(buf, data->data, left - doff);
+ doff -= t_data.size - left + doff;
+ buf = t_data.data;
+ }
+ hdr_dbt.data = &hdrs;
+ hdr_dbt.size = sizeof(HEAPSPLITHDR);
+ indx = HEAP_FREEINDX(cp->page);
+
+ /* Log the write. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __heap_addrem_log(dbp,
+ dbc->txn, &LSN(cp->page), 0,
+ DB_ADD_HEAP, cp->pgno, (u_int32_t)indx,
+ size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ if ((ret = __heap_pitem(dbc,
+ (PAGE *)cp->page, indx, size, &hdr_dbt, &t_data)) != 0)
+ goto err;
+ F_CLR(&(hdrs.std_hdr), HEAP_RECLAST);
+ left -= hdrs.std_hdr.size;
+
+ /*
+ * Save the rid where we just wrote, this is the "next"
+ * chunk.
+ */
+ hdrs.nextpg = cp->pgno;
+ hdrs.nextindx = indx;
+
+ /* Check whether we need to update the space bitmap. */
+ HEAP_CALCSPACEBITS(dbp,
+ HEAP_FREESPACE(dbp, cp->page), spacebits);
+
+ if (spacebits != availbits) {
+ /* Get the region page with an exclusive latch. */
+ region_pgno = HEAP_REGION_PGNO(dbp, cp->pgno);
+ if ((ret = __memp_fget(mpf, &region_pgno,
+ dbc->thread_info,
+ NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
+ goto err;
+
+ HEAP_SETSPACE(dbp,
+ rpage, cp->pgno - region_pgno - 1, spacebits);
+ ret = __memp_fput(mpf,
+ dbc->thread_info, rpage, dbc->priority);
+ rpage = NULL;
+ if (ret != 0)
+ goto err;
+ }
+
+ }
+
+ rid.pgno = cp->pgno;
+ rid.indx = indx;
+ cp->indx = indx;
+
+err: if (rpage != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (cp->page != NULL) {
+ DISCARD(dbc, cp->page, cp->lock, 1, t_ret);
+ if (ret == 0)
+ ret = t_ret;
+ }
+ if (buf != NULL)
+ __os_free(dbp->env, buf);
+
+ if (ret == 0 && key != NULL)
+ ret = __db_retcopy(dbp->env, key,
+ &rid, DB_HEAP_RID_SZ, &dbc->rkey->data, &dbc->rkey->ulen);
+ DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ return (ret);
+}
+
+/*
+ * __heapc_pitem --
+ * Put an item on a heap page. Copy all bytes from the header (if any)
+ * first and then copy from data.
+ *
+ * PUBLIC: int __heap_pitem __P((DBC *,
+ * PUBLIC: PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__heap_pitem(dbc, pagep, indx, nbytes, hdr, data)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT *hdr, *data;
+{
+ DB *dbp;
+ u_int8_t *buf;
+
+ dbp = dbc->dbp;
+
+ DB_ASSERT(dbp->env, TYPE(pagep) == P_HEAP);
+ DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+ DB_ASSERT(dbp->env, nbytes == DB_ALIGN(nbytes, sizeof(u_int32_t)));
+ DB_ASSERT(dbp->env, DB_ALIGN(((HEAPHDR *)hdr->data)->size,
+ sizeof(u_int32_t)) >= data->size);
+ DB_ASSERT(dbp->env, nbytes >= hdr->size + data->size);
+
+ /*
+ * We're writing data either as a result of DB->put or as a result of
+ * undo-ing a delete. If we're undo-ing a delete we just need to write
+ * the bytes from hdr to the page. Otherwise, we need to construct a
+ * heap header, etc.
+ */
+ HEAP_OFFSETTBL(dbp, pagep)[indx] = HOFFSET(pagep) - nbytes;
+ buf = P_ENTRY(dbp, pagep, indx);
+ DB_ASSERT(dbp->env, buf > (u_int8_t*)&HEAP_OFFSETTBL(dbp, pagep)[indx]);
+
+ if (hdr != NULL) {
+ memcpy(buf, hdr->data, hdr->size);
+ buf += hdr->size;
+ }
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ memset(buf, 0, data->doff);
+ buf += data->doff;
+ }
+ memcpy(buf, data->data, data->size);
+
+ /*
+ * Update data page header. If DEBUG/DIAGNOSTIC is set, the page might
+ * be filled with 0xdb, so we can't just look for a 0 in the offset
+ * table. We used the first available index, so start there and scan
+ * forward. If the table is full, the first available index is the
+ * highest index plus one.
+ */
+ if (indx > HEAP_HIGHINDX(pagep)) {
+ if (NUM_ENT(pagep) == 0)
+ HEAP_FREEINDX(pagep) = 0;
+ else if (HEAP_FREEINDX(pagep) >= indx) {
+ if (indx > (u_int32_t)HEAP_HIGHINDX(pagep) + 1)
+ HEAP_FREEINDX(pagep) = HEAP_HIGHINDX(pagep) + 1;
+ else
+ HEAP_FREEINDX(pagep) = indx + 1;
+ }
+ while (++HEAP_HIGHINDX(pagep) < indx)
+ HEAP_OFFSETTBL(dbp,pagep)[HEAP_HIGHINDX(pagep)] = 0;
+ } else {
+ for (; indx <= HEAP_HIGHINDX(pagep); indx++)
+ if (HEAP_OFFSETTBL(dbp, pagep)[indx] == 0)
+ break;
+ HEAP_FREEINDX(pagep) = indx;
+ }
+ HOFFSET(pagep) -= nbytes;
+ NUM_ENT(pagep)++;
+
+ return (0);
+}
+
+/*
+ * __heapc_dup --
+ * Duplicate a heap cursor, such that the new one holds appropriate
+ * locks for the position of the original.
+ *
+ * PUBLIC: int __heapc_dup __P((DBC *, DBC *));
+ */
+int
+__heapc_dup(orig_dbc, new_dbc)
+ DBC *orig_dbc, *new_dbc;
+{
+ HEAP_CURSOR *orig, *new;
+
+ orig = (HEAP_CURSOR *)orig_dbc->internal;
+ new = (HEAP_CURSOR *)new_dbc->internal;
+ new->flags = orig->flags;
+ return (0);
+}
+
+/*
+ * __heapc_gsplit --
+ * Get a heap split record. The page pointed to by the cursor must
+ * be the first segment of this record.
+ *
+ * PUBLIC: int __heapc_gsplit __P((DBC *,
+ * PUBLIC: DBT *, void **, u_int32_t *));
+ */
+int
+__heapc_gsplit(dbc, dbt, bpp, bpsz)
+ DBC *dbc;
+ DBT *dbt;
+ void **bpp;
+ u_int32_t *bpsz;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ DB_HEAP_RID rid;
+ DB_LOCK data_lock;
+ HEAP_CURSOR *cp;
+ ENV *env;
+ HEAPPG *dpage;
+ HEAPSPLITHDR *hdr;
+ db_indx_t bytes;
+ u_int32_t curoff, needed, start, tlen;
+ u_int8_t *p, *src;
+ int putpage, ret, t_ret;
+
+ LOCK_INIT(data_lock);
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (HEAP_CURSOR *)dbc->internal;
+ putpage = FALSE;
+ ret = 0;
+
+ /*
+ * We should have first page, locked already in cursor. Get the
+ * record id out of the cursor and set up local variables.
+ */
+ DB_ASSERT(env, cp->page != NULL);
+ rid.pgno = cp->pgno;
+ rid.indx = cp->indx;
+ dpage = cp->page;
+ hdr = (HEAPSPLITHDR *)P_ENTRY(dbp, dpage, rid.indx);
+ DB_ASSERT(env, hdr->tsize != 0);
+ tlen = hdr->tsize;
+
+ /*
+ * If we doing a partial retrieval, figure out how much we are
+ * actually going to get.
+ */
+ if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+ start = dbt->doff;
+ if (start > tlen)
+ needed = 0;
+ else if (dbt->dlen > tlen - start)
+ needed = tlen - start;
+ else
+ needed = dbt->dlen;
+ } else {
+ start = 0;
+ needed = tlen;
+ }
+
+ /*
+ * If the caller has not requested any data, return success. This
+ * "early-out" also avoids setting up the streaming optimization when
+ * no page would be retrieved. If it were removed, the streaming code
+ * should only initialize when needed is not 0.
+ */
+ if (needed == 0) {
+ dbt->size = 0;
+ return (0);
+ }
+
+ /*
+ * Check if the buffer is big enough; if it is not and we are
+ * allowed to malloc space, then we'll malloc it. If we are
+ * not (DB_DBT_USERMEM), then we'll set the dbt and return
+ * appropriately.
+ */
+ if (F_ISSET(dbt, DB_DBT_USERCOPY))
+ goto skip_alloc;
+
+ /* Allocate any necessary memory. */
+ if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+ if (needed > dbt->ulen) {
+ dbt->size = needed;
+ return (DB_BUFFER_SMALL);
+ }
+ } else if (F_ISSET(dbt, DB_DBT_MALLOC)) {
+ if ((ret = __os_umalloc(env, needed, &dbt->data)) != 0)
+ return (ret);
+ } else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+ if ((ret = __os_urealloc(env, needed, &dbt->data)) != 0)
+ return (ret);
+ } else if (bpsz != NULL && (*bpsz == 0 || *bpsz < needed)) {
+ if ((ret = __os_realloc(env, needed, bpp)) != 0)
+ return (ret);
+ *bpsz = needed;
+ dbt->data = *bpp;
+ } else if (bpp != NULL)
+ dbt->data = *bpp;
+ else {
+ DB_ASSERT(env,
+ F_ISSET(dbt,
+ DB_DBT_USERMEM | DB_DBT_MALLOC | DB_DBT_REALLOC) ||
+ bpsz != NULL || bpp != NULL);
+ return (DB_BUFFER_SMALL);
+ }
+
+skip_alloc:
+ /*
+ * Go through each of the pieces, copying the data on each one
+ * into the buffer. Never copy more than the total data length.
+ * We are starting off with the page that is currently pointed to by
+ * the cursor,
+ */
+ curoff = 0;
+ dbt->size = needed;
+ for (p = dbt->data; needed > 0;) {
+ /* Check if we need any bytes from this page */
+ if (curoff + hdr->std_hdr.size >= start) {
+ bytes = hdr->std_hdr.size;
+ src = (u_int8_t *)hdr +
+ P_TO_UINT16(sizeof(HEAPSPLITHDR));
+ if (start > curoff) {
+ src += start - curoff;
+ bytes -= start - curoff;
+ }
+ if (bytes > needed)
+ bytes = needed;
+ if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+ /*
+ * The offset into the DBT is the total size
+ * less the amount of data still needed. Care
+ * needs to be taken if doing a partial copy
+ * beginning at an offset other than 0.
+ */
+ if ((ret = env->dbt_usercopy(
+ dbt, dbt->size - needed,
+ src, bytes, DB_USERCOPY_SETDATA)) != 0) {
+ if (putpage)
+ (void)__memp_fput(
+ mpf, dbc->thread_info,
+ dpage, dbp->priority);
+
+ return (ret);
+ }
+ } else
+ memcpy(p, src, bytes);
+ p += bytes;
+ needed -= bytes;
+ }
+ curoff += hdr->std_hdr.size;
+
+ /* Find next record piece as long as it exists */
+ if (!F_ISSET((HEAPHDR *)hdr, HEAP_RECLAST)) {
+ rid.pgno = hdr->nextpg;
+ rid.indx = hdr->nextindx;
+
+ /*
+ * First pass through here, we are using the
+ * page pointed to by the cursor, and this page
+ * will get put when the cursor is is closed.
+ * Only pages specifically gotten in this loop
+ * need to be put back.
+ */
+ if (putpage) {
+ if ((ret = __memp_fput(mpf, dbc->thread_info,
+ dpage, dbp->priority) ) != 0)
+ goto err;
+ dpage = NULL;
+ if ((ret = __TLPUT(dbc, data_lock)) != 0)
+ goto err;
+ }
+
+ if ((ret = __db_lget(dbc, 0, rid.pgno,
+ DB_LOCK_READ, 0, &data_lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &rid.pgno,
+ dbc->thread_info, dbc->txn, 0, &dpage)) != 0)
+ goto err;
+ hdr = (HEAPSPLITHDR *)P_ENTRY(dbp, dpage, rid.indx);
+ putpage = TRUE;
+
+ /*
+ * If we have the last piece of this record and we're
+ * reading the entire record, then what we need should
+ * equal what is remaining.
+ */
+ if (F_ISSET((HEAPHDR *)hdr, HEAP_RECLAST) &&
+ !F_ISSET(dbt, DB_DBT_PARTIAL) &&
+ (hdr->std_hdr.size != needed)) {
+ __db_errx(env, DB_STR_A("1167",
+ "Incorrect record size in header: %s: rid %lu.%lu",
+ "%s %lu %lu"), dbc->dbp->fname,
+ (u_long)(cp->pgno), (u_long)(cp->indx));
+ ret = __env_panic(env, DB_RUNRECOVERY);
+ goto err;
+ }
+ }
+ }
+
+err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
+ if (putpage && dpage != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, dpage, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, data_lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+/*
+ * __heapc_refresh --
+ * do the proper set up for cursor reuse.
+ *
+ * PUBLIC: int __heapc_refresh __P((DBC *));
+ */
+int
+__heapc_refresh(dbc)
+ DBC *dbc;
+{
+ HEAP_CURSOR *cp;
+
+ cp = (HEAP_CURSOR *)dbc->internal;
+
+ LOCK_INIT(cp->lock);
+ cp->lock_mode = DB_LOCK_NG;
+ cp->flags = 0;
+
+ return (0);
+}
diff --git a/src/heap/heap.src b/src/heap/heap.src
new file mode 100644
index 00000000..47bd4bb0
--- /dev/null
+++ b/src/heap/heap.src
@@ -0,0 +1,101 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __heap
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/heap.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * addrem -- Add or remove an entry from a heap db.
+ *
+ * opcode: identifies if this is an add or delete.
+ * fileid: file identifier of the file being modified.
+ * pgno: page number.
+ * indx: location at which to insert or delete.
+ * nbytes: number of bytes added/removed to/from the page.
+ * hdr: header for the data item.
+ * dbt: data that is to be added or deleted.
+ * pagelsn: former lsn of the page.
+ */
+BEGIN addrem 49 151
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG nbytes u_int32_t lu
+DBT hdr DBT s
+DBT dbt DBT s
+POINTER pagelsn DB_LSN * lu
+END
+
+/*
+ * pg_alloc: used to record allocating a new page in a heap database.
+ *
+ * meta_lsn: the lsn of the metadata page
+ * meta_pgno the metadata page
+ * page_lsn: the allocated page's original lsn.
+ * pgno: the page allocated.
+ * ptype: the type of the page allocated.
+ * last_pgno: the last page in the file after this op (4.3+).
+ */
+BEGIN pg_alloc 49 152
+DB fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+ARG pgno db_pgno_t lu
+ARG ptype u_int32_t lu
+ARG last_pgno db_pgno_t lu
+END
+
+/*
+ * trunc_meta -- Used to record truncation of a heap database's meta page
+ *
+ * fileid: file identifier of the file being modified.
+ * pgno: page number.
+ * last_pgno: value of last_pgno on meta page
+ * key_count: value of key_count on meta page
+ * record_count: value of record_count on meta page
+ * curregion: value of curregion on meta page
+ * nregions: value of nregions on meta page
+ * pagelsn: former lsn of the page.
+ */
+BEGIN trunc_meta 49 153
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG last_pgno u_int32_t lu
+ARG key_count u_int32_t lu
+ARG record_count u_int32_t lu
+ARG curregion u_int32_t lu
+ARG nregions u_int32_t lu
+POINTER pagelsn DB_LSN * lu
+END
+
+/*
+ * trunc_page -- Used to record truncation of a heap database's region page
+ *
+ * fileid: file identifier of the file being modified.
+ * pgno: page number.
+ * old_data: the contents of the page before truncation
+ * pagelsn: former lsn of the page.
+ */
+BEGIN trunc_page 49 154
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+DBT old_data DBT s
+ARG is_region u_int32_t lu
+POINTER pagelsn DB_LSN * lu
+END
+
+
diff --git a/src/heap/heap_auto.c b/src/heap/heap_auto.c
new file mode 100644
index 00000000..1cb705f4
--- /dev/null
+++ b/src/heap/heap_auto.c
@@ -0,0 +1,73 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/heap.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __heap_addrem_desc[] = {
+ {LOGREC_ARG, SSZ(__heap_addrem_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__heap_addrem_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__heap_addrem_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_addrem_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_addrem_args, nbytes), "nbytes", "%lu"},
+ {LOGREC_DBT, SSZ(__heap_addrem_args, hdr), "hdr", ""},
+ {LOGREC_DBT, SSZ(__heap_addrem_args, dbt), "dbt", ""},
+ {LOGREC_POINTER, SSZ(__heap_addrem_args, pagelsn), "pagelsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __heap_pg_alloc_desc[] = {
+ {LOGREC_DB, SSZ(__heap_pg_alloc_args, fileid), "fileid", ""},
+ {LOGREC_POINTER, SSZ(__heap_pg_alloc_args, meta_lsn), "meta_lsn", ""},
+ {LOGREC_ARG, SSZ(__heap_pg_alloc_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_pg_alloc_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_pg_alloc_args, ptype), "ptype", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_pg_alloc_args, last_pgno), "last_pgno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __heap_trunc_meta_desc[] = {
+ {LOGREC_DB, SSZ(__heap_trunc_meta_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__heap_trunc_meta_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_trunc_meta_args, last_pgno), "last_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_trunc_meta_args, key_count), "key_count", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_trunc_meta_args, record_count), "record_count", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_trunc_meta_args, curregion), "curregion", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_trunc_meta_args, nregions), "nregions", "%lu"},
+ {LOGREC_POINTER, SSZ(__heap_trunc_meta_args, pagelsn), "pagelsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __heap_trunc_page_desc[] = {
+ {LOGREC_DB, SSZ(__heap_trunc_page_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__heap_trunc_page_args, pgno), "pgno", "%lu"},
+ {LOGREC_DBT, SSZ(__heap_trunc_page_args, old_data), "old_data", ""},
+ {LOGREC_ARG, SSZ(__heap_trunc_page_args, is_region), "is_region", "%lu"},
+ {LOGREC_POINTER, SSZ(__heap_trunc_page_args, pagelsn), "pagelsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __heap_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__heap_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_addrem_recover, DB___heap_addrem)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_pg_alloc_recover, DB___heap_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_trunc_meta_recover, DB___heap_trunc_meta)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_trunc_page_recover, DB___heap_trunc_page)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/heap/heap_autop.c b/src/heap/heap_autop.c
new file mode 100644
index 00000000..b767203b
--- /dev/null
+++ b/src/heap/heap_autop.c
@@ -0,0 +1,105 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#ifdef HAVE_HEAP
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/heap.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __heap_addrem_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__heap_addrem_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__heap_addrem", __heap_addrem_desc, info));
+}
+
+/*
+ * PUBLIC: int __heap_pg_alloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__heap_pg_alloc_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__heap_pg_alloc", __heap_pg_alloc_desc, info));
+}
+
+/*
+ * PUBLIC: int __heap_trunc_meta_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__heap_trunc_meta_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__heap_trunc_meta", __heap_trunc_meta_desc, info));
+}
+
+/*
+ * PUBLIC: int __heap_trunc_page_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__heap_trunc_page_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__heap_trunc_page", __heap_trunc_page_desc, info));
+}
+
+/*
+ * PUBLIC: int __heap_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__heap_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_addrem_print, DB___heap_addrem)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_pg_alloc_print, DB___heap_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_trunc_meta_print, DB___heap_trunc_meta)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_trunc_page_print, DB___heap_trunc_page)) != 0)
+ return (ret);
+ return (0);
+}
+#endif /* HAVE_HEAP */
diff --git a/src/heap/heap_backup.c b/src/heap/heap_backup.c
new file mode 100644
index 00000000..4588b0ba
--- /dev/null
+++ b/src/heap/heap_backup.c
@@ -0,0 +1,66 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+#include "dbinc/mp.h"
+
+/*
+ * __heap_backup --
+ * Copy a heap database file coordinated with mpool.
+ *
+ * PUBLIC: int __heap_backup __P((DB_ENV *, DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_FH *, void *, u_int32_t));
+ */
+int
+__heap_backup(dbenv, dbp, ip, fp, handle, flags)
+ DB_ENV *dbenv;
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_FH *fp;
+ void *handle;
+ u_int32_t flags;
+{
+ HEAPPG *p;
+ db_pgno_t chunk_pgno, high_pgno, max_pgno;
+ int ret;
+
+ max_pgno = dbp->mpf->mfp->last_pgno;
+ chunk_pgno = FIRST_HEAP_RPAGE;
+
+ for (;;) {
+ /*
+ * Get the chunk page and the chunk's highest used page.
+ * Immediately return the page, it makes error handling easier.
+ */
+ if ((ret = __memp_fget(dbp->mpf,
+ &chunk_pgno, ip, NULL, 0, &p)) != 0)
+ break;
+ high_pgno = p->high_pgno;
+ if ((ret = __memp_fput(dbp->mpf,
+ ip, p, DB_PRIORITY_UNCHANGED)) != 0)
+ break;
+
+ /*
+ * Backup all the used pages in this chunk, starting at the
+ * chunk page. If this is the very first chunk, be sure to
+ * backup the db meta page, too.
+ */
+ if ((ret = __memp_backup_mpf(dbenv->env, dbp->mpf, ip,
+ chunk_pgno == FIRST_HEAP_RPAGE ? 0 : chunk_pgno,
+ high_pgno, fp, handle, flags)) != 0)
+ break;
+ chunk_pgno += HEAP_REGION_SIZE(dbp) + 1;
+ if (chunk_pgno > max_pgno)
+ break;
+ }
+
+ return (ret);
+}
diff --git a/src/heap/heap_conv.c b/src/heap/heap_conv.c
new file mode 100644
index 00000000..9f432d13
--- /dev/null
+++ b/src/heap/heap_conv.c
@@ -0,0 +1,93 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/heap.h"
+
+/*
+ * __heap_pgin --
+ * Convert host-specific page layout from the host-independent format
+ * stored on disk.
+ *
+ * PUBLIC: int __heap_pgin __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__heap_pgin(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+ PAGE *h;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!F_ISSET(pginfo, DB_AM_SWAP))
+ return (0);
+
+ h = pp;
+ return (TYPE(h) == P_HEAPMETA ? __heap_mswap(dbp->env, pp) :
+ __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 1));
+}
+
+/*
+ * __heap_pgout --
+ * Convert host-specific page layout from the host-independent format
+ * stored on disk.
+ *
+ * PUBLIC: int __heap_pgout __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__heap_pgout(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+ PAGE *h;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!F_ISSET(pginfo, DB_AM_SWAP))
+ return (0);
+
+ h = pp;
+ return (TYPE(h) == P_HEAPMETA ? __heap_mswap(dbp->env, pp) :
+ __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 0));
+}
+
+/*
+ * __heap_mswap --
+ * Swap the bytes on the heap metadata page.
+ *
+ * PUBLIC: int __heap_mswap __P((ENV *, PAGE *));
+ */
+int
+__heap_mswap(env, pg)
+ ENV *env;
+ PAGE *pg;
+{
+ u_int8_t *p;
+
+ COMPQUIET(env, NULL);
+
+ __db_metaswap(pg);
+ p = (u_int8_t *)pg + sizeof(DBMETA);
+
+ SWAP32(p); /* curregion */
+ SWAP32(p); /* nregions */
+ SWAP32(p); /* gbytes */
+ SWAP32(p); /* bytes */
+ SWAP32(p); /* region_size */
+ p += 92 * sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* crypto_magic */
+
+ return (0);
+}
diff --git a/src/heap/heap_method.c b/src/heap/heap_method.c
new file mode 100644
index 00000000..f938b5e7
--- /dev/null
+++ b/src/heap/heap_method.c
@@ -0,0 +1,168 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+
+/*
+ * __heap_db_create --
+ * Heap specific initialization of the DB structure.
+ *
+ * PUBLIC: int __heap_db_create __P((DB *));
+ */
+int
+__heap_db_create(dbp)
+ DB *dbp;
+{
+ HEAP *h;
+ int ret;
+
+ if ((ret = __os_calloc(dbp->env, 1, sizeof(HEAP), &h)) != 0)
+ return (ret);
+ dbp->heap_internal = h;
+ h->region_size = 0;
+
+ dbp->get_heapsize = __heap_get_heapsize;
+ dbp->get_heap_regionsize = __heap_get_heap_regionsize;
+ dbp->set_heapsize = __heap_set_heapsize;
+ dbp->set_heap_regionsize = __heap_set_heap_regionsize;
+
+ return (0);
+}
+
+/*
+ * __heap_db_close --
+ * Heap specific discard of the DB structure.
+ *
+ * PUBLIC: int __heap_db_close __P((DB *));
+ */
+int
+__heap_db_close(dbp)
+ DB *dbp;
+{
+ HEAP *h;
+ int ret;
+
+ ret = 0;
+ if ((h = dbp->heap_internal) == NULL)
+ return (0);
+
+ __os_free(dbp->env, h);
+ dbp->heap_internal = NULL;
+
+ return (0);
+}
+
+/*
+ * __heap_get_heapsize --
+ * Get the initial size of the heap.
+ *
+ * PUBLIC: int __heap_get_heapsize __P((DB *, u_int32_t *, u_int32_t *));
+ */
+int
+__heap_get_heapsize(dbp, gbytes, bytes)
+ DB *dbp;
+ u_int32_t *gbytes, *bytes;
+{
+ HEAP *h;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HEAP);
+
+ h = dbp->heap_internal;
+ *gbytes = h->gbytes;
+ *bytes = h->bytes;
+
+ return (0);
+}
+
+/*
+ * __heap_get_heap_regionsize --
+ * Get the region size of the heap.
+ *
+ * PUBLIC: int __heap_get_heap_regionsize __P((DB *, u_int32_t *));
+ */
+int
+__heap_get_heap_regionsize(dbp, npages)
+ DB *dbp;
+ u_int32_t *npages;
+{
+ HEAP *h;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HEAP);
+
+ h = dbp->heap_internal;
+ *npages = h->region_size;
+
+ return (0);
+}
+
+/*
+ * __heap_set_heapsize --
+ * Set the initial size of the heap.
+ *
+ * PUBLIC: int __heap_set_heapsize __P((DB *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__heap_set_heapsize(dbp, gbytes, bytes, flags)
+ DB *dbp;
+ u_int32_t gbytes, bytes, flags;
+{
+ HEAP *h;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_heapsize");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HEAP);
+
+ COMPQUIET(flags, 0);
+ h = dbp->heap_internal;
+ h->gbytes = gbytes;
+ h->bytes = bytes;
+
+ return (0);
+}
+
+/*
+ * __heap_set_heap_regionsize --
+ * Set the region size of the heap.
+ *
+ * PUBLIC: int __heap_set_heap_regionsize __P((DB *, u_int32_t));
+ */
+int
+__heap_set_heap_regionsize(dbp, npages)
+ DB *dbp;
+ u_int32_t npages;
+{
+ HEAP *h;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_heap_regionsize");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_HEAP);
+
+ if (npages == 0) {
+ __db_errx(dbp->env, DB_STR("1168", "region size may not be 0"));
+ return (EINVAL);
+ }
+
+ h = dbp->heap_internal;
+ h->region_size = npages;
+
+ return (0);
+}
+
+/*
+ * __heap_exist --
+ * Test to see if heap exists or not, used in Perl interface
+ *
+ * PUBLIC: int __heap_exist __P((void));
+ */
+int
+__heap_exist()
+{
+ return (1);
+}
diff --git a/src/heap/heap_open.c b/src/heap/heap_open.c
new file mode 100644
index 00000000..6827450d
--- /dev/null
+++ b/src/heap/heap_open.c
@@ -0,0 +1,439 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/fop.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+static void __heap_init_meta __P((DB *, HEAPMETA *, db_pgno_t, DB_LSN*));
+
+/*
+ * __heap_open --
+ * Open a heap.
+ *
+ * PUBLIC: int __heap_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__heap_open(dbp, ip, txn, name, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ HEAP *h;
+ db_pgno_t npgs;
+ int ret;
+
+ h = (HEAP *)dbp->heap_internal;
+ COMPQUIET(name, NULL);
+
+ ret = __heap_read_meta(dbp, ip, txn, base_pgno, flags);
+
+ if (h->gbytes != 0 || h->bytes != 0) {
+ /*
+ * We don't have to worry about rounding with gbytes, as pgsize
+ * is always a multiple of 2, but we round up if bytes isn't
+ * a multiple of the page size.
+ */
+ npgs = (db_pgno_t)(h->gbytes * (GIGABYTE / dbp->pgsize));
+ npgs += (db_pgno_t)((h->bytes +dbp->pgsize - 1)/ dbp->pgsize);
+ h->maxpgno = npgs - 1;
+ if (h->maxpgno < FIRST_HEAP_DPAGE) {
+ __db_errx(dbp->env,
+ "requested database size is too small");
+ return (EINVAL);
+ }
+ } else
+ /* If not fixed size heap, set maxregion to maximum value */
+ h->maxpgno = UINT32_MAX;
+
+ return (ret);
+}
+
+/*
+ * __heap_metachk --
+ *
+ * PUBLIC: int __heap_metachk __P((DB *, const char *, HEAPMETA *));
+ */
+int
+__heap_metachk(dbp, name, hm)
+ DB *dbp;
+ const char *name;
+ HEAPMETA *hm;
+{
+ ENV *env;
+ HEAP *h;
+ int ret;
+ u_int32_t vers;
+
+ env = dbp->env;
+ h = (HEAP *)dbp->heap_internal;
+
+ /*
+ * At this point, all we know is that the magic number is for a Heap.
+ * Check the version, the database may be out of date.
+ */
+ vers = hm->dbmeta.version;
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ M_32_SWAP(vers);
+ switch (vers) {
+ case 1:
+ break;
+ default:
+ __db_errx(env,
+ "%s: unsupported heap version: %lu", name, (u_long)vers);
+ return (EINVAL);
+ }
+
+ /* Swap the page if needed. */
+ if (F_ISSET(dbp, DB_AM_SWAP) &&
+ (ret = __heap_mswap(env, (PAGE *)hm)) != 0)
+ return (ret);
+
+ /* Check application info against metadata info. */
+ if (h->gbytes != 0 || h->bytes != 0)
+ if (h->gbytes != hm->gbytes || h->bytes != hm->bytes) {
+ __db_errx(env, DB_STR_A("1155",
+ "%s: specified heap size does not match size set in database",
+ "%s"), name);
+ return (EINVAL);
+ }
+
+ /* Set the page size. */
+ dbp->pgsize = hm->dbmeta.pagesize;
+
+ /* Copy the file's ID. */
+ memcpy(dbp->fileid, hm->dbmeta.uid, DB_FILE_ID_LEN);
+
+ return (0);
+}
+
+/*
+ * __heap_read_meta --
+ * Read the meta page and set up the internal structure.
+ *
+ * PUBLIC: int __heap_read_meta __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
+ */
+int
+__heap_read_meta(dbp, ip, txn, meta_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ db_pgno_t meta_pgno;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
+ HEAPMETA *meta;
+ HEAP *h;
+ int ret, t_ret;
+
+ COMPQUIET(flags, 0);
+
+ meta = NULL;
+ h = dbp->heap_internal;
+ LOCK_INIT(metalock);
+ mpf = dbp->mpf;
+ ret = 0;
+
+ /* Get a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ /* Get the metadata page. */
+ if ((ret =
+ __db_lget(dbc, 0, meta_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &meta_pgno, ip, dbc->txn, 0, &meta)) != 0)
+ goto err;
+
+ /*
+ * If the magic number is set, the heap has been created. Correct
+ * any fields that may not be right. Note, all of the local flags
+ * were set by DB->open.
+ *
+ * Otherwise, we'd better be in recovery or abort, in which case the
+ * metadata page will be created/initialized elsewhere.
+ */
+ if (meta->dbmeta.magic == DB_HEAPMAGIC) {
+ h->curregion = meta->curregion;
+ h->curpgindx = 0;
+ h->gbytes = meta->gbytes;
+ h->bytes = meta->bytes;
+ h->region_size = meta->region_size;
+
+ if (PGNO(meta) == PGNO_BASE_MD && !F_ISSET(dbp, DB_AM_RECOVER))
+ __memp_set_last_pgno(mpf, meta->dbmeta.last_pgno);
+ } else {
+ DB_ASSERT(dbp->env,
+ IS_RECOVERING(dbp->env) || F_ISSET(dbp, DB_AM_RECOVER));
+ }
+
+err: /* Put the metadata page back. */
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ ip, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __heap_new_file --
+ * Create the necessary pages to begin a new database file.
+ *
+ * PUBLIC: int __heap_new_file __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__heap_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ DBT pdbt;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO pginfo;
+ ENV *env;
+ HEAP *h;
+ HEAPMETA *meta;
+ HEAPPG *region;
+ db_pgno_t pgno;
+ int ret, t_ret;
+ u_int32_t max_size;
+ void *buf;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ buf = NULL;
+ h = (HEAP *)dbp->heap_internal;
+ max_size = HEAP_REGION_COUNT(dbp, dbp->pgsize);
+
+ if (h->region_size == 0)
+ h->region_size = HEAP_DEFAULT_REGION_MAX(dbp) > max_size ?
+ max_size : HEAP_DEFAULT_REGION_MAX(dbp);
+ else if (h->region_size > max_size) {
+ __db_errx(dbp->env, DB_STR_A("1169",
+ "region size may not be larger than %lu",
+ "%lu"), (u_long)max_size);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ /* Build the meta-data page. */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno,
+ ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+ return (ret);
+ LSN_NOT_LOGGED(lsn);
+ __heap_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+ ret = __db_log_page(dbp, txn, &lsn, pgno, (PAGE *)meta);
+ if ((t_ret =
+ __memp_fput(mpf, ip, meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ meta = NULL;
+ if (ret != 0)
+ goto err;
+
+ /* Build the first region page. */
+ pgno = 1;
+ if ((ret = __memp_fget(mpf, &pgno,
+ ip, txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &region)) != 0)
+ goto err;
+ memset(region, 0, dbp->pgsize);
+
+ P_INIT(region,
+ dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID, 0, P_IHEAP);
+ LSN_NOT_LOGGED(region->lsn);
+ ret = __db_log_page(
+ dbp, txn, &region->lsn, pgno, (PAGE *)region);
+ if ((t_ret = __memp_fput(
+ mpf, ip, region, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ region = NULL;
+ if (ret != 0)
+ goto err;
+ } else {
+ memset(&pdbt, 0, sizeof(pdbt));
+
+ /* Build the meta-data page. */
+ pginfo.db_pagesize = dbp->pgsize;
+ pginfo.flags =
+ F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+ pginfo.type = dbp->type;
+ pdbt.data = &pginfo;
+ pdbt.size = sizeof(pginfo);
+ if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0)
+ return (ret);
+ meta = (HEAPMETA *)buf;
+ LSN_NOT_LOGGED(lsn);
+ __heap_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+ if ((ret =
+ __db_pgout(dbp->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
+ goto err;
+ if ((ret = __fop_write(env, txn, name, dbp->dirname,
+ DB_APP_DATA, fhp,
+ dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, F_ISSET(
+ dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ meta = NULL;
+
+ /* Build the first region page */
+ memset(buf, 0, dbp->pgsize);
+ region = (HEAPPG *)buf;
+ P_INIT(region,
+ dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID, 0, P_IHEAP);
+ LSN_NOT_LOGGED(region->lsn);
+ if ((ret =
+ __db_pgout(dbp->dbenv, region->pgno, region, &pdbt)) != 0)
+ goto err;
+ if ((ret =
+ __fop_write(env, txn, name, dbp->dirname, DB_APP_DATA,
+ fhp, dbp->pgsize, 1, 0, buf, dbp->pgsize, 1, F_ISSET(
+ dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ region = NULL;
+ }
+
+err: if (buf != NULL)
+ __os_free(env, buf);
+ return (ret);
+}
+
+/*
+ * __heap_create_region --
+ * Create a region page
+ *
+ * PUBLIC: int __heap_create_region __P((DBC *, db_pgno_t));
+ */
+int
+__heap_create_region(dbc, pgno)
+ DBC *dbc;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DB_LOCK meta_lock;
+ DB_MPOOLFILE *mpf;
+ HEAPMETA *meta;
+ HEAPPG *region;
+ db_pgno_t meta_pgno;
+ int ret, t_ret;
+
+ LOCK_INIT(meta_lock);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ region = NULL;
+
+ /* We may need to update the last page number on the metadata page. */
+ meta_pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, meta_pgno, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+ return (ret);
+ if ((ret = __memp_fget(mpf, &meta_pgno,
+ dbc->thread_info, NULL, DB_MPOOL_DIRTY, &meta)) != 0) {
+ (void)__LPUT(dbc, meta_lock);
+ return (ret);
+ }
+
+ ret = __memp_fget(mpf, &pgno, dbc->thread_info,
+ NULL, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &region);
+
+ if (ret != 0 || region->pgno != 0)
+ /*
+ * There's been an error or someone got here before us and
+ * created the page. Either way, our work here is done.
+ */
+ goto done;
+
+ /* Log the page creation. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __heap_pg_alloc_log(dbp,
+ dbc->txn, &LSN(meta), 0, &LSN(meta), meta_pgno,
+ pgno, (u_int32_t)P_IHEAP, meta->dbmeta.last_pgno)) != 0)
+ goto done;
+ } else
+ LSN_NOT_LOGGED(LSN(&meta->dbmeta));
+
+ memset((void *)region, 0, dbp->pgsize);
+ P_INIT(region,
+ dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_IHEAP);
+ LSN(region) = LSN(&meta->dbmeta);
+
+ /*
+ * We may have created a page earlier with a larger page number
+ * check before updating the metadata page.
+ */
+ if (pgno > meta->dbmeta.last_pgno)
+ meta->dbmeta.last_pgno = pgno;
+ if (HEAP_REGION_NUM(dbp, pgno) > meta->nregions)
+ meta->nregions = HEAP_REGION_NUM(dbp, pgno);
+
+done: if (region != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, region, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+static void
+__heap_init_meta(dbp, meta, pgno, lsnp)
+ DB *dbp;
+ HEAPMETA *meta;
+ db_pgno_t pgno;
+ DB_LSN *lsnp;
+{
+ HEAP *h;
+ ENV *env;
+
+ env = dbp->env;
+ h = dbp->heap_internal;
+
+ memset(meta, 0, sizeof(HEAPMETA));
+ meta->dbmeta.lsn = *lsnp;
+ meta->dbmeta.pgno = pgno;
+ meta->dbmeta.magic = DB_HEAPMAGIC;
+ meta->dbmeta.version = DB_HEAPVERSION;
+ meta->dbmeta.pagesize = dbp->pgsize;
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
+ DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
+ meta->crypto_magic = meta->dbmeta.magic;
+ }
+ meta->dbmeta.type = P_HEAPMETA;
+ meta->dbmeta.free = PGNO_INVALID;
+ meta->dbmeta.last_pgno = FIRST_HEAP_RPAGE;
+ memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+ meta->gbytes = h->gbytes;
+ meta->bytes = h->bytes;
+ meta->region_size = h->region_size;
+ meta->nregions = 1;
+ meta->curregion = 1;
+}
diff --git a/src/heap/heap_rec.c b/src/heap/heap_rec.c
new file mode 100644
index 00000000..578a61c4
--- /dev/null
+++ b/src/heap/heap_rec.c
@@ -0,0 +1,386 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+/*
+ * __heap_addrem_recover --
+ * Recovery function for addrem.
+ *
+ * PUBLIC: int __heap_addrem_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_addrem_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __heap_addrem_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *pagep, *regionp;
+ db_pgno_t region_pgno;
+ int cmp_n, cmp_p, modified, oldspace, ret, space;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__heap_addrem_print);
+ REC_INTRO(__heap_addrem_read, ip, 1);
+ region_pgno = HEAP_REGION_PGNO(file_dbp, argp->pgno);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->pagelsn);
+
+ if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_HEAP) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_HEAP)) {
+ /* We are either redo-ing an add or undoing a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __heap_pitem(dbc, pagep,
+ argp->indx, argp->nbytes, &argp->hdr, &argp->dbt)) != 0)
+ goto out;
+ modified = 1;
+ } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_HEAP) ||
+ (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_HEAP)) {
+ /* We are either undoing an add or redo-ing a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __heap_ditem(
+ dbc, pagep, argp->indx, argp->nbytes)) != 0)
+ goto out;
+ modified = 1;
+ }
+
+ if (modified) {
+ REC_FGET(mpf, ip, region_pgno, &regionp, done);
+ if (DB_REDO(op))
+ LSN(pagep) = *lsnp;
+ else
+ LSN(pagep) = argp->pagelsn;
+
+ /* Update the available space bitmap, if necessary. */
+ HEAP_CALCSPACEBITS(
+ file_dbp, HEAP_FREESPACE(file_dbp, pagep), space);
+ oldspace = HEAP_SPACE(file_dbp, regionp,
+ argp->pgno - region_pgno - 1);
+ if (space != oldspace) {
+ REC_DIRTY(mpf, ip, dbc->priority, &regionp);
+ HEAP_SETSPACE(file_dbp,
+ regionp, argp->pgno - region_pgno - 1, space);
+ }
+ if ((ret = __memp_fput(mpf, ip, regionp, dbc->priority)) != 0)
+ goto out;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __heap_pg_alloc_recover --
+ * Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __heap_pg_alloc_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_pg_alloc_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __heap_pg_alloc_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ HEAPMETA *meta;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ HEAPPG *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, ret, trunc;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ meta = NULL;
+ pagep = NULL;
+
+ REC_PRINT(__heap_pg_alloc_print);
+ REC_INTRO(__heap_pg_alloc_read, ip, 0);
+
+ trunc = 0;
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+ /* The metadata page must always exist on redo. */
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else {
+ ret = 0;
+ goto done;
+ }
+ }
+ cmp_n = log_compare(lsnp, &LSN(meta));
+ cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ LSN(meta) = *lsnp;
+ if (argp->pgno > meta->dbmeta.last_pgno)
+ meta->dbmeta.last_pgno = argp->pgno;
+ if (argp->ptype == P_IHEAP &&
+ HEAP_REGION_NUM(file_dbp, argp->pgno) > meta->nregions)
+ meta->nregions = HEAP_REGION_NUM(file_dbp, argp->pgno);
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ LSN(meta) = argp->meta_lsn;
+ if (meta->dbmeta.last_pgno != argp->last_pgno) {
+ if (file_dbp->mpf->mfp->last_pgno ==
+ meta->dbmeta.last_pgno)
+ trunc = 1;
+ meta->dbmeta.last_pgno = argp->last_pgno;
+ }
+ if (argp->ptype == P_IHEAP &&
+ HEAP_REGION_NUM(file_dbp, argp->pgno) == meta->nregions) {
+ do
+ meta->nregions--;
+ while (argp->last_pgno <
+ (meta->nregions - 1) * HEAP_REGION_SIZE(file_dbp));
+ }
+ }
+ /*
+ * Fix up the allocated page.
+ * If we're undoing and the page doesn't exist, there's nothing to do,
+ * if the page does exist we simply zero it out.
+ * Otherwise if we're redoing the operation, we have
+ * to get the page (creating it if it doesn't exist), and update its
+ * LSN.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_UNDO(op)) {
+ ret = 0;
+ goto do_meta;
+ }
+ if ((ret = __memp_fget(mpf,
+ &argp->pgno, ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ if (DB_REDO(op) && IS_ZERO_LSN(LSN(pagep))) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, PGNO_INVALID, 0, argp->ptype);
+ LSN(pagep) = *lsnp;
+ } else if ((cmp_n == 0 || IS_ZERO_LSN(LSN(pagep))) && DB_UNDO(op)) {
+ if (argp->pgno == file_dbp->mpf->mfp->last_pgno)
+ trunc = 1;
+ else if (!IS_ZERO_LSN(LSN(pagep))) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memset(pagep, 0, file_dbp->pgsize);
+ }
+ }
+ /* If the page is newly allocated and aborted, give it back. */
+ if (pagep != NULL && (trunc == 1 ||
+ (IS_ZERO_LSN(LSN(pagep)) && TYPE(pagep) != P_IHEAP))) {
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+ if ((ret = __memp_fget(mpf,
+ &argp->pgno, ip, NULL, DB_MPOOL_FREE, &pagep)) != 0)
+ goto out;
+ if (trunc == 0 && argp->pgno <= mpf->mfp->last_flushed_pgno) {
+ /*
+ * If this page is on disk we need to zero it.
+ * This is safe since we never free pages other
+ * than backing out an allocation, so there can
+ * not be a previous allocate and free of this
+ * page that is reflected on disk.
+ */
+ if ((ret = __db_zero_extend(env, mpf->fhp,
+ argp->pgno, argp->pgno, file_dbp->pgsize)) != 0)
+ goto out;
+ }
+ }
+ /*
+ * Keep the region high_pgno up to date This not logged so we
+ * always need to check it.
+ */
+ if (DB_REDO(op)) {
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+ pgno = HEAP_REGION_PGNO(file_dbp, argp->pgno);
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) != 0)
+ goto out;
+ if (pagep->high_pgno >= argp->pgno)
+ goto done;
+ if ((ret = __memp_dirty(mpf, &pagep, ip, NULL,
+ DB_PRIORITY_UNCHANGED, 0)) != 0)
+ goto done;
+ pagep->high_pgno = argp->pgno;
+ }
+
+do_meta:
+ if (trunc == 1 &&
+ (ret = __memp_ftruncate(mpf, NULL, ip, meta->dbmeta.last_pgno + 1,
+ MP_TRUNC_RECOVER | MP_TRUNC_NOCACHE)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __heap_trunc_meta_recover --
+ * Recovery function for trunc_meta.
+ *
+ * PUBLIC: int __heap_trunc_meta_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_trunc_meta_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __heap_trunc_meta_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ HEAPMETA *meta;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__heap_trunc_meta_print);
+ REC_INTRO(__heap_trunc_meta_read, ip, 1);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->pagelsn);
+ meta = (HEAPMETA *)pagep;
+
+ if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ meta->dbmeta.last_pgno = argp->last_pgno;
+ meta->dbmeta.key_count = argp->key_count;
+ meta->dbmeta.record_count = argp->record_count;
+ meta->curregion = argp->curregion;
+ meta->nregions = argp->nregions;
+ LSN(meta) = argp->pagelsn;
+ } else if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ /* last_pgno to 1 to account for region page */
+ meta->dbmeta.last_pgno = 1;
+ meta->dbmeta.key_count = 0;
+ meta->dbmeta.record_count = 0;
+ meta->curregion = FIRST_HEAP_RPAGE;
+ meta->nregions = 1;
+ LSN(meta) = *lsnp;
+ if ((ret = __memp_ftruncate(mpf, dbc->txn,
+ ip, PGNO_BASE_MD + 1, MP_TRUNC_NOCACHE)) != 0)
+ goto out;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __heap_trunc_page_recover --
+ * Recovery function for trunc_page.
+ *
+ * PUBLIC: int __heap_trunc_page_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_trunc_page_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __heap_trunc_page_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *pagep;
+ int cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__heap_trunc_page_print);
+ REC_INTRO(__heap_trunc_page_read, ip, 1);
+
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_REDO(op))
+ goto done;
+ if ((ret = __memp_fget(mpf,
+ &argp->pgno, ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+ cmp_p = log_compare(&LSN(pagep), &argp->pagelsn);
+
+ if (DB_UNDO(op) && IS_ZERO_LSN(LSN(pagep))) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ memcpy(pagep, argp->old_data.data, argp->old_data.size);
+ LSN(pagep) = argp->pagelsn;
+ } else if (cmp_p == 0 && DB_REDO(op)) {
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_FREE, &pagep)) != 0)
+ goto out;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
diff --git a/src/heap/heap_reclaim.c b/src/heap/heap_reclaim.c
new file mode 100644
index 00000000..8cedb223
--- /dev/null
+++ b/src/heap/heap_reclaim.c
@@ -0,0 +1,152 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __heap_truncate --
+ * Truncate a database.
+ *
+ * PUBLIC: int __heap_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__heap_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ DB *dbp;
+ DB_LOCK lock, meta_lock;
+ DB_MPOOLFILE *mpf;
+ DBT log_dbt;
+ HEAPHDR *hdr;
+ HEAPMETA *meta;
+ HEAPPG *pg;
+ db_pgno_t pgno;
+ int i, ret, t_ret;
+ u_int32_t count, next_region, region_size;
+
+ LOCK_INIT(lock);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ count = 0;
+ next_region = FIRST_HEAP_RPAGE;
+ region_size = HEAP_REGION_SIZE(dbp);
+
+ /* Traverse the entire database, starting with the metadata pg. */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+ return (ret);
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0) {
+ __TLPUT(dbc, lock);
+ goto err;
+ }
+
+ for (;;) {
+ pgno++;
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE, pgno, DB_LOCK_WRITE, 0, &lock)) != 0)
+ break;
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0) {
+ if (ret == DB_PAGE_NOTFOUND)
+ ret = 0;
+ break;
+ }
+ if (DBC_LOGGING(dbc)) {
+ memset(&log_dbt, 0, sizeof(DBT));
+ log_dbt.data = pg;
+ log_dbt.size = dbp->pgsize;
+ if ((ret = __heap_trunc_page_log(dbp, dbc->txn,
+ &LSN(pg), 0, pgno,
+ &log_dbt, (pgno == next_region), &LSN(pg))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(pg));
+
+ if (pgno == next_region) {
+ DB_ASSERT(dbp->env, TYPE(pg) == P_IHEAP);
+ next_region += region_size + 1;
+ } else {
+ /*
+ * We can't use pg->entries to calculate the record
+ * count, because it can include split records. So we
+ * check the header for each entry and only count
+ * non-split records and the first piece of split
+ * records. But if the page is empty, there's no work to
+ * do.
+ */
+ if (NUM_ENT(pg) != 0)
+ for (i = 0; i <= HEAP_HIGHINDX(pg); i++) {
+ if (HEAP_OFFSETTBL(dbp, pg)[i] == 0)
+ continue;
+ hdr = (HEAPHDR *)P_ENTRY(dbp, pg, i);
+ if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+ F_ISSET(hdr, HEAP_RECFIRST))
+ count++;
+ }
+ }
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, pg, dbc->priority)) != 0)
+ break;
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_FREE, &pg)) != 0)
+ break;
+ }
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (countp != NULL && ret == 0)
+ *countp = count;
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __heap_trunc_meta_log(dbp, dbc->txn, &LSN(meta), 0,
+ meta->dbmeta.pgno, meta->dbmeta.last_pgno,
+ meta->dbmeta.key_count, meta->dbmeta.record_count,
+ meta->curregion, meta->nregions, &LSN(meta))) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+ meta->dbmeta.key_count = 0;
+ meta->dbmeta.record_count = 0;
+ meta->dbmeta.last_pgno = PGNO_BASE_MD + 1;
+ meta->curregion = 1;
+ meta->nregions = 1;
+
+ if ((ret = __memp_ftruncate(mpf, dbc->txn,
+ dbc->thread_info, PGNO_BASE_MD + 1, MP_TRUNC_NOCACHE)) != 0)
+ goto err;
+
+ /* Create the first region. */
+ pgno = PGNO_BASE_MD + 1;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pg)) != 0)
+ goto err;
+
+ memset(pg, 0, dbp->pgsize);
+ P_INIT(pg,
+ dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID, 0, P_IHEAP);
+ ret = __db_log_page(dbp, dbc->txn, &pg->lsn, pgno, (PAGE *)pg);
+ if ((t_ret = __memp_fput(
+ mpf, dbc->thread_info, pg, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, meta_lock)) && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
diff --git a/src/heap/heap_stat.c b/src/heap/heap_stat.c
new file mode 100644
index 00000000..9f4361a7
--- /dev/null
+++ b/src/heap/heap_stat.c
@@ -0,0 +1,289 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+#ifdef HAVE_STATISTICS
+/*
+ * __heap_stat --
+ * Gather/print the heap statistics
+ *
+ * PUBLIC: int __heap_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__heap_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_HEAP_STAT *sp;
+ DB_LOCK lock, metalock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HEAPMETA *meta;
+ db_pgno_t metapgno;
+ int ret, t_ret, write_meta;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ meta = NULL;
+ LOCK_INIT(metalock);
+ LOCK_INIT(lock);
+ mpf = dbp->mpf;
+ sp = NULL;
+ ret = t_ret = write_meta = 0;
+
+ /* Allocate and clear the structure. */
+ if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
+ goto err;
+ memset(sp, 0, sizeof(*sp));
+
+ /* Get the metadata page for the entire database. */
+ metapgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ 0, metapgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &metapgno,
+ dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+ goto err;
+
+ sp->heap_metaflags = meta->dbmeta.flags;
+ sp->heap_pagecnt = meta->dbmeta.last_pgno + 1;
+ sp->heap_pagesize = meta->dbmeta.pagesize;
+ sp->heap_magic = meta->dbmeta.magic;
+ sp->heap_version = meta->dbmeta.version;
+ sp->heap_nregions = meta->nregions;
+ sp->heap_regionsize = meta->region_size;
+
+ if (LF_ISSET(DB_FAST_STAT)) {
+ sp->heap_nrecs = meta->dbmeta.record_count;
+ } else {
+ /* Count the entries in the database. */
+ if ((ret = __heap_traverse(dbc, __heap_stat_callback, sp)) != 0)
+ goto err;
+
+ write_meta = !F_ISSET(dbp, DB_AM_RDONLY) &&
+ (!MULTIVERSION(dbp) || dbc->txn != NULL);
+ if (write_meta) {
+ ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority);
+ meta = NULL;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ if ((ret = __db_lget(dbc,
+ 0, metapgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &metapgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+
+ meta->dbmeta.key_count = sp->heap_nrecs;
+ meta->dbmeta.record_count = sp->heap_nrecs;
+ }
+ }
+
+ *(DB_HEAP_STAT **)spp = sp;
+
+err: /* Discard metadata page. */
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0 && sp != NULL) {
+ __os_ufree(env, sp);
+ *(DB_BTREE_STAT **)spp = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __heap_stat_print --
+ * Display heap statistics.
+ *
+ * PUBLIC: int __heap_stat_print __P((DBC *, u_int32_t));
+ */
+int
+__heap_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_HEAP_STAT *sp;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if ((ret = __heap_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Default Heap database information:");
+ }
+ __db_msg(env, "%lx\tHeap magic number", (u_long)sp->heap_magic);
+ __db_msg(env, "%lu\tHeap version number", (u_long)sp->heap_version);
+ __db_dl(env,
+ "Underlying database page size", (u_long)sp->heap_pagesize);
+ __db_dl(env,
+ "Number of records in the database", (u_long)sp->heap_nrecs);
+ __db_dl(env, "Number of database pages", (u_long)sp->heap_pagecnt);
+ __db_dl(env, "Number of database regions", (u_long)sp->heap_nregions);
+ __db_dl(env,
+ "Number of pages in a region", (u_long)sp->heap_regionsize);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+/*
+ * __heap_print_cursor --
+ * Display the current cursor.
+ *
+ * PUBLIC: void __heap_print_cursor __P((DBC *));
+ */
+void
+__heap_print_cursor(dbc)
+ DBC *dbc;
+{
+ COMPQUIET(dbc, NULL);
+
+ return;
+}
+
+/*
+ * __heap_stat_callback --
+ * Statistics callback.
+ *
+ * PUBLIC: int __heap_stat_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__heap_stat_callback(dbc, h, cookie, putp)
+ DBC *dbc;
+ PAGE *h;
+ void *cookie;
+ int *putp;
+{
+ DB *dbp;
+ DB_HEAP_STAT *sp;
+ HEAPHDR *hdr;
+ int i;
+
+ dbp = dbc->dbp;
+ sp = cookie;
+ *putp = 0;
+
+ switch (TYPE(h)) {
+ case P_HEAP:
+ /*
+ * We can't just use NUM_ENT, otherwise we'd mis-count split
+ * records.
+ */
+ for (i = 0; i < NUM_ENT(h); i++) {
+ hdr = (HEAPHDR *)P_ENTRY(dbp, h, i);
+ if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+ F_ISSET(hdr, HEAP_RECFIRST))
+ sp->heap_nrecs++;
+ }
+ break;
+ case P_HEAPMETA: /* Fallthrough */
+ case P_IHEAP: /* Fallthrough */
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__heap_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbc->env));
+}
+#endif
+
+/*
+ * __heap_traverse --
+ * Walk a Heap database.
+ *
+ * PUBLIC: int __heap_traverse __P((DBC *,
+ * PUBLIC: int (*)(DBC *, PAGE *, void *, int *), void *));
+ */
+int
+__heap_traverse(dbc, callback, cookie)
+ DBC *dbc;
+ int (*callback)__P((DBC *, PAGE *, void *, int *));
+ void *cookie;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_pgno_t pgno;
+ int already_put, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ LOCK_INIT(lock);
+ pgno = FIRST_HEAP_DPAGE;
+
+ for (;;) {
+ already_put = 0;
+ h = NULL;
+
+ if ((ret = __db_lget(dbc,
+ 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
+ break;
+ if ((ret = __memp_fget(mpf,
+ &pgno, dbc->thread_info, dbc->txn, 0, &h)) != 0) {
+ if (ret == DB_PAGE_NOTFOUND)
+ ret = 0;
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+ }
+
+ ret = callback(dbc, h, cookie, &already_put);
+
+ if (!already_put && (t_ret = __memp_fput(
+ mpf, dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ break;
+ pgno++;
+ }
+
+ return (ret);
+}
diff --git a/src/heap/heap_stub.c b/src/heap/heap_stub.c
new file mode 100644
index 00000000..b4feb2f3
--- /dev/null
+++ b/src/heap/heap_stub.c
@@ -0,0 +1,328 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id:
+ */
+
+#ifndef HAVE_HEAP
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+
+/*
+ * If the library wasn't compiled with the Heap access method, various
+ * routines aren't available. Stub them here, returning an appropriate
+ * error.
+ */
+
+/*
+ * __db_no_heap_am --
+ * Error when a Berkeley DB build doesn't include the access method.
+ *
+ * PUBLIC: int __db_no_heap_am __P((ENV *));
+ */
+int
+__db_no_heap_am(env)
+ ENV *env;
+{
+ __db_errx(env,
+ "library build did not include support for the Heap access method");
+ return (DB_OPNOTSUP);
+}
+
+int
+__heap_db_create(dbp)
+ DB *dbp;
+{
+ COMPQUIET(dbp, NULL);
+ return (0);
+}
+
+int
+__heap_db_close(dbp)
+ DB *dbp;
+{
+ COMPQUIET(dbp, NULL);
+ return (0);
+}
+
+int
+__heap_get_heapsize(dbp, gbytes, bytes)
+ DB *dbp;
+ u_int32_t *gbytes, *bytes;
+{
+ COMPQUIET(gbytes, NULL);
+ COMPQUIET(bytes, NULL);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heapc_dup(orig_dbc, new_dbc)
+ DBC *orig_dbc, *new_dbc;
+{
+ COMPQUIET(new_dbc, NULL);
+ return (__db_no_heap_am(orig_dbc->env));
+}
+
+int
+__heapc_gsplit(dbc, dbt, bpp, bpsz)
+ DBC *dbc;
+ DBT *dbt;
+ void **bpp;
+ u_int32_t *bpsz;
+{
+ COMPQUIET(dbt, NULL);
+ COMPQUIET(bpp, NULL);
+ COMPQUIET(bpsz, NULL);
+ return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_append(dbc, key, data)
+ DBC *dbc;
+ DBT *key, *data;
+{
+ COMPQUIET(key, NULL);
+ COMPQUIET(data, NULL);
+ return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_backup(dbenv, dbp, ip, fp, handle, flags)
+ DB_ENV *dbenv;
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_FH *fp;
+ void *handle;
+ u_int32_t flags;
+{
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(ip, NULL);
+ COMPQUIET(fp, NULL);
+ COMPQUIET(handle, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_heap_am(dbenv->env));
+}
+
+int
+__heapc_init(dbc)
+ DBC *dbc;
+{
+ return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(dtabp, NULL);
+ return (0);
+}
+
+int
+__heap_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(dtabp, NULL);
+ return (0);
+}
+
+int
+__heap_meta2pgset(dbp, vdp, heapmeta, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ HEAPMETA *heapmeta;
+ DB *pgset;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(heapmeta, NULL);
+ COMPQUIET(pgset, NULL);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_metachk(dbp, name, hm)
+ DB *dbp;
+ const char *name;
+ HEAPMETA *hm;
+{
+ COMPQUIET(name, NULL);
+ COMPQUIET(hm, NULL);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ COMPQUIET(ip, NULL);
+ COMPQUIET(txn, NULL);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(name, NULL);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_open(dbp, ip, txn, name, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ COMPQUIET(ip, NULL);
+ COMPQUIET(txn, NULL);
+ COMPQUIET(name, NULL);
+ COMPQUIET(base_pgno, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_pgin(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ COMPQUIET(pg, 0);
+ COMPQUIET(pp, NULL);
+ COMPQUIET(cookie, NULL);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_pgout(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ COMPQUIET(pg, 0);
+ COMPQUIET(pp, NULL);
+ COMPQUIET(cookie, NULL);
+ return (__db_no_heap_am(dbp->env));
+}
+
+void
+__heap_print_cursor(dbc)
+ DBC *dbc;
+{
+ (void)__db_no_heap_am(dbc->env);
+}
+
+int
+__heapc_refresh(dbc)
+ DBC *dbc;
+{
+ return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(handle, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(callback, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ COMPQUIET(countp, NULL);
+ return (__db_no_heap_am(dbc->env));
+}
+
+int
+__heap_vrfy(dbp, vdbp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdbp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ COMPQUIET(h, NULL);
+ COMPQUIET(vdbp, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ HEAPMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(meta, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_vrfy_structure(dbp, vdp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_exist()
+{
+ return (0);
+}
+#endif /* !HAVE_HEAP */
diff --git a/src/heap/heap_verify.c b/src/heap/heap_verify.c
new file mode 100644
index 00000000..ea15c28b
--- /dev/null
+++ b/src/heap/heap_verify.c
@@ -0,0 +1,468 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/heap.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __heap_safe_gsplit __P((DB *, VRFY_DBINFO *, PAGE *, db_indx_t,
+ DBT *));
+static int __heap_verify_offset_cmp __P((const void *, const void *));
+
+/*
+ * __heap_vrfy_meta --
+ * Verify the heap-specific part of a metadata page.
+ *
+ * PUBLIC: int __heap_vrfy_meta __P((DB *, VRFY_DBINFO *, HEAPMETA *,
+ * PUBLIC: db_pgno_t, u_int32_t));
+ */
+int
+__heap_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ HEAPMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ HEAP *h;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t last_pgno, max_pgno, npgs;
+ int isbad, ret;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ isbad = 0;
+ /*
+ * Heap can't be used in subdatabases, so if this isn't set
+ * something very odd is going on.
+ */
+ if (!F_ISSET(pip, VRFY_INCOMPLETE))
+ EPRINT((dbp->env, DB_STR_A("1156",
+ "Page %lu: Heap databases must be one-per-file",
+ "%lu"), (u_long)pgno));
+
+ /*
+ * We have already checked the common fields in __db_vrfy_pagezero.
+ * However, we used the on-disk metadata page, it may have been stale.
+ * We now have the page from mpool, so check that.
+ */
+ if ((ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * Check that nregions is correct. The last page in the database must
+ * belong to the nregion-th page.
+ */
+ h = (HEAP *)dbp->heap_internal;
+ h->region_size = meta->region_size;
+ last_pgno = meta->dbmeta.last_pgno;
+ if (meta->nregions != HEAP_REGION_NUM(dbp, last_pgno)) {
+ EPRINT((dbp->env, DB_STR_A("1157",
+ "Page %lu: Number of heap regions incorrect",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+
+ /*
+ * Check that last_pgno doesn't surpass the end of a fixed size
+ * database.
+ */
+ if (meta->gbytes != 0 || meta->bytes != 0) {
+ /*
+ * We don't have to worry about rounding with gbytes, as pgsize
+ * is always a multiple of 2, but we round down if bytes isn't
+ * a multiple of the page size.
+ */
+ npgs = (db_pgno_t)(meta->gbytes * (GIGABYTE / dbp->pgsize));
+ npgs += (db_pgno_t)(meta->bytes / dbp->pgsize);
+ max_pgno = npgs - 1;
+ if (last_pgno > max_pgno) {
+ EPRINT((dbp->env, DB_STR_A("1158",
+ "Page %lu: last_pgno beyond end of fixed size heap",
+ "%lu"), (u_long)pgno));
+ isbad = 1;
+ }
+ }
+
+err: if (LF_ISSET(DB_SALVAGE))
+ ret = __db_salvage_markdone(vdp, pgno);
+
+ return (ret == 0 && isbad == 1 ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __heap_vrfy --
+ * Verify a heap data or internal page.
+ *
+ * PUBLIC: int __heap_vrfy __P((DB *,
+ * PUBLIC: VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__heap_vrfy(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ HEAPHDR *hdr;
+ int cnt, i, j, ret;
+ db_indx_t *offsets, *offtbl, end;
+
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0)
+ goto err;
+
+ if (TYPE(h) == P_IHEAP)
+ /* Nothing to verify on a region page. */
+ return (0);
+
+ offtbl = HEAP_OFFSETTBL(dbp, h);
+
+ if ((ret = __os_malloc(dbp->env,
+ NUM_ENT(h) * sizeof(db_indx_t), &offsets)) != 0)
+ goto err;
+
+ /*
+ * Build a sorted list of all the offsets in the table. Entries in the
+ * offset table are not always sorted. While we're here, check that
+ * flags are sane.
+ */
+ cnt = 0;
+ for (i = 0; i <= HEAP_HIGHINDX(h); i++) {
+ if (offtbl[i] == 0)
+ /* Unused index. */
+ continue;
+ if (cnt >= NUM_ENT(h)) {
+ /* Unexpected entry in the offset table. */
+ EPRINT((dbp->env, DB_STR_A("1159",
+ "Page %lu: incorrect number of entries in page's offset table",
+ "%lu"), (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ hdr = (HEAPHDR *)P_ENTRY(dbp, h, i);
+ if (!F_ISSET(hdr, HEAP_RECSPLIT) &&
+ F_ISSET(hdr, HEAP_RECFIRST | HEAP_RECLAST)) {
+ EPRINT((dbp->env, DB_STR_A("1165",
+ "Page %lu: record %lu has invalid flags",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ offsets[cnt] = offtbl[i];
+ cnt++;
+ }
+ if (cnt == 0) {
+ /* Empty page. */
+ ret = 0;
+ goto err;
+ }
+ qsort(offsets, cnt, sizeof(db_indx_t), __heap_verify_offset_cmp);
+
+ /*
+ * Now check that the record at each offset does not overlap the next
+ * record. We can't use the P_ENTRY macro because we've kept track of
+ * the offsets, not the indexes.
+ */
+ for (i = 0; i < cnt - 1; i++) {
+ hdr = (HEAPHDR *)((u_int8_t *)h + offsets[i]);
+ end = offsets[i] + HEAP_HDRSIZE(hdr) + hdr->size;
+ if (end > offsets[i+1]) {
+ /*
+ * Find the record number for this offset, for the error
+ * msg.
+ */
+ for (j = 0; j < HEAP_HIGHINDX(h); j++)
+ if (offtbl[j] == offsets[i])
+ break;
+ EPRINT((dbp->env, DB_STR_A("1160",
+ "Page %lu: record %lu (length %lu) overlaps next record",
+ "%lu %lu %lu"),
+ (u_long)pgno, (u_long)j, (u_long)hdr->size));
+ ret = DB_VERIFY_BAD;
+ }
+ }
+
+ /* Finally, check that the last record doesn't overflow the page */
+ hdr = (HEAPHDR *)((u_int8_t *)h + offsets[i]);
+ end = offsets[i] + HEAP_HDRSIZE(hdr) + hdr->size;
+ if (end > dbp->pgsize) {
+ /* Find the record number for this offset, for the error msg. */
+ for (j = 0; j < HEAP_HIGHINDX(h); j++)
+ if (offtbl[j] == offsets[i])
+ break;
+ EPRINT((dbp->env, DB_STR_A("1161",
+ "Page %lu: record %lu (length %lu) beyond end of page",
+ "%lu %lu %lu"),
+ (u_long)pgno, (u_long)j, (u_long)hdr->size));
+ ret = DB_VERIFY_BAD;
+ }
+
+ err: __os_free(dbp->env, offsets);
+ return (ret);
+}
+
+static int
+__heap_verify_offset_cmp(off1, off2)
+ const void *off1;
+ const void *off2;
+{
+ return (*(db_indx_t *)off1 - *(db_indx_t *)off2);
+}
+
+/*
+ * __heap_vrfy_structure --
+ * Verify the structure of a heap database.
+ *
+ * PUBLIC: int __heap_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t));
+ */
+int
+__heap_vrfy_structure(dbp, vdp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ u_int32_t flags;
+{
+ VRFY_PAGEINFO *pip;
+ db_pgno_t i, next_region, high_pgno;
+ int ret, isbad;
+
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0)
+ return (ret);
+
+ if (pip->type != P_HEAPMETA) {
+ EPRINT((dbp->env, DB_STR_A("1162",
+ "Page %lu: heap database has no meta page", "%lu"),
+ (u_long)PGNO_BASE_MD));
+ isbad = 1;
+ goto err;
+ }
+
+ if ((ret = __db_vrfy_pgset_inc(
+ vdp->pgset, vdp->thread_info, vdp->txn, 0)) != 0)
+ goto err;
+
+ /*
+ * Not much structure to verify. Just make sure region pages are where
+ * they're supposed to be. If we don't have FTRUNCATE, there could be
+ * a zero'd out page where the region page is supposed to be.
+ */
+ next_region = FIRST_HEAP_RPAGE;
+ high_pgno = 0;
+ for (i = 1; i <= vdp->last_pgno; i++) {
+ /* Send feedback to the application about our progress. */
+ if (!LF_ISSET(DB_SALVAGE))
+ __db_vrfy_struct_feedback(dbp, vdp);
+
+ if ((ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0 ||
+ (ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+ return (ret);
+ if (i != next_region &&
+ pip->type != P_HEAP && pip->type != P_INVALID) {
+ EPRINT((dbp->env, DB_STR_A("1163",
+ "Page %lu: heap database page of incorrect type %lu",
+ "%lu %lu"), (u_long)i, (u_long)pip->type));
+ isbad = 1;
+ } else if (i == next_region && pip->type != P_IHEAP
+#ifndef HAVE_FTRUNCATE
+ && pip->type != P_INVALID
+#endif
+ ) {
+ EPRINT((dbp->env, DB_STR_A("1164",
+ "Page %lu: heap database missing region page (page type %lu)",
+ "%lu %lu"), (u_long)i, (u_long)pip->type));
+ isbad = 1;
+ } else if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
+ vdp->thread_info, vdp->txn, i)) != 0)
+ goto err;
+
+ if (i == next_region) {
+ high_pgno = pip->prev_pgno;
+ next_region += HEAP_REGION_SIZE(dbp) + 1;
+ } else if (pip->type != P_INVALID && i > high_pgno) {
+ EPRINT((dbp->env, DB_STR_A("1166",
+ "Page %lu heap database page beyond high page in region",
+ "%lu"), (u_long) i));
+ isbad = 1;
+ }
+ }
+
+err: if ((ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0)
+ return (ret);
+ return (isbad == 1 ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __heap_salvage --
+ * Safely dump out anything that looks like a record on an alleged heap
+ * data page.
+ *
+ * PUBLIC: int __heap_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__heap_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DBT dbt;
+ HEAPHDR *hdr;
+ db_indx_t i, *offtbl;
+ int err_ret, ret, t_ret;
+
+ COMPQUIET(flags, 0);
+ memset(&dbt, 0, sizeof(DBT));
+
+ offtbl = (db_indx_t *)HEAP_OFFSETTBL(dbp, h);
+ err_ret = ret = t_ret = 0;
+
+ /*
+ * Walk the page, dumping non-split records and retrieving split records
+ * when the first piece is encountered,
+ */
+ for (i = 0; i <= HEAP_HIGHINDX(h); i++) {
+ if (offtbl[i] == 0)
+ continue;
+ hdr = (HEAPHDR *)P_ENTRY(dbp, h, i);
+ if (F_ISSET(hdr, HEAP_RECSPLIT)) {
+ if (!F_ISSET(hdr, HEAP_RECFIRST))
+ continue;
+ /*
+ * We don't completely trust hdr->tsize if it's huge,
+ * gsplit() is able to realloc as needed.
+ */
+ dbt.size = ((HEAPSPLITHDR *)hdr)->tsize;
+ if (dbt.size > dbp->pgsize * 4)
+ dbt.size = dbp->pgsize * 4;
+ if ((ret =
+ __os_malloc(dbp->env, dbt.size, &dbt.data)) != 0)
+ goto err;
+ __heap_safe_gsplit(dbp, vdp, h, i, &dbt);
+ } else {
+ dbt.data = (u_int8_t *)hdr + HEAP_HDRSIZE(hdr);
+ dbt.size = hdr->size;
+ }
+
+ if ((ret = __db_vrfy_prdbt(&dbt,
+ 0, " ", handle, callback, 0, 0, vdp)) != 0)
+ err_ret = ret;
+ if (F_ISSET(hdr, HEAP_RECSPLIT))
+ __os_free(dbp->env, dbt.data);
+ }
+
+err: if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
+ return (t_ret);
+ return ((ret == 0 && err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __heap_safe_gsplit --
+ * Given a page and offset, retrieve a split record.
+ */
+static int
+__heap_safe_gsplit(dbp, vdp, h, i, dbt)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_indx_t i;
+ DBT *dbt;
+{
+ DB_MPOOLFILE *mpf;
+ HEAPSPLITHDR *hdr;
+ int gotpg, ret, t_ret;
+ u_int32_t bufsz, reclen;
+ u_int8_t *buf;
+
+ mpf = dbp->mpf;
+ buf = dbt->data;
+ bufsz = dbt->size;
+ dbt->size = 0;
+ ret = 0;
+
+ gotpg = 0;
+ for (;;) {
+ hdr = (HEAPSPLITHDR *)P_ENTRY(dbp, h, i);
+ reclen = hdr->std_hdr.size;
+ /* First copy the data from this page */
+ if (dbt->size + reclen > bufsz) {
+ bufsz = dbt->size + reclen;
+ if ((ret = __os_realloc(
+ dbp->env, bufsz, &dbt->data)) != 0)
+ goto err;
+ buf = (u_int8_t *)dbt->data + dbt->size;
+ }
+ memcpy(buf, (u_int8_t *)hdr + sizeof(HEAPSPLITHDR), reclen);
+ buf += reclen;
+ dbt->size += reclen;
+
+ /* If we're not at the end of the record, grab the next page. */
+ if (F_ISSET(&hdr->std_hdr, HEAP_RECLAST))
+ break;
+ if (gotpg && (ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ return (ret);
+ gotpg = 0;
+ if ((ret = __memp_fget(mpf,
+ &hdr->nextpg, vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+ gotpg = 1;
+ i = hdr->nextindx;
+ }
+
+err: if (gotpg && (t_ret = __memp_fput(
+ mpf, vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
+ t_ret = ret;
+ return (ret);
+}
+
+/*
+ * __heap_meta2pgset --
+ * Given a known-good meta page, populate pgsetp with the db_pgno_t's
+ * corresponding to the pages in the heap. This is just all pages in the
+ * database.
+ *
+ * PUBLIC: int __heap_meta2pgset __P((DB *, VRFY_DBINFO *, HEAPMETA *, DB *));
+ */
+int
+__heap_meta2pgset(dbp, vdp, heapmeta, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ HEAPMETA *heapmeta;
+ DB *pgset;
+{
+ db_pgno_t pgno, last;
+ int ret;
+
+ COMPQUIET(dbp, NULL);
+
+ last = heapmeta->dbmeta.last_pgno;
+ ret = 0;
+
+ for (pgno = 1; pgno <= last; pgno++)
+ if ((ret = __db_vrfy_pgset_inc(
+ pgset, vdp->thread_info, vdp->txn, pgno)) != 0)
+ break;
+ return (ret);
+}
diff --git a/src/hmac/hmac.c b/src/hmac/hmac.c
new file mode 100644
index 00000000..4febfc60
--- /dev/null
+++ b/src/hmac/hmac.c
@@ -0,0 +1,223 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * Some parts of this code originally written by Adam Stubblefield,
+ * -- astubble@rice.edu.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h" /* for hash.h only */
+#include "dbinc/hash.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+
+#define HMAC_OUTPUT_SIZE 20
+#define HMAC_BLOCK_SIZE 64
+
+static void __db_hmac __P((u_int8_t *, u_int8_t *, size_t, u_int8_t *));
+
+/*
+ * !!!
+ * All of these functions use a ctx structure on the stack. The __db_SHA1Init
+ * call does not initialize the 64-byte buffer portion of it. The
+ * underlying SHA1 functions will properly pad the buffer if the data length
+ * is less than 64-bytes, so there isn't a chance of reading uninitialized
+ * memory. Although it would be cleaner to do a memset(ctx.buffer, 0, 64)
+ * we do not want to incur that penalty if we don't have to for performance.
+ */
+
+/*
+ * __db_hmac --
+ * Do a hashed MAC.
+ */
+static void
+__db_hmac(k, data, data_len, mac)
+ u_int8_t *k, *data, *mac;
+ size_t data_len;
+{
+ SHA1_CTX ctx;
+ u_int8_t key[HMAC_BLOCK_SIZE];
+ u_int8_t ipad[HMAC_BLOCK_SIZE];
+ u_int8_t opad[HMAC_BLOCK_SIZE];
+ u_int8_t tmp[HMAC_OUTPUT_SIZE];
+ int i;
+
+ memset(key, 0x00, HMAC_BLOCK_SIZE);
+ memset(ipad, 0x36, HMAC_BLOCK_SIZE);
+ memset(opad, 0x5C, HMAC_BLOCK_SIZE);
+
+ memcpy(key, k, HMAC_OUTPUT_SIZE);
+
+ for (i = 0; i < HMAC_BLOCK_SIZE; i++) {
+ ipad[i] ^= key[i];
+ opad[i] ^= key[i];
+ }
+
+ __db_SHA1Init(&ctx);
+ __db_SHA1Update(&ctx, ipad, HMAC_BLOCK_SIZE);
+ __db_SHA1Update(&ctx, data, data_len);
+ __db_SHA1Final(tmp, &ctx);
+ __db_SHA1Init(&ctx);
+ __db_SHA1Update(&ctx, opad, HMAC_BLOCK_SIZE);
+ __db_SHA1Update(&ctx, tmp, HMAC_OUTPUT_SIZE);
+ __db_SHA1Final(mac, &ctx);
+ return;
+}
+
+/*
+ * __db_chksum --
+ * Create a MAC/SHA1 checksum.
+ *
+ * PUBLIC: void __db_chksum __P((void *,
+ * PUBLIC: u_int8_t *, size_t, u_int8_t *, u_int8_t *));
+ */
+void
+__db_chksum(hdr, data, data_len, mac_key, store)
+ void *hdr;
+ u_int8_t *data;
+ size_t data_len;
+ u_int8_t *mac_key;
+ u_int8_t *store;
+{
+ int sumlen;
+ u_int32_t hash4;
+
+ /*
+ * Since the checksum might be on a page of data we are checksumming
+ * we might be overwriting after checksumming, we zero-out the
+ * checksum value so that we can have a known value there when
+ * we verify the checksum.
+ * If we are passed a log header XOR in prev and len so we have
+ * some redundancy on these fields. Mostly we need to be sure that
+ * we detect a race when doing hot backups and reading a live log
+ * file.
+ */
+ if (mac_key == NULL)
+ sumlen = sizeof(u_int32_t);
+ else
+ sumlen = DB_MAC_KEY;
+ if (hdr == NULL)
+ memset(store, 0, sumlen);
+ else
+ store = ((HDR*)hdr)->chksum;
+ if (mac_key == NULL) {
+ /* Just a hash, no MAC */
+ hash4 = __ham_func4(NULL, data, (u_int32_t)data_len);
+ if (hdr != NULL)
+ hash4 ^= ((HDR *)hdr)->prev ^ ((HDR *)hdr)->len;
+ memcpy(store, &hash4, sumlen);
+ } else {
+ __db_hmac(mac_key, data, data_len, store);
+ if (hdr != 0) {
+ ((int *)store)[0] ^= ((HDR *)hdr)->prev;
+ ((int *)store)[1] ^= ((HDR *)hdr)->len;
+ }
+ }
+ return;
+}
+/*
+ * __db_derive_mac --
+ * Create a MAC/SHA1 key.
+ *
+ * PUBLIC: void __db_derive_mac __P((u_int8_t *, size_t, u_int8_t *));
+ */
+void
+__db_derive_mac(passwd, plen, mac_key)
+ u_int8_t *passwd;
+ size_t plen;
+ u_int8_t *mac_key;
+{
+ SHA1_CTX ctx;
+
+ /* Compute the MAC key. mac_key must be 20 bytes. */
+ __db_SHA1Init(&ctx);
+ __db_SHA1Update(&ctx, passwd, plen);
+ __db_SHA1Update(&ctx, (u_int8_t *)DB_MAC_MAGIC, strlen(DB_MAC_MAGIC));
+ __db_SHA1Update(&ctx, passwd, plen);
+ __db_SHA1Final(mac_key, &ctx);
+
+ return;
+}
+
+/*
+ * __db_check_chksum --
+ * Verify a checksum.
+ *
+ * Return 0 on success, >0 (errno) on error, -1 on checksum mismatch.
+ *
+ * PUBLIC: int __db_check_chksum __P((ENV *,
+ * PUBLIC: void *, DB_CIPHER *, u_int8_t *, void *, size_t, int));
+ */
+int
+__db_check_chksum(env, hdr, db_cipher, chksum, data, data_len, is_hmac)
+ ENV *env;
+ void *hdr;
+ DB_CIPHER *db_cipher;
+ u_int8_t *chksum;
+ void *data;
+ size_t data_len;
+ int is_hmac;
+{
+ int ret;
+ size_t sum_len;
+ u_int32_t hash4;
+ u_int8_t *mac_key, old[DB_MAC_KEY], new[DB_MAC_KEY];
+
+ /*
+ * If we are just doing checksumming and not encryption, then checksum
+ * is 4 bytes. Otherwise, it is DB_MAC_KEY size. Check for illegal
+ * combinations of crypto/non-crypto checksums.
+ */
+ if (is_hmac == 0) {
+ if (db_cipher != NULL) {
+ __db_errx(env, DB_STR("0195",
+ "Unencrypted checksum with a supplied encryption key"));
+ return (EINVAL);
+ }
+ sum_len = sizeof(u_int32_t);
+ mac_key = NULL;
+ } else {
+ if (db_cipher == NULL) {
+ __db_errx(env, DB_STR("0196",
+ "Encrypted checksum: no encryption key specified"));
+ return (EINVAL);
+ }
+ sum_len = DB_MAC_KEY;
+ mac_key = db_cipher->mac_key;
+ }
+
+ /*
+ * !!!
+ * Since the checksum might be on the page, we need to have known data
+ * there so that we can generate the same original checksum. We zero
+ * it out, just like we do in __db_chksum above.
+ * If there is a log header, XOR the prev and len fields.
+ */
+ if (hdr == NULL) {
+ memcpy(old, chksum, sum_len);
+ memset(chksum, 0, sum_len);
+ chksum = old;
+ }
+
+ if (mac_key == NULL) {
+ /* Just a hash, no MAC */
+ hash4 = __ham_func4(NULL, data, (u_int32_t)data_len);
+ if (hdr != NULL)
+ LOG_HDR_SUM(0, hdr, &hash4);
+ ret = memcmp((u_int32_t *)chksum, &hash4, sum_len) ? -1 : 0;
+ } else {
+ __db_hmac(mac_key, data, data_len, new);
+ if (hdr != NULL)
+ LOG_HDR_SUM(1, hdr, new);
+ ret = memcmp(chksum, new, sum_len) ? -1 : 0;
+ }
+
+ return (ret);
+}
diff --git a/src/hmac/sha1.c b/src/hmac/sha1.c
new file mode 100644
index 00000000..76069694
--- /dev/null
+++ b/src/hmac/sha1.c
@@ -0,0 +1,289 @@
+/*
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/hmac.h"
+
+/*
+SHA-1 in C
+By Steve Reid <sreid@sea-to-sky.net>
+100% Public Domain
+
+-----------------
+Modified 7/98
+By James H. Brown <jbrown@burgoyne.com>
+Still 100% Public Domain
+
+Corrected a problem which generated improper hash values on 16 bit machines
+Routine SHA1Update changed from
+ void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned int
+len)
+to
+ void SHA1Update(SHA1_CTX* context, unsigned char* data, unsigned
+long len)
+
+The 'len' parameter was declared an int which works fine on 32 bit machines.
+However, on 16 bit machines an int is too small for the shifts being done
+against
+it. This caused the hash function to generate incorrect values if len was
+greater than 8191 (8K - 1) due to the 'len << 3' on line 3 of SHA1Update().
+
+Since the file IO in main() reads 16K at a time, any file 8K or larger would
+be guaranteed to generate the wrong hash (e.g. Test Vector #3, a million
+"a"s).
+
+I also changed the declaration of variables i & j in SHA1Update to
+unsigned long from unsigned int for the same reason.
+
+These changes should make no difference to any 32 bit implementations since
+an
+int and a long are the same size in those environments.
+
+--
+I also corrected a few compiler warnings generated by Borland C.
+1. Added #include <process.h> for exit() prototype
+2. Removed unused variable 'j' in SHA1Final
+3. Changed exit(0) to return (0) at end of main.
+
+ALL changes I made can be located by searching for comments containing 'JHB'
+-----------------
+Modified 8/98
+By Steve Reid <sreid@sea-to-sky.net>
+Still 100% public domain
+
+1- Removed #include <process.h> and used return () instead of exit()
+2- Fixed overwriting of finalcount in SHA1Final() (discovered by Chris Hall)
+3- Changed email address from steve@edmweb.com to sreid@sea-to-sky.net
+
+-----------------
+Modified 4/01
+By Saul Kravitz <Saul.Kravitz@celera.com>
+Still 100% PD
+Modified to run on Compaq Alpha hardware.
+
+*/
+
+/*
+Test Vectors (from FIPS PUB 180-1)
+"abc"
+ A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
+"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+ 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
+A million repetitions of "a"
+ 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
+*/
+
+#define SHA1HANDSOFF
+
+/* #include <process.h> */ /* prototype for exit() - JHB */
+/* Using return () instead of exit() - SWR */
+
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+#define blk0(i) is_bigendian ? block->l[i] : \
+ (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
+ |(rol(block->l[i],8)&0x00FF00FF))
+#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
+ ^block->l[(i+2)&15]^block->l[i&15],1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5); \
+ w=rol(w,30);
+#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5); \
+ w=rol(w,30);
+#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
+#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5); \
+ w=rol(w,30);
+#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
+
+#ifdef VERBOSE /* SAK */
+static void __db_SHAPrintContext __P((SHA1_CTX *, char *));
+static void
+__db_SHAPrintContext(context, msg)
+ SHA1_CTX *context;
+ char *msg;
+{
+ printf("%s (%d,%d) %x %x %x %x %x\n",
+ msg,
+ context->count[0], context->count[1],
+ context->state[0],
+ context->state[1],
+ context->state[2],
+ context->state[3],
+ context->state[4]);
+}
+#endif
+
+/* Hash a single 512-bit block. This is the core of the algorithm. */
+
+/*
+ * __db_SHA1Transform --
+ *
+ * PUBLIC: void __db_SHA1Transform __P((u_int32_t *, unsigned char *));
+ */
+void
+__db_SHA1Transform(state, buffer)
+ u_int32_t *state;
+ unsigned char *buffer;
+{
+u_int32_t a, b, c, d, e;
+typedef union {
+ unsigned char c[64];
+ u_int32_t l[16];
+} CHAR64LONG16;
+CHAR64LONG16* block;
+ int is_bigendian;
+#ifdef SHA1HANDSOFF
+ unsigned char workspace[64];
+
+ block = (CHAR64LONG16*)workspace;
+ memcpy(block, buffer, 64);
+#else
+ block = (CHAR64LONG16*)buffer;
+#endif
+ is_bigendian = __db_isbigendian();
+
+ /* Copy context->state[] to working vars */
+ a = state[0];
+ b = state[1];
+ c = state[2];
+ d = state[3];
+ e = state[4];
+ /* 4 rounds of 20 operations each. Loop unrolled. */
+ R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
+ R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
+ R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
+ R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
+ R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
+ R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
+ R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
+ R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
+ R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
+ R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
+ R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
+ R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
+ R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
+ R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
+ R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
+ R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
+ R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
+ R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
+ R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
+ R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
+ /* Add the working vars back into context.state[] */
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+ state[4] += e;
+ /* Wipe variables */
+ a = b = c = d = e = 0;
+}
+
+/* SHA1Init - Initialize new context */
+
+/*
+ * __db_SHA1Init --
+ * Initialize new context
+ *
+ * PUBLIC: void __db_SHA1Init __P((SHA1_CTX *));
+ */
+void
+__db_SHA1Init(context)
+ SHA1_CTX *context;
+{
+ /* SHA1 initialization constants */
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xEFCDAB89;
+ context->state[2] = 0x98BADCFE;
+ context->state[3] = 0x10325476;
+ context->state[4] = 0xC3D2E1F0;
+ context->count[0] = context->count[1] = 0;
+}
+
+/* Run your data through this. */
+
+/*
+ * __db_SHA1Update --
+ * Run your data through this.
+ *
+ * PUBLIC: void __db_SHA1Update __P((SHA1_CTX *, unsigned char *,
+ * PUBLIC: size_t));
+ */
+void
+__db_SHA1Update(context, data, len)
+ SHA1_CTX *context;
+ unsigned char *data;
+ size_t len;
+{
+u_int32_t i, j; /* JHB */
+
+#ifdef VERBOSE
+ __db_SHAPrintContext(context, DB_STR_P("before"));
+#endif
+ j = (context->count[0] >> 3) & 63;
+ if ((context->count[0] += (u_int32_t)len << 3) < (len << 3))
+ context->count[1]++;
+ context->count[1] += (u_int32_t)(len >> 29);
+ if ((j + len) > 63) {
+ memcpy(&context->buffer[j], data, (i = 64-j));
+ __db_SHA1Transform(context->state, context->buffer);
+ for ( ; i + 63 < len; i += 64) {
+ __db_SHA1Transform(context->state, &data[i]);
+ }
+ j = 0;
+ }
+ else i = 0;
+ memcpy(&context->buffer[j], &data[i], len - i);
+#ifdef VERBOSE
+ __db_SHAPrintContext(context, DB_STR_P("after "));
+#endif
+}
+
+/* Add padding and return the message digest. */
+
+/*
+ * __db_SHA1Final --
+ * Add padding and return the message digest.
+ *
+ * PUBLIC: void __db_SHA1Final __P((unsigned char *, SHA1_CTX *));
+ */
+void
+__db_SHA1Final(digest, context)
+ unsigned char *digest;
+ SHA1_CTX *context;
+{
+u_int32_t i; /* JHB */
+unsigned char finalcount[8];
+
+ for (i = 0; i < 8; i++) {
+ finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)]
+ >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */
+ }
+ __db_SHA1Update(context, (unsigned char *)"\200", 1);
+ while ((context->count[0] & 504) != 448) {
+ __db_SHA1Update(context, (unsigned char *)"\0", 1);
+ }
+ __db_SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform()
+*/
+ for (i = 0; i < 20; i++) {
+ digest[i] = (unsigned char)
+ ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255);
+ }
+ /* Wipe variables */
+ i = 0; /* JHB */
+ memset(context->buffer, 0, 64);
+ memset(context->state, 0, 20);
+ memset(context->count, 0, 8);
+ memset(finalcount, 0, 8); /* SWR */
+#ifdef SHA1HANDSOFF /* make SHA1Transform overwrite it's own static vars */
+ __db_SHA1Transform(context->state, context->buffer);
+#endif
+}
+
+/*************************************************************/
diff --git a/src/lock/Design b/src/lock/Design
new file mode 100644
index 00000000..f82bc7e8
--- /dev/null
+++ b/src/lock/Design
@@ -0,0 +1,301 @@
+Synchronization in the Locking Subsystem
+
+This is a document that describes how we implemented fine-grain locking
+in the lock manager (that is, locking on a hash bucket level instead of
+locking the entire region). We found that the increase in concurrency
+was not sufficient to warrant the increase in complexity or the additional
+cost of performing each lock operation. Therefore, we don't use this
+any more. Should we have to do fine-grain locking in a future release,
+this would be a reasonable starting point.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+1. Data structures
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+
+The lock manager maintains 3 different structures:
+
+Objects (__db_lockobj):
+ Describes an object that is locked. When used with DB, this consists
+ of a __db_ilock (a file identifier and a page number).
+
+Lockers (__db_locker):
+ Identifies a specific locker ID and maintains the head of a list of
+ locks held by a locker (for using during transaction commit/abort).
+
+Locks (__db_lock):
+ Describes a particular object lock held on behalf of a particular
+ locker id.
+
+Objects and Lockers reference Locks.
+
+These structures are organized via two synchronized hash tables. Each
+hash table consists of two physical arrays: the array of actual hash
+buckets and an array of mutexes so we can lock individual buckets, rather
+than the whole table.
+
+One hash table contains Objects and the other hash table contains Lockers.
+Objects contain two lists of locks, waiters and holders: holders currently
+hold a lock on the Object, waiters are lock waiting to be granted.
+Lockers are a single linked list that connects the Locks held on behalf
+of the specific locker ID.
+
+In the diagram below:
+
+Locker ID #1 holds a lock on Object #1 (L1) and Object #2 (L5), and is
+waiting on a lock on Object #1 (L3).
+
+Locker ID #2 holds a lock on Object #1 (L2) and is waiting on a lock for
+Object #2 (L7).
+
+Locker ID #3 is waiting for a lock on Object #2 (L6).
+
+ OBJECT -----------------------
+ HASH | |
+ ----|------------- |
+ ________ _______ | | ________ | |
+ | |-->| O1 |--|---|-->| O2 | | |
+ |_______| |_____| | | |______| V |
+ | | W H--->L1->L2 W H--->L5 | holders
+ |_______| | | | | V
+ | | ------->L3 \ ------->L6------>L7 waiters
+ |_______| / \ \
+ . . / \ \
+ . . | \ \
+ . . | \ -----------
+ |_______| | -------------- |
+ | | ____|____ ___|_____ _|______
+ |_______| | | | | | |
+ | | | LID1 | | LID2 | | LID3 |
+ |_______| |_______| |_______| |______|
+ ^ ^ ^
+ | | |
+ ___|________________________|________|___
+ LOCKER | | | | | | | | |
+ HASH | | | | | | | | |
+ | | | | | | | | |
+ |____|____|____|____|____|____|____|____|
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+2. Synchronization
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+
+There are four types of mutexes in the subsystem.
+
+Object mutexes;
+ These map one-to-one to each bucket in the Object hash table.
+ Holding a mutex on an Object bucket secures all the Objects in
+ that bucket as well as the Lock structures linked from those
+ Objects. All fields in the Locks EXCEPT the Locker links (the
+ links that attach Locks by Locker ID) are protected by these
+ mutexes.
+
+Locker mutexes:
+ These map one-to-one to each bucket in the Locker hash table.
+ Holding a mutex on a Locker bucket secures the Locker structures
+ and the Locker links in the Locks.
+
+Memory mutex:
+ This mutex allows calls to allocate/free memory, i.e. calls to
+ __db_shalloc and __db_shalloc_free, as well as manipulation of
+ the Object, Locker and Lock free lists.
+
+Region mutex:
+ This mutex is currently only used to protect the locker ids.
+ It may also be needed later to provide exclusive access to
+ the region for deadlock detection.
+
+Creating or removing a Lock requires locking both the Object lock and the
+Locker lock (and eventually the shalloc lock to return the item to the
+free list).
+
+The locking hierarchy is as follows:
+
+ The Region mutex may never be acquired after any other mutex.
+
+ The Object mutex may be acquired after the Region mutex.
+
+ The Locker mutex may be acquired after the Region and Object
+ mutexes.
+
+ The Memory mutex may be acquired after any mutex.
+
+So, if both and Object mutex and a Locker mutex are going to be acquired,
+the Object mutex must be acquired first.
+
+The Memory mutex may be acquired after any other mutex, but no other mutexes
+can be acquired once the Memory mutex is held.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+3. The algorithms:
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+The locking subsystem supports four basic operations:
+ Get a Lock (lock_get)
+
+ Release a Lock (lock_put)
+
+ Release all the Locks on a specific Object (lock_vec)
+
+ Release all the Locks for a specific Locker (lock_vec)
+
+Get a lock:
+ Acquire Object bucket mutex.
+ Acquire Locker bucket mutex.
+
+ Acquire Memory mutex.
+ If the Object does not exist
+ Take an Object off the freelist.
+ If the Locker doesn't exist
+ Take a Locker off the freelist.
+ Take a Lock off the free list.
+ Release Memory mutex.
+
+ Add Lock to the Object list.
+ Add Lock to the Locker list.
+ Release Locker bucket mutex
+
+ If the lock cannot be granted
+ Release Object bucket mutex
+ Acquire lock mutex (blocks)
+
+ Acquire Object bucket mutex
+ If lock acquisition did not succeed (e.g, deadlock)
+ Acquire Locker bucket mutex
+ If locker should be destroyed
+ Remove locker from hash table
+ Acquire Memory mutex
+ Return locker to free list
+ Release Memory mutex
+ Release Locker bucket mutex
+
+ If object should be released
+ Acquire Memory mutex
+ Return object to free list
+ Release Memory mutex
+
+ Release Object bucket mutex
+
+Release a lock:
+ Acquire Object bucket mutex.
+ (Requires that we be able to find the Object hash bucket
+ without looking inside the Lock itself.)
+
+ If releasing a single lock and the user provided generation number
+ doesn't match the Lock's generation number, the Lock has been reused
+ and we return failure.
+
+ Enter lock_put_internal:
+ if the Lock is still on the Object's lists:
+ Increment Lock's generation number.
+ Remove Lock from the Object's list (NULL link fields).
+ Promote locks for the Object.
+
+ Enter locker_list_removal
+ Acquire Locker bucket mutex.
+ If Locker doesn't exist:
+ Release Locker bucket mutex
+ Release Object bucket mutex
+ Return error.
+ Else if Locker marked as deleted:
+ dont_release = TRUE
+ Else
+ Remove Lock from Locker list.
+ If Locker has no more locks
+ Remove Locker from table.
+ Acquire Memory mutex.
+ Return Locker to free list
+ Release Memory mutex
+ Release Locker bucket mutex.
+ Exit locker_list_removal
+
+ If (!dont_release)
+ Acquire Memory mutex
+ Return Lock to free list
+ Release Memory mutex
+
+ Exit lock_put_internal
+
+ Release Object bucket mutex
+
+Release all the Locks on a specific Object (lock_vec, DB_PUT_ALL_OBJ):
+
+ Acquire Object bucket mutex.
+
+ For each lock on the waiter list:
+ lock_put_internal
+ For each lock on the holder list:
+ lock_put_internal
+
+ Release Object bucket mutex.
+
+Release all the Locks for a specific Locker (lock_vec, DB_PUT_ALL):
+
+ Acquire Locker bucket mutex.
+ Mark Locker deleted.
+ Release Locker mutex.
+
+ For each lock on the Locker's list:
+ Remove from locker's list
+ (The lock could get put back on the free list in
+ lock_put and then could get reallocated and the
+ act of setting its locker links could clobber us.)
+ Perform "Release a Lock" above: skip locker_list_removal.
+
+ Acquire Locker bucket mutex.
+ Remove Locker
+ Release Locker mutex.
+
+ Acquire Memory mutex
+ Return Locker to free list
+ Release Memory mutex
+
+Deadlock detection (lock_detect):
+
+ For each bucket in Object table
+ Acquire the Object bucket mutex.
+ create waitsfor
+
+ For each bucket in Object table
+ Release the Object mutex.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+FAQ:
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+Q: Why do you need generation numbers?
+A: If a lock has been released due to a transaction abort (potentially in a
+ different process), and then lock is released by a thread of control
+ unaware of the abort, the lock might have potentially been re-allocated
+ to a different object. The generation numbers detect this problem.
+
+ Note, we assume that reads/writes of lock generation numbers are atomic,
+ if they are not, it is theoretically possible that a re-allocated lock
+ could be mistaken for another lock.
+
+Q: Why is is safe to walk the Locker list without holding any mutexes at
+ all?
+A: Locks are created with both the Object and Locker bucket mutexes held.
+ Once created, they removed in two ways:
+
+ a) when a specific Lock is released, in which case, the Object and
+ Locker bucket mutexes are again held, and
+
+ b) when all Locks for a specific Locker Id is released.
+
+ In case b), the Locker bucket mutex is held while the Locker chain is
+ marked as "destroyed", which blocks any further access to the Locker
+ chain. Then, each individual Object bucket mutex is acquired when each
+ individual Lock is removed.
+
+Q: What are the implications of doing fine grain locking?
+
+A: Since we no longer globally lock the entire region, lock_vec will no
+ longer be atomic. We still execute the items in a lock_vec in order,
+ so things like lock-coupling still work, but you can't make any
+ guarantees about atomicity.
+
+Q: How do I configure for FINE_GRAIN locking?
+
+A: We currently do not support any automatic configuration for FINE_GRAIN
+ locking. When we do, will need to document that atomicity discussion
+ listed above (it is bug-report #553).
+
+Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
diff --git a/src/lock/lock.c b/src/lock/lock.c
new file mode 100644
index 00000000..e4627734
--- /dev/null
+++ b/src/lock/lock.c
@@ -0,0 +1,2020 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+
+static int __lock_allocobj __P((DB_LOCKTAB *, u_int32_t));
+static int __lock_alloclock __P((DB_LOCKTAB *, u_int32_t));
+static int __lock_freelock __P((DB_LOCKTAB *,
+ struct __db_lock *, DB_LOCKER *, u_int32_t));
+static int __lock_getobj
+ __P((DB_LOCKTAB *, const DBT *, u_int32_t, int, DB_LOCKOBJ **));
+static int __lock_get_api __P((ENV *,
+ u_int32_t, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *));
+static int __lock_inherit_locks __P ((DB_LOCKTAB *, DB_LOCKER *, u_int32_t));
+static int __lock_same_family __P((DB_LOCKTAB *, DB_LOCKER *, DB_LOCKER *));
+static int __lock_put_internal __P((DB_LOCKTAB *,
+ struct __db_lock *, u_int32_t, u_int32_t));
+static int __lock_put_nolock __P((ENV *, DB_LOCK *, int *, u_int32_t));
+static int __lock_remove_waiter __P((DB_LOCKTAB *,
+ DB_LOCKOBJ *, struct __db_lock *, db_status_t));
+static int __lock_trade __P((ENV *, DB_LOCK *, DB_LOCKER *));
+static int __lock_vec_api __P((ENV *,
+ u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+
+static const char __db_lock_invalid[] = "%s: Lock is no longer valid";
+static const char __db_locker_invalid[] = "Locker is not valid";
+
+#ifdef DEBUG
+extern void __db_loadme (void);
+#endif
+
+/*
+ * __lock_vec_pp --
+ * ENV->lock_vec pre/post processing.
+ *
+ * PUBLIC: int __lock_vec_pp __P((DB_ENV *,
+ * PUBLIC: u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+ */
+int
+__lock_vec_pp(dbenv, lid, flags, list, nlist, elistp)
+ DB_ENV *dbenv;
+ u_int32_t lid, flags;
+ int nlist;
+ DB_LOCKREQ *list, **elistp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lk_handle, "DB_ENV->lock_vec", DB_INIT_LOCK);
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env,
+ "DB_ENV->lock_vec", flags, DB_LOCK_NOWAIT)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__lock_vec_api(env, lid, flags, list, nlist, elistp)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+static int
+__lock_vec_api(env, lid, flags, list, nlist, elistp)
+ ENV *env;
+ u_int32_t lid, flags;
+ int nlist;
+ DB_LOCKREQ *list, **elistp;
+{
+ DB_LOCKER *sh_locker;
+ int ret;
+
+ if ((ret =
+ __lock_getlocker(env->lk_handle, lid, 0, &sh_locker)) == 0)
+ ret = __lock_vec(env, sh_locker, flags, list, nlist, elistp);
+ return (ret);
+}
+
+/*
+ * __lock_vec --
+ * ENV->lock_vec.
+ *
+ * Vector lock routine. This function takes a set of operations
+ * and performs them all at once. In addition, lock_vec provides
+ * functionality for lock inheritance, releasing all locks for a
+ * given locker (used during transaction commit/abort), releasing
+ * all locks on a given object, and generating debugging information.
+ *
+ * PUBLIC: int __lock_vec __P((ENV *,
+ * PUBLIC: DB_LOCKER *, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
+ */
+int
+__lock_vec(env, sh_locker, flags, list, nlist, elistp)
+ ENV *env;
+ DB_LOCKER *sh_locker;
+ u_int32_t flags;
+ int nlist;
+ DB_LOCKREQ *list, **elistp;
+{
+ struct __db_lock *lp, *next_lock;
+ DB_LOCK lock; DB_LOCKOBJ *sh_obj;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ DBT *objlist, *np;
+ u_int32_t ndx;
+ int did_abort, i, ret, run_dd, upgrade, writes;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+
+ run_dd = 0;
+ LOCK_SYSTEM_LOCK(lt, region);
+ for (i = 0, ret = 0; i < nlist && ret == 0; i++)
+ switch (list[i].op) {
+ case DB_LOCK_GET_TIMEOUT:
+ LF_SET(DB_LOCK_SET_TIMEOUT);
+ /* FALLTHROUGH */
+ case DB_LOCK_GET:
+ if (IS_RECOVERING(env)) {
+ LOCK_INIT(list[i].lock);
+ break;
+ }
+ ret = __lock_get_internal(lt,
+ sh_locker, flags, list[i].obj,
+ list[i].mode, list[i].timeout, &list[i].lock);
+ break;
+ case DB_LOCK_INHERIT:
+ ret = __lock_inherit_locks(lt, sh_locker, flags);
+ break;
+ case DB_LOCK_PUT:
+ ret = __lock_put_nolock(env,
+ &list[i].lock, &run_dd, flags);
+ break;
+ case DB_LOCK_PUT_ALL: /* Put all locks. */
+ case DB_LOCK_PUT_READ: /* Put read locks. */
+ case DB_LOCK_UPGRADE_WRITE:
+ /* Upgrade was_write and put read locks. */
+ /*
+ * Since the locker may hold no
+ * locks (i.e., you could call abort before you've
+ * done any work), it's perfectly reasonable for there
+ * to be no locker; this is not an error.
+ */
+ if (sh_locker == NULL)
+ /*
+ * If ret is set, then we'll generate an
+ * error. If it's not set, we have nothing
+ * to do.
+ */
+ break;
+ upgrade = 0;
+ writes = 1;
+ if (list[i].op == DB_LOCK_PUT_READ)
+ writes = 0;
+ else if (list[i].op == DB_LOCK_UPGRADE_WRITE) {
+ if (F_ISSET(sh_locker, DB_LOCKER_DIRTY))
+ upgrade = 1;
+ writes = 0;
+ }
+ objlist = list[i].obj;
+ if (objlist != NULL) {
+ /*
+ * We know these should be ilocks,
+ * but they could be something else,
+ * so allocate room for the size too.
+ */
+ objlist->size =
+ sh_locker->nwrites * sizeof(DBT);
+ if ((ret = __os_malloc(env,
+ objlist->size, &objlist->data)) != 0)
+ goto up_done;
+ memset(objlist->data, 0, objlist->size);
+ np = (DBT *) objlist->data;
+ } else
+ np = NULL;
+
+ /* Now traverse the locks, releasing each one. */
+ for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock);
+ lp != NULL; lp = next_lock) {
+ sh_obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+ next_lock = SH_LIST_NEXT(lp,
+ locker_links, __db_lock);
+ if (writes == 1 ||
+ lp->mode == DB_LOCK_READ ||
+ lp->mode == DB_LOCK_READ_UNCOMMITTED) {
+ SH_LIST_REMOVE(lp,
+ locker_links, __db_lock);
+ sh_obj = SH_OFF_TO_PTR(lp,
+ lp->obj, DB_LOCKOBJ);
+ ndx = sh_obj->indx;
+ OBJECT_LOCK_NDX(lt, region, ndx);
+ /*
+ * We are not letting lock_put_internal
+ * unlink the lock, so we'll have to
+ * update counts here.
+ */
+ if (lp->status == DB_LSTAT_HELD) {
+ DB_ASSERT(env,
+ sh_locker->nlocks != 0);
+ sh_locker->nlocks--;
+ if (IS_WRITELOCK(lp->mode))
+ sh_locker->nwrites--;
+ }
+ ret = __lock_put_internal(lt, lp,
+ sh_obj->indx,
+ DB_LOCK_FREE | DB_LOCK_DOALL);
+ OBJECT_UNLOCK(lt, region, ndx);
+ if (ret != 0)
+ break;
+ continue;
+ }
+ if (objlist != NULL) {
+ DB_ASSERT(env, (u_int8_t *)np <
+ (u_int8_t *)objlist->data +
+ objlist->size);
+ np->data = SH_DBT_PTR(&sh_obj->lockobj);
+ np->size = sh_obj->lockobj.size;
+ np++;
+ }
+ }
+ if (ret != 0)
+ goto up_done;
+
+ if (objlist != NULL)
+ if ((ret = __lock_fix_list(env,
+ objlist, sh_locker->nwrites)) != 0)
+ goto up_done;
+ switch (list[i].op) {
+ case DB_LOCK_UPGRADE_WRITE:
+ /*
+ * Upgrade all WWRITE locks to WRITE so
+ * that we can abort a transaction which
+ * was supporting dirty readers.
+ */
+ if (upgrade != 1)
+ goto up_done;
+ SH_LIST_FOREACH(lp, &sh_locker->heldby,
+ locker_links, __db_lock) {
+ if (lp->mode != DB_LOCK_WWRITE)
+ continue;
+ lock.off = R_OFFSET(&lt->reginfo, lp);
+ lock.gen = lp->gen;
+ F_SET(sh_locker, DB_LOCKER_INABORT);
+ if ((ret = __lock_get_internal(lt,
+ sh_locker, flags | DB_LOCK_UPGRADE,
+ NULL, DB_LOCK_WRITE, 0, &lock)) !=0)
+ break;
+ }
+ up_done:
+ /* FALLTHROUGH */
+ case DB_LOCK_PUT_READ:
+ case DB_LOCK_PUT_ALL:
+ break;
+ default:
+ break;
+ }
+ break;
+ case DB_LOCK_PUT_OBJ:
+ /* Remove all the locks associated with an object. */
+ OBJECT_LOCK(lt, region, list[i].obj, ndx);
+ if ((ret = __lock_getobj(lt, list[i].obj,
+ ndx, 0, &sh_obj)) != 0 || sh_obj == NULL) {
+ if (ret == 0)
+ ret = EINVAL;
+ OBJECT_UNLOCK(lt, region, ndx);
+ break;
+ }
+
+ /*
+ * Go through both waiters and holders. Don't bother
+ * to run promotion, because everyone is getting
+ * released. The processes waiting will still get
+ * awakened as their waiters are released.
+ */
+ for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock);
+ ret == 0 && lp != NULL;
+ lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock))
+ ret = __lock_put_internal(lt, lp, ndx,
+ DB_LOCK_UNLINK |
+ DB_LOCK_NOPROMOTE | DB_LOCK_DOALL);
+
+ /*
+ * On the last time around, the object will get
+ * reclaimed by __lock_put_internal, structure the
+ * loop carefully so we do not get bitten.
+ */
+ for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock);
+ ret == 0 && lp != NULL;
+ lp = next_lock) {
+ next_lock = SH_TAILQ_NEXT(lp, links, __db_lock);
+ ret = __lock_put_internal(lt, lp, ndx,
+ DB_LOCK_UNLINK |
+ DB_LOCK_NOPROMOTE | DB_LOCK_DOALL);
+ }
+ OBJECT_UNLOCK(lt, region, ndx);
+ break;
+
+ case DB_LOCK_TIMEOUT:
+ ret = __lock_set_timeout_internal(env,
+ sh_locker, 0, DB_SET_TXN_NOW);
+ break;
+
+ case DB_LOCK_TRADE:
+ /*
+ * INTERNAL USE ONLY.
+ * Change the holder of the lock described in
+ * list[i].lock to the locker-id specified by
+ * the locker parameter.
+ */
+ /*
+ * You had better know what you're doing here.
+ * We are trading locker-id's on a lock to
+ * facilitate file locking on open DB handles.
+ * We do not do any conflict checking on this,
+ * so heaven help you if you use this flag under
+ * any other circumstances.
+ */
+ ret = __lock_trade(env, &list[i].lock, sh_locker);
+ break;
+#if defined(DEBUG) && defined(HAVE_STATISTICS)
+ case DB_LOCK_DUMP:
+ if (sh_locker == NULL)
+ break;
+
+ SH_LIST_FOREACH(
+ lp, &sh_locker->heldby, locker_links, __db_lock)
+ __lock_printlock(lt, NULL, lp, 1);
+ break;
+#endif
+ default:
+ __db_errx(env, DB_STR_A("2035",
+ "Invalid lock operation: %d", "%d"), list[i].op);
+ ret = EINVAL;
+ break;
+ }
+
+ if (ret == 0 && region->detect != DB_LOCK_NORUN &&
+ (region->need_dd || timespecisset(&region->next_timeout)))
+ run_dd = 1;
+ LOCK_SYSTEM_UNLOCK(lt, region);
+
+ if (run_dd)
+ (void)__lock_detect(env, region->detect, &did_abort);
+
+ if (ret != 0 && elistp != NULL)
+ *elistp = &list[i - 1];
+
+ return (ret);
+}
+
+/*
+ * __lock_get_pp --
+ * ENV->lock_get pre/post processing.
+ *
+ * PUBLIC: int __lock_get_pp __P((DB_ENV *,
+ * PUBLIC: u_int32_t, u_int32_t, DBT *, db_lockmode_t, DB_LOCK *));
+ */
+int
+__lock_get_pp(dbenv, locker, flags, obj, lock_mode, lock)
+ DB_ENV *dbenv;
+ u_int32_t locker, flags;
+ DBT *obj;
+ db_lockmode_t lock_mode;
+ DB_LOCK *lock;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lk_handle, "DB_ENV->lock_get", DB_INIT_LOCK);
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env, "DB_ENV->lock_get", flags,
+ DB_LOCK_NOWAIT | DB_LOCK_UPGRADE | DB_LOCK_SWITCH)) != 0)
+ return (ret);
+
+ if ((ret = __dbt_usercopy(env, obj)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__lock_get_api(env, locker, flags, obj, lock_mode, lock)),
+ 0, ret);
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, obj, NULL, NULL);
+ return (ret);
+}
+
+static int
+__lock_get_api(env, locker, flags, obj, lock_mode, lock)
+ ENV *env;
+ u_int32_t locker, flags;
+ const DBT *obj;
+ db_lockmode_t lock_mode;
+ DB_LOCK *lock;
+{
+ DB_LOCKER *sh_locker;
+ DB_LOCKREGION *region;
+ int ret;
+
+ COMPQUIET(region, NULL);
+
+ region = env->lk_handle->reginfo.primary;
+
+ LOCK_LOCKERS(env, region);
+ ret = __lock_getlocker_int(env->lk_handle, locker, 0, &sh_locker);
+ UNLOCK_LOCKERS(env, region);
+ LOCK_SYSTEM_LOCK(env->lk_handle, region);
+ if (ret == 0)
+ ret = __lock_get_internal(env->lk_handle,
+ sh_locker, flags, obj, lock_mode, 0, lock);
+ LOCK_SYSTEM_UNLOCK(env->lk_handle, region);
+ return (ret);
+}
+
+/*
+ * __lock_get --
+ * ENV->lock_get.
+ *
+ * PUBLIC: int __lock_get __P((ENV *,
+ * PUBLIC: DB_LOCKER *, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *));
+ */
+int
+__lock_get(env, locker, flags, obj, lock_mode, lock)
+ ENV *env;
+ DB_LOCKER *locker;
+ u_int32_t flags;
+ const DBT *obj;
+ db_lockmode_t lock_mode;
+ DB_LOCK *lock;
+{
+ DB_LOCKTAB *lt;
+ int ret;
+
+ lt = env->lk_handle;
+
+ if (IS_RECOVERING(env) && !LF_ISSET(DB_LOCK_IGNORE_REC)) {
+ LOCK_INIT(*lock);
+ return (0);
+ }
+
+ LOCK_SYSTEM_LOCK(lt, (DB_LOCKREGION *)lt->reginfo.primary);
+ ret = __lock_get_internal(lt, locker, flags, obj, lock_mode, 0, lock);
+ LOCK_SYSTEM_UNLOCK(lt, (DB_LOCKREGION *)lt->reginfo.primary);
+ return (ret);
+}
+/*
+ * __lock_alloclock -- allocate a lock from another partition.
+ * We assume we have the partition locked on entry and leave
+ * it unlocked on success since we will have to retry the lock operation.
+ * The mutex will still be locked if we are out of space.
+ */
+static int
+__lock_alloclock(lt, part_id)
+ DB_LOCKTAB *lt;
+ u_int32_t part_id;
+{
+#define FREE_LIST_HEAD free_locks
+#define STRUCT_NAME __db_lock
+#define CURRENT_COUNT st_locks
+#define MAX_COUNT st_maxlocks
+#define STEAL_NAME st_locksteals
+#define STEAL_EVENT steal
+
+#ifdef DEBUG
+ __db_loadme();
+#endif
+
+#include "lock_alloc.incl"
+}
+
+/*
+ * __lock_get_internal --
+ * All the work for lock_get (and for the GET option of lock_vec) is done
+ * inside of lock_get_internal.
+ *
+ * PUBLIC: int __lock_get_internal __P((DB_LOCKTAB *, DB_LOCKER *, u_int32_t,
+ * PUBLIC: const DBT *, db_lockmode_t, db_timeout_t, DB_LOCK *));
+ */
+int
+__lock_get_internal(lt, sh_locker, flags, obj, lock_mode, timeout, lock)
+ DB_LOCKTAB *lt;
+ DB_LOCKER *sh_locker;
+ u_int32_t flags;
+ const DBT *obj;
+ db_lockmode_t lock_mode;
+ db_timeout_t timeout;
+ DB_LOCK *lock;
+{
+ struct __db_lock *newl, *lp;
+ ENV *env;
+ DB_LOCKOBJ *sh_obj;
+ DB_LOCKREGION *region;
+ DB_THREAD_INFO *ip;
+ u_int32_t ndx, part_id;
+ int did_abort, ihold, grant_dirty, no_dd, ret, t_ret;
+ roff_t holder, sh_off;
+
+ /*
+ * We decide what action to take based on what locks are already held
+ * and what locks are in the wait queue.
+ */
+ enum {
+ GRANT, /* Grant the lock. */
+ UPGRADE, /* Upgrade the lock. */
+ HEAD, /* Wait at head of wait queue. */
+ SECOND, /* Wait as the second waiter. */
+ TAIL /* Wait at tail of the wait queue. */
+ } action;
+
+ env = lt->env;
+ region = lt->reginfo.primary;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ if (sh_locker == NULL) {
+ __db_errx(env, DB_STR("2036", "Locker does not exist"));
+ return (EINVAL);
+ }
+
+ DB_ASSERT(env, lock_mode == DB_LOCK_WAIT || !LF_ISSET(DB_LOCK_SWITCH));
+
+ no_dd = ret = 0;
+ newl = NULL;
+ sh_obj = NULL;
+
+ /* Check that the lock mode is valid. */
+ if (lock_mode >= (db_lockmode_t)region->nmodes) {
+ __db_errx(env, DB_STR_A("2037",
+ "DB_ENV->lock_get: invalid lock mode %lu", "%lu"),
+ (u_long)lock_mode);
+ return (EINVAL);
+ }
+
+again: if (obj == NULL) {
+ DB_ASSERT(env, LOCK_ISSET(*lock));
+ lp = R_ADDR(&lt->reginfo, lock->off);
+ DB_ASSERT(env, lock->gen == lp->gen);
+ sh_obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+ ndx = sh_obj->indx;
+ OBJECT_LOCK_NDX(lt, region, ndx);
+ } else {
+ /* Allocate a shared memory new object. */
+ OBJECT_LOCK(lt, region, obj, lock->ndx);
+ ndx = lock->ndx;
+ if ((ret = __lock_getobj(lt,
+ obj, lock->ndx, !LF_ISSET(DB_LOCK_CHECK), &sh_obj)) != 0)
+ goto err;
+#ifdef DIAGNOSTIC
+ if (sh_obj == NULL) {
+ ret = ENOENT;
+ goto err;
+ }
+ if (LF_ISSET(DB_LOCK_UPGRADE)) {
+ DB_ASSERT(env, LOCK_ISSET(*lock));
+ lp = R_ADDR(&lt->reginfo, lock->off);
+ DB_ASSERT(env, lock->gen == lp->gen);
+ DB_ASSERT(env,
+ SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ) == sh_obj);
+ }
+#endif
+ }
+
+#ifdef HAVE_STATISTICS
+ if (LF_ISSET(DB_LOCK_UPGRADE))
+ STAT_INC_VERB(env, lock, upgrade,
+ lt->obj_stat[ndx].st_nupgrade,
+ (DBT *) obj, sh_locker->id);
+ else if (!LF_ISSET(DB_LOCK_SWITCH | DB_LOCK_CHECK))
+ STAT_INC_VERB(env, lock, request,
+ lt->obj_stat[ndx].st_nrequests,
+ (DBT *) obj, sh_locker->id);
+#endif
+
+ /*
+ * Figure out if we can grant this lock or if it should wait.
+ * By default, we can grant the new lock if it does not conflict with
+ * anyone on the holders list OR anyone on the waiters list.
+ * The reason that we don't grant if there's a conflict is that
+ * this can lead to starvation (a writer waiting on a popularly
+ * read item will never be granted). The downside of this is that
+ * a waiting reader can prevent an upgrade from reader to writer,
+ * which is not uncommon.
+ *
+ * There are two exceptions to the no-conflict rule. First, if
+ * a lock is held by the requesting locker AND the new lock does
+ * not conflict with any other holders, then we grant the lock.
+ * The most common place this happens is when the holder has a
+ * WRITE lock and a READ lock request comes in for the same locker.
+ * If we do not grant the read lock, then we guarantee deadlock.
+ * Second, dirty readers are granted if at all possible while
+ * avoiding starvation, see below.
+ *
+ * In case of conflict, we put the new lock on the end of the waiters
+ * list, unless we are upgrading or this is a dirty reader in which
+ * case the locker goes at or near the front of the list.
+ */
+ ihold = 0;
+ grant_dirty = 0;
+ holder = 0;
+
+ /*
+ * DB_LOCK_WAIT is is a special case used by the queue
+ * access method when we want to get an entry which is past
+ * the end of the queue. With CDB we have a DB_READ_LOCK and
+ * need to switch it to DB_LOCK_WAIT. Otherwise we insert a
+ * DB_LOCK_WAIT and and then after releasing the metadata
+ * page wait on it and join the waiters queue. This must be
+ * done as a single operation so that another locker cannot
+ * get in and fail to wake us up.
+ */
+ if (lock_mode == DB_LOCK_WAIT)
+ lp = NULL;
+ else
+ lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock);
+
+ sh_off = R_OFFSET(&lt->reginfo, sh_locker);
+ for (; lp != NULL; lp = SH_TAILQ_NEXT(lp, links, __db_lock)) {
+ if (sh_off == lp->holder) {
+ if (lp->mode == lock_mode &&
+ lp->status == DB_LSTAT_HELD) {
+ if (LF_ISSET(DB_LOCK_UPGRADE))
+ goto upgrade;
+
+#ifdef DIAGNOSTIC
+ if (LF_ISSET(DB_LOCK_CHECK))
+ goto done;
+#endif
+
+ /*
+ * Lock is held, so we can increment the
+ * reference count and return this lock
+ * to the caller. We do not count reference
+ * increments towards the locks held by
+ * the locker.
+ */
+ lp->refcount++;
+ lock->off = R_OFFSET(&lt->reginfo, lp);
+ lock->gen = lp->gen;
+ lock->mode = lp->mode;
+ goto done;
+ } else {
+ ihold = 1;
+ }
+ } else if (__lock_same_family(lt,
+ R_ADDR(&lt->reginfo, lp->holder), sh_locker))
+ ihold = 1;
+ else if (CONFLICTS(lt, region, lp->mode, lock_mode))
+ break;
+ else if (lp->mode == DB_LOCK_READ ||
+ lp->mode == DB_LOCK_WWRITE) {
+ grant_dirty = 1;
+ holder = lp->holder;
+ }
+ }
+
+#ifdef DIAGNOSTIC
+ if (LF_ISSET(DB_LOCK_CHECK)) {
+ ret = ENOENT;
+ goto err;
+ }
+#endif
+
+ /*
+ * If there are conflicting holders we will have to wait. If we
+ * already hold a lock on this object or are doing an upgrade or
+ * this is a dirty reader it goes to the head of the queue, everyone
+ * else to the back.
+ */
+ if (lp != NULL) {
+ if (ihold || LF_ISSET(DB_LOCK_UPGRADE) ||
+ lock_mode == DB_LOCK_READ_UNCOMMITTED)
+ action = HEAD;
+ else
+ action = TAIL;
+ } else {
+ if (LF_ISSET(DB_LOCK_UPGRADE))
+ action = UPGRADE;
+ else if (lock_mode == DB_LOCK_WAIT)
+ action = TAIL;
+ else if (ihold)
+ action = GRANT;
+ else {
+ /*
+ * Look for conflicting waiters.
+ */
+ SH_TAILQ_FOREACH(lp, &sh_obj->waiters, links, __db_lock)
+ if (lp->holder != sh_off &&
+ CONFLICTS(lt, region, lp->mode, lock_mode))
+ break;
+
+ /*
+ * If there are no conflicting holders or waiters,
+ * then we grant. Normally when we wait, we
+ * wait at the end (TAIL). However, the goal of
+ * DIRTY_READ locks to allow forward progress in the
+ * face of updating transactions, so we try to allow
+ * all DIRTY_READ requests to proceed as rapidly
+ * as possible, so long as we can prevent starvation.
+ *
+ * When determining how to queue a DIRTY_READ
+ * request:
+ *
+ * 1. If there is a waiting upgrading writer,
+ * then we enqueue the dirty reader BEHIND it
+ * (second in the queue).
+ * 2. Else, if the current holders are either
+ * READ or WWRITE, we grant
+ * 3. Else queue SECOND i.e., behind the first
+ * waiter.
+ *
+ * The end result is that dirty_readers get to run
+ * so long as other lockers are blocked. Once
+ * there is a locker which is only waiting on
+ * dirty readers then they queue up behind that
+ * locker so that it gets to run. In general
+ * this locker will be a WRITE which will shortly
+ * get downgraded to a WWRITE, permitting the
+ * DIRTY locks to be granted.
+ */
+ if (lp == NULL)
+ action = GRANT;
+ else if (grant_dirty &&
+ lock_mode == DB_LOCK_READ_UNCOMMITTED) {
+ /*
+ * An upgrade will be at the head of the
+ * queue.
+ */
+ lp = SH_TAILQ_FIRST(
+ &sh_obj->waiters, __db_lock);
+ if (lp->mode == DB_LOCK_WRITE &&
+ lp->holder == holder)
+ action = SECOND;
+ else
+ action = GRANT;
+ } else if (lock_mode == DB_LOCK_READ_UNCOMMITTED)
+ action = SECOND;
+ else
+ action = TAIL;
+ }
+ }
+
+ switch (action) {
+ case HEAD:
+ case TAIL:
+ case SECOND:
+ if (LF_ISSET(DB_LOCK_NOWAIT) && lock_mode != DB_LOCK_WAIT) {
+ ret = DB_LOCK_NOTGRANTED;
+ STAT_INC_VERB(env, lock, nowait_notgranted,
+ region->stat.st_lock_nowait,
+ (DBT *) obj, sh_locker->id);
+ goto err;
+ }
+ /* FALLTHROUGH */
+ case GRANT:
+ part_id = LOCK_PART(region, ndx);
+ /* Allocate a new lock. */
+ if ((newl = SH_TAILQ_FIRST(
+ &FREE_LOCKS(lt, part_id), __db_lock)) == NULL) {
+ if ((ret = __lock_alloclock(lt, part_id)) != 0)
+ goto err;
+ /* Allocation dropped the mutex, start over. */
+ OBJECT_UNLOCK(lt, region, ndx);
+ sh_obj = NULL;
+ goto again;
+ }
+ SH_TAILQ_REMOVE(
+ &FREE_LOCKS(lt, part_id), newl, links, __db_lock);
+
+#ifdef HAVE_STATISTICS
+ /*
+ * Keep track of the maximum number of locks allocated
+ * in each partition and the maximum number of locks
+ * used by any one bucket.
+ */
+ if (++lt->obj_stat[ndx].st_nlocks >
+ lt->obj_stat[ndx].st_maxnlocks)
+ lt->obj_stat[ndx].st_maxnlocks =
+ lt->obj_stat[ndx].st_nlocks;
+ if (++lt->part_array[part_id].part_stat.st_nlocks >
+ lt->part_array[part_id].part_stat.st_maxnlocks)
+ lt->part_array[part_id].part_stat.st_maxnlocks =
+ lt->part_array[part_id].part_stat.st_nlocks;
+#endif
+
+ newl->holder = R_OFFSET(&lt->reginfo, sh_locker);
+ newl->refcount = 1;
+ newl->mode = lock_mode;
+ newl->obj = (roff_t)SH_PTR_TO_OFF(newl, sh_obj);
+ newl->indx = sh_obj->indx;
+ newl->mtx_lock = MUTEX_INVALID;
+ /*
+ * Now, insert the lock onto its locker's list.
+ * If the locker does not currently hold any locks,
+ * there's no reason to run a deadlock
+ * detector, save that information.
+ */
+ no_dd = sh_locker->master_locker == INVALID_ROFF &&
+ SH_LIST_FIRST(
+ &sh_locker->child_locker, __db_locker) == NULL &&
+ SH_LIST_FIRST(&sh_locker->heldby, __db_lock) == NULL;
+
+ SH_LIST_INSERT_HEAD(
+ &sh_locker->heldby, newl, locker_links, __db_lock);
+
+ break;
+
+ case UPGRADE:
+upgrade: lp = R_ADDR(&lt->reginfo, lock->off);
+ DB_ASSERT(env, lock->gen == lp->gen);
+ if (IS_WRITELOCK(lock_mode) && !IS_WRITELOCK(lp->mode))
+ sh_locker->nwrites++;
+ lp->mode = lock_mode;
+ /* If we are upgrading to a WAIT we must wait. */
+ if (lock_mode != DB_LOCK_WAIT)
+ goto done;
+ if (lp->status != DB_LSTAT_WAITING) {
+ /* We have already been granted. */
+ MUTEX_LOCK(env, lp->mtx_lock);
+ newl = lp;
+ if (lp->status == DB_LSTAT_EXPIRED)
+ goto expired;
+ DB_ASSERT(env, lp->status == DB_LSTAT_PENDING);
+ SH_TAILQ_REMOVE(
+ &sh_obj->holders, newl, links, __db_lock);
+ newl->links.stqe_prev = -1;
+ goto done;
+ }
+ COMPQUIET(action, UPGRADE);
+ }
+
+ switch (action) {
+ case GRANT:
+ newl->status = DB_LSTAT_HELD;
+ SH_TAILQ_INSERT_TAIL(&sh_obj->holders, newl, links);
+ break;
+ case UPGRADE:
+ DB_ASSERT(env, lock_mode == DB_LOCK_WAIT);
+ /* FALLTHROUGH */
+ case HEAD:
+ case TAIL:
+ case SECOND:
+ if ((lp =
+ SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock)) == NULL) {
+ LOCK_DD(env, region);
+ SH_TAILQ_INSERT_HEAD(&region->dd_objs,
+ sh_obj, dd_links, __db_lockobj);
+ UNLOCK_DD(env, region);
+ }
+ switch (action) {
+ case HEAD:
+ SH_TAILQ_INSERT_HEAD(
+ &sh_obj->waiters, newl, links, __db_lock);
+ break;
+ case SECOND:
+ SH_TAILQ_INSERT_AFTER(
+ &sh_obj->waiters, lp, newl, links, __db_lock);
+ break;
+ case TAIL:
+ SH_TAILQ_INSERT_TAIL(&sh_obj->waiters, newl, links);
+ break;
+ case UPGRADE:
+ /* The lock is already in the queue. */
+ newl = R_ADDR(&lt->reginfo, lock->off);
+ break;
+ default:
+ DB_ASSERT(env, 0);
+ }
+
+ /*
+ * First check to see if this txn has expired.
+ * If not then see if the lock timeout is past
+ * the expiration of the txn, if it is, use
+ * the txn expiration time. lk_expire is passed
+ * to avoid an extra call to get the time.
+ */
+ timespecclear(&sh_locker->lk_expire);
+ if (__clock_expired(env,
+ &sh_locker->lk_expire, &sh_locker->tx_expire)) {
+ newl->status = DB_LSTAT_EXPIRED;
+ sh_locker->lk_expire = sh_locker->tx_expire;
+
+ /* We are done. */
+ goto expired;
+ }
+
+ /*
+ * If a timeout was specified in this call then it
+ * takes priority. If a lock timeout has been specified
+ * for this transaction then use that, otherwise use
+ * the global timeout value.
+ */
+ if (!LF_ISSET(DB_LOCK_SET_TIMEOUT)) {
+ if (F_ISSET(sh_locker, DB_LOCKER_TIMEOUT))
+ timeout = sh_locker->lk_timeout;
+ else
+ timeout = region->lk_timeout;
+ }
+
+ /*
+ * For queue we insert the WAIT lock and don't wait on it.
+ * That way we can unpin the metadata page first and then
+ * block.
+ */
+ if (lock_mode == DB_LOCK_WAIT && LF_ISSET(DB_LOCK_NOWAIT)) {
+ newl->mtx_lock = sh_locker->mtx_locker;
+ newl->status = DB_LSTAT_WAITING;
+ goto out;
+ }
+
+ if (timeout != 0)
+ __clock_set_expires(env,
+ &sh_locker->lk_expire, timeout);
+ else
+ timespecclear(&sh_locker->lk_expire);
+
+ if (timespecisset(&sh_locker->tx_expire) &&
+ (timeout == 0 || __clock_expired(env,
+ &sh_locker->lk_expire, &sh_locker->tx_expire)))
+ sh_locker->lk_expire = sh_locker->tx_expire;
+ if (timespecisset(&sh_locker->lk_expire) &&
+ (!timespecisset(&region->next_timeout) ||
+ timespeccmp(
+ &region->next_timeout, &sh_locker->lk_expire, >)))
+ region->next_timeout = sh_locker->lk_expire;
+
+in_abort: newl->status = DB_LSTAT_WAITING;
+ newl->mtx_lock = sh_locker->mtx_locker;
+ STAT(lt->obj_stat[ndx].st_lock_wait++);
+ /* We are about to block, deadlock detector must run. */
+ region->need_dd = 1;
+
+ OBJECT_UNLOCK(lt, region, sh_obj->indx);
+
+ /* If we are switching drop the lock we had. */
+ if (LF_ISSET(DB_LOCK_SWITCH) &&
+ (ret = __lock_put_nolock(env, lock, &ihold, 0)) != 0) {
+ OBJECT_LOCK_NDX(lt, region, sh_obj->indx);
+ (void)__lock_remove_waiter(
+ lt, sh_obj, newl, DB_LSTAT_FREE);
+ goto err;
+ }
+
+ LOCK_SYSTEM_UNLOCK(lt, region);
+
+ /*
+ * Before waiting, see if the deadlock detector should run.
+ */
+ if (region->detect != DB_LOCK_NORUN && !no_dd)
+ (void)__lock_detect(env, region->detect, &did_abort);
+
+ ip = NULL;
+ if (env->thr_hashtab != NULL &&
+ (ret = __env_set_state(env, &ip, THREAD_BLOCKED)) != 0) {
+ LOCK_SYSTEM_LOCK(lt, region);
+ OBJECT_LOCK_NDX(lt, region, ndx);
+ goto err;
+ }
+
+ PERFMON2(env, lock, suspend, (DBT *) obj, lock_mode);
+ MUTEX_LOCK(env, newl->mtx_lock);
+ PERFMON2(env, lock, resume, (DBT *) obj, lock_mode);
+
+ if (ip != NULL)
+ ip->dbth_state = THREAD_ACTIVE;
+
+ LOCK_SYSTEM_LOCK(lt, region);
+ OBJECT_LOCK_NDX(lt, region, ndx);
+
+ /* Turn off lock timeout. */
+ if (newl->status != DB_LSTAT_EXPIRED)
+ timespecclear(&sh_locker->lk_expire);
+
+ switch (newl->status) {
+ case DB_LSTAT_ABORTED:
+ /*
+ * If we raced with the deadlock detector and it
+ * mistakenly picked this transaction to abort again
+ * ignore the abort and request the lock again.
+ */
+ if (F_ISSET(sh_locker, DB_LOCKER_INABORT))
+ goto in_abort;
+ ret = DB_LOCK_DEADLOCK;
+ goto err;
+ case DB_LSTAT_EXPIRED:
+expired: ret = __lock_put_internal(lt, newl,
+ ndx, DB_LOCK_UNLINK | DB_LOCK_FREE);
+ newl = NULL;
+ if (ret != 0)
+ goto err;
+#ifdef HAVE_STATISTICS
+ if (timespeccmp(
+ &sh_locker->lk_expire, &sh_locker->tx_expire, ==))
+ STAT_INC(env, lock, txntimeout,
+ lt->obj_stat[ndx].st_ntxntimeouts,
+ (DBT *) obj);
+ else
+ STAT_INC(env, lock, locktimeout,
+ lt->obj_stat[ndx].st_nlocktimeouts,
+ (DBT *) obj);
+#endif
+ ret = DB_LOCK_NOTGRANTED;
+ timespecclear(&sh_locker->lk_expire);
+ goto err;
+ case DB_LSTAT_PENDING:
+ if (LF_ISSET(DB_LOCK_UPGRADE)) {
+ /*
+ * The lock just granted got put on the holders
+ * list. Since we're upgrading some other lock,
+ * we've got to remove it here.
+ */
+ SH_TAILQ_REMOVE(
+ &sh_obj->holders, newl, links, __db_lock);
+ /*
+ * Ensure the object is not believed to be on
+ * the object's lists, if we're traversing by
+ * locker.
+ */
+ newl->links.stqe_prev = -1;
+ if (newl->mode == DB_LOCK_WAIT)
+ goto done;
+ goto upgrade;
+ } else
+ newl->status = DB_LSTAT_HELD;
+ break;
+ case DB_LSTAT_FREE:
+ case DB_LSTAT_HELD:
+ case DB_LSTAT_WAITING:
+ default:
+ __db_errx(env, DB_STR_A("2038",
+ "Unexpected lock status: %d", "%d"),
+ (int)newl->status);
+ ret = __env_panic(env, EINVAL);
+ goto err;
+ }
+ }
+
+out: lock->off = R_OFFSET(&lt->reginfo, newl);
+ lock->gen = newl->gen;
+ lock->mode = newl->mode;
+ sh_locker->nlocks++;
+ if (IS_WRITELOCK(newl->mode)) {
+ sh_locker->nwrites++;
+ if (newl->mode == DB_LOCK_WWRITE)
+ F_SET(sh_locker, DB_LOCKER_DIRTY);
+ }
+
+ OBJECT_UNLOCK(lt, region, ndx);
+ return (0);
+
+err: if (!LF_ISSET(DB_LOCK_UPGRADE | DB_LOCK_SWITCH))
+ LOCK_INIT(*lock);
+
+done: if (newl != NULL &&
+ (t_ret = __lock_freelock(lt, newl, sh_locker,
+ DB_LOCK_FREE | DB_LOCK_UNLINK)) != 0 && ret == 0)
+ ret = t_ret;
+ OBJECT_UNLOCK(lt, region, ndx);
+
+ return (ret);
+}
+
+/*
+ * __lock_put_pp --
+ * ENV->lock_put pre/post processing.
+ *
+ * PUBLIC: int __lock_put_pp __P((DB_ENV *, DB_LOCK *));
+ */
+int
+__lock_put_pp(dbenv, lock)
+ DB_ENV *dbenv;
+ DB_LOCK *lock;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lk_handle, "DB_LOCK->lock_put", DB_INIT_LOCK);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__lock_put(env, lock)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __lock_put --
+ *
+ * PUBLIC: int __lock_put __P((ENV *, DB_LOCK *));
+ * Internal lock_put interface.
+ */
+int
+__lock_put(env, lock)
+ ENV *env;
+ DB_LOCK *lock;
+{
+ DB_LOCKTAB *lt;
+ int ret, run_dd;
+
+ if (IS_RECOVERING(env))
+ return (0);
+
+ lt = env->lk_handle;
+
+ LOCK_SYSTEM_LOCK(lt, (DB_LOCKREGION *)lt->reginfo.primary);
+ ret = __lock_put_nolock(env, lock, &run_dd, 0);
+ LOCK_SYSTEM_UNLOCK(lt, (DB_LOCKREGION *)lt->reginfo.primary);
+
+ /*
+ * Only run the lock detector if put told us to AND we are running
+ * in auto-detect mode. If we are not running in auto-detect, then
+ * a call to lock_detect here will 0 the need_dd bit, but will not
+ * actually abort anything.
+ */
+ if (ret == 0 && run_dd)
+ (void)__lock_detect(env,
+ ((DB_LOCKREGION *)lt->reginfo.primary)->detect, NULL);
+ return (ret);
+}
+
+static int
+__lock_put_nolock(env, lock, runp, flags)
+ ENV *env;
+ DB_LOCK *lock;
+ int *runp;
+ u_int32_t flags;
+{
+ struct __db_lock *lockp;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ int ret;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+
+ lockp = R_ADDR(&lt->reginfo, lock->off);
+ DB_ASSERT(env, lock->gen == lockp->gen);
+ if (lock->gen != lockp->gen) {
+ __db_errx(env, __db_lock_invalid, "DB_LOCK->lock_put");
+ LOCK_INIT(*lock);
+ return (EINVAL);
+ }
+
+ OBJECT_LOCK_NDX(lt, region, lock->ndx);
+ ret = __lock_put_internal(lt,
+ lockp, lock->ndx, flags | DB_LOCK_UNLINK | DB_LOCK_FREE);
+ OBJECT_UNLOCK(lt, region, lock->ndx);
+
+ LOCK_INIT(*lock);
+
+ *runp = 0;
+ if (ret == 0 && region->detect != DB_LOCK_NORUN &&
+ (region->need_dd || timespecisset(&region->next_timeout)))
+ *runp = 1;
+
+ return (ret);
+}
+
+/*
+ * __lock_downgrade --
+ *
+ * Used to downgrade locks. Currently this is used in three places: 1) by the
+ * Concurrent Data Store product to downgrade write locks back to iwrite locks
+ * and 2) to downgrade write-handle locks to read-handle locks at the end of
+ * an open/create. 3) To downgrade write locks to was_write to support dirty
+ * reads.
+ *
+ * PUBLIC: int __lock_downgrade __P((ENV *,
+ * PUBLIC: DB_LOCK *, db_lockmode_t, u_int32_t));
+ */
+int
+__lock_downgrade(env, lock, new_mode, flags)
+ ENV *env;
+ DB_LOCK *lock;
+ db_lockmode_t new_mode;
+ u_int32_t flags;
+{
+ struct __db_lock *lockp;
+ DB_LOCKER *sh_locker;
+ DB_LOCKOBJ *obj;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ int ret;
+
+ ret = 0;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+
+ LOCK_SYSTEM_LOCK(lt, region);
+
+ lockp = R_ADDR(&lt->reginfo, lock->off);
+ if (lock->gen != lockp->gen) {
+ __db_errx(env, __db_lock_invalid, "lock_downgrade");
+ ret = EINVAL;
+ goto out;
+ }
+
+ sh_locker = R_ADDR(&lt->reginfo, lockp->holder);
+
+ if (IS_WRITELOCK(lockp->mode) && !IS_WRITELOCK(new_mode))
+ sh_locker->nwrites--;
+
+ lockp->mode = new_mode;
+ lock->mode = new_mode;
+
+ /* Get the object associated with this lock. */
+ obj = SH_OFF_TO_PTR(lockp, lockp->obj, DB_LOCKOBJ);
+ OBJECT_LOCK_NDX(lt, region, obj->indx);
+ STAT(lt->obj_stat[obj->indx].st_ndowngrade++);
+ ret = __lock_promote(lt, obj, NULL, flags);
+ OBJECT_UNLOCK(lt, region, obj->indx);
+
+out: LOCK_SYSTEM_UNLOCK(lt, region);
+ return (ret);
+}
+
+/*
+ * __lock_put_internal -- put a lock structure
+ * We assume that we are called with the proper object locked.
+ */
+static int
+__lock_put_internal(lt, lockp, obj_ndx, flags)
+ DB_LOCKTAB *lt;
+ struct __db_lock *lockp;
+ u_int32_t obj_ndx, flags;
+{
+ DB_LOCKOBJ *sh_obj;
+ DB_LOCKREGION *region;
+ ENV *env;
+ u_int32_t part_id;
+ int ret, state_changed;
+
+ COMPQUIET(env, NULL);
+ env = lt->env;
+ region = lt->reginfo.primary;
+ ret = state_changed = 0;
+
+ if (!OBJ_LINKS_VALID(lockp)) {
+ /*
+ * Someone removed this lock while we were doing a release
+ * by locker id. We are trying to free this lock, but it's
+ * already been done; all we need to do is return it to the
+ * free list.
+ */
+ (void)__lock_freelock(lt, lockp, NULL, DB_LOCK_FREE);
+ return (0);
+ }
+
+#ifdef HAVE_STATISTICS
+ if (LF_ISSET(DB_LOCK_DOALL))
+ lt->obj_stat[obj_ndx].st_nreleases += lockp->refcount;
+ else
+ lt->obj_stat[obj_ndx].st_nreleases++;
+#endif
+
+ if (!LF_ISSET(DB_LOCK_DOALL) && lockp->refcount > 1) {
+ lockp->refcount--;
+ PERFMON2(env, lock, put_reduce_count,
+ &(SH_OFF_TO_PTR(lockp, lockp->obj, DB_LOCKOBJ))->lockobj,
+ flags);
+ return (0);
+ }
+
+ /* Increment generation number. */
+ lockp->gen++;
+
+ /* Get the object associated with this lock. */
+ sh_obj = SH_OFF_TO_PTR(lockp, lockp->obj, DB_LOCKOBJ);
+
+ PERFMON2(env, lock, put, &sh_obj->lockobj, flags);
+ /*
+ * Remove this lock from its holders/waitlist. Set its status
+ * to ABORTED. It may get freed below, but if not then the
+ * waiter has been aborted (it will panic if the lock is
+ * free).
+ */
+ if (lockp->status != DB_LSTAT_HELD &&
+ lockp->status != DB_LSTAT_PENDING) {
+ DB_ASSERT(env, lockp !=
+ SH_TAILQ_FIRST(&sh_obj->holders, __db_lock));
+ if ((ret = __lock_remove_waiter(
+ lt, sh_obj, lockp, DB_LSTAT_ABORTED)) != 0)
+ return (ret);
+ } else {
+ DB_ASSERT(env, lockp !=
+ SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock));
+ SH_TAILQ_REMOVE(&sh_obj->holders, lockp, links, __db_lock);
+ lockp->links.stqe_prev = -1;
+ }
+
+ if (LF_ISSET(DB_LOCK_NOPROMOTE))
+ state_changed = 0;
+ else if ((ret = __lock_promote(lt,
+ sh_obj, &state_changed, flags)) != 0)
+ return (ret);
+
+ /* Check if object should be reclaimed. */
+ if (SH_TAILQ_FIRST(&sh_obj->holders, __db_lock) == NULL &&
+ SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock) == NULL) {
+ part_id = LOCK_PART(region, obj_ndx);
+ SH_TAILQ_REMOVE(
+ &lt->obj_tab[obj_ndx], sh_obj, links, __db_lockobj);
+ if (sh_obj->lockobj.size > sizeof(sh_obj->objdata)) {
+ if (region->part_t_size != 1)
+ LOCK_REGION_LOCK(env);
+ __env_alloc_free(&lt->reginfo,
+ SH_DBT_PTR(&sh_obj->lockobj));
+ if (region->part_t_size != 1)
+ LOCK_REGION_UNLOCK(env);
+ }
+ SH_TAILQ_INSERT_HEAD(
+ &FREE_OBJS(lt, part_id), sh_obj, links, __db_lockobj);
+ sh_obj->generation++;
+ STAT(lt->part_array[part_id].part_stat.st_nobjects--);
+ STAT(lt->obj_stat[obj_ndx].st_nobjects--);
+ state_changed = 1;
+ }
+
+ /* Free lock. */
+ if (LF_ISSET(DB_LOCK_UNLINK | DB_LOCK_FREE))
+ ret = __lock_freelock(lt, lockp,
+ R_ADDR(&lt->reginfo, lockp->holder), flags);
+
+ /*
+ * If we did not promote anyone; we need to run the deadlock
+ * detector again.
+ */
+ if (state_changed == 0)
+ region->need_dd = 1;
+
+ return (ret);
+}
+
+/*
+ * __lock_freelock --
+ * Free a lock. Unlink it from its locker if necessary.
+ * We must hold the object lock.
+ *
+ */
+static int
+__lock_freelock(lt, lockp, sh_locker, flags)
+ DB_LOCKTAB *lt;
+ struct __db_lock *lockp;
+ DB_LOCKER *sh_locker;
+ u_int32_t flags;
+{
+ DB_LOCKREGION *region;
+ ENV *env;
+ u_int32_t part_id;
+ int ret;
+
+ env = lt->env;
+ region = lt->reginfo.primary;
+
+ if (LF_ISSET(DB_LOCK_UNLINK)) {
+ SH_LIST_REMOVE(lockp, locker_links, __db_lock);
+ if (lockp->status == DB_LSTAT_HELD) {
+ sh_locker->nlocks--;
+ if (IS_WRITELOCK(lockp->mode))
+ sh_locker->nwrites--;
+ }
+ }
+
+ if (LF_ISSET(DB_LOCK_FREE)) {
+ /*
+ * If the lock is not held we cannot be sure of its mutex
+ * state so we refresh it.
+ */
+ part_id = LOCK_PART(region, lockp->indx);
+ if (lockp->mtx_lock != MUTEX_INVALID &&
+ lockp->status != DB_LSTAT_HELD &&
+ lockp->status != DB_LSTAT_EXPIRED) {
+ if ((ret = __mutex_refresh(env, lockp->mtx_lock)) != 0)
+ return (ret);
+ MUTEX_LOCK(env, lockp->mtx_lock);
+ }
+
+ lockp->status = DB_LSTAT_FREE;
+ SH_TAILQ_INSERT_HEAD(&FREE_LOCKS(lt, part_id),
+ lockp, links, __db_lock);
+ STAT(lt->part_array[part_id].part_stat.st_nlocks--);
+ STAT(lt->obj_stat[lockp->indx].st_nlocks--);
+ }
+
+ return (0);
+}
+
+#undef FREE_LIST_HEAD
+#undef STRUCT_NAME
+#undef CURRENT_COUNT
+#undef MAX_COUNT
+#undef STEAL_NAME
+#undef STEAL_EVENT
+/*
+ * __lock_allocobj -- allocate a object from another partition.
+ * We assume we have the partition locked on entry and leave
+ * with the same partition locked on exit.
+ */
+static int
+__lock_allocobj(lt, part_id)
+ DB_LOCKTAB *lt;
+ u_int32_t part_id;
+{
+#define FREE_LIST_HEAD free_objs
+#define STRUCT_NAME __db_lockobj
+#define CURRENT_COUNT st_objects
+#define MAX_COUNT st_maxobjects
+#define STEAL_NAME st_objectsteals
+#define STEAL_EVENT object_steal
+
+#ifdef DEBUG
+ __db_loadme();
+#endif
+
+#include "lock_alloc.incl"
+
+}
+
+/*
+ * __lock_getobj --
+ * Get an object in the object hash table. The create parameter
+ * indicates if the object should be created if it doesn't exist in
+ * the table.
+ *
+ * This must be called with the object bucket locked.
+ */
+static int
+__lock_getobj(lt, obj, ndx, create, retp)
+ DB_LOCKTAB *lt;
+ const DBT *obj;
+ u_int32_t ndx;
+ int create;
+ DB_LOCKOBJ **retp;
+{
+ DB_LOCKOBJ *sh_obj;
+ DB_LOCKREGION *region;
+ ENV *env;
+ int ret;
+ void *p;
+ u_int32_t len, part_id;
+
+ env = lt->env;
+ region = lt->reginfo.primary;
+ len = 0;
+
+ /* Look up the object in the hash table. */
+retry: SH_TAILQ_FOREACH(sh_obj, &lt->obj_tab[ndx], links, __db_lockobj) {
+ len++;
+ if (obj->size == sh_obj->lockobj.size &&
+ memcmp(obj->data,
+ SH_DBT_PTR(&sh_obj->lockobj), obj->size) == 0)
+ break;
+ }
+
+ /*
+ * If we found the object, then we can just return it. If
+ * we didn't find the object, then we need to create it.
+ */
+ if (sh_obj == NULL && create) {
+ /* Create new object and then insert it into hash table. */
+ part_id = LOCK_PART(region, ndx);
+ if ((sh_obj = SH_TAILQ_FIRST(&FREE_OBJS(
+ lt, part_id), __db_lockobj)) == NULL) {
+ if ((ret = __lock_allocobj(lt, part_id)) == 0)
+ goto retry;
+ goto err;
+ }
+
+ /*
+ * If we can fit this object in the structure, do so instead
+ * of alloc-ing space for it.
+ */
+ if (obj->size <= sizeof(sh_obj->objdata))
+ p = sh_obj->objdata;
+ else {
+ /*
+ * If we have only one partition, the region is locked.
+ */
+ if (region->part_t_size != 1)
+ LOCK_REGION_LOCK(env);
+ ret = __env_alloc(&lt->reginfo, obj->size, &p);
+ if (region->part_t_size != 1)
+ LOCK_REGION_UNLOCK(env);
+ if (ret != 0) {
+ __db_errx(env,
+ "No space for lock object storage");
+ goto err;
+ }
+ }
+
+ memcpy(p, obj->data, obj->size);
+
+ SH_TAILQ_REMOVE(&FREE_OBJS(
+ lt, part_id), sh_obj, links, __db_lockobj);
+#ifdef HAVE_STATISTICS
+ /*
+ * Keep track of both the max number of objects allocated
+ * per partition and the max number of objects used by
+ * this bucket.
+ */
+ len++;
+ if (++lt->obj_stat[ndx].st_nobjects >
+ lt->obj_stat[ndx].st_maxnobjects)
+ lt->obj_stat[ndx].st_maxnobjects =
+ lt->obj_stat[ndx].st_nobjects;
+ if (++lt->part_array[part_id].part_stat.st_nobjects >
+ lt->part_array[part_id].part_stat.st_maxnobjects)
+ lt->part_array[part_id].part_stat.st_maxnobjects =
+ lt->part_array[part_id].part_stat.st_nobjects;
+#endif
+
+ sh_obj->indx = ndx;
+ SH_TAILQ_INIT(&sh_obj->waiters);
+ SH_TAILQ_INIT(&sh_obj->holders);
+ sh_obj->lockobj.size = obj->size;
+ sh_obj->lockobj.off =
+ (roff_t)SH_PTR_TO_OFF(&sh_obj->lockobj, p);
+ SH_TAILQ_INSERT_HEAD(
+ &lt->obj_tab[ndx], sh_obj, links, __db_lockobj);
+ }
+
+#ifdef HAVE_STATISTICS
+ if (len > lt->obj_stat[ndx].st_hash_len)
+ lt->obj_stat[ndx].st_hash_len = len;
+#endif
+
+ *retp = sh_obj;
+ return (0);
+
+err: return (ret);
+}
+
+/*
+ * __lock_same_family --
+ * Looks for compatible lockers. There are two modes:
+ * 1) If the lockers 2 belongs to a family transaction, then the locks are
+ * compatible if the lockers share the same last ancestor.
+ * 2) Otherwise the lockers are compatible if locker 1 is a parent of
+ * locker 2.
+ * Return 1 if the lockers are compatible.
+ *
+ * This is used to determine if we should grant locks that appear to conflict,
+ * but don't because the lock is already held by a compatible locker.
+ */
+static int
+__lock_same_family(lt, sh_locker1, sh_locker2)
+ DB_LOCKTAB *lt;
+ DB_LOCKER *sh_locker1;
+ DB_LOCKER *sh_locker2;
+{
+ while (sh_locker2->parent_locker != INVALID_ROFF) {
+ sh_locker2 = R_ADDR(&lt->reginfo, sh_locker2->parent_locker);
+ if (sh_locker2 == sh_locker1)
+ return (1);
+ }
+
+ if (!F_ISSET(sh_locker2, DB_LOCKER_FAMILY_LOCKER))
+ return (0);
+
+ /*
+ * If checking for a family locker situation, compare the last ancestor
+ * of each locker.
+ */
+ while (sh_locker1->parent_locker != INVALID_ROFF)
+ sh_locker1 =
+ R_ADDR(&lt->reginfo, sh_locker1->parent_locker);
+
+ return (sh_locker1 == sh_locker2);
+}
+
+/*
+ * __lock_locker_same_family --
+ * Determine if "locker" is an ancestor of "child".
+ * *retp == 1 if so, 0 otherwise.
+ *
+ * PUBLIC: int __lock_locker_same_family
+ * PUBLIC: __P((ENV *, DB_LOCKER *, DB_LOCKER *, int *));
+ */
+int
+__lock_locker_same_family(env, locker1, locker2, retp)
+ ENV *env;
+ DB_LOCKER *locker1;
+ DB_LOCKER *locker2;
+ int *retp;
+{
+ DB_LOCKTAB *lt;
+
+ lt = env->lk_handle;
+
+ /*
+ * The locker may not exist for this transaction, if not then it has
+ * no parents.
+ */
+ if (locker1 == NULL)
+ *retp = 0;
+ else
+ *retp = __lock_same_family(lt, locker1, locker2);
+ return (0);
+}
+
+/*
+ * __lock_inherit_locks --
+ * Called on child commit to merge child's locks with parent's.
+ */
+static int
+__lock_inherit_locks(lt, sh_locker, flags)
+ DB_LOCKTAB *lt;
+ DB_LOCKER *sh_locker;
+ u_int32_t flags;
+{
+ DB_LOCKER *sh_parent;
+ DB_LOCKOBJ *obj;
+ DB_LOCKREGION *region;
+ ENV *env;
+ int ret;
+ struct __db_lock *hlp, *lp;
+ roff_t poff;
+
+ env = lt->env;
+ region = lt->reginfo.primary;
+
+ /*
+ * Get the committing locker and mark it as deleted.
+ * This allows us to traverse the locker links without
+ * worrying that someone else is deleting locks out
+ * from under us. However, if the locker doesn't
+ * exist, that just means that the child holds no
+ * locks, so inheritance is easy!
+ */
+ if (sh_locker == NULL) {
+ __db_errx(env, __db_locker_invalid);
+ return (EINVAL);
+ }
+
+ /* Make sure we are a child transaction. */
+ if (sh_locker->parent_locker == INVALID_ROFF) {
+ __db_errx(env, DB_STR("2039", "Not a child transaction"));
+ return (EINVAL);
+ }
+ sh_parent = R_ADDR(&lt->reginfo, sh_locker->parent_locker);
+
+ /*
+ * In order to make it possible for a parent to have
+ * many, many children who lock the same objects, and
+ * not require an inordinate number of locks, we try
+ * to merge the child's locks with its parent's.
+ */
+ poff = R_OFFSET(&lt->reginfo, sh_parent);
+ for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock);
+ lp != NULL;
+ lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) {
+ SH_LIST_REMOVE(lp, locker_links, __db_lock);
+
+ /* See if the parent already has a lock. */
+ obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+ OBJECT_LOCK_NDX(lt, region, obj->indx);
+ SH_TAILQ_FOREACH(hlp, &obj->holders, links, __db_lock)
+ if (hlp->holder == poff && lp->mode == hlp->mode)
+ break;
+
+ if (hlp != NULL) {
+ /* Parent already holds lock. */
+ hlp->refcount += lp->refcount;
+
+ /* Remove lock from object list and free it. */
+ DB_ASSERT(env, lp->status == DB_LSTAT_HELD);
+ SH_TAILQ_REMOVE(&obj->holders, lp, links, __db_lock);
+ (void)__lock_freelock(lt, lp, sh_locker, DB_LOCK_FREE);
+ } else {
+ /* Just move lock to parent chains. */
+ SH_LIST_INSERT_HEAD(&sh_parent->heldby,
+ lp, locker_links, __db_lock);
+ lp->holder = poff;
+ sh_parent->nlocks++;
+ if (IS_WRITELOCK(lp->mode))
+ sh_parent->nwrites++;
+ }
+
+ /*
+ * We may need to promote regardless of whether we simply
+ * moved the lock to the parent or changed the parent's
+ * reference count, because there might be a sibling waiting,
+ * who will now be allowed to make forward progress.
+ */
+ ret = __lock_promote(lt, obj, NULL, flags);
+ OBJECT_UNLOCK(lt, region, obj->indx);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+/*
+ * __lock_wakeup --
+ *
+ * Wakeup any waiters on a lock objects.
+ *
+ * PUBLIC: int __lock_wakeup __P((ENV *, const DBT *));
+ */
+int
+__lock_wakeup(env, obj)
+ ENV *env;
+ const DBT *obj;
+{
+ DB_LOCKOBJ *sh_obj;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ u_int32_t ndx;
+ int ret;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+
+ OBJECT_LOCK(lt, region, obj, ndx);
+ if ((ret = __lock_getobj(lt, obj, ndx, 0, &sh_obj)) == 0 &&
+ sh_obj != NULL)
+ ret = __lock_promote(lt, sh_obj, NULL, DB_LOCK_ONEWAITER);
+
+ OBJECT_UNLOCK(lt, region, ndx);
+ return (ret);
+}
+
+/*
+ * __lock_promote --
+ *
+ * Look through the waiters and holders lists and decide which (if any)
+ * locks can be promoted. Promote any that are eligible.
+ *
+ * PUBLIC: int __lock_promote
+ * PUBLIC: __P((DB_LOCKTAB *, DB_LOCKOBJ *, int *, u_int32_t));
+ */
+int
+__lock_promote(lt, obj, state_changedp, flags)
+ DB_LOCKTAB *lt;
+ DB_LOCKOBJ *obj;
+ int *state_changedp;
+ u_int32_t flags;
+{
+ struct __db_lock *lp_w, *lp_h, *next_waiter;
+ DB_LOCKREGION *region;
+ int had_waiters, state_changed;
+
+ region = lt->reginfo.primary;
+ had_waiters = 0;
+
+ /*
+ * We need to do lock promotion. We also need to determine if we're
+ * going to need to run the deadlock detector again. If we release
+ * locks, and there are waiters, but no one gets promoted, then we
+ * haven't fundamentally changed the lockmgr state, so we may still
+ * have a deadlock and we have to run again. However, if there were
+ * no waiters, or we actually promoted someone, then we are OK and we
+ * don't have to run it immediately.
+ *
+ * During promotion, we look for state changes so we can return this
+ * information to the caller.
+ */
+
+ for (lp_w = SH_TAILQ_FIRST(&obj->waiters, __db_lock),
+ state_changed = lp_w == NULL;
+ lp_w != NULL;
+ lp_w = next_waiter) {
+ had_waiters = 1;
+ next_waiter = SH_TAILQ_NEXT(lp_w, links, __db_lock);
+
+ /* Waiter may have aborted or expired. */
+ if (lp_w->status != DB_LSTAT_WAITING)
+ continue;
+
+ SH_TAILQ_FOREACH(lp_h, &obj->holders, links, __db_lock) {
+ if (lp_h->holder != lp_w->holder &&
+ CONFLICTS(lt, region, lp_h->mode, lp_w->mode)) {
+ if (!__lock_same_family(lt,
+ R_ADDR(&lt->reginfo, lp_h->holder),
+ R_ADDR(&lt->reginfo, lp_w->holder)))
+ break;
+ }
+ }
+ if (lp_h != NULL) /* Found a conflict. */
+ break;
+
+ /* No conflict, promote the waiting lock. */
+ SH_TAILQ_REMOVE(&obj->waiters, lp_w, links, __db_lock);
+ lp_w->status = DB_LSTAT_PENDING;
+ SH_TAILQ_INSERT_TAIL(&obj->holders, lp_w, links);
+
+ /* Wake up waiter. */
+ MUTEX_UNLOCK(lt->env, lp_w->mtx_lock);
+ state_changed = 1;
+ if (LF_ISSET(DB_LOCK_ONEWAITER))
+ break;
+ }
+
+ /*
+ * If this object had waiters and doesn't any more, then we need
+ * to remove it from the dd_obj list.
+ */
+ if (had_waiters && SH_TAILQ_FIRST(&obj->waiters, __db_lock) == NULL) {
+ LOCK_DD(lt->env, region);
+ /*
+ * Bump the generation when removing an object from the
+ * queue so that the deadlock detector will retry.
+ */
+ obj->generation++;
+ SH_TAILQ_REMOVE(&region->dd_objs, obj, dd_links, __db_lockobj);
+ UNLOCK_DD(lt->env, region);
+ }
+
+ if (state_changedp != NULL)
+ *state_changedp = state_changed;
+
+ return (0);
+}
+
+/*
+ * __lock_remove_waiter --
+ * Any lock on the waitlist has a process waiting for it. Therefore,
+ * we can't return the lock to the freelist immediately. Instead, we can
+ * remove the lock from the list of waiters, set the status field of the
+ * lock, and then let the process waking up return the lock to the
+ * free list.
+ *
+ * This must be called with the Object bucket locked.
+ */
+static int
+__lock_remove_waiter(lt, sh_obj, lockp, status)
+ DB_LOCKTAB *lt;
+ DB_LOCKOBJ *sh_obj;
+ struct __db_lock *lockp;
+ db_status_t status;
+{
+ DB_LOCKREGION *region;
+ int do_wakeup;
+
+ region = lt->reginfo.primary;
+
+ do_wakeup = lockp->status == DB_LSTAT_WAITING;
+
+ SH_TAILQ_REMOVE(&sh_obj->waiters, lockp, links, __db_lock);
+ lockp->links.stqe_prev = -1;
+ lockp->status = status;
+ if (SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock) == NULL) {
+ LOCK_DD(lt->env, region);
+ sh_obj->generation++;
+ SH_TAILQ_REMOVE(
+ &region->dd_objs,
+ sh_obj, dd_links, __db_lockobj);
+ UNLOCK_DD(lt->env, region);
+ }
+
+ /*
+ * Wake whoever is waiting on this lock.
+ */
+ if (do_wakeup)
+ MUTEX_UNLOCK(lt->env, lockp->mtx_lock);
+
+ return (0);
+}
+
+/*
+ * __lock_trade --
+ *
+ * Trade locker ids on a lock. This is used to reassign file locks from
+ * a transactional locker id to a long-lived locker id. This should be
+ * called with the region mutex held.
+ */
+static int
+__lock_trade(env, lock, new_locker)
+ ENV *env;
+ DB_LOCK *lock;
+ DB_LOCKER *new_locker;
+{
+ struct __db_lock *lp;
+ DB_LOCKTAB *lt;
+ int ret;
+
+ lt = env->lk_handle;
+ lp = R_ADDR(&lt->reginfo, lock->off);
+
+ /* If the lock is already released, simply return. */
+ if (lp->gen != lock->gen)
+ return (DB_NOTFOUND);
+
+ if (new_locker == NULL) {
+ __db_errx(env, DB_STR("2040", "Locker does not exist"));
+ return (EINVAL);
+ }
+
+ /* Remove the lock from its current locker. */
+ if ((ret = __lock_freelock(lt,
+ lp, R_ADDR(&lt->reginfo, lp->holder), DB_LOCK_UNLINK)) != 0)
+ return (ret);
+
+ /* Add lock to its new locker. */
+ SH_LIST_INSERT_HEAD(&new_locker->heldby, lp, locker_links, __db_lock);
+ new_locker->nlocks++;
+ if (IS_WRITELOCK(lp->mode))
+ new_locker->nwrites++;
+ lp->holder = R_OFFSET(&lt->reginfo, new_locker);
+
+ return (0);
+}
+
+/*
+ * __lock_change --
+ *
+ * PUBLIC: int __lock_change __P((ENV *, DB_LOCK *, DB_LOCK *));
+ *
+ * Change a lock to a different object. This is used when we move a
+ * metadata page to change the handle lock. We know that the new lock
+ * has replaced the old lock so we just delete that lock.
+ */
+int
+__lock_change(env, old_lock, new_lock)
+ ENV *env;
+ DB_LOCK *old_lock, *new_lock;
+{
+ struct __db_lock *lp, *old_lp;
+ DB_LOCKOBJ *old_obj, *new_obj;
+ DB_LOCKTAB *lt;
+ DB_LOCKREGION *region;
+ u_int32_t old_part, new_part;
+ int ret;
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+
+ old_lp = R_ADDR(&lt->reginfo, old_lock->off);
+ DB_ASSERT(env, old_lp->gen == old_lock->gen);
+ old_obj = SH_OFF_TO_PTR(old_lp, old_lp->obj, DB_LOCKOBJ);
+
+ lp = R_ADDR(&lt->reginfo, new_lock->off);
+ DB_ASSERT(env, lp->gen == new_lock->gen);
+ new_obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+
+ /* Don't deadlock on partition mutexes, order the latches. */
+ LOCK_SYSTEM_LOCK(lt, region);
+ old_part = LOCK_PART(region, old_obj->indx);
+ new_part = LOCK_PART(region, new_obj->indx);
+
+ if (old_part == new_part)
+ MUTEX_LOCK_PARTITION(lt, region, old_part);
+ else if (new_obj->indx < old_obj->indx) {
+ MUTEX_LOCK_PARTITION(lt, region, new_part);
+ MUTEX_LOCK_PARTITION(lt, region, old_part);
+ } else {
+ MUTEX_LOCK_PARTITION(lt, region, old_part);
+ MUTEX_LOCK_PARTITION(lt, region, new_part);
+ }
+
+ for (lp = SH_TAILQ_FIRST(&old_obj->waiters, __db_lock);
+ lp != NULL;
+ lp = SH_TAILQ_FIRST(&old_obj->waiters, __db_lock)) {
+ SH_TAILQ_REMOVE(&old_obj->waiters, lp, links, __db_lock);
+ SH_TAILQ_INSERT_TAIL(&new_obj->waiters, lp, links);
+ lp->indx = new_obj->indx;
+ lp->obj = (roff_t)SH_PTR_TO_OFF(lp, new_obj);
+ }
+
+ for (lp = SH_TAILQ_FIRST(&old_obj->holders, __db_lock);
+ lp != NULL;
+ lp = SH_TAILQ_FIRST(&old_obj->holders, __db_lock)) {
+ SH_TAILQ_REMOVE(&old_obj->holders, lp, links, __db_lock);
+ if (lp == old_lp)
+ continue;
+ SH_TAILQ_INSERT_TAIL(&new_obj->holders, lp, links);
+ lp->indx = new_obj->indx;
+ lp->obj = (roff_t)SH_PTR_TO_OFF(lp, new_obj);
+ }
+
+ /* Put the lock back in and call put so the object goes away too. */
+ SH_TAILQ_INSERT_TAIL(&old_obj->holders, old_lp, links);
+ ret = __lock_put_internal(lt, old_lp, old_obj->indx,
+ DB_LOCK_UNLINK | DB_LOCK_FREE | DB_LOCK_NOPROMOTE);
+
+ MUTEX_UNLOCK_PARTITION(lt, region, new_part);
+ if (new_part != old_part)
+ MUTEX_UNLOCK_PARTITION(lt, region, old_part);
+ LOCK_SYSTEM_UNLOCK(lt, region);
+
+ return (ret);
+}
diff --git a/src/lock/lock_alloc.incl b/src/lock/lock_alloc.incl
new file mode 100644
index 00000000..edea07d2
--- /dev/null
+++ b/src/lock/lock_alloc.incl
@@ -0,0 +1,138 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+/*
+ * This is a template for allocation in the lock region. The following
+ * macros must be defined:
+ *
+ * FREE_LIST_HEAD -- the name of the head of the free list.
+ * STRUCT_NAME -- the name of the structure in the free list.
+ * CURRENT_COUNT -- structure element for count of current objects.
+ * MAX_COUNT -- structure element for max of current objects.
+ * STEAL_NAME -- name of stat to track steals.
+ * STEAL_EVENT -- name of event to track steals.
+ */
+#define __lock_alloc() /* for ctags */
+{
+ struct STRUCT_NAME *sh_thing;
+ DB_LOCKPART *end_p, *cur_p, *orig_p;
+ DB_LOCKREGION *region;
+ int begin, locked;
+ u_int32_t i, nobjs;
+
+ region = lt->reginfo.primary;
+
+ orig_p = &lt->part_array[part_id];
+ if (region->part_t_size == 1)
+ goto alloc;
+retry: MUTEX_UNLOCK(lt->env, orig_p->mtx_part);
+ locked = 0;
+ sh_thing = NULL;
+ end_p = &lt->part_array[region->part_t_size];
+ /*
+ * Start looking at the next partition and wrap around. If
+ * we get back to our partition then raise an error.
+ */
+ begin = 0;
+ nobjs = 0;
+ cur_p = orig_p + 1;
+again: for (; sh_thing == NULL && cur_p < end_p; cur_p++) {
+ MUTEX_LOCK(lt->env, cur_p->mtx_part);
+ if ((sh_thing = SH_TAILQ_FIRST(
+ &cur_p->FREE_LIST_HEAD, STRUCT_NAME)) != NULL)
+ SH_TAILQ_REMOVE(&cur_p->FREE_LIST_HEAD,
+ sh_thing, links, STRUCT_NAME);
+ MUTEX_UNLOCK(lt->env, cur_p->mtx_part);
+ }
+ if (sh_thing != NULL) {
+ MUTEX_LOCK(lt->env, orig_p->mtx_part);
+ SH_TAILQ_INSERT_HEAD(&orig_p->FREE_LIST_HEAD,
+ sh_thing, links, STRUCT_NAME);
+ STAT_INC_VERB(env,
+ lock, STEAL_EVENT, orig_p->part_stat.STEAL_NAME,
+ cur_p - lt->part_array, part_id);
+ return (0);
+ }
+ if (!begin) {
+ begin = 1;
+ cur_p = lt->part_array;
+ end_p = orig_p;
+ goto again;
+ }
+ /*
+ * Try to get some more space in the region.
+ */
+ LOCK_REGION_LOCK(lt->env);
+ MUTEX_LOCK(lt->env, orig_p->mtx_part);
+ locked = 1;
+ nobjs = 0;
+ /* check to see if we raced with someone. */
+ if ((region->stat.MAX_COUNT == 0 ||
+ region->stat.CURRENT_COUNT < region->stat.MAX_COUNT) &&
+ SH_TAILQ_FIRST(&orig_p->FREE_LIST_HEAD, STRUCT_NAME) == NULL) {
+ MUTEX_UNLOCK(lt->env, orig_p->mtx_part);
+alloc: locked = 0;
+ sh_thing = NULL;
+ cur_p = orig_p;
+ end_p = &lt->part_array[region->part_t_size];
+ nobjs = region->stat.CURRENT_COUNT >> 2;
+ /* Just in case. */
+ if (nobjs == 0)
+ nobjs = 1;
+ if (region->stat.MAX_COUNT != 0 &&
+ region->stat.MAX_COUNT <
+ region->stat.CURRENT_COUNT + nobjs)
+ nobjs = region->stat.MAX_COUNT -
+ region->stat.CURRENT_COUNT;
+ /*
+ * If the max memory is not sized for max objects,
+ * allocate as much as possible.
+ */
+ F_SET(&lt->reginfo, REGION_TRACKED);
+ while (__env_alloc(&lt->reginfo,
+ nobjs * sizeof(struct STRUCT_NAME), &sh_thing) != 0)
+ if ((nobjs >>= 1) == 0)
+ break;
+ F_CLR(&lt->reginfo, REGION_TRACKED);
+ region->stat.CURRENT_COUNT += nobjs;
+ if (region->part_t_size != 1)
+ LOCK_REGION_UNLOCK(lt->env);
+
+ if (nobjs == 0)
+ goto err;
+
+ for (i = 0; i < nobjs; i++) {
+ memset(sh_thing, 0, sizeof (struct STRUCT_NAME));
+ if (&cur_p->free_locks ==
+ (struct __flock *)&cur_p->FREE_LIST_HEAD)
+ ((struct __db_lock *)
+ sh_thing)->status = DB_LSTAT_FREE;
+ MUTEX_LOCK(lt->env, cur_p->mtx_part);
+ SH_TAILQ_INSERT_HEAD(&cur_p->FREE_LIST_HEAD,
+ sh_thing, links, STRUCT_NAME);
+ MUTEX_UNLOCK(lt->env, cur_p->mtx_part);
+ if (region->part_t_size != 1 && ++cur_p == end_p)
+ cur_p = lt->part_array;
+ sh_thing++;
+ }
+ if (region->part_t_size != 1)
+ MUTEX_LOCK(lt->env, orig_p->mtx_part);
+ locked = 1;
+ } else
+ LOCK_REGION_UNLOCK(lt->env);
+
+ if (SH_TAILQ_FIRST(&orig_p->FREE_LIST_HEAD, STRUCT_NAME) != NULL)
+ return (0);
+ /* Somone stole all the locks! */
+ if (nobjs > 0)
+ goto retry;
+
+err: if (region->part_t_size != 1 && locked == 0)
+ MUTEX_LOCK(lt->env, orig_p->mtx_part);
+ return (__lock_nomem(lt->env, "lock entries"));
+}
diff --git a/src/lock/lock_deadlock.c b/src/lock/lock_deadlock.c
new file mode 100644
index 00000000..3c00d7f1
--- /dev/null
+++ b/src/lock/lock_deadlock.c
@@ -0,0 +1,1063 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+#define ISSET_MAP(M, N) ((M)[(N) / 32] & (1 << ((N) % 32)))
+
+#define CLEAR_MAP(M, N) { \
+ u_int32_t __i; \
+ for (__i = 0; __i < (N); __i++) \
+ (M)[__i] = 0; \
+}
+
+#define SET_MAP(M, B) ((M)[(B) / 32] |= (1 << ((B) % 32)))
+#define CLR_MAP(M, B) ((M)[(B) / 32] &= ~((u_int)1 << ((B) % 32)))
+
+#define OR_MAP(D, S, N) { \
+ u_int32_t __i; \
+ for (__i = 0; __i < (N); __i++) \
+ D[__i] |= S[__i]; \
+}
+#define BAD_KILLID 0xffffffff
+
+typedef struct {
+ int valid;
+ int self_wait;
+ int in_abort;
+ u_int32_t count;
+ u_int32_t id;
+ roff_t last_lock;
+ roff_t last_obj;
+ u_int32_t last_ndx;
+ u_int32_t last_locker_id;
+ db_pgno_t pgno;
+ u_int32_t priority;
+} locker_info;
+
+static int __dd_abort __P((ENV *, locker_info *, int *));
+static int __dd_build __P((ENV *, u_int32_t, u_int32_t **,
+ u_int32_t *, u_int32_t *, locker_info **, int*, int*));
+static int __dd_find __P((ENV *,
+ u_int32_t *, locker_info *, u_int32_t, u_int32_t, u_int32_t ***));
+static int __dd_isolder __P((u_int32_t, u_int32_t, u_int32_t, u_int32_t));
+static int __dd_verify __P((locker_info *, u_int32_t *, u_int32_t *,
+ u_int32_t *, u_int32_t, u_int32_t, u_int32_t));
+
+#ifdef DIAGNOSTIC
+static void __dd_debug
+ __P((ENV *, locker_info *, u_int32_t *, u_int32_t, u_int32_t));
+#endif
+
+/*
+ * __lock_detect_pp --
+ * ENV->lock_detect pre/post processing.
+ *
+ * PUBLIC: int __lock_detect_pp __P((DB_ENV *, u_int32_t, u_int32_t, int *));
+ */
+int
+__lock_detect_pp(dbenv, flags, atype, rejectp)
+ DB_ENV *dbenv;
+ u_int32_t flags, atype;
+ int *rejectp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lk_handle, "DB_ENV->lock_detect", DB_INIT_LOCK);
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env, "DB_ENV->lock_detect", flags, 0)) != 0)
+ return (ret);
+ switch (atype) {
+ case DB_LOCK_DEFAULT:
+ case DB_LOCK_EXPIRE:
+ case DB_LOCK_MAXLOCKS:
+ case DB_LOCK_MAXWRITE:
+ case DB_LOCK_MINLOCKS:
+ case DB_LOCK_MINWRITE:
+ case DB_LOCK_OLDEST:
+ case DB_LOCK_RANDOM:
+ case DB_LOCK_YOUNGEST:
+ break;
+ default:
+ __db_errx(env, DB_STR("2048",
+ "DB_ENV->lock_detect: unknown deadlock detection mode specified"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__lock_detect(env, atype, rejectp)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __lock_detect --
+ * ENV->lock_detect.
+ *
+ * PUBLIC: int __lock_detect __P((ENV *, u_int32_t, int *));
+ */
+int
+__lock_detect(env, atype, rejectp)
+ ENV *env;
+ u_int32_t atype;
+ int *rejectp;
+{
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ db_timespec now;
+ locker_info *idmap;
+ u_int32_t *bitmap, *copymap, **deadp, **deadlist, *tmpmap;
+ u_int32_t i, cid, keeper, killid, limit, nalloc, nlockers;
+ u_int32_t lock_max, txn_max;
+ int pri_set, ret, status;
+
+ /*
+ * If this environment is a replication client, then we must use the
+ * MINWRITE detection discipline.
+ */
+ if (IS_REP_CLIENT(env))
+ atype = DB_LOCK_MINWRITE;
+
+ copymap = tmpmap = NULL;
+ deadlist = NULL;
+
+ lt = env->lk_handle;
+ if (rejectp != NULL)
+ *rejectp = 0;
+
+ /* Check if a detector run is necessary. */
+
+ /* Make a pass only if auto-detect would run. */
+ region = lt->reginfo.primary;
+
+ timespecclear(&now);
+ if (region->need_dd == 0 &&
+ (!timespecisset(&region->next_timeout) ||
+ !__clock_expired(env, &now, &region->next_timeout))) {
+ return (0);
+ }
+ if (region->need_dd == 0)
+ atype = DB_LOCK_EXPIRE;
+
+ /* Reset need_dd, so we know we've run the detector. */
+ region->need_dd = 0;
+
+ /* Build the waits-for bitmap. */
+ ret = __dd_build(env,
+ atype, &bitmap, &nlockers, &nalloc, &idmap, rejectp, &pri_set);
+ lock_max = region->stat.st_cur_maxid;
+ if (ret != 0 || atype == DB_LOCK_EXPIRE)
+ return (ret);
+
+ /* If there are no lockers, there are no deadlocks. */
+ if (nlockers == 0)
+ return (0);
+
+#ifdef DIAGNOSTIC
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_WAITSFOR))
+ __dd_debug(env, idmap, bitmap, nlockers, nalloc);
+#endif
+
+ /* Now duplicate the bitmaps so we can verify deadlock participants. */
+ if ((ret = __os_calloc(env, (size_t)nlockers,
+ sizeof(u_int32_t) * nalloc, &copymap)) != 0)
+ goto err;
+ memcpy(copymap, bitmap, nlockers * sizeof(u_int32_t) * nalloc);
+
+ if ((ret = __os_calloc(env, sizeof(u_int32_t), nalloc, &tmpmap)) != 0)
+ goto err;
+
+ /* Find a deadlock. */
+ if ((ret =
+ __dd_find(env, bitmap, idmap, nlockers, nalloc, &deadlist)) != 0)
+ return (ret);
+
+ /*
+ * We need the cur_maxid from the txn region as well. In order
+ * to avoid tricky synchronization between the lock and txn
+ * regions, we simply unlock the lock region and then lock the
+ * txn region. This introduces a small window during which the
+ * transaction system could then wrap. We're willing to return
+ * the wrong answer for "oldest" or "youngest" in those rare
+ * circumstances.
+ */
+ if (TXN_ON(env)) {
+ TXN_SYSTEM_LOCK(env);
+ txn_max = ((DB_TXNREGION *)
+ env->tx_handle->reginfo.primary)->cur_maxid;
+ TXN_SYSTEM_UNLOCK(env);
+ } else
+ txn_max = TXN_MAXIMUM;
+
+ killid = BAD_KILLID;
+ for (deadp = deadlist; *deadp != NULL; deadp++) {
+ if (rejectp != NULL)
+ ++*rejectp;
+ killid = (u_int32_t)(*deadp - bitmap) / nalloc;
+ limit = killid;
+
+ /*
+ * There are cases in which our general algorithm will
+ * fail. Returning 1 from verify indicates that the
+ * particular locker is not only involved in a deadlock,
+ * but that killing him will allow others to make forward
+ * progress. Unfortunately, there are cases where we need
+ * to abort someone, but killing them will not necessarily
+ * ensure forward progress (imagine N readers all trying to
+ * acquire a write lock).
+ * killid is only set to lockers that pass the db_verify test.
+ * keeper will hold the best candidate even if it does
+ * not pass db_verify. Once we fill in killid then we do
+ * not need a keeper, but we keep updating it anyway.
+ */
+
+ keeper = idmap[killid].in_abort == 0 ? killid : BAD_KILLID;
+ if (keeper == BAD_KILLID ||
+ __dd_verify(idmap, *deadp,
+ tmpmap, copymap, nlockers, nalloc, keeper) == 0)
+ killid = BAD_KILLID;
+
+ if (!pri_set && killid != BAD_KILLID &&
+ (atype == DB_LOCK_DEFAULT || atype == DB_LOCK_RANDOM))
+ goto dokill;
+
+ /*
+ * Start with the id that we know is deadlocked, then examine
+ * all other set bits and see if any are a better candidate
+ * for abortion and they are genuinely part of the deadlock.
+ * The definition of "best":
+ * MAXLOCKS: maximum count
+ * MAXWRITE: maximum write count
+ * MINLOCKS: minimum count
+ * MINWRITE: minimum write count
+ * OLDEST: smallest id
+ * YOUNGEST: largest id
+ */
+ for (i = (limit + 1) % nlockers;
+ i != limit;
+ i = (i + 1) % nlockers) {
+ if (!ISSET_MAP(*deadp, i) || idmap[i].in_abort)
+ continue;
+
+ /*
+ * Determine if we have a verified candidate
+ * in killid, if not then compare with the
+ * non-verified candidate in keeper.
+ */
+ if (killid == BAD_KILLID) {
+ if (keeper == BAD_KILLID)
+ goto use_next;
+ else
+ cid = keeper;
+ } else
+ cid = killid;
+
+ if (idmap[i].priority > idmap[cid].priority)
+ continue;
+ if (idmap[i].priority < idmap[cid].priority)
+ goto use_next;
+
+ /* Equal priorities, break ties using atype. */
+ switch (atype) {
+ case DB_LOCK_OLDEST:
+ if (__dd_isolder(idmap[cid].id,
+ idmap[i].id, lock_max, txn_max))
+ continue;
+ break;
+ case DB_LOCK_YOUNGEST:
+ if (__dd_isolder(idmap[i].id,
+ idmap[cid].id, lock_max, txn_max))
+ continue;
+ break;
+ case DB_LOCK_MAXLOCKS:
+ if (idmap[i].count < idmap[cid].count)
+ continue;
+ break;
+ case DB_LOCK_MAXWRITE:
+ if (idmap[i].count < idmap[cid].count)
+ continue;
+ break;
+ case DB_LOCK_MINLOCKS:
+ case DB_LOCK_MINWRITE:
+ if (idmap[i].count > idmap[cid].count)
+ continue;
+ break;
+ case DB_LOCK_DEFAULT:
+ case DB_LOCK_RANDOM:
+ continue;
+
+ default:
+ killid = BAD_KILLID;
+ ret = EINVAL;
+ goto dokill;
+ }
+
+use_next: keeper = i;
+ if (__dd_verify(idmap, *deadp,
+ tmpmap, copymap, nlockers, nalloc, i))
+ killid = i;
+ }
+
+dokill: if (killid == BAD_KILLID) {
+ if (keeper == BAD_KILLID)
+ continue;
+ else {
+ /*
+ * Removing a single locker will not
+ * break the deadlock, signal to run
+ * detection again.
+ */
+ region->need_dd = 1;
+ killid = keeper;
+ }
+ }
+
+ /* Kill the locker with lockid idmap[killid]. */
+ if ((ret = __dd_abort(env, &idmap[killid], &status)) != 0)
+ break;
+
+ /*
+ * It's possible that the lock was already aborted; this isn't
+ * necessarily a problem, so do not treat it as an error. If
+ * the txn was aborting and deadlocked trying to upgrade
+ * a was_write lock, the detector should be run again or
+ * the deadlock might persist.
+ */
+ if (status != 0) {
+ if (status != DB_ALREADY_ABORTED)
+ __db_errx(env, DB_STR_A("2049",
+ "warning: unable to abort locker %lx",
+ "%lx"), (u_long)idmap[killid].id);
+ else
+ region->need_dd = 1;
+ } else if (FLD_ISSET(env->dbenv->verbose, DB_VERB_DEADLOCK))
+ __db_msg(env, DB_STR_A("2050", "Aborting locker %lx",
+ "%lx"), (u_long)idmap[killid].id);
+ }
+err: if (copymap != NULL)
+ __os_free(env, copymap);
+ if (deadlist != NULL)
+ __os_free(env, deadlist);
+ if (tmpmap != NULL)
+ __os_free(env, tmpmap);
+ __os_free(env, bitmap);
+ __os_free(env, idmap);
+
+ return (ret);
+}
+
+/*
+ * ========================================================================
+ * Utilities
+ */
+
+#define DD_INVALID_ID ((u_int32_t) -1)
+
+/*
+ * __dd_build --
+ * Build the lock dependency bit maps.
+ * Notes on synchronization:
+ * LOCK_SYSTEM_LOCK is used to hold objects locked when we have
+ * a single partition.
+ * LOCK_LOCKERS is held while we are walking the lockers list and
+ * to single thread the use of lockerp->dd_id.
+ * LOCK_DD protects the DD list of objects.
+ */
+
+static int
+__dd_build(env, atype, bmp, nlockers, allocp, idmap, rejectp, pri_set)
+ ENV *env;
+ u_int32_t atype, **bmp, *nlockers, *allocp;
+ locker_info **idmap;
+ int *pri_set, *rejectp;
+{
+ struct __db_lock *lp;
+ DB_LOCKER *lip, *lockerp, *child;
+ DB_LOCKOBJ *op, *lo, *np;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ locker_info *id_array;
+ db_timespec now, min_timeout;
+ u_int32_t *bitmap, count, dd;
+ u_int32_t *entryp, gen, id, indx, ndx, nentries, *tmpmap;
+ u_int8_t *pptr;
+ int is_first, ret;
+
+ COMPQUIET(indx, 0);
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+ timespecclear(&now);
+ timespecclear(&min_timeout);
+
+ /*
+ * While we always check for expired timeouts, if we are called with
+ * DB_LOCK_EXPIRE, then we are only checking for timeouts (i.e., not
+ * doing deadlock detection at all). If we aren't doing real deadlock
+ * detection, then we can skip a significant, amount of the processing.
+ * In particular we do not build the conflict array and our caller
+ * needs to expect this.
+ */
+ LOCK_SYSTEM_LOCK(lt, region);
+ if (atype == DB_LOCK_EXPIRE) {
+skip: LOCK_DD(env, region);
+ op = SH_TAILQ_FIRST(&region->dd_objs, __db_lockobj);
+ for (; op != NULL; op = np) {
+ indx = op->indx;
+ gen = op->generation;
+ UNLOCK_DD(env, region);
+ OBJECT_LOCK_NDX(lt, region, indx);
+ if (op->generation != gen) {
+ OBJECT_UNLOCK(lt, region, indx);
+ goto skip;
+ }
+ SH_TAILQ_FOREACH(lp, &op->waiters, links, __db_lock) {
+ lockerp = (DB_LOCKER *)
+ R_ADDR(&lt->reginfo, lp->holder);
+ if (lp->status == DB_LSTAT_WAITING) {
+ if (__clock_expired(env,
+ &now, &lockerp->lk_expire)) {
+ lp->status = DB_LSTAT_EXPIRED;
+ MUTEX_UNLOCK(
+ env, lp->mtx_lock);
+ if (rejectp != NULL)
+ ++*rejectp;
+ continue;
+ }
+ if (timespecisset(
+ &lockerp->lk_expire) &&
+ (!timespecisset(&min_timeout) ||
+ timespeccmp(&min_timeout,
+ &lockerp->lk_expire, >)))
+ min_timeout =
+ lockerp->lk_expire;
+ }
+ }
+ LOCK_DD(env, region);
+ np = SH_TAILQ_NEXT(op, dd_links, __db_lockobj);
+ OBJECT_UNLOCK(lt, region, indx);
+ }
+ UNLOCK_DD(env, region);
+ LOCK_SYSTEM_UNLOCK(lt, region);
+ goto done;
+ }
+
+ /*
+ * Allocate after locking the region
+ * to make sure the structures are large enough.
+ */
+ LOCK_LOCKERS(env, region);
+ count = region->nlockers;
+ if (count == 0) {
+ UNLOCK_LOCKERS(env, region);
+ LOCK_SYSTEM_UNLOCK(lt, region);
+ *nlockers = 0;
+ return (0);
+ }
+
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_DEADLOCK))
+ __db_msg(env, DB_STR_A("2051", "%lu lockers",
+ "%lu"), (u_long)count);
+
+ nentries = (u_int32_t)DB_ALIGN(count, 32) / 32;
+
+ /* Allocate enough space for a count by count bitmap matrix. */
+ if ((ret = __os_calloc(env, (size_t)count,
+ sizeof(u_int32_t) * nentries, &bitmap)) != 0) {
+ UNLOCK_LOCKERS(env, region);
+ LOCK_SYSTEM_UNLOCK(lt, region);
+ return (ret);
+ }
+
+ if ((ret = __os_calloc(env,
+ sizeof(u_int32_t), nentries, &tmpmap)) != 0) {
+ UNLOCK_LOCKERS(env, region);
+ LOCK_SYSTEM_UNLOCK(lt, region);
+ __os_free(env, bitmap);
+ return (ret);
+ }
+
+ if ((ret = __os_calloc(env,
+ (size_t)count, sizeof(locker_info), &id_array)) != 0) {
+ UNLOCK_LOCKERS(env, region);
+ LOCK_SYSTEM_UNLOCK(lt, region);
+ __os_free(env, bitmap);
+ __os_free(env, tmpmap);
+ return (ret);
+ }
+
+ /*
+ * First we go through and assign each locker a deadlock detector id.
+ */
+ id = 0;
+ *pri_set = 0;
+ SH_TAILQ_FOREACH(lip, &region->lockers, ulinks, __db_locker) {
+ if (lip->master_locker == INVALID_ROFF) {
+ DB_ASSERT(env, id < count);
+ lip->dd_id = id++;
+ id_array[lip->dd_id].id = lip->id;
+ id_array[lip->dd_id].priority = lip->priority;
+ if (lip->dd_id > 0 &&
+ id_array[lip->dd_id-1].priority != lip->priority)
+ *pri_set = 1;
+
+ switch (atype) {
+ case DB_LOCK_MINLOCKS:
+ case DB_LOCK_MAXLOCKS:
+ id_array[lip->dd_id].count = lip->nlocks;
+ break;
+ case DB_LOCK_MINWRITE:
+ case DB_LOCK_MAXWRITE:
+ id_array[lip->dd_id].count = lip->nwrites;
+ break;
+ default:
+ break;
+ }
+ } else
+ lip->dd_id = DD_INVALID_ID;
+
+ }
+
+ /*
+ * We only need consider objects that have waiters, so we use
+ * the list of objects with waiters (dd_objs) instead of traversing
+ * the entire hash table. For each object, we traverse the waiters
+ * list and add an entry in the waitsfor matrix for each waiter/holder
+ * combination. We don't want to lock from the DD mutex to the
+ * hash mutex, so we drop deadlock mutex and get the hash mutex. Then
+ * check to see if the object has changed. Once we have the object
+ * locked then locks cannot be remove and lockers cannot go away.
+ */
+ if (0) {
+ /* If an object has changed state, start over. */
+again: memset(bitmap, 0, count * sizeof(u_int32_t) * nentries);
+ }
+ LOCK_DD(env, region);
+ op = SH_TAILQ_FIRST(&region->dd_objs, __db_lockobj);
+ for (; op != NULL; op = np) {
+ indx = op->indx;
+ gen = op->generation;
+ UNLOCK_DD(env, region);
+
+ OBJECT_LOCK_NDX(lt, region, indx);
+ if (gen != op->generation) {
+ OBJECT_UNLOCK(lt, region, indx);
+ goto again;
+ }
+
+ /*
+ * First we go through and create a bit map that
+ * represents all the holders of this object.
+ */
+
+ CLEAR_MAP(tmpmap, nentries);
+ SH_TAILQ_FOREACH(lp, &op->holders, links, __db_lock) {
+ lockerp = (DB_LOCKER *)R_ADDR(&lt->reginfo, lp->holder);
+
+ if (lockerp->dd_id == DD_INVALID_ID) {
+ /*
+ * If the locker was not here when we started,
+ * then it was not deadlocked at that time.
+ */
+ if (lockerp->master_locker == INVALID_ROFF)
+ continue;
+ dd = ((DB_LOCKER *)R_ADDR(&lt->reginfo,
+ lockerp->master_locker))->dd_id;
+ if (dd == DD_INVALID_ID)
+ continue;
+ lockerp->dd_id = dd;
+ switch (atype) {
+ case DB_LOCK_MINLOCKS:
+ case DB_LOCK_MAXLOCKS:
+ id_array[dd].count += lockerp->nlocks;
+ break;
+ case DB_LOCK_MINWRITE:
+ case DB_LOCK_MAXWRITE:
+ id_array[dd].count += lockerp->nwrites;
+ break;
+ default:
+ break;
+ }
+
+ } else
+ dd = lockerp->dd_id;
+ id_array[dd].valid = 1;
+
+ /*
+ * If the holder has already been aborted, then
+ * we should ignore it for now.
+ */
+ if (lp->status == DB_LSTAT_HELD)
+ SET_MAP(tmpmap, dd);
+ }
+
+ /*
+ * Next, for each waiter, we set its row in the matrix
+ * equal to the map of holders we set up above.
+ */
+ for (is_first = 1,
+ lp = SH_TAILQ_FIRST(&op->waiters, __db_lock);
+ lp != NULL;
+ is_first = 0,
+ lp = SH_TAILQ_NEXT(lp, links, __db_lock)) {
+ lockerp = (DB_LOCKER *)R_ADDR(&lt->reginfo, lp->holder);
+ if (lp->status == DB_LSTAT_WAITING) {
+ if (__clock_expired(env,
+ &now, &lockerp->lk_expire)) {
+ lp->status = DB_LSTAT_EXPIRED;
+ MUTEX_UNLOCK(env, lp->mtx_lock);
+ if (rejectp != NULL)
+ ++*rejectp;
+ continue;
+ }
+ if (timespecisset(&lockerp->lk_expire) &&
+ (!timespecisset(&min_timeout) ||
+ timespeccmp(
+ &min_timeout, &lockerp->lk_expire, >)))
+ min_timeout = lockerp->lk_expire;
+ }
+
+ if (lockerp->dd_id == DD_INVALID_ID) {
+ dd = ((DB_LOCKER *)R_ADDR(&lt->reginfo,
+ lockerp->master_locker))->dd_id;
+ lockerp->dd_id = dd;
+ switch (atype) {
+ case DB_LOCK_MINLOCKS:
+ case DB_LOCK_MAXLOCKS:
+ id_array[dd].count += lockerp->nlocks;
+ break;
+ case DB_LOCK_MINWRITE:
+ case DB_LOCK_MAXWRITE:
+ id_array[dd].count += lockerp->nwrites;
+ break;
+ default:
+ break;
+ }
+ } else
+ dd = lockerp->dd_id;
+ id_array[dd].valid = 1;
+
+ /*
+ * If the transaction is pending abortion, then
+ * ignore it on this iteration.
+ */
+ if (lp->status != DB_LSTAT_WAITING)
+ continue;
+
+ entryp = bitmap + (nentries * dd);
+ OR_MAP(entryp, tmpmap, nentries);
+ /*
+ * If this is the first waiter on the queue,
+ * then we remove the waitsfor relationship
+ * with oneself. However, if it's anywhere
+ * else on the queue, then we have to keep
+ * it and we have an automatic deadlock.
+ */
+ if (is_first) {
+ if (ISSET_MAP(entryp, dd))
+ id_array[dd].self_wait = 1;
+ CLR_MAP(entryp, dd);
+ }
+ }
+ LOCK_DD(env, region);
+ np = SH_TAILQ_NEXT(op, dd_links, __db_lockobj);
+ OBJECT_UNLOCK(lt, region, indx);
+ }
+ UNLOCK_DD(env, region);
+
+ /*
+ * Now for each locker, record its last lock and set abort status.
+ * We need to look at the heldby list carefully. We have the LOCKERS
+ * locked so they cannot go away. The lock at the head of the
+ * list can be removed by locking the object it points at.
+ * Since lock memory is not freed if we get a lock we can look
+ * at it safely but SH_LIST_FIRST is not atomic, so we check that
+ * the list has not gone empty during that macro. We check abort
+ * status after building the bit maps so that we will not detect
+ * a blocked transaction without noting that it is already aborting.
+ */
+ for (id = 0; id < count; id++) {
+ if (!id_array[id].valid)
+ continue;
+ if ((ret = __lock_getlocker_int(lt,
+ id_array[id].id, 0, &lockerp)) != 0 || lockerp == NULL)
+ continue;
+
+ /*
+ * If this is a master transaction, try to
+ * find one of its children's locks first,
+ * as they are probably more recent.
+ */
+ child = SH_LIST_FIRST(&lockerp->child_locker, __db_locker);
+ if (child != NULL) {
+ do {
+c_retry: lp = SH_LIST_FIRST(&child->heldby, __db_lock);
+ if (SH_LIST_EMPTY(&child->heldby) || lp == NULL)
+ goto c_next;
+
+ if (F_ISSET(child, DB_LOCKER_INABORT))
+ id_array[id].in_abort = 1;
+ ndx = lp->indx;
+ OBJECT_LOCK_NDX(lt, region, ndx);
+ if (lp != SH_LIST_FIRST(
+ &child->heldby, __db_lock) ||
+ ndx != lp->indx) {
+ OBJECT_UNLOCK(lt, region, ndx);
+ goto c_retry;
+ }
+
+ if (lp != NULL &&
+ lp->status == DB_LSTAT_WAITING) {
+ id_array[id].last_locker_id = child->id;
+ goto get_lock;
+ } else {
+ OBJECT_UNLOCK(lt, region, ndx);
+ }
+c_next: child = SH_LIST_NEXT(
+ child, child_link, __db_locker);
+ } while (child != NULL);
+ }
+
+l_retry: lp = SH_LIST_FIRST(&lockerp->heldby, __db_lock);
+ if (!SH_LIST_EMPTY(&lockerp->heldby) && lp != NULL) {
+ ndx = lp->indx;
+ OBJECT_LOCK_NDX(lt, region, ndx);
+ if (lp != SH_LIST_FIRST(&lockerp->heldby, __db_lock) ||
+ lp->indx != ndx) {
+ OBJECT_UNLOCK(lt, region, ndx);
+ goto l_retry;
+ }
+ id_array[id].last_locker_id = lockerp->id;
+get_lock: id_array[id].last_lock = R_OFFSET(&lt->reginfo, lp);
+ id_array[id].last_obj = lp->obj;
+ lo = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+ id_array[id].last_ndx = lo->indx;
+ pptr = SH_DBT_PTR(&lo->lockobj);
+ if (lo->lockobj.size >= sizeof(db_pgno_t))
+ memcpy(&id_array[id].pgno,
+ pptr, sizeof(db_pgno_t));
+ else
+ id_array[id].pgno = 0;
+ OBJECT_UNLOCK(lt, region, ndx);
+ }
+ if (F_ISSET(lockerp, DB_LOCKER_INABORT))
+ id_array[id].in_abort = 1;
+ }
+ UNLOCK_LOCKERS(env, region);
+ LOCK_SYSTEM_UNLOCK(lt, region);
+
+ /*
+ * Now we can release everything except the bitmap matrix that we
+ * created.
+ */
+ *nlockers = id;
+ *idmap = id_array;
+ *bmp = bitmap;
+ *allocp = nentries;
+ __os_free(env, tmpmap);
+done: if (timespecisset(&region->next_timeout))
+ region->next_timeout = min_timeout;
+ return (0);
+}
+
+static int
+__dd_find(env, bmp, idmap, nlockers, nalloc, deadp)
+ ENV *env;
+ u_int32_t *bmp, nlockers, nalloc;
+ locker_info *idmap;
+ u_int32_t ***deadp;
+{
+ u_int32_t i, j, k, *mymap, *tmpmap, **retp;
+ u_int ndead, ndeadalloc;
+ int ret;
+
+#undef INITIAL_DEAD_ALLOC
+#define INITIAL_DEAD_ALLOC 8
+
+ ndeadalloc = INITIAL_DEAD_ALLOC;
+ ndead = 0;
+ if ((ret = __os_malloc(env,
+ ndeadalloc * sizeof(u_int32_t *), &retp)) != 0)
+ return (ret);
+
+ /*
+ * For each locker, OR in the bits from the lockers on which that
+ * locker is waiting.
+ */
+ for (mymap = bmp, i = 0; i < nlockers; i++, mymap += nalloc) {
+ if (!idmap[i].valid)
+ continue;
+ for (j = 0; j < nlockers; j++) {
+ if (!ISSET_MAP(mymap, j))
+ continue;
+
+ /* Find the map for this bit. */
+ tmpmap = bmp + (nalloc * j);
+ OR_MAP(mymap, tmpmap, nalloc);
+ if (!ISSET_MAP(mymap, i))
+ continue;
+
+ /* Make sure we leave room for NULL. */
+ if (ndead + 2 >= ndeadalloc) {
+ ndeadalloc <<= 1;
+ /*
+ * If the alloc fails, then simply return the
+ * deadlocks that we already have.
+ */
+ if (__os_realloc(env,
+ ndeadalloc * sizeof(u_int32_t *),
+ &retp) != 0) {
+ retp[ndead] = NULL;
+ *deadp = retp;
+ return (0);
+ }
+ }
+ retp[ndead++] = mymap;
+
+ /* Mark all participants in this deadlock invalid. */
+ for (k = 0; k < nlockers; k++)
+ if (ISSET_MAP(mymap, k))
+ idmap[k].valid = 0;
+ break;
+ }
+ }
+ retp[ndead] = NULL;
+ *deadp = retp;
+ return (0);
+}
+
+static int
+__dd_abort(env, info, statusp)
+ ENV *env;
+ locker_info *info;
+ int *statusp;
+{
+ struct __db_lock *lockp;
+ DB_LOCKER *lockerp;
+ DB_LOCKOBJ *sh_obj;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ int ret;
+
+ *statusp = 0;
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+ ret = 0;
+
+ /* We must lock so this locker cannot go away while we abort it. */
+ LOCK_SYSTEM_LOCK(lt, region);
+ LOCK_LOCKERS(env, region);
+
+ /*
+ * Get the locker. If it's gone or was aborted while we were
+ * detecting, return that.
+ */
+ if ((ret = __lock_getlocker_int(lt,
+ info->last_locker_id, 0, &lockerp)) != 0)
+ goto err;
+ if (lockerp == NULL || F_ISSET(lockerp, DB_LOCKER_INABORT)) {
+ *statusp = DB_ALREADY_ABORTED;
+ goto err;
+ }
+
+ /*
+ * Find the locker's last lock. It is possible for this lock to have
+ * been freed, either though a timeout or another detector run.
+ * First lock the lock object so it is stable.
+ */
+
+ OBJECT_LOCK_NDX(lt, region, info->last_ndx);
+ if ((lockp = SH_LIST_FIRST(&lockerp->heldby, __db_lock)) == NULL) {
+ *statusp = DB_ALREADY_ABORTED;
+ goto done;
+ }
+ if (R_OFFSET(&lt->reginfo, lockp) != info->last_lock ||
+ lockp->holder != R_OFFSET(&lt->reginfo, lockerp) ||
+ F_ISSET(lockerp, DB_LOCKER_INABORT) ||
+ lockp->obj != info->last_obj || lockp->status != DB_LSTAT_WAITING) {
+ *statusp = DB_ALREADY_ABORTED;
+ goto done;
+ }
+
+ sh_obj = SH_OFF_TO_PTR(lockp, lockp->obj, DB_LOCKOBJ);
+
+ STAT_INC_VERB(env, lock, deadlock,
+ region->stat.st_ndeadlocks, lockerp->id, &sh_obj->lockobj);
+ /* Abort lock, take it off list, and wake up this lock. */
+ lockp->status = DB_LSTAT_ABORTED;
+ SH_TAILQ_REMOVE(&sh_obj->waiters, lockp, links, __db_lock);
+
+ /*
+ * Either the waiters list is now empty, in which case we remove
+ * it from dd_objs, or it is not empty, in which case we need to
+ * do promotion.
+ */
+ if (SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock) == NULL) {
+ LOCK_DD(env, region);
+ SH_TAILQ_REMOVE(&region->dd_objs,
+ sh_obj, dd_links, __db_lockobj);
+ UNLOCK_DD(env, region);
+ } else
+ ret = __lock_promote(lt, sh_obj, NULL, 0);
+ MUTEX_UNLOCK(env, lockp->mtx_lock);
+
+done: OBJECT_UNLOCK(lt, region, info->last_ndx);
+err: UNLOCK_LOCKERS(env, region);
+ LOCK_SYSTEM_UNLOCK(lt, region);
+ return (ret);
+}
+
+#ifdef DIAGNOSTIC
+static void
+__dd_debug(env, idmap, bitmap, nlockers, nalloc)
+ ENV *env;
+ locker_info *idmap;
+ u_int32_t *bitmap, nlockers, nalloc;
+{
+ DB_MSGBUF mb;
+ u_int32_t i, j, *mymap;
+
+ DB_MSGBUF_INIT(&mb);
+
+ __db_msg(env, "Waitsfor array\nWaiter:\tWaiting on:");
+ for (mymap = bitmap, i = 0; i < nlockers; i++, mymap += nalloc) {
+ if (!idmap[i].valid)
+ continue;
+
+ __db_msgadd(env, &mb, /* Waiter. */
+ "%lx/%lu:\t", (u_long)idmap[i].id, (u_long)idmap[i].pgno);
+ for (j = 0; j < nlockers; j++)
+ if (ISSET_MAP(mymap, j))
+ __db_msgadd(env,
+ &mb, " %lx", (u_long)idmap[j].id);
+ __db_msgadd(env, &mb, " %lu", (u_long)idmap[i].last_lock);
+ DB_MSGBUF_FLUSH(env, &mb);
+ }
+}
+#endif
+
+/*
+ * Given a bitmap that contains a deadlock, verify that the bit
+ * specified in the which parameter indicates a transaction that
+ * is actually deadlocked. Return 1 if really deadlocked, 0 otherwise.
+ * deadmap -- the array that identified the deadlock.
+ * tmpmap -- a copy of the initial bitmaps from the dd_build phase.
+ * origmap -- a temporary bit map into which we can OR things.
+ * nlockers -- the number of actual lockers under consideration.
+ * nalloc -- the number of words allocated for the bitmap.
+ * which -- the locker in question.
+ */
+static int
+__dd_verify(idmap, deadmap, tmpmap, origmap, nlockers, nalloc, which)
+ locker_info *idmap;
+ u_int32_t *deadmap, *tmpmap, *origmap;
+ u_int32_t nlockers, nalloc, which;
+{
+ u_int32_t *tmap;
+ u_int32_t j;
+ int count;
+
+ memset(tmpmap, 0, sizeof(u_int32_t) * nalloc);
+
+ /*
+ * In order for "which" to be actively involved in
+ * the deadlock, removing him from the evaluation
+ * must remove the deadlock. So, we OR together everyone
+ * except which; if all the participants still have their
+ * bits set, then the deadlock persists and which does
+ * not participate. If the deadlock does not persist
+ * then "which" does participate.
+ */
+ count = 0;
+ for (j = 0; j < nlockers; j++) {
+ if (!ISSET_MAP(deadmap, j) || j == which)
+ continue;
+
+ /* Find the map for this bit. */
+ tmap = origmap + (nalloc * j);
+
+ /*
+ * We special case the first waiter who is also a holder, so
+ * we don't automatically call that a deadlock. However, if
+ * it really is a deadlock, we need the bit set now so that
+ * we treat the first waiter like other waiters.
+ */
+ if (idmap[j].self_wait)
+ SET_MAP(tmap, j);
+ OR_MAP(tmpmap, tmap, nalloc);
+ count++;
+ }
+
+ if (count == 1)
+ return (1);
+
+ /*
+ * Now check the resulting map and see whether
+ * all participants still have their bit set.
+ */
+ for (j = 0; j < nlockers; j++) {
+ if (!ISSET_MAP(deadmap, j) || j == which)
+ continue;
+ if (!ISSET_MAP(tmpmap, j))
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * __dd_isolder --
+ *
+ * Figure out the relative age of two lockers. We make all lockers
+ * older than all transactions, because that's how it's worked
+ * historically (because lockers are lower ids).
+ */
+static int
+__dd_isolder(a, b, lock_max, txn_max)
+ u_int32_t a, b;
+ u_int32_t lock_max, txn_max;
+{
+ u_int32_t max;
+
+ /* Check for comparing lock-id and txnid. */
+ if (a <= DB_LOCK_MAXID && b > DB_LOCK_MAXID)
+ return (1);
+ if (b <= DB_LOCK_MAXID && a > DB_LOCK_MAXID)
+ return (0);
+
+ /* In the same space; figure out which one. */
+ max = txn_max;
+ if (a <= DB_LOCK_MAXID)
+ max = lock_max;
+
+ /*
+ * We can't get a 100% correct ordering, because we don't know
+ * where the current interval started and if there were older
+ * lockers outside the interval. We do the best we can.
+ */
+
+ /*
+ * Check for a wrapped case with ids above max.
+ */
+ if (a > max && b < max)
+ return (1);
+ if (b > max && a < max)
+ return (0);
+
+ return (a < b);
+}
diff --git a/src/lock/lock_failchk.c b/src/lock/lock_failchk.c
new file mode 100644
index 00000000..59fb010f
--- /dev/null
+++ b/src/lock/lock_failchk.c
@@ -0,0 +1,114 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+/*
+ * __lock_failchk --
+ * Check for locks held by dead threads of control and release
+ * read locks. If any write locks were held by dead non-trasnactional
+ * lockers then we must abort and run recovery. Otherwise we release
+ * read locks for lockers owned by dead threads. Write locks for
+ * dead transactional lockers will be freed when we abort the transaction.
+ *
+ * PUBLIC: int __lock_failchk __P((ENV *));
+ */
+int
+__lock_failchk(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ DB_LOCKER *lip;
+ DB_LOCKREGION *lrp;
+ DB_LOCKREQ request;
+ DB_LOCKTAB *lt;
+ u_int32_t i;
+ int ret;
+ char buf[DB_THREADID_STRLEN];
+
+ dbenv = env->dbenv;
+ lt = env->lk_handle;
+ lrp = lt->reginfo.primary;
+
+retry: LOCK_LOCKERS(env, lrp);
+
+ ret = 0;
+ for (i = 0; i < lrp->locker_t_size; i++)
+ SH_TAILQ_FOREACH(lip, &lt->locker_tab[i], links, __db_locker) {
+ /*
+ * If the locker is transactional, we can ignore it if
+ * it has no read locks or has no locks at all. Check
+ * the heldby list rather then nlocks since a lock may
+ * be PENDING. __txn_failchk aborts any transactional
+ * lockers. Non-transactional lockers progress to
+ * is_alive test.
+ */
+ if ((lip->id >= TXN_MINIMUM) &&
+ (SH_LIST_EMPTY(&lip->heldby) ||
+ lip->nlocks == lip->nwrites))
+ continue;
+
+ /* If the locker is still alive, it's not a problem. */
+ if (dbenv->is_alive(dbenv, lip->pid, lip->tid,
+ F_ISSET(lip, DB_LOCKER_HANDLE_LOCKER) ?
+ DB_MUTEX_PROCESS_ONLY : 0))
+ continue;
+
+ /*
+ * We can only deal with read locks. If a
+ * non-transactional locker holds write locks we
+ * have to assume a Berkeley DB operation was
+ * interrupted with only 1-of-N pages modified.
+ */
+ if (lip->id < TXN_MINIMUM && lip->nwrites != 0) {
+ ret = __db_failed(env, DB_STR_A("2052",
+ "locker has write locks", ""),
+ lip->pid, lip->tid);
+ break;
+ }
+
+ /*
+ * Discard the locker and its read locks.
+ */
+ if (!SH_LIST_EMPTY(&lip->heldby)) {
+ __db_msg(env, DB_STR_A("2053",
+ "Freeing read locks for locker %#lx: %s",
+ "%#lx %s"), (u_long)lip->id,
+ dbenv->thread_id_string(
+ dbenv, lip->pid, lip->tid, buf));
+ UNLOCK_LOCKERS(env, lrp);
+ memset(&request, 0, sizeof(request));
+ request.op = DB_LOCK_PUT_READ;
+ if ((ret = __lock_vec(env,
+ lip, 0, &request, 1, NULL)) != 0)
+ return (ret);
+ }
+ else
+ UNLOCK_LOCKERS(env, lrp);
+
+ /*
+ * This locker is most likely referenced by a cursor
+ * which is owned by a dead thread. Normally the
+ * cursor would be available for other threads
+ * but we assume the dead thread will never release
+ * it.
+ */
+ if (lip->id < TXN_MINIMUM &&
+ (ret = __lock_freelocker(lt, lip)) != 0)
+ return (ret);
+ goto retry;
+ }
+
+ UNLOCK_LOCKERS(env, lrp);
+
+ return (ret);
+}
diff --git a/src/lock/lock_id.c b/src/lock/lock_id.c
new file mode 100644
index 00000000..24b545d1
--- /dev/null
+++ b/src/lock/lock_id.c
@@ -0,0 +1,572 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+
+static int __lock_freelocker_int
+ __P((DB_LOCKTAB *, DB_LOCKREGION *, DB_LOCKER *, int));
+
+/*
+ * __lock_id_pp --
+ * ENV->lock_id pre/post processing.
+ *
+ * PUBLIC: int __lock_id_pp __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_id_pp(dbenv, idp)
+ DB_ENV *dbenv;
+ u_int32_t *idp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lk_handle, "DB_ENV->lock_id", DB_INIT_LOCK);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__lock_id(env, idp, NULL)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __lock_id --
+ * ENV->lock_id.
+ *
+ * PUBLIC: int __lock_id __P((ENV *, u_int32_t *, DB_LOCKER **));
+ */
+int
+__lock_id(env, idp, lkp)
+ ENV *env;
+ u_int32_t *idp;
+ DB_LOCKER **lkp;
+{
+ DB_LOCKER *lk;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ u_int32_t id, *ids;
+ int nids, ret;
+
+ lk = NULL;
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+ id = DB_LOCK_INVALIDID;
+ ret = 0;
+
+ id = DB_LOCK_INVALIDID;
+ lk = NULL;
+
+ LOCK_LOCKERS(env, region);
+
+ /*
+ * Allocate a new lock id. If we wrap around then we find the minimum
+ * currently in use and make sure we can stay below that. This code is
+ * similar to code in __txn_begin_int for recovering txn ids.
+ *
+ * Our current valid range can span the maximum valid value, so check
+ * for it and wrap manually.
+ */
+ if (region->lock_id == DB_LOCK_MAXID &&
+ region->cur_maxid != DB_LOCK_MAXID)
+ region->lock_id = DB_LOCK_INVALIDID;
+ if (region->lock_id == region->cur_maxid) {
+ if ((ret = __os_malloc(env,
+ sizeof(u_int32_t) * region->nlockers, &ids)) != 0)
+ goto err;
+ nids = 0;
+ SH_TAILQ_FOREACH(lk, &region->lockers, ulinks, __db_locker)
+ ids[nids++] = lk->id;
+ region->lock_id = DB_LOCK_INVALIDID;
+ region->cur_maxid = DB_LOCK_MAXID;
+ if (nids != 0)
+ __db_idspace(ids, nids,
+ &region->lock_id, &region->cur_maxid);
+ __os_free(env, ids);
+ }
+ id = ++region->lock_id;
+
+ /* Allocate a locker for this id. */
+ ret = __lock_getlocker_int(lt, id, 1, &lk);
+
+err: UNLOCK_LOCKERS(env, region);
+
+ if (idp != NULL)
+ *idp = id;
+ if (lkp != NULL)
+ *lkp = lk;
+
+ return (ret);
+}
+
+/*
+ * __lock_set_thread_id --
+ * Set the thread_id in an existing locker.
+ * PUBLIC: void __lock_set_thread_id __P((void *, pid_t, db_threadid_t));
+ */
+void
+__lock_set_thread_id(lref_arg, pid, tid)
+ void *lref_arg;
+ pid_t pid;
+ db_threadid_t tid;
+{
+ DB_LOCKER *lref;
+
+ lref = lref_arg;
+ lref->pid = pid;
+ lref->tid = tid;
+}
+
+/*
+ * __lock_id_free_pp --
+ * ENV->lock_id_free pre/post processing.
+ *
+ * PUBLIC: int __lock_id_free_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_id_free_pp(dbenv, id)
+ DB_ENV *dbenv;
+ u_int32_t id;
+{
+ DB_LOCKER *sh_locker;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lk_handle, "DB_ENV->lock_id_free", DB_INIT_LOCK);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __env_rep_enter(env, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+
+ LOCK_LOCKERS(env, region);
+ if ((ret =
+ __lock_getlocker_int(env->lk_handle, id, 0, &sh_locker)) == 0) {
+ if (sh_locker != NULL)
+ ret = __lock_freelocker_int(lt, region, sh_locker, 1);
+ else {
+ __db_errx(env, DB_STR_A("2045",
+ "Unknown locker id: %lx", "%lx"), (u_long)id);
+ ret = EINVAL;
+ }
+ }
+ UNLOCK_LOCKERS(env, region);
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __lock_id_free --
+ * Free a locker id.
+ *
+ * PUBLIC: int __lock_id_free __P((ENV *, DB_LOCKER *));
+ */
+int
+__lock_id_free(env, sh_locker)
+ ENV *env;
+ DB_LOCKER *sh_locker;
+{
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ int ret;
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+ ret = 0;
+
+ if (sh_locker->nlocks != 0) {
+ __db_errx(env, DB_STR("2046",
+ "Locker still has locks"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ LOCK_LOCKERS(env, region);
+ ret = __lock_freelocker_int(lt, region, sh_locker, 1);
+ UNLOCK_LOCKERS(env, region);
+
+err: return (ret);
+}
+
+/*
+ * __lock_id_set --
+ * Set the current locker ID and current maximum unused ID (for
+ * testing purposes only).
+ *
+ * PUBLIC: int __lock_id_set __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__lock_id_set(env, cur_id, max_id)
+ ENV *env;
+ u_int32_t cur_id, max_id;
+{
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lk_handle, "lock_id_set", DB_INIT_LOCK);
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+ region->lock_id = cur_id;
+ region->cur_maxid = max_id;
+
+ return (0);
+}
+
+/*
+ * __lock_getlocker --
+ * Get a locker in the locker hash table. The create parameter
+ * indicates if the locker should be created if it doesn't exist in
+ * the table.
+ *
+ * This must be called with the locker mutex lock if create == 1.
+ *
+ * PUBLIC: int __lock_getlocker __P((DB_LOCKTAB *,
+ * PUBLIC: u_int32_t, int, DB_LOCKER **));
+ * PUBLIC: int __lock_getlocker_int __P((DB_LOCKTAB *,
+ * PUBLIC: u_int32_t, int, DB_LOCKER **));
+ */
+int
+__lock_getlocker(lt, locker, create, retp)
+ DB_LOCKTAB *lt;
+ u_int32_t locker;
+ int create;
+ DB_LOCKER **retp;
+{
+ DB_LOCKREGION *region;
+ ENV *env;
+ int ret;
+
+ COMPQUIET(region, NULL);
+ env = lt->env;
+ region = lt->reginfo.primary;
+
+ LOCK_LOCKERS(env, region);
+ ret = __lock_getlocker_int(lt, locker, create, retp);
+ UNLOCK_LOCKERS(env, region);
+
+ return (ret);
+}
+
+int
+__lock_getlocker_int(lt, locker, create, retp)
+ DB_LOCKTAB *lt;
+ u_int32_t locker;
+ int create;
+ DB_LOCKER **retp;
+{
+ DB_LOCKER *sh_locker;
+ DB_LOCKREGION *region;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ db_mutex_t mutex;
+ u_int32_t i, indx, nlockers;
+ int ret;
+
+ env = lt->env;
+ region = lt->reginfo.primary;
+
+ LOCKER_HASH(lt, region, locker, indx);
+
+ /*
+ * If we find the locker, then we can just return it. If we don't find
+ * the locker, then we need to create it.
+ */
+ SH_TAILQ_FOREACH(sh_locker, &lt->locker_tab[indx], links, __db_locker)
+ if (sh_locker->id == locker)
+ break;
+ if (sh_locker == NULL && create) {
+ nlockers = 0;
+ /* Create new locker and then insert it into hash table. */
+ if ((ret = __mutex_alloc(env, MTX_LOGICAL_LOCK,
+ DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_SELF_BLOCK,
+ &mutex)) != 0)
+ return (ret);
+ else
+ MUTEX_LOCK(env, mutex);
+ if ((sh_locker = SH_TAILQ_FIRST(
+ &region->free_lockers, __db_locker)) == NULL) {
+ nlockers = region->stat.st_lockers >> 2;
+ /* Just in case. */
+ if (nlockers == 0)
+ nlockers = 1;
+ if (region->stat.st_maxlockers != 0 &&
+ region->stat.st_maxlockers <
+ region->stat.st_lockers + nlockers)
+ nlockers = region->stat.st_maxlockers -
+ region->stat.st_lockers;
+ /*
+ * Don't hold lockers when getting the region,
+ * we could deadlock. When creating a locker
+ * there is no race since the id allocation
+ * is synchronized.
+ */
+ UNLOCK_LOCKERS(env, region);
+ LOCK_REGION_LOCK(env);
+ /*
+ * If the max memory is not sized for max objects,
+ * allocate as much as possible.
+ */
+ F_SET(&lt->reginfo, REGION_TRACKED);
+ while (__env_alloc(&lt->reginfo, nlockers *
+ sizeof(struct __db_locker), &sh_locker) != 0)
+ if ((nlockers >> 1) == 0)
+ break;
+ F_CLR(&lt->reginfo, REGION_TRACKED);
+ LOCK_REGION_UNLOCK(lt->env);
+ LOCK_LOCKERS(env, region);
+ for (i = 0; i < nlockers; i++) {
+ SH_TAILQ_INSERT_HEAD(&region->free_lockers,
+ sh_locker, links, __db_locker);
+ sh_locker++;
+ }
+ if (nlockers == 0)
+ return (__lock_nomem(env, "locker entries"));
+ region->stat.st_lockers += nlockers;
+ sh_locker = SH_TAILQ_FIRST(
+ &region->free_lockers, __db_locker);
+ }
+ SH_TAILQ_REMOVE(
+ &region->free_lockers, sh_locker, links, __db_locker);
+ ++region->nlockers;
+#ifdef HAVE_STATISTICS
+ STAT_PERFMON2(env, lock, nlockers, region->nlockers, locker);
+ if (region->nlockers > region->stat.st_maxnlockers)
+ STAT_SET(env, lock, maxnlockers,
+ region->stat.st_maxnlockers,
+ region->nlockers, locker);
+#endif
+ sh_locker->id = locker;
+ env->dbenv->thread_id(
+ env->dbenv, &sh_locker->pid, &sh_locker->tid);
+ sh_locker->mtx_locker = mutex;
+ sh_locker->dd_id = 0;
+ sh_locker->master_locker = INVALID_ROFF;
+ sh_locker->parent_locker = INVALID_ROFF;
+ SH_LIST_INIT(&sh_locker->child_locker);
+ sh_locker->flags = 0;
+ SH_LIST_INIT(&sh_locker->heldby);
+ sh_locker->nlocks = 0;
+ sh_locker->nwrites = 0;
+ sh_locker->priority = DB_LOCK_DEFPRIORITY;
+ sh_locker->lk_timeout = 0;
+ timespecclear(&sh_locker->tx_expire);
+ timespecclear(&sh_locker->lk_expire);
+
+ SH_TAILQ_INSERT_HEAD(
+ &lt->locker_tab[indx], sh_locker, links, __db_locker);
+ SH_TAILQ_INSERT_HEAD(&region->lockers,
+ sh_locker, ulinks, __db_locker);
+ ENV_GET_THREAD_INFO(env, ip);
+#ifdef DIAGNOSTIC
+ if (ip != NULL)
+ ip->dbth_locker = R_OFFSET(&lt->reginfo, sh_locker);
+#endif
+ }
+
+ *retp = sh_locker;
+ return (0);
+}
+
+/*
+ * __lock_addfamilylocker
+ * Put a locker entry in for a child transaction.
+ *
+ * PUBLIC: int __lock_addfamilylocker __P((ENV *,
+ * PUBLIC: u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__lock_addfamilylocker(env, pid, id, is_family)
+ ENV *env;
+ u_int32_t pid, id, is_family;
+{
+ DB_LOCKER *lockerp, *mlockerp;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ int ret;
+
+ COMPQUIET(region, NULL);
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+ LOCK_LOCKERS(env, region);
+
+ /* get/create the parent locker info */
+ if ((ret = __lock_getlocker_int(lt, pid, 1, &mlockerp)) != 0)
+ goto err;
+
+ /*
+ * We assume that only one thread can manipulate
+ * a single transaction family.
+ * Therefore the master locker cannot go away while
+ * we manipulate it, nor can another child in the
+ * family be created at the same time.
+ */
+ if ((ret = __lock_getlocker_int(lt, id, 1, &lockerp)) != 0)
+ goto err;
+
+ /* Point to our parent. */
+ lockerp->parent_locker = R_OFFSET(&lt->reginfo, mlockerp);
+
+ /* See if this locker is the family master. */
+ if (mlockerp->master_locker == INVALID_ROFF)
+ lockerp->master_locker = R_OFFSET(&lt->reginfo, mlockerp);
+ else {
+ lockerp->master_locker = mlockerp->master_locker;
+ mlockerp = R_ADDR(&lt->reginfo, mlockerp->master_locker);
+ }
+
+ /*
+ * Set the family locker flag, so it is possible to distinguish
+ * between locks held by subtransactions and those with compatible
+ * lockers.
+ */
+ if (is_family)
+ F_SET(mlockerp, DB_LOCKER_FAMILY_LOCKER);
+
+ /*
+ * Link the child at the head of the master's list.
+ * The guess is when looking for deadlock that
+ * the most recent child is the one that's blocked.
+ */
+ SH_LIST_INSERT_HEAD(
+ &mlockerp->child_locker, lockerp, child_link, __db_locker);
+
+err: UNLOCK_LOCKERS(env, region);
+
+ return (ret);
+}
+
+/*
+ * __lock_freelocker_int
+ * Common code for deleting a locker; must be called with the
+ * locker bucket locked.
+ */
+static int
+__lock_freelocker_int(lt, region, sh_locker, reallyfree)
+ DB_LOCKTAB *lt;
+ DB_LOCKREGION *region;
+ DB_LOCKER *sh_locker;
+ int reallyfree;
+{
+ ENV *env;
+ u_int32_t indx;
+ int ret;
+
+ env = lt->env;
+
+ if (SH_LIST_FIRST(&sh_locker->heldby, __db_lock) != NULL) {
+ __db_errx(env, DB_STR("2047",
+ "Freeing locker with locks"));
+ return (EINVAL);
+ }
+
+ /* If this is part of a family, we must fix up its links. */
+ if (sh_locker->master_locker != INVALID_ROFF) {
+ SH_LIST_REMOVE(sh_locker, child_link, __db_locker);
+ sh_locker->master_locker = INVALID_ROFF;
+ }
+
+ if (reallyfree) {
+ LOCKER_HASH(lt, region, sh_locker->id, indx);
+ SH_TAILQ_REMOVE(&lt->locker_tab[indx], sh_locker,
+ links, __db_locker);
+ if (sh_locker->mtx_locker != MUTEX_INVALID &&
+ (ret = __mutex_free(env, &sh_locker->mtx_locker)) != 0)
+ return (ret);
+ SH_TAILQ_INSERT_HEAD(&region->free_lockers, sh_locker,
+ links, __db_locker);
+ SH_TAILQ_REMOVE(&region->lockers, sh_locker,
+ ulinks, __db_locker);
+ region->nlockers--;
+ STAT_PERFMON2(env,
+ lock, nlockers, region->nlockers, sh_locker->id);
+ }
+
+ return (0);
+}
+
+/*
+ * __lock_freelocker
+ * Remove a locker its family from the hash table.
+ *
+ * This must be called without the locker bucket locked.
+ *
+ * PUBLIC: int __lock_freelocker __P((DB_LOCKTAB *, DB_LOCKER *));
+ */
+int
+__lock_freelocker(lt, sh_locker)
+ DB_LOCKTAB *lt;
+ DB_LOCKER *sh_locker;
+{
+ DB_LOCKREGION *region;
+ ENV *env;
+ int ret;
+
+ region = lt->reginfo.primary;
+ env = lt->env;
+
+ if (sh_locker == NULL)
+ return (0);
+
+ LOCK_LOCKERS(env, region);
+ ret = __lock_freelocker_int(lt, region, sh_locker, 1);
+ UNLOCK_LOCKERS(env, region);
+
+ return (ret);
+}
+
+/*
+ * __lock_familyremove
+ * Remove a locker from its family.
+ *
+ * This must be called without the locker bucket locked.
+ *
+ * PUBLIC: int __lock_familyremove __P((DB_LOCKTAB *, DB_LOCKER *));
+ */
+int
+__lock_familyremove(lt, sh_locker)
+ DB_LOCKTAB *lt;
+ DB_LOCKER *sh_locker;
+{
+ DB_LOCKREGION *region;
+ ENV *env;
+ int ret;
+
+ region = lt->reginfo.primary;
+ env = lt->env;
+
+ LOCK_LOCKERS(env, region);
+ ret = __lock_freelocker_int(lt, region, sh_locker, 0);
+ UNLOCK_LOCKERS(env, region);
+
+ return (ret);
+}
diff --git a/src/lock/lock_list.c b/src/lock/lock_list.c
new file mode 100644
index 00000000..1e3d2a55
--- /dev/null
+++ b/src/lock/lock_list.c
@@ -0,0 +1,365 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+
+static int __lock_sort_cmp __P((const void *, const void *));
+
+/*
+ * Lock list routines.
+ * The list is composed of a 32-bit count of locks followed by
+ * each lock. A lock is represented by a 16-bit page-count, a lock
+ * object and a page list. A lock object consists of a 16-bit size
+ * and the object itself. In a pseudo BNF notation, you get:
+ *
+ * LIST = COUNT32 LOCK*
+ * LOCK = COUNT16 LOCKOBJ PAGELIST
+ * LOCKOBJ = COUNT16 OBJ
+ * PAGELIST = COUNT32*
+ *
+ * (Recall that X* means "0 or more X's")
+ *
+ * In most cases, the OBJ is a struct __db_ilock and the page list is
+ * a series of (32-bit) page numbers that should get written into the
+ * pgno field of the __db_ilock. So, the actual number of pages locked
+ * is the number of items in the PAGELIST plus 1. If this is an application-
+ * specific lock, then we cannot interpret obj and the pagelist must
+ * be empty.
+ *
+ * Consider a lock list for: File A, pages 1&2, File B pages 3-5, Applock
+ * This would be represented as:
+ * 5 1 [fid=A;page=1] 2 2 [fid=B;page=3] 4 5 0 APPLOCK
+ * ------------------ -------------------- ---------
+ * LOCK for file A LOCK for file B application-specific lock
+ */
+
+#define MAX_PGNOS 0xffff
+
+/*
+ * These macros are bigger than one might expect because some compilers say a
+ * cast does not return an lvalue, so constructs like *(u_int32_t*)dp = count;
+ * generate warnings.
+ */
+#define RET_SIZE(size, count) ((size) + \
+ sizeof(u_int32_t) + (count) * 2 * sizeof(u_int16_t))
+
+#define PUT_COUNT(dp, count) do { u_int32_t __c = (count); \
+ LOGCOPY_32(env, dp, &__c); \
+ dp = (u_int8_t *)dp + \
+ sizeof(u_int32_t); \
+ } while (0)
+#define PUT_PCOUNT(dp, count) do { u_int16_t __c = (count); \
+ LOGCOPY_16(env, dp, &__c); \
+ dp = (u_int8_t *)dp + \
+ sizeof(u_int16_t); \
+ } while (0)
+#define PUT_SIZE(dp, size) do { u_int16_t __s = (size); \
+ LOGCOPY_16(env, dp, &__s); \
+ dp = (u_int8_t *)dp + \
+ sizeof(u_int16_t); \
+ } while (0)
+#define PUT_PGNO(dp, pgno) do { db_pgno_t __pg = (pgno); \
+ LOGCOPY_32(env, dp, &__pg); \
+ dp = (u_int8_t *)dp + \
+ sizeof(db_pgno_t); \
+ } while (0)
+#define COPY_OBJ(dp, obj) do { \
+ memcpy(dp, \
+ (obj)->data, (obj)->size); \
+ dp = (u_int8_t *)dp + \
+ DB_ALIGN((obj)->size, \
+ sizeof(u_int32_t)); \
+ } while (0)
+#define GET_COUNT(dp, count) do { LOGCOPY_32(env, &count, dp); \
+ dp = (u_int8_t *)dp + \
+ sizeof(u_int32_t); \
+ } while (0)
+#define GET_PCOUNT(dp, count) do { LOGCOPY_16(env, &count, dp); \
+ dp = (u_int8_t *)dp + \
+ sizeof(u_int16_t); \
+ } while (0)
+#define GET_SIZE(dp, size) do { LOGCOPY_16(env, &size, dp); \
+ dp = (u_int8_t *)dp + \
+ sizeof(u_int16_t); \
+ } while (0)
+#define GET_PGNO(dp, pgno) do { LOGCOPY_32(env, &pgno, dp); \
+ dp = (u_int8_t *)dp + \
+ sizeof(db_pgno_t); \
+ } while (0)
+
+/*
+ * __lock_fix_list --
+ *
+ * PUBLIC: int __lock_fix_list __P((ENV *, DBT *, u_int32_t));
+ */
+int
+__lock_fix_list(env, list_dbt, nlocks)
+ ENV *env;
+ DBT *list_dbt;
+ u_int32_t nlocks;
+{
+ DBT *obj;
+ DB_LOCK_ILOCK *lock, *plock;
+ u_int32_t i, j, nfid, npgno, size;
+ u_int8_t *data, *dp;
+ int ret;
+
+ if ((size = list_dbt->size) == 0)
+ return (0);
+
+ obj = (DBT *)list_dbt->data;
+
+ /*
+ * If necessary sort the list of locks so that locks on the same fileid
+ * are together. We do not sort 1 or 2 locks because by definition if
+ * there are locks on the same fileid they will be together. The sort
+ * will also move any locks that do not look like page locks to the end
+ * of the list so we can stop looking for locks we can combine when we
+ * hit one.
+ */
+ switch (nlocks) {
+ case 1:
+ size = RET_SIZE(obj->size, 1);
+ if ((ret = __os_malloc(env, size, &data)) != 0)
+ return (ret);
+
+ dp = data;
+ PUT_COUNT(dp, 1);
+ PUT_PCOUNT(dp, 0);
+ PUT_SIZE(dp, obj->size);
+ COPY_OBJ(dp, obj);
+ break;
+ default:
+ /* Sort so that all locks with same fileid are together. */
+ qsort(list_dbt->data, nlocks, sizeof(DBT), __lock_sort_cmp);
+ /* FALLTHROUGH */
+ case 2:
+ nfid = npgno = 0;
+ i = 0;
+ if (obj->size != sizeof(DB_LOCK_ILOCK))
+ goto not_ilock;
+
+ nfid = 1;
+ plock = (DB_LOCK_ILOCK *)obj->data;
+
+ /* We use ulen to keep track of the number of pages. */
+ j = 0;
+ obj[0].ulen = 0;
+ for (i = 1; i < nlocks; i++) {
+ if (obj[i].size != sizeof(DB_LOCK_ILOCK))
+ break;
+ lock = (DB_LOCK_ILOCK *)obj[i].data;
+ if (obj[j].ulen < MAX_PGNOS &&
+ lock->type == plock->type &&
+ memcmp(lock->fileid,
+ plock->fileid, DB_FILE_ID_LEN) == 0) {
+ obj[j].ulen++;
+ npgno++;
+ } else {
+ nfid++;
+ plock = lock;
+ j = i;
+ obj[j].ulen = 0;
+ }
+ }
+
+not_ilock: size = nfid * sizeof(DB_LOCK_ILOCK);
+ size += npgno * sizeof(db_pgno_t);
+ /* Add the number of nonstandard locks and get their size. */
+ nfid += nlocks - i;
+ for (; i < nlocks; i++) {
+ size += obj[i].size;
+ obj[i].ulen = 0;
+ }
+
+ size = RET_SIZE(size, nfid);
+ if ((ret = __os_malloc(env, size, &data)) != 0)
+ return (ret);
+
+ dp = data;
+ PUT_COUNT(dp, nfid);
+
+ for (i = 0; i < nlocks; i = j) {
+ PUT_PCOUNT(dp, obj[i].ulen);
+ PUT_SIZE(dp, obj[i].size);
+ COPY_OBJ(dp, &obj[i]);
+ lock = (DB_LOCK_ILOCK *)obj[i].data;
+ for (j = i + 1; j <= i + obj[i].ulen; j++) {
+ lock = (DB_LOCK_ILOCK *)obj[j].data;
+ PUT_PGNO(dp, lock->pgno);
+ }
+ }
+ }
+
+ __os_free(env, list_dbt->data);
+
+ list_dbt->data = data;
+ list_dbt->size = size;
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __lock_get_list __P((ENV *, DB_LOCKER *, u_int32_t,
+ * PUBLIC: db_lockmode_t, DBT *));
+ */
+int
+__lock_get_list(env, locker, flags, lock_mode, list)
+ ENV *env;
+ DB_LOCKER *locker;
+ u_int32_t flags;
+ db_lockmode_t lock_mode;
+ DBT *list;
+{
+ DBT obj_dbt;
+ DB_LOCK ret_lock;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ DB_LOCK_ILOCK *lock;
+ db_pgno_t save_pgno;
+ u_int16_t npgno, size;
+ u_int32_t i, nlocks;
+ int ret;
+ void *data, *dp;
+
+ if (list->size == 0)
+ return (0);
+ ret = 0;
+ data = NULL;
+
+ lt = env->lk_handle;
+ dp = list->data;
+
+ /*
+ * There is no assurance log records will be aligned. If not, then
+ * copy the data to an aligned region so the rest of the code does
+ * not have to worry about it.
+ */
+ if ((uintptr_t)dp != DB_ALIGN((uintptr_t)dp, sizeof(u_int32_t))) {
+ if ((ret = __os_malloc(env, list->size, &data)) != 0)
+ return (ret);
+ memcpy(data, list->data, list->size);
+ dp = data;
+ }
+
+ region = lt->reginfo.primary;
+ LOCK_SYSTEM_LOCK(lt, region);
+ GET_COUNT(dp, nlocks);
+
+ for (i = 0; i < nlocks; i++) {
+ GET_PCOUNT(dp, npgno);
+ GET_SIZE(dp, size);
+ lock = (DB_LOCK_ILOCK *) dp;
+ save_pgno = lock->pgno;
+ obj_dbt.data = dp;
+ obj_dbt.size = size;
+ dp = ((u_int8_t *)dp) + DB_ALIGN(size, sizeof(u_int32_t));
+ do {
+ if ((ret = __lock_get_internal(lt, locker,
+ flags, &obj_dbt, lock_mode, 0, &ret_lock)) != 0) {
+ lock->pgno = save_pgno;
+ goto err;
+ }
+ if (npgno != 0)
+ GET_PGNO(dp, lock->pgno);
+ } while (npgno-- != 0);
+ lock->pgno = save_pgno;
+ }
+
+err: LOCK_SYSTEM_UNLOCK(lt, region);
+ if (data != NULL)
+ __os_free(env, data);
+ return (ret);
+}
+
+#define UINT32_CMP(A, B) ((A) == (B) ? 0 : ((A) > (B) ? 1 : -1))
+static int
+__lock_sort_cmp(a, b)
+ const void *a, *b;
+{
+ const DBT *d1, *d2;
+ DB_LOCK_ILOCK *l1, *l2;
+
+ d1 = a;
+ d2 = b;
+
+ /* Force all non-standard locks to sort at end. */
+ if (d1->size != sizeof(DB_LOCK_ILOCK)) {
+ if (d2->size != sizeof(DB_LOCK_ILOCK))
+ return (UINT32_CMP(d1->size, d2->size));
+ else
+ return (1);
+ } else if (d2->size != sizeof(DB_LOCK_ILOCK))
+ return (-1);
+
+ l1 = d1->data;
+ l2 = d2->data;
+ if (l1->type != l2->type)
+ return (UINT32_CMP(l1->type, l2->type));
+ return (memcmp(l1->fileid, l2->fileid, DB_FILE_ID_LEN));
+}
+
+/*
+ * PUBLIC: void __lock_list_print __P((ENV *, DB_MSGBUF *, DBT *));
+ */
+void
+__lock_list_print(env, mbp, list)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ DBT *list;
+{
+ DB_LOCK_ILOCK *lock;
+ db_pgno_t pgno;
+ u_int16_t npgno, size;
+ u_int32_t i, nlocks;
+ u_int8_t *fidp;
+ char *fname, *dname, *p, namebuf[26];
+ void *dp;
+
+ if (list->size == 0)
+ return;
+ dp = list->data;
+
+ GET_COUNT(dp, nlocks);
+
+ for (i = 0; i < nlocks; i++) {
+ GET_PCOUNT(dp, npgno);
+ GET_SIZE(dp, size);
+ lock = (DB_LOCK_ILOCK *) dp;
+ fidp = lock->fileid;
+ (void)__dbreg_get_name(env, fidp, &fname, &dname);
+ __db_msgadd(env, mbp, "\t");
+ if (fname == NULL && dname == NULL)
+ __db_msgadd(env, mbp, "(%lx %lx %lx %lx %lx)",
+ (u_long)fidp[0], (u_long)fidp[1], (u_long)fidp[2],
+ (u_long)fidp[3], (u_long)fidp[4]);
+ else {
+ if (fname != NULL && dname != NULL) {
+ (void)snprintf(namebuf, sizeof(namebuf),
+ "%14s.%-10s", fname, dname);
+ p = namebuf;
+ } else if (fname != NULL)
+ p = fname;
+ else
+ p = dname;
+ __db_msgadd(env, mbp, "%-25s", p);
+ }
+ dp = ((u_int8_t *)dp) + DB_ALIGN(size, sizeof(u_int32_t));
+ LOGCOPY_32(env, &pgno, &lock->pgno);
+ do {
+ __db_msgadd(env, mbp, " %d", pgno);
+ if (npgno != 0)
+ GET_PGNO(dp, pgno);
+ } while (npgno-- != 0);
+ __db_msgadd(env, mbp, "\n");
+ }
+}
diff --git a/src/lock/lock_method.c b/src/lock/lock_method.c
new file mode 100644
index 00000000..0cc2e19d
--- /dev/null
+++ b/src/lock/lock_method.c
@@ -0,0 +1,630 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+/*
+ * __lock_env_create --
+ * Lock specific creation of the DB_ENV structure.
+ *
+ * PUBLIC: int __lock_env_create __P((DB_ENV *));
+ */
+int
+__lock_env_create(dbenv)
+ DB_ENV *dbenv;
+{
+ u_int32_t cpu;
+ /*
+ * !!!
+ * Our caller has not yet had the opportunity to reset the panic
+ * state or turn off mutex locking, and so we can neither check
+ * the panic state or acquire a mutex in the DB_ENV create path.
+ */
+ dbenv->lk_init = 0;
+ dbenv->lk_init_lockers = 0;
+ dbenv->lk_init_objects = 0;
+
+ /*
+ * Default to 10 partitions per cpu. This seems to be near
+ * the point of diminishing returns on Xeon type processors.
+ * Cpu count often returns the number of hyper threads and if
+ * there is only one CPU you probably do not want to run partitions.
+ */
+ cpu = __os_cpu_count();
+ dbenv->lk_partitions = cpu > 1 ? 10 * cpu : 1;
+
+ return (0);
+}
+
+/*
+ * __lock_env_destroy --
+ * Lock specific destruction of the DB_ENV structure.
+ *
+ * PUBLIC: void __lock_env_destroy __P((DB_ENV *));
+ */
+void
+__lock_env_destroy(dbenv)
+ DB_ENV *dbenv;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (dbenv->lk_conflicts != NULL) {
+ __os_free(env, dbenv->lk_conflicts);
+ dbenv->lk_conflicts = NULL;
+ }
+}
+
+/*
+ * __lock_get_lk_conflicts
+ * Get the conflicts matrix.
+ *
+ * PUBLIC: int __lock_get_lk_conflicts
+ * PUBLIC: __P((DB_ENV *, const u_int8_t **, int *));
+ */
+int
+__lock_get_lk_conflicts(dbenv, lk_conflictsp, lk_modesp)
+ DB_ENV *dbenv;
+ const u_int8_t **lk_conflictsp;
+ int *lk_modesp;
+{
+ DB_LOCKTAB *lt;
+ ENV *env;
+
+ env = dbenv->env;
+ lt = env->lk_handle;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_lk_conflicts", DB_INIT_LOCK);
+
+ if (LOCKING_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ if (lk_conflictsp != NULL)
+ *lk_conflictsp = lt->conflicts;
+ if (lk_modesp != NULL)
+ *lk_modesp = ((DB_LOCKREGION *)
+ (lt->reginfo.primary))->nmodes;
+ } else {
+ if (lk_conflictsp != NULL)
+ *lk_conflictsp = dbenv->lk_conflicts;
+ if (lk_modesp != NULL)
+ *lk_modesp = dbenv->lk_modes;
+ }
+ return (0);
+}
+
+/*
+ * __lock_set_lk_conflicts
+ * Set the conflicts matrix.
+ *
+ * PUBLIC: int __lock_set_lk_conflicts __P((DB_ENV *, u_int8_t *, int));
+ */
+int
+__lock_set_lk_conflicts(dbenv, lk_conflicts, lk_modes)
+ DB_ENV *dbenv;
+ u_int8_t *lk_conflicts;
+ int lk_modes;
+{
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_conflicts");
+
+ if (dbenv->lk_conflicts != NULL) {
+ __os_free(env, dbenv->lk_conflicts);
+ dbenv->lk_conflicts = NULL;
+ }
+ if ((ret = __os_malloc(env,
+ (size_t)(lk_modes * lk_modes), &dbenv->lk_conflicts)) != 0)
+ return (ret);
+ memcpy(
+ dbenv->lk_conflicts, lk_conflicts, (size_t)(lk_modes * lk_modes));
+ dbenv->lk_modes = lk_modes;
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __lock_get_lk_detect __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_detect(dbenv, lk_detectp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_detectp;
+{
+ DB_LOCKTAB *lt;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_lk_detect", DB_INIT_LOCK);
+
+ if (LOCKING_ON(env)) {
+ lt = env->lk_handle;
+ ENV_ENTER(env, ip);
+ LOCK_REGION_LOCK(env);
+ *lk_detectp = ((DB_LOCKREGION *)lt->reginfo.primary)->detect;
+ LOCK_REGION_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ *lk_detectp = dbenv->lk_detect;
+ return (0);
+}
+
+/*
+ * __lock_set_lk_detect
+ * DB_ENV->set_lk_detect.
+ *
+ * PUBLIC: int __lock_set_lk_detect __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_detect(dbenv, lk_detect)
+ DB_ENV *dbenv;
+ u_int32_t lk_detect;
+{
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->set_lk_detect", DB_INIT_LOCK);
+
+ switch (lk_detect) {
+ case DB_LOCK_DEFAULT:
+ case DB_LOCK_EXPIRE:
+ case DB_LOCK_MAXLOCKS:
+ case DB_LOCK_MAXWRITE:
+ case DB_LOCK_MINLOCKS:
+ case DB_LOCK_MINWRITE:
+ case DB_LOCK_OLDEST:
+ case DB_LOCK_RANDOM:
+ case DB_LOCK_YOUNGEST:
+ break;
+ default:
+ __db_errx(env, DB_STR("2043",
+ "DB_ENV->set_lk_detect: unknown deadlock detection mode specified"));
+ return (EINVAL);
+ }
+
+ ret = 0;
+ if (LOCKING_ON(env)) {
+ ENV_ENTER(env, ip);
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+ LOCK_REGION_LOCK(env);
+ /*
+ * Check for incompatible automatic deadlock detection requests.
+ * There are scenarios where changing the detector configuration
+ * is reasonable, but we disallow them guessing it is likely to
+ * be an application error.
+ *
+ * We allow applications to turn on the lock detector, and we
+ * ignore attempts to set it to the default or current value.
+ */
+ if (region->detect != DB_LOCK_NORUN &&
+ lk_detect != DB_LOCK_DEFAULT &&
+ region->detect != lk_detect) {
+ __db_errx(env, DB_STR("2044",
+ "DB_ENV->set_lk_detect: incompatible deadlock detector mode"));
+ ret = EINVAL;
+ } else
+ if (region->detect == DB_LOCK_NORUN)
+ region->detect = lk_detect;
+ LOCK_REGION_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ dbenv->lk_detect = lk_detect;
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __lock_get_lk_max_locks __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_max_locks(dbenv, lk_maxp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_maxp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_lk_maxlocks", DB_INIT_LOCK);
+
+ if (LOCKING_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *lk_maxp = ((DB_LOCKREGION *)
+ env->lk_handle->reginfo.primary)->stat.st_maxlocks;
+ } else
+ *lk_maxp = dbenv->lk_max;
+ return (0);
+}
+
+/*
+ * __lock_set_lk_max_locks
+ * DB_ENV->set_lk_max_locks.
+ *
+ * PUBLIC: int __lock_set_lk_max_locks __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_max_locks(dbenv, lk_max)
+ DB_ENV *dbenv;
+ u_int32_t lk_max;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_max_locks");
+
+ dbenv->lk_max = lk_max;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __lock_get_lk_max_lockers __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_max_lockers(dbenv, lk_maxp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_maxp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_lk_max_lockers", DB_INIT_LOCK);
+
+ if (LOCKING_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *lk_maxp = ((DB_LOCKREGION *)
+ env->lk_handle->reginfo.primary)->stat.st_maxlockers;
+ } else
+ *lk_maxp = dbenv->lk_max_lockers;
+ return (0);
+}
+
+/*
+ * __lock_set_lk_max_lockers
+ * DB_ENV->set_lk_max_lockers.
+ *
+ * PUBLIC: int __lock_set_lk_max_lockers __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_max_lockers(dbenv, lk_max)
+ DB_ENV *dbenv;
+ u_int32_t lk_max;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_max_lockers");
+
+ dbenv->lk_max_lockers = lk_max;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __lock_get_lk_max_objects __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_max_objects(dbenv, lk_maxp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_maxp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_lk_max_objects", DB_INIT_LOCK);
+
+ if (LOCKING_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *lk_maxp = ((DB_LOCKREGION *)
+ env->lk_handle->reginfo.primary)->stat.st_maxobjects;
+ } else
+ *lk_maxp = dbenv->lk_max_objects;
+ return (0);
+}
+
+/*
+ * __lock_set_lk_max_objects
+ * DB_ENV->set_lk_max_objects.
+ *
+ * PUBLIC: int __lock_set_lk_max_objects __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_max_objects(dbenv, lk_max)
+ DB_ENV *dbenv;
+ u_int32_t lk_max;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_max_objects");
+
+ dbenv->lk_max_objects = lk_max;
+ return (0);
+}
+/*
+ * PUBLIC: int __lock_get_lk_partitions __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_partitions(dbenv, lk_partitionp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_partitionp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_lk_partitions", DB_INIT_LOCK);
+
+ if (LOCKING_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *lk_partitionp = ((DB_LOCKREGION *)
+ env->lk_handle->reginfo.primary)->stat.st_partitions;
+ } else
+ *lk_partitionp = dbenv->lk_partitions;
+ return (0);
+}
+
+/*
+ * __lock_set_lk_partitions
+ * DB_ENV->set_lk_partitions.
+ *
+ * PUBLIC: int __lock_set_lk_partitions __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_partitions(dbenv, lk_partitions)
+ DB_ENV *dbenv;
+ u_int32_t lk_partitions;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_partitions");
+
+ dbenv->lk_partitions = lk_partitions;
+ return (0);
+}
+/*
+ * PUBLIC: int __lock_get_lk_tablesize __P((DB_ENV *, u_int32_t *));
+ */
+int
+__lock_get_lk_tablesize(dbenv, lk_tablesizep)
+ DB_ENV *dbenv;
+ u_int32_t *lk_tablesizep;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_lk_tablesize", DB_INIT_LOCK);
+
+ if (LOCKING_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *lk_tablesizep = ((DB_LOCKREGION *)
+ env->lk_handle->reginfo.primary)->stat.st_tablesize;
+ } else
+ *lk_tablesizep = dbenv->object_t_size;
+ return (0);
+}
+
+/*
+ * __lock_set_lk_tablesize
+ * DB_ENV->set_lk_tablesize.
+ *
+ * PUBLIC: int __lock_set_lk_tablesize __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_set_lk_tablesize(dbenv, lk_tablesize)
+ DB_ENV *dbenv;
+ u_int32_t lk_tablesize;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lk_tablesize");
+
+ dbenv->object_t_size = lk_tablesize;
+ return (0);
+}
+
+/*
+ * __lock_set_lk_priority --
+ * Set a locker's priority.
+ *
+ * PUBLIC: int __lock_set_lk_priority __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__lock_set_lk_priority(dbenv, lockid, priority)
+ DB_ENV *dbenv;
+ u_int32_t lockid, priority;
+{
+ DB_LOCKER *locker;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ if (!LOCKING_ON(env))
+ return (EINVAL);
+
+ if ((ret = __lock_getlocker(env->lk_handle, lockid, 0, &locker)) == 0)
+ locker->priority = priority;
+ return (ret);
+}
+
+/*
+ * __lock_get_lk_priority --
+ * Get a locker's priority.
+ *
+ * PUBLIC: int __lock_get_lk_priority __P((DB_ENV *, u_int32_t, u_int32_t *));
+ */
+int
+__lock_get_lk_priority(dbenv, lockid, priorityp)
+ DB_ENV *dbenv;
+ u_int32_t lockid, *priorityp;
+{
+ DB_LOCKER *locker;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ if (!LOCKING_ON(env))
+ return (EINVAL);
+
+ if ((ret = __lock_getlocker(env->lk_handle, lockid, 0, &locker)) == 0)
+ *priorityp = locker->priority;
+ return ret;
+}
+
+/*
+ * PUBLIC: int __lock_get_env_timeout
+ * PUBLIC: __P((DB_ENV *, db_timeout_t *, u_int32_t));
+ */
+int
+__lock_get_env_timeout(dbenv, timeoutp, flag)
+ DB_ENV *dbenv;
+ db_timeout_t *timeoutp;
+ u_int32_t flag;
+{
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->get_env_timeout", DB_INIT_LOCK);
+
+ ret = 0;
+ if (LOCKING_ON(env)) {
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+ ENV_ENTER(env, ip);
+ LOCK_REGION_LOCK(env);
+ switch (flag) {
+ case DB_SET_LOCK_TIMEOUT:
+ *timeoutp = region->lk_timeout;
+ break;
+ case DB_SET_TXN_TIMEOUT:
+ *timeoutp = region->tx_timeout;
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ LOCK_REGION_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ switch (flag) {
+ case DB_SET_LOCK_TIMEOUT:
+ *timeoutp = dbenv->lk_timeout;
+ break;
+ case DB_SET_TXN_TIMEOUT:
+ *timeoutp = dbenv->tx_timeout;
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+
+ if (ret)
+ ret = __db_ferr(env, "DB_ENV->get_timeout", 0);
+
+ return (ret);
+}
+
+/*
+ * __lock_set_env_timeout
+ * DB_ENV->set_lock_timeout.
+ *
+ * PUBLIC: int __lock_set_env_timeout __P((DB_ENV *, db_timeout_t, u_int32_t));
+ */
+int
+__lock_set_env_timeout(dbenv, timeout, flags)
+ DB_ENV *dbenv;
+ db_timeout_t timeout;
+ u_int32_t flags;
+{
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lk_handle, "DB_ENV->set_env_timeout", DB_INIT_LOCK);
+
+ ret = 0;
+ if (LOCKING_ON(env)) {
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+ ENV_ENTER(env, ip);
+ LOCK_REGION_LOCK(env);
+ switch (flags) {
+ case DB_SET_LOCK_TIMEOUT:
+ region->lk_timeout = timeout;
+ break;
+ case DB_SET_TXN_TIMEOUT:
+ region->tx_timeout = timeout;
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ LOCK_REGION_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ switch (flags) {
+ case DB_SET_LOCK_TIMEOUT:
+ dbenv->lk_timeout = timeout;
+ break;
+ case DB_SET_TXN_TIMEOUT:
+ dbenv->tx_timeout = timeout;
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+
+ if (ret)
+ ret = __db_ferr(env, "DB_ENV->set_timeout", 0);
+
+ return (ret);
+}
diff --git a/src/lock/lock_region.c b/src/lock/lock_region.c
new file mode 100644
index 00000000..1aae1815
--- /dev/null
+++ b/src/lock/lock_region.c
@@ -0,0 +1,578 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+static int __lock_region_init __P((ENV *, DB_LOCKTAB *));
+
+/*
+ * The conflict arrays are set up such that the row is the lock you are
+ * holding and the column is the lock that is desired.
+ */
+#define DB_LOCK_RIW_N 9
+static const u_int8_t db_riw_conflicts[] = {
+/* N R W WT IW IR RIW DR WW */
+/* N */ 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/* R */ 0, 0, 1, 0, 1, 0, 1, 0, 1,
+/* W */ 0, 1, 1, 1, 1, 1, 1, 1, 1,
+/* WT */ 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/* IW */ 0, 1, 1, 0, 0, 0, 0, 1, 1,
+/* IR */ 0, 0, 1, 0, 0, 0, 0, 0, 1,
+/* RIW */ 0, 1, 1, 0, 0, 0, 0, 1, 1,
+/* DR */ 0, 0, 1, 0, 1, 0, 1, 0, 0,
+/* WW */ 0, 1, 1, 0, 1, 1, 1, 0, 1
+};
+
+/*
+ * This conflict array is used for concurrent db access (CDB). It uses
+ * the same locks as the db_riw_conflicts array, but adds an IW mode to
+ * be used for write cursors.
+ */
+#define DB_LOCK_CDB_N 5
+static const u_int8_t db_cdb_conflicts[] = {
+ /* N R W WT IW */
+ /* N */ 0, 0, 0, 0, 0,
+ /* R */ 0, 0, 1, 0, 0,
+ /* W */ 0, 1, 1, 1, 1,
+ /* WT */ 0, 0, 0, 0, 0,
+ /* IW */ 0, 0, 1, 0, 1
+};
+
+/*
+ * __lock_open --
+ * Internal version of lock_open: only called from ENV->open.
+ *
+ * PUBLIC: int __lock_open __P((ENV *));
+ */
+int
+__lock_open(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ int region_locked, ret;
+
+ dbenv = env->dbenv;
+ region_locked = 0;
+
+ /* Create the lock table structure. */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_LOCKTAB), &lt)) != 0)
+ return (ret);
+ lt->env = env;
+
+ /* Join/create the lock region. */
+ if ((ret = __env_region_share(env, &lt->reginfo)) != 0)
+ goto err;
+
+ /* If we created the region, initialize it. */
+ if (F_ISSET(&lt->reginfo, REGION_CREATE))
+ if ((ret = __lock_region_init(env, lt)) != 0)
+ goto err;
+
+ /* Set the local addresses. */
+ region = lt->reginfo.primary =
+ R_ADDR(&lt->reginfo, ((REGENV *)env->reginfo->primary)->lt_primary);
+
+ /* Set remaining pointers into region. */
+ lt->conflicts = R_ADDR(&lt->reginfo, region->conf_off);
+ lt->obj_tab = R_ADDR(&lt->reginfo, region->obj_off);
+#ifdef HAVE_STATISTICS
+ lt->obj_stat = R_ADDR(&lt->reginfo, region->stat_off);
+#endif
+ lt->part_array = R_ADDR(&lt->reginfo, region->part_off);
+ lt->locker_tab = R_ADDR(&lt->reginfo, region->locker_off);
+
+ env->lk_handle = lt;
+ lt->reginfo.mtx_alloc = region->mtx_region;
+
+ LOCK_REGION_LOCK(env);
+ region_locked = 1;
+
+ if (dbenv->lk_detect != DB_LOCK_NORUN) {
+ /*
+ * Check for incompatible automatic deadlock detection requests.
+ * There are scenarios where changing the detector configuration
+ * is reasonable, but we disallow them guessing it is likely to
+ * be an application error.
+ *
+ * We allow applications to turn on the lock detector, and we
+ * ignore attempts to set it to the default or current value.
+ */
+ if (region->detect != DB_LOCK_NORUN &&
+ dbenv->lk_detect != DB_LOCK_DEFAULT &&
+ region->detect != dbenv->lk_detect) {
+ __db_errx(env, DB_STR("2041",
+ "lock_open: incompatible deadlock detector mode"));
+ ret = EINVAL;
+ goto err;
+ }
+ if (region->detect == DB_LOCK_NORUN)
+ region->detect = dbenv->lk_detect;
+ }
+
+ /*
+ * A process joining the region may have reset the lock and transaction
+ * timeouts.
+ */
+ if (dbenv->lk_timeout != 0)
+ region->lk_timeout = dbenv->lk_timeout;
+ if (dbenv->tx_timeout != 0)
+ region->tx_timeout = dbenv->tx_timeout;
+
+ LOCK_REGION_UNLOCK(env);
+ region_locked = 0;
+
+ return (0);
+
+err: if (lt->reginfo.addr != NULL) {
+ if (region_locked)
+ LOCK_REGION_UNLOCK(env);
+ (void)__env_region_detach(env, &lt->reginfo, 0);
+ }
+ env->lk_handle = NULL;
+
+ __os_free(env, lt);
+ return (ret);
+}
+
+/*
+ * __lock_region_init --
+ * Initialize the lock region.
+ */
+static int
+__lock_region_init(env, lt)
+ ENV *env;
+ DB_LOCKTAB *lt;
+{
+ const u_int8_t *lk_conflicts;
+ struct __db_lock *lp;
+ DB_ENV *dbenv;
+ DB_LOCKER *lidp;
+ DB_LOCKOBJ *op;
+ DB_LOCKREGION *region;
+ DB_LOCKPART *part;
+ u_int32_t extra_locks, extra_objects, i, j, max;
+ u_int8_t *addr;
+ int lk_modes, ret;
+
+ dbenv = env->dbenv;
+
+ if ((ret = __env_alloc(&lt->reginfo,
+ sizeof(DB_LOCKREGION), &lt->reginfo.primary)) != 0)
+ goto mem_err;
+ ((REGENV *)env->reginfo->primary)->lt_primary =
+ R_OFFSET(&lt->reginfo, lt->reginfo.primary);
+ region = lt->reginfo.primary;
+ memset(region, 0, sizeof(*region));
+
+ /* We share the region so we need the same mutex. */
+ region->mtx_region = ((REGENV *)env->reginfo->primary)->mtx_regenv;
+
+ /* Select a conflict matrix if none specified. */
+ if (dbenv->lk_modes == 0)
+ if (CDB_LOCKING(env)) {
+ lk_modes = DB_LOCK_CDB_N;
+ lk_conflicts = db_cdb_conflicts;
+ } else {
+ lk_modes = DB_LOCK_RIW_N;
+ lk_conflicts = db_riw_conflicts;
+ }
+ else {
+ lk_modes = dbenv->lk_modes;
+ lk_conflicts = dbenv->lk_conflicts;
+ }
+
+ region->need_dd = 0;
+ timespecclear(&region->next_timeout);
+ region->detect = DB_LOCK_NORUN;
+ region->lk_timeout = dbenv->lk_timeout;
+ region->tx_timeout = dbenv->tx_timeout;
+ region->locker_t_size = dbenv->locker_t_size;
+ region->object_t_size = dbenv->object_t_size;
+ region->part_t_size = dbenv->lk_partitions;
+ region->lock_id = 0;
+ region->cur_maxid = DB_LOCK_MAXID;
+ region->nmodes = lk_modes;
+ memset(&region->stat, 0, sizeof(region->stat));
+ region->stat.st_maxlocks = dbenv->lk_max;
+ region->stat.st_maxlockers = dbenv->lk_max_lockers;
+ region->stat.st_maxobjects = dbenv->lk_max_objects;
+ region->stat.st_initlocks = region->stat.st_locks = dbenv->lk_init;
+ region->stat.st_initlockers =
+ region->stat.st_lockers = dbenv->lk_init_lockers;
+ region->stat.st_initobjects =
+ region->stat.st_objects = dbenv->lk_init_objects;
+ region->stat.st_partitions = dbenv->lk_partitions;
+ region->stat.st_tablesize = dbenv->object_t_size;
+
+ /* Allocate room for the conflict matrix and initialize it. */
+ if ((ret = __env_alloc(
+ &lt->reginfo, (size_t)(lk_modes * lk_modes), &addr)) != 0)
+ goto mem_err;
+ memcpy(addr, lk_conflicts, (size_t)(lk_modes * lk_modes));
+ region->conf_off = R_OFFSET(&lt->reginfo, addr);
+
+ /* Allocate room for the object hash table and initialize it. */
+ if ((ret = __env_alloc(&lt->reginfo,
+ region->object_t_size * sizeof(DB_HASHTAB), &addr)) != 0)
+ goto mem_err;
+ __db_hashinit(addr, region->object_t_size);
+ region->obj_off = R_OFFSET(&lt->reginfo, addr);
+
+#ifdef HAVE_STATISTICS
+ /* Allocate room for the object hash stats table and initialize it. */
+ if ((ret = __env_alloc(&lt->reginfo,
+ region->object_t_size * sizeof(DB_LOCK_HSTAT), &addr)) != 0)
+ goto mem_err;
+ memset(addr, 0, region->object_t_size * sizeof(DB_LOCK_HSTAT));
+ region->stat_off = R_OFFSET(&lt->reginfo, addr);
+#endif
+
+ /* Allocate room for the partition table and initialize its mutexes. */
+ if ((ret = __env_alloc(&lt->reginfo,
+ region->part_t_size * sizeof(DB_LOCKPART), &part)) != 0)
+ goto mem_err;
+ memset(part, 0, region->part_t_size * sizeof(DB_LOCKPART));
+ region->part_off = R_OFFSET(&lt->reginfo, part);
+ for (i = 0; i < region->part_t_size; i++) {
+ if ((ret = __mutex_alloc(
+ env, MTX_LOCK_REGION, 0, &part[i].mtx_part)) != 0)
+ return (ret);
+ }
+ if ((ret = __mutex_alloc(
+ env, MTX_LOCK_REGION, 0, &region->mtx_dd)) != 0)
+ return (ret);
+
+ if ((ret = __mutex_alloc(
+ env, MTX_LOCK_REGION, 0, &region->mtx_lockers)) != 0)
+ return (ret);
+
+ /* Allocate room for the locker hash table and initialize it. */
+ if ((ret = __env_alloc(&lt->reginfo,
+ region->locker_t_size * sizeof(DB_HASHTAB), &addr)) != 0)
+ goto mem_err;
+ __db_hashinit(addr, region->locker_t_size);
+ region->locker_off = R_OFFSET(&lt->reginfo, addr);
+
+ SH_TAILQ_INIT(&region->dd_objs);
+
+ /*
+ * If the locks and objects don't divide evenly, spread them around.
+ */
+ extra_locks = region->stat.st_locks -
+ ((region->stat.st_locks / region->part_t_size) *
+ region->part_t_size);
+ extra_objects = region->stat.st_objects -
+ ((region->stat.st_objects / region->part_t_size) *
+ region->part_t_size);
+ for (j = 0; j < region->part_t_size; j++) {
+ /* Initialize locks onto a free list. */
+ SH_TAILQ_INIT(&part[j].free_locks);
+ max = region->stat.st_locks / region->part_t_size;
+ if (extra_locks > 0) {
+ max++;
+ extra_locks--;
+ }
+
+ if ((ret =
+ __env_alloc(&lt->reginfo,
+ sizeof(struct __db_lock) * max,
+ &lp)) != 0)
+ goto mem_err;
+ part[j].lock_mem_off = R_OFFSET(&lt->reginfo, lp);
+ for (i = 0; i < max; ++i) {
+ memset(lp, 0, sizeof(*lp));
+ lp->status = DB_LSTAT_FREE;
+ SH_TAILQ_INSERT_HEAD(
+ &part[j].free_locks, lp, links, __db_lock);
+ ++lp;
+ }
+
+ /* Initialize objects onto a free list. */
+ max = region->stat.st_objects / region->part_t_size;
+ if (extra_objects > 0) {
+ max++;
+ extra_objects--;
+ }
+ SH_TAILQ_INIT(&part[j].free_objs);
+
+ if ((ret =
+ __env_alloc(&lt->reginfo,
+ sizeof(DB_LOCKOBJ) * max,
+ &op)) != 0)
+ goto mem_err;
+ part[j].lockobj_mem_off = R_OFFSET(&lt->reginfo, op);
+ for (i = 0; i < max; ++i) {
+ memset(op, 0, sizeof(*op));
+ SH_TAILQ_INSERT_HEAD(
+ &part[j].free_objs, op, links, __db_lockobj);
+ ++op;
+ }
+ }
+
+ /* Initialize lockers onto a free list. */
+ SH_TAILQ_INIT(&region->lockers);
+ SH_TAILQ_INIT(&region->free_lockers);
+ if ((ret =
+ __env_alloc(&lt->reginfo,
+ sizeof(DB_LOCKER) * region->stat.st_lockers,
+ &lidp)) != 0)
+ goto mem_err;
+
+ region->locker_mem_off = R_OFFSET(&lt->reginfo, lidp);
+ for (i = 0; i < region->stat.st_lockers; ++i) {
+ SH_TAILQ_INSERT_HEAD(
+ &region->free_lockers, lidp, links, __db_locker);
+ ++lidp;
+ }
+ return (0);
+mem_err: __db_errx(env, DB_STR("2042",
+ "unable to allocate memory for the lock table"));
+ return (ret);
+ }
+
+/*
+ * __lock_env_refresh --
+ * Clean up after the lock system on a close or failed open.
+ *
+ * PUBLIC: int __lock_env_refresh __P((ENV *));
+ */
+int
+__lock_env_refresh(env)
+ ENV *env;
+{
+ DB_LOCKREGION *lr;
+ DB_LOCKTAB *lt;
+ REGINFO *reginfo;
+ u_int32_t j;
+ int ret;
+
+ lt = env->lk_handle;
+ reginfo = &lt->reginfo;
+ lr = reginfo->primary;
+
+ /*
+ * If a private region, return the memory to the heap. Not needed for
+ * filesystem-backed or system shared memory regions, that memory isn't
+ * owned by any particular process.
+ */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ reginfo->mtx_alloc = MUTEX_INVALID;
+ /* Discard the conflict matrix. */
+ __env_alloc_free(reginfo, R_ADDR(reginfo, lr->conf_off));
+
+ /* Discard the object hash table. */
+ __env_alloc_free(reginfo, R_ADDR(reginfo, lr->obj_off));
+
+ /* Discard the locker hash table. */
+ __env_alloc_free(reginfo, R_ADDR(reginfo, lr->locker_off));
+
+ /* Discard the object hash stat table. */
+ __env_alloc_free(reginfo, R_ADDR(reginfo, lr->stat_off));
+ for (j = 0; j < lr->part_t_size; j++) {
+ SH_TAILQ_INIT(&FREE_OBJS(lt, j));
+ SH_TAILQ_INIT(&FREE_LOCKS(lt, j));
+ __env_alloc_free(reginfo,
+ R_ADDR(reginfo,
+ lt->part_array[j].lock_mem_off));
+ __env_alloc_free(reginfo,
+ R_ADDR(reginfo,
+ lt->part_array[j].lockobj_mem_off));
+ }
+
+ /* Discard the object partition array. */
+ __env_alloc_free(reginfo, R_ADDR(reginfo, lr->part_off));
+ SH_TAILQ_INIT(&lr->free_lockers);
+ __env_alloc_free(reginfo,
+ R_ADDR(reginfo, lr->locker_mem_off));
+ }
+
+ /* Detach from the region. */
+ ret = __env_region_detach(env, reginfo, 0);
+
+ /* Discard DB_LOCKTAB. */
+ __os_free(env, lt);
+ env->lk_handle = NULL;
+
+ return (ret);
+}
+
+/*
+ * __lock_region_mutex_count --
+ * Return the number of mutexes the lock region will need.
+ *
+ * PUBLIC: u_int32_t __lock_region_mutex_count __P((ENV *));
+ */
+u_int32_t
+__lock_region_mutex_count(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+
+ dbenv = env->dbenv;
+
+ /*
+ * We need one mutex per locker for it to block on.
+ */
+ return (dbenv->lk_init_lockers + dbenv->lk_partitions + 3);
+}
+/*
+ * __lock_region_mutex_max --
+ * Return the number of additional mutexes the lock region will need.
+ *
+ * PUBLIC: u_int32_t __lock_region_mutex_max __P((ENV *));
+ */
+u_int32_t
+__lock_region_mutex_max(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ u_int32_t count;
+
+ dbenv = env->dbenv;
+
+ /*
+ * For backward compatibility, ensure enough mutexes.
+ * These might actually get used by other things.
+ */
+ if ((count = dbenv->lk_max_lockers) == 0)
+ count = DB_LOCK_DEFAULT_N;
+ if (count > dbenv->lk_init_lockers)
+ return (count - dbenv->lk_init_lockers);
+ else
+ return (0);
+}
+
+/*
+ * __lock_region_max --
+ * Return the amount of extra memory to allocate for locking information.
+ * PUBLIC: size_t __lock_region_max __P((ENV *));
+ */
+size_t
+__lock_region_max(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ size_t retval;
+ u_int32_t count;
+
+ dbenv = env->dbenv;
+
+ retval = 0;
+ if ((count = dbenv->lk_max) == 0)
+ count = DB_LOCK_DEFAULT_N;
+ if (count > dbenv->lk_init)
+ retval += __env_alloc_size(sizeof(struct __db_lock)) *
+ (count - dbenv->lk_init);
+ if ((count = dbenv->lk_max_objects) == 0)
+ count = DB_LOCK_DEFAULT_N;
+ if (count > dbenv->lk_init_objects)
+ retval += __env_alloc_size(sizeof(DB_LOCKOBJ)) *
+ (count - dbenv->lk_init_objects);
+ if ((count = dbenv->lk_max_lockers) == 0)
+ count = DB_LOCK_DEFAULT_N;
+ if (count > dbenv->lk_init_lockers)
+ retval += __env_alloc_size(sizeof(DB_LOCKER)) *
+ (count - dbenv->lk_init_lockers);
+
+ /* And we keep getting this wrong, let's be generous. */
+ retval += retval / 4;
+
+ return (retval);
+}
+
+/*
+ * __lock_region_size --
+ * Return the initial region size.
+ * PUBLIC: size_t __lock_region_size __P((ENV *, size_t));
+ */
+size_t
+__lock_region_size(env, other_alloc)
+ ENV *env;
+ size_t other_alloc;
+{
+ DB_ENV *dbenv;
+ size_t retval;
+ u_int32_t count;
+
+ dbenv = env->dbenv;
+
+ /* Make sure there is at least 5 objects and locks per partition. */
+ if (dbenv->lk_init_objects < dbenv->lk_partitions * 5)
+ dbenv->lk_init_objects = dbenv->lk_partitions * 5;
+ if (dbenv->lk_init < dbenv->lk_partitions * 5)
+ dbenv->lk_init = dbenv->lk_partitions * 5;
+ /*
+ * Figure out how much space we're going to need. This list should
+ * map one-to-one with the __env_alloc calls in __lock_region_init.
+ */
+ retval = 0;
+ retval += __env_alloc_size(sizeof(DB_LOCKREGION));
+ retval += __env_alloc_size((size_t)(dbenv->lk_modes * dbenv->lk_modes));
+ /*
+ * Try to figure out the size of the locker hash table.
+ */
+ if (dbenv->lk_max_lockers != 0)
+ dbenv->locker_t_size = __db_tablesize(dbenv->lk_max_lockers);
+ else if (dbenv->tx_max != 0)
+ dbenv->locker_t_size = __db_tablesize(dbenv->tx_max);
+ else {
+ if (dbenv->memory_max != 0)
+ count = (u_int32_t)
+ (((dbenv->memory_max - other_alloc) / 10) /
+ sizeof(DB_LOCKER));
+ else
+ count = DB_LOCK_DEFAULT_N / 10;
+ if (count < dbenv->lk_init_lockers)
+ count = dbenv->lk_init_lockers;
+ dbenv->locker_t_size = __db_tablesize(count);
+ }
+ retval += __env_alloc_size(dbenv->locker_t_size * (sizeof(DB_HASHTAB)));
+ retval += __env_alloc_size(sizeof(DB_LOCKER)) * dbenv->lk_init_lockers;
+ retval += __env_alloc_size(sizeof(struct __db_lock) * dbenv->lk_init);
+ other_alloc += retval;
+ /*
+ * We want to allocate a object hash table that is big enough to
+ * avoid many collisions, but not too big for starters. Arbitrarily
+ * pick the point 2/3s of the way to the max size. If the max
+ * is not stated then guess that objects will fill 1/2 the memory.
+ * Failing to know how much memory there might we just wind up
+ * using the default value. If this winds up being less than
+ * the init value then we just make the table fit the init value.
+ */
+ if ((count = dbenv->lk_max_objects) == 0) {
+ if (dbenv->memory_max != 0)
+ count = (u_int32_t)(
+ ((dbenv->memory_max - other_alloc) / 2)
+ / sizeof(DB_LOCKOBJ));
+ else
+ count = DB_LOCK_DEFAULT_N;
+ if (count < dbenv->lk_init_objects)
+ count = dbenv->lk_init_objects;
+ }
+ count *= 2;
+ count += dbenv->lk_init_objects;
+ count /= 3;
+ if (dbenv->object_t_size == 0)
+ dbenv->object_t_size = __db_tablesize(count);
+ retval += __env_alloc_size(
+ __db_tablesize(dbenv->object_t_size) * (sizeof(DB_HASHTAB)));
+#ifdef HAVE_STATISTICS
+ retval += __env_alloc_size(
+ __db_tablesize(dbenv->object_t_size) * (sizeof(DB_LOCK_HSTAT)));
+#endif
+ retval +=
+ __env_alloc_size(dbenv->lk_partitions * (sizeof(DB_LOCKPART)));
+ retval += __env_alloc_size(sizeof(DB_LOCKOBJ) * dbenv->lk_init_objects);
+
+ return (retval);
+}
diff --git a/src/lock/lock_stat.c b/src/lock/lock_stat.c
new file mode 100644
index 00000000..11b934aa
--- /dev/null
+++ b/src/lock/lock_stat.c
@@ -0,0 +1,770 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/db_am.h"
+
+#ifdef HAVE_STATISTICS
+static int __lock_dump_locker
+ __P((ENV *, DB_MSGBUF *, DB_LOCKTAB *, DB_LOCKER *));
+static int __lock_dump_object __P((DB_LOCKTAB *, DB_MSGBUF *, DB_LOCKOBJ *));
+static int __lock_print_all __P((ENV *, u_int32_t));
+static int __lock_print_stats __P((ENV *, u_int32_t));
+static void __lock_print_header __P((ENV *));
+static int __lock_stat __P((ENV *, DB_LOCK_STAT **, u_int32_t));
+
+/*
+ * __lock_stat_pp --
+ * ENV->lock_stat pre/post processing.
+ *
+ * PUBLIC: int __lock_stat_pp __P((DB_ENV *, DB_LOCK_STAT **, u_int32_t));
+ */
+int
+__lock_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_LOCK_STAT **statp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lk_handle, "DB_ENV->lock_stat", DB_INIT_LOCK);
+
+ if ((ret = __db_fchk(env,
+ "DB_ENV->lock_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__lock_stat(env, statp, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __lock_stat --
+ * ENV->lock_stat.
+ */
+static int
+__lock_stat(env, statp, flags)
+ ENV *env;
+ DB_LOCK_STAT **statp;
+ u_int32_t flags;
+{
+ DB_LOCKREGION *region;
+ DB_LOCKTAB *lt;
+ DB_LOCK_STAT *stats, tmp;
+ DB_LOCK_HSTAT htmp;
+ DB_LOCK_PSTAT ptmp;
+ int ret;
+ u_int32_t i;
+ uintmax_t tmp_wait, tmp_nowait;
+
+ *statp = NULL;
+ lt = env->lk_handle;
+
+ if ((ret = __os_umalloc(env, sizeof(*stats), &stats)) != 0)
+ return (ret);
+
+ /* Copy out the global statistics. */
+ LOCK_REGION_LOCK(env);
+
+ region = lt->reginfo.primary;
+ memcpy(stats, &region->stat, sizeof(*stats));
+ stats->st_locktimeout = region->lk_timeout;
+ stats->st_txntimeout = region->tx_timeout;
+ stats->st_id = region->lock_id;
+ stats->st_cur_maxid = region->cur_maxid;
+ stats->st_nlockers = region->nlockers;
+ stats->st_nmodes = region->nmodes;
+
+ for (i = 0; i < region->object_t_size; i++) {
+ stats->st_nrequests += lt->obj_stat[i].st_nrequests;
+ stats->st_nreleases += lt->obj_stat[i].st_nreleases;
+ stats->st_nupgrade += lt->obj_stat[i].st_nupgrade;
+ stats->st_ndowngrade += lt->obj_stat[i].st_ndowngrade;
+ stats->st_lock_wait += lt->obj_stat[i].st_lock_wait;
+ stats->st_lock_nowait += lt->obj_stat[i].st_lock_nowait;
+ stats->st_nlocktimeouts += lt->obj_stat[i].st_nlocktimeouts;
+ stats->st_ntxntimeouts += lt->obj_stat[i].st_ntxntimeouts;
+ if (stats->st_maxhlocks < lt->obj_stat[i].st_maxnlocks)
+ stats->st_maxhlocks = lt->obj_stat[i].st_maxnlocks;
+ if (stats->st_maxhobjects < lt->obj_stat[i].st_maxnobjects)
+ stats->st_maxhobjects = lt->obj_stat[i].st_maxnobjects;
+ if (stats->st_hash_len < lt->obj_stat[i].st_hash_len)
+ stats->st_hash_len = lt->obj_stat[i].st_hash_len;
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ htmp = lt->obj_stat[i];
+ memset(&lt->obj_stat[i], 0, sizeof(lt->obj_stat[i]));
+ lt->obj_stat[i].st_nlocks = htmp.st_nlocks;
+ lt->obj_stat[i].st_maxnlocks = htmp.st_nlocks;
+ lt->obj_stat[i].st_nobjects = htmp.st_nobjects;
+ lt->obj_stat[i].st_maxnobjects = htmp.st_nobjects;
+
+ }
+ }
+
+ for (i = 0; i < region->part_t_size; i++) {
+ stats->st_nlocks += lt->part_array[i].part_stat.st_nlocks;
+ stats->st_maxnlocks +=
+ lt->part_array[i].part_stat.st_maxnlocks;
+ stats->st_nobjects += lt->part_array[i].part_stat.st_nobjects;
+ stats->st_maxnobjects +=
+ lt->part_array[i].part_stat.st_maxnobjects;
+ stats->st_locksteals +=
+ lt->part_array[i].part_stat.st_locksteals;
+ if (stats->st_maxlsteals <
+ lt->part_array[i].part_stat.st_locksteals)
+ stats->st_maxlsteals =
+ lt->part_array[i].part_stat.st_locksteals;
+ stats->st_objectsteals +=
+ lt->part_array[i].part_stat.st_objectsteals;
+ if (stats->st_maxosteals <
+ lt->part_array[i].part_stat.st_objectsteals)
+ stats->st_maxosteals =
+ lt->part_array[i].part_stat.st_objectsteals;
+ __mutex_set_wait_info(env,
+ lt->part_array[i].mtx_part, &tmp_wait, &tmp_nowait);
+ stats->st_part_nowait += tmp_nowait;
+ stats->st_part_wait += tmp_wait;
+ if (tmp_wait > stats->st_part_max_wait) {
+ stats->st_part_max_nowait = tmp_nowait;
+ stats->st_part_max_wait = tmp_wait;
+ }
+
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ ptmp = lt->part_array[i].part_stat;
+ memset(&lt->part_array[i].part_stat,
+ 0, sizeof(lt->part_array[i].part_stat));
+ lt->part_array[i].part_stat.st_nlocks =
+ ptmp.st_nlocks;
+ lt->part_array[i].part_stat.st_maxnlocks =
+ ptmp.st_nlocks;
+ lt->part_array[i].part_stat.st_nobjects =
+ ptmp.st_nobjects;
+ lt->part_array[i].part_stat.st_maxnobjects =
+ ptmp.st_nobjects;
+ }
+ }
+
+ __mutex_set_wait_info(env, region->mtx_region,
+ &stats->st_region_wait, &stats->st_region_nowait);
+ __mutex_set_wait_info(env, region->mtx_dd,
+ &stats->st_objs_wait, &stats->st_objs_nowait);
+ __mutex_set_wait_info(env, region->mtx_lockers,
+ &stats->st_lockers_wait, &stats->st_lockers_nowait);
+ stats->st_regsize = lt->reginfo.rp->size;
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ tmp = region->stat;
+ memset(&region->stat, 0, sizeof(region->stat));
+ if (!LF_ISSET(DB_STAT_SUBSYSTEM)) {
+ __mutex_clear(env, region->mtx_region);
+ __mutex_clear(env, region->mtx_dd);
+ __mutex_clear(env, region->mtx_lockers);
+ for (i = 0; i < region->part_t_size; i++)
+ __mutex_clear(env, lt->part_array[i].mtx_part);
+ }
+
+ region->stat.st_maxlocks = tmp.st_maxlocks;
+ region->stat.st_maxlockers = tmp.st_maxlockers;
+ region->stat.st_maxobjects = tmp.st_maxobjects;
+ region->stat.st_nlocks =
+ region->stat.st_maxnlocks = tmp.st_nlocks;
+ region->stat.st_maxnlockers = region->nlockers;
+ region->stat.st_nobjects =
+ region->stat.st_maxnobjects = tmp.st_nobjects;
+ region->stat.st_partitions = tmp.st_partitions;
+ region->stat.st_tablesize = tmp.st_tablesize;
+ }
+
+ LOCK_REGION_UNLOCK(env);
+
+ *statp = stats;
+ return (0);
+}
+
+/*
+ * __lock_stat_print_pp --
+ * ENV->lock_stat_print pre/post processing.
+ *
+ * PUBLIC: int __lock_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__lock_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lk_handle, "DB_ENV->lock_stat_print", DB_INIT_LOCK);
+
+#define DB_STAT_LOCK_FLAGS \
+ (DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR | DB_STAT_LOCK_CONF |\
+ DB_STAT_LOCK_LOCKERS | DB_STAT_LOCK_OBJECTS | DB_STAT_LOCK_PARAMS)
+ if ((ret = __db_fchk(env, "DB_ENV->lock_stat_print",
+ flags, DB_STAT_CLEAR | DB_STAT_LOCK_FLAGS)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__lock_stat_print(env, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __lock_stat_print --
+ * ENV->lock_stat_print method.
+ *
+ * PUBLIC: int __lock_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__lock_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ u_int32_t orig_flags;
+ int ret;
+
+ orig_flags = flags;
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+ if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+ ret = __lock_print_stats(env, orig_flags);
+ if (flags == 0 || ret != 0)
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_CONF | DB_STAT_LOCK_LOCKERS |
+ DB_STAT_LOCK_OBJECTS | DB_STAT_LOCK_PARAMS) &&
+ (ret = __lock_print_all(env, orig_flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __lock_print_stats --
+ * Display default lock region statistics.
+ */
+static int
+__lock_print_stats(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_LOCK_STAT *sp;
+ int ret;
+
+#ifdef LOCK_DIAGNOSTIC
+ DB_LOCKTAB *lt;
+ DB_LOCKREGION *region;
+ u_int32_t i;
+ u_int32_t wait, nowait;
+
+ lt = env->lk_handle;
+ region = lt->reginfo.primary;
+
+ for (i = 0; i < region->object_t_size; i++) {
+ if (lt->obj_stat[i].st_hash_len == 0)
+ continue;
+ __db_dl(env,
+ "Hash bucket", (u_long)i);
+ __db_dl(env, "Partition", (u_long)LOCK_PART(region, i));
+ __mutex_set_wait_info(env,
+ lt->part_array[LOCK_PART(region, i)].mtx_part,
+ &wait, &nowait);
+ __db_dl_pct(env,
+ "The number of partition mutex requests that required waiting",
+ (u_long)wait, DB_PCT(wait, wait + nowait), NULL);
+ __db_dl(env,
+ "Maximum hash bucket length",
+ (u_long)lt->obj_stat[i].st_hash_len);
+ __db_dl(env,
+ "Total number of locks requested",
+ (u_long)lt->obj_stat[i].st_nrequests);
+ __db_dl(env,
+ "Total number of locks released",
+ (u_long)lt->obj_stat[i].st_nreleases);
+ __db_dl(env,
+ "Total number of locks upgraded",
+ (u_long)lt->obj_stat[i].st_nupgrade);
+ __db_dl(env,
+ "Total number of locks downgraded",
+ (u_long)lt->obj_stat[i].st_ndowngrade);
+ __db_dl(env,
+ "Lock requests not available due to conflicts, for which we waited",
+ (u_long)lt->obj_stat[i].st_lock_wait);
+ __db_dl(env,
+ "Lock requests not available due to conflicts, for which we did not wait",
+ (u_long)lt->obj_stat[i].st_lock_nowait);
+ __db_dl(env, "Number of locks that have timed out",
+ (u_long)lt->obj_stat[i].st_nlocktimeouts);
+ __db_dl(env, "Number of transactions that have timed out",
+ (u_long)lt->obj_stat[i].st_ntxntimeouts);
+ }
+#endif
+ if ((ret = __lock_stat(env, &sp, flags)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL))
+ __db_msg(env, "Default locking region information:");
+ __db_dl(env, "Last allocated locker ID", (u_long)sp->st_id);
+ __db_msg(env, "%#lx\tCurrent maximum unused locker ID",
+ (u_long)sp->st_cur_maxid);
+ __db_dl(env, "Number of lock modes", (u_long)sp->st_nmodes);
+ __db_dl(env,
+ "Initial number of locks allocated", (u_long)sp->st_initlocks);
+ __db_dl(env,
+ "Initial number of lockers allocated", (u_long)sp->st_initlockers);
+ __db_dl(env, "Initial number of lock objects allocated",
+ (u_long)sp->st_initobjects);
+ __db_dl(env,
+ "Maximum number of locks possible", (u_long)sp->st_maxlocks);
+ __db_dl(env,
+ "Maximum number of lockers possible", (u_long)sp->st_maxlockers);
+ __db_dl(env, "Maximum number of lock objects possible",
+ (u_long)sp->st_maxobjects);
+ __db_dl(env,
+ "Current number of locks allocated", (u_long)sp->st_locks);
+ __db_dl(env,
+ "Current number of lockers allocated", (u_long)sp->st_lockers);
+ __db_dl(env, "Current number of lock objects allocated",
+ (u_long)sp->st_objects);
+ __db_dl(env, "Number of lock object partitions",
+ (u_long)sp->st_partitions);
+ __db_dl(env, "Size of object hash table",
+ (u_long)sp->st_tablesize);
+ __db_dl(env, "Number of current locks", (u_long)sp->st_nlocks);
+ __db_dl(env, "Maximum number of locks at any one time",
+ (u_long)sp->st_maxnlocks);
+ __db_dl(env, "Maximum number of locks in any one bucket",
+ (u_long)sp->st_maxhlocks);
+ __db_dl(env, "Maximum number of locks stolen by for an empty partition",
+ (u_long)sp->st_locksteals);
+ __db_dl(env, "Maximum number of locks stolen for any one partition",
+ (u_long)sp->st_maxlsteals);
+ __db_dl(env, "Number of current lockers", (u_long)sp->st_nlockers);
+ __db_dl(env, "Maximum number of lockers at any one time",
+ (u_long)sp->st_maxnlockers);
+ __db_dl(env,
+ "Number of current lock objects", (u_long)sp->st_nobjects);
+ __db_dl(env, "Maximum number of lock objects at any one time",
+ (u_long)sp->st_maxnobjects);
+ __db_dl(env, "Maximum number of lock objects in any one bucket",
+ (u_long)sp->st_maxhobjects);
+ __db_dl(env,
+ "Maximum number of objects stolen by for an empty partition",
+ (u_long)sp->st_objectsteals);
+ __db_dl(env, "Maximum number of objects stolen for any one partition",
+ (u_long)sp->st_maxosteals);
+ __db_dl(env,
+ "Total number of locks requested", (u_long)sp->st_nrequests);
+ __db_dl(env,
+ "Total number of locks released", (u_long)sp->st_nreleases);
+ __db_dl(env,
+ "Total number of locks upgraded", (u_long)sp->st_nupgrade);
+ __db_dl(env,
+ "Total number of locks downgraded", (u_long)sp->st_ndowngrade);
+ __db_dl(env,
+ "Lock requests not available due to conflicts, for which we waited",
+ (u_long)sp->st_lock_wait);
+ __db_dl(env,
+ "Lock requests not available due to conflicts, for which we did not wait",
+ (u_long)sp->st_lock_nowait);
+ __db_dl(env, "Number of deadlocks", (u_long)sp->st_ndeadlocks);
+ __db_dl(env, "Lock timeout value", (u_long)sp->st_locktimeout);
+ __db_dl(env, "Number of locks that have timed out",
+ (u_long)sp->st_nlocktimeouts);
+ __db_dl(env,
+ "Transaction timeout value", (u_long)sp->st_txntimeout);
+ __db_dl(env, "Number of transactions that have timed out",
+ (u_long)sp->st_ntxntimeouts);
+
+ __db_dlbytes(env, "Region size",
+ (u_long)0, (u_long)0, (u_long)sp->st_regsize);
+ __db_dl_pct(env,
+ "The number of partition locks that required waiting",
+ (u_long)sp->st_part_wait, DB_PCT(
+ sp->st_part_wait, sp->st_part_wait + sp->st_part_nowait), NULL);
+ __db_dl_pct(env,
+ "The maximum number of times any partition lock was waited for",
+ (u_long)sp->st_part_max_wait, DB_PCT(sp->st_part_max_wait,
+ sp->st_part_max_wait + sp->st_part_max_nowait), NULL);
+ __db_dl_pct(env,
+ "The number of object queue operations that required waiting",
+ (u_long)sp->st_objs_wait, DB_PCT(sp->st_objs_wait,
+ sp->st_objs_wait + sp->st_objs_nowait), NULL);
+ __db_dl_pct(env,
+ "The number of locker allocations that required waiting",
+ (u_long)sp->st_lockers_wait, DB_PCT(sp->st_lockers_wait,
+ sp->st_lockers_wait + sp->st_lockers_nowait), NULL);
+ __db_dl_pct(env,
+ "The number of region locks that required waiting",
+ (u_long)sp->st_region_wait, DB_PCT(sp->st_region_wait,
+ sp->st_region_wait + sp->st_region_nowait), NULL);
+ __db_dl(env, "Maximum hash bucket length",
+ (u_long)sp->st_hash_len);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+/*
+ * __lock_print_all --
+ * Display debugging lock region statistics.
+ */
+static int
+__lock_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_LOCKER *lip;
+ DB_LOCKOBJ *op;
+ DB_LOCKREGION *lrp;
+ DB_LOCKTAB *lt;
+ DB_MSGBUF mb;
+ int i, j;
+ u_int32_t k;
+
+ lt = env->lk_handle;
+ lrp = lt->reginfo.primary;
+ DB_MSGBUF_INIT(&mb);
+
+ LOCK_REGION_LOCK(env);
+ __db_print_reginfo(env, &lt->reginfo, "Lock", flags);
+
+ if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_PARAMS)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Lock region parameters:");
+ __mutex_print_debug_single(env,
+ "Lock region region mutex", lrp->mtx_region, flags);
+ STAT_ULONG("locker table size", lrp->locker_t_size);
+ STAT_ULONG("object table size", lrp->object_t_size);
+ STAT_ULONG("obj_off", lrp->obj_off);
+ STAT_ULONG("locker_off", lrp->locker_off);
+ STAT_ULONG("need_dd", lrp->need_dd);
+ if (timespecisset(&lrp->next_timeout)) {
+#ifdef HAVE_STRFTIME
+ time_t t = (time_t)lrp->next_timeout.tv_sec;
+ char tbuf[64];
+ if (strftime(tbuf, sizeof(tbuf),
+ "%m-%d-%H:%M:%S", localtime(&t)) != 0)
+ __db_msg(env, "next_timeout: %s.%09lu",
+ tbuf, (u_long)lrp->next_timeout.tv_nsec);
+ else
+#endif
+ __db_msg(env, "next_timeout: %lu.%09lu",
+ (u_long)lrp->next_timeout.tv_sec,
+ (u_long)lrp->next_timeout.tv_nsec);
+ }
+ }
+
+ if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_CONF)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Lock conflict matrix:");
+ for (i = 0; i < lrp->stat.st_nmodes; i++) {
+ for (j = 0; j < lrp->stat.st_nmodes; j++)
+ __db_msgadd(env, &mb, "%lu\t", (u_long)
+ lt->conflicts[i * lrp->stat.st_nmodes + j]);
+ DB_MSGBUF_FLUSH(env, &mb);
+ }
+ }
+ LOCK_REGION_UNLOCK(env);
+
+ if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_LOCKERS)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Locks grouped by lockers:");
+ __lock_print_header(env);
+ LOCK_LOCKERS(env, lrp);
+ for (k = 0; k < lrp->locker_t_size; k++)
+ SH_TAILQ_FOREACH(
+ lip, &lt->locker_tab[k], links, __db_locker)
+ (void)__lock_dump_locker(env, &mb, lt, lip);
+ UNLOCK_LOCKERS(env, lrp);
+ }
+
+ if (LF_ISSET(DB_STAT_ALL | DB_STAT_LOCK_OBJECTS)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Locks grouped by object:");
+ __lock_print_header(env);
+ for (k = 0; k < lrp->object_t_size; k++) {
+ OBJECT_LOCK_NDX(lt, lrp, k);
+ SH_TAILQ_FOREACH(
+ op, &lt->obj_tab[k], links, __db_lockobj) {
+ (void)__lock_dump_object(lt, &mb, op);
+ __db_msg(env, "%s", "");
+ }
+ OBJECT_UNLOCK(lt, lrp, k);
+ }
+ }
+
+ return (0);
+}
+
+static int
+__lock_dump_locker(env, mbp, lt, lip)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ DB_LOCKTAB *lt;
+ DB_LOCKER *lip;
+{
+ DB_LOCKREGION *lrp;
+ struct __db_lock *lp;
+ char buf[DB_THREADID_STRLEN];
+ u_int32_t ndx;
+
+ lrp = lt->reginfo.primary;
+
+ __db_msgadd(env,
+ mbp, "%8lx dd=%2ld locks held %-4d write locks %-4d pid/thread %s",
+ (u_long)lip->id, (long)lip->dd_id, lip->nlocks, lip->nwrites,
+ env->dbenv->thread_id_string(env->dbenv, lip->pid, lip->tid, buf));
+ __db_msgadd(env, mbp,
+ " flags %-4x priority %-10u", lip->flags, lip->priority);
+
+ if (timespecisset(&lip->tx_expire)) {
+#ifdef HAVE_STRFTIME
+ time_t t = (time_t)lip->tx_expire.tv_sec;
+ char tbuf[64];
+ if (strftime(tbuf, sizeof(tbuf),
+ "%m-%d-%H:%M:%S", localtime(&t)) != 0)
+ __db_msgadd(env, mbp, "expires %s.%09lu",
+ tbuf, (u_long)lip->tx_expire.tv_nsec);
+ else
+#endif
+ __db_msgadd(env, mbp, "expires %lu.%09lu",
+ (u_long)lip->tx_expire.tv_sec,
+ (u_long)lip->tx_expire.tv_nsec);
+ }
+ if (F_ISSET(lip, DB_LOCKER_TIMEOUT))
+ __db_msgadd(
+ env, mbp, " lk timeout %lu", (u_long)lip->lk_timeout);
+ if (timespecisset(&lip->lk_expire)) {
+#ifdef HAVE_STRFTIME
+ time_t t = (time_t)lip->lk_expire.tv_sec;
+ char tbuf[64];
+ if (strftime(tbuf,
+ sizeof(tbuf), "%m-%d-%H:%M:%S", localtime(&t)) != 0)
+ __db_msgadd(env, mbp, " lk expires %s.%09lu",
+ tbuf, (u_long)lip->lk_expire.tv_nsec);
+ else
+#endif
+ __db_msgadd(env, mbp, " lk expires %lu.%09lu",
+ (u_long)lip->lk_expire.tv_sec,
+ (u_long)lip->lk_expire.tv_nsec);
+ }
+ DB_MSGBUF_FLUSH(env, mbp);
+
+ /*
+ * We need some care here since the list may change while we
+ * look.
+ */
+retry: SH_LIST_FOREACH(lp, &lip->heldby, locker_links, __db_lock) {
+ if (!SH_LIST_EMPTY(&lip->heldby) && lp != NULL) {
+ ndx = lp->indx;
+ OBJECT_LOCK_NDX(lt, lrp, ndx);
+ if (lp->indx == ndx)
+ __lock_printlock(lt, mbp, lp, 1);
+ else {
+ OBJECT_UNLOCK(lt, lrp, ndx);
+ goto retry;
+ }
+ OBJECT_UNLOCK(lt, lrp, ndx);
+ }
+ }
+ return (0);
+}
+
+static int
+__lock_dump_object(lt, mbp, op)
+ DB_LOCKTAB *lt;
+ DB_MSGBUF *mbp;
+ DB_LOCKOBJ *op;
+{
+ struct __db_lock *lp;
+
+ SH_TAILQ_FOREACH(lp, &op->holders, links, __db_lock)
+ __lock_printlock(lt, mbp, lp, 1);
+ SH_TAILQ_FOREACH(lp, &op->waiters, links, __db_lock)
+ __lock_printlock(lt, mbp, lp, 1);
+ return (0);
+}
+
+/*
+ * __lock_print_header --
+ */
+static void
+__lock_print_header(env)
+ ENV *env;
+{
+ __db_msg(env, "%-8s %-10s%-4s %-7s %s",
+ "Locker", "Mode",
+ "Count", "Status", "----------------- Object ---------------");
+}
+
+/*
+ * __lock_printlock --
+ *
+ * PUBLIC: void __lock_printlock
+ * PUBLIC: __P((DB_LOCKTAB *, DB_MSGBUF *mbp, struct __db_lock *, int));
+ */
+void
+__lock_printlock(lt, mbp, lp, ispgno)
+ DB_LOCKTAB *lt;
+ DB_MSGBUF *mbp;
+ struct __db_lock *lp;
+ int ispgno;
+{
+ DB_LOCKOBJ *lockobj;
+ DB_MSGBUF mb;
+ ENV *env;
+ db_pgno_t pgno;
+ u_int32_t *fidp, type;
+ u_int8_t *ptr;
+ char *fname, *dname, *p, namebuf[26];
+ const char *mode, *status;
+
+ env = lt->env;
+
+ if (mbp == NULL) {
+ DB_MSGBUF_INIT(&mb);
+ mbp = &mb;
+ }
+
+ switch (lp->mode) {
+ case DB_LOCK_IREAD:
+ mode = "IREAD";
+ break;
+ case DB_LOCK_IWR:
+ mode = "IWR";
+ break;
+ case DB_LOCK_IWRITE:
+ mode = "IWRITE";
+ break;
+ case DB_LOCK_NG:
+ mode = "NG";
+ break;
+ case DB_LOCK_READ:
+ mode = "READ";
+ break;
+ case DB_LOCK_READ_UNCOMMITTED:
+ mode = "READ_UNCOMMITTED";
+ break;
+ case DB_LOCK_WRITE:
+ mode = "WRITE";
+ break;
+ case DB_LOCK_WWRITE:
+ mode = "WAS_WRITE";
+ break;
+ case DB_LOCK_WAIT:
+ mode = "WAIT";
+ break;
+ default:
+ mode = "UNKNOWN";
+ break;
+ }
+ switch (lp->status) {
+ case DB_LSTAT_ABORTED:
+ status = "ABORT";
+ break;
+ case DB_LSTAT_EXPIRED:
+ status = "EXPIRED";
+ break;
+ case DB_LSTAT_FREE:
+ status = "FREE";
+ break;
+ case DB_LSTAT_HELD:
+ status = "HELD";
+ break;
+ case DB_LSTAT_PENDING:
+ status = "PENDING";
+ break;
+ case DB_LSTAT_WAITING:
+ status = "WAIT";
+ break;
+ default:
+ status = "UNKNOWN";
+ break;
+ }
+ __db_msgadd(env, mbp, "%8lx %-10s %4lu %-7s ",
+ (u_long)((DB_LOCKER *)R_ADDR(&lt->reginfo, lp->holder))->id,
+ mode, (u_long)lp->refcount, status);
+
+ lockobj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
+ ptr = SH_DBT_PTR(&lockobj->lockobj);
+ if (ispgno && lockobj->lockobj.size == sizeof(struct __db_ilock)) {
+ /* Assume this is a DBT lock. */
+ memcpy(&pgno, ptr, sizeof(db_pgno_t));
+ fidp = (u_int32_t *)(ptr + sizeof(db_pgno_t));
+ type = *(u_int32_t *)(ptr + sizeof(db_pgno_t) + DB_FILE_ID_LEN);
+ (void)__dbreg_get_name(
+ lt->env, (u_int8_t *)fidp, &fname, &dname);
+ if (fname == NULL && dname == NULL)
+ __db_msgadd(env, mbp, "(%lx %lx %lx %lx %lx) ",
+ (u_long)fidp[0], (u_long)fidp[1], (u_long)fidp[2],
+ (u_long)fidp[3], (u_long)fidp[4]);
+ else {
+ if (fname != NULL && dname != NULL) {
+ (void)snprintf(namebuf, sizeof(namebuf),
+ "%14s:%-10s", fname, dname);
+ p = namebuf;
+ } else if (fname != NULL)
+ p = fname;
+ else
+ p = dname;
+ __db_msgadd(env, mbp, "%-25s ", p);
+ }
+ __db_msgadd(env, mbp, "%-7s %7lu",
+ type == DB_PAGE_LOCK ? "page" :
+ type == DB_RECORD_LOCK ? "record" :
+ type == DB_DATABASE_LOCK ? "database" : "handle",
+ (u_long)pgno);
+ } else {
+ __db_msgadd(env, mbp, "0x%lx ",
+ (u_long)R_OFFSET(&lt->reginfo, lockobj));
+ __db_prbytes(env, mbp, ptr, lockobj->lockobj.size);
+ }
+ DB_MSGBUF_FLUSH(env, mbp);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__lock_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_LOCK_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__lock_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/lock/lock_stub.c b/src/lock/lock_stub.c
new file mode 100644
index 00000000..3875af55
--- /dev/null
+++ b/src/lock/lock_stub.c
@@ -0,0 +1,631 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+/*
+ * If the library wasn't compiled with locking support, various routines
+ * aren't available. Stub them here, returning an appropriate error.
+ */
+static int __db_nolocking __P((ENV *));
+
+/*
+ * __db_nolocking --
+ * Error when a Berkeley DB build doesn't include the locking subsystem.
+ */
+static int
+__db_nolocking(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("2054",
+ "library build did not include support for locking"));
+ return (DB_OPNOTSUP);
+}
+
+int
+__lock_env_create(dbenv)
+ DB_ENV *dbenv;
+{
+ COMPQUIET(dbenv, 0);
+ return (0);
+}
+
+void
+__lock_env_destroy(dbenv)
+ DB_ENV *dbenv;
+{
+ COMPQUIET(dbenv, 0);
+}
+
+int
+__lock_get_lk_conflicts(dbenv, lk_conflictsp, lk_modesp)
+ DB_ENV *dbenv;
+ const u_int8_t **lk_conflictsp;
+ int *lk_modesp;
+{
+ COMPQUIET(lk_conflictsp, NULL);
+ COMPQUIET(lk_modesp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_detect(dbenv, lk_detectp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_detectp;
+{
+ COMPQUIET(lk_detectp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_init_lockers(dbenv, lk_initp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_initp;
+{
+ COMPQUIET(lk_initp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_init_locks(dbenv, lk_initp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_initp;
+{
+ COMPQUIET(lk_initp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_init_objects(dbenv, lk_initp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_initp;
+{
+ COMPQUIET(lk_initp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_max_lockers(dbenv, lk_maxp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_maxp;
+{
+ COMPQUIET(lk_maxp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_max_locks(dbenv, lk_maxp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_maxp;
+{
+ COMPQUIET(lk_maxp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_max_objects(dbenv, lk_maxp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_maxp;
+{
+ COMPQUIET(lk_maxp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_partitions(dbenv, lk_maxp)
+ DB_ENV *dbenv;
+ u_int32_t *lk_maxp;
+{
+ COMPQUIET(lk_maxp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_tablesize(dbenv, lk_tablesizep)
+ DB_ENV *dbenv;
+ u_int32_t *lk_tablesizep;
+{
+ COMPQUIET(lk_tablesizep, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_tablesize(dbenv, lk_tablesize)
+ DB_ENV *dbenv;
+ u_int32_t lk_tablesize;
+{
+ COMPQUIET(lk_tablesize, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_lk_priority(dbenv, lockid, priorityp)
+ DB_ENV *dbenv;
+ u_int32_t lockid, *priorityp;
+{
+ COMPQUIET(lockid, 0);
+ COMPQUIET(priorityp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_priority(dbenv, lockid, priority)
+ DB_ENV *dbenv;
+ u_int32_t lockid, priority;
+{
+ COMPQUIET(lockid, 0);
+ COMPQUIET(priority, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_env_timeout(dbenv, timeoutp, flag)
+ DB_ENV *dbenv;
+ db_timeout_t *timeoutp;
+ u_int32_t flag;
+{
+ COMPQUIET(timeoutp, NULL);
+ COMPQUIET(flag, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_detect_pp(dbenv, flags, atype, abortp)
+ DB_ENV *dbenv;
+ u_int32_t flags, atype;
+ int *abortp;
+{
+ COMPQUIET(flags, 0);
+ COMPQUIET(atype, 0);
+ COMPQUIET(abortp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_get_pp(dbenv, locker, flags, obj, lock_mode, lock)
+ DB_ENV *dbenv;
+ u_int32_t locker, flags;
+ DBT *obj;
+ db_lockmode_t lock_mode;
+ DB_LOCK *lock;
+{
+ COMPQUIET(locker, 0);
+ COMPQUIET(flags, 0);
+ COMPQUIET(obj, NULL);
+ COMPQUIET(lock_mode, 0);
+ COMPQUIET(lock, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_id_pp(dbenv, idp)
+ DB_ENV *dbenv;
+ u_int32_t *idp;
+{
+ COMPQUIET(idp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_id_free_pp(dbenv, id)
+ DB_ENV *dbenv;
+ u_int32_t id;
+{
+ COMPQUIET(id, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_put_pp(dbenv, lock)
+ DB_ENV *dbenv;
+ DB_LOCK *lock;
+{
+ COMPQUIET(lock, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_LOCK_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_vec_pp(dbenv, locker, flags, list, nlist, elistp)
+ DB_ENV *dbenv;
+ u_int32_t locker, flags;
+ int nlist;
+ DB_LOCKREQ *list, **elistp;
+{
+ COMPQUIET(locker, 0);
+ COMPQUIET(flags, 0);
+ COMPQUIET(list, NULL);
+ COMPQUIET(nlist, 0);
+ COMPQUIET(elistp, NULL);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_conflicts(dbenv, lk_conflicts, lk_modes)
+ DB_ENV *dbenv;
+ u_int8_t *lk_conflicts;
+ int lk_modes;
+{
+ COMPQUIET(lk_conflicts, NULL);
+ COMPQUIET(lk_modes, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_detect(dbenv, lk_detect)
+ DB_ENV *dbenv;
+ u_int32_t lk_detect;
+{
+ COMPQUIET(lk_detect, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_max_locks(dbenv, lk_max)
+ DB_ENV *dbenv;
+ u_int32_t lk_max;
+{
+ COMPQUIET(lk_max, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_max_lockers(dbenv, lk_max)
+ DB_ENV *dbenv;
+ u_int32_t lk_max;
+{
+ COMPQUIET(lk_max, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_max_objects(dbenv, lk_max)
+ DB_ENV *dbenv;
+ u_int32_t lk_max;
+{
+ COMPQUIET(lk_max, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_lk_partitions(dbenv, lk_max)
+ DB_ENV *dbenv;
+ u_int32_t lk_max;
+{
+ COMPQUIET(lk_max, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_set_env_timeout(dbenv, timeout, flags)
+ DB_ENV *dbenv;
+ db_timeout_t timeout;
+ u_int32_t flags;
+{
+ COMPQUIET(timeout, 0);
+ COMPQUIET(flags, 0);
+ return (__db_nolocking(dbenv->env));
+}
+
+int
+__lock_open(env)
+ ENV *env;
+{
+ return (__db_nolocking(env));
+}
+
+u_int32_t
+__lock_region_mutex_count(env)
+ ENV *env;
+{
+ return (__db_nolocking(env));
+}
+
+u_int32_t
+__lock_region_mutex_max(env)
+ ENV *env;
+{
+ return (__db_nolocking(env));
+}
+
+size_t
+__lock_region_max(env)
+ ENV *env;
+{
+ return (0);
+}
+
+size_t
+__lock_region_size(env, other_alloc)
+ ENV *env;
+ size_t other_alloc;
+{
+ COMPQUIET(other_alloc, 0);
+ return (0);
+}
+
+int
+__lock_id_free(env, sh_locker)
+ ENV *env;
+ DB_LOCKER *sh_locker;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(sh_locker, 0);
+ return (0);
+}
+
+int
+__lock_env_refresh(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+int
+__lock_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(flags, 0);
+ return (0);
+}
+
+int
+__lock_put(env, lock)
+ ENV *env;
+ DB_LOCK *lock;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(lock, NULL);
+ return (0);
+}
+
+int
+__lock_vec(env, sh_locker, flags, list, nlist, elistp)
+ ENV *env;
+ DB_LOCKER *sh_locker;
+ u_int32_t flags;
+ int nlist;
+ DB_LOCKREQ *list, **elistp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(sh_locker, 0);
+ COMPQUIET(flags, 0);
+ COMPQUIET(list, NULL);
+ COMPQUIET(nlist, 0);
+ COMPQUIET(elistp, NULL);
+ return (0);
+}
+
+int
+__lock_get(env, locker, flags, obj, lock_mode, lock)
+ ENV *env;
+ DB_LOCKER *locker;
+ u_int32_t flags;
+ const DBT *obj;
+ db_lockmode_t lock_mode;
+ DB_LOCK *lock;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(locker, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(obj, NULL);
+ COMPQUIET(lock_mode, 0);
+ COMPQUIET(lock, NULL);
+ return (0);
+}
+
+int
+__lock_id(env, idp, lkp)
+ ENV *env;
+ u_int32_t *idp;
+ DB_LOCKER **lkp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(idp, NULL);
+ COMPQUIET(lkp, NULL);
+ return (0);
+}
+
+int
+__lock_inherit_timeout(env, parent, locker)
+ ENV *env;
+ DB_LOCKER *parent, *locker;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(parent, NULL);
+ COMPQUIET(locker, NULL);
+ return (0);
+}
+
+int
+__lock_set_timeout(env, locker, timeout, op)
+ ENV *env;
+ DB_LOCKER *locker;
+ db_timeout_t timeout;
+ u_int32_t op;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(locker, NULL);
+ COMPQUIET(timeout, 0);
+ COMPQUIET(op, 0);
+ return (0);
+}
+
+int
+__lock_addfamilylocker(env, pid, id, is_family)
+ ENV *env;
+ u_int32_t pid, id, is_family;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(pid, 0);
+ COMPQUIET(id, 0);
+ COMPQUIET(is_family, 0);
+ return (0);
+}
+
+int
+__lock_freelocker(lt, sh_locker)
+ DB_LOCKTAB *lt;
+ DB_LOCKER *sh_locker;
+{
+ COMPQUIET(lt, NULL);
+ COMPQUIET(sh_locker, NULL);
+ return (0);
+}
+
+int
+__lock_familyremove(lt, sh_locker)
+ DB_LOCKTAB *lt;
+ DB_LOCKER *sh_locker;
+{
+ COMPQUIET(lt, NULL);
+ COMPQUIET(sh_locker, NULL);
+ return (0);
+}
+
+int
+__lock_downgrade(env, lock, new_mode, flags)
+ ENV *env;
+ DB_LOCK *lock;
+ db_lockmode_t new_mode;
+ u_int32_t flags;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(lock, NULL);
+ COMPQUIET(new_mode, 0);
+ COMPQUIET(flags, 0);
+ return (0);
+}
+
+int
+__lock_locker_same_family(env, locker1, locker2, retp)
+ ENV *env;
+ DB_LOCKER *locker1;
+ DB_LOCKER *locker2;
+ int *retp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(locker1, NULL);
+ COMPQUIET(locker2, NULL);
+
+ *retp = 1;
+ return (0);
+}
+
+void
+__lock_set_thread_id(lref, pid, tid)
+ void *lref;
+ pid_t pid;
+ db_threadid_t tid;
+{
+ COMPQUIET(lref, NULL);
+ COMPQUIET(pid, 0);
+ COMPQUIET(tid, 0);
+}
+
+int
+__lock_failchk(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+int
+__lock_get_list(env, locker, flags, lock_mode, list)
+ ENV *env;
+ DB_LOCKER *locker;
+ u_int32_t flags;
+ db_lockmode_t lock_mode;
+ DBT *list;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(locker, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(lock_mode, 0);
+ COMPQUIET(list, NULL);
+ return (0);
+}
+
+void
+__lock_list_print(env, mbp, list)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ DBT *list;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(list, NULL);
+}
+
+int
+__lock_getlocker(lt, locker, create, retp)
+ DB_LOCKTAB *lt;
+ u_int32_t locker;
+ int create;
+ DB_LOCKER **retp;
+{
+ COMPQUIET(locker, 0);
+ COMPQUIET(create, 0);
+ COMPQUIET(retp, NULL);
+ return (__db_nolocking(lt->env));
+}
+
+int
+__lock_id_set(env, cur_id, max_id)
+ ENV *env;
+ u_int32_t cur_id, max_id;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(cur_id, 0);
+ COMPQUIET(max_id, 0);
+ return (0);
+}
+
+int
+__lock_wakeup(env, obj)
+ ENV *env;
+ const DBT *obj;
+{
+ COMPQUIET(obj, NULL);
+ return (__db_nolocking(env));
+}
+
+int
+__lock_change(env, old_lock, new_lock)
+ ENV *env;
+ DB_LOCK *old_lock, *new_lock;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(old_lock, NULL);
+ COMPQUIET(new_lock, NULL);
+}
diff --git a/src/lock/lock_timer.c b/src/lock/lock_timer.c
new file mode 100644
index 00000000..943047f0
--- /dev/null
+++ b/src/lock/lock_timer.c
@@ -0,0 +1,128 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+/*
+ * __lock_set_timeout --
+ * Set timeout values in shared memory.
+ *
+ * This is called from the transaction system. We either set the time that
+ * this transaction expires or the amount of time a lock for this transaction
+ * is permitted to wait.
+ *
+ * PUBLIC: int __lock_set_timeout __P((ENV *,
+ * PUBLIC: DB_LOCKER *, db_timeout_t, u_int32_t));
+ */
+int
+__lock_set_timeout(env, locker, timeout, op)
+ ENV *env;
+ DB_LOCKER *locker;
+ db_timeout_t timeout;
+ u_int32_t op;
+{
+ int ret;
+
+ if (locker == NULL)
+ return (0);
+ LOCK_REGION_LOCK(env);
+ ret = __lock_set_timeout_internal(env, locker, timeout, op);
+ LOCK_REGION_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __lock_set_timeout_internal
+ * -- set timeout values in shared memory.
+ *
+ * This is the internal version called from the lock system. We either set
+ * the time that this transaction expires or the amount of time that a lock
+ * for this transaction is permitted to wait.
+ *
+ * PUBLIC: int __lock_set_timeout_internal
+ * PUBLIC: __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t));
+ */
+int
+__lock_set_timeout_internal(env, sh_locker, timeout, op)
+ ENV *env;
+ DB_LOCKER *sh_locker;
+ db_timeout_t timeout;
+ u_int32_t op;
+{
+ DB_LOCKREGION *region;
+ region = env->lk_handle->reginfo.primary;
+
+ if (op == DB_SET_TXN_TIMEOUT) {
+ if (timeout == 0)
+ timespecclear(&sh_locker->tx_expire);
+ else
+ __clock_set_expires(env,
+ &sh_locker->tx_expire, timeout);
+ } else if (op == DB_SET_LOCK_TIMEOUT) {
+ sh_locker->lk_timeout = timeout;
+ F_SET(sh_locker, DB_LOCKER_TIMEOUT);
+ } else if (op == DB_SET_TXN_NOW) {
+ timespecclear(&sh_locker->tx_expire);
+ __clock_set_expires(env, &sh_locker->tx_expire, 0);
+ sh_locker->lk_expire = sh_locker->tx_expire;
+ if (!timespecisset(&region->next_timeout) ||
+ timespeccmp(
+ &region->next_timeout, &sh_locker->lk_expire, >))
+ region->next_timeout = sh_locker->lk_expire;
+ } else
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * __lock_inherit_timeout
+ * -- inherit timeout values from parent locker.
+ * This is called from the transaction system. This will
+ * return EINVAL if the parent does not exist or did not
+ * have a current txn timeout set.
+ *
+ * PUBLIC: int __lock_inherit_timeout __P((ENV *, DB_LOCKER *, DB_LOCKER *));
+ */
+int
+__lock_inherit_timeout(env, parent, locker)
+ ENV *env;
+ DB_LOCKER *parent, *locker;
+{
+ int ret;
+
+ ret = 0;
+ LOCK_REGION_LOCK(env);
+
+ /*
+ * If the parent is not there yet, that's ok. If it
+ * does not have any timouts set, then avoid creating
+ * the child locker at this point.
+ */
+ if (parent == NULL ||
+ (timespecisset(&parent->tx_expire) &&
+ !F_ISSET(parent, DB_LOCKER_TIMEOUT))) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ locker->tx_expire = parent->tx_expire;
+
+ if (F_ISSET(parent, DB_LOCKER_TIMEOUT)) {
+ locker->lk_timeout = parent->lk_timeout;
+ F_SET(locker, DB_LOCKER_TIMEOUT);
+ if (!timespecisset(&parent->tx_expire))
+ ret = EINVAL;
+ }
+
+err: LOCK_REGION_UNLOCK(env);
+ return (ret);
+}
diff --git a/src/lock/lock_util.c b/src/lock/lock_util.c
new file mode 100644
index 00000000..f7029cd7
--- /dev/null
+++ b/src/lock/lock_util.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+
+/*
+ * The next two functions are the hash functions used to store objects in the
+ * lock hash tables. They are hashing the same items, but one (__lock_ohash)
+ * takes a DBT (used for hashing a parameter passed from the user) and the
+ * other (__lock_lhash) takes a DB_LOCKOBJ (used for hashing something that is
+ * already in the lock manager). In both cases, we have a special check to
+ * fast path the case where we think we are doing a hash on a DB page/fileid
+ * pair. If the size is right, then we do the fast hash.
+ *
+ * We know that DB uses DB_LOCK_ILOCK types for its lock objects. The first
+ * four bytes are the 4-byte page number and the next DB_FILE_ID_LEN bytes
+ * are a unique file id, where the first 4 bytes on UNIX systems are the file
+ * inode number, and the first 4 bytes on Windows systems are the FileIndexLow
+ * bytes. This is followed by a random number. The inode values tend
+ * to increment fairly slowly and are not good for hashing. So, we use
+ * the XOR of the page number and the four bytes of the file id random
+ * number to produce a 32-bit hash value.
+ *
+ * We have no particular reason to believe that this algorithm will produce
+ * a good hash, but we want a fast hash more than we want a good one, when
+ * we're coming through this code path.
+ */
+#define FAST_HASH(P) { \
+ u_int32_t __h; \
+ u_int8_t *__cp, *__hp; \
+ __hp = (u_int8_t *)&__h; \
+ __cp = (u_int8_t *)(P); \
+ __hp[0] = __cp[0] ^ __cp[12]; \
+ __hp[1] = __cp[1] ^ __cp[13]; \
+ __hp[2] = __cp[2] ^ __cp[14]; \
+ __hp[3] = __cp[3] ^ __cp[15]; \
+ return (__h); \
+}
+
+/*
+ * __lock_ohash --
+ *
+ * PUBLIC: u_int32_t __lock_ohash __P((const DBT *));
+ */
+u_int32_t
+__lock_ohash(dbt)
+ const DBT *dbt;
+{
+ if (dbt->size == sizeof(DB_LOCK_ILOCK))
+ FAST_HASH(dbt->data);
+
+ return (__ham_func5(NULL, dbt->data, dbt->size));
+}
+
+/*
+ * __lock_lhash --
+ *
+ * PUBLIC: u_int32_t __lock_lhash __P((DB_LOCKOBJ *));
+ */
+u_int32_t
+__lock_lhash(lock_obj)
+ DB_LOCKOBJ *lock_obj;
+{
+ void *obj_data;
+
+ obj_data = SH_DBT_PTR(&lock_obj->lockobj);
+
+ if (lock_obj->lockobj.size == sizeof(DB_LOCK_ILOCK))
+ FAST_HASH(obj_data);
+
+ return (__ham_func5(NULL, obj_data, lock_obj->lockobj.size));
+}
+
+/*
+ * __lock_nomem --
+ * Report a lack of some resource.
+ *
+ * PUBLIC: int __lock_nomem __P((ENV *, const char *));
+ */
+int
+__lock_nomem(env, res)
+ ENV *env;
+ const char *res;
+{
+ __db_errx(env, DB_STR_A("2055", "Lock table is out of available %s",
+ "%s"), res);
+ return (ENOMEM);
+}
diff --git a/src/log/log.c b/src/log/log.c
new file mode 100644
index 00000000..5808145f
--- /dev/null
+++ b/src/log/log.c
@@ -0,0 +1,1727 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+static int __log_init __P((ENV *, DB_LOG *));
+static int __log_recover __P((DB_LOG *));
+
+/*
+ * __log_open --
+ * Internal version of log_open: only called from ENV->open.
+ *
+ * PUBLIC: int __log_open __P((ENV *));
+ */
+int
+__log_open(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ DB_LOG *dblp;
+ LOG *lp;
+ u_int8_t *bulk;
+ int region_locked, ret;
+
+ dbenv = env->dbenv;
+ region_locked = 0;
+
+ /* Create/initialize the DB_LOG structure. */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_LOG), &dblp)) != 0)
+ return (ret);
+ dblp->env = env;
+
+ /* Join/create the log region. */
+ if ((ret = __env_region_share(env, &dblp->reginfo)) != 0)
+ goto err;
+
+ /* If we created the region, initialize it. */
+ if (F_ISSET(&dblp->reginfo, REGION_CREATE))
+ if ((ret = __log_init(env, dblp)) != 0)
+ goto err;
+
+ /* Set the local addresses. */
+ lp = dblp->reginfo.primary = R_ADDR(&dblp->reginfo,
+ ((REGENV *)env->reginfo->primary)->lg_primary);
+ dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
+
+ /*
+ * If the region is threaded, we have to lock the DBREG list, and we
+ * need to allocate a mutex for that purpose.
+ */
+ if ((ret = __mutex_alloc(env,
+ MTX_LOG_REGION, DB_MUTEX_PROCESS_ONLY, &dblp->mtx_dbreg)) != 0)
+ goto err;
+
+ /*
+ * Set the handle -- we may be about to run recovery, which allocates
+ * log cursors. Log cursors require logging be already configured,
+ * and the handle being set is what demonstrates that.
+ *
+ * If we created the region, run recovery. If that fails, make sure
+ * we reset the log handle before cleaning up, otherwise we will try
+ * and clean up again in the mainline ENV initialization code.
+ */
+ env->lg_handle = dblp;
+
+ if (F_ISSET(&dblp->reginfo, REGION_CREATE)) {
+ /*
+ * We first take the log file size from the environment, if
+ * specified. If that wasn't set, default it. Regardless,
+ * recovery may set it from the persistent information in a
+ * log file header.
+ */
+ if (lp->log_size == 0)
+ lp->log_size =
+ FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
+ LG_MAX_INMEM : LG_MAX_DEFAULT;
+
+ if ((ret = __log_recover(dblp)) != 0)
+ goto err;
+
+ /*
+ * If the next log file size hasn't been set yet, default it
+ * to the current log file size.
+ */
+ if (lp->log_nsize == 0)
+ lp->log_nsize = lp->log_size;
+
+ /*
+ * If we haven't written any log files, write the first one
+ * so that checkpoint gets a valid ckp_lsn value.
+ */
+ if (IS_INIT_LSN(lp->lsn) &&
+ (ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
+ goto err;
+
+ /*
+ * Initialize replication's next-expected LSN value
+ * and replication's bulk buffer. In __env_open, we
+ * always create/open the replication region before
+ * the log region so we're assured that our rep_handle
+ * is valid at this point, if replication is being used.
+ */
+ lp->ready_lsn = lp->lsn;
+ if (IS_ENV_REPLICATED(env)) {
+ if ((ret =
+ __env_alloc(&dblp->reginfo, MEGABYTE, &bulk)) != 0)
+ goto err;
+ lp->bulk_buf = R_OFFSET(&dblp->reginfo, bulk);
+ lp->bulk_len = MEGABYTE;
+ lp->bulk_off = 0;
+ lp->wait_ts = env->rep_handle->request_gap;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ } else {
+ lp->bulk_buf = INVALID_ROFF;
+ lp->bulk_len = 0;
+ lp->bulk_off = 0;
+ }
+ } else {
+ /*
+ * A process joining the region may have reset the log file
+ * size, too. If so, it only affects the next log file we
+ * create. We need to check that the size is reasonable given
+ * the buffer size in the region.
+ */
+ LOG_SYSTEM_LOCK(env);
+ region_locked = 1;
+
+ if (dbenv->lg_size != 0) {
+ if ((ret =
+ __log_check_sizes(env, dbenv->lg_size, 0)) != 0)
+ goto err;
+
+ lp->log_nsize = dbenv->lg_size;
+ }
+
+ LOG_SYSTEM_UNLOCK(env);
+ region_locked = 0;
+
+ if (dbenv->lg_flags != 0 && (ret =
+ __log_set_config_int(dbenv, dbenv->lg_flags, 1, 0)) != 0)
+ return (ret);
+ }
+ dblp->reginfo.mtx_alloc = lp->mtx_region;
+
+ return (0);
+
+err: if (dblp->reginfo.addr != NULL) {
+ if (region_locked)
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__env_region_detach(env, &dblp->reginfo, 0);
+ }
+ env->lg_handle = NULL;
+
+ (void)__mutex_free(env, &dblp->mtx_dbreg);
+ __os_free(env, dblp);
+
+ return (ret);
+}
+
+/*
+ * __log_init --
+ * Initialize a log region in shared memory.
+ */
+static int
+__log_init(env, dblp)
+ ENV *env;
+ DB_LOG *dblp;
+{
+ DB_ENV *dbenv;
+ LOG *lp;
+ int ret;
+ void *p;
+
+ dbenv = env->dbenv;
+
+ /*
+ * This is the first point where we can validate the buffer size,
+ * because we know all three settings have been configured (file size,
+ * buffer size and the in-memory flag).
+ */
+ if ((ret =
+ __log_check_sizes(env, dbenv->lg_size, dbenv->lg_bsize)) != 0)
+ return (ret);
+
+ if ((ret = __env_alloc(&dblp->reginfo,
+ sizeof(*lp), &dblp->reginfo.primary)) != 0)
+ goto mem_err;
+
+ ((REGENV *)env->reginfo->primary)->lg_primary =
+ R_OFFSET(&dblp->reginfo, dblp->reginfo.primary);
+
+ lp = dblp->reginfo.primary;
+ memset(lp, 0, sizeof(*lp));
+
+ /* We share the region so we need the same mutex. */
+ lp->mtx_region = ((REGENV *)env->reginfo->primary)->mtx_regenv;
+
+ lp->fid_max = 0;
+ SH_TAILQ_INIT(&lp->fq);
+ lp->free_fid_stack = INVALID_ROFF;
+ lp->free_fids = lp->free_fids_alloced = 0;
+
+ /* Initialize LOG LSNs. */
+ INIT_LSN(lp->lsn);
+ INIT_LSN(lp->t_lsn);
+
+ /*
+ * It's possible to be waiting for an LSN of [1][0], if a replication
+ * client gets the first log record out of order. An LSN of [0][0]
+ * signifies that we're not waiting.
+ */
+ ZERO_LSN(lp->waiting_lsn);
+
+ /*
+ * Log makes note of the fact that it ran into a checkpoint on
+ * startup if it did so, as a recovery optimization. A zero
+ * LSN signifies that it hasn't found one [yet].
+ */
+ ZERO_LSN(lp->cached_ckp_lsn);
+
+ if ((ret =
+ __mutex_alloc(env, MTX_LOG_FILENAME, 0, &lp->mtx_filelist)) != 0)
+ return (ret);
+ if ((ret = __mutex_alloc(env, MTX_LOG_FLUSH, 0, &lp->mtx_flush)) != 0)
+ return (ret);
+
+ /* Initialize the buffer. */
+ if ((ret = __env_alloc(&dblp->reginfo, dbenv->lg_bsize, &p)) != 0) {
+mem_err: __db_errx( env, DB_STR("2524",
+ "unable to allocate log region memory"));
+ return (ret);
+ }
+ lp->regionmax = dbenv->lg_regionmax;
+ lp->buffer_off = R_OFFSET(&dblp->reginfo, p);
+ lp->buffer_size = dbenv->lg_bsize;
+ lp->filemode = dbenv->lg_filemode;
+ lp->log_size = lp->log_nsize = dbenv->lg_size;
+ lp->stat.st_fileid_init = dbenv->lg_fileid_init;
+
+ /* Initialize the commit Queue. */
+ SH_TAILQ_INIT(&lp->free_commits);
+ SH_TAILQ_INIT(&lp->commits);
+ lp->ncommit = 0;
+
+ /* Initialize the logfiles list for in-memory logs. */
+ SH_TAILQ_INIT(&lp->logfiles);
+ SH_TAILQ_INIT(&lp->free_logfiles);
+
+ /*
+ * Fill in the log's persistent header. Don't fill in the log file
+ * sizes, as they may change at any time and so have to be filled in
+ * as each log file is created.
+ */
+ lp->persist.magic = DB_LOGMAGIC;
+ /*
+ * Don't use __log_set_version because env->dblp isn't set up yet.
+ */
+ lp->persist.version = DB_LOGVERSION;
+ lp->persist.notused = 0;
+ env->lg_handle = dblp;
+
+ /* Migrate persistent flags from the ENV into the region. */
+ if (dbenv->lg_flags != 0 &&
+ (ret = __log_set_config_int(dbenv, dbenv->lg_flags, 1, 1)) != 0)
+ return (ret);
+
+ (void)time(&lp->timestamp);
+ return (0);
+}
+
+/*
+ * __log_recover --
+ * Recover a log.
+ */
+static int
+__log_recover(dblp)
+ DB_LOG *dblp;
+{
+ DBT dbt;
+ DB_ENV *dbenv;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ ENV *env;
+ LOG *lp;
+ u_int32_t cnt, rectype;
+ int ret;
+ logfile_validity status;
+
+ env = dblp->env;
+ dbenv = env->dbenv;
+ logc = NULL;
+ lp = dblp->reginfo.primary;
+
+ /*
+ * Find a log file. If none exist, we simply return, leaving
+ * everything initialized to a new log.
+ */
+ if ((ret = __log_find(dblp, 0, &cnt, &status)) != 0)
+ return (ret);
+ if (cnt == 0) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+ __db_msg(env, DB_STR("2525", "No log files found"));
+ return (0);
+ }
+
+ /*
+ * If the last file is an old, unreadable version, start a new
+ * file. Don't bother finding the end of the last log file;
+ * we assume that it's valid in its entirety, since the user
+ * should have shut down cleanly or run recovery before upgrading.
+ */
+ if (status == DB_LV_OLD_UNREADABLE) {
+ lp->lsn.file = lp->s_lsn.file = cnt + 1;
+ lp->lsn.offset = lp->s_lsn.offset = 0;
+ goto skipsearch;
+ }
+ DB_ASSERT(env,
+ (status == DB_LV_NORMAL || status == DB_LV_OLD_READABLE));
+
+ /*
+ * We have the last useful log file and we've loaded any persistent
+ * information. Set the end point of the log past the end of the last
+ * file. Read the last file, looking for the last checkpoint and
+ * the log's end.
+ */
+ lp->lsn.file = cnt + 1;
+ lp->lsn.offset = 0;
+ lsn.file = cnt;
+ lsn.offset = 0;
+
+ /*
+ * Allocate a cursor and set it to the first record. This shouldn't
+ * fail, leave error messages on.
+ */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ F_SET(logc, DB_LOG_LOCKED);
+ memset(&dbt, 0, sizeof(dbt));
+ if ((ret = __logc_get(logc, &lsn, &dbt, DB_SET)) != 0)
+ goto err;
+
+ /*
+ * Read to the end of the file. This may fail at some point, so
+ * turn off error messages.
+ */
+ F_SET(logc, DB_LOG_SILENT_ERR);
+ while (__logc_get(logc, &lsn, &dbt, DB_NEXT) == 0) {
+ if (dbt.size < sizeof(u_int32_t))
+ continue;
+ LOGCOPY_32(env, &rectype, dbt.data);
+ if (rectype == DB___txn_ckp)
+ /*
+ * If we happen to run into a checkpoint, cache its
+ * LSN so that the transaction system doesn't have
+ * to walk this log file again looking for it.
+ */
+ lp->cached_ckp_lsn = lsn;
+ }
+ F_CLR(logc, DB_LOG_SILENT_ERR);
+
+ /*
+ * We now know where the end of the log is. Set the first LSN that
+ * we want to return to an application and the LSN of the last known
+ * record on disk.
+ */
+ lp->lsn = lsn;
+ lp->s_lsn = lsn;
+ lp->lsn.offset += logc->len;
+ lp->s_lsn.offset += logc->len;
+
+ /* Set up the current buffer information, too. */
+ lp->len = logc->len;
+ lp->a_off = 0;
+ lp->b_off = 0;
+ lp->w_off = lp->lsn.offset;
+
+skipsearch:
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+ __db_msg(env, DB_STR_A("2526",
+ "Finding last valid log LSN: file: %lu offset %lu",
+ "%lu %lu"), (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
+
+err: if (logc != NULL)
+ (void)__logc_close(logc);
+
+ return (ret);
+}
+
+/*
+ * __log_find --
+ * Try to find a log file. If find_first is set, valp will contain
+ * the number of the first readable log file, else it will contain the number
+ * of the last log file (which may be too old to read).
+ *
+ * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
+ */
+int
+__log_find(dblp, find_first, valp, statusp)
+ DB_LOG *dblp;
+ int find_first;
+ u_int32_t *valp;
+ logfile_validity *statusp;
+{
+ ENV *env;
+ LOG *lp;
+ logfile_validity logval_status, status;
+ struct __db_filestart *filestart;
+ u_int32_t clv, logval;
+ int cnt, fcnt, ret;
+ const char *dir;
+ char *c, **names, *p, *q;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+ logval_status = status = DB_LV_NONEXISTENT;
+
+ /* Return a value of 0 as the log file number on failure. */
+ *valp = 0;
+
+ if (lp->db_log_inmemory) {
+ filestart = find_first ?
+ SH_TAILQ_FIRST(&lp->logfiles, __db_filestart) :
+ SH_TAILQ_LAST(&lp->logfiles, links, __db_filestart);
+ if (filestart != NULL) {
+ *valp = filestart->file;
+ logval_status = DB_LV_NORMAL;
+ }
+ *statusp = logval_status;
+ return (0);
+ }
+
+ /* Find the directory name. */
+ if ((ret = __log_name(dblp, 1, &p, NULL, 0)) != 0) {
+ __os_free(env, p);
+ return (ret);
+ }
+ if ((q = __db_rpath(p)) == NULL)
+ dir = PATH_DOT;
+ else {
+ *q = '\0';
+ dir = p;
+ }
+
+ /* Get the list of file names. */
+retry: if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
+ __db_err(env, ret, "%s", dir);
+ __os_free(env, p);
+ return (ret);
+ }
+
+ /* Search for a valid log file name. */
+ for (cnt = fcnt, clv = logval = 0; --cnt >= 0;) {
+ if (!IS_LOG_FILE(names[cnt]))
+ continue;
+
+ /*
+ * Names of the form log\.[0-9]* are reserved for DB. Other
+ * names sharing LFPREFIX, such as "log.db", are legal.
+ */
+ for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++)
+ if (!isdigit((int)*c))
+ break;
+ if (*c != '\0')
+ continue;
+
+ /*
+ * Use atol, not atoi; if an "int" is 16-bits, the largest
+ * log file name won't fit.
+ */
+ clv = (u_int32_t)atol(names[cnt] + (sizeof(LFPREFIX) - 1));
+
+ /*
+ * If searching for the first log file, we want to return the
+ * oldest log file we can read, or, if no readable log files
+ * exist, the newest log file we can't read (the crossover
+ * point between the old and new versions of the log file).
+ *
+ * If we're searching for the last log file, we want to return
+ * the newest log file, period.
+ *
+ * Readable log files should never precede unreadable log
+ * files, that would mean the admin seriously screwed up.
+ */
+ if (find_first) {
+ if (logval != 0 &&
+ status != DB_LV_OLD_UNREADABLE && clv > logval)
+ continue;
+ } else
+ if (logval != 0 && clv < logval)
+ continue;
+
+ if ((ret = __log_valid(dblp, clv, 1, NULL, 0,
+ &status, NULL)) != 0) {
+ /*
+ * If we have raced with removal of a log file since
+ * the call to __os_dirlist, it may no longer exist.
+ * In that case, just go on to the next one. If we're
+ * at the end of the list, all of the log files we saw
+ * initially are gone and we need to get the list again.
+ */
+ if (ret == ENOENT) {
+ ret = 0;
+ if (cnt == 0) {
+ __os_dirfree(env, names, fcnt);
+ goto retry;
+ }
+ continue;
+ }
+ __db_err(env, ret, DB_STR_A("2527",
+ "Invalid log file: %s", "%s"), names[cnt]);
+ goto err;
+ }
+ switch (status) {
+ case DB_LV_NONEXISTENT:
+ /* __log_valid never returns DB_LV_NONEXISTENT. */
+ DB_ASSERT(env, 0);
+ break;
+ case DB_LV_INCOMPLETE:
+ /*
+ * The last log file may not have been initialized --
+ * it's possible to create a log file but not write
+ * anything to it. If performing recovery (that is,
+ * if find_first isn't set), ignore the file, it's
+ * not interesting. If we're searching for the first
+ * log record, return the file (assuming we don't find
+ * something better), as the "real" first log record
+ * is likely to be in the log buffer, and we want to
+ * set the file LSN for our return.
+ */
+ if (find_first)
+ goto found;
+ break;
+ case DB_LV_OLD_UNREADABLE:
+ /*
+ * If we're searching for the first log file, then we
+ * only want this file if we don't yet have a file or
+ * already have an unreadable file and this one is
+ * newer than that one. If we're searching for the
+ * last log file, we always want this file because we
+ * wouldn't be here if it wasn't newer than our current
+ * choice.
+ */
+ if (!find_first || logval == 0 ||
+ (status == DB_LV_OLD_UNREADABLE && clv > logval))
+ goto found;
+ break;
+ case DB_LV_NORMAL:
+ case DB_LV_OLD_READABLE:
+found: logval = clv;
+ logval_status = status;
+ break;
+ }
+ }
+
+ *valp = logval;
+
+err: __os_dirfree(env, names, fcnt);
+ __os_free(env, p);
+ *statusp = logval_status;
+
+ return (ret);
+}
+
+/*
+ * log_valid --
+ * Validate a log file. Returns an error code in the event of
+ * a fatal flaw in a the specified log file; returns success with
+ * a code indicating the currentness and completeness of the specified
+ * log file if it is not unexpectedly flawed (that is, if it's perfectly
+ * normal, if it's zero-length, or if it's an old version).
+ *
+ * PUBLIC: int __log_valid __P((DB_LOG *, u_int32_t, int,
+ * PUBLIC: DB_FH **, u_int32_t, logfile_validity *, u_int32_t *));
+ */
+int
+__log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
+ DB_LOG *dblp;
+ u_int32_t number;
+ int set_persist;
+ DB_FH **fhpp;
+ u_int32_t flags;
+ logfile_validity *statusp;
+ u_int32_t *versionp;
+{
+ DB_CIPHER *db_cipher;
+ DB_FH *fhp;
+ ENV *env;
+ HDR *hdr;
+ LOG *lp;
+ LOGP *persist;
+ logfile_validity status;
+ size_t hdrsize, nr, recsize;
+ int chksum_includes_hdr, is_hmac, ret;
+ u_int32_t logversion;
+ u_int8_t *tmp;
+ char *fname;
+
+ env = dblp->env;
+ db_cipher = env->crypto_handle;
+ fhp = NULL;
+ persist = NULL;
+ status = DB_LV_NORMAL;
+ tmp = NULL;
+#if defined(HAVE_LOG_CHECKSUM)
+ /* Most log versions include the hdr in the checksum. */
+ chksum_includes_hdr = 1;
+#else
+ COMPQUIET(chksum_includes_hdr, 0);
+#endif
+
+ /* Return the file handle to our caller, on request */
+ if (fhpp != NULL)
+ *fhpp = NULL;
+
+ if (flags == 0)
+ flags = DB_OSO_RDONLY | DB_OSO_SEQ;
+ /* Try to open the log file. */
+ if ((ret = __log_name(dblp, number, &fname, &fhp, flags)) != 0) {
+ __os_free(env, fname);
+ return (ret);
+ }
+
+ hdrsize = HDR_NORMAL_SZ;
+ is_hmac = 0;
+ recsize = sizeof(LOGP);
+ if (CRYPTO_ON(env)) {
+ hdrsize = HDR_CRYPTO_SZ;
+ recsize = sizeof(LOGP);
+ recsize += db_cipher->adj_size(recsize);
+ is_hmac = 1;
+ }
+ if ((ret = __os_calloc(env, 1, recsize + hdrsize, &tmp)) != 0)
+ goto err;
+
+ hdr = (HDR *)tmp;
+ persist = (LOGP *)(tmp + hdrsize);
+
+ /*
+ * Try to read the header. This can fail if the log is truncated, or
+ * if we find a preallocated log file where the header has not yet been
+ * written, so we need to check whether the header is zero-filled.
+ */
+ if ((ret = __os_read(env, fhp, tmp, recsize + hdrsize, &nr)) != 0 ||
+ nr != recsize + hdrsize ||
+ (hdr->len == 0 && persist->magic == 0 && persist->log_size == 0)) {
+ if (ret == 0)
+ status = DB_LV_INCOMPLETE;
+ else
+ /*
+ * The error was a fatal read error, not just an
+ * incompletely initialized log file.
+ */
+ __db_err(env, ret, DB_STR_A("2528",
+ "ignoring log file: %s", "%s"), fname);
+ goto err;
+ }
+
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(hdr, CRYPTO_ON(env));
+
+ /*
+ * Now we have to validate the persistent record. We have
+ * several scenarios we have to deal with:
+ *
+ * 1. User has crypto turned on:
+ * - They're reading an old, unencrypted log file
+ * . We will fail the record size match check below.
+ * - They're reading a current, unencrypted log file
+ * . We will fail the record size match check below.
+ * - They're reading an old, encrypted log file [NOT YET]
+ * . After decryption we'll fail the version check. [NOT YET]
+ * - They're reading a current, encrypted log file
+ * . We should proceed as usual.
+ * 2. User has crypto turned off:
+ * - They're reading an old, unencrypted log file
+ * . We will fail the version check.
+ * - They're reading a current, unencrypted log file
+ * . We should proceed as usual.
+ * - They're reading an old, encrypted log file [NOT YET]
+ * . We'll fail the magic number check (it is encrypted).
+ * - They're reading a current, encrypted log file
+ * . We'll fail the magic number check (it is encrypted).
+ */
+ if (CRYPTO_ON(env)) {
+ /*
+ * If we are trying to decrypt an unencrypted log
+ * we can only detect that by having an unreasonable
+ * data length for our persistent data.
+ */
+ if ((hdr->len - hdrsize) != sizeof(LOGP)) {
+ __db_errx(env, "log record size mismatch");
+ goto err;
+ }
+ /*
+ * The checksum is calculated from the encrypted data, and,
+ * for recent logs, the fields hdr->{prev,len}.
+ */
+#ifdef HAVE_LOG_CHECKSUM
+ if ((ret = __db_check_chksum(env, hdr, db_cipher,
+ &hdr->chksum[0], (u_int8_t *)persist,
+ hdr->len - hdrsize, is_hmac)) != 0) {
+ /*
+ * The checksum doesn't verify when the header fields
+ * are included; try without the header.
+ */
+
+ if ((ret = __db_check_chksum(env, NULL, db_cipher,
+ &hdr->chksum[0], (u_int8_t *)persist,
+ hdr->len - hdrsize, is_hmac)) != 0)
+ goto bad_checksum;
+ /*
+ * The checksum verifies without the header. Make note
+ * of that, because it is only acceptable when the log
+ * version < DB_LOGCHKSUM. Later, when we determine log
+ * version, we will confirm this.
+ */
+ chksum_includes_hdr = 0;
+ }
+#endif
+
+ if ((ret = db_cipher->decrypt(env, db_cipher->data,
+ &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0)
+ goto err;
+ }
+
+ /* Swap the header, if necessary. */
+ if (LOG_SWAPPED(env)) {
+ /*
+ * If the magic number is not byte-swapped, we're looking at an
+ * old log that we can no longer read.
+ */
+ if (persist->magic == DB_LOGMAGIC) {
+ __db_errx(env, DB_STR_A("2529",
+ "Ignoring log file: %s historic byte order",
+ "%s"), fname);
+ status = DB_LV_OLD_UNREADABLE;
+ goto err;
+ }
+
+ __log_persistswap(persist);
+ }
+
+ /* Validate the header. */
+ if (persist->magic != DB_LOGMAGIC) {
+ __db_errx(env, DB_STR_A("2530",
+ "Ignoring log file: %s: magic number %lx, not %lx",
+ "%s %lx %lx"), fname,
+ (u_long)persist->magic, (u_long)DB_LOGMAGIC);
+ ret = EINVAL;
+ goto err;
+ }
+
+ logversion = persist->version;
+ /*
+ * Set our status code to indicate whether the log file belongs to an
+ * unreadable or readable old version; leave it alone if and only if
+ * the log file version is the current one.
+ */
+ if (logversion > DB_LOGVERSION) {
+ /* This is a fatal error--the log file is newer than DB. */
+ __db_errx(env, DB_STR_A("2531",
+ "Unacceptable log file %s: unsupported log version %lu",
+ "%s %lu"), fname, (u_long)logversion);
+ ret = EINVAL;
+ goto err;
+ } else if (logversion < DB_LOGOLDVER) {
+ status = DB_LV_OLD_UNREADABLE;
+ /* This is a non-fatal error, but give some feedback. */
+ __db_errx(env, DB_STR_A("2532",
+ "Skipping log file %s: historic log version %lu", "%s %lu"),
+ fname, (u_long)logversion);
+ /*
+ * We don't want to set persistent info based on an unreadable
+ * region, so jump to "err".
+ */
+ goto err;
+ } else if (logversion < DB_LOGVERSION)
+ status = DB_LV_OLD_READABLE;
+
+ /*
+ * We could not check the checksum before checking the magic and version
+ * because old log headers put the length and checksum in a different
+ * location.
+ */
+#ifdef HAVE_LOG_CHECKSUM
+ if (CRYPTO_ON(env)) {
+ /*
+ * We might have to declare a checksum failure here, if:
+ * - the checksum verified only by ignoring the header, and
+ * - the log version indicates that the header should have
+ * been included.
+ */
+ if (!chksum_includes_hdr && logversion >= DB_LOGCHKSUM)
+ goto bad_checksum;
+ } else {
+ /*
+ * The checksum was calculated with the swapped byte order. We
+ * might need to swap them back; the check needs the same bytes.
+ */
+ if (LOG_SWAPPED(env))
+ __log_persistswap(persist);
+ /*
+ * We have the logversion here, so we know whether to include
+ * the hdr or not.
+ */
+ if ((ret = __db_check_chksum(env,
+ logversion >= DB_LOGCHKSUM ? hdr : NULL, db_cipher,
+ &hdr->chksum[0], (u_int8_t *)persist,
+ hdr->len - hdrsize, is_hmac)) != 0) {
+bad_checksum:
+ __db_errx(env, DB_STR("2533",
+ "log record checksum mismatch"));
+ goto err;
+ }
+
+ if (LOG_SWAPPED(env))
+ __log_persistswap(persist);
+ }
+#endif
+
+ /*
+ * If the log is readable so far and we're doing system initialization,
+ * set the region's persistent information based on the headers.
+ *
+ * Override the current log file size.
+ */
+ if (set_persist) {
+ lp = dblp->reginfo.primary;
+ lp->log_size = persist->log_size;
+ lp->persist.version = logversion;
+ }
+ if (versionp != NULL)
+ *versionp = logversion;
+
+err: if (fname != NULL)
+ __os_free(env, fname);
+ if (ret == 0 && fhpp != NULL)
+ *fhpp = fhp;
+ else
+ /* Must close on error or if we only used it locally. */
+ (void)__os_closehandle(env, fhp);
+ if (tmp != NULL)
+ __os_free(env, tmp);
+
+ if (statusp != NULL)
+ *statusp = status;
+
+ return (ret);
+}
+
+/*
+ * __log_env_refresh --
+ * Clean up after the log system on a close or failed open.
+ *
+ * PUBLIC: int __log_env_refresh __P((ENV *));
+ */
+int
+__log_env_refresh(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ REGINFO *reginfo;
+ struct __fname *fnp;
+ struct __db_commit *commit;
+ struct __db_filestart *filestart;
+ int ret, t_ret;
+
+ dblp = env->lg_handle;
+ reginfo = &dblp->reginfo;
+ lp = reginfo->primary;
+ ret = 0;
+
+ /*
+ * Flush the log if it's private -- there's no Berkeley DB guarantee
+ * that this gets done, but in case the application has forgotten to
+ * flush for durability, it's the polite thing to do.
+ */
+ if (F_ISSET(env, ENV_PRIVATE) &&
+ (t_ret = __log_flush(env, NULL)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __dbreg_close_files(env, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * After we close the files, check for any unlogged closes left in
+ * the shared memory queue. If we find any, try to log it, otherwise
+ * return the error. We cannot say the environment was closed
+ * cleanly.
+ */
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+ if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) &&
+ (t_ret = __dbreg_close_id_int(
+ env, fnp, DBREG_CLOSE, 1)) != 0)
+ ret = t_ret;
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+ /*
+ * If a private region, return the memory to the heap. Not needed for
+ * filesystem-backed or system shared memory regions, that memory isn't
+ * owned by any particular process.
+ */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ reginfo->mtx_alloc = MUTEX_INVALID;
+ /* Discard the flush mutex. */
+ if ((t_ret =
+ __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the buffer. */
+ __env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off));
+
+ /* Discard stack of free file IDs. */
+ if (lp->free_fid_stack != INVALID_ROFF)
+ __env_alloc_free(reginfo,
+ R_ADDR(reginfo, lp->free_fid_stack));
+
+ /* Discard the list of in-memory log file markers. */
+ while ((filestart = SH_TAILQ_FIRST(&lp->logfiles,
+ __db_filestart)) != NULL) {
+ SH_TAILQ_REMOVE(&lp->logfiles, filestart, links,
+ __db_filestart);
+ __env_alloc_free(reginfo, filestart);
+ }
+
+ while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles,
+ __db_filestart)) != NULL) {
+ SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links,
+ __db_filestart);
+ __env_alloc_free(reginfo, filestart);
+ }
+
+ /* Discard commit queue elements. */
+ while ((commit = SH_TAILQ_FIRST(&lp->free_commits,
+ __db_commit)) != NULL) {
+ SH_TAILQ_REMOVE(&lp->free_commits, commit, links,
+ __db_commit);
+ __env_alloc_free(reginfo, commit);
+ }
+
+ /* Discard replication bulk buffer. */
+ if (lp->bulk_buf != INVALID_ROFF) {
+ __env_alloc_free(reginfo,
+ R_ADDR(reginfo, lp->bulk_buf));
+ lp->bulk_buf = INVALID_ROFF;
+ }
+ }
+
+ /* Discard the per-thread DBREG mutex. */
+ if ((t_ret = __mutex_free(env, &dblp->mtx_dbreg)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Detach from the region. */
+ if ((t_ret = __env_region_detach(env, reginfo, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Close open files, release allocated memory. */
+ if (dblp->lfhp != NULL) {
+ if ((t_ret =
+ __os_closehandle(env, dblp->lfhp)) != 0 && ret == 0)
+ ret = t_ret;
+ dblp->lfhp = NULL;
+ }
+ if (dblp->dbentry != NULL)
+ __os_free(env, dblp->dbentry);
+
+ __os_free(env, dblp);
+
+ env->lg_handle = NULL;
+ return (ret);
+}
+
+/*
+ * __log_get_cached_ckp_lsn --
+ * Retrieve any last checkpoint LSN that we may have found on startup.
+ *
+ * PUBLIC: int __log_get_cached_ckp_lsn __P((ENV *, DB_LSN *));
+ */
+int
+__log_get_cached_ckp_lsn(env, ckp_lsnp)
+ ENV *env;
+ DB_LSN *ckp_lsnp;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = env->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+
+ LOG_SYSTEM_LOCK(env);
+ *ckp_lsnp = lp->cached_ckp_lsn;
+ LOG_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __log_region_mutex_count --
+ * Return the number of mutexes the log region will need.
+ *
+ * PUBLIC: u_int32_t __log_region_mutex_count __P((ENV *));
+ */
+u_int32_t
+__log_region_mutex_count(env)
+ ENV *env;
+{
+ /*
+ * We need a few assorted mutexes, and one per transaction waiting
+ * on the group commit list. We can't know how many that will be,
+ * but it should be bounded by the maximum active transactions.
+ */
+ return (env->dbenv->tx_init + 5);
+}
+
+/*
+ * __log_region_mutex_max --
+ * Return the number of additional mutexes the log region will need.
+ *
+ * PUBLIC: u_int32_t __log_region_mutex_max __P((ENV *));
+ */
+u_int32_t
+__log_region_mutex_max(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ u_int32_t count;
+
+ dbenv = env->dbenv;
+
+ if ((count = dbenv->tx_max) == 0)
+ count = DEF_MAX_TXNS;
+ if (count < dbenv->tx_init)
+ return (0);
+ return (count - dbenv->tx_init);
+}
+
+/*
+ * __log_region_size --
+ * Return the amount of space needed for the log region.
+ * Make the region large enough to hold txn_max transaction
+ * detail structures plus some space to hold thread handles
+ * and the beginning of the alloc region and anything we
+ * need for mutex system resource recording.
+ * PUBLIC: size_t __log_region_size __P((ENV *));
+ */
+size_t
+__log_region_size(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ size_t s;
+
+ dbenv = env->dbenv;
+
+ /* Set the default buffer size, if not otherwise configured. */
+ if (dbenv->lg_bsize == 0)
+ dbenv->lg_bsize = FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
+ LG_BSIZE_INMEM : LG_BSIZE_DEFAULT;
+
+ s = dbenv->lg_bsize;
+ /* Allocate the initial fileid allocation, plus some path name space. */
+ s += dbenv->lg_fileid_init * __env_alloc_size((sizeof(FNAME)) + 16);
+
+ return (s);
+}
+/*
+ * __log_region_max --
+ * Return the amount of extra memory to allocate for logging informaition.
+ * PUBLIC: size_t __log_region_max __P((ENV *));
+ */
+size_t
+__log_region_max(env)
+ ENV *env;
+{
+
+ DB_ENV *dbenv;
+ size_t s;
+
+ dbenv = env->dbenv;
+ if (dbenv->lg_fileid_init == 0) {
+ if ((s = dbenv->lg_regionmax) == 0)
+ s = LG_BASE_REGION_SIZE;
+ } else if ((s = dbenv->lg_regionmax) != 0 &&
+ s < dbenv->lg_fileid_init * (__env_alloc_size(sizeof(FNAME)) + 16))
+ s = 0;
+ else if (s != 0)
+ s -= dbenv->lg_fileid_init *
+ (__env_alloc_size(sizeof(FNAME)) + 16);
+
+ return (s);
+}
+
+/*
+ * __log_vtruncate
+ * This is a virtual truncate. We set up the log indicators to
+ * make everyone believe that the given record is the last one in the
+ * log. Returns with the next valid LSN (i.e., the LSN of the next
+ * record to be written). This is used in replication to discard records
+ * in the log file that do not agree with the master.
+ *
+ * PUBLIC: int __log_vtruncate __P((ENV *, DB_LSN *, DB_LSN *, DB_LSN *));
+ */
+int
+__log_vtruncate(env, lsn, ckplsn, trunclsn)
+ ENV *env;
+ DB_LSN *lsn, *ckplsn, *trunclsn;
+{
+ DBT log_dbt;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ LOG *lp;
+ u_int32_t bytes, len;
+ size_t offset;
+ int ret, t_ret;
+
+ /* Need to find out the length of this soon-to-be-last record. */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ ret = __logc_get(logc, lsn, &log_dbt, DB_SET);
+ len = logc->len;
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+
+ /* Now do the truncate. */
+ dblp = env->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+
+ LOG_SYSTEM_LOCK(env);
+
+ /*
+ * Flush the log so we can simply initialize the in-memory buffer
+ * after the truncate.
+ */
+ if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
+ goto err;
+
+ lp->lsn = *lsn;
+ lp->len = len;
+ lp->lsn.offset += lp->len;
+
+ offset = lp->b_off;
+ if (lp->db_log_inmemory && (ret =
+ __log_inmem_lsnoff(dblp, &lp->lsn, &offset)) != 0) {
+ lp->b_off = (db_size_t)offset;
+ goto err;
+ }
+ lp->b_off = (db_size_t)offset;
+
+ /*
+ * I am going to assume that the number of bytes written since
+ * the last checkpoint doesn't exceed a 32-bit number.
+ */
+ DB_ASSERT(env, lp->lsn.file >= ckplsn->file);
+ bytes = 0;
+ if (ckplsn->file != lp->lsn.file) {
+ bytes = lp->log_size - ckplsn->offset;
+ if (lp->lsn.file > ckplsn->file + 1)
+ bytes += lp->log_size *
+ ((lp->lsn.file - ckplsn->file) - 1);
+ bytes += lp->lsn.offset;
+ } else
+ bytes = lp->lsn.offset - ckplsn->offset;
+
+ lp->stat.st_wc_mbytes += bytes / MEGABYTE;
+ lp->stat.st_wc_bytes += bytes % MEGABYTE;
+
+ /*
+ * If the synced lsn is greater than our new end of log, reset it
+ * to our current end of log.
+ */
+ MUTEX_LOCK(env, lp->mtx_flush);
+ if (LOG_COMPARE(&lp->s_lsn, lsn) > 0)
+ lp->s_lsn = lp->lsn;
+ MUTEX_UNLOCK(env, lp->mtx_flush);
+
+ /* Initialize the in-region buffer to a pristine state. */
+ ZERO_LSN(lp->f_lsn);
+ lp->w_off = lp->lsn.offset;
+
+ if (trunclsn != NULL)
+ *trunclsn = lp->lsn;
+
+ /* Truncate the log to the new point. */
+ if ((ret = __log_zero(env, &lp->lsn)) != 0)
+ goto err;
+
+err: LOG_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __log_is_outdated --
+ * Used by the replication system to identify if a client's logs are too
+ * old.
+ *
+ * PUBLIC: int __log_is_outdated __P((ENV *, u_int32_t, int *));
+ */
+int
+__log_is_outdated(env, fnum, outdatedp)
+ ENV *env;
+ u_int32_t fnum;
+ int *outdatedp;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ char *name;
+ int ret;
+ u_int32_t cfile;
+ struct __db_filestart *filestart;
+
+ dblp = env->lg_handle;
+
+ /*
+ * The log represented by env is compared to the file number passed
+ * in fnum. If the log file fnum does not exist and is lower-numbered
+ * than the current logs, return *outdatedp non-zero, else we return 0.
+ */
+ if (FLD_ISSET(env->dbenv->lg_flags, DB_LOG_IN_MEMORY)) {
+ LOG_SYSTEM_LOCK(env);
+ lp = (LOG *)dblp->reginfo.primary;
+ filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+ *outdatedp = filestart == NULL ? 0 : (fnum < filestart->file);
+ LOG_SYSTEM_UNLOCK(env);
+ return (0);
+ }
+
+ *outdatedp = 0;
+ if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) {
+ __os_free(env, name);
+ return (ret);
+ }
+
+ /* If the file exists, we're just fine. */
+ if (__os_exists(env, name, NULL) == 0)
+ goto out;
+
+ /*
+ * It didn't exist, decide if the file number is too big or
+ * too little. If it's too little, then we need to indicate
+ * that the LSN is outdated.
+ */
+ LOG_SYSTEM_LOCK(env);
+ lp = (LOG *)dblp->reginfo.primary;
+ cfile = lp->lsn.file;
+ LOG_SYSTEM_UNLOCK(env);
+
+ if (cfile > fnum)
+ *outdatedp = 1;
+out: __os_free(env, name);
+ return (ret);
+}
+
+/*
+ * __log_zero --
+ * Zero out the tail of a log after a truncate.
+ *
+ * PUBLIC: int __log_zero __P((ENV *, DB_LSN *));
+ */
+int
+__log_zero(env, from_lsn)
+ ENV *env;
+ DB_LSN *from_lsn;
+{
+ DB_FH *fhp;
+ DB_LOG *dblp;
+ LOG *lp;
+ struct __db_filestart *filestart, *nextstart;
+ size_t nbytes, len, nw;
+ u_int32_t fn, mbytes, bytes;
+ u_int8_t buf[4096];
+ int ret;
+ char *fname;
+
+ dblp = env->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+ DB_ASSERT(env, LOG_COMPARE(from_lsn, &lp->lsn) <= 0);
+ if (LOG_COMPARE(from_lsn, &lp->lsn) > 0) {
+ __db_errx(env, DB_STR("2534",
+ "Warning: truncating to point beyond end of log"));
+ return (0);
+ }
+
+ if (lp->db_log_inmemory) {
+ /*
+ * Remove the files that are invalidated by this truncate.
+ */
+ for (filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+ filestart != NULL; filestart = nextstart) {
+ nextstart = SH_TAILQ_NEXT(filestart,
+ links, __db_filestart);
+ if (filestart->file > from_lsn->file) {
+ SH_TAILQ_REMOVE(&lp->logfiles,
+ filestart, links, __db_filestart);
+ SH_TAILQ_INSERT_HEAD(&lp->free_logfiles,
+ filestart, links, __db_filestart);
+ }
+ }
+
+ return (0);
+ }
+
+ /* Close any open file handles so unlinks don't fail. */
+ if (dblp->lfhp != NULL) {
+ (void)__os_closehandle(env, dblp->lfhp);
+ dblp->lfhp = NULL;
+ }
+
+ /* Throw away any extra log files that we have around. */
+ for (fn = from_lsn->file + 1;; fn++) {
+ if (__log_name(dblp, fn, &fname, &fhp, DB_OSO_RDONLY) != 0) {
+ __os_free(env, fname);
+ break;
+ }
+ (void)__os_closehandle(env, fhp);
+ (void)time(&lp->timestamp);
+ ret = __os_unlink(env, fname, 0);
+ __os_free(env, fname);
+ if (ret != 0)
+ return (ret);
+ }
+
+ /* We removed some log files; have to 0 to end of file. */
+ if ((ret =
+ __log_name(dblp, from_lsn->file, &fname, &dblp->lfhp, 0)) != 0) {
+ __os_free(env, fname);
+ return (ret);
+ }
+ __os_free(env, fname);
+ if ((ret = __os_ioinfo(env,
+ NULL, dblp->lfhp, &mbytes, &bytes, NULL)) != 0)
+ goto err;
+ DB_ASSERT(env, (mbytes * MEGABYTE + bytes) >= from_lsn->offset);
+ len = (mbytes * MEGABYTE + bytes) - from_lsn->offset;
+
+ memset(buf, 0, sizeof(buf));
+
+ /* Initialize the write position. */
+ if ((ret = __os_seek(env, dblp->lfhp, 0, 0, from_lsn->offset)) != 0)
+ goto err;
+
+ while (len > 0) {
+ nbytes = len > sizeof(buf) ? sizeof(buf) : len;
+ if ((ret =
+ __os_write(env, dblp->lfhp, buf, nbytes, &nw)) != 0)
+ goto err;
+ len -= nbytes;
+ }
+
+err: (void)__os_closehandle(env, dblp->lfhp);
+ dblp->lfhp = NULL;
+
+ return (ret);
+}
+
+/*
+ * __log_inmem_lsnoff --
+ * Find the offset in the buffer of a given LSN.
+ *
+ * PUBLIC: int __log_inmem_lsnoff __P((DB_LOG *, DB_LSN *, size_t *));
+ */
+int
+__log_inmem_lsnoff(dblp, lsnp, offsetp)
+ DB_LOG *dblp;
+ DB_LSN *lsnp;
+ size_t *offsetp;
+{
+ LOG *lp;
+ struct __db_filestart *filestart;
+
+ lp = (LOG *)dblp->reginfo.primary;
+
+ SH_TAILQ_FOREACH(filestart, &lp->logfiles, links, __db_filestart)
+ if (filestart->file == lsnp->file) {
+ *offsetp = (u_int32_t)
+ (filestart->b_off + lsnp->offset) % lp->buffer_size;
+ return (0);
+ }
+
+ return (DB_NOTFOUND);
+}
+
+/*
+ * __log_inmem_newfile --
+ * Records the offset of the beginning of a new file in the in-memory
+ * buffer.
+ *
+ * PUBLIC: int __log_inmem_newfile __P((DB_LOG *, u_int32_t));
+ */
+int
+__log_inmem_newfile(dblp, file)
+ DB_LOG *dblp;
+ u_int32_t file;
+{
+ HDR hdr;
+ LOG *lp;
+ struct __db_filestart *filestart;
+ int ret;
+#ifdef DIAGNOSTIC
+ struct __db_filestart *first, *last;
+#endif
+
+ lp = (LOG *)dblp->reginfo.primary;
+
+ /*
+ * If the log buffer is empty, reuse the filestart entry.
+ */
+ filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+ if (filestart != NULL &&
+ RINGBUF_LEN(lp, filestart->b_off, lp->b_off) <=
+ sizeof(HDR) + sizeof(LOGP)) {
+ filestart->file = file;
+ filestart->b_off = lp->b_off;
+ return (0);
+ }
+
+ /*
+ * We write an empty header at the end of every in-memory log file.
+ * This is used during cursor traversal to indicate when to switch the
+ * LSN to the next file.
+ */
+ if (file > 1) {
+ memset(&hdr, 0, sizeof(HDR));
+ __log_inmem_copyin(dblp, lp->b_off, &hdr, sizeof(HDR));
+ lp->b_off = (lp->b_off + sizeof(HDR)) % lp->buffer_size;
+ }
+
+ filestart = SH_TAILQ_FIRST(&lp->free_logfiles, __db_filestart);
+ if (filestart == NULL) {
+ if ((ret = __env_alloc(&dblp->reginfo,
+ sizeof(struct __db_filestart), &filestart)) != 0)
+ return (ret);
+ memset(filestart, 0, sizeof(*filestart));
+ } else
+ SH_TAILQ_REMOVE(&lp->free_logfiles, filestart,
+ links, __db_filestart);
+
+ filestart->file = file;
+ filestart->b_off = lp->b_off;
+
+#ifdef DIAGNOSTIC
+ first = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+ last = SH_TAILQ_LAST(&(lp)->logfiles, links, __db_filestart);
+
+ /* Check that we don't wrap. */
+ DB_ASSERT(dblp->env, !first || first == last ||
+ RINGBUF_LEN(lp, first->b_off, lp->b_off) ==
+ RINGBUF_LEN(lp, first->b_off, last->b_off) +
+ RINGBUF_LEN(lp, last->b_off, lp->b_off));
+#endif
+
+ SH_TAILQ_INSERT_TAIL(&lp->logfiles, filestart, links);
+ return (0);
+}
+
+/*
+ * __log_inmem_chkspace --
+ * Ensure that the requested amount of space is available in the buffer,
+ * and invalidate the region.
+ * Note: assumes that the region lock is held on entry.
+ *
+ * PUBLIC: int __log_inmem_chkspace __P((DB_LOG *, size_t));
+ */
+int
+__log_inmem_chkspace(dblp, len)
+ DB_LOG *dblp;
+ size_t len;
+{
+ DB_LSN active_lsn, old_active_lsn;
+ ENV *env;
+ LOG *lp;
+ struct __db_filestart *filestart;
+ size_t offset;
+ int ret;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+
+ DB_ASSERT(env, lp->db_log_inmemory);
+
+ /*
+ * Allow room for an extra header so that we don't need to check for
+ * space when switching files.
+ */
+ len += sizeof(HDR);
+
+ /*
+ * If transactions are enabled and we're about to fill available space,
+ * update the active LSN and recheck. If transactions aren't enabled,
+ * don't even bother checking: in that case we can always overwrite old
+ * log records, because we're never going to abort.
+ */
+ while (TXN_ON(env) &&
+ RINGBUF_LEN(lp, lp->b_off, lp->a_off) <= len) {
+ old_active_lsn = lp->active_lsn;
+ active_lsn = lp->lsn;
+
+ /*
+ * Drop the log region lock so we don't hold it while
+ * taking the transaction region lock.
+ */
+ LOG_SYSTEM_UNLOCK(env);
+ ret = __txn_getactive(env, &active_lsn);
+ LOG_SYSTEM_LOCK(env);
+ if (ret != 0)
+ return (ret);
+ active_lsn.offset = 0;
+
+ /* If we didn't make any progress, give up. */
+ if (LOG_COMPARE(&active_lsn, &old_active_lsn) == 0) {
+ __db_errx(env, DB_STR("2535",
+"In-memory log buffer is full (an active transaction spans the buffer)"));
+ return (DB_LOG_BUFFER_FULL);
+ }
+
+ /* Make sure we're moving the region LSN forwards. */
+ if (LOG_COMPARE(&active_lsn, &lp->active_lsn) > 0) {
+ lp->active_lsn = active_lsn;
+ offset = lp->a_off;
+ (void)__log_inmem_lsnoff(dblp, &active_lsn, &offset);
+ lp->a_off = (db_size_t)offset;
+ }
+ }
+
+ /*
+ * Remove the first file if it is invalidated by this write.
+ * Log records can't be bigger than a file, so we only need to
+ * check the first file.
+ */
+ filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
+ if (filestart != NULL &&
+ RINGBUF_LEN(lp, lp->b_off, filestart->b_off) <= len) {
+ SH_TAILQ_REMOVE(&lp->logfiles, filestart,
+ links, __db_filestart);
+ SH_TAILQ_INSERT_HEAD(&lp->free_logfiles, filestart,
+ links, __db_filestart);
+ lp->f_lsn.file = filestart->file + 1;
+ }
+
+ return (0);
+}
+
+/*
+ * __log_inmem_copyout --
+ * Copies the given number of bytes from the buffer -- no checking.
+ * Note: assumes that the region lock is held on entry.
+ *
+ * PUBLIC: void __log_inmem_copyout __P((DB_LOG *, size_t, void *, size_t));
+ */
+void
+__log_inmem_copyout(dblp, offset, buf, size)
+ DB_LOG *dblp;
+ size_t offset;
+ void *buf;
+ size_t size;
+{
+ LOG *lp;
+ size_t nbytes;
+
+ lp = (LOG *)dblp->reginfo.primary;
+ nbytes = (offset + size < lp->buffer_size) ?
+ size : lp->buffer_size - offset;
+ memcpy(buf, dblp->bufp + offset, nbytes);
+ if (nbytes < size)
+ memcpy((u_int8_t *)buf + nbytes, dblp->bufp, size - nbytes);
+}
+
+/*
+ * __log_inmem_copyin --
+ * Copies the given number of bytes into the buffer -- no checking.
+ * Note: assumes that the region lock is held on entry.
+ *
+ * PUBLIC: void __log_inmem_copyin __P((DB_LOG *, size_t, void *, size_t));
+ */
+void
+__log_inmem_copyin(dblp, offset, buf, size)
+ DB_LOG *dblp;
+ size_t offset;
+ void *buf;
+ size_t size;
+{
+ LOG *lp;
+ size_t nbytes;
+
+ lp = (LOG *)dblp->reginfo.primary;
+ nbytes = (offset + size < lp->buffer_size) ?
+ size : lp->buffer_size - offset;
+ memcpy(dblp->bufp + offset, buf, nbytes);
+ if (nbytes < size)
+ memcpy(dblp->bufp, (u_int8_t *)buf + nbytes, size - nbytes);
+}
+
+/*
+ * __log_set_version --
+ * Sets the current version of the log subsystem to the given version.
+ * Essentially this modifies the lp->persist.version field in the
+ * shared memory region. Called when region is initially created
+ * and when replication is starting up or finds a new master.
+ *
+ * PUBLIC: void __log_set_version __P((ENV *, u_int32_t));
+ */
+void
+__log_set_version(env, newver)
+ ENV *env;
+ u_int32_t newver;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = env->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+ /*
+ * We should be able to update this atomically without locking.
+ */
+ lp->persist.version = newver;
+}
+
+/*
+ * __log_get_oldversion --
+ * Returns the last version of log that this environment was working
+ * with. Since there could be several versions of log files, if
+ * the user upgraded and didn't log archive, we check the version
+ * of the first log file, compare it to the last log file. If those
+ * are different, then there is an older log existing, and we then
+ * walk backward in the log files looking for the version of the
+ * most recent older log file.
+ *
+ * PUBLIC: int __log_get_oldversion __P((ENV *, u_int32_t *));
+ */
+int
+__log_get_oldversion(env, ver)
+ ENV *env;
+ u_int32_t *ver;
+{
+ DBT rec;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ LOG *lp;
+ u_int32_t firstfnum, fnum, lastver, oldver;
+ int ret, t_ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ logc = NULL;
+ ret = 0;
+ oldver = DB_LOGVERSION;
+ /*
+ * If we're in-memory logs we're always the current version.
+ */
+ if (lp->db_log_inmemory) {
+ *ver = oldver;
+ return (0);
+ }
+ memset(&rec, 0, sizeof(rec));
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+ /*
+ * Get the version numbers of the first and last log files.
+ */
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
+ /*
+ * If there is no log file, we'll get DB_NOTFOUND.
+ * If we get that, set the version to the current.
+ */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+ firstfnum = lsn.file;
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0)
+ goto err;
+ if ((ret = __log_valid(dblp, firstfnum, 0, NULL, 0,
+ NULL, &oldver)) != 0)
+ goto err;
+ /*
+ * If the first and last LSN are in the same file, then we
+ * already have the version in oldver. Return it.
+ */
+ if (firstfnum == lsn.file)
+ goto err;
+
+ /*
+ * Otherwise they're in different files and we call __log_valid
+ * to get the version numbers in both files.
+ */
+ if ((ret = __log_valid(dblp, lsn.file, 0, NULL, 0,
+ NULL, &lastver)) != 0)
+ goto err;
+ /*
+ * If the version numbers are different, walk backward getting
+ * the version of each log file until we find one that is
+ * different than the last.
+ */
+ if (oldver != lastver) {
+ for (fnum = lsn.file - 1; fnum >= firstfnum; fnum--) {
+ if ((ret = __log_valid(dblp, fnum, 0, NULL, 0,
+ NULL, &oldver)) != 0)
+ goto err;
+ if (oldver != lastver)
+ break;
+ }
+ }
+err: if (logc != NULL && ((t_ret = __logc_close(logc)) != 0) && ret == 0)
+ ret = t_ret;
+ if (ret == 0 && ver != NULL)
+ *ver = oldver;
+ return (ret);
+}
diff --git a/src/log/log_archive.c b/src/log/log_archive.c
new file mode 100644
index 00000000..280a2071
--- /dev/null
+++ b/src/log/log_archive.c
@@ -0,0 +1,643 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __absname __P((ENV *, char *, char *, char **));
+static int __build_data __P((ENV *, char *, char ***));
+static int __cmpfunc __P((const void *, const void *));
+static int __usermem __P((ENV *, char ***));
+
+/*
+ * __log_archive_pp --
+ * ENV->log_archive pre/post processing.
+ *
+ * PUBLIC: int __log_archive_pp __P((DB_ENV *, char **[], u_int32_t));
+ */
+int
+__log_archive_pp(dbenv, listp, flags)
+ DB_ENV *dbenv;
+ char ***listp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->log_archive", DB_INIT_LOG);
+
+#undef OKFLAGS
+#define OKFLAGS (DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG | DB_ARCH_REMOVE)
+ if (flags != 0) {
+ if ((ret = __db_fchk(
+ env, "DB_ENV->log_archive", flags, OKFLAGS)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(env, "DB_ENV->log_archive",
+ flags, DB_ARCH_DATA, DB_ARCH_LOG)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(env, "DB_ENV->log_archive",
+ flags, DB_ARCH_REMOVE,
+ DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG)) != 0)
+ return (ret);
+ }
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__log_archive(env, listp, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __log_archive --
+ * ENV->log_archive. Internal.
+ * PUBLIC: int __log_archive __P((ENV *, char **[], u_int32_t));
+ */
+int
+__log_archive(env, listp, flags)
+ ENV *env;
+ char ***listp;
+ u_int32_t flags;
+{
+ DBT rec;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN stable_lsn;
+ LOG *lp;
+ u_int array_size, n;
+ u_int32_t fnum;
+ int handle_check, ret, t_ret;
+ char **array, **arrayp, *name, *p, *pref;
+#ifdef HAVE_GETCWD
+ char path[DB_MAXPATHLEN];
+#endif
+
+ dblp = env->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+ array = NULL;
+ name = NULL;
+ ret = 0;
+ COMPQUIET(fnum, 0);
+
+ if (flags != DB_ARCH_REMOVE)
+ *listp = NULL;
+
+ /* There are no log files if logs are in memory. */
+ if (lp->db_log_inmemory) {
+ LF_CLR(~DB_ARCH_DATA);
+ if (flags == 0)
+ return (0);
+ }
+
+ /*
+ * Check if the user wants the list of log files to remove and we're
+ * at a bad time in replication initialization.
+ */
+ handle_check = 0;
+ if (!LF_ISSET(DB_ARCH_DATA) &&
+ !LF_ISSET(DB_ARCH_LOG)) {
+ /*
+ * If we're locked out, just return success. No files
+ * can be archived right now. Any other error pass back
+ * to the caller.
+ */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __archive_rep_enter(env)) != 0) {
+ if (ret == DB_REP_LOCKOUT)
+ ret = 0;
+ return (ret);
+ }
+ }
+
+ /*
+ * Prepend the original absolute pathname if the user wants an
+ * absolute path to the database environment directory.
+ */
+#ifdef HAVE_GETCWD
+ if (LF_ISSET(DB_ARCH_ABS)) {
+ /*
+ * XXX
+ * Can't trust getcwd(3) to set a valid errno, so don't display
+ * one unless we know it's good. It's likely a permissions
+ * problem: use something bland and useless in the default
+ * return value, so we don't send somebody off in the wrong
+ * direction.
+ */
+ __os_set_errno(0);
+ if (getcwd(path, sizeof(path)) == NULL) {
+ ret = __os_get_errno();
+ __db_err(env, ret, DB_STR("2570",
+ "no absolute path for the current directory"));
+ goto err;
+ }
+ pref = path;
+ } else
+#endif
+ pref = NULL;
+
+ LF_CLR(DB_ARCH_ABS);
+ switch (flags) {
+ case DB_ARCH_DATA:
+ ret = __build_data(env, pref, listp);
+ goto err;
+ case DB_ARCH_LOG:
+ memset(&rec, 0, sizeof(rec));
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+#ifdef UMRW
+ ZERO_LSN(stable_lsn);
+#endif
+ ret = __logc_get(logc, &stable_lsn, &rec, DB_LAST);
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ fnum = stable_lsn.file;
+ break;
+ case DB_ARCH_REMOVE:
+ __log_autoremove(env);
+ goto err;
+ case 0:
+
+ ret = __log_get_stable_lsn(env, &stable_lsn, 1);
+ /*
+ * A return of DB_NOTFOUND means the checkpoint LSN
+ * is before the beginning of the log files we have.
+ * This is not an error; it just means we're done.
+ */
+ if (ret != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+ /* Remove any log files before the last stable LSN. */
+ fnum = stable_lsn.file - 1;
+ break;
+ default:
+ ret = __db_unknown_path(env, "__log_archive");
+ goto err;
+ }
+
+#define LIST_INCREMENT 64
+ /* Get some initial space. */
+ array_size = 64;
+ if ((ret = __os_malloc(env,
+ sizeof(char *) * array_size, &array)) != 0)
+ goto err;
+ array[0] = NULL;
+
+ /* Build an array of the file names. */
+ for (n = 0; fnum > 0; --fnum) {
+ if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) {
+ __os_free(env, name);
+ goto err;
+ }
+ if (__os_exists(env, name, NULL) != 0) {
+ __os_free(env, name);
+ name = NULL;
+ if (LF_ISSET(DB_ARCH_LOG) && fnum == stable_lsn.file)
+ continue;
+ break;
+ }
+
+ if (n >= array_size - 2) {
+ array_size += LIST_INCREMENT;
+ if ((ret = __os_realloc(env,
+ sizeof(char *) * array_size, &array)) != 0)
+ goto err;
+ }
+
+ if (pref != NULL) {
+ if ((ret =
+ __absname(env, pref, name, &array[n])) != 0)
+ goto err;
+ __os_free(env, name);
+ } else if ((p = __db_rpath(name)) != NULL) {
+ if ((ret = __os_strdup(env, p + 1, &array[n])) != 0)
+ goto err;
+ __os_free(env, name);
+ } else
+ array[n] = name;
+
+ name = NULL;
+ array[++n] = NULL;
+ }
+
+ /* If there's nothing to return, we're done. */
+ if (n == 0)
+ goto err;
+
+ /* Sort the list. */
+ qsort(array, (size_t)n, sizeof(char *), __cmpfunc);
+
+ /* Rework the memory. */
+ if ((ret = __usermem(env, &array)) != 0)
+ goto err;
+
+ if (listp != NULL)
+ *listp = array;
+
+ if (0) {
+err: if (array != NULL) {
+ for (arrayp = array; *arrayp != NULL; ++arrayp)
+ __os_free(env, *arrayp);
+ __os_free(env, array);
+ }
+ if (name != NULL)
+ __os_free(env, name);
+ }
+ if (handle_check && (t_ret = __archive_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __log_get_stable_lsn --
+ * Get the stable lsn based on where checkpoints are.
+ *
+ * PUBLIC: int __log_get_stable_lsn __P((ENV *, DB_LSN *, int));
+ */
+int
+__log_get_stable_lsn(env, stable_lsn, group_wide)
+ ENV *env;
+ DB_LSN *stable_lsn;
+ int group_wide;
+{
+ DBT rec;
+ DB_LOGC *logc;
+ LOG *lp;
+ __txn_ckp_args *ckp_args;
+ int ret, t_ret;
+
+ lp = env->lg_handle->reginfo.primary;
+
+ ret = 0;
+ memset(&rec, 0, sizeof(rec));
+ if (!TXN_ON(env)) {
+ if ((ret = __log_get_cached_ckp_lsn(env, stable_lsn)) != 0)
+ goto err;
+ /*
+ * No need to check for a return value of DB_NOTFOUND;
+ * __txn_findlastckp returns 0 if no checkpoint record
+ * is found. Instead of checking the return value, we
+ * check to see if the return LSN has been filled in.
+ */
+ if (IS_ZERO_LSN(*stable_lsn) && (ret =
+ __txn_findlastckp(env, stable_lsn, NULL)) != 0)
+ goto err;
+ /*
+ * If the LSN has not been filled in return DB_NOTFOUND
+ * so that the caller knows it may be done.
+ */
+ if (IS_ZERO_LSN(*stable_lsn)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ } else if ((ret = __txn_getckp(env, stable_lsn)) != 0)
+ goto err;
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+ /*
+ * Read checkpoint records until we find one that is on disk,
+ * then copy the ckp_lsn to the stable_lsn;
+ */
+ while ((ret = __logc_get(logc, stable_lsn, &rec, DB_SET)) == 0 &&
+ (ret = __txn_ckp_read(env, rec.data, &ckp_args)) == 0) {
+ if (stable_lsn->file < lp->s_lsn.file ||
+ (stable_lsn->file == lp->s_lsn.file &&
+ stable_lsn->offset < lp->s_lsn.offset)) {
+ *stable_lsn = ckp_args->ckp_lsn;
+ __os_free(env, ckp_args);
+ break;
+ }
+ *stable_lsn = ckp_args->last_ckp;
+ __os_free(env, ckp_args);
+ }
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+#ifdef HAVE_REPLICATION_THREADS
+ /*
+ * If we have RepMgr, get the minimum group-aware LSN.
+ */
+ if (group_wide && ret == 0 && REP_ON(env) && APP_IS_REPMGR(env) &&
+ (t_ret = __repmgr_stable_lsn(env, stable_lsn)) != 0)
+ ret = t_ret;
+#else
+ COMPQUIET(group_wide, 0);
+#endif
+err:
+ return (ret);
+}
+
+/*
+ * __log_autoremove --
+ * Delete any non-essential log files.
+ *
+ * PUBLIC: void __log_autoremove __P((ENV *));
+ */
+void
+__log_autoremove(env)
+ ENV *env;
+{
+ int ret;
+ char **begin, **list;
+
+ /*
+ * Complain if there's an error, but don't return the error to our
+ * caller. Auto-remove is done when writing a log record, and we
+ * don't want to fail a write, which could fail the corresponding
+ * committing transaction, for a permissions error.
+ */
+ if ((ret = __log_archive(env, &list, DB_ARCH_ABS)) != 0) {
+ if (ret != DB_NOTFOUND)
+ __db_err(env, ret, DB_STR("2571",
+ "log file auto-remove"));
+ return;
+ }
+
+ /* Remove the files. */
+ if (list != NULL) {
+ for (begin = list; *list != NULL; ++list)
+ (void)__os_unlink(env, *list, 0);
+ __os_ufree(env, begin);
+ }
+}
+
+/*
+ * __build_data --
+ * Build a list of datafiles for return.
+ */
+static int
+__build_data(env, pref, listp)
+ ENV *env;
+ char *pref, ***listp;
+{
+ DBT rec;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ __dbreg_register_args *argp;
+ u_int array_size, last, n, nxt;
+ u_int32_t rectype;
+ int ret, t_ret;
+ char **array, **arrayp, **list, **lp, *p, *real_name;
+
+ /* Get some initial space. */
+ array_size = 64;
+ if ((ret = __os_malloc(env,
+ sizeof(char *) * array_size, &array)) != 0)
+ return (ret);
+ array[0] = NULL;
+
+ memset(&rec, 0, sizeof(rec));
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ for (n = 0; (ret = __logc_get(logc, &lsn, &rec, DB_PREV)) == 0;) {
+ if (rec.size < sizeof(rectype)) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR("2572",
+ "DB_ENV->log_archive: bad log record"));
+ break;
+ }
+
+ LOGCOPY_32(env, &rectype, rec.data);
+ if (rectype != DB___dbreg_register)
+ continue;
+ if ((ret =
+ __dbreg_register_read(env, rec.data, &argp)) != 0) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR("2573",
+ "DB_ENV->log_archive: unable to read log record"));
+ break;
+ }
+
+ if (n >= array_size - 2) {
+ array_size += LIST_INCREMENT;
+ if ((ret = __os_realloc(env,
+ sizeof(char *) * array_size, &array)) != 0)
+ goto free_continue;
+ }
+
+ if ((ret = __os_strdup(env,
+ argp->name.data, &array[n++])) != 0)
+ goto free_continue;
+ array[n] = NULL;
+
+ if (argp->ftype == DB_QUEUE) {
+ if ((ret = __qam_extent_names(env,
+ argp->name.data, &list)) != 0)
+ goto q_err;
+ for (lp = list;
+ lp != NULL && *lp != NULL; lp++) {
+ if (n >= array_size - 2) {
+ array_size += LIST_INCREMENT;
+ if ((ret = __os_realloc(env,
+ sizeof(char *) *
+ array_size, &array)) != 0)
+ goto q_err;
+ }
+ if ((ret =
+ __os_strdup(env, *lp, &array[n++])) != 0)
+ goto q_err;
+ array[n] = NULL;
+ }
+q_err: if (list != NULL)
+ __os_free(env, list);
+ }
+free_continue: __os_free(env, argp);
+ if (ret != 0)
+ break;
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err1;
+
+ /* If there's nothing to return, we're done. */
+ if (n == 0) {
+ ret = 0;
+ *listp = NULL;
+ goto err1;
+ }
+
+ /* Sort the list. */
+ qsort(array, (size_t)n, sizeof(char *), __cmpfunc);
+
+ /*
+ * Build the real pathnames, discarding nonexistent files and
+ * duplicates.
+ */
+ for (last = nxt = 0; nxt < n;) {
+ /*
+ * Discard duplicates. Last is the next slot we're going
+ * to return to the user, nxt is the next slot that we're
+ * going to consider.
+ */
+ if (last != nxt) {
+ array[last] = array[nxt];
+ array[nxt] = NULL;
+ }
+ for (++nxt; nxt < n &&
+ strcmp(array[last], array[nxt]) == 0; ++nxt) {
+ __os_free(env, array[nxt]);
+ array[nxt] = NULL;
+ }
+
+ /* Get the real name. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, array[last], NULL, &real_name)) != 0)
+ goto err2;
+
+ /* If the file doesn't exist, ignore it. */
+ if (__os_exists(env, real_name, NULL) != 0) {
+ __os_free(env, real_name);
+ __os_free(env, array[last]);
+ array[last] = NULL;
+ continue;
+ }
+
+ /* Rework the name as requested by the user. */
+ __os_free(env, array[last]);
+ array[last] = NULL;
+ if (pref != NULL) {
+ ret = __absname(env, pref, real_name, &array[last]);
+ __os_free(env, real_name);
+ if (ret != 0)
+ goto err2;
+ } else if ((p = __db_rpath(real_name)) != NULL) {
+ ret = __os_strdup(env, p + 1, &array[last]);
+ __os_free(env, real_name);
+ if (ret != 0)
+ goto err2;
+ } else
+ array[last] = real_name;
+ ++last;
+ }
+
+ /* NULL-terminate the list. */
+ array[last] = NULL;
+
+ /* Rework the memory. */
+ if ((ret = __usermem(env, &array)) != 0)
+ goto err1;
+
+ *listp = array;
+ return (0);
+
+err2: /*
+ * XXX
+ * We've possibly inserted NULLs into the array list, so clean up a
+ * bit so that the other error processing works.
+ */
+ if (array != NULL)
+ for (; nxt < n; ++nxt)
+ __os_free(env, array[nxt]);
+ /* FALLTHROUGH */
+
+err1: if (array != NULL) {
+ for (arrayp = array; *arrayp != NULL; ++arrayp)
+ __os_free(env, *arrayp);
+ __os_free(env, array);
+ }
+ return (ret);
+}
+
+/*
+ * __absname --
+ * Return an absolute path name for the file.
+ */
+static int
+__absname(env, pref, name, newnamep)
+ ENV *env;
+ char *pref, *name, **newnamep;
+{
+ size_t l_pref, l_name;
+ int isabspath, ret;
+ char *newname;
+
+ l_name = strlen(name);
+ isabspath = __os_abspath(name);
+ l_pref = isabspath ? 0 : strlen(pref);
+
+ /* Malloc space for concatenating the two. */
+ if ((ret = __os_malloc(env,
+ l_pref + l_name + 2, &newname)) != 0)
+ return (ret);
+ *newnamep = newname;
+
+ /* Build the name. If `name' is an absolute path, ignore any prefix. */
+ if (!isabspath) {
+ memcpy(newname, pref, l_pref);
+ if (strchr(PATH_SEPARATOR, newname[l_pref - 1]) == NULL)
+ newname[l_pref++] = PATH_SEPARATOR[0];
+ }
+ memcpy(newname + l_pref, name, l_name + 1);
+
+ return (0);
+}
+
+/*
+ * __usermem --
+ * Create a single chunk of memory that holds the returned information.
+ * If the user has their own malloc routine, use it.
+ */
+static int
+__usermem(env, listp)
+ ENV *env;
+ char ***listp;
+{
+ size_t len;
+ int ret;
+ char **array, **arrayp, **orig, *strp;
+
+ /* Find out how much space we need. */
+ for (len = 0, orig = *listp; *orig != NULL; ++orig)
+ len += sizeof(char *) + strlen(*orig) + 1;
+ len += sizeof(char *);
+
+ /* Allocate it and set up the pointers. */
+ if ((ret = __os_umalloc(env, len, &array)) != 0)
+ return (ret);
+
+ strp = (char *)(array + (orig - *listp) + 1);
+
+ /* Copy the original information into the new memory. */
+ for (orig = *listp, arrayp = array; *orig != NULL; ++orig, ++arrayp) {
+ len = strlen(*orig);
+ memcpy(strp, *orig, len + 1);
+ *arrayp = strp;
+ strp += len + 1;
+
+ __os_free(env, *orig);
+ }
+
+ /* NULL-terminate the list. */
+ *arrayp = NULL;
+
+ __os_free(env, *listp);
+ *listp = array;
+
+ return (0);
+}
+
+static int
+__cmpfunc(p1, p2)
+ const void *p1, *p2;
+{
+ return (strcmp(*((char * const *)p1), *((char * const *)p2)));
+}
diff --git a/src/log/log_compare.c b/src/log/log_compare.c
new file mode 100644
index 00000000..97b59338
--- /dev/null
+++ b/src/log/log_compare.c
@@ -0,0 +1,66 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+/*
+ * log_compare --
+ * Compare two LSN's; return 1, 0, -1 if first is >, == or < second.
+ *
+ * EXTERN: int log_compare __P((const DB_LSN *, const DB_LSN *));
+ */
+int
+log_compare(lsn0, lsn1)
+ const DB_LSN *lsn0, *lsn1;
+{
+ return (LOG_COMPARE(lsn0, lsn1));
+}
+
+/*
+ * __log_check_page_lsn --
+ * Panic if the page's lsn in past the end of the current log.
+ *
+ * PUBLIC: int __log_check_page_lsn __P((ENV *, DB *, DB_LSN *));
+ */
+int
+__log_check_page_lsn(env, dbp, lsnp)
+ ENV *env;
+ DB *dbp;
+ DB_LSN *lsnp;
+{
+ LOG *lp;
+ int ret;
+
+ lp = env->lg_handle->reginfo.primary;
+ LOG_SYSTEM_LOCK(env);
+
+ ret = LOG_COMPARE(lsnp, &lp->lsn);
+
+ LOG_SYSTEM_UNLOCK(env);
+
+ if (ret < 0)
+ return (0);
+
+ __db_errx(env, DB_STR_A("2506",
+ "file %s has LSN %lu/%lu, past end of log at %lu/%lu",
+ "%s %lu %lu %lu %lu"),
+ dbp == NULL ||
+ dbp->fname == NULL ? DB_STR_P("unknown") : dbp->fname,
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
+ __db_errx(env, DB_STR("2507",
+ "Commonly caused by moving a database from one database environment"));
+ __db_errx(env, DB_STR("2508",
+ "to another without clearing the database LSNs, or by removing all of"));
+ __db_errx(env, DB_STR("2509",
+ "the log files from a database environment"));
+ return (EINVAL);
+}
diff --git a/src/log/log_debug.c b/src/log/log_debug.c
new file mode 100644
index 00000000..32fb2542
--- /dev/null
+++ b/src/log/log_debug.c
@@ -0,0 +1,146 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+static int __log_printf_int __P((ENV *, DB_TXN *, const char *, va_list));
+
+/*
+ * __log_printf_capi --
+ * Write a printf-style format string into the DB log.
+ *
+ * PUBLIC: int __log_printf_capi __P((DB_ENV *, DB_TXN *, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+int
+#ifdef STDC_HEADERS
+__log_printf_capi(DB_ENV *dbenv, DB_TXN *txnid, const char *fmt, ...)
+#else
+__log_printf_capi(dbenv, txnid, fmt, va_alist)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+ int ret;
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ ret = __log_printf_pp(dbenv, txnid, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __log_printf_pp --
+ * Handle the arguments and call an internal routine to do the work.
+ *
+ * The reason this routine isn't just folded into __log_printf_capi
+ * is because the C++ API has to call a C API routine, and you can
+ * only pass variadic arguments to a single routine.
+ *
+ * PUBLIC: int __log_printf_pp
+ * PUBLIC: __P((DB_ENV *, DB_TXN *, const char *, va_list));
+ */
+int
+__log_printf_pp(dbenv, txnid, fmt, ap)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ const char *fmt;
+ va_list ap;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->log_printf", DB_INIT_LOG);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__log_printf_int(env, txnid, fmt, ap)), 0, ret);
+ va_end(ap);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __log_printf --
+ * Write a printf-style format string into the DB log.
+ *
+ * PUBLIC: int __log_printf __P((ENV *, DB_TXN *, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+int
+#ifdef STDC_HEADERS
+__log_printf(ENV *env, DB_TXN *txnid, const char *fmt, ...)
+#else
+__log_printf(env, txnid, fmt, va_alist)
+ ENV *env;
+ DB_TXN *txnid;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+ int ret;
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ ret = __log_printf_int(env, txnid, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __log_printf_int --
+ * Write a printf-style format string into the DB log (internal).
+ */
+static int
+__log_printf_int(env, txnid, fmt, ap)
+ ENV *env;
+ DB_TXN *txnid;
+ const char *fmt;
+ va_list ap;
+{
+ DBT opdbt, msgdbt;
+ DB_LSN lsn;
+ char __logbuf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+
+ if (!DBENV_LOGGING(env)) {
+ __db_errx(env, DB_STR("2510",
+ "Logging not currently permitted"));
+ return (EAGAIN);
+ }
+
+ memset(&opdbt, 0, sizeof(opdbt));
+ opdbt.data = "DIAGNOSTIC";
+ opdbt.size = sizeof("DIAGNOSTIC") - 1;
+
+ memset(&msgdbt, 0, sizeof(msgdbt));
+ msgdbt.data = __logbuf;
+ msgdbt.size = (u_int32_t)vsnprintf(__logbuf, sizeof(__logbuf), fmt, ap);
+
+ return (__db_debug_log(
+ env, txnid, &lsn, 0, &opdbt, -1, &msgdbt, NULL, 0));
+}
diff --git a/src/log/log_get.c b/src/log/log_get.c
new file mode 100644
index 00000000..db30c969
--- /dev/null
+++ b/src/log/log_get.c
@@ -0,0 +1,1626 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/hash.h"
+
+typedef enum { L_ALREADY, L_ACQUIRED, L_NONE } RLOCK;
+
+static int __logc_close_pp __P((DB_LOGC *, u_int32_t));
+static int __logc_get_pp __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+static int __logc_get_int __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+static int __logc_hdrchk __P((DB_LOGC *, DB_LSN *, HDR *, int *));
+static int __logc_incursor __P((DB_LOGC *, DB_LSN *, HDR *, u_int8_t **));
+static int __logc_inregion __P((DB_LOGC *,
+ DB_LSN *, RLOCK *, DB_LSN *, HDR *, u_int8_t **, int *));
+static int __logc_io __P((DB_LOGC *,
+ u_int32_t, u_int32_t, void *, size_t *, int *));
+static int __logc_ondisk __P((DB_LOGC *,
+ DB_LSN *, DB_LSN *, u_int32_t, HDR *, u_int8_t **, int *));
+static int __logc_set_maxrec __P((DB_LOGC *, char *));
+static int __logc_shortread __P((DB_LOGC *, DB_LSN *, int));
+static int __logc_version_pp __P((DB_LOGC *, u_int32_t *, u_int32_t));
+
+/*
+ * __log_cursor_pp --
+ * ENV->log_cursor
+ *
+ * PUBLIC: int __log_cursor_pp __P((DB_ENV *, DB_LOGC **, u_int32_t));
+ */
+int
+__log_cursor_pp(dbenv, logcp, flags)
+ DB_ENV *dbenv;
+ DB_LOGC **logcp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->log_cursor", DB_INIT_LOG);
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env, "DB_ENV->log_cursor", flags, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__log_cursor(env, logcp)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __log_cursor --
+ * Create a log cursor.
+ *
+ * PUBLIC: int __log_cursor __P((ENV *, DB_LOGC **));
+ */
+int
+__log_cursor(env, logcp)
+ ENV *env;
+ DB_LOGC **logcp;
+{
+ DB_LOGC *logc;
+ int ret;
+
+ *logcp = NULL;
+
+ /* Allocate memory for the cursor. */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_LOGC), &logc)) != 0)
+ return (ret);
+
+ logc->bp_size = LG_CURSOR_BUF_SIZE;
+ /*
+ * Set this to something positive.
+ */
+ logc->bp_maxrec = MEGABYTE;
+ if ((ret = __os_malloc(env, logc->bp_size, &logc->bp)) != 0) {
+ __os_free(env, logc);
+ return (ret);
+ }
+
+ logc->env = env;
+ logc->close = __logc_close_pp;
+ logc->get = __logc_get_pp;
+ logc->version = __logc_version_pp;
+
+ *logcp = logc;
+ return (0);
+}
+
+/*
+ * __logc_close_pp --
+ * DB_LOGC->close pre/post processing.
+ */
+static int
+__logc_close_pp(logc, flags)
+ DB_LOGC *logc;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = logc->env;
+
+ if ((ret = __db_fchk(env, "DB_LOGC->close", flags, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__logc_close(logc)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __logc_close --
+ * DB_LOGC->close.
+ *
+ * PUBLIC: int __logc_close __P((DB_LOGC *));
+ */
+int
+__logc_close(logc)
+ DB_LOGC *logc;
+{
+ ENV *env;
+
+ env = logc->env;
+
+ if (logc->fhp != NULL) {
+ (void)__os_closehandle(env, logc->fhp);
+ logc->fhp = NULL;
+ }
+
+ if (logc->dbt.data != NULL)
+ __os_free(env, logc->dbt.data);
+
+ __os_free(env, logc->bp);
+ __os_free(env, logc);
+
+ return (0);
+}
+
+/*
+ * __logc_version_pp --
+ * DB_LOGC->version.
+ */
+static int
+__logc_version_pp(logc, versionp, flags)
+ DB_LOGC *logc;
+ u_int32_t *versionp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = logc->env;
+
+ if ((ret = __db_fchk(env, "DB_LOGC->version", flags, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__logc_version(logc, versionp)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __logc_version --
+ * DB_LOGC->version.
+ *
+ * PUBLIC: int __logc_version __P((DB_LOGC *, u_int32_t *));
+ */
+int
+__logc_version(logc, versionp)
+ DB_LOGC *logc;
+ u_int32_t *versionp;
+{
+ DBT hdrdbt;
+ DB_LOGC *plogc;
+ DB_LSN plsn;
+ ENV *env;
+ LOGP *persist;
+ int ret, t_ret;
+
+ env = logc->env;
+ if (IS_ZERO_LSN(logc->lsn)) {
+ __db_errx(env, DB_STR("2574", "DB_LOGC->get: unset cursor"));
+ return (EINVAL);
+ }
+ ret = 0;
+ /*
+ * Check if the persist info we have is for the same file
+ * as the current cursor position. If we already have the
+ * information, then we're done. If not, we open a new
+ * log cursor and get the header.
+ *
+ * Since most users walk forward through the log when
+ * using this feature (i.e. printlog) we're likely to
+ * have the information we need.
+ */
+ if (logc->lsn.file != logc->p_lsn.file) {
+ if ((ret = __log_cursor(env, &plogc)) != 0)
+ return (ret);
+ plsn.file = logc->lsn.file;
+ plsn.offset = 0;
+ plogc->lsn = plsn;
+ memset(&hdrdbt, 0, sizeof(DBT));
+ if ((ret = __logc_get_int(plogc,
+ &plsn, &hdrdbt, DB_SET)) == 0) {
+ persist = (LOGP *)hdrdbt.data;
+ if (LOG_SWAPPED(env))
+ __log_persistswap(persist);
+ logc->p_lsn = logc->lsn;
+ logc->p_version = persist->version;
+ }
+ if ((t_ret = __logc_close(plogc)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ /* Return the version. */
+ if (ret == 0)
+ *versionp = logc->p_version;
+ return (ret);
+}
+
+/*
+ * __logc_get_pp --
+ * DB_LOGC->get pre/post processing.
+ */
+static int
+__logc_get_pp(logc, alsn, dbt, flags)
+ DB_LOGC *logc;
+ DB_LSN *alsn;
+ DBT *dbt;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = logc->env;
+
+ /* Validate arguments. */
+ switch (flags) {
+ case DB_CURRENT:
+ case DB_FIRST:
+ case DB_LAST:
+ case DB_NEXT:
+ case DB_PREV:
+ break;
+ case DB_SET:
+ if (IS_ZERO_LSN(*alsn)) {
+ __db_errx(env, DB_STR_A("2575",
+ "DB_LOGC->get: invalid LSN: %lu/%lu", "%lu %lu"),
+ (u_long)alsn->file, (u_long)alsn->offset);
+ return (EINVAL);
+ }
+ break;
+ default:
+ return (__db_ferr(env, "DB_LOGC->get", 1));
+ }
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__logc_get(logc, alsn, dbt, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __logc_get --
+ * DB_LOGC->get.
+ *
+ * PUBLIC: int __logc_get __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+ */
+int
+__logc_get(logc, alsn, dbt, flags)
+ DB_LOGC *logc;
+ DB_LSN *alsn;
+ DBT *dbt;
+ u_int32_t flags;
+{
+ DB_LSN saved_lsn;
+ ENV *env;
+ LOGP *persist;
+ int ret;
+
+ env = logc->env;
+
+ /*
+ * On error, we take care not to overwrite the caller's LSN. This
+ * is because callers looking for the end of the log loop using the
+ * DB_NEXT flag, and expect to take the last successful lsn out of
+ * the passed-in structure after DB_LOGC->get fails with DB_NOTFOUND.
+ *
+ * !!!
+ * This line is often flagged an uninitialized memory read during a
+ * Purify or similar tool run, as the application didn't initialize
+ * *alsn. If the application isn't setting the DB_SET flag, there is
+ * no reason it should have initialized *alsn, but we can't know that
+ * and we want to make sure we never overwrite whatever the application
+ * put in there.
+ */
+ saved_lsn = *alsn;
+ /*
+ * If we get one of the log's header records as a result of doing a
+ * DB_FIRST, DB_NEXT, DB_LAST or DB_PREV, repeat the operation, log
+ * file header records aren't useful to applications.
+ */
+ if ((ret = __logc_get_int(logc, alsn, dbt, flags)) != 0) {
+ *alsn = saved_lsn;
+ return (ret);
+ }
+ /*
+ * The DBT was populated by the call to __logc_get_int, copy the data
+ * out of DB_DBT_USERMEM space if it is there.
+ */
+ if ((ret = __dbt_usercopy(env, dbt)) != 0)
+ return (ret);
+
+ if (alsn->offset == 0 && (flags == DB_FIRST ||
+ flags == DB_NEXT || flags == DB_LAST || flags == DB_PREV)) {
+ switch (flags) {
+ case DB_FIRST:
+ flags = DB_NEXT;
+ break;
+ case DB_LAST:
+ flags = DB_PREV;
+ break;
+ case DB_NEXT:
+ case DB_PREV:
+ default:
+ break;
+ }
+ /*
+ * If we're walking the log and we find a persist header
+ * then store so that we may use it later if needed.
+ */
+ persist = (LOGP *)dbt->data;
+ if (LOG_SWAPPED(env))
+ __log_persistswap(persist);
+ logc->p_lsn = *alsn;
+ logc->p_version = persist->version;
+ if (F_ISSET(dbt, DB_DBT_MALLOC)) {
+ __os_free(env, dbt->data);
+ dbt->data = NULL;
+ }
+ if ((ret = __logc_get_int(logc, alsn, dbt, flags)) != 0) {
+ *alsn = saved_lsn;
+ goto err;
+ }
+ }
+
+err: __dbt_userfree(env, dbt, NULL, NULL);
+ return (ret);
+}
+
+/*
+ * __logc_get_int --
+ * Get a log record; internal version.
+ */
+static int
+__logc_get_int(logc, alsn, dbt, flags)
+ DB_LOGC *logc;
+ DB_LSN *alsn;
+ DBT *dbt;
+ u_int32_t flags;
+{
+ DB_CIPHER *db_cipher;
+ DB_LOG *dblp;
+ DB_LSN last_lsn, nlsn;
+ ENV *env;
+ HDR hdr;
+ LOG *lp;
+ RLOCK rlock;
+ logfile_validity status;
+ u_int32_t cnt, logfsz, orig_flags;
+ u_int8_t *rp;
+ int eof, is_hmac, need_cksum, ret;
+ size_t blen;
+#ifdef HAVE_LOG_CHECKSUM
+ u_int32_t i, logtype, version;
+ char chksumbuf[256];
+ u_int8_t ch;
+#endif
+
+ env = logc->env;
+ db_cipher = env->crypto_handle;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ eof = is_hmac = 0;
+ orig_flags = flags; /* flags may be altered later. */
+ blen = 0;
+ logfsz = lp->persist.log_size;
+
+ /*
+ * We don't acquire the log region lock until we need it, and we
+ * release it as soon as we're done.
+ */
+ rlock = F_ISSET(logc, DB_LOG_LOCKED) ? L_ALREADY : L_NONE;
+
+#ifdef HAVE_LOG_CHECKSUM
+nextrec:
+#endif
+ nlsn = logc->lsn;
+ switch (flags) {
+ case DB_NEXT: /* Next log record. */
+ if (!IS_ZERO_LSN(nlsn)) {
+ /* Increment the cursor by the cursor record size. */
+ nlsn.offset += logc->len;
+ break;
+ }
+ flags = DB_FIRST;
+ /* FALLTHROUGH */
+ case DB_FIRST: /* First log record. */
+ /* Find the first log file. */
+ if ((ret = __log_find(dblp, 1, &cnt, &status)) != 0)
+ goto err;
+
+ /*
+ * DB_LV_INCOMPLETE:
+ * Theoretically, the log file we want could be created
+ * but not yet written, the "first" log record must be
+ * in the log buffer.
+ * DB_LV_NORMAL:
+ * DB_LV_OLD_READABLE:
+ * We found a log file we can read.
+ * DB_LV_NONEXISTENT:
+ * No log files exist, the "first" log record must be in
+ * the log buffer.
+ * DB_LV_OLD_UNREADABLE:
+ * No readable log files exist, we're at the cross-over
+ * point between two versions. The "first" log record
+ * must be in the log buffer.
+ */
+ switch (status) {
+ case DB_LV_INCOMPLETE:
+ DB_ASSERT(env, lp->lsn.file == cnt);
+ /* FALLTHROUGH */
+ case DB_LV_NORMAL:
+ case DB_LV_OLD_READABLE:
+ nlsn.file = cnt;
+ break;
+ case DB_LV_NONEXISTENT:
+ nlsn.file = 1;
+ DB_ASSERT(env, lp->lsn.file == nlsn.file);
+ break;
+ case DB_LV_OLD_UNREADABLE:
+ nlsn.file = cnt + 1;
+ DB_ASSERT(env, lp->lsn.file == nlsn.file);
+ break;
+ }
+ nlsn.offset = 0;
+ break;
+ case DB_CURRENT: /* Current log record. */
+ break;
+ case DB_PREV: /* Previous log record. */
+ if (!IS_ZERO_LSN(nlsn)) {
+ /* If at start-of-file, move to the previous file. */
+ if (nlsn.offset == 0) {
+ if (nlsn.file == 1) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ if ((!lp->db_log_inmemory &&
+ (__log_valid(dblp, nlsn.file - 1, 0, NULL,
+ 0, &status, NULL) != 0 ||
+ (status != DB_LV_NORMAL &&
+ status != DB_LV_OLD_READABLE)))) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ --nlsn.file;
+ }
+ nlsn.offset = logc->prev;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST: /* Last log record. */
+ if (rlock == L_NONE) {
+ rlock = L_ACQUIRED;
+ LOG_SYSTEM_LOCK(env);
+ }
+ nlsn.file = lp->lsn.file;
+ nlsn.offset = lp->lsn.offset - lp->len;
+ break;
+ case DB_SET: /* Set log record. */
+ nlsn = *alsn;
+ break;
+ default:
+ ret = __db_unknown_path(env, "__logc_get_int");
+ goto err;
+ }
+
+ if (0) { /* Move to the next file. */
+next_file: ++nlsn.file;
+ nlsn.offset = 0;
+ }
+
+ /*
+ * The above switch statement should have set nlsn to the lsn of
+ * the requested record.
+ */
+
+ if (CRYPTO_ON(env)) {
+ hdr.size = HDR_CRYPTO_SZ;
+ is_hmac = 1;
+ } else {
+ hdr.size = HDR_NORMAL_SZ;
+ is_hmac = 0;
+ }
+
+ /*
+ * Check to see if the record is in the cursor's buffer -- if so,
+ * we'll need to checksum it.
+ */
+ if ((ret = __logc_incursor(logc, &nlsn, &hdr, &rp)) != 0)
+ goto err;
+ if (rp != NULL)
+ goto cksum;
+
+ /*
+ * Look to see if we're moving backward in the log with the last record
+ * coming from the disk -- it means the record can't be in the region's
+ * buffer. Else, check the region's buffer.
+ *
+ * If the record isn't in the region's buffer, then either logs are
+ * in-memory, and we're done, or we're going to have to read the
+ * record from disk. We want to make a point of not reading past the
+ * end of the logical log (after recovery, there may be data after the
+ * end of the logical log, not to mention the log file may have been
+ * pre-allocated). So, zero out last_lsn, and initialize it inside
+ * __logc_inregion -- if it's still zero when we check it in
+ * __logc_ondisk, that's OK, it just means the logical end of the log
+ * isn't an issue for this request.
+ */
+ ZERO_LSN(last_lsn);
+ if (!F_ISSET(logc, DB_LOG_DISK) ||
+ LOG_COMPARE(&nlsn, &logc->lsn) > 0) {
+ F_CLR(logc, DB_LOG_DISK);
+
+ if ((ret = __logc_inregion(logc,
+ &nlsn, &rlock, &last_lsn, &hdr, &rp, &need_cksum)) != 0)
+ goto err;
+ if (rp != NULL) {
+ /*
+ * If we read the entire record from the in-memory log
+ * buffer, we don't need to checksum it, nor do we need
+ * to worry about vtruncate issues.
+ */
+ if (need_cksum)
+ goto cksum;
+ goto from_memory;
+ }
+ if (lp->db_log_inmemory)
+ goto nohdr;
+ }
+
+ /*
+ * We have to read from an on-disk file to retrieve the record.
+ * If we ever can't retrieve the record at offset 0, we're done,
+ * return EOF/DB_NOTFOUND.
+ *
+ * Discard the region lock if we're still holding it, the on-disk
+ * reading routines don't need it.
+ */
+ if (rlock == L_ACQUIRED) {
+ rlock = L_NONE;
+ LOG_SYSTEM_UNLOCK(env);
+ }
+ if ((ret = __logc_ondisk(
+ logc, &nlsn, &last_lsn, flags, &hdr, &rp, &eof)) != 0)
+ goto err;
+
+ /*
+ * If we got a 0-length record, that means we're in the midst of some
+ * bytes that got 0'd as the result of a vtruncate. In that case or at
+ * the end of a file, with DB_NEXT we're going to have to retry.
+ */
+ if (eof || hdr.len == 0) {
+nohdr: switch (flags) {
+ case DB_LAST:
+ case DB_PREV:
+ /*
+ * We should never get here. If we recover a log
+ * file with 0's at the end, we'll treat the 0'd
+ * headers as the end of log and ignore them. If
+ * we're reading backwards from another file, then
+ * the first record in that new file should have its
+ * prev field set correctly.
+ * First check that the file exists.
+ */
+ if (eof && logc->bp_lsn.file != nlsn.file)
+ __db_errx(env, DB_STR_A("2583",
+ "Log file %d not found, check log directory configuration", "%d"),
+ nlsn.file);
+ else
+ __db_errx(env, DB_STR("2576",
+ "Encountered zero length records while traversing backwards"));
+ ret = __env_panic(env, DB_RUNRECOVERY);
+ goto err;
+ case DB_FIRST:
+ case DB_NEXT:
+ /*
+ * Zero'd records always indicate the end of a file,
+ * but only go to the next file once.
+ */
+ if (nlsn.offset != 0)
+ goto next_file;
+ /* FALLTHROUGH */
+ case DB_SET:
+ default:
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ }
+
+ F_SET(logc, DB_LOG_DISK);
+
+cksum: /*
+ * Discard the region lock if we're still holding it. (The path to
+ * get here is we acquired the region lock because of the caller's
+ * flag argument, but we found the record in the in-memory or cursor
+ * buffers. Improbable, but it's easy to avoid.)
+ */
+ if (rlock == L_ACQUIRED) {
+ rlock = L_NONE;
+ LOG_SYSTEM_UNLOCK(env);
+ }
+#ifdef HAVE_LOG_CHECKSUM
+ /*
+ * Checksum: there are two types of errors -- a configuration error
+ * or a checksum mismatch. The former is always bad. The latter is
+ * OK if we're searching for the end of the log, and very, very bad
+ * if we're reading random log records.
+ */
+ if ((ret = __db_check_chksum(env, &hdr, db_cipher,
+ hdr.chksum, rp + hdr.size, hdr.len - hdr.size, is_hmac)) != 0) {
+ /*
+ * This might be a log whose checksum does not include the hdr.
+ * Try again without the header, either for logs whose version
+ * is pre-DB_LOGCHKSUM, or for the persist record which contains
+ * the log version. Check for the zero offset first to avoid
+ * unwanted recursion in __logc_version().
+ *
+ * Set the cursor to the LSN we are trying to look at.
+ */
+ last_lsn = logc->lsn;
+ logc->lsn = nlsn;
+ if ((logc->lsn.offset == 0 ||
+ (__logc_version(logc, &version) == 0 &&
+ version < DB_LOGCHKSUM)) &&
+ __db_check_chksum(env, NULL, db_cipher, hdr.chksum,
+ rp + hdr.size, hdr.len - hdr.size, is_hmac) == 0) {
+ logc->lsn = last_lsn;
+ goto from_memory;
+ }
+
+ /*
+ * If we are iterating logs during log verification and basic
+ * header info is correct, we can skip the failed log record
+ * and goto next one.
+ */
+ if (F_ISSET(logc->env->lg_handle, DBLOG_VERIFYING) &&
+ (orig_flags == DB_FIRST || orig_flags == DB_LAST ||
+ orig_flags == DB_PREV || orig_flags == DB_NEXT) &&
+ hdr.size > 0 && hdr.len > hdr.size && hdr.len < logfsz &&
+ (((flags == DB_FIRST || flags == DB_NEXT) &&
+ hdr.prev == last_lsn.offset) ||
+ ((flags == DB_PREV || flags == DB_LAST) &&
+ last_lsn.offset - hdr.len == nlsn.offset))) {
+
+ flags = orig_flags;
+
+ logc->lsn = nlsn;
+ logc->len = hdr.len;
+ logc->prev = hdr.prev;
+
+ if (flags == DB_LAST)
+ flags = DB_PREV;
+ else if (flags == DB_FIRST)
+ flags = DB_NEXT;
+
+ memset(chksumbuf, 0, 256);
+ blen = 0;
+ for (i = 0; i < DB_MAC_KEY && blen < 256; i++) {
+ ch = hdr.chksum[i];
+ blen = strlen(chksumbuf);
+ snprintf(chksumbuf + blen, 255 - blen,
+ isprint(ch) ||
+ ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ /* Type field is always the first one in the record. */
+ memcpy(&logtype, rp + hdr.size, sizeof(logtype));
+ __db_errx(env, DB_STR_A("2577",
+ "DB_LOGC->get: log record LSN %lu/%lu: "
+ "checksum mismatch, hdr.chksum: %s, hdr.prev: %u, "
+ "hdr.len: %u, log type: %u. Skipping it and "
+ "continuing with the %s one",
+ "%lu %lu %s %u %u %u %s"),
+ (u_long)nlsn.file, (u_long)nlsn.offset, chksumbuf,
+ hdr.prev, hdr.len, logtype, flags == DB_NEXT ?
+ DB_STR_P("next") : DB_STR_P("previous"));
+ goto nextrec;
+ }
+
+ if (F_ISSET(logc, DB_LOG_SILENT_ERR)) {
+ if (ret == -1)
+ ret = EIO;
+ } else if (ret == -1) {
+ __db_errx(env, DB_STR_A("2578",
+ "DB_LOGC->get: log record LSN %lu/%lu: checksum mismatch",
+ "%lu %lu"), (u_long)nlsn.file, (u_long)nlsn.offset);
+ __db_errx(env, DB_STR("2579",
+ "DB_LOGC->get: catastrophic recovery may be required"));
+ ret = __env_panic(env, DB_RUNRECOVERY);
+ }
+ logc->lsn = last_lsn;
+ goto err;
+ }
+#endif
+
+from_memory:
+ /*
+ * Discard the region lock if we're still holding it. (The path to
+ * get here is we acquired the region lock because of the caller's
+ * flag argument, but we found the record in the in-memory or cursor
+ * buffers. Improbable, but it's easy to avoid.)
+ */
+ if (rlock == L_ACQUIRED) {
+ rlock = L_NONE;
+ LOG_SYSTEM_UNLOCK(env);
+ }
+
+ /* Copy the record into the user's DBT. */
+ if ((ret = __db_retcopy(env, dbt, rp + hdr.size,
+ (u_int32_t)(hdr.len - hdr.size),
+ &logc->dbt.data, &logc->dbt.ulen)) != 0)
+ goto err;
+
+ if (CRYPTO_ON(env)) {
+ if ((ret = db_cipher->decrypt(env, db_cipher->data,
+ hdr.iv, dbt->data, hdr.len - hdr.size)) != 0) {
+ ret = EAGAIN;
+ goto err;
+ }
+ /*
+ * Return the original log record size to the user,
+ * even though we've allocated more than that, possibly.
+ * The log record is decrypted in the user dbt, not in
+ * the buffer, so we must do this here after decryption,
+ * not adjust the len passed to the __db_retcopy call.
+ */
+ dbt->size = hdr.orig_size;
+ }
+
+ /* Update the cursor and the returned LSN. */
+ *alsn = nlsn;
+ logc->lsn = nlsn;
+ logc->len = hdr.len;
+ logc->prev = hdr.prev;
+
+err: if (rlock == L_ACQUIRED)
+ LOG_SYSTEM_UNLOCK(env);
+
+ return (ret);
+}
+
+/*
+ * __logc_incursor --
+ * Check to see if the requested record is in the cursor's buffer.
+ */
+static int
+__logc_incursor(logc, lsn, hdr, pp)
+ DB_LOGC *logc;
+ DB_LSN *lsn;
+ HDR *hdr;
+ u_int8_t **pp;
+{
+ ENV *env;
+ u_int8_t *p;
+ int eof;
+
+ env = logc->env;
+ *pp = NULL;
+
+ /*
+ * Test to see if the requested LSN could be part of the cursor's
+ * buffer.
+ *
+ * The record must be part of the same file as the cursor's buffer.
+ * The record must start at a byte offset equal to or greater than
+ * the cursor buffer.
+ * The record must not start at a byte offset after the cursor
+ * buffer's end.
+ */
+ if (logc->bp_lsn.file != lsn->file)
+ return (0);
+ if (logc->bp_lsn.offset > lsn->offset)
+ return (0);
+ if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->size)
+ return (0);
+
+ /*
+ * Read the record's header and check if the record is entirely held
+ * in the buffer. If the record is not entirely held, get it again.
+ * (The only advantage in having part of the record locally is that
+ * we might avoid a system call because we already have the HDR in
+ * memory.)
+ *
+ * If the header check fails for any reason, it must be because the
+ * LSN is bogus. Fail hard.
+ */
+ p = logc->bp + (lsn->offset - logc->bp_lsn.offset);
+ memcpy(hdr, p, hdr->size);
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(hdr, CRYPTO_ON(env));
+ if (__logc_hdrchk(logc, lsn, hdr, &eof))
+ return (DB_NOTFOUND);
+ if (eof || logc->bp_lsn.offset + logc->bp_rlen < lsn->offset + hdr->len)
+ return (0);
+
+ *pp = p; /* Success. */
+
+ return (0);
+}
+
+/*
+ * __logc_inregion --
+ * Check to see if the requested record is in the region's buffer.
+ */
+static int
+__logc_inregion(logc, lsn, rlockp, last_lsn, hdr, pp, need_cksump)
+ DB_LOGC *logc;
+ DB_LSN *lsn, *last_lsn;
+ RLOCK *rlockp;
+ HDR *hdr;
+ u_int8_t **pp;
+ int *need_cksump;
+{
+ DB_LOG *dblp;
+ ENV *env;
+ LOG *lp;
+ size_t b_region, len, nr;
+ u_int32_t b_disk;
+ int eof, ret;
+ u_int8_t *p;
+
+ env = logc->env;
+ dblp = env->lg_handle;
+ lp = env->lg_handle->reginfo.primary;
+
+ ret = 0;
+ b_region = 0;
+ *pp = NULL;
+ *need_cksump = 0;
+
+ /* If we haven't yet acquired the log region lock, do so. */
+ if (*rlockp == L_NONE) {
+ *rlockp = L_ACQUIRED;
+ LOG_SYSTEM_LOCK(env);
+ }
+
+ /*
+ * The routines to read from disk must avoid reading past the logical
+ * end of the log, so pass that information back to it.
+ *
+ * Since they're reading directly from the disk, they must also avoid
+ * reading past the offset we've written out. If the log was
+ * truncated, it's possible that there are zeroes or garbage on
+ * disk after this offset, and the logical end of the log can
+ * come later than this point if the log buffer isn't empty.
+ */
+ *last_lsn = lp->lsn;
+ if (!lp->db_log_inmemory && last_lsn->offset > lp->w_off)
+ last_lsn->offset = lp->w_off;
+
+ /*
+ * Test to see if the requested LSN could be part of the region's
+ * buffer.
+ *
+ * During recovery, we read the log files getting the information to
+ * initialize the region. In that case, the region's lsn field will
+ * not yet have been filled in, use only the disk.
+ *
+ * The record must not start at a byte offset after the region buffer's
+ * end, since that means the request is for a record after the end of
+ * the log. Do this test even if the region's buffer is empty -- after
+ * recovery, the log files may continue past the declared end-of-log,
+ * and the disk reading routine will incorrectly attempt to read the
+ * remainder of the log.
+ *
+ * Otherwise, test to see if the region's buffer actually has what we
+ * want:
+ *
+ * The buffer must have some useful content.
+ * The record must be in the same file as the region's buffer and must
+ * start at a byte offset equal to or greater than the region's buffer.
+ */
+ if (IS_ZERO_LSN(lp->lsn))
+ return (0);
+ if (LOG_COMPARE(lsn, &lp->lsn) >= 0)
+ return (DB_NOTFOUND);
+ else if (lp->db_log_inmemory) {
+ if ((ret = __log_inmem_lsnoff(dblp, lsn, &b_region)) != 0)
+ return (ret);
+ } else if (lp->b_off == 0 || LOG_COMPARE(lsn, &lp->f_lsn) < 0)
+ return (0);
+
+ /*
+ * The current contents of the cursor's buffer will be useless for a
+ * future call, we're about to overwrite it -- trash it rather than
+ * try and make it look correct.
+ */
+ logc->bp_rlen = 0;
+
+ /*
+ * If the requested LSN is greater than the region buffer's first
+ * byte, we know the entire record is in the buffer on a good LSN.
+ *
+ * If we're given a bad LSN, the "entire" record might not be in
+ * our buffer in order to fail at the chksum. __logc_hdrchk made
+ * sure our dest buffer fits, via bp_maxrec, but we also need to
+ * make sure we don't run off the end of this buffer, the src.
+ *
+ * There is one case where the header check can fail: on a scan through
+ * in-memory logs, when we reach the end of a file we can read an empty
+ * header. In that case, it's safe to return zero, here: it will be
+ * caught in our caller. Otherwise, the LSN is bogus. Fail hard.
+ */
+ if (lp->db_log_inmemory || LOG_COMPARE(lsn, &lp->f_lsn) > 0) {
+ if (!lp->db_log_inmemory)
+ b_region = lsn->offset - lp->w_off;
+ __log_inmem_copyout(dblp, b_region, hdr, hdr->size);
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(hdr, CRYPTO_ON(env));
+ if (__logc_hdrchk(logc, lsn, hdr, &eof) != 0)
+ return (DB_NOTFOUND);
+ if (eof)
+ return (0);
+ if (lp->db_log_inmemory) {
+ if (RINGBUF_LEN(lp, b_region, lp->b_off) < hdr->len)
+ return (DB_NOTFOUND);
+ } else if (lsn->offset + hdr->len > lp->w_off + lp->buffer_size)
+ return (DB_NOTFOUND);
+ if (logc->bp_size <= hdr->len) {
+ len = (size_t)DB_ALIGN((uintmax_t)hdr->len * 2, 128);
+ if ((ret =
+ __os_realloc(logc->env, len, &logc->bp)) != 0)
+ return (ret);
+ logc->bp_size = (u_int32_t)len;
+ }
+ __log_inmem_copyout(dblp, b_region, logc->bp, hdr->len);
+ *pp = logc->bp;
+ return (0);
+ }
+
+ DB_ASSERT(env, !lp->db_log_inmemory);
+
+ /*
+ * There's a partial record, that is, the requested record starts
+ * in a log file and finishes in the region buffer. We have to
+ * find out how many bytes of the record are in the region buffer
+ * so we can copy them out into the cursor buffer. First, check
+ * to see if the requested record is the only record in the region
+ * buffer, in which case we should copy the entire region buffer.
+ *
+ * Else, walk back through the region's buffer to find the first LSN
+ * after the record that crosses the buffer boundary -- we can detect
+ * that LSN, because its "prev" field will reference the record we
+ * want. The bytes we need to copy from the region buffer are the
+ * bytes up to the record we find. The bytes we'll need to allocate
+ * to hold the log record are the bytes between the two offsets.
+ */
+ b_disk = lp->w_off - lsn->offset;
+ if (lp->b_off <= lp->len)
+ b_region = (u_int32_t)lp->b_off;
+ else
+ for (p = dblp->bufp + (lp->b_off - lp->len);;) {
+ memcpy(hdr, p, hdr->size);
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(hdr, CRYPTO_ON(env));
+ if (hdr->prev == lsn->offset) {
+ b_region = (u_int32_t)(p - dblp->bufp);
+ break;
+ }
+ p = dblp->bufp + (hdr->prev - lp->w_off);
+ }
+
+ /*
+ * If we don't have enough room for the record, we have to allocate
+ * space. We have to do it while holding the region lock, which is
+ * truly annoying, but there's no way around it. This call is why
+ * we allocate cursor buffer space when allocating the cursor instead
+ * of waiting.
+ */
+ if (logc->bp_size <= b_region + b_disk) {
+ len = (size_t)DB_ALIGN((uintmax_t)(b_region + b_disk) * 2, 128);
+ if ((ret = __os_realloc(logc->env, len, &logc->bp)) != 0)
+ return (ret);
+ logc->bp_size = (u_int32_t)len;
+ }
+
+ /* Copy the region's bytes to the end of the cursor's buffer. */
+ p = (logc->bp + logc->bp_size) - b_region;
+ memcpy(p, dblp->bufp, b_region);
+
+ /* Release the region lock. */
+ if (*rlockp == L_ACQUIRED) {
+ *rlockp = L_NONE;
+ LOG_SYSTEM_UNLOCK(env);
+ }
+
+ /*
+ * Read the rest of the information from disk. Neither short reads
+ * or EOF are acceptable, the bytes we want had better be there.
+ */
+ if (b_disk != 0) {
+ p -= b_disk;
+ nr = b_disk;
+ if ((ret = __logc_io(
+ logc, lsn->file, lsn->offset, p, &nr, NULL)) != 0)
+ return (ret);
+ if (nr < b_disk)
+ return (__logc_shortread(logc, lsn, 0));
+
+ /* We read bytes from the disk, we'll need to checksum them. */
+ *need_cksump = 1;
+ }
+
+ /* Copy the header information into the caller's structure. */
+ memcpy(hdr, p, hdr->size);
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(hdr, CRYPTO_ON(env));
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __log_hdrswap --
+ * Swap the bytes in a log header from machines with different endianness.
+ *
+ * PUBLIC: void __log_hdrswap __P((HDR *, int));
+ */
+void
+__log_hdrswap(hdr, is_hmac)
+ HDR *hdr;
+ int is_hmac;
+{
+ M_32_SWAP(hdr->prev);
+ M_32_SWAP(hdr->len);
+ if (!is_hmac)
+ P_32_SWAP(hdr->chksum);
+}
+
+/*
+ * __log_persistswap --
+ * Swap the bytes in a log file persistent header from machines with
+ * different endianness.
+ *
+ * PUBLIC: void __log_persistswap __P((LOGP *));
+ */
+void
+__log_persistswap(persist)
+ LOGP *persist;
+{
+ M_32_SWAP(persist->magic);
+ M_32_SWAP(persist->version);
+ M_32_SWAP(persist->log_size);
+ M_32_SWAP(persist->notused);
+}
+
+/*
+ * __logc_ondisk --
+ * Read a record off disk.
+ */
+static int
+__logc_ondisk(logc, lsn, last_lsn, flags, hdr, pp, eofp)
+ DB_LOGC *logc;
+ DB_LSN *lsn, *last_lsn;
+ u_int32_t flags;
+ int *eofp;
+ HDR *hdr;
+ u_int8_t **pp;
+{
+ ENV *env;
+ size_t len, nr;
+ u_int32_t offset;
+ int ret;
+
+ env = logc->env;
+ *eofp = 0;
+
+ nr = hdr->size;
+ if ((ret =
+ __logc_io(logc, lsn->file, lsn->offset, hdr, &nr, eofp)) != 0)
+ return (ret);
+ if (*eofp)
+ return (0);
+
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(hdr, CRYPTO_ON(env));
+
+ /*
+ * If the read was successful, but we can't read a full header, assume
+ * we've hit EOF. We can't check that the header has been partially
+ * zeroed out, but it's unlikely that this is caused by a write failure
+ * since the header is written as a single write call and it's less
+ * than sector.
+ */
+ if (nr < hdr->size) {
+ *eofp = 1;
+ return (0);
+ }
+
+ /* Check the HDR. */
+ if ((ret = __logc_hdrchk(logc, lsn, hdr, eofp)) != 0)
+ return (ret);
+ if (*eofp)
+ return (0);
+
+ /*
+ * Regardless of how we return, the previous contents of the cursor's
+ * buffer are useless -- trash it.
+ */
+ logc->bp_rlen = 0;
+
+ /*
+ * Otherwise, we now (finally!) know how big the record is. (Maybe
+ * we should have just stuck the length of the record into the LSN!?)
+ * Make sure we have enough space.
+ */
+ if (logc->bp_size <= hdr->len) {
+ len = (size_t)DB_ALIGN((uintmax_t)hdr->len * 2, 128);
+ if ((ret = __os_realloc(env, len, &logc->bp)) != 0)
+ return (ret);
+ logc->bp_size = (u_int32_t)len;
+ }
+
+ /*
+ * If we're moving forward in the log file, read this record in at the
+ * beginning of the buffer. Otherwise, read this record in at the end
+ * of the buffer, making sure we don't try and read before the start
+ * of the file. (We prefer positioning at the end because transaction
+ * aborts use DB_SET to move backward through the log and we might get
+ * lucky.)
+ *
+ * Read a buffer's worth, without reading past the logical EOF. The
+ * last_lsn may be a zero LSN, but that's OK, the test works anyway.
+ */
+ if (flags == DB_FIRST || flags == DB_NEXT)
+ offset = lsn->offset;
+ else if (lsn->offset + hdr->len < logc->bp_size)
+ offset = 0;
+ else
+ offset = (lsn->offset + hdr->len) - logc->bp_size;
+
+ nr = logc->bp_size;
+ if (lsn->file == last_lsn->file && offset + nr >= last_lsn->offset)
+ nr = last_lsn->offset - offset;
+
+ if ((ret =
+ __logc_io(logc, lsn->file, offset, logc->bp, &nr, eofp)) != 0)
+ return (ret);
+
+ /*
+ * We should have at least gotten the bytes up-to-and-including the
+ * record we're reading.
+ */
+ if (nr < (lsn->offset + hdr->len) - offset)
+ return (__logc_shortread(logc, lsn, 1));
+
+ /*
+ * Set up the return information.
+ *
+ * !!!
+ * No need to set the bp_lsn.file field, __logc_io set it for us.
+ */
+ logc->bp_rlen = (u_int32_t)nr;
+ logc->bp_lsn.offset = offset;
+
+ *pp = logc->bp + (lsn->offset - offset);
+
+ return (0);
+}
+
+/*
+ * __logc_hdrchk --
+ *
+ * Check for corrupted HDRs before we use them to allocate memory or find
+ * records.
+ *
+ * If the log files were pre-allocated, a zero-filled HDR structure is the
+ * logical file end. However, we can see buffers filled with 0's during
+ * recovery, too (because multiple log buffers were written asynchronously,
+ * and one made it to disk before a different one that logically precedes
+ * it in the log file.
+ *
+ * Check for impossibly large records. The malloc should fail later, but we
+ * have customers that run mallocs that treat all allocation failures as fatal
+ * errors.
+ *
+ * Note that none of this is necessarily something awful happening. We let
+ * the application hand us any LSN they want, and it could be a pointer into
+ * the middle of a log record, there's no way to tell.
+ */
+static int
+__logc_hdrchk(logc, lsn, hdr, eofp)
+ DB_LOGC *logc;
+ DB_LSN *lsn;
+ HDR *hdr;
+ int *eofp;
+{
+ ENV *env;
+ int ret;
+
+ env = logc->env;
+
+ /*
+ * Check EOF before we do any other processing.
+ */
+ if (eofp != NULL) {
+ if (hdr->prev == 0 && hdr->chksum[0] == 0 && hdr->len == 0) {
+ *eofp = 1;
+ return (0);
+ }
+ *eofp = 0;
+ }
+
+ /*
+ * Sanity check the log record's size.
+ * We must check it after "virtual" EOF above.
+ */
+ if (hdr->len <= hdr->size)
+ goto err;
+
+ /*
+ * If the cursor's max-record value isn't yet set, it means we aren't
+ * reading these records from a log file and no check is necessary.
+ */
+ if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec) {
+ /*
+ * If we fail the check, there's the pathological case that
+ * we're reading the last file, it's growing, and our initial
+ * check information was wrong. Get it again, to be sure.
+ */
+ if ((ret = __logc_set_maxrec(logc, NULL)) != 0) {
+ __db_err(env, ret, "DB_LOGC->get");
+ return (ret);
+ }
+ if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec)
+ goto err;
+ }
+ return (0);
+
+err: if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+ __db_errx(env, DB_STR_A("2580",
+ "DB_LOGC->get: LSN %lu/%lu: invalid log record header",
+ "%lu %lu"), (u_long)lsn->file, (u_long)lsn->offset);
+ return (EIO);
+}
+
+/*
+ * __logc_io --
+ * Read records from a log file.
+ */
+static int
+__logc_io(logc, fnum, offset, p, nrp, eofp)
+ DB_LOGC *logc;
+ u_int32_t fnum, offset;
+ void *p;
+ size_t *nrp;
+ int *eofp;
+{
+ DB_LOG *dblp;
+ ENV *env;
+ LOG *lp;
+ int ret;
+ char *np;
+
+ env = logc->env;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ /*
+ * If we've switched files, discard the current file handle and acquire
+ * a new one.
+ */
+ if (logc->fhp != NULL && logc->bp_lsn.file != fnum) {
+ ret = __os_closehandle(env, logc->fhp);
+ logc->fhp = NULL;
+ logc->bp_lsn.file = 0;
+
+ if (ret != 0)
+ return (ret);
+ }
+ if (logc->fhp == NULL) {
+ if ((ret = __log_name(dblp, fnum,
+ &np, &logc->fhp, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) {
+ /*
+ * If we're allowed to return EOF, assume that's the
+ * problem, set the EOF status flag and return 0.
+ */
+ if (eofp != NULL) {
+ *eofp = 1;
+ ret = 0;
+ } else if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+ __db_err(env, ret, "DB_LOGC->get: %s",
+ np == NULL ? "__log_name failed" : np);
+ __os_free(env, np);
+ return (ret);
+ }
+
+ if ((ret = __logc_set_maxrec(logc, np)) != 0) {
+ __db_err(env, ret, "DB_LOGC->get: %s", np);
+ __os_free(env, np);
+ return (ret);
+ }
+ __os_free(env, np);
+
+ logc->bp_lsn.file = fnum;
+ }
+
+ STAT_INC(env, log, read, lp->stat.st_rcount, fnum);
+ /* Seek to the record's offset and read the data. */
+ if ((ret = __os_io(env, DB_IO_READ,
+ logc->fhp, 0, 0, offset, (u_int32_t)*nrp, p, nrp)) != 0) {
+ if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+ __db_err(env, ret, DB_STR_A("2581",
+ "DB_LOGC->get: LSN: %lu/%lu: read", "%lu %lu"),
+ (u_long)fnum, (u_long)offset);
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __logc_shortread --
+ * Read was short -- return a consistent error message and error.
+ */
+static int
+__logc_shortread(logc, lsn, check_silent)
+ DB_LOGC *logc;
+ DB_LSN *lsn;
+ int check_silent;
+{
+ if (!check_silent || !F_ISSET(logc, DB_LOG_SILENT_ERR))
+ __db_errx(logc->env, DB_STR_A("2582",
+ "DB_LOGC->get: LSN: %lu/%lu: short read", "%lu %lu"),
+ (u_long)lsn->file, (u_long)lsn->offset);
+ return (EIO);
+}
+
+/*
+ * __logc_set_maxrec --
+ * Bound the maximum log record size in a log file.
+ */
+static int
+__logc_set_maxrec(logc, np)
+ DB_LOGC *logc;
+ char *np;
+{
+ DB_LOG *dblp;
+ ENV *env;
+ LOG *lp;
+ u_int32_t mbytes, bytes;
+ int ret;
+
+ env = logc->env;
+ dblp = env->lg_handle;
+
+ /*
+ * We don't want to try and allocate huge chunks of memory because
+ * applications with error-checking malloc's often consider that a
+ * hard failure. If we're about to look at a corrupted record with
+ * a bizarre size, we need to know before trying to allocate space
+ * to hold it. We could read the persistent data at the beginning
+ * of the file but that's hard -- we may have to decrypt it, checksum
+ * it and so on. Stat the file instead.
+ */
+ if (logc->fhp != NULL) {
+ if ((ret = __os_ioinfo(env, np, logc->fhp,
+ &mbytes, &bytes, NULL)) != 0)
+ return (ret);
+ if (logc->bp_maxrec < (mbytes * MEGABYTE + bytes))
+ logc->bp_maxrec = mbytes * MEGABYTE + bytes;
+ }
+
+ /*
+ * If reading from the log file currently being written, we could get
+ * an incorrect size, that is, if the cursor was opened on the file
+ * when it had only a few hundred bytes, and then the cursor used to
+ * move forward in the file, after more log records were written, the
+ * original stat value would be wrong. Use the maximum of the current
+ * log file size and the size of the buffer -- that should represent
+ * the max of any log record currently in the file.
+ *
+ * The log buffer size is set when the environment is opened and never
+ * changed, we don't need a lock on it.
+ */
+ lp = dblp->reginfo.primary;
+ if (logc->bp_maxrec < lp->buffer_size)
+ logc->bp_maxrec = lp->buffer_size;
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __log_read_record_pp __P((DB_ENV *, DB **, void *, void *,
+ * PUBLIC: DB_LOG_RECSPEC *, u_int32_t, void **));
+ */
+int
+__log_read_record_pp(dbenv, dbpp, td, recbuf, spec, size, argpp)
+ DB_ENV *dbenv;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ DB_LOG_RECSPEC *spec;
+ u_int32_t size;
+ void **argpp;
+{
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ENV_REQUIRES_CONFIG(dbenv->env,
+ dbenv->env->lg_handle, "DB_ENV->log_read_record", DB_INIT_LOG);
+
+ *argpp = NULL;
+ ENV_ENTER(dbenv->env, ip);
+ if ((ret = __os_umalloc(dbenv->env, size + sizeof(DB_TXN), argpp)) != 0)
+ goto done;
+ REPLICATION_WRAP(dbenv->env, (__log_read_record(dbenv->env, dbpp,
+ td, recbuf, spec, size, argpp)), 0, ret);
+ if (ret != 0) {
+ __os_ufree(dbenv->env, *argpp);
+ *argpp = NULL;
+ }
+done: ENV_LEAVE(dbenv->env, ip);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __log_read_record __P((ENV *, DB **, void *, void *,
+ * PUBLIC: DB_LOG_RECSPEC *, u_int32_t, void **));
+ */
+int
+__log_read_record(env, dbpp, td, recbuf, spec, size, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ DB_LOG_RECSPEC *spec;
+ u_int32_t size;
+ void **argpp;
+{
+ DB_LOG_RECSPEC *sp, *np;
+ DB_TXN *txnp;
+ LOG *lp;
+ PAGE *hdrstart;
+ u_int32_t hdrsize, op, uinttmp;
+ u_int8_t *ap, *bp;
+ int has_data, ret, downrev;
+
+ COMPQUIET(has_data, 0);
+ COMPQUIET(hdrsize, 0);
+ COMPQUIET(hdrstart, NULL);
+ COMPQUIET(op, 0);
+ ap = *argpp;
+ /*
+ * Allocate space for the arg structure and a transaction
+ * structure which will imediately follow it.
+ */
+ if (ap == NULL &&
+ (ret = __os_malloc(env, size + sizeof(DB_TXN), &ap)) != 0)
+ return (ret);
+ txnp = (DB_TXN *)(ap + size);
+ memset(txnp, 0, sizeof(DB_TXN));
+ txnp->td = td;
+ lp = env->lg_handle->reginfo.primary;
+ downrev = lp->persist.version < DB_LOGVERSION_50;
+
+ bp = recbuf;
+
+ /*
+ * The first three fields are always the same in every arg
+ * struct so we know their offsets.
+ */
+ /* type */
+ LOGCOPY_32(env, ap + SSZ(LOG_REC_HEADER, type), bp);
+ bp += sizeof(u_int32_t);
+
+ /* txnp */
+ LOGCOPY_32(env, &txnp->txnid, bp);
+ *(DB_TXN **)(ap + SSZ(LOG_REC_HEADER, txnp)) = txnp;
+ bp += sizeof(txnp->txnid);
+
+ /* Previous LSN */
+ LOGCOPY_TOLSN(env,
+ (DB_LSN *)(ap + SSZ(LOG_REC_HEADER, prev_lsn)), bp);
+ bp += sizeof(DB_LSN);
+
+ ret = 0;
+ for (sp = spec; sp->type != LOGREC_Done; sp++) {
+ switch (sp->type) {
+ case LOGREC_DB:
+ LOGCOPY_32(env, &uinttmp, bp);
+ *(u_int32_t*)(ap + sp->offset) = uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(env,
+ txnp, dbpp, (int32_t)uinttmp, 1);
+ }
+ break;
+
+ case LOGREC_ARG:
+ case LOGREC_TIME:
+ case LOGREC_DBOP:
+ LOGCOPY_32(env, ap + sp->offset, bp);
+ bp += sizeof(uinttmp);
+ break;
+ case LOGREC_OP:
+ LOGCOPY_32(env, &op, bp);
+ *(u_int32_t *)(ap + sp->offset) = op;
+ bp += sizeof(uinttmp);
+ break;
+ case LOGREC_DBT:
+ case LOGREC_PGLIST:
+ case LOGREC_LOCKS:
+ case LOGREC_HDR:
+ case LOGREC_DATA:
+ case LOGREC_PGDBT:
+ case LOGREC_PGDDBT:
+ memset(ap + sp->offset, 0, sizeof(DBT));
+ LOGCOPY_32(env, &uinttmp, bp);
+ *(u_int32_t*)
+ (ap + sp->offset + SSZ(DBT, size)) = uinttmp;
+ bp += sizeof(u_int32_t);
+ *(void **)(ap + sp->offset + SSZ(DBT, data)) = bp;
+
+ /* Process fields that need to be byte swapped. */
+ switch (sp->type) {
+ case LOGREC_DBT:
+ case LOGREC_PGLIST:
+ case LOGREC_LOCKS:
+ break;
+ case LOGREC_HDR:
+ if (uinttmp == 0)
+ break;
+ has_data = 0;
+ for (np = sp + 1; np->type != LOGREC_Done; np++)
+ if (np->type == LOGREC_DATA) {
+ has_data = 1;
+ break;
+ }
+ hdrstart = (PAGE *)bp;
+ hdrsize = uinttmp;
+ if (has_data == 1)
+ break;
+ /* FALLTHROUGH */
+ case LOGREC_DATA:
+ if (downrev ? LOG_SWAPPED(env) :
+ (dbpp != NULL && *dbpp != NULL &&
+ F_ISSET(*dbpp, DB_AM_SWAP)))
+ __db_recordswap(op, hdrsize,
+ hdrstart, has_data ?
+ ap + sp->offset : NULL, 1);
+ break;
+ case LOGREC_PGDBT:
+ has_data = 0;
+ for (np = sp + 1; np->type != LOGREC_Done; np++)
+ if (np->type == LOGREC_PGDDBT) {
+ has_data = 1;
+ break;
+ }
+
+ hdrstart = (PAGE *)bp;
+ hdrsize = uinttmp;
+ if (has_data == 1)
+ break;
+ /* FALLTHROUGH */
+ case LOGREC_PGDDBT:
+ if (dbpp != NULL && *dbpp != NULL &&
+ (downrev ? LOG_SWAPPED(env) :
+ F_ISSET(*dbpp, DB_AM_SWAP)) &&
+ (ret = __db_pageswap(env, *dbpp, hdrstart,
+ hdrsize, has_data == 0 ? NULL :
+ (DBT *)(ap + sp->offset), 1)) != 0)
+ return (ret);
+ break;
+ default:
+ DB_ASSERT(env, sp->type != sp->type);
+ }
+
+ bp += uinttmp;
+ break;
+
+ case LOGREC_POINTER:
+ LOGCOPY_TOLSN(env, (DB_LSN *)(ap + sp->offset), bp);
+ bp += sizeof(DB_LSN);
+ break;
+
+ default:
+ DB_ASSERT(env, sp->type != sp->type);
+ }
+ }
+
+ *argpp = ap;
+ return (ret);
+}
diff --git a/src/log/log_method.c b/src/log/log_method.c
new file mode 100644
index 00000000..d5aec116
--- /dev/null
+++ b/src/log/log_method.c
@@ -0,0 +1,533 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+/*
+ * __log_env_create --
+ * Log specific initialization of the DB_ENV structure.
+ *
+ * PUBLIC: int __log_env_create __P((DB_ENV *));
+ */
+int
+__log_env_create(dbenv)
+ DB_ENV *dbenv;
+{
+ /*
+ * !!!
+ * Our caller has not yet had the opportunity to reset the panic
+ * state or turn off mutex locking, and so we can neither check
+ * the panic state or acquire a mutex in the DB_ENV create path.
+ */
+ dbenv->lg_bsize = 0;
+ dbenv->lg_regionmax = 0;
+
+ return (0);
+}
+
+/*
+ * __log_env_destroy --
+ * Log specific destruction of the DB_ENV structure.
+ *
+ * PUBLIC: void __log_env_destroy __P((DB_ENV *));
+ */
+void
+__log_env_destroy(dbenv)
+ DB_ENV *dbenv;
+{
+ COMPQUIET(dbenv, NULL);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_bsize __P((DB_ENV *, u_int32_t *));
+ */
+int
+__log_get_lg_bsize(dbenv, lg_bsizep)
+ DB_ENV *dbenv;
+ u_int32_t *lg_bsizep;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lg_handle, "DB_ENV->get_lg_bsize", DB_INIT_LOG);
+
+ if (LOGGING_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *lg_bsizep =
+ ((LOG *)env->lg_handle->reginfo.primary)->buffer_size;
+ } else
+ *lg_bsizep = dbenv->lg_bsize;
+ return (0);
+}
+
+/*
+ * __log_set_lg_bsize --
+ * DB_ENV->set_lg_bsize.
+ *
+ * PUBLIC: int __log_set_lg_bsize __P((DB_ENV *, u_int32_t));
+ */
+int
+__log_set_lg_bsize(dbenv, lg_bsize)
+ DB_ENV *dbenv;
+ u_int32_t lg_bsize;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lg_bsize");
+
+ dbenv->lg_bsize = lg_bsize;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_filemode __P((DB_ENV *, int *));
+ */
+int
+__log_get_lg_filemode(dbenv, lg_modep)
+ DB_ENV *dbenv;
+ int *lg_modep;
+{
+ DB_LOG *dblp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lg_handle, "DB_ENV->get_lg_filemode", DB_INIT_LOG);
+
+ if (LOGGING_ON(env)) {
+ dblp = env->lg_handle;
+ ENV_ENTER(env, ip);
+ LOG_SYSTEM_LOCK(env);
+ *lg_modep = ((LOG *)dblp->reginfo.primary)->filemode;
+ LOG_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ *lg_modep = dbenv->lg_filemode;
+
+ return (0);
+}
+
+/*
+ * __log_set_lg_filemode --
+ * DB_ENV->set_lg_filemode.
+ *
+ * PUBLIC: int __log_set_lg_filemode __P((DB_ENV *, int));
+ */
+int
+__log_set_lg_filemode(dbenv, lg_mode)
+ DB_ENV *dbenv;
+ int lg_mode;
+{
+ DB_LOG *dblp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ LOG *lp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lg_handle, "DB_ENV->set_lg_filemode", DB_INIT_LOG);
+
+ if (LOGGING_ON(env)) {
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ENV_ENTER(env, ip);
+ LOG_SYSTEM_LOCK(env);
+ lp->filemode = lg_mode;
+ LOG_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ dbenv->lg_filemode = lg_mode;
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_max __P((DB_ENV *, u_int32_t *));
+ */
+int
+__log_get_lg_max(dbenv, lg_maxp)
+ DB_ENV *dbenv;
+ u_int32_t *lg_maxp;
+{
+ DB_LOG *dblp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lg_handle, "DB_ENV->get_lg_max", DB_INIT_LOG);
+
+ if (LOGGING_ON(env)) {
+ dblp = env->lg_handle;
+ ENV_ENTER(env, ip);
+ LOG_SYSTEM_LOCK(env);
+ *lg_maxp = ((LOG *)dblp->reginfo.primary)->log_nsize;
+ LOG_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ *lg_maxp = dbenv->lg_size;
+
+ return (0);
+}
+
+/*
+ * __log_set_lg_max --
+ * DB_ENV->set_lg_max.
+ *
+ * PUBLIC: int __log_set_lg_max __P((DB_ENV *, u_int32_t));
+ */
+int
+__log_set_lg_max(dbenv, lg_max)
+ DB_ENV *dbenv;
+ u_int32_t lg_max;
+{
+ DB_LOG *dblp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ LOG *lp;
+ int ret;
+
+ env = dbenv->env;
+ ret = 0;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lg_handle, "DB_ENV->set_lg_max", DB_INIT_LOG);
+
+ if (LOGGING_ON(env)) {
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ENV_ENTER(env, ip);
+ if ((ret = __log_check_sizes(env, lg_max, 0)) == 0) {
+ LOG_SYSTEM_LOCK(env);
+ lp->log_nsize = lg_max;
+ LOG_SYSTEM_UNLOCK(env);
+ }
+ ENV_LEAVE(env, ip);
+ } else
+ dbenv->lg_size = lg_max;
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_regionmax __P((DB_ENV *, u_int32_t *));
+ */
+int
+__log_get_lg_regionmax(dbenv, lg_regionmaxp)
+ DB_ENV *dbenv;
+ u_int32_t *lg_regionmaxp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->lg_handle, "DB_ENV->get_lg_regionmax", DB_INIT_LOG);
+
+ if (LOGGING_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *lg_regionmaxp =
+ ((LOG *)env->lg_handle->reginfo.primary)->regionmax;
+ } else
+ *lg_regionmaxp = dbenv->lg_regionmax;
+ return (0);
+}
+
+/*
+ * __log_set_lg_regionmax --
+ * DB_ENV->set_lg_regionmax.
+ *
+ * PUBLIC: int __log_set_lg_regionmax __P((DB_ENV *, u_int32_t));
+ */
+int
+__log_set_lg_regionmax(dbenv, lg_regionmax)
+ DB_ENV *dbenv;
+ u_int32_t lg_regionmax;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_lg_regionmax");
+
+ /* Let's not be silly. */
+ if (lg_regionmax != 0 && lg_regionmax < LG_BASE_REGION_SIZE) {
+ __db_errx(env, DB_STR_A("2569",
+ "log region size must be >= %d",
+ "%d"), LG_BASE_REGION_SIZE);
+ return (EINVAL);
+ }
+
+ dbenv->lg_regionmax = lg_regionmax;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __log_get_lg_dir __P((DB_ENV *, const char **));
+ */
+int
+__log_get_lg_dir(dbenv, dirp)
+ DB_ENV *dbenv;
+ const char **dirp;
+{
+ *dirp = dbenv->db_log_dir;
+ return (0);
+}
+
+/*
+ * __log_set_lg_dir --
+ * DB_ENV->set_lg_dir.
+ *
+ * PUBLIC: int __log_set_lg_dir __P((DB_ENV *, const char *));
+ */
+int
+__log_set_lg_dir(dbenv, dir)
+ DB_ENV *dbenv;
+ const char *dir;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (dbenv->db_log_dir != NULL)
+ __os_free(env, dbenv->db_log_dir);
+ return (__os_strdup(env, dir, &dbenv->db_log_dir));
+}
+
+/*
+ * __log_get_flags --
+ * DB_ENV->get_flags.
+ *
+ * PUBLIC: void __log_get_flags __P((DB_ENV *, u_int32_t *));
+ */
+void
+__log_get_flags(dbenv, flagsp)
+ DB_ENV *dbenv;
+ u_int32_t *flagsp;
+{
+ DB_LOG *dblp;
+ ENV *env;
+ LOG *lp;
+ u_int32_t flags;
+
+ env = dbenv->env;
+
+ if ((dblp = env->lg_handle) == NULL)
+ return;
+
+ lp = dblp->reginfo.primary;
+
+ flags = *flagsp;
+ if (lp->db_log_autoremove)
+ LF_SET(DB_LOG_AUTO_REMOVE);
+ else
+ LF_CLR(DB_LOG_AUTO_REMOVE);
+ if (lp->db_log_inmemory)
+ LF_SET(DB_LOG_IN_MEMORY);
+ else
+ LF_CLR(DB_LOG_IN_MEMORY);
+ *flagsp = flags;
+}
+
+/*
+ * __log_set_flags --
+ * DB_ENV->set_flags.
+ *
+ * PUBLIC: void __log_set_flags __P((ENV *, u_int32_t, int));
+ */
+void
+__log_set_flags(env, flags, on)
+ ENV *env;
+ u_int32_t flags;
+ int on;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+
+ if ((dblp = env->lg_handle) == NULL)
+ return;
+
+ lp = dblp->reginfo.primary;
+
+ if (LF_ISSET(DB_LOG_AUTO_REMOVE))
+ lp->db_log_autoremove = on ? 1 : 0;
+ if (LF_ISSET(DB_LOG_IN_MEMORY))
+ lp->db_log_inmemory = on ? 1 : 0;
+}
+
+/*
+ * List of flags we can handle here. DB_LOG_INMEMORY must be
+ * processed before creating the region, leave it out for now.
+ */
+#undef OK_FLAGS
+#define OK_FLAGS \
+ (DB_LOG_AUTO_REMOVE | DB_LOG_DIRECT | \
+ DB_LOG_DSYNC | DB_LOG_IN_MEMORY | DB_LOG_ZERO)
+static const FLAG_MAP LogMap[] = {
+ { DB_LOG_AUTO_REMOVE, DBLOG_AUTOREMOVE},
+ { DB_LOG_DIRECT, DBLOG_DIRECT},
+ { DB_LOG_DSYNC, DBLOG_DSYNC},
+ { DB_LOG_IN_MEMORY, DBLOG_INMEMORY},
+ { DB_LOG_ZERO, DBLOG_ZERO}
+};
+/*
+ * __log_get_config --
+ * Configure the logging subsystem.
+ *
+ * PUBLIC: int __log_get_config __P((DB_ENV *, u_int32_t, int *));
+ */
+int
+__log_get_config(dbenv, which, onp)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int *onp;
+{
+ ENV *env;
+ DB_LOG *dblp;
+ u_int32_t flags;
+
+ env = dbenv->env;
+ if (FLD_ISSET(which, ~OK_FLAGS))
+ return (__db_ferr(env, "DB_ENV->log_get_config", 0));
+ dblp = env->lg_handle;
+ ENV_REQUIRES_CONFIG(env, dblp, "DB_ENV->log_get_config", DB_INIT_LOG);
+
+ __env_fetch_flags(LogMap, sizeof(LogMap), &dblp->flags, &flags);
+ __log_get_flags(dbenv, &flags);
+ if (LF_ISSET(which))
+ *onp = 1;
+ else
+ *onp = 0;
+
+ return (0);
+}
+
+/*
+ * __log_set_config --
+ * Configure the logging subsystem.
+ *
+ * PUBLIC: int __log_set_config __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__log_set_config(dbenv, flags, on)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+ int on;
+{
+ return (__log_set_config_int(dbenv, flags, on, 0));
+}
+/*
+ * __log_set_config_int --
+ * Configure the logging subsystem.
+ *
+ * PUBLIC: int __log_set_config_int __P((DB_ENV *, u_int32_t, int, int));
+ */
+int
+__log_set_config_int(dbenv, flags, on, in_open)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+ int on;
+ int in_open;
+{
+ ENV *env;
+ DB_LOG *dblp;
+ u_int32_t mapped_flags;
+
+ env = dbenv->env;
+ dblp = env->lg_handle;
+ if (FLD_ISSET(flags, ~OK_FLAGS))
+ return (__db_ferr(env, "DB_ENV->log_set_config", 0));
+ ENV_NOT_CONFIGURED(env, dblp, "DB_ENV->log_set_config", DB_INIT_LOG);
+ if (LF_ISSET(DB_LOG_DIRECT) && __os_support_direct_io() == 0) {
+ __db_errx(env,
+"DB_ENV->log_set_config: direct I/O either not configured or not supported");
+ return (EINVAL);
+ }
+
+ if (LOGGING_ON(env)) {
+ if (!in_open && LF_ISSET(DB_LOG_IN_MEMORY) &&
+ ((LOG *)dblp->reginfo.primary)->db_log_inmemory == 0)
+ ENV_ILLEGAL_AFTER_OPEN(env,
+ "DB_ENV->log_set_config: DB_LOG_IN_MEMORY");
+ __log_set_flags(env, flags, on);
+ mapped_flags = 0;
+ __env_map_flags(LogMap, sizeof(LogMap), &flags, &mapped_flags);
+ if (on)
+ F_SET(dblp, mapped_flags);
+ else
+ F_CLR(dblp, mapped_flags);
+ } else {
+ /*
+ * DB_LOG_IN_MEMORY, DB_TXN_NOSYNC and DB_TXN_WRITE_NOSYNC
+ * are mutually incompatible. If we're setting one of them,
+ * clear all current settings.
+ */
+ if (on && LF_ISSET(DB_LOG_IN_MEMORY))
+ F_CLR(dbenv,
+ DB_ENV_TXN_NOSYNC | DB_ENV_TXN_WRITE_NOSYNC);
+
+ if (on)
+ FLD_SET(dbenv->lg_flags, flags);
+ else
+ FLD_CLR(dbenv->lg_flags, flags);
+ }
+
+ return (0);
+}
+
+/*
+ * __log_check_sizes --
+ * Makes sure that the log file size and log buffer size are compatible.
+ *
+ * PUBLIC: int __log_check_sizes __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__log_check_sizes(env, lg_max, lg_bsize)
+ ENV *env;
+ u_int32_t lg_max;
+ u_int32_t lg_bsize;
+{
+ DB_ENV *dbenv;
+ LOG *lp;
+ int inmem;
+
+ dbenv = env->dbenv;
+
+ if (LOGGING_ON(env)) {
+ lp = env->lg_handle->reginfo.primary;
+ inmem = lp->db_log_inmemory;
+ lg_bsize = lp->buffer_size;
+ } else
+ inmem = (FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) != 0);
+
+ if (inmem) {
+ if (lg_bsize == 0)
+ lg_bsize = LG_BSIZE_INMEM;
+ if (lg_max == 0)
+ lg_max = LG_MAX_INMEM;
+
+ if (lg_bsize <= lg_max) {
+ __db_errx(env,
+ "in-memory log buffer must be larger than the log file size");
+ return (EINVAL);
+ }
+ }
+
+ return (0);
+}
diff --git a/src/log/log_print.c b/src/log/log_print.c
new file mode 100644
index 00000000..d2cda519
--- /dev/null
+++ b/src/log/log_print.c
@@ -0,0 +1,380 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+
+static int __log_print_dbregister __P((ENV *, DBT *, DB_LOG *));
+
+/*
+ * PUBLIC: int __log_print_record __P((ENV *,
+ * PUBLIC: DBT *, DB_LSN *, char *, DB_LOG_RECSPEC *, void *));
+ */
+int
+__log_print_record(env, recbuf, lsnp, name, spec, info)
+ ENV *env;
+ DBT *recbuf;
+ DB_LSN *lsnp;
+ char *name;
+ DB_LOG_RECSPEC *spec;
+ void *info;
+{
+ DB *dbp;
+ DBT dbt;
+ DB_LOG_RECSPEC *sp, *np;
+ DB_LOG *dblp;
+ DB_LSN prev_lsn;
+ DB_MSGBUF msgbuf;
+ LOG *lp;
+ PAGE *hdrstart, *hdrtmp;
+ int32_t inttmp;
+ u_int32_t hdrsize, op, uinttmp;
+ u_int32_t type, txnid;
+ u_int8_t *bp, *datatmp;
+ int has_data, ret, downrev;
+ struct tm *lt;
+ time_t timeval;
+ char time_buf[CTIME_BUFLEN], *s;
+ const char *hdrname;
+
+ COMPQUIET(hdrstart, NULL);
+ COMPQUIET(hdrname, NULL);
+ COMPQUIET(hdrsize, 0);
+ COMPQUIET(has_data, 0);
+ COMPQUIET(op, 0);
+
+ bp = recbuf->data;
+ dblp = info;
+ dbp = NULL;
+ lp = env->lg_handle->reginfo.primary;
+ downrev = lp->persist.version < DB_LOGVERSION_50;
+ DB_MSGBUF_INIT(&msgbuf);
+
+ /*
+ * The first three fields are always the same in every arg
+ * struct so we know their offsets.
+ */
+ /* type */
+ LOGCOPY_32(env, &type, bp);
+ bp += sizeof(u_int32_t);
+
+ /* txnp */
+ LOGCOPY_32(env, &txnid, bp);
+ bp += sizeof(txnid);
+
+ /* Previous LSN */
+ LOGCOPY_TOLSN(env,&prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+ __db_msgadd(env, &msgbuf,
+ "[%lu][%lu]%s%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ name, (type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)type,
+ (u_long)txnid,
+ (u_long)prev_lsn.file, (u_long)prev_lsn.offset);
+
+ for (sp = spec; sp->type != LOGREC_Done; sp++) {
+ switch (sp->type) {
+ case LOGREC_OP:
+ LOGCOPY_32(env, &op, bp);
+ __db_msgadd(env, &msgbuf, "\t%s: ", sp->name);
+ __db_msgadd(env, &msgbuf, sp->fmt, OP_MODE_GET(op));
+ __db_msgadd(env, &msgbuf, " ptype: %s\n",
+ __db_pagetype_to_string(OP_PAGE_GET(op)));
+ bp += sizeof(uinttmp);
+ break;
+ case LOGREC_DB:
+ LOGCOPY_32(env, &inttmp, bp);
+ __db_msgadd(env, &msgbuf, "\t%s: %lu\n",
+ sp->name, (unsigned long)inttmp);
+ bp += sizeof(inttmp);
+ if (dblp != NULL && inttmp < dblp->dbentry_cnt)
+ dbp = dblp->dbentry[inttmp].dbp;
+ break;
+
+ case LOGREC_DBOP:
+ /* Special op for dbreg_register records. */
+ if (dblp != NULL && (ret =
+ __log_print_dbregister(env, recbuf, dblp)) != 0)
+ return (ret);
+ LOGCOPY_32(env, &uinttmp, bp);
+ switch (FLD_ISSET(uinttmp, DBREG_OP_MASK)) {
+ case DBREG_CHKPNT:
+ s = "CHKPNT";
+ break;
+ case DBREG_CLOSE:
+ s = "CLOSE";
+ break;
+ case DBREG_OPEN:
+ s = "OPEN";
+ break;
+ case DBREG_PREOPEN:
+ s = "PREOPEN";
+ break;
+ case DBREG_RCLOSE:
+ s = "RCLOSE";
+ break;
+ case DBREG_REOPEN:
+ s = "REOPEN";
+ break;
+ case DBREG_XCHKPNT:
+ s = "XCHKPNT";
+ break;
+ case DBREG_XOPEN:
+ s = "XOPEN";
+ break;
+ case DBREG_XREOPEN:
+ s = "XREOPEN";
+ break;
+ default:
+ s = "UNKNOWN";
+ break;
+ }
+ __db_msgadd(env, &msgbuf, "\t%s: %s %lx\n", sp->name,
+ s, (unsigned long)(uinttmp & ~DBREG_OP_MASK));
+ bp += sizeof(uinttmp);
+ break;
+ case LOGREC_ARG:
+ LOGCOPY_32(env, &uinttmp, bp);
+ __db_msgadd(env, &msgbuf, "\t%s: ", sp->name);
+ __db_msgadd(env, &msgbuf, sp->fmt, uinttmp);
+ __db_msgadd(env, &msgbuf, "\n");
+ bp += sizeof(uinttmp);
+ break;
+ case LOGREC_TIME:
+ /* time_t is long but we only store 32 bits. */
+ LOGCOPY_32(env, &uinttmp, bp);
+ timeval = uinttmp;
+ lt = localtime(&timeval);
+ __db_msgadd(env, &msgbuf,
+ "\t%s: %ld (%.24s, 20%02lu%02lu%02lu%02lu%02lu.%02lu)\n",
+ sp->name, (long)timeval,
+ __os_ctime(&timeval, time_buf),
+ (u_long)lt->tm_year - 100, (u_long)lt->tm_mon+1,
+ (u_long)lt->tm_mday, (u_long)lt->tm_hour,
+ (u_long)lt->tm_min, (u_long)lt->tm_sec);
+ bp += sizeof(uinttmp);
+ break;
+ case LOGREC_PGDBT:
+ case LOGREC_PGDDBT:
+ case LOGREC_PGLIST:
+ case LOGREC_LOCKS:
+ case LOGREC_HDR:
+ case LOGREC_DATA:
+ case LOGREC_DBT:
+ LOGCOPY_32(env, &uinttmp, bp);
+ bp += sizeof(u_int32_t);
+ switch (sp->type) {
+ case LOGREC_HDR:
+ if (uinttmp == 0)
+ break;
+ has_data = 0;
+ for (np = sp + 1; np->type != LOGREC_Done; np++)
+ if (np->type == LOGREC_DATA) {
+ has_data = 1;
+ break;
+ }
+
+ hdrstart = (PAGE*)bp;
+ hdrsize = uinttmp;
+ hdrname = sp->name;
+ if (has_data == 1)
+ break;
+ /* FALLTHROUGH */
+ case LOGREC_DATA:
+ if (downrev ? LOG_SWAPPED(env) :
+ (dbp != NULL && F_ISSET(dbp, DB_AM_SWAP)))
+ __db_recordswap(op, hdrsize, hdrstart,
+ (has_data && uinttmp != 0) ?
+ bp : NULL, 1);
+ __db_msgadd(env, &msgbuf, "\t%s: ", hdrname);
+ __db_prbytes(env, &msgbuf,
+ (u_int8_t *)hdrstart, hdrsize);
+ if (has_data == 0 || uinttmp == 0)
+ break;
+ /* FALLTHROUGH */
+ default:
+ __db_msgadd(env, &msgbuf, "\t%s: ", sp->name);
+ pr_data:
+ __db_prbytes(env, &msgbuf, bp, uinttmp);
+ has_data = 0;
+ break;
+ case LOGREC_PGDBT:
+ has_data = 0;
+ for (np = sp + 1; np->type != LOGREC_Done; np++)
+ if (np->type == LOGREC_PGDDBT) {
+ has_data = 1;
+ break;
+ }
+
+ hdrstart = (PAGE*)bp;
+ hdrsize = uinttmp;
+ if (has_data == 1)
+ break;
+ /* FALLTHROUGH */
+ case LOGREC_PGDDBT:
+ DB_ASSERT(env, hdrstart != NULL);
+ if (dbp != NULL && (downrev ? LOG_SWAPPED(env) :
+ F_ISSET(dbp, DB_AM_SWAP))) {
+ dbt.data = bp;
+ dbt.size = uinttmp;
+ if ((ret = __db_pageswap(env, dbp,
+ hdrstart, hdrsize, has_data == 0 ?
+ NULL : &dbt, 1)) != 0)
+ return (ret);
+ }
+ if (downrev)
+ goto pr_data;
+ if (ALIGNP_INC(hdrstart,
+ sizeof(u_int32_t)) != hdrstart) {
+ if ((ret = __os_malloc(env,
+ hdrsize, &hdrtmp)) != 0)
+ return (ret);
+ memcpy(hdrtmp, hdrstart, hdrsize);
+ } else
+ hdrtmp = hdrstart;
+ if (has_data == 1 && ALIGNP_INC(bp,
+ sizeof(u_int32_t)) != bp) {
+ if ((ret = __os_malloc(env,
+ uinttmp, &datatmp)) != 0)
+ return (ret);
+ memcpy(datatmp, bp, uinttmp);
+ } else if (has_data == 1)
+ datatmp = bp;
+ else
+ datatmp = NULL;
+ if ((ret = __db_prpage_int(env, &msgbuf,
+ dbp, "\t", hdrtmp,
+ uinttmp, datatmp, DB_PR_PAGE)) != 0)
+ return (ret);
+ has_data = 0;
+ if (hdrtmp != hdrstart)
+ __os_free(env, hdrtmp);
+ if (datatmp != bp && datatmp != NULL)
+ __os_free(env, datatmp);
+ break;
+ case LOGREC_PGLIST:
+ dbt.data = bp;
+ dbt.size = uinttmp;
+ __db_pglist_print(env, &msgbuf, &dbt);
+ break;
+ case LOGREC_LOCKS:
+ dbt.data = bp;
+ dbt.size = uinttmp;
+ __lock_list_print(env, &msgbuf, &dbt);
+ break;
+ }
+ bp += uinttmp;
+ break;
+
+ case LOGREC_POINTER:
+ LOGCOPY_TOLSN(env, &prev_lsn, bp);
+ __db_msgadd(env, &msgbuf,
+ "\t%s: [%lu][%lu]\n", sp->name,
+ (u_long)prev_lsn.file, (u_long)prev_lsn.offset);
+ bp += sizeof(DB_LSN);
+ break;
+ case LOGREC_Done:
+ DB_ASSERT(env, sp->type != LOGREC_Done);
+ }
+ }
+ if (msgbuf.buf != NULL)
+ DB_MSGBUF_FLUSH(env, &msgbuf);
+ else
+ __db_msg(env, "%s", "");
+ return (0);
+}
+
+/*
+ * __log_print_dbregister --
+ * So that we can properly swap and print information from databases
+ * we generate dummy DB handles here. These are real handles that are never
+ * opened but their fileid, meta_pgno and some flags are set properly.
+ * This code uses parallel structures to those in the dbregister code.
+ * The DB_LOG handle passed in must NOT be the real environment handle
+ * since this would confuse actual running transactions if printing is
+ * done while the environment is active.
+ */
+static int
+__log_print_dbregister(env, recbuf, dblp)
+ ENV *env;
+ DBT *recbuf;
+ DB_LOG *dblp;
+{
+ __dbreg_register_args *argp;
+ DB *dbp;
+ DB_ENTRY *dbe;
+ int ret;
+
+ if ((ret = __dbreg_register_read(env, recbuf->data, &argp)) != 0)
+ return (ret);
+
+ if (dblp->dbentry_cnt <= argp->fileid &&
+ (ret = __dbreg_add_dbentry(env, dblp, NULL, argp->fileid)) != 0)
+ goto err;
+ dbe = &dblp->dbentry[argp->fileid];
+ dbp = dbe->dbp;
+
+ switch (FLD_ISSET(argp->opcode, DBREG_OP_MASK)) {
+ case DBREG_CHKPNT:
+ case DBREG_OPEN:
+ case DBREG_REOPEN:
+ case DBREG_XCHKPNT:
+ case DBREG_XOPEN:
+ case DBREG_XREOPEN:
+ if (dbp != NULL) {
+ if (memcmp(dbp->fileid,
+ argp->uid.data, DB_FILE_ID_LEN) == 0 &&
+ dbp->meta_pgno == argp->meta_pgno)
+ goto done;
+ if ((__db_close(dbp, NULL, DB_NOSYNC)) != 0)
+ goto err;
+ dbe->dbp = dbp = NULL;
+ }
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ memcpy(dbp->fileid, argp->uid.data, DB_FILE_ID_LEN);
+ dbp->meta_pgno = argp->meta_pgno;
+ F_SET(dbp, DB_AM_RECOVER);
+ /*
+ * We need to swap bytes if we are on a BIGEND machine XOR
+ * we have a BIGEND database.
+ */
+ if ((F_ISSET(env, ENV_LITTLEENDIAN) == 0) ^
+ (FLD_ISSET(argp->opcode, DBREG_BIGEND) != 0))
+ F_SET(dbp, DB_AM_SWAP);
+ if (FLD_ISSET(argp->opcode, DBREG_CHKSUM))
+ F_SET(dbp, DB_AM_CHKSUM);
+ if (FLD_ISSET(argp->opcode, DBREG_ENCRYPT))
+ F_SET(dbp, DB_AM_ENCRYPT);
+ if (FLD_ISSET(argp->opcode, DBREG_EXCL))
+ F2_SET(dbp, DB2_AM_EXCL);
+ dbe->dbp = dbp;
+ break;
+ case DBREG_CLOSE:
+ case DBREG_RCLOSE:
+ if (dbp == NULL)
+ goto err;
+ if ((__db_close(dbp, NULL, DB_NOSYNC)) != 0)
+ goto err;
+ dbe->dbp = dbp = NULL;
+ break;
+ case DBREG_PREOPEN:
+ break;
+ default:
+ DB_ASSERT(env, argp->opcode != argp->opcode);
+ }
+done:
+err:
+ __os_free(env, argp);
+ return (ret);
+}
diff --git a/src/log/log_put.c b/src/log/log_put.c
new file mode 100644
index 00000000..8f7e23d8
--- /dev/null
+++ b/src/log/log_put.c
@@ -0,0 +1,2041 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_page.h"
+#include "dbinc_auto/db_ext.h"
+
+static int __log_encrypt_record __P((ENV *, DBT *, HDR *, u_int32_t));
+static int __log_file __P((ENV *, const DB_LSN *, char *, size_t));
+static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
+static int __log_flush_commit __P((ENV *, const DB_LSN *, u_int32_t));
+static int __log_newfh __P((DB_LOG *, int));
+static int __log_put_next __P((ENV *,
+ DB_LSN *, const DBT *, HDR *, DB_LSN *));
+static int __log_put_record_int __P((ENV *, DB *, DB_TXN *, DB_LSN *,
+ u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, va_list));
+static int __log_putr __P((DB_LOG *,
+ DB_LSN *, const DBT *, u_int32_t, HDR *));
+static int __log_write __P((DB_LOG *, void *, u_int32_t));
+
+/*
+ * __log_put_pp --
+ * ENV->log_put pre/post processing.
+ *
+ * PUBLIC: int __log_put_pp __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
+ */
+int
+__log_put_pp(dbenv, lsnp, udbt, flags)
+ DB_ENV *dbenv;
+ DB_LSN *lsnp;
+ const DBT *udbt;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->log_put", DB_INIT_LOG);
+
+ /* Validate arguments: check for allowed flags. */
+ if ((ret = __db_fchk(env, "DB_ENV->log_put", flags,
+ DB_LOG_CHKPNT | DB_LOG_COMMIT |
+ DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0)
+ return (ret);
+
+ /* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */
+ if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH))
+ return (__db_ferr(env, "DB_ENV->log_put", 1));
+
+ /* Replication clients should never write log records. */
+ if (IS_REP_CLIENT(env)) {
+ __db_errx(env, DB_STR("2511",
+ "DB_ENV->log_put is illegal on replication clients"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__log_put(env, lsnp, udbt, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __log_put --
+ * ENV->log_put.
+ *
+ * PUBLIC: int __log_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
+ */
+int
+__log_put(env, lsnp, udbt, flags)
+ ENV *env;
+ DB_LSN *lsnp;
+ const DBT *udbt;
+ u_int32_t flags;
+{
+ DBT *dbt, t;
+ DB_CIPHER *db_cipher;
+ DB_LOG *dblp;
+ DB_LSN lsn, old_lsn;
+ DB_REP *db_rep;
+ HDR hdr;
+ LOG *lp;
+ REP *rep;
+ int lock_held, need_free, ret;
+ u_int8_t *key;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ db_cipher = env->crypto_handle;
+ db_rep = env->rep_handle;
+ if (db_rep != NULL)
+ rep = db_rep->region;
+ else
+ rep = NULL;
+
+ dbt = &t;
+ t = *udbt;
+ lock_held = need_free = 0;
+ ZERO_LSN(old_lsn);
+ hdr.len = hdr.prev = 0;
+
+ /*
+ * In general, if we are not a rep application, but are sharing a master
+ * rep env, we should not be writing log records. However, we can allow
+ * a non-replication-aware process to join a pre-existing repmgr
+ * environment, if env handle meets repmgr's DB_THREAD requirement.
+ */
+
+ if (IS_REP_MASTER(env) && db_rep->send == NULL) {
+#ifdef HAVE_REPLICATION_THREADS
+ if (F_ISSET(env, ENV_THREAD) && APP_IS_REPMGR(env)) {
+ if ((ret = __repmgr_autostart(env)) != 0)
+ return (ret);
+ } else
+#endif
+ {
+#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP)
+ __db_errx(env, DB_STR("2512",
+ "Non-replication DB_ENV handle attempting "
+ "to modify a replicated environment"));
+ return (EINVAL);
+#endif
+ }
+ }
+ DB_ASSERT(env, !IS_REP_CLIENT(env));
+
+ /*
+ * If we are coming from the logging code, we use an internal flag,
+ * DB_LOG_NOCOPY, because we know we can overwrite/encrypt the log
+ * record in place. Otherwise, if a user called log_put then we
+ * must copy it to new memory so that we know we can write it.
+ *
+ * We also must copy it to new memory if we are a replication master
+ * so that we retain an unencrypted copy of the log record to send
+ * to clients.
+ */
+ if (!LF_ISSET(DB_LOG_NOCOPY) || IS_REP_MASTER(env)) {
+ if (CRYPTO_ON(env))
+ t.size += db_cipher->adj_size(udbt->size);
+ if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
+ goto err;
+ need_free = 1;
+ memcpy(t.data, udbt->data, udbt->size);
+ }
+ if ((ret = __log_encrypt_record(env, dbt, &hdr, udbt->size)) != 0)
+ goto err;
+ if (CRYPTO_ON(env))
+ key = db_cipher->mac_key;
+ else
+ key = NULL;
+#ifdef HAVE_LOG_CHECKSUM
+ __db_chksum(&hdr, dbt->data, dbt->size, key, hdr.chksum);
+#endif
+
+ LOG_SYSTEM_LOCK(env);
+ lock_held = 1;
+
+ if ((ret = __log_put_next(env, &lsn, dbt, &hdr, &old_lsn)) != 0)
+ goto panic_check;
+
+ /*
+ * Assign the return LSN before dropping the region lock. Necessary
+ * in case the lsn is a begin_lsn from a TXN_DETAIL structure passed in
+ * by the logging routines. We use atomic 32-bit operations because
+ * during commit this will be a TXN_DETAIL visible_lsn field, and MVCC
+ * relies on reading the fields atomically.
+ */
+ lsnp->file = lsn.file;
+ lsnp->offset = lsn.offset;
+
+#ifdef HAVE_REPLICATION
+ if (IS_REP_MASTER(env)) {
+ __rep_newfile_args nf_args;
+ DBT newfiledbt;
+ REP_BULK bulk;
+ size_t len;
+ u_int32_t ctlflags;
+ u_int8_t buf[__REP_NEWFILE_SIZE];
+
+ /*
+ * Replication masters need to drop the lock to send messages,
+ * but want to drop and reacquire it a minimal number of times.
+ */
+ ctlflags = LF_ISSET(DB_LOG_COMMIT | DB_LOG_CHKPNT) ?
+ REPCTL_PERM : 0;
+ LOG_SYSTEM_UNLOCK(env);
+ lock_held = 0;
+ if (LF_ISSET(DB_FLUSH))
+ ctlflags |= REPCTL_FLUSH;
+
+ /*
+ * If we changed files and we're in a replicated environment,
+ * we need to inform our clients now that we've dropped the
+ * region lock.
+ *
+ * Note that a failed NEWFILE send is a dropped message that
+ * our client can handle, so we can ignore it. It's possible
+ * that the record we already put is a commit, so we don't just
+ * want to return failure.
+ */
+ if (!IS_ZERO_LSN(old_lsn)) {
+ memset(&newfiledbt, 0, sizeof(newfiledbt));
+ nf_args.version = lp->persist.version;
+ (void)__rep_newfile_marshal(env, &nf_args,
+ buf, __REP_NEWFILE_SIZE, &len);
+ DB_INIT_DBT(newfiledbt, buf, len);
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_NEWFILE, &old_lsn, &newfiledbt, 0, 0);
+ }
+
+ /*
+ * If we're doing bulk processing put it in the bulk buffer.
+ */
+ ret = 0;
+ if (FLD_ISSET(rep->config, REP_C_BULK)) {
+ /*
+ * Bulk could have been turned on by another process.
+ * If so, set the address into the bulk region now.
+ */
+ if (db_rep->bulk == NULL)
+ db_rep->bulk = R_ADDR(&dblp->reginfo,
+ lp->bulk_buf);
+ memset(&bulk, 0, sizeof(bulk));
+ bulk.addr = db_rep->bulk;
+ bulk.offp = &lp->bulk_off;
+ bulk.len = lp->bulk_len;
+ bulk.lsn = lsn;
+ bulk.type = REP_BULK_LOG;
+ bulk.eid = DB_EID_BROADCAST;
+ bulk.flagsp = &lp->bulk_flags;
+ ret = __rep_bulk_message(env, &bulk, NULL,
+ &lsn, udbt, ctlflags);
+ }
+ if (!FLD_ISSET(rep->config, REP_C_BULK) ||
+ ret == DB_REP_BULKOVF) {
+ /*
+ * Then send the log record itself on to our clients.
+ */
+ /*
+ * !!!
+ * In the crypto case, we MUST send the udbt, not the
+ * now-encrypted dbt. Clients have no way to decrypt
+ * without the header.
+ */
+ ret = __rep_send_message(env, DB_EID_BROADCAST,
+ REP_LOG, &lsn, udbt, ctlflags, 0);
+ }
+ if (FLD_ISSET(ctlflags, REPCTL_PERM)) {
+ LOG_SYSTEM_LOCK(env);
+#ifdef HAVE_STATISTICS
+ if (IS_USING_LEASES(env))
+ rep->stat.st_lease_sends++;
+#endif
+ /*
+ * Keep track of our last PERM lsn. Set this on a
+ * master under the log lock. When using leases, if
+ * we set max_perm_lsn too early (before the send)
+ * then we hit a lot of false invalid lease checks
+ * which all try to refresh and hurt performance.
+ */
+ if (LOG_COMPARE(&lp->max_perm_lsn, &lsn) < 0)
+ lp->max_perm_lsn = lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ }
+ /*
+ * If the send fails and we're a commit or checkpoint,
+ * there's nothing we can do; the record's in the log.
+ * Flush it, even if we're running with TXN_NOSYNC,
+ * on the grounds that it should be in durable
+ * form somewhere.
+ */
+ if (ret != 0 && FLD_ISSET(ctlflags, REPCTL_PERM))
+ LF_SET(DB_FLUSH);
+ /*
+ * We ignore send failures so reset 'ret' to 0 here.
+ * We needed to check special return values from
+ * bulk transfer and errors from either bulk or normal
+ * message sending need flushing on perm records. But
+ * otherwise we need to ignore it and reset it now.
+ */
+ ret = 0;
+ }
+#endif
+
+ /*
+ * If needed, do a flush. Note that failures at this point
+ * are only permissible if we know we haven't written a commit
+ * record; __log_flush_commit is responsible for enforcing this.
+ *
+ * If a flush is not needed, see if WRITE_NOSYNC was set and we
+ * need to write out the log buffer.
+ */
+ if (LF_ISSET(DB_FLUSH | DB_LOG_WRNOSYNC)) {
+ if (!lock_held) {
+ LOG_SYSTEM_LOCK(env);
+ lock_held = 1;
+ }
+ if ((ret = __log_flush_commit(env, &lsn, flags)) != 0)
+ goto panic_check;
+ }
+
+ /*
+ * If flushed a checkpoint record, reset the "bytes since the last
+ * checkpoint" counters.
+ */
+ if (LF_ISSET(DB_LOG_CHKPNT))
+ lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
+
+ /* Increment count of records added to the log. */
+ STAT(++lp->stat.st_record);
+
+ if (0) {
+panic_check: /*
+ * Writing log records cannot fail if we're a replication
+ * master. The reason is that once we send the record to
+ * replication clients, the transaction can no longer
+ * abort, otherwise the master would be out of sync with
+ * the rest of the replication group. Panic the system.
+ */
+ if (ret != 0 && IS_REP_MASTER(env))
+ ret = __env_panic(env, ret);
+ }
+
+err: if (lock_held)
+ LOG_SYSTEM_UNLOCK(env);
+ if (need_free)
+ __os_free(env, dbt->data);
+
+ /*
+ * If auto-remove is set and we switched files, remove unnecessary
+ * log files.
+ */
+ if (ret == 0 && !IS_ZERO_LSN(old_lsn) && lp->db_log_autoremove)
+ __log_autoremove(env);
+
+ return (ret);
+}
+
+/*
+ * __log_current_lsn_int --
+ * internal operations of __log_current_lsn
+ *
+ * PUBLIC: int __log_current_lsn_int
+ * PUBLIC: __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
+ */
+int
+__log_current_lsn_int(env, lsnp, mbytesp, bytesp)
+ ENV *env;
+ DB_LSN *lsnp;
+ u_int32_t *mbytesp, *bytesp;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ LOG_SYSTEM_LOCK(env);
+
+ /*
+ * We need the LSN of the last entry in the log.
+ *
+ * Typically, it's easy to get the last written LSN, you simply look
+ * at the current log pointer and back up the number of bytes of the
+ * last log record. However, if the last thing we did was write the
+ * log header of a new log file, then, this doesn't work, so we return
+ * the first log record that will be written in this new file.
+ */
+ *lsnp = lp->lsn;
+ if (lp->lsn.offset > lp->len)
+ lsnp->offset -= lp->len;
+
+ /*
+ * Since we're holding the log region lock, return the bytes put into
+ * the log since the last checkpoint, transaction checkpoint needs it.
+ *
+ * We add the current buffer offset so as to count bytes that have not
+ * yet been written, but are sitting in the log buffer.
+ */
+ if (mbytesp != NULL) {
+ *mbytesp = lp->stat.st_wc_mbytes;
+ *bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off);
+ }
+
+ LOG_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __log_current_lsn --
+ * Return the current LSN.
+ *
+ * PUBLIC: int __log_current_lsn
+ * PUBLIC: __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
+ */
+int
+__log_current_lsn(env, lsnp, mbytesp, bytesp)
+ ENV *env;
+ DB_LSN *lsnp;
+ u_int32_t *mbytesp, *bytesp;
+{
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ret = 0;
+ ENV_ENTER(env, ip);
+ ret = __log_current_lsn_int(env, lsnp, mbytesp, bytesp);
+ ENV_LEAVE(env, ip);
+
+ return ret;
+}
+
+/*
+ * __log_put_next --
+ * Put the given record as the next in the log, wherever that may
+ * turn out to be.
+ */
+static int
+__log_put_next(env, lsn, dbt, hdr, old_lsnp)
+ ENV *env;
+ DB_LSN *lsn;
+ const DBT *dbt;
+ HDR *hdr;
+ DB_LSN *old_lsnp;
+{
+ DB_LOG *dblp;
+ DB_LSN old_lsn;
+ LOG *lp;
+ int adv_file, newfile, ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ /*
+ * Save a copy of lp->lsn before we might decide to switch log
+ * files and change it. If we do switch log files, and we're
+ * doing replication, we'll need to tell our clients about the
+ * switch, and they need to receive a NEWFILE message
+ * with this "would-be" LSN in order to know they're not
+ * missing any log records.
+ */
+ old_lsn = lp->lsn;
+ newfile = 0;
+ adv_file = 0;
+ /*
+ * If our current log is at an older version and we want to write
+ * a record then we need to advance the log.
+ */
+ if (lp->persist.version != DB_LOGVERSION) {
+ __log_set_version(env, DB_LOGVERSION);
+ adv_file = 1;
+ }
+
+ /*
+ * If this information won't fit in the file, or if we're a
+ * replication client environment and have been told to do so,
+ * swap files.
+ */
+ if (adv_file || lp->lsn.offset == 0 ||
+ lp->lsn.offset + hdr->size + dbt->size > lp->log_size) {
+ if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) {
+ __db_errx(env, DB_STR_A("2513",
+ "DB_ENV->log_put: record larger than maximum file size (%lu > %lu)",
+ "%lu %lu"),
+ (u_long)hdr->size + sizeof(LOGP) + dbt->size,
+ (u_long)lp->log_size);
+ return (EINVAL);
+ }
+
+ if ((ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
+ return (ret);
+
+ /*
+ * Flag that we switched files, in case we're a master
+ * and need to send this information to our clients.
+ * We postpone doing the actual send until we can
+ * safely release the log region lock and are doing so
+ * anyway.
+ */
+ newfile = 1;
+ }
+
+ /* If we switched log files, let our caller know where. */
+ if (newfile)
+ *old_lsnp = old_lsn;
+
+ /* Actually put the record. */
+ return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr));
+}
+
+/*
+ * __log_flush_commit --
+ * Flush a record.
+ */
+static int
+__log_flush_commit(env, lsnp, flags)
+ ENV *env;
+ const DB_LSN *lsnp;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_LSN flush_lsn;
+ HDR hdr;
+ LOG *lp;
+ int ret, t_ret;
+ size_t nr, nw;
+ u_int8_t *buffer;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ flush_lsn = *lsnp;
+
+ ret = 0;
+
+ /*
+ * DB_FLUSH:
+ * Flush a record for which the DB_FLUSH flag to log_put was set.
+ *
+ * DB_LOG_WRNOSYNC:
+ * If there's anything in the current log buffer, write it out.
+ */
+ if (LF_ISSET(DB_FLUSH))
+ ret = __log_flush_int(dblp, &flush_lsn, 1);
+ else if (!lp->db_log_inmemory && lp->b_off != 0)
+ if ((ret = __log_write(dblp,
+ dblp->bufp, (u_int32_t)lp->b_off)) == 0)
+ lp->b_off = 0;
+
+ /*
+ * If a flush supporting a transaction commit fails, we must abort the
+ * transaction. (If we aren't doing a commit, return the failure; if
+ * if the commit we care about made it to disk successfully, we just
+ * ignore the failure, because there's no way to undo the commit.)
+ */
+ if (ret == 0 || !LF_ISSET(DB_LOG_COMMIT))
+ return (ret);
+
+ if (LF_ISSET(DB_FLUSH) ?
+ flush_lsn.file != lp->s_lsn.file ||
+ flush_lsn.offset < lp->s_lsn.offset :
+ flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off)
+ return (0);
+
+ if (IS_REP_MASTER(env)) {
+ __db_err(env, ret, DB_STR("2514",
+ "Write failed on MASTER commit."));
+ return (__env_panic(env, ret));
+ }
+
+ /*
+ * Else, make sure that the commit record does not get out after we
+ * abort the transaction. Do this by overwriting the commit record
+ * in the buffer. (Note that other commits in this buffer will wait
+ * until a successful write happens, we do not wake them.) We point
+ * at the right part of the buffer and write an abort record over the
+ * commit. We must then try and flush the buffer again, since the
+ * interesting part of the buffer may have actually made it out to
+ * disk before there was a failure, we can't know for sure.
+ */
+ if (flush_lsn.offset > lp->w_off) {
+ if ((t_ret = __txn_force_abort(env,
+ dblp->bufp + flush_lsn.offset - lp->w_off)) != 0)
+ return (__env_panic(env, t_ret));
+ } else {
+ /*
+ * The buffer was written, but its not on disk, we
+ * must read it back and force things from a commit
+ * state to an abort state. Lots of things could fail
+ * here and we will be left with a commit record but
+ * a panic return.
+ */
+ if (
+ (t_ret = __os_seek(env,
+ dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
+ (t_ret = __os_read(env, dblp->lfhp, &hdr,
+ HDR_NORMAL_SZ, &nr)) != 0 || nr != HDR_NORMAL_SZ)
+ return (__env_panic(env, t_ret == 0 ? EIO : t_ret));
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(&hdr, CRYPTO_ON(env));
+ if ((t_ret = __os_malloc(env, hdr.len, &buffer)) != 0 ||
+ (t_ret = __os_seek(env,
+ dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
+ (t_ret = __os_read(env, dblp->lfhp, buffer,
+ hdr.len, &nr)) != 0 || nr != hdr.len ||
+ (t_ret = __txn_force_abort(env, buffer)) != 0 ||
+ (t_ret = __os_seek(env,
+ dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
+ (t_ret = __os_write(env, dblp->lfhp, buffer,
+ nr, &nw)) != 0 || nw != nr)
+ return (__env_panic(env, t_ret == 0 ? EIO : t_ret));
+ __os_free(env, buffer);
+ }
+ /*
+ * Try to flush the log again, if the disk just bounced then we
+ * want to be sure it does not go away again before we write the
+ * abort record.
+ */
+ (void)__log_flush_int(dblp, &flush_lsn, 0);
+
+ return (ret);
+}
+
+/*
+ * __log_newfile --
+ * Initialize and switch to a new log file. (Note that this is
+ * called both when no log yet exists and when we fill a log file.)
+ *
+ * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *, u_int32_t, u_int32_t));
+ */
+int
+__log_newfile(dblp, lsnp, logfile, version)
+ DB_LOG *dblp;
+ DB_LSN *lsnp;
+ u_int32_t logfile;
+ u_int32_t version;
+{
+ DBT t;
+ DB_CIPHER *db_cipher;
+ DB_LSN lsn;
+ ENV *env;
+ HDR hdr;
+ LOG *lp;
+ LOGP *tpersist;
+ int need_free, ret;
+ u_int32_t lastoff;
+ size_t tsize;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+
+ /*
+ * If we're not specifying a specific log file number and we're
+ * not at the beginning of a file already, start a new one.
+ */
+ if (logfile == 0 && lp->lsn.offset != 0) {
+ /*
+ * Flush the log so this file is out and can be closed. We
+ * cannot release the region lock here because we need to
+ * protect the end of the file while we switch. In
+ * particular, a thread with a smaller record than ours
+ * could detect that there is space in the log. Even
+ * blocking that event by declaring the file full would
+ * require all threads to wait here so that the lsn.file
+ * can be moved ahead after the flush completes. This
+ * probably can be changed if we had an lsn for the
+ * previous file and one for the current, but it does not
+ * seem like this would get much more throughput, if any.
+ */
+ if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
+ return (ret);
+
+ /*
+ * Save the last known offset from the previous file, we'll
+ * need it to initialize the persistent header information.
+ */
+ lastoff = lp->lsn.offset;
+
+ /* Point the current LSN to the new file. */
+ ++lp->lsn.file;
+ lp->lsn.offset = 0;
+
+ /* Reset the file write offset. */
+ lp->w_off = 0;
+ } else
+ lastoff = 0;
+
+ /*
+ * Replication may require we reset the log file name space entirely.
+ * In that case we also force a file switch so that replication can
+ * clean up old files.
+ */
+ if (logfile != 0) {
+ lp->lsn.file = logfile;
+ lp->lsn.offset = 0;
+ lp->w_off = 0;
+ if (lp->db_log_inmemory) {
+ lsn = lp->lsn;
+ (void)__log_zero(env, &lsn);
+ } else {
+ lp->s_lsn = lp->lsn;
+ if ((ret = __log_newfh(dblp, 1)) != 0)
+ return (ret);
+ }
+ }
+
+ DB_ASSERT(env, lp->db_log_inmemory || lp->b_off == 0);
+ if (lp->db_log_inmemory &&
+ (ret = __log_inmem_newfile(dblp, lp->lsn.file)) != 0)
+ return (ret);
+
+ /*
+ * Insert persistent information as the first record in every file.
+ * Note that the previous length is wrong for the very first record
+ * of the log, but that's okay, we check for it during retrieval.
+ */
+ memset(&t, 0, sizeof(t));
+ memset(&hdr, 0, sizeof(HDR));
+
+ need_free = 0;
+ tsize = sizeof(LOGP);
+ db_cipher = env->crypto_handle;
+ if (CRYPTO_ON(env))
+ tsize += db_cipher->adj_size(tsize);
+ if ((ret = __os_calloc(env, 1, tsize, &tpersist)) != 0)
+ return (ret);
+ need_free = 1;
+ /*
+ * If we're told what version to make this file, then we
+ * need to be at that version. Update here.
+ */
+ if (version != 0) {
+ __log_set_version(env, version);
+ if ((ret = __env_init_rec(env, version)) != 0)
+ goto err;
+ }
+ lp->persist.log_size = lp->log_size = lp->log_nsize;
+ memcpy(tpersist, &lp->persist, sizeof(LOGP));
+ DB_SET_DBT(t, tpersist, tsize);
+ if (LOG_SWAPPED(env))
+ __log_persistswap(tpersist);
+
+ if ((ret =
+ __log_encrypt_record(env, &t, &hdr, (u_int32_t)tsize)) != 0)
+ goto err;
+
+ if ((ret = __log_putr(dblp, &lsn,
+ &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0)
+ goto err;
+
+ /* Update the LSN information returned to the caller. */
+ if (lsnp != NULL)
+ *lsnp = lp->lsn;
+
+err: if (need_free)
+ __os_free(env, tpersist);
+ return (ret);
+}
+
+/*
+ * __log_putr --
+ * Actually put a record into the log.
+ */
+static int
+__log_putr(dblp, lsn, dbt, prev, h)
+ DB_LOG *dblp;
+ DB_LSN *lsn;
+ const DBT *dbt;
+ u_int32_t prev;
+ HDR *h;
+{
+ DB_CIPHER *db_cipher;
+ DB_LSN f_lsn;
+ ENV *env;
+ HDR tmp, *hdr;
+ LOG *lp;
+ int ret, t_ret;
+ db_size_t b_off;
+ size_t nr;
+ u_int32_t w_off;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+
+ /*
+ * If we weren't given a header, use a local one.
+ */
+ db_cipher = env->crypto_handle;
+ if (h == NULL) {
+ hdr = &tmp;
+ memset(hdr, 0, sizeof(HDR));
+ if (CRYPTO_ON(env))
+ hdr->size = HDR_CRYPTO_SZ;
+ else
+ hdr->size = HDR_NORMAL_SZ;
+ } else
+ hdr = h;
+
+ /* Save our position in case we fail. */
+ b_off = lp->b_off;
+ w_off = lp->w_off;
+ f_lsn = lp->f_lsn;
+
+ /*
+ * Initialize the header. If we just switched files, lsn.offset will
+ * be 0, and what we really want is the offset of the previous record
+ * in the previous file. Fortunately, prev holds the value we want.
+ */
+ hdr->prev = prev;
+ hdr->len = (u_int32_t)hdr->size + dbt->size;
+
+#ifdef HAVE_LOG_CHECKSUM
+ /*
+ * If we were passed in a nonzero checksum, our caller calculated
+ * the checksum before acquiring the log mutex, as an optimization.
+ *
+ * If our caller calculated a real checksum of 0, we'll needlessly
+ * recalculate it. C'est la vie; there's no out-of-bounds value
+ * here.
+ */
+ if (hdr->chksum[0] == 0) {
+ if (lp->persist.version < DB_LOGCHKSUM)
+ __db_chksum(NULL, dbt->data, dbt->size,
+ (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
+ hdr->chksum);
+ else
+ __db_chksum(hdr, dbt->data, dbt->size,
+ (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
+ hdr->chksum);
+ } else if (lp->persist.version >= DB_LOGCHKSUM)
+ /*
+ * We need to include hdr->prev and len here, since they were
+ * still zero at the time of the caller's __db_chksum() call.
+ */
+ LOG_HDR_SUM(CRYPTO_ON(env), hdr, hdr->chksum);
+#endif
+
+ if (lp->db_log_inmemory && (ret = __log_inmem_chkspace(dblp,
+ (u_int32_t)hdr->size + dbt->size)) != 0)
+ goto err;
+
+ /*
+ * The offset into the log file at this point is the LSN where
+ * we're about to put this record, and is the LSN the caller wants.
+ */
+ *lsn = lp->lsn;
+
+ nr = hdr->size;
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(hdr, CRYPTO_ON(env));
+
+ /* nr can't overflow a 32 bit value - header size is internal. */
+ ret = __log_fill(dblp, lsn, hdr, (u_int32_t)nr);
+
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(hdr, CRYPTO_ON(env));
+
+ if (ret != 0)
+ goto err;
+
+ if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0)
+ goto err;
+
+ lp->len = (u_int32_t)(hdr->size + dbt->size);
+ lp->lsn.offset += lp->len;
+ return (0);
+err:
+ /*
+ * If we wrote more than one buffer before failing, get the
+ * first one back. The extra buffers will fail the checksums
+ * and be ignored.
+ */
+ if (w_off + lp->buffer_size < lp->w_off) {
+ DB_ASSERT(env, !lp->db_log_inmemory);
+ if ((t_ret = __os_seek(env, dblp->lfhp, 0, 0, w_off)) != 0 ||
+ (t_ret = __os_read(env, dblp->lfhp, dblp->bufp,
+ b_off, &nr)) != 0)
+ return (__env_panic(env, t_ret));
+ if (nr != b_off) {
+ __db_errx(env, DB_STR("2515",
+ "Short read while restoring log"));
+ return (__env_panic(env, EIO));
+ }
+ }
+
+ /* Reset to where we started. */
+ lp->w_off = w_off;
+ lp->b_off = b_off;
+ lp->f_lsn = f_lsn;
+
+ return (ret);
+}
+
+/*
+ * __log_flush_pp --
+ * ENV->log_flush pre/post processing.
+ *
+ * PUBLIC: int __log_flush_pp __P((DB_ENV *, const DB_LSN *));
+ */
+int
+__log_flush_pp(dbenv, lsn)
+ DB_ENV *dbenv;
+ const DB_LSN *lsn;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__log_flush(env, lsn)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * See if we need to wait. s_lsn is not locked so some care is needed.
+ * The sync point can only move forward. The lsnp->file cannot be
+ * greater than the s_lsn.file. If the file we want is in the past
+ * we are done. If the file numbers are the same check the offset.
+ * This all assumes we can read an 32-bit quantity in one state or
+ * the other, not in transition.
+ */
+#define ALREADY_FLUSHED(lp, lsnp) \
+ (((lp)->s_lsn.file > (lsnp)->file) || \
+ ((lp)->s_lsn.file == (lsnp)->file && \
+ (lp)->s_lsn.offset > (lsnp)->offset))
+
+/*
+ * __log_flush --
+ * ENV->log_flush
+ *
+ * PUBLIC: int __log_flush __P((ENV *, const DB_LSN *));
+ */
+int
+__log_flush(env, lsn)
+ ENV *env;
+ const DB_LSN *lsn;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ int ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (lsn != NULL && ALREADY_FLUSHED(lp, lsn))
+ return (0);
+ LOG_SYSTEM_LOCK(env);
+ ret = __log_flush_int(dblp, lsn, 1);
+ LOG_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __log_flush_int --
+ * Write all records less than or equal to the specified LSN; internal
+ * version.
+ *
+ * PUBLIC: int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
+ */
+int
+__log_flush_int(dblp, lsnp, release)
+ DB_LOG *dblp;
+ const DB_LSN *lsnp;
+ int release;
+{
+ struct __db_commit *commit;
+ ENV *env;
+ DB_LSN flush_lsn, f_lsn;
+ LOG *lp;
+ size_t b_off;
+ u_int32_t ncommit, w_off;
+ int do_flush, first, ret;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+ ncommit = 0;
+ ret = 0;
+
+ if (lp->db_log_inmemory) {
+ lp->s_lsn = lp->lsn;
+ STAT(++lp->stat.st_scount);
+ return (0);
+ }
+
+ /*
+ * If no LSN specified, flush the entire log by setting the flush LSN
+ * to the last LSN written in the log. Otherwise, check that the LSN
+ * isn't a non-existent record for the log.
+ */
+ if (lsnp == NULL) {
+ flush_lsn.file = lp->lsn.file;
+ flush_lsn.offset = lp->lsn.offset - lp->len;
+ } else if (lsnp->file > lp->lsn.file ||
+ (lsnp->file == lp->lsn.file &&
+ lsnp->offset > lp->lsn.offset - lp->len)) {
+ __db_errx(env, DB_STR_A("2516",
+ "DB_ENV->log_flush: LSN of %lu/%lu past current end-of-log of %lu/%lu",
+ "%lu %lu %lu %lu"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, (u_long)lp->lsn.file,
+ (u_long)lp->lsn.offset);
+ __db_errx(env, DB_STR("2517",
+ "Database environment corrupt; the wrong log files may "
+ "have been removed or incompatible database files "
+ "imported from another environment"));
+ return (__env_panic(env, DB_RUNRECOVERY));
+ } else {
+ if (ALREADY_FLUSHED(lp, lsnp))
+ return (0);
+ flush_lsn = *lsnp;
+ }
+
+ /*
+ * If a flush is in progress and we're allowed to do so, drop
+ * the region lock and block waiting for the next flush.
+ */
+ if (release && lp->in_flush != 0) {
+ if ((commit = SH_TAILQ_FIRST(
+ &lp->free_commits, __db_commit)) == NULL) {
+ if ((ret = __env_alloc(&dblp->reginfo,
+ sizeof(struct __db_commit), &commit)) != 0)
+ goto flush;
+ memset(commit, 0, sizeof(*commit));
+ if ((ret = __mutex_alloc(env, MTX_TXN_COMMIT,
+ DB_MUTEX_SELF_BLOCK, &commit->mtx_txnwait)) != 0) {
+ __env_alloc_free(&dblp->reginfo, commit);
+ return (ret);
+ }
+ MUTEX_LOCK(env, commit->mtx_txnwait);
+ } else
+ SH_TAILQ_REMOVE(
+ &lp->free_commits, commit, links, __db_commit);
+
+ lp->ncommit++;
+
+ /*
+ * Flushes may be requested out of LSN order; be
+ * sure we only move lp->t_lsn forward.
+ */
+ if (LOG_COMPARE(&lp->t_lsn, &flush_lsn) < 0)
+ lp->t_lsn = flush_lsn;
+
+ commit->lsn = flush_lsn;
+ SH_TAILQ_INSERT_HEAD(
+ &lp->commits, commit, links, __db_commit);
+ LOG_SYSTEM_UNLOCK(env);
+ /* Wait here for the in-progress flush to finish. */
+ MUTEX_LOCK(env, commit->mtx_txnwait);
+ LOG_SYSTEM_LOCK(env);
+
+ lp->ncommit--;
+ /*
+ * Grab the flag before freeing the struct to see if
+ * we need to flush the log to commit. If so,
+ * use the maximal lsn for any committing thread.
+ */
+ do_flush = F_ISSET(commit, DB_COMMIT_FLUSH);
+ F_CLR(commit, DB_COMMIT_FLUSH);
+ SH_TAILQ_INSERT_HEAD(
+ &lp->free_commits, commit, links, __db_commit);
+ if (do_flush) {
+ lp->in_flush--;
+ flush_lsn = lp->t_lsn;
+ } else
+ return (0);
+ }
+
+ /*
+ * Protect flushing with its own mutex so we can release
+ * the region lock except during file switches.
+ */
+flush: MUTEX_LOCK(env, lp->mtx_flush);
+
+ /*
+ * If the LSN is less than or equal to the last-sync'd LSN, we're done.
+ * Note, the last-sync LSN saved in s_lsn is the LSN of the first byte
+ * after the byte we absolutely know was written to disk, so the test
+ * is <, not <=.
+ */
+ if (flush_lsn.file < lp->s_lsn.file ||
+ (flush_lsn.file == lp->s_lsn.file &&
+ flush_lsn.offset < lp->s_lsn.offset)) {
+ MUTEX_UNLOCK(env, lp->mtx_flush);
+ goto done;
+ }
+
+ /*
+ * We may need to write the current buffer. We have to write the
+ * current buffer if the flush LSN is greater than or equal to the
+ * buffer's starting LSN.
+ *
+ * Otherwise, it's still possible that this thread may never have
+ * written to this log file. Acquire a file descriptor if we don't
+ * already have one.
+ */
+ if (lp->b_off != 0 && LOG_COMPARE(&flush_lsn, &lp->f_lsn) >= 0) {
+ if ((ret = __log_write(dblp,
+ dblp->bufp, (u_int32_t)lp->b_off)) != 0) {
+ MUTEX_UNLOCK(env, lp->mtx_flush);
+ goto done;
+ }
+
+ lp->b_off = 0;
+ } else if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file)
+ if ((ret = __log_newfh(dblp, 0)) != 0) {
+ MUTEX_UNLOCK(env, lp->mtx_flush);
+ goto done;
+ }
+
+ /*
+ * We are going to flush, release the region.
+ * First get the current state of the buffer since
+ * another write may come in, but we may not flush it.
+ */
+ b_off = lp->b_off;
+ w_off = lp->w_off;
+ f_lsn = lp->f_lsn;
+ lp->in_flush++;
+ if (release)
+ LOG_SYSTEM_UNLOCK(env);
+
+ /* Sync all writes to disk. */
+ if ((ret = __os_fsync(env, dblp->lfhp)) != 0) {
+ MUTEX_UNLOCK(env, lp->mtx_flush);
+ if (release)
+ LOG_SYSTEM_LOCK(env);
+ lp->in_flush--;
+ goto done;
+ }
+
+ /*
+ * Set the last-synced LSN.
+ * This value must be set to the LSN past the last complete
+ * record that has been flushed. This is at least the first
+ * lsn, f_lsn. If the buffer is empty, b_off == 0, then
+ * we can move up to write point since the first lsn is not
+ * set for the new buffer.
+ */
+ lp->s_lsn = f_lsn;
+ if (b_off == 0)
+ lp->s_lsn.offset = w_off;
+
+ MUTEX_UNLOCK(env, lp->mtx_flush);
+ if (release)
+ LOG_SYSTEM_LOCK(env);
+
+ lp->in_flush--;
+ STAT(++lp->stat.st_scount);
+
+ /*
+ * How many flush calls (usually commits) did this call actually sync?
+ * At least one, if it got here.
+ */
+ ncommit = 1;
+done:
+ if (lp->ncommit != 0) {
+ first = 1;
+ SH_TAILQ_FOREACH(commit, &lp->commits, links, __db_commit)
+ if (LOG_COMPARE(&lp->s_lsn, &commit->lsn) > 0) {
+ MUTEX_UNLOCK(env, commit->mtx_txnwait);
+ SH_TAILQ_REMOVE(
+ &lp->commits, commit, links, __db_commit);
+ ncommit++;
+ } else if (first == 1) {
+ F_SET(commit, DB_COMMIT_FLUSH);
+ MUTEX_UNLOCK(env, commit->mtx_txnwait);
+ SH_TAILQ_REMOVE(
+ &lp->commits, commit, links, __db_commit);
+ /*
+ * This thread will wake and flush.
+ * If another thread commits and flushes
+ * first we will waste a trip trough the
+ * mutex.
+ */
+ lp->in_flush++;
+ first = 0;
+ }
+ }
+#ifdef HAVE_STATISTICS
+ if (lp->stat.st_maxcommitperflush < ncommit)
+ lp->stat.st_maxcommitperflush = ncommit;
+ if (lp->stat.st_mincommitperflush > ncommit ||
+ lp->stat.st_mincommitperflush == 0)
+ lp->stat.st_mincommitperflush = ncommit;
+#endif
+
+ return (ret);
+}
+
+/*
+ * __log_fill --
+ * Write information into the log.
+ */
+static int
+__log_fill(dblp, lsn, addr, len)
+ DB_LOG *dblp;
+ DB_LSN *lsn;
+ void *addr;
+ u_int32_t len;
+{
+ LOG *lp;
+ u_int32_t bsize, nrec;
+ size_t nw, remain;
+ int ret;
+
+ lp = dblp->reginfo.primary;
+ bsize = lp->buffer_size;
+
+ if (lp->db_log_inmemory) {
+ __log_inmem_copyin(dblp, lp->b_off, addr, len);
+ lp->b_off = (lp->b_off + len) % lp->buffer_size;
+ return (0);
+ }
+
+ while (len > 0) { /* Copy out the data. */
+ /*
+ * If we're beginning a new buffer, note the user LSN to which
+ * the first byte of the buffer belongs. We have to know this
+ * when flushing the buffer so that we know if the in-memory
+ * buffer needs to be flushed.
+ */
+ if (lp->b_off == 0)
+ lp->f_lsn = *lsn;
+
+ /*
+ * If we're on a buffer boundary and the data is big enough,
+ * copy as many records as we can directly from the data.
+ */
+ if (lp->b_off == 0 && len >= bsize) {
+ nrec = len / bsize;
+ if ((ret = __log_write(dblp, addr, nrec * bsize)) != 0)
+ return (ret);
+ addr = (u_int8_t *)addr + nrec * bsize;
+ len -= nrec * bsize;
+ STAT(++lp->stat.st_wcount_fill);
+ continue;
+ }
+
+ /* Figure out how many bytes we can copy this time. */
+ remain = bsize - lp->b_off;
+ nw = remain > len ? len : remain;
+ memcpy(dblp->bufp + lp->b_off, addr, nw);
+ addr = (u_int8_t *)addr + nw;
+ len -= (u_int32_t)nw;
+ lp->b_off += (u_int32_t)nw;
+
+ /* If we fill the buffer, flush it. */
+ if (lp->b_off == bsize) {
+ if ((ret = __log_write(dblp, dblp->bufp, bsize)) != 0)
+ return (ret);
+ lp->b_off = 0;
+ STAT(++lp->stat.st_wcount_fill);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __log_write --
+ * Write the log buffer to disk.
+ */
+static int
+__log_write(dblp, addr, len)
+ DB_LOG *dblp;
+ void *addr;
+ u_int32_t len;
+{
+ ENV *env;
+ LOG *lp;
+ size_t nw;
+ int ret;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+
+ DB_ASSERT(env, !lp->db_log_inmemory);
+
+ /*
+ * If we haven't opened the log file yet or the current one has
+ * changed, acquire a new log file. We are creating the file if we're
+ * about to write to the start of it, in other words, if the write
+ * offset is zero.
+ */
+ if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file ||
+ dblp->lf_timestamp != lp->timestamp)
+ if ((ret = __log_newfh(dblp, lp->w_off == 0)) != 0)
+ return (ret);
+
+ /*
+ * If we're writing the first block in a log file on a filesystem that
+ * guarantees unwritten blocks are zero-filled, we set the size of the
+ * file in advance. This increases sync performance on some systems,
+ * because they don't need to update metadata on every sync.
+ *
+ * Ignore any error -- we may have run out of disk space, but that's no
+ * reason to quit.
+ */
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ if (lp->w_off == 0 && !__os_fs_notzero()) {
+#else
+ if (lp->w_off == 0) {
+#endif
+ (void)__db_file_extend(env, dblp->lfhp, lp->log_size);
+ if (F_ISSET(dblp, DBLOG_ZERO))
+ (void)__db_zero_extend(env, dblp->lfhp,
+ 0, lp->log_size/lp->buffer_size, lp->buffer_size);
+
+ }
+
+ /*
+ * Seek to the offset in the file (someone may have written it
+ * since we last did).
+ */
+ if ((ret = __os_io(env, DB_IO_WRITE,
+ dblp->lfhp, 0, 0, lp->w_off, len, addr, &nw)) != 0)
+ return (ret);
+
+ /* Reset the buffer offset and update the seek offset. */
+ lp->w_off += len;
+
+ /* Update written statistics. */
+ if ((lp->stat.st_wc_bytes += len) >= MEGABYTE) {
+ lp->stat.st_wc_bytes -= MEGABYTE;
+ ++lp->stat.st_wc_mbytes;
+ }
+#ifdef HAVE_STATISTICS
+ if ((lp->stat.st_w_bytes += len) >= MEGABYTE) {
+ lp->stat.st_w_bytes -= MEGABYTE;
+ ++lp->stat.st_w_mbytes;
+ }
+ ++lp->stat.st_wcount;
+#endif
+
+ return (0);
+}
+
+/*
+ * __log_file_pp --
+ * ENV->log_file pre/post processing.
+ *
+ * PUBLIC: int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t));
+ */
+int
+__log_file_pp(dbenv, lsn, namep, len)
+ DB_ENV *dbenv;
+ const DB_LSN *lsn;
+ char *namep;
+ size_t len;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret, set;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->log_file", DB_INIT_LOG);
+
+ if ((ret = __log_get_config(dbenv, DB_LOG_IN_MEMORY, &set)) != 0)
+ return (ret);
+ if (set) {
+ __db_errx(env, DB_STR("2518",
+ "DB_ENV->log_file is illegal with in-memory logs"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__log_file(env, lsn, namep, len)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __log_file --
+ * ENV->log_file.
+ */
+static int
+__log_file(env, lsn, namep, len)
+ ENV *env;
+ const DB_LSN *lsn;
+ char *namep;
+ size_t len;
+{
+ DB_LOG *dblp;
+ int ret;
+ char *name;
+
+ dblp = env->lg_handle;
+ LOG_SYSTEM_LOCK(env);
+ ret = __log_name(dblp, lsn->file, &name, NULL, 0);
+ LOG_SYSTEM_UNLOCK(env);
+ if (ret != 0)
+ return (ret);
+
+ /* Check to make sure there's enough room and copy the name. */
+ if (len < strlen(name) + 1) {
+ *namep = '\0';
+ __db_errx(env, DB_STR("2519",
+ "DB_ENV->log_file: name buffer is too short"));
+ return (EINVAL);
+ }
+ (void)strcpy(namep, name);
+ __os_free(env, name);
+
+ return (0);
+}
+
+/*
+ * __log_newfh --
+ * Acquire a file handle for the current log file.
+ */
+static int
+__log_newfh(dblp, create)
+ DB_LOG *dblp;
+ int create;
+{
+ ENV *env;
+ LOG *lp;
+ u_int32_t flags;
+ int ret;
+ logfile_validity status;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+
+ /* Close any previous file descriptor. */
+ if (dblp->lfhp != NULL) {
+ (void)__os_closehandle(env, dblp->lfhp);
+ dblp->lfhp = NULL;
+ }
+
+ flags = DB_OSO_SEQ |
+ (create ? DB_OSO_CREATE : 0) |
+ (F_ISSET(dblp, DBLOG_DIRECT) ? DB_OSO_DIRECT : 0) |
+ (F_ISSET(dblp, DBLOG_DSYNC) ? DB_OSO_DSYNC : 0);
+
+ /* Get the path of the new file and open it. */
+ dblp->lfname = lp->lsn.file;
+ if ((ret = __log_valid(dblp, dblp->lfname, 0, &dblp->lfhp,
+ flags, &status, NULL)) != 0)
+ __db_err(env, ret,
+ "DB_ENV->log_newfh: %lu", (u_long)lp->lsn.file);
+ else if (status != DB_LV_NORMAL && status != DB_LV_INCOMPLETE &&
+ status != DB_LV_OLD_READABLE)
+ ret = DB_NOTFOUND;
+
+ return (ret);
+}
+
+/*
+ * __log_name --
+ * Return the log name for a particular file, and optionally open it.
+ *
+ * PUBLIC: int __log_name __P((DB_LOG *,
+ * PUBLIC: u_int32_t, char **, DB_FH **, u_int32_t));
+ */
+int
+__log_name(dblp, filenumber, namep, fhpp, flags)
+ DB_LOG *dblp;
+ u_int32_t filenumber, flags;
+ char **namep;
+ DB_FH **fhpp;
+{
+ ENV *env;
+ LOG *lp;
+ int mode, ret;
+ char *oname;
+ char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+
+ DB_ASSERT(env, !lp->db_log_inmemory);
+
+ /*
+ * !!!
+ * The semantics of this routine are bizarre.
+ *
+ * The reason for all of this is that we need a place where we can
+ * intercept requests for log files, and, if appropriate, check for
+ * both the old-style and new-style log file names. The trick is
+ * that all callers of this routine that are opening the log file
+ * read-only want to use an old-style file name if they can't find
+ * a match using a new-style name. The only down-side is that some
+ * callers may check for the old-style when they really don't need
+ * to, but that shouldn't mess up anything, and we only check for
+ * the old-style name when we've already failed to find a new-style
+ * one.
+ *
+ * Create a new-style file name, and if we're not going to open the
+ * file, return regardless.
+ */
+ (void)snprintf(new, sizeof(new), LFNAME, filenumber);
+ if ((ret = __db_appname(env,
+ DB_APP_LOG, new, NULL, namep)) != 0 || fhpp == NULL)
+ return (ret);
+
+ /* The application may have specified an absolute file mode. */
+ if (lp->filemode == 0)
+ mode = env->db_mode;
+ else {
+ LF_SET(DB_OSO_ABSMODE);
+ mode = lp->filemode;
+ }
+
+ /* Open the new-style file -- if we succeed, we're done. */
+ dblp->lf_timestamp = lp->timestamp;
+ if ((ret = __os_open(env, *namep, 0, flags, mode, fhpp)) == 0)
+ return (0);
+
+ /*
+ * If the open failed for reason other than the file
+ * not being there, complain loudly, the wrong user
+ * probably started up the application.
+ */
+ if (ret != ENOENT) {
+ __db_err(env, ret, DB_STR_A("2520",
+ "%s: log file unreadable", "%s"), *namep);
+ return (__env_panic(env, ret));
+ }
+
+ /*
+ * The open failed... if the DB_RDONLY flag isn't set, we're done,
+ * the caller isn't interested in old-style files.
+ */
+ if (!LF_ISSET(DB_OSO_RDONLY)) {
+ __db_err(env, ret, DB_STR_A("2521",
+ "%s: log file open failed", "%s"), *namep);
+ return (__env_panic(env, ret));
+ }
+
+ /* Create an old-style file name. */
+ (void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
+ if ((ret = __db_appname(env,
+ DB_APP_LOG, old, NULL, &oname)) != 0)
+ goto err;
+
+ /*
+ * Open the old-style file -- if we succeed, we're done. Free the
+ * space allocated for the new-style name and return the old-style
+ * name to the caller.
+ */
+ if ((ret = __os_open(env, oname, 0, flags, mode, fhpp)) == 0) {
+ __os_free(env, *namep);
+ *namep = oname;
+ return (0);
+ }
+
+ /*
+ * Couldn't find either style of name -- return the new-style name
+ * for the caller's error message. If it's an old-style name that's
+ * actually missing we're going to confuse the user with the error
+ * message, but that implies that not only were we looking for an
+ * old-style name, but we expected it to exist and we weren't just
+ * looking for any log file. That's not a likely error.
+ */
+err: __os_free(env, oname);
+ return (ret);
+}
+
+/*
+ * __log_rep_put --
+ * Short-circuit way for replication clients to put records into the
+ * log. Replication clients' logs need to be laid out exactly as their masters'
+ * are, so we let replication take responsibility for when the log gets
+ * flushed, when log switches files, etc. This is just a thin PUBLIC wrapper
+ * for __log_putr with a slightly prettier interface.
+ *
+ * Note that the REP->mtx_clientdb should be held when this is called.
+ * Note that we acquire the log region mutex while holding mtx_clientdb.
+ *
+ * PUBLIC: int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
+ */
+int
+__log_rep_put(env, lsnp, rec, flags)
+ ENV *env;
+ DB_LSN *lsnp;
+ const DBT *rec;
+ u_int32_t flags;
+{
+ DBT *dbt, t;
+ DB_CIPHER *db_cipher;
+ DB_LOG *dblp;
+ HDR hdr;
+ LOG *lp;
+ int need_free, ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ LOG_SYSTEM_LOCK(env);
+ memset(&hdr, 0, sizeof(HDR));
+ t = *rec;
+ dbt = &t;
+ need_free = 0;
+ db_cipher = env->crypto_handle;
+ if (CRYPTO_ON(env))
+ t.size += db_cipher->adj_size(rec->size);
+ if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
+ goto err;
+ need_free = 1;
+ memcpy(t.data, rec->data, rec->size);
+
+ if ((ret = __log_encrypt_record(env, dbt, &hdr, rec->size)) != 0)
+ goto err;
+
+ DB_ASSERT(env, LOG_COMPARE(lsnp, &lp->lsn) == 0);
+ ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr);
+err:
+ /*
+ * !!! Assume caller holds REP->mtx_clientdb to modify ready_lsn.
+ */
+ lp->ready_lsn = lp->lsn;
+
+ if (LF_ISSET(DB_LOG_CHKPNT))
+ lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
+
+ /* Increment count of records added to the log. */
+ STAT(++lp->stat.st_record);
+ LOG_SYSTEM_UNLOCK(env);
+ if (need_free)
+ __os_free(env, t.data);
+ return (ret);
+}
+
+static int
+__log_encrypt_record(env, dbt, hdr, orig)
+ ENV *env;
+ DBT *dbt;
+ HDR *hdr;
+ u_int32_t orig;
+{
+ DB_CIPHER *db_cipher;
+ int ret;
+
+ if (CRYPTO_ON(env)) {
+ db_cipher = env->crypto_handle;
+ hdr->size = HDR_CRYPTO_SZ;
+ hdr->orig_size = orig;
+ if ((ret = db_cipher->encrypt(env, db_cipher->data,
+ hdr->iv, dbt->data, dbt->size)) != 0)
+ return (ret);
+ } else {
+ hdr->size = HDR_NORMAL_SZ;
+ }
+ return (0);
+}
+/*
+ * __log_put_record_pp --
+ * DB_ENV->log_put_record pre/post processing.
+ *
+ * PUBLIC: int __log_put_record_pp __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int32_t,
+ * PUBLIC: DB_LOG_RECSPEC *, ...));
+ */
+#ifdef STDC_HEADERS
+int
+__log_put_record_pp(DB_ENV *dbenv, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
+ u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
+ DB_LOG_RECSPEC *spec, ...)
+#else
+int
+__log_put_record_pp(dbenv, dbp, txnp, ret_lsnp,
+ flags, rectype, has_data, size,
+ spec, va_alist)
+ DB_ENV *dbenv;
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t rectype;
+ u_int32_t has_data;
+ u_int32_t size;
+ DB_LOG_RECSPEC *spec;
+ va_dcl
+#endif
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ va_list argp;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->log_put_record", DB_INIT_LOG);
+
+ /* Validate arguments: check for allowed flags. */
+ if ((ret = __db_fchk(env, "DB_ENV->log_put_record", flags,
+ DB_LOG_CHKPNT | DB_LOG_COMMIT |
+ DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0)
+ return (ret);
+
+ /* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */
+ if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH))
+ return (__db_ferr(env, "DB_ENV->log_put_record", 1));
+
+ /* Replication clients should never write log records. */
+ if (IS_REP_CLIENT(env)) {
+ __db_errx(env, DB_STR("2522",
+ "DB_ENV->log_put is illegal on replication clients"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ va_start(argp, spec);
+ REPLICATION_WRAP(env, (__log_put_record_int(env, dbp,
+ txnp, ret_lsnp, flags, rectype, has_data, size, spec, argp)),
+ 0, ret);
+ va_end(argp);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __log_put_record __P((ENV *, DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int32_t,
+ * PUBLIC: DB_LOG_RECSPEC *, ...));
+ */
+#ifdef STDC_HEADERS
+int
+__log_put_record(ENV *env, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
+ u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
+ DB_LOG_RECSPEC *spec, ...)
+#else
+int
+__log_put_record(env, dbp, txnp, ret_lsnp,
+ flags, rectype, has_data, size, spec, va_alist);
+ ENV *env;
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t rectype;
+ u_int32_t has_data;
+ u_int32_t size;
+ DB_LOG_RECSPEC *spec;
+ va_dcl
+#endif
+{
+ va_list argp;
+ int ret;
+
+ va_start(argp, spec);
+ ret = __log_put_record_int(env, dbp, txnp, ret_lsnp, flags,
+ rectype, has_data, size, spec, argp);
+ va_end(argp);
+ return (ret);
+}
+
+#ifdef STDC_HEADERS
+static int
+__log_put_record_int(ENV *env, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
+ u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
+ DB_LOG_RECSPEC *spec, va_list argp)
+#else
+int
+__log_put_record_int(env, dbp, txnp, ret_lsnp,
+ flags, rectype, has_data, size, spec, argp);
+ ENV *env;
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t has_data;
+ u_int32_t size;
+ u_int32_t rectype;
+ DB_LOG_RECSPEC *spec;
+ va_list argp;
+#endif
+{
+ DBT *data, *dbt, *header, logrec;
+ DB_LOG_RECSPEC *sp;
+ DB_LSN *lsnp, lsn, null_lsn, *pagelsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ LOG *lp;
+ PAGE *pghdrstart;
+ u_int32_t hdrsize, op, zero, uinttmp, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+ void *hdrstart;
+
+ COMPQUIET(lr, NULL);
+ COMPQUIET(hdrsize, 0);
+ COMPQUIET(op, 0);
+ COMPQUIET(hdrstart, NULL);
+ COMPQUIET(pghdrstart, NULL);
+ COMPQUIET(header, NULL);
+
+ /*
+ * rlsnp will be stored into while holding the log system lock.
+ * If this is a commit record then ret_lsnp will be the address of
+ * the transaction detail visible_lsn field. If not then this
+ * may be the lsn of a page and we do not want to set it if
+ * the log_put fails after writing the record (due to an I/O error).
+ */
+ if (LF_ISSET(DB_LOG_COMMIT))
+ rlsnp = ret_lsnp;
+ else
+ rlsnp = &lsn;
+ npad = 0;
+ ret = 0;
+ data = NULL;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ (dbp != NULL && F_ISSET(dbp, DB_AM_NOT_DURABLE))) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ if (dbp != NULL) {
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+ }
+
+ logrec.size = size;
+
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ zero = 0;
+ lp = env->lg_handle->reginfo.primary;
+ for (sp = spec; sp->type != LOGREC_Done; sp++) {
+ switch (sp->type) {
+ case LOGREC_DB:
+ /* This is not in the varargs. */
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+ break;
+
+ case LOGREC_ARG:
+ case LOGREC_TIME:
+ case LOGREC_DBOP:
+ uinttmp = va_arg(argp, u_int32_t);
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+ break;
+ case LOGREC_OP:
+ op = va_arg(argp, u_int32_t);
+ LOGCOPY_32(env, bp, &op);
+ bp += sizeof(uinttmp);
+ break;
+ case LOGREC_DBT:
+ case LOGREC_PGLIST:
+ case LOGREC_LOCKS:
+ case LOGREC_HDR:
+ case LOGREC_DATA:
+ dbt = va_arg(argp, DBT *);
+ if (dbt == NULL) {
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &dbt->size);
+ bp += sizeof(dbt->size);
+ memcpy(bp, dbt->data, dbt->size);
+ }
+ /* Process fields that need to be byte swapped. */
+ if (dbp != NULL && F_ISSET(dbp, DB_AM_SWAP)) {
+ if (sp->type == LOGREC_HDR &&
+ dbt != NULL && has_data == 0)
+ __db_recordswap(op,
+ dbt->size, bp, NULL, 0);
+ else if (sp->type == LOGREC_HDR) {
+ hdrstart = bp;
+ hdrsize = dbt == NULL ? 0 : dbt->size;
+ } else if (sp->type == LOGREC_DATA) {
+ __db_recordswap(op,
+ hdrsize, hdrstart, bp, 0);
+ has_data = 0;
+ }
+ }
+ if (dbt != NULL)
+ bp += dbt->size;
+
+ break;
+ /*
+ * Page header and data -- we assume that the header
+ * is listed first and the data follows sometime later.
+ * There should be only one header/data pair per record.
+ */
+ case LOGREC_PGDBT:
+ header = va_arg(argp, DBT *);
+ if (header == NULL) {
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &header->size);
+ bp += sizeof(header->size);
+ pghdrstart = (PAGE *)bp;
+ memcpy(bp, header->data, header->size);
+ if (has_data == 0 &&
+ F_ISSET(dbp, DB_AM_SWAP) &&
+ (ret = __db_pageswap(
+ env, dbp, pghdrstart, (size_t)header->size,
+ NULL, 0)) != 0)
+ return (ret);
+ bp += header->size;
+ }
+ break;
+
+ case LOGREC_PGDDBT:
+ data = va_arg(argp, DBT *);
+ if (data == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ if (F_ISSET(dbp, DB_AM_SWAP) &&
+ (ret = __db_pageswap(env, dbp, pghdrstart,
+ (size_t)header->size, (DBT *)data, 0)) != 0)
+ return (ret);
+ LOGCOPY_32(env, bp, &data->size);
+ bp += sizeof(data->size);
+ memcpy(bp, data->data, data->size);
+ if (F_ISSET(dbp, DB_AM_SWAP) &&
+ F_ISSET(data, DB_DBT_APPMALLOC))
+ __os_free(env, data->data);
+ bp += data->size;
+ }
+ break;
+ case LOGREC_POINTER:
+ pagelsn = va_arg(argp, DB_LSN *);
+ if (pagelsn != NULL) {
+ if (txnp != NULL) {
+ if (LOG_COMPARE(pagelsn,
+ &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env,
+ dbp, pagelsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, pagelsn);
+ } else
+ memset(bp, 0, sizeof(*pagelsn));
+ bp += sizeof(*pagelsn);
+ break;
+
+ default:
+ DB_ASSERT(env, sp->type != sp->type);
+ }
+ }
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0) {
+ if (txnp != NULL)
+ *lsnp = *rlsnp;
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env) && !lp->db_log_inmemory)
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_addrem_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
diff --git a/src/log/log_stat.c b/src/log/log_stat.c
new file mode 100644
index 00000000..37b74c74
--- /dev/null
+++ b/src/log/log_stat.c
@@ -0,0 +1,336 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+#ifdef HAVE_STATISTICS
+static int __log_print_all __P((ENV *, u_int32_t));
+static int __log_print_stats __P((ENV *, u_int32_t));
+static int __log_stat __P((ENV *, DB_LOG_STAT **, u_int32_t));
+
+/*
+ * __log_stat_pp --
+ * DB_ENV->log_stat pre/post processing.
+ *
+ * PUBLIC: int __log_stat_pp __P((DB_ENV *, DB_LOG_STAT **, u_int32_t));
+ */
+int
+__log_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_LOG_STAT **statp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->log_stat", DB_INIT_LOG);
+
+ if ((ret = __db_fchk(env,
+ "DB_ENV->log_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__log_stat(env, statp, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __log_stat --
+ * DB_ENV->log_stat.
+ */
+static int
+__log_stat(env, statp, flags)
+ ENV *env;
+ DB_LOG_STAT **statp;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_LOG_STAT *stats;
+ LOG *lp;
+ int ret;
+
+ *statp = NULL;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ if ((ret = __os_umalloc(env, sizeof(DB_LOG_STAT), &stats)) != 0)
+ return (ret);
+
+ /* Copy out the global statistics. */
+ LOG_SYSTEM_LOCK(env);
+ *stats = lp->stat;
+ if (LF_ISSET(DB_STAT_CLEAR))
+ memset(&lp->stat, 0, sizeof(lp->stat));
+
+ stats->st_magic = lp->persist.magic;
+ stats->st_version = lp->persist.version;
+ stats->st_mode = lp->filemode;
+ stats->st_lg_bsize = lp->buffer_size;
+ stats->st_lg_size = lp->log_nsize;
+
+ __mutex_set_wait_info(env, lp->mtx_region,
+ &stats->st_region_wait, &stats->st_region_nowait);
+ if (LF_ISSET(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM) == DB_STAT_CLEAR)
+ __mutex_clear(env, lp->mtx_region);
+ stats->st_regsize = dblp->reginfo.rp->size;
+
+ stats->st_cur_file = lp->lsn.file;
+ stats->st_cur_offset = lp->lsn.offset;
+ stats->st_disk_file = lp->s_lsn.file;
+ stats->st_disk_offset = lp->s_lsn.offset;
+
+ LOG_SYSTEM_UNLOCK(env);
+
+ *statp = stats;
+ return (0);
+}
+
+/*
+ * __log_stat_print_pp --
+ * DB_ENV->log_stat_print pre/post processing.
+ *
+ * PUBLIC: int __log_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__log_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->log_stat_print", DB_INIT_LOG);
+
+ if ((ret = __db_fchk(env, "DB_ENV->log_stat_print",
+ flags, DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__log_stat_print(env, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __log_stat_print --
+ * DB_ENV->log_stat_print method.
+ *
+ * PUBLIC: int __log_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__log_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ u_int32_t orig_flags;
+ int ret;
+
+ orig_flags = flags;
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+ if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+ ret = __log_print_stats(env, orig_flags);
+ if (flags == 0 || ret != 0)
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_STAT_ALL) &&
+ (ret = __log_print_all(env, orig_flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __log_print_stats --
+ * Display default log region statistics.
+ */
+static int
+__log_print_stats(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_LOG_STAT *sp;
+ int ret;
+
+ if ((ret = __log_stat(env, &sp, flags)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL))
+ __db_msg(env, "Default logging region information:");
+ STAT_HEX("Log magic number", sp->st_magic);
+ STAT_ULONG("Log version number", sp->st_version);
+ __db_dlbytes(env, "Log record cache size",
+ (u_long)0, (u_long)0, (u_long)sp->st_lg_bsize);
+ __db_msg(env, "%#o\tLog file mode", sp->st_mode);
+ if (sp->st_lg_size % MEGABYTE == 0)
+ __db_msg(env, "%luMb\tCurrent log file size",
+ (u_long)sp->st_lg_size / MEGABYTE);
+ else if (sp->st_lg_size % 1024 == 0)
+ __db_msg(env, "%luKb\tCurrent log file size",
+ (u_long)sp->st_lg_size / 1024);
+ else
+ __db_msg(env, "%lu\tCurrent log file size",
+ (u_long)sp->st_lg_size);
+ __db_dl(env, "Initial fileid allocation", (u_long)sp->st_fileid_init);
+ __db_dl(env, "Current fileids in use", (u_long)sp->st_nfileid);
+ __db_dl(env, "Maximum fileids used", (u_long)sp->st_maxnfileid);
+ __db_dl(env, "Records entered into the log", (u_long)sp->st_record);
+ __db_dlbytes(env, "Log bytes written",
+ (u_long)0, (u_long)sp->st_w_mbytes, (u_long)sp->st_w_bytes);
+ __db_dlbytes(env, "Log bytes written since last checkpoint",
+ (u_long)0, (u_long)sp->st_wc_mbytes, (u_long)sp->st_wc_bytes);
+ __db_dl(env, "Total log file I/O writes", (u_long)sp->st_wcount);
+ __db_dl(env, "Total log file I/O writes due to overflow",
+ (u_long)sp->st_wcount_fill);
+ __db_dl(env, "Total log file flushes", (u_long)sp->st_scount);
+ __db_dl(env, "Total log file I/O reads", (u_long)sp->st_rcount);
+ STAT_ULONG("Current log file number", sp->st_cur_file);
+ STAT_ULONG("Current log file offset", sp->st_cur_offset);
+ STAT_ULONG("On-disk log file number", sp->st_disk_file);
+ STAT_ULONG("On-disk log file offset", sp->st_disk_offset);
+
+ __db_dl(env,
+ "Maximum commits in a log flush", (u_long)sp->st_maxcommitperflush);
+ __db_dl(env,
+ "Minimum commits in a log flush", (u_long)sp->st_mincommitperflush);
+
+ __db_dlbytes(env, "Region size",
+ (u_long)0, (u_long)0, (u_long)sp->st_regsize);
+ __db_dl_pct(env,
+ "The number of region locks that required waiting",
+ (u_long)sp->st_region_wait, DB_PCT(sp->st_region_wait,
+ sp->st_region_wait + sp->st_region_nowait), NULL);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+/*
+ * __log_print_all --
+ * Display debugging log region statistics.
+ */
+static int
+__log_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { DBLOG_RECOVER, "DBLOG_RECOVER" },
+ { DBLOG_FORCE_OPEN, "DBLOG_FORCE_OPEN" },
+ { DBLOG_AUTOREMOVE, "DBLOG_AUTOREMOVE"},
+ { DBLOG_DIRECT, "DBLOG_DIRECT"},
+ { DBLOG_DSYNC, "DBLOG_DSYNC"},
+ { DBLOG_FORCE_OPEN, "DBLOG_FORCE_OPEN"},
+ { DBLOG_INMEMORY, "DBLOG_INMEMORY"},
+ { DBLOG_OPENFILES, "DBLOG_OPENFILES"},
+ { DBLOG_RECOVER, "DBLOG_RECOVER"},
+ { DBLOG_ZERO, "DBLOG_ZERO"},
+ { 0, NULL }
+ };
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = env->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+
+ LOG_SYSTEM_LOCK(env);
+
+ __db_print_reginfo(env, &dblp->reginfo, "Log", flags);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB_LOG handle information:");
+ __mutex_print_debug_single(
+ env, "DB_LOG handle mutex", dblp->mtx_dbreg, flags);
+ STAT_ULONG("Log file name", dblp->lfname);
+ __db_print_fh(env, "Log file handle", dblp->lfhp, flags);
+ __db_prflags(env, NULL, dblp->flags, fn, NULL, "\tFlags");
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "LOG handle information:");
+ __mutex_print_debug_single(
+ env, "LOG region mutex", lp->mtx_region, flags);
+ __mutex_print_debug_single(
+ env, "File name list mutex", lp->mtx_filelist, flags);
+
+ STAT_HEX("persist.magic", lp->persist.magic);
+ STAT_ULONG("persist.version", lp->persist.version);
+ __db_dlbytes(env,
+ "persist.log_size", (u_long)0, (u_long)0, lp->persist.log_size);
+ STAT_FMT("log file permissions mode", "%#lo", u_long, lp->filemode);
+ STAT_LSN("current file offset LSN", &lp->lsn);
+ STAT_LSN("first buffer byte LSN", &lp->lsn);
+ STAT_ULONG("current buffer offset", lp->b_off);
+ STAT_ULONG("current file write offset", lp->w_off);
+ STAT_ULONG("length of last record", lp->len);
+ STAT_LONG("log flush in progress", lp->in_flush);
+ __mutex_print_debug_single(
+ env, "Log flush mutex", lp->mtx_flush, flags);
+
+ STAT_LSN("last sync LSN", &lp->s_lsn);
+
+ /*
+ * Don't display the replication fields here, they're displayed as part
+ * of the replication statistics.
+ */
+
+ STAT_LSN("cached checkpoint LSN", &lp->cached_ckp_lsn);
+
+ __db_dlbytes(env,
+ "log buffer size", (u_long)0, (u_long)0, lp->buffer_size);
+ __db_dlbytes(env,
+ "log file size", (u_long)0, (u_long)0, lp->log_size);
+ __db_dlbytes(env,
+ "next log file size", (u_long)0, (u_long)0, lp->log_nsize);
+
+ STAT_ULONG("transactions waiting to commit", lp->ncommit);
+ STAT_LSN("LSN of first commit", &lp->t_lsn);
+
+ LOG_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__log_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_LOG_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__log_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/log/log_verify.c b/src/log/log_verify.c
new file mode 100644
index 00000000..e7f8f688
--- /dev/null
+++ b/src/log/log_verify.c
@@ -0,0 +1,437 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+#include "dbinc/log_verify.h"
+
+#define FIRST_OFFSET(env) \
+ (sizeof(LOGP) + (CRYPTO_ON(env) ? HDR_CRYPTO_SZ : HDR_NORMAL_SZ))
+
+static int __env_init_verify __P((ENV *, u_int32_t, DB_DISTAB *));
+
+/*
+ * PUBLIC: int __log_verify_pp __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+ */
+int
+__log_verify_pp(dbenv, lvconfig)
+ DB_ENV *dbenv;
+ const DB_LOG_VERIFY_CONFIG *lvconfig;
+{
+ int lsnrg, ret, timerg;
+ DB_THREAD_INFO *ip;
+ const char *phome;
+
+ lsnrg = ret = timerg = 0;
+ phome = NULL;
+
+ if (!IS_ZERO_LSN(lvconfig->start_lsn) ||
+ !IS_ZERO_LSN(lvconfig->end_lsn))
+ lsnrg = 1;
+ if (lvconfig->start_time != 0 || lvconfig->end_time != 0)
+ timerg = 1;
+
+ if ((!IS_ZERO_LSN(lvconfig->start_lsn) && lvconfig->start_time != 0) ||
+ (!IS_ZERO_LSN(lvconfig->end_lsn) && lvconfig->end_time != 0) ||
+ (lsnrg && timerg)) {
+ __db_errx(dbenv->env, DB_STR("2501",
+ "Set either an lsn range or a time range to verify logs "
+ "in the range, don't mix time and lsn."));
+ ret = EINVAL;
+ goto err;
+ }
+ phome = dbenv->env->db_home;
+ if (phome != NULL && lvconfig->temp_envhome != NULL &&
+ strcmp(phome, lvconfig->temp_envhome) == 0) {
+ __db_errx(dbenv->env,
+ "Environment home for log verification internal use "
+ "overlaps with that of the environment to verify.");
+ ret = EINVAL;
+ goto err;
+ }
+
+ ENV_ENTER(dbenv->env, ip);
+ ret = __log_verify(dbenv, lvconfig, ip);
+ ENV_LEAVE(dbenv->env, ip);
+err: return (ret);
+}
+
+/*
+ * PUBLIC: int __log_verify __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *,
+ * PUBLIC: DB_THREAD_INFO *));
+ */
+int
+__log_verify(dbenv, lvconfig, ip)
+ DB_ENV *dbenv;
+ const DB_LOG_VERIFY_CONFIG *lvconfig;
+ DB_THREAD_INFO *ip;
+{
+
+ u_int32_t logcflag, max_fileno;
+ DB_LOGC *logc;
+ ENV *env;
+ DBT data;
+ DB_DISTAB dtab;
+ DB_LSN key, start, start2, stop, stop2, verslsn;
+ u_int32_t newversion, version;
+ int cmp, fwdscroll, goprev, ret, tret;
+ time_t starttime, endtime;
+ const char *okmsg;
+ DB_LOG_VRFY_INFO *logvrfy_hdl;
+
+ okmsg = NULL;
+ fwdscroll = 1;
+ max_fileno = (u_int32_t)-1;
+ goprev = 0;
+ env = dbenv->env;
+ logc = NULL;
+ memset(&dtab, 0, sizeof(dtab));
+ memset(&data, 0, sizeof(data));
+ version = newversion = 0;
+ ZERO_LSN(verslsn);
+ memset(&start, 0, sizeof(DB_LSN));
+ memset(&start2, 0, sizeof(DB_LSN));
+ memset(&stop, 0, sizeof(DB_LSN));
+ memset(&stop2, 0, sizeof(DB_LSN));
+ memset(&key, 0, sizeof(DB_LSN));
+ memset(&verslsn, 0, sizeof(DB_LSN));
+
+ start = lvconfig->start_lsn;
+ stop = lvconfig->end_lsn;
+ starttime = lvconfig->start_time;
+ endtime = lvconfig->end_time;
+
+ if ((ret = __create_log_vrfy_info(lvconfig, &logvrfy_hdl, ip)) != 0)
+ goto err;
+ logvrfy_hdl->lv_config = lvconfig;
+ if (lvconfig->continue_after_fail)
+ F_SET(logvrfy_hdl, DB_LOG_VERIFY_CAF);
+ if (lvconfig->verbose)
+ F_SET(logvrfy_hdl, DB_LOG_VERIFY_VERBOSE);
+
+ /* Allocate a log cursor. */
+ if ((ret = __log_cursor(dbenv->env, &logc)) != 0) {
+ __db_err(dbenv->env, ret, "DB_ENV->log_cursor");
+ goto err;
+ }
+ /* Ignore failed chksum and go on with next one. */
+ F_SET(logc->env->lg_handle, DBLOG_VERIFYING);
+
+ /* Only scan the range that we want to verify. */
+ if (fwdscroll) {
+ if (IS_ZERO_LSN(stop)) {
+ logcflag = DB_LAST;
+ key.file = key.offset = 0;
+ } else {
+ key = stop;
+ logcflag = DB_SET;
+ }
+ logvrfy_hdl->flags |= DB_LOG_VERIFY_FORWARD;
+ goto startscroll;
+ }
+
+vrfyscroll:
+
+ /*
+ * Initialize version to 0 so that we get the
+ * correct version right away.
+ */
+ version = 0;
+ ZERO_LSN(verslsn);
+
+ /*
+ * In the log verification config struct, start_lsn and end_lsn have
+ * higher priority than start_time and end_time, and you can specify
+ * either lsn or time to start/stop verification.
+ */
+ if (starttime != 0 || endtime != 0) {
+ if ((ret = __find_lsnrg_by_timerg(logvrfy_hdl,
+ starttime, endtime, &start2, &stop2)) != 0)
+ goto err;
+ ((DB_LOG_VERIFY_CONFIG *)lvconfig)->start_lsn = start = start2;
+ ((DB_LOG_VERIFY_CONFIG *)lvconfig)->end_lsn = stop = stop2;
+ }
+
+ if (IS_ZERO_LSN(start)) {
+ logcflag = DB_FIRST;
+ key.file = key.offset = 0;
+ } else {
+ key = start;
+ logcflag = DB_SET;
+ F_SET(logvrfy_hdl, DB_LOG_VERIFY_PARTIAL);
+ }
+ goprev = 0;
+
+ /*
+ * So far we only support verifying a specific db file. The config's
+ * dbfile must be prefixed with the data directory if it's not in
+ * environment home directory.
+ */
+ if (lvconfig->dbfile != NULL) {
+ F_SET(logvrfy_hdl,
+ DB_LOG_VERIFY_DBFILE | DB_LOG_VERIFY_PARTIAL);
+ if ((ret = __set_logvrfy_dbfuid(logvrfy_hdl)) != 0)
+ goto err;
+ }
+
+startscroll:
+
+ memset(&data, 0, sizeof(data));
+
+ for (;;) {
+
+ /*
+ * We may have reached beyond the range we're verifying.
+ */
+ if (!fwdscroll && !IS_ZERO_LSN(stop)) {
+ cmp = LOG_COMPARE(&key, &stop);
+ if (cmp > 0)
+ break;
+ }
+ if (fwdscroll && !IS_ZERO_LSN(start)) {
+ cmp = LOG_COMPARE(&key, &start);
+ if (cmp < 0)
+ break;
+ }
+
+ ret = __logc_get(logc, &key, &data, logcflag);
+ if (ret != 0) {
+ if (ret == DB_NOTFOUND) {
+ /* We may not start from the first log file. */
+ if (logcflag == DB_PREV && key.file > 1)
+ F_SET(logvrfy_hdl,
+ DB_LOG_VERIFY_PARTIAL);
+ break;
+ }
+ __db_err(dbenv->env, ret, "DB_LOGC->get");
+ /*
+ * When go beyond valid lsn range, we may get other
+ * error values than DB_NOTFOUND.
+ */
+ goto out;
+ }
+
+ if (logcflag == DB_SET) {
+ if (goprev)
+ logcflag = DB_PREV;
+ else
+ logcflag = DB_NEXT;
+ } else if (logcflag == DB_LAST) {
+ logcflag = DB_PREV;
+ max_fileno = key.file;
+ } else if (logcflag == DB_FIRST)
+ logcflag = DB_NEXT;
+
+ if (key.file != verslsn.file) {
+ /*
+ * If our log file changed, we need to see if the
+ * version of the log file changed as well.
+ * If it changed, reset the print table.
+ */
+ if ((ret = __logc_version(logc, &newversion)) != 0) {
+ __db_err(dbenv->env, ret, "DB_LOGC->version");
+ goto err;
+ }
+ if (version != newversion) {
+ version = newversion;
+ if (!IS_LOG_VRFY_SUPPORTED(version)) {
+ __db_msg(dbenv->env, DB_STR_A("2502",
+ "[%lu][%lu] Unsupported version of log file, "
+ "log file number: %u, log file version: %u, "
+ "supported log version: %u.",
+ "%lu %lu %u %u %u"),
+ (u_long)key.file,
+ (u_long)key.offset,
+ key.file, version, DB_LOGVERSION);
+ if (logcflag == DB_NEXT) {
+ key.file += 1;
+ if (key.file > max_fileno)
+ break;
+ /*
+ * Txns don't span log versions, no need to
+ * set DB_LOG_VERIFY_PARTIAL here.
+ */
+ } else {
+ goprev = 1;
+ key.file -= 1;
+ if (key.file == 0)
+ break;
+ }
+ key.offset = FIRST_OFFSET(env);
+ logcflag = DB_SET;
+ continue;
+ }
+ if ((ret = __env_init_verify(env, version,
+ &dtab)) != 0) {
+ __db_err(dbenv->env, ret,
+ DB_STR("2503",
+ "callback: initialization"));
+ goto err;
+ }
+ }
+ verslsn = key;
+ }
+
+ ret = __db_dispatch(dbenv->env, &dtab, &data, &key,
+ DB_TXN_LOG_VERIFY, logvrfy_hdl);
+
+ if (!fwdscroll && ret != 0) {
+ if (!F_ISSET(logvrfy_hdl, DB_LOG_VERIFY_CAF)) {
+ __db_err(dbenv->env, ret,
+ "[%lu][%lu] __db_dispatch",
+ (u_long)key.file, (u_long)key.offset);
+ goto err;
+ } else
+ F_SET(logvrfy_hdl, DB_LOG_VERIFY_ERR);
+ }
+ }
+
+ if (fwdscroll) {
+ fwdscroll = 0;
+ F_CLR(logvrfy_hdl, DB_LOG_VERIFY_FORWARD);
+ goto vrfyscroll;
+ }
+out:
+ /*
+ * When we arrive here ret can be 0 or errors returned by DB_LOGC->get,
+ * all which we have already handled. So we clear ret.
+ */
+ ret = 0;
+
+ /* If continuing after fail, we can complete the entire log. */
+ if (F_ISSET(logvrfy_hdl, DB_LOG_VERIFY_ERR) ||
+ F_ISSET(logvrfy_hdl, DB_LOG_VERIFY_INTERR))
+ ret = DB_LOG_VERIFY_BAD;
+ /*
+ * This function can be called when the environment is alive, so
+ * there can be active transactions.
+ */
+ __db_log_verify_global_report(logvrfy_hdl);
+ if (ret == DB_LOG_VERIFY_BAD)
+ okmsg = DB_STR_P("FAILED");
+ else {
+ DB_ASSERT(dbenv->env, ret == 0);
+ okmsg = DB_STR_P("SUCCEEDED");
+ }
+
+ __db_msg(dbenv->env, DB_STR_A("2504",
+ "Log verification ended and %s.", "%s"), okmsg);
+
+err:
+ if (logc != NULL)
+ (void)__logc_close(logc);
+ if ((tret = __destroy_log_vrfy_info(logvrfy_hdl)) != 0 && ret == 0)
+ ret = tret;
+ if (dtab.int_dispatch)
+ __os_free(dbenv->env, dtab.int_dispatch);
+ if (dtab.ext_dispatch)
+ __os_free(dbenv->env, dtab.ext_dispatch);
+
+ return (ret);
+}
+
+/*
+ * __env_init_verify--
+ */
+static int
+__env_init_verify(env, version, dtabp)
+ ENV *env;
+ u_int32_t version;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ /*
+ * We need to prime the print table with the current print
+ * functions. Then we overwrite only specific entries based on
+ * each previous version we support.
+ */
+ if ((ret = __bam_init_verify(env, dtabp)) != 0)
+ goto err;
+ if ((ret = __crdel_init_verify(env, dtabp)) != 0)
+ goto err;
+ if ((ret = __db_init_verify(env, dtabp)) != 0)
+ goto err;
+ if ((ret = __dbreg_init_verify(env, dtabp)) != 0)
+ goto err;
+ if ((ret = __fop_init_verify(env, dtabp)) != 0)
+ goto err;
+#ifdef HAVE_HASH
+ if ((ret = __ham_init_verify(env, dtabp)) != 0)
+ goto err;
+#endif
+#ifdef HAVE_HEAP
+ if ((ret = __heap_init_verify(env, dtabp)) != 0)
+ goto err;
+#endif
+#ifdef HAVE_QUEUE
+ if ((ret = __qam_init_verify(env, dtabp)) != 0)
+ goto err;
+#endif
+ if ((ret = __txn_init_verify(env, dtabp)) != 0)
+ goto err;
+
+ switch (version) {
+ case DB_LOGVERSION:
+ ret = 0;
+ break;
+
+ default:
+ __db_errx(env, DB_STR_A("2505", "Not supported version %lu",
+ "%lu"), (u_long)version);
+ ret = EINVAL;
+ break;
+ }
+err: return (ret);
+}
+
+/*
+ * __log_verify_wrap --
+ * Wrapper function for APIs of other languages, like java/c# and
+ * script languages. It's much easier to implement the swig layer
+ * when we split up the C structure.
+ *
+ * PUBLIC: int __log_verify_wrap __P((ENV *, const char *, u_int32_t,
+ * PUBLIC: const char *, const char *, time_t, time_t, u_int32_t,
+ * PUBLIC: u_int32_t, u_int32_t, u_int32_t, int, int));
+ */
+int
+__log_verify_wrap(env, envhome, cachesize, dbfile, dbname,
+ stime, etime, stfile, stoffset, efile, eoffset, caf, verbose)
+ ENV *env;
+ const char *envhome, *dbfile, *dbname;
+ time_t stime, etime;
+ u_int32_t cachesize, stfile, stoffset, efile, eoffset;
+ int caf, verbose;
+{
+ DB_LOG_VERIFY_CONFIG cfg;
+
+ memset(&cfg, 0, sizeof(cfg));
+ cfg.cachesize = cachesize;
+ cfg.temp_envhome = envhome;
+ cfg.dbfile = dbfile;
+ cfg.dbname = dbname;
+ cfg.start_time = stime;
+ cfg.end_time = etime;
+ cfg.start_lsn.file = stfile;
+ cfg.start_lsn.offset = stoffset;
+ cfg.end_lsn.file = efile;
+ cfg.end_lsn.offset = eoffset;
+ cfg.continue_after_fail = caf;
+ cfg.verbose = verbose;
+
+ return __log_verify_pp(env->dbenv, &cfg);
+}
diff --git a/src/log/log_verify_auto.c b/src/log/log_verify_auto.c
new file mode 100644
index 00000000..08bc5d64
--- /dev/null
+++ b/src/log/log_verify_auto.c
@@ -0,0 +1,318 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+#include "dbinc/fop.h"
+
+/*
+ * PUBLIC: int __crdel_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_verify(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_metasub_verify, DB___crdel_metasub)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_create_verify, DB___crdel_inmem_create)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_rename_verify, DB___crdel_inmem_rename)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_remove_verify, DB___crdel_inmem_remove)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_verify(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_addrem_verify, DB___db_addrem)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_big_verify, DB___db_big)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_ovref_verify, DB___db_ovref)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_debug_verify, DB___db_debug)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_noop_verify, DB___db_noop)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_alloc_verify, DB___db_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_free_verify, DB___db_pg_free)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_cksum_verify, DB___db_cksum)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_freedata_verify, DB___db_pg_freedata)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_init_verify, DB___db_pg_init)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_trunc_verify, DB___db_pg_trunc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_realloc_verify, DB___db_realloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_relink_verify, DB___db_relink)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_merge_verify, DB___db_merge)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pgno_verify, DB___db_pgno)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __dbreg_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__dbreg_init_verify(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __dbreg_register_verify, DB___dbreg_register)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__bam_init_verify(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_split_verify, DB___bam_split)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rsplit_verify, DB___bam_rsplit)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_adj_verify, DB___bam_adj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cadjust_verify, DB___bam_cadjust)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cdel_verify, DB___bam_cdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_repl_verify, DB___bam_repl)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_root_verify, DB___bam_root)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_curadj_verify, DB___bam_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rcuradj_verify, DB___bam_rcuradj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_irep_verify, DB___bam_irep)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __fop_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__fop_init_verify(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_create_verify, DB___fop_create)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_remove_verify, DB___fop_remove)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_write_verify, DB___fop_write)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_rename_verify, DB___fop_rename)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_rename_verify, DB___fop_rename_noundo)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_file_remove_verify, DB___fop_file_remove)) != 0)
+ return (ret);
+ return (0);
+}
+
+#ifdef HAVE_HASH
+/*
+ * PUBLIC: int __ham_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__ham_init_verify(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_insdel_verify, DB___ham_insdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_newpage_verify, DB___ham_newpage)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_splitdata_verify, DB___ham_splitdata)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_replace_verify, DB___ham_replace)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_copypage_verify, DB___ham_copypage)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_metagroup_verify, DB___ham_metagroup)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_groupalloc_verify, DB___ham_groupalloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_changeslot_verify, DB___ham_changeslot)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_contract_verify, DB___ham_contract)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_curadj_verify, DB___ham_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __ham_chgpg_verify, DB___ham_chgpg)) != 0)
+ return (ret);
+ return (0);
+}
+
+#endif /* HAVE_HASH */
+#ifdef HAVE_HEAP
+/*
+ * PUBLIC: int __heap_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__heap_init_verify(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_addrem_verify, DB___heap_addrem)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_pg_alloc_verify, DB___heap_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_trunc_meta_verify, DB___heap_trunc_meta)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __heap_trunc_page_verify, DB___heap_trunc_page)) != 0)
+ return (ret);
+ return (0);
+}
+#endif /* HAVE_HEAP */
+#ifdef HAVE_QUEUE
+/*
+ * PUBLIC: int __qam_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__qam_init_verify(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_incfirst_verify, DB___qam_incfirst)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_mvptr_verify, DB___qam_mvptr)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_del_verify, DB___qam_del)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_add_verify, DB___qam_add)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_delext_verify, DB___qam_delext)) != 0)
+ return (ret);
+ return (0);
+}
+
+#endif /* HAVE_QUEUE */
+/*
+ * PUBLIC: int __txn_init_verify __P((ENV *, DB_DISTAB *));
+ */
+int
+__txn_init_verify(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_regop_verify, DB___txn_regop)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_ckp_verify, DB___txn_ckp)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_child_verify, DB___txn_child)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_prepare_verify, DB___txn_prepare)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_recycle_verify, DB___txn_recycle)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/log/log_verify_int.c b/src/log/log_verify_int.c
new file mode 100644
index 00000000..abe564c6
--- /dev/null
+++ b/src/log/log_verify_int.c
@@ -0,0 +1,4353 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+/*
+ * This file contains verification functions for all types of log records,
+ * one for each type. We can't make this automated like the log_type_print/read
+ * functions because there are no consistent handling. Each type of log records
+ * have unique ways to verify, and unique information to extract.
+ *
+ * In each verification function, we first call the log_type_read function
+ * to get the log_type_args structure, then extract information according to
+ * the type of log. The log types can be made into different categories, each
+ * of which have similar types of information.
+ *
+ * For example, txn_regop and txn_ckp types both have timestamps, and we
+ * want to maintain (timestamp,lsn) mapping, so we will have a on_timestamp
+ * function, and call it in txn_regop_verify and txn_ckp_verify functions,
+ * and in the two functions we may call other on_*** functions to extract and
+ * verify other information.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/heap.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+#include "dbinc/log_verify.h"
+
+static int __log_vrfy_proc __P((DB_LOG_VRFY_INFO *, DB_LSN, DB_LSN,
+ u_int32_t, DB_TXN *, int32_t, int *));
+static int __lv_ckp_vrfy_handler __P((DB_LOG_VRFY_INFO *,
+ VRFY_TXN_INFO *, void *));
+static const char *__lv_dbreg_str __P((u_int32_t));
+static int __lv_dbregid_to_dbtype __P((DB_LOG_VRFY_INFO *, int32_t, DBTYPE *));
+static int __lv_dbt_str __P((const DBT *, char **));
+static const char *__lv_dbtype_str __P((DBTYPE));
+static u_int32_t __lv_first_offset __P((ENV *));
+static int __lv_new_logfile_vrfy __P((DB_LOG_VRFY_INFO *, const DB_LSN *));
+static int __lv_log_fwdscr_oncmt __P((DB_LOG_VRFY_INFO *, DB_LSN,
+ u_int32_t, u_int32_t, int32_t));
+static int __lv_log_fwdscr_onrec __P((DB_LOG_VRFY_INFO *,
+ u_int32_t, u_int32_t, DB_LSN, DB_LSN));
+static int __lv_log_mismatch __P((DB_LOG_VRFY_INFO *, DB_LSN, DBTYPE, DBTYPE));
+static int __lv_on_bam_log __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t));
+static int __lv_on_ham_log __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t));
+static int __lv_on_heap_log __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t));
+static int __lv_on_new_txn __P((DB_LOG_VRFY_INFO *, const DB_LSN *,
+ const DB_TXN *, u_int32_t, int32_t, const DBT *));
+static int __lv_on_nontxn_update __P((DB_LOG_VRFY_INFO *, const DB_LSN *,
+ u_int32_t, u_int32_t, int32_t));
+static int __lv_on_page_update __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t,
+ db_pgno_t, DB_TXN *, int *));
+static int __lv_on_qam_log __P((DB_LOG_VRFY_INFO *, DB_LSN, int32_t));
+static int __lv_on_timestamp __P((DB_LOG_VRFY_INFO *, const DB_LSN *,
+ int32_t, u_int32_t));
+static int __lv_on_txn_aborted __P((DB_LOG_VRFY_INFO *));
+static int __lv_on_txn_logrec __P((DB_LOG_VRFY_INFO *, const DB_LSN *,
+ const DB_LSN *, const DB_TXN *, u_int32_t, int32_t));
+static int __lv_vrfy_for_dbfile __P((DB_LOG_VRFY_INFO *, int32_t, int *));
+
+/* General error handlers, called when a check fails. */
+#define ON_ERROR(lvh, errv) do { \
+ (lvh)->flags |= (errv); \
+ if (F_ISSET((lvh), DB_LOG_VERIFY_CAF)) \
+ ret = 0;/* Ignore the error and continue. */ \
+ goto err; \
+} while (0)
+
+/* Used by logs of unsupported types. */
+#define ON_NOT_SUPPORTED(env, lvh, lsn, ltype) do { \
+ __db_errx((env), DB_STR_A("2536", \
+ "[%lu][%lu] Not supported type of log record %u.", \
+ "%lu %lu %u"), (u_long)((lsn).file), (u_long)((lsn).offset),\
+ (ltype)); \
+ (lvh)->unknown_logrec_cnt++; \
+ goto err; \
+} while (0)
+
+#define SKIP_FORWARD_CHK(type) ((type) != DB___txn_regop && \
+ (type) != DB___txn_ckp && (type) != DB___fop_rename && \
+ (type) != DB___txn_child)
+
+#define NOTCOMMIT(type) ((type) != DB___txn_regop && \
+ (type) != DB___txn_child)
+
+#define LOG_VRFY_PROC(lvh, lsn, argp, fileid) do { \
+ int __lv_log_vrfy_proc_step = 0; \
+ if ((ret = __log_vrfy_proc((lvh), (lsn), (argp)->prev_lsn, \
+ (argp)->type, (argp)->txnp, (fileid), \
+ &__lv_log_vrfy_proc_step)) != 0) \
+ goto err; \
+ if (__lv_log_vrfy_proc_step == 1) \
+ goto out; \
+ else if (__lv_log_vrfy_proc_step == -1) \
+ goto err; \
+ else \
+ DB_ASSERT(lvh->dbenv->env, \
+ __lv_log_vrfy_proc_step == 0); \
+} while (0)
+
+/* Log record handlers used by log types involving page updates. */
+#define ON_PAGE_UPDATE(lvh, lsn, argp, pgno) do { \
+ int __lv_onpgupdate_res; \
+ if ((ret = __lv_on_page_update((lvh), (lsn), (argp)->fileid, \
+ (pgno), (argp)->txnp, &__lv_onpgupdate_res)) != 0) \
+ goto err; \
+ if (__lv_onpgupdate_res == 1) \
+ goto out; \
+ else if (__lv_onpgupdate_res == -1) \
+ goto err; \
+ else \
+ DB_ASSERT(lvh->dbenv->env, __lv_onpgupdate_res == 0); \
+} while (0)
+
+static int
+__lv_on_page_update(lvh, lsn, fileid, pgno, txnp, step)
+ DB_LOG_VRFY_INFO *lvh;
+ DB_LSN lsn;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_TXN *txnp;
+ int *step;
+{
+ u_int32_t otxn, txnid;
+ int res, ret;
+
+ txnid = txnp->txnid;
+ res = ret = 0;
+
+ if ((ret = __add_page_to_txn(lvh, fileid, pgno,
+ txnid, &otxn, &res)) != 0)
+ ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+ if (res != -1) {/* No access violation, we are done. */
+ *step = 0;
+ goto out;
+ }
+ /*
+ * It's OK for a child txn to update its parent's page, but not OK
+ * for a parent txn to update its active child's pages. We can't
+ * detect the child's abort, so we may false alarm that a parent txn
+ * is updating its child's pages.
+ */
+ if ((ret = __is_ancestor_txn(lvh, otxn, txnid, lsn, &res)) != 0)
+ ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+ if (res) {/* The txnid is updating its parent otxn's pages. */
+ *step = 0;
+ goto out;
+ }
+ if ((ret = __is_ancestor_txn(lvh, txnid, otxn, lsn, &res)) != 0)
+ ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+ if (res) {/* The txnid is updating its active child otxn's pages. */
+ __db_errx(lvh->dbenv->env, DB_STR_A("2537",
+ "[%lu][%lu] [WARNING] Parent txn %lx is updating its "
+ "active child txn %lx's pages, or %lx aborted.",
+ "%lu %lu %lx %lx %lx"), (u_long)lsn.file,
+ (u_long)lsn.offset, (u_long)txnid,
+ (u_long)otxn, (u_long)otxn);
+ *step = 0;
+ goto out;
+ }
+ /*
+ * It's likely that the two txns are parent-child and the child
+ * aborted, but from the log we can't figure out this fact.
+ */
+ __db_errx(lvh->dbenv->env, DB_STR_A("2538",
+ "[%lu][%lu] [WARNING] Txn %lx is updating txn %lx's pages.",
+ "%lu %lu %lx %lx"), (u_long)lsn.file, (u_long)lsn.offset,
+ (u_long)txnid, (u_long)otxn);
+ *step = 0;
+out:
+err:
+ return (ret);
+}
+
+/*
+ * This macro is put in all types of verify functions where a db file is
+ * updated, but no page number/lock involved.
+ */
+#define ON_PAGE_UPDATE4
+
+/*
+ * General log record handler used by all log verify functions.
+ */
+static int
+__log_vrfy_proc(lvh, lsn, prev_lsn, type, txnp, fileid, step)
+ DB_LOG_VRFY_INFO *lvh;
+ DB_LSN lsn, prev_lsn;
+ u_int32_t type; /* Log record type. */
+ DB_TXN *txnp;
+ int32_t fileid;
+ int *step;
+{
+ int dovrfy, ret;
+
+ dovrfy = 1;
+ ret = 0;
+ /*
+ * step is used to tell if go on with the rest of the caller, or
+ * goto err/out.
+ * 0: go on after this function; 1: goto out; -1: goto err.
+ */
+ *step = 0;
+
+ if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+ /* Commits are not abort/beginnings. */
+ if (NOTCOMMIT(type) && ((ret = __lv_log_fwdscr_onrec(
+ lvh, txnp->txnid, type, prev_lsn, lsn)) != 0))
+ goto err;
+ if (SKIP_FORWARD_CHK(type))
+ goto out;
+ } else {/* Verifying */
+ if (F_ISSET(lvh, DB_LOG_VERIFY_VERBOSE))
+ __db_errx(lvh->dbenv->env, DB_STR_A("2539",
+ "[%lu][%lu] Verifying log record of type %s",
+ "%lu %lu %s"), (u_long)lsn.file,
+ (u_long)lsn.offset, LOGTYPE_NAME(lvh, type));
+ /*
+ * If verifying a log range and we've passed the initial part
+ * which may have partial txns, remove the PARTIAL bit.
+ */
+ if (F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL) &&
+ LOG_COMPARE(&lsn, &(lvh->valid_lsn)) >= 0) {
+ lvh->valid_lsn.offset = lvh->valid_lsn.file = 0;
+ F_CLR(lvh, DB_LOG_VERIFY_PARTIAL);
+ }
+
+ if ((ret = __lv_new_logfile_vrfy(lvh, &lsn)) != 0)
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ /* If only verify a db file, ignore logs about other dbs. */
+ if (F_ISSET(lvh, DB_LOG_VERIFY_DBFILE) && fileid !=
+ INVAL_DBREGID && (ret = __lv_vrfy_for_dbfile(lvh,
+ fileid, &dovrfy)) != 0)
+ goto err;
+ if (!dovrfy)
+ goto out;
+ if (lvh->aborted_txnid != 0 &&
+ ((ret = __lv_on_txn_aborted(lvh)) != 0))
+ goto err;
+ if ((ret = __get_aborttxn(lvh, lsn)) != 0)
+ goto err;
+ if (txnp->txnid >= TXN_MINIMUM) {
+ if ((ret = __lv_on_txn_logrec(lvh, &lsn, &(prev_lsn),
+ txnp, type, fileid)) != 0)
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ } else {/* Non-txnal updates. */
+ if ((ret = __lv_on_nontxn_update(lvh, &lsn,
+ txnp->txnid, type, fileid)) != 0)
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+ }
+ if (0) {
+out:
+ *step = 1;
+ }
+ if (0) {
+err:
+ *step = -1;
+ }
+ return (ret);
+}
+
+/* Log record handlers used by log types for each access method. */
+static int
+__lv_on_bam_log(lvh, lsn, fileid)
+ DB_LOG_VRFY_INFO *lvh;
+ DB_LSN lsn;
+ int32_t fileid;
+{
+ int ret;
+ DBTYPE dbtype;
+ if ((ret = __lv_dbregid_to_dbtype(lvh, fileid, &dbtype)) == 0 &&
+ dbtype != DB_BTREE && dbtype != DB_RECNO && dbtype != DB_HASH)
+ ret = __lv_log_mismatch(lvh, lsn, dbtype, DB_BTREE);
+ if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ return (ret);
+}
+
+static int
+__lv_on_ham_log(lvh, lsn, fileid)
+ DB_LOG_VRFY_INFO *lvh;
+ DB_LSN lsn;
+ int32_t fileid;
+{
+ int ret;
+ DBTYPE dbtype;
+ if ((ret = __lv_dbregid_to_dbtype(lvh, fileid, &dbtype)) == 0 &&
+ dbtype != DB_HASH)
+ ret = __lv_log_mismatch(lvh, lsn, dbtype, DB_HASH);
+ if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ return (ret);
+}
+
+static int
+__lv_on_heap_log(lvh, lsn, fileid)
+ DB_LOG_VRFY_INFO *lvh;
+ DB_LSN lsn;
+ int32_t fileid;
+{
+ int ret;
+ DBTYPE dbtype;
+ if ((ret = __lv_dbregid_to_dbtype(lvh, fileid, &dbtype)) == 0 &&
+ dbtype != DB_HEAP)
+ ret = __lv_log_mismatch(lvh, lsn, dbtype, DB_HEAP);
+ if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ return (ret);
+}
+
+static int
+__lv_on_qam_log(lvh, lsn, fileid)
+ DB_LOG_VRFY_INFO *lvh;
+ DB_LSN lsn;
+ int32_t fileid;
+{
+ int ret;
+ DBTYPE dbtype;
+ if ((ret = __lv_dbregid_to_dbtype(lvh, fileid, &dbtype)) == 0 &&
+ dbtype != DB_QUEUE)
+ ret = __lv_log_mismatch(lvh, lsn, dbtype, DB_QUEUE);
+ if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ return (ret);
+}
+
+/* Catch commits and store into lvinfo->txnrngs database. */
+static int
+__lv_log_fwdscr_oncmt(lvinfo, lsn, txnid, ptxnid, timestamp)
+ DB_LOG_VRFY_INFO *lvinfo;
+ DB_LSN lsn;
+ u_int32_t txnid, ptxnid;
+ int32_t timestamp;
+{
+ int ret;
+ struct __lv_txnrange tr;
+ DBT key, data;
+
+ memset(&tr, 0, sizeof(tr));
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ tr.txnid = txnid;
+ tr.end = lsn;
+ tr.when_commit = timestamp;
+ tr.ptxnid = ptxnid;
+ key.data = &(txnid);
+ key.size = sizeof(txnid);
+ data.data = &tr;
+ data.size = sizeof(tr);
+ if ((ret = __db_put(lvinfo->txnrngs, lvinfo->ip, NULL,
+ &key, &data, 0)) != 0)
+ goto err;
+err:
+ return (ret);
+}
+
+/* Catch aborts and txn beginnings and store into lvinfo->txnrngs database. */
+static int
+__lv_log_fwdscr_onrec(lvinfo, txnid, lrtype, prevlsn, lsn)
+ DB_LOG_VRFY_INFO *lvinfo;
+ u_int32_t txnid, lrtype;
+ DB_LSN prevlsn, lsn;
+{
+ int doput, ret, ret2, tret;
+ u_int32_t putflag;
+ struct __lv_txnrange tr, *ptr;
+ DBC *csr;
+ DBT key, key2, data, data2;
+
+ /* Ignore non-txnal log records. */
+ if (txnid < TXN_MINIMUM)
+ return (0);
+
+ /* Not used for now, but may be used later. Pass lint checks. */
+ COMPQUIET(lrtype ,0);
+ putflag = 0;
+ doput = ret = ret2 = 0;
+ csr = NULL;
+ memset(&tr, 0, sizeof(tr));
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ memset(&key2, 0, sizeof(DBT));
+ memset(&data2, 0, sizeof(DBT));
+ key.data = &txnid;
+ key.size = sizeof(txnid);
+ tr.txnid = txnid;
+ tr.when_commit = 0;/* This is not a __txn_regop record. */
+
+ if ((ret = __db_cursor(lvinfo->txnrngs, lvinfo->ip,
+ NULL, &csr, 0)) != 0)
+ goto err;
+ /*
+ * If the txnid is first seen here or reused later, it's aborted
+ * after this log record; if this log record is the 1st one of a txn,
+ * we have the beginning of the txn; otherwise the log record is one
+ * of the actions taken within the txn, and we don't do anything.
+ */
+ if ((ret = __dbc_get(csr, &key, &data, DB_SET)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+
+ ptr = (struct __lv_txnrange *)data.data;
+ if (ret == DB_NOTFOUND || !IS_ZERO_LSN(ptr->begin)) {
+ tr.end = lsn;
+ data.data = &tr;
+ data.size = sizeof(tr);
+ doput = 1;
+ key2.data = &lsn;
+ key2.size = sizeof(lsn);
+ data2.data = &(tr.txnid);
+ data2.size = sizeof(tr.txnid);
+ putflag = DB_KEYFIRST;
+ if ((ret2 = __db_put(lvinfo->txnaborts, lvinfo->ip, NULL,
+ &key2, &data2, 0)) != 0) {
+ ret = ret2;
+ goto err;
+ }
+ } else if (ret == 0 && IS_ZERO_LSN(prevlsn)) {/* The beginning of txn.*/
+ /* The begin field must be [0, 0]. */
+ DB_ASSERT(lvinfo->dbenv->env, IS_ZERO_LSN(ptr->begin));
+ ptr->begin = lsn;
+ putflag = DB_CURRENT;
+ doput = 1;
+ }
+
+ if (doput && (ret = __dbc_put(csr, &key, &data, putflag)) != 0)
+ goto err;
+err:
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+
+ return (ret);
+}
+
+/*
+ * Return 0 from dovrfy if verifying logs for a specified db file, and fileid
+ * is not the one we want; Otherwise return 1 from dovrfy. If DB operations
+ * failed, the error is returned.
+ */
+static int
+__lv_vrfy_for_dbfile(lvh, fileid, dovrfy)
+ DB_LOG_VRFY_INFO *lvh;
+ int32_t fileid;
+ int *dovrfy;
+{
+ u_int8_t tmpuid[DB_FILE_ID_LEN];
+ VRFY_FILEREG_INFO *fregp;
+ u_int32_t i;
+ int ret, tret;
+ DBT tgtkey;
+
+ ret = tret = 0;
+ *dovrfy = 0;
+ fregp = NULL;
+ memset(tmpuid, 0, sizeof(u_int8_t) * DB_FILE_ID_LEN);
+ memset(&tgtkey, 0, sizeof(tgtkey));
+ tgtkey.data = lvh->target_dbid;
+ tgtkey.size = DB_FILE_ID_LEN;
+ ret = __get_filereg_info(lvh, &tgtkey, &fregp);
+
+ /*
+ * If the target db file is not seen yet, we don't verify any file,
+ * and it does not mean anything wrong.
+ */
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ goto out;
+ }
+ if (ret != 0)
+ goto err;
+
+ for (i = 0; i < fregp->regcnt; i++)
+ if (fregp->dbregids[i] == fileid) {
+ *dovrfy = 1;
+ goto out;
+ }
+out:
+err:
+ if (fregp != NULL &&
+ (tret = __free_filereg_info(fregp)) != 0 && ret == 0)
+ ret = tret;
+
+ return (ret);
+}
+
+static int
+__lv_log_mismatch(lvh, lsn, dbtype, exp_dbtype)
+ DB_LOG_VRFY_INFO *lvh;
+ DB_LSN lsn;
+ DBTYPE dbtype, exp_dbtype;
+{
+ int ret;
+
+ __db_errx(lvh->dbenv->env, DB_STR_A("2540",
+ "[%lu][%lu] Log record type does not match related database type, "
+ "current database type: %s, expected database type according to "
+ "the log record type: %s.", "%lu %lu %s %s"),
+ (u_long)lsn.file, (u_long)lsn.offset, __lv_dbtype_str(dbtype),
+ __lv_dbtype_str(exp_dbtype));
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+err:
+ return (ret);
+}
+
+static int
+__lv_dbregid_to_dbtype(lvh, id, ptype)
+ DB_LOG_VRFY_INFO *lvh;
+ int32_t id;
+ DBTYPE *ptype;
+{
+ int ret;
+ VRFY_FILELIFE *pflife;
+
+ ret = 0;
+ pflife = NULL;
+
+ if ((ret = __get_filelife(lvh, id, &pflife)) != 0)
+ goto err;
+ *ptype = pflife->dbtype;
+err:
+ if (pflife != NULL)
+ __os_free(lvh->dbenv->env, pflife);
+
+ return (ret);
+}
+
+/*
+ * __db_log_verify_global_report --
+ * Report statistics data in DB_LOG_VRFY_INFO handle.
+ *
+ * PUBLIC: void __db_log_verify_global_report __P((const DB_LOG_VRFY_INFO *));
+ */
+void __db_log_verify_global_report (lvinfo)
+ const DB_LOG_VRFY_INFO *lvinfo;
+{
+ u_int32_t i, nltype;
+
+ __db_msg(lvinfo->dbenv->env,
+ "Number of active transactions: %u;", lvinfo->ntxn_active);
+ __db_msg(lvinfo->dbenv->env,
+ "Number of committed transactions: %u;", lvinfo->ntxn_commit);
+ __db_msg(lvinfo->dbenv->env,
+ "Number of aborted transactions: %u;", lvinfo->ntxn_abort);
+ __db_msg(lvinfo->dbenv->env,
+ "Number of prepared transactions: %u;", lvinfo->ntxn_prep);
+ __db_msg(lvinfo->dbenv->env,
+ "Total number of checkpoint: %u;", lvinfo->nckp);
+ __db_msg(lvinfo->dbenv->env,
+ "Total number of non-transactional updates: %u;",
+ lvinfo->non_txnup_cnt);
+ __db_msg(lvinfo->dbenv->env,
+ "Total number of unknown log records: %u;",
+ lvinfo->unknown_logrec_cnt);
+ __db_msg(lvinfo->dbenv->env,
+ "Total number of app-specific log record: %u;",
+ lvinfo->external_logrec_cnt);
+ __db_msg(lvinfo->dbenv->env,
+ "The number of each type of log record:");
+
+ for (i = 0; i < 256; i++) {
+ nltype = lvinfo->lrtypes[i];
+ if (LOGTYPE_NAME(lvinfo, i) != NULL)
+ __db_msg(lvinfo->dbenv->env, "\n\t%s : %u;",
+ LOGTYPE_NAME(lvinfo, i), nltype);
+ }
+}
+
+/*
+ * PUBLIC: int __crdel_metasub_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__crdel_metasub_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __crdel_metasub_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __crdel_metasub_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_verify __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __crdel_inmem_create_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __crdel_inmem_create_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_verify __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __crdel_inmem_rename_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __crdel_inmem_rename_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_verify __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __crdel_inmem_remove_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __crdel_inmem_remove_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_addrem_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_addrem_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_addrem_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_addrem_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_big_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_big_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_big_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_big_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_ovref_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_ovref_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_ovref_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_ovref_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_relink_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_relink_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_relink_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_relink_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_debug_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_debug_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_debug_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __db_debug_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_noop_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_noop_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_noop_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_noop_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_alloc_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pg_alloc_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pg_alloc_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_alloc_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pg_alloc_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pg_alloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_free_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pg_free_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pg_free_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_free_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pg_free_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pg_free_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_cksum_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_cksum_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_cksum_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_42_verify __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pg_freedata_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pg_freedata_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_freedata_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pg_freedata_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pg_freedata_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_init_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_init_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pg_init_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pg_init_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_sort_44_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_sort_44_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pg_sort_44_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pg_sort_44_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_trunc_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pg_trunc_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pg_trunc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+out:
+err:
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_realloc_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_realloc_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_realloc_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_realloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_relink_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_relink_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_relink_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_relink_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_merge_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_merge_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_merge_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_merge_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pgno_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pgno_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __db_pgno_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __db_pgno_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+static const char *
+__lv_dbreg_str(op)
+ u_int32_t op;
+{
+ const char *p;
+
+ switch (op) {
+ case DBREG_CHKPNT:
+ p = "DBREG_CHKPNT";
+ break;
+ case DBREG_RCLOSE:
+ p = "DBREG_RCLOSE";
+ break;
+ case DBREG_CLOSE:
+ p = "DBREG_CLOSE";
+ break;
+ case DBREG_OPEN:
+ p = "DBREG_OPEN";
+ break;
+ case DBREG_PREOPEN:
+ p = "DBREG_PREOPEN";
+ break;
+ case DBREG_REOPEN:
+ p = "DBREG_REOPEN";
+ break;
+ case DBREG_XCHKPNT:
+ p = "DBREG_XCHKPNT";
+ break;
+ case DBREG_XOPEN:
+ p = "DBREG_XOPEN";
+ break;
+ case DBREG_XREOPEN:
+ p = "DBREG_XREOPEN";
+ break;
+ default:
+ p = DB_STR_P("Unknown dbreg op code");
+ break;
+ }
+
+ return (p);
+}
+
+static int
+__lv_dbt_str(dbt, str)
+ const DBT *dbt;
+ char **str;
+{
+ char *p, *q;
+ u_int32_t buflen, bufsz, i;
+ int ret;
+
+ ret = 0;
+ p = q = NULL;
+ buflen = bufsz = i = 0;
+ bufsz = sizeof(char) * dbt->size * 2;
+
+ if ((ret = __os_malloc(NULL, bufsz, &p)) != 0)
+ goto err;
+ q = (char *)dbt->data;
+
+ memset(p, 0, bufsz);
+ /*
+ * Each unprintable character takes up several bytes, so be ware of
+ * memory access violation.
+ */
+ for (i = 0; i < dbt->size && buflen < bufsz; i++) {
+ buflen = (u_int32_t)strlen(p);
+ snprintf(p + buflen, bufsz - (buflen + 1),
+ isprint(q[i]) || q[i] == 0x0a ? "%c" : "%x", q[i]);
+ }
+ *str = p;
+err:
+ return (ret);
+}
+
+static const char *
+__lv_dbtype_str(dbtype)
+ DBTYPE dbtype;
+{
+ char *p;
+
+ switch (dbtype) {
+ case DB_BTREE:
+ p = "DB_BTREE";
+ break;
+ case DB_HASH:
+ p = "DB_HASH";
+ break;
+ case DB_RECNO:
+ p = "DB_RECNO";
+ break;
+ case DB_QUEUE:
+ p = "DB_QUEUE";
+ break;
+ default:
+ p = DB_STR_P("Unknown db type");
+ break;
+ }
+
+ return (p);
+}
+
+/*
+ * PUBLIC: int __dbreg_register_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__dbreg_register_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __dbreg_register_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ VRFY_FILEREG_INFO *fregp, freg;
+ VRFY_FILELIFE *pflife, flife;
+ int checklife, rmv_dblife, ret, ret2;
+ u_int32_t opcode;
+ char *puid;
+ const char *dbfname;
+
+ dbfname = NULL;
+ checklife = 1;
+ opcode = 0;
+ ret = ret2 = rmv_dblife = 0;
+ puid = NULL;
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+ fregp = NULL;
+ pflife = NULL;
+ memset(&flife, 0, sizeof(flife));
+ memset(&freg, 0, sizeof(freg));
+
+ if ((ret = __dbreg_register_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK);
+ dbfname = argp->name.size == 0 ? "(null)" : (char *)(argp->name.data);
+ /*
+ * We don't call LOG_VRFY_PROC macro here, so we have to copy the code
+ * snippet in __log_vrfy_proc here.
+ */
+ if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+ if ((ret = __lv_log_fwdscr_onrec(lvh, argp->txnp->txnid,
+ argp->type, argp->prev_lsn, *lsnp)) != 0)
+ goto err;
+ goto out;
+ }
+ if (lvh->aborted_txnid != 0 && (ret = __lv_on_txn_aborted(lvh)) != 0)
+ goto err;
+
+ if ((ret = __get_filereg_info(lvh, &(argp->uid), &fregp)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+
+ /*
+ * When DBREG_CLOSE, we should remove the fileuid-filename mapping
+ * from filereg because the file can be opened again with a different
+ * fileuid after closed.
+ */
+ if (ret == 0 && IS_DBREG_CLOSE(opcode)) {
+ if ((ret = __db_del(lvh->fileregs, lvh->ip, NULL,
+ &(argp->uid), 0)) != 0)
+ goto err;
+ }
+
+ /*
+ * If this db file is seen for the 1st time, store filereg and
+ * filelife info. Since we will do a end-to-begin scan before the
+ * verification, we will be able to get the record but it's regcnt
+ * is 0 since we didn't know any dbregid yet.
+ */
+ if (ret == DB_NOTFOUND || fregp->regcnt == 0) {
+ /* Store filereg info unless it's a CLOSE. */
+ freg.fileid = argp->uid;
+ if (!IS_DBREG_CLOSE(opcode)) {
+ freg.regcnt = 1;
+ freg.dbregids = &(argp->fileid);
+ } else {
+ freg.regcnt = 0;
+ freg.dbregids = NULL;
+ }
+ if (ret == DB_NOTFOUND) {
+ /*
+ * If the db file is an in-memory db file, we can arrive
+ * here because there is no __fop_rename log for it;
+ * if the __fop_rename log record is out of the log range we
+ * verify, we will also arrive here.
+ */
+ if ((ret = __os_malloc(env, argp->name.size + 1,
+ &(freg.fname))) != 0)
+ goto err;
+ memset(freg.fname, 0,
+ sizeof(char) * (argp->name.size + 1));
+ (void)strncpy(freg.fname,
+ (const char *)(argp->name.data), argp->name.size);
+ } else /* We already have the name. */
+ if ((ret = __os_strdup(env,
+ fregp->fname, &(freg.fname))) != 0)
+ goto err;
+
+ if (!IS_DBREG_OPEN(opcode) &&
+ !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+ /* It's likely that the DBREG_OPEN is not seen.*/
+ __db_msg(env, DB_STR_A("2541",
+ "[%lu][%lu] Suspicious dbreg operation: %s, the "
+ "database file %s's register in log region does "
+ "not begin with an open operation.",
+ "%lu %lu %s %s"), (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ __lv_dbreg_str(opcode), dbfname);
+ }
+
+ /*
+ * PREOPEN is only generated when opening an in-memory db.
+ * Because we need to log the fileid we're allocating, but we
+ * don't have all the details yet, we are preopening the
+ * database and will actually complete the open later. So
+ * PREOPEN is not a real open, and the log should be ignored
+ * in log_verify.
+ * If fileuid is in a CLOSE operation there is no need to
+ * record it.
+ */
+ if ((opcode != DBREG_PREOPEN) && !IS_DBREG_CLOSE(opcode) &&
+ (ret = __put_filereg_info(lvh, &freg)) != 0)
+ goto err;
+
+ /* Store filelife info unless it's a CLOSE dbreg operation. */
+ if (!IS_DBREG_CLOSE(opcode)) {
+ flife.lifetime = opcode;
+ flife.dbregid = argp->fileid;
+ flife.lsn = *lsnp;
+ flife.dbtype = argp->ftype;
+ flife.meta_pgno = argp->meta_pgno;
+ memcpy(flife.fileid, argp->uid.data, argp->uid.size);
+ if ((ret = __put_filelife(lvh, &flife)) != 0)
+ goto err;
+ }
+ /* on_txn_logrec relies on the freg info in db first. */
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ goto out;
+ }
+
+ /*
+ * Add dbregid if it's new, and store the file register info; or
+ * remove dbregid from fregp if we are closing the file.
+ */
+ if ((ret = __add_dbregid(lvh, fregp, argp->fileid,
+ opcode, *lsnp, argp->ftype, argp->meta_pgno, &ret2)) != 0)
+ goto err;
+ ret = ret2;
+ if (ret != 0 && ret != 1 && ret != 2 && ret != -1)
+ goto err;/* DB operation error. */
+ if (ret != 0) {
+ /* Newly seen dbregid does not need to check life. */
+ if (ret == 1)
+ checklife = 0;
+ else if (ret == -1)
+ rmv_dblife = 1;/* The dbreg file id is closed. */
+ else if (ret == 2) {
+ __db_errx(env, DB_STR_A("2542",
+ "[%lu][%lu] Wrong dbreg operation "
+ "sequence, opening %s for id %d which is already "
+ "open.", "%lu %lu %s %d"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ dbfname, argp->fileid);
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+ if (!rmv_dblife && (ret = __put_filereg_info(lvh, fregp)) != 0)
+ goto err;
+ }
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ if (!checklife)
+ goto out;
+
+ /*
+ * Verify the database type does not change, and the lifetime of a
+ * db file follow an open/chkpnt->[chkpnt]->close order.
+ * A VRFY_FILELIFE record is removed from db on DBREG_CLOSE,
+ * and inserted into db on DBREG_OPEN.
+ */
+ if (!IS_DBREG_OPEN(opcode) &&
+ (ret = __get_filelife(lvh, argp->fileid, &pflife)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ if (!F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+ __db_errx(env, DB_STR_A("2543",
+ "[%lu][%lu] Wrong dbreg operation sequence,"
+ "file %s with id %d is first seen of "
+ "status: %s", "%lu %lu %s %d"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ dbfname, argp->fileid,
+ __lv_dbreg_str(opcode));
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ } else
+ ret = 0;
+ }
+ goto err;
+ }
+
+ /* Can't go on verifying without pflife. */
+ if (pflife == NULL)
+ goto out;
+ if (argp->ftype != pflife->dbtype) {
+ if ((ret = __lv_dbt_str(&(argp->uid), &puid)) != 0)
+ goto err;
+ __db_errx(env, DB_STR_A("2544",
+ "[%lu][%lu] The dbtype of database file %s with uid %s "
+ " and id %d has changed from %s to %s.",
+ "%lu %lu %s %s %d %s %s"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, dbfname, puid,
+ pflife->dbregid, __lv_dbtype_str(pflife->dbtype),
+ __lv_dbtype_str(argp->ftype));
+
+ __os_free(env, puid);
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+
+ if ((IS_DBREG_CLOSE(opcode) &&
+ (pflife->lifetime != DBREG_CHKPNT ||
+ pflife->lifetime != DBREG_XCHKPNT) &&
+ !IS_DBREG_OPEN(pflife->lifetime))) {
+ __db_errx(env, DB_STR_A("2545",
+ "[%lu][%lu] Wrong dbreg operation sequence for file %s "
+ "with id %d, current status: %s, new status: %s",
+ "%lu %lu %s %d %s %s"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, dbfname, pflife->dbregid,
+ __lv_dbreg_str(pflife->lifetime),
+ __lv_dbreg_str(opcode));
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+
+ pflife->lifetime = opcode;
+ pflife->lsn = *lsnp;
+ if ((!rmv_dblife && (ret = __put_filelife(lvh, pflife)) != 0) ||
+ ((rmv_dblife || IS_DBREG_CLOSE(opcode)) &&
+ ((ret = __del_filelife(lvh, argp->fileid)) != 0)))
+ goto err;
+
+out:
+ /* There may be something to do here in future. */
+err:
+ __os_free(env, argp);
+ if (fregp != NULL &&
+ (ret2 = __free_filereg_info(fregp)) != 0 && ret == 0)
+ ret = ret2;
+ if (freg.fname != NULL)
+ __os_free(env, freg.fname);
+ if (pflife != NULL)
+ __os_free(env, pflife);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_split_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_split_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_split_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_split_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->left);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->right);
+ /* Parent page lock is always released before __bam_page returns. */
+
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_split_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_split_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_split_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_split_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_rsplit_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_rsplit_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_rsplit_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_adj_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_adj_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_adj_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_adj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_irep_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_irep_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_irep_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_irep_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_cadjust_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_cadjust_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_cadjust_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cdel_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_cdel_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_cdel_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_cdel_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_repl_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_repl_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_repl_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_repl_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_root_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_root_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_root_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_root_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_curadj_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_curadj_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_curadj_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_curadj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_rcuradj_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_rcuradj_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_rcuradj_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_rcuradj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ if ((ret = __lv_on_bam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_relink_43_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_relink_43_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_relink_43_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_relink_43_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_merge_44_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_merge_44_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __bam_merge_44_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __bam_merge_44_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_create_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_create_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_create_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_create_42_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_create_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_create_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_create_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_create_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_remove_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_remove_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_remove_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_remove_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_write_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_write_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_write_42_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_write_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_write_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_write_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+ ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_rename_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_rename_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_rename_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_rename_42_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_rename_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_rename_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_rename_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ char *buf;
+ int ret;
+ size_t buflen;
+ VRFY_FILEREG_INFO freg, *fregp;
+
+ memset(&freg, 0, sizeof(freg));
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+ buf = NULL;
+
+ if ((ret = __fop_rename_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+ if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+ /*
+ * Since we get the fname-fuid map when iterating from end to
+ * beginning, we only store the latest file name, that's the
+ * name supposed to be used currently. So if the fileid is
+ * already stored, and we see it again here, it means the db
+ * file was renamed and we already have its latest name.
+ *
+ * Store the dbfile path (dir/fname) in case there are db
+ * files with same name in different data directories.
+ */
+ if (__get_filereg_info(lvh, &(argp->fileid), &fregp) == 0) {
+ if (fregp != NULL &&
+ (ret = __free_filereg_info(fregp)) != 0)
+ goto err;
+ goto out;
+ }
+ freg.fileid = argp->fileid;
+ if ((ret = __os_malloc(env, buflen = argp->dirname.size +
+ argp->newname.size + 2, &buf)) != 0)
+ goto err;
+ snprintf(buf, buflen, "%s/%s", (char *)argp->dirname.data,
+ (char *)argp->newname.data);
+ freg.fname = buf;
+ /* Store the dbfilename<-->dbfileid map. */
+ if ((ret = __put_filereg_info(lvh, &freg)) != 0)
+ goto err;
+ }
+out:
+
+err:
+ if (buf != NULL)
+ __os_free(lvh->dbenv->env, buf);
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_file_remove_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_file_remove_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_file_remove_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_file_remove_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+#ifdef HAVE_HASH
+/*
+ * PUBLIC: int __ham_insdel_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_insdel_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_insdel_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_insdel_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_newpage_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_newpage_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_newpage_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_newpage_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_splitdata_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_splitdata_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_splitdata_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_splitdata_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_replace_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_replace_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_replace_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_replace_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_copypage_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_copypage_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_copypage_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_copypage_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_metagroup_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_metagroup_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_metagroup_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_metagroup_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_metagroup_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_metagroup_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_metagroup_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_metagroup_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_groupalloc_42_verify __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__ham_groupalloc_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_groupalloc_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_groupalloc_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_groupalloc_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_groupalloc_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_groupalloc_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ VRFY_FILELIFE *pflife;
+ int ret;
+
+ ret = 0;
+ pflife = NULL;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_groupalloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+
+ /*
+ * The __ham_groupalloc record is only generated when creating the
+ * hash sub database so it will always be on the master database's
+ * fileid.
+ */
+
+ if ((ret = __get_filelife(lvh, argp->fileid, &pflife)) != 0)
+ goto err;
+
+ if (pflife->meta_pgno != PGNO_BASE_MD) {
+ __db_errx(lvh->dbenv->env, DB_STR_A("2546",
+ "[%lu][%lu] __ham_groupalloc should apply only to the "
+ "master database with meta page number 0, current meta "
+ "page number is %d.", "%lu %lu %d"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ pflife->meta_pgno);
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+
+out:
+
+err:
+ if (pflife != NULL)
+ __os_free(lvh->dbenv->env, pflife);
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_changeslot_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_changeslot_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_changeslot_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_changeslot_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_contract_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_contract_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_contract_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_contract_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_curadj_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_curadj_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_curadj_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_curadj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __ham_chgpg_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__ham_chgpg_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __ham_chgpg_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __ham_chgpg_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+ if ((ret = __lv_on_ham_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+#endif
+
+#ifdef HAVE_HEAP
+/*
+ * PUBLIC: int __heap_addrem_verify
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_addrem_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __heap_addrem_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __heap_addrem_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+out:
+
+err:
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __heap_pg_alloc_verify
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_pg_alloc_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __heap_pg_alloc_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __heap_pg_alloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+out:
+
+err:
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __heap_trunc_meta_verify
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_trunc_meta_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __heap_trunc_meta_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __heap_trunc_meta_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __heap_trunc_page_verify
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_trunc_page_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __heap_trunc_page_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __heap_trunc_page_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+out:
+
+err:
+ __os_free(env, argp);
+ return (ret);
+}
+#endif
+
+#ifdef HAVE_QUEUE
+/*
+ * PUBLIC: int __qam_incfirst_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_incfirst_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __qam_incfirst_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __qam_incfirst_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __qam_mvptr_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_mvptr_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __qam_mvptr_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __qam_mvptr_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __qam_del_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_del_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __qam_del_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __qam_del_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __qam_add_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_add_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __qam_add_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __qam_add_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __qam_delext_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_delext_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __qam_delext_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __qam_delext_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+ if ((ret = __lv_on_qam_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+
+out:
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+#endif
+
+/*
+ * PUBLIC: int __txn_regop_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_regop_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __txn_regop_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_regop_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_regop_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __txn_regop_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret, ret2, started;
+ VRFY_TXN_INFO *ptvi, *pptvi;
+ VRFY_TIMESTAMP_INFO tsinfo;
+
+ ptvi = pptvi = NULL;
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+ ret = ret2 = started = 0;
+
+ if ((ret = __txn_regop_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ /*
+ * The __lv_log_fwdscr_oncmt call must precede LOG_VRFY_PROC otherwise
+ * this txn will be taken as an aborted txn.
+ */
+ if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+ if ((ret = __lv_log_fwdscr_oncmt(lvh, *lsnp,
+ argp->txnp->txnid, 0, argp->timestamp)) != 0)
+ goto err;
+
+ tsinfo.lsn = *lsnp;
+ tsinfo.timestamp = argp->timestamp;
+ tsinfo.logtype = argp->type;
+ if ((ret = __put_timestamp_info(lvh, &tsinfo)) != 0)
+ goto err;
+ goto out; /* We are done. */
+ }
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+ if ((ret = __del_txn_pages(lvh, argp->txnp->txnid)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;/* Some txns may have updated no pages. */
+ if ((ret = __lv_on_timestamp(lvh, lsnp, argp->timestamp,
+ DB___txn_regop)) != 0)
+ goto err;
+ if ((ret = __get_txn_vrfy_info(lvh, argp->txnp->txnid, &ptvi)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+ if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+ if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+ (ret2 = __txn_started(lvh, lvh->lv_config->start_lsn,
+ argp->txnp->txnid, &started)) == 0 && started != 0) {
+ ret = 0;
+ goto err;
+ }
+ if (ret2 != 0)
+ ret = ret2;
+ __db_errx(lvh->dbenv->env, DB_STR_A("2547",
+ "[%lu][%lu] Can not find an active transaction's "
+ "information, txnid: %lx.", "%lu %lu %lx"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)argp->txnp->txnid);
+ ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+
+ }
+
+ if (ptvi == NULL) {
+ if (ret == DB_NOTFOUND &&
+ F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ goto out;
+
+ }
+ DB_ASSERT(env, ptvi->ptxnid == 0);
+
+ /*
+ * This log record is only logged when committing a outermost txn,
+ * child txn commits are logged in __txn_child_log.
+ */
+ if (ptvi->ptxnid == 0) {
+ if (ptvi->status == TXN_STAT_PREPARE)
+ lvh->ntxn_prep--;
+ else if (ptvi->status == TXN_STAT_ACTIVE)
+ lvh->ntxn_active--;
+ lvh->ntxn_commit++;
+ }
+ ptvi->status = TXN_STAT_COMMIT;
+ DB_ASSERT(env, IS_ZERO_LSN(ptvi->last_lsn));
+ ptvi->last_lsn = *lsnp;
+ if ((ret = __put_txn_vrfy_info(lvh, ptvi)) != 0)
+ goto err;
+
+ /* Report txn stats. */
+ if (F_ISSET(lvh, DB_LOG_VERIFY_VERBOSE))
+ __db_msg(env, DB_STR_A("2548",
+ "[%lu][%lu] The number of active, committed and aborted "
+ "child txns of txn %lx: %u, %u, %u.",
+ "%lu %lu %lx %u %u %u"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, (u_long)ptvi->txnid,
+ ptvi->nchild_active, ptvi->nchild_commit,
+ ptvi->nchild_abort);
+out:
+err:
+
+ if (pptvi != NULL && (ret2 = __free_txninfo(pptvi)) != 0 && ret == 0)
+ ret = ret2;
+ if (ptvi != NULL && (ret2 = __free_txninfo(ptvi)) != 0 && ret == 0)
+ ret = ret2;
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_ckp_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_ckp_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __txn_ckp_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_ckp_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_ckp_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __txn_ckp_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ VRFY_CKP_INFO *lastckp, ckpinfo;
+ int ret;
+ struct __ckp_verify_params cvp;
+ VRFY_TIMESTAMP_INFO tsinfo;
+ char timebuf[CTIME_BUFLEN];
+ time_t ckp_time, lastckp_time;
+
+ lastckp = NULL;
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+ memset(&ckpinfo, 0, sizeof(ckpinfo));
+ memset(&cvp, 0, sizeof(cvp));
+
+ if ((ret = __txn_ckp_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+ if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+ tsinfo.lsn = *lsnp;
+ tsinfo.timestamp = argp->timestamp;
+ tsinfo.logtype = argp->type;
+ /*
+ * Store the first ckp_lsn, or the least one greater than the
+ * starting point. There will be no partial txns after
+ * valid_lsn.
+ */
+ if (!(!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+ LOG_COMPARE(&(lvh->lv_config->start_lsn),
+ &(argp->ckp_lsn)) > 0))
+ lvh->valid_lsn = argp->ckp_lsn;
+ if ((ret = __put_timestamp_info(lvh, &tsinfo)) != 0)
+ goto err;
+ goto out;/* We are done, exit. */
+ }
+ lvh->nckp++;
+ ckp_time = (time_t)argp->timestamp;
+ __db_msg(env, DB_STR_A("2549",
+ "[%lu][%lu] Checkpoint record, ckp_lsn: [%lu][%lu], "
+ "timestamp: %s. Total checkpoint: %u",
+ "%lu %lu %lu %lu %s %u"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, (u_long)argp->ckp_lsn.file,
+ (u_long)argp->ckp_lsn.offset,
+ __os_ctime(&ckp_time, timebuf), lvh->nckp);
+
+ if ((ret = __lv_on_timestamp(lvh, lsnp,
+ argp->timestamp, DB___txn_ckp)) != 0)
+ goto err;
+ if (((ret = __get_last_ckp_info(lvh, &lastckp)) != 0) &&
+ ret != DB_NOTFOUND)
+ return (ret);
+ if (ret == DB_NOTFOUND)
+ goto cont;
+
+ if (LOG_COMPARE(&(argp->last_ckp), &(lastckp->lsn)) != 0) {
+ __db_errx(env, DB_STR_A("2550",
+ "[%lu][%lu] Last known checkpoint [%lu][%lu] not equal "
+ "to last_ckp :[%lu][%lu]. Some checkpoint log records "
+ "may be missing.", "%lu %lu %lu %lu %lu %lu"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)lastckp->lsn.file, (u_long)lastckp->lsn.offset,
+ (u_long)argp->last_ckp.file, (u_long)argp->last_ckp.offset);
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+
+ /*
+ * Checkpoint are generally not performed quite often, so we see this
+ * as an error, but in txn commits we see it as a warning.
+ */
+ lastckp_time = (time_t)lastckp->timestamp;
+ if (argp->timestamp < lastckp->timestamp) {
+ __db_errx(env, DB_STR_A("2551",
+ "[%lu][%lu] Last known checkpoint [%lu, %lu] has a "
+ "timestamp %s smaller than this checkpoint timestamp %s.",
+ "%lu %lu %lu %lu %s %s"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, (u_long)lastckp->lsn.file,
+ (u_long)lastckp->lsn.offset,
+ __os_ctime(&lastckp_time, timebuf),
+ __os_ctime(&ckp_time, timebuf));
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+
+cont:
+ cvp.env = env;
+ cvp.lsn = *lsnp;
+ cvp.ckp_lsn = argp->ckp_lsn;
+
+ /*
+ * Verify that all active txn's first lsn is greater than
+ * argp->ckp_lsn.
+ */
+ if ((ret = __iterate_txninfo(lvh, 0, 0,
+ __lv_ckp_vrfy_handler, &cvp)) != 0)
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ ckpinfo.timestamp = argp->timestamp;
+ ckpinfo.lsn = *lsnp;
+ ckpinfo.ckplsn = argp->ckp_lsn;
+
+ if ((ret = __put_ckp_info(lvh, &ckpinfo)) != 0)
+ goto err;
+out:
+err:
+ if (argp)
+ __os_free(env, argp);
+ if (lastckp)
+ __os_free(env, lastckp);
+ return (ret);
+}
+
+static int
+__lv_ckp_vrfy_handler(lvinfo, txninfop, param)
+ DB_LOG_VRFY_INFO *lvinfo;
+ VRFY_TXN_INFO *txninfop;
+ void *param;
+{
+ struct __ckp_verify_params *cvp;
+ int ret;
+
+ ret = 0;
+ cvp = (struct __ckp_verify_params *)param;
+ /* ckp_lsn should be less than any active txn's first lsn. */
+ if (txninfop->status == TXN_STAT_ACTIVE && LOG_COMPARE(&(cvp->ckp_lsn),
+ &(txninfop->first_lsn)) >= 0) {
+ __db_errx(cvp->env, DB_STR_A("2552",
+ "[%lu][%lu] ckp log's ckp_lsn [%lu][%lu] greater than "
+ "active txn %lx 's first lsn [%lu][%lu]",
+ "%lu %lu %lu %lu %lx %lu %lu"),
+ (u_long)cvp->lsn.file, (u_long)cvp->lsn.offset,
+ (u_long)cvp->ckp_lsn.file, (u_long)cvp->ckp_lsn.offset,
+ (u_long)txninfop->txnid,
+ (u_long)txninfop->first_lsn.file,
+ (u_long)txninfop->first_lsn.offset);
+ lvinfo->flags |= DB_LOG_VERIFY_ERR;
+ if (!F_ISSET(lvinfo, DB_LOG_VERIFY_CAF))
+ /* Stop the iteration. */
+ ret = DB_LOG_VERIFY_BAD;
+ }
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_child_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_child_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __txn_child_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ VRFY_TXN_INFO *ptvi, *ptvi2;
+ int ret, ret2, started;
+
+ /*
+ * This function is called when a txn T0's child txn T1 commits. Before
+ * this log record we don't know T0 and T1's relationship. This means
+ * we never know the T0 has an active child txn T1, all child txns
+ * we know are committed.
+ */
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+ ptvi = ptvi2 = NULL;
+ ret = ret2 = started = 0;
+
+ if ((ret = __txn_child_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ /*
+ * The __lv_log_fwdscr_oncmt call must precede LOG_VRFY_PROC otherwise
+ * this txn will be taken as an aborted txn.
+ */
+ if (F_ISSET(lvh, DB_LOG_VERIFY_FORWARD)) {
+ if ((ret = __lv_log_fwdscr_oncmt(lvh, argp->c_lsn, argp->child,
+ argp->txnp->txnid, 0)) != 0)
+ goto err;
+ if ((ret = __lv_log_fwdscr_onrec(lvh, argp->txnp->txnid,
+ argp->type, argp->prev_lsn, *lsnp)) != 0)
+ goto err;
+ goto out;/* We are done. */
+ }
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+ if ((ret = __return_txn_pages(lvh, argp->child,
+ argp->txnp->txnid)) != 0 && ret != DB_NOTFOUND)
+ goto err;/* Some txns may have updated no pages. */
+
+ /* Update parent txn info. */
+ if ((ret = __get_txn_vrfy_info(lvh, argp->txnp->txnid, &ptvi)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+ if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+ if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+ ((ret2 = __txn_started(lvh, lvh->lv_config->start_lsn,
+ argp->txnp->txnid, &started)) == 0) && started != 0) {
+ ret = 0;
+ goto err;
+ }
+ if (ret2 != 0)
+ ret = ret2;
+ __db_errx(lvh->dbenv->env, DB_STR_A("2553",
+ "[%lu][%lu] Can not find an active transaction's "
+ "information, txnid: %lx.", "%lu %lu %lx"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)argp->txnp->txnid);
+ ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+
+ }
+ if (ptvi == NULL) {
+ if (ret == DB_NOTFOUND &&
+ F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ goto out;
+
+ }
+ ptvi->nchild_commit++;
+ /*
+ * The start of this child txn caused lvh->ntxn_active to be
+ * incremented unnecessarily, so decrement it.
+ */
+ lvh->ntxn_active--;
+ if (ptvi->status != TXN_STAT_ACTIVE) {
+ __db_errx(lvh->dbenv->env, DB_STR_A("2554",
+ "[%lu][%lu] Parent txn %lx ended "
+ "before child txn %lx ends.", "%lu %lu %lx %lx"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)argp->txnp->txnid, (u_long)argp->child);
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+ if ((ret = __put_txn_vrfy_info(lvh, ptvi)) != 0)
+ goto err;
+
+ /* Update child txn info. */
+ if ((ret = __get_txn_vrfy_info(lvh, argp->child, &ptvi2)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+ if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+ if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+ ((ret2 = __txn_started(lvh, lvh->lv_config->start_lsn,
+ argp->child, &started)) == 0) && started != 0) {
+ ret = 0;
+ goto err;
+ }
+ if (ret2 != 0)
+ ret = ret2;
+ __db_errx(lvh->dbenv->env, DB_STR_A("2555",
+ "[%lu][%lu] Can not find an active "
+ "transaction's information, txnid: %lx.",
+ "%lu %lu %lx"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, (u_long)argp->child);
+ ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+
+ }
+ if (ptvi2 == NULL) {
+ if (ret == DB_NOTFOUND &&
+ F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ goto out;
+
+ }
+ if (ptvi2->status != TXN_STAT_ACTIVE) {
+ __db_errx(lvh->dbenv->env, DB_STR_A("2556",
+ "[%lu][%lu] Txn %lx ended before it commits.",
+ "%lu %lu %lx"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, (u_long)argp->child);
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+ ptvi2->status = TXN_STAT_COMMIT;
+ if ((ret = __put_txn_vrfy_info(lvh, ptvi2)) != 0)
+ goto err;
+out:
+err:
+ __os_free(env, argp);
+ if (ptvi != NULL && (ret2 = __free_txninfo(ptvi)) != 0 && ret == 0)
+ ret = ret2;
+ if (ptvi2 != NULL && (ret2 = __free_txninfo(ptvi2)) != 0 && ret == 0)
+ ret = ret2;
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_xa_regop_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_xa_regop_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __txn_xa_regop_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __txn_xa_regop_42_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_prepare_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_prepare_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __txn_prepare_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ VRFY_TXN_INFO *ptvi;
+ int ret, ret2, started;
+
+ ret = ret2 = started = 0;
+ ptvi = NULL;
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __txn_prepare_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+ if ((ret = __get_txn_vrfy_info(lvh, argp->txnp->txnid, &ptvi)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+
+ if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+ if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) &&
+ ((ret2 = __txn_started(lvh, lvh->lv_config->start_lsn,
+ argp->txnp->txnid, &started)) == 0) && started != 0) {
+ ret = 0;
+ goto err;
+ }
+ if (ret2 != 0)
+ ret = ret2;
+ __db_errx(lvh->dbenv->env, DB_STR_A("2557",
+ "[%lu][%lu] Can not find an active transaction's "
+ "information, txnid: %lx.", "%lu %lu %lx"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)argp->txnp->txnid);
+ ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+
+ }
+ if (ptvi == NULL) {
+ if (ret == DB_NOTFOUND &&
+ F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ goto out;
+
+ }
+ DB_ASSERT(env,
+ (IS_ZERO_LSN(ptvi->prep_lsn) && ptvi->status != TXN_STAT_PREPARE) ||
+ (!IS_ZERO_LSN(ptvi->prep_lsn) && ptvi->status == TXN_STAT_PREPARE));
+
+ lvh->ntxn_prep++;
+ lvh->ntxn_active--;
+
+ if (!IS_ZERO_LSN(ptvi->prep_lsn)) {/* Prepared more than once. */
+
+ __db_errx(lvh->dbenv->env, DB_STR_A("2558",
+ "[%lu][%lu] Multiple txn_prepare log record for "
+ "transaction %lx, previous prepare lsn: [%lu, %lu].",
+ "%lu %lu %lx %lu %lu"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, (u_long)argp->txnp->txnid,
+ (u_long)ptvi->prep_lsn.file, (u_long)ptvi->prep_lsn.offset);
+ } else {
+ ptvi->prep_lsn = *lsnp;
+ ptvi->status = TXN_STAT_PREPARE;
+ }
+ ret = __put_txn_vrfy_info(lvh, ptvi);
+out:
+err:
+ __os_free(env, argp);
+ if (ptvi != NULL && (ret2 = __free_txninfo(ptvi)) != 0 && ret == 0)
+ ret = ret2;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_recycle_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_recycle_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __txn_recycle_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ notused2 = DB_TXN_LOG_VERIFY;
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+ ret = 0;
+
+ if ((ret = __txn_recycle_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+ /* Add recycle info for all txns whose ID is in the [min, max] range. */
+ ret = __add_recycle_lsn_range(lvh, lsnp, argp->min, argp->max);
+
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/* Handle log types having timestamps, so far only __txn_ckp and __txn_regop. */
+static int
+__lv_on_timestamp(lvh, lsn, timestamp, logtype)
+ DB_LOG_VRFY_INFO *lvh;
+ const DB_LSN *lsn;
+ int32_t timestamp;
+ u_int32_t logtype;
+{
+ VRFY_TIMESTAMP_INFO *ltsinfo;
+ int ret;
+
+ ltsinfo = NULL;
+ ret = 0;
+ if ((ret = __get_latest_timestamp_info(lvh, *lsn, &ltsinfo)) == 0) {
+ DB_ASSERT(lvh->dbenv->env, ltsinfo != NULL);
+ if (ltsinfo->timestamp >= timestamp &&
+ F_ISSET(lvh, DB_LOG_VERIFY_VERBOSE)) {
+ __db_errx(lvh->dbenv->env, DB_STR_A("2559",
+ "[%lu][%lu] [WARNING] This log record of type %s "
+ "does not have a greater time stamp than "
+ "[%lu, %lu] of type %s", "%lu %lu %s %lu %lu %s"),
+ (u_long)lsn->file, (u_long)lsn->offset,
+ LOGTYPE_NAME(lvh, logtype),
+ (u_long)ltsinfo->lsn.file,
+ (u_long)ltsinfo->lsn.offset,
+ LOGTYPE_NAME(lvh, ltsinfo->logtype));
+ lvh->flags |= DB_LOG_VERIFY_WARNING;
+ }
+ }
+ if (ltsinfo != NULL)
+ __os_free(lvh->dbenv->env, ltsinfo);
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ return (ret);
+}
+
+/*
+ * Called whenever the log record belongs to a transaction.
+ */
+static int
+__lv_on_txn_logrec(lvh, lsnp, prev_lsnp, txnp, type, dbregid)
+ DB_LOG_VRFY_INFO *lvh;
+ const DB_LSN *lsnp;
+ const DB_LSN *prev_lsnp;
+ const DB_TXN *txnp;
+ u_int32_t type;
+ int32_t dbregid;
+{
+ DBT fid;
+ VRFY_TXN_INFO *pvti;
+ u_int32_t txnid;
+ VRFY_FILEREG_INFO *fregp;
+ int ret, ret2, started;
+
+ ret = ret2 = started = 0;
+ pvti = NULL;
+ fregp = NULL;
+ lvh->lrtypes[type]++;/* Increment per-type log record count. */
+ txnid = txnp->txnid;
+ memset(&fid, 0, sizeof(fid));
+
+ if (dbregid == INVAL_DBREGID)
+ goto cont;
+ if ((ret = __get_filereg_by_dbregid(lvh, dbregid, &fregp)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ /*
+ * It's likely that we are verifying a subset of logs
+ * and the DBREG_OPEN is outside the range.
+ */
+ if (!F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ __db_msg(lvh->dbenv->env, DB_STR_A("2560",
+ "[%lu][%lu] Transaction %lx is updating a "
+ "db file %d not registered.",
+ "%lu %lu %lx %d"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)txnp->txnid, dbregid);
+ goto cont;
+ } else
+ goto err;
+ }
+
+ fid = fregp->fileid;
+cont:
+ if (IS_ZERO_LSN(*prev_lsnp) &&
+ (ret = __lv_on_new_txn(lvh, lsnp, txnp, type, dbregid, &fid)) != 0)
+ goto err;
+
+ if ((ret = __get_txn_vrfy_info(lvh, txnid, &pvti)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+
+ /* If can't find the txn, there is an internal error. */
+ if (ret == DB_NOTFOUND && !F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL)) {
+ /*
+ * If verifying from middle, it's expected that txns begun
+ * before start are not found.
+ */
+ if (!IS_ZERO_LSN(lvh->lv_config->start_lsn) && ((ret2 =
+ __txn_started(lvh, lvh->lv_config->start_lsn, txnid,
+ &started)) == 0) && started != 0) {
+ ret = 0;
+ goto out;/* We are done. */
+ }
+ if (ret2 != 0)
+ ret = ret2;
+
+ __db_errx(lvh->dbenv->env, DB_STR_A("2561",
+ "[%lu][%lu] Can not find an active transaction's "
+ "information, txnid: %lx.", "%lu %lu %lx"),
+ (u_long)lsnp->file, (u_long)lsnp->offset, (u_long)txnid);
+ ON_ERROR(lvh, DB_LOG_VERIFY_INTERR);
+ }
+
+ /* Can't proceed without the txn info. */
+ if (pvti == NULL) {
+ if (ret == DB_NOTFOUND && F_ISSET(lvh, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ goto out;
+ }
+
+ /* Check if prev lsn is wrong, and some log records may be missing. */
+ if (!IS_ZERO_LSN(*prev_lsnp) &&
+ LOG_COMPARE(prev_lsnp, &(pvti->cur_lsn)) != 0) {
+ __db_errx(lvh->dbenv->env, DB_STR_A("2562",
+ "[%lu][%lu] Previous record for transaction %lx is "
+ "[%lu][%lu] and prev_lsn is [%lu][%lu].",
+ "%lu %lu %lx %lu %lu %lu %lu"), (u_long)lsnp->file,
+ (u_long)lsnp->offset, (u_long)pvti->txnid,
+ (u_long)pvti->cur_lsn.file, (u_long)pvti->cur_lsn.offset,
+ (u_long)prev_lsnp->file, (u_long)prev_lsnp->offset);
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+
+ /*
+ * After the txn is prepared, the only valid log record for this txn
+ * is the commit record.
+ */
+ if (pvti->status == TXN_STAT_PREPARE && type != DB___txn_regop) {
+ __db_errx(lvh->dbenv->env, DB_STR_A("2563",
+ "[%lu][%lu] Update action is performed in a "
+ "prepared transaction %lx.", "%lu %lu %lx"),
+ (u_long)lsnp->file, (u_long)lsnp->offset, (u_long)txnid);
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+ pvti->cur_lsn = *lsnp;
+ pvti->flags = txnp->flags;
+ if (dbregid != INVAL_DBREGID && fid.size > 0 &&
+ (ret = __add_file_updated(pvti, &fid, dbregid)) != 0)
+ goto err;
+ if ((ret = __put_txn_vrfy_info(lvh, pvti)) != 0)
+ goto err;
+out:
+err:
+ if (pvti != NULL && (ret2 = __free_txninfo(pvti)) != 0 && ret == 0)
+ ret = ret2;
+ if (fregp != NULL &&
+ (ret2 = __free_filereg_info(fregp)) != 0 && ret == 0)
+ ret = ret2;
+ return (ret);
+}
+
+/*
+ * Called whenever a new transaction is started, including child transactions.
+ */
+static int
+__lv_on_new_txn (lvh, lsnp, txnp, type, dbregid, fid)
+ DB_LOG_VRFY_INFO *lvh;
+ const DB_LSN *lsnp;
+ const DB_TXN *txnp;
+ u_int32_t type;
+ int32_t dbregid;
+ const DBT *fid;
+{
+ VRFY_TXN_INFO vti, *pvti, *vtip;
+ int ret, tret;
+ u_int32_t txnid;
+ ENV *env;
+
+ ret = tret = 0;
+ txnid = txnp->txnid;
+ pvti = NULL;
+ memset(&vti, 0, sizeof(vti));
+ vti.txnid = txnid;
+ env = lvh->dbenv->env;
+ /* Log record type, may be used later. Pass lint checks. */
+ COMPQUIET(type, 0);
+
+ /*
+ * It's possible that the new txn is a child txn, we will decrement
+ * this value in __txn_child_verify when we realize this, because
+ * this value only records the number of outermost active txns.
+ */
+ lvh->ntxn_active++;
+
+ if ((ret = __get_txn_vrfy_info(lvh, txnid, &pvti)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+ if (ret == DB_NOTFOUND)
+ vtip = &vti;
+ else {/* The txnid is reused, may be illegal. */
+ vtip = pvti;
+ /*
+ * If this txn id was recycled, this use is legal. A legal
+ * recyclable txnid is immediately not recyclable after
+ * it's recycled here. And it's impossible for vtip->status
+ * to be TXN_STAT_ACTIVE, since we have made it TXN_STAT_ABORT
+ * when we detected this txn id recycle just now.
+ */
+ if (vtip->num_recycle > 0 && LOG_COMPARE(&(vtip->recycle_lsns
+ [vtip->num_recycle - 1]), lsnp) < 0) {
+ DB_ASSERT(env, vtip->status != TXN_STAT_ACTIVE);
+ if ((ret = __rem_last_recycle_lsn(vtip)) != 0)
+ goto err;
+ if ((ret = __clear_fileups(vtip)) != 0)
+ goto err;
+
+ vtip->status = 0;
+ ZERO_LSN(vtip->prep_lsn);
+ ZERO_LSN(vtip->last_lsn);
+
+ vtip->nchild_active = 0;
+ vtip->nchild_commit = 0;
+ vtip->nchild_abort = 0;
+ /*
+ * We may goto the else branch if this txn has child txns
+ * before any updates done on its behalf. So we should
+ * exclude this possibility to conclude a failed verification.
+ */
+ } else if (vtip->nchild_active + vtip->nchild_commit +
+ vtip->nchild_abort == 0) {
+ __db_errx(lvh->dbenv->env, DB_STR_A("2564",
+ "[%lu][%lu] Transaction id %lx reused without "
+ "being recycled with a __txn_recycle.",
+ "%lu %lu %lx"),
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)txnid);
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+ }
+
+ vtip->first_lsn = *lsnp;
+ vtip->cur_lsn = *lsnp;
+ vtip->flags = txnp->flags;
+
+ /*
+ * It's possible that the first log rec does not update any file,
+ * like the __txn_child type of record.
+ */
+ if (fid->size > 0 && (ret =
+ __add_file_updated(vtip, fid, dbregid)) != 0)
+ goto err;
+ if ((ret = __put_txn_vrfy_info(lvh, vtip)) != 0)
+ goto err;
+
+err:
+ if (pvti != NULL && (tret = __free_txninfo(pvti)) != 0 && ret == 0)
+ ret = tret;
+ if ((tret = __free_txninfo_stack(&vti)) != 0 && ret == 0)
+ ret = tret;
+
+ return (ret);
+}
+
+/* Called when we detect that a new log file is used. */
+static int
+__lv_new_logfile_vrfy(lvh, lsnp)
+ DB_LOG_VRFY_INFO *lvh;
+ const DB_LSN *lsnp;
+{
+ int ret;
+
+ ret = 0;
+ if (IS_ZERO_LSN(lvh->last_lsn) || lvh->last_lsn.file == lsnp->file) {
+ lvh->last_lsn = *lsnp;
+ return (0);
+ }
+
+ /*
+ * If file number changed, it must have been incremented,
+ * and the offset is 0.
+ * */
+ if (lsnp->file - lvh->last_lsn.file != 1 || lsnp->offset !=
+ __lv_first_offset(lvh->dbenv->env)) {
+ __db_errx(lvh->dbenv->env,
+ "[%lu][%lu] Last log record verified ([%lu][%lu]) is not "
+ "immidiately before the current log record.",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)lvh->last_lsn.file, (u_long)lvh->last_lsn.offset);
+ ret = DB_LOG_VERIFY_BAD;
+ ON_ERROR(lvh, DB_LOG_VERIFY_ERR);
+ }
+
+ lvh->last_lsn = *lsnp;
+err:
+ return (ret);
+}
+
+static u_int32_t
+__lv_first_offset(env)
+ ENV *env;
+{
+ u_int32_t sz;
+
+ if (CRYPTO_ON(env))
+ sz = HDR_CRYPTO_SZ;
+ else
+ sz = HDR_NORMAL_SZ;
+
+ sz += sizeof(LOGP);
+
+ return sz;
+}
+
+/* Called when we see a non-transactional update log record. */
+static int
+__lv_on_nontxn_update(lvh, lsnp, txnid, logtype, fileid)
+ DB_LOG_VRFY_INFO *lvh;
+ const DB_LSN *lsnp;
+ u_int32_t txnid, logtype;
+ int32_t fileid;
+{
+ lvh->lrtypes[logtype]++;
+ COMPQUIET(txnid, 0);
+ if (fileid != INVAL_DBREGID) {
+ lvh->non_txnup_cnt++;
+ __db_msg(lvh->dbenv->env, DB_STR_A("2565",
+ "[%lu][%lu] Non-transactional update, "
+ "log type: %u, fileid: %d.", "%lu %lu %u %d"),
+ (u_long)lsnp->file, (u_long)lsnp->offset, logtype, fileid);
+ }
+
+ return (0);
+}
+
+static int
+__lv_on_txn_aborted(lvinfo)
+ DB_LOG_VRFY_INFO *lvinfo;
+{
+ int ret, ret2, sres;
+ VRFY_TXN_INFO *ptvi;
+ u_int32_t abtid;
+ DB_LSN lsn, slsn;
+
+ ret = ret2 = sres = 0;
+ abtid = lvinfo->aborted_txnid;
+ lsn = lvinfo->aborted_txnlsn;
+ slsn = lvinfo->lv_config->start_lsn;
+ ptvi = NULL;
+
+ if ((ret = __del_txn_pages(lvinfo, lvinfo->aborted_txnid)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;/* Some txns may have updated no pages. */
+ ret = __get_txn_vrfy_info(lvinfo, lvinfo->aborted_txnid, &ptvi);
+ if (ret == DB_NOTFOUND && !F_ISSET(lvinfo, DB_LOG_VERIFY_PARTIAL)) {
+ /*
+ * If verifying from slsn and the txn abtid started before
+ * slsn, it's expected that we can't find the txn.
+ */
+ if (!IS_ZERO_LSN(slsn) && (ret2 = __txn_started(lvinfo, slsn,
+ abtid, &sres)) == 0 && sres != 0) {
+ ret = 0;
+ goto err;
+ }
+ if (ret2 != 0)
+ ret = ret2;/* Use the same error msg below. */
+ __db_errx(lvinfo->dbenv->env, DB_STR_A("2566",
+ "[%lu][%lu] Can not find an active transaction's "
+ "information, txnid: %lx.", "%lu %lu %lx"),
+ (u_long)lsn.file, (u_long)lsn.offset,
+ (u_long)lvinfo->aborted_txnid);
+ ON_ERROR(lvinfo, DB_LOG_VERIFY_INTERR);
+ }
+ if (ptvi == NULL) {
+ if (ret == DB_NOTFOUND &&
+ F_ISSET(lvinfo, DB_LOG_VERIFY_PARTIAL))
+ ret = 0;
+ goto out;
+ }
+ ptvi->status = TXN_STAT_ABORT;
+ lvinfo->ntxn_abort++;
+ lvinfo->ntxn_active--;
+ /* Report txn stats. */
+ if (F_ISSET(lvinfo, DB_LOG_VERIFY_VERBOSE)) {
+ __db_msg(lvinfo->dbenv->env, DB_STR_A("2567",
+ "[%lu][%lu] Txn %lx aborted after this log record.",
+ "%lu %lu %lx"), (u_long)lvinfo->aborted_txnlsn.file,
+ (u_long)lvinfo->aborted_txnlsn.offset, (u_long)ptvi->txnid);
+ __db_msg(lvinfo->dbenv->env, DB_STR_A("2568",
+ "\tThe number of active, committed and aborted child txns "
+ "of txn %lx: %u, %u, %u.", "%lx %u %u %u"),
+ (u_long)ptvi->txnid, ptvi->nchild_active,
+ ptvi->nchild_commit, ptvi->nchild_abort);
+ }
+ lvinfo->aborted_txnid = 0;
+ lvinfo->aborted_txnlsn.file = lvinfo->aborted_txnlsn.offset = 0;
+ if ((ret = __put_txn_vrfy_info(lvinfo, ptvi)) != 0)
+ goto err;
+ if ((ret = __free_txninfo(ptvi)) != 0)
+ goto err;
+out:
+err:
+ return (ret);
+}
diff --git a/src/log/log_verify_stub.c b/src/log/log_verify_stub.c
new file mode 100644
index 00000000..e6589a50
--- /dev/null
+++ b/src/log/log_verify_stub.c
@@ -0,0 +1,79 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_VERIFY
+
+#include "db_config.h"
+#include "db_int.h"
+
+static int __db_log_novrfy __P((ENV *));
+int __log_verify_pp __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+int __log_verify __P((DB_ENV *, const DB_LOG_VERIFY_CONFIG *));
+int __log_verify_wrap __P((ENV *env, const char *, u_int32_t, const char *,
+ const char *, time_t, time_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t,
+ int, int));
+
+/*
+ * __db_log_novrfy --
+ * Error when a Berkeley DB build doesn't include the access method.
+ */
+static int
+__db_log_novrfy(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("2523",
+ "library build did not include support for log verification"));
+ return (DB_OPNOTSUP);
+}
+
+int
+__log_verify_pp(dbenv, lvconfig)
+ DB_ENV *dbenv;
+ const DB_LOG_VERIFY_CONFIG *lvconfig;
+{
+ COMPQUIET(lvconfig, NULL);
+
+ /* The dbenv is intact, callers should properly take care of it. */
+ return (__db_log_novrfy(dbenv->env));
+}
+
+int
+__log_verify(dbenv, lvconfig)
+ DB_ENV *dbenv;
+ const DB_LOG_VERIFY_CONFIG *lvconfig;
+{
+ COMPQUIET(lvconfig, NULL);
+
+ return (__db_log_novrfy(dbenv->env));
+}
+
+int
+__log_verify_wrap(env, envhome, cachesize, dbfile, dbname,
+ stime, etime, stfile, stoffset, efile, eoffset, caf, verbose)
+ ENV *env;
+ const char *envhome, *dbfile, *dbname;
+ time_t stime, etime;
+ u_int32_t cachesize, stfile, stoffset, efile, eoffset;
+ int caf, verbose;
+{
+ COMPQUIET(envhome, NULL);
+ COMPQUIET(dbfile, NULL);
+ COMPQUIET(dbname, NULL);
+ COMPQUIET(stime, 0);
+ COMPQUIET(etime, 0);
+ COMPQUIET(cachesize, 0);
+ COMPQUIET(stfile, 0);
+ COMPQUIET(stoffset, 0);
+ COMPQUIET(efile, 0);
+ COMPQUIET(eoffset, 0);
+ COMPQUIET(caf, 0);
+ COMPQUIET(verbose, 0);
+ return (__db_log_novrfy(env));
+}
+
+#endif /* !HAVE_VERIFY */
diff --git a/src/log/log_verify_util.c b/src/log/log_verify_util.c
new file mode 100644
index 00000000..88682921
--- /dev/null
+++ b/src/log/log_verify_util.c
@@ -0,0 +1,2234 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+/*
+ * This file contains helper functions like data structure and in-memory db
+ * management, which are used to store various log verification information.
+ */
+#include "db_config.h"
+#include "db_int.h"
+
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/fop.h"
+
+#include "dbinc/log_verify.h"
+
+#define BDBOP(op) do { \
+ ret = (op); \
+ if (ret != 0) { \
+ __lv_on_bdbop_err(ret); \
+ goto err; \
+ } \
+} while (0)
+
+#define BDBOP2(dbenv, op, funct) do { \
+ ret = (op); \
+ if (ret != 0) { \
+ __lv_on_bdbop_err(ret); \
+ __db_err(dbenv->env, ret, "\n%s", funct); \
+ return (ret); \
+ } \
+} while (0)
+
+#define BDBOP3(dbenv, op, excpt, funct) do { \
+ ret = (op); \
+ if (ret != 0) { \
+ __lv_on_bdbop_err(ret); \
+ if (ret != excpt) { \
+ __db_err(dbenv->env, ret, "\n%s", funct); \
+ return (ret); \
+ } \
+ } \
+} while (0)
+
+typedef int (*btcmp_funct)(DB *, const DBT *, const DBT *);
+typedef int (*dupcmp_funct)(DB *, const DBT *, const DBT *);
+
+static int __lv_add_recycle_handler __P((
+ DB_LOG_VRFY_INFO *, VRFY_TXN_INFO *, void *));
+static int __lv_add_recycle_lsn __P((VRFY_TXN_INFO *, const DB_LSN *));
+static size_t __lv_dbt_arrsz __P((const DBT *, u_int32_t));
+static int __lv_fidpgno_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_i32_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_lsn_cmp __P((DB *, const DBT *, const DBT *));
+static void __lv_on_bdbop_err __P((int));
+static int __lv_open_db __P((DB_ENV *, DB **, DB_THREAD_INFO *,
+ const char *, int, btcmp_funct, u_int32_t, dupcmp_funct));
+static int __lv_pack_filereg __P((const VRFY_FILEREG_INFO *, DBT *));
+static int __lv_pack_txn_vrfy_info __P((
+ const VRFY_TXN_INFO *, DBT *, DBT *data));
+static int __lv_seccbk_fname __P((DB *, const DBT *, const DBT *, DBT *));
+static int __lv_seccbk_lsn __P((DB *, const DBT *, const DBT *, DBT *));
+static int __lv_seccbk_txnpg __P((DB *, const DBT *, const DBT *, DBT *));
+static void __lv_setup_logtype_names __P((DB_LOG_VRFY_INFO *lvinfo));
+static int __lv_txnrgns_lsn_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_ui32_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_unpack_txn_vrfy_info __P((VRFY_TXN_INFO **, const DBT *));
+static int __lv_unpack_filereg __P((const DBT *, VRFY_FILEREG_INFO **));
+
+static void __lv_on_bdbop_err(ret)
+ int ret;
+{
+ /* Pass lint checks. We need the ret and this function for debugging. */
+ COMPQUIET(ret, 0);
+}
+
+/*
+ * __create_log_vrfy_info --
+ * Initialize and return a log verification handle to be used throughout
+ * a verification process.
+ *
+ * PUBLIC: int __create_log_vrfy_info __P((const DB_LOG_VERIFY_CONFIG *,
+ * PUBLIC: DB_LOG_VRFY_INFO **, DB_THREAD_INFO *));
+ */
+int
+__create_log_vrfy_info(cfg, lvinfopp, ip)
+ const DB_LOG_VERIFY_CONFIG *cfg;
+ DB_LOG_VRFY_INFO **lvinfopp;
+ DB_THREAD_INFO *ip;
+{
+ const char *envhome;
+ int inmem, ret;
+ u_int32_t cachesz, envflags;
+ const char *dbf1, *dbf2, *dbf3, *dbf4, *dbf5, *dbf6, *dbf7, *dbf8,
+ *dbf9, *dbf10, *dbf11;
+ DB_LOG_VRFY_INFO *lvinfop;
+
+ dbf1 = "__db_log_vrfy_txninfo.db";
+ dbf2 = "__db_log_vrfy_fileregs.db";
+ dbf3 = "__db_log_vrfy_pgtxn.db";
+ dbf4 = "__db_log_vrfy_lsntime.db";
+ dbf5 = "__db_log_vrfy_timelsn.db";
+ dbf6 = "__db_log_vrfy_ckps.db";
+ dbf7 = "__db_log_vrfy_dbregids.db";
+ dbf8 = "__db_log_vrfy_fnameuid.db";
+ dbf9 = "__db_log_vrfy_timerange.db";
+ dbf10 = "__db_log_vrfy_txnaborts.db";
+ dbf11 = "__db_log_vrfy_txnpg.db";
+
+ envhome = cfg->temp_envhome;
+ lvinfop = NULL;
+ cachesz = cfg->cachesize;
+ if (cachesz== 0)
+ cachesz = 1024 * 1024 * 256;
+
+ BDBOP(__os_malloc(NULL, sizeof(DB_LOG_VRFY_INFO), &lvinfop));
+ memset(lvinfop, 0, sizeof(DB_LOG_VRFY_INFO));
+ lvinfop->ip = ip;
+ __lv_setup_logtype_names(lvinfop);
+ /* Avoid the VERIFY_PARTIAL bit being cleared if no ckp_lsn exists. */
+ lvinfop->valid_lsn.file = lvinfop->valid_lsn.offset = (u_int32_t)-1;
+
+ /*
+ * The envhome parameter determines if we will use an in-memory
+ * environment and databases.
+ */
+ if (envhome == NULL) {
+ envflags = DB_PRIVATE;
+ inmem = 1;
+ } else {
+ envflags = 0;
+ inmem = 0;
+ }
+
+ /* Create log verify internal database environment. */
+ BDBOP(db_env_create(&lvinfop->dbenv, 0));
+ BDBOP(__memp_set_cachesize(lvinfop->dbenv, 0, cachesz, 1));
+ /*
+ * Log verification internal db environment should be accessed
+ * single-threaded. No transaction semantics needed.
+ */
+ BDBOP(__env_open(lvinfop->dbenv, envhome,
+ envflags | DB_CREATE | DB_INIT_MPOOL, 0666));
+
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->txninfo, ip, dbf1,
+ inmem, __lv_ui32_cmp, 0, NULL));
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->fileregs, ip, dbf2,
+ inmem, NULL, 0, NULL));
+
+ /* No dup allowed, always overwrite data with same key. */
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->dbregids, ip, dbf7,
+ inmem, __lv_i32_cmp, 0, NULL));
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->pgtxn, ip, dbf3,
+ inmem, __lv_fidpgno_cmp, 0, NULL));
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->txnpg, ip, dbf11,
+ inmem, __lv_ui32_cmp, DB_DUP | DB_DUPSORT, __lv_fidpgno_cmp));
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->lsntime, ip, dbf4,
+ inmem, __lv_lsn_cmp, 0, NULL));
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->timelsn, ip, dbf5,
+ inmem, __lv_i32_cmp, DB_DUP | DB_DUPSORT, __lv_lsn_cmp));
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->txnaborts, ip, dbf10,
+ inmem, __lv_lsn_cmp, 0, NULL));
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->ckps, ip, dbf6,
+ inmem, __lv_lsn_cmp, 0, NULL));
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->fnameuid, ip, dbf8,
+ inmem, NULL, 0, NULL));
+ BDBOP(__lv_open_db(lvinfop->dbenv, &lvinfop->txnrngs, ip, dbf9,
+ inmem, __lv_ui32_cmp, DB_DUP | DB_DUPSORT, __lv_txnrgns_lsn_cmp));
+
+ BDBOP(__db_associate(lvinfop->lsntime, ip, NULL,
+ lvinfop->timelsn, __lv_seccbk_lsn, DB_CREATE));
+ BDBOP(__db_associate(lvinfop->fileregs, ip, NULL,
+ lvinfop->fnameuid, __lv_seccbk_fname, DB_CREATE));
+ BDBOP(__db_associate(lvinfop->pgtxn, ip, NULL,
+ lvinfop->txnpg, __lv_seccbk_txnpg, DB_CREATE));
+
+ *lvinfopp = lvinfop;
+
+ return (0);
+err:
+ if (lvinfop->dbenv && ret != 0)
+ __db_err(lvinfop->dbenv->env, ret, "__create_log_vrfy_info");
+ (void)__destroy_log_vrfy_info(lvinfop);
+
+ return (ret);
+}
+
+/*
+ * __destroy_log_vrfy_info --
+ * Destroy and free a log verification handle.
+ *
+ * PUBLIC: int __destroy_log_vrfy_info __P((DB_LOG_VRFY_INFO *));
+ */
+int
+__destroy_log_vrfy_info(lvinfop)
+ DB_LOG_VRFY_INFO *lvinfop;
+{
+ int ret;
+
+ ret = 0;
+ if (lvinfop == NULL)
+ return (0);
+
+ if (lvinfop->txnaborts != NULL &&
+ (ret = __db_close(lvinfop->txnaborts, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->txninfo != NULL &&
+ (ret = __db_close(lvinfop->txninfo, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->dbregids != NULL &&
+ (ret = __db_close(lvinfop->dbregids, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->fileregs != NULL &&
+ (ret = __db_close(lvinfop->fileregs, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->pgtxn != NULL &&
+ (ret = __db_close(lvinfop->pgtxn, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->lsntime != NULL &&
+ (ret = __db_close(lvinfop->lsntime, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->ckps != NULL &&
+ (ret = __db_close(lvinfop->ckps, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->txnrngs != NULL &&
+ (ret = __db_close(lvinfop->txnrngs, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->fnameuid != NULL &&
+ (ret = __db_close(lvinfop->fnameuid, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->timelsn != NULL &&
+ (ret = __db_close(lvinfop->timelsn, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->txnpg != NULL &&
+ (ret = __db_close(lvinfop->txnpg, NULL, 0)) != 0)
+ goto err;
+ if (lvinfop->dbenv != NULL &&
+ (ret = __env_close(lvinfop->dbenv, 0)) != 0)
+ goto err;
+err:
+ __os_free(NULL, lvinfop);
+
+ return (ret);
+}
+
+/* Secondary index callback function for DB_LOG_VRFY_INFO->timelsn. */
+static int
+__lv_seccbk_fname(secdb, key, data, result)
+ DB *secdb;
+ const DBT *key;
+ const DBT *data;
+ DBT *result;
+{
+ int ret, tret;
+ VRFY_FILEREG_INFO *freg;
+ char *buf;
+ size_t buflen, slen;
+
+ ret = tret = 0;
+ COMPQUIET(key, NULL);
+ if ((ret = __lv_unpack_filereg(data, &freg)) != 0)
+ goto out;
+ if (freg->fname == NULL || (slen = strlen(freg->fname)) == 0) {
+ ret = DB_DONOTINDEX;
+ goto out;
+ }
+
+ buflen = (slen + 1) * sizeof(char);
+ if ((ret = __os_umalloc(secdb->dbenv->env, buflen, &buf)) != 0)
+ goto out;
+ (void)strcpy(buf, freg->fname);
+ result->size = (u_int32_t)buflen;
+ result->flags |= DB_DBT_APPMALLOC;
+ result->data = buf;
+out:
+ if (freg != NULL && (tret = __free_filereg_info(freg)) != 0 && ret == 0)
+ ret = tret;
+ return (ret);
+}
+
+/* Secondary index callback function for DB_LOG_VRFY_INFO->txnpg. */
+static int
+__lv_seccbk_txnpg(secdb, key, data, result)
+ DB *secdb;
+ const DBT *key;
+ const DBT *data;
+ DBT *result;
+{
+ COMPQUIET(key, NULL);
+ COMPQUIET(secdb, NULL);
+ /* Txnid is the secondary key, and it's all the data dbt has. */
+ result->data = data->data;
+ result->size = data->size;
+
+ return (0);
+}
+
+/* Secondary index callback function for DB_LOG_VRFY_INFO->timelsn. */
+static int
+__lv_seccbk_lsn(secdb, key, data, result)
+ DB *secdb;
+ const DBT *key;
+ const DBT *data;
+ DBT *result;
+{
+ VRFY_TIMESTAMP_INFO *lvti;
+
+ COMPQUIET(key, NULL);
+ COMPQUIET(secdb, NULL);
+
+ lvti = (VRFY_TIMESTAMP_INFO *)data->data;
+ result->data = &(lvti->timestamp);
+ result->size = sizeof(lvti->timestamp);
+
+ return (0);
+}
+
+/*
+ * Open a BTREE database handle, optionally set the btree compare function
+ * and flags if any.
+ */
+static int
+__lv_open_db(dbenv, dbpp, ip, name, inmem, cmpf, sflags, dupcmpf)
+ DB_ENV *dbenv;
+ DB **dbpp;
+ const char *name;
+ int inmem;
+ btcmp_funct cmpf;
+ u_int32_t sflags;
+ dupcmp_funct dupcmpf;
+ DB_THREAD_INFO *ip;
+{
+ int ret;
+ const char *dbfname, *dbname;
+ DB *dbp;
+
+ dbp = NULL;
+ ret = 0;
+ if (inmem) {
+ dbfname = NULL;
+ dbname = name;
+ } else {
+ dbfname = name;
+ dbname = NULL;
+ }
+
+ BDBOP(db_create(&dbp, dbenv, 0));
+
+ if (cmpf != NULL)
+ BDBOP(__bam_set_bt_compare(dbp, cmpf));
+ if (dupcmpf != NULL)
+ dbp->dup_compare = dupcmpf;
+ if (sflags != 0)
+ BDBOP(__db_set_flags(dbp, sflags));
+ /* No concurrency needed, a big page size reduces overflow pages. */
+ BDBOP(__db_set_pagesize(dbp, 16 * 1024));
+
+ BDBOP(__db_open(dbp, ip, NULL, dbfname, dbname, DB_BTREE, DB_CREATE,
+ 0666, PGNO_BASE_MD));
+
+ *dbpp = dbp;
+
+ return (0);
+err:
+ if (dbenv != NULL && ret != 0)
+ __db_err(dbenv->env, ret, "__lv_open_db");
+ if (dbp != NULL)
+ (void)__db_close(dbp, NULL, 0);
+
+ return (ret);
+}
+
+/* Btree compare function for a [fileid, pgno] key. */
+static int
+__lv_fidpgno_cmp(db, dbt1, dbt2)
+ DB *db;
+ const DBT *dbt1;
+ const DBT *dbt2;
+{
+ db_pgno_t pgno1, pgno2;
+ int ret;
+ size_t len;
+
+ COMPQUIET(db, NULL);
+ len = DB_FILE_ID_LEN;
+ ret = memcmp(dbt1->data, dbt2->data, len);
+ if (ret == 0) {
+ memcpy(&pgno1, (u_int8_t *)dbt1->data + len,
+ sizeof(pgno1));
+ memcpy(&pgno2, (u_int8_t *)dbt2->data + len,
+ sizeof(pgno2));
+ ret = NUMCMP(pgno1, pgno2);
+ }
+
+ return (ret);
+}
+
+/* Btree compare function for a int32_t type of key. */
+static int
+__lv_i32_cmp(db, dbt1, dbt2)
+ DB *db;
+ const DBT *dbt1;
+ const DBT *dbt2;
+{
+ int32_t k1, k2;
+
+ COMPQUIET(db, NULL);
+ memcpy(&k1, dbt1->data, sizeof(k1));
+ memcpy(&k2, dbt2->data, sizeof(k2));
+
+ return (NUMCMP(k1, k2));
+}
+
+/* Btree compare function for a u_int32_t type of key. */
+static int
+__lv_ui32_cmp(db, dbt1, dbt2)
+ DB *db;
+ const DBT *dbt1;
+ const DBT *dbt2;
+{
+ u_int32_t k1, k2;
+
+ COMPQUIET(db, NULL);
+ memcpy(&k1, dbt1->data, sizeof(k1));
+ memcpy(&k2, dbt2->data, sizeof(k2));
+
+ return (NUMCMP(k1, k2));
+}
+
+/* Btree compare function for a DB_LSN type of key. */
+static int
+__lv_lsn_cmp(db, dbt1, dbt2)
+ DB *db;
+ const DBT *dbt1;
+ const DBT *dbt2;
+{
+ DB_LSN lsn1, lsn2;
+
+ DB_ASSERT(db->env, dbt1->size == sizeof(DB_LSN));
+ DB_ASSERT(db->env, dbt2->size == sizeof(DB_LSN));
+ memcpy(&lsn1, dbt1->data, sizeof(DB_LSN));
+ memcpy(&lsn2, dbt2->data, sizeof(DB_LSN));
+
+ return (LOG_COMPARE(&lsn1, &lsn2));
+}
+
+/*
+ * Structure management routines. We keep each structure on a
+ * consecutive memory chunk.
+ *
+ * The get functions will allocate memory via __os_malloc, and callers
+ * should free the memory after use. The update functions for VRFY_TXN_INFO
+ * and VRFY_FILEREG_INFO may realloc the structure.
+ */
+
+/*
+ * PUBLIC: int __put_txn_vrfy_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC: const VRFY_TXN_INFO *));
+ */
+int
+__put_txn_vrfy_info (lvinfo, txninfop)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ const VRFY_TXN_INFO *txninfop;
+{
+ int ret;
+ DBT key, data;
+
+ ret = __lv_pack_txn_vrfy_info(txninfop, &key, &data);
+ DB_ASSERT(lvinfo->dbenv->env, ret == 0);
+
+ BDBOP2(lvinfo->dbenv, __db_put(lvinfo->txninfo, lvinfo->ip, NULL,
+ &key, &data, 0), "__put_txn_vrfy_info");
+ __os_free(lvinfo->dbenv->env, data.data);
+
+ return (0);
+}
+
+/* Construct a key and data DBT from the structure. */
+static int
+__lv_pack_txn_vrfy_info(txninfop, key, data)
+ const VRFY_TXN_INFO *txninfop;
+ DBT *key, *data;
+{
+ int ret;
+ char *buf, *p;
+ size_t bufsz, len;
+ u_int32_t i;
+ DBT *pdbt;
+
+ memset(key, 0, sizeof(DBT));
+ memset(data, 0, sizeof(DBT));
+ ret = 0;
+ bufsz = TXN_VERIFY_INFO_TOTSIZE(*txninfop);
+
+ if ((ret = __os_malloc(NULL, bufsz, &buf)) != 0)
+ goto err;
+ memset(buf, 0, bufsz);
+ memcpy(buf, txninfop, TXN_VERIFY_INFO_FIXSIZE);
+ p = buf + TXN_VERIFY_INFO_FIXSIZE;
+ memcpy(p, txninfop->recycle_lsns, len = sizeof(DB_LSN) *
+ txninfop->num_recycle);
+ p += len;
+
+ for (i = 0; i < txninfop->filenum; i++) {
+
+ pdbt = &(txninfop->fileups[i]);
+ memcpy(p, &(pdbt->size), sizeof(pdbt->size));
+ p += sizeof(pdbt->size);
+ memcpy(p, pdbt->data, pdbt->size);
+ p += pdbt->size;
+ }
+
+ key->data = (void *)&txninfop->txnid;
+ key->size = sizeof(txninfop->txnid);
+ data->data = buf;
+ data->size = (u_int32_t)bufsz;
+ data->flags |= DB_DBT_MALLOC;
+err:
+ return (ret);
+}
+
+/* Calculate a DBT array's total number of bytes to store. */
+static size_t
+__lv_dbt_arrsz(arr, arrlen)
+ const DBT *arr;
+ u_int32_t arrlen;
+{
+ u_int32_t i;
+ size_t sz;
+
+ sz = 0;
+
+ /* For each DBT object, store its size and its data bytes. */
+ for (i = 0; i < arrlen; i++)
+ sz += arr[i].size + sizeof(arr[i].size);
+
+ return sz;
+}
+
+/*
+ * __get_txn_vrfy_info --
+ * Get a VRFY_TXN_INFO object from db by txnid. Callers should free the
+ * object by calling __free_txninfo.
+ *
+ * PUBLIC: int __get_txn_vrfy_info __P((const DB_LOG_VRFY_INFO *, u_int32_t,
+ * PUBLIC: VRFY_TXN_INFO **));
+ */
+int
+__get_txn_vrfy_info (lvinfo, txnid, txninfopp)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ u_int32_t txnid;
+ VRFY_TXN_INFO **txninfopp;
+{
+ int ret;
+ DBT key, data;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ key.data = &txnid;
+ key.size = sizeof(txnid);
+
+ BDBOP3(lvinfo->dbenv, __db_get(lvinfo->txninfo, lvinfo->ip, NULL,
+ &key, &data, 0), DB_NOTFOUND, "__get_txn_vrfy_info");
+
+ if (ret != DB_NOTFOUND)
+ ret = __lv_unpack_txn_vrfy_info(txninfopp, &data);
+
+ return (ret);
+}
+
+/* Construct a structure from a DBT. */
+static int
+__lv_unpack_txn_vrfy_info(txninfopp, data)
+ VRFY_TXN_INFO **txninfopp;
+ const DBT *data;
+{
+ size_t bufsz;
+ VRFY_TXN_INFO *buf, *txninfop;
+ DB_LSN *lsns, *p;
+ u_int32_t i, sz;
+ char *pb, *q;
+ int ret;
+
+ ret = 0;
+ i = sz = 0;
+ lsns = p = NULL;
+ pb = q = NULL;
+ txninfop = (VRFY_TXN_INFO *)data->data;
+ lsns = (DB_LSN *)((char *)data->data + TXN_VERIFY_INFO_FIXSIZE);
+ pb = (char *)lsns + txninfop->num_recycle * sizeof(DB_LSN);
+
+ if ((ret = __os_malloc(NULL, bufsz = sizeof(VRFY_TXN_INFO), &buf)) != 0)
+ goto err;
+ memset(buf, 0, bufsz);
+ memcpy(buf, data->data, TXN_VERIFY_INFO_FIXSIZE);
+
+ if (txninfop->num_recycle != 0) {
+ if ((ret = __os_malloc(NULL,
+ txninfop->num_recycle * sizeof(DB_LSN), &p)) != 0)
+ goto err;
+ memcpy(p, lsns, txninfop->num_recycle * sizeof(DB_LSN));
+ buf->recycle_lsns = p;
+ }
+
+ if (txninfop->filenum != 0) {
+ if ((ret = __os_malloc(NULL,
+ txninfop->filenum * sizeof(DBT), &q)) != 0)
+ goto err;
+ memset(q, 0, txninfop->filenum * sizeof(DBT));
+ buf->fileups = (DBT *)q;
+ for (i = 0; i < txninfop->filenum; i++) {
+ memcpy(&sz, pb, sizeof(sz));
+ pb += sizeof(sz);
+ if ((ret = __os_malloc(NULL, sz, &q)) != 0)
+ goto err;
+ memcpy(q, pb, sz);
+ pb += sz;
+
+ buf->fileups[i].data = q;
+ buf->fileups[i].size = sz;
+ }
+ }
+
+ *txninfopp = buf;
+err:
+ return (ret);
+}
+
+static int
+__lv_add_recycle_lsn (txninfop, lsn)
+ VRFY_TXN_INFO *txninfop;
+ const DB_LSN *lsn;
+{
+ int ret;
+
+ ret = 0;
+ txninfop->num_recycle++;
+ if ((ret = __os_realloc(NULL, txninfop->num_recycle * sizeof(DB_LSN),
+ &(txninfop->recycle_lsns))) != 0)
+ goto err;
+ txninfop->recycle_lsns[txninfop->num_recycle - 1] = *lsn;
+err:
+ return (ret);
+}
+
+/*
+ * __add_recycle_lsn_range --
+ * Add recycle info for each txn within the recycled txnid range.
+ *
+ * PUBLIC: int __add_recycle_lsn_range __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC: const DB_LSN *, u_int32_t, u_int32_t));
+ */
+int
+__add_recycle_lsn_range(lvinfo, lsn, min, max)
+ DB_LOG_VRFY_INFO *lvinfo;
+ const DB_LSN *lsn;
+ u_int32_t min, max;
+{
+ DBC *csr;
+ int ret, tret;
+ u_int32_t i;
+ DBT key2, data2;
+ struct __add_recycle_params param;
+
+ csr = NULL;
+ ret = tret = 0;
+ memset(&key2, 0, sizeof(DBT));
+ memset(&data2, 0, sizeof(DBT));
+ memset(&param, 0, sizeof(param));
+
+ if ((ret = __os_malloc(lvinfo->dbenv->env, sizeof(VRFY_TXN_INFO *) *
+ (param.ti2ul = 1024), &(param.ti2u))) != 0)
+ goto err;
+ param.ti2ui = 0;
+ param.recycle_lsn = *lsn;
+ param.min = min;
+ param.max = max;
+
+ /* Iterate the specified range and process each transaction. */
+ if ((ret = __iterate_txninfo(lvinfo, min, max, __lv_add_recycle_handler,
+ &param)) != 0)
+ goto err;
+
+ /*
+ * Save updated txninfo structures. We can't do so in the above
+ * iteration, so we have to save them here.
+ */
+ BDBOP(__db_cursor(lvinfo->txninfo, lvinfo->ip, NULL, &csr, DBC_BULK));
+
+ for (i = 0; i < param.ti2ui; i++) {
+ ret = __lv_pack_txn_vrfy_info(param.ti2u[i], &key2, &data2);
+ DB_ASSERT(lvinfo->dbenv->env, ret == 0);
+ BDBOP(__dbc_put(csr, &key2, &data2, DB_KEYLAST));
+ /*
+ * key2.data refers to param.ti2u[i]'s memory, data2.data is
+ * freed by DB since we set DB_DBT_MALLOC.
+ */
+ if ((ret = __free_txninfo(param.ti2u[i])) != 0)
+ goto err;
+ }
+
+err:
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ __os_free(lvinfo->dbenv->env, param.ti2u);
+ if (ret != 0)
+ __db_err(lvinfo->dbenv->env, ret,
+ "__add_recycle_lsn_range");
+
+ return (ret);
+}
+
+/*
+ * __iterate_txninfo --
+ * Iterate throught the transaction info database as fast as possible,
+ * and process each key/data pair using a callback handler. Break the
+ * iteration if the handler returns non-zero values.
+ *
+ * PUBLIC: int __iterate_txninfo __P((DB_LOG_VRFY_INFO *, u_int32_t,
+ * PUBLIC: u_int32_t, TXNINFO_HANDLER, void *));
+ */
+int
+__iterate_txninfo(lvinfo, min, max, handler, param)
+ DB_LOG_VRFY_INFO *lvinfo;
+ u_int32_t min, max;
+ TXNINFO_HANDLER handler;
+ void *param;
+{
+ ENV *env;
+ VRFY_TXN_INFO *txninfop;
+ int ret, tret;
+ u_int32_t bufsz, pgsz, txnid;
+ size_t retkl, retdl;
+ char *btbuf;
+ u_int8_t *retk, *retd;
+ DBT key, data, data2;
+ DBC *csr;
+ void *p;
+
+ csr = NULL;
+ env = lvinfo->dbenv->env;
+ txninfop = NULL;
+ ret = tret = 0;
+ txnid = 0;
+ retkl = retdl = 0;
+ bufsz = 64 * 1024;
+ btbuf = NULL;
+ retk = retd = NULL;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ memset(&data2, 0, sizeof(DBT));
+
+ pgsz = lvinfo->txninfo->pgsize;
+ DB_ASSERT(env, ret == 0);
+
+ if (bufsz % pgsz != 0)
+ bufsz = pgsz * (bufsz / pgsz);
+
+ if ((ret = __os_malloc(env, bufsz, &btbuf)) != 0)
+ goto err;
+
+ BDBOP(__db_cursor(lvinfo->txninfo, lvinfo->ip, NULL, &csr, DBC_BULK));
+
+ /*
+ * Use bulk retrieval to scan the database as fast as possible.
+ */
+ data.data = btbuf;
+ data.ulen = bufsz;
+ data.flags |= DB_DBT_USERMEM;
+
+ for (ret = __dbc_get(csr, &key, &data, DB_FIRST | DB_MULTIPLE_KEY) ;;
+ ret = __dbc_get(csr, &key, &data, DB_NEXT | DB_MULTIPLE_KEY)) {
+ switch (ret) {
+ case 0:
+ break;
+ case DB_NOTFOUND:
+ goto out;
+ /* No break statement allowed by lint here. */
+ case DB_BUFFER_SMALL:
+ if ((ret = __os_realloc(lvinfo->dbenv->env,
+ bufsz *= 2, &btbuf)) != 0)
+ goto out;
+ data.ulen = bufsz;
+ data.data = btbuf;
+ continue;/* Continue the for-loop. */
+ /* No break statement allowed by lint here. */
+ default:
+ goto err;
+ }
+
+ /*
+ * Do bulk get. Some txninfo objects may be updated by the
+ * handler, but we can't store them immediately in the same
+ * loop because we wouldn't be able to continue the bulk get
+ * using the same cursor; and we can't use another cursor
+ * otherwise we may self-block. In the handler we need to
+ * store the updated objects and store them to db when we get
+ * out of this loop.
+ */
+ DB_MULTIPLE_INIT(p, &data);
+ while (1) {
+ DB_MULTIPLE_KEY_NEXT(p, &data,
+ retk, retkl, retd, retdl);
+ if (p == NULL)
+ break;
+ DB_ASSERT(env, retkl == sizeof(txnid) && retk != NULL);
+ memcpy(&txnid, retk, retkl);
+ /*
+ * Process it if txnid in range or no range specified.
+ * The range must be a closed one.
+ */
+ if ((min != 0 && txnid >= min && max != 0 &&
+ txnid <= max) || (min == 0 && max == 0)) {
+ data2.data = retd;
+ data2.size = (u_int32_t)retdl;
+
+ if ((ret = __lv_unpack_txn_vrfy_info(
+ &txninfop, &data2)) != 0)
+ goto out;
+ if ((ret = handler(lvinfo, txninfop,
+ param)) != 0)
+ /* Stop the iteration on error. */
+ goto out;
+ }
+ }
+
+ }
+out:
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+err:
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ __os_free(lvinfo->dbenv->env, btbuf);
+ return (ret);
+}
+
+/* Txninfo iteration handler to add recycle info for affected txns. */
+static int
+__lv_add_recycle_handler(lvinfo, txninfop, params)
+ DB_LOG_VRFY_INFO *lvinfo;
+ VRFY_TXN_INFO *txninfop;
+ void *params;
+{
+ int ret;
+ struct __add_recycle_params *param;
+
+ ret = 0;
+ param = (struct __add_recycle_params *)params;
+
+ /*
+ * If the txnid is reused, update its recycle info and note it for
+ * later update, otherwise free the txninfop structure.
+ */
+ if (txninfop->txnid < param->min && txninfop->txnid > param->max) {
+ ret = __free_txninfo(txninfop);
+ return (ret);
+ }
+
+ ret = __lv_add_recycle_lsn(txninfop, &(param->recycle_lsn));
+
+ if (ret != 0)
+ goto err;
+ /*
+ * Below is one way to tell if a txn is aborted without doing another
+ * backward pass of the log. However if the txn id is not in the
+ * chosen recycled txn id range, we can't tell, until all the log
+ * records are passed --- the remaining active txns are the aborted
+ * txns.
+ * No longer needed since we did another backward pass of the log
+ * and have all the txn lifetimes.
+ if (txninfop->status == TXN_STAT_ACTIVE)
+ __on_txn_abort(lvinfo, txninfop);
+ */
+ if (txninfop->status == TXN_STAT_PREPARE) {
+ __db_errx(lvinfo->dbenv->env,
+ "[ERROR] Transaction with ID %u is prepared and not "
+ "committed, but its ID is recycled by log record [%u, %u].",
+ txninfop->txnid, param->recycle_lsn.file,
+ param->recycle_lsn.offset);
+ }
+ /* Note down to store later. */
+ param->ti2u[(param->ti2ui)++] = txninfop;
+ if (param->ti2ui == param->ti2ul)
+ BDBOP(__os_realloc(lvinfo->dbenv->env,
+ sizeof(VRFY_TXN_INFO *) * (param->ti2ul *= 2),
+ &(param->ti2u)));
+err:
+ return (ret);
+
+}
+/*
+ * PUBLIC: int __rem_last_recycle_lsn __P((VRFY_TXN_INFO *));
+ */
+int
+__rem_last_recycle_lsn(txninfop)
+ VRFY_TXN_INFO *txninfop;
+{
+ int ret;
+
+ ret = 0;
+ if (txninfop->num_recycle == 0)
+ return (0);
+ txninfop->num_recycle--;
+ if (txninfop->num_recycle > 0)
+ BDBOP(__os_realloc(NULL, txninfop->num_recycle * sizeof(DB_LSN),
+ &(txninfop->recycle_lsns)));
+ else {
+ __os_free(NULL, txninfop->recycle_lsns);
+ txninfop->recycle_lsns = NULL;
+ }
+err:
+ return (ret);
+
+}
+
+/*
+ * __add_file_updated --
+ * Add a file's dbregid and uid to the updating txn if it's not yet
+ * recorded.
+ *
+ * PUBLIC: int __add_file_updated __P((VRFY_TXN_INFO *, const DBT *, int32_t));
+ */
+int
+__add_file_updated (txninfop, fileid, dbregid)
+ VRFY_TXN_INFO *txninfop;
+ const DBT *fileid;
+ int32_t dbregid;
+{
+ int ret;
+ DBT *pdbt, *p;
+ u_int32_t found, i;
+
+ ret = 0;
+ p = pdbt = NULL;
+
+ for (found = 0, i = 0; i < txninfop->filenum; i++) {
+ p = &(txninfop->fileups[i]);
+ if (p->size == fileid->size &&
+ memcmp(p->data, fileid->data, p->size) == 0) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found)
+ return (0);
+
+ /* Add file's uid into the array, deep copy from fileid. */
+ txninfop->filenum++;
+ if ((ret = __os_realloc(NULL, txninfop->filenum *
+ sizeof(DBT), &(txninfop->fileups))) != 0)
+ goto err;
+
+ pdbt = &(txninfop->fileups[txninfop->filenum - 1]);
+ memset(pdbt, 0, sizeof(DBT));
+ if ((ret = __os_malloc(NULL,
+ pdbt->size = fileid->size, &(pdbt->data))) != 0)
+ goto err;
+ memcpy(pdbt->data, fileid->data, fileid->size);
+
+ /* Add file dbregid into the array. */
+ BDBOP(__os_realloc(NULL, txninfop->filenum *
+ sizeof(int32_t), &(txninfop->dbregid)));
+ txninfop->dbregid[txninfop->filenum - 1] = dbregid;
+err:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __del_file_updated __P((VRFY_TXN_INFO *, const DBT *));
+ */
+int
+__del_file_updated (txninfop, fileid)
+ VRFY_TXN_INFO *txninfop;
+ const DBT *fileid;
+{
+ u_int32_t found, i;
+ int ret;
+ DBT *p;
+ void *pdbtdata;
+
+ ret = 0;
+
+ if (txninfop->filenum == 0)
+ return (0);
+
+ /*
+ * If the array has an element identical to fileid, remove it. fileid
+ * itself is intact after this function call.
+ */
+ for (found = 0, i = 0, pdbtdata = NULL; i < txninfop->filenum; i++) {
+ p = &(txninfop->fileups[i]);
+ if (p->size == fileid->size &&
+ memcmp(p->data, fileid->data, p->size) == 0) {
+ pdbtdata = p->data;
+ if (txninfop->filenum > 1) {
+ memmove(txninfop->fileups + i, txninfop->
+ fileups + i + 1, sizeof(DBT) * (txninfop->
+ filenum - (i + 1)));
+ memmove(txninfop->dbregid + i, txninfop->
+ dbregid + i + 1, sizeof(int32_t) *
+ (txninfop->filenum - (i + 1)));
+ } else {
+ __os_free(NULL, txninfop->fileups);
+ __os_free(NULL, txninfop->dbregid);
+ txninfop->fileups = NULL;
+ txninfop->dbregid = NULL;
+ }
+ found = 1;
+ break;
+ }
+ }
+
+ if (found) {
+ txninfop->filenum--;
+ if (txninfop->filenum) {
+ BDBOP(__os_realloc(NULL, sizeof(DBT) *
+ txninfop->filenum, &(txninfop->fileups)));
+ BDBOP(__os_realloc(NULL, sizeof(int32_t) *
+ txninfop->filenum, &(txninfop->dbregid)));
+ }
+ __os_free(NULL, pdbtdata);
+ }
+err:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __clear_fileups __P((VRFY_TXN_INFO *));
+ */
+int
+__clear_fileups(txninfop)
+ VRFY_TXN_INFO *txninfop;
+{
+ u_int32_t i;
+
+ for (i = 0; i < txninfop->filenum; i++)
+ __os_free(NULL, txninfop->fileups[i].data);
+
+ __os_free(NULL, txninfop->fileups);
+ __os_free(NULL, txninfop->dbregid);
+ txninfop->fileups = NULL;
+ txninfop->dbregid = NULL;
+ txninfop->filenum = 0;
+
+ return (0);
+}
+
+/*
+ * __free_txninfo_stack --
+ * The object is on stack, only free its internal memory, not itself.
+ * PUBLIC: int __free_txninfo_stack __P((VRFY_TXN_INFO *));
+ */
+int
+__free_txninfo_stack (p)
+ VRFY_TXN_INFO *p;
+{
+ u_int32_t i;
+
+ if (p == NULL)
+ return (0);
+
+ if (p->fileups != NULL) {
+ for (i = 0; i < p->filenum; i++)
+ __os_free(NULL, p->fileups[i].data);
+ __os_free(NULL, p->fileups);
+ }
+
+ if (p->dbregid != NULL)
+ __os_free(NULL, p->dbregid);
+
+ if (p->recycle_lsns != NULL)
+ __os_free(NULL, p->recycle_lsns);
+
+ return (0);
+}
+/*
+ * PUBLIC: int __free_txninfo __P((VRFY_TXN_INFO *));
+ */
+int
+__free_txninfo(p)
+ VRFY_TXN_INFO *p;
+{
+ (void)__free_txninfo_stack(p);
+ __os_free(NULL, p);
+
+ return (0);
+}
+
+/* Construct a key and data DBT from the structure. */
+static int
+__lv_pack_filereg(freginfo, data)
+ const VRFY_FILEREG_INFO *freginfo;
+ DBT *data;
+{
+ char *buf, *p;
+ size_t bufsz, offset;
+ int ret;
+
+ ret = 0;
+ if ((ret = __os_malloc(NULL,
+ bufsz = FILE_REG_INFO_TOTSIZE(*freginfo), &buf)) != 0)
+ goto err;
+ memset(buf, 0, bufsz);
+
+ memcpy(buf, freginfo, FILE_REG_INFO_FIXSIZE);
+ p = buf + FILE_REG_INFO_FIXSIZE;
+
+ offset = sizeof(int32_t) * freginfo->regcnt;
+ memcpy(p, freginfo->dbregids, offset);
+ p += offset;
+
+ memcpy(p, &(freginfo->fileid.size), sizeof(freginfo->fileid.size));
+ p += sizeof(freginfo->fileid.size);
+ memcpy(p, freginfo->fileid.data, freginfo->fileid.size);
+ p += freginfo->fileid.size;
+ (void)strcpy(p, freginfo->fname);
+
+ data->data = buf;
+ data->size = (u_int32_t)bufsz;
+err:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __put_filereg_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC: const VRFY_FILEREG_INFO *));
+ */
+int __put_filereg_info (lvinfo, freginfo)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ const VRFY_FILEREG_INFO *freginfo;
+{
+
+ int ret;
+ DBT data;
+
+ memset(&data, 0, sizeof(DBT));
+
+ if ((ret = __lv_pack_filereg(freginfo, &data)) != 0)
+ goto err;
+
+ /*
+ * We store dbregid-filereg map into dbregids.db, but we can't make
+ * dbregids.db the sec db of fileregs.db, because dbregid is only
+ * valid when a db file is open, we want to delete data with same
+ * key in dbregids.db, but we want to keep all filereg_info data in
+ * fileregs.db to track all db file lifetime and status.
+ *
+ * Consequently we will store dbregid-file_uid in dbregs.db, so that we
+ * can delete dbregid when the db handle is closed, and we can
+ * use the dbregid to get the currently open db file's uid.
+ */
+
+ BDBOP2(lvinfo->dbenv, __db_put(lvinfo->fileregs, lvinfo->ip, NULL,
+ (DBT *)&(freginfo->fileid), &data, 0), "__put_filereg_info");
+
+err:
+ if (data.data != NULL)
+ __os_free(lvinfo->dbenv->env, data.data);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __del_filelife __P((const DB_LOG_VRFY_INFO *, int32_t));
+ */
+int
+__del_filelife(lvinfo, dbregid)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ int32_t dbregid;
+{
+ int ret;
+ DBT key;
+
+ memset(&key, 0, sizeof(DBT));
+ key.data = &(dbregid);
+ key.size = sizeof(dbregid);
+
+ if ((ret = __db_del(lvinfo->dbregids, lvinfo->ip, NULL,
+ &key, 0)) != 0)
+ goto err;
+
+err:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __put_filelife __P((const DB_LOG_VRFY_INFO *, VRFY_FILELIFE *));
+ */
+int
+__put_filelife (lvinfo, pflife)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ VRFY_FILELIFE *pflife;
+{
+ int ret;
+ DBT key, data;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ key.data = &(pflife->dbregid);
+ key.size = sizeof(pflife->dbregid);
+ data.data = pflife;
+ data.size = sizeof(VRFY_FILELIFE);
+
+ if ((ret = __db_put(lvinfo->dbregids, lvinfo->ip, NULL,
+ &key, &data, 0)) != 0)
+ goto err;
+
+err:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __get_filelife __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC: int32_t, VRFY_FILELIFE **));
+ */
+int
+__get_filelife (lvinfo, dbregid, flifepp)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ int32_t dbregid;
+ VRFY_FILELIFE **flifepp;
+{
+ int ret;
+ DBT key, data;
+ VRFY_FILELIFE *flifep;
+
+ ret = 0;
+ flifep = NULL;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &dbregid;
+ key.size = sizeof(dbregid);
+ if ((ret = __db_get(lvinfo->dbregids, lvinfo->ip, NULL,
+ &key, &data, 0)) != 0)
+ goto err;
+ if ((ret = __os_malloc(lvinfo->dbenv->env,
+ sizeof(VRFY_FILELIFE), &flifep)) != 0)
+ goto err;
+ DB_ASSERT(lvinfo->dbenv->env, flifep != NULL);
+ memcpy(flifep, data.data, sizeof(VRFY_FILELIFE));
+ *flifepp = flifep;
+err:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __get_filereg_by_dbregid __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC: int32_t, VRFY_FILEREG_INFO **));
+ */
+int
+__get_filereg_by_dbregid(lvinfo, dbregid, freginfopp)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ int32_t dbregid;
+ VRFY_FILEREG_INFO **freginfopp;
+{
+ int ret;
+ DBT key, data;
+ char uid[DB_FILE_ID_LEN];
+ VRFY_FILELIFE *pflife;
+
+ memset(&data, 0, sizeof(DBT));
+ memset(&key, 0, sizeof(DBT));
+ key.data = &dbregid;
+ key.size = sizeof(dbregid);
+
+ BDBOP3(lvinfo->dbenv, __db_get(lvinfo->dbregids, lvinfo->ip, NULL,
+ &key, &data, 0), DB_NOTFOUND, "__get_filereg_by_dbregid");
+ if (ret == DB_NOTFOUND)
+ goto err;
+
+ /* Use the file-uid as key to retrieve from fileregs.db. */
+ pflife = (VRFY_FILELIFE *)data.data;
+ memcpy((void *)uid, (void *)pflife->fileid, key.size = DB_FILE_ID_LEN);
+
+ key.data = (void *)uid;
+ memset(&data, 0, sizeof(DBT));
+
+ BDBOP3(lvinfo->dbenv, __db_get(lvinfo->fileregs, lvinfo->ip, NULL,
+ &key, &data, 0), DB_NOTFOUND, "__get_filereg_by_dbregid");
+ if (ret == DB_NOTFOUND)
+ goto err;
+ if ((ret = __lv_unpack_filereg(&data, freginfopp)) != 0)
+ goto err;
+
+err:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __add_dbregid __P((DB_LOG_VRFY_INFO *, VRFY_FILEREG_INFO *,
+ * PUBLIC: int32_t, u_int32_t, DB_LSN, DBTYPE, db_pgno_t, int *));
+ */
+int
+__add_dbregid(lvh, freg, dbregid, opcode, lsn, dbtype, meta_pgno, addp)
+ DB_LOG_VRFY_INFO *lvh;
+ VRFY_FILEREG_INFO *freg;
+ int32_t dbregid;
+ u_int32_t opcode;
+ DB_LSN lsn;
+ DBTYPE dbtype;
+ db_pgno_t meta_pgno;
+ int *addp;
+{
+ int inarray, ret, tret;
+ u_int32_t i, j;
+ VRFY_FILELIFE flife;
+
+ inarray = ret = tret = 0;
+ for (i = 0; i < freg->regcnt; i++) {
+ if (freg->dbregids[i] == dbregid) {
+ if (!IS_DBREG_CLOSE(opcode)) {
+ /* Opening an open dbreg id. */
+ if (IS_DBREG_OPEN(opcode) &&
+ (opcode != DBREG_CHKPNT &&
+ opcode != DBREG_XCHKPNT)) {
+ tret = 2;
+ goto err;
+ }
+ tret = 0;
+ inarray = 1;
+ } else
+ /* Found the dbregid; gonna remove it. */
+ tret = -1;
+ break;
+ }
+ }
+
+ if (IS_DBREG_OPEN(opcode))
+ tret = 1;/* dbregid not in the array, gonna add 1. */
+
+ /*
+ * Remove closed dbregid. dbregid can be recycled, not unique to a db
+ * file, it's dynamically allocated for each db handle.
+ */
+ if (tret == -1) {
+ for (j = i; j < freg->regcnt - 1; j++)
+ freg->dbregids[j] = freg->dbregids[j + 1];
+ freg->regcnt--;
+ BDBOP(__os_realloc(lvh->dbenv->env,
+ sizeof(int32_t) * freg->regcnt, &(freg->dbregids)));
+ /* Don't remove dbregid life info from dbregids db. */
+ } else if (tret == 1) {
+ if (!inarray) {
+ freg->regcnt++;
+ BDBOP(__os_realloc(lvh->dbenv->env,
+ sizeof(int32_t) * freg->regcnt, &(freg->dbregids)));
+ freg->dbregids[freg->regcnt - 1] = dbregid;
+ }
+ flife.dbregid = dbregid;
+ memcpy(flife.fileid, freg->fileid.data, freg->fileid.size);
+ flife.lifetime = opcode;
+ flife.dbtype = dbtype;
+ flife.lsn = lsn;
+ flife.meta_pgno = meta_pgno;
+ if ((ret = __put_filelife(lvh, &flife)) != 0)
+ goto err;
+ }
+
+err:
+ *addp = tret;
+ return (ret);
+
+}
+
+/*
+ * PUBLIC: int __get_filereg_info __P((const DB_LOG_VRFY_INFO *, const DBT *,
+ * PUBLIC: VRFY_FILEREG_INFO **));
+ */
+int
+__get_filereg_info (lvinfo, fuid, freginfopp)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ const DBT *fuid;
+ VRFY_FILEREG_INFO **freginfopp;
+{
+ int ret;
+ DBT data;
+
+ memset(&data, 0, sizeof(DBT));
+
+ BDBOP3(lvinfo->dbenv, __db_get(lvinfo->fileregs, lvinfo->ip, NULL,
+ (DBT *)fuid, &data, 0), DB_NOTFOUND, "__get_filereg_info");
+ if (ret == DB_NOTFOUND)
+ goto err;
+ if ((ret = __lv_unpack_filereg(&data, freginfopp)) != 0)
+ goto err;
+
+err:
+ return (ret);
+}
+
+static int
+__lv_unpack_filereg(data, freginfopp)
+ const DBT *data;
+ VRFY_FILEREG_INFO **freginfopp;
+{
+ char *p, *q;
+ u_int32_t fidsz, arrsz;
+ VRFY_FILEREG_INFO *buf;
+ int ret;
+
+ ret = 0;
+ p = q = NULL;
+ fidsz = arrsz = 0;
+ buf = NULL;
+
+ if ((ret = __os_malloc(NULL, sizeof(VRFY_FILEREG_INFO), &buf)) != 0)
+ goto err;
+ memset(buf, 0, sizeof(VRFY_FILEREG_INFO));
+
+ memcpy(buf, data->data, FILE_REG_INFO_FIXSIZE);
+ *freginfopp = (VRFY_FILEREG_INFO *)buf;
+ p = ((char *)(data->data)) + FILE_REG_INFO_FIXSIZE;
+
+ if ((ret = __os_malloc(NULL, arrsz = (*freginfopp)->regcnt *
+ sizeof(int32_t), &((*freginfopp)->dbregids))) != 0)
+ goto err;
+ memcpy((*freginfopp)->dbregids, p, arrsz);
+ p += arrsz;
+
+ memcpy(&fidsz, p, sizeof(fidsz));
+ p += sizeof(fidsz);
+ if ((ret = __os_malloc(NULL, fidsz, &q)) != 0)
+ goto err;
+ memcpy(q, p, fidsz);
+ (*freginfopp)->fileid.data = q;
+ (*freginfopp)->fileid.size = fidsz;
+ p += fidsz;
+
+ if ((ret = __os_malloc(NULL, sizeof(char) * (strlen(p) + 1), &q)) != 0)
+ goto err;
+ (void)strcpy(q, p);
+
+ (*freginfopp)->fname = q;
+err:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __free_filereg_info __P((VRFY_FILEREG_INFO *));
+ */
+int
+__free_filereg_info(p)
+ VRFY_FILEREG_INFO *p;
+{
+ if (p == NULL)
+ return (0);
+ if (p ->fname != NULL)
+ __os_free(NULL, (void *)(p->fname));
+ if (p->fileid.data != NULL)
+ __os_free(NULL, p->fileid.data);
+ if (p->dbregids != NULL)
+ __os_free(NULL, p->dbregids);
+ __os_free(NULL, p);
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __get_ckp_info __P((const DB_LOG_VRFY_INFO *, DB_LSN,
+ * PUBLIC: VRFY_CKP_INFO **));
+ */
+int
+__get_ckp_info (lvinfo, lsn, ckpinfopp)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ DB_LSN lsn;
+ VRFY_CKP_INFO **ckpinfopp;
+{
+ int ret;
+ DBT key, data;
+ VRFY_CKP_INFO *ckpinfo;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ key.data = &lsn;
+ key.size = sizeof(DB_LSN);
+ BDBOP3(lvinfo->dbenv, __db_get(lvinfo->ckps, lvinfo->ip, NULL,
+ &key, &data, 0), DB_NOTFOUND, "__get_ckp_info");
+
+ if (ret == DB_NOTFOUND)
+ goto err;
+
+ if ((ret = __os_malloc(lvinfo->dbenv->env,
+ sizeof(VRFY_CKP_INFO), &ckpinfo)) != 0)
+ goto err;
+ memcpy(ckpinfo, data.data, sizeof(VRFY_CKP_INFO));
+ *ckpinfopp = ckpinfo;
+err:
+ return (ret);
+
+}
+
+/*
+ * PUBLIC: int __get_last_ckp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC: VRFY_CKP_INFO **));
+ */
+int
+__get_last_ckp_info (lvinfo, ckpinfopp)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ VRFY_CKP_INFO **ckpinfopp;
+{
+ int ret, tret;
+ DBT key, data;
+ VRFY_CKP_INFO *ckpinfo;
+ DBC *csr;
+
+ csr = NULL;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ BDBOP(__db_cursor(lvinfo->ckps, lvinfo->ip, NULL, &csr, 0));
+ if ((ret = __dbc_get(csr, &key, &data, DB_LAST)) != 0)
+ goto err;
+
+ if ((ret = __os_malloc(lvinfo->dbenv->env,
+ sizeof(VRFY_CKP_INFO), &ckpinfo)) != 0)
+ goto err;
+ DB_ASSERT(lvinfo->dbenv->env, sizeof(VRFY_CKP_INFO) == data.size);
+ memcpy(ckpinfo, data.data, sizeof(VRFY_CKP_INFO));
+ *ckpinfopp = ckpinfo;
+err:
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ if (ret != 0 && ret != DB_NOTFOUND)
+ __db_err(lvinfo->dbenv->env, ret, "__get_last_ckp_info");
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __put_ckp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC: const VRFY_CKP_INFO *));
+ */
+int __put_ckp_info (lvinfo, ckpinfo)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ const VRFY_CKP_INFO *ckpinfo;
+{
+ int ret;
+ DBT key, data;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ key.data = (void *)&ckpinfo->lsn;
+ key.size = sizeof(DB_LSN);
+ data.data = (void *)ckpinfo;
+ data.size = sizeof(VRFY_CKP_INFO);
+
+ BDBOP2(lvinfo->dbenv, __db_put(lvinfo->ckps, lvinfo->ip,
+ NULL, &key, &data, 0), "__put_ckp_info");
+ return (0);
+}
+
+/*
+ * PUBLIC: int __get_timestamp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC: DB_LSN, VRFY_TIMESTAMP_INFO **));
+ */
+int __get_timestamp_info (lvinfo, lsn, tsinfopp)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ DB_LSN lsn;
+ VRFY_TIMESTAMP_INFO **tsinfopp;
+{
+ int ret;
+ DBT key, data;
+ VRFY_TIMESTAMP_INFO *tsinfo;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ key.data = &lsn;
+ key.size = sizeof(DB_LSN);
+ BDBOP3(lvinfo->dbenv, __db_get(lvinfo->lsntime, lvinfo->ip, NULL,
+ &key, &data, 0), DB_NOTFOUND, "__get_timestamp_info");
+
+ if (ret == DB_NOTFOUND)
+ goto err;
+
+ if ((ret = __os_malloc(lvinfo->dbenv->env,
+ sizeof(VRFY_TIMESTAMP_INFO), &tsinfo)) != 0)
+ goto err;
+
+ memcpy(tsinfo, data.data, sizeof(VRFY_TIMESTAMP_INFO));
+ *tsinfopp = tsinfo;
+err:
+ return (ret);
+}
+
+/*
+ * __get_latest_timestamp_info --
+ * Get latest timestamp info before lsn.
+ * PUBLIC: int __get_latest_timestamp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC: DB_LSN, VRFY_TIMESTAMP_INFO **));
+ */
+int __get_latest_timestamp_info(lvinfo, lsn, tsinfopp)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ DB_LSN lsn;
+ VRFY_TIMESTAMP_INFO **tsinfopp;
+{
+ int ret, tret;
+ DBT key, data;
+ VRFY_TIMESTAMP_INFO *tsinfo;
+ DBC *csr;
+
+ csr = NULL;
+ ret = tret = 0;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &lsn;
+ key.size = sizeof(lsn);
+ BDBOP(__db_cursor(lvinfo->lsntime, lvinfo->ip, NULL, &csr, 0));
+
+ BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+ BDBOP(__dbc_get(csr, &key, &data, DB_PREV));
+
+ if ((ret = __os_malloc(lvinfo->dbenv->env, sizeof(VRFY_TIMESTAMP_INFO),
+ &tsinfo)) != 0)
+ goto err;
+
+ memcpy(tsinfo, data.data, sizeof(VRFY_TIMESTAMP_INFO));
+ *tsinfopp = tsinfo;
+
+err:
+ if (ret != 0 && ret != DB_NOTFOUND)
+ __db_err(lvinfo->dbenv->env,
+ ret, "__get_latest_timestamp_info");
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __put_timestamp_info __P((const DB_LOG_VRFY_INFO *,
+ * PUBLIC: const VRFY_TIMESTAMP_INFO *));
+ */
+int __put_timestamp_info (lvinfo, tsinfo)
+ const DB_LOG_VRFY_INFO *lvinfo;
+ const VRFY_TIMESTAMP_INFO *tsinfo;
+{
+ int ret;
+ DBT key, data;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ key.data = (void *)&(tsinfo->lsn);
+ key.size = sizeof(DB_LSN);
+ data.data = (void *)tsinfo;
+ data.size = sizeof(VRFY_TIMESTAMP_INFO);
+ BDBOP2(lvinfo->dbenv, __db_put(lvinfo->lsntime, lvinfo->ip, NULL,
+ &key, &data, 0), "__put_timestamp_info");
+
+ return (0);
+}
+
+static int
+__lv_txnrgns_lsn_cmp (db, d1, d2)
+ DB *db;
+ const DBT *d1, *d2;
+{
+ struct __lv_txnrange r1, r2;
+
+ DB_ASSERT(db->env, d1->size == sizeof(r1));
+ DB_ASSERT(db->env, d2->size == sizeof(r2));
+ memcpy(&r1, d1->data, d1->size);
+ memcpy(&r2, d2->data, d2->size);
+
+ return (LOG_COMPARE(&(r1.end), &(r2.end)));
+}
+
+/*
+ * __find_lsnrg_by_timerg --
+ * Find the lsn closed interval [beginlsn, endlsn] so that the
+ * corresponding timestamp interval fully contains interval [begin, end].
+ * PUBLIC: int __find_lsnrg_by_timerg __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC: time_t, time_t, DB_LSN *, DB_LSN *));
+ */
+int
+__find_lsnrg_by_timerg(lvinfo, begin, end, startlsn, endlsn)
+ DB_LOG_VRFY_INFO *lvinfo;
+ time_t begin, end;
+ DB_LSN *startlsn, *endlsn;
+{
+ int ret, tret;
+ DBC *csr;
+ struct __lv_timestamp_info *t1, *t2;
+ DBT key, data;
+
+ ret = tret = 0;
+ csr = NULL;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ BDBOP(__db_cursor(lvinfo->timelsn, lvinfo->ip, NULL, &csr, 0));
+
+ /*
+ * We want a lsn range that completely contains [begin, end], so
+ * try move 1 record prev when getting the startlsn.
+ */
+ key.data = &begin;
+ key.size = sizeof(begin);
+ BDBOP(__dbc_get(csr, &key, &data, DB_SET_RANGE));
+ if ((ret = __dbc_get(csr, &key, &data, DB_PREV)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+ if (ret == DB_NOTFOUND)/* begin is smaller than the smallest key. */
+ startlsn->file = startlsn->offset = 0;/* beginning. */
+ else {
+ t1 = (struct __lv_timestamp_info *)data.data;
+ *startlsn = t1->lsn;
+ }
+
+ /*
+ * Move to the last key/data pair of the duplicate set to get the
+ * biggest lsn having end as timestamp.
+ */
+ key.data = &end;
+ key.size = sizeof(end);
+ if ((ret = __dbc_get(csr, &key, &data, DB_SET_RANGE)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+ if (ret == DB_NOTFOUND) {
+ endlsn->file = endlsn->offset = (u_int32_t)-1;/* Biggest lsn. */
+ ret = 0;
+ goto err; /* We are done. */
+ }
+
+ /*
+ * Go to the biggest lsn of the dup set, if the key is the last one,
+ * go to the last one.
+ */
+ if ((ret = __dbc_get(csr, &key, &data, DB_NEXT_NODUP)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto err;
+
+ if (ret == DB_NOTFOUND)
+ BDBOP(__dbc_get(csr, &key, &data, DB_LAST));
+ else
+ BDBOP(__dbc_get(csr, &key, &data, DB_PREV));
+
+ t2 = (struct __lv_timestamp_info *)data.data;
+ *endlsn = t2->lsn;
+err:
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __add_txnrange __P((DB_LOG_VRFY_INFO *, u_int32_t,
+ * PUBLIC: DB_LSN, int32_t, int));
+ */
+int __add_txnrange (lvinfo, txnid, lsn, when, ishead)
+ DB_LOG_VRFY_INFO *lvinfo;
+ u_int32_t txnid;
+ DB_LSN lsn;
+ int32_t when;
+ int ishead; /* Whether it's the 1st log of the txn. */
+{
+ int ret, tret;
+ DBC *csr;
+ struct __lv_txnrange tr, *ptr;
+ DBT key, data;
+
+ csr = NULL;
+ ret = 0;
+ ptr = NULL;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ memset(&tr, 0, sizeof(tr));
+
+ key.data = &txnid;
+ key.size = sizeof(txnid);
+ tr.txnid = txnid;
+ BDBOP(__db_cursor(lvinfo->txnrngs, lvinfo->ip, NULL, &csr, 0));
+ /*
+ * Note that we will backward play the logs to gather such information.
+ */
+ if (!ishead) {
+ tr.end = lsn;
+ tr.when_commit = when;
+ data.data = &tr;
+ data.size = sizeof(tr);
+ BDBOP(__dbc_put(csr, &key, &data, DB_KEYFIRST));
+ } else {
+ /*
+ * Dup data sorted by lsn, and we are backward playing logs,
+ * so the 1st record should be the one we want.
+ */
+ BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+ ptr = (struct __lv_txnrange *)data.data;
+ DB_ASSERT(lvinfo->dbenv->env, IS_ZERO_LSN(ptr->begin));
+ ptr->begin = lsn;
+ BDBOP(__dbc_put(csr, &key, &data, DB_CURRENT));
+ }
+
+err:
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ return (ret);
+}
+
+/*
+ * __get_aborttxn --
+ * If lsn is the last log of an aborted txn T, T's txnid is
+ * returned via the log verify handle.
+ *
+ * PUBLIC: int __get_aborttxn __P((DB_LOG_VRFY_INFO *, DB_LSN));
+ */
+int
+__get_aborttxn(lvinfo, lsn)
+ DB_LOG_VRFY_INFO *lvinfo;
+ DB_LSN lsn;
+{
+ int ret, tret;
+ u_int32_t txnid;
+ DBC *csr;
+ DBT key, data;
+
+ csr = NULL;
+ txnid = 0;
+ ret = tret = 0;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &lsn;
+ key.size = sizeof(lsn);
+ BDBOP(__db_cursor(lvinfo->txnaborts, lvinfo->ip, NULL, &csr, 0));
+ BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+ memcpy(&txnid, data.data, data.size);
+ /*
+ * The lsn is the last op of an aborted txn, call __on_txnabort
+ * before processing next log record.
+ */
+ lvinfo->aborted_txnid = txnid;
+ lvinfo->aborted_txnlsn = lsn;
+
+err:
+ /* It's OK if can't find it. */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ return (ret);
+}
+
+/*
+ * __txn_started --
+ * Whether txnid is started before lsn and ended after lsn.
+ *
+ * PUBLIC: int __txn_started __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC: DB_LSN, u_int32_t, int *));
+ */
+int
+__txn_started(lvinfo, lsn, txnid, res)
+ DB_LOG_VRFY_INFO *lvinfo;
+ DB_LSN lsn;
+ u_int32_t txnid;
+ int *res;
+{
+ int ret, tret;
+ DBC *csr;
+ DBT key, data;
+ struct __lv_txnrange *ptr, tr;
+
+ ret = *res = 0;
+ csr = NULL;
+ memset(&tr, 0, sizeof(tr));
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ key.data = &txnid;
+ key.size = sizeof(txnid);
+
+ BDBOP(__db_cursor(lvinfo->txnrngs, lvinfo->ip, NULL, &csr, 0));
+ BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+ for (;ret == 0; ret = __dbc_get(csr, &key, &data, DB_NEXT_DUP)) {
+ ptr = (struct __lv_txnrange *)data.data;
+ if (LOG_COMPARE(&lsn, &(ptr->begin)) > 0 &&
+ LOG_COMPARE(&lsn, &(ptr->end)) <= 0) {
+ *res = 1;
+ break;
+ }
+ }
+err:
+ if (ret == DB_NOTFOUND)
+ ret = 0;/* It's OK if can't find it. */
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __set_logvrfy_dbfuid __P((DB_LOG_VRFY_INFO *));
+ */
+int
+__set_logvrfy_dbfuid(lvinfo)
+ DB_LOG_VRFY_INFO *lvinfo;
+{
+ int ret;
+ const char *p;
+ DBT key, data;
+ size_t buflen;
+
+ p = NULL;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /* So far we only support verifying a specific db file. */
+ p = lvinfo->lv_config->dbfile;
+ buflen = sizeof(char) * (strlen(p) + 1);
+ key.data = (char *)p;
+ key.size = (u_int32_t)buflen;
+
+ BDBOP2(lvinfo->dbenv, __db_get(lvinfo->fnameuid, lvinfo->ip, NULL,
+ &key, &data, 0), "__set_logvrfy_dbfuid");
+
+ memcpy(lvinfo->target_dbid, data.data, DB_FILE_ID_LEN);
+
+ return (ret);
+}
+
+/*
+ * __add_page_to_txn --
+ * Try adding a page to a txn, result brings back if really added(0/1)
+ * or if there is an access violation(-1).
+ * PUBLIC: int __add_page_to_txn __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC: int32_t, db_pgno_t, u_int32_t, u_int32_t *, int *));
+ */
+int
+__add_page_to_txn (lvinfo, dbregid, pgno, txnid, otxn, result)
+ DB_LOG_VRFY_INFO *lvinfo;
+ int32_t dbregid;
+ db_pgno_t pgno;
+ u_int32_t txnid, *otxn;
+ int *result;
+{
+ int ret;
+ u_int8_t *buf;
+ DBT key, data;
+ size_t buflen;
+ u_int32_t txnid2;
+ VRFY_FILELIFE *pff;
+
+ if (txnid < TXN_MINIMUM) {
+ *result = 0;
+ return (0);
+ }
+ buf = NULL;
+ ret = 0;
+ txnid2 = 0;
+ pff = NULL;
+ buflen = sizeof(u_int8_t) * DB_FILE_ID_LEN + sizeof(db_pgno_t);
+ BDBOP(__os_malloc(lvinfo->dbenv->env, buflen, &buf));
+ memset(buf, 0, buflen);
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /*
+ * We use the file uid as key because a single db file can have
+ * multiple dbregid at the same time, and we may neglect the fact
+ * that the same db file is being updated by multiple txns if we use
+ * dbregid as key.
+ */
+ key.data = &dbregid;
+ key.size = sizeof(dbregid);
+ if ((ret = __db_get(lvinfo->dbregids, lvinfo->ip, NULL,
+ &key, &data, 0)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ if (F_ISSET(lvinfo, DB_LOG_VERIFY_PARTIAL)) {
+ ret = 0;
+ goto out;
+ } else
+ F_SET(lvinfo, DB_LOG_VERIFY_INTERR);
+ }
+ goto err;
+ }
+ pff = (VRFY_FILELIFE *)data.data;
+ memcpy(buf, pff->fileid, DB_FILE_ID_LEN);
+ memcpy(buf + DB_FILE_ID_LEN, (u_int8_t *)&pgno, sizeof(pgno));
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ key.data = buf;
+ key.size = (u_int32_t)buflen;
+ if ((ret = __db_get(lvinfo->pgtxn, lvinfo->ip, NULL,
+ &key, &data, 0)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ data.data = &txnid;
+ data.size = sizeof(txnid);
+ BDBOP(__db_put(lvinfo->pgtxn, lvinfo->ip, NULL, &key,
+ &data, 0));
+ *result = 1;
+ ret = 0;/* This is not an error. */
+ }
+ goto err;
+ }
+ DB_ASSERT(lvinfo->dbenv->env, data.size == sizeof(txnid2));
+ memcpy(&txnid2, data.data, data.size);
+ if (txnid == txnid2)/* The same txn already has the page. */
+ *result = 0;
+ else {/* Txn txnid is updating pages still held by txnid2. */
+ *result = -1;
+ *otxn = txnid2;
+ }
+out:
+ /* result is set to -1 on violation, 0 if already has it, 1 if added. */
+err:
+ if (buf != NULL)
+ __os_free(lvinfo->dbenv->env, buf);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __del_txn_pages __P((DB_LOG_VRFY_INFO *, u_int32_t));
+ */
+int
+__del_txn_pages(lvinfo, txnid)
+ DB_LOG_VRFY_INFO *lvinfo;
+ u_int32_t txnid;
+{
+ int ret;
+ DBT key;
+
+ ret = 0;
+ memset(&key, 0, sizeof(DBT));
+ key.data = &txnid;
+ key.size = sizeof(txnid);
+
+ BDBOP(__db_del(lvinfo->txnpg, lvinfo->ip, NULL, &key, 0));
+
+err:
+ return (ret);
+}
+
+/*
+ * __is_ancestor_txn --
+ * Tells via res if ptxnid is txnid's parent txn at the moment of lsn.
+ *
+ * PUBLIC: int __is_ancestor_txn __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC: u_int32_t, u_int32_t, DB_LSN, int *));
+ */
+int
+__is_ancestor_txn (lvinfo, ptxnid, txnid, lsn, res)
+ DB_LOG_VRFY_INFO *lvinfo;
+ u_int32_t ptxnid, txnid;
+ DB_LSN lsn;
+ int *res;
+{
+ u_int32_t ptid;
+ int ret, tret;
+ DBC *csr;
+ DB *pdb;
+ DBT key, data;
+ struct __lv_txnrange tr;
+
+ ret = 0;
+ ptid = txnid;
+ csr = NULL;
+ pdb = lvinfo->txnrngs;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ *res = 0;
+ BDBOP(__db_cursor(pdb, lvinfo->ip, NULL, &csr, 0));
+
+ /* See if ptxnid is an ancestor of txnid. */
+ do {
+ key.data = &ptid;
+ key.size = sizeof(ptid);
+ BDBOP(__dbc_get(csr, &key, &data, DB_SET));
+ /* A txnid maybe reused, we want the range having lsn in it. */
+ for (;ret == 0;
+ ret = __dbc_get(csr, &key, &data, DB_NEXT_DUP)) {
+ DB_ASSERT(pdb->env, sizeof(tr) == data.size);
+ memcpy(&tr, data.data, data.size);
+ if (tr.ptxnid > 0 &&
+ LOG_COMPARE(&lsn, &(tr.begin)) >= 0 &&
+ LOG_COMPARE(&lsn, &(tr.end)) <= 0)
+ break;
+ }
+
+ if (tr.ptxnid == ptxnid) {
+ *res = 1;
+ goto out;
+ } else
+ ptid = tr.ptxnid;
+
+ } while (ptid != 0);
+out:
+
+err:
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __return_txn_pages __P((DB_LOG_VRFY_INFO *,
+ * PUBLIC: u_int32_t, u_int32_t));
+ */
+int __return_txn_pages(lvh, ctxn, ptxn)
+ DB_LOG_VRFY_INFO *lvh;
+ u_int32_t ctxn, ptxn;
+{
+ int ret, tret;
+ DBC *csr;
+ DB *pdb, *sdb;
+ DBT key, key2, data, data2;
+ char buf[DB_FILE_ID_LEN + sizeof(db_pgno_t)];
+
+ ret = tret = 0;
+ csr = NULL;
+ sdb = lvh->txnpg;
+ pdb = lvh->pgtxn;
+ memset(&key, 0, sizeof(DBT));
+ memset(&key2, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ memset(&data2, 0, sizeof(DBT));
+
+ BDBOP(__db_cursor(sdb, lvh->ip, NULL, &csr, 0));
+ key.data = &ctxn;
+ key.size = sizeof(ctxn);
+ key2.data = &ptxn;
+ key2.size = sizeof(ptxn);
+ data2.data = buf;
+ data2.ulen = DB_FILE_ID_LEN + sizeof(db_pgno_t);
+ data2.flags = DB_DBT_USERMEM;
+
+ for (ret = __dbc_pget(csr, &key, &data2, &data, DB_SET); ret == 0;
+ ret = __dbc_pget(csr, &key, &data2, &data, DB_NEXT_DUP))
+ BDBOP(__db_put(pdb, lvh->ip, NULL, &data2, &key2, 0));
+ if ((ret = __del_txn_pages(lvh, ctxn)) != 0 && ret != DB_NOTFOUND)
+ goto err;
+err:
+ if (csr != NULL && (tret = __dbc_close(csr)) != 0 && ret == 0)
+ ret = tret;
+ return (ret);
+}
+
+#define ADD_ITEM(lvh, logtype) ((lvh)->logtype_names[(logtype)] = (#logtype))
+static void
+__lv_setup_logtype_names(lvinfo)
+ DB_LOG_VRFY_INFO *lvinfo;
+{
+ ADD_ITEM(lvinfo, DB___bam_irep);
+ ADD_ITEM(lvinfo, DB___bam_split_42);
+ ADD_ITEM(lvinfo, DB___bam_split);
+ ADD_ITEM(lvinfo, DB___bam_rsplit);
+ ADD_ITEM(lvinfo, DB___bam_adj);
+ ADD_ITEM(lvinfo, DB___bam_cadjust);
+ ADD_ITEM(lvinfo, DB___bam_cdel);
+ ADD_ITEM(lvinfo, DB___bam_repl);
+ ADD_ITEM(lvinfo, DB___bam_root);
+ ADD_ITEM(lvinfo, DB___bam_curadj);
+ ADD_ITEM(lvinfo, DB___bam_rcuradj);
+ ADD_ITEM(lvinfo, DB___bam_relink_43);
+ ADD_ITEM(lvinfo, DB___bam_merge_44);
+ ADD_ITEM(lvinfo, DB___crdel_metasub);
+ ADD_ITEM(lvinfo, DB___crdel_inmem_create);
+ ADD_ITEM(lvinfo, DB___crdel_inmem_rename);
+ ADD_ITEM(lvinfo, DB___crdel_inmem_remove);
+ ADD_ITEM(lvinfo, DB___dbreg_register);
+ ADD_ITEM(lvinfo, DB___db_addrem);
+ ADD_ITEM(lvinfo, DB___db_big);
+ ADD_ITEM(lvinfo, DB___db_ovref);
+ ADD_ITEM(lvinfo, DB___db_relink_42);
+ ADD_ITEM(lvinfo, DB___db_debug);
+ ADD_ITEM(lvinfo, DB___db_noop);
+ ADD_ITEM(lvinfo, DB___db_pg_alloc_42);
+ ADD_ITEM(lvinfo, DB___db_pg_alloc);
+ ADD_ITEM(lvinfo, DB___db_pg_free_42);
+ ADD_ITEM(lvinfo, DB___db_pg_free);
+ ADD_ITEM(lvinfo, DB___db_cksum);
+ ADD_ITEM(lvinfo, DB___db_pg_freedata_42);
+ ADD_ITEM(lvinfo, DB___db_pg_freedata);
+ ADD_ITEM(lvinfo, DB___db_pg_init);
+ ADD_ITEM(lvinfo, DB___db_pg_sort_44);
+ ADD_ITEM(lvinfo, DB___db_pg_trunc);
+ ADD_ITEM(lvinfo, DB___db_realloc);
+ ADD_ITEM(lvinfo, DB___db_relink);
+ ADD_ITEM(lvinfo, DB___db_merge);
+ ADD_ITEM(lvinfo, DB___db_pgno);
+#ifdef HAVE_HASH
+ ADD_ITEM(lvinfo, DB___ham_insdel);
+ ADD_ITEM(lvinfo, DB___ham_newpage);
+ ADD_ITEM(lvinfo, DB___ham_splitdata);
+ ADD_ITEM(lvinfo, DB___ham_replace);
+ ADD_ITEM(lvinfo, DB___ham_copypage);
+ ADD_ITEM(lvinfo, DB___ham_metagroup_42);
+ ADD_ITEM(lvinfo, DB___ham_metagroup);
+ ADD_ITEM(lvinfo, DB___ham_groupalloc_42);
+ ADD_ITEM(lvinfo, DB___ham_groupalloc);
+ ADD_ITEM(lvinfo, DB___ham_changeslot);
+ ADD_ITEM(lvinfo, DB___ham_contract);
+ ADD_ITEM(lvinfo, DB___ham_curadj);
+ ADD_ITEM(lvinfo, DB___ham_chgpg);
+#endif
+#ifdef HAVE_QUEUE
+ ADD_ITEM(lvinfo, DB___qam_incfirst);
+ ADD_ITEM(lvinfo, DB___qam_mvptr);
+ ADD_ITEM(lvinfo, DB___qam_del);
+ ADD_ITEM(lvinfo, DB___qam_add);
+ ADD_ITEM(lvinfo, DB___qam_delext);
+#endif
+ ADD_ITEM(lvinfo, DB___txn_regop_42);
+ ADD_ITEM(lvinfo, DB___txn_regop);
+ ADD_ITEM(lvinfo, DB___txn_ckp_42);
+ ADD_ITEM(lvinfo, DB___txn_ckp);
+ ADD_ITEM(lvinfo, DB___txn_child);
+ ADD_ITEM(lvinfo, DB___txn_xa_regop_42);
+ ADD_ITEM(lvinfo, DB___txn_prepare);
+ ADD_ITEM(lvinfo, DB___txn_recycle);
+ ADD_ITEM(lvinfo, DB___fop_create_42);
+ ADD_ITEM(lvinfo, DB___fop_create);
+ ADD_ITEM(lvinfo, DB___fop_remove);
+ ADD_ITEM(lvinfo, DB___fop_write_42);
+ ADD_ITEM(lvinfo, DB___fop_write);
+ ADD_ITEM(lvinfo, DB___fop_rename_42);
+ ADD_ITEM(lvinfo, DB___fop_rename_noundo_46);
+ ADD_ITEM(lvinfo, DB___fop_rename);
+ ADD_ITEM(lvinfo, DB___fop_rename_noundo);
+ ADD_ITEM(lvinfo, DB___fop_file_remove);
+}
diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c
new file mode 100644
index 00000000..dc331215
--- /dev/null
+++ b/src/mp/mp_alloc.c
@@ -0,0 +1,724 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * This configuration parameter limits the number of hash buckets which
+ * __memp_alloc() searches through while excluding buffers with a 'high'
+ * priority.
+ */
+#if !defined(MPOOL_ALLOC_SEARCH_LIMIT)
+#define MPOOL_ALLOC_SEARCH_LIMIT 500
+#endif
+
+/*
+ * __memp_alloc --
+ * Allocate some space from a cache region.
+ *
+ * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
+ * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
+ */
+int
+__memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
+ DB_MPOOL *dbmp;
+ REGINFO *infop;
+ MPOOLFILE *mfp;
+ size_t len;
+ roff_t *offsetp;
+ void *retp;
+{
+ BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp;
+ BH_FROZEN_PAGE *frozen_bhp;
+ DB_LSN oldest_reader, vlsn;
+ DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp;
+ ENV *env;
+ MPOOL *c_mp;
+ MPOOLFILE *bh_mfp;
+ size_t freed_space;
+ u_int32_t buckets, bucket_priority, buffers, cache_reduction;
+ u_int32_t dirty_eviction, high_priority, priority, versions;
+ u_int32_t priority_saved, put_counter, lru_generation, total_buckets;
+ int aggressive, alloc_freeze, b_lock, giveup;
+ int h_locked, need_free, obsolete, ret, write_error;
+ u_int8_t *endp;
+ void *p;
+
+ env = dbmp->env;
+ c_mp = infop->primary;
+ dbht = R_ADDR(infop, c_mp->htab);
+ hp_end = &dbht[c_mp->htab_buckets];
+ hp_saved = NULL;
+ priority_saved = 0;
+ write_error = 0;
+
+ buckets = buffers = put_counter = total_buckets = versions = 0;
+ aggressive = alloc_freeze = giveup = h_locked = 0;
+
+ /*
+ * If we're allocating a buffer, and the one we're discarding is the
+ * same size, we don't want to waste the time to re-integrate it into
+ * the shared memory free list. If the DB_MPOOLFILE argument isn't
+ * NULL, we'll compare the underlying page sizes of the two buffers
+ * before free-ing and re-allocating buffers.
+ */
+ if (mfp != NULL) {
+ len = SSZA(BH, buf) + mfp->pagesize;
+ /* Add space for alignment padding for MVCC diagnostics. */
+ MVCC_BHSIZE(mfp, len);
+ }
+
+ STAT_INC(env, mpool, nallocs, c_mp->stat.st_alloc, len);
+
+ MPOOL_REGION_LOCK(env, infop);
+
+ /*
+ * First we try to allocate from free memory. If that fails, scan the
+ * buffer pool to find buffers with low priorities. We consider small
+ * sets of hash buckets each time to limit the amount of work needing
+ * to be done. This approximates LRU, but not very well. We either
+ * find a buffer of the same size to use, or we will free 3 times what
+ * we need in the hopes it will coalesce into a contiguous chunk of the
+ * right size. In the latter case we branch back here and try again.
+ */
+alloc: if ((ret = __env_alloc(infop, len, &p)) == 0) {
+ if (mfp != NULL) {
+ /*
+ * For MVCC diagnostics, align the pointer so that the
+ * buffer starts on a page boundary.
+ */
+ MVCC_BHALIGN(p);
+ bhp = (BH *)p;
+
+ if ((ret = __mutex_alloc(env, MTX_MPOOL_BH,
+ DB_MUTEX_SHARED, &bhp->mtx_buf)) != 0) {
+ MVCC_BHUNALIGN(bhp);
+ __env_alloc_free(infop, bhp);
+ goto search;
+ }
+ c_mp->pages++;
+ }
+ MPOOL_REGION_UNLOCK(env, infop);
+found: if (offsetp != NULL)
+ *offsetp = R_OFFSET(infop, p);
+ *(void **)retp = p;
+
+ /*
+ * Update the search statistics.
+ *
+ * We're not holding the region locked here, these statistics
+ * can't be trusted.
+ */
+#ifdef HAVE_STATISTICS
+ total_buckets += buckets;
+ if (total_buckets != 0) {
+ if (total_buckets > c_mp->stat.st_alloc_max_buckets)
+ STAT_SET(env, mpool, alloc_max_buckets,
+ c_mp->stat.st_alloc_max_buckets,
+ total_buckets, infop->id);
+ STAT_ADJUST(env, mpool, alloc_buckets,
+ c_mp->stat.st_alloc_buckets,
+ total_buckets, infop->id);
+ }
+ if (buffers != 0) {
+ if (buffers > c_mp->stat.st_alloc_max_pages)
+ STAT_SET(env, mpool, alloc_max_pages,
+ c_mp->stat.st_alloc_max_pages,
+ buffers, infop->id);
+ STAT_ADJUST(env, mpool, alloc_pages,
+ c_mp->stat.st_alloc_pages, buffers, infop->id);
+ }
+#endif
+ return (0);
+ } else if (giveup || c_mp->pages == 0) {
+ MPOOL_REGION_UNLOCK(env, infop);
+
+ __db_errx(env, DB_STR("3017",
+ "unable to allocate space from the buffer cache"));
+ return ((ret == ENOMEM && write_error != 0) ? EIO : ret);
+ }
+
+search:
+ /*
+ * Anything newer than 1/10th of the buffer pool is ignored during the
+ * first MPOOL_SEARCH_ALLOC_LIMIT buckets worth of allocation.
+ */
+ cache_reduction = c_mp->pages / 10;
+ high_priority = aggressive ? MPOOL_LRU_MAX :
+ c_mp->lru_priority - cache_reduction;
+ lru_generation = c_mp->lru_generation;
+
+ ret = 0;
+ MAX_LSN(oldest_reader);
+
+ /*
+ * We re-attempt the allocation every time we've freed 3 times what
+ * we need. Reset our free-space counter.
+ */
+ freed_space = 0;
+ total_buckets += buckets;
+ buckets = 0;
+
+ /*
+ * Walk the hash buckets and find the next two with potentially useful
+ * buffers. Free the buffer with the lowest priority from the buckets'
+ * chains.
+ */
+ for (;;) {
+ /* All pages have been freed, make one last try */
+ if (c_mp->pages == 0)
+ goto alloc;
+
+ /* Check for wrap around. */
+ hp = &dbht[c_mp->last_checked++];
+ if (hp >= hp_end) {
+ c_mp->last_checked = 0;
+ hp = &dbht[c_mp->last_checked++];
+ }
+
+ /*
+ * The failure mode is when there are too many buffers we can't
+ * write or there's not enough memory in the system to support
+ * the number of pinned buffers.
+ *
+ * Get aggressive if we've reviewed the entire cache without
+ * freeing the needed space. (The code resets "aggressive"
+ * when we free any space.) Aggressive means:
+ *
+ * a: set a flag to attempt to flush high priority buffers as
+ * well as other buffers.
+ * b: look at a buffer in every hash bucket rather than choose
+ * the more preferable of two.
+ * c: start to think about giving up.
+ *
+ * If we get here three or more times, sync the mpool to force
+ * out queue extent pages. While we might not have enough
+ * space for what we want and flushing is expensive, why not?
+ * Then sleep for a second, hopefully someone else will run and
+ * free up some memory.
+ *
+ * Always try to allocate memory too, in case some other thread
+ * returns its memory to the region.
+ *
+ * We don't have any way to know an allocation has no way to
+ * succeed. Fail if no pages are returned to the cache after
+ * we've been trying for a relatively long time.
+ *
+ * !!!
+ * This test ignores pathological cases like no buffers in the
+ * system -- we check for that early on, so it isn't possible.
+ */
+ if (buckets++ == c_mp->htab_buckets) {
+ if (freed_space > 0)
+ goto alloc;
+ MPOOL_REGION_UNLOCK(env, infop);
+
+ aggressive++;
+ /*
+ * Once aggressive, we consider all buffers. By setting
+ * this to MPOOL_LRU_MAX, we'll still select a victim
+ * even if all buffers have the highest normal priority.
+ */
+ high_priority = MPOOL_LRU_MAX;
+ PERFMON4(env, mpool, alloc_wrap,
+ len, infop->id, aggressive, c_mp->put_counter);
+ switch (aggressive) {
+ case 1:
+ break;
+ case 2:
+ put_counter = c_mp->put_counter;
+ break;
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ (void)__memp_sync_int(
+ env, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);
+
+ __os_yield(env, 1, 0);
+ break;
+ default:
+ aggressive = 1;
+ if (put_counter == c_mp->put_counter)
+ giveup = 1;
+ break;
+ }
+
+ MPOOL_REGION_LOCK(env, infop);
+ goto alloc;
+ }
+
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking the hash
+ * bucket as we only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ /* Set aggressive if we have already searched for too long. */
+ if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) {
+ aggressive = 1;
+ /* Once aggressive, we consider all buffers. */
+ high_priority = MPOOL_LRU_MAX;
+ }
+
+ /* Unlock the region and lock the hash bucket. */
+ MPOOL_REGION_UNLOCK(env, infop);
+ MUTEX_READLOCK(env, hp->mtx_hash);
+ h_locked = 1;
+ b_lock = 0;
+
+ /*
+ * Find a buffer we can use.
+ *
+ * We use the lowest-LRU singleton buffer if we find one and
+ * it's better than the result of another hash bucket we've
+ * reviewed. We do not use a buffer which has a priority
+ * greater than high_priority unless we are being aggressive.
+ *
+ * With MVCC buffers, the situation is more complicated: we
+ * don't want to free a buffer out of the middle of an MVCC
+ * chain, since that requires I/O. So, walk the buffers,
+ * looking for an obsolete buffer at the end of an MVCC chain.
+ * Once a buffer becomes obsolete, its LRU priority is
+ * irrelevant because that version can never be accessed again.
+ *
+ * If we don't find any obsolete MVCC buffers, we will get
+ * aggressive, and in that case consider the lowest priority
+ * buffer within a chain.
+ *
+ * Ignore referenced buffers, we can't get rid of them.
+ */
+retry_search: bhp = NULL;
+ bucket_priority = high_priority;
+ obsolete = 0;
+ SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) {
+ /*
+ * First, do the standard LRU check for singletons.
+ * We can use the buffer if it is unreferenced, has a
+ * priority that isn't too high (unless we are
+ * aggressive), and is better than the best candidate
+ * we have found so far in this bucket.
+ */
+#ifdef MPOOL_ALLOC_SEARCH_DYN
+ if (aggressive == 0 &&
+ ++high_priority >= c_mp->lru_priority)
+ aggressive = 1;
+#endif
+
+ if (SH_CHAIN_SINGLETON(current_bhp, vc)) {
+ if (BH_REFCOUNT(current_bhp) != 0)
+ continue;
+ buffers++;
+ if (bucket_priority > current_bhp->priority) {
+ bucket_priority = current_bhp->priority;
+ if (bhp != NULL)
+ atomic_dec(env, &bhp->ref);
+ bhp = current_bhp;
+ atomic_inc(env, &bhp->ref);
+ }
+ continue;
+ }
+
+ /*
+ * For MVCC buffers, walk through the chain. If we are
+ * aggressive, choose the best candidate from within
+ * the chain for freezing.
+ */
+ for (mvcc_bhp = oldest_bhp = current_bhp;
+ mvcc_bhp != NULL;
+ oldest_bhp = mvcc_bhp,
+ mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) {
+#ifdef MPOOL_ALLOC_SEARCH_DYN
+ if (aggressive == 0 &&
+ ++high_priority >= c_mp->lru_priority)
+ aggressive = 1;
+#endif
+ DB_ASSERT(env, mvcc_bhp !=
+ SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
+ if ((aggressive < 2 &&
+ ++versions < (buffers >> 2)) ||
+ BH_REFCOUNT(mvcc_bhp) != 0)
+ continue;
+ buffers++;
+ if (!F_ISSET(mvcc_bhp, BH_FROZEN) &&
+ (bhp == NULL ||
+ bhp->priority > mvcc_bhp->priority)) {
+ if (bhp != NULL)
+ atomic_dec(env, &bhp->ref);
+ bhp = mvcc_bhp;
+ atomic_inc(env, &bhp->ref);
+ }
+ }
+
+ /*
+ * oldest_bhp is the last buffer on the MVCC chain, and
+ * an obsolete buffer at the end of the MVCC chain gets
+ * used without further search. Before checking for
+ * obsolescence, update the cached oldest reader LSN in
+ * the bucket if it is older than call's oldest_reader.
+ */
+ if (BH_REFCOUNT(oldest_bhp) != 0)
+ continue;
+
+ if (LOG_COMPARE(&oldest_reader, &hp->old_reader) > 0) {
+ if (IS_MAX_LSN(oldest_reader) &&
+ (ret = __txn_oldest_reader(
+ env, &oldest_reader)) != 0) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ if (bhp != NULL)
+ atomic_dec(env, &bhp->ref);
+ return (ret);
+ }
+ if (LOG_COMPARE(&oldest_reader,
+ &hp->old_reader) > 0)
+ hp->old_reader = oldest_reader;
+ }
+
+ if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) {
+ if (aggressive < 2)
+ buffers++;
+ obsolete = 1;
+ if (bhp != NULL)
+ atomic_dec(env, &bhp->ref);
+ bhp = oldest_bhp;
+ atomic_inc(env, &bhp->ref);
+ goto this_buffer;
+ }
+ }
+
+ /*
+ * bhp is either NULL or the best candidate buffer.
+ * We'll use the chosen buffer only if we have compared its
+ * priority against one chosen from another hash bucket.
+ */
+ if (bhp == NULL)
+ goto next_hb;
+
+ priority = bhp->priority;
+
+ /*
+ * Compare two hash buckets and select the one with the lower
+ * priority. Performance testing showed looking at two improves
+ * the LRU-ness and looking at more only does a little better.
+ */
+ if (hp_saved == NULL) {
+ hp_saved = hp;
+ priority_saved = priority;
+ goto next_hb;
+ }
+
+ /*
+ * If the buffer we just found is a better choice than our
+ * previous choice, use it.
+ *
+ * If the previous choice was better, pretend we're moving
+ * from this hash bucket to the previous one and re-do the
+ * search.
+ *
+ * We don't worry about simply swapping between two buckets
+ * because that could only happen if a buffer was removed
+ * from the chain, or its priority updated. If a buffer
+ * is removed from the chain, some other thread has managed
+ * to discard a buffer, so we're moving forward. Updating
+ * a buffer's priority will make it a high-priority buffer,
+ * so we'll ignore it when we search again, and so we will
+ * eventually zero in on a buffer to use, or we'll decide
+ * there are no buffers we can use.
+ *
+ * If there's only a single hash bucket with buffers, we'll
+ * search the bucket once, choose a buffer, walk the entire
+ * list of buckets and search it again. In the case of a
+ * system that's busy, it's possible to imagine a case where
+ * we'd loop for a long while. For that reason, and because
+ * the test is easy, we special case and test for it.
+ */
+ if (priority > priority_saved && hp != hp_saved) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ hp_tmp = hp_saved;
+ hp_saved = hp;
+ hp = hp_tmp;
+ priority_saved = priority;
+ MUTEX_READLOCK(env, hp->mtx_hash);
+ h_locked = 1;
+ DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+ atomic_dec(env, &bhp->ref);
+ goto retry_search;
+ }
+
+ /*
+ * If another thread has called __memp_reset_lru() while we were
+ * looking for this buffer, it is possible that we've picked a
+ * poor choice for a victim. If so toss it and start over.
+ */
+ if (lru_generation != c_mp->lru_generation) {
+ DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+ atomic_dec(env, &bhp->ref);
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ MPOOL_REGION_LOCK(env, infop);
+ hp_saved = NULL;
+ goto search;
+ }
+
+this_buffer: /*
+ * Discard any previously remembered hash bucket, we've got
+ * a winner.
+ */
+ hp_saved = NULL;
+
+ /* Drop the hash mutex and lock the buffer exclusively. */
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+
+ /* Don't bother trying to latch a busy buffer. */
+ if (BH_REFCOUNT(bhp) > 1)
+ goto next_hb;
+
+ /* We cannot block as the caller is probably holding locks. */
+ if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) {
+ if (ret != DB_LOCK_NOTGRANTED)
+ return (ret);
+ goto next_hb;
+ }
+ F_SET(bhp, BH_EXCLUSIVE);
+ b_lock = 1;
+
+ /* Someone may have grabbed it while we got the lock. */
+ if (BH_REFCOUNT(bhp) != 1)
+ goto next_hb;
+
+ /* Find the associated MPOOLFILE. */
+ bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /* If the page is dirty, write it. */
+ ret = 0;
+ dirty_eviction = 0;
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
+ ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
+ DB_ASSERT(env, atomic_read(&bhp->ref) > 0);
+
+ /*
+ * If a write fails for any reason, we can't proceed.
+ *
+ * If there's a write error and we're having problems
+ * finding something to allocate, avoid selecting this
+ * buffer again by maximizing its priority.
+ */
+ if (ret != 0) {
+ if (ret != EPERM && ret != EAGAIN) {
+ write_error++;
+ __db_errx(env, DB_STR_A("3018",
+ "%s: unwritable page %d remaining in the cache after error %d",
+ "%s %d %d"),
+ __memp_fns(dbmp, bh_mfp),
+ bhp->pgno, ret);
+ }
+ bhp->priority = MPOOL_LRU_REDZONE;
+
+ goto next_hb;
+ }
+
+ dirty_eviction = 1;
+ }
+
+ /*
+ * Freeze this buffer, if necessary. That is, if the buffer is
+ * part of an MVCC chain and could be required by a reader.
+ */
+ if (SH_CHAIN_HASPREV(bhp, vc) ||
+ (SH_CHAIN_HASNEXT(bhp, vc) && !obsolete)) {
+ if (!aggressive ||
+ F_ISSET(bhp, BH_DIRTY | BH_FROZEN))
+ goto next_hb;
+ ret = __memp_bh_freeze(
+ dbmp, infop, hp, bhp, &alloc_freeze);
+ if (ret == EIO)
+ write_error++;
+ if (ret == EBUSY || ret == EIO ||
+ ret == ENOMEM || ret == ENOSPC) {
+ ret = 0;
+ goto next_hb;
+ } else if (ret != 0) {
+ DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+ atomic_dec(env, &bhp->ref);
+ DB_ASSERT(env, b_lock);
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ DB_ASSERT(env, !h_locked);
+ return (ret);
+ }
+ }
+
+ MUTEX_LOCK(env, hp->mtx_hash);
+ h_locked = 1;
+
+ /*
+ * We released the hash bucket lock while doing I/O, so another
+ * thread may have acquired this buffer and incremented the ref
+ * count or dirtied the buffer or installed a new version after
+ * we wrote it, in which case we can't have it.
+ */
+ if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) ||
+ (SH_CHAIN_HASNEXT(bhp, vc) &&
+ SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off &&
+ !BH_OBSOLETE(bhp, hp->old_reader, vlsn)))
+ goto next_hb;
+
+ /*
+ * If the buffer is frozen, thaw it and look for another one
+ * we can use. (Calling __memp_bh_freeze above will not
+ * mark bhp BH_FROZEN.)
+ */
+ if (F_ISSET(bhp, BH_FROZEN)) {
+ DB_ASSERT(env, obsolete || SH_CHAIN_SINGLETON(bhp, vc));
+ DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+ if (!F_ISSET(bhp, BH_THAWED)) {
+ /*
+ * This call releases the hash bucket mutex.
+ * We're going to retry the search, so we need
+ * to re-lock it.
+ */
+ if ((ret = __memp_bh_thaw(dbmp,
+ infop, hp, bhp, NULL)) != 0)
+ return (ret);
+ MUTEX_READLOCK(env, hp->mtx_hash);
+ } else {
+ need_free = (atomic_dec(env, &bhp->ref) == 0);
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ if (need_free) {
+ MPOOL_REGION_LOCK(env, infop);
+ SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
+ bhp, hq);
+ MPOOL_REGION_UNLOCK(env, infop);
+ }
+ }
+ bhp = NULL;
+ b_lock = alloc_freeze = 0;
+ goto retry_search;
+ }
+
+ /* We are certainly freeing this buf; now update statistic. */
+ if (dirty_eviction)
+ STAT_INC(env, mpool,
+ dirty_eviction, c_mp->stat.st_rw_evict, infop->id);
+ else
+ STAT_INC(env, mpool,
+ clean_eviction, c_mp->stat.st_ro_evict, infop->id);
+ /*
+ * If we need some empty buffer headers for freezing, turn the
+ * buffer we've found into frozen headers and put them on the
+ * free list. Only reset alloc_freeze if we've actually
+ * allocated some frozen buffer headers.
+ */
+ if (alloc_freeze) {
+ if ((ret = __memp_bhfree(dbmp,
+ infop, bh_mfp, hp, bhp, 0)) != 0)
+ return (ret);
+ b_lock = 0;
+ h_locked = 0;
+
+ MVCC_MPROTECT(bhp->buf, bh_mfp->pagesize,
+ PROT_READ | PROT_WRITE | PROT_EXEC);
+
+ MPOOL_REGION_LOCK(env, infop);
+ SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
+ (BH_FROZEN_ALLOC *)bhp, links);
+ frozen_bhp = (BH_FROZEN_PAGE *)
+ ((BH_FROZEN_ALLOC *)bhp + 1);
+ endp = (u_int8_t *)bhp->buf + bh_mfp->pagesize;
+ while ((u_int8_t *)(frozen_bhp + 1) < endp) {
+ frozen_bhp->header.mtx_buf = MUTEX_INVALID;
+ SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
+ (BH *)frozen_bhp, hq);
+ frozen_bhp++;
+ }
+ MPOOL_REGION_UNLOCK(env, infop);
+
+ alloc_freeze = 0;
+ MUTEX_READLOCK(env, hp->mtx_hash);
+ h_locked = 1;
+ goto retry_search;
+ }
+
+ /*
+ * Check to see if the buffer is the size we're looking for.
+ * If so, we can simply reuse it. Otherwise, free the buffer
+ * and its space and keep looking.
+ */
+ if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) {
+ if ((ret = __memp_bhfree(dbmp,
+ infop, bh_mfp, hp, bhp, 0)) != 0)
+ return (ret);
+ p = bhp;
+ goto found;
+ }
+
+ freed_space += sizeof(*bhp) + bh_mfp->pagesize;
+ if ((ret =
+ __memp_bhfree(dbmp, infop,
+ bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)
+ return (ret);
+
+ /* Reset "aggressive" and "write_error" if we free any space. */
+ if (aggressive > 1)
+ aggressive = 1;
+ write_error = 0;
+
+ /*
+ * Unlock this buffer and re-acquire the region lock. If
+ * we're reaching here as a result of calling memp_bhfree, the
+ * buffer lock has already been discarded.
+ */
+ if (0) {
+next_hb: if (bhp != NULL) {
+ DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+ atomic_dec(env, &bhp->ref);
+ if (b_lock) {
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ }
+ }
+ if (h_locked)
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+ }
+ MPOOL_REGION_LOCK(env, infop);
+
+ /*
+ * Retry the allocation as soon as we've freed up sufficient
+ * space. We're likely to have to coalesce of memory to
+ * satisfy the request, don't try until it's likely (possible?)
+ * we'll succeed.
+ */
+ if (freed_space >= 3 * len)
+ goto alloc;
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __memp_free --
+ * Free some space from a cache region.
+ *
+ * PUBLIC: void __memp_free __P((REGINFO *, void *));
+ */
+void
+__memp_free(infop, buf)
+ REGINFO *infop;
+ void *buf;
+{
+ __env_alloc_free(infop, buf);
+}
diff --git a/src/mp/mp_backup.c b/src/mp/mp_backup.c
new file mode 100644
index 00000000..f376cda7
--- /dev/null
+++ b/src/mp/mp_backup.c
@@ -0,0 +1,333 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#ifndef HAVE_ATOMICFILEREAD
+#include "dbinc/db_page.h"
+#endif
+
+#ifndef HAVE_ATOMICFILEREAD
+static int __memp_check_backup __P((ENV *,
+ MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+#endif
+
+/*
+ * __memp_backup_open --
+ * Setup to backup a database file.
+ *
+ * PUBLIC: int __memp_backup_open __P((ENV *, DB_MPOOLFILE *,
+ * PUBLIC: const char *, const char *, u_int32_t, DB_FH **, void**));
+ */
+int
+__memp_backup_open(env, mpf, dbfile, target, flags, fpp, handlep)
+ ENV *env;
+ DB_MPOOLFILE *mpf;
+ const char *dbfile;
+ const char *target;
+ u_int32_t flags;
+ DB_FH **fpp;
+ void **handlep;
+{
+ DB_BACKUP *backup;
+#ifndef HAVE_ATOMICFILEREAD
+ MPOOLFILE *mfp;
+#endif
+ u_int32_t oflags;
+ size_t len;
+ int ret;
+ char *path;
+
+ path = NULL;
+ *fpp = NULL;
+ backup = env->backup_handle;
+ *handlep = NULL;
+
+ if (backup != NULL && backup->open != NULL)
+ ret = backup->open(env->dbenv, dbfile, target, handlep);
+ else {
+ len = strlen(target) + strlen(dbfile) + 2;
+ if ((ret = __os_malloc(env, len, &path)) != 0) {
+ __db_err(env, ret, DB_STR_A("0703",
+ "Cannot allocate space for path: %s", "%s"),
+ target);
+ goto err;
+ }
+
+ if ((ret = __os_concat_path(path, len, target, dbfile)) != 0)
+ goto err;
+
+ oflags = DB_OSO_CREATE | DB_OSO_TRUNC;
+ if (LF_ISSET(DB_EXCL))
+ FLD_SET(oflags, DB_OSO_EXCL);
+ if (backup != NULL && F_ISSET(backup, BACKUP_WRITE_DIRECT))
+ FLD_SET(oflags, DB_OSO_DIRECT);
+ ret = __os_open(env, path, 0, oflags, DB_MODE_600, fpp);
+ }
+ if (ret != 0) {
+ __db_err(env, ret, DB_STR_A("0704",
+ "Cannot open target file: %s", "%s"), path);
+ goto err;
+ }
+
+#ifndef HAVE_ATOMICFILEREAD
+ mfp = mpf->mfp;
+
+ /*
+ * Need to register thread with fail check.
+ */
+ MUTEX_LOCK(env, mfp->mtx_write);
+ if (mfp->backup_in_progress) {
+ __db_err(env, ret, DB_STR_A("0712",
+ "%s is already in a backup", "%s"), dbfile);
+ MUTEX_UNLOCK(env, mfp->mtx_write);
+ goto err;
+ }
+ mfp->backup_in_progress = 1;
+ env->dbenv->thread_id(env->dbenv, &mfp->pid, &mfp->tid);
+ MUTEX_UNLOCK(env, mfp->mtx_write);
+#else
+ COMPQUIET(mpf, NULL);
+#endif
+err: if (path != NULL)
+ __os_free(env, path);
+ if (ret != 0) {
+ if (*fpp != NULL)
+ (void)__os_closehandle(env, *fpp);
+ if (backup != NULL && backup->close != NULL)
+ (void)backup->close(env->dbenv, dbfile, *handlep);
+ }
+ return (ret);
+}
+
+/*
+ * __memp_backup_mpf --
+ * Copy a database file while maintaining synchronization with
+ * mpool write activity.
+ *
+ * PUBLIC: int __memp_backup_mpf __P((ENV *, DB_MPOOLFILE *, DB_THREAD_INFO *,
+ * PUBLIC: db_pgno_t, db_pgno_t, DB_FH *, void *, u_int32_t));
+ */
+int
+__memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags)
+ ENV *env;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ db_pgno_t first_pgno, last_pgno;
+ DB_FH *fp;
+ void *handle;
+ u_int32_t flags;
+{
+ DB_BACKUP *backup;
+ MPOOLFILE *mfp;
+ db_pgno_t high_pgno, pgno;
+ off_t t_off;
+ u_int32_t read_count, write_size;
+ u_int32_t gigs, off;
+ size_t len, nr, nw;
+ u_int8_t *buf;
+ int ret;
+
+ COMPQUIET(flags, 0);
+ backup = env->backup_handle;
+ read_count = 0;
+ buf = NULL;
+ mfp = mpf->mfp;
+ gigs = 0;
+ off = 0;
+
+ if (backup == NULL || (len = backup->size) == 0)
+ len = MEGABYTE;
+ if ((ret = __os_malloc(env, len, &buf)) != 0)
+ return (ret);
+ write_size = (u_int32_t)(len / mfp->pagesize);
+
+ if (first_pgno > 0) {
+ t_off = (off_t)first_pgno * mfp->pagesize;
+ gigs = (u_int32_t)(t_off / GIGABYTE);
+ off = (u_int32_t)(t_off - (off_t)gigs * GIGABYTE);
+ }
+
+ for (pgno = first_pgno; pgno <= last_pgno; pgno = high_pgno + 1) {
+ high_pgno = pgno + write_size - 1;
+ if (high_pgno > last_pgno)
+ high_pgno = last_pgno;
+ len = ((high_pgno - pgno) + 1) * mfp->pagesize;
+#ifndef HAVE_ATOMICFILEREAD
+ if (ip != NULL)
+ ip->dbth_state = THREAD_ACTIVE;
+ MUTEX_LOCK(env, mfp->mtx_write);
+
+ /* Eventually the writers will drain and block on the mutex. */
+ while (atomic_read(&mfp->writers) != 0) {
+ STAT_INC_VERB(env, mpool, backup_spins,
+ mfp->stat.st_backup_spins, __memp_fn(mpf), pgno);
+ __os_yield(env, 0, 1000);
+ }
+
+ mfp->low_pgno = pgno;
+ mfp->high_pgno = high_pgno;
+ MUTEX_UNLOCK(env, mfp->mtx_write);
+ if (ip != NULL)
+ ip->dbth_state = THREAD_OUT;
+#endif
+
+ if ((ret = __os_io(env, DB_IO_READ, mpf->fhp, pgno,
+ mfp->pagesize, 0, (u_int32_t)len, buf, &nr)) != 0)
+ break;
+
+ if (nr == 0)
+ break;
+
+ if (backup != NULL && backup->write != NULL) {
+ if ((ret = backup->write(
+ env->dbenv, gigs, off, (u_int32_t)nr,
+ buf, handle)) != 0)
+ break;
+ } else {
+ if ((ret = __os_io(env, DB_IO_WRITE, fp, pgno,
+ mfp->pagesize, 0, (u_int32_t)nr, buf, &nw)) != 0)
+ break;
+ if (nr != nw) {
+ ret = EIO;
+ break;
+ }
+ }
+
+ off += (u_int32_t)nr;
+ if (off >= GIGABYTE) {
+ gigs++;
+ off -= GIGABYTE;
+ }
+
+ if (backup != NULL && backup->read_count != 0) {
+ if ((read_count += write_size) >= backup->read_count)
+ __os_yield(env, 0, backup->read_sleep);
+ }
+
+ /*
+ * There may be pages not written to the file yet. The
+ * next read will probably see the end of file.
+ */
+ if (nr != len)
+ high_pgno = pgno + (db_pgno_t)(nr / mfp->pagesize);
+ }
+ DB_ASSERT(env, ret == 0);
+ __os_free(env, buf);
+
+#ifndef HAVE_ATOMICFILEREAD
+ if (ip != NULL)
+ ip->dbth_state = THREAD_ACTIVE;
+ MUTEX_LOCK(env, mfp->mtx_write);
+ mfp->low_pgno = PGNO_INVALID;
+ mfp->high_pgno = PGNO_INVALID;
+ MUTEX_UNLOCK(env, mfp->mtx_write);
+#else
+ COMPQUIET(ip, NULL);
+#endif
+
+ return (ret);
+}
+
+/*
+ * __memp_backup_close --
+ * Close backup file.
+ *
+ * PUBLIC: int __memp_backup_close __P((ENV *, DB_MPOOLFILE *,
+ * PUBLIC: const char *, DB_FH *, void *HANDLE));
+ */
+int
+__memp_backup_close(env, mpf, dbfile, fp, handle)
+ ENV *env;
+ DB_MPOOLFILE *mpf;
+ const char *dbfile;
+ DB_FH *fp;
+ void *handle;
+{
+ DB_BACKUP *backup;
+#ifndef HAVE_ATOMICFILEREAD
+ MPOOLFILE *mfp;
+#endif
+ int ret, t_ret;
+
+ backup = env->backup_handle;
+ ret = t_ret = 0;
+
+#ifndef HAVE_ATOMICFILEREAD
+ mfp = mpf->mfp;
+ MUTEX_LOCK(env, mfp->mtx_write);
+ mfp->backup_in_progress = 0;
+ MUTEX_UNLOCK(env, mfp->mtx_write);
+#else
+ COMPQUIET(mpf, NULL);
+#endif
+ if (fp != NULL)
+ ret = __os_closehandle(env, fp);
+ if (backup != NULL && backup->close != NULL)
+ t_ret = backup->close(env->dbenv, dbfile, handle);
+ return (ret == 0 ? t_ret : ret);
+}
+
+#ifndef HAVE_ATOMICFILEREAD
+/*
+ * __memp_check_backup --
+ * check for a dead thread backing up a mp file.
+ */
+static int
+__memp_check_backup(env, mfp, arg, countp, flags)
+ ENV *env;
+ MPOOLFILE *mfp;
+ void *arg;
+ u_int32_t *countp;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ char buf[DB_THREADID_STRLEN];
+
+ COMPQUIET(arg, NULL);
+ COMPQUIET(countp, NULL);
+ COMPQUIET(flags, 0);
+
+ dbenv = env->dbenv;
+
+ if (mfp->backup_in_progress == 0 ||
+ dbenv->is_alive(dbenv, mfp->pid, mfp->tid, 0))
+ return (0);
+
+ __db_msg(env, DB_STR_A("3042", "Releasing backup of %s for %s.",
+ "%s %s"), (char *)R_ADDR(env->mp_handle->reginfo, mfp->path_off),
+ dbenv->thread_id_string(dbenv, mfp->pid, mfp->tid, buf));
+ mfp->backup_in_progress = 0;
+ return (0);
+}
+#endif
+
+/*
+ * __memp_failchk --
+ * Remove in process database backups.
+ * PUBLIC: int __memp_failchk __P((ENV *));
+ */
+int
+__memp_failchk(env)
+ ENV *env;
+{
+#ifdef HAVE_ATOMICFILEREAD
+ COMPQUIET(env, NULL);
+ return (0);
+#else
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+
+ return (__memp_walk_files(env, mp, __memp_check_backup, NULL, NULL, 0));
+#endif
+}
diff --git a/src/mp/mp_bh.c b/src/mp/mp_bh.c
new file mode 100644
index 00000000..1df8e206
--- /dev/null
+++ b/src/mp/mp_bh.c
@@ -0,0 +1,690 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h" /* Required for diagnostic code. */
+#include "dbinc/mp.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+static int __memp_pgwrite
+ __P((ENV *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
+
+/*
+ * __memp_bhwrite --
+ * Write the page associated with a given buffer header.
+ *
+ * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
+ * PUBLIC: DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
+ */
+int
+__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ MPOOLFILE *mfp;
+ BH *bhp;
+ int open_extents;
+{
+ DB_MPOOLFILE *dbmfp;
+ DB_MPREG *mpreg;
+ ENV *env;
+ int opened, ret;
+
+ env = dbmp->env;
+ opened = 0;
+
+ /*
+ * If the file has been removed or is a closed temporary file, we're
+ * done -- the page-write function knows how to handle the fact that
+ * we don't have (or need!) any real file descriptor information.
+ */
+ if (mfp->deadfile)
+ return (__memp_pgwrite(env, NULL, hp, bhp));
+
+ /*
+ * Walk the process' DB_MPOOLFILE list and find a file descriptor for
+ * the file. We also check that the descriptor is open for writing.
+ */
+ MUTEX_LOCK(env, dbmp->mutex);
+ TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q)
+ if (dbmfp->mfp == mfp && !F_ISSET(dbmfp, MP_READONLY)) {
+ ++dbmfp->ref;
+ break;
+ }
+ MUTEX_UNLOCK(env, dbmp->mutex);
+
+ if (dbmfp != NULL) {
+ /*
+ * Temporary files may not have been created. We only handle
+ * temporary files in this path, because only the process that
+ * created a temporary file will ever flush buffers to it.
+ */
+ if (dbmfp->fhp == NULL) {
+ /* We may not be allowed to create backing files. */
+ if (mfp->no_backing_file) {
+ --dbmfp->ref;
+ return (EPERM);
+ }
+
+ MUTEX_LOCK(env, dbmp->mutex);
+ if (dbmfp->fhp == NULL) {
+ ret = __db_tmp_open(env,
+ F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ?
+ DB_OSO_DIRECT : 0, &dbmfp->fhp);
+ } else
+ ret = 0;
+ MUTEX_UNLOCK(env, dbmp->mutex);
+ if (ret != 0) {
+ __db_errx(env, DB_STR("3014",
+ "unable to create temporary backing file"));
+ --dbmfp->ref;
+ return (ret);
+ }
+ }
+
+ goto pgwrite;
+ }
+
+ /*
+ * There's no file handle for this file in our process.
+ *
+ * !!!
+ * It's the caller's choice if we're going to open extent files.
+ */
+ if (!open_extents && F_ISSET(mfp, MP_EXTENT))
+ return (EPERM);
+
+ /*
+ * !!!
+ * Don't try to attach to temporary files. There are two problems in
+ * trying to do that. First, if we have different privileges than the
+ * process that "owns" the temporary file, we might create the backing
+ * disk file such that the owning process couldn't read/write its own
+ * buffers, e.g., memp_trickle running as root creating a file owned
+ * as root, mode 600. Second, if the temporary file has already been
+ * created, we don't have any way of finding out what its real name is,
+ * and, even if we did, it was already unlinked (so that it won't be
+ * left if the process dies horribly). This decision causes a problem,
+ * however: if the temporary file consumes the entire buffer cache,
+ * and the owner doesn't flush the buffers to disk, we could end up
+ * with resource starvation, and the memp_trickle thread couldn't do
+ * anything about it. That's a pretty unlikely scenario, though.
+ *
+ * Note we should never get here when the temporary file in question
+ * has already been closed in another process, in which case it should
+ * be marked dead.
+ */
+ if (F_ISSET(mfp, MP_TEMP) || mfp->no_backing_file)
+ return (EPERM);
+
+ /*
+ * It's not a page from a file we've opened. If the file requires
+ * application-specific input/output processing, see if this process
+ * has ever registered information as to how to write this type of
+ * file. If not, there's nothing we can do.
+ */
+ if (mfp->ftype != 0 && mfp->ftype != DB_FTYPE_SET) {
+ MUTEX_LOCK(env, dbmp->mutex);
+ LIST_FOREACH(mpreg, &dbmp->dbregq, q)
+ if (mpreg->ftype == mfp->ftype)
+ break;
+ MUTEX_UNLOCK(env, dbmp->mutex);
+ if (mpreg == NULL)
+ return (EPERM);
+ }
+
+ /*
+ * Try and open the file, specifying the known underlying shared area.
+ *
+ * !!!
+ * There's no negative cache, so we may repeatedly try and open files
+ * that we have previously tried (and failed) to open.
+ */
+ if ((ret = __memp_fcreate(env, &dbmfp)) != 0)
+ return (ret);
+ /*
+ * The open will set MP_FLUSH and so we need to keep
+ * a checkpoint from closing this before we finish with it.
+ */
+ dbmfp->ref++;
+ opened = 1;
+ if ((ret = __memp_fopen(dbmfp, mfp, NULL,
+ NULL, DB_FLUSH | DB_DURABLE_UNKNOWN, 0, mfp->pagesize)) != 0) {
+ dbmfp->ref--;
+ (void)__memp_fclose(dbmfp, 0);
+
+ /*
+ * Ignore any error if the file is marked dead, assume the file
+ * was removed from under us.
+ */
+ if (!mfp->deadfile)
+ return (ret);
+
+ dbmfp = NULL;
+ }
+
+pgwrite:
+ MVCC_MPROTECT(bhp->buf, mfp->pagesize,
+ PROT_READ | PROT_WRITE | PROT_EXEC);
+ ret = __memp_pgwrite(env, dbmfp, hp, bhp);
+ if (dbmfp == NULL)
+ return (ret);
+
+ /*
+ * Discard our reference, and, if we're the last reference, make sure
+ * the file eventually gets closed.
+ */
+ MUTEX_LOCK(env, dbmp->mutex);
+ if (!opened && dbmfp->ref == 1) {
+ /*
+ * If we are the last reference, then we need to mark
+ * this as having been used to flush. If this dbmf
+ * has not been counted as a neutral reference do it.
+ *
+ * Getting the mfp mutex while holding the dbmp is
+ * ok we never do it in the reverse order.
+ */
+ if (!F_ISSET(dbmfp, MP_FLUSH)) {
+ F_SET(dbmfp, MP_FLUSH);
+ MUTEX_LOCK(env,dbmfp->mfp->mutex);
+ if (!F_ISSET(dbmfp, MP_FOR_FLUSH)) {
+ mfp->neutral_cnt++;
+ F_SET(dbmfp, MP_FOR_FLUSH);
+ }
+ MUTEX_UNLOCK(env, dbmfp->mfp->mutex);
+ }
+ } else
+ --dbmfp->ref;
+ MUTEX_UNLOCK(env, dbmp->mutex);
+
+ return (ret);
+}
+
+/*
+ * __memp_pgread --
+ * Read a page from a file.
+ *
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pgread(dbmfp, bhp, can_create)
+ DB_MPOOLFILE *dbmfp;
+ BH *bhp;
+ int can_create;
+{
+ ENV *env;
+ MPOOLFILE *mfp;
+ size_t len, nr;
+ u_int32_t pagesize;
+ int ret;
+
+ env = dbmfp->env;
+ mfp = dbmfp->mfp;
+ pagesize = mfp->pagesize;
+
+ /* We should never be called with a dirty or unlocked buffer. */
+ DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY_CREATE | BH_FROZEN));
+ DB_ASSERT(env, can_create ||
+ F_ISSET(bhp, BH_TRASH) || !F_ISSET(bhp, BH_DIRTY));
+ DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE));
+
+ /* Mark the buffer as in transition. */
+ F_SET(bhp, BH_TRASH);
+
+ /*
+ * Temporary files may not yet have been created. We don't create
+ * them now, we create them when the pages have to be flushed.
+ */
+ nr = 0;
+ if (dbmfp->fhp != NULL) {
+ PERFMON3(env, mpool, read, __memp_fn(dbmfp), bhp->pgno, bhp);
+ if ((ret = __os_io(env, DB_IO_READ, dbmfp->fhp,
+ bhp->pgno, pagesize, 0, pagesize, bhp->buf, &nr)) != 0)
+ goto err;
+ }
+
+ /*
+ * The page may not exist; if it doesn't, nr may well be 0, but we
+ * expect the underlying OS calls not to return an error code in
+ * this case.
+ */
+ if (nr < pagesize) {
+ /*
+ * Don't output error messages for short reads. In particular,
+ * DB recovery processing may request pages never written to
+ * disk or for which only some part have been written to disk,
+ * in which case we won't find the page. The caller must know
+ * how to handle the error.
+ */
+ if (!can_create) {
+ ret = DB_PAGE_NOTFOUND;
+ goto err;
+ }
+
+ /* Clear any bytes that need to be cleared. */
+ len = mfp->clear_len == DB_CLEARLEN_NOTSET ?
+ pagesize : mfp->clear_len;
+ memset(bhp->buf, 0, len);
+
+#if defined(DIAGNOSTIC) || defined(UMRW)
+ /*
+ * If we're running in diagnostic mode, corrupt any bytes on
+ * the page that are unknown quantities for the caller.
+ */
+ if (len < pagesize)
+ memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
+#endif
+ STAT_INC_VERB(env, mpool, page_create,
+ mfp->stat.st_page_create, __memp_fn(dbmfp), bhp->pgno);
+ } else
+ STAT_INC_VERB(env, mpool, page_in,
+ mfp->stat.st_page_in, __memp_fn(dbmfp), bhp->pgno);
+
+ /* Call any pgin function. */
+ ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1);
+
+ /*
+ * If no errors occurred, the data is now valid, clear the BH_TRASH
+ * flag.
+ */
+ if (ret == 0)
+ F_CLR(bhp, BH_TRASH);
+err: return (ret);
+}
+
+/*
+ * __memp_pgwrite --
+ * Write a page to a file.
+ */
+static int
+__memp_pgwrite(env, dbmfp, hp, bhp)
+ ENV *env;
+ DB_MPOOLFILE *dbmfp;
+ DB_MPOOL_HASH *hp;
+ BH *bhp;
+{
+ DB_LSN lsn;
+ MPOOLFILE *mfp;
+ size_t nw;
+ int ret;
+ void * buf;
+
+ /*
+ * Since writing does not require exclusive access, another thread
+ * could have already written this buffer.
+ */
+ if (!F_ISSET(bhp, BH_DIRTY))
+ return (0);
+
+ mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
+ ret = 0;
+ buf = NULL;
+
+ /* We should never be called with a frozen or trashed buffer. */
+ DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN | BH_TRASH));
+
+ /*
+ * It's possible that the underlying file doesn't exist, either
+ * because of an outright removal or because it was a temporary
+ * file that's been closed.
+ *
+ * !!!
+ * Once we pass this point, we know that dbmfp and mfp aren't NULL,
+ * and that we have a valid file reference.
+ */
+ if (mfp == NULL || mfp->deadfile)
+ goto file_dead;
+
+ /*
+ * If the page is in a file for which we have LSN information, we have
+ * to ensure the appropriate log records are on disk.
+ */
+ if (LOGGING_ON(env) && mfp->lsn_off != DB_LSN_OFF_NOTSET &&
+ !IS_CLIENT_PGRECOVER(env)) {
+ memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
+ if (!IS_NOT_LOGGED_LSN(lsn) &&
+ (ret = __log_flush(env, &lsn)) != 0)
+ goto err;
+ }
+
+#ifdef DIAGNOSTIC
+ /*
+ * Verify write-ahead logging semantics.
+ *
+ * !!!
+ * Two special cases. There is a single field on the meta-data page,
+ * the last-page-number-in-the-file field, for which we do not log
+ * changes. If the page was originally created in a database that
+ * didn't have logging turned on, we can see a page marked dirty but
+ * for which no corresponding log record has been written. However,
+ * the only way that a page can be created for which there isn't a
+ * previous log record and valid LSN is when the page was created
+ * without logging turned on, and so we check for that special-case
+ * LSN value.
+ *
+ * Second, when a client is reading database pages from a master
+ * during an internal backup, we may get pages modified after
+ * the current end-of-log.
+ */
+ if (LOGGING_ON(env) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf)) &&
+ !IS_CLIENT_PGRECOVER(env)) {
+ /*
+ * There is a potential race here. If we are in the midst of
+ * switching log files, it's possible we could test against the
+ * old file and the new offset in the log region's LSN. If we
+ * fail the first test, acquire the log mutex and check again.
+ */
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (!lp->db_log_inmemory &&
+ LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
+ MUTEX_LOCK(env, lp->mtx_flush);
+ DB_ASSERT(env, F_ISSET(env->dbenv, DB_ENV_NOLOCKING) ||
+ LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) > 0);
+ MUTEX_UNLOCK(env, lp->mtx_flush);
+ }
+ }
+#endif
+
+#ifndef HAVE_ATOMICFILEREAD
+ if (mfp->backup_in_progress != 0) {
+ MUTEX_READLOCK(env, mfp->mtx_write);
+ if (bhp->pgno >= mfp->low_pgno && bhp->pgno <= mfp->high_pgno) {
+ MUTEX_UNLOCK(env, mfp->mtx_write);
+ ret = EAGAIN;
+ goto err;
+ }
+ atomic_inc(env, &mfp->writers);
+ MUTEX_UNLOCK(env, mfp->mtx_write);
+ } else
+ atomic_inc(env, &mfp->writers);
+#endif
+
+ /*
+ * Call any pgout function. If we have the page exclusive then
+ * we are going to reuse it otherwise make a copy of the page so
+ * that others can continue looking at the page while we write it.
+ */
+ buf = bhp->buf;
+ if (mfp->ftype != 0) {
+ if (F_ISSET(bhp, BH_EXCLUSIVE))
+ F_SET(bhp, BH_TRASH);
+ else {
+ if ((ret = __os_malloc(env, mfp->pagesize, &buf)) != 0)
+ goto err;
+ memcpy(buf, bhp->buf, mfp->pagesize);
+ }
+ if ((ret = __memp_pg(dbmfp, bhp->pgno, buf, 0)) != 0)
+ goto err;
+ }
+
+ PERFMON3(env, mpool, write, __memp_fn(dbmfp), bhp->pgno, bhp);
+ /* Write the page. */
+ if ((ret = __os_io(env, DB_IO_WRITE, dbmfp->fhp, bhp->pgno,
+ mfp->pagesize, 0, mfp->pagesize, buf, &nw)) != 0) {
+#ifndef HAVE_ATOMICFILEREAD
+ atomic_dec(env, &mfp->writers);
+#endif
+ __db_errx(env, DB_STR_A("3015",
+ "%s: write failed for page %lu", "%s %lu"),
+ __memp_fn(dbmfp), (u_long)bhp->pgno);
+ goto err;
+ }
+#ifndef HAVE_ATOMICFILEREAD
+ atomic_dec(env, &mfp->writers);
+#endif
+ STAT_INC_VERB(env, mpool, page_out,
+ mfp->stat.st_page_out, __memp_fn(dbmfp), bhp->pgno);
+ if (bhp->pgno > mfp->last_flushed_pgno) {
+ MUTEX_LOCK(env, mfp->mutex);
+ if (bhp->pgno > mfp->last_flushed_pgno)
+ mfp->last_flushed_pgno = bhp->pgno;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ }
+
+err:
+file_dead:
+ if (buf != NULL && buf != bhp->buf)
+ __os_free(env, buf);
+ /*
+ * !!!
+ * Once we pass this point, dbmfp and mfp may be NULL, we may not have
+ * a valid file reference.
+ */
+
+ /*
+ * Update the hash bucket statistics, reset the flags. If we were
+ * successful, the page is no longer dirty. Someone else may have
+ * also written the page so we need to latch the hash bucket here
+ * to get the accounting correct. Since we have the buffer
+ * shared it cannot be marked dirty again till we release it.
+ * This is the only place we update the flags field only holding
+ * a shared latch.
+ */
+ if (F_ISSET(bhp, BH_DIRTY | BH_TRASH)) {
+ MUTEX_LOCK(env, hp->mtx_hash);
+ DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
+ if (ret == 0 && F_ISSET(bhp, BH_DIRTY)) {
+ F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+ DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
+ atomic_dec(env, &hp->hash_page_dirty);
+ }
+
+ /* put the page back if necessary. */
+ if ((ret != 0 || BH_REFCOUNT(bhp) > 1) &&
+ F_ISSET(bhp, BH_TRASH)) {
+ ret = __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1);
+ F_CLR(bhp, BH_TRASH);
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ }
+
+ return (ret);
+}
+
+/*
+ * __memp_pg --
+ * Call the pgin/pgout routine.
+ *
+ * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int));
+ */
+int
+__memp_pg(dbmfp, pgno, buf, is_pgin)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t pgno;
+ void *buf;
+ int is_pgin;
+{
+ DBT dbt, *dbtp;
+ DB_MPOOL *dbmp;
+ DB_MPREG *mpreg;
+ ENV *env;
+ MPOOLFILE *mfp;
+ int ftype, ret;
+
+ env = dbmfp->env;
+ dbmp = env->mp_handle;
+ mfp = dbmfp->mfp;
+
+ if ((ftype = mfp->ftype) == DB_FTYPE_SET)
+ mpreg = dbmp->pg_inout;
+ else {
+ MUTEX_LOCK(env, dbmp->mutex);
+ LIST_FOREACH(mpreg, &dbmp->dbregq, q)
+ if (ftype == mpreg->ftype)
+ break;
+ MUTEX_UNLOCK(env, dbmp->mutex);
+ }
+ if (mpreg == NULL)
+ return (0);
+
+ if (mfp->pgcookie_len == 0)
+ dbtp = NULL;
+ else {
+ DB_SET_DBT(dbt, R_ADDR(
+ dbmp->reginfo, mfp->pgcookie_off), mfp->pgcookie_len);
+ dbtp = &dbt;
+ }
+
+ if (is_pgin) {
+ if (mpreg->pgin != NULL && (ret =
+ mpreg->pgin(env->dbenv, pgno, buf, dbtp)) != 0)
+ goto err;
+ } else
+ if (mpreg->pgout != NULL && (ret =
+ mpreg->pgout(env->dbenv, pgno, buf, dbtp)) != 0)
+ goto err;
+
+ return (0);
+
+err: __db_errx(env, DB_STR_A("3016",
+ "%s: %s failed for page %lu", "%s %s %lu"), __memp_fn(dbmfp),
+ is_pgin ? DB_STR_P("pgin") : DB_STR_P("pgout"), (u_long)pgno);
+ return (ret);
+}
+
+/*
+ * __memp_bhfree --
+ * Free a bucket header and its referenced data.
+ *
+ * PUBLIC: int __memp_bhfree __P((DB_MPOOL *,
+ * PUBLIC: REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t));
+ */
+int
+__memp_bhfree(dbmp, infop, mfp, hp, bhp, flags)
+ DB_MPOOL *dbmp;
+ REGINFO *infop;
+ MPOOLFILE *mfp;
+ DB_MPOOL_HASH *hp;
+ BH *bhp;
+ u_int32_t flags;
+{
+ ENV *env;
+#ifdef DIAGNOSTIC
+ DB_LSN vlsn;
+#endif
+ BH *prev_bhp;
+ MPOOL *c_mp;
+ int ret, t_ret;
+#ifdef DIAG_MVCC
+ size_t pagesize;
+#endif
+
+ ret = 0;
+
+ /*
+ * Assumes the hash bucket is locked and the MPOOL is not.
+ */
+ env = dbmp->env;
+#ifdef DIAG_MVCC
+ if (mfp != NULL)
+ pagesize = mfp->pagesize;
+#endif
+
+ DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) ||
+ (hp != NULL && MUTEX_IS_OWNED(env, hp->mtx_hash)));
+ DB_ASSERT(env, BH_REFCOUNT(bhp) == 1 &&
+ !F_ISSET(bhp, BH_DIRTY | BH_FROZEN));
+ DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) ||
+ SH_CHAIN_SINGLETON(bhp, vc) || (SH_CHAIN_HASNEXT(bhp, vc) &&
+ (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off ||
+ bhp->td_off == INVALID_ROFF ||
+ IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) ||
+ BH_OBSOLETE(bhp, hp->old_reader, vlsn))));
+
+ PERFMON3(env, mpool, evict, __memp_fns(dbmp, mfp), bhp->pgno, bhp);
+
+ /*
+ * Delete the buffer header from the hash bucket queue or the
+ * version chain.
+ */
+ if (hp == NULL)
+ goto no_hp;
+ prev_bhp = SH_CHAIN_PREV(bhp, vc, __bh);
+ if (!SH_CHAIN_HASNEXT(bhp, vc)) {
+ if (prev_bhp != NULL)
+ SH_TAILQ_INSERT_AFTER(&hp->hash_bucket,
+ bhp, prev_bhp, hq, __bh);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+ }
+ SH_CHAIN_REMOVE(bhp, vc, __bh);
+
+ /*
+ * Remove the reference to this buffer from the transaction that
+ * created it, if any. When the BH_FREE_UNLOCKED flag is set, we're
+ * discarding the environment, so the transaction region is already
+ * gone.
+ */
+ if (bhp->td_off != INVALID_ROFF && !LF_ISSET(BH_FREE_UNLOCKED)) {
+ ret = __txn_remove_buffer(
+ env, BH_OWNER(env, bhp), hp->mtx_hash);
+ bhp->td_off = INVALID_ROFF;
+ }
+
+ /*
+ * We're going to use the memory for something else -- it had better be
+ * accessible.
+ */
+no_hp: if (mfp != NULL)
+ MVCC_MPROTECT(bhp->buf,
+ pagesize, PROT_READ | PROT_WRITE | PROT_EXEC);
+
+ /*
+ * Discard the hash bucket's mutex, it's no longer needed, and
+ * we don't want to be holding it when acquiring other locks.
+ */
+ if (!LF_ISSET(BH_FREE_UNLOCKED))
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+
+ /*
+ * If we're only removing this header from the chain for reuse, we're
+ * done.
+ */
+ if (LF_ISSET(BH_FREE_REUSE))
+ return (ret);
+
+ /*
+ * If we're not reusing the buffer immediately, free the buffer for
+ * real.
+ */
+ if (!LF_ISSET(BH_FREE_UNLOCKED))
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ if (LF_ISSET(BH_FREE_FREEMEM)) {
+ if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0)
+ return (ret);
+ MPOOL_REGION_LOCK(env, infop);
+
+ MVCC_BHUNALIGN(bhp);
+ __memp_free(infop, bhp);
+ c_mp = infop->primary;
+ c_mp->pages--;
+
+ MPOOL_REGION_UNLOCK(env, infop);
+ }
+
+ if (mfp == NULL)
+ return (ret);
+
+ /*
+ * Decrement the reference count of the underlying MPOOLFILE.
+ * If this is its last reference, remove it.
+ */
+ MUTEX_LOCK(env, mfp->mutex);
+ if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) {
+ if ((t_ret = __memp_mf_discard(dbmp, mfp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ } else
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ return (ret);
+}
diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c
new file mode 100644
index 00000000..5f9a4bf9
--- /dev/null
+++ b/src/mp/mp_fget.c
@@ -0,0 +1,1230 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#ifdef DIAGNOSTIC
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#endif
+
+/*
+ * __memp_fget_pp --
+ * DB_MPOOLFILE->get pre/post processing.
+ *
+ * PUBLIC: int __memp_fget_pp
+ * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
+ */
+int
+__memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t *pgnoaddr;
+ DB_TXN *txnp;
+ u_int32_t flags;
+ void *addrp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int rep_blocked, ret;
+
+ env = dbmfp->env;
+
+ MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get");
+
+ /*
+ * Validate arguments.
+ *
+ * !!!
+ * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
+ * files here, and create non-existent pages in readonly files if the
+ * flags are set, later. The reason is that the hash access method
+ * wants to get empty pages that don't really exist in readonly files.
+ * The only alternative is for hash to write the last "bucket" all the
+ * time, which we don't want to do because one of our big goals in life
+ * is to keep database files small. It's sleazy as hell, but we catch
+ * any attempt to actually write the file in memp_fput().
+ */
+#undef OKFLAGS
+#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \
+ DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW)
+ if (flags != 0) {
+ if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0)
+ return (ret);
+
+ switch (FLD_CLR(flags, DB_MPOOL_DIRTY | DB_MPOOL_EDIT)) {
+ case DB_MPOOL_CREATE:
+ case DB_MPOOL_LAST:
+ case DB_MPOOL_NEW:
+ case 0:
+ break;
+ default:
+ return (__db_ferr(env, "memp_fget", 1));
+ }
+ }
+
+ ENV_ENTER(env, ip);
+
+ rep_blocked = 0;
+ if (txnp == NULL && IS_ENV_REPLICATED(env)) {
+ if ((ret = __op_rep_enter(env, 0, 1)) != 0)
+ goto err;
+ rep_blocked = 1;
+ }
+ ret = __memp_fget(dbmfp, pgnoaddr, ip, txnp, flags, addrp);
+ /*
+ * We only decrement the count in op_rep_exit if the operation fails.
+ * Otherwise the count will be decremented when the page is no longer
+ * pinned in memp_fput.
+ */
+ if (ret != 0 && rep_blocked)
+ (void)__op_rep_exit(env);
+
+ /* Similarly if an app has a page pinned it is ACTIVE. */
+err: if (ret != 0)
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __memp_fget --
+ * Get a page from the file.
+ *
+ * PUBLIC: int __memp_fget __P((DB_MPOOLFILE *,
+ * PUBLIC: db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *));
+ */
+int
+__memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t *pgnoaddr;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t flags;
+ void *addrp;
+{
+ enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
+ BH *alloc_bhp, *bhp, *oldest_bhp;
+ ENV *env;
+ DB_LSN *read_lsnp, vlsn;
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *c_mp;
+ MPOOLFILE *mfp;
+ PIN_LIST *list, *lp;
+ REGENV *renv;
+ REGINFO *infop, *t_infop, *reginfo;
+ TXN_DETAIL *td;
+ roff_t list_off, mf_offset;
+ u_int32_t bucket, pinmax, st_hsearch;
+ int b_incr, b_lock, h_locked, dirty, extending;
+ int makecopy, mvcc, need_free, ret;
+#ifdef DIAGNOSTIC
+ DB_LOCKTAB *lt;
+ DB_LOCKER *locker;
+#endif
+
+ *(void **)addrp = NULL;
+ COMPQUIET(c_mp, NULL);
+ COMPQUIET(infop, NULL);
+
+ env = dbmfp->env;
+ dbmp = env->mp_handle;
+
+ mfp = dbmfp->mfp;
+ mvcc = atomic_read(&mfp->multiversion) && (txn != NULL);
+ mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+ alloc_bhp = bhp = oldest_bhp = NULL;
+ read_lsnp = NULL;
+ td = NULL;
+ hp = NULL;
+ b_incr = b_lock = h_locked = extending = makecopy = ret = 0;
+
+ if (LF_ISSET(DB_MPOOL_DIRTY)) {
+ if (F_ISSET(dbmfp, MP_READONLY)) {
+ __db_errx(env, DB_STR_A("3021",
+ "%s: dirty flag set for readonly file page",
+ "%s"), __memp_fn(dbmfp));
+ return (EINVAL);
+ }
+ if ((ret = __db_fcchk(env, "DB_MPOOLFILE->get",
+ flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0)
+ return (ret);
+ }
+
+ dirty = LF_ISSET(DB_MPOOL_DIRTY | DB_MPOOL_EDIT | DB_MPOOL_FREE);
+ LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT);
+
+ /*
+ * If the transaction is being used to update a multiversion database
+ * for the first time, set the read LSN. In addition, if this is an
+ * update, allocate a mutex. If no transaction has been supplied, that
+ * will be caught later, when we know whether one is required.
+ */
+ if (mvcc && txn != NULL && txn->td != NULL) {
+ /* We're only interested in the ultimate parent transaction. */
+ while (txn->parent != NULL)
+ txn = txn->parent;
+ td = (TXN_DETAIL *)txn->td;
+ if (F_ISSET(txn, TXN_SNAPSHOT)) {
+ read_lsnp = &td->read_lsn;
+ if (IS_MAX_LSN(*read_lsnp) &&
+ (ret = __log_current_lsn_int(env, read_lsnp,
+ NULL, NULL)) != 0)
+ return (ret);
+ }
+ if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) &&
+ td->mvcc_mtx == MUTEX_INVALID && (ret =
+ __mutex_alloc(env, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0)
+ return (ret);
+ }
+
+ switch (flags) {
+ case DB_MPOOL_LAST:
+ /* Get the last page number in the file. */
+ MUTEX_LOCK(env, mfp->mutex);
+ *pgnoaddr = mfp->last_pgno;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ break;
+ case DB_MPOOL_NEW:
+ /*
+ * If always creating a page, skip the first search
+ * of the hash bucket.
+ */
+ goto newpg;
+ case DB_MPOOL_CREATE:
+ default:
+ break;
+ }
+
+ /*
+ * If mmap'ing the file and the page is not past the end of the file,
+ * just return a pointer. We can't use R_ADDR here: this is an offset
+ * into an mmap'd file, not a shared region, and doesn't change for
+ * private environments.
+ *
+ * The page may be past the end of the file, so check the page number
+ * argument against the original length of the file. If we previously
+ * returned pages past the original end of the file, last_pgno will
+ * have been updated to match the "new" end of the file, and checking
+ * against it would return pointers past the end of the mmap'd region.
+ *
+ * If another process has opened the file for writing since we mmap'd
+ * it, we will start playing the game by their rules, i.e. everything
+ * goes through the cache. All pages previously returned will be safe,
+ * as long as the correct locking protocol was observed.
+ *
+ * We don't discard the map because we don't know when all of the
+ * pages will have been discarded from the process' address space.
+ * It would be possible to do so by reference counting the open
+ * pages from the mmap, but it's unclear to me that it's worth it.
+ */
+ if (dbmfp->addr != NULL &&
+ F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
+ *(void **)addrp = (u_int8_t *)dbmfp->addr +
+ (*pgnoaddr * mfp->pagesize);
+ STAT_INC_VERB(env,
+ mpool, map, mfp->stat.st_map, __memp_fn(dbmfp), *pgnoaddr);
+ return (0);
+ }
+
+ /*
+ * Determine the cache and hash bucket where this page lives and get
+ * local pointers to them. Reset on each pass through this code, the
+ * page number can change.
+ */
+ MP_GET_BUCKET(env, mfp, *pgnoaddr, &infop, hp, bucket, ret);
+ if (ret != 0)
+ return (ret);
+ c_mp = infop->primary;
+
+ if (0) {
+ /* if we search again, get an exclusive lock. */
+retry: MUTEX_LOCK(env, hp->mtx_hash);
+ }
+
+ /* Search the hash chain for the page. */
+ st_hsearch = 0;
+ h_locked = 1;
+ SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
+ ++st_hsearch;
+ if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
+ continue;
+
+ /* Snapshot reads -- get the version visible at read_lsn. */
+ if (read_lsnp != NULL) {
+ while (bhp != NULL &&
+ !BH_OWNED_BY(env, bhp, txn) &&
+ !BH_VISIBLE(env, bhp, read_lsnp, vlsn))
+ bhp = SH_CHAIN_PREV(bhp, vc, __bh);
+
+ /*
+ * We can get a null bhp if we are looking for a
+ * page that was created after the transaction was
+ * started so its not visible (i.e. page added to
+ * the BTREE in a subsequent txn).
+ */
+ if (bhp == NULL) {
+ ret = DB_PAGE_NOTFOUND;
+ goto err;
+ }
+ }
+
+ makecopy = mvcc && dirty && !BH_OWNED_BY(env, bhp, txn);
+
+ /*
+ * Increment the reference count. This signals that the
+ * buffer may not be discarded. We must drop the hash
+ * mutex before we lock the buffer mutex.
+ */
+ if (BH_REFCOUNT(bhp) == UINT16_MAX) {
+ __db_errx(env, DB_STR_A("3022",
+ "%s: page %lu: reference count overflow",
+ "%s %lu"), __memp_fn(dbmfp), (u_long)bhp->pgno);
+ ret = __env_panic(env, EINVAL);
+ goto err;
+ }
+ atomic_inc(env, &bhp->ref);
+ b_incr = 1;
+
+ /*
+ * Lock the buffer. If the page is being read in or modified it
+ * will be exclusively locked and we will block.
+ */
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+ if (dirty || extending || makecopy || F_ISSET(bhp, BH_FROZEN)) {
+xlatch: if (LF_ISSET(DB_MPOOL_TRY)) {
+ if ((ret =
+ MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0)
+ goto err;
+ } else
+ MUTEX_LOCK(env, bhp->mtx_buf);
+ F_SET(bhp, BH_EXCLUSIVE);
+ } else if (LF_ISSET(DB_MPOOL_TRY)) {
+ if ((ret = MUTEX_TRY_READLOCK(env, bhp->mtx_buf)) != 0)
+ goto err;
+ } else
+ MUTEX_READLOCK(env, bhp->mtx_buf);
+
+#ifdef HAVE_SHARED_LATCHES
+ /*
+ * If buffer is still in transit once we have a shared latch,
+ * upgrade to an exclusive latch.
+ */
+ if (F_ISSET(bhp, BH_FREED | BH_TRASH) &&
+ !F_ISSET(bhp, BH_EXCLUSIVE)) {
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ goto xlatch;
+ }
+#else
+ F_SET(bhp, BH_EXCLUSIVE);
+#endif
+ b_lock = 1;
+
+ /*
+ * If the buffer was frozen before we waited for any I/O to
+ * complete and is still frozen, we will need to thaw it.
+ * Otherwise, it was thawed while we waited, and we need to
+ * search again.
+ */
+ if (F_ISSET(bhp, BH_THAWED)) {
+thawed: need_free = (atomic_dec(env, &bhp->ref) == 0);
+ b_incr = 0;
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ b_lock = 0;
+ if (need_free) {
+ MPOOL_REGION_LOCK(env, infop);
+ SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
+ bhp, hq);
+ MPOOL_REGION_UNLOCK(env, infop);
+ }
+ bhp = NULL;
+ goto retry;
+ }
+
+ /*
+ * If the buffer we wanted was frozen or thawed while we
+ * waited, we need to start again. That is indicated by
+ * a new buffer header in the version chain owned by the same
+ * transaction as the one we pinned.
+ *
+ * Also, if we're doing an unversioned read on a multiversion
+ * file, another thread may have dirtied this buffer while we
+ * swapped from the hash bucket lock to the buffer lock.
+ */
+ if (SH_CHAIN_HASNEXT(bhp, vc) &&
+ (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off ||
+ (!dirty && read_lsnp == NULL))) {
+ DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0);
+ atomic_dec(env, &bhp->ref);
+ b_incr = 0;
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ b_lock = 0;
+ bhp = NULL;
+ goto retry;
+ } else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) {
+ ret = DB_LOCK_DEADLOCK;
+ goto err;
+ } else if (F_ISSET(bhp, BH_FREED) && flags != DB_MPOOL_CREATE &&
+ flags != DB_MPOOL_NEW && flags != DB_MPOOL_FREE) {
+ ret = DB_PAGE_NOTFOUND;
+ goto err;
+ }
+
+ /* Is it worthwhile to publish oh-so-frequent cache hits? */
+ STAT_INC_VERB(env, mpool, hit,
+ mfp->stat.st_cache_hit, __memp_fn(dbmfp), *pgnoaddr);
+ break;
+ }
+
+#ifdef HAVE_STATISTICS
+ /*
+ * Update the hash bucket search statistics -- do now because our next
+ * search may be for a different bucket. Are these too frequent also?
+ */
+ STAT_INC_VERB(env, mpool, hash_search,
+ c_mp->stat.st_hash_searches, __memp_fn(dbmfp), *pgnoaddr);
+ if (st_hsearch > c_mp->stat.st_hash_longest)
+ STAT_SET_VERB(env, mpool, hash_longest,
+ c_mp->stat.st_hash_longest,
+ st_hsearch, __memp_fn(dbmfp), *pgnoaddr);
+ STAT_ADJUST_VERB(env, mpool, hash_examined, c_mp->stat.st_hash_searches,
+ st_hsearch, __memp_fn(dbmfp), *pgnoaddr);
+#endif
+
+ /*
+ * There are 4 possible paths to this location:
+ *
+ * FIRST_MISS:
+ * Didn't find the page in the hash bucket on our first pass:
+ * bhp == NULL, alloc_bhp == NULL
+ *
+ * FIRST_FOUND:
+ * Found the page in the hash bucket on our first pass:
+ * bhp != NULL, alloc_bhp == NULL
+ *
+ * SECOND_FOUND:
+ * Didn't find the page in the hash bucket on the first pass,
+ * allocated space, and found the page in the hash bucket on
+ * our second pass:
+ * bhp != NULL, alloc_bhp != NULL
+ *
+ * SECOND_MISS:
+ * Didn't find the page in the hash bucket on the first pass,
+ * allocated space, and didn't find the page in the hash bucket
+ * on our second pass:
+ * bhp == NULL, alloc_bhp != NULL
+ */
+ state = bhp == NULL ?
+ (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
+ (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
+
+ switch (state) {
+ case FIRST_FOUND:
+ /*
+ * If we are to free the buffer, then this had better be the
+ * only reference. If so, just free the buffer. If not,
+ * complain and get out.
+ */
+ if (flags == DB_MPOOL_FREE) {
+freebuf: MUTEX_LOCK(env, hp->mtx_hash);
+ h_locked = 1;
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+ DB_ASSERT(env,
+ atomic_read(&hp->hash_page_dirty) > 0);
+ atomic_dec(env, &hp->hash_page_dirty);
+ }
+
+ /*
+ * If the buffer we found is already freed, we're done.
+ * If the ref count is not 1 then someone may be
+ * peeking at the buffer. We cannot free it until they
+ * determine that it is not what they want. Clear the
+ * buffer so that waiting threads get an empty page.
+ */
+ if (F_ISSET(bhp, BH_FREED))
+ goto done;
+ else if (BH_REFCOUNT(bhp) != 1 ||
+ !SH_CHAIN_SINGLETON(bhp, vc)) {
+ /*
+ * Create an empty page in the chain for
+ * subsequent gets. Otherwise, a thread that
+ * re-creates this page while it is still in
+ * cache will see stale data.
+ */
+ F_SET(bhp, BH_FREED);
+ F_CLR(bhp, BH_TRASH);
+ } else if (F_ISSET(bhp, BH_FROZEN)) {
+ /*
+ * Freeing a singleton frozen buffer: just free
+ * it. This call will release the hash bucket
+ * mutex.
+ */
+ ret =
+ __memp_bh_thaw(dbmp, infop, hp, bhp, NULL);
+ bhp = NULL;
+ b_incr = b_lock = h_locked = 0;
+ } else {
+ ret = __memp_bhfree(dbmp, infop, mfp,
+ hp, bhp, BH_FREE_FREEMEM);
+ bhp = NULL;
+ b_incr = b_lock = h_locked = 0;
+ }
+ goto done;
+ } else if (F_ISSET(bhp, BH_FREED | BH_TRASH)) {
+revive: if (F_ISSET(bhp, BH_FREED))
+ makecopy = makecopy ||
+ (mvcc && !BH_OWNED_BY(env, bhp, txn)) ||
+ F_ISSET(bhp, BH_FROZEN);
+ if (flags == DB_MPOOL_CREATE) {
+ MUTEX_LOCK(env, mfp->mutex);
+ if (*pgnoaddr > mfp->last_pgno)
+ mfp->last_pgno = *pgnoaddr;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ }
+ /* We can race with a thread trying to free this. */
+ if (F_ISSET(bhp, BH_TRASH) &&
+ *pgnoaddr <= mfp->last_pgno)
+ break;
+
+ /* Otherwise this page does not currently exist. */
+ if (flags != DB_MPOOL_CREATE && flags != DB_MPOOL_NEW) {
+ ret = DB_PAGE_NOTFOUND;
+ goto done;
+ }
+ }
+ if (mvcc) {
+ /*
+ * With multiversion databases, we might need to
+ * allocate a new buffer into which we can copy the one
+ * that we found. In that case, check the last buffer
+ * in the chain to see whether we can reuse an obsolete
+ * buffer.
+ *
+ * To provide snapshot isolation, we need to make sure
+ * that we've seen a buffer older than the oldest
+ * snapshot read LSN.
+ */
+reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
+ !h_locked) {
+ MUTEX_LOCK(env, hp->mtx_hash);
+ h_locked = 1;
+ }
+ if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
+ SH_CHAIN_HASPREV(bhp, vc)) {
+ oldest_bhp = SH_CHAIN_PREVP(bhp, vc, __bh);
+ while (SH_CHAIN_HASPREV(oldest_bhp, vc))
+ oldest_bhp = SH_CHAIN_PREVP(
+ oldest_bhp, vc, __bh);
+
+ if (BH_REFCOUNT(oldest_bhp) == 0 &&
+ !BH_OBSOLETE(
+ oldest_bhp, hp->old_reader, vlsn) &&
+ (ret = __txn_oldest_reader(env,
+ &hp->old_reader)) != 0)
+ goto err;
+
+ if (BH_OBSOLETE(
+ oldest_bhp, hp->old_reader, vlsn) &&
+ BH_REFCOUNT(oldest_bhp) == 0) {
+ DB_ASSERT(env,
+ !F_ISSET(oldest_bhp, BH_DIRTY));
+ atomic_inc(env, &oldest_bhp->ref);
+ if (F_ISSET(oldest_bhp, BH_FROZEN)) {
+ /*
+ * This call will release the
+ * hash bucket mutex.
+ */
+ ret = __memp_bh_thaw(dbmp,
+ infop, hp, oldest_bhp,
+ NULL);
+ h_locked = 0;
+ if (ret != 0)
+ goto err;
+ goto reuse;
+ }
+ if ((ret = __memp_bhfree(dbmp,
+ infop, mfp, hp, oldest_bhp,
+ BH_FREE_REUSE)) != 0)
+ goto err;
+ alloc_bhp = oldest_bhp;
+ h_locked = 0;
+ }
+
+ DB_ASSERT(env, alloc_bhp == NULL ||
+ !F_ISSET(alloc_bhp, BH_FROZEN));
+ }
+ }
+
+ /* We found the buffer or we're ready to copy -- we're done. */
+ if (!(makecopy || F_ISSET(bhp, BH_FROZEN)) || alloc_bhp != NULL)
+ break;
+
+ /* FALLTHROUGH */
+ case FIRST_MISS:
+ /*
+ * We didn't find the buffer in our first check. Figure out
+ * if the page exists, and allocate structures so we can add
+ * the page to the buffer pool.
+ */
+ if (h_locked)
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+
+ /*
+ * The buffer is not in the pool, so we don't need to free it.
+ */
+ if (LF_ISSET(DB_MPOOL_FREE) &&
+ (bhp == NULL || F_ISSET(bhp, BH_FREED) || !makecopy))
+ goto done;
+
+ if (bhp != NULL)
+ goto alloc;
+
+newpg: /*
+ * If DB_MPOOL_NEW is set, we have to allocate a page number.
+ * If neither DB_MPOOL_CREATE or DB_MPOOL_NEW is set, then
+ * it's an error to try and get a page past the end of file.
+ */
+ DB_ASSERT(env, !h_locked);
+ MUTEX_LOCK(env, mfp->mutex);
+ switch (flags) {
+ case DB_MPOOL_NEW:
+ extending = 1;
+ if (mfp->maxpgno != 0 &&
+ mfp->last_pgno >= mfp->maxpgno) {
+ __db_errx(env, DB_STR_A("3023",
+ "%s: file limited to %lu pages", "%s %lu"),
+ __memp_fn(dbmfp), (u_long)mfp->maxpgno);
+ ret = ENOSPC;
+ } else
+ *pgnoaddr = mfp->last_pgno + 1;
+ break;
+ case DB_MPOOL_CREATE:
+ if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) {
+ __db_errx(env, DB_STR_A("3024",
+ "%s: file limited to %lu pages", "%s %lu"),
+ __memp_fn(dbmfp), (u_long)mfp->maxpgno);
+ ret = ENOSPC;
+ } else if (!extending)
+ extending = *pgnoaddr > mfp->last_pgno;
+ break;
+ default:
+ ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
+ break;
+ }
+ MUTEX_UNLOCK(env, mfp->mutex);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * !!!
+ * In the DB_MPOOL_NEW code path, hp, infop and c_mp have
+ * not yet been initialized.
+ */
+ if (hp == NULL) {
+ MP_GET_BUCKET(env,
+ mfp, *pgnoaddr, &infop, hp, bucket, ret);
+ if (ret != 0)
+ goto err;
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ c_mp = infop->primary;
+ }
+
+alloc: /* Allocate a new buffer header and data space. */
+ if (alloc_bhp == NULL && (ret =
+ __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+ goto err;
+
+ /* Initialize enough so we can call __memp_bhfree. */
+ alloc_bhp->flags = 0;
+ atomic_init(&alloc_bhp->ref, 1);
+#ifdef DIAGNOSTIC
+ if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
+ __db_errx(env, DB_STR("3025",
+ "DB_MPOOLFILE->get: buffer data is NOT size_t aligned"));
+ ret = __env_panic(env, EINVAL);
+ goto err;
+ }
+#endif
+
+ /*
+ * If we're doing copy-on-write, we will already have the
+ * buffer header. In that case, we don't need to search again.
+ */
+ if (bhp != NULL)
+ break;
+
+ /*
+ * If we are extending the file, we'll need the mfp lock
+ * again.
+ */
+ if (extending)
+ MUTEX_LOCK(env, mfp->mutex);
+
+ /*
+ * DB_MPOOL_NEW does not guarantee you a page unreferenced by
+ * any other thread of control. (That guarantee is interesting
+ * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
+ * did not specify the page number, and so, may reasonably not
+ * have any way to lock the page outside of mpool.) Regardless,
+ * if we allocate the page, and some other thread of control
+ * requests the page by number, we will not detect that and the
+ * thread of control that allocated using DB_MPOOL_NEW may not
+ * have a chance to initialize the page. (Note: we *could*
+ * detect this case if we set a flag in the buffer header which
+ * guaranteed that no gets of the page would succeed until the
+ * reference count went to 0, that is, until the creating page
+ * put the page.) What we do guarantee is that if two threads
+ * of control are both doing DB_MPOOL_NEW calls, they won't
+ * collide, that is, they won't both get the same page.
+ *
+ * There's a possibility that another thread allocated the page
+ * we were planning to allocate while we were off doing buffer
+ * allocation. We can do that by making sure the page number
+ * we were going to use is still available. If it's not, then
+ * we check to see if the next available page number hashes to
+ * the same mpool region as the old one -- if it does, we can
+ * continue, otherwise, we have to start over.
+ */
+ if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
+ *pgnoaddr = mfp->last_pgno + 1;
+ MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop, ret);
+ if (ret != 0)
+ goto err;
+ if (t_infop != infop) {
+ /*
+ * flags == DB_MPOOL_NEW, so extending is set
+ * and we're holding the mfp locked.
+ */
+ MUTEX_UNLOCK(env, mfp->mutex);
+ hp = NULL;
+ goto newpg;
+ }
+ }
+
+ /*
+ * We released the mfp lock, so another thread might have
+ * extended the file. Update the last_pgno and initialize
+ * the file, as necessary, if we extended the file.
+ */
+ if (extending) {
+ if (*pgnoaddr > mfp->last_pgno)
+ mfp->last_pgno = *pgnoaddr;
+ else
+ extending = 0;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ if (ret != 0)
+ goto err;
+ }
+ goto retry;
+ case SECOND_FOUND:
+ /*
+ * We allocated buffer space for the requested page, but then
+ * found the page in the buffer cache on our second check.
+ * That's OK -- we can use the page we found in the pool,
+ * unless DB_MPOOL_NEW is set. If we're about to copy-on-write,
+ * this is exactly the situation we want.
+ *
+ * For multiversion files, we may have left some pages in cache
+ * beyond the end of a file after truncating. In that case, we
+ * would get to here with extending set. If so, we need to
+ * insert the new page in the version chain similar to when
+ * we copy on write.
+ */
+ if (F_ISSET(bhp, BH_FREED) &&
+ (flags == DB_MPOOL_NEW || flags == DB_MPOOL_CREATE))
+ goto revive;
+ else if (flags == DB_MPOOL_FREE)
+ goto freebuf;
+ else if (makecopy || F_ISSET(bhp, BH_FROZEN))
+ break;
+
+ /*
+ * We can't use the page we found in the pool if DB_MPOOL_NEW
+ * was set. (For details, see the above comment beginning
+ * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
+ * any other thread of control".) If DB_MPOOL_NEW is set, we
+ * release our pin on this particular buffer, and try to get
+ * another one.
+ */
+ if (flags == DB_MPOOL_NEW) {
+ DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0);
+ atomic_dec(env, &bhp->ref);
+ b_incr = 0;
+ if (F_ISSET(bhp, BH_EXCLUSIVE))
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ b_lock = 0;
+ bhp = NULL;
+ hp = NULL;
+ goto newpg;
+ }
+
+ break;
+ case SECOND_MISS:
+ /*
+ * We allocated buffer space for the requested page, and found
+ * the page still missing on our second pass through the buffer
+ * cache. Instantiate the page.
+ */
+ DB_ASSERT(env, alloc_bhp != NULL);
+ bhp = alloc_bhp;
+ alloc_bhp = NULL;
+
+ /*
+ * Initialize all the BH and hash bucket fields so we can call
+ * __memp_bhfree if an error occurs.
+ *
+ * Append the buffer to the tail of the bucket list.
+ */
+ bhp->priority = MPOOL_LRU_REDZONE;
+ bhp->pgno = *pgnoaddr;
+ bhp->mf_offset = mf_offset;
+ bhp->bucket = bucket;
+ bhp->region = (int)(infop - dbmp->reginfo);
+ bhp->td_off = INVALID_ROFF;
+ SH_CHAIN_INIT(bhp, vc);
+ bhp->flags = 0;
+
+ /*
+ * Reference the buffer and lock exclusive. We either
+ * need to read the buffer or create it from scratch
+ * and don't want anyone looking at it till we do.
+ */
+ MUTEX_LOCK(env, bhp->mtx_buf);
+ b_lock = 1;
+ F_SET(bhp, BH_EXCLUSIVE);
+ b_incr = 1;
+
+ /* We created a new page, it starts dirty. */
+ if (extending) {
+ atomic_inc(env, &hp->hash_page_dirty);
+ F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+ }
+
+ MUTEX_REQUIRED(env, hp->mtx_hash);
+ SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, bhp, hq, __bh);
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+
+ /*
+ * If we created the page, zero it out. If we didn't create
+ * the page, read from the backing file.
+ *
+ * !!!
+ * DB_MPOOL_NEW doesn't call the pgin function.
+ *
+ * If DB_MPOOL_CREATE is used, then the application's pgin
+ * function has to be able to handle pages of 0's -- if it
+ * uses DB_MPOOL_NEW, it can detect all of its page creates,
+ * and not bother.
+ *
+ * If we're running in diagnostic mode, smash any bytes on the
+ * page that are unknown quantities for the caller.
+ *
+ * Otherwise, read the page into memory, optionally creating it
+ * if DB_MPOOL_CREATE is set.
+ */
+ if (extending) {
+ MVCC_MPROTECT(bhp->buf, mfp->pagesize,
+ PROT_READ | PROT_WRITE);
+ memset(bhp->buf, 0,
+ (mfp->clear_len == DB_CLEARLEN_NOTSET) ?
+ mfp->pagesize : mfp->clear_len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
+ if (mfp->clear_len != DB_CLEARLEN_NOTSET)
+ memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
+ mfp->pagesize - mfp->clear_len);
+#endif
+
+ if (flags == DB_MPOOL_CREATE && mfp->ftype != 0 &&
+ (ret = __memp_pg(dbmfp,
+ bhp->pgno, bhp->buf, 1)) != 0)
+ goto err;
+
+ STAT_INC_VERB(env, mpool, page_create,
+ mfp->stat.st_page_create,
+ __memp_fn(dbmfp), *pgnoaddr);
+ } else {
+ F_SET(bhp, BH_TRASH);
+ STAT_INC_VERB(env, mpool, miss, mfp->stat.st_cache_miss,
+ __memp_fn(dbmfp), *pgnoaddr);
+ }
+
+ makecopy = mvcc && dirty && !extending;
+
+ /* Increment buffer count referenced by MPOOLFILE. */
+ MUTEX_LOCK(env, mfp->mutex);
+ ++mfp->block_cnt;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ }
+
+ DB_ASSERT(env, bhp != NULL && BH_REFCOUNT(bhp) != 0 && b_lock);
+ DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN) || !F_ISSET(bhp, BH_FREED) ||
+ makecopy);
+
+ /* We've got a buffer header we're re-instantiating. */
+ if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) {
+ if (alloc_bhp == NULL)
+ goto reuse;
+
+ /*
+ * To thaw the buffer, we must hold the hash bucket mutex,
+ * and the call to __memp_bh_thaw will release it.
+ */
+ if (h_locked == 0)
+ MUTEX_LOCK(env, hp->mtx_hash);
+ h_locked = 1;
+
+ /*
+ * If the empty buffer has been filled in the meantime, don't
+ * overwrite it.
+ */
+ if (F_ISSET(bhp, BH_THAWED)) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+ goto thawed;
+ }
+
+ ret = __memp_bh_thaw(dbmp, infop, hp, bhp, alloc_bhp);
+ bhp = NULL;
+ b_lock = h_locked = 0;
+ if (ret != 0)
+ goto err;
+ bhp = alloc_bhp;
+ alloc_bhp = NULL;
+ MUTEX_REQUIRED(env, bhp->mtx_buf);
+ b_incr = b_lock = 1;
+ }
+
+ /*
+ * BH_TRASH --
+ * The buffer we found may need to be filled from the disk.
+ *
+ * It's possible for the read function to fail, which means we fail
+ * as well. Discard the buffer on failure unless another thread
+ * is waiting on our I/O to complete. It's OK to leave the buffer
+ * around, as the waiting thread will see the BH_TRASH flag set,
+ * and will also attempt to discard it. If there's a waiter,
+ * we need to decrement our reference count.
+ */
+ if (F_ISSET(bhp, BH_TRASH) &&
+ flags != DB_MPOOL_FREE && !F_ISSET(bhp, BH_FREED)) {
+ MVCC_MPROTECT(bhp->buf, mfp->pagesize,
+ PROT_READ | PROT_WRITE);
+ if ((ret = __memp_pgread(dbmfp,
+ bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
+ goto err;
+ DB_ASSERT(env, read_lsnp != NULL || !SH_CHAIN_HASNEXT(bhp, vc));
+ }
+
+ /* Copy-on-write. */
+ if (makecopy) {
+ /*
+ * If we read a page from disk that we want to modify, we now
+ * need to make copy, so we now need to allocate another buffer
+ * to hold the new copy.
+ */
+ if (alloc_bhp == NULL)
+ goto reuse;
+
+ DB_ASSERT(env, bhp != NULL && alloc_bhp != bhp);
+ DB_ASSERT(env, bhp->td_off == INVALID_ROFF ||
+ !IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) ||
+ (F_ISSET(bhp, BH_FREED) && F_ISSET(bhp, BH_FROZEN)));
+ DB_ASSERT(env, txn != NULL ||
+ (F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED)));
+ DB_ASSERT(env, (extending || flags == DB_MPOOL_FREE ||
+ F_ISSET(bhp, BH_FREED)) ||
+ !F_ISSET(bhp, BH_FROZEN | BH_TRASH));
+ MUTEX_REQUIRED(env, bhp->mtx_buf);
+
+ if (BH_REFCOUNT(bhp) == 1)
+ MVCC_MPROTECT(bhp->buf, mfp->pagesize,
+ PROT_READ);
+
+ atomic_init(&alloc_bhp->ref, 1);
+ MUTEX_LOCK(env, alloc_bhp->mtx_buf);
+ alloc_bhp->priority = bhp->priority;
+ alloc_bhp->pgno = bhp->pgno;
+ alloc_bhp->bucket = bhp->bucket;
+ alloc_bhp->region = bhp->region;
+ alloc_bhp->mf_offset = bhp->mf_offset;
+ alloc_bhp->td_off = INVALID_ROFF;
+ if (txn == NULL) {
+ DB_ASSERT(env,
+ F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED));
+ if (bhp->td_off != INVALID_ROFF && (ret =
+ __memp_bh_settxn(dbmp, mfp, alloc_bhp,
+ BH_OWNER(env, bhp))) != 0)
+ goto err;
+ } else if ((ret =
+ __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0)
+ goto err;
+ MVCC_MPROTECT(alloc_bhp->buf, mfp->pagesize,
+ PROT_READ | PROT_WRITE);
+ if (extending ||
+ F_ISSET(bhp, BH_FREED) || flags == DB_MPOOL_FREE) {
+ memset(alloc_bhp->buf, 0,
+ (mfp->clear_len == DB_CLEARLEN_NOTSET) ?
+ mfp->pagesize : mfp->clear_len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
+ if (mfp->clear_len != DB_CLEARLEN_NOTSET)
+ memset(alloc_bhp->buf + mfp->clear_len,
+ CLEAR_BYTE,
+ mfp->pagesize - mfp->clear_len);
+#endif
+ if (mfp->ftype != 0 && (ret = __memp_pg(dbmfp,
+ alloc_bhp->pgno, alloc_bhp->buf, 1)) != 0)
+ goto err;
+ } else
+ memcpy(alloc_bhp->buf, bhp->buf, mfp->pagesize);
+ MVCC_MPROTECT(alloc_bhp->buf, mfp->pagesize, 0);
+
+ if (h_locked == 0)
+ MUTEX_LOCK(env, hp->mtx_hash);
+ MUTEX_REQUIRED(env, hp->mtx_hash);
+ h_locked = 1;
+
+ alloc_bhp->flags = BH_EXCLUSIVE |
+ ((flags == DB_MPOOL_FREE) ? BH_FREED :
+ F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE));
+ DB_ASSERT(env, flags != DB_MPOOL_FREE ||
+ !F_ISSET(bhp, BH_DIRTY));
+ F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+ DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
+ SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh);
+ SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
+ bhp, alloc_bhp, hq, __bh);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+ DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) > 0);
+ if (atomic_dec(env, &bhp->ref) == 0) {
+ bhp->priority = c_mp->lru_priority;
+ MVCC_MPROTECT(bhp->buf, mfp->pagesize, 0);
+ }
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+
+ bhp = alloc_bhp;
+ DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
+ b_incr = 1;
+ MUTEX_REQUIRED(env, bhp->mtx_buf);
+ b_lock = 1;
+
+ if (alloc_bhp != oldest_bhp) {
+ MUTEX_LOCK(env, mfp->mutex);
+ ++mfp->block_cnt;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ }
+
+ alloc_bhp = NULL;
+ } else if (mvcc && extending &&
+ (ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0)
+ goto err;
+
+ if (flags == DB_MPOOL_FREE) {
+ DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
+ /* If we have created an empty buffer, it is not returned. */
+ if (!F_ISSET(bhp, BH_FREED))
+ goto freebuf;
+ goto done;
+ }
+
+ /*
+ * Free the allocated memory, we no longer need it.
+ */
+ if (alloc_bhp != NULL) {
+ if ((ret = __memp_bhfree(dbmp, infop, NULL,
+ NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED)) != 0)
+ goto err;
+ alloc_bhp = NULL;
+ }
+
+ if (dirty || extending ||
+ (F_ISSET(bhp, BH_FREED) &&
+ (flags == DB_MPOOL_CREATE || flags == DB_MPOOL_NEW))) {
+ MUTEX_REQUIRED(env, bhp->mtx_buf);
+ if (F_ISSET(bhp, BH_FREED)) {
+ DB_ASSERT(env, bhp->pgno <= mfp->last_pgno);
+ memset(bhp->buf, 0,
+ (mfp->clear_len == DB_CLEARLEN_NOTSET) ?
+ mfp->pagesize : mfp->clear_len);
+ F_CLR(bhp, BH_FREED);
+ if (mfp->ftype != 0 && (ret =
+ __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1)) != 0)
+ goto err;
+ }
+ if (!F_ISSET(bhp, BH_DIRTY)) {
+#ifdef DIAGNOSTIC
+ MUTEX_LOCK(env, hp->mtx_hash);
+#endif
+ DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
+ atomic_inc(env, &hp->hash_page_dirty);
+ F_SET(bhp, BH_DIRTY);
+#ifdef DIAGNOSTIC
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+#endif
+ }
+ } else if (F_ISSET(bhp, BH_EXCLUSIVE)) {
+ F_CLR(bhp, BH_EXCLUSIVE);
+#ifdef HAVE_SHARED_LATCHES
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ MUTEX_READLOCK(env, bhp->mtx_buf);
+ /*
+ * If another thread has dirtied the page while we
+ * switched locks, we have to go through it all again.
+ */
+ if (SH_CHAIN_HASNEXT(bhp, vc) && read_lsnp == NULL) {
+ atomic_dec(env, &bhp->ref);
+ b_incr = 0;
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ b_lock = 0;
+ bhp = NULL;
+ goto retry;
+ }
+#endif
+ }
+
+ MVCC_MPROTECT(bhp->buf, mfp->pagesize, PROT_READ |
+ (dirty || extending || F_ISSET(bhp, BH_DIRTY) ?
+ PROT_WRITE : 0));
+
+#ifdef DIAGNOSTIC
+ MUTEX_LOCK(env, hp->mtx_hash);
+ {
+ BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh);
+
+ DB_ASSERT(env, !atomic_read(&mfp->multiversion) || read_lsnp != NULL ||
+ next_bhp == NULL);
+ DB_ASSERT(env, !mvcc || read_lsnp == NULL ||
+ bhp->td_off == INVALID_ROFF || BH_OWNED_BY(env, bhp, txn) ||
+ (BH_VISIBLE(env, bhp, read_lsnp, vlsn) &&
+ (next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) ||
+ (next_bhp->td_off != INVALID_ROFF &&
+ (BH_OWNER(env, next_bhp)->status != TXN_COMMITTED ||
+ IS_ZERO_LSN(BH_OWNER(env, next_bhp)->last_lsn) ||
+ !BH_VISIBLE(env, next_bhp, read_lsnp, vlsn))))));
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+#endif
+
+ /*
+ * Record this pin for this thread. Holding the page pinned
+ * without recording the pin is ok since we do not recover from
+ * a death from within the library itself.
+ */
+ if (ip != NULL) {
+ reginfo = env->reginfo;
+ if (ip->dbth_pincount == ip->dbth_pinmax) {
+ pinmax = ip->dbth_pinmax;
+ renv = reginfo->primary;
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ if ((ret = __env_alloc(reginfo,
+ 2 * pinmax * sizeof(PIN_LIST), &list)) != 0) {
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ goto err;
+ }
+
+ memcpy(list, R_ADDR(reginfo, ip->dbth_pinlist),
+ pinmax * sizeof(PIN_LIST));
+ memset(&list[pinmax], 0, pinmax * sizeof(PIN_LIST));
+ list_off = R_OFFSET(reginfo, list);
+ list = R_ADDR(reginfo, ip->dbth_pinlist);
+ ip->dbth_pinmax = 2 * pinmax;
+ ip->dbth_pinlist = list_off;
+ if (list != ip->dbth_pinarray)
+ __env_alloc_free(reginfo, list);
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ }
+ list = R_ADDR(reginfo, ip->dbth_pinlist);
+ for (lp = list; lp < &list[ip->dbth_pinmax]; lp++)
+ if (lp->b_ref == INVALID_ROFF)
+ break;
+
+ ip->dbth_pincount++;
+ lp->b_ref = R_OFFSET(infop, bhp);
+ lp->region = (int)(infop - dbmp->reginfo);
+#ifdef DIAGNOSTIC
+ if (dirty && ip->dbth_locker != INVALID_ROFF &&
+ ip->dbth_check_off == 0) {
+ lt = env->lk_handle;
+ locker = (DB_LOCKER *)
+ (R_ADDR(&lt->reginfo, ip->dbth_locker));
+ DB_ASSERT(env, __db_has_pagelock(env, locker, dbmfp,
+ (PAGE*)bhp->buf, DB_LOCK_WRITE) == 0);
+ }
+#endif
+
+ }
+ /*
+ * During recovery we can read past the end of the file. Also
+ * last_pgno is not versioned, so if this is an older version
+ * that is ok as well.
+ */
+ DB_ASSERT(env, IS_RECOVERING(env) ||
+ bhp->pgno <= mfp->last_pgno || !SH_CHAIN_SINGLETON(bhp, vc));
+
+#ifdef DIAGNOSTIC
+ /* Update the file's pinned reference count. */
+ MPOOL_SYSTEM_LOCK(env);
+ ++dbmfp->pinref;
+ MPOOL_SYSTEM_UNLOCK(env);
+
+ /*
+ * We want to switch threads as often as possible, and at awkward
+ * times. Yield every time we get a new page to ensure contention.
+ */
+ if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU))
+ __os_yield(env, 0, 0);
+#endif
+
+ DB_ASSERT(env, alloc_bhp == NULL);
+ DB_ASSERT(env, !(dirty || extending) ||
+ atomic_read(&hp->hash_page_dirty) > 0);
+ DB_ASSERT(env, BH_REFCOUNT(bhp) > 0 &&
+ !F_ISSET(bhp, BH_FREED | BH_FROZEN | BH_TRASH));
+
+ *(void **)addrp = bhp->buf;
+ return (0);
+
+done:
+err: /*
+ * We should only get to here with ret == 0 if freeing a buffer.
+ * In that case, check that it has in fact been freed.
+ */
+ DB_ASSERT(env, ret != 0 || flags != DB_MPOOL_FREE || bhp == NULL ||
+ (F_ISSET(bhp, BH_FREED) && !SH_CHAIN_HASNEXT(bhp, vc)));
+
+ if (bhp != NULL) {
+ if (b_incr)
+ atomic_dec(env, &bhp->ref);
+ if (b_lock) {
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ }
+ }
+
+ if (h_locked)
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+
+ /* If alloc_bhp is set, free the memory. */
+ if (alloc_bhp != NULL)
+ (void)__memp_bhfree(dbmp, infop, NULL,
+ NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED);
+
+ return (ret);
+}
diff --git a/src/mp/mp_fmethod.c b/src/mp/mp_fmethod.c
new file mode 100644
index 00000000..41bd638c
--- /dev/null
+++ b/src/mp/mp_fmethod.c
@@ -0,0 +1,589 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+static int __memp_get_clear_len __P((DB_MPOOLFILE *, u_int32_t *));
+static int __memp_get_lsn_offset __P((DB_MPOOLFILE *, int32_t *));
+static int __memp_get_maxsize __P((DB_MPOOLFILE *, u_int32_t *, u_int32_t *));
+static int __memp_set_maxsize __P((DB_MPOOLFILE *, u_int32_t, u_int32_t));
+static int __memp_set_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY));
+static int __memp_get_last_pgno_pp __P((DB_MPOOLFILE *, db_pgno_t *));
+
+/*
+ * __memp_fcreate_pp --
+ * ENV->memp_fcreate pre/post processing.
+ *
+ * PUBLIC: int __memp_fcreate_pp __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
+ */
+int
+__memp_fcreate_pp(dbenv, retp, flags)
+ DB_ENV *dbenv;
+ DB_MPOOLFILE **retp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ /* Validate arguments. */
+ if ((ret =
+ __db_fchk(env, "DB_ENV->memp_fcreate", flags, DB_VERIFY)) != 0)
+ return (ret);
+
+ /* We look the other way on mpool operations if we're verifying. */
+ if (REP_ON(env) && !LF_ISSET(DB_VERIFY)) {
+ __db_errx(env, DB_STR("3029",
+"DB_ENV->memp_fcreate: method not permitted when replication is configured"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ ret = __memp_fcreate(env, retp);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __memp_fcreate --
+ * ENV->memp_fcreate.
+ *
+ * PUBLIC: int __memp_fcreate __P((ENV *, DB_MPOOLFILE **));
+ */
+int
+__memp_fcreate(env, retp)
+ ENV *env;
+ DB_MPOOLFILE **retp;
+{
+ DB_MPOOLFILE *dbmfp;
+ int ret;
+
+ /* Allocate and initialize the per-process structure. */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
+ return (ret);
+
+ dbmfp->ref = 1;
+ dbmfp->lsn_offset = DB_LSN_OFF_NOTSET;
+ dbmfp->env = env;
+ dbmfp->mfp = INVALID_ROFF;
+
+ dbmfp->close = __memp_fclose_pp;
+ dbmfp->get = __memp_fget_pp;
+ dbmfp->get_clear_len = __memp_get_clear_len;
+ dbmfp->get_fileid = __memp_get_fileid;
+ dbmfp->get_flags = __memp_get_flags;
+ dbmfp->get_ftype = __memp_get_ftype;
+ dbmfp->get_last_pgno = __memp_get_last_pgno_pp;
+ dbmfp->get_lsn_offset = __memp_get_lsn_offset;
+ dbmfp->get_maxsize = __memp_get_maxsize;
+ dbmfp->get_pgcookie = __memp_get_pgcookie;
+ dbmfp->get_priority = __memp_get_priority;
+ dbmfp->open = __memp_fopen_pp;
+ dbmfp->put = __memp_fput_pp;
+ dbmfp->set_clear_len = __memp_set_clear_len;
+ dbmfp->set_fileid = __memp_set_fileid;
+ dbmfp->set_flags = __memp_set_flags;
+ dbmfp->set_ftype = __memp_set_ftype;
+ dbmfp->set_lsn_offset = __memp_set_lsn_offset;
+ dbmfp->set_maxsize = __memp_set_maxsize;
+ dbmfp->set_pgcookie = __memp_set_pgcookie;
+ dbmfp->set_priority = __memp_set_priority;
+ dbmfp->sync = __memp_fsync_pp;
+
+ *retp = dbmfp;
+ return (0);
+}
+
+/*
+ * __memp_get_clear_len --
+ * Get the clear length.
+ */
+static int
+__memp_get_clear_len(dbmfp, clear_lenp)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t *clear_lenp;
+{
+ *clear_lenp = dbmfp->clear_len;
+ return (0);
+}
+
+/*
+ * __memp_set_clear_len --
+ * DB_MPOOLFILE->set_clear_len.
+ *
+ * PUBLIC: int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t));
+ */
+int
+__memp_set_clear_len(dbmfp, clear_len)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t clear_len;
+{
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_clear_len");
+
+ dbmfp->clear_len = clear_len;
+ return (0);
+}
+
+/*
+ * __memp_get_fileid --
+ * DB_MPOOLFILE->get_fileid.
+ *
+ * PUBLIC: int __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+ */
+int
+__memp_get_fileid(dbmfp, fileid)
+ DB_MPOOLFILE *dbmfp;
+ u_int8_t *fileid;
+{
+ if (!F_ISSET(dbmfp, MP_FILEID_SET)) {
+ __db_errx(dbmfp->env, DB_STR("3030",
+ "get_fileid: file ID not set"));
+ return (EINVAL);
+ }
+
+ memcpy(fileid, dbmfp->fileid, DB_FILE_ID_LEN);
+ return (0);
+}
+
+/*
+ * __memp_set_fileid --
+ * DB_MPOOLFILE->set_fileid.
+ *
+ * PUBLIC: int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+ */
+int
+__memp_set_fileid(dbmfp, fileid)
+ DB_MPOOLFILE *dbmfp;
+ u_int8_t *fileid;
+{
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_fileid");
+
+ memcpy(dbmfp->fileid, fileid, DB_FILE_ID_LEN);
+ F_SET(dbmfp, MP_FILEID_SET);
+
+ return (0);
+}
+
+/*
+ * __memp_get_flags --
+ * Get the DB_MPOOLFILE flags;
+ *
+ * PUBLIC: int __memp_get_flags __P((DB_MPOOLFILE *, u_int32_t *));
+ */
+int
+__memp_get_flags(dbmfp, flagsp)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t *flagsp;
+{
+ MPOOLFILE *mfp;
+
+ mfp = dbmfp->mfp;
+
+ *flagsp = 0;
+
+ if (mfp == NULL)
+ *flagsp = FLD_ISSET(dbmfp->config_flags,
+ DB_MPOOL_NOFILE | DB_MPOOL_UNLINK);
+ else {
+ if (mfp->no_backing_file)
+ FLD_SET(*flagsp, DB_MPOOL_NOFILE);
+ if (mfp->unlink_on_close)
+ FLD_SET(*flagsp, DB_MPOOL_UNLINK);
+ }
+ return (0);
+}
+
+/*
+ * __memp_set_flags --
+ * Set the DB_MPOOLFILE flags;
+ *
+ * PUBLIC: int __memp_set_flags __P((DB_MPOOLFILE *, u_int32_t, int));
+ */
+int
+__memp_set_flags(dbmfp, flags, onoff)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t flags;
+ int onoff;
+{
+ ENV *env;
+ MPOOLFILE *mfp;
+ int ret;
+
+ env = dbmfp->env;
+ mfp = dbmfp->mfp;
+
+ switch (flags) {
+ case DB_MPOOL_NOFILE:
+ if (mfp == NULL)
+ if (onoff)
+ FLD_SET(dbmfp->config_flags, DB_MPOOL_NOFILE);
+ else
+ FLD_CLR(dbmfp->config_flags, DB_MPOOL_NOFILE);
+ else
+ mfp->no_backing_file = onoff;
+ break;
+ case DB_MPOOL_UNLINK:
+ if (mfp == NULL)
+ if (onoff)
+ FLD_SET(dbmfp->config_flags, DB_MPOOL_UNLINK);
+ else
+ FLD_CLR(dbmfp->config_flags, DB_MPOOL_UNLINK);
+ else
+ mfp->unlink_on_close = onoff;
+ break;
+ default:
+ if ((ret = __db_fchk(env, "DB_MPOOLFILE->set_flags",
+ flags, DB_MPOOL_NOFILE | DB_MPOOL_UNLINK)) != 0)
+ return (ret);
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __memp_get_ftype --
+ * Get the file type (as registered).
+ *
+ * PUBLIC: int __memp_get_ftype __P((DB_MPOOLFILE *, int *));
+ */
+int
+__memp_get_ftype(dbmfp, ftypep)
+ DB_MPOOLFILE *dbmfp;
+ int *ftypep;
+{
+ *ftypep = dbmfp->ftype;
+ return (0);
+}
+
+/*
+ * __memp_set_ftype --
+ * DB_MPOOLFILE->set_ftype.
+ *
+ * PUBLIC: int __memp_set_ftype __P((DB_MPOOLFILE *, int));
+ */
+int
+__memp_set_ftype(dbmfp, ftype)
+ DB_MPOOLFILE *dbmfp;
+ int ftype;
+{
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_ftype");
+
+ dbmfp->ftype = ftype;
+ return (0);
+}
+
+/*
+ * __memp_get_lsn_offset --
+ * Get the page's LSN offset.
+ */
+static int
+__memp_get_lsn_offset(dbmfp, lsn_offsetp)
+ DB_MPOOLFILE *dbmfp;
+ int32_t *lsn_offsetp;
+{
+ *lsn_offsetp = dbmfp->lsn_offset;
+ return (0);
+}
+
+/*
+ * __memp_set_lsn_offset --
+ * Set the page's LSN offset.
+ *
+ * PUBLIC: int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t));
+ */
+int
+__memp_set_lsn_offset(dbmfp, lsn_offset)
+ DB_MPOOLFILE *dbmfp;
+ int32_t lsn_offset;
+{
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_lsn_offset");
+
+ dbmfp->lsn_offset = lsn_offset;
+ return (0);
+}
+
+/*
+ * __memp_get_maxsize --
+ * Get the file's maximum size.
+ */
+static int
+__memp_get_maxsize(dbmfp, gbytesp, bytesp)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t *gbytesp, *bytesp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ MPOOLFILE *mfp;
+
+ if ((mfp = dbmfp->mfp) == NULL) {
+ *gbytesp = dbmfp->gbytes;
+ *bytesp = dbmfp->bytes;
+ } else {
+ env = dbmfp->env;
+ ENV_ENTER(env, ip);
+
+ MUTEX_LOCK(env, mfp->mutex);
+ *gbytesp = (u_int32_t)
+ (mfp->maxpgno / (GIGABYTE / mfp->pagesize));
+ *bytesp = (u_int32_t)
+ ((mfp->maxpgno % (GIGABYTE / mfp->pagesize)) *
+ mfp->pagesize);
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ ENV_LEAVE(env, ip);
+ }
+
+ return (0);
+}
+
+/*
+ * __memp_set_maxsize --
+ * Set the file's maximum size.
+ */
+static int
+__memp_set_maxsize(dbmfp, gbytes, bytes)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t gbytes, bytes;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ MPOOLFILE *mfp;
+
+ if ((mfp = dbmfp->mfp) == NULL) {
+ dbmfp->gbytes = gbytes;
+ dbmfp->bytes = bytes;
+ } else {
+ env = dbmfp->env;
+ ENV_ENTER(env, ip);
+
+ MUTEX_LOCK(env, mfp->mutex);
+ mfp->maxpgno = (db_pgno_t)
+ (gbytes * (GIGABYTE / mfp->pagesize));
+ mfp->maxpgno += (db_pgno_t)
+ ((bytes + mfp->pagesize - 1) / mfp->pagesize);
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ ENV_LEAVE(env, ip);
+ }
+
+ return (0);
+}
+
+/*
+ * __memp_get_pgcookie --
+ * Get the pgin/pgout cookie.
+ *
+ * PUBLIC: int __memp_get_pgcookie __P((DB_MPOOLFILE *, DBT *));
+ */
+int
+__memp_get_pgcookie(dbmfp, pgcookie)
+ DB_MPOOLFILE *dbmfp;
+ DBT *pgcookie;
+{
+ if (dbmfp->pgcookie == NULL) {
+ pgcookie->size = 0;
+ pgcookie->data = "";
+ } else
+ memcpy(pgcookie, dbmfp->pgcookie, sizeof(DBT));
+ return (0);
+}
+
+/*
+ * __memp_set_pgcookie --
+ * Set the pgin/pgout cookie.
+ *
+ * PUBLIC: int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *));
+ */
+int
+__memp_set_pgcookie(dbmfp, pgcookie)
+ DB_MPOOLFILE *dbmfp;
+ DBT *pgcookie;
+{
+ DBT *cookie;
+ ENV *env;
+ int ret;
+
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_pgcookie");
+ env = dbmfp->env;
+
+ if ((ret = __os_calloc(env, 1, sizeof(*cookie), &cookie)) != 0)
+ return (ret);
+ if ((ret = __os_malloc(env, pgcookie->size, &cookie->data)) != 0) {
+ __os_free(env, cookie);
+ return (ret);
+ }
+
+ memcpy(cookie->data, pgcookie->data, pgcookie->size);
+ cookie->size = pgcookie->size;
+
+ dbmfp->pgcookie = cookie;
+ return (0);
+}
+
+/*
+ * __memp_get_priority --
+ * Set the cache priority for pages from this file.
+ *
+ * PUBLIC: int __memp_get_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *));
+ */
+int
+__memp_get_priority(dbmfp, priorityp)
+ DB_MPOOLFILE *dbmfp;
+ DB_CACHE_PRIORITY *priorityp;
+{
+ switch (dbmfp->priority) {
+ case MPOOL_PRI_VERY_LOW:
+ *priorityp = DB_PRIORITY_VERY_LOW;
+ break;
+ case MPOOL_PRI_LOW:
+ *priorityp = DB_PRIORITY_LOW;
+ break;
+ case MPOOL_PRI_DEFAULT:
+ *priorityp = DB_PRIORITY_DEFAULT;
+ break;
+ case MPOOL_PRI_HIGH:
+ *priorityp = DB_PRIORITY_HIGH;
+ break;
+ case MPOOL_PRI_VERY_HIGH:
+ *priorityp = DB_PRIORITY_VERY_HIGH;
+ break;
+ default:
+ __db_errx(dbmfp->env, DB_STR_A("3031",
+ "DB_MPOOLFILE->get_priority: unknown priority value: %d",
+ "%d"), dbmfp->priority);
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __memp_set_priority --
+ * Set the cache priority for pages from this file.
+ */
+static int
+__memp_set_priority(dbmfp, priority)
+ DB_MPOOLFILE *dbmfp;
+ DB_CACHE_PRIORITY priority;
+{
+ switch (priority) {
+ case DB_PRIORITY_VERY_LOW:
+ dbmfp->priority = MPOOL_PRI_VERY_LOW;
+ break;
+ case DB_PRIORITY_LOW:
+ dbmfp->priority = MPOOL_PRI_LOW;
+ break;
+ case DB_PRIORITY_DEFAULT:
+ dbmfp->priority = MPOOL_PRI_DEFAULT;
+ break;
+ case DB_PRIORITY_HIGH:
+ dbmfp->priority = MPOOL_PRI_HIGH;
+ break;
+ case DB_PRIORITY_VERY_HIGH:
+ dbmfp->priority = MPOOL_PRI_VERY_HIGH;
+ break;
+ default:
+ __db_errx(dbmfp->env, DB_STR_A("3032",
+ "DB_MPOOLFILE->set_priority: unknown priority value: %d",
+ "%d"), priority);
+ return (EINVAL);
+ }
+
+ /* Update the underlying file if we've already opened it. */
+ if (dbmfp->mfp != NULL)
+ dbmfp->mfp->priority = dbmfp->priority;
+
+ return (0);
+}
+
+/*
+ * __memp_get_last_pgno --
+ * Return the page number of the last page in the file.
+ *
+ * !!!
+ * The method is undocumented, but the handle is exported, users occasionally
+ * ask for it.
+ *
+ * PUBLIC: int __memp_get_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
+ */
+int
+__memp_get_last_pgno(dbmfp, pgnoaddr)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t *pgnoaddr;
+{
+ ENV *env;
+ MPOOLFILE *mfp;
+
+ env = dbmfp->env;
+ mfp = dbmfp->mfp;
+
+ MUTEX_LOCK(env, mfp->mutex);
+ *pgnoaddr = mfp->last_pgno;
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ return (0);
+}
+
+/*
+ * __memp_get_last_pgno_pp --
+ * pre/post processing for __memp_get_last_pgno.
+ *
+ */
+static int
+__memp_get_last_pgno_pp(dbmfp, pgnoaddr)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t *pgnoaddr;
+{
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ret = 0;
+ ENV_ENTER(dbmfp->env, ip);
+
+ ret = __memp_get_last_pgno(dbmfp, pgnoaddr);
+
+ ENV_LEAVE(dbmfp->env, ip);
+ return (ret);
+}
+
+/*
+ * __memp_fn --
+ * On errors we print whatever is available as the file name.
+ *
+ * PUBLIC: char * __memp_fn __P((DB_MPOOLFILE *));
+ */
+char *
+__memp_fn(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ return (__memp_fns(dbmfp->env->mp_handle, dbmfp->mfp));
+}
+
+/*
+ * __memp_fns --
+ * On errors we print whatever is available as the file name.
+ *
+ * PUBLIC: char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *));
+ *
+ */
+char *
+__memp_fns(dbmp, mfp)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+{
+ if (mfp == NULL || mfp->path_off == 0)
+ return ((char *)"unknown");
+
+ return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off));
+}
diff --git a/src/mp/mp_fopen.c b/src/mp/mp_fopen.c
new file mode 100644
index 00000000..ef7f886a
--- /dev/null
+++ b/src/mp/mp_fopen.c
@@ -0,0 +1,1220 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+static int __memp_mpf_alloc __P((DB_MPOOL *,
+ DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **));
+static int __memp_mpf_find __P((ENV *,
+ DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **));
+
+/*
+ * __memp_fopen_pp --
+ * DB_MPOOLFILE->open pre/post processing.
+ *
+ * PUBLIC: int __memp_fopen_pp
+ * PUBLIC: __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t));
+ */
+int
+__memp_fopen_pp(dbmfp, path, flags, mode, pagesize)
+ DB_MPOOLFILE *dbmfp;
+ const char *path;
+ u_int32_t flags;
+ int mode;
+ size_t pagesize;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbmfp->env;
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env, "DB_MPOOLFILE->open", flags,
+ DB_CREATE | DB_DIRECT | DB_EXTENT | DB_MULTIVERSION |
+ DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
+ return (ret);
+
+ /*
+ * Require a power-of-two pagesize, smaller than the clear length. A
+ * non-zero page size is only allowed if opening an existing, in-memory
+ * db.
+ */
+ if (!POWER_OF_TWO(pagesize) ||
+ (pagesize == 0 && (LF_ISSET(DB_CREATE) ||
+ !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)))) {
+ __db_errx(env, DB_STR("3033",
+ "DB_MPOOLFILE->open: page sizes must be a power-of-2"));
+ return (EINVAL);
+ }
+ if (pagesize != 0 && dbmfp->clear_len > pagesize) {
+ __db_errx(env, DB_STR("3034",
+ "DB_MPOOLFILE->open: clear length larger than page size"));
+ return (EINVAL);
+ }
+
+ /* Read-only checks, and local flag. */
+ if (LF_ISSET(DB_RDONLY) && path == NULL) {
+ __db_errx(env, DB_STR("3035",
+ "DB_MPOOLFILE->open: temporary files can't be readonly"));
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIVERSION) && !TXN_ON(env)) {
+ __db_errx(env, DB_STR("3036",
+ "DB_MPOOLFILE->open: DB_MULTIVERSION requires transactions"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__memp_fopen(dbmfp, NULL,
+ path, NULL, flags, mode, pagesize)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * Generate the number of user opens. If there is no backing file
+ * there is an extra open count to keep the in memory db around.
+ */
+#define MFP_OPEN_CNT(mfp) ((mfp)->mpf_cnt - ((mfp)->neutral_cnt + \
+ (u_int32_t)(mfp)->no_backing_file))
+/*
+ * __memp_fopen --
+ * DB_MPOOLFILE->open.
+ *
+ * PUBLIC: int __memp_fopen __P((DB_MPOOLFILE *, MPOOLFILE *,
+ * PUBLIC: const char *, const char **, u_int32_t, int, size_t));
+ */
+int
+__memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize)
+ DB_MPOOLFILE *dbmfp;
+ MPOOLFILE *mfp;
+ const char *path;
+ const char **dirp;
+ u_int32_t flags;
+ int mode;
+ size_t pgsize;
+{
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *tmp_dbmfp;
+ DB_MPOOL_HASH *hp;
+ ENV *env;
+ MPOOL *mp;
+ MPOOLFILE *alloc_mfp;
+ size_t maxmap;
+ db_pgno_t last_pgno;
+ u_int32_t bucket, mbytes, bytes, oflags, pagesize;
+ int refinc, ret, isdir;
+ char *rpath;
+
+ /* If this handle is already open, return. */
+ if (F_ISSET(dbmfp, MP_OPEN_CALLED))
+ return (0);
+
+ env = dbmfp->env;
+ dbmp = env->mp_handle;
+ dbenv = env->dbenv;
+ mp = dbmp->reginfo[0].primary;
+ alloc_mfp = NULL;
+ mbytes = bytes = 0;
+ refinc = ret = isdir = 0;
+ rpath = NULL;
+
+ /*
+ * We're keeping the page size as a size_t in the public API, but
+ * it's a u_int32_t everywhere internally.
+ */
+ pagesize = (u_int32_t)pgsize;
+
+ /*
+ * We're called internally with a specified mfp, in which case the
+ * path is NULL, but we'll get the path from the underlying region
+ * information. Otherwise, if the path is NULL, it's a temporary
+ * file -- we know we can't join any existing files, and we'll delay
+ * the open until we actually need to write the file. All temporary
+ * files will go into the first hash bucket.
+ */
+ DB_ASSERT(env, mfp == NULL || path == NULL);
+
+ bucket = 0;
+ hp = R_ADDR(dbmp->reginfo, mp->ftab);
+ if (mfp == NULL) {
+ if (path == NULL)
+ goto alloc;
+
+ /*
+ * If fileid is not set but the file exists on the disk,
+ * we try to use __os_fileid to set it. We do this
+ * because we want to use the fileid to check if we have
+ * opened the mpoolfile as early as possible.
+ *
+ * Note: DB layer always calls __memp_fopen with fileid set,
+ * so this is only for using mpool api to open a file.
+ */
+
+ if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
+ !F_ISSET(dbmfp, MP_FILEID_SET)) {
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, path, dirp, &rpath)) != 0)
+ goto err;
+ ret = __os_exists(env, rpath, &isdir);
+ if (ret == 0 && isdir) {
+ ret = EINVAL;
+ goto err;
+ } else if (ret == 0) {
+ if ((ret = __os_fileid(env,
+ rpath, 0, dbmfp->fileid)) != 0)
+ goto err;
+ F_SET(dbmfp, MP_FILEID_SET);
+ }
+ }
+
+ /*
+ * Hash to the proper file table entry and walk it.
+ *
+ * The fileID is a filesystem unique number (e.g., a
+ * UNIX dev/inode pair) plus a timestamp. If files are
+ * removed and created in less than a second, the fileID
+ * can be repeated. The problem with repetition happens
+ * when the file that previously had the fileID value still
+ * has pages in the pool, since we don't want to use them
+ * to satisfy requests for the new file. Because the
+ * DB_TRUNCATE flag reuses the dev/inode pair, repeated
+ * opens with that flag set guarantees matching fileIDs
+ * when the machine can open a file and then re-open
+ * with truncate within a second. For this reason, we
+ * pass that flag down, and, if we find a matching entry,
+ * we ensure that it's never found again, and we create
+ * a new entry for the current request.
+ */
+ if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) ||
+ F_ISSET(dbmfp, MP_FILEID_SET)) {
+ if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+ bucket = FNBUCKET(path, strlen(path));
+ else
+ bucket = FNBUCKET(dbmfp->fileid,
+ DB_FILE_ID_LEN);
+
+ hp += bucket;
+ /*
+ * If we find the MPOOLFILE and inc its ref count.
+ * That way it cannot go away while we open it.
+ */
+ MUTEX_LOCK(env, hp->mtx_hash);
+ ret =
+ __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp);
+ if (ret == 0 && mfp != NULL) {
+ refinc = 1;
+
+ if (LF_ISSET(DB_MULTIVERSION)) {
+ if (MFP_OPEN_CNT(mfp) > (u_int32_t)
+ (LF_ISSET(DB_RDONLY) ? 0 : 1) &&
+ atomic_read(
+ &mfp->multiversion) == 0) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ goto mvcc_err;
+ }
+ atomic_inc(env, &mfp->multiversion);
+ F_SET(dbmfp, MP_MULTIVERSION);
+ }
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ if (ret != 0)
+ goto err;
+ }
+ } else {
+ /*
+ * Deadfile can only be set if mpf_cnt goes to zero (or if we
+ * failed creating the file DB_AM_DISCARD). Increment the ref
+ * count so the file cannot become dead and be unlinked.
+ */
+ MUTEX_LOCK(env, mfp->mutex);
+ if (!mfp->deadfile) {
+ if (LF_ISSET(DB_MULTIVERSION)) {
+ MUTEX_UNLOCK(env, mfp->mutex);
+ if (MFP_OPEN_CNT(mfp) > 0 &&
+ atomic_read(&mfp->multiversion) == 0) {
+mvcc_err: __db_errx(env, DB_STR("3041",
+"DB_MULTIVERSION cannot be specified on a database file which is already open"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ atomic_inc(env, &mfp->multiversion);
+ F_SET(dbmfp, MP_MULTIVERSION);
+ }
+ /*
+ * Increment the reference count. We also track
+ * those references that don't effect the ability
+ * to convert the handle to either NOT_DURABLE or
+ * MVCC. These are readonly opens or threads that
+ * are using the handle just to flush a buffer.
+ */
+ ++mfp->mpf_cnt;
+ if (LF_ISSET(DB_FLUSH | DB_RDONLY))
+ ++mfp->neutral_cnt;
+ if (LF_ISSET(DB_FLUSH))
+ F_SET(dbmfp, MP_FOR_FLUSH);
+ refinc = 1;
+ }
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ /*
+ * Test one last time to see if the file is dead -- it may have
+ * been removed. This happens when a checkpoint trying to open
+ * the file to flush a buffer races with the Db::remove method.
+ * The error will be ignored, so don't output an error message.
+ */
+ if (mfp->deadfile) {
+ ret = EINVAL;
+ goto err;
+ }
+ }
+
+ if (LF_ISSET(DB_RDONLY))
+ F_SET(dbmfp, MP_READONLY);
+ if (LF_ISSET(DB_FLUSH))
+ F_SET(dbmfp, MP_FLUSH);
+ /*
+ * Share the underlying file descriptor if that's possible.
+ */
+ if (mfp != NULL && !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
+ MUTEX_LOCK(env, dbmp->mutex);
+ TAILQ_FOREACH(tmp_dbmfp, &dbmp->dbmfq, q)
+ if (mfp == tmp_dbmfp->mfp &&
+ (F_ISSET(dbmfp, MP_READONLY) ||
+ !F_ISSET(tmp_dbmfp, MP_READONLY))) {
+ ++tmp_dbmfp->fhp->ref;
+ dbmfp->fhp = tmp_dbmfp->fhp;
+ dbmfp->addr = tmp_dbmfp->addr;
+ dbmfp->len = tmp_dbmfp->len;
+ break;
+ }
+ MUTEX_UNLOCK(env, dbmp->mutex);
+ if (dbmfp->fhp != NULL)
+ goto have_mfp;
+ }
+
+ /*
+ * If there's no backing file, we can join existing files in the cache,
+ * but there's nothing to read from disk.
+ */
+ if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
+ /* Convert MP open flags to DB OS-layer open flags. */
+ oflags = 0;
+ if (LF_ISSET(DB_CREATE))
+ oflags |= DB_OSO_CREATE;
+ if (LF_ISSET(DB_DIRECT))
+ oflags |= DB_OSO_DIRECT;
+ if (LF_ISSET(DB_RDONLY))
+ oflags |= DB_OSO_RDONLY;
+
+ /*
+ * XXX
+ * A grievous layering violation, the DB_DSYNC_DB flag
+ * was left in the ENV structure and not driven through
+ * the cache API. This needs to be fixed when the general
+ * API configuration is fixed.
+ */
+ if (F_ISSET(env->dbenv, DB_ENV_DSYNC_DB))
+ oflags |= DB_OSO_DSYNC;
+
+ /*
+ * Get the real name for this file and open it.
+ *
+ * Supply a page size so os_open can decide whether to
+ * turn buffering off if the DB_DIRECT_DB flag is set.
+ *
+ * Acquire the region lock if we're using a path from
+ * an underlying MPOOLFILE -- there's a race in accessing
+ * the path name stored in the region, __memp_nameop may
+ * be simultaneously renaming the file.
+ */
+
+ ret = 0;
+ if (mfp != NULL) {
+ MPOOL_SYSTEM_LOCK(env);
+ path = R_ADDR(dbmp->reginfo, mfp->path_off);
+ if (rpath != NULL) {
+ __os_free(env, rpath);
+ rpath = NULL;
+ }
+ }
+ if (rpath == NULL)
+ ret = __db_appname(env,
+ DB_APP_DATA, path, dirp, &rpath);
+ if (ret == 0)
+ ret = __os_open(env, rpath,
+ (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp);
+ if (mfp != NULL)
+ MPOOL_SYSTEM_UNLOCK(env);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Cache file handles are shared, and have mutexes to
+ * protect the underlying file handle across seek and
+ * read/write calls.
+ */
+ dbmfp->fhp->ref = 1;
+ if ((ret = __mutex_alloc(env, MTX_MPOOL_FH,
+ DB_MUTEX_PROCESS_ONLY, &dbmfp->fhp->mtx_fh)) != 0)
+ goto err;
+
+ /* Figure out the file's size. */
+ if ((ret = __os_ioinfo(
+ env, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
+ __db_err(env, ret, "%s", rpath);
+ goto err;
+ }
+
+ /*
+ * Don't permit files that aren't a multiple of the pagesize,
+ * and find the number of the last page in the file, all the
+ * time being careful not to overflow 32 bits.
+ *
+ * During verify or recovery, we might have to cope with a
+ * truncated file; if the file size is not a multiple of the
+ * page size, round down to a page, we'll take care of the
+ * partial page outside the mpool system.
+ *
+ * Pagesize of 0 is only allowed for in-mem dbs.
+ */
+ DB_ASSERT(env, pagesize != 0);
+ if (bytes % pagesize != 0) {
+ if (LF_ISSET(DB_ODDFILESIZE))
+ bytes -= (u_int32_t)(bytes % pagesize);
+ else {
+ __db_errx(env, DB_STR_A("3037",
+ "%s: file size not a multiple of the pagesize", "%s"),
+ rpath);
+ ret = EINVAL;
+ goto err;
+ }
+ }
+
+ /*
+ * Get the file id if we weren't given one. Generated file id's
+ * don't use timestamps, otherwise there'd be no chance of any
+ * other process joining the party. Don't bother looking for
+ * this id in the hash table, its new.
+ */
+ if (mfp == NULL && !F_ISSET(dbmfp, MP_FILEID_SET)) {
+ if ((ret =
+ __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0)
+ goto err;
+ F_SET(dbmfp, MP_FILEID_SET);
+ bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN);
+ hp += bucket;
+ goto alloc;
+ }
+ }
+
+ if (mfp != NULL)
+ goto have_mfp;
+
+ /*
+ * We can race with another process opening the same file when
+ * we allocate the mpoolfile structure. We will come back
+ * here and check the hash table again to see if it has appeared.
+ * For most files this is not a problem, since the name is locked
+ * at a higher layer but QUEUE extent files are not locked.
+ */
+check: MUTEX_LOCK(env, hp->mtx_hash);
+ if ((ret = __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp) != 0))
+ goto err;
+
+ if (alloc_mfp != NULL && mfp == NULL) {
+ mfp = alloc_mfp;
+ alloc_mfp = NULL;
+ SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile);
+ } else if (mfp != NULL) {
+ refinc = 1;
+ /*
+ * Some things about a file cannot be changed: the clear length,
+ * page size, or LSN location. However, if this is an attempt
+ * to open a named in-memory file, we may not yet have that
+ * information. so accept uninitialized entries.
+ *
+ * The file type can change if the application's pre- and post-
+ * processing needs change. For example, an application that
+ * created a hash subdatabase in a database that was previously
+ * all btree.
+ *
+ * !!!
+ * We do not check to see if the pgcookie information changed,
+ * or update it if it is.
+ */
+ if ((dbmfp->clear_len != DB_CLEARLEN_NOTSET &&
+ mfp->clear_len != DB_CLEARLEN_NOTSET &&
+ dbmfp->clear_len != mfp->clear_len) ||
+ (pagesize != 0 && pagesize != mfp->pagesize) ||
+ (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET &&
+ mfp->lsn_off != DB_LSN_OFF_NOTSET &&
+ dbmfp->lsn_offset != mfp->lsn_off)) {
+ __db_errx(env, DB_STR_A("3038",
+ "%s: clear length, page size or LSN location changed",
+ "%s"), path);
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ ret = EINVAL;
+ goto err;
+ }
+ }
+ if (mfp != NULL && LF_ISSET(DB_MULTIVERSION)) {
+ if (MFP_OPEN_CNT(mfp) > 1 &&
+ atomic_read(&mfp->multiversion) == 0) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ goto mvcc_err;
+ }
+ atomic_inc(env, &mfp->multiversion);
+ F_SET(dbmfp, MP_MULTIVERSION);
+ }
+
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ if (alloc_mfp != NULL) {
+ MUTEX_LOCK(env, alloc_mfp->mutex);
+ if ((ret = __memp_mf_discard(dbmp, alloc_mfp, 0)) != 0)
+ goto err;
+ }
+
+ if (mfp == NULL) {
+ /*
+ * If we didn't find the file and this is an in-memory file,
+ * then the create flag should be set.
+ */
+ if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
+ !LF_ISSET(DB_CREATE)) {
+ ret = ENOENT;
+ goto err;
+ }
+
+alloc: if ((ret = __memp_mpf_alloc(dbmp,
+ dbmfp, path, pagesize, flags, &alloc_mfp)) != 0)
+ goto err;
+
+ /*
+ * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a
+ * page get, we have to increment the last page in the file.
+ * Figure it out and save it away.
+ *
+ * Note correction: page numbers are zero-based, not 1-based.
+ */
+ DB_ASSERT(env, pagesize != 0);
+ last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize));
+ last_pgno += (db_pgno_t)(bytes / pagesize);
+ if (last_pgno != 0)
+ --last_pgno;
+
+ alloc_mfp->last_flushed_pgno = alloc_mfp->orig_last_pgno =
+ alloc_mfp->last_pgno = last_pgno;
+
+ alloc_mfp->bucket = bucket;
+
+ /* Go back and see if someone else has opened the file. */
+ if (path != NULL)
+ goto check;
+
+ mfp = alloc_mfp;
+
+ if (LF_ISSET(DB_MULTIVERSION)) {
+ atomic_inc(env, &mfp->multiversion);
+ F_SET(dbmfp, MP_MULTIVERSION);
+ }
+
+ /* This is a temp, noone else can see it, put it at the end. */
+ MUTEX_LOCK(env, hp->mtx_hash);
+ SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, mfp, q);
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ }
+have_mfp:
+ /*
+ * We need to verify that all handles open a file either durable or not
+ * durable. This needs to be cross process and cross sub-databases, so
+ * mpool is the place to do it.
+ */
+ if (!LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) {
+ if (F_ISSET(mfp, MP_DURABLE_UNKNOWN)) {
+ if (LF_ISSET(DB_TXN_NOT_DURABLE))
+ F_SET(mfp, MP_NOT_DURABLE);
+ F_CLR(mfp, MP_DURABLE_UNKNOWN);
+ } else if (!LF_ISSET(DB_TXN_NOT_DURABLE) !=
+ !F_ISSET(mfp, MP_NOT_DURABLE)) {
+ __db_errx(env, DB_STR("3039",
+ "Cannot open DURABLE and NOT DURABLE handles in the same file"));
+ ret = EINVAL;
+ goto err;
+ }
+ }
+
+ /*
+ * All paths to here have initialized the mfp variable to reference
+ * the selected (or allocated) MPOOLFILE.
+ */
+ dbmfp->mfp = mfp;
+
+ /*
+ * Check to see if we can mmap the file. If a file:
+ * + isn't temporary
+ * + is read-only
+ * + doesn't require any pgin/pgout support
+ * + the DB_NOMMAP flag wasn't set (in either the file open or
+ * the environment in which it was opened)
+ * + and is less than mp_mmapsize bytes in size
+ *
+ * we can mmap it instead of reading/writing buffers. Don't do error
+ * checking based on the mmap call failure. We want to do normal I/O
+ * on the file if the reason we failed was because the file was on an
+ * NFS mounted partition, and we can fail in buffer I/O just as easily
+ * as here.
+ *
+ * We'd like to test to see if the file is too big to mmap. Since we
+ * don't know what size or type off_t's or size_t's are, or the largest
+ * unsigned integral type is, or what random insanity the local C
+ * compiler will perpetrate, doing the comparison in a portable way is
+ * flatly impossible. Hope that mmap fails if the file is too large.
+ */
+#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 MB. */
+ if (F_ISSET(mfp, MP_CAN_MMAP) && dbmfp->addr == NULL) {
+ maxmap = dbenv->mp_mmapsize == 0 ?
+ DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
+ if (path == NULL ||
+ FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+ F_CLR(mfp, MP_CAN_MMAP);
+ else if (!F_ISSET(dbmfp, MP_READONLY))
+ F_CLR(mfp, MP_CAN_MMAP);
+ else if (dbmfp->ftype != 0)
+ F_CLR(mfp, MP_CAN_MMAP);
+ else if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
+ F_CLR(mfp, MP_CAN_MMAP);
+ else {
+ MPOOL_SYSTEM_LOCK(env);
+ maxmap = mp->mp_mmapsize == 0 ?
+ DB_MAXMMAPSIZE : mp->mp_mmapsize;
+ MPOOL_SYSTEM_UNLOCK(env);
+ if (mbytes > maxmap / MEGABYTE ||
+ (mbytes == maxmap / MEGABYTE &&
+ bytes >= maxmap % MEGABYTE))
+ F_CLR(mfp, MP_CAN_MMAP);
+ }
+
+ dbmfp->addr = NULL;
+ if (F_ISSET(mfp, MP_CAN_MMAP)) {
+ dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
+ if (__os_mapfile(env, rpath,
+ dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) {
+ dbmfp->addr = NULL;
+ F_CLR(mfp, MP_CAN_MMAP);
+ }
+ }
+ }
+
+ F_SET(dbmfp, MP_OPEN_CALLED);
+
+ /*
+ * Add the file to the process' list of DB_MPOOLFILEs.
+ */
+ MUTEX_LOCK(env, dbmp->mutex);
+ TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
+ MUTEX_UNLOCK(env, dbmp->mutex);
+
+ if (0) {
+err: if (refinc) {
+ /*
+ * If mpf_cnt goes to zero here and unlink_on_close is
+ * set, then we missed the last close, but there was an
+ * error trying to open the file, so we probably cannot
+ * unlink it anyway.
+ */
+ MUTEX_LOCK(env, mfp->mutex);
+ --mfp->mpf_cnt;
+ if (LF_ISSET(DB_FLUSH | DB_RDONLY)) {
+ DB_ASSERT(env, mfp->neutral_cnt != 0);
+ --mfp->neutral_cnt;
+ }
+ MUTEX_UNLOCK(env, mfp->mutex);
+ }
+
+ }
+ if (rpath != NULL)
+ __os_free(env, rpath);
+ return (ret);
+}
+
+/*
+ * __memp_mpf_find --
+ * Search a hash bucket for a MPOOLFILE.
+ */
+static int
+__memp_mpf_find(env, dbmfp, hp, path, flags, mfpp)
+ ENV *env;
+ DB_MPOOLFILE *dbmfp;
+ DB_MPOOL_HASH *hp;
+ const char *path;
+ u_int32_t flags;
+ MPOOLFILE **mfpp;
+{
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+
+ dbmp = env->mp_handle;
+
+ SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+ /* Skip dead files and temporary files. */
+ if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+ continue;
+
+ /*
+ * Any remaining DB_MPOOL_NOFILE databases are in-memory
+ * named databases and need only match other in-memory
+ * databases with the same name.
+ */
+ if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
+ if (!mfp->no_backing_file)
+ continue;
+
+ if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off)))
+ continue;
+
+ /*
+ * We matched an in-memory file; grab the fileid if
+ * it is set in the region, but not in the dbmfp.
+ */
+ if (!F_ISSET(dbmfp, MP_FILEID_SET))
+ (void)__memp_set_fileid(dbmfp,
+ R_ADDR(dbmp->reginfo, mfp->fileid_off));
+ } else
+ if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
+ mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+ continue;
+
+ /*
+ * If the file is being truncated, remove it from the system
+ * and create a new entry.
+ *
+ * !!!
+ * We should be able to set mfp to NULL and break out of the
+ * loop, but I like the idea of checking all the entries.
+ */
+ if (LF_ISSET(DB_TRUNCATE)) {
+ MUTEX_LOCK(env, mfp->mutex);
+ mfp->deadfile = 1;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ continue;
+ }
+
+ /*
+ * Check to see if this file has died while we waited.
+ *
+ * We normally don't lock the deadfile field when we read it as
+ * we only care if the field is zero or non-zero. We do lock
+ * on read when searching for a matching MPOOLFILE so that two
+ * threads of control don't race between setting the deadfile
+ * bit and incrementing the reference count, that is, a thread
+ * of control decrementing the reference count and then setting
+ * deadfile because the reference count is 0 blocks us finding
+ * the file without knowing it's about to be marked dead.
+ */
+ MUTEX_LOCK(env, mfp->mutex);
+ if (mfp->deadfile) {
+ MUTEX_UNLOCK(env, mfp->mutex);
+ continue;
+ }
+ ++mfp->mpf_cnt;
+ if (LF_ISSET(DB_FLUSH | DB_RDONLY))
+ ++mfp->neutral_cnt;
+ if (LF_ISSET(DB_FLUSH))
+ F_SET(dbmfp, MP_FOR_FLUSH);
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ /* Initialize any fields that are not yet set. */
+ if (dbmfp->ftype != 0)
+ mfp->ftype = dbmfp->ftype;
+ if (dbmfp->clear_len != DB_CLEARLEN_NOTSET)
+ mfp->clear_len = dbmfp->clear_len;
+ if (dbmfp->lsn_offset != -1)
+ mfp->lsn_off = dbmfp->lsn_offset;
+
+ break;
+ }
+
+ *mfpp = mfp;
+ return (0);
+}
+
+static int
+__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ const char *path;
+ u_int32_t pagesize;
+ u_int32_t flags;
+ MPOOLFILE **retmfp;
+{
+ ENV *env;
+ MPOOLFILE *mfp;
+ int ret;
+ void *p;
+
+ env = dbmp->env;
+ ret = 0;
+ /* Allocate and initialize a new MPOOLFILE. */
+ if ((ret = __memp_alloc(dbmp,
+ dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+ goto err;
+ memset(mfp, 0, sizeof(MPOOLFILE));
+ mfp->mpf_cnt = 1;
+ if (LF_ISSET(DB_FLUSH | DB_RDONLY))
+ mfp->neutral_cnt = 1;
+ if (LF_ISSET(DB_FLUSH))
+ F_SET(dbmfp, MP_FOR_FLUSH);
+ mfp->ftype = dbmfp->ftype;
+ mfp->pagesize = pagesize;
+ mfp->lsn_off = dbmfp->lsn_offset;
+ mfp->clear_len = dbmfp->clear_len;
+ mfp->priority = dbmfp->priority;
+ if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) {
+ mfp->maxpgno = (db_pgno_t)
+ (dbmfp->gbytes * (GIGABYTE / mfp->pagesize));
+ mfp->maxpgno += (db_pgno_t)
+ ((dbmfp->bytes + mfp->pagesize - 1) /
+ mfp->pagesize);
+ }
+ if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+ mfp->no_backing_file = 1;
+ if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK))
+ mfp->unlink_on_close = 1;
+
+ F_SET(mfp, MP_CAN_MMAP);
+ if (F_ISSET(env->dbenv, DB_ENV_DATABASE_LOCKING))
+ F_SET(mfp, MP_DATABASE_LOCKING);
+ if (LF_ISSET(DB_DIRECT))
+ F_SET(mfp, MP_DIRECT);
+ if (LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY))
+ F_SET(mfp, MP_DURABLE_UNKNOWN);
+ if (LF_ISSET(DB_EXTENT))
+ F_SET(mfp, MP_EXTENT);
+ if (LF_ISSET(DB_TXN_NOT_DURABLE))
+ F_SET(mfp, MP_NOT_DURABLE);
+
+ /*
+ * An in-memory database with no name is a temp file. Named
+ * in-memory databases get an artificially bumped reference
+ * count so they don't disappear on close; they need a remove
+ * to make them disappear.
+ */
+ if (path == NULL)
+ F_SET(mfp, MP_TEMP);
+ else if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+ mfp->mpf_cnt++;
+
+ /* Copy the file identification string into shared memory. */
+ if (F_ISSET(dbmfp, MP_FILEID_SET)) {
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
+ goto err;
+ memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN);
+ }
+
+ /* Copy the file path into shared memory. */
+ if (path != NULL) {
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
+ goto err;
+ memcpy(p, path, strlen(path) + 1);
+ }
+
+ /* Copy the page cookie into shared memory. */
+ if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) {
+ mfp->pgcookie_len = 0;
+ mfp->pgcookie_off = 0;
+ } else {
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, dbmfp->pgcookie->size,
+ &mfp->pgcookie_off, &p)) != 0)
+ goto err;
+ memcpy(p,
+ dbmfp->pgcookie->data, dbmfp->pgcookie->size);
+ mfp->pgcookie_len = dbmfp->pgcookie->size;
+ }
+
+ if ((ret = __mutex_alloc(env,
+ MTX_MPOOLFILE_HANDLE, 0, &mfp->mutex)) != 0)
+ goto err;
+#ifndef HAVE_ATOMICFILEREAD
+ if ((ret = __mutex_alloc(env,
+ MTX_MPOOLFILE_HANDLE, DB_MUTEX_SHARED, &mfp->mtx_write)) != 0)
+ goto err;
+#endif
+ *retmfp = mfp;
+
+err: return (ret);
+}
+
+/*
+ * memp_fclose_pp --
+ * DB_MPOOLFILE->close pre/post processing.
+ *
+ * PUBLIC: int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t));
+ */
+int
+__memp_fclose_pp(dbmfp, flags)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbmfp->env;
+
+ /*
+ * Validate arguments, but as a handle destructor, we can't fail.
+ */
+ if (flags != 0)
+ (void)__db_ferr(env, "DB_MPOOLFILE->close", 0);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__memp_fclose(dbmfp, 0)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __memp_fclose --
+ * DB_MPOOLFILE->close.
+ *
+ * PUBLIC: int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
+ */
+int
+__memp_fclose(dbmfp, flags)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t flags;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOLFILE *mfp;
+ char *rpath;
+ u_int32_t ref;
+ int deleted, ret, t_ret;
+
+ env = dbmfp->env;
+ dbmp = env->mp_handle;
+ ret = 0;
+
+ /*
+ * Remove the DB_MPOOLFILE from the process' list.
+ *
+ * It's possible the underlying mpool cache may never have been created.
+ * In that case, all we have is a structure, discard it.
+ *
+ * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE
+ * file list, check the MP_OPEN_CALLED flag to be sure.
+ */
+ if (dbmp == NULL)
+ goto done;
+
+ MUTEX_LOCK(env, dbmp->mutex);
+
+ DB_ASSERT(env, dbmfp->ref >= 1);
+ if ((ref = --dbmfp->ref) == 0 && F_ISSET(dbmfp, MP_OPEN_CALLED))
+ TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+
+ /*
+ * Decrement the file descriptor's ref count -- if we're the last ref,
+ * we'll discard the file descriptor.
+ */
+ if (ref == 0 && dbmfp->fhp != NULL && --dbmfp->fhp->ref > 0)
+ dbmfp->fhp = NULL;
+ MUTEX_UNLOCK(env, dbmp->mutex);
+ if (ref != 0)
+ return (0);
+
+ /* Complain if pinned blocks never returned. */
+ if (dbmfp->pinref != 0) {
+ __db_errx(env, DB_STR_A("3040",
+ "%s: close: %lu blocks left pinned", "%s %lu"),
+ __memp_fn(dbmfp), (u_long)dbmfp->pinref);
+ ret = __env_panic(env, DB_RUNRECOVERY);
+ }
+
+ /* Discard any mmap information. */
+ if (dbmfp->addr != NULL && dbmfp->fhp != NULL &&
+ (ret = __os_unmapfile(env, dbmfp->addr, dbmfp->len)) != 0)
+ __db_err(env, ret, "%s", __memp_fn(dbmfp));
+
+ /*
+ * Close the file and discard the descriptor structure; temporary
+ * files may not yet have been created.
+ */
+ if (dbmfp->fhp != NULL) {
+ if ((t_ret =
+ __mutex_free(env, &dbmfp->fhp->mtx_fh)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __os_closehandle(env, dbmfp->fhp)) != 0) {
+ __db_err(env, t_ret, "%s", __memp_fn(dbmfp));
+ if (ret == 0)
+ ret = t_ret;
+ }
+ dbmfp->fhp = NULL;
+ }
+
+ /*
+ * Discard our reference on the underlying MPOOLFILE, and close it
+ * if it's no longer useful to anyone. It possible the open of the
+ * file never happened or wasn't successful, in which case, mpf will
+ * be NULL and MP_OPEN_CALLED will not be set.
+ */
+ mfp = dbmfp->mfp;
+ DB_ASSERT(env,
+ (F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp != NULL) ||
+ (!F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp == NULL));
+ if (!F_ISSET(dbmfp, MP_OPEN_CALLED))
+ goto done;
+
+ /*
+ * If it's a temp file, all outstanding references belong to unflushed
+ * buffers. (A temp file can only be referenced by one DB_MPOOLFILE).
+ * We don't care about preserving any of those buffers, so mark the
+ * MPOOLFILE as dead so that even the dirty ones just get discarded
+ * when we try to flush them.
+ */
+ deleted = 0;
+ if (!LF_ISSET(DB_MPOOL_NOLOCK))
+ MUTEX_LOCK(env, mfp->mutex);
+ if (F_ISSET(dbmfp, MP_MULTIVERSION))
+ atomic_dec(env, &mfp->multiversion);
+ if (F_ISSET(dbmfp, MP_READONLY) ||
+ (LF_ISSET(DB_FLUSH) && F_ISSET(dbmfp, MP_FOR_FLUSH))) {
+ DB_ASSERT(env, mfp->neutral_cnt != 0);
+ --mfp->neutral_cnt;
+ }
+ DB_ASSERT(env, mfp->neutral_cnt < mfp->mpf_cnt);
+ if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) {
+ if (LF_ISSET(DB_MPOOL_DISCARD) ||
+ F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) {
+ mfp->deadfile = 1;
+ }
+ if (mfp->unlink_on_close) {
+ if ((t_ret = __db_appname(dbmp->env, DB_APP_DATA,
+ R_ADDR(dbmp->reginfo, mfp->path_off), NULL,
+ &rpath)) != 0 && ret == 0)
+ ret = t_ret;
+ if (t_ret == 0) {
+ if ((t_ret = __os_unlink(
+ dbmp->env, rpath, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, rpath);
+ }
+ }
+ if (MFP_OPEN_CNT(mfp) == 0) {
+ F_CLR(mfp, MP_NOT_DURABLE);
+ F_SET(mfp, MP_DURABLE_UNKNOWN);
+ }
+ if (mfp->block_cnt == 0) {
+ /*
+ * We should never discard this mp file if our caller
+ * is holding the lock on it. See comment in
+ * __memp_sync_file.
+ */
+ DB_ASSERT(env, !LF_ISSET(DB_MPOOL_NOLOCK));
+ if ((t_ret =
+ __memp_mf_discard(dbmp, mfp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ deleted = 1;
+ }
+ }
+ if (!deleted && !LF_ISSET(DB_MPOOL_NOLOCK))
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+done: /* Discard the DB_MPOOLFILE structure. */
+ if (dbmfp->pgcookie != NULL) {
+ __os_free(env, dbmfp->pgcookie->data);
+ __os_free(env, dbmfp->pgcookie);
+ }
+ __os_free(env, dbmfp);
+
+ return (ret);
+}
+
+/*
+ * __memp_mf_discard --
+ * Discard an MPOOLFILE.
+ *
+ * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *, int));
+ */
+int
+__memp_mf_discard(dbmp, mfp, hp_locked)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ int hp_locked;
+{
+ DB_MPOOL_HASH *hp;
+ ENV *env;
+#ifdef HAVE_STATISTICS
+ DB_MPOOL_STAT *sp;
+#endif
+ MPOOL *mp;
+ int need_sync, ret, t_ret;
+
+ env = dbmp->env;
+ mp = dbmp->reginfo[0].primary;
+ hp = R_ADDR(dbmp->reginfo, mp->ftab);
+ hp += mfp->bucket;
+ ret = 0;
+
+ /*
+ * Expects caller to be holding the MPOOLFILE mutex.
+ *
+ * When discarding a file, we have to flush writes from it to disk.
+ * The scenario is that dirty buffers from this file need to be
+ * flushed to satisfy a future checkpoint, but when the checkpoint
+ * calls mpool sync, the sync code won't know anything about them.
+ * Ignore files not written, discarded, or only temporary.
+ */
+ need_sync = mfp->file_written && !mfp->deadfile &&
+ !F_ISSET(mfp, MP_TEMP) && !mfp->no_backing_file;
+
+ /*
+ * We have to release the MPOOLFILE mutex before acquiring the region
+ * mutex so we don't deadlock. Make sure nobody ever looks at this
+ * structure again.
+ */
+ mfp->deadfile = 1;
+
+ /* Discard the mutex we're holding and return it too the pool. */
+ MUTEX_UNLOCK(env, mfp->mutex);
+ if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0)
+ ret = t_ret;
+#ifndef HAVE_ATOMICFILEREAD
+ if ((ret = __mutex_free(env, &mfp->mtx_write)) != 0 && ret == 0)
+ ret = t_ret;
+#endif
+
+ /*
+ * Lock the bucket and delete from the list of MPOOLFILEs.
+ * If this function is called by __memp_discard_all_mpfs,
+ * the MPOOLFILE hash bucket is already locked.
+ */
+ if (!hp_locked)
+ MUTEX_LOCK(env, hp->mtx_hash);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
+ if (!hp_locked)
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+
+ /* Lock the region and collect stats and free the space. */
+ MPOOL_SYSTEM_LOCK(env);
+ if (need_sync &&
+ (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+#ifdef HAVE_STATISTICS
+ /* Copy the statistics into the region. */
+ sp = &mp->stat;
+ sp->st_cache_hit += mfp->stat.st_cache_hit;
+ sp->st_cache_miss += mfp->stat.st_cache_miss;
+ sp->st_map += mfp->stat.st_map;
+ sp->st_page_create += mfp->stat.st_page_create;
+ sp->st_page_in += mfp->stat.st_page_in;
+ sp->st_page_out += mfp->stat.st_page_out;
+#endif
+
+ /* Free the space. */
+ if (mfp->path_off != 0)
+ __memp_free(&dbmp->reginfo[0],
+ R_ADDR(dbmp->reginfo, mfp->path_off));
+ if (mfp->fileid_off != 0)
+ __memp_free(&dbmp->reginfo[0],
+ R_ADDR(dbmp->reginfo, mfp->fileid_off));
+ if (mfp->pgcookie_off != 0)
+ __memp_free(&dbmp->reginfo[0],
+ R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
+ __memp_free(&dbmp->reginfo[0], mfp);
+
+ MPOOL_SYSTEM_UNLOCK(env);
+
+ return (ret);
+}
+
+/*
+ * __memp_inmemlist --
+ * Return a list of the named in-memory databases.
+ *
+ * PUBLIC: int __memp_inmemlist __P((ENV *, char ***, int *));
+ */
+int
+__memp_inmemlist(env, namesp, cntp)
+ ENV *env;
+ char ***namesp;
+ int *cntp;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ int arraysz, cnt, i, ret;
+ char **names;
+
+ names = NULL;
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ hp = R_ADDR(dbmp->reginfo, mp->ftab);
+
+ arraysz = cnt = 0;
+ for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
+ MUTEX_LOCK(env, hp->mtx_hash);
+ SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+ /* Skip dead files and temporary files. */
+ if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+ continue;
+
+ /* Skip entries that allow files. */
+ if (!mfp->no_backing_file)
+ continue;
+
+ /* We found one. */
+ if (cnt >= arraysz) {
+ arraysz += 100;
+ if ((ret = __os_realloc(env,
+ (u_int)arraysz * sizeof(names[0]),
+ &names)) != 0)
+ goto nomem;
+ }
+ if ((ret = __os_strdup(env,
+ R_ADDR(dbmp->reginfo, mfp->path_off),
+ &names[cnt])) != 0)
+ goto nomem;
+
+ cnt++;
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ }
+ *namesp = names;
+ *cntp = cnt;
+ return (0);
+
+nomem: MUTEX_UNLOCK(env, hp->mtx_hash);
+ if (names != NULL) {
+ while (--cnt >= 0)
+ __os_free(env, names[cnt]);
+ __os_free(env, names);
+ }
+
+ /* Make sure we don't return any garbage. */
+ *cntp = 0;
+ *namesp = NULL;
+ return (ret);
+}
diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c
new file mode 100644
index 00000000..7a900fd0
--- /dev/null
+++ b/src/mp/mp_fput.c
@@ -0,0 +1,374 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+static int __memp_reset_lru __P((ENV *, REGINFO *));
+
+/*
+ * __memp_fput_pp --
+ * DB_MPOOLFILE->put pre/post processing.
+ *
+ * PUBLIC: int __memp_fput_pp
+ * PUBLIC: __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
+ */
+int
+__memp_fput_pp(dbmfp, pgaddr, priority, flags)
+ DB_MPOOLFILE *dbmfp;
+ void *pgaddr;
+ DB_CACHE_PRIORITY priority;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbmfp->env;
+
+ if (flags != 0)
+ return (__db_ferr(env, "DB_MPOOLFILE->put", 0));
+
+ MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put");
+
+ ENV_ENTER(env, ip);
+
+ ret = __memp_fput(dbmfp, ip, pgaddr, priority);
+ if (IS_ENV_REPLICATED(env) &&
+ (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __memp_fput --
+ * DB_MPOOLFILE->put.
+ *
+ * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *,
+ * PUBLIC: DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY));
+ */
+int
+__memp_fput(dbmfp, ip, pgaddr, priority)
+ DB_MPOOLFILE *dbmfp;
+ DB_THREAD_INFO *ip;
+ void *pgaddr;
+ DB_CACHE_PRIORITY priority;
+{
+ BH *bhp;
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ ENV *env;
+ MPOOL *c_mp;
+ MPOOLFILE *mfp;
+ PIN_LIST *list, *lp;
+ REGINFO *infop, *reginfo;
+ roff_t b_ref;
+ int region;
+ int adjust, pfactor, ret, t_ret;
+ char buf[DB_THREADID_STRLEN];
+
+ env = dbmfp->env;
+ dbenv = env->dbenv;
+ dbmp = env->mp_handle;
+ mfp = dbmfp->mfp;
+ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+ ret = 0;
+
+ /*
+ * If this is marked dummy, we are using it to unpin a buffer for
+ * another thread.
+ */
+ if (F_ISSET(dbmfp, MP_DUMMY))
+ goto unpin;
+
+ /*
+ * If we're mapping the file, there's nothing to do. Because we can
+ * stop mapping the file at any time, we have to check on each buffer
+ * to see if the address we gave the application was part of the map
+ * region.
+ */
+ if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
+ (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
+ return (0);
+
+ DB_ASSERT(env, IS_RECOVERING(env) || bhp->pgno <= mfp->last_pgno ||
+ F_ISSET(bhp, BH_FREED) || !SH_CHAIN_SINGLETON(bhp, vc));
+#ifdef DIAGNOSTIC
+ /*
+ * Decrement the per-file pinned buffer count (mapped pages aren't
+ * counted).
+ */
+ MPOOL_SYSTEM_LOCK(env);
+ if (dbmfp->pinref == 0) {
+ MPOOL_SYSTEM_UNLOCK(env);
+ __db_errx(env, DB_STR_A("3011",
+ "%s: more pages returned than retrieved", "%s"),
+ __memp_fn(dbmfp));
+ return (__env_panic(env, EACCES));
+ }
+ --dbmfp->pinref;
+ MPOOL_SYSTEM_UNLOCK(env);
+#endif
+
+unpin:
+ infop = &dbmp->reginfo[bhp->region];
+ c_mp = infop->primary;
+ hp = R_ADDR(infop, c_mp->htab);
+ hp = &hp[bhp->bucket];
+
+ /*
+ * Check for a reference count going to zero. This can happen if the
+ * application returns a page twice.
+ */
+ if (atomic_read(&bhp->ref) == 0) {
+ __db_errx(env, DB_STR_A("3012",
+ "%s: page %lu: unpinned page returned", "%s %lu"),
+ __memp_fn(dbmfp), (u_long)bhp->pgno);
+ DB_ASSERT(env, atomic_read(&bhp->ref) != 0);
+ return (__env_panic(env, EACCES));
+ }
+
+ /* Note the activity so allocation won't decide to quit. */
+ ++c_mp->put_counter;
+
+ if (ip != NULL) {
+ reginfo = env->reginfo;
+ list = R_ADDR(reginfo, ip->dbth_pinlist);
+ region = (int)(infop - dbmp->reginfo);
+ b_ref = R_OFFSET(infop, bhp);
+ for (lp = list; lp < &list[ip->dbth_pinmax]; lp++)
+ if (lp->b_ref == b_ref && lp->region == region)
+ break;
+
+ if (lp == &list[ip->dbth_pinmax]) {
+ __db_errx(env, DB_STR_A("3013",
+ "__memp_fput: pinned buffer not found for thread %s",
+ "%s"), dbenv->thread_id_string(dbenv,
+ ip->dbth_pid, ip->dbth_tid, buf));
+ return (__env_panic(env, EINVAL));
+ }
+
+ lp->b_ref = INVALID_ROFF;
+ ip->dbth_pincount--;
+ }
+
+ /*
+ * Mark the file dirty.
+ */
+ if (F_ISSET(bhp, BH_EXCLUSIVE) && F_ISSET(bhp, BH_DIRTY)) {
+ DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
+ mfp->file_written = 1;
+ }
+
+ /*
+ * If more than one reference to the page we're done. Ignore the
+ * discard flags (for now) and leave the buffer's priority alone.
+ * We are doing this a little early as the remaining ref may or
+ * may not be a write behind. If it is we set the priority
+ * here, if not it will get set again later. We might race
+ * and miss setting the priority which would leave it wrong
+ * for a while.
+ */
+ DB_ASSERT(env, atomic_read(&bhp->ref) != 0);
+ if (atomic_dec(env, &bhp->ref) > 1 || (atomic_read(&bhp->ref) == 1 &&
+ !F_ISSET(bhp, BH_DIRTY))) {
+ /*
+ * __memp_pgwrite only has a shared lock while it clears
+ * the BH_DIRTY bit. If we only have a shared latch then
+ * we can't touch the flags bits.
+ */
+ if (F_ISSET(bhp, BH_EXCLUSIVE))
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ return (0);
+ }
+
+ /* The buffer should not be accessed again. */
+#ifdef DIAG_MVCC
+ MUTEX_LOCK(env, hp->mtx_hash);
+ if (BH_REFCOUNT(bhp) == 0)
+ MVCC_MPROTECT(bhp->buf, mfp->pagesize, 0);
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+#endif
+
+ /* Update priority values. */
+ if (priority == DB_PRIORITY_VERY_LOW ||
+ mfp->priority == MPOOL_PRI_VERY_LOW)
+ bhp->priority = 0;
+ else {
+ /*
+ * We don't lock the LRU priority or the pages field, if
+ * we get garbage (which won't happen on a 32-bit machine), it
+ * only means a buffer has the wrong priority.
+ */
+ bhp->priority = c_mp->lru_priority;
+
+ switch (priority) {
+ default:
+ case DB_PRIORITY_UNCHANGED:
+ pfactor = mfp->priority;
+ break;
+ case DB_PRIORITY_VERY_LOW:
+ pfactor = MPOOL_PRI_VERY_LOW;
+ break;
+ case DB_PRIORITY_LOW:
+ pfactor = MPOOL_PRI_LOW;
+ break;
+ case DB_PRIORITY_DEFAULT:
+ pfactor = MPOOL_PRI_DEFAULT;
+ break;
+ case DB_PRIORITY_HIGH:
+ pfactor = MPOOL_PRI_HIGH;
+ break;
+ case DB_PRIORITY_VERY_HIGH:
+ pfactor = MPOOL_PRI_VERY_HIGH;
+ break;
+ }
+
+ adjust = 0;
+ if (pfactor != 0)
+ adjust = (int)c_mp->pages / pfactor;
+
+ if (F_ISSET(bhp, BH_DIRTY))
+ adjust += (int)c_mp->pages / MPOOL_PRI_DIRTY;
+
+ if (adjust > 0) {
+ if (MPOOL_LRU_REDZONE - bhp->priority >=
+ (u_int32_t)adjust)
+ bhp->priority += adjust;
+ } else if (adjust < 0)
+ if (bhp->priority > (u_int32_t)-adjust)
+ bhp->priority += adjust;
+ }
+
+ /*
+ * __memp_pgwrite only has a shared lock while it clears the
+ * BH_DIRTY bit. If we only have a shared latch then we can't
+ * touch the flags bits.
+ */
+ if (F_ISSET(bhp, BH_EXCLUSIVE))
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+
+ /*
+ * On every buffer put we update the cache lru priority and check
+ * for wraparound. The increment doesn't need to be atomic: occasional
+ * lost increments are okay; __memp_reset_lru handles race conditions.
+ */
+ if (++c_mp->lru_priority >= MPOOL_LRU_REDZONE &&
+ (t_ret = __memp_reset_lru(env, infop)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __memp_reset_lru --
+ * Reset the cache LRU priority when it reaches the upper limit.
+ */
+static int
+__memp_reset_lru(env, infop)
+ ENV *env;
+ REGINFO *infop;
+{
+ BH *bhp, *tbhp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *c_mp;
+ u_int32_t bucket;
+ int reset;
+
+ /*
+ * Update the priority so all future allocations will start at the
+ * bottom. Lock this cache region to ensure that exactly one thread
+ * will reset this cache's buffers.
+ */
+ c_mp = infop->primary;
+ MPOOL_REGION_LOCK(env, infop);
+ reset = c_mp->lru_priority >= MPOOL_LRU_DECREMENT;
+ if (reset) {
+ c_mp->lru_priority -= MPOOL_LRU_DECREMENT;
+ c_mp->lru_generation++;
+ }
+ MPOOL_REGION_UNLOCK(env, infop);
+
+ if (!reset)
+ return (0);
+
+ /* Reduce the priority of every buffer in this cache region. */
+ for (hp = R_ADDR(infop, c_mp->htab),
+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ MUTEX_LOCK(env, hp->mtx_hash);
+ SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
+ for (tbhp = bhp; tbhp != NULL;
+ tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) {
+ if (tbhp->priority > MPOOL_LRU_DECREMENT)
+ tbhp->priority -= MPOOL_LRU_DECREMENT;
+ else
+ tbhp->priority = 0;
+ }
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ }
+
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+/*
+ * __memp_unpin_buffers --
+ * Unpin buffers pinned by a thread.
+ *
+ * PUBLIC: int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *));
+ */
+int
+__memp_unpin_buffers(env, ip)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+{
+ BH *bhp;
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE dbmf;
+ PIN_LIST *list, *lp;
+ REGINFO *rinfop, *reginfo;
+ int ret;
+
+ memset(&dbmf, 0, sizeof(dbmf));
+ dbmf.env = env;
+ dbmf.flags = MP_DUMMY;
+ dbmp = env->mp_handle;
+ reginfo = env->reginfo;
+
+ list = R_ADDR(reginfo, ip->dbth_pinlist);
+ for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) {
+ if (lp->b_ref == INVALID_ROFF)
+ continue;
+ rinfop = &dbmp->reginfo[lp->region];
+ bhp = R_ADDR(rinfop, lp->b_ref);
+ dbmf.mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ if ((ret = __memp_fput(&dbmf, ip,
+ (u_int8_t *)bhp + SSZA(BH, buf),
+ DB_PRIORITY_UNCHANGED)) != 0)
+ return (ret);
+ }
+ return (0);
+}
diff --git a/src/mp/mp_fset.c b/src/mp/mp_fset.c
new file mode 100644
index 00000000..1129853f
--- /dev/null
+++ b/src/mp/mp_fset.c
@@ -0,0 +1,170 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * __memp_dirty --
+ * Upgrade a page from a read-only to a writable pointer.
+ *
+ * PUBLIC: int __memp_dirty __P((DB_MPOOLFILE *, void *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t));
+ */
+int
+__memp_dirty(dbmfp, addrp, ip, txn, priority, flags)
+ DB_MPOOLFILE *dbmfp;
+ void *addrp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_CACHE_PRIORITY priority;
+ u_int32_t flags;
+{
+ BH *bhp;
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ DB_TXN *ancestor;
+ ENV *env;
+ MPOOL *c_mp;
+#ifdef DIAG_MVCC
+ MPOOLFILE *mfp;
+#endif
+ REGINFO *infop;
+ int mvcc, ret;
+ db_pgno_t pgno;
+ void *pgaddr;
+
+ env = dbmfp->env;
+ dbmp = env->mp_handle;
+ mvcc = atomic_read(&dbmfp->mfp->multiversion);
+
+ /* Convert the page address to a buffer header. */
+ pgaddr = *(void **)addrp;
+ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+ pgno = bhp->pgno;
+
+ /* If we have it exclusively then its already dirty. */
+ if (F_ISSET(bhp, BH_EXCLUSIVE)) {
+ DB_ASSERT(env, F_ISSET(bhp, BH_DIRTY));
+ return (0);
+ }
+
+ if (flags == 0)
+ flags = DB_MPOOL_DIRTY;
+ DB_ASSERT(env, flags == DB_MPOOL_DIRTY || flags == DB_MPOOL_EDIT);
+
+ if (F_ISSET(dbmfp, MP_READONLY)) {
+ __db_errx(env, DB_STR_A("3008",
+ "%s: dirty flag set for readonly file page", "%s"),
+ __memp_fn(dbmfp));
+ return (EACCES);
+ }
+
+ for (ancestor = txn;
+ ancestor != NULL && ancestor->parent != NULL;
+ ancestor = ancestor->parent)
+ ;
+
+ if (mvcc && txn != NULL && flags == DB_MPOOL_DIRTY &&
+ (!BH_OWNED_BY(env, bhp, ancestor) || SH_CHAIN_HASNEXT(bhp, vc))) {
+ atomic_inc(env, &bhp->ref);
+ *(void **)addrp = NULL;
+ if ((ret = __memp_fput(dbmfp, ip, pgaddr, priority)) != 0) {
+ __db_errx(env, DB_STR_A("3009",
+ "%s: error releasing a read-only page", "%s"),
+ __memp_fn(dbmfp));
+ atomic_dec(env, &bhp->ref);
+ return (ret);
+ }
+ if ((ret = __memp_fget(dbmfp,
+ &pgno, ip, txn, flags, addrp)) != 0) {
+ if (ret != DB_LOCK_DEADLOCK)
+ __db_errx(env, DB_STR_A("3010",
+ "%s: error getting a page for writing",
+ "%s"), __memp_fn(dbmfp));
+ atomic_dec(env, &bhp->ref);
+ return (ret);
+ }
+ atomic_dec(env, &bhp->ref);
+
+ /*
+ * If the MVCC handle count hasn't changed, we should get a
+ * different version of the page.
+ */
+ DB_ASSERT(env, *(void **)addrp != pgaddr ||
+ mvcc != atomic_read(&dbmfp->mfp->multiversion));
+
+ pgaddr = *(void **)addrp;
+ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+ DB_ASSERT(env, pgno == bhp->pgno);
+ return (0);
+ }
+
+ infop = &dbmp->reginfo[bhp->region];
+ c_mp = infop->primary;
+ hp = R_ADDR(infop, c_mp->htab);
+ hp = &hp[bhp->bucket];
+
+ /* Drop the shared latch and get an exclusive. We have the buf ref'ed.*/
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ MUTEX_LOCK(env, bhp->mtx_buf);
+ DB_ASSERT(env, !F_ISSET(bhp, BH_EXCLUSIVE));
+ F_SET(bhp, BH_EXCLUSIVE);
+
+ /* Set/clear the page bits. */
+ if (!F_ISSET(bhp, BH_DIRTY)) {
+#ifdef DIAGNOSTIC
+ MUTEX_LOCK(env, hp->mtx_hash);
+#endif
+ atomic_inc(env, &hp->hash_page_dirty);
+ F_SET(bhp, BH_DIRTY);
+#ifdef DIAGNOSTIC
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+#endif
+ }
+
+#ifdef DIAG_MVCC
+ mfp = R_ADDR(env->mp_handle->reginfo, bhp->mf_offset);
+ MVCC_MPROTECT(bhp->buf, mfp->pagesize, PROT_READ | PROT_WRITE);
+#endif
+ DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) ||
+ atomic_read(&hp->hash_page_dirty) != 0);
+ return (0);
+}
+
+/*
+ * __memp_shared --
+ * Downgrade a page from exlusively held to shared.
+ *
+ * PUBLIC: int __memp_shared __P((DB_MPOOLFILE *, void *));
+ */
+int
+__memp_shared(dbmfp, pgaddr)
+ DB_MPOOLFILE *dbmfp;
+ void *pgaddr;
+{
+ BH *bhp;
+ ENV *env;
+
+ env = dbmfp->env;
+ /* Convert the page address to a buffer header. */
+ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+ if (F_ISSET(bhp, BH_DIRTY))
+ dbmfp->mfp->file_written = 1;
+ DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE));
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ MUTEX_READLOCK(env, bhp->mtx_buf);
+
+ return (0);
+}
diff --git a/src/mp/mp_method.c b/src/mp/mp_method.c
new file mode 100644
index 00000000..7afae248
--- /dev/null
+++ b/src/mp/mp_method.c
@@ -0,0 +1,1091 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+/*
+ * __memp_env_create --
+ * Mpool specific creation of the DB_ENV structure.
+ *
+ * PUBLIC: int __memp_env_create __P((DB_ENV *));
+ */
+int
+__memp_env_create(dbenv)
+ DB_ENV *dbenv;
+{
+ /*
+ * !!!
+ * Our caller has not yet had the opportunity to reset the panic
+ * state or turn off mutex locking, and so we can neither check
+ * the panic state or acquire a mutex in the DB_ENV create path.
+ *
+ * We default to 32 8K pages. We don't default to a flat 256K, because
+ * we want to include the size of the buffer header which can vary
+ * from system to system.
+ */
+ dbenv->mp_bytes =
+ 32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH);
+ dbenv->mp_ncache = 1;
+
+ return (0);
+}
+
+/*
+ * __memp_env_destroy --
+ * Mpool specific destruction of the DB_ENV structure.
+ *
+ * PUBLIC: void __memp_env_destroy __P((DB_ENV *));
+ */
+void
+__memp_env_destroy(dbenv)
+ DB_ENV *dbenv;
+{
+ COMPQUIET(dbenv, NULL);
+}
+
+/*
+ * __memp_get_cachesize --
+ * {DB_ENV,DB}->get_cachesize.
+ *
+ * PUBLIC: int __memp_get_cachesize
+ * PUBLIC: __P((DB_ENV *, u_int32_t *, u_int32_t *, int *));
+ */
+int
+__memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep)
+ DB_ENV *dbenv;
+ u_int32_t *gbytesp, *bytesp;
+ int *ncachep;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_cachesize", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ if (gbytesp != NULL)
+ *gbytesp = mp->gbytes;
+ if (bytesp != NULL)
+ *bytesp = mp->bytes;
+ if (ncachep != NULL)
+ *ncachep = (int)mp->nreg;
+ } else {
+ if (gbytesp != NULL)
+ *gbytesp = dbenv->mp_gbytes;
+ if (bytesp != NULL)
+ *bytesp = dbenv->mp_bytes;
+ if (ncachep != NULL)
+ *ncachep = (int)dbenv->mp_ncache;
+ }
+
+ return (0);
+}
+
+/*
+ * __memp_set_cachesize --
+ * {DB_ENV,DB}->set_cachesize.
+ *
+ * PUBLIC: int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
+ */
+int
+__memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
+ DB_ENV *dbenv;
+ u_int32_t gbytes, bytes;
+ int arg_ncache;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int ncache;
+ int ret;
+
+ env = dbenv->env;
+ ret = 0;
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->set_cachesize", DB_INIT_MPOOL);
+
+ /* Normalize the cache count. */
+ ncache = arg_ncache <= 0 ? 1 : (u_int)arg_ncache;
+
+ /*
+ * You can only store 4GB-1 in an unsigned 32-bit value, so correct for
+ * applications that specify 4GB cache sizes -- we know what they meant.
+ */
+ if (sizeof(roff_t) == 4 && gbytes / ncache == 4 && bytes == 0) {
+ --gbytes;
+ bytes = GIGABYTE - 1;
+ } else {
+ gbytes += bytes / GIGABYTE;
+ bytes %= GIGABYTE;
+ }
+
+ /*
+ * !!!
+ * With 32-bit region offsets, individual cache regions must be smaller
+ * than 4GB. Also, cache sizes larger than 10TB would cause 32-bit
+ * wrapping in the calculation of the number of hash buckets. See
+ * __memp_open for details.
+ */
+ if (!F_ISSET(env, ENV_OPEN_CALLED)) {
+ if (sizeof(roff_t) <= 4 && gbytes / ncache >= 4) {
+ __db_errx(env, DB_STR("3003",
+ "individual cache size too large: maximum is 4GB"));
+ return (EINVAL);
+ }
+ if (gbytes / ncache > 10000) {
+ __db_errx(env, DB_STR("3004",
+ "individual cache size too large: maximum is 10TB"));
+ return (EINVAL);
+ }
+ }
+
+ /*
+ * If the application requested less than 500Mb, increase the cachesize
+ * by 25% and factor in the size of the hash buckets to account for our
+ * overhead. (I'm guessing caches over 500Mb are specifically sized,
+ * that is, it's a large server and the application actually knows how
+ * much memory is available. We only document the 25% overhead number,
+ * not the hash buckets, but I don't see a reason to confuse the issue,
+ * it shouldn't matter to an application.)
+ *
+ * There is a minimum cache size, regardless.
+ */
+ if (gbytes == 0) {
+ if (bytes < 500 * MEGABYTE)
+ bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH);
+ if (bytes / ncache < DB_CACHESIZE_MIN)
+ bytes = ncache * DB_CACHESIZE_MIN;
+ }
+
+ if (F_ISSET(env, ENV_OPEN_CALLED)) {
+ ENV_ENTER(env, ip);
+ ret = __memp_resize(env->mp_handle, gbytes, bytes);
+ ENV_LEAVE(env, ip);
+ return ret;
+ }
+
+ dbenv->mp_gbytes = gbytes;
+ dbenv->mp_bytes = bytes;
+ dbenv->mp_ncache = ncache;
+
+ return (0);
+}
+
+/*
+ * __memp_set_config --
+ * Set the cache subsystem configuration.
+ *
+ * PUBLIC: int __memp_set_config __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__memp_set_config(dbenv, which, on)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int on;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->memp_set_config", DB_INIT_MPOOL);
+
+ switch (which) {
+ case DB_MEMP_SUPPRESS_WRITE:
+ case DB_MEMP_SYNC_INTERRUPT:
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ if (on)
+ FLD_SET(mp->config_flags, which);
+ else
+ FLD_CLR(mp->config_flags, which);
+ }
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __memp_get_config --
+ * Return the cache subsystem configuration.
+ *
+ * PUBLIC: int __memp_get_config __P((DB_ENV *, u_int32_t, int *));
+ */
+int
+__memp_get_config(dbenv, which, onp)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int *onp;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->mp_handle, "DB_ENV->memp_get_config", DB_INIT_MPOOL);
+
+ switch (which) {
+ case DB_MEMP_SUPPRESS_WRITE:
+ case DB_MEMP_SYNC_INTERRUPT:
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ *onp = FLD_ISSET(mp->config_flags, which) ? 1 : 0;
+ } else
+ *onp = 0;
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_max_openfd __P((DB_ENV *, int *));
+ */
+int
+__memp_get_mp_max_openfd(dbenv, maxopenfdp)
+ DB_ENV *dbenv;
+ int *maxopenfdp;
+{
+ DB_MPOOL *dbmp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_openfd", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ ENV_ENTER(env, ip);
+ MPOOL_SYSTEM_LOCK(env);
+ *maxopenfdp = mp->mp_maxopenfd;
+ MPOOL_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ *maxopenfdp = dbenv->mp_maxopenfd;
+ return (0);
+}
+
+/*
+ * __memp_set_mp_max_openfd --
+ * Set the maximum number of open fd's when flushing the cache.
+ * PUBLIC: int __memp_set_mp_max_openfd __P((DB_ENV *, int));
+ */
+int
+__memp_set_mp_max_openfd(dbenv, maxopenfd)
+ DB_ENV *dbenv;
+ int maxopenfd;
+{
+ DB_MPOOL *dbmp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->set_mp_max_openfd", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ ENV_ENTER(env, ip);
+ MPOOL_SYSTEM_LOCK(env);
+ mp->mp_maxopenfd = maxopenfd;
+ MPOOL_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ dbenv->mp_maxopenfd = maxopenfd;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, db_timeout_t *));
+ */
+int
+__memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp)
+ DB_ENV *dbenv;
+ int *maxwritep;
+ db_timeout_t *maxwrite_sleepp;
+{
+ DB_MPOOL *dbmp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ ENV_ENTER(env, ip);
+ MPOOL_SYSTEM_LOCK(env);
+ *maxwritep = mp->mp_maxwrite;
+ *maxwrite_sleepp = mp->mp_maxwrite_sleep;
+ MPOOL_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ *maxwritep = dbenv->mp_maxwrite;
+ *maxwrite_sleepp = dbenv->mp_maxwrite_sleep;
+ }
+ return (0);
+}
+
+/*
+ * __memp_set_mp_max_write --
+ * Set the maximum continuous I/O count.
+ *
+ * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, db_timeout_t));
+ */
+int
+__memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep)
+ DB_ENV *dbenv;
+ int maxwrite;
+ db_timeout_t maxwrite_sleep;
+{
+ DB_MPOOL *dbmp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ ENV_ENTER(env, ip);
+ MPOOL_SYSTEM_LOCK(env);
+ mp->mp_maxwrite = maxwrite;
+ mp->mp_maxwrite_sleep = maxwrite_sleep;
+ MPOOL_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ dbenv->mp_maxwrite = maxwrite;
+ dbenv->mp_maxwrite_sleep = maxwrite_sleep;
+ }
+ return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_mmapsize __P((DB_ENV *, size_t *));
+ */
+int
+__memp_get_mp_mmapsize(dbenv, mp_mmapsizep)
+ DB_ENV *dbenv;
+ size_t *mp_mmapsizep;
+{
+ DB_MPOOL *dbmp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ ENV_ENTER(env, ip);
+ MPOOL_SYSTEM_LOCK(env);
+ *mp_mmapsizep = mp->mp_mmapsize;
+ MPOOL_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ *mp_mmapsizep = dbenv->mp_mmapsize;
+ return (0);
+}
+
+/*
+ * __memp_set_mp_mmapsize --
+ * DB_ENV->set_mp_mmapsize.
+ *
+ * PUBLIC: int __memp_set_mp_mmapsize __P((DB_ENV *, size_t));
+ */
+int
+__memp_set_mp_mmapsize(dbenv, mp_mmapsize)
+ DB_ENV *dbenv;
+ size_t mp_mmapsize;
+{
+ DB_MPOOL *dbmp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->set_mp_max_mmapsize", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ ENV_ENTER(env, ip);
+ MPOOL_SYSTEM_LOCK(env);
+ /*
+ * We need to cast here because size_t and db_size_t can be
+ * different on a 64 bit build, when building in 32 bit
+ * compatibility mode. The cast is safe, because we check for
+ * overflow when the fields are assigned.
+ */
+ mp->mp_mmapsize = (db_size_t)mp_mmapsize;
+ MPOOL_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else
+ dbenv->mp_mmapsize = (db_size_t)mp_mmapsize;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_pagesize __P((DB_ENV *, u_int32_t *));
+ */
+int
+__memp_get_mp_pagesize(dbenv, mp_pagesizep)
+ DB_ENV *dbenv;
+ u_int32_t *mp_pagesizep;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_pagesize", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ *mp_pagesizep = mp->pagesize;
+ } else {
+ *mp_pagesizep = dbenv->mp_pagesize;
+ }
+ return (0);
+}
+
+/*
+ * __memp_set_mp_pagesize --
+ * DB_ENV->set_mp_pagesize.
+ *
+ * PUBLIC: int __memp_set_mp_pagesize __P((DB_ENV *, u_int32_t));
+ */
+int
+__memp_set_mp_pagesize(dbenv, mp_pagesize)
+ DB_ENV *dbenv;
+ u_int32_t mp_pagesize;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_pagesize");
+
+ dbenv->mp_pagesize = mp_pagesize;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_tablesize __P((DB_ENV *, u_int32_t *));
+ */
+int
+__memp_get_mp_tablesize(dbenv, mp_tablesizep)
+ DB_ENV *dbenv;
+ u_int32_t *mp_tablesizep;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_tablesize", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ *mp_tablesizep = mp->htab_buckets;
+ } else
+ *mp_tablesizep = dbenv->mp_tablesize;
+ return (0);
+}
+
+/*
+ * __memp_set_mp_tablesize --
+ * DB_ENV->set_mp_tablesize.
+ *
+ * PUBLIC: int __memp_set_mp_tablesize __P((DB_ENV *, u_int32_t));
+ */
+int
+__memp_set_mp_tablesize(dbenv, mp_tablesize)
+ DB_ENV *dbenv;
+ u_int32_t mp_tablesize;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_tablesize");
+
+ dbenv->mp_tablesize = mp_tablesize;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __memp_get_mp_mtxcount __P((DB_ENV *, u_int32_t *));
+ */
+int
+__memp_get_mp_mtxcount(dbenv, mp_mtxcountp)
+ DB_ENV *dbenv;
+ u_int32_t *mp_mtxcountp;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOL *mp;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_mtxcount", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ *mp_mtxcountp = mp->htab_mutexes;
+ } else
+ *mp_mtxcountp = dbenv->mp_mtxcount;
+ return (0);
+}
+
+/*
+ * __memp_set_mp_mtxcount --
+ * DB_ENV->set_mp_mtxcount.
+ *
+ * PUBLIC: int __memp_set_mp_mtxcount __P((DB_ENV *, u_int32_t));
+ */
+int
+__memp_set_mp_mtxcount(dbenv, mp_mtxcount)
+ DB_ENV *dbenv;
+ u_int32_t mp_mtxcount;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_mtxcount");
+
+ dbenv->mp_mtxcount = mp_mtxcount;
+ return (0);
+}
+
+/*
+ * __memp_nameop
+ * Remove or rename a file in the pool.
+ *
+ * PUBLIC: int __memp_nameop __P((ENV *,
+ * PUBLIC: u_int8_t *, const char *, const char *, const char *, int));
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+int
+__memp_nameop(env, fileid, newname, fullold, fullnew, inmem)
+ ENV *env;
+ u_int8_t *fileid;
+ const char *newname, *fullold, *fullnew;
+ int inmem;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp, *nhp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ roff_t newname_off;
+ u_int32_t bucket;
+ int locked, ret;
+ size_t nlen;
+ void *p;
+
+#undef op_is_remove
+#define op_is_remove (newname == NULL)
+
+ COMPQUIET(bucket, 0);
+ COMPQUIET(hp, NULL);
+ COMPQUIET(newname_off, 0);
+ COMPQUIET(nlen, 0);
+
+ dbmp = NULL;
+ mfp = NULL;
+ nhp = NULL;
+ p = NULL;
+ locked = ret = 0;
+
+ if (!MPOOL_ON(env))
+ goto fsop;
+
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ hp = R_ADDR(dbmp->reginfo, mp->ftab);
+
+ if (!op_is_remove) {
+ nlen = strlen(newname);
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, nlen + 1, &newname_off, &p)) != 0)
+ return (ret);
+ memcpy(p, newname, nlen + 1);
+ }
+
+ /*
+ * Remove or rename a file that the mpool might know about. We assume
+ * that the fop layer has the file locked for exclusive access, so we
+ * don't worry about locking except for the mpool mutexes. Checkpoint
+ * can happen at any time, independent of file locking, so we have to
+ * do the actual unlink or rename system call while holding
+ * all affected buckets locked.
+ *
+ * If this is a rename and this is a memory file then we need
+ * to make sure that the new name does not exist. Since we
+ * are locking two buckets lock them in ascending order.
+ */
+ if (inmem) {
+ DB_ASSERT(env, fullold != NULL);
+ hp += FNBUCKET(fullold, strlen(fullold));
+ if (!op_is_remove) {
+ bucket = FNBUCKET(newname, nlen);
+ nhp = R_ADDR(dbmp->reginfo, mp->ftab);
+ nhp += bucket;
+ }
+ } else
+ hp += FNBUCKET(fileid, DB_FILE_ID_LEN);
+
+ if (nhp != NULL && nhp < hp)
+ MUTEX_LOCK(env, nhp->mtx_hash);
+ MUTEX_LOCK(env, hp->mtx_hash);
+ if (nhp != NULL && nhp > hp)
+ MUTEX_LOCK(env, nhp->mtx_hash);
+ locked = 1;
+
+ if (!op_is_remove && inmem) {
+ SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile)
+ if (!mfp->deadfile &&
+ mfp->no_backing_file && strcmp(newname,
+ R_ADDR(dbmp->reginfo, mfp->path_off)) == 0)
+ break;
+ if (mfp != NULL) {
+ ret = EEXIST;
+ goto err;
+ }
+ }
+
+ /*
+ * Find the file -- if mpool doesn't know about this file, that may
+ * not be an error.
+ */
+ SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+ /* Ignore non-active files. */
+ if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+ continue;
+
+ /* Try to match on fileid. */
+ if (memcmp(fileid, R_ADDR(
+ dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+ continue;
+
+ break;
+ }
+
+ if (mfp == NULL) {
+ if (inmem) {
+ ret = ENOENT;
+ goto err;
+ }
+ goto fsop;
+ }
+
+ if (op_is_remove) {
+ MUTEX_LOCK(env, mfp->mutex);
+ /*
+ * In-memory dbs have an artificially incremented ref count so
+ * they do not get reclaimed as long as they exist. Since we
+ * are now deleting the database, we need to dec that count.
+ */
+ if (mfp->no_backing_file)
+ mfp->mpf_cnt--;
+ mfp->deadfile = 1;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ } else {
+ /*
+ * Else, it's a rename. We've allocated memory for the new
+ * name. Swap it with the old one. If it's in memory we
+ * need to move it the right bucket.
+ */
+ p = R_ADDR(dbmp->reginfo, mfp->path_off);
+ mfp->path_off = newname_off;
+
+ if (inmem && hp != nhp) {
+ DB_ASSERT(env, nhp != NULL);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
+ mfp->bucket = bucket;
+ SH_TAILQ_INSERT_TAIL(&nhp->hash_bucket, mfp, q);
+ }
+ }
+
+fsop: /*
+ * If this is a real file, then mfp could be NULL, because
+ * mpool isn't turned on, and we still need to do the file ops.
+ */
+ if (mfp == NULL || !mfp->no_backing_file) {
+ if (op_is_remove) {
+ /*
+ * !!!
+ * Replication may ask us to unlink a file that's been
+ * renamed. Don't complain if it doesn't exist.
+ */
+ if ((ret = __os_unlink(env, fullold, 0)) == ENOENT)
+ ret = 0;
+ } else {
+ /*
+ * Defensive only, fullnew should never be
+ * NULL.
+ */
+ DB_ASSERT(env, fullnew != NULL);
+ if (fullnew == NULL) {
+ ret = EINVAL;
+ goto err;
+ }
+ ret = __os_rename(env, fullold, fullnew, 1);
+ }
+ }
+
+ /* Delete the memory we no longer need. */
+err: if (p != NULL) {
+ MPOOL_REGION_LOCK(env, &dbmp->reginfo[0]);
+ __memp_free(&dbmp->reginfo[0], p);
+ MPOOL_REGION_UNLOCK(env, &dbmp->reginfo[0]);
+ }
+
+ /* If we have buckets locked, unlock them when done moving files. */
+ if (locked == 1) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ if (nhp != NULL && nhp != hp)
+ MUTEX_UNLOCK(env, nhp->mtx_hash);
+ }
+ return (ret);
+}
+
+/*
+ * __memp_ftruncate __
+ * Truncate the file.
+ *
+ * PUBLIC: int __memp_ftruncate __P((DB_MPOOLFILE *, DB_TXN *,
+ * PUBLIC: DB_THREAD_INFO *, db_pgno_t, u_int32_t));
+ */
+int
+__memp_ftruncate(dbmfp, txn, ip, pgno, flags)
+ DB_MPOOLFILE *dbmfp;
+ DB_TXN *txn;
+ DB_THREAD_INFO *ip;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ MPOOLFILE *mfp;
+ void *pagep;
+ db_pgno_t last_pgno, pg;
+ int ret;
+
+ env = dbmfp->env;
+ mfp = dbmfp->mfp;
+ ret = 0;
+
+ MUTEX_LOCK(env, mfp->mutex);
+ last_pgno = mfp->last_pgno;
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ if (pgno > last_pgno) {
+ if (LF_ISSET(MP_TRUNC_RECOVER))
+ return (0);
+ __db_errx(env, DB_STR("3005",
+ "Truncate beyond the end of file"));
+ return (EINVAL);
+ }
+
+ pg = pgno;
+ if (!LF_ISSET(MP_TRUNC_NOCACHE))
+ do {
+ if (mfp->block_cnt == 0)
+ break;
+ if ((ret = __memp_fget(dbmfp, &pg,
+ ip, txn, DB_MPOOL_FREE, &pagep)) != 0)
+ return (ret);
+ } while (pg++ < last_pgno);
+
+ /*
+ * If we are aborting an extend of a file, the call to __os_truncate
+ * could extend the file if the new page(s) had not yet been
+ * written to disk. We do not want to extend the file to pages
+ * whose log records are not yet flushed [#14031]. In addition if
+ * we are out of disk space we can generate an error [#12743].
+ */
+ MUTEX_LOCK(env, mfp->mutex);
+ if (!F_ISSET(mfp, MP_TEMP) &&
+ !mfp->no_backing_file && pgno <= mfp->last_flushed_pgno)
+#ifdef HAVE_FTRUNCATE
+ ret = __os_truncate(env,
+ dbmfp->fhp, pgno, mfp->pagesize);
+#else
+ ret = __db_zero_extend(env,
+ dbmfp->fhp, pgno, mfp->last_pgno, mfp->pagesize);
+#endif
+
+ /*
+ * This set could race with another thread of control that extending
+ * the file. It's not a problem because we should have the page
+ * locked at a higher level of the system.
+ */
+ if (ret == 0) {
+ mfp->last_pgno = pgno - 1;
+ if (mfp->last_flushed_pgno > mfp->last_pgno)
+ mfp->last_flushed_pgno = mfp->last_pgno;
+ }
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * Support routines for maintaining a sorted freelist while we try to rearrange
+ * and truncate the file.
+ */
+
+/*
+ * __memp_alloc_freelist --
+ * Allocate mpool space for the freelist.
+ *
+ * PUBLIC: int __memp_alloc_freelist __P((DB_MPOOLFILE *,
+ * PUBLIC: u_int32_t, db_pgno_t **));
+ */
+int
+__memp_alloc_freelist(dbmfp, nelems, listp)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t nelems;
+ db_pgno_t **listp;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOLFILE *mfp;
+ void *retp;
+ int ret;
+
+ env = dbmfp->env;
+ dbmp = env->mp_handle;
+ mfp = dbmfp->mfp;
+
+ *listp = NULL;
+
+ /*
+ * These fields are protected because the database layer
+ * has the metapage locked while manipulating them.
+ */
+ mfp->free_ref++;
+ if (mfp->free_size != 0)
+ return (EBUSY);
+
+ /* Allocate at least a few slots. */
+ mfp->free_cnt = nelems;
+ if (nelems == 0)
+ nelems = 50;
+
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, nelems * sizeof(db_pgno_t), &mfp->free_list, &retp)) != 0)
+ return (ret);
+
+ mfp->free_size = nelems * sizeof(db_pgno_t);
+ *listp = retp;
+ return (0);
+}
+
+/*
+ * __memp_free_freelist --
+ * Free the list.
+ *
+ * PUBLIC: int __memp_free_freelist __P((DB_MPOOLFILE *));
+ */
+int
+__memp_free_freelist(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOLFILE *mfp;
+
+ env = dbmfp->env;
+ dbmp = env->mp_handle;
+ mfp = dbmfp->mfp;
+
+ DB_ASSERT(env, mfp->free_ref > 0);
+ if (--mfp->free_ref > 0)
+ return (0);
+
+ DB_ASSERT(env, mfp->free_size != 0);
+
+ MPOOL_SYSTEM_LOCK(env);
+ __memp_free(dbmp->reginfo, R_ADDR(dbmp->reginfo, mfp->free_list));
+ MPOOL_SYSTEM_UNLOCK(env);
+
+ mfp->free_cnt = 0;
+ mfp->free_list = 0;
+ mfp->free_size = 0;
+ return (0);
+}
+
+/*
+ * __memp_get_freelst --
+ * Return current list.
+ *
+ * PUBLIC: int __memp_get_freelist __P((
+ * PUBLIC: DB_MPOOLFILE *, u_int32_t *, db_pgno_t **));
+ */
+int
+__memp_get_freelist(dbmfp, nelemp, listp)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t *nelemp;
+ db_pgno_t **listp;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOLFILE *mfp;
+
+ env = dbmfp->env;
+ dbmp = env->mp_handle;
+ mfp = dbmfp->mfp;
+
+ if (mfp->free_size == 0) {
+ *nelemp = 0;
+ *listp = NULL;
+ } else {
+ *nelemp = mfp->free_cnt;
+ *listp = R_ADDR(dbmp->reginfo, mfp->free_list);
+ }
+
+ return (0);
+}
+
+/*
+ * __memp_extend_freelist --
+ * Extend the list.
+ *
+ * PUBLIC: int __memp_extend_freelist __P((
+ * PUBLIC: DB_MPOOLFILE *, u_int32_t , db_pgno_t **));
+ */
+int
+__memp_extend_freelist(dbmfp, count, listp)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t count;
+ db_pgno_t **listp;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOLFILE *mfp;
+ int ret;
+ size_t size;
+ void *retp;
+
+ env = dbmfp->env;
+ dbmp = env->mp_handle;
+ mfp = dbmfp->mfp;
+
+ if (mfp->free_size == 0)
+ return (EINVAL);
+
+ if (count * sizeof(db_pgno_t) > mfp->free_size) {
+ size = (size_t)DB_ALIGN(count * sizeof(db_pgno_t), 512);
+#ifdef HAVE_MIXED_SIZE_ADDRESSING
+ if (size >= 0xFFFFFFFF) {
+ __db_errx(env, DB_STR("3006",
+ "Can't get the required free size while"
+ "operating in mixed-size-addressing mode"));
+ return EINVAL;
+ }
+#endif
+ *listp = R_ADDR(dbmp->reginfo, mfp->free_list);
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, size, &mfp->free_list, &retp)) != 0)
+ return (ret);
+ mfp->free_size = (db_size_t)size;
+
+ memcpy(retp, *listp, mfp->free_cnt * sizeof(db_pgno_t));
+
+ MPOOL_SYSTEM_LOCK(env);
+ __memp_free(dbmp->reginfo, *listp);
+ MPOOL_SYSTEM_UNLOCK(env);
+ }
+
+ mfp->free_cnt = count;
+ *listp = R_ADDR(dbmp->reginfo, mfp->free_list);
+
+ return (0);
+}
+#endif
+
+/*
+ * __memp_set_last_pgno -- set the last page of the file
+ *
+ * PUBLIC: int __memp_set_last_pgno __P((DB_MPOOLFILE *, db_pgno_t));
+ */
+int
+__memp_set_last_pgno(dbmfp, pgno)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t pgno;
+{
+ MPOOLFILE *mfp;
+
+ mfp = dbmfp->mfp;
+
+ if (mfp->mpf_cnt == 1) {
+ MUTEX_LOCK(dbmfp->env, mfp->mutex);
+ if (mfp->mpf_cnt == 1)
+ dbmfp->mfp->last_pgno = pgno;
+ MUTEX_UNLOCK(dbmfp->env, mfp->mutex);
+ }
+ return (0);
+}
diff --git a/src/mp/mp_mvcc.c b/src/mp/mp_mvcc.c
new file mode 100644
index 00000000..47531528
--- /dev/null
+++ b/src/mp/mp_mvcc.c
@@ -0,0 +1,636 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __pgno_cmp __P((const void *, const void *));
+
+/*
+ * __memp_bh_settxn --
+ * Set the transaction that owns the given buffer.
+ *
+ * PUBLIC: int __memp_bh_settxn __P((DB_MPOOL *, MPOOLFILE *mfp, BH *, void *));
+ */
+int
+__memp_bh_settxn(dbmp, mfp, bhp, vtd)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ BH *bhp;
+ void *vtd;
+{
+ ENV *env;
+ TXN_DETAIL *td;
+
+ env = dbmp->env;
+ td = (TXN_DETAIL *)vtd;
+
+ if (td == NULL) {
+ __db_errx(env, DB_STR_A("3002",
+ "%s: non-transactional update to a multiversion file",
+ "%s"), __memp_fns(dbmp, mfp));
+ return (EINVAL);
+ }
+
+ if (bhp->td_off != INVALID_ROFF) {
+ DB_ASSERT(env, BH_OWNER(env, bhp) == td);
+ return (0);
+ }
+
+ bhp->td_off = R_OFFSET(&env->tx_handle->reginfo, td);
+ return (__txn_add_buffer(env, td));
+}
+
+/*
+ * __memp_skip_curadj --
+ * Indicate whether a cursor adjustment can be skipped for a snapshot
+ * cursor.
+ *
+ * PUBLIC: int __memp_skip_curadj __P((DBC *, db_pgno_t));
+ */
+int
+__memp_skip_curadj(dbc, pgno)
+ DBC * dbc;
+ db_pgno_t pgno;
+{
+ BH *bhp;
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ DB_MPOOL_HASH *hp;
+ DB_TXN *txn;
+ ENV *env;
+ MPOOLFILE *mfp;
+ REGINFO *infop;
+ roff_t mf_offset;
+ int ret, skip;
+ u_int32_t bucket;
+
+ env = dbc->env;
+ dbmp = env->mp_handle;
+ dbmfp = dbc->dbp->mpf;
+ mfp = dbmfp->mfp;
+ mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+ skip = 0;
+
+ for (txn = dbc->txn; txn->parent != NULL; txn = txn->parent)
+ ;
+
+ /*
+ * Determine the cache and hash bucket where this page lives and get
+ * local pointers to them. Reset on each pass through this code, the
+ * page number can change.
+ */
+ MP_GET_BUCKET(env, mfp, pgno, &infop, hp, bucket, ret);
+ if (ret != 0) {
+ /* Panic: there is no way to return the error. */
+ (void)__env_panic(env, ret);
+ return (0);
+ }
+
+ SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
+ if (bhp->pgno != pgno || bhp->mf_offset != mf_offset)
+ continue;
+
+ if (!BH_OWNED_BY(env, bhp, txn))
+ skip = 1;
+ break;
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+
+ return (skip);
+}
+
+#define DB_FREEZER_MAGIC 0x06102002
+
+/*
+ * __memp_bh_freeze --
+ * Save a buffer to temporary storage in case it is needed later by
+ * a snapshot transaction. This function should be called with the buffer
+ * locked and will exit with it locked. A BH_FROZEN buffer header is
+ * allocated to represent the frozen data in mpool.
+ *
+ * PUBLIC: int __memp_bh_freeze __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *,
+ * PUBLIC: BH *, int *));
+ */
+int
+__memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
+ DB_MPOOL *dbmp;
+ REGINFO *infop;
+ DB_MPOOL_HASH *hp;
+ BH *bhp;
+ int *need_frozenp;
+{
+ BH *frozen_bhp;
+ BH_FROZEN_ALLOC *frozen_alloc;
+ DB_FH *fhp;
+ ENV *env;
+ MPOOL *c_mp;
+ MPOOLFILE *mfp;
+ db_mutex_t mutex;
+ db_pgno_t maxpgno, newpgno, nextfree;
+ size_t nio;
+ int created, h_locked, ret, t_ret;
+ u_int32_t magic, nbucket, ncache, pagesize;
+ char filename[100], *real_name;
+
+ env = dbmp->env;
+ c_mp = infop->primary;
+ created = h_locked = ret = 0;
+ /* Find the associated MPOOLFILE. */
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ pagesize = mfp->pagesize;
+ real_name = NULL;
+ fhp = NULL;
+
+ MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE);
+
+ MPOOL_REGION_LOCK(env, infop);
+ frozen_bhp = SH_TAILQ_FIRST(&c_mp->free_frozen, __bh);
+ if (frozen_bhp != NULL) {
+ SH_TAILQ_REMOVE(&c_mp->free_frozen, frozen_bhp, hq, __bh);
+ *need_frozenp = SH_TAILQ_EMPTY(&c_mp->free_frozen);
+ } else {
+ *need_frozenp = 1;
+
+ /* There might be a small amount of unallocated space. */
+ if (__env_alloc(infop,
+ sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE),
+ &frozen_alloc) == 0) {
+ frozen_bhp = (BH *)(frozen_alloc + 1);
+ frozen_bhp->mtx_buf = MUTEX_INVALID;
+ SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
+ frozen_alloc, links);
+ }
+ }
+ MPOOL_REGION_UNLOCK(env, infop);
+
+ /*
+ * If we can't get a frozen buffer header, return ENOMEM immediately:
+ * we don't want to call __memp_alloc recursively. __memp_alloc will
+ * turn the next free page it finds into frozen buffer headers.
+ */
+ if (frozen_bhp == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+
+ /*
+ * For now, keep things simple and have one file per page size per
+ * hash bucket. This improves concurrency but can mean lots of files
+ * if there is lots of freezing.
+ */
+ ncache = (u_int32_t)(infop - dbmp->reginfo);
+ nbucket = (u_int32_t)(hp - (DB_MPOOL_HASH *)R_ADDR(infop, c_mp->htab));
+ snprintf(filename, sizeof(filename), "__db.freezer.%lu.%lu.%luK",
+ (u_long)ncache, (u_long)nbucket, (u_long)pagesize / 1024);
+
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, filename, NULL, &real_name)) != 0)
+ goto err;
+
+ MUTEX_LOCK(env, hp->mtx_hash);
+ h_locked = 1;
+ DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE) && !F_ISSET(bhp, BH_FROZEN));
+
+ if (BH_REFCOUNT(bhp) > 1 || F_ISSET(bhp, BH_DIRTY)) {
+ ret = EBUSY;
+ goto err;
+ }
+
+ if ((ret = __os_open(env, real_name, pagesize,
+ DB_OSO_CREATE | DB_OSO_EXCL, env->db_mode, &fhp)) == 0) {
+ /* We're creating the file -- initialize the metadata page. */
+ created = 1;
+ magic = DB_FREEZER_MAGIC;
+ maxpgno = newpgno = 0;
+ if ((ret = __os_write(env, fhp,
+ &magic, sizeof(u_int32_t), &nio)) != 0 ||
+ (ret = __os_write(env, fhp,
+ &newpgno, sizeof(db_pgno_t), &nio)) != 0 ||
+ (ret = __os_write(env, fhp,
+ &maxpgno, sizeof(db_pgno_t), &nio)) != 0 ||
+ (ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ } else if (ret == EEXIST)
+ ret = __os_open(env,
+ real_name, pagesize, 0, env->db_mode, &fhp);
+ if (ret != 0)
+ goto err;
+ if ((ret = __os_read(env, fhp,
+ &magic, sizeof(u_int32_t), &nio)) != 0 ||
+ (ret = __os_read(env, fhp,
+ &newpgno, sizeof(db_pgno_t), &nio)) != 0 ||
+ (ret = __os_read(env, fhp,
+ &maxpgno, sizeof(db_pgno_t), &nio)) != 0)
+ goto err;
+ if (magic != DB_FREEZER_MAGIC) {
+ ret = EINVAL;
+ goto err;
+ }
+ if (newpgno == 0) {
+ newpgno = ++maxpgno;
+ if ((ret = __os_seek(env,
+ fhp, 0, 0, sizeof(u_int32_t) + sizeof(db_pgno_t))) != 0 ||
+ (ret = __os_write(env, fhp, &maxpgno, sizeof(db_pgno_t),
+ &nio)) != 0)
+ goto err;
+ } else {
+ if ((ret = __os_seek(env, fhp, newpgno, pagesize, 0)) != 0 ||
+ (ret = __os_read(env, fhp, &nextfree, sizeof(db_pgno_t),
+ &nio)) != 0)
+ goto err;
+ if ((ret =
+ __os_seek(env, fhp, 0, 0, sizeof(u_int32_t))) != 0 ||
+ (ret = __os_write(env, fhp, &nextfree, sizeof(db_pgno_t),
+ &nio)) != 0)
+ goto err;
+ }
+
+ /* Write the buffer to the allocated page. */
+ if ((ret = __os_io(env, DB_IO_WRITE, fhp, newpgno, pagesize, 0,
+ pagesize, bhp->buf, &nio)) != 0)
+ goto err;
+
+ ret = __os_closehandle(env, fhp);
+ fhp = NULL;
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Set up the frozen_bhp with the freezer page number. The original
+ * buffer header is about to be freed, so transfer resources to the
+ * frozen header here.
+ */
+ mutex = frozen_bhp->mtx_buf;
+#ifdef DIAG_MVCC
+ memcpy(frozen_bhp, bhp, SSZ(BH, align_off));
+#else
+ memcpy(frozen_bhp, bhp, SSZA(BH, buf));
+#endif
+ atomic_init(&frozen_bhp->ref, 0);
+ if (mutex != MUTEX_INVALID)
+ frozen_bhp->mtx_buf = mutex;
+ else if ((ret = __mutex_alloc(env, MTX_MPOOL_BH,
+ DB_MUTEX_SHARED, &frozen_bhp->mtx_buf)) != 0)
+ goto err;
+ F_SET(frozen_bhp, BH_FROZEN);
+ F_CLR(frozen_bhp, BH_EXCLUSIVE);
+ ((BH_FROZEN_PAGE *)frozen_bhp)->spgno = newpgno;
+
+ /*
+ * We're about to add the frozen buffer header to the version chain, so
+ * we have temporarily created another buffer for the owning
+ * transaction.
+ */
+ if (frozen_bhp->td_off != INVALID_ROFF &&
+ (ret = __txn_add_buffer(env, BH_OWNER(env, frozen_bhp))) != 0) {
+ (void)__env_panic(env, ret);
+ goto err;
+ }
+
+ STAT_INC(env, mpool, freeze, hp->hash_frozen, bhp->pgno);
+
+ /*
+ * Add the frozen buffer to the version chain and update the hash
+ * bucket if this is the head revision. The original buffer will be
+ * freed by __memp_alloc calling __memp_bhfree (assuming no other
+ * thread has blocked waiting for it while we were freezing).
+ */
+ SH_CHAIN_INSERT_AFTER(bhp, frozen_bhp, vc, __bh);
+ if (!SH_CHAIN_HASNEXT(frozen_bhp, vc)) {
+ SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
+ bhp, frozen_bhp, hq, __bh);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+
+ /*
+ * Increment the file's block count -- freeing the original buffer will
+ * decrement it.
+ */
+ MUTEX_LOCK(env, mfp->mutex);
+ ++mfp->block_cnt;
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ if (0) {
+err: if (fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (created) {
+ DB_ASSERT(env, h_locked);
+ if ((t_ret = __os_unlink(env, real_name, 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+ if (h_locked)
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ if (ret == 0)
+ ret = EIO;
+ if (frozen_bhp != NULL) {
+ MPOOL_REGION_LOCK(env, infop);
+ SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
+ frozen_bhp, hq);
+ MPOOL_REGION_UNLOCK(env, infop);
+ }
+ }
+ if (real_name != NULL)
+ __os_free(env, real_name);
+ if (ret != 0 && ret != EBUSY && ret != ENOMEM)
+ __db_err(env, ret, "__memp_bh_freeze");
+
+ return (ret);
+}
+
+static int
+__pgno_cmp(a, b)
+ const void *a, *b;
+{
+ db_pgno_t *ap, *bp;
+
+ ap = (db_pgno_t *)a;
+ bp = (db_pgno_t *)b;
+
+ return (int)(*ap - *bp);
+}
+
+/*
+ * __memp_bh_thaw --
+ * Free a buffer header in temporary storage. Optionally restore the
+ * buffer (if alloc_bhp != NULL). This function should be
+ * called with the hash bucket locked and will return with it unlocked.
+ *
+ * PUBLIC: int __memp_bh_thaw __P((DB_MPOOL *, REGINFO *,
+ * PUBLIC: DB_MPOOL_HASH *, BH *, BH *));
+ */
+int
+__memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
+ DB_MPOOL *dbmp;
+ REGINFO *infop;
+ DB_MPOOL_HASH *hp;
+ BH *frozen_bhp, *alloc_bhp;
+{
+ DB_FH *fhp;
+ ENV *env;
+#ifdef DIAGNOSTIC
+ DB_LSN vlsn;
+#endif
+ MPOOL *c_mp;
+ MPOOLFILE *mfp;
+ db_mutex_t mutex;
+ db_pgno_t *freelist, *ppgno, freepgno, maxpgno, spgno;
+ size_t nio;
+ u_int32_t listsize, magic, nbucket, ncache, ntrunc, nfree, pagesize;
+#ifdef HAVE_FTRUNCATE
+ int i;
+#endif
+ int h_locked, needfree, ret, t_ret;
+ char filename[100], *real_name;
+
+ env = dbmp->env;
+ fhp = NULL;
+ c_mp = infop->primary;
+ mfp = R_ADDR(dbmp->reginfo, frozen_bhp->mf_offset);
+ freelist = NULL;
+ pagesize = mfp->pagesize;
+ ret = 0;
+ real_name = NULL;
+
+ MUTEX_REQUIRED(env, hp->mtx_hash);
+ DB_ASSERT(env, F_ISSET(frozen_bhp, BH_EXCLUSIVE) || alloc_bhp == NULL);
+ h_locked = 1;
+
+ DB_ASSERT(env, F_ISSET(frozen_bhp, BH_FROZEN) &&
+ !F_ISSET(frozen_bhp, BH_THAWED));
+ DB_ASSERT(env, alloc_bhp != NULL ||
+ SH_CHAIN_SINGLETON(frozen_bhp, vc) ||
+ (SH_CHAIN_HASNEXT(frozen_bhp, vc) &&
+ BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn)));
+ DB_ASSERT(env, alloc_bhp == NULL || !F_ISSET(alloc_bhp, BH_FROZEN));
+
+ spgno = ((BH_FROZEN_PAGE *)frozen_bhp)->spgno;
+
+ if (alloc_bhp != NULL) {
+ mutex = alloc_bhp->mtx_buf;
+#ifdef DIAG_MVCC
+ memcpy(alloc_bhp, frozen_bhp, SSZ(BH, align_off));
+#else
+ memcpy(alloc_bhp, frozen_bhp, SSZA(BH, buf));
+#endif
+ alloc_bhp->mtx_buf = mutex;
+ MUTEX_LOCK(env, alloc_bhp->mtx_buf);
+ atomic_init(&alloc_bhp->ref, 1);
+ F_CLR(alloc_bhp, BH_FROZEN);
+ }
+
+ /*
+ * For now, keep things simple and have one file per page size per
+ * hash bucket. This improves concurrency but can mean lots of files
+ * if there is lots of freezing.
+ */
+ ncache = (u_int32_t)(infop - dbmp->reginfo);
+ nbucket = (u_int32_t)(hp - (DB_MPOOL_HASH *)R_ADDR(infop, c_mp->htab));
+ snprintf(filename, sizeof(filename), "__db.freezer.%lu.%lu.%luK",
+ (u_long)ncache, (u_long)nbucket, (u_long)pagesize / 1024);
+
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, filename, NULL, &real_name)) != 0)
+ goto err;
+ if ((ret = __os_open(env,
+ real_name, pagesize, 0, env->db_mode, &fhp)) != 0)
+ goto err;
+
+ /*
+ * Read the first free page number -- we're about to free the page
+ * after we we read it.
+ */
+ if ((ret = __os_read(env, fhp, &magic, sizeof(u_int32_t), &nio)) != 0 ||
+ (ret =
+ __os_read(env, fhp, &freepgno, sizeof(db_pgno_t), &nio)) != 0 ||
+ (ret = __os_read(env, fhp, &maxpgno, sizeof(db_pgno_t), &nio)) != 0)
+ goto err;
+
+ if (magic != DB_FREEZER_MAGIC) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ /* Read the buffer from the frozen page. */
+ if (alloc_bhp != NULL) {
+ DB_ASSERT(env, !F_ISSET(frozen_bhp, BH_FREED));
+ if ((ret = __os_io(env, DB_IO_READ, fhp,
+ spgno, pagesize, 0, pagesize, alloc_bhp->buf, &nio)) != 0)
+ goto err;
+ }
+
+ /*
+ * Free the page from the file. If it's the last page, truncate.
+ * Otherwise, update free page linked list.
+ */
+ needfree = 1;
+ if (spgno == maxpgno) {
+ listsize = 100;
+ if ((ret = __os_malloc(env,
+ listsize * sizeof(db_pgno_t), &freelist)) != 0)
+ goto err;
+ nfree = 0;
+ while (freepgno != 0) {
+ if (nfree == listsize - 1) {
+ listsize *= 2;
+ if ((ret = __os_realloc(env,
+ listsize * sizeof(db_pgno_t),
+ &freelist)) != 0)
+ goto err;
+ }
+ freelist[nfree++] = freepgno;
+ if ((ret = __os_seek(env, fhp,
+ freepgno, pagesize, 0)) != 0 ||
+ (ret = __os_read(env, fhp, &freepgno,
+ sizeof(db_pgno_t), &nio)) != 0)
+ goto err;
+ }
+ freelist[nfree++] = spgno;
+ qsort(freelist, nfree, sizeof(db_pgno_t), __pgno_cmp);
+ for (ppgno = &freelist[nfree - 1]; ppgno > freelist; ppgno--)
+ if (*(ppgno - 1) != *ppgno - 1)
+ break;
+ ntrunc = (u_int32_t)(&freelist[nfree] - ppgno);
+ if (ntrunc == (u_int32_t)maxpgno) {
+ needfree = 0;
+ ret = __os_closehandle(env, fhp);
+ fhp = NULL;
+ if (ret != 0 ||
+ (ret = __os_unlink(env, real_name, 0)) != 0)
+ goto err;
+ }
+#ifdef HAVE_FTRUNCATE
+ else {
+ maxpgno -= (db_pgno_t)ntrunc;
+ if ((ret = __os_truncate(env, fhp,
+ maxpgno + 1, pagesize)) != 0)
+ goto err;
+
+ /* Fix up the linked list */
+ freelist[nfree - ntrunc] = 0;
+ if ((ret = __os_seek(env, fhp,
+ 0, 0, sizeof(u_int32_t))) != 0 ||
+ (ret = __os_write(env, fhp, &freelist[0],
+ sizeof(db_pgno_t), &nio)) != 0 ||
+ (ret = __os_write(env, fhp, &maxpgno,
+ sizeof(db_pgno_t), &nio)) != 0)
+ goto err;
+
+ for (i = 0; i < (int)(nfree - ntrunc); i++)
+ if ((ret = __os_seek(env,
+ fhp, freelist[i], pagesize, 0)) != 0 ||
+ (ret = __os_write(env, fhp,
+ &freelist[i + 1], sizeof(db_pgno_t),
+ &nio)) != 0)
+ goto err;
+ needfree = 0;
+ }
+#endif
+ }
+ if (needfree) {
+ if ((ret = __os_seek(env, fhp, spgno, pagesize, 0)) != 0 ||
+ (ret = __os_write(env, fhp,
+ &freepgno, sizeof(db_pgno_t), &nio)) != 0 ||
+ (ret = __os_seek(env, fhp, 0, 0, sizeof(u_int32_t))) != 0 ||
+ (ret = __os_write(env, fhp,
+ &spgno, sizeof(db_pgno_t), &nio)) != 0)
+ goto err;
+
+ ret = __os_closehandle(env, fhp);
+ fhp = NULL;
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * Add the thawed buffer (if any) to the version chain. We can't
+ * do this any earlier, because we can't guarantee that another thread
+ * won't be waiting for it, which means we can't clean up if there are
+ * errors reading from the freezer. We can't do it any later, because
+ * we're about to free frozen_bhp, and without it we would need to do
+ * another cache lookup to find out where the new page should live.
+ */
+ MUTEX_REQUIRED(env, hp->mtx_hash);
+ if (alloc_bhp != NULL) {
+ alloc_bhp->priority = c_mp->lru_priority;
+
+ SH_CHAIN_INSERT_AFTER(frozen_bhp, alloc_bhp, vc, __bh);
+ if (!SH_CHAIN_HASNEXT(alloc_bhp, vc)) {
+ SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, frozen_bhp,
+ alloc_bhp, hq, __bh);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, frozen_bhp, hq, __bh);
+ }
+ } else if (!SH_CHAIN_HASNEXT(frozen_bhp, vc)) {
+ if (SH_CHAIN_HASPREV(frozen_bhp, vc))
+ SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, frozen_bhp,
+ SH_CHAIN_PREV(frozen_bhp, vc, __bh), hq, __bh);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, frozen_bhp, hq, __bh);
+ }
+ SH_CHAIN_REMOVE(frozen_bhp, vc, __bh);
+
+ if (alloc_bhp == NULL && frozen_bhp->td_off != INVALID_ROFF &&
+ (ret = __txn_remove_buffer(env,
+ BH_OWNER(env, frozen_bhp), MUTEX_INVALID)) != 0) {
+ (void)__env_panic(env, ret);
+ goto err;
+ }
+ frozen_bhp->td_off = INVALID_ROFF;
+
+ /*
+ * If other threads are waiting for this buffer as well, they will have
+ * incremented the reference count and will be waiting on the mutex.
+ * For that reason, we can't unconditionally free the memory here.
+ */
+ needfree = (atomic_dec(env, &frozen_bhp->ref) == 0);
+ if (!needfree)
+ F_SET(frozen_bhp, BH_THAWED);
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ if (F_ISSET(frozen_bhp, BH_EXCLUSIVE))
+ MUTEX_UNLOCK(env, frozen_bhp->mtx_buf);
+ h_locked = 0;
+ if (needfree) {
+ MPOOL_REGION_LOCK(env, infop);
+ SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, frozen_bhp, hq);
+ MPOOL_REGION_UNLOCK(env, infop);
+ }
+
+#ifdef HAVE_STATISTICS
+ if (alloc_bhp != NULL)
+ STAT_INC_VERB(env, mpool, thaw,
+ hp->hash_thawed, __memp_fns(dbmp, mfp), frozen_bhp->pgno);
+ else
+ STAT_INC_VERB(env, mpool, free_frozen, hp->hash_frozen_freed,
+ __memp_fns(dbmp, mfp), frozen_bhp->pgno);
+#endif
+
+ if (0) {
+err: if (h_locked)
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ if (ret == 0)
+ ret = EIO;
+ }
+ if (real_name != NULL)
+ __os_free(env, real_name);
+ if (freelist != NULL)
+ __os_free(env, freelist);
+ if (fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ __db_err(env, ret, "__memp_bh_thaw");
+
+ return (ret);
+}
diff --git a/src/mp/mp_region.c b/src/mp/mp_region.c
new file mode 100644
index 00000000..07134de7
--- /dev/null
+++ b/src/mp/mp_region.c
@@ -0,0 +1,620 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+
+static int __memp_init_config __P((ENV *, MPOOL *));
+static void __memp_region_size __P((ENV *, roff_t *, u_int32_t *));
+
+#define MPOOL_DEFAULT_PAGESIZE (4 * 1024)
+
+/*
+ * __memp_open --
+ * Internal version of memp_open: only called from ENV->open.
+ *
+ * PUBLIC: int __memp_open __P((ENV *, int));
+ */
+int
+__memp_open(env, create_ok)
+ ENV *env;
+ int create_ok;
+{
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ MPOOL *mp, *mp_i;
+ REGINFO reginfo;
+ roff_t cache_size, max_size, reg_size;
+ u_int i, max_nreg;
+ u_int32_t htab_buckets, *regids;
+ int ret;
+
+ dbenv = env->dbenv;
+ cache_size = 0;
+
+ /* Calculate the region size and hash bucket count. */
+ __memp_region_size(env, &max_size, &htab_buckets);
+
+ /* Create and initialize the DB_MPOOL structure. */
+ if ((ret = __os_calloc(env, 1, sizeof(*dbmp), &dbmp)) != 0)
+ return (ret);
+ LIST_INIT(&dbmp->dbregq);
+ TAILQ_INIT(&dbmp->dbmfq);
+ dbmp->env = env;
+
+ /* Join/create the first mpool region. */
+ memset(&reginfo, 0, sizeof(REGINFO));
+ reginfo.env = env;
+ reginfo.type = REGION_TYPE_MPOOL;
+ reginfo.id = INVALID_REGION_ID;
+ reginfo.flags = REGION_JOIN_OK;
+
+ /* Calculate the minimum allocation. */
+ reg_size = sizeof(MPOOL);
+ reg_size += MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH);
+ reg_size += htab_buckets * sizeof(DB_MPOOL_HASH);
+ reg_size += (dbenv->mp_pagesize == 0 ?
+ MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize) * 10;
+ if (reg_size > max_size)
+ reg_size = max_size;
+
+ if (create_ok)
+ F_SET(&reginfo, REGION_CREATE_OK);
+ if ((ret = __env_region_attach(env, &reginfo, reg_size, max_size)) != 0)
+ goto err;
+ cache_size = reginfo.rp->max;
+ if (F_ISSET(env, ENV_PRIVATE))
+ reginfo.max_alloc = reginfo.rp->max;
+
+ /*
+ * If we created the region, initialize it. Create or join any
+ * additional regions.
+ */
+ if (F_ISSET(&reginfo, REGION_CREATE)) {
+ /*
+ * We define how many regions there are going to be, allocate
+ * the REGINFO structures and create them. Make sure we don't
+ * clear the wrong entries on error.
+ */
+ max_nreg = __memp_max_regions(env);
+ if ((ret = __os_calloc(env,
+ max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+ goto err;
+ /* Make sure we don't clear the wrong entries on error. */
+ dbmp->reginfo[0] = reginfo;
+ for (i = 1; i < max_nreg; ++i)
+ dbmp->reginfo[i].id = INVALID_REGION_ID;
+
+ /* Initialize the first region. */
+ if ((ret = __memp_init(env, dbmp,
+ 0, htab_buckets, max_nreg)) != 0)
+ goto err;
+
+ /*
+ * Create/initialize remaining regions and copy their IDs into
+ * the first region.
+ */
+ mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary);
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+ regids[0] = dbmp->reginfo[0].id;
+ for (i = 1; i < dbenv->mp_ncache; ++i) {
+ dbmp->reginfo[i].env = env;
+ dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+ dbmp->reginfo[i].id = INVALID_REGION_ID;
+ dbmp->reginfo[i].flags = REGION_CREATE_OK;
+ if ((ret = __env_region_attach(
+ env, &dbmp->reginfo[i], reg_size, max_size)) != 0)
+ goto err;
+ if (F_ISSET(env, ENV_PRIVATE))
+ dbmp->reginfo[i].max_alloc = max_size;
+ cache_size += dbmp->reginfo[i].rp->max;
+ if ((ret = __memp_init(env, dbmp,
+ i, htab_buckets, max_nreg)) != 0)
+ goto err;
+
+ regids[i] = dbmp->reginfo[i].id;
+ }
+ mp->gbytes = (u_int32_t) (cache_size / GIGABYTE);
+ mp->bytes = (u_int32_t) (cache_size % GIGABYTE);
+ } else {
+ /*
+ * Determine how many regions there are going to be, allocate
+ * the REGINFO structures and fill in local copies of that
+ * information.
+ */
+ mp = R_ADDR(&reginfo, reginfo.rp->primary);
+ dbenv->mp_ncache = mp->nreg;
+ if ((ret = __os_calloc(env,
+ mp->max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+ goto err;
+ /* Make sure we don't clear the wrong entries on error. */
+ for (i = 0; i < dbenv->mp_ncache; ++i)
+ dbmp->reginfo[i].id = INVALID_REGION_ID;
+ dbmp->reginfo[0] = reginfo;
+
+ /* Join remaining regions. */
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+ for (i = 1; i < dbenv->mp_ncache; ++i) {
+ dbmp->reginfo[i].env = env;
+ dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+ dbmp->reginfo[i].id = regids[i];
+ dbmp->reginfo[i].flags = REGION_JOIN_OK;
+ if ((ret = __env_region_attach(
+ env, &dbmp->reginfo[i], 0, 0)) != 0)
+ goto err;
+ }
+ }
+
+ /* Set the local addresses for the regions. */
+ for (i = 0; i < dbenv->mp_ncache; ++i) {
+ mp_i = dbmp->reginfo[i].primary =
+ R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
+ dbmp->reginfo[i].mtx_alloc = mp_i->mtx_region;
+ }
+
+ /* If the region is threaded, allocate a mutex to lock the handles. */
+ if ((ret = __mutex_alloc(env,
+ MTX_MPOOL_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbmp->mutex)) != 0)
+ goto err;
+
+ env->mp_handle = dbmp;
+
+ /* A process joining the region may reset the mpool configuration. */
+ if ((ret = __memp_init_config(env, mp)) != 0)
+ return (ret);
+
+ return (0);
+
+err: env->mp_handle = NULL;
+ if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
+ for (i = 0; i < dbenv->mp_ncache; ++i)
+ if (dbmp->reginfo[i].id != INVALID_REGION_ID)
+ (void)__env_region_detach(
+ env, &dbmp->reginfo[i], 0);
+ __os_free(env, dbmp->reginfo);
+ }
+
+ (void)__mutex_free(env, &dbmp->mutex);
+ __os_free(env, dbmp);
+ return (ret);
+}
+
+/*
+ * __memp_init --
+ * Initialize a MPOOL structure in shared memory.
+ *
+ * PUBLIC: int __memp_init
+ * PUBLIC: __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int));
+ */
+int
+__memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
+ ENV *env;
+ DB_MPOOL *dbmp;
+ u_int reginfo_off, max_nreg;
+ u_int32_t htab_buckets;
+{
+ BH *frozen_bhp;
+ BH_FROZEN_ALLOC *frozen;
+ DB_ENV *dbenv;
+ DB_MPOOL_HASH *htab, *hp;
+ MPOOL *mp, *main_mp;
+ REGINFO *infop;
+ db_mutex_t mtx_base, mtx_discard, mtx_prev;
+ u_int32_t i;
+ int ret;
+ void *p;
+
+ dbenv = env->dbenv;
+
+ infop = &dbmp->reginfo[reginfo_off];
+ if ((ret = __env_alloc(infop, sizeof(MPOOL), &infop->primary)) != 0)
+ goto mem_err;
+ infop->rp->primary = R_OFFSET(infop, infop->primary);
+ mp = infop->primary;
+ memset(mp, 0, sizeof(*mp));
+
+ if ((ret =
+ __mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0)
+ return (ret);
+
+ if (reginfo_off == 0) {
+ ZERO_LSN(mp->lsn);
+
+ mp->nreg = dbenv->mp_ncache;
+ mp->max_nreg = max_nreg;
+ if ((ret = __env_alloc(&dbmp->reginfo[0],
+ max_nreg * sizeof(u_int32_t), &p)) != 0)
+ goto mem_err;
+ mp->regids = R_OFFSET(dbmp->reginfo, p);
+ mp->nbuckets = dbenv->mp_ncache * htab_buckets;
+
+ /* Allocate file table space and initialize it. */
+ if ((ret = __env_alloc(infop,
+ MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), &htab)) != 0)
+ goto mem_err;
+ mp->ftab = R_OFFSET(infop, htab);
+ for (i = 0; i < MPOOL_FILE_BUCKETS; i++) {
+ if ((ret = __mutex_alloc(env,
+ MTX_MPOOL_FILE_BUCKET, 0, &htab[i].mtx_hash)) != 0)
+ return (ret);
+ SH_TAILQ_INIT(&htab[i].hash_bucket);
+ atomic_init(&htab[i].hash_page_dirty, 0);
+ }
+
+ /*
+ * Allocate all of the hash bucket mutexes up front. We do
+ * this so that we don't need to free and reallocate mutexes as
+ * the cache is resized.
+ */
+ mtx_base = mtx_prev = MUTEX_INVALID;
+ if (!MUTEX_ON(env) || F_ISSET(env, ENV_PRIVATE))
+ goto no_prealloc;
+ for (i = 0; i < mp->max_nreg * dbenv->mp_mtxcount; i++) {
+ if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
+ DB_MUTEX_SHARED, &mtx_discard)) != 0)
+ return (ret);
+ if (i == 0)
+ mtx_base = mtx_discard;
+ else
+ DB_ASSERT(env, mtx_base == MUTEX_INVALID ||
+ mtx_discard == mtx_prev + 1);
+ mtx_prev = mtx_discard;
+ }
+ } else {
+ main_mp = dbmp->reginfo[0].primary;
+ htab = R_ADDR(&dbmp->reginfo[0], main_mp->htab);
+ mtx_base = htab[0].mtx_hash;
+ }
+
+ /*
+ * We preallocated all of the mutexes in a block, so for regions after
+ * the first, we skip mutexes in use in earlier regions. Each region
+ * has the same number of buckets
+ */
+no_prealloc:
+ if (MUTEX_ON(env))
+ mtx_base += reginfo_off * dbenv->mp_mtxcount;
+
+ /* Allocate hash table space and initialize it. */
+ if ((ret = __env_alloc(infop,
+ htab_buckets * sizeof(DB_MPOOL_HASH), &htab)) != 0)
+ goto mem_err;
+ mp->htab = R_OFFSET(infop, htab);
+ for (i = 0; i < htab_buckets; i++) {
+ hp = &htab[i];
+ if (!MUTEX_ON(env) || dbenv->mp_mtxcount == 0)
+ hp->mtx_hash = MUTEX_INVALID;
+ else if (F_ISSET(env, ENV_PRIVATE)) {
+ if (i >= dbenv->mp_mtxcount)
+ hp->mtx_hash =
+ htab[i % dbenv->mp_mtxcount].mtx_hash;
+ else if
+ ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
+ DB_MUTEX_SHARED, &hp->mtx_hash)) != 0)
+ return (ret);
+ } else
+ hp->mtx_hash = mtx_base + (i % dbenv->mp_mtxcount);
+ SH_TAILQ_INIT(&hp->hash_bucket);
+ atomic_init(&hp->hash_page_dirty, 0);
+#ifdef HAVE_STATISTICS
+ hp->hash_io_wait = 0;
+ hp->hash_frozen = hp->hash_thawed = hp->hash_frozen_freed = 0;
+#endif
+ hp->flags = 0;
+ ZERO_LSN(hp->old_reader);
+ }
+ mp->htab_buckets = htab_buckets;
+ mp->htab_mutexes = dbenv->mp_mtxcount;
+ mp->pagesize = dbenv->mp_pagesize == 0 ?
+ MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize;
+
+ SH_TAILQ_INIT(&mp->free_frozen);
+ SH_TAILQ_INIT(&mp->alloc_frozen);
+
+ /*
+ * Pre-allocate one frozen buffer header. This avoids situations where
+ * the cache becomes full of pages and we don't even have the 28 bytes
+ * (or so) available to allocate a frozen buffer header.
+ */
+ if ((ret = __env_alloc(infop,
+ sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen)) != 0)
+ goto mem_err;
+ SH_TAILQ_INSERT_TAIL(&mp->alloc_frozen, frozen, links);
+ frozen_bhp = (BH *)(frozen + 1);
+ frozen_bhp->mtx_buf = MUTEX_INVALID;
+ SH_TAILQ_INSERT_TAIL(&mp->free_frozen, frozen_bhp, hq);
+
+ /*
+ * Only the environment creator knows the total cache size,
+ * fill in those fields now.
+ */
+ mp->gbytes = dbenv->mp_gbytes;
+ mp->bytes = dbenv->mp_bytes;
+ infop->mtx_alloc = mp->mtx_region;
+ return (0);
+
+mem_err:__db_errx(env, DB_STR("3026",
+ "Unable to allocate memory for mpool region"));
+ return (ret);
+}
+
+/*
+ * PUBLIC: u_int32_t __memp_max_regions __P((ENV *));
+ */
+u_int32_t
+__memp_max_regions(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ roff_t reg_size, max_size;
+ size_t max_nreg;
+
+ dbenv = env->dbenv;
+
+ if (dbenv->mp_max_gbytes == 0 && dbenv->mp_max_bytes == 0)
+ return (dbenv->mp_ncache);
+ __memp_region_size(env, &reg_size, NULL);
+ max_size =
+ (roff_t)dbenv->mp_max_gbytes * GIGABYTE + dbenv->mp_max_bytes;
+ max_nreg = (max_size + reg_size / 2) / reg_size;
+
+ /* Sanity check that the number of regions fits in 32 bits. */
+ DB_ASSERT(env, max_nreg == (u_int32_t)max_nreg);
+
+ if (max_nreg <= dbenv->mp_ncache)
+ max_nreg = dbenv->mp_ncache;
+ return ((u_int32_t)max_nreg);
+}
+
+/*
+ * __memp_region_size --
+ * Size the region and figure out how many hash buckets we'll have.
+ */
+static void
+__memp_region_size(env, reg_sizep, htab_bucketsp)
+ ENV *env;
+ roff_t *reg_sizep;
+ u_int32_t *htab_bucketsp;
+{
+ DB_ENV *dbenv;
+ roff_t reg_size, cache_size;
+ u_int32_t pgsize;
+
+ dbenv = env->dbenv;
+
+ /*
+ * Figure out how big each cache region is. Cast an operand to roff_t
+ * so we do 64-bit arithmetic as appropriate.
+ */
+ cache_size = (roff_t)dbenv->mp_gbytes * GIGABYTE + dbenv->mp_bytes;
+ reg_size = cache_size / dbenv->mp_ncache;
+ if (reg_sizep != NULL)
+ *reg_sizep = reg_size;
+
+ /*
+ * Figure out how many hash buckets each region will have. Assume we
+ * want to keep the hash chains with under 3 pages on each chain. We
+ * don't know the pagesize in advance, and it may differ for different
+ * files. Use a pagesize of 4K for the calculation -- we walk these
+ * chains a lot, they must be kept short. We use 2.5 as this maintains
+ * compatibility with previous releases.
+ *
+ * XXX
+ * Cache sizes larger than 10TB would cause 32-bit wrapping in the
+ * calculation of the number of hash buckets. This probably isn't
+ * something we need to worry about right now, but is checked when the
+ * cache size is set.
+ */
+ if (htab_bucketsp != NULL) {
+ if (dbenv->mp_tablesize != 0)
+ *htab_bucketsp = __db_tablesize(dbenv->mp_tablesize);
+ else {
+ if ((pgsize = dbenv->mp_pagesize) == 0)
+ pgsize = MPOOL_DEFAULT_PAGESIZE;
+ *htab_bucketsp = __db_tablesize(
+ (u_int32_t)(reg_size / (2.5 * pgsize)));
+ }
+ }
+
+}
+
+/*
+ * __memp_region_mutex_count --
+ * Return the number of mutexes the mpool region will need.
+ *
+ * PUBLIC: u_int32_t __memp_region_mutex_count __P((ENV *));
+ */
+u_int32_t
+__memp_region_mutex_count(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ u_int32_t htab_buckets;
+ roff_t reg_size;
+ u_int32_t max_region, num_per_cache, pgsize;
+
+ dbenv = env->dbenv;
+
+ __memp_region_size(env, &reg_size, &htab_buckets);
+ if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
+ pgsize = sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE);
+ if ((pgsize = dbenv->mp_pagesize) == 0)
+ pgsize = MPOOL_DEFAULT_PAGESIZE;
+ max_region = __memp_max_regions(env);
+
+ /*
+ * We need a couple of mutexes for the region itself, one for each
+ * file handle (MPOOLFILE) the application allocates, one for each
+ * of the MPOOL_FILE_BUCKETS, and each cache has one mutex per
+ * hash bucket. We then need one mutex per page in the cache,
+ * the worst case is really big if the pages are 512 bytes.
+ */
+ if (dbenv->mp_mtxcount != 0)
+ htab_buckets = dbenv->mp_mtxcount;
+ else
+ dbenv->mp_mtxcount = htab_buckets;
+ num_per_cache = htab_buckets + (u_int32_t)(reg_size / pgsize);
+ return ((max_region * num_per_cache) + 50 + MPOOL_FILE_BUCKETS);
+}
+
+/*
+ * __memp_init_config --
+ * Initialize shared configuration information.
+ */
+static int
+__memp_init_config(env, mp)
+ ENV *env;
+ MPOOL *mp;
+{
+ DB_ENV *dbenv;
+
+ dbenv = env->dbenv;
+
+ MPOOL_SYSTEM_LOCK(env);
+ if (dbenv->mp_mmapsize != 0)
+ mp->mp_mmapsize = (db_size_t)dbenv->mp_mmapsize;
+ if (dbenv->mp_maxopenfd != 0)
+ mp->mp_maxopenfd = dbenv->mp_maxopenfd;
+ if (dbenv->mp_maxwrite != 0)
+ mp->mp_maxwrite = dbenv->mp_maxwrite;
+ if (dbenv->mp_maxwrite_sleep != 0)
+ mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep;
+ MPOOL_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __memp_env_refresh --
+ * Clean up after the mpool system on a close or failed open.
+ *
+ * PUBLIC: int __memp_env_refresh __P((ENV *));
+ */
+int
+__memp_env_refresh(env)
+ ENV *env;
+{
+ BH *bhp;
+ BH_FROZEN_ALLOC *frozen_alloc;
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ DB_MPOOL_HASH *hp;
+ DB_MPREG *mpreg;
+ MPOOL *mp, *c_mp;
+ REGINFO *infop;
+ u_int32_t bucket, i, nreg;
+ int ret, t_ret;
+
+ ret = 0;
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ nreg = mp->nreg;
+ hp = R_ADDR(&dbmp->reginfo[0], mp->htab);
+
+ /*
+ * If a private region, return the memory to the heap. Not needed for
+ * filesystem-backed or system shared memory regions, that memory isn't
+ * owned by any particular process.
+ */
+ if (!F_ISSET(env, ENV_PRIVATE))
+ goto not_priv;
+
+ /* Discard buffers. */
+ for (i = 0; i < nreg; ++i) {
+ infop = &dbmp->reginfo[i];
+ c_mp = infop->primary;
+ for (hp = R_ADDR(infop, c_mp->htab), bucket = 0;
+ bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ while ((bhp = SH_TAILQ_FIRST(
+ &hp->hash_bucket, __bh)) != NULL)
+ if (F_ISSET(bhp, BH_FROZEN))
+ SH_TAILQ_REMOVE(
+ &hp->hash_bucket, bhp,
+ hq, __bh);
+ else {
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ atomic_dec(env,
+ &hp->hash_page_dirty);
+ F_CLR(bhp,
+ BH_DIRTY | BH_DIRTY_CREATE);
+ }
+ atomic_inc(env, &bhp->ref);
+ if ((t_ret = __memp_bhfree(dbmp, infop,
+ R_ADDR(dbmp->reginfo,
+ bhp->mf_offset), hp, bhp,
+ BH_FREE_FREEMEM |
+ BH_FREE_UNLOCKED)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ }
+ MPOOL_REGION_LOCK(env, infop);
+ while ((frozen_alloc = SH_TAILQ_FIRST(
+ &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) {
+ SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc,
+ links, __bh_frozen_a);
+ __env_alloc_free(infop, frozen_alloc);
+ }
+ MPOOL_REGION_UNLOCK(env, infop);
+ }
+
+not_priv:
+ /* Discard DB_MPOOLFILEs. */
+ while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
+ if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard DB_MPREGs. */
+ if (dbmp->pg_inout != NULL)
+ __os_free(env, dbmp->pg_inout);
+ while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
+ LIST_REMOVE(mpreg, q);
+ __os_free(env, mpreg);
+ }
+
+ /* Discard the DB_MPOOL thread mutex. */
+ if ((t_ret = __mutex_free(env, &dbmp->mutex)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ /* Discard REGION IDs. */
+ infop = &dbmp->reginfo[0];
+ infop->mtx_alloc = MUTEX_INVALID;
+ __memp_free(infop, R_ADDR(infop, mp->regids));
+
+ /* Discard all the MPOOLFILEs. */
+ if ((t_ret = __memp_discard_all_mpfs(env, mp)) != 0 && ret == 0)
+ ret = t_ret;
+ /* Discard the File table. */
+ __memp_free(infop, R_ADDR(infop, mp->ftab));
+
+ /* Discard Hash tables. */
+ for (i = 0; i < nreg; ++i) {
+ infop = &dbmp->reginfo[i];
+ c_mp = infop->primary;
+ infop->mtx_alloc = MUTEX_INVALID;
+ __memp_free(infop, R_ADDR(infop, c_mp->htab));
+ }
+ }
+
+ /* Detach from the region. */
+ for (i = 0; i < nreg; ++i) {
+ infop = &dbmp->reginfo[i];
+ if ((t_ret =
+ __env_region_detach(env, infop, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /* Discard DB_MPOOL. */
+ __os_free(env, dbmp->reginfo);
+ __os_free(env, dbmp);
+
+ env->mp_handle = NULL;
+ return (ret);
+}
diff --git a/src/mp/mp_register.c b/src/mp/mp_register.c
new file mode 100644
index 00000000..dc7015a7
--- /dev/null
+++ b/src/mp/mp_register.c
@@ -0,0 +1,116 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+/*
+ * memp_register_pp --
+ * ENV->memp_register pre/post processing.
+ *
+ * PUBLIC: int __memp_register_pp __P((DB_ENV *, int,
+ * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *),
+ * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+ */
+int
+__memp_register_pp(dbenv, ftype, pgin, pgout)
+ DB_ENV *dbenv;
+ int ftype;
+ int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->mp_handle, "DB_ENV->memp_register", DB_INIT_MPOOL);
+
+ if (REP_ON(env)) {
+ __db_errx(env, DB_STR_A("3001",
+ "%smethod not permitted when replication is configured",
+ "%s"), "DB_ENV->memp_register: ");
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ ret = __memp_register(env, ftype, pgin, pgout);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * memp_register --
+ * ENV->memp_register.
+ *
+ * PUBLIC: int __memp_register __P((ENV *, int,
+ * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *),
+ * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
+ */
+int
+__memp_register(env, ftype, pgin, pgout)
+ ENV *env;
+ int ftype;
+ int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+{
+ DB_MPOOL *dbmp;
+ DB_MPREG *mpreg;
+ int ret;
+
+ dbmp = env->mp_handle;
+
+ /*
+ * We keep the DB pgin/pgout functions outside of the linked list
+ * to avoid locking/unlocking the linked list on every page I/O.
+ *
+ * The Berkeley DB I/O conversion functions are registered when the
+ * environment is first created, so there's no need for locking here.
+ */
+ if (ftype == DB_FTYPE_SET) {
+ if (dbmp->pg_inout != NULL)
+ return (0);
+ if ((ret =
+ __os_malloc(env, sizeof(DB_MPREG), &dbmp->pg_inout)) != 0)
+ return (ret);
+ dbmp->pg_inout->ftype = ftype;
+ dbmp->pg_inout->pgin = pgin;
+ dbmp->pg_inout->pgout = pgout;
+ return (0);
+ }
+
+ /*
+ * The item may already have been registered. If already registered,
+ * just update the entry, although it's probably unchanged.
+ */
+ MUTEX_LOCK(env, dbmp->mutex);
+ LIST_FOREACH(mpreg, &dbmp->dbregq, q)
+ if (mpreg->ftype == ftype) {
+ mpreg->pgin = pgin;
+ mpreg->pgout = pgout;
+ break;
+ }
+
+ if (mpreg == NULL) { /* New entry. */
+ if ((ret = __os_malloc(env, sizeof(DB_MPREG), &mpreg)) != 0)
+ return (ret);
+ mpreg->ftype = ftype;
+ mpreg->pgin = pgin;
+ mpreg->pgout = pgout;
+
+ LIST_INSERT_HEAD(&dbmp->dbregq, mpreg, q);
+ }
+ MUTEX_UNLOCK(env, dbmp->mutex);
+
+ return (0);
+}
diff --git a/src/mp/mp_resize.c b/src/mp/mp_resize.c
new file mode 100644
index 00000000..97719554
--- /dev/null
+++ b/src/mp/mp_resize.c
@@ -0,0 +1,605 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __memp_add_bucket __P((DB_MPOOL *));
+static int __memp_add_region __P((DB_MPOOL *));
+static int __memp_map_regions __P((DB_MPOOL *));
+static int __memp_merge_buckets
+ __P((DB_MPOOL *, u_int32_t, u_int32_t, u_int32_t));
+static int __memp_remove_bucket __P((DB_MPOOL *));
+static int __memp_remove_region __P((DB_MPOOL *));
+
+/*
+ * PUBLIC: int __memp_get_bucket __P((ENV *, MPOOLFILE *,
+ * PUBLIC: db_pgno_t, REGINFO **, DB_MPOOL_HASH **, u_int32_t *));
+ */
+int
+__memp_get_bucket(env, mfp, pgno, infopp, hpp, bucketp)
+ ENV *env;
+ MPOOLFILE *mfp;
+ db_pgno_t pgno;
+ REGINFO **infopp;
+ DB_MPOOL_HASH **hpp;
+ u_int32_t *bucketp;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *c_mp, *mp;
+ REGINFO *infop;
+ roff_t mf_offset;
+ u_int32_t bucket, nbuckets, new_bucket, new_nbuckets, region;
+ u_int32_t *regids;
+ int ret;
+
+ dbmp = env->mp_handle;
+ mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+ mp = dbmp->reginfo[0].primary;
+ ret = 0;
+
+ for (;;) {
+ nbuckets = mp->nbuckets;
+ MP_BUCKET(mf_offset, pgno, nbuckets, bucket);
+
+ /*
+ * Once we work out which region we are looking in, we have to
+ * check that we have that region mapped, and that the version
+ * we have matches the ID in the main mpool region. Otherwise
+ * we have to go and map in any regions that don't match and
+ * retry.
+ */
+ region = NREGION(mp, bucket);
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+
+ for (;;) {
+ infop = *infopp = &dbmp->reginfo[region];
+ c_mp = infop->primary;
+
+ /* If we have the correct region mapped, we're done. */
+ if (c_mp != NULL && regids[region] == infop->id)
+ break;
+ if ((ret = __memp_map_regions(dbmp)) != 0)
+ return (ret);
+ }
+
+ /* If our caller wants the hash bucket, lock it here. */
+ if (hpp != NULL) {
+ hp = R_ADDR(infop, c_mp->htab);
+ hp = &hp[bucket - region * mp->htab_buckets];
+
+ MUTEX_READLOCK(env, hp->mtx_hash);
+
+ /*
+ * Check that we still have the correct region mapped.
+ */
+ if (regids[region] != infop->id) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ continue;
+ }
+
+ /*
+ * Now that the bucket is locked, we need to check that
+ * the cache has not been resized while we waited.
+ */
+ new_nbuckets = mp->nbuckets;
+ if (nbuckets != new_nbuckets) {
+ MP_BUCKET(mf_offset, pgno, new_nbuckets,
+ new_bucket);
+
+ if (new_bucket != bucket) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ continue;
+ }
+ }
+
+ *hpp = hp;
+ }
+
+ break;
+ }
+
+ if (bucketp != NULL)
+ *bucketp = bucket - region * mp->htab_buckets;
+ return (ret);
+}
+
+static int
+__memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket)
+ DB_MPOOL *dbmp;
+ u_int32_t new_nbuckets, old_bucket, new_bucket;
+{
+ BH *alloc_bhp, *bhp, *current_bhp, *new_bhp, *next_bhp;
+ DB_LSN vlsn;
+ DB_MPOOL_HASH *new_hp, *old_hp;
+ ENV *env;
+ MPOOL *mp, *new_mp, *old_mp;
+ MPOOLFILE *mfp;
+ REGINFO *new_infop, *old_infop;
+ u_int32_t bucket, high_mask, new_region, old_region;
+ int ret;
+
+ env = dbmp->env;
+ mp = dbmp->reginfo[0].primary;
+ new_bhp = NULL;
+ ret = 0;
+
+ MP_MASK(new_nbuckets, high_mask);
+
+ old_region = NREGION(mp, old_bucket);
+ old_infop = &dbmp->reginfo[old_region];
+ old_mp = old_infop->primary;
+ old_hp = R_ADDR(old_infop, old_mp->htab);
+ old_hp = &old_hp[old_bucket - old_region * mp->htab_buckets];
+
+ new_region = NREGION(mp, new_bucket);
+ new_infop = &dbmp->reginfo[new_region];
+ new_mp = new_infop->primary;
+ new_hp = R_ADDR(new_infop, new_mp->htab);
+ new_hp = &new_hp[new_bucket - new_region * mp->htab_buckets];
+
+ /*
+ * Before merging, we need to check that there are no old buffers left
+ * in the target hash bucket after a previous split.
+ */
+free_old:
+ MUTEX_LOCK(env, new_hp->mtx_hash);
+ SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) {
+ MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket);
+
+ if (bucket != new_bucket) {
+ /*
+ * There is no way that an old buffer can be locked
+ * after a split, since everyone will look for it in
+ * the new hash bucket.
+ */
+ DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) &&
+ atomic_read(&bhp->ref) == 0);
+ atomic_inc(env, &bhp->ref);
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ if ((ret = __memp_bhfree(dbmp, new_infop,
+ mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) {
+ MUTEX_UNLOCK(env, new_hp->mtx_hash);
+ return (ret);
+ }
+
+ /*
+ * The free has modified the list of buffers and
+ * dropped the mutex. We need to start again.
+ */
+ goto free_old;
+ }
+ }
+ MUTEX_UNLOCK(env, new_hp->mtx_hash);
+
+ /*
+ * Before we begin, make sure that all of the buffers we care about are
+ * not in use and not frozen. We do this because we can't drop the old
+ * hash bucket mutex once we start moving buffers around.
+ */
+retry: MUTEX_LOCK(env, old_hp->mtx_hash);
+ SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
+ MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
+ new_nbuckets, high_mask, bucket);
+
+ if (bucket == new_bucket && atomic_read(&bhp->ref) != 0) {
+ MUTEX_UNLOCK(env, old_hp->mtx_hash);
+ __os_yield(env, 0, 0);
+ goto retry;
+ } else if (bucket == new_bucket && F_ISSET(bhp, BH_FROZEN)) {
+ atomic_inc(env, &bhp->ref);
+ /*
+ * We need to drop the hash bucket mutex to avoid
+ * self-blocking when we allocate a new buffer.
+ */
+ MUTEX_UNLOCK(env, old_hp->mtx_hash);
+ MUTEX_LOCK(env, bhp->mtx_buf);
+ F_SET(bhp, BH_EXCLUSIVE);
+ if (BH_OBSOLETE(bhp, old_hp->old_reader, vlsn))
+ alloc_bhp = NULL;
+ else {
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ if ((ret = __memp_alloc(dbmp,
+ old_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+ goto err;
+ }
+ /*
+ * But we need to lock the hash bucket again before
+ * thawing the buffer. The call to __memp_bh_thaw
+ * will unlock the hash bucket mutex.
+ */
+ MUTEX_LOCK(env, old_hp->mtx_hash);
+ if (F_ISSET(bhp, BH_THAWED)) {
+ ret = __memp_bhfree(dbmp, old_infop, NULL, NULL,
+ alloc_bhp,
+ BH_FREE_FREEMEM | BH_FREE_UNLOCKED);
+ } else
+ ret = __memp_bh_thaw(dbmp,
+ old_infop, old_hp, bhp, alloc_bhp);
+
+ /*
+ * We've dropped the mutex in order to thaw, so we need
+ * to go back to the beginning and check that all of
+ * the buffers we care about are still unlocked and
+ * unreferenced.
+ */
+err: atomic_dec(env, &bhp->ref);
+ F_CLR(bhp, BH_EXCLUSIVE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ if (ret != 0)
+ return (ret);
+ goto retry;
+ }
+ }
+
+ /*
+ * We now know that all of the buffers we care about are unlocked and
+ * unreferenced. Go ahead and copy them.
+ */
+ SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
+ MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
+ new_nbuckets, high_mask, bucket);
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /*
+ * We ignore buffers that don't hash to the new bucket. We
+ * could also ignore clean buffers which are not part of a
+ * multiversion chain as long as they have a backing file.
+ */
+ if (bucket != new_bucket || (!F_ISSET(bhp, BH_DIRTY) &&
+ SH_CHAIN_SINGLETON(bhp, vc) && !mfp->no_backing_file))
+ continue;
+
+ for (current_bhp = bhp, next_bhp = NULL;
+ current_bhp != NULL;
+ current_bhp = SH_CHAIN_PREV(current_bhp, vc, __bh),
+ next_bhp = alloc_bhp) {
+ /* Allocate in the new region. */
+ if ((ret = __memp_alloc(dbmp,
+ new_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+ break;
+
+ alloc_bhp->ref = current_bhp->ref;
+ alloc_bhp->priority = current_bhp->priority;
+ alloc_bhp->pgno = current_bhp->pgno;
+ alloc_bhp->mf_offset = current_bhp->mf_offset;
+ alloc_bhp->flags = current_bhp->flags;
+ alloc_bhp->td_off = current_bhp->td_off;
+
+ /*
+ * We've duplicated the buffer, so now we need to
+ * update reference counts, including the counts in the
+ * per-MPOOLFILE and the transaction detail (for MVCC
+ * buffers).
+ */
+ MUTEX_LOCK(env, mfp->mutex);
+ ++mfp->block_cnt;
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ if (alloc_bhp->td_off != INVALID_ROFF &&
+ (ret = __txn_add_buffer(env,
+ R_ADDR(&env->tx_handle->reginfo,
+ alloc_bhp->td_off))) != 0)
+ break;
+
+ memcpy(alloc_bhp->buf, bhp->buf, mfp->pagesize);
+
+ /*
+ * We build up the MVCC chain first, then insert the
+ * head (stored in new_bhp) once.
+ */
+ if (next_bhp == NULL) {
+ SH_CHAIN_INIT(alloc_bhp, vc);
+ new_bhp = alloc_bhp;
+ } else
+ SH_CHAIN_INSERT_BEFORE(
+ next_bhp, alloc_bhp, vc, __bh);
+ }
+
+ DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash);
+ MUTEX_LOCK(env, new_hp->mtx_hash);
+ SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq);
+ if (F_ISSET(new_bhp, BH_DIRTY))
+ atomic_inc(env, &new_hp->hash_page_dirty);
+
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ F_CLR(bhp, BH_DIRTY);
+ atomic_dec(env, &old_hp->hash_page_dirty);
+ }
+ MUTEX_UNLOCK(env, new_hp->mtx_hash);
+ }
+
+ if (ret == 0)
+ mp->nbuckets = new_nbuckets;
+ MUTEX_UNLOCK(env, old_hp->mtx_hash);
+
+ return (ret);
+}
+
+static int
+__memp_add_bucket(dbmp)
+ DB_MPOOL *dbmp;
+{
+ ENV *env;
+ MPOOL *mp;
+ u_int32_t high_mask, new_bucket, old_bucket;
+
+ env = dbmp->env;
+ mp = dbmp->reginfo[0].primary;
+
+ new_bucket = mp->nbuckets;
+ /* We should always be adding buckets to the last region. */
+ DB_ASSERT(env, NREGION(mp, new_bucket) == mp->nreg - 1);
+ MP_MASK(mp->nbuckets, high_mask);
+ old_bucket = new_bucket & (high_mask >> 1);
+
+ /*
+ * With fixed-sized regions, the new region is always smaller than the
+ * existing total cache size, so buffers always need to be copied. If
+ * we implement variable region sizes, it's possible that we will be
+ * splitting a hash bucket in the new region. Catch that here.
+ */
+ DB_ASSERT(env, NREGION(mp, old_bucket) != NREGION(mp, new_bucket));
+
+ return (__memp_merge_buckets(dbmp, mp->nbuckets + 1,
+ old_bucket, new_bucket));
+}
+
+static int
+__memp_add_region(dbmp)
+ DB_MPOOL *dbmp;
+{
+ ENV *env;
+ MPOOL *mp;
+ REGINFO *infop;
+ int ret;
+ roff_t cache_size, reg_size;
+ u_int i;
+ u_int32_t *regids;
+
+ env = dbmp->env;
+ mp = dbmp->reginfo[0].primary;
+ cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes;
+
+ /* All cache regions are the same size. */
+ reg_size = dbmp->reginfo[0].rp->size;
+ ret = 0;
+
+ infop = &dbmp->reginfo[mp->nreg];
+ infop->env = env;
+ infop->type = REGION_TYPE_MPOOL;
+ infop->id = INVALID_REGION_ID;
+ infop->flags = REGION_CREATE_OK;
+ if ((ret = __env_region_attach(env, infop, reg_size, reg_size)) != 0)
+ return (ret);
+ if ((ret = __memp_init(env,
+ dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0)
+ return (ret);
+ cache_size += reg_size;
+ mp->gbytes = (u_int32_t)(cache_size / GIGABYTE);
+ mp->bytes = (u_int32_t)(cache_size % GIGABYTE);
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+ regids[mp->nreg++] = infop->id;
+
+ for (i = 0; i < mp->htab_buckets; i++)
+ if ((ret = __memp_add_bucket(dbmp)) != 0)
+ break;
+
+ return (ret);
+}
+
+static int
+__memp_remove_bucket(dbmp)
+ DB_MPOOL *dbmp;
+{
+ ENV *env;
+ MPOOL *mp;
+ u_int32_t high_mask, new_bucket, old_bucket;
+
+ env = dbmp->env;
+ mp = dbmp->reginfo[0].primary;
+
+ old_bucket = mp->nbuckets - 1;
+
+ /* We should always be removing buckets from the last region. */
+ DB_ASSERT(env, NREGION(mp, old_bucket) == mp->nreg - 1);
+ MP_MASK(mp->nbuckets - 1, high_mask);
+ new_bucket = old_bucket & (high_mask >> 1);
+
+ return (__memp_merge_buckets(dbmp, mp->nbuckets - 1,
+ old_bucket, new_bucket));
+}
+
+static int
+__memp_remove_region(dbmp)
+ DB_MPOOL *dbmp;
+{
+ DB_MPOOL_HASH *hp;
+ ENV *env;
+ MPOOL *mp;
+ REGINFO *infop;
+ int ret;
+ roff_t cache_size, reg_size;
+ u_int i;
+
+ env = dbmp->env;
+ mp = dbmp->reginfo[0].primary;
+ reg_size = dbmp->reginfo[0].rp->size;
+ cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes;
+ ret = 0;
+
+ if (mp->nreg == 1) {
+ __db_errx(env, DB_STR("3019",
+ "cannot remove the last cache"));
+ return (EINVAL);
+ }
+
+ for (i = 0; i < mp->htab_buckets; i++)
+ if ((ret = __memp_remove_bucket(dbmp)) != 0)
+ return (ret);
+
+ /* Detach from the region then destroy it. */
+ infop = &dbmp->reginfo[mp->nreg];
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ hp = R_ADDR(infop, ((MPOOL*)infop->primary)->htab);
+ for (i = 0; i < env->dbenv->mp_mtxcount; i++)
+ if ((ret = __mutex_free(env, &hp[i].mtx_hash)) != 0)
+ return (ret);
+ }
+
+ ret = __env_region_detach(env, infop, 1);
+ if (ret == 0) {
+ mp->nreg--;
+ cache_size -= reg_size;
+ mp->gbytes = (u_int32_t)(cache_size / GIGABYTE);
+ mp->bytes = (u_int32_t)(cache_size % GIGABYTE);
+ }
+
+ return (ret);
+}
+
+static int
+__memp_map_regions(dbmp)
+ DB_MPOOL *dbmp;
+{
+ ENV *env;
+ MPOOL *mp;
+ int ret;
+ u_int i;
+ u_int32_t *regids;
+
+ env = dbmp->env;
+ mp = dbmp->reginfo[0].primary;
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+ ret = 0;
+
+ for (i = 1; i < mp->nreg; ++i) {
+ if (dbmp->reginfo[i].primary != NULL &&
+ dbmp->reginfo[i].id == regids[i])
+ continue;
+
+ if (dbmp->reginfo[i].primary != NULL)
+ ret = __env_region_detach(env, &dbmp->reginfo[i], 0);
+
+ dbmp->reginfo[i].env = env;
+ dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+ dbmp->reginfo[i].id = regids[i];
+ dbmp->reginfo[i].flags = REGION_JOIN_OK;
+ if ((ret =
+ __env_region_attach(env, &dbmp->reginfo[i], 0, 0)) != 0)
+ return (ret);
+ dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i],
+ dbmp->reginfo[i].rp->primary);
+ }
+
+ for (; i < mp->max_nreg; i++)
+ if (dbmp->reginfo[i].primary != NULL &&
+ (ret = __env_region_detach(env,
+ &dbmp->reginfo[i], 0)) != 0)
+ break;
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t));
+ */
+int
+__memp_resize(dbmp, gbytes, bytes)
+ DB_MPOOL *dbmp;
+ u_int32_t gbytes, bytes;
+{
+ ENV *env;
+ MPOOL *mp;
+ int ret;
+ u_int32_t ncache;
+ roff_t reg_size, total_size;
+
+ env = dbmp->env;
+ mp = dbmp->reginfo[0].primary;
+ reg_size = dbmp->reginfo[0].rp->size;
+ total_size = (roff_t)gbytes * GIGABYTE + bytes;
+ ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size);
+
+ if (ncache < 1)
+ ncache = 1;
+ else if (ncache > mp->max_nreg) {
+ __db_errx(env, DB_STR_A("3020",
+ "cannot resize to %lu cache regions: maximum is %lu",
+ "%lu %lu"), (u_long)ncache, (u_long)mp->max_nreg);
+ return (EINVAL);
+ }
+
+ ret = 0;
+ MUTEX_LOCK(env, mp->mtx_resize);
+ while (mp->nreg != ncache)
+ if ((ret = (mp->nreg < ncache ?
+ __memp_add_region(dbmp) :
+ __memp_remove_region(dbmp))) != 0)
+ break;
+ MUTEX_UNLOCK(env, mp->mtx_resize);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__memp_get_cache_max(dbenv, max_gbytesp, max_bytesp)
+ DB_ENV *dbenv;
+ u_int32_t *max_gbytesp, *max_bytesp;
+{
+ DB_MPOOL *dbmp;
+ ENV *env;
+ MPOOL *mp;
+ roff_t reg_size, max_size;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ reg_size = dbmp->reginfo[0].rp->size;
+ max_size = mp->max_nreg * reg_size;
+ *max_gbytesp = (u_int32_t)(max_size / GIGABYTE);
+ *max_bytesp = (u_int32_t)(max_size % GIGABYTE);
+ } else {
+ *max_gbytesp = dbenv->mp_max_gbytes;
+ *max_bytesp = dbenv->mp_max_bytes;
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__memp_set_cache_max(dbenv, max_gbytes, max_bytes)
+ DB_ENV *dbenv;
+ u_int32_t max_gbytes, max_bytes;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_cache_max");
+ dbenv->mp_max_gbytes = max_gbytes;
+ dbenv->mp_max_bytes = max_bytes;
+
+ return (0);
+}
diff --git a/src/mp/mp_stat.c b/src/mp/mp_stat.c
new file mode 100644
index 00000000..246b44d7
--- /dev/null
+++ b/src/mp/mp_stat.c
@@ -0,0 +1,905 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_STATISTICS
+static void __memp_print_bh __P((ENV *,
+ DB_MPOOL *, const char *, BH *, roff_t *));
+static int __memp_print_all __P((ENV *, u_int32_t));
+static int __memp_print_stats __P((ENV *, u_int32_t));
+static int __memp_print_hash __P((ENV *,
+ DB_MPOOL *, REGINFO *, roff_t *, u_int32_t));
+static int __memp_stat __P((ENV *,
+ DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
+static void __memp_stat_wait
+ __P((ENV *, REGINFO *, MPOOL *, DB_MPOOL_STAT *, u_int32_t));
+static int __memp_file_stats __P((ENV *,
+ MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+static int __memp_count_files __P((ENV *,
+ MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+static int __memp_get_files __P((ENV *,
+ MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+static int __memp_print_files __P((ENV *,
+ MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+
+/*
+ * __memp_stat_pp --
+ * DB_ENV->memp_stat pre/post processing.
+ *
+ * PUBLIC: int __memp_stat_pp
+ * PUBLIC: __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
+ */
+int
+__memp_stat_pp(dbenv, gspp, fspp, flags)
+ DB_ENV *dbenv;
+ DB_MPOOL_STAT **gspp;
+ DB_MPOOL_FSTAT ***fspp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->mp_handle, "DB_ENV->memp_stat", DB_INIT_MPOOL);
+
+ if ((ret = __db_fchk(env,
+ "DB_ENV->memp_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__memp_stat(env, gspp, fspp, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __memp_stat --
+ * ENV->memp_stat
+ */
+static int
+__memp_stat(env, gspp, fspp, flags)
+ ENV *env;
+ DB_MPOOL_STAT **gspp;
+ DB_MPOOL_FSTAT ***fspp;
+ u_int32_t flags;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOL_FSTAT **tfsp;
+ DB_MPOOL_STAT *sp;
+ MPOOL *c_mp, *mp;
+ size_t len;
+ int ret;
+ u_int32_t i;
+ uintmax_t tmp_wait, tmp_nowait;
+
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+
+ /* Global statistics. */
+ if (gspp != NULL) {
+ *gspp = NULL;
+
+ if ((ret = __os_umalloc(env, sizeof(**gspp), gspp)) != 0)
+ return (ret);
+ memset(*gspp, 0, sizeof(**gspp));
+ sp = *gspp;
+
+ /*
+ * Initialization and information that is not maintained on
+ * a per-cache basis. Note that configuration information
+ * may be modified at any time, and so we have to lock.
+ */
+ sp->st_gbytes = mp->gbytes;
+ sp->st_bytes = mp->bytes;
+ sp->st_pagesize = mp->pagesize;
+ sp->st_ncache = mp->nreg;
+ sp->st_max_ncache = mp->max_nreg;
+ sp->st_regsize = dbmp->reginfo[0].rp->size;
+ sp->st_regmax = dbmp->reginfo[0].rp->max;
+ sp->st_sync_interrupted = mp->stat.st_sync_interrupted;
+
+ MPOOL_SYSTEM_LOCK(env);
+ sp->st_mmapsize = mp->mp_mmapsize;
+ sp->st_maxopenfd = mp->mp_maxopenfd;
+ sp->st_maxwrite = mp->mp_maxwrite;
+ sp->st_maxwrite_sleep = mp->mp_maxwrite_sleep;
+ MPOOL_SYSTEM_UNLOCK(env);
+
+ /* Walk the cache list and accumulate the global information. */
+ for (i = 0; i < mp->nreg; ++i) {
+ c_mp = dbmp->reginfo[i].primary;
+
+ sp->st_map += c_mp->stat.st_map;
+ sp->st_cache_hit += c_mp->stat.st_cache_hit;
+ sp->st_cache_miss += c_mp->stat.st_cache_miss;
+ sp->st_page_create += c_mp->stat.st_page_create;
+ sp->st_page_in += c_mp->stat.st_page_in;
+ sp->st_page_out += c_mp->stat.st_page_out;
+ sp->st_ro_evict += c_mp->stat.st_ro_evict;
+ sp->st_rw_evict += c_mp->stat.st_rw_evict;
+ sp->st_page_trickle += c_mp->stat.st_page_trickle;
+ sp->st_pages += c_mp->pages;
+ /*
+ * st_page_dirty calculated by __memp_stat_hash
+ * st_page_clean calculated here
+ */
+ __memp_stat_hash(
+ &dbmp->reginfo[i], c_mp, &sp->st_page_dirty);
+ sp->st_page_clean = sp->st_pages - sp->st_page_dirty;
+ sp->st_hash_buckets += c_mp->htab_buckets;
+ sp->st_hash_mutexes += c_mp->htab_mutexes;
+ sp->st_hash_searches += c_mp->stat.st_hash_searches;
+ sp->st_hash_longest += c_mp->stat.st_hash_longest;
+ sp->st_hash_examined += c_mp->stat.st_hash_examined;
+ /*
+ * st_hash_nowait calculated by __memp_stat_wait
+ * st_hash_wait
+ */
+ __memp_stat_wait(
+ env, &dbmp->reginfo[i], c_mp, sp, flags);
+ __mutex_set_wait_info(env,
+ c_mp->mtx_region, &tmp_wait, &tmp_nowait);
+ sp->st_region_nowait += tmp_nowait;
+ sp->st_region_wait += tmp_wait;
+ sp->st_alloc += c_mp->stat.st_alloc;
+ sp->st_alloc_buckets += c_mp->stat.st_alloc_buckets;
+ if (sp->st_alloc_max_buckets <
+ c_mp->stat.st_alloc_max_buckets)
+ sp->st_alloc_max_buckets =
+ c_mp->stat.st_alloc_max_buckets;
+ sp->st_alloc_pages += c_mp->stat.st_alloc_pages;
+ if (sp->st_alloc_max_pages <
+ c_mp->stat.st_alloc_max_pages)
+ sp->st_alloc_max_pages =
+ c_mp->stat.st_alloc_max_pages;
+
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ if (!LF_ISSET(DB_STAT_SUBSYSTEM))
+ __mutex_clear(env, c_mp->mtx_region);
+
+ memset(&c_mp->stat, 0, sizeof(c_mp->stat));
+ }
+ }
+
+ /*
+ * We have duplicate statistics fields in per-file structures
+ * and the cache. The counters are only incremented in the
+ * per-file structures, except if a file is flushed from the
+ * mpool, at which time we copy its information into the cache
+ * statistics. We added the cache information above, now we
+ * add the per-file information.
+ */
+ if ((ret = __memp_walk_files(env, mp, __memp_file_stats,
+ sp, NULL, fspp == NULL ? LF_ISSET(DB_STAT_CLEAR) : 0)) != 0)
+ return (ret);
+ }
+
+ /* Per-file statistics. */
+ if (fspp != NULL) {
+ *fspp = NULL;
+
+ /* Count the MPOOLFILE structures. */
+ i = 0;
+ len = 0;
+ if ((ret = __memp_walk_files(env,
+ mp, __memp_count_files, &len, &i, flags)) != 0)
+ return (ret);
+
+ if (i == 0)
+ return (0);
+ len += sizeof(DB_MPOOL_FSTAT *); /* Trailing NULL */
+
+ /* Allocate space */
+ if ((ret = __os_umalloc(env, len, fspp)) != 0)
+ return (ret);
+
+ tfsp = *fspp;
+ *tfsp = NULL;
+
+ /*
+ * Files may have been opened since we counted, don't walk
+ * off the end of the allocated space.
+ */
+ if ((ret = __memp_walk_files(env,
+ mp, __memp_get_files, &tfsp, &i, flags)) != 0)
+ return (ret);
+
+ *++tfsp = NULL;
+ }
+
+ return (0);
+}
+
+static int
+__memp_file_stats(env, mfp, argp, countp, flags)
+ ENV *env;
+ MPOOLFILE *mfp;
+ void *argp;
+ u_int32_t *countp;
+ u_int32_t flags;
+{
+ DB_MPOOL_STAT *sp;
+
+ COMPQUIET(env, NULL);
+ COMPQUIET(countp, NULL);
+
+ sp = argp;
+
+ sp->st_map += mfp->stat.st_map;
+ sp->st_cache_hit += mfp->stat.st_cache_hit;
+ sp->st_cache_miss += mfp->stat.st_cache_miss;
+ sp->st_page_create += mfp->stat.st_page_create;
+ sp->st_page_in += mfp->stat.st_page_in;
+ sp->st_page_out += mfp->stat.st_page_out;
+ if (LF_ISSET(DB_STAT_CLEAR))
+ memset(&mfp->stat, 0, sizeof(mfp->stat));
+
+ return (0);
+}
+
+static int
+__memp_count_files(env, mfp, argp, countp, flags)
+ ENV *env;
+ MPOOLFILE *mfp;
+ void *argp;
+ u_int32_t *countp;
+ u_int32_t flags;
+{
+ DB_MPOOL *dbmp;
+ size_t len;
+
+ COMPQUIET(flags, 0);
+ dbmp = env->mp_handle;
+ len = *(size_t *)argp;
+
+ (*countp)++;
+ len += sizeof(DB_MPOOL_FSTAT *) +
+ sizeof(DB_MPOOL_FSTAT) + strlen(__memp_fns(dbmp, mfp)) + 1;
+
+ *(size_t *)argp = len;
+ return (0);
+}
+
+/*
+ * __memp_get_files --
+ * get file specific statistics
+ *
+ * Build each individual entry. We assume that an array of pointers are
+ * aligned correctly to be followed by an array of structures, which should
+ * be safe (in this particular case, the first element of the structure
+ * is a pointer, so we're doubly safe). The array is followed by space
+ * for the text file names.
+ */
+static int
+__memp_get_files(env, mfp, argp, countp, flags)
+ ENV *env;
+ MPOOLFILE *mfp;
+ void *argp;
+ u_int32_t *countp;
+ u_int32_t flags;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOL_FSTAT **tfsp, *tstruct;
+ char *name, *tname;
+ size_t nlen;
+
+ if (*countp == 0)
+ return (0);
+
+ dbmp = env->mp_handle;
+ tfsp = *(DB_MPOOL_FSTAT ***)argp;
+
+ if (*tfsp == NULL) {
+ /* Add 1 to count because we need to skip over the NULL. */
+ tstruct = (DB_MPOOL_FSTAT *)(tfsp + *countp + 1);
+ tname = (char *)(tstruct + *countp);
+ *tfsp = tstruct;
+ } else {
+ tstruct = *tfsp + 1;
+ tname = (*tfsp)->file_name + strlen((*tfsp)->file_name) + 1;
+ *++tfsp = tstruct;
+ }
+
+ name = __memp_fns(dbmp, mfp);
+ nlen = strlen(name) + 1;
+ memcpy(tname, name, nlen);
+ memcpy(tstruct, &mfp->stat, sizeof(mfp->stat));
+ tstruct->file_name = tname;
+
+ /* Grab the pagesize from the mfp. */
+ tstruct->st_pagesize = mfp->pagesize;
+
+ *(DB_MPOOL_FSTAT ***)argp = tfsp;
+ (*countp)--;
+
+ if (LF_ISSET(DB_STAT_CLEAR))
+ memset(&mfp->stat, 0, sizeof(mfp->stat));
+
+ return (0);
+}
+
+/*
+ * __memp_stat_print_pp --
+ * ENV->memp_stat_print pre/post processing.
+ *
+ * PUBLIC: int __memp_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__memp_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->mp_handle, "DB_ENV->memp_stat_print", DB_INIT_MPOOL);
+
+#define DB_STAT_MEMP_FLAGS \
+ (DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR | DB_STAT_MEMP_HASH)
+ if ((ret = __db_fchk(env,
+ "DB_ENV->memp_stat_print", flags, DB_STAT_MEMP_FLAGS)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__memp_stat_print(env, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+#define FMAP_ENTRIES 200 /* Files we map. */
+
+/*
+ * __memp_stat_print --
+ * ENV->memp_stat_print method.
+ *
+ * PUBLIC: int __memp_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__memp_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ u_int32_t orig_flags;
+ int ret;
+
+ orig_flags = flags;
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+ if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+ ret = __memp_print_stats(env,
+ LF_ISSET(DB_STAT_ALL) ? flags : orig_flags);
+ if (flags == 0 || ret != 0)
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_STAT_ALL | DB_STAT_MEMP_HASH) &&
+ (ret = __memp_print_all(env, orig_flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __memp_print_stats --
+ * Display default mpool region statistics.
+ */
+static int
+__memp_print_stats(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_MPOOL_FSTAT **fsp, **tfsp;
+ DB_MPOOL_STAT *gsp;
+ int ret;
+
+ if ((ret = __memp_stat(env, &gsp, &fsp, flags)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL))
+ __db_msg(env, "Default cache region information:");
+ __db_dlbytes(env, "Total cache size",
+ (u_long)gsp->st_gbytes, (u_long)0, (u_long)gsp->st_bytes);
+ __db_dl(env, "Number of caches", (u_long)gsp->st_ncache);
+ __db_dl(env, "Maximum number of caches", (u_long)gsp->st_max_ncache);
+ __db_dlbytes(env, "Pool individual cache size",
+ (u_long)0, (u_long)0, (u_long)gsp->st_regsize);
+ __db_dlbytes(env, "Pool individual cache max",
+ (u_long)0, (u_long)0, (u_long)gsp->st_regmax);
+ __db_dlbytes(env, "Maximum memory-mapped file size",
+ (u_long)0, (u_long)0, (u_long)gsp->st_mmapsize);
+ STAT_LONG("Maximum open file descriptors", gsp->st_maxopenfd);
+ STAT_LONG("Maximum sequential buffer writes", gsp->st_maxwrite);
+ STAT_LONG("Sleep after writing maximum sequential buffers",
+ gsp->st_maxwrite_sleep);
+ __db_dl(env,
+ "Requested pages mapped into the process' address space",
+ (u_long)gsp->st_map);
+ __db_dl_pct(env, "Requested pages found in the cache",
+ (u_long)gsp->st_cache_hit, DB_PCT(
+ gsp->st_cache_hit, gsp->st_cache_hit + gsp->st_cache_miss), NULL);
+ __db_dl(env, "Requested pages not found in the cache",
+ (u_long)gsp->st_cache_miss);
+ __db_dl(env,
+ "Pages created in the cache", (u_long)gsp->st_page_create);
+ __db_dl(env, "Pages read into the cache", (u_long)gsp->st_page_in);
+ __db_dl(env, "Pages written from the cache to the backing file",
+ (u_long)gsp->st_page_out);
+ __db_dl(env, "Clean pages forced from the cache",
+ (u_long)gsp->st_ro_evict);
+ __db_dl(env, "Dirty pages forced from the cache",
+ (u_long)gsp->st_rw_evict);
+ __db_dl(env, "Dirty pages written by trickle-sync thread",
+ (u_long)gsp->st_page_trickle);
+ __db_dl(env, "Current total page count",
+ (u_long)gsp->st_pages);
+ __db_dl(env, "Current clean page count",
+ (u_long)gsp->st_page_clean);
+ __db_dl(env, "Current dirty page count",
+ (u_long)gsp->st_page_dirty);
+ __db_dl(env, "Number of hash buckets used for page location",
+ (u_long)gsp->st_hash_buckets);
+ __db_dl(env, "Number of mutexes for the hash buckets",
+ (u_long)gsp->st_hash_mutexes);
+ __db_dl(env, "Assumed page size used",
+ (u_long)gsp->st_pagesize);
+ __db_dl(env,
+ "Total number of times hash chains searched for a page",
+ (u_long)gsp->st_hash_searches);
+ __db_dl(env, "The longest hash chain searched for a page",
+ (u_long)gsp->st_hash_longest);
+ __db_dl(env,
+ "Total number of hash chain entries checked for page",
+ (u_long)gsp->st_hash_examined);
+ __db_dl_pct(env,
+ "The number of hash bucket locks that required waiting",
+ (u_long)gsp->st_hash_wait, DB_PCT(
+ gsp->st_hash_wait, gsp->st_hash_wait + gsp->st_hash_nowait), NULL);
+ __db_dl_pct(env,
+ "The maximum number of times any hash bucket lock was waited for",
+ (u_long)gsp->st_hash_max_wait, DB_PCT(gsp->st_hash_max_wait,
+ gsp->st_hash_max_wait + gsp->st_hash_max_nowait), NULL);
+ __db_dl_pct(env,
+ "The number of region locks that required waiting",
+ (u_long)gsp->st_region_wait, DB_PCT(gsp->st_region_wait,
+ gsp->st_region_wait + gsp->st_region_nowait), NULL);
+ __db_dl(env, "The number of buffers frozen",
+ (u_long)gsp->st_mvcc_frozen);
+ __db_dl(env, "The number of buffers thawed",
+ (u_long)gsp->st_mvcc_thawed);
+ __db_dl(env, "The number of frozen buffers freed",
+ (u_long)gsp->st_mvcc_freed);
+ __db_dl(env, "The number of page allocations", (u_long)gsp->st_alloc);
+ __db_dl(env,
+ "The number of hash buckets examined during allocations",
+ (u_long)gsp->st_alloc_buckets);
+ __db_dl(env,
+ "The maximum number of hash buckets examined for an allocation",
+ (u_long)gsp->st_alloc_max_buckets);
+ __db_dl(env, "The number of pages examined during allocations",
+ (u_long)gsp->st_alloc_pages);
+ __db_dl(env, "The max number of pages examined for an allocation",
+ (u_long)gsp->st_alloc_max_pages);
+ __db_dl(env, "Threads waited on page I/O", (u_long)gsp->st_io_wait);
+ __db_dl(env, "The number of times a sync is interrupted",
+ (u_long)gsp->st_sync_interrupted);
+
+ for (tfsp = fsp; fsp != NULL && *tfsp != NULL; ++tfsp) {
+ if (LF_ISSET(DB_STAT_ALL))
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Pool File: %s", (*tfsp)->file_name);
+ __db_dl(env, "Page size", (u_long)(*tfsp)->st_pagesize);
+ __db_dl(env,
+ "Requested pages mapped into the process' address space",
+ (u_long)(*tfsp)->st_map);
+ __db_dl_pct(env, "Requested pages found in the cache",
+ (u_long)(*tfsp)->st_cache_hit, DB_PCT((*tfsp)->st_cache_hit,
+ (*tfsp)->st_cache_hit + (*tfsp)->st_cache_miss), NULL);
+ __db_dl(env, "Requested pages not found in the cache",
+ (u_long)(*tfsp)->st_cache_miss);
+ __db_dl(env, "Pages created in the cache",
+ (u_long)(*tfsp)->st_page_create);
+ __db_dl(env, "Pages read into the cache",
+ (u_long)(*tfsp)->st_page_in);
+ __db_dl(env,
+ "Pages written from the cache to the backing file",
+ (u_long)(*tfsp)->st_page_out);
+ if ((*tfsp)->st_backup_spins != 0)
+ __db_dl(env,
+ "Spins while trying to backup the file",
+ (u_long)(*tfsp)->st_backup_spins);
+ }
+
+ __os_ufree(env, fsp);
+ __os_ufree(env, gsp);
+ return (0);
+}
+
+/*
+ * __memp_print_all --
+ * Display debugging mpool region statistics.
+ */
+static int
+__memp_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ static const FN cfn[] = {
+ { DB_MPOOL_NOFILE, "DB_MPOOL_NOFILE" },
+ { DB_MPOOL_UNLINK, "DB_MPOOL_UNLINK" },
+ { 0, NULL }
+ };
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ MPOOL *mp;
+ roff_t fmap[FMAP_ENTRIES + 1];
+ u_int32_t i, cnt;
+ int ret;
+
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ ret = 0;
+
+ MPOOL_SYSTEM_LOCK(env);
+
+ __db_print_reginfo(env, dbmp->reginfo, "Mpool", flags);
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+
+ __db_msg(env, "MPOOL structure:");
+ __mutex_print_debug_single(
+ env, "MPOOL region mutex", mp->mtx_region, flags);
+ STAT_LSN("Maximum checkpoint LSN", &mp->lsn);
+ STAT_ULONG("Hash table entries", mp->htab_buckets);
+ STAT_ULONG("Hash table mutexes", mp->htab_mutexes);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB_MPOOL handle information:");
+ __mutex_print_debug_single(
+ env, "DB_MPOOL handle mutex", dbmp->mutex, flags);
+ STAT_ULONG("Underlying cache regions", mp->nreg);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB_MPOOLFILE structures:");
+ for (cnt = 0, dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) {
+ __db_msg(env, "File #%lu: %s: per-process, %s",
+ (u_long)cnt + 1, __memp_fn(dbmfp),
+ F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write");
+ STAT_ULONG("Reference count", dbmfp->ref);
+ STAT_ULONG("Pinned block reference count", dbmfp->ref);
+ STAT_ULONG("Clear length", dbmfp->clear_len);
+ __db_print_fileid(env, dbmfp->fileid, "\tID");
+ STAT_ULONG("File type", dbmfp->ftype);
+ STAT_ULONG("LSN offset", dbmfp->lsn_offset);
+ STAT_ULONG("Max gbytes", dbmfp->gbytes);
+ STAT_ULONG("Max bytes", dbmfp->bytes);
+ STAT_ULONG("Cache priority", dbmfp->priority);
+ STAT_POINTER("mmap address", dbmfp->addr);
+ STAT_ULONG("mmap length", dbmfp->len);
+ __db_prflags(env, NULL, dbmfp->flags, cfn, NULL, "\tFlags");
+ __db_print_fh(env, "File handle", dbmfp->fhp, flags);
+ }
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "MPOOLFILE structures:");
+ cnt = 0;
+ ret = __memp_walk_files(env, mp, __memp_print_files, fmap, &cnt, flags);
+ MPOOL_SYSTEM_UNLOCK(env);
+ if (ret != 0)
+ return (ret);
+
+ if (cnt < FMAP_ENTRIES)
+ fmap[cnt] = INVALID_ROFF;
+ else
+ fmap[FMAP_ENTRIES] = INVALID_ROFF;
+
+ /* Dump the individual caches. */
+ for (i = 0; i < mp->nreg; ++i) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Cache #%d:", i + 1);
+ if (i > 0)
+ __env_alloc_print(&dbmp->reginfo[i], flags);
+ if ((ret = __memp_print_hash(
+ env, dbmp, &dbmp->reginfo[i], fmap, flags)) != 0)
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+__memp_print_files(env, mfp, argp, countp, flags)
+ ENV *env;
+ MPOOLFILE *mfp;
+ void *argp;
+ u_int32_t *countp;
+ u_int32_t flags;
+{
+ roff_t *fmap;
+ DB_MPOOL *dbmp;
+ u_int32_t mfp_flags;
+ static const FN fn[] = {
+ { MP_CAN_MMAP, "MP_CAN_MMAP" },
+ { MP_DIRECT, "MP_DIRECT" },
+ { MP_EXTENT, "MP_EXTENT" },
+ { MP_FAKE_DEADFILE, "deadfile" },
+ { MP_FAKE_FILEWRITTEN, "file written" },
+ { MP_FAKE_NB, "no backing file" },
+ { MP_FAKE_UOC, "unlink on close" },
+ { MP_NOT_DURABLE, "not durable" },
+ { MP_TEMP, "MP_TEMP" },
+ { 0, NULL }
+ };
+
+ dbmp = env->mp_handle;
+ fmap = argp;
+
+ __db_msg(env, "File #%d: %s", *countp + 1, __memp_fns(dbmp, mfp));
+ __mutex_print_debug_single(env, "Mutex", mfp->mutex, flags);
+
+ MUTEX_LOCK(env, mfp->mutex);
+ STAT_ULONG("Revision count", mfp->revision);
+ STAT_ULONG("Reference count", mfp->mpf_cnt);
+ STAT_ULONG("Sync/read only open count", mfp->neutral_cnt);
+ STAT_ULONG("Block count", mfp->block_cnt);
+ STAT_ULONG("Last page number", mfp->last_pgno);
+ STAT_ULONG("Original last page number", mfp->orig_last_pgno);
+ STAT_ULONG("Maximum page number", mfp->maxpgno);
+ STAT_LONG("Type", mfp->ftype);
+ STAT_LONG("Priority", mfp->priority);
+ STAT_LONG("Page's LSN offset", mfp->lsn_off);
+ STAT_LONG("Page's clear length", mfp->clear_len);
+
+ __db_print_fileid(env,
+ R_ADDR(dbmp->reginfo, mfp->fileid_off), "\tID");
+
+ mfp_flags = 0;
+ if (mfp->deadfile)
+ FLD_SET(mfp_flags, MP_FAKE_DEADFILE);
+ if (mfp->file_written)
+ FLD_SET(mfp_flags, MP_FAKE_FILEWRITTEN);
+ if (mfp->no_backing_file)
+ FLD_SET(mfp_flags, MP_FAKE_NB);
+ if (mfp->unlink_on_close)
+ FLD_SET(mfp_flags, MP_FAKE_UOC);
+ __db_prflags(env, NULL, mfp_flags, fn, NULL, "\tFlags");
+
+ if (*countp < FMAP_ENTRIES)
+ fmap[*countp] = R_OFFSET(dbmp->reginfo, mfp);
+ (*countp)++;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ return (0);
+}
+
+/*
+ * __memp_print_hash --
+ * Display hash bucket statistics for a cache.
+ */
+static int
+__memp_print_hash(env, dbmp, reginfo, fmap, flags)
+ ENV *env;
+ DB_MPOOL *dbmp;
+ REGINFO *reginfo;
+ roff_t *fmap;
+ u_int32_t flags;
+{
+ BH *bhp, *vbhp;
+ DB_MPOOL_HASH *hp;
+ DB_MSGBUF mb;
+ MPOOL *c_mp;
+ u_int32_t bucket;
+
+ c_mp = reginfo->primary;
+ DB_MSGBUF_INIT(&mb);
+ STAT_ULONG("Hash table last-checked", c_mp->last_checked);
+ STAT_ULONG("Hash table LRU priority", c_mp->lru_priority);
+ STAT_ULONG("Hash table LRU generation", c_mp->lru_generation);
+ STAT_ULONG("Put counter", c_mp->put_counter);
+
+ /* Display the hash table list of BH's. */
+ __db_msg(env,
+ "BH hash table (%lu hash slots)", (u_long)c_mp->htab_buckets);
+ __db_msg(env, "bucket #: priority, I/O wait, [mutex]");
+ __db_msg(env, "\tpageno, file, ref, LSN, address, priority, flags");
+
+ for (hp = R_ADDR(reginfo, c_mp->htab),
+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ MUTEX_READLOCK(env, hp->mtx_hash);
+ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) {
+ __db_msgadd(env, &mb,
+ "bucket %lu: %lu (%lu dirty)",
+ (u_long)bucket, (u_long)hp->hash_io_wait,
+ (u_long)atomic_read(&hp->hash_page_dirty));
+ if (hp->hash_frozen != 0)
+ __db_msgadd(env, &mb, "(MVCC %lu/%lu/%lu) ",
+ (u_long)hp->hash_frozen,
+ (u_long)hp->hash_thawed,
+ (u_long)hp->hash_frozen_freed);
+ __mutex_print_debug_stats(
+ env, &mb, hp->mtx_hash, flags);
+ DB_MSGBUF_FLUSH(env, &mb);
+ }
+ for (; bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
+ __memp_print_bh(env, dbmp, NULL, bhp, fmap);
+
+ /* Print the version chain, if it exists. */
+ for (vbhp = SH_CHAIN_PREV(bhp, vc, __bh);
+ vbhp != NULL;
+ vbhp = SH_CHAIN_PREV(vbhp, vc, __bh)) {
+ __memp_print_bh(env, dbmp,
+ " next:\t", vbhp, fmap);
+ }
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ }
+
+ return (0);
+}
+
+/*
+ * __memp_print_bh --
+ * Display a BH structure.
+ */
+static void
+__memp_print_bh(env, dbmp, prefix, bhp, fmap)
+ ENV *env;
+ DB_MPOOL *dbmp;
+ const char *prefix;
+ BH *bhp;
+ roff_t *fmap;
+{
+ static const FN fn[] = {
+ { BH_CALLPGIN, "callpgin" },
+ { BH_DIRTY, "dirty" },
+ { BH_DIRTY_CREATE, "created" },
+ { BH_DISCARD, "discard" },
+ { BH_EXCLUSIVE, "exclusive" },
+ { BH_FREED, "freed" },
+ { BH_FROZEN, "frozen" },
+ { BH_TRASH, "trash" },
+ { BH_THAWED, "thawed" },
+ { 0, NULL }
+ };
+ DB_MSGBUF mb;
+ int i;
+
+ DB_MSGBUF_INIT(&mb);
+
+ if (prefix != NULL)
+ __db_msgadd(env, &mb, "%s", prefix);
+ else
+ __db_msgadd(env, &mb, "\t");
+
+ for (i = 0; i < FMAP_ENTRIES; ++i)
+ if (fmap[i] == INVALID_ROFF || fmap[i] == bhp->mf_offset)
+ break;
+
+ if (fmap[i] == INVALID_ROFF)
+ __db_msgadd(env, &mb, "%5lu, %lu, ",
+ (u_long)bhp->pgno, (u_long)bhp->mf_offset);
+ else
+ __db_msgadd(
+ env, &mb, "%5lu, #%d, ", (u_long)bhp->pgno, i + 1);
+
+ __db_msgadd(env, &mb, "%2lu, %lu/%lu", (u_long)atomic_read(&bhp->ref),
+ F_ISSET(bhp, BH_FROZEN) ? 0 : (u_long)LSN(bhp->buf).file,
+ F_ISSET(bhp, BH_FROZEN) ? 0 : (u_long)LSN(bhp->buf).offset);
+ if (bhp->td_off != INVALID_ROFF)
+ __db_msgadd(env, &mb, " (@%lu/%lu 0x%x)",
+ (u_long)VISIBLE_LSN(env, bhp)->file,
+ (u_long)VISIBLE_LSN(env, bhp)->offset,
+ BH_OWNER(env, bhp)->txnid);
+ __db_msgadd(env, &mb, ", %#08lx, %lu",
+ (u_long)R_OFFSET(dbmp->reginfo, bhp), (u_long)bhp->priority);
+ __db_prflags(env, &mb, bhp->flags, fn, " (", ")");
+ DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __memp_stat_wait --
+ * Total hash bucket wait stats into the region.
+ */
+static void
+__memp_stat_wait(env, reginfo, mp, mstat, flags)
+ ENV *env;
+ REGINFO *reginfo;
+ MPOOL *mp;
+ DB_MPOOL_STAT *mstat;
+ u_int32_t flags;
+{
+ DB_MPOOL_HASH *hp;
+ u_int32_t i;
+ uintmax_t tmp_nowait, tmp_wait;
+
+ mstat->st_hash_max_wait = 0;
+ hp = R_ADDR(reginfo, mp->htab);
+ for (i = 0; i < mp->htab_buckets; i++, hp++) {
+ __mutex_set_wait_info(
+ env, hp->mtx_hash, &tmp_wait, &tmp_nowait);
+ mstat->st_hash_nowait += tmp_nowait;
+ mstat->st_hash_wait += tmp_wait;
+ if (tmp_wait > mstat->st_hash_max_wait) {
+ mstat->st_hash_max_wait = tmp_wait;
+ mstat->st_hash_max_nowait = tmp_nowait;
+ }
+ if (LF_ISSET(DB_STAT_CLEAR |
+ DB_STAT_SUBSYSTEM) == DB_STAT_CLEAR)
+ __mutex_clear(env, hp->mtx_hash);
+
+ mstat->st_io_wait += hp->hash_io_wait;
+ mstat->st_mvcc_frozen += hp->hash_frozen;
+ mstat->st_mvcc_thawed += hp->hash_thawed;
+ mstat->st_mvcc_freed += hp->hash_frozen_freed;
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ hp->hash_io_wait = 0;
+ hp->hash_frozen = 0;
+ hp->hash_thawed = 0;
+ hp->hash_frozen_freed = 0;
+ }
+ }
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__memp_stat_pp(dbenv, gspp, fspp, flags)
+ DB_ENV *dbenv;
+ DB_MPOOL_STAT **gspp;
+ DB_MPOOL_FSTAT ***fspp;
+ u_int32_t flags;
+{
+ COMPQUIET(gspp, NULL);
+ COMPQUIET(fspp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__memp_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
+
+/*
+ * __memp_stat_hash --
+ * Total hash bucket stats (other than mutex wait) into the region.
+ *
+ * PUBLIC: void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *));
+ */
+void
+__memp_stat_hash(reginfo, mp, dirtyp)
+ REGINFO *reginfo;
+ MPOOL *mp;
+ u_int32_t *dirtyp;
+{
+ DB_MPOOL_HASH *hp;
+ u_int32_t dirty, i;
+
+ hp = R_ADDR(reginfo, mp->htab);
+ for (i = 0, dirty = 0; i < mp->htab_buckets; i++, hp++)
+ dirty += (u_int32_t)atomic_read(&hp->hash_page_dirty);
+ *dirtyp = dirty;
+}
diff --git a/src/mp/mp_sync.c b/src/mp/mp_sync.c
new file mode 100644
index 00000000..fa06b1d4
--- /dev/null
+++ b/src/mp/mp_sync.c
@@ -0,0 +1,965 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+
+typedef struct {
+ DB_MPOOL_HASH *track_hp; /* Hash bucket. */
+
+ roff_t track_off; /* Page file offset. */
+ db_pgno_t track_pgno; /* Page number. */
+} BH_TRACK;
+
+static int __bhcmp __P((const void *, const void *));
+static int __memp_close_flush_files __P((ENV *, int));
+static int __memp_sync_files __P((ENV *));
+static int __memp_sync_file __P((ENV *,
+ MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+
+/*
+ * __memp_walk_files --
+ * PUBLIC: int __memp_walk_files __P((ENV *, MPOOL *,
+ * PUBLIC: int (*) __P((ENV *, MPOOLFILE *, void *,
+ * PUBLIC: u_int32_t *, u_int32_t)), void *, u_int32_t *, u_int32_t));
+ */
+int
+__memp_walk_files(env, mp, func, arg, countp, flags)
+ ENV *env;
+ MPOOL *mp;
+ int (*func)__P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t));
+ void *arg;
+ u_int32_t *countp;
+ u_int32_t flags;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ MPOOLFILE *mfp;
+ int i, ret, t_ret;
+
+ dbmp = env->mp_handle;
+ ret = 0;
+
+ hp = R_ADDR(dbmp->reginfo, mp->ftab);
+ for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
+ MUTEX_LOCK(env, hp->mtx_hash);
+ SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+ if ((t_ret = func(env,
+ mfp, arg, countp, flags)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR))
+ break;
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR))
+ break;
+ }
+ return (ret);
+}
+
+/*
+ * __memp_discard_all_mpfs --
+ * Force discard all mpoolfiles. When closing a private environment, we
+ * always want to discard all mpoolfiles to avoid memory leak.
+ *
+ * PUBLIC: int __memp_discard_all_mpfs __P((ENV *, MPOOL *));
+ */
+int
+__memp_discard_all_mpfs (env, mp)
+ ENV *env;
+ MPOOL *mp;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ MPOOLFILE *mfp;
+ int i, ret, t_ret;
+
+ ret = t_ret = 0;
+ mfp = NULL;
+ hp = NULL;
+ dbmp = env->mp_handle;
+
+ hp = R_ADDR(dbmp->reginfo, mp->ftab);
+ for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
+ MUTEX_LOCK(env, hp->mtx_hash);
+ while ((mfp = SH_TAILQ_FIRST(
+ &hp->hash_bucket, __mpoolfile)) != NULL) {
+ MUTEX_LOCK(env, mfp->mutex);
+ if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ }
+ return (ret);
+}
+
+/*
+ * __memp_sync_pp --
+ * ENV->memp_sync pre/post processing.
+ *
+ * PUBLIC: int __memp_sync_pp __P((DB_ENV *, DB_LSN *));
+ */
+int
+__memp_sync_pp(dbenv, lsnp)
+ DB_ENV *dbenv;
+ DB_LSN *lsnp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->mp_handle, "memp_sync", DB_INIT_MPOOL);
+
+ /*
+ * If no LSN is provided, flush the entire cache (reasonable usage
+ * even if there's no log subsystem configured).
+ */
+ if (lsnp != NULL)
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "memp_sync", DB_INIT_LOG);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__memp_sync(env, DB_SYNC_CACHE, lsnp)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __memp_sync --
+ * ENV->memp_sync.
+ *
+ * PUBLIC: int __memp_sync __P((ENV *, u_int32_t, DB_LSN *));
+ */
+int
+__memp_sync(env, flags, lsnp)
+ ENV *env;
+ u_int32_t flags;
+ DB_LSN *lsnp;
+{
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+ int interrupted, ret;
+
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+
+ /* If we've flushed to the requested LSN, return that information. */
+ if (lsnp != NULL) {
+ MPOOL_SYSTEM_LOCK(env);
+ if (LOG_COMPARE(lsnp, &mp->lsn) <= 0) {
+ *lsnp = mp->lsn;
+
+ MPOOL_SYSTEM_UNLOCK(env);
+ return (0);
+ }
+ MPOOL_SYSTEM_UNLOCK(env);
+ }
+
+ if ((ret =
+ __memp_sync_int(env, NULL, 0, flags, NULL, &interrupted)) != 0)
+ return (ret);
+
+ if (!interrupted && lsnp != NULL) {
+ MPOOL_SYSTEM_LOCK(env);
+ if (LOG_COMPARE(lsnp, &mp->lsn) > 0)
+ mp->lsn = *lsnp;
+ MPOOL_SYSTEM_UNLOCK(env);
+ }
+
+ return (0);
+}
+
+/*
+ * __memp_fsync_pp --
+ * DB_MPOOLFILE->sync pre/post processing.
+ *
+ * PUBLIC: int __memp_fsync_pp __P((DB_MPOOLFILE *));
+ */
+int
+__memp_fsync_pp(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbmfp->env;
+
+ MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->sync");
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__memp_fsync(dbmfp)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __memp_fsync --
+ * DB_MPOOLFILE->sync.
+ *
+ * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *));
+ */
+int
+__memp_fsync(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ MPOOLFILE *mfp;
+
+ mfp = dbmfp->mfp;
+
+ /*
+ * If this handle doesn't have a file descriptor that's open for
+ * writing, or if the file is a temporary, or if the file hasn't
+ * been written since it was flushed, there's no reason to proceed
+ * further.
+ */
+ if (F_ISSET(dbmfp, MP_READONLY))
+ return (0);
+
+ if (F_ISSET(dbmfp->mfp, MP_TEMP) || dbmfp->mfp->no_backing_file)
+ return (0);
+
+ if (mfp->file_written == 0)
+ return (0);
+
+ return (__memp_sync_int(
+ dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL));
+}
+
+/*
+ * __mp_xxx_fh --
+ * Return a file descriptor for DB 1.85 compatibility locking.
+ *
+ * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
+ */
+int
+__mp_xxx_fh(dbmfp, fhp)
+ DB_MPOOLFILE *dbmfp;
+ DB_FH **fhp;
+{
+ int ret;
+
+ /*
+ * This is a truly spectacular layering violation, intended ONLY to
+ * support compatibility for the DB 1.85 DB->fd call.
+ *
+ * Sync the database file to disk, creating the file as necessary.
+ *
+ * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
+ * The MP_READONLY test isn't interesting because we will either
+ * already have a file descriptor (we opened the database file for
+ * reading) or we aren't readonly (we created the database which
+ * requires write privileges). The MP_TEMP test isn't interesting
+ * because we want to write to the backing file regardless so that
+ * we get a file descriptor to return.
+ */
+ if ((*fhp = dbmfp->fhp) != NULL)
+ return (0);
+
+ if ((ret = __memp_sync_int(
+ dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0)
+ *fhp = dbmfp->fhp;
+ return (ret);
+}
+
+/*
+ * __memp_sync_int --
+ * Mpool sync internal function.
+ *
+ * PUBLIC: int __memp_sync_int __P((ENV *,
+ * PUBLIC: DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *));
+ */
+int
+__memp_sync_int(env, dbmfp, trickle_max, flags, wrote_totalp, interruptedp)
+ ENV *env;
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t trickle_max, flags, *wrote_totalp;
+ int *interruptedp;
+{
+ BH *bhp;
+ BH_TRACK *bharray;
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *c_mp, *mp;
+ MPOOLFILE *mfp;
+ db_mutex_t mutex;
+ roff_t last_mf_offset;
+ u_int32_t ar_cnt, ar_max, i, n_cache, remaining, wrote_total;
+ int32_t wrote_cnt;
+ int dirty, filecnt, maxopenfd, required_write, ret, t_ret;
+
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ last_mf_offset = INVALID_ROFF;
+ filecnt = wrote_total = 0;
+
+ if (wrote_totalp != NULL)
+ *wrote_totalp = 0;
+ if (interruptedp != NULL)
+ *interruptedp = 0;
+
+ /*
+ * If we're flushing the cache, it's a checkpoint or we're flushing a
+ * specific file, we really have to write the blocks and we have to
+ * confirm they made it to disk. Otherwise, we can skip a block if
+ * it's hard to get.
+ */
+ required_write = LF_ISSET(DB_SYNC_CACHE |
+ DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT);
+
+ /* Get shared configuration information. */
+ MPOOL_SYSTEM_LOCK(env);
+ maxopenfd = mp->mp_maxopenfd;
+ MPOOL_SYSTEM_UNLOCK(env);
+
+ /* Assume one dirty page per bucket. */
+ ar_max = mp->nreg * mp->htab_buckets;
+ if ((ret =
+ __os_malloc(env, ar_max * sizeof(BH_TRACK), &bharray)) != 0)
+ return (ret);
+
+ /*
+ * Walk each cache's list of buffers and mark all dirty buffers to be
+ * written and all dirty buffers to be potentially written, depending
+ * on our flags.
+ */
+ for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) {
+ c_mp = dbmp->reginfo[n_cache].primary;
+
+ hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+ for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
+ /*
+ * We can check for empty buckets before locking as
+ * we only care if the pointer is zero or non-zero.
+ * We can ignore empty or clean buckets because we
+ * only need write buffers that were dirty before
+ * we started.
+ */
+#ifdef DIAGNOSTIC
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+#else
+ if (atomic_read(&hp->hash_page_dirty) == 0)
+#endif
+ continue;
+
+ dirty = 0;
+ MUTEX_LOCK(env, hp->mtx_hash);
+ SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
+ /* Always ignore clean pages. */
+ if (!F_ISSET(bhp, BH_DIRTY))
+ continue;
+
+ dirty++;
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /*
+ * Ignore in-memory files, unless the file is
+ * specifically being flushed.
+ */
+ if (mfp->no_backing_file)
+ continue;
+ if (!LF_ISSET(DB_SYNC_FILE) &&
+ F_ISSET(mfp, MP_TEMP))
+ continue;
+
+ /*
+ * Ignore files that aren't involved in DB's
+ * transactional operations during checkpoints.
+ */
+ if (LF_ISSET(DB_SYNC_CHECKPOINT) &&
+ mfp->lsn_off == DB_LSN_OFF_NOTSET)
+ continue;
+
+ /*
+ * Ignore files that aren't Queue extent files
+ * if we're flushing a Queue file with extents.
+ */
+ if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) &&
+ !F_ISSET(mfp, MP_EXTENT))
+ continue;
+
+ /*
+ * If we're flushing a specific file, see if
+ * this page is from that file.
+ */
+ if (dbmfp != NULL && mfp != dbmfp->mfp)
+ continue;
+
+ /* Track the buffer, we want it. */
+ bharray[ar_cnt].track_hp = hp;
+ bharray[ar_cnt].track_pgno = bhp->pgno;
+ bharray[ar_cnt].track_off = bhp->mf_offset;
+ ar_cnt++;
+
+ /*
+ * If we run out of space, double and continue.
+ * Don't stop at trickle_max, we want to sort
+ * as large a sample set as possible in order
+ * to minimize disk seeks.
+ */
+ if (ar_cnt >= ar_max) {
+ if ((ret = __os_realloc(env,
+ (ar_max * 2) * sizeof(BH_TRACK),
+ &bharray)) != 0)
+ break;
+ ar_max *= 2;
+ }
+ }
+
+ if (ret != 0)
+ goto err;
+ /*
+ * We are only checking this in diagnostic mode
+ * since it requires extra latching to keep the count
+ * in sync with the number of bits counted.
+ */
+ DB_ASSERT(env,
+ dirty == (int)atomic_read(&hp->hash_page_dirty));
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+
+ /* Check if the call has been interrupted. */
+ if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET(
+ mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
+ STAT(++mp->stat.st_sync_interrupted);
+ if (interruptedp != NULL)
+ *interruptedp = 1;
+ goto err;
+ }
+ }
+ }
+
+ /* If there no buffers to write, we're done. */
+ if (ar_cnt == 0)
+ goto done;
+
+ /*
+ * Write the buffers in file/page order, trying to reduce seeks by the
+ * filesystem and, when pages are smaller than filesystem block sizes,
+ * reduce the actual number of writes.
+ */
+ if (ar_cnt > 1)
+ qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp);
+
+ /*
+ * If we're trickling buffers, only write enough to reach the correct
+ * percentage.
+ */
+ if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max)
+ ar_cnt = trickle_max;
+
+ /*
+ * Flush the log. We have to ensure the log records reflecting the
+ * changes on the database pages we're writing have already made it
+ * to disk. We still have to check the log each time we write a page
+ * (because pages we are about to write may be modified after we have
+ * flushed the log), but in general this will at least avoid any I/O
+ * on the log's part.
+ */
+ if (LOGGING_ON(env) && (ret = __log_flush(env, NULL)) != 0)
+ goto err;
+
+ /*
+ * Walk the array, writing buffers. When we write a buffer, we NULL
+ * out its hash bucket pointer so we don't process a slot more than
+ * once.
+ */
+ for (i = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) {
+ if (i >= ar_cnt) {
+ i = 0;
+ __os_yield(env, 1, 0);
+ }
+ if ((hp = bharray[i].track_hp) == NULL)
+ continue;
+
+ /* Lock the hash bucket and find the buffer. */
+ mutex = hp->mtx_hash;
+ MUTEX_READLOCK(env, mutex);
+ SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh)
+ if (bhp->pgno == bharray[i].track_pgno &&
+ bhp->mf_offset == bharray[i].track_off)
+ break;
+
+ /*
+ * If we can't find the buffer we're done, somebody else had
+ * to have written it.
+ *
+ * If the buffer isn't dirty, we're done, there's no work
+ * needed.
+ */
+ if (bhp == NULL || !F_ISSET(bhp, BH_DIRTY)) {
+ MUTEX_UNLOCK(env, mutex);
+ --remaining;
+ bharray[i].track_hp = NULL;
+ continue;
+ }
+
+ /*
+ * If the buffer is locked by another thread, ignore it, we'll
+ * come back to it.
+ */
+ if (F_ISSET(bhp, BH_EXCLUSIVE)) {
+ MUTEX_UNLOCK(env, mutex);
+ if (!required_write) {
+ --remaining;
+ bharray[i].track_hp = NULL;
+ }
+ continue;
+ }
+
+ /* Pin the buffer into memory. */
+ atomic_inc(env, &bhp->ref);
+ MUTEX_UNLOCK(env, mutex);
+ MUTEX_READLOCK(env, bhp->mtx_buf);
+ DB_ASSERT(env, !F_ISSET(bhp, BH_EXCLUSIVE));
+
+ /*
+ * When swapping the hash bucket mutex for the buffer mutex,
+ * we may have raced with an MVCC update. In that case, we
+ * no longer have the most recent version, and need to retry
+ * (the buffer header we have pinned will no longer be marked
+ * dirty, so we can't just write it).
+ */
+ if (SH_CHAIN_HASNEXT(bhp, vc)) {
+ atomic_dec(env, &bhp->ref);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ continue;
+ }
+
+ /*
+ * If we've switched files, check to see if we're configured
+ * to close file descriptors.
+ */
+ if (maxopenfd != 0 && bhp->mf_offset != last_mf_offset) {
+ if (++filecnt >= maxopenfd) {
+ filecnt = 0;
+ if ((t_ret = __memp_close_flush_files(
+ env, 1)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ last_mf_offset = bhp->mf_offset;
+ }
+
+ /*
+ * If the buffer is dirty, we write it. We only try to
+ * write the buffer once.
+ */
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ if ((t_ret =
+ __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) {
+ ++wrote_cnt;
+ ++wrote_total;
+ } else {
+ /* The buffer is being backed up, try again. */
+ if (t_ret == EAGAIN) {
+ atomic_dec(env, &bhp->ref);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+ continue;
+ }
+ if (ret == 0)
+ ret = t_ret;
+ __db_errx(env, DB_STR_A("3027",
+ "%s: unable to flush page: %lu", "%s %lu"),
+ __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
+
+ }
+ }
+
+ /* we disposed of this buffer. */
+ --remaining;
+ bharray[i].track_hp = NULL;
+
+ /* Discard our buffer reference. */
+ DB_ASSERT(env, atomic_read(&bhp->ref) > 0);
+ atomic_dec(env, &bhp->ref);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+
+ /* Check if the call has been interrupted. */
+ if (LF_ISSET(DB_SYNC_INTERRUPT_OK) &&
+ FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
+ STAT(++mp->stat.st_sync_interrupted);
+ if (interruptedp != NULL)
+ *interruptedp = 1;
+ goto err;
+ }
+
+ /*
+ * Sleep after some number of writes to avoid disk saturation.
+ * Don't cache the max writes value, an application shutting
+ * down might reset the value in order to do a fast flush or
+ * checkpoint.
+ */
+ if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) &&
+ !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) &&
+ mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) {
+ wrote_cnt = 0;
+ __os_yield(env, 0, (u_long)mp->mp_maxwrite_sleep);
+ }
+ }
+
+done: /*
+ * If a write is required, we have to force the pages to disk. We
+ * don't do this as we go along because we want to give the OS as
+ * much time as possible to lazily flush, and because we have to flush
+ * files that might not even have had dirty buffers in the cache, so
+ * we have to walk the files list.
+ */
+ if (ret == 0 && required_write) {
+ if (dbmfp == NULL)
+ ret = __memp_sync_files(env);
+ else
+ ret = __os_fsync(env, dbmfp->fhp);
+ }
+
+ /* If we've opened files to flush pages, close them. */
+ if ((t_ret = __memp_close_flush_files(env, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: __os_free(env, bharray);
+ if (wrote_totalp != NULL)
+ *wrote_totalp = wrote_total;
+
+ return (ret);
+}
+
+static int
+__memp_sync_file(env, mfp, argp, countp, flags)
+ ENV *env;
+ MPOOLFILE *mfp;
+ void *argp;
+ u_int32_t *countp;
+ u_int32_t flags;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ int ret, t_ret;
+
+ COMPQUIET(countp, NULL);
+ COMPQUIET(flags, 0);
+
+ if (!mfp->file_written || mfp->no_backing_file ||
+ mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+ return (0);
+ /*
+ * Pin the MPOOLFILE structure into memory, and release the
+ * region mutex allowing us to walk the linked list. We'll
+ * re-acquire that mutex to move to the next entry in the list.
+ *
+ * This works because we only need to flush current entries,
+ * we don't care about new entries being added, and the linked
+ * list is never re-ordered, a single pass is sufficient. It
+ * requires MPOOLFILE structures removed before we get to them
+ * be flushed to disk, but that's nothing new, they could have
+ * been removed while checkpoint was running, too.
+ *
+ * Once we have the MPOOLFILE lock, re-check the MPOOLFILE is
+ * not being discarded. (A thread removing the MPOOLFILE
+ * will: hold the MPOOLFILE mutex, set deadfile, drop the
+ * MPOOLFILE mutex and then acquire the region MUTEX to walk
+ * the linked list and remove the MPOOLFILE structure. Make
+ * sure the MPOOLFILE wasn't marked dead while we waited for
+ * the mutex.
+ */
+ MUTEX_LOCK(env, mfp->mutex);
+ if (!mfp->file_written || mfp->deadfile) {
+ MUTEX_UNLOCK(env, mfp->mutex);
+ return (0);
+ }
+ ++mfp->mpf_cnt;
+ ++mfp->neutral_cnt;
+ MUTEX_UNLOCK(env, mfp->mutex);
+
+ /*
+ * Look for an already open, writable handle (fsync doesn't
+ * work on read-only Windows handles).
+ */
+ dbmp = env->mp_handle;
+ MUTEX_LOCK(env, dbmp->mutex);
+ TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) {
+ if (dbmfp->mfp != mfp || F_ISSET(dbmfp, MP_READONLY))
+ continue;
+ /*
+ * We don't want to hold the mutex while calling sync.
+ * Increment the DB_MPOOLFILE handle ref count to pin
+ * it into memory.
+ */
+ ++dbmfp->ref;
+ break;
+ }
+ MUTEX_UNLOCK(env, dbmp->mutex);
+
+ /* If we don't find a handle we can use, open one. */
+ if (dbmfp == NULL) {
+ if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) {
+ __db_err(env, ret, DB_STR_A("3028",
+ "%s: unable to flush", "%s"), (char *)
+ R_ADDR(dbmp->reginfo, mfp->path_off));
+ }
+ } else
+ ret = __os_fsync(env, dbmfp->fhp);
+
+ /*
+ * Re-acquire the MPOOLFILE mutex, we need it to modify the
+ * reference count.
+ */
+ MUTEX_LOCK(env, mfp->mutex);
+
+ /*
+ * If we wrote the file and there are no other references (or there
+ * is a single reference, and it's the one we opened to write
+ * buffers during checkpoint), clear the file_written flag. We
+ * do this so that applications opening thousands of files don't
+ * loop here opening and flushing those files during checkpoint.
+ *
+ * The danger here is if a buffer were to be written as part of
+ * a checkpoint, and then not be flushed to disk. This cannot
+ * happen because we only clear file_written when there are no
+ * other users of the MPOOLFILE in the system, and, as we hold
+ * the region lock, no possibility of another thread of control
+ * racing with us to open a MPOOLFILE.
+ */
+ if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 &&
+ dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) {
+ mfp->file_written = 0;
+
+ /*
+ * We may be the last reference for a MPOOLFILE, as we
+ * weren't holding the MPOOLFILE mutex when flushing
+ * it's buffers to disk. If we can discard it, set
+ * a flag to schedule a clean-out pass. (Not likely,
+ * I mean, what are the chances that there aren't any
+ * buffers in the pool? Regardless, it might happen.)
+ */
+ if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0)
+ *(int *)argp = 1;
+ }
+
+ /*
+ * If we found the file we must close it in case we are the last
+ * reference to the dbmfp. NOTE: since we have incremented
+ * mfp->mpf_cnt this cannot be the last reference to the mfp.
+ * This is important since we are called with the hash bucket
+ * locked. The mfp will get freed via the cleanup pass.
+ */
+ if (dbmfp != NULL &&
+ (t_ret = __memp_fclose(dbmfp, DB_MPOOL_NOLOCK)) != 0 && ret == 0)
+ ret = t_ret;
+
+ --mfp->mpf_cnt;
+ DB_ASSERT(env, mfp->neutral_cnt != 0);
+ --mfp->neutral_cnt;
+
+ /* Unlock the MPOOLFILE. */
+ MUTEX_UNLOCK(env, mfp->mutex);
+ return (ret);
+}
+
+/*
+ * __memp_sync_files --
+ * Sync all the files in the environment, open or not.
+ */
+static int
+__memp_sync_files(env)
+ ENV *env;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *mp;
+ MPOOLFILE *mfp, *next_mfp;
+ int i, need_discard_pass, ret;
+
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ need_discard_pass = ret = 0;
+
+ ret = __memp_walk_files(env,
+ mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_MEMP_NOERROR);
+
+ /*
+ * We may need to do a last pass through the MPOOLFILE list -- if we
+ * were the last reference to an MPOOLFILE, we need to clean it out.
+ */
+ if (!need_discard_pass)
+ return (ret);
+
+ hp = R_ADDR(dbmp->reginfo, mp->ftab);
+ for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
+retry: MUTEX_LOCK(env, hp->mtx_hash);
+ for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket,
+ __mpoolfile); mfp != NULL; mfp = next_mfp) {
+ next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile);
+ /*
+ * Do a fast check -- we can check for zero/non-zero
+ * without a mutex on the MPOOLFILE. If likely to
+ * succeed, lock the MPOOLFILE down and look for real.
+ */
+ if (mfp->deadfile ||
+ mfp->block_cnt != 0 || mfp->mpf_cnt != 0)
+ continue;
+
+ MUTEX_LOCK(env, mfp->mutex);
+ if (!mfp->deadfile &&
+ mfp->block_cnt == 0 && mfp->mpf_cnt == 0) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ (void)__memp_mf_discard(dbmp, mfp, 0);
+ goto retry;
+ } else
+ MUTEX_UNLOCK(env, mfp->mutex);
+ }
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ }
+ return (ret);
+}
+
+/*
+ * __memp_mf_sync --
+ * Flush an MPOOLFILE, when no currently open handle is available.
+ *
+ * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int));
+ */
+int
+__memp_mf_sync(dbmp, mfp, locked)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ int locked;
+{
+ DB_FH *fhp;
+ DB_MPOOL_HASH *hp;
+ ENV *env;
+ MPOOL *mp;
+ int ret, t_ret;
+ char *rpath;
+
+ COMPQUIET(hp, NULL);
+ env = dbmp->env;
+
+ /*
+ * We need to be holding the hash lock: we're using the path name
+ * and __memp_nameop might try and rename the file.
+ */
+ if (!locked) {
+ mp = dbmp->reginfo[0].primary;
+ hp = R_ADDR(dbmp->reginfo, mp->ftab);
+ hp += FNBUCKET(
+ R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN);
+ MUTEX_LOCK(env, hp->mtx_hash);
+ }
+
+ if ((ret = __db_appname(env, DB_APP_DATA,
+ R_ADDR(dbmp->reginfo, mfp->path_off), NULL, &rpath)) == 0) {
+ if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) {
+ ret = __os_fsync(env, fhp);
+ if ((t_ret =
+ __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ __os_free(env, rpath);
+ }
+
+ if (!locked)
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+
+ return (ret);
+}
+
+/*
+ * __memp_close_flush_files --
+ * Close files opened only to flush buffers.
+ */
+static int
+__memp_close_flush_files(env, dosync)
+ ENV *env;
+ int dosync;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ MPOOLFILE *mfp;
+ int ret;
+
+ dbmp = env->mp_handle;
+
+ /*
+ * The routine exists because we must close files opened by sync to
+ * flush buffers. There are two cases: first, extent files have to
+ * be closed so they may be removed when empty. Second, regular
+ * files have to be closed so we don't run out of descriptors (for
+ * example, an application partitioning its data into databases
+ * based on timestamps, so there's a continually increasing set of
+ * files).
+ *
+ * We mark files opened in the __memp_bhwrite() function with the
+ * MP_FLUSH flag. Here we walk through our file descriptor list,
+ * and, if a file was opened by __memp_bhwrite(), we close it.
+ */
+retry: MUTEX_LOCK(env, dbmp->mutex);
+ TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q)
+ if (F_ISSET(dbmfp, MP_FLUSH)) {
+ F_CLR(dbmfp, MP_FLUSH);
+ MUTEX_UNLOCK(env, dbmp->mutex);
+ if (dosync) {
+ /*
+ * If we have the only open handle on the file,
+ * clear the dirty flag so we don't re-open and
+ * sync it again when discarding the MPOOLFILE
+ * structure. Clear the flag before the sync
+ * so can't race with a thread writing the file.
+ */
+ mfp = dbmfp->mfp;
+ if (mfp->mpf_cnt == 1) {
+ MUTEX_LOCK(env, mfp->mutex);
+ if (mfp->mpf_cnt == 1)
+ mfp->file_written = 0;
+ MUTEX_UNLOCK(env, mfp->mutex);
+ }
+ if ((ret = __os_fsync(env, dbmfp->fhp)) != 0)
+ return (ret);
+ }
+ if ((ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0)
+ return (ret);
+ goto retry;
+ }
+ MUTEX_UNLOCK(env, dbmp->mutex);
+
+ return (0);
+}
+
+static int
+__bhcmp(p1, p2)
+ const void *p1, *p2;
+{
+ BH_TRACK *bhp1, *bhp2;
+
+ bhp1 = (BH_TRACK *)p1;
+ bhp2 = (BH_TRACK *)p2;
+
+ /* Sort by file (shared memory pool offset). */
+ if (bhp1->track_off < bhp2->track_off)
+ return (-1);
+ if (bhp1->track_off > bhp2->track_off)
+ return (1);
+
+ /*
+ * !!!
+ * Defend against badly written quicksort code calling the comparison
+ * function with two identical pointers (e.g., WATCOM C++ (Power++)).
+ */
+ if (bhp1->track_pgno < bhp2->track_pgno)
+ return (-1);
+ if (bhp1->track_pgno > bhp2->track_pgno)
+ return (1);
+ return (0);
+}
diff --git a/src/mp/mp_trickle.c b/src/mp/mp_trickle.c
new file mode 100644
index 00000000..fba528b3
--- /dev/null
+++ b/src/mp/mp_trickle.c
@@ -0,0 +1,112 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+static int __memp_trickle __P((ENV *, int, int *));
+
+/*
+ * __memp_trickle_pp --
+ * ENV->memp_trickle pre/post processing.
+ *
+ * PUBLIC: int __memp_trickle_pp __P((DB_ENV *, int, int *));
+ */
+int
+__memp_trickle_pp(dbenv, pct, nwrotep)
+ DB_ENV *dbenv;
+ int pct, *nwrotep;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->mp_handle, "memp_trickle", DB_INIT_MPOOL);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__memp_trickle(env, pct, nwrotep)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __memp_trickle --
+ * ENV->memp_trickle.
+ */
+static int
+__memp_trickle(env, pct, nwrotep)
+ ENV *env;
+ int pct, *nwrotep;
+{
+ DB_MPOOL *dbmp;
+ MPOOL *c_mp, *mp;
+ u_int32_t clean, dirty, i, need_clean, total, dtmp, wrote;
+ int ret;
+
+ dbmp = env->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+
+ if (nwrotep != NULL)
+ *nwrotep = 0;
+
+ if (pct < 1 || pct > 100) {
+ __db_errx(env, DB_STR_A("3007",
+ "DB_ENV->memp_trickle: %d: percent must be between 1 and 100",
+ "%d"), pct);
+ return (EINVAL);
+ }
+
+ /*
+ * Loop through the caches counting total/dirty buffers.
+ *
+ * XXX
+ * Using hash_page_dirty is our only choice at the moment, but it's not
+ * as correct as we might like in the presence of pools having more
+ * than one page size, as a free 512B buffer may not be equivalent to
+ * having a free 8KB buffer.
+ */
+ for (ret = 0, i = dirty = total = 0; i < mp->nreg; ++i) {
+ c_mp = dbmp->reginfo[i].primary;
+ total += c_mp->pages;
+ __memp_stat_hash(&dbmp->reginfo[i], c_mp, &dtmp);
+ dirty += dtmp;
+ }
+
+ /*
+ * If there are sufficient clean buffers, no buffers or no dirty
+ * buffers, we're done.
+ */
+ if (total == 0 || dirty == 0)
+ return (0);
+
+ /*
+ * The total number of pages is an exact number, but the dirty page
+ * count can change while we're walking the hash buckets, and it's
+ * even possible the dirty page count ends up larger than the total
+ * number of pages.
+ */
+ clean = total > dirty ? total - dirty : 0;
+ need_clean = (total * (u_int)pct) / 100;
+ if (clean >= need_clean)
+ return (0);
+
+ need_clean -= clean;
+ ret = __memp_sync_int(env, NULL,
+ need_clean, DB_SYNC_TRICKLE | DB_SYNC_INTERRUPT_OK, &wrote, NULL);
+ STAT((mp->stat.st_page_trickle += wrote));
+ if (nwrotep != NULL)
+ *nwrotep = (int)wrote;
+
+ return (ret);
+}
diff --git a/src/mutex/README b/src/mutex/README
new file mode 100644
index 00000000..6e95c5fd
--- /dev/null
+++ b/src/mutex/README
@@ -0,0 +1,110 @@
+# $Id$
+
+Note: this only applies to locking using test-and-set and fcntl calls,
+pthreads were added after this was written.
+
+Resource locking routines: lock based on a DB_MUTEX. All this gunk
+(including trying to make assembly code portable), is necessary because
+System V semaphores require system calls for uncontested locks and we
+don't want to make two system calls per resource lock.
+
+First, this is how it works. The DB_MUTEX structure contains a resource
+test-and-set lock (tsl), a file offset, a pid for debugging and statistics
+information.
+
+If HAVE_MUTEX_FCNTL is NOT defined (that is, we know how to do
+test-and-sets for this compiler/architecture combination), we try and
+lock the resource tsl some number of times (based on the number of
+processors). If we can't acquire the mutex that way, we use a system
+call to sleep for 1ms, 2ms, 4ms, etc. (The time is bounded at 10ms for
+mutexes backing logical locks and 25 ms for data structures, just in
+case.) Using the timer backoff means that there are two assumptions:
+that mutexes are held for brief periods (never over system calls or I/O)
+and mutexes are not hotly contested.
+
+If HAVE_MUTEX_FCNTL is defined, we use a file descriptor to do byte
+locking on a file at a specified offset. In this case, ALL of the
+locking is done in the kernel. Because file descriptors are allocated
+per process, we have to provide the file descriptor as part of the lock
+call. We still have to do timer backoff because we need to be able to
+block ourselves, that is, the lock manager causes processes to wait by
+having the process acquire a mutex and then attempting to re-acquire the
+mutex. There's no way to use kernel locking to block yourself, that is,
+if you hold a lock and attempt to re-acquire it, the attempt will
+succeed.
+
+Next, let's talk about why it doesn't work the way a reasonable person
+would think it should work.
+
+Ideally, we'd have the ability to try to lock the resource tsl, and if
+that fails, increment a counter of waiting processes, then block in the
+kernel until the tsl is released. The process holding the resource tsl
+would see the wait counter when it went to release the resource tsl, and
+would wake any waiting processes up after releasing the lock. This would
+actually require both another tsl (call it the mutex tsl) and
+synchronization between the call that blocks in the kernel and the actual
+resource tsl. The mutex tsl would be used to protect accesses to the
+DB_MUTEX itself. Locking the mutex tsl would be done by a busy loop,
+which is safe because processes would never block holding that tsl (all
+they would do is try to obtain the resource tsl and set/check the wait
+count). The problem in this model is that the blocking call into the
+kernel requires a blocking semaphore, i.e. one whose normal state is
+locked.
+
+The only portable forms of locking under UNIX are fcntl(2) on a file
+descriptor/offset, and System V semaphores. Neither of these locking
+methods are sufficient to solve the problem.
+
+The problem with fcntl locking is that only the process that obtained the
+lock can release it. Remember, we want the normal state of the kernel
+semaphore to be locked. So, if the creator of the DB_MUTEX were to
+initialize the lock to "locked", then a second process locks the resource
+tsl, and then a third process needs to block, waiting for the resource
+tsl, when the second process wants to wake up the third process, it can't
+because it's not the holder of the lock! For the second process to be
+the holder of the lock, we would have to make a system call per
+uncontested lock, which is what we were trying to get away from in the
+first place.
+
+There are some hybrid schemes, such as signaling the holder of the lock,
+or using a different blocking offset depending on which process is
+holding the lock, but it gets complicated fairly quickly. I'm open to
+suggestions, but I'm not holding my breath.
+
+Regardless, we use this form of locking when we don't have any other
+choice, because it doesn't have the limitations found in System V
+semaphores, and because the normal state of the kernel object in that
+case is unlocked, so the process releasing the lock is also the holder
+of the lock.
+
+The System V semaphore design has a number of other limitations that make
+it inappropriate for this task. Namely:
+
+First, the semaphore key name space is separate from the file system name
+space (although there exist methods for using file names to create
+semaphore keys). If we use a well-known key, there's no reason to believe
+that any particular key will not already be in use, either by another
+instance of the DB application or some other application, in which case
+the DB application will fail. If we create a key, then we have to use a
+file system name to rendezvous and pass around the key.
+
+Second, System V semaphores traditionally have compile-time, system-wide
+limits on the number of semaphore keys that you can have. Typically, that
+number is far too low for any practical purpose. Since the semaphores
+permit more than a single slot per semaphore key, we could try and get
+around that limit by using multiple slots, but that means that the file
+that we're using for rendezvous is going to have to contain slot
+information as well as semaphore key information, and we're going to be
+reading/writing it on every db_mutex_t init or destroy operation. Anyhow,
+similar compile-time, system-wide limits on the numbers of slots per
+semaphore key kick in, and you're right back where you started.
+
+My fantasy is that once POSIX.1 standard mutexes are in wide-spread use,
+we can switch to them. My guess is that it won't happen, because the
+POSIX semaphores are only required to work for threads within a process,
+and not independent processes.
+
+Note: there are races in the statistics code, but since it's just that,
+I didn't bother fixing them. (The fix requires a mutex tsl, so, when/if
+this code is fixed to do rational locking (see above), then change the
+statistics update code to acquire/release the mutex tsl.
diff --git a/src/mutex/mut_alloc.c b/src/mutex/mut_alloc.c
new file mode 100644
index 00000000..5df3de53
--- /dev/null
+++ b/src/mutex/mut_alloc.c
@@ -0,0 +1,291 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __mutex_alloc --
+ * Allocate a mutex from the mutex region.
+ *
+ * PUBLIC: int __mutex_alloc __P((ENV *, int, u_int32_t, db_mutex_t *));
+ */
+int
+__mutex_alloc(env, alloc_id, flags, indxp)
+ ENV *env;
+ int alloc_id;
+ u_int32_t flags;
+ db_mutex_t *indxp;
+{
+ /* The caller may depend on us to initialize. */
+ *indxp = MUTEX_INVALID;
+
+ /*
+ * If this is not an application lock, and we've turned off locking,
+ * or the ENV handle isn't thread-safe, and this is a thread lock
+ * or the environment isn't multi-process by definition, there's no
+ * need to mutex at all.
+ */
+ if (alloc_id != MTX_APPLICATION && alloc_id != MTX_MUTEX_TEST &&
+ (F_ISSET(env->dbenv, DB_ENV_NOLOCKING) ||
+ (!F_ISSET(env, ENV_THREAD) &&
+ (LF_ISSET(DB_MUTEX_PROCESS_ONLY) ||
+ F_ISSET(env, ENV_PRIVATE)))))
+ return (0);
+
+ /* Private environments never share mutexes. */
+ if (F_ISSET(env, ENV_PRIVATE))
+ LF_SET(DB_MUTEX_PROCESS_ONLY);
+
+ /*
+ * If we have a region in which to allocate the mutexes, lock it and
+ * do the allocation.
+ */
+ if (!MUTEX_ON(env)) {
+ __db_errx(env, DB_STR("2033",
+ "Mutex allocated before mutex region."));
+ return (__env_panic(env, EINVAL));
+ }
+ return (__mutex_alloc_int(env, 1, alloc_id, flags, indxp));
+}
+
+/*
+ * __mutex_alloc_int --
+ * Internal routine to allocate a mutex.
+ *
+ * PUBLIC: int __mutex_alloc_int
+ * PUBLIC: __P((ENV *, int, int, u_int32_t, db_mutex_t *));
+ */
+int
+__mutex_alloc_int(env, locksys, alloc_id, flags, indxp)
+ ENV *env;
+ int locksys, alloc_id;
+ u_int32_t flags;
+ db_mutex_t *indxp;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ db_mutex_t i;
+ size_t len;
+ u_int32_t cnt;
+ int ret;
+
+ dbenv = env->dbenv;
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+ ret = 0;
+
+ /*
+ * If we're not initializing the mutex region, then lock the region to
+ * allocate new mutexes. Drop the lock before initializing the mutex,
+ * mutex initialization may require a system call.
+ */
+ if (locksys)
+ MUTEX_SYSTEM_LOCK(env);
+
+ if (mtxregion->mutex_next == MUTEX_INVALID) {
+ if (mtxregion->stat.st_mutex_max != 0 &&
+ mtxregion->stat.st_mutex_cnt >=
+ mtxregion->stat.st_mutex_max) {
+nomem: __db_errx(env, DB_STR("2034",
+ "unable to allocate memory for mutex; resize mutex region"));
+ if (locksys)
+ MUTEX_SYSTEM_UNLOCK(env);
+ return (ret == 0 ? ENOMEM : ret);
+ }
+ cnt = mtxregion->stat.st_mutex_cnt / 2;
+ if (cnt < 8)
+ cnt = 8;
+ if (mtxregion->stat.st_mutex_max != 0 &&
+ mtxregion->stat.st_mutex_cnt + cnt >
+ mtxregion->stat.st_mutex_max)
+ cnt = mtxregion->stat.st_mutex_max -
+ mtxregion->stat.st_mutex_cnt;
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ F_SET(&mtxmgr->reginfo, REGION_TRACKED);
+ while (__env_alloc(&mtxmgr->reginfo,
+ (cnt * mtxregion->mutex_size) +
+ mtxregion->stat.st_mutex_align, &i) != 0)
+ if ((cnt >> 1) == 0)
+ break;
+ F_CLR(&mtxmgr->reginfo, REGION_TRACKED);
+ i = (db_mutex_t)ALIGNP_INC(i,
+ mtxregion->stat.st_mutex_align);
+ } else {
+ len = cnt * mtxregion->mutex_size;
+ if ((ret = __env_alloc_extend(&mtxmgr->reginfo,
+ R_ADDR(&mtxmgr->reginfo,
+ mtxregion->mutex_off_alloc), &len)) != 0)
+ goto nomem;
+ cnt = (u_int32_t)(len / mtxregion->mutex_size);
+ i = mtxregion->stat.st_mutex_cnt + 1;
+ }
+ if (cnt == 0)
+ goto nomem;
+ mutexp = MUTEXP_SET(env, i);
+ mtxregion->stat.st_mutex_free = cnt;
+ mtxregion->mutex_next = i;
+ mtxregion->stat.st_mutex_cnt += cnt;
+ while (--cnt > 0) {
+ mutexp->flags = 0;
+ if (F_ISSET(env, ENV_PRIVATE))
+ mutexp->mutex_next_link =
+ (uintptr_t)(mutexp + 1);
+ else
+ mutexp->mutex_next_link = ++i;
+ mutexp++;
+ }
+ mutexp->flags = 0;
+ mutexp->mutex_next_link = MUTEX_INVALID;
+ }
+
+ *indxp = mtxregion->mutex_next;
+ mutexp = MUTEXP_SET(env, *indxp);
+ DB_ASSERT(env,
+ ((uintptr_t)mutexp & (dbenv->mutex_align - 1)) == 0);
+ mtxregion->mutex_next = mutexp->mutex_next_link;
+
+ --mtxregion->stat.st_mutex_free;
+ ++mtxregion->stat.st_mutex_inuse;
+ if (mtxregion->stat.st_mutex_inuse > mtxregion->stat.st_mutex_inuse_max)
+ mtxregion->stat.st_mutex_inuse_max =
+ mtxregion->stat.st_mutex_inuse;
+ if (locksys)
+ MUTEX_SYSTEM_UNLOCK(env);
+
+ /* Initialize the mutex. */
+ memset(mutexp, 0, sizeof(*mutexp));
+ F_SET(mutexp, DB_MUTEX_ALLOCATED |
+ LF_ISSET(DB_MUTEX_LOGICAL_LOCK |
+ DB_MUTEX_PROCESS_ONLY | DB_MUTEX_SHARED));
+
+ /*
+ * If the mutex is associated with a single process, set the process
+ * ID. If the application ever calls DbEnv::failchk, we'll need the
+ * process ID to know if the mutex is still in use.
+ */
+ if (LF_ISSET(DB_MUTEX_PROCESS_ONLY))
+ dbenv->thread_id(dbenv, &mutexp->pid, NULL);
+
+#ifdef HAVE_STATISTICS
+ mutexp->alloc_id = alloc_id;
+#else
+ COMPQUIET(alloc_id, 0);
+#endif
+
+ if ((ret = __mutex_init(env, *indxp, flags)) != 0)
+ (void)__mutex_free_int(env, locksys, indxp);
+
+ return (ret);
+}
+
+/*
+ * __mutex_free --
+ * Free a mutex.
+ *
+ * PUBLIC: int __mutex_free __P((ENV *, db_mutex_t *));
+ */
+int
+__mutex_free(env, indxp)
+ ENV *env;
+ db_mutex_t *indxp;
+{
+ /*
+ * There is no explicit ordering in how the regions are cleaned up
+ * up and/or discarded when an environment is destroyed (either a
+ * private environment is closed or a public environment is removed).
+ * The way we deal with mutexes is to clean up all remaining mutexes
+ * when we close the mutex environment (because we have to be able to
+ * do that anyway, after a crash), which means we don't have to deal
+ * with region cleanup ordering on normal environment destruction.
+ * All that said, what it really means is we can get here without a
+ * mpool region. It's OK, the mutex has been, or will be, destroyed.
+ *
+ * If the mutex has never been configured, we're done.
+ */
+ if (!MUTEX_ON(env) || *indxp == MUTEX_INVALID)
+ return (0);
+
+ return (__mutex_free_int(env, 1, indxp));
+}
+
+/*
+ * __mutex_free_int --
+ * Internal routine to free a mutex.
+ *
+ * PUBLIC: int __mutex_free_int __P((ENV *, int, db_mutex_t *));
+ */
+int
+__mutex_free_int(env, locksys, indxp)
+ ENV *env;
+ int locksys;
+ db_mutex_t *indxp;
+{
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ db_mutex_t mutex;
+ int ret;
+
+ mutex = *indxp;
+ *indxp = MUTEX_INVALID;
+
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+ mutexp = MUTEXP_SET(env, mutex);
+
+ DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_ALLOCATED));
+ F_CLR(mutexp, DB_MUTEX_ALLOCATED);
+
+ ret = __mutex_destroy(env, mutex);
+
+ if (locksys)
+ MUTEX_SYSTEM_LOCK(env);
+
+ /* Link the mutex on the head of the free list. */
+ mutexp->mutex_next_link = mtxregion->mutex_next;
+ mtxregion->mutex_next = mutex;
+ ++mtxregion->stat.st_mutex_free;
+ --mtxregion->stat.st_mutex_inuse;
+
+ if (locksys)
+ MUTEX_SYSTEM_UNLOCK(env);
+
+ return (ret);
+}
+
+/*
+ * __mutex_refresh --
+ * Reinitialize a mutex, if we are not sure of its state.
+ *
+ * PUBLIC: int __mutex_refresh __P((ENV *, db_mutex_t));
+ */
+int
+__mutex_refresh(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_MUTEX *mutexp;
+ u_int32_t flags;
+ int ret;
+
+ mutexp = MUTEXP_SET(env, mutex);
+ flags = mutexp->flags;
+ if ((ret = __mutex_destroy(env, mutex)) == 0) {
+ memset(mutexp, 0, sizeof(*mutexp));
+ F_SET(mutexp, DB_MUTEX_ALLOCATED |
+ LF_ISSET(DB_MUTEX_LOGICAL_LOCK |
+ DB_MUTEX_PROCESS_ONLY | DB_MUTEX_SHARED));
+ LF_CLR(DB_MUTEX_LOCKED);
+ ret = __mutex_init(env, mutex, flags);
+ }
+ return (ret);
+}
diff --git a/src/mutex/mut_failchk.c b/src/mutex/mut_failchk.c
new file mode 100644
index 00000000..1425389f
--- /dev/null
+++ b/src/mutex/mut_failchk.c
@@ -0,0 +1,76 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __mut_failchk --
+ * Check for mutexes held by dead processes.
+ *
+ * PUBLIC: int __mut_failchk __P((ENV *));
+ */
+int
+__mut_failchk(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ db_mutex_t i;
+ int ret;
+ char buf[DB_THREADID_STRLEN];
+ db_threadid_t unused;
+
+ if (F_ISSET(env, ENV_PRIVATE))
+ return (0);
+
+ DB_THREADID_INIT(unused);
+
+ dbenv = env->dbenv;
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+ ret = 0;
+
+ MUTEX_SYSTEM_LOCK(env);
+ for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i, ++mutexp) {
+ mutexp = MUTEXP_SET(env, i);
+
+ /*
+ * We're looking for per-process mutexes where the process
+ * has died.
+ */
+ if (!F_ISSET(mutexp, DB_MUTEX_ALLOCATED) ||
+ !F_ISSET(mutexp, DB_MUTEX_PROCESS_ONLY))
+ continue;
+
+ /*
+ * The thread that allocated the mutex may have exited, but
+ * we cannot reclaim the mutex if the process is still alive.
+ */
+ if (dbenv->is_alive(
+ dbenv, mutexp->pid, unused, DB_MUTEX_PROCESS_ONLY))
+ continue;
+
+ __db_msg(env, DB_STR_A("2017",
+ "Freeing mutex for process: %s", "%s"),
+ dbenv->thread_id_string(dbenv, mutexp->pid, unused, buf));
+
+ /* Unlock and free the mutex. */
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+ MUTEX_UNLOCK(env, i);
+
+ if ((ret = __mutex_free_int(env, 0, &i)) != 0)
+ break;
+ }
+ MUTEX_SYSTEM_UNLOCK(env);
+
+ return (ret);
+}
diff --git a/src/mutex/mut_fcntl.c b/src/mutex/mut_fcntl.c
new file mode 100644
index 00000000..0694aa59
--- /dev/null
+++ b/src/mutex/mut_fcntl.c
@@ -0,0 +1,248 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static inline int __db_fcntl_mutex_lock_int
+ __P((ENV *, db_mutex_t, db_timeout_t, int));
+
+/*
+ * __db_fcntl_mutex_init --
+ * Initialize a fcntl mutex.
+ *
+ * PUBLIC: int __db_fcntl_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+ */
+int
+__db_fcntl_mutex_init(env, mutex, flags)
+ ENV *env;
+ db_mutex_t mutex;
+ u_int32_t flags;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(mutex, MUTEX_INVALID);
+ COMPQUIET(flags, 0);
+
+ return (0);
+}
+
+/*
+ * __db_fcntl_mutex_lock_int
+ * Internal function to lock a mutex, blocking only when requested
+ */
+inline int
+__db_fcntl_mutex_lock_int(env, mutex, timeout, wait)
+ ENV *env;
+ db_mutex_t mutex;
+ db_timeout_t timeout;
+ int wait;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ DB_THREAD_INFO *ip;
+ struct flock k_lock;
+ int locked, ms, ret;
+ db_timespec now, timespec;
+ db_timeout_t time_left;
+
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mutexp = MUTEXP_SET(env, mutex);
+
+ CHECK_MTX_THREAD(env, mutexp);
+
+#ifdef HAVE_STATISTICS
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+ ++mutexp->mutex_set_wait;
+ else
+ ++mutexp->mutex_set_nowait;
+#endif
+
+ /* Initialize the lock. */
+ k_lock.l_whence = SEEK_SET;
+ k_lock.l_start = mutex;
+ k_lock.l_len = 1;
+
+ if (timeout != 0) {
+ timespecclear(&timespec);
+ __clock_set_expires(env, &timespec, timeout);
+ }
+
+ /*
+ * Only check the thread state once, by initializing the thread
+ * control block pointer to null. If it is not the failchk
+ * thread, then ip will have a valid value subsequent times
+ * in the loop.
+ */
+ ip = NULL;
+
+ for (locked = 0;;) {
+ /*
+ * Wait for the lock to become available; wait 1ms initially,
+ * up to 1 second.
+ */
+ for (ms = 1; F_ISSET(mutexp, DB_MUTEX_LOCKED);) {
+ if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
+ ip == NULL && dbenv->is_alive(dbenv,
+ mutexp->pid, mutexp->tid, 0) == 0) {
+ ret = __env_set_state(env, &ip, THREAD_VERIFY);
+ if (ret != 0 ||
+ ip->dbth_state == THREAD_FAILCHK)
+ return (DB_RUNRECOVERY);
+ }
+ if (!wait)
+ return (DB_LOCK_NOTGRANTED);
+ if (timeout != 0) {
+ timespecclear(&now);
+ if (__clock_expired(env, &now, &timespec))
+ return (DB_TIMEOUT);
+ DB_TIMESPEC_TO_TIMEOUT(time_left, &now, 0);
+ time_left = timeout - time_left;
+ if (ms * US_PER_MS > time_left)
+ ms = time_left / US_PER_MS;
+ }
+ __os_yield(NULL, 0, ms * US_PER_MS);
+ if ((ms <<= 1) > MS_PER_SEC)
+ ms = MS_PER_SEC;
+ }
+
+ /* Acquire an exclusive kernel lock on the byte. */
+ k_lock.l_type = F_WRLCK;
+ if (fcntl(env->lockfhp->fd, F_SETLKW, &k_lock))
+ goto err;
+
+ /* If the resource is still available, it's ours. */
+ if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ locked = 1;
+
+ F_SET(mutexp, DB_MUTEX_LOCKED);
+ dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+ }
+
+ /* Release the kernel lock. */
+ k_lock.l_type = F_UNLCK;
+ if (fcntl(env->lockfhp->fd, F_SETLK, &k_lock))
+ goto err;
+
+ /*
+ * If we got the resource lock we're done.
+ *
+ * !!!
+ * We can't check to see if the lock is ours, because we may
+ * be trying to block ourselves in the lock manager, and so
+ * the holder of the lock that's preventing us from getting
+ * the lock may be us! (Seriously.)
+ */
+ if (locked)
+ break;
+ }
+
+#ifdef DIAGNOSTIC
+ /*
+ * We want to switch threads as often as possible. Yield every time
+ * we get a mutex to ensure contention.
+ */
+ if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+ __os_yield(env, 0, 0);
+#endif
+ return (0);
+
+err: ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("2019", "fcntl lock failed"));
+ return (__env_panic(env, __os_posix_err(ret)));
+}
+
+/*
+ * __db_fcntl_mutex_lock
+ * Lock a mutex, blocking if necessary.
+ *
+ * PUBLIC: int __db_fcntl_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+ */
+int
+__db_fcntl_mutex_lock(env, mutex, timeout)
+ ENV *env;
+ db_mutex_t mutex;
+ db_timeout_t timeout;
+{
+ return (__db_fcntl_mutex_lock_int(env, mutex, timeout, 1));
+}
+
+/*
+ * __db_fcntl_mutex_trylock
+ * Try to lock a mutex, without blocking when it is busy.
+ *
+ * PUBLIC: int __db_fcntl_mutex_trylock __P((ENV *, db_mutex_t));
+ */
+int
+__db_fcntl_mutex_trylock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ return (__db_fcntl_mutex_lock_int(env, mutex, 0, 0));
+}
+
+/*
+ * __db_fcntl_mutex_unlock --
+ * Release a mutex.
+ *
+ * PUBLIC: int __db_fcntl_mutex_unlock __P((ENV *, db_mutex_t));
+ */
+int
+__db_fcntl_mutex_unlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mutexp = MUTEXP_SET(env, mutex);
+
+#ifdef DIAGNOSTIC
+ if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ __db_errx(env, DB_STR("2020",
+ "fcntl unlock failed: lock already unlocked"));
+ return (__env_panic(env, EACCES));
+ }
+#endif
+
+ /*
+ * Release the resource. We don't have to acquire any locks because
+ * processes trying to acquire the lock are waiting for the flag to
+ * go to 0. Once that happens the waiters will serialize acquiring
+ * an exclusive kernel lock before locking the mutex.
+ */
+ F_CLR(mutexp, DB_MUTEX_LOCKED);
+
+ return (0);
+}
+
+/*
+ * __db_fcntl_mutex_destroy --
+ * Destroy a mutex.
+ *
+ * PUBLIC: int __db_fcntl_mutex_destroy __P((ENV *, db_mutex_t));
+ */
+int
+__db_fcntl_mutex_destroy(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(mutex, MUTEX_INVALID);
+
+ return (0);
+}
diff --git a/src/mutex/mut_method.c b/src/mutex/mut_method.c
new file mode 100644
index 00000000..cb666082
--- /dev/null
+++ b/src/mutex/mut_method.c
@@ -0,0 +1,482 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __mutex_alloc_pp --
+ * Allocate a mutex, application method.
+ *
+ * PUBLIC: int __mutex_alloc_pp __P((DB_ENV *, u_int32_t, db_mutex_t *));
+ */
+int
+__mutex_alloc_pp(dbenv, flags, indxp)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+ db_mutex_t *indxp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ if ((ret = __db_fchk(env, "DB_ENV->mutex_alloc",
+ flags, DB_MUTEX_PROCESS_ONLY | DB_MUTEX_SELF_BLOCK)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ ret = __mutex_alloc(env, MTX_APPLICATION, flags, indxp);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __mutex_free_pp --
+ * Destroy a mutex, application method.
+ *
+ * PUBLIC: int __mutex_free_pp __P((DB_ENV *, db_mutex_t));
+ */
+int
+__mutex_free_pp(dbenv, indx)
+ DB_ENV *dbenv;
+ db_mutex_t indx;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ if (indx == MUTEX_INVALID)
+ return (EINVAL);
+
+ /*
+ * Internally Berkeley DB passes around the db_mutex_t address on
+ * free, because we want to make absolutely sure the slot gets
+ * overwritten with MUTEX_INVALID. We don't export MUTEX_INVALID,
+ * so we don't export that part of the API, either.
+ */
+ ENV_ENTER(env, ip);
+ ret = __mutex_free(env, &indx);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __mutex_lock --
+ * Lock a mutex, application method.
+ *
+ * PUBLIC: int __mutex_lock_pp __P((DB_ENV *, db_mutex_t));
+ */
+int
+__mutex_lock_pp(dbenv, indx)
+ DB_ENV *dbenv;
+ db_mutex_t indx;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ if (indx == MUTEX_INVALID)
+ return (EINVAL);
+
+ ENV_ENTER(env, ip);
+ ret = __mutex_lock(env, indx);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __mutex_unlock --
+ * Unlock a mutex, application method.
+ *
+ * PUBLIC: int __mutex_unlock_pp __P((DB_ENV *, db_mutex_t));
+ */
+int
+__mutex_unlock_pp(dbenv, indx)
+ DB_ENV *dbenv;
+ db_mutex_t indx;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ if (indx == MUTEX_INVALID)
+ return (EINVAL);
+
+ ENV_ENTER(env, ip);
+ ret = __mutex_unlock(env, indx);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __mutex_get_align --
+ * DB_ENV->mutex_get_align.
+ *
+ * PUBLIC: int __mutex_get_align __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_align(dbenv, alignp)
+ DB_ENV *dbenv;
+ u_int32_t *alignp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (MUTEX_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *alignp = ((DB_MUTEXREGION *)
+ env->mutex_handle->reginfo.primary)->stat.st_mutex_align;
+ } else
+ *alignp = dbenv->mutex_align;
+ return (0);
+}
+
+/*
+ * __mutex_set_align --
+ * DB_ENV->mutex_set_align.
+ *
+ * PUBLIC: int __mutex_set_align __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_align(dbenv, align)
+ DB_ENV *dbenv;
+ u_int32_t align;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mutex_align");
+
+ if (align == 0 || !POWER_OF_TWO(align)) {
+ __db_errx(env, DB_STR("2018",
+"DB_ENV->mutex_set_align: alignment value must be a non-zero power-of-two"));
+ return (EINVAL);
+ }
+
+ dbenv->mutex_align = align;
+ return (0);
+}
+
+/*
+ * __mutex_get_increment --
+ * DB_ENV->mutex_get_increment.
+ *
+ * PUBLIC: int __mutex_get_increment __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_increment(dbenv, incrementp)
+ DB_ENV *dbenv;
+ u_int32_t *incrementp;
+{
+ /*
+ * We don't maintain the increment in the region (it just makes
+ * no sense). Return whatever we have configured on this handle,
+ * nobody is ever going to notice.
+ */
+ *incrementp = dbenv->mutex_inc;
+ return (0);
+}
+
+/*
+ * __mutex_set_increment --
+ * DB_ENV->mutex_set_increment.
+ *
+ * PUBLIC: int __mutex_set_increment __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_increment(dbenv, increment)
+ DB_ENV *dbenv;
+ u_int32_t increment;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mutex_increment");
+
+ dbenv->mutex_cnt = 0;
+ dbenv->mutex_inc = increment;
+ return (0);
+}
+
+/*
+ * __mutex_get_init --
+ * DB_ENV->mutex_get_init.
+ *
+ * PUBLIC: int __mutex_get_init __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_init(dbenv, initp)
+ DB_ENV *dbenv;
+ u_int32_t *initp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (MUTEX_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *initp = ((DB_MUTEXREGION *)
+ env->mutex_handle->reginfo.primary)->stat.st_mutex_init;
+ } else
+ *initp = dbenv->mutex_cnt;
+ return (0);
+}
+
+/*
+ * __mutex_set_init --
+ * DB_ENV->mutex_set_init.
+ *
+ * PUBLIC: int __mutex_set_init __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_init(dbenv, init)
+ DB_ENV *dbenv;
+ u_int32_t init;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mutex_init");
+
+ dbenv->mutex_cnt = init;
+ dbenv->mutex_inc = 0;
+ return (0);
+}
+
+/*
+ * __mutex_get_max --
+ * DB_ENV->mutex_get_max.
+ *
+ * PUBLIC: int __mutex_get_max __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_max(dbenv, maxp)
+ DB_ENV *dbenv;
+ u_int32_t *maxp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (MUTEX_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *maxp = ((DB_MUTEXREGION *)
+ env->mutex_handle->reginfo.primary)->stat.st_mutex_max;
+ } else
+ *maxp = dbenv->mutex_max;
+ return (0);
+}
+
+/*
+ * __mutex_set_max --
+ * DB_ENV->mutex_set_max.
+ *
+ * PUBLIC: int __mutex_set_max __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_max(dbenv, max)
+ DB_ENV *dbenv;
+ u_int32_t max;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mutex_max");
+
+ dbenv->mutex_max = max;
+ dbenv->mutex_inc = 0;
+ return (0);
+}
+
+/*
+ * __mutex_get_tas_spins --
+ * DB_ENV->mutex_get_tas_spins.
+ *
+ * PUBLIC: int __mutex_get_tas_spins __P((DB_ENV *, u_int32_t *));
+ */
+int
+__mutex_get_tas_spins(dbenv, tas_spinsp)
+ DB_ENV *dbenv;
+ u_int32_t *tas_spinsp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (MUTEX_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *tas_spinsp = ((DB_MUTEXREGION *)env->
+ mutex_handle->reginfo.primary)->stat.st_mutex_tas_spins;
+ } else
+ *tas_spinsp = dbenv->mutex_tas_spins;
+ return (0);
+}
+
+/*
+ * __mutex_set_tas_spins --
+ * DB_ENV->mutex_set_tas_spins.
+ *
+ * PUBLIC: int __mutex_set_tas_spins __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_set_tas_spins(dbenv, tas_spins)
+ DB_ENV *dbenv;
+ u_int32_t tas_spins;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ /*
+ * Bound the value -- less than 1 makes no sense, greater than 1M
+ * makes no sense.
+ */
+ if (tas_spins == 0)
+ tas_spins = 1;
+ else if (tas_spins > 1000000)
+ tas_spins = 1000000;
+
+ /*
+ * There's a theoretical race here, but I'm not interested in locking
+ * the test-and-set spin count. The worst possibility is a thread
+ * reads out a bad spin count and spins until it gets the lock, but
+ * that's awfully unlikely.
+ */
+ if (MUTEX_ON(env))
+ ((DB_MUTEXREGION *)env->mutex_handle
+ ->reginfo.primary)->stat.st_mutex_tas_spins = tas_spins;
+ else
+ dbenv->mutex_tas_spins = tas_spins;
+ return (0);
+}
+
+#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+/*
+ * Provide atomic operations for platforms which have mutexes yet do not have
+ * native atomic operations configured. They are emulated by protected the
+ * operation with a mutex. The address of the atomic value selects which
+ * mutex to use.
+ */
+/*
+ * atomic_get_mutex -
+ * Map an address to the mutex to use to atomically modify it
+ */
+static inline db_mutex_t atomic_get_mutex(env, v)
+ ENV *env;
+ db_atomic_t *v;
+{
+ u_int index;
+ DB_MUTEXREGION *mtxreg;
+
+ if (!MUTEX_ON(env))
+ return (MUTEX_INVALID);
+ index = (u_int)(((uintptr_t) (v)) >> 6) % MAX_ATOMIC_MUTEXES;
+ mtxreg = (DB_MUTEXREGION *)env->mutex_handle->reginfo.primary;
+ return (mtxreg->mtx_atomic[index]);
+}
+
+/*
+ * __atomic_inc
+ * Use a mutex to provide an atomic increment function
+ *
+ * PUBLIC: #if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+ * PUBLIC: atomic_value_t __atomic_inc __P((ENV *, db_atomic_t *));
+ * PUBLIC: #endif
+ */
+atomic_value_t
+__atomic_inc(env, v)
+ ENV *env;
+ db_atomic_t *v;
+{
+ db_mutex_t mtx;
+ int ret;
+
+ mtx = atomic_get_mutex(env, v);
+ MUTEX_LOCK(env, mtx);
+ ret = ++v->value;
+ MUTEX_UNLOCK(env, mtx);
+
+ return (ret);
+}
+
+/*
+ * __atomic_dec
+ * Use a mutex to provide an atomic decrement function
+ *
+ * PUBLIC: #if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+ * PUBLIC: atomic_value_t __atomic_dec __P((ENV *, db_atomic_t *));
+ * PUBLIC: #endif
+ */
+atomic_value_t
+__atomic_dec(env, v)
+ ENV *env;
+ db_atomic_t *v;
+{
+ db_mutex_t mtx;
+ int ret;
+
+ mtx = atomic_get_mutex(env, v);
+ MUTEX_LOCK(env, mtx);
+ ret = --v->value;
+ MUTEX_UNLOCK(env, mtx);
+
+ return (ret);
+}
+
+/*
+ * atomic_compare_exchange
+ * Use a mutex to provide an atomic decrement function
+ *
+ * PUBLIC: #if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
+ * PUBLIC: int atomic_compare_exchange
+ * PUBLIC: __P((ENV *, db_atomic_t *, atomic_value_t, atomic_value_t));
+ * PUBLIC: #endif
+ * Returns 1 if the *v was equal to oldval, else 0
+ *
+ * Side Effect:
+ * Sets the value to newval if and only if returning 1
+ */
+int
+atomic_compare_exchange(env, v, oldval, newval)
+ ENV *env;
+ db_atomic_t *v;
+ atomic_value_t oldval;
+ atomic_value_t newval;
+{
+ db_mutex_t mtx;
+ int ret;
+
+ if (atomic_read(v) != oldval)
+ return (0);
+
+ mtx = atomic_get_mutex(env, v);
+ MUTEX_LOCK(env, mtx);
+ ret = atomic_read(v) == oldval;
+ if (ret)
+ atomic_init(v, newval);
+ MUTEX_UNLOCK(env, mtx);
+
+ return (ret);
+}
+#endif
diff --git a/src/mutex/mut_pthread.c b/src/mutex/mut_pthread.c
new file mode 100644
index 00000000..1ec4fb9c
--- /dev/null
+++ b/src/mutex/mut_pthread.c
@@ -0,0 +1,770 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+/*
+ * This is where we load in architecture/compiler specific mutex code.
+ */
+#define LOAD_ACTUAL_MUTEX_CODE
+
+#ifdef HAVE_MUTEX_SOLARIS_LWP
+#define pthread_cond_destroy(x) 0
+#define pthread_cond_signal _lwp_cond_signal
+#define pthread_cond_broadcast _lwp_cond_broadcast
+#define pthread_cond_wait _lwp_cond_wait
+#define pthread_mutex_destroy(x) 0
+#define pthread_mutex_lock _lwp_mutex_lock
+#define pthread_mutex_trylock _lwp_mutex_trylock
+#define pthread_mutex_unlock _lwp_mutex_unlock
+#endif
+#ifdef HAVE_MUTEX_UI_THREADS
+#define pthread_cond_destroy(x) cond_destroy
+#define pthread_cond_broadcast cond_broadcast
+#define pthread_cond_wait cond_wait
+#define pthread_mutex_destroy mutex_destroy
+#define pthread_mutex_lock mutex_lock
+#define pthread_mutex_trylock mutex_trylock
+#define pthread_mutex_unlock mutex_unlock
+#endif
+
+/*
+ * According to HP-UX engineers contacted by Netscape,
+ * pthread_mutex_unlock() will occasionally return EFAULT for no good reason
+ * on mutexes in shared memory regions, and the correct caller behavior
+ * is to try again. Do so, up to EFAULT_RETRY_ATTEMPTS consecutive times.
+ * Note that we don't bother to restrict this to HP-UX;
+ * it should be harmless elsewhere. [#2471]
+ */
+#define EFAULT_RETRY_ATTEMPTS 5
+#define RETRY_ON_EFAULT(func_invocation, ret) do { \
+ int i; \
+ i = EFAULT_RETRY_ATTEMPTS; \
+ do { \
+ RET_SET((func_invocation), ret); \
+ } while (ret == EFAULT && --i > 0); \
+} while (0)
+
+/*
+ * IBM's MVS pthread mutex implementation returns -1 and sets errno rather than
+ * returning errno itself. As -1 is not a valid errno value, assume functions
+ * returning -1 have set errno. If they haven't, return a random error value.
+ */
+#define RET_SET(f, ret) do { \
+ if (((ret) = (f)) == -1 && ((ret) = errno) == 0) \
+ (ret) = EAGAIN; \
+} while (0)
+
+/*
+ * __db_pthread_mutex_init --
+ * Initialize a pthread mutex: either a native one or
+ * just the mutex for block/wakeup of a hybrid test-and-set mutex
+ *
+ *
+ * PUBLIC: int __db_pthread_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+ */
+int
+__db_pthread_mutex_init(env, mutex, flags)
+ ENV *env;
+ db_mutex_t mutex;
+ u_int32_t flags;
+{
+ DB_MUTEX *mutexp;
+ int ret;
+
+ mutexp = MUTEXP_SET(env, mutex);
+ ret = 0;
+
+#ifndef HAVE_MUTEX_HYBRID
+ /* Can't have self-blocking shared latches. */
+ DB_ASSERT(env, !LF_ISSET(DB_MUTEX_SELF_BLOCK) ||
+ !LF_ISSET(DB_MUTEX_SHARED));
+#endif
+
+#ifdef HAVE_MUTEX_PTHREADS
+ {
+#ifndef HAVE_MUTEX_THREAD_ONLY
+ pthread_condattr_t condattr;
+ pthread_mutexattr_t mutexattr;
+#endif
+ pthread_condattr_t *condattrp = NULL;
+ pthread_mutexattr_t *mutexattrp = NULL;
+
+#ifndef HAVE_MUTEX_HYBRID
+ if (LF_ISSET(DB_MUTEX_SHARED)) {
+#if defined(HAVE_SHARED_LATCHES)
+ pthread_rwlockattr_t rwlockattr, *rwlockattrp = NULL;
+#ifndef HAVE_MUTEX_THREAD_ONLY
+ if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+ RET_SET((pthread_rwlockattr_init(&rwlockattr)), ret);
+ if (ret != 0)
+ goto err;
+ RET_SET((pthread_rwlockattr_setpshared(
+ &rwlockattr, PTHREAD_PROCESS_SHARED)), ret);
+ rwlockattrp = &rwlockattr;
+ }
+#endif
+
+ if (ret == 0)
+ RET_SET((pthread_rwlock_init(&mutexp->u.rwlock,
+ rwlockattrp)), ret);
+ if (rwlockattrp != NULL)
+ (void)pthread_rwlockattr_destroy(rwlockattrp);
+
+ F_SET(mutexp, DB_MUTEX_SHARED);
+ /* For rwlocks, we're done - cannot use the mutex or cond */
+ goto err;
+#endif
+ }
+#endif
+#ifndef HAVE_MUTEX_THREAD_ONLY
+ if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+ RET_SET((pthread_mutexattr_init(&mutexattr)), ret);
+ if (ret != 0)
+ goto err;
+ RET_SET((pthread_mutexattr_setpshared(
+ &mutexattr, PTHREAD_PROCESS_SHARED)), ret);
+ mutexattrp = &mutexattr;
+ }
+#endif
+
+ if (ret == 0)
+ RET_SET(
+ (pthread_mutex_init(&mutexp->u.m.mutex, mutexattrp)), ret);
+
+ if (mutexattrp != NULL)
+ (void)pthread_mutexattr_destroy(mutexattrp);
+ if (ret != 0)
+ goto err;
+ if (LF_ISSET(DB_MUTEX_SELF_BLOCK)) {
+#ifndef HAVE_MUTEX_THREAD_ONLY
+ if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+ RET_SET((pthread_condattr_init(&condattr)), ret);
+ if (ret != 0)
+ goto err;
+
+ condattrp = &condattr;
+ RET_SET((pthread_condattr_setpshared(
+ &condattr, PTHREAD_PROCESS_SHARED)), ret);
+ }
+#endif
+
+ if (ret == 0)
+ RET_SET((pthread_cond_init(
+ &mutexp->u.m.cond, condattrp)), ret);
+
+ F_SET(mutexp, DB_MUTEX_SELF_BLOCK);
+ if (condattrp != NULL)
+ (void)pthread_condattr_destroy(condattrp);
+ }
+
+ }
+#endif
+#ifdef HAVE_MUTEX_SOLARIS_LWP
+ /*
+ * XXX
+ * Gcc complains about missing braces in the static initializations of
+ * lwp_cond_t and lwp_mutex_t structures because the structures contain
+ * sub-structures/unions and the Solaris include file that defines the
+ * initialization values doesn't have surrounding braces. There's not
+ * much we can do.
+ */
+ if (LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+ static lwp_mutex_t mi = DEFAULTMUTEX;
+
+ mutexp->mutex = mi;
+ } else {
+ static lwp_mutex_t mi = SHAREDMUTEX;
+
+ mutexp->mutex = mi;
+ }
+ if (LF_ISSET(DB_MUTEX_SELF_BLOCK)) {
+ if (LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+ static lwp_cond_t ci = DEFAULTCV;
+
+ mutexp->cond = ci;
+ } else {
+ static lwp_cond_t ci = SHAREDCV;
+
+ mutexp->cond = ci;
+ }
+ F_SET(mutexp, DB_MUTEX_SELF_BLOCK);
+ }
+#endif
+#ifdef HAVE_MUTEX_UI_THREADS
+ {
+ int type;
+
+ type = LF_ISSET(DB_MUTEX_PROCESS_ONLY) ? USYNC_THREAD : USYNC_PROCESS;
+
+ ret = mutex_init(&mutexp->mutex, type, NULL);
+ if (ret == 0 && LF_ISSET(DB_MUTEX_SELF_BLOCK)) {
+ ret = cond_init(&mutexp->cond, type, NULL);
+
+ F_SET(mutexp, DB_MUTEX_SELF_BLOCK);
+ }}
+#endif
+
+err: if (ret != 0) {
+ __db_err(env, ret, DB_STR("2021",
+ "unable to initialize mutex"));
+ }
+ return (ret);
+}
+
+/*
+ * __db_pthread_mutex_prep
+ * Prepare to use a pthread-based DB_MUTEX.
+ *
+ * This exclusively locks a DB_MUTEX's pthread_mutex_t or pthread_rwlock_t,
+ * before locking, unlocking, or waiting for the DB mutex to be become
+ * available in the requested mode (exclusive == 1, shared == 0).
+ *
+ * Test for failchk concerns here too, to avoid hanging on a dead pid/tid.
+ */
+inline static int
+__db_pthread_mutex_prep(env, mutex, mutexp, exclusive)
+ ENV *env;
+ db_mutex_t mutex;
+ DB_MUTEX *mutexp;
+ int exclusive;
+{
+ DB_ENV *dbenv;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ dbenv = env->dbenv;
+ PERFMON4(env,
+ mutex, suspend, mutex, exclusive, mutexp->alloc_id, mutexp);
+ if (F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+ for (;;) {
+ RET_SET_PTHREAD_TRYLOCK(mutexp, ret);
+ if (ret != EBUSY)
+ break;
+ if (dbenv->is_alive(dbenv,
+ mutexp->pid, mutexp->tid, 0) == 0) {
+ ret = __env_set_state(env, &ip, THREAD_VERIFY);
+ if (ret != 0 ||
+ ip->dbth_state == THREAD_FAILCHK) {
+ ret = DB_RUNRECOVERY;
+ } else {
+ /*
+ * Some thread other than the true
+ * FAILCHK thread in this process is
+ * asking for the mutex held by the
+ * dead process/thread. We will block
+ * here until someone else does the
+ * cleanup. Same behavior as if we
+ * hadn't gone down the 'if
+ * DB_ENV_FAILCHK' path to start with.
+ */
+ RET_SET_PTHREAD_LOCK(mutexp, ret);
+ break;
+ }
+ }
+ }
+ } else
+ RET_SET_PTHREAD_LOCK(mutexp, ret);
+
+ PERFMON4(env,
+ mutex, resume, mutex, exclusive, mutexp->alloc_id, mutexp);
+ COMPQUIET(mutex, 0);
+ COMPQUIET(exclusive, 0);
+ return (ret);
+}
+
+/*
+ * __db_pthread_mutex_condwait
+ * Perform a pthread condition wait for a DB_MUTEX.
+ *
+ * This will be a timed wait when a timespec has been specified. EINTR and
+ * spurious ETIME* values are mapped to 0, and hence success. The
+ * mutexp->u.m.mutex must be locked upon entry. When returning a success
+ * or timeout status it will have been locked again.
+ *
+ * Returns:
+ * 0 if it is safe to retry to get the mutex
+ * DB_TIMEOUT if the timeout exceeded
+ * <other> a fatal error. The mutexp->u.m.mutex has been unlocked.
+ */
+inline static int
+__db_pthread_mutex_condwait(env, mutex, mutexp, timespec)
+ ENV *env;
+ db_mutex_t mutex;
+ DB_MUTEX *mutexp;
+ db_timespec *timespec;
+{
+ int ret;
+
+#ifdef MUTEX_DIAG
+ printf("condwait %ld %x wait busy %x count %d\n",
+ mutex, pthread_self(), MUTEXP_BUSY_FIELD(mutexp), mutexp->wait);
+#endif
+ PERFMON4(env, mutex, suspend, mutex, TRUE, mutexp->alloc_id, mutexp);
+
+ if (timespec != NULL) {
+ RET_SET((pthread_cond_timedwait(&mutexp->u.m.cond,
+ &mutexp->u.m.mutex, (struct timespec *) timespec)), ret);
+ if (ret == ETIMEDOUT) {
+ ret = DB_TIMEOUT;
+ goto ret;
+ }
+ } else
+ RET_SET((pthread_cond_wait(&mutexp->u.m.cond,
+ &mutexp->u.m.mutex)), ret);
+#ifdef MUTEX_DIAG
+ printf("condwait %ld %x wait returns %d busy %x\n",
+ mutex, pthread_self(), ret, MUTEXP_BUSY_FIELD(mutexp));
+#endif
+ /*
+ * !!!
+ * Solaris bug workaround: pthread_cond_wait() sometimes returns ETIME
+ * -- out of sheer paranoia, check both ETIME and ETIMEDOUT. We
+ * believe this happens when the application uses SIGALRM for some
+ * purpose, e.g., the C library sleep call, and Solaris delivers the
+ * signal to the wrong LWP.
+ */
+ if (ret != 0) {
+ if (ret == ETIMEDOUT ||
+#ifdef ETIME
+ ret == ETIME ||
+#endif
+ ret == EINTR)
+ ret = 0;
+ else
+ /* Failure, caller shouldn't condwait again. */
+ (void)pthread_mutex_unlock(&mutexp->u.m.mutex);
+ }
+
+ret:
+ PERFMON4(env, mutex, resume, mutex, TRUE, mutexp->alloc_id, mutexp);
+
+ COMPQUIET(mutex, 0);
+ COMPQUIET(env, 0);
+ return (ret);
+}
+
+#ifndef HAVE_MUTEX_HYBRID
+/*
+ * __db_pthread_mutex_lock
+ * Lock on a mutex, blocking if necessary.
+ * Timeouts are supported only for self-blocking mutexes.
+ *
+ * Self-blocking shared latches are not supported.
+ *
+ * PUBLIC: #ifndef HAVE_MUTEX_HYBRID
+ * PUBLIC: int __db_pthread_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+ * PUBLIC: #endif
+ */
+int
+__db_pthread_mutex_lock(env, mutex, timeout)
+ ENV *env;
+ db_mutex_t mutex;
+ db_timeout_t timeout;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ db_timespec timespec;
+ int ret, t_ret;
+
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ t_ret = 0;
+ mutexp = MUTEXP_SET(env, mutex);
+
+ CHECK_MTX_THREAD(env, mutexp);
+
+#if defined(HAVE_STATISTICS)
+ /*
+ * We want to know which mutexes are contentious, but don't want to
+ * do an interlocked test here -- that's slower when the underlying
+ * system has adaptive mutexes and can perform optimizations like
+ * spinning only if the thread holding the mutex is actually running
+ * on a CPU. Make a guess, using a normal load instruction.
+ */
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+ STAT_INC(env, mutex, set_wait, mutexp->mutex_set_wait, mutex);
+ else
+ STAT_INC(env,
+ mutex, set_nowait, mutexp->mutex_set_nowait, mutex);
+#endif
+
+ /* Single-thread the next block, except during the possible condwait. */
+ if ((ret = __db_pthread_mutex_prep(env, mutex, mutexp, TRUE)) != 0)
+ goto err;
+
+ if (F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK)) {
+ if (timeout != 0)
+ timespecclear(&timespec);
+ while (MUTEXP_IS_BUSY(mutexp)) {
+ /* Set expiration timer upon first need. */
+ if (timeout != 0 && !timespecisset(&timespec)) {
+ timespecclear(&timespec);
+ __clock_set_expires(env, &timespec, timeout);
+ }
+ t_ret = __db_pthread_mutex_condwait(env,
+ mutex, mutexp, timeout == 0 ? NULL : &timespec);
+ if (t_ret != 0) {
+ if (t_ret == DB_TIMEOUT)
+ goto out;
+ ret = t_ret;
+ goto err;
+ }
+ }
+
+ F_SET(mutexp, DB_MUTEX_LOCKED);
+ dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+out:
+ /* #2471: HP-UX can sporadically return EFAULT. See above */
+ RETRY_ON_EFAULT(pthread_mutex_unlock(&mutexp->u.m.mutex), ret);
+ if (ret != 0)
+ goto err;
+ } else {
+#ifdef DIAGNOSTIC
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ char buf[DB_THREADID_STRLEN];
+ (void)dbenv->thread_id_string(dbenv,
+ mutexp->pid, mutexp->tid, buf);
+ __db_errx(env, DB_STR_A("2022",
+ "pthread lock failed: lock currently in use: pid/tid: %s",
+ "%s"), buf);
+ ret = EINVAL;
+ goto err;
+ }
+#endif
+ F_SET(mutexp, DB_MUTEX_LOCKED);
+ dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+ }
+
+#ifdef DIAGNOSTIC
+ /*
+ * We want to switch threads as often as possible. Yield every time
+ * we get a mutex to ensure contention.
+ */
+ if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+ __os_yield(env, 0, 0);
+#endif
+ return (t_ret);
+
+err:
+ __db_err(env, ret, DB_STR("2023", "pthread lock failed"));
+ return (__env_panic(env, ret));
+}
+#endif
+
+#if defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_HYBRID)
+/*
+ * __db_pthread_mutex_readlock
+ * Take a shared lock on a mutex, blocking if necessary.
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_pthread_mutex_readlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_pthread_mutex_readlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ int ret;
+
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mutexp = MUTEXP_SET(env, mutex);
+ DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SHARED));
+
+ CHECK_MTX_THREAD(env, mutexp);
+
+#if defined(HAVE_STATISTICS)
+ /*
+ * We want to know which mutexes are contentious, but don't want to
+ * do an interlocked test here -- that's slower when the underlying
+ * system has adaptive mutexes and can perform optimizations like
+ * spinning only if the thread holding the mutex is actually running
+ * on a CPU. Make a guess, using a normal load instruction.
+ */
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+ STAT_INC(env,
+ mutex, set_rd_wait, mutexp->mutex_set_rd_wait, mutex);
+ else
+ STAT_INC(env,
+ mutex, set_rd_nowait, mutexp->mutex_set_rd_nowait, mutex);
+#endif
+
+ PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp);
+ RET_SET((pthread_rwlock_rdlock(&mutexp->u.rwlock)), ret);
+ PERFMON4(env, mutex, resume, mutex, FALSE, mutexp->alloc_id, mutexp);
+ DB_ASSERT(env, !F_ISSET(mutexp, DB_MUTEX_LOCKED));
+ if (ret != 0)
+ goto err;
+
+#ifdef DIAGNOSTIC
+ /*
+ * We want to switch threads as often as possible. Yield every time
+ * we get a mutex to ensure contention.
+ */
+ if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+ __os_yield(env, 0, 0);
+#endif
+ return (0);
+
+err: __db_err(env, ret, DB_STR("2024", "pthread readlock failed"));
+ return (__env_panic(env, ret));
+}
+#endif
+
+#ifdef HAVE_MUTEX_HYBRID
+/*
+ * __db_hybrid_mutex_suspend
+ * Suspend this thread until the mutex is free enough to give the caller a
+ * good chance of getting the mutex in the requested exclusivity mode.
+ *
+ * The major difference between this and the old __db_pthread_mutex_lock()
+ * is the additional 'exclusive' parameter.
+ *
+ * PUBLIC: #ifdef HAVE_MUTEX_HYBRID
+ * PUBLIC: int __db_hybrid_mutex_suspend
+ * PUBLIC: __P((ENV *, db_mutex_t, db_timespec *, int));
+ * PUBLIC: #endif
+ */
+int
+__db_hybrid_mutex_suspend(env, mutex, timespec, exclusive)
+ ENV *env;
+ db_mutex_t mutex;
+ db_timespec *timespec;
+ int exclusive;
+{
+ DB_MUTEX *mutexp;
+ int ret, t_ret;
+
+ t_ret = 0;
+ mutexp = MUTEXP_SET(env, mutex);
+
+ if (!exclusive)
+ DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SHARED));
+ DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK));
+
+ if ((ret = __db_pthread_mutex_prep(env, mutex, mutexp, exclusive)) != 0)
+ goto err;
+
+ /*
+ * Since this is only for hybrid mutexes the pthread mutex
+ * is only used to wait after spinning on the TAS mutex.
+ * Set the wait flag before checking to see if the mutex
+ * is still locked. The holder will clear DB_MUTEX_LOCKED
+ * before checking the wait counter.
+ */
+ mutexp->wait++;
+ MUTEX_MEMBAR(mutexp->wait);
+ while (exclusive ? MUTEXP_IS_BUSY(mutexp) :
+ atomic_read(&mutexp->sharecount) == MUTEX_SHARE_ISEXCLUSIVE) {
+ t_ret = __db_pthread_mutex_condwait(env,
+ mutex, mutexp, timespec);
+ if (t_ret != 0) {
+ if (t_ret == DB_TIMEOUT)
+ break;
+ ret = t_ret;
+ goto err;
+ }
+ MUTEX_MEMBAR(mutexp->flags);
+ }
+
+ mutexp->wait--;
+
+ /* #2471: HP-UX can sporadically return EFAULT. See above */
+ RETRY_ON_EFAULT(pthread_mutex_unlock(&mutexp->u.m.mutex), ret);
+ if (ret != 0)
+ goto err;
+
+ PERFMON4(env,
+ mutex, resume, mutex, exclusive, mutexp->alloc_id, mutexp);
+
+#ifdef DIAGNOSTIC
+ /*
+ * We want to switch threads as often as possible. Yield every time
+ * we get a mutex to ensure contention.
+ */
+ if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU))
+ __os_yield(env, 0, 0);
+#endif
+ return (t_ret);
+
+err:
+ PERFMON4(env,
+ mutex, resume, mutex, exclusive, mutexp->alloc_id, mutexp);
+ __db_err(env, ret, "pthread suspend failed");
+ return (__env_panic(env, ret));
+}
+#endif
+
+/*
+ * __db_pthread_mutex_unlock --
+ * Release a mutex, or, if hybrid, wake a thread up from a suspend.
+ *
+ * PUBLIC: int __db_pthread_mutex_unlock __P((ENV *, db_mutex_t));
+ */
+int
+__db_pthread_mutex_unlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ int ret;
+#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID)
+ int waiters;
+#endif
+
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mutexp = MUTEXP_SET(env, mutex);
+#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID)
+ waiters = mutexp->wait;
+#endif
+
+#if !defined(HAVE_MUTEX_HYBRID) && defined(DIAGNOSTIC)
+ if (!F_ISSET(mutexp, DB_MUTEX_LOCKED | DB_MUTEX_SHARED)) {
+ __db_errx(env, DB_STR("2025",
+ "pthread unlock failed: lock already unlocked"));
+ return (__env_panic(env, EACCES));
+ }
+#endif
+ if (F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK)) {
+ ret = __db_pthread_mutex_prep(env, mutex, mutexp, TRUE);
+ if (ret != 0)
+ goto err;
+
+#ifdef HAVE_MUTEX_HYBRID
+ STAT_INC(env,
+ mutex, hybrid_wakeup, mutexp->hybrid_wakeup, mutex);
+#else
+ F_CLR(mutexp, DB_MUTEX_LOCKED); /* nop if DB_MUTEX_SHARED */
+#endif
+
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+ RET_SET(
+ (pthread_cond_broadcast(&mutexp->u.m.cond)), ret);
+ else
+ RET_SET((pthread_cond_signal(&mutexp->u.m.cond)), ret);
+ if (ret != 0)
+ goto err;
+ } else {
+#ifndef HAVE_MUTEX_HYBRID
+ F_CLR(mutexp, DB_MUTEX_LOCKED);
+#endif
+ }
+
+ /* See comment above; workaround for [#2471]. */
+#if defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_HYBRID)
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+ RETRY_ON_EFAULT(pthread_rwlock_unlock(&mutexp->u.rwlock), ret);
+ else
+#endif
+ RETRY_ON_EFAULT(pthread_mutex_unlock(&mutexp->u.m.mutex), ret);
+
+err: if (ret != 0) {
+ __db_err(env, ret, "pthread unlock failed");
+ return (__env_panic(env, ret));
+ }
+#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID)
+ if (!MUTEXP_IS_BUSY(mutexp) && mutexp->wait != 0)
+ printf("unlock %ld %x busy %x waiters %d/%d\n",
+ mutex, pthread_self(), ret,
+ MUTEXP_BUSY_FIELD(mutexp), waiters, mutexp->wait);
+#endif
+ return (ret);
+}
+
+/*
+ * __db_pthread_mutex_destroy --
+ * Destroy a mutex.
+ * If it is a native shared latch (not hybrid) then
+ * destroy only one half of the rwlock/mutex&cond union,
+ * depending whether it was allocated as shared
+ *
+ * PUBLIC: int __db_pthread_mutex_destroy __P((ENV *, db_mutex_t));
+ */
+int
+__db_pthread_mutex_destroy(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_MUTEX *mutexp;
+ DB_THREAD_INFO *ip;
+ int ret, t_ret, failchk_thread;
+
+ if (!MUTEX_ON(env))
+ return (0);
+
+ mutexp = MUTEXP_SET(env, mutex);
+
+ ret = 0;
+ failchk_thread = FALSE;
+ /* Get information to determine if we are really the failchk thread. */
+ if (F_ISSET(env->dbenv, DB_ENV_FAILCHK)) {
+ ret = __env_set_state(env, &ip, THREAD_VERIFY);
+ if (ip != NULL && ip->dbth_state == THREAD_FAILCHK)
+ failchk_thread = TRUE;
+ }
+
+#ifndef HAVE_MUTEX_HYBRID
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+#if defined(HAVE_SHARED_LATCHES)
+ /*
+ * If there were dead processes waiting on the condition
+ * we may not be able to destroy it. Let failchk thread skip
+ * this, unless destroy is required.
+ * XXX What operating system resources might this leak?
+ */
+#ifdef HAVE_PTHREAD_RWLOCK_REINIT_OKAY
+ if (!failchk_thread)
+#endif
+ RET_SET(
+ (pthread_rwlock_destroy(&mutexp->u.rwlock)), ret);
+ /* For rwlocks, we're done - must not destroy rest of union */
+ return (ret);
+#endif
+ }
+#endif
+ if (F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK)) {
+ /*
+ * If there were dead processes waiting on the condition
+ * we may not be able to destroy it. Let failchk thread
+ * skip this, unless destroy is required.
+ */
+#ifdef HAVE_PTHREAD_COND_REINIT_OKAY
+ if (!failchk_thread)
+#endif
+ RET_SET((pthread_cond_destroy(&mutexp->u.m.cond)), ret);
+ if (ret != 0)
+ __db_err(env, ret, DB_STR("2026",
+ "unable to destroy cond"));
+ }
+ RET_SET((pthread_mutex_destroy(&mutexp->u.m.mutex)), t_ret);
+ if (t_ret != 0 && !failchk_thread) {
+ __db_err(env, t_ret, DB_STR("2027",
+ "unable to destroy mutex"));
+ if (ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
diff --git a/src/mutex/mut_region.c b/src/mutex/mut_region.c
new file mode 100644
index 00000000..26ae0a03
--- /dev/null
+++ b/src/mutex/mut_region.c
@@ -0,0 +1,468 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static db_size_t __mutex_align_size __P((ENV *));
+static int __mutex_region_init __P((ENV *, DB_MUTEXMGR *));
+static size_t __mutex_region_size __P((ENV *));
+static size_t __mutex_region_max __P((ENV *));
+
+/*
+ * __mutex_open --
+ * Open a mutex region.
+ *
+ * PUBLIC: int __mutex_open __P((ENV *, int));
+ */
+int
+__mutex_open(env, create_ok)
+ ENV *env;
+ int create_ok;
+{
+ DB_ENV *dbenv;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ size_t size;
+ u_int32_t cpu_count;
+ int ret;
+#ifndef HAVE_ATOMIC_SUPPORT
+ u_int i;
+#endif
+
+ dbenv = env->dbenv;
+ if (dbenv->mutex_max == 0 &&
+ dbenv->mutex_cnt == 0 && dbenv->mutex_inc == 0 &&
+ F_ISSET(env, ENV_PRIVATE | ENV_THREAD) == ENV_PRIVATE)
+ return (0);
+
+ /*
+ * Initialize the ENV handle information if not already initialized.
+ *
+ * Align mutexes on the byte boundaries specified by the application.
+ */
+ if (dbenv->mutex_align == 0)
+ dbenv->mutex_align = MUTEX_ALIGN;
+ if (dbenv->mutex_tas_spins == 0) {
+ cpu_count = __os_cpu_count();
+ if ((ret = __mutex_set_tas_spins(dbenv, cpu_count == 1 ?
+ cpu_count : cpu_count * MUTEX_SPINS_PER_PROCESSOR)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If the user didn't set an absolute value on the number of mutexes
+ * we'll need, figure it out. We're conservative in our allocation,
+ * we need mutexes for DB handles, group-commit queues and other things
+ * applications allocate at run-time. The application may have kicked
+ * up our count to allocate its own mutexes, add that in.
+ */
+ if (dbenv->mutex_cnt == 0 &&
+ F_ISSET(env, ENV_PRIVATE | ENV_THREAD) != ENV_PRIVATE)
+ dbenv->mutex_cnt =
+ __lock_region_mutex_count(env) +
+ __log_region_mutex_count(env) +
+ __memp_region_mutex_count(env) +
+ __txn_region_mutex_count(env);
+
+ if (dbenv->mutex_max != 0 && dbenv->mutex_cnt > dbenv->mutex_max)
+ dbenv->mutex_cnt = dbenv->mutex_max;
+
+ /* Create/initialize the mutex manager structure. */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_MUTEXMGR), &mtxmgr)) != 0)
+ return (ret);
+
+ /* Join/create the mutex region. */
+ mtxmgr->reginfo.env = env;
+ mtxmgr->reginfo.type = REGION_TYPE_MUTEX;
+ mtxmgr->reginfo.id = INVALID_REGION_ID;
+ mtxmgr->reginfo.flags = REGION_JOIN_OK;
+ size = __mutex_region_size(env);
+ if (create_ok)
+ F_SET(&mtxmgr->reginfo, REGION_CREATE_OK);
+ if ((ret = __env_region_attach(env,
+ &mtxmgr->reginfo, size, size + __mutex_region_max(env))) != 0)
+ goto err;
+
+ /* If we created the region, initialize it. */
+ if (F_ISSET(&mtxmgr->reginfo, REGION_CREATE))
+ if ((ret = __mutex_region_init(env, mtxmgr)) != 0)
+ goto err;
+
+ /* Set the local addresses. */
+ mtxregion = mtxmgr->reginfo.primary =
+ R_ADDR(&mtxmgr->reginfo, mtxmgr->reginfo.rp->primary);
+ mtxmgr->mutex_array = R_ADDR(&mtxmgr->reginfo, mtxregion->mutex_off);
+
+ env->mutex_handle = mtxmgr;
+
+#ifndef HAVE_ATOMIC_SUPPORT
+ /* If necessary allocate the atomic emulation mutexes. */
+ if (F_ISSET(&mtxmgr->reginfo, REGION_CREATE))
+ for (i = 0; i != MAX_ATOMIC_MUTEXES; i++)
+ if ((ret = __mutex_alloc_int(
+ env, 0, MTX_ATOMIC_EMULATION,
+ 0, &mtxregion->mtx_atomic[i])) != 0)
+ return (ret);
+#endif
+
+ return (0);
+
+err: env->mutex_handle = NULL;
+ if (mtxmgr->reginfo.addr != NULL)
+ (void)__env_region_detach(env, &mtxmgr->reginfo, 0);
+
+ __os_free(env, mtxmgr);
+ return (ret);
+}
+
+/*
+ * __mutex_region_init --
+ * Initialize a mutex region in shared memory.
+ */
+static int
+__mutex_region_init(env, mtxmgr)
+ ENV *env;
+ DB_MUTEXMGR *mtxmgr;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ DB_MUTEXREGION *mtxregion;
+ db_mutex_t mutex;
+ int ret;
+ void *mutex_array;
+
+ dbenv = env->dbenv;
+
+ COMPQUIET(mutexp, NULL);
+
+ if ((ret = __env_alloc(&mtxmgr->reginfo,
+ sizeof(DB_MUTEXREGION), &mtxmgr->reginfo.primary)) != 0) {
+ __db_errx(env, DB_STR("2013",
+ "Unable to allocate memory for the mutex region"));
+ return (ret);
+ }
+ mtxmgr->reginfo.rp->primary =
+ R_OFFSET(&mtxmgr->reginfo, mtxmgr->reginfo.primary);
+ mtxregion = mtxmgr->reginfo.primary;
+ memset(mtxregion, 0, sizeof(*mtxregion));
+
+ mtxregion->mutex_size = __mutex_align_size(env);
+
+ mtxregion->stat.st_mutex_align = dbenv->mutex_align;
+ if (dbenv->mutex_cnt == 0)
+ dbenv->mutex_cnt = 1;
+ mtxregion->stat.st_mutex_init =
+ mtxregion->stat.st_mutex_cnt = dbenv->mutex_cnt;
+ mtxregion->stat.st_mutex_max = dbenv->mutex_max;
+ if (mtxregion->stat.st_mutex_max != 0)
+ mtxregion->stat.st_mutex_max += dbenv->mutex_inc;
+ mtxregion->stat.st_mutex_tas_spins = dbenv->mutex_tas_spins;
+
+ /*
+ * Get a chunk of memory to be used for the mutexes themselves. Each
+ * piece of the memory must be properly aligned, and that alignment
+ * may be more restrictive than the memory alignment returned by the
+ * underlying allocation code. We already know how much memory each
+ * mutex in the array will take up, but we need to offset the first
+ * mutex in the array so the array begins properly aligned.
+ *
+ * The OOB mutex (MUTEX_INVALID) is 0. To make this work, we ignore
+ * the first allocated slot when we build the free list. We have to
+ * correct the count by 1 here, though, otherwise our counter will be
+ * off by 1.
+ */
+ if ((ret = __env_alloc(&mtxmgr->reginfo,
+ mtxregion->stat.st_mutex_align +
+ (mtxregion->stat.st_mutex_cnt + 1) * mtxregion->mutex_size,
+ &mutex_array)) != 0) {
+ __db_errx(env, DB_STR("2014",
+ "Unable to allocate memory for mutexes from the region"));
+ return (ret);
+ }
+
+ mtxregion->mutex_off_alloc = R_OFFSET(&mtxmgr->reginfo, mutex_array);
+ mutex_array = ALIGNP_INC(mutex_array, mtxregion->stat.st_mutex_align);
+ mtxregion->mutex_off = R_OFFSET(&mtxmgr->reginfo, mutex_array);
+ mtxmgr->mutex_array = mutex_array;
+
+ /*
+ * Put the mutexes on a free list and clear the allocated flag.
+ *
+ * The OOB mutex (MUTEX_INVALID) is 0, skip it.
+ *
+ * The comparison is <, not <=, because we're looking ahead one
+ * in each link.
+ */
+ env->mutex_handle = mtxmgr;
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ mutexp = (DB_MUTEX *)mutex_array;
+ mutexp++;
+ mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+ mtxregion->mutex_next = (db_mutex_t)mutexp;
+ } else {
+ mtxregion->mutex_next = 1;
+ mutexp = MUTEXP_SET(env, 1);
+ }
+ for (mutex = 1; mutex < mtxregion->stat.st_mutex_cnt; ++mutex) {
+ mutexp->flags = 0;
+ if (F_ISSET(env, ENV_PRIVATE))
+ mutexp->mutex_next_link = (db_mutex_t)(mutexp + 1);
+ else
+ mutexp->mutex_next_link = mutex + 1;
+ mutexp++;
+ mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+ }
+ mutexp->flags = 0;
+ mutexp->mutex_next_link = MUTEX_INVALID;
+ mtxregion->stat.st_mutex_free = mtxregion->stat.st_mutex_cnt;
+ mtxregion->stat.st_mutex_inuse = mtxregion->stat.st_mutex_inuse_max = 0;
+ if ((ret = __mutex_alloc(env, MTX_MUTEX_REGION, 0, &mutex)) != 0)
+ return (ret);
+ mtxmgr->reginfo.mtx_alloc = mtxregion->mtx_region = mutex;
+
+ /*
+ * This is the first place we can test mutexes and we need to
+ * know if they're working. (They CAN fail, for example on
+ * SunOS, when using fcntl(2) for locking and using an
+ * in-memory filesystem as the database environment directory.
+ * But you knew that, I'm sure -- it probably wasn't worth
+ * mentioning.)
+ */
+ mutex = MUTEX_INVALID;
+ if ((ret =
+ __mutex_alloc(env, MTX_MUTEX_TEST, 0, &mutex) != 0) ||
+ (ret = __mutex_lock(env, mutex)) != 0 ||
+ (ret = __mutex_unlock(env, mutex)) != 0 ||
+ (ret = __mutex_trylock(env, mutex)) != 0 ||
+ (ret = __mutex_unlock(env, mutex)) != 0 ||
+ (ret = __mutex_free(env, &mutex)) != 0) {
+ __db_errx(env, DB_STR("2015",
+ "Unable to acquire/release a mutex; check configuration"));
+ return (ret);
+ }
+#ifdef HAVE_SHARED_LATCHES
+ if ((ret =
+ __mutex_alloc(env,
+ MTX_MUTEX_TEST, DB_MUTEX_SHARED, &mutex) != 0) ||
+ (ret = __mutex_lock(env, mutex)) != 0 ||
+ (ret = __mutex_tryrdlock(env, mutex)) != DB_LOCK_NOTGRANTED ||
+ (ret = __mutex_unlock(env, mutex)) != 0 ||
+ (ret = __mutex_rdlock(env, mutex)) != 0 ||
+ (ret = __mutex_rdlock(env, mutex)) != 0 ||
+ (ret = __mutex_unlock(env, mutex)) != 0 ||
+ (ret = __mutex_unlock(env, mutex)) != 0 ||
+ (ret = __mutex_free(env, &mutex)) != 0) {
+ __db_errx(env, DB_STR("2016",
+ "Unable to acquire/release a shared latch; check configuration"));
+ return (ret);
+ }
+#endif
+
+ return (0);
+}
+
+/*
+ * __mutex_env_refresh --
+ * Clean up after the mutex region on a close or failed open.
+ *
+ * PUBLIC: int __mutex_env_refresh __P((ENV *));
+ */
+int
+__mutex_env_refresh(env)
+ ENV *env;
+{
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ REGINFO *reginfo;
+ int ret;
+
+ mtxmgr = env->mutex_handle;
+ reginfo = &mtxmgr->reginfo;
+ mtxregion = mtxmgr->reginfo.primary;
+
+ /*
+ * If a private region, return the memory to the heap. Not needed for
+ * filesystem-backed or system shared memory regions, that memory isn't
+ * owned by any particular process.
+ */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ reginfo->mtx_alloc = MUTEX_INVALID;
+
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
+ /*
+ * If destroying the mutex region, return any system resources
+ * to the system.
+ */
+ __mutex_resource_return(env, reginfo);
+#endif
+ /* Discard the mutex array. */
+ __env_alloc_free(
+ reginfo, R_ADDR(reginfo, mtxregion->mutex_off_alloc));
+ }
+
+ /* Detach from the region. */
+ ret = __env_region_detach(env, reginfo, 0);
+
+ __os_free(env, mtxmgr);
+
+ env->mutex_handle = NULL;
+
+ return (ret);
+}
+
+/*
+ * __mutex_align_size --
+ * Return how much memory each mutex will take up if an array of them
+ * are to be properly aligned, individually, within the array.
+ */
+static db_size_t
+__mutex_align_size(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+
+ dbenv = env->dbenv;
+
+ return ((db_size_t)DB_ALIGN(sizeof(DB_MUTEX), dbenv->mutex_align));
+}
+
+/*
+ * __mutex_region_size --
+ * Return the amount of space needed for the mutex region.
+ */
+static size_t
+__mutex_region_size(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ size_t s;
+
+ dbenv = env->dbenv;
+
+ s = sizeof(DB_MUTEXMGR) + 1024;
+
+ /* We discard one mutex for the OOB slot. */
+ s += __env_alloc_size(
+ (dbenv->mutex_cnt + 1) *__mutex_align_size(env));
+
+ return (s);
+}
+
+/*
+ * __mutex_region_max --
+ * Return the amount of space needed to reach the maximum size.
+ */
+static size_t
+__mutex_region_max(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ u_int32_t max;
+
+ dbenv = env->dbenv;
+
+ if ((max = dbenv->mutex_max) == 0) {
+ if (F_ISSET(env, ENV_PRIVATE | ENV_THREAD) == ENV_PRIVATE)
+ max = dbenv->mutex_inc + 1;
+ else
+ max = __lock_region_mutex_max(env) +
+ __txn_region_mutex_max(env) +
+ __log_region_mutex_max(env) +
+ dbenv->mutex_inc + 100;
+ } else if (max <= dbenv->mutex_cnt)
+ return (0);
+ else
+ max -= dbenv->mutex_cnt;
+
+ return ( __env_alloc_size(max * __mutex_align_size(env)));
+}
+
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
+/*
+ * __mutex_resource_return
+ * Return any system-allocated mutex resources to the system.
+ *
+ * PUBLIC: void __mutex_resource_return __P((ENV *, REGINFO *));
+ */
+void
+__mutex_resource_return(env, infop)
+ ENV *env;
+ REGINFO *infop;
+{
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr, mtxmgr_st;
+ DB_MUTEXREGION *mtxregion;
+ db_mutex_t i, indx;
+ void *orig_handle, *chunk;
+ uintmax_t size;
+
+ /*
+ * This routine is called in two cases: when discarding the regions
+ * from a previous Berkeley DB run, during recovery, and two, when
+ * discarding regions as we shut down the database environment.
+ *
+ * Walk the list of mutexes and destroy any live ones.
+ *
+ * This is just like joining a region -- the REGINFO we're handed is
+ * the same as the one returned by __env_region_attach(), all we have
+ * to do is fill in the links.
+ *
+ * !!!
+ * The region may be corrupted, of course. We're safe because the
+ * only things we look at are things that are initialized when the
+ * region is created, and never modified after that.
+ */
+ memset(&mtxmgr_st, 0, sizeof(mtxmgr_st));
+ mtxmgr = &mtxmgr_st;
+ mtxmgr->reginfo = *infop;
+ mtxregion = mtxmgr->reginfo.primary =
+ R_ADDR(&mtxmgr->reginfo, mtxmgr->reginfo.rp->primary);
+ mtxmgr->mutex_array = R_ADDR(&mtxmgr->reginfo, mtxregion->mutex_off);
+
+ /*
+ * This is a little strange, but the mutex_handle is what all of the
+ * underlying mutex routines will use to determine if they should do
+ * any work and to find their information. Save/restore the handle
+ * around the work loop.
+ *
+ * The OOB mutex (MUTEX_INVALID) is 0, skip it.
+ */
+ orig_handle = env->mutex_handle;
+ env->mutex_handle = mtxmgr;
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1;
+ chunk = NULL;
+ size = __env_elem_size(env,
+ (void *)mtxregion->mutex_off_alloc);
+ size -= sizeof(*mutexp);
+ } else
+ mutexp = MUTEXP_SET(env, 1);
+ for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) {
+ if (F_ISSET(env, ENV_PRIVATE))
+ indx = (db_mutex_t)mutexp;
+ else
+ indx = i;
+ if (F_ISSET(mutexp, DB_MUTEX_ALLOCATED))
+ (void)__mutex_destroy(env, indx);
+ mutexp++;
+ if (F_ISSET(env, ENV_PRIVATE) &&
+ (size -= sizeof(*mutexp)) < sizeof(*mutexp)) {
+ mutexp = __env_get_chunk(&mtxmgr->reginfo,
+ &chunk, &size);
+ }
+ mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+ }
+ env->mutex_handle = orig_handle;
+}
+#endif
diff --git a/src/mutex/mut_stat.c b/src/mutex/mut_stat.c
new file mode 100644
index 00000000..b64207fa
--- /dev/null
+++ b/src/mutex/mut_stat.c
@@ -0,0 +1,579 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+#ifdef HAVE_STATISTICS
+static int __mutex_print_all __P((ENV *, u_int32_t));
+static const char *__mutex_print_id __P((int));
+static int __mutex_print_stats __P((ENV *, u_int32_t));
+static void __mutex_print_summary __P((ENV *));
+static int __mutex_stat __P((ENV *, DB_MUTEX_STAT **, u_int32_t));
+
+/*
+ * __mutex_stat_pp --
+ * ENV->mutex_stat pre/post processing.
+ *
+ * PUBLIC: int __mutex_stat_pp __P((DB_ENV *, DB_MUTEX_STAT **, u_int32_t));
+ */
+int
+__mutex_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_MUTEX_STAT **statp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->mutex_handle, "DB_ENV->mutex_stat", DB_INIT_MUTEX);
+
+ if ((ret = __db_fchk(env,
+ "DB_ENV->mutex_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__mutex_stat(env, statp, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __mutex_stat --
+ * ENV->mutex_stat.
+ */
+static int
+__mutex_stat(env, statp, flags)
+ ENV *env;
+ DB_MUTEX_STAT **statp;
+ u_int32_t flags;
+{
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ DB_MUTEX_STAT *stats;
+ int ret;
+
+ *statp = NULL;
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+
+ if ((ret = __os_umalloc(env, sizeof(DB_MUTEX_STAT), &stats)) != 0)
+ return (ret);
+
+ MUTEX_SYSTEM_LOCK(env);
+
+ /*
+ * Most fields are maintained in the underlying region structure.
+ * Region size and region mutex are not.
+ */
+ *stats = mtxregion->stat;
+ stats->st_regsize = mtxmgr->reginfo.rp->size;
+ stats->st_regmax = mtxmgr->reginfo.rp->max;
+ __mutex_set_wait_info(env, mtxregion->mtx_region,
+ &stats->st_region_wait, &stats->st_region_nowait);
+ if (LF_ISSET(DB_STAT_CLEAR))
+ __mutex_clear(env, mtxregion->mtx_region);
+
+ MUTEX_SYSTEM_UNLOCK(env);
+
+ *statp = stats;
+ return (0);
+}
+
+/*
+ * __mutex_stat_print_pp --
+ * ENV->mutex_stat_print pre/post processing.
+ *
+ * PUBLIC: int __mutex_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__mutex_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->mutex_handle, "DB_ENV->mutex_stat_print", DB_INIT_MUTEX);
+
+ if ((ret = __db_fchk(env, "DB_ENV->mutex_stat_print",
+ flags, DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__mutex_stat_print(env, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __mutex_stat_print
+ * ENV->mutex_stat_print method.
+ *
+ * PUBLIC: int __mutex_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__mutex_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ u_int32_t orig_flags;
+ int ret;
+
+ orig_flags = flags;
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+ if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+ ret = __mutex_print_stats(env, orig_flags);
+ __mutex_print_summary(env);
+ if (flags == 0 || ret != 0)
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_STAT_ALL))
+ ret = __mutex_print_all(env, orig_flags);
+
+ return (0);
+}
+
+static void
+__mutex_print_summary(env)
+ ENV *env;
+{
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ void *chunk;
+ db_mutex_t i;
+ u_int32_t counts[MTX_MAX_ENTRY + 2];
+ uintmax_t size;
+ int alloc_id;
+
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+ memset(counts, 0, sizeof(counts));
+ size = 0;
+
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1;
+ chunk = NULL;
+ size = __env_elem_size(env,
+ ROFF_TO_P(mtxregion->mutex_off_alloc));
+ size -= sizeof(*mutexp);
+ } else
+ mutexp = MUTEXP_SET(env, 1);
+ for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) {
+ if (!F_ISSET(mutexp, DB_MUTEX_ALLOCATED))
+ counts[0]++;
+ else if (mutexp->alloc_id > MTX_MAX_ENTRY)
+ counts[MTX_MAX_ENTRY + 1]++;
+ else
+ counts[mutexp->alloc_id]++;
+
+ mutexp++;
+ if (F_ISSET(env, ENV_PRIVATE) &&
+ (size -= sizeof(*mutexp)) < sizeof(*mutexp)) {
+ mutexp =
+ __env_get_chunk(&mtxmgr->reginfo, &chunk, &size);
+ }
+ mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+ }
+ __db_msg(env, "Mutex counts");
+ __db_msg(env, "%d\tUnallocated", counts[0]);
+ for (alloc_id = 1; alloc_id <= MTX_TXN_REGION + 1; alloc_id++)
+ if (counts[alloc_id] != 0)
+ __db_msg(env, "%lu\t%s",
+ (u_long)counts[alloc_id],
+ __mutex_print_id(alloc_id));
+
+}
+
+/*
+ * __mutex_print_stats --
+ * Display default mutex region statistics.
+ */
+static int
+__mutex_print_stats(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_MUTEX_STAT *sp;
+ int ret;
+
+ if ((ret = __mutex_stat(env, &sp, LF_ISSET(DB_STAT_CLEAR))) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL))
+ __db_msg(env, "Default mutex region information:");
+
+ __db_dlbytes(env, "Mutex region size",
+ (u_long)0, (u_long)0, (u_long)sp->st_regsize);
+ __db_dlbytes(env, "Mutex region max size",
+ (u_long)0, (u_long)0, (u_long)sp->st_regmax);
+ __db_dl_pct(env,
+ "The number of region locks that required waiting",
+ (u_long)sp->st_region_wait, DB_PCT(sp->st_region_wait,
+ sp->st_region_wait + sp->st_region_nowait), NULL);
+ STAT_ULONG("Mutex alignment", sp->st_mutex_align);
+ STAT_ULONG("Mutex test-and-set spins", sp->st_mutex_tas_spins);
+ STAT_ULONG("Mutex initial count", sp->st_mutex_init);
+ STAT_ULONG("Mutex total count", sp->st_mutex_cnt);
+ STAT_ULONG("Mutex max count", sp->st_mutex_max);
+ STAT_ULONG("Mutex free count", sp->st_mutex_free);
+ STAT_ULONG("Mutex in-use count", sp->st_mutex_inuse);
+ STAT_ULONG("Mutex maximum in-use count", sp->st_mutex_inuse_max);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+/*
+ * __mutex_print_all --
+ * Display debugging mutex region statistics.
+ */
+static int
+__mutex_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { DB_MUTEX_ALLOCATED, "alloc" },
+ { DB_MUTEX_LOCKED, "locked" },
+ { DB_MUTEX_LOGICAL_LOCK, "logical" },
+ { DB_MUTEX_PROCESS_ONLY, "process-private" },
+ { DB_MUTEX_SELF_BLOCK, "self-block" },
+ { 0, NULL }
+ };
+ DB_MSGBUF mb, *mbp;
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ db_mutex_t i;
+ uintmax_t size;
+ void *chunk;
+
+ DB_MSGBUF_INIT(&mb);
+ mbp = &mb;
+
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+
+ __db_print_reginfo(env, &mtxmgr->reginfo, "Mutex", flags);
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+
+ __db_msg(env, "DB_MUTEXREGION structure:");
+ __mutex_print_debug_single(env,
+ "DB_MUTEXREGION region mutex", mtxregion->mtx_region, flags);
+ STAT_ULONG("Size of the aligned mutex", mtxregion->mutex_size);
+ STAT_ULONG("Next free mutex", mtxregion->mutex_next);
+
+ /*
+ * The OOB mutex (MUTEX_INVALID) is 0, skip it.
+ *
+ * We're not holding the mutex region lock, so we're racing threads of
+ * control allocating mutexes. That's OK, it just means we display or
+ * clear statistics while mutexes are moving.
+ */
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "mutex\twait/nowait, pct wait, holder, flags");
+ size = 0;
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1;
+ chunk = NULL;
+ size = __env_elem_size(env,
+ ROFF_TO_P(mtxregion->mutex_off_alloc));
+ size -= sizeof(*mutexp);
+ } else
+ mutexp = MUTEXP_SET(env, 1);
+ for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) {
+ if (F_ISSET(mutexp, DB_MUTEX_ALLOCATED)) {
+ __db_msgadd(env, mbp, "%5lu\t", (u_long)i);
+
+ __mutex_print_debug_stats(env, mbp,
+ F_ISSET(env, ENV_PRIVATE) ?
+ (db_mutex_t)mutexp : i, flags);
+
+ if (mutexp->alloc_id != 0)
+ __db_msgadd(env, mbp,
+ ", %s", __mutex_print_id(mutexp->alloc_id));
+
+ __db_prflags(env, mbp, mutexp->flags, fn, " (", ")");
+
+ DB_MSGBUF_FLUSH(env, mbp);
+ }
+
+ mutexp++;
+ if (F_ISSET(env, ENV_PRIVATE) &&
+ (size -= sizeof(*mutexp)) < sizeof(*mutexp)) {
+ mutexp =
+ __env_get_chunk(&mtxmgr->reginfo, &chunk, &size);
+ }
+ mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
+ }
+
+ return (0);
+}
+
+/*
+ * __mutex_print_debug_single --
+ * Print mutex internal debugging statistics for a single mutex on a
+ * single output line.
+ *
+ * PUBLIC: void __mutex_print_debug_single
+ * PUBLIC: __P((ENV *, const char *, db_mutex_t, u_int32_t));
+ */
+void
+__mutex_print_debug_single(env, tag, mutex, flags)
+ ENV *env;
+ const char *tag;
+ db_mutex_t mutex;
+ u_int32_t flags;
+{
+ DB_MSGBUF mb, *mbp;
+
+ DB_MSGBUF_INIT(&mb);
+ mbp = &mb;
+
+ if (LF_ISSET(DB_STAT_SUBSYSTEM))
+ LF_CLR(DB_STAT_CLEAR);
+ __db_msgadd(env, mbp, "%lu\t%s ", (u_long)mutex, tag);
+ __mutex_print_debug_stats(env, mbp, mutex, flags);
+ DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __mutex_print_debug_stats --
+ * Print mutex internal debugging statistics, that is, the statistics
+ * in the [] square brackets.
+ *
+ * PUBLIC: void __mutex_print_debug_stats
+ * PUBLIC: __P((ENV *, DB_MSGBUF *, db_mutex_t, u_int32_t));
+ */
+void
+__mutex_print_debug_stats(env, mbp, mutex, flags)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ db_mutex_t mutex;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ u_long value;
+ char buf[DB_THREADID_STRLEN];
+#if defined(HAVE_SHARED_LATCHES) && (defined(HAVE_MUTEX_HYBRID) || \
+ !defined(HAVE_MUTEX_PTHREADS))
+ int sharecount;
+#endif
+
+ if (mutex == MUTEX_INVALID) {
+ __db_msgadd(env, mbp, "[!Set]");
+ return;
+ }
+
+ dbenv = env->dbenv;
+ mutexp = MUTEXP_SET(env, mutex);
+
+ __db_msgadd(env, mbp, "[");
+ if ((value = mutexp->mutex_set_wait) < 10000000)
+ __db_msgadd(env, mbp, "%lu", value);
+ else
+ __db_msgadd(env, mbp, "%luM", value / 1000000);
+ if ((value = mutexp->mutex_set_nowait) < 10000000)
+ __db_msgadd(env, mbp, "/%lu", value);
+ else
+ __db_msgadd(env, mbp, "/%luM", value / 1000000);
+
+ __db_msgadd(env, mbp, " %d%% ",
+ DB_PCT(mutexp->mutex_set_wait,
+ mutexp->mutex_set_wait + mutexp->mutex_set_nowait));
+
+#if defined(HAVE_SHARED_LATCHES)
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+ __db_msgadd(env, mbp, " rd ");
+ if ((value = mutexp->mutex_set_rd_wait) < 10000000)
+ __db_msgadd(env, mbp, "%lu", value);
+ else
+ __db_msgadd(env, mbp, "%luM", value / 1000000);
+ if ((value = mutexp->mutex_set_rd_nowait) < 10000000)
+ __db_msgadd(env, mbp, "/%lu", value);
+ else
+ __db_msgadd(env, mbp, "/%luM", value / 1000000);
+ __db_msgadd(env, mbp, " %d%% ",
+ DB_PCT(mutexp->mutex_set_rd_wait,
+ mutexp->mutex_set_rd_wait + mutexp->mutex_set_rd_nowait));
+ }
+#endif
+
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+ __db_msgadd(env, mbp, "%s]",
+ dbenv->thread_id_string(dbenv,
+ mutexp->pid, mutexp->tid, buf));
+ /* Pthreads-based shared latches do not expose the share count. */
+#if defined(HAVE_SHARED_LATCHES) && (defined(HAVE_MUTEX_HYBRID) || \
+ !defined(HAVE_MUTEX_PTHREADS))
+ else if (F_ISSET(mutexp, DB_MUTEX_SHARED) &&
+ (sharecount = atomic_read(&mutexp->sharecount)) != 0) {
+ if (sharecount == 1)
+ __db_msgadd(env, mbp, "1 reader");
+ else
+ __db_msgadd(env, mbp, "%d readers", sharecount);
+ /* Show the thread which last acquired the latch. */
+ __db_msgadd(env, mbp, " %s]",
+ dbenv->thread_id_string(dbenv,
+ mutexp->pid, mutexp->tid, buf));
+ }
+#endif
+ else
+ __db_msgadd(env, mbp, "!Own]");
+
+#ifdef HAVE_MUTEX_HYBRID
+ if (mutexp->hybrid_wait != 0 || mutexp->hybrid_wakeup != 0)
+ __db_msgadd(env, mbp, " <wakeups %d/%d>",
+ mutexp->hybrid_wait, mutexp->hybrid_wakeup);
+#endif
+
+ if (LF_ISSET(DB_STAT_CLEAR))
+ __mutex_clear(env, mutex);
+}
+
+static const char *
+__mutex_print_id(alloc_id)
+ int alloc_id;
+{
+ switch (alloc_id) {
+ case MTX_APPLICATION: return ("application allocated");
+ case MTX_ATOMIC_EMULATION: return ("atomic emulation");
+ case MTX_DB_HANDLE: return ("db handle");
+ case MTX_ENV_DBLIST: return ("env dblist");
+ case MTX_ENV_EXCLDBLIST: return ("env exclusive dblist");
+ case MTX_ENV_HANDLE: return ("env handle");
+ case MTX_ENV_REGION: return ("env region");
+ case MTX_LOCK_REGION: return ("lock region");
+ case MTX_LOGICAL_LOCK: return ("logical lock");
+ case MTX_LOG_FILENAME: return ("log filename");
+ case MTX_LOG_FLUSH: return ("log flush");
+ case MTX_LOG_HANDLE: return ("log handle");
+ case MTX_LOG_REGION: return ("log region");
+ case MTX_MPOOLFILE_HANDLE: return ("mpoolfile handle");
+ case MTX_MPOOL_BH: return ("mpool buffer");
+ case MTX_MPOOL_FH: return ("mpool filehandle");
+ case MTX_MPOOL_FILE_BUCKET: return ("mpool file bucket");
+ case MTX_MPOOL_HANDLE: return ("mpool handle");
+ case MTX_MPOOL_HASH_BUCKET: return ("mpool hash bucket");
+ case MTX_MPOOL_REGION: return ("mpool region");
+ case MTX_MUTEX_REGION: return ("mutex region");
+ case MTX_MUTEX_TEST: return ("mutex test");
+ case MTX_REPMGR: return ("replication manager");
+ case MTX_REP_CHKPT: return ("replication checkpoint");
+ case MTX_REP_DATABASE: return ("replication database");
+ case MTX_REP_DIAG: return ("replication diagnostics");
+ case MTX_REP_EVENT: return ("replication event");
+ case MTX_REP_REGION: return ("replication region");
+ case MTX_REP_START: return ("replication role config");
+ case MTX_REP_WAITER: return ("replication txn apply");
+ case MTX_SEQUENCE: return ("sequence");
+ case MTX_TWISTER: return ("twister");
+ case MTX_TCL_EVENTS: return ("Tcl events");
+ case MTX_TXN_ACTIVE: return ("txn active list");
+ case MTX_TXN_CHKPT: return ("transaction checkpoint");
+ case MTX_TXN_COMMIT: return ("txn commit");
+ case MTX_TXN_MVCC: return ("txn mvcc");
+ case MTX_TXN_REGION: return ("txn region");
+ default: return ("unknown mutex type");
+ /* NOTREACHED */
+ }
+}
+
+/*
+ * __mutex_set_wait_info --
+ * Return mutex statistics.
+ *
+ * PUBLIC: void __mutex_set_wait_info
+ * PUBLIC: __P((ENV *, db_mutex_t, uintmax_t *, uintmax_t *));
+ */
+void
+__mutex_set_wait_info(env, mutex, waitp, nowaitp)
+ ENV *env;
+ db_mutex_t mutex;
+ uintmax_t *waitp, *nowaitp;
+{
+ DB_MUTEX *mutexp;
+
+ if (mutex == MUTEX_INVALID) {
+ *waitp = 0;
+ *nowaitp = 0;
+ return;
+ }
+ mutexp = MUTEXP_SET(env, mutex);
+
+ *waitp = mutexp->mutex_set_wait;
+ *nowaitp = mutexp->mutex_set_nowait;
+}
+
+/*
+ * __mutex_clear --
+ * Clear mutex statistics.
+ *
+ * PUBLIC: void __mutex_clear __P((ENV *, db_mutex_t));
+ */
+void
+__mutex_clear(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_MUTEX *mutexp;
+
+ if (!MUTEX_ON(env))
+ return;
+
+ mutexp = MUTEXP_SET(env, mutex);
+
+ mutexp->mutex_set_wait = mutexp->mutex_set_nowait = 0;
+#ifdef HAVE_SHARED_LATCHES
+ mutexp->mutex_set_rd_wait = mutexp->mutex_set_rd_nowait = 0;
+#endif
+#ifdef HAVE_MUTEX_HYBRID
+ mutexp->hybrid_wait = mutexp->hybrid_wakeup = 0;
+#endif
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__mutex_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_MUTEX_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__mutex_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/mutex/mut_stub.c b/src/mutex/mut_stub.c
new file mode 100644
index 00000000..61ecc80c
--- /dev/null
+++ b/src/mutex/mut_stub.c
@@ -0,0 +1,252 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_MUTEX_SUPPORT
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+/*
+ * If the library wasn't compiled with mutex support, various routines
+ * aren't available. Stub them here, returning an appropriate error.
+ */
+static int __db_nomutex __P((ENV *));
+
+/*
+ * __db_nomutex --
+ * Error when a Berkeley DB build doesn't include mutexes.
+ */
+static int
+__db_nomutex(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("2001",
+ "library build did not include support for mutexes"));
+ return (DB_OPNOTSUP);
+}
+
+int
+__mutex_alloc_pp(dbenv, flags, indxp)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+ db_mutex_t *indxp;
+{
+ COMPQUIET(flags, 0);
+ COMPQUIET(indxp, NULL);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_alloc(env, alloc_id, flags, indxp)
+ ENV *env;
+ int alloc_id;
+ u_int32_t flags;
+ db_mutex_t *indxp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(alloc_id, 0);
+ COMPQUIET(flags, 0);
+ *indxp = MUTEX_INVALID;
+ return (0);
+}
+
+void
+__mutex_clear(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(mutex, MUTEX_INVALID);
+}
+
+int
+__mutex_free_pp(dbenv, indx)
+ DB_ENV *dbenv;
+ db_mutex_t indx;
+{
+ COMPQUIET(indx, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_free(env, indxp)
+ ENV *env;
+ db_mutex_t *indxp;
+{
+ COMPQUIET(env, NULL);
+ *indxp = MUTEX_INVALID;
+ return (0);
+}
+
+int
+__mutex_get_align(dbenv, alignp)
+ DB_ENV *dbenv;
+ u_int32_t *alignp;
+{
+ COMPQUIET(alignp, NULL);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_get_increment(dbenv, incrementp)
+ DB_ENV *dbenv;
+ u_int32_t *incrementp;
+{
+ COMPQUIET(incrementp, NULL);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_get_max(dbenv, maxp)
+ DB_ENV *dbenv;
+ u_int32_t *maxp;
+{
+ COMPQUIET(maxp, NULL);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_get_tas_spins(dbenv, tas_spinsp)
+ DB_ENV *dbenv;
+ u_int32_t *tas_spinsp;
+{
+ COMPQUIET(tas_spinsp, NULL);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_lock_pp(dbenv, indx)
+ DB_ENV *dbenv;
+ db_mutex_t indx;
+{
+ COMPQUIET(indx, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+void
+__mutex_print_debug_single(env, tag, mutex, flags)
+ ENV *env;
+ const char *tag;
+ db_mutex_t mutex;
+ u_int32_t flags;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(tag, NULL);
+ COMPQUIET(mutex, MUTEX_INVALID);
+ COMPQUIET(flags, 0);
+}
+
+void
+__mutex_print_debug_stats(env, mbp, mutex, flags)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ db_mutex_t mutex;
+ u_int32_t flags;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(mbp, NULL);
+ COMPQUIET(mutex, MUTEX_INVALID);
+ COMPQUIET(flags, 0);
+}
+
+int
+__mutex_set_align(dbenv, align)
+ DB_ENV *dbenv;
+ u_int32_t align;
+{
+ COMPQUIET(align, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_set_increment(dbenv, increment)
+ DB_ENV *dbenv;
+ u_int32_t increment;
+{
+ COMPQUIET(increment, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_get_init(dbenv, initp)
+ DB_ENV *dbenv;
+ u_int32_t *initp;
+{
+ COMPQUIET(initp, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_set_init(dbenv, init)
+ DB_ENV *dbenv;
+ u_int32_t init;
+{
+ COMPQUIET(init, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_set_max(dbenv, max)
+ DB_ENV *dbenv;
+ u_int32_t max;
+{
+ COMPQUIET(max, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_set_tas_spins(dbenv, tas_spins)
+ DB_ENV *dbenv;
+ u_int32_t tas_spins;
+{
+ COMPQUIET(tas_spins, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+void
+__mutex_set_wait_info(env, mutex, waitp, nowaitp)
+ ENV *env;
+ db_mutex_t mutex;
+ uintmax_t *waitp, *nowaitp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(mutex, MUTEX_INVALID);
+ *waitp = *nowaitp = 0;
+}
+
+int
+__mutex_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_MUTEX_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_nomutex(dbenv->env));
+}
+
+int
+__mutex_unlock_pp(dbenv, indx)
+ DB_ENV *dbenv;
+ db_mutex_t indx;
+{
+ COMPQUIET(indx, 0);
+ return (__db_nomutex(dbenv->env));
+}
+#endif /* !HAVE_MUTEX_SUPPORT */
diff --git a/src/mutex/mut_tas.c b/src/mutex/mut_tas.c
new file mode 100644
index 00000000..0899d237
--- /dev/null
+++ b/src/mutex/mut_tas.c
@@ -0,0 +1,608 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/lock.h"
+
+static inline int __db_tas_mutex_lock_int
+ __P((ENV *, db_mutex_t, db_timeout_t, int));
+static inline int __db_tas_mutex_readlock_int __P((ENV *, db_mutex_t, int));
+
+/*
+ * __db_tas_mutex_init --
+ * Initialize a test-and-set mutex.
+ *
+ * PUBLIC: int __db_tas_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+ */
+int
+__db_tas_mutex_init(env, mutex, flags)
+ ENV *env;
+ db_mutex_t mutex;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ int ret;
+
+#ifndef HAVE_MUTEX_HYBRID
+ COMPQUIET(flags, 0);
+#endif
+
+ dbenv = env->dbenv;
+ mutexp = MUTEXP_SET(env, mutex);
+
+ /* Check alignment. */
+ if (((uintptr_t)mutexp & (dbenv->mutex_align - 1)) != 0) {
+ __db_errx(env, DB_STR("2028",
+ "TAS: mutex not appropriately aligned"));
+ return (EINVAL);
+ }
+
+#ifdef HAVE_SHARED_LATCHES
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED))
+ atomic_init(&mutexp->sharecount, 0);
+ else
+#endif
+ if (MUTEX_INIT(&mutexp->tas)) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("2029",
+ "TAS: mutex initialize"));
+ return (__os_posix_err(ret));
+ }
+#ifdef HAVE_MUTEX_HYBRID
+ if ((ret = __db_pthread_mutex_init(env,
+ mutex, flags | DB_MUTEX_SELF_BLOCK)) != 0)
+ return (ret);
+#endif
+ return (0);
+}
+
+/*
+ * __db_tas_mutex_lock_int
+ * Internal function to lock a mutex, or just try to lock it without waiting
+ */
+inline static int
+__db_tas_mutex_lock_int(env, mutex, timeout, nowait)
+ ENV *env;
+ db_mutex_t mutex;
+ db_timeout_t timeout;
+ int nowait;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ DB_THREAD_INFO *ip;
+ db_timespec now, timespec;
+ u_int32_t nspins;
+ int ret;
+#ifdef HAVE_MUTEX_HYBRID
+ const u_long micros = 0;
+#else
+ u_long micros, max_micros;
+ db_timeout_t time_left;
+#endif
+
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+ mutexp = MUTEXP_SET(env, mutex);
+
+ CHECK_MTX_THREAD(env, mutexp);
+
+#ifdef HAVE_STATISTICS
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+ STAT_INC(env, mutex, set_wait, mutexp->mutex_set_wait, mutex);
+ else
+ STAT_INC(env,
+ mutex, set_nowait, mutexp->mutex_set_nowait, mutex);
+#endif
+
+#ifndef HAVE_MUTEX_HYBRID
+ /*
+ * Wait 1ms initially, up to 10ms for mutexes backing logical database
+ * locks, and up to 25 ms for mutual exclusion data structure mutexes.
+ * SR: #7675
+ */
+ micros = 1000;
+ max_micros = F_ISSET(mutexp, DB_MUTEX_LOGICAL_LOCK) ? 10000 : 25000;
+#endif
+
+ /* Clear the ending timespec so it'll be initialed upon first need. */
+ if (timeout != 0)
+ timespecclear(&timespec);
+
+ /*
+ * Only check the thread state once, by initializing the thread
+ * control block pointer to null. If it is not the failchk
+ * thread, then ip will have a valid value subsequent times
+ * in the loop.
+ */
+ ip = NULL;
+
+loop: /* Attempt to acquire the resource for N spins. */
+ for (nspins =
+ mtxregion->stat.st_mutex_tas_spins; nspins > 0; --nspins) {
+#ifdef HAVE_MUTEX_S390_CC_ASSEMBLY
+ tsl_t zero;
+
+ zero = 0;
+#endif
+
+#ifdef HAVE_MUTEX_HPPA_MSEM_INIT
+ relock:
+#endif
+ /*
+ * Avoid interlocked instructions until they're likely to
+ * succeed by first checking whether it is held
+ */
+ if (MUTEXP_IS_BUSY(mutexp) || !MUTEXP_ACQUIRE(mutexp)) {
+ if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
+ ip == NULL && dbenv->is_alive(dbenv,
+ mutexp->pid, mutexp->tid, 0) == 0) {
+ ret = __env_set_state(env, &ip, THREAD_VERIFY);
+ if (ret != 0 ||
+ ip->dbth_state == THREAD_FAILCHK)
+ return (DB_RUNRECOVERY);
+ }
+ if (nowait)
+ return (DB_LOCK_NOTGRANTED);
+ /*
+ * Some systems (notably those with newer Intel CPUs)
+ * need a small pause here. [#6975]
+ */
+ MUTEX_PAUSE
+ continue;
+ }
+
+ MEMBAR_ENTER();
+
+#ifdef HAVE_MUTEX_HPPA_MSEM_INIT
+ /*
+ * HP semaphores are unlocked automatically when a holding
+ * process exits. If the mutex appears to be locked
+ * (F_ISSET(DB_MUTEX_LOCKED)) but we got here, assume this
+ * has happened. Set the pid and tid into the mutex and
+ * lock again. (The default state of the mutexes used to
+ * block in __lock_get_internal is locked, so exiting with
+ * a locked mutex is reasonable behavior for a process that
+ * happened to initialize or use one of them.)
+ */
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+ goto relock;
+ }
+ /*
+ * If we make it here, the mutex isn't locked, the diagnostic
+ * won't fire, and we were really unlocked by someone calling
+ * the DB mutex unlock function.
+ */
+#endif
+#ifdef DIAGNOSTIC
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ char buf[DB_THREADID_STRLEN];
+ __db_errx(env, DB_STR_A("2030",
+ "TAS lock failed: lock %ld currently in use: ID: %s",
+ "%ld %s"), (long)mutex,
+ dbenv->thread_id_string(dbenv,
+ mutexp->pid, mutexp->tid, buf));
+ return (__env_panic(env, EACCES));
+ }
+#endif
+ F_SET(mutexp, DB_MUTEX_LOCKED);
+ dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+
+#ifdef DIAGNOSTIC
+ /*
+ * We want to switch threads as often as possible. Yield
+ * every time we get a mutex to ensure contention.
+ */
+ if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+ __os_yield(env, 0, 0);
+#endif
+ return (0);
+ }
+
+ /*
+ * We need to wait for the lock to become available.
+ * Possibly setup timeouts if this is the first wait, or
+ * check expiration times for the second and subsequent waits.
+ */
+ if (timeout != 0) {
+ /* Set the expiration time if this is the first sleep . */
+ if (!timespecisset(&timespec))
+ __clock_set_expires(env, &timespec, timeout);
+ else {
+ timespecclear(&now);
+ if (__clock_expired(env, &now, &timespec))
+ return (DB_TIMEOUT);
+#ifndef HAVE_MUTEX_HYBRID
+ timespecsub(&now, &timespec);
+ DB_TIMESPEC_TO_TIMEOUT(time_left, &now, 0);
+ time_left = timeout - time_left;
+ if (micros > time_left)
+ micros = time_left;
+#endif
+ }
+ }
+
+ /*
+ * This yields for a while for tas mutexes, and just gives up the
+ * processor for hybrid mutexes.
+ * By yielding here we can get the other thread to give up the
+ * mutex before calling the more expensive library mutex call.
+ * Tests have shown this to be a big win when there is contention.
+ */
+ PERFMON4(env, mutex, suspend, mutex, TRUE, mutexp->alloc_id, mutexp);
+ __os_yield(env, 0, micros);
+ PERFMON4(env, mutex, resume, mutex, TRUE, mutexp->alloc_id, mutexp);
+
+#if defined(HAVE_MUTEX_HYBRID)
+ if (!MUTEXP_IS_BUSY(mutexp))
+ goto loop;
+ /* Wait until the mutex can be obtained exclusively or it times out. */
+ if ((ret = __db_hybrid_mutex_suspend(env,
+ mutex, timeout == 0 ? NULL : &timespec, TRUE)) != 0)
+ return (ret);
+#else
+ if ((micros <<= 1) > max_micros)
+ micros = max_micros;
+#endif
+
+ /*
+ * We're spinning. The environment might be hung, and somebody else
+ * has already recovered it. The first thing recovery does is panic
+ * the environment. Check to see if we're never going to get this
+ * mutex.
+ */
+ PANIC_CHECK(env);
+
+ goto loop;
+}
+
+/*
+ * __db_tas_mutex_lock
+ * Lock on a mutex, blocking if necessary.
+ *
+ * PUBLIC: int __db_tas_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+ */
+int
+__db_tas_mutex_lock(env, mutex, timeout)
+ ENV *env;
+ db_mutex_t mutex;
+ db_timeout_t timeout;
+{
+ return (__db_tas_mutex_lock_int(env, mutex, timeout, 0));
+}
+
+/*
+ * __db_tas_mutex_trylock
+ * Try to exclusively lock a mutex without ever blocking - ever!
+ *
+ * Returns 0 on success,
+ * DB_LOCK_NOTGRANTED on timeout
+ * Possibly DB_RUNRECOVERY if DB_ENV_FAILCHK or panic.
+ *
+ * This will work for DB_MUTEX_SHARED, though it always tries
+ * for exclusive access.
+ *
+ * PUBLIC: int __db_tas_mutex_trylock __P((ENV *, db_mutex_t));
+ */
+int
+__db_tas_mutex_trylock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ return (__db_tas_mutex_lock_int(env, mutex, 0, 1));
+}
+
+#if defined(HAVE_SHARED_LATCHES)
+/*
+ * __db_tas_mutex_readlock_int
+ * Internal function to get a shared lock on a latch, blocking if necessary.
+ *
+ */
+static inline int
+__db_tas_mutex_readlock_int(env, mutex, nowait)
+ ENV *env;
+ db_mutex_t mutex;
+ int nowait;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ DB_THREAD_INFO *ip;
+ int lock;
+ u_int32_t nspins;
+ int ret;
+#ifndef HAVE_MUTEX_HYBRID
+ u_long micros, max_micros;
+#endif
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+ mutexp = MUTEXP_SET(env, mutex);
+
+ CHECK_MTX_THREAD(env, mutexp);
+
+ DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SHARED));
+#ifdef HAVE_STATISTICS
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+ STAT_INC(env,
+ mutex, set_rd_wait, mutexp->mutex_set_rd_wait, mutex);
+ else
+ STAT_INC(env,
+ mutex, set_rd_nowait, mutexp->mutex_set_rd_nowait, mutex);
+#endif
+
+#ifndef HAVE_MUTEX_HYBRID
+ /*
+ * Wait 1ms initially, up to 10ms for mutexes backing logical database
+ * locks, and up to 25 ms for mutual exclusion data structure mutexes.
+ * SR: #7675
+ */
+ micros = 1000;
+ max_micros = F_ISSET(mutexp, DB_MUTEX_LOGICAL_LOCK) ? 10000 : 25000;
+#endif
+
+loop: /* Attempt to acquire the resource for N spins. */
+ for (nspins =
+ mtxregion->stat.st_mutex_tas_spins; nspins > 0; --nspins) {
+ lock = atomic_read(&mutexp->sharecount);
+ if (lock == MUTEX_SHARE_ISEXCLUSIVE ||
+ !atomic_compare_exchange(env,
+ &mutexp->sharecount, lock, lock + 1)) {
+ /*
+ * Some systems (notably those with newer Intel CPUs)
+ * need a small pause here. [#6975]
+ */
+ MUTEX_PAUSE
+ continue;
+ }
+
+ MEMBAR_ENTER();
+ /* For shared latches the threadid is the last requestor's id.
+ */
+ dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+
+ return (0);
+ }
+
+ /*
+ * Waiting for the latched must be avoided when it could allow a
+ * 'failchk'ing thread to hang.
+ */
+ if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
+ dbenv->is_alive(dbenv, mutexp->pid, mutexp->tid, 0) == 0) {
+ ret = __env_set_state(env, &ip, THREAD_VERIFY);
+ if (ret != 0 || ip->dbth_state == THREAD_FAILCHK)
+ return (DB_RUNRECOVERY);
+ }
+
+ /*
+ * It is possible to spin out when the latch is just shared, due to
+ * many threads or interrupts interfering with the compare&exchange.
+ * Avoid spurious DB_LOCK_NOTGRANTED returns by retrying.
+ */
+ if (nowait) {
+ if (atomic_read(&mutexp->sharecount) != MUTEX_SHARE_ISEXCLUSIVE)
+ goto loop;
+ return (DB_LOCK_NOTGRANTED);
+ }
+
+ /* Wait for the lock to become available. */
+#ifdef HAVE_MUTEX_HYBRID
+ /*
+ * By yielding here we can get the other thread to give up the
+ * mutex before calling the more expensive library mutex call.
+ * Tests have shown this to be a big win when there is contention.
+ */
+ PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp);
+ __os_yield(env, 0, 0);
+ PERFMON4(env, mutex, resume, mutex, FALSE, mutexp->alloc_id, mutexp);
+ if (atomic_read(&mutexp->sharecount) != MUTEX_SHARE_ISEXCLUSIVE)
+ goto loop;
+ /* Wait until the mutex is no longer exclusively locked. */
+ if ((ret = __db_hybrid_mutex_suspend(env, mutex, NULL, FALSE)) != 0)
+ return (ret);
+#else
+ PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp);
+ __os_yield(env, 0, micros);
+ PERFMON4(env, mutex, resume, mutex, FALSE, mutexp->alloc_id, mutexp);
+ if ((micros <<= 1) > max_micros)
+ micros = max_micros;
+#endif
+
+ /*
+ * We're spinning. The environment might be hung, and somebody else
+ * has already recovered it. The first thing recovery does is panic
+ * the environment. Check to see if we're never going to get this
+ * mutex.
+ */
+ PANIC_CHECK(env);
+
+ goto loop;
+}
+
+/*
+ * __db_tas_mutex_readlock
+ * Get a shared lock on a latch, waiting if necessary.
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_tas_mutex_readlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_tas_mutex_readlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ return (__db_tas_mutex_readlock_int(env, mutex, 0));
+}
+
+/*
+ * __db_tas_mutex_tryreadlock
+ * Try to get a shared lock on a latch; don't wait when busy.
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_tas_mutex_tryreadlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_tas_mutex_tryreadlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ return (__db_tas_mutex_readlock_int(env, mutex, 1));
+}
+#endif
+
+/*
+ * __db_tas_mutex_unlock --
+ * Release a mutex.
+ *
+ * PUBLIC: int __db_tas_mutex_unlock __P((ENV *, db_mutex_t));
+ *
+ * Hybrid shared latch wakeup
+ * When an exclusive requester waits for the last shared holder to
+ * release, it increments mutexp->wait and pthread_cond_wait()'s. The
+ * last shared unlock calls __db_pthread_mutex_unlock() to wake it.
+ */
+int
+__db_tas_mutex_unlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+#ifdef HAVE_MUTEX_HYBRID
+ int ret;
+#ifdef MUTEX_DIAG
+ int waiters;
+#endif
+#endif
+#ifdef HAVE_SHARED_LATCHES
+ int sharecount;
+#endif
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mutexp = MUTEXP_SET(env, mutex);
+#if defined(HAVE_MUTEX_HYBRID) && defined(MUTEX_DIAG)
+ waiters = mutexp->wait;
+#endif
+
+#if defined(DIAGNOSTIC)
+#if defined(HAVE_SHARED_LATCHES)
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+ if (atomic_read(&mutexp->sharecount) == 0) {
+ __db_errx(env, DB_STR_A("2031",
+ "shared unlock %ld already unlocked", "%ld"),
+ (long)mutex);
+ return (__env_panic(env, EACCES));
+ }
+ } else
+#endif
+ if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ __db_errx(env, DB_STR_A("2032",
+ "unlock %ld already unlocked", "%ld"), (long)mutex);
+ return (__env_panic(env, EACCES));
+ }
+#endif
+
+#ifdef HAVE_SHARED_LATCHES
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+ sharecount = atomic_read(&mutexp->sharecount);
+ /*MUTEX_MEMBAR(mutexp->sharecount);*/ /* XXX why? */
+ if (sharecount == MUTEX_SHARE_ISEXCLUSIVE) {
+ F_CLR(mutexp, DB_MUTEX_LOCKED);
+ /* Flush flag update before zeroing count */
+ MEMBAR_EXIT();
+ atomic_init(&mutexp->sharecount, 0);
+ } else {
+ DB_ASSERT(env, sharecount > 0);
+ MEMBAR_EXIT();
+ sharecount = atomic_dec(env, &mutexp->sharecount);
+ DB_ASSERT(env, sharecount >= 0);
+ if (sharecount > 0)
+ return (0);
+ }
+ } else
+#endif
+ {
+ F_CLR(mutexp, DB_MUTEX_LOCKED);
+ MUTEX_UNSET(&mutexp->tas);
+ }
+
+#ifdef HAVE_MUTEX_HYBRID
+#ifdef DIAGNOSTIC
+ if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+ __os_yield(env, 0, 0);
+#endif
+
+ /* Prevent the load of wait from being hoisted before MUTEX_UNSET */
+ MUTEX_MEMBAR(mutexp->flags);
+ if (mutexp->wait &&
+ (ret = __db_pthread_mutex_unlock(env, mutex)) != 0)
+ return (ret);
+
+#ifdef MUTEX_DIAG
+ if (mutexp->wait)
+ printf("tas_unlock %ld %x waiters! busy %x waiters %d/%d\n",
+ mutex, pthread_self(),
+ MUTEXP_BUSY_FIELD(mutexp), waiters, mutexp->wait);
+#endif
+#endif
+
+ return (0);
+}
+
+/*
+ * __db_tas_mutex_destroy --
+ * Destroy a mutex.
+ *
+ * PUBLIC: int __db_tas_mutex_destroy __P((ENV *, db_mutex_t));
+ */
+int
+__db_tas_mutex_destroy(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_MUTEX *mutexp;
+#ifdef HAVE_MUTEX_HYBRID
+ int ret;
+#endif
+
+ if (!MUTEX_ON(env))
+ return (0);
+
+ mutexp = MUTEXP_SET(env, mutex);
+
+ MUTEX_DESTROY(&mutexp->tas);
+
+#ifdef HAVE_MUTEX_HYBRID
+ if ((ret = __db_pthread_mutex_destroy(env, mutex)) != 0)
+ return (ret);
+#endif
+
+ COMPQUIET(mutexp, NULL); /* MUTEX_DESTROY may not be defined. */
+ return (0);
+}
diff --git a/src/mutex/mut_win32.c b/src/mutex/mut_win32.c
new file mode 100644
index 00000000..07d5a8dd
--- /dev/null
+++ b/src/mutex/mut_win32.c
@@ -0,0 +1,589 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2002, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#define LOAD_ACTUAL_MUTEX_CODE
+#include "db_int.h"
+
+#include "dbinc/atomic.h"
+/*
+ * This is where we load in the actual mutex declarations.
+ */
+#include "dbinc/mutex_int.h"
+
+/*
+ * Common code to get an event handle. This is executed whenever a mutex
+ * blocks, or when unlocking a mutex that a thread is waiting on. We can't
+ * keep these handles around, since the mutex structure is in shared memory,
+ * and each process gets its own handle value.
+ *
+ * We pass security attributes so that the created event is accessible by all
+ * users, in case a Windows service is sharing an environment with a local
+ * process run as a different user.
+ */
+static _TCHAR hex_digits[] = _T("0123456789abcdef");
+
+static __inline int get_handle(env, mutexp, eventp)
+ ENV *env;
+ DB_MUTEX *mutexp;
+ HANDLE *eventp;
+{
+ _TCHAR idbuf[] = _T("db.m00000000");
+ _TCHAR *p = idbuf + 12;
+ int ret = 0;
+ u_int32_t id;
+
+ for (id = (mutexp)->id; id != 0; id >>= 4)
+ *--p = hex_digits[id & 0xf];
+
+#ifndef DB_WINCE
+ if (DB_GLOBAL(win_sec_attr) == NULL) {
+ InitializeSecurityDescriptor(&DB_GLOBAL(win_default_sec_desc),
+ SECURITY_DESCRIPTOR_REVISION);
+ SetSecurityDescriptorDacl(&DB_GLOBAL(win_default_sec_desc),
+ TRUE, 0, FALSE);
+ DB_GLOBAL(win_default_sec_attr).nLength =
+ sizeof(SECURITY_ATTRIBUTES);
+ DB_GLOBAL(win_default_sec_attr).bInheritHandle = FALSE;
+ DB_GLOBAL(win_default_sec_attr).lpSecurityDescriptor =
+ &DB_GLOBAL(win_default_sec_desc);
+ DB_GLOBAL(win_sec_attr) = &DB_GLOBAL(win_default_sec_attr);
+ }
+#endif
+
+ if ((*eventp = CreateEvent(DB_GLOBAL(win_sec_attr),
+ FALSE, FALSE, idbuf)) == NULL) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("2002",
+ "Win32 create event failed"));
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_win32_mutex_lock_int
+ * Internal function to lock a win32 mutex
+ *
+ * If the wait parameter is 0, this function will return DB_LOCK_NOTGRANTED
+ * rather than wait.
+ *
+ */
+static __inline int
+__db_win32_mutex_lock_int(env, mutex, timeout, wait)
+ ENV *env;
+ db_mutex_t mutex;
+ db_timeout_t timeout;
+ int wait;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ DB_THREAD_INFO *ip;
+ HANDLE event;
+ u_int32_t ms, nspins;
+ db_timespec now, tempspec, timeoutspec;
+ db_timeout_t time_left;
+ int ret;
+#ifdef MUTEX_DIAG
+ LARGE_INTEGER now;
+#endif
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+ mutexp = MUTEXP_SET(env, mutex);
+
+ CHECK_MTX_THREAD(env, mutexp);
+
+ if (timeout != 0) {
+ timespecclear(&timeoutspec);
+ __clock_set_expires(env, &timeoutspec, timeout);
+ }
+
+ /*
+ * See WINCE_ATOMIC_MAGIC definition for details.
+ * Use sharecount, because the value just needs to be a db_atomic_t
+ * memory mapped onto the same page as those being Interlocked*.
+ */
+ WINCE_ATOMIC_MAGIC(&mutexp->sharecount);
+
+ event = NULL;
+ ms = 50;
+ ret = 0;
+
+ /*
+ * Only check the thread state once, by initializing the thread
+ * control block pointer to null. If it is not the failchk
+ * thread, then ip will have a valid value subsequent times
+ * in the loop.
+ */
+ ip = NULL;
+
+loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */
+ for (nspins =
+ mtxregion->stat.st_mutex_tas_spins; nspins > 0; --nspins) {
+ /*
+ * We can avoid the (expensive) interlocked instructions if
+ * the mutex is already busy.
+ */
+ if (MUTEXP_IS_BUSY(mutexp) || !MUTEXP_ACQUIRE(mutexp)) {
+ if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
+ ip == NULL && dbenv->is_alive(dbenv,
+ mutexp->pid, mutexp->tid, 0) == 0) {
+ ret = __env_set_state(env, &ip, THREAD_VERIFY);
+ if (ret != 0 ||
+ ip->dbth_state == THREAD_FAILCHK)
+ return (DB_RUNRECOVERY);
+ }
+ if (!wait)
+ return (DB_LOCK_NOTGRANTED);
+ /*
+ * Some systems (notably those with newer Intel CPUs)
+ * need a small pause before retrying. [#6975]
+ */
+ MUTEX_PAUSE
+ continue;
+ }
+
+#ifdef DIAGNOSTIC
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ char buf[DB_THREADID_STRLEN];
+ __db_errx(env, DB_STR_A("2003",
+ "Win32 lock failed: mutex already locked by %s",
+ "%s"), dbenv->thread_id_string(dbenv,
+ mutexp->pid, mutexp->tid, buf));
+ return (__env_panic(env, EACCES));
+ }
+#endif
+ F_SET(mutexp, DB_MUTEX_LOCKED);
+ dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+
+#ifdef HAVE_STATISTICS
+ if (event == NULL)
+ ++mutexp->mutex_set_nowait;
+ else
+ ++mutexp->mutex_set_wait;
+#endif
+ if (event != NULL) {
+ CloseHandle(event);
+ InterlockedDecrement(&mutexp->nwaiters);
+#ifdef MUTEX_DIAG
+ if (ret != WAIT_OBJECT_0) {
+ QueryPerformanceCounter(&diag_now);
+ printf(DB_STR_A("2004",
+ "[%I64d]: Lost signal on mutex %p, "
+ "id %d, ms %d\n", "%I64d %p %d %d"),
+ diag_now.QuadPart, mutexp, mutexp->id, ms);
+ }
+#endif
+ }
+
+#ifdef DIAGNOSTIC
+ /*
+ * We want to switch threads as often as possible. Yield
+ * every time we get a mutex to ensure contention.
+ */
+ if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+ __os_yield(env, 0, 0);
+#endif
+
+ return (0);
+ }
+
+ /*
+ * Yield the processor; wait 50 ms initially, up to 1 second. This
+ * loop is needed to work around a race where the signal from the
+ * unlocking thread gets lost. We start at 50 ms because it's unlikely
+ * to happen often and we want to avoid wasting CPU.
+ */
+ if (timeout != 0) {
+ timespecclear(&now);
+ if (__clock_expired(env, &now, &timeoutspec)) {
+ if (event != NULL) {
+ CloseHandle(event);
+ InterlockedDecrement(&mutexp->nwaiters);
+ }
+ return (DB_TIMEOUT);
+ }
+ /* Reduce the event wait if the timeout would happen first. */
+ tempspec = timeoutspec;
+ timespecsub(&tempspec, &now);
+ DB_TIMESPEC_TO_TIMEOUT(time_left, &tempspec, 0);
+ time_left /= US_PER_MS;
+ if (ms > time_left)
+ ms = time_left;
+ }
+ if (event == NULL) {
+#ifdef MUTEX_DIAG
+ QueryPerformanceCounter(&diag_now);
+ printf(DB_STR_A("2005",
+ "[%I64d]: Waiting on mutex %p, id %d\n",
+ "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
+#endif
+ InterlockedIncrement(&mutexp->nwaiters);
+ if ((ret = get_handle(env, mutexp, &event)) != 0)
+ goto err;
+ }
+ if ((ret = WaitForSingleObject(event, ms)) == WAIT_FAILED) {
+ ret = __os_get_syserr();
+ goto err;
+ }
+ if ((ms <<= 1) > MS_PER_SEC)
+ ms = MS_PER_SEC;
+
+ PANIC_CHECK(env);
+ goto loop;
+
+err: __db_syserr(env, ret, DB_STR("2006", "Win32 lock failed"));
+ return (__env_panic(env, __os_posix_err(ret)));
+}
+
+/*
+ * __db_win32_mutex_init --
+ * Initialize a Win32 mutex.
+ *
+ * PUBLIC: int __db_win32_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
+ */
+int
+__db_win32_mutex_init(env, mutex, flags)
+ ENV *env;
+ db_mutex_t mutex;
+ u_int32_t flags;
+{
+ DB_MUTEX *mutexp;
+
+ mutexp = MUTEXP_SET(env, mutex);
+ mutexp->id = ((getpid() & 0xffff) << 16) ^ P_TO_UINT32(mutexp);
+ F_SET(mutexp, flags);
+
+ return (0);
+}
+
+/*
+ * __db_win32_mutex_lock
+ * Lock on a mutex, blocking if necessary.
+ *
+ * PUBLIC: int __db_win32_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
+ */
+int
+__db_win32_mutex_lock(env, mutex, timeout)
+ ENV *env;
+ db_mutex_t mutex;
+ db_timeout_t timeout;
+{
+ return (__db_win32_mutex_lock_int(env, mutex, timeout, 1));
+}
+
+/*
+ * __db_win32_mutex_trylock
+ * Try to lock a mutex, returning without waiting if it is busy
+ *
+ * PUBLIC: int __db_win32_mutex_trylock __P((ENV *, db_mutex_t));
+ */
+int
+__db_win32_mutex_trylock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ return (__db_win32_mutex_lock_int(env, mutex, 0));
+}
+
+#if defined(HAVE_SHARED_LATCHES)
+/*
+ * __db_win32_mutex_readlock_int
+ * Try to lock a mutex, possibly waiting if requested and necessary.
+ */
+int
+__db_win32_mutex_readlock_int(env, mutex, nowait)
+ ENV *env;
+ db_mutex_t mutex;
+ int nowait;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ DB_MUTEXMGR *mtxmgr;
+ DB_MUTEXREGION *mtxregion;
+ HANDLE event;
+ u_int32_t nspins;
+ int ms, ret;
+ long exch_ret, mtx_val;
+#ifdef MUTEX_DIAG
+ LARGE_INTEGER diag_now;
+#endif
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mtxmgr = env->mutex_handle;
+ mtxregion = mtxmgr->reginfo.primary;
+ mutexp = MUTEXP_SET(env, mutex);
+
+ CHECK_MTX_THREAD(env, mutexp);
+
+ /*
+ * See WINCE_ATOMIC_MAGIC definition for details.
+ * Use sharecount, because the value just needs to be a db_atomic_t
+ * memory mapped onto the same page as those being Interlocked*.
+ */
+ WINCE_ATOMIC_MAGIC(&mutexp->sharecount);
+
+ event = NULL;
+ ms = 50;
+ ret = 0;
+ /*
+ * This needs to be initialized, since if mutexp->tas
+ * is write locked on the first pass, it needs a value.
+ */
+ exch_ret = 0;
+
+loop: /* Attempt to acquire the resource for N spins. */
+ for (nspins =
+ mtxregion->stat.st_mutex_tas_spins; nspins > 0; --nspins) {
+ /*
+ * We can avoid the (expensive) interlocked instructions if
+ * the mutex is already "set".
+ */
+retry: mtx_val = atomic_read(&mutexp->sharecount);
+ if (mtx_val == MUTEX_SHARE_ISEXCLUSIVE) {
+ if (nowait)
+ return (DB_LOCK_NOTGRANTED);
+
+ continue;
+ } else if (!atomic_compare_exchange(env, &mutexp->sharecount,
+ mtx_val, mtx_val + 1)) {
+ /*
+ * Some systems (notably those with newer Intel CPUs)
+ * need a small pause here. [#6975]
+ */
+ MUTEX_PAUSE
+ goto retry;
+ }
+
+#ifdef HAVE_STATISTICS
+ if (event == NULL)
+ ++mutexp->mutex_set_rd_nowait;
+ else
+ ++mutexp->mutex_set_rd_wait;
+#endif
+ if (event != NULL) {
+ CloseHandle(event);
+ InterlockedDecrement(&mutexp->nwaiters);
+#ifdef MUTEX_DIAG
+ if (ret != WAIT_OBJECT_0) {
+ QueryPerformanceCounter(&diag_now);
+ printf(DB_STR_A("2007",
+ "[%I64d]: Lost signal on mutex %p, "
+ "id %d, ms %d\n", "%I64d %p %d %d"),
+ diag_now.QuadPart, mutexp, mutexp->id, ms);
+ }
+#endif
+ }
+
+#ifdef DIAGNOSTIC
+ /*
+ * We want to switch threads as often as possible. Yield
+ * every time we get a mutex to ensure contention.
+ */
+ if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+ __os_yield(env, 0, 0);
+#endif
+
+ return (0);
+ }
+
+ /*
+ * Yield the processor; wait 50 ms initially, up to 1 second. This
+ * loop is needed to work around a race where the signal from the
+ * unlocking thread gets lost. We start at 50 ms because it's unlikely
+ * to happen often and we want to avoid wasting CPU.
+ */
+ if (event == NULL) {
+#ifdef MUTEX_DIAG
+ QueryPerformanceCounter(&diag_now);
+ printf(DB_STR_A("2008",
+ "[%I64d]: Waiting on mutex %p, id %d\n",
+ "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
+#endif
+ InterlockedIncrement(&mutexp->nwaiters);
+ if ((ret = get_handle(env, mutexp, &event)) != 0)
+ goto err;
+ }
+ if ((ret = WaitForSingleObject(event, ms)) == WAIT_FAILED) {
+ ret = __os_get_syserr();
+ goto err;
+ }
+ if ((ms <<= 1) > MS_PER_SEC)
+ ms = MS_PER_SEC;
+
+ PANIC_CHECK(env);
+ goto loop;
+
+err: __db_syserr(env, ret, DB_STR("2009",
+ "Win32 read lock failed"));
+ return (__env_panic(env, __os_posix_err(ret)));
+}
+
+/*
+ * __db_win32_mutex_readlock
+ * Get a shared lock on a latch
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_win32_mutex_readlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_win32_mutex_readlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ return (__db_win32_mutex_readlock_int(env, mutex, 0));
+}
+
+/*
+ * __db_win32_mutex_tryreadlock
+ * Try to a shared lock on a latch
+ *
+ * PUBLIC: #if defined(HAVE_SHARED_LATCHES)
+ * PUBLIC: int __db_win32_mutex_tryreadlock __P((ENV *, db_mutex_t));
+ * PUBLIC: #endif
+ */
+int
+__db_win32_mutex_tryreadlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ return (__db_win32_mutex_readlock_int(env, mutex, 1));
+}
+#endif
+
+/*
+ * __db_win32_mutex_unlock --
+ * Release a mutex.
+ *
+ * PUBLIC: int __db_win32_mutex_unlock __P((ENV *, db_mutex_t));
+ */
+int
+__db_win32_mutex_unlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ HANDLE event;
+ int ret;
+#ifdef MUTEX_DIAG
+ LARGE_INTEGER diag_now;
+#endif
+ dbenv = env->dbenv;
+
+ if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ mutexp = MUTEXP_SET(env, mutex);
+
+#ifdef DIAGNOSTIC
+ if (!MUTEXP_IS_BUSY(mutexp) || !(F_ISSET(mutexp, DB_MUTEX_SHARED) ||
+ F_ISSET(mutexp, DB_MUTEX_LOCKED))) {
+ __db_errx(env, DB_STR_A("2010",
+ "Win32 unlock failed: lock already unlocked: mutex %d busy %d",
+ "%d %d"), mutex, MUTEXP_BUSY_FIELD(mutexp));
+ return (__env_panic(env, EACCES));
+ }
+#endif
+ /*
+ * If we have a shared latch, and a read lock (DB_MUTEX_LOCKED is only
+ * set for write locks), then decrement the latch. If the readlock is
+ * still held by other threads, just return. Otherwise go ahead and
+ * notify any waiting threads.
+ */
+#ifdef HAVE_SHARED_LATCHES
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ F_CLR(mutexp, DB_MUTEX_LOCKED);
+ if ((ret = InterlockedExchange(
+ (interlocked_val)(&atomic_read(
+ &mutexp->sharecount)), 0)) !=
+ MUTEX_SHARE_ISEXCLUSIVE) {
+ ret = DB_RUNRECOVERY;
+ goto err;
+ }
+ } else if (InterlockedDecrement(
+ (interlocked_val)(&atomic_read(&mutexp->sharecount))) > 0)
+ return (0);
+ } else
+#endif
+ {
+ F_CLR(mutexp, DB_MUTEX_LOCKED);
+ MUTEX_UNSET(&mutexp->tas);
+ }
+
+ if (mutexp->nwaiters > 0) {
+ if ((ret = get_handle(env, mutexp, &event)) != 0)
+ goto err;
+
+#ifdef MUTEX_DIAG
+ QueryPerformanceCounter(&diag_now);
+ printf(DB_STR_A("2011",
+ "[%I64d]: Signalling mutex %p, id %d\n",
+ "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
+#endif
+ if (!PulseEvent(event)) {
+ ret = __os_get_syserr();
+ CloseHandle(event);
+ goto err;
+ }
+
+ CloseHandle(event);
+ }
+
+ return (0);
+
+err: __db_syserr(env, ret, DB_STR("2012", "Win32 unlock failed"));
+ return (__env_panic(env, __os_posix_err(ret)));
+}
+
+/*
+ * __db_win32_mutex_destroy --
+ * Destroy a mutex.
+ *
+ * PUBLIC: int __db_win32_mutex_destroy __P((ENV *, db_mutex_t));
+ */
+int
+__db_win32_mutex_destroy(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ return (0);
+}
+
+#ifndef DB_WINCE
+/*
+ * db_env_set_win_security
+ *
+ * Set the SECURITY_ATTRIBUTES to be used by BDB on Windows.
+ * It should not be called while any BDB mutexes are locked.
+ *
+ * EXTERN: #if defined(DB_WIN32) && !defined(DB_WINCE)
+ * EXTERN: int db_env_set_win_security __P((SECURITY_ATTRIBUTES *sa));
+ * EXTERN: #endif
+ */
+int
+db_env_set_win_security(sa)
+ SECURITY_ATTRIBUTES *sa;
+{
+ DB_GLOBAL(win_sec_attr) = sa;
+ return (0);
+}
+#endif
diff --git a/src/mutex/test_mutex.c b/src/mutex/test_mutex.c
new file mode 100644
index 00000000..24c18016
--- /dev/null
+++ b/src/mutex/test_mutex.c
@@ -0,0 +1,1051 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * Standalone mutex tester for Berkeley DB mutexes.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef DB_WIN32
+#define MUTEX_THREAD_TEST 1
+
+extern int getopt(int, char * const *, const char *);
+
+typedef HANDLE os_pid_t;
+typedef HANDLE os_thread_t;
+
+#define os_thread_create(thrp, attr, func, arg) \
+ (((*(thrp) = CreateThread(NULL, 0, \
+ (LPTHREAD_START_ROUTINE)(func), (arg), 0, NULL)) == NULL) ? -1 : 0)
+#define os_thread_join(thr, statusp) \
+ ((WaitForSingleObject((thr), INFINITE) == WAIT_OBJECT_0) && \
+ GetExitCodeThread((thr), (LPDWORD)(statusp)) ? 0 : -1)
+#define os_thread_self() GetCurrentThreadId()
+
+#else /* !DB_WIN32 */
+
+#include <sys/wait.h>
+
+typedef pid_t os_pid_t;
+
+/*
+ * There's only one mutex implementation that can't support thread-level
+ * locking: UNIX/fcntl mutexes.
+ *
+ * The general Berkeley DB library configuration doesn't look for the POSIX
+ * pthread functions, with one exception -- pthread_yield.
+ *
+ * Use these two facts to decide if we're going to build with or without
+ * threads.
+ */
+#if !defined(HAVE_MUTEX_FCNTL) && defined(HAVE_PTHREAD_YIELD)
+#define MUTEX_THREAD_TEST 1
+
+#include <pthread.h>
+
+typedef pthread_t os_thread_t;
+
+#define os_thread_create(thrp, attr, func, arg) \
+ pthread_create((thrp), (attr), (func), (arg))
+#define os_thread_join(thr, statusp) pthread_join((thr), (statusp))
+#define os_thread_self() pthread_self()
+#endif /* HAVE_PTHREAD_YIELD */
+#endif /* !DB_WIN32 */
+
+#define OS_BAD_PID ((os_pid_t)-1)
+
+#define TESTDIR "TESTDIR" /* Working area */
+#define MT_FILE "TESTDIR/mutex.file"
+#define MT_FILE_QUIT "TESTDIR/mutex.file.quit"
+
+/*
+ * The backing data layout:
+ * TM[1] per-thread mutex array lock
+ * TM[nthreads] per-thread mutex array
+ * TM[maxlocks] per-lock mutex array
+ */
+typedef struct {
+ db_mutex_t mutex; /* Mutex. */
+ u_long id; /* Holder's ID. */
+ u_int wakeme; /* Request to awake. */
+} TM;
+
+DB_ENV *dbenv; /* Backing environment */
+ENV *env;
+size_t len; /* Backing data chunk size. */
+
+u_int8_t *gm_addr; /* Global mutex */
+u_int8_t *lm_addr; /* Locker mutexes */
+u_int8_t *tm_addr; /* Thread mutexes */
+
+#ifdef MUTEX_THREAD_TEST
+os_thread_t *kidsp; /* Locker threads */
+os_thread_t wakep; /* Wakeup thread */
+#endif
+
+#ifndef HAVE_MMAP
+u_int nprocs = 1; /* -p: Processes. */
+u_int nthreads = 20; /* -t: Threads. */
+#elif MUTEX_THREAD_TEST
+u_int nprocs = 5; /* -p: Processes. */
+u_int nthreads = 4; /* -t: Threads. */
+#else
+u_int nprocs = 20; /* -p: Processes. */
+u_int nthreads = 1; /* -t: Threads. */
+#endif
+
+u_int maxlocks = 20; /* -l: Backing locks. */
+u_int nlocks = 10000; /* -n: Locks per process. */
+int verbose; /* -v: Verbosity. */
+
+const char *progname;
+
+void data_off(u_int8_t *, DB_FH *);
+void data_on(u_int8_t **, u_int8_t **, u_int8_t **, DB_FH **, int);
+int locker_start(u_long);
+int locker_wait(void);
+os_pid_t os_spawn(const char *, char *const[]);
+int os_wait(os_pid_t *, u_int);
+void *run_lthread(void *);
+void *run_wthread(void *);
+os_pid_t spawn_proc(u_long, char *, char *);
+void tm_env_close(void);
+int tm_env_init(void);
+void tm_mutex_destroy(void);
+void tm_mutex_init(void);
+void tm_mutex_stats(void);
+int usage(void);
+int wakeup_start(u_long);
+int wakeup_wait(void);
+
+int
+main(argc, argv)
+ int argc;
+ char *argv[];
+{
+ enum {LOCKER, WAKEUP, PARENT} rtype;
+ extern int optind;
+ extern char *optarg;
+ os_pid_t wakeup_pid, *pids;
+ u_long id;
+ u_int i;
+ DB_FH *fhp, *map_fhp;
+ int ch, err;
+ char *p, *tmpath, cmd[1024];
+
+ if ((progname = __db_rpath(argv[0])) == NULL)
+ progname = argv[0];
+ else
+ ++progname;
+
+ rtype = PARENT;
+ id = 0;
+ tmpath = argv[0];
+ while ((ch = getopt(argc, argv, "l:n:p:T:t:v")) != EOF)
+ switch (ch) {
+ case 'l':
+ maxlocks = (u_int)atoi(optarg);
+ break;
+ case 'n':
+ nlocks = (u_int)atoi(optarg);
+ break;
+ case 'p':
+ nprocs = (u_int)atoi(optarg);
+ break;
+ case 't':
+ if ((nthreads = (u_int)atoi(optarg)) == 0)
+ nthreads = 1;
+#if !defined(MUTEX_THREAD_TEST)
+ if (nthreads != 1) {
+ fprintf(stderr,
+ "%s: thread support not available or not compiled for this platform.\n",
+ progname);
+ return (EXIT_FAILURE);
+ }
+#endif
+ break;
+ case 'T':
+ if (!memcmp(optarg, "locker", sizeof("locker") - 1))
+ rtype = LOCKER;
+ else if (
+ !memcmp(optarg, "wakeup", sizeof("wakeup") - 1))
+ rtype = WAKEUP;
+ else
+ return (usage());
+ if ((p = strchr(optarg, '=')) == NULL)
+ return (usage());
+ id = (u_long)atoi(p + 1);
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= optind;
+ argv += optind;
+
+ /*
+ * If we're not running a multi-process test, we should be running
+ * a multi-thread test.
+ */
+ if (nprocs == 1 && nthreads == 1) {
+ fprintf(stderr,
+ "%s: running in a single process requires multiple threads\n",
+ progname);
+ return (EXIT_FAILURE);
+ }
+
+ len = sizeof(TM) * (1 + nthreads * nprocs + maxlocks);
+
+ /*
+ * In the multi-process test, the parent spawns processes that exec
+ * the original binary, ending up here. Each process joins the DB
+ * environment separately and then calls the supporting function.
+ */
+ if (rtype == LOCKER || rtype == WAKEUP) {
+ __os_yield(env, 3, 0); /* Let everyone catch up. */
+ /* Initialize random numbers. */
+ srand((u_int)time(NULL) % (u_int)getpid());
+
+ if (tm_env_init() != 0) /* Join the environment. */
+ exit(EXIT_FAILURE);
+ /* Join the backing data. */
+ data_on(&gm_addr, &tm_addr, &lm_addr, &map_fhp, 0);
+ if (verbose)
+ printf(
+ "Backing file: global (%#lx), threads (%#lx), locks (%#lx)\n",
+ (u_long)gm_addr, (u_long)tm_addr, (u_long)lm_addr);
+
+ if ((rtype == LOCKER ?
+ locker_start(id) : wakeup_start(id)) != 0)
+ exit(EXIT_FAILURE);
+ if ((rtype == LOCKER ? locker_wait() : wakeup_wait()) != 0)
+ exit(EXIT_FAILURE);
+
+ data_off(gm_addr, map_fhp); /* Detach from backing data. */
+
+ tm_env_close(); /* Detach from environment. */
+
+ exit(EXIT_SUCCESS);
+ }
+
+ /*
+ * The following code is only executed by the original parent process.
+ *
+ * Clean up from any previous runs.
+ */
+ snprintf(cmd, sizeof(cmd), "rm -rf %s", TESTDIR);
+ (void)system(cmd);
+ snprintf(cmd, sizeof(cmd), "mkdir %s", TESTDIR);
+ (void)system(cmd);
+
+ printf(
+ "%s: %u processes, %u threads/process, %u lock requests from %u locks\n",
+ progname, nprocs, nthreads, nlocks, maxlocks);
+ printf("%s: backing data %lu bytes\n", progname, (u_long)len);
+
+ if (tm_env_init() != 0) /* Create the environment. */
+ exit(EXIT_FAILURE);
+ /* Create the backing data. */
+ data_on(&gm_addr, &tm_addr, &lm_addr, &map_fhp, 1);
+ if (verbose)
+ printf(
+ "backing data: global (%#lx), threads (%#lx), locks (%#lx)\n",
+ (u_long)gm_addr, (u_long)tm_addr, (u_long)lm_addr);
+
+ tm_mutex_init(); /* Initialize mutexes. */
+
+ if (nprocs > 1) { /* Run the multi-process test. */
+ /* Allocate array of locker process IDs. */
+ if ((pids = calloc(nprocs, sizeof(os_pid_t))) == NULL) {
+ fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+ goto fail;
+ }
+
+ /* Spawn locker processes and threads. */
+ for (i = 0; i < nprocs; ++i) {
+ if ((pids[i] =
+ spawn_proc(id, tmpath, "locker")) == OS_BAD_PID) {
+ fprintf(stderr,
+ "%s: failed to spawn a locker\n", progname);
+ goto fail;
+ }
+ id += nthreads;
+ }
+
+ /* Spawn wakeup process/thread. */
+ if ((wakeup_pid =
+ spawn_proc(id, tmpath, "wakeup")) == OS_BAD_PID) {
+ fprintf(stderr,
+ "%s: failed to spawn waker\n", progname);
+ goto fail;
+ }
+ ++id;
+
+ /* Wait for all lockers to exit. */
+ if ((err = os_wait(pids, nprocs)) != 0) {
+ fprintf(stderr, "%s: locker wait failed with %d\n",
+ progname, err);
+ goto fail;
+ }
+
+ /* Signal wakeup process to exit. */
+ if ((err = __os_open(
+ env, MT_FILE_QUIT, 0, DB_OSO_CREATE, 0664, &fhp)) != 0) {
+ fprintf(stderr,
+ "%s: open %s\n", progname, db_strerror(err));
+ goto fail;
+ }
+ (void)__os_closehandle(env, fhp);
+
+ /* Wait for wakeup process/thread. */
+ if ((err = os_wait(&wakeup_pid, 1)) != 0) {
+ fprintf(stderr, "%s: %lu: exited %d\n",
+ progname, (u_long)wakeup_pid, err);
+ goto fail;
+ }
+ } else { /* Run the single-process test. */
+ /* Spawn locker threads. */
+ if (locker_start(0) != 0)
+ goto fail;
+
+ /* Spawn wakeup thread. */
+ if (wakeup_start(nthreads) != 0)
+ goto fail;
+
+ /* Wait for all lockers to exit. */
+ if (locker_wait() != 0)
+ goto fail;
+
+ /* Signal wakeup process to exit. */
+ if ((err = __os_open(
+ env, MT_FILE_QUIT, 0, DB_OSO_CREATE, 0664, &fhp)) != 0) {
+ fprintf(stderr,
+ "%s: open %s\n", progname, db_strerror(err));
+ goto fail;
+ }
+ (void)__os_closehandle(env, fhp);
+
+ /* Wait for wakeup thread. */
+ if (wakeup_wait() != 0)
+ goto fail;
+ }
+
+ tm_mutex_stats(); /* Display run statistics. */
+ tm_mutex_destroy(); /* Destroy mutexes. */
+
+ data_off(gm_addr, map_fhp); /* Detach from backing data. */
+
+ tm_env_close(); /* Detach from environment. */
+
+ printf("%s: test succeeded\n", progname);
+ return (EXIT_SUCCESS);
+
+fail: printf("%s: FAILED!\n", progname);
+ return (EXIT_FAILURE);
+}
+
+int
+locker_start(id)
+ u_long id;
+{
+#if defined(MUTEX_THREAD_TEST)
+ u_int i;
+ int err;
+
+ /*
+ * Spawn off threads. We have nthreads all locking and going to
+ * sleep, and one other thread cycling through and waking them up.
+ */
+ if ((kidsp =
+ (os_thread_t *)calloc(sizeof(os_thread_t), nthreads)) == NULL) {
+ fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+ return (1);
+ }
+ for (i = 0; i < nthreads; i++)
+ if ((err = os_thread_create(
+ &kidsp[i], NULL, run_lthread, (void *)(id + i))) != 0) {
+ fprintf(stderr, "%s: failed spawning thread: %s\n",
+ progname, db_strerror(err));
+ return (1);
+ }
+ return (0);
+#else
+ return (run_lthread((void *)id) == NULL ? 0 : 1);
+#endif
+}
+
+int
+locker_wait()
+{
+#if defined(MUTEX_THREAD_TEST)
+ u_int i;
+ void *retp;
+
+ /* Wait for the threads to exit. */
+ for (i = 0; i < nthreads; i++) {
+ (void)os_thread_join(kidsp[i], &retp);
+ if (retp != NULL) {
+ fprintf(stderr,
+ "%s: thread exited with error\n", progname);
+ return (1);
+ }
+ }
+ free(kidsp);
+#endif
+ return (0);
+}
+
+void *
+run_lthread(arg)
+ void *arg;
+{
+ TM *gp, *mp, *tp;
+ u_long id, tid;
+ u_int lock, nl;
+ int err, i;
+
+ id = (u_long)arg;
+#if defined(MUTEX_THREAD_TEST)
+ tid = (u_long)os_thread_self();
+#else
+ tid = 0;
+#endif
+ printf("Locker: ID %03lu (PID: %lu; TID: %lx)\n",
+ id, (u_long)getpid(), tid);
+
+ gp = (TM *)gm_addr;
+ tp = (TM *)(tm_addr + id * sizeof(TM));
+
+ for (nl = nlocks; nl > 0;) {
+ /* Select and acquire a data lock. */
+ lock = (u_int)rand() % maxlocks;
+ mp = (TM *)(lm_addr + lock * sizeof(TM));
+ if (verbose)
+ printf("%03lu: lock %d (mtx: %lu)\n",
+ id, lock, (u_long)mp->mutex);
+
+ if ((err = dbenv->mutex_lock(dbenv, mp->mutex)) != 0) {
+ fprintf(stderr, "%s: %03lu: never got lock %d: %s\n",
+ progname, id, lock, db_strerror(err));
+ return ((void *)1);
+ }
+ if (mp->id != 0) {
+ fprintf(stderr,
+ "%s: RACE! (%03lu granted lock %d held by %03lu)\n",
+ progname, id, lock, mp->id);
+ return ((void *)1);
+ }
+ mp->id = id;
+
+ /*
+ * Pretend to do some work, periodically checking to see if
+ * we still hold the mutex.
+ */
+ for (i = 0; i < 3; ++i) {
+ __os_yield(env, 0, (u_long)rand() % 3);
+ if (mp->id != id) {
+ fprintf(stderr,
+ "%s: RACE! (%03lu stole lock %d from %03lu)\n",
+ progname, mp->id, lock, id);
+ return ((void *)1);
+ }
+ }
+
+ /*
+ * Test self-blocking and unlocking by other threads/processes:
+ *
+ * acquire the global lock
+ * set our wakeup flag
+ * release the global lock
+ * acquire our per-thread lock
+ *
+ * The wakeup thread will wake us up.
+ */
+ if ((err = dbenv->mutex_lock(dbenv, gp->mutex)) != 0) {
+ fprintf(stderr, "%s: %03lu: global lock: %s\n",
+ progname, id, db_strerror(err));
+ return ((void *)1);
+ }
+ if (tp->id != 0 && tp->id != id) {
+ fprintf(stderr,
+ "%s: %03lu: per-thread mutex isn't mine, owned by %03lu\n",
+ progname, id, tp->id);
+ return ((void *)1);
+ }
+ tp->id = id;
+ if (verbose)
+ printf("%03lu: self-blocking (mtx: %lu)\n",
+ id, (u_long)tp->mutex);
+ if (tp->wakeme) {
+ fprintf(stderr,
+ "%s: %03lu: wakeup flag incorrectly set\n",
+ progname, id);
+ return ((void *)1);
+ }
+ tp->wakeme = 1;
+ if ((err = dbenv->mutex_unlock(dbenv, gp->mutex)) != 0) {
+ fprintf(stderr,
+ "%s: %03lu: global unlock: %s\n",
+ progname, id, db_strerror(err));
+ return ((void *)1);
+ }
+ if ((err = dbenv->mutex_lock(dbenv, tp->mutex)) != 0) {
+ fprintf(stderr, "%s: %03lu: per-thread lock: %s\n",
+ progname, id, db_strerror(err));
+ return ((void *)1);
+ }
+ /* Time passes... */
+ if (tp->wakeme) {
+ fprintf(stderr, "%s: %03lu: wakeup flag not cleared\n",
+ progname, id);
+ return ((void *)1);
+ }
+
+ if (verbose)
+ printf("%03lu: release %d (mtx: %lu)\n",
+ id, lock, (u_long)mp->mutex);
+
+ /* Release the data lock. */
+ mp->id = 0;
+ if ((err = dbenv->mutex_unlock(dbenv, mp->mutex)) != 0) {
+ fprintf(stderr,
+ "%s: %03lu: lock release: %s\n",
+ progname, id, db_strerror(err));
+ return ((void *)1);
+ }
+
+ if (--nl % 1000 == 0)
+ printf("%03lu: %d\n", id, nl);
+ }
+
+ return (NULL);
+}
+
+int
+wakeup_start(id)
+ u_long id;
+{
+#if defined(MUTEX_THREAD_TEST)
+ int err;
+
+ /*
+ * Spawn off wakeup thread.
+ */
+ if ((err = os_thread_create(
+ &wakep, NULL, run_wthread, (void *)id)) != 0) {
+ fprintf(stderr, "%s: failed spawning wakeup thread: %s\n",
+ progname, db_strerror(err));
+ return (1);
+ }
+ return (0);
+#else
+ return (run_wthread((void *)id) == NULL ? 0 : 1);
+#endif
+}
+
+int
+wakeup_wait()
+{
+#if defined(MUTEX_THREAD_TEST)
+ void *retp;
+
+ /*
+ * A file is created when the wakeup thread is no longer needed.
+ */
+ (void)os_thread_join(wakep, &retp);
+ if (retp != NULL) {
+ fprintf(stderr,
+ "%s: wakeup thread exited with error\n", progname);
+ return (1);
+ }
+#endif
+ return (0);
+}
+
+/*
+ * run_wthread --
+ * Thread to wake up other threads that are sleeping.
+ */
+void *
+run_wthread(arg)
+ void *arg;
+{
+ TM *gp, *tp;
+ u_long id, tid;
+ u_int check_id;
+ int err, quitcheck;
+
+ id = (u_long)arg;
+ quitcheck = 0;
+#if defined(MUTEX_THREAD_TEST)
+ tid = (u_long)os_thread_self();
+#else
+ tid = 0;
+#endif
+ printf("Wakeup: ID %03lu (PID: %lu; TID: %lx)\n",
+ id, (u_long)getpid(), tid);
+
+ gp = (TM *)gm_addr;
+
+ /* Loop, waking up sleepers and periodically sleeping ourselves. */
+ for (check_id = 0;; ++check_id) {
+ /* Check to see if the locking threads have finished. */
+ if (++quitcheck >= 100) {
+ quitcheck = 0;
+ if (__os_exists(env, MT_FILE_QUIT, NULL) == 0)
+ break;
+ }
+
+ /* Check for ID wraparound. */
+ if (check_id == nthreads * nprocs)
+ check_id = 0;
+
+ /* Check for a thread that needs a wakeup. */
+ tp = (TM *)(tm_addr + check_id * sizeof(TM));
+ if (!tp->wakeme)
+ continue;
+
+ if (verbose) {
+ printf("%03lu: wakeup thread %03lu (mtx: %lu)\n",
+ id, tp->id, (u_long)tp->mutex);
+ (void)fflush(stdout);
+ }
+
+ /* Acquire the global lock. */
+ if ((err = dbenv->mutex_lock(dbenv, gp->mutex)) != 0) {
+ fprintf(stderr, "%s: wakeup: global lock: %s\n",
+ progname, db_strerror(err));
+ return ((void *)1);
+ }
+
+ tp->wakeme = 0;
+ if ((err = dbenv->mutex_unlock(dbenv, tp->mutex)) != 0) {
+ fprintf(stderr, "%s: wakeup: unlock: %s\n",
+ progname, db_strerror(err));
+ return ((void *)1);
+ }
+
+ if ((err = dbenv->mutex_unlock(dbenv, gp->mutex)) != 0) {
+ fprintf(stderr, "%s: wakeup: global unlock: %s\n",
+ progname, db_strerror(err));
+ return ((void *)1);
+ }
+
+ __os_yield(env, 0, (u_long)rand() % 3);
+ }
+ return (NULL);
+}
+
+/*
+ * tm_env_init --
+ * Create the backing database environment.
+ */
+int
+tm_env_init()
+{
+ u_int32_t flags;
+ int ret;
+ char *home;
+
+ /*
+ * Create an environment object and initialize it for error
+ * reporting.
+ */
+ if ((ret = db_env_create(&dbenv, 0)) != 0) {
+ fprintf(stderr, "%s: %s\n", progname, db_strerror(ret));
+ return (1);
+ }
+ env = dbenv->env;
+ dbenv->set_errfile(dbenv, stderr);
+ dbenv->set_errpfx(dbenv, progname);
+
+ /* Allocate enough mutexes. */
+ if ((ret = dbenv->mutex_set_increment(dbenv,
+ 1 + nthreads * nprocs + maxlocks)) != 0) {
+ dbenv->err(dbenv, ret, "dbenv->mutex_set_increment");
+ return (1);
+ }
+
+ flags = DB_CREATE;
+ if (nprocs == 1) {
+ home = NULL;
+ flags |= DB_PRIVATE;
+ } else
+ home = TESTDIR;
+ if (nthreads != 1)
+ flags |= DB_THREAD;
+ if ((ret = dbenv->open(dbenv, home, flags, 0)) != 0) {
+ dbenv->err(dbenv, ret, "environment open: %s", home);
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * tm_env_close --
+ * Close the backing database environment.
+ */
+void
+tm_env_close()
+{
+ (void)dbenv->close(dbenv, 0);
+}
+
+/*
+ * tm_mutex_init --
+ * Initialize the mutexes.
+ */
+void
+tm_mutex_init()
+{
+ TM *mp;
+ u_int i;
+ int err;
+
+ if (verbose)
+ printf("Allocate the global mutex: ");
+ mp = (TM *)gm_addr;
+ if ((err = dbenv->mutex_alloc(dbenv, 0, &mp->mutex)) != 0) {
+ fprintf(stderr, "%s: DB_ENV->mutex_alloc (global): %s\n",
+ progname, db_strerror(err));
+ exit(EXIT_FAILURE);
+ }
+ if (verbose)
+ printf("%lu\n", (u_long)mp->mutex);
+
+ if (verbose)
+ printf(
+ "Allocate %d per-thread, self-blocking mutexes: ",
+ nthreads * nprocs);
+ for (i = 0; i < nthreads * nprocs; ++i) {
+ mp = (TM *)(tm_addr + i * sizeof(TM));
+ if ((err = dbenv->mutex_alloc(
+ dbenv, DB_MUTEX_SELF_BLOCK, &mp->mutex)) != 0) {
+ fprintf(stderr,
+ "%s: DB_ENV->mutex_alloc (per-thread %d): %s\n",
+ progname, i, db_strerror(err));
+ exit(EXIT_FAILURE);
+ }
+ if ((err = dbenv->mutex_lock(dbenv, mp->mutex)) != 0) {
+ fprintf(stderr,
+ "%s: DB_ENV->mutex_lock (per-thread %d): %s\n",
+ progname, i, db_strerror(err));
+ exit(EXIT_FAILURE);
+ }
+ if (verbose)
+ printf("%lu ", (u_long)mp->mutex);
+ }
+ if (verbose)
+ printf("\n");
+
+ if (verbose)
+ printf("Allocate %d per-lock mutexes: ", maxlocks);
+ for (i = 0; i < maxlocks; ++i) {
+ mp = (TM *)(lm_addr + i * sizeof(TM));
+ if ((err = dbenv->mutex_alloc(dbenv, 0, &mp->mutex)) != 0) {
+ fprintf(stderr,
+ "%s: DB_ENV->mutex_alloc (per-lock: %d): %s\n",
+ progname, i, db_strerror(err));
+ exit(EXIT_FAILURE);
+ }
+ if (verbose)
+ printf("%lu ", (u_long)mp->mutex);
+ }
+ if (verbose)
+ printf("\n");
+}
+
+/*
+ * tm_mutex_destroy --
+ * Destroy the mutexes.
+ */
+void
+tm_mutex_destroy()
+{
+ TM *gp, *mp;
+ u_int i;
+ int err;
+
+ if (verbose)
+ printf("Destroy the global mutex.\n");
+ gp = (TM *)gm_addr;
+ if ((err = dbenv->mutex_free(dbenv, gp->mutex)) != 0) {
+ fprintf(stderr, "%s: DB_ENV->mutex_free (global): %s\n",
+ progname, db_strerror(err));
+ exit(EXIT_FAILURE);
+ }
+
+ if (verbose)
+ printf("Destroy the per-thread mutexes.\n");
+ for (i = 0; i < nthreads * nprocs; ++i) {
+ mp = (TM *)(tm_addr + i * sizeof(TM));
+ if ((err = dbenv->mutex_free(dbenv, mp->mutex)) != 0) {
+ fprintf(stderr,
+ "%s: DB_ENV->mutex_free (per-thread %d): %s\n",
+ progname, i, db_strerror(err));
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (verbose)
+ printf("Destroy the per-lock mutexes.\n");
+ for (i = 0; i < maxlocks; ++i) {
+ mp = (TM *)(lm_addr + i * sizeof(TM));
+ if ((err = dbenv->mutex_free(dbenv, mp->mutex)) != 0) {
+ fprintf(stderr,
+ "%s: DB_ENV->mutex_free (per-lock: %d): %s\n",
+ progname, i, db_strerror(err));
+ exit(EXIT_FAILURE);
+ }
+ }
+}
+
+/*
+ * tm_mutex_stats --
+ * Display mutex statistics.
+ */
+void
+tm_mutex_stats()
+{
+#ifdef HAVE_STATISTICS
+ TM *mp;
+ uintmax_t set_wait, set_nowait;
+ u_int i;
+
+ printf("Per-lock mutex statistics.\n");
+ for (i = 0; i < maxlocks; ++i) {
+ mp = (TM *)(lm_addr + i * sizeof(TM));
+ __mutex_set_wait_info(env, mp->mutex, &set_wait, &set_nowait);
+ printf("mutex %2d: wait: %lu; no wait %lu\n", i,
+ (u_long)set_wait, (u_long)set_nowait);
+ }
+#endif
+}
+
+/*
+ * data_on --
+ * Map in or allocate the backing data space.
+ */
+void
+data_on(gm_addrp, tm_addrp, lm_addrp, fhpp, init)
+ u_int8_t **gm_addrp, **tm_addrp, **lm_addrp;
+ DB_FH **fhpp;
+ int init;
+{
+ DB_FH *fhp;
+ size_t nwrite;
+ int err;
+ void *addr;
+
+ fhp = NULL;
+
+ /*
+ * In a single process, use heap memory.
+ */
+ if (nprocs == 1) {
+ if (init) {
+ if ((err =
+ __os_calloc(env, (size_t)len, 1, &addr)) != 0)
+ exit(EXIT_FAILURE);
+ } else {
+ fprintf(stderr,
+ "%s: init should be set for single process call\n",
+ progname);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ if (init) {
+ if (verbose)
+ printf("Create the backing file.\n");
+
+ if ((err = __os_open(env, MT_FILE, 0,
+ DB_OSO_CREATE | DB_OSO_TRUNC, 0666, &fhp)) == -1) {
+ fprintf(stderr, "%s: %s: open: %s\n",
+ progname, MT_FILE, db_strerror(err));
+ exit(EXIT_FAILURE);
+ }
+
+ if ((err =
+ __os_seek(env, fhp, 0, 0, (u_int32_t)len)) != 0 ||
+ (err =
+ __os_write(env, fhp, &err, 1, &nwrite)) != 0 ||
+ nwrite != 1) {
+ fprintf(stderr, "%s: %s: seek/write: %s\n",
+ progname, MT_FILE, db_strerror(err));
+ exit(EXIT_FAILURE);
+ }
+ } else
+ if ((err = __os_open(env, MT_FILE, 0, 0, 0, &fhp)) != 0)
+ exit(EXIT_FAILURE);
+
+ if ((err =
+ __os_mapfile(env, MT_FILE, fhp, len, 0, &addr)) != 0)
+ exit(EXIT_FAILURE);
+ }
+
+ *gm_addrp = (u_int8_t *)addr;
+ addr = (u_int8_t *)addr + sizeof(TM);
+ *tm_addrp = (u_int8_t *)addr;
+ addr = (u_int8_t *)addr + sizeof(TM) * (nthreads * nprocs);
+ *lm_addrp = (u_int8_t *)addr;
+
+ if (fhpp != NULL)
+ *fhpp = fhp;
+}
+
+/*
+ * data_off --
+ * Discard or de-allocate the backing data space.
+ */
+void
+data_off(addr, fhp)
+ u_int8_t *addr;
+ DB_FH *fhp;
+{
+ if (nprocs == 1)
+ __os_free(env, addr);
+ else {
+ if (__os_unmapfile(env, addr, len) != 0)
+ exit(EXIT_FAILURE);
+ if (__os_closehandle(env, fhp) != 0)
+ exit(EXIT_FAILURE);
+ }
+}
+
+/*
+ * usage --
+ *
+ */
+int
+usage()
+{
+ fprintf(stderr, "usage: %s %s\n\t%s\n", progname,
+ "[-v] [-l maxlocks]",
+ "[-n locks] [-p procs] [-T locker=ID|wakeup=ID] [-t threads]");
+ return (EXIT_FAILURE);
+}
+
+/*
+ * os_wait --
+ * Wait for an array of N procs.
+ */
+int
+os_wait(procs, n)
+ os_pid_t *procs;
+ u_int n;
+{
+ u_int i;
+ int status;
+#if defined(DB_WIN32)
+ DWORD ret;
+#endif
+
+ status = 0;
+
+#if defined(DB_WIN32)
+ do {
+ ret = WaitForMultipleObjects(n, procs, FALSE, INFINITE);
+ i = ret - WAIT_OBJECT_0;
+ if (i < 0 || i >= n)
+ return (__os_posix_err(__os_get_syserr()));
+
+ if ((GetExitCodeProcess(procs[i], &ret) == 0) || (ret != 0))
+ return (ret);
+
+ /* remove the process handle from the list */
+ while (++i < n)
+ procs[i - 1] = procs[i];
+ } while (--n);
+#elif !defined(HAVE_VXWORKS)
+ do {
+ if (wait(&status) == -1)
+ return (__os_posix_err(__os_get_syserr()));
+
+ if (WIFEXITED(status) == 0 || WEXITSTATUS(status) != 0) {
+ for (i = 0; i < n; i++)
+ (void)kill(procs[i], SIGKILL);
+ return (WEXITSTATUS(status));
+ }
+ } while (--n);
+#endif
+
+ return (0);
+}
+
+os_pid_t
+spawn_proc(id, tmpath, typearg)
+ u_long id;
+ char *tmpath, *typearg;
+{
+ char *const vbuf = verbose ? "-v" : NULL;
+ char *args[13], lbuf[16], nbuf[16], pbuf[16], tbuf[16], Tbuf[256];
+
+ args[0] = tmpath;
+ args[1] = "-l";
+ snprintf(lbuf, sizeof(lbuf), "%d", maxlocks);
+ args[2] = lbuf;
+ args[3] = "-n";
+ snprintf(nbuf, sizeof(nbuf), "%d", nlocks);
+ args[4] = nbuf;
+ args[5] = "-p";
+ snprintf(pbuf, sizeof(pbuf), "%d", nprocs);
+ args[6] = pbuf;
+ args[7] = "-t";
+ snprintf(tbuf, sizeof(tbuf), "%d", nthreads);
+ args[8] = tbuf;
+ args[9] = "-T";
+ snprintf(Tbuf, sizeof(Tbuf), "%s=%lu", typearg, id);
+ args[10] = Tbuf;
+ args[11] = vbuf;
+ args[12] = NULL;
+
+ return (os_spawn(tmpath, args));
+}
+
+os_pid_t
+os_spawn(path, argv)
+ const char *path;
+ char *const argv[];
+{
+ os_pid_t pid;
+ int status;
+
+ COMPQUIET(pid, 0);
+ COMPQUIET(status, 0);
+
+#ifdef HAVE_VXWORKS
+ fprintf(stderr, "%s: os_spawn not supported for VxWorks.\n", progname);
+ return (OS_BAD_PID);
+#elif defined(HAVE_QNX)
+ /*
+ * For QNX, we cannot fork if we've ever used threads. So
+ * we'll use their spawn function. We use 'spawnl' which
+ * is NOT a POSIX function.
+ *
+ * The return value of spawnl is just what we want depending
+ * on the value of the 'wait' arg.
+ */
+ return (spawnv(P_NOWAIT, path, argv));
+#elif defined(DB_WIN32)
+ return (os_pid_t)(_spawnv(P_NOWAIT, path, argv));
+#else
+ if ((pid = fork()) != 0) {
+ if (pid == -1)
+ return (OS_BAD_PID);
+ return (pid);
+ } else {
+ (void)execv(path, argv);
+ exit(EXIT_FAILURE);
+ }
+#endif
+}
diff --git a/src/mutex/uts4_cc.s b/src/mutex/uts4_cc.s
new file mode 100644
index 00000000..4f59e9c8
--- /dev/null
+++ b/src/mutex/uts4_cc.s
@@ -0,0 +1,26 @@
+ / See the file LICENSE for redistribution information.
+ /
+ / Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ /
+ / $Id$
+ /
+ / int uts_lock ( int *p, int i );
+ / Update the lock word pointed to by p with the
+ / value i, using compare-and-swap.
+ / Returns 0 if update was successful.
+ / Returns 1 if update failed.
+ /
+ entry uts_lock
+ uts_lock:
+ using .,r15
+ st r2,8(sp) / Save R2
+ l r2,64+0(sp) / R2 -> word to update
+ slr r0, r0 / R0 = current lock value must be 0
+ l r1,64+4(sp) / R1 = new lock value
+ cs r0,r1,0(r2) / Try the update ...
+ be x / ... Success. Return 0
+ la r0,1 / ... Failure. Return 1
+ x: /
+ l r2,8(sp) / Restore R2
+ b 2(,r14) / Return to caller
+ drop r15
diff --git a/src/os/os_abort.c b/src/os/os_abort.c
new file mode 100644
index 00000000..68b4bc05
--- /dev/null
+++ b/src/os/os_abort.c
@@ -0,0 +1,33 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_abort --
+ *
+ * PUBLIC: void __os_abort __P((ENV *));
+ */
+void
+__os_abort(env)
+ ENV *env;
+{
+ __os_stack(env); /* Try and get a stack trace. */
+
+#ifdef HAVE_ABORT
+ abort(); /* Try and drop core. */
+ /* NOTREACHED */
+#endif
+#ifdef SIGABRT
+ (void)raise(SIGABRT); /* Try and drop core. */
+#endif
+ exit(1); /* Quit anyway. */
+ /* NOTREACHED */
+}
diff --git a/src/os/os_abs.c b/src/os/os_abs.c
new file mode 100644
index 00000000..4a1a5abd
--- /dev/null
+++ b/src/os/os_abs.c
@@ -0,0 +1,24 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_abspath --
+ * Return if a path is an absolute path.
+ *
+ * PUBLIC: int __os_abspath __P((const char *));
+ */
+int
+__os_abspath(path)
+ const char *path;
+{
+ return (path[0] == '/');
+}
diff --git a/src/os/os_addrinfo.c b/src/os/os_addrinfo.c
new file mode 100644
index 00000000..205f41ec
--- /dev/null
+++ b/src/os/os_addrinfo.c
@@ -0,0 +1,179 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_getaddrinfo and __os_freeaddrinfo wrap the getaddrinfo and freeaddrinfo
+ * calls, as well as the associated platform dependent error handling, mapping
+ * the error return to a ANSI C/POSIX error return.
+ */
+
+/*
+ * __os_getaddrinfo --
+ *
+ * PUBLIC: #if defined(HAVE_REPLICATION_THREADS)
+ * PUBLIC: int __os_getaddrinfo __P((ENV *, const char *, u_int,
+ * PUBLIC: const char *, const ADDRINFO *, ADDRINFO **));
+ * PUBLIC: #endif
+ */
+int
+__os_getaddrinfo(env, nodename, port, servname, hints, res)
+ ENV *env;
+ const char *nodename, *servname;
+ u_int port;
+ const ADDRINFO *hints;
+ ADDRINFO **res;
+{
+#ifdef HAVE_GETADDRINFO
+ int ret;
+
+ if ((ret = getaddrinfo(nodename, servname, hints, res)) == 0)
+ return (0);
+
+ __db_errx(env, DB_STR_A("0153",
+ "%s(%u): host lookup failed: %s", "%s %u %s"),
+ nodename == NULL ? "" : nodename, port,
+#ifdef DB_WIN32
+ gai_strerrorA(ret));
+#else
+ gai_strerror(ret));
+#endif
+ return (__os_posix_err(ret));
+#else
+ ADDRINFO *answer;
+ struct hostent *hostaddr;
+ struct sockaddr_in sin;
+ u_int32_t tmpaddr;
+ int ret;
+
+ COMPQUIET(hints, NULL);
+ COMPQUIET(servname, NULL);
+
+ /* INADDR_NONE is not defined on Solaris 2.6, 2.7 or 2.8. */
+#ifndef INADDR_NONE
+#define INADDR_NONE ((u_long)0xffffffff)
+#endif
+
+ /*
+ * Basic implementation of IPv4 component of getaddrinfo.
+ * Limited to the functionality used by repmgr.
+ */
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ if (nodename) {
+ if (nodename[0] == '\0')
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ else if ((tmpaddr = inet_addr(CHAR_STAR_CAST nodename)) !=
+ INADDR_NONE) {
+ sin.sin_addr.s_addr = tmpaddr;
+ } else {
+ hostaddr = gethostbyname(nodename);
+ if (hostaddr == NULL) {
+#ifdef DB_WIN32
+ ret = __os_get_neterr();
+ __db_syserr(env, ret, DB_STR_A("0154",
+ "%s(%u): host lookup failed", "%s %u"),
+ nodename == NULL ? "" : nodename, port);
+ return (__os_posix_err(ret));
+#else
+ /*
+ * Historic UNIX systems used the h_errno
+ * global variable to return gethostbyname
+ * errors. The only function we currently
+ * use that needs h_errno is gethostbyname,
+ * so we deal with it here.
+ *
+ * hstrerror is not available on Solaris 2.6
+ * (it is in libresolv but is a private,
+ * unexported symbol).
+ */
+#ifdef HAVE_HSTRERROR
+ __db_errx(env, DB_STR_A("0155",
+ "%s(%u): host lookup failed: %s",
+ "%s %u %s"),
+ nodename == NULL ? "" : nodename, port,
+ hstrerror(h_errno));
+#else
+ __db_errx(env, DB_STR_A("0156",
+ "%s(%u): host lookup failed: %d",
+ "%s %u %d"),
+ nodename == NULL ? "" : nodename, port,
+ h_errno);
+#endif
+ switch (h_errno) {
+ case HOST_NOT_FOUND:
+ case NO_DATA:
+ return (EHOSTUNREACH);
+ case TRY_AGAIN:
+ return (EAGAIN);
+ case NO_RECOVERY:
+ default:
+ return (EFAULT);
+ }
+ /* NOTREACHED */
+#endif
+ }
+ memcpy(&(sin.sin_addr),
+ hostaddr->h_addr, (size_t)hostaddr->h_length);
+ }
+ } else /* No host specified. */
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons((u_int16_t)port);
+
+ if ((ret = __os_calloc(env, 1, sizeof(ADDRINFO), &answer)) != 0)
+ return (ret);
+ if ((ret = __os_malloc(env, sizeof(sin), &answer->ai_addr)) != 0) {
+ __os_free(env, answer);
+ return (ret);
+ }
+
+ answer->ai_family = AF_INET;
+ answer->ai_protocol = IPPROTO_TCP;
+ answer->ai_socktype = SOCK_STREAM;
+ answer->ai_addrlen = sizeof(sin);
+ memcpy(answer->ai_addr, &sin, sizeof(sin));
+ *res = answer;
+
+ return (0);
+#endif /* HAVE_GETADDRINFO */
+}
+
+/*
+ * __os_freeaddrinfo --
+ *
+ * PUBLIC: #if defined(HAVE_REPLICATION_THREADS)
+ * PUBLIC: void __os_freeaddrinfo __P((ENV *, ADDRINFO *));
+ * PUBLIC: #endif
+ */
+void
+__os_freeaddrinfo(env, ai)
+ ENV *env;
+ ADDRINFO *ai;
+{
+#ifdef HAVE_GETADDRINFO
+ COMPQUIET(env, NULL);
+
+ freeaddrinfo(ai);
+#else
+ ADDRINFO *next, *tmpaddr;
+
+ for (next = ai; next != NULL; next = tmpaddr) {
+ if (next->ai_canonname != NULL)
+ __os_free(env, next->ai_canonname);
+
+ if (next->ai_addr != NULL)
+ __os_free(env, next->ai_addr);
+
+ tmpaddr = next->ai_next;
+ __os_free(env, next);
+ }
+#endif
+}
diff --git a/src/os/os_alloc.c b/src/os/os_alloc.c
new file mode 100644
index 00000000..fb7bf109
--- /dev/null
+++ b/src/os/os_alloc.c
@@ -0,0 +1,464 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef DIAGNOSTIC
+static void __os_guard __P((ENV *));
+
+typedef union {
+ size_t size;
+ uintmax_t align;
+} db_allocinfo_t;
+#endif
+
+/*
+ * !!!
+ * Correct for systems that return NULL when you allocate 0 bytes of memory.
+ * There are several places in DB where we allocate the number of bytes held
+ * by the key/data item, and it can be 0. Correct here so that malloc never
+ * returns a NULL for that reason (which behavior is permitted by ANSI). We
+ * could make these calls macros on non-Alpha architectures (that's where we
+ * saw the problem), but it's probably not worth the autoconf complexity.
+ *
+ * !!!
+ * Correct for systems that don't set errno when malloc and friends fail.
+ *
+ * Out of memory.
+ * We wish to hold the whole sky,
+ * But we never will.
+ */
+
+/*
+ * __os_umalloc --
+ * Allocate memory to be used by the application.
+ *
+ * Use, in order of preference, the allocation function specified to the
+ * ENV handle, the allocation function specified as a replacement for
+ * the library malloc, or the library malloc().
+ *
+ * PUBLIC: int __os_umalloc __P((ENV *, size_t, void *));
+ */
+int
+__os_umalloc(env, size, storep)
+ ENV *env;
+ size_t size;
+ void *storep;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ /* Never allocate 0 bytes -- some C libraries don't like it. */
+ if (size == 0)
+ ++size;
+
+ if (dbenv == NULL || dbenv->db_malloc == NULL) {
+ if (DB_GLOBAL(j_malloc) != NULL)
+ *(void **)storep = DB_GLOBAL(j_malloc)(size);
+ else
+ *(void **)storep = malloc(size);
+ if (*(void **)storep == NULL) {
+ /*
+ * Correct error return, see __os_malloc.
+ */
+ if ((ret = __os_get_errno_ret_zero()) == 0) {
+ ret = ENOMEM;
+ __os_set_errno(ENOMEM);
+ }
+ __db_err(env, ret, DB_STR_A("0143", "malloc: %lu",
+ "%lu"), (u_long)size);
+ return (ret);
+ }
+ return (0);
+ }
+
+ if ((*(void **)storep = dbenv->db_malloc(size)) == NULL) {
+ __db_errx(env, DB_STR("0144",
+ "user-specified malloc function returned NULL"));
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+/*
+ * __os_urealloc --
+ * Allocate memory to be used by the application.
+ *
+ * A realloc(3) counterpart to __os_umalloc's malloc(3).
+ *
+ * PUBLIC: int __os_urealloc __P((ENV *, size_t, void *));
+ */
+int
+__os_urealloc(env, size, storep)
+ ENV *env;
+ size_t size;
+ void *storep;
+{
+ DB_ENV *dbenv;
+ int ret;
+ void *ptr;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ ptr = *(void **)storep;
+
+ /* Never allocate 0 bytes -- some C libraries don't like it. */
+ if (size == 0)
+ ++size;
+
+ if (dbenv == NULL || dbenv->db_realloc == NULL) {
+ if (ptr == NULL)
+ return (__os_umalloc(env, size, storep));
+
+ if (DB_GLOBAL(j_realloc) != NULL)
+ *(void **)storep = DB_GLOBAL(j_realloc)(ptr, size);
+ else
+ *(void **)storep = realloc(ptr, size);
+ if (*(void **)storep == NULL) {
+ /*
+ * Correct errno, see __os_realloc.
+ */
+ if ((ret = __os_get_errno_ret_zero()) == 0) {
+ ret = ENOMEM;
+ __os_set_errno(ENOMEM);
+ }
+ __db_err(env, ret, DB_STR_A("0145",
+ "realloc: %lu", "%lu"), (u_long)size);
+ return (ret);
+ }
+ return (0);
+ }
+
+ if ((*(void **)storep = dbenv->db_realloc(ptr, size)) == NULL) {
+ __db_errx(env, DB_STR("0146",
+ "User-specified realloc function returned NULL"));
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+/*
+ * __os_ufree --
+ * Free memory used by the application.
+ *
+ * A free(3) counterpart to __os_umalloc's malloc(3).
+ *
+ * PUBLIC: void __os_ufree __P((ENV *, void *));
+ */
+void
+__os_ufree(env, ptr)
+ ENV *env;
+ void *ptr;
+{
+ DB_ENV *dbenv;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL && dbenv->db_free != NULL)
+ dbenv->db_free(ptr);
+ else if (DB_GLOBAL(j_free) != NULL)
+ DB_GLOBAL(j_free)(ptr);
+ else
+ free(ptr);
+}
+
+/*
+ * __os_strdup --
+ * The strdup(3) function for DB.
+ *
+ * PUBLIC: int __os_strdup __P((ENV *, const char *, void *));
+ */
+int
+__os_strdup(env, str, storep)
+ ENV *env;
+ const char *str;
+ void *storep;
+{
+ size_t size;
+ int ret;
+ void *p;
+
+ *(void **)storep = NULL;
+
+ size = strlen(str) + 1;
+ if ((ret = __os_malloc(env, size, &p)) != 0)
+ return (ret);
+
+ memcpy(p, str, size);
+
+ *(void **)storep = p;
+ return (0);
+}
+
+/*
+ * __os_calloc --
+ * The calloc(3) function for DB.
+ *
+ * PUBLIC: int __os_calloc __P((ENV *, size_t, size_t, void *));
+ */
+int
+__os_calloc(env, num, size, storep)
+ ENV *env;
+ size_t num, size;
+ void *storep;
+{
+ int ret;
+
+ size *= num;
+ if ((ret = __os_malloc(env, size, storep)) != 0)
+ return (ret);
+
+ memset(*(void **)storep, 0, size);
+
+ return (0);
+}
+
+/*
+ * __os_malloc --
+ * The malloc(3) function for DB.
+ *
+ * PUBLIC: int __os_malloc __P((ENV *, size_t, void *));
+ */
+int
+__os_malloc(env, size, storep)
+ ENV *env;
+ size_t size;
+ void *storep;
+{
+ int ret;
+ void *p;
+
+ *(void **)storep = NULL;
+
+ /* Never allocate 0 bytes -- some C libraries don't like it. */
+ if (size == 0)
+ ++size;
+
+#ifdef DIAGNOSTIC
+ /* Add room for size and a guard byte. */
+ size += sizeof(db_allocinfo_t) + 1;
+#endif
+
+ if (DB_GLOBAL(j_malloc) != NULL)
+ p = DB_GLOBAL(j_malloc)(size);
+ else
+ p = malloc(size);
+ if (p == NULL) {
+ /*
+ * Some C libraries don't correctly set errno when malloc(3)
+ * fails. We'd like to 0 out errno before calling malloc,
+ * but it turns out that setting errno is quite expensive on
+ * Windows/NT in an MT environment.
+ */
+ if ((ret = __os_get_errno_ret_zero()) == 0) {
+ ret = ENOMEM;
+ __os_set_errno(ENOMEM);
+ }
+ __db_err(env, ret, DB_STR_A("0147", "malloc: %lu", "%lu"),
+ (u_long)size);
+ return (ret);
+ }
+
+#ifdef DIAGNOSTIC
+ /* Overwrite memory. */
+ memset(p, CLEAR_BYTE, size);
+
+ /*
+ * Guard bytes: if #DIAGNOSTIC is defined, we allocate an additional
+ * byte after the memory and set it to a special value that we check
+ * for when the memory is free'd.
+ */
+ ((u_int8_t *)p)[size - 1] = CLEAR_BYTE;
+
+ ((db_allocinfo_t *)p)->size = size;
+ p = &((db_allocinfo_t *)p)[1];
+#endif
+ *(void **)storep = p;
+
+ return (0);
+}
+
+/*
+ * __os_realloc --
+ * The realloc(3) function for DB.
+ *
+ * PUBLIC: int __os_realloc __P((ENV *, size_t, void *));
+ */
+int
+__os_realloc(env, size, storep)
+ ENV *env;
+ size_t size;
+ void *storep;
+{
+ int ret;
+ void *p, *ptr;
+
+ ptr = *(void **)storep;
+
+ /* Never allocate 0 bytes -- some C libraries don't like it. */
+ if (size == 0)
+ ++size;
+
+ /* If we haven't yet allocated anything yet, simply call malloc. */
+ if (ptr == NULL)
+ return (__os_malloc(env, size, storep));
+
+#ifdef DIAGNOSTIC
+ /* Add room for size and a guard byte. */
+ size += sizeof(db_allocinfo_t) + 1;
+
+ /* Back up to the real beginning */
+ ptr = &((db_allocinfo_t *)ptr)[-1];
+
+ {
+ size_t s;
+
+ s = ((db_allocinfo_t *)ptr)->size;
+ if (((u_int8_t *)ptr)[s - 1] != CLEAR_BYTE)
+ __os_guard(env);
+ }
+#endif
+
+ /*
+ * Don't overwrite the original pointer, there are places in DB we
+ * try to continue after realloc fails.
+ */
+ if (DB_GLOBAL(j_realloc) != NULL)
+ p = DB_GLOBAL(j_realloc)(ptr, size);
+ else
+ p = realloc(ptr, size);
+ if (p == NULL) {
+ /*
+ * Some C libraries don't correctly set errno when malloc(3)
+ * fails. We'd like to 0 out errno before calling malloc,
+ * but it turns out that setting errno is quite expensive on
+ * Windows/NT in an MT environment.
+ */
+ if ((ret = __os_get_errno_ret_zero()) == 0) {
+ ret = ENOMEM;
+ __os_set_errno(ENOMEM);
+ }
+ __db_err(env, ret, DB_STR_A("0148", "realloc: %lu", "%lu"),
+ (u_long)size);
+ return (ret);
+ }
+#ifdef DIAGNOSTIC
+ ((u_int8_t *)p)[size - 1] = CLEAR_BYTE; /* Initialize guard byte. */
+
+ ((db_allocinfo_t *)p)->size = size;
+ p = &((db_allocinfo_t *)p)[1];
+#endif
+
+ *(void **)storep = p;
+
+ return (0);
+}
+
+/*
+ * __os_free --
+ * The free(3) function for DB.
+ *
+ * PUBLIC: void __os_free __P((ENV *, void *));
+ */
+void
+__os_free(env, ptr)
+ ENV *env;
+ void *ptr;
+{
+#ifdef DIAGNOSTIC
+ size_t size;
+#endif
+
+ /*
+ * ANSI C requires free(NULL) work. Don't depend on the underlying
+ * library.
+ */
+ if (ptr == NULL)
+ return;
+
+#ifdef DIAGNOSTIC
+ /*
+ * Check that the guard byte (one past the end of the memory) is
+ * still CLEAR_BYTE.
+ */
+ ptr = &((db_allocinfo_t *)ptr)[-1];
+ size = ((db_allocinfo_t *)ptr)->size;
+ if (((u_int8_t *)ptr)[size - 1] != CLEAR_BYTE)
+ __os_guard(env);
+
+ /* Overwrite memory. */
+ if (size != 0)
+ memset(ptr, CLEAR_BYTE, size);
+#else
+ COMPQUIET(env, NULL);
+#endif
+
+ if (DB_GLOBAL(j_free) != NULL)
+ DB_GLOBAL(j_free)(ptr);
+ else
+ free(ptr);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __os_guard --
+ * Complain and abort.
+ */
+static void
+__os_guard(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("0149",
+ "Guard byte incorrect during free"));
+ __os_abort(env);
+ /* NOTREACHED */
+}
+#endif
+
+/*
+ * __ua_memcpy --
+ * Copy memory to memory without relying on any kind of alignment.
+ *
+ * There are places in DB that we have unaligned data, for example,
+ * when we've stored a structure in a log record as a DBT, and now
+ * we want to look at it. Unfortunately, if you have code like:
+ *
+ * struct a {
+ * int x;
+ * } *p;
+ *
+ * void *func_argument;
+ * int local;
+ *
+ * p = (struct a *)func_argument;
+ * memcpy(&local, p->x, sizeof(local));
+ *
+ * compilers optimize to use inline instructions requiring alignment,
+ * and records in the log don't have any particular alignment. (This
+ * isn't a compiler bug, because it's a structure they're allowed to
+ * assume alignment.)
+ *
+ * Casting the memcpy arguments to (u_int8_t *) appears to work most
+ * of the time, but we've seen examples where it wasn't sufficient
+ * and there's nothing in ANSI C that requires that work.
+ *
+ * PUBLIC: void *__ua_memcpy __P((void *, const void *, size_t));
+ */
+void *
+__ua_memcpy(dst, src, len)
+ void *dst;
+ const void *src;
+ size_t len;
+{
+ return ((void *)memcpy(dst, src, len));
+}
diff --git a/src/os/os_clock.c b/src/os/os_clock.c
new file mode 100644
index 00000000..25eeb704
--- /dev/null
+++ b/src/os/os_clock.c
@@ -0,0 +1,73 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_gettime --
+ * Return the current time-of-day clock in seconds and nanoseconds.
+ *
+ * PUBLIC: void __os_gettime __P((ENV *, db_timespec *, int));
+ */
+void
+__os_gettime(env, tp, monotonic)
+ ENV *env;
+ db_timespec *tp;
+ int monotonic;
+{
+ const char *sc;
+ int ret;
+
+#if defined(HAVE_CLOCK_GETTIME)
+#if defined(HAVE_CLOCK_MONOTONIC)
+ if (monotonic)
+ RETRY_CHK((clock_gettime(
+ CLOCK_MONOTONIC, (struct timespec *)tp)), ret);
+ else
+#endif
+ RETRY_CHK((clock_gettime(
+ CLOCK_REALTIME, (struct timespec *)tp)), ret);
+
+ RETRY_CHK((clock_gettime(CLOCK_REALTIME, (struct timespec *)tp)), ret);
+ if (ret != 0) {
+ sc = "clock_gettime";
+ goto err;
+ }
+#elif defined(HAVE_GETTIMEOFDAY)
+ struct timeval v;
+
+ RETRY_CHK((gettimeofday(&v, NULL)), ret);
+ if (ret != 0) {
+ sc = "gettimeofday";
+ goto err;
+ }
+
+ tp->tv_sec = v.tv_sec;
+ tp->tv_nsec = v.tv_usec * NS_PER_US;
+#elif defined(HAVE_TIME)
+ time_t now;
+
+ RETRY_CHK((time(&now) == (time_t)-1 ? 1 : 0), ret);
+ if (ret != 0) {
+ sc = "time";
+ goto err;
+ }
+
+ tp->tv_sec = now;
+ tp->tv_nsec = 0;
+#else
+ NO AVAILABLE CLOCK IMPLEMENTATION
+#endif
+ COMPQUIET(monotonic, 0);
+ return;
+
+err: __db_syserr(env, ret, "%s", sc);
+ (void)__env_panic(env, __os_posix_err(ret));
+}
diff --git a/src/os/os_config.c b/src/os/os_config.c
new file mode 100644
index 00000000..c455a349
--- /dev/null
+++ b/src/os/os_config.c
@@ -0,0 +1,70 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fs_notzero --
+ * Return 1 if allocated filesystem blocks are not zeroed.
+ *
+ * PUBLIC: int __os_fs_notzero __P((void));
+ */
+int
+__os_fs_notzero()
+{
+ /* Most filesystems zero out implicitly created pages. */
+ return (0);
+}
+
+/*
+ * __os_support_direct_io --
+ * Return 1 if we support direct I/O.
+ *
+ * PUBLIC: int __os_support_direct_io __P((void));
+ */
+int
+__os_support_direct_io()
+{
+ int ret;
+
+ ret = 0;
+
+#ifdef HAVE_O_DIRECT
+ ret = 1;
+#endif
+#if defined(HAVE_DIRECTIO) && defined(DIRECTIO_ON)
+ ret = 1;
+#endif
+ return (ret);
+}
+
+/*
+ * __os_support_db_register --
+ * Return 1 if the system supports DB_REGISTER.
+ *
+ * PUBLIC: int __os_support_db_register __P((void));
+ */
+int
+__os_support_db_register()
+{
+ return (1);
+}
+
+/*
+ * __os_support_replication --
+ * Return 1 if the system supports replication.
+ *
+ * PUBLIC: int __os_support_replication __P((void));
+ */
+int
+__os_support_replication()
+{
+ return (1);
+}
diff --git a/src/os/os_cpu.c b/src/os/os_cpu.c
new file mode 100644
index 00000000..6b7f9f1e
--- /dev/null
+++ b/src/os/os_cpu.c
@@ -0,0 +1,47 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#if defined(HAVE_PSTAT_GETDYNAMIC)
+#include <sys/pstat.h>
+#endif
+#endif
+
+/*
+ * __os_cpu_count --
+ * Return the number of CPUs.
+ *
+ * PUBLIC: u_int32_t __os_cpu_count __P((void));
+ */
+u_int32_t
+__os_cpu_count()
+{
+#if defined(HAVE_PSTAT_GETDYNAMIC)
+ /*
+ * HP/UX.
+ */
+ struct pst_dynamic psd;
+
+ return ((u_int32_t)pstat_getdynamic(&psd,
+ sizeof(psd), (size_t)1, 0) == -1 ? 1 : psd.psd_proc_cnt);
+#elif defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
+ /*
+ * Solaris, Linux.
+ */
+ long nproc;
+
+ nproc = sysconf(_SC_NPROCESSORS_ONLN);
+ return ((u_int32_t)(nproc > 1 ? nproc : 1));
+#else
+ return (1);
+#endif
+}
diff --git a/src/os/os_ctime.c b/src/os/os_ctime.c
new file mode 100644
index 00000000..3f656c32
--- /dev/null
+++ b/src/os/os_ctime.c
@@ -0,0 +1,47 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_ctime --
+ * Format a time-stamp.
+ *
+ * PUBLIC: char *__os_ctime __P((const time_t *, char *));
+ */
+char *
+__os_ctime(tod, time_buf)
+ const time_t *tod;
+ char *time_buf;
+{
+ time_buf[CTIME_BUFLEN - 1] = '\0';
+
+ /*
+ * The ctime_r interface is the POSIX standard, thread-safe version of
+ * ctime. However, it was implemented in three different ways (with
+ * and without a buffer length argument, and where the buffer length
+ * argument was an int vs. a size_t *). Also, you can't depend on a
+ * return of (char *) from ctime_r, HP-UX 10.XX's version returned an
+ * int.
+ */
+#if defined(HAVE_VXWORKS)
+ {
+ size_t buflen = CTIME_BUFLEN;
+ (void)ctime_r(tod, time_buf, &buflen);
+ }
+#elif defined(HAVE_CTIME_R_3ARG)
+ (void)ctime_r(tod, time_buf, CTIME_BUFLEN);
+#elif defined(HAVE_CTIME_R)
+ (void)ctime_r(tod, time_buf);
+#else
+ (void)strncpy(time_buf, ctime(tod), CTIME_BUFLEN - 1);
+#endif
+ return (time_buf);
+}
diff --git a/src/os/os_dir.c b/src/os/os_dir.c
new file mode 100644
index 00000000..42bad194
--- /dev/null
+++ b/src/os/os_dir.c
@@ -0,0 +1,140 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#if HAVE_DIRENT_H
+# include <dirent.h>
+# define NAMLEN(dirent) strlen((dirent)->d_name)
+#else
+# define dirent direct
+# define NAMLEN(dirent) (dirent)->d_namlen
+# if HAVE_SYS_NDIR_H
+# include <sys/ndir.h>
+# endif
+# if HAVE_SYS_DIR_H
+# include <sys/dir.h>
+# endif
+# if HAVE_NDIR_H
+# include <ndir.h>
+# endif
+#endif
+
+#include "db_int.h"
+
+/*
+ * __os_dirlist --
+ * Return a list of the files in a directory.
+ *
+ * PUBLIC: int __os_dirlist __P((ENV *, const char *, int, char ***, int *));
+ */
+int
+__os_dirlist(env, dir, returndir, namesp, cntp)
+ ENV *env;
+ const char *dir;
+ int returndir, *cntp;
+ char ***namesp;
+{
+ DB_ENV *dbenv;
+ struct dirent *dp;
+ DIR *dirp;
+ struct stat sb;
+ int arraysz, cnt, ret;
+ char **names, buf[DB_MAXPATHLEN];
+
+ *namesp = NULL;
+ *cntp = 0;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0159",
+ "fileops: directory list %s", "%s"), dir);
+
+ if (DB_GLOBAL(j_dirlist) != NULL)
+ return (DB_GLOBAL(j_dirlist)(dir, namesp, cntp));
+
+ if ((dirp = opendir(CHAR_STAR_CAST dir)) == NULL)
+ return (__os_get_errno());
+ names = NULL;
+ for (arraysz = cnt = 0; (dp = readdir(dirp)) != NULL;) {
+ snprintf(buf, sizeof(buf), "%s/%s", dir, dp->d_name);
+
+ RETRY_CHK(stat(buf, &sb), ret);
+ if (ret != 0) {
+ ret = __os_posix_err(ret);
+ /* Ignore entries that no longer exist. */
+ if (ret == ENOENT)
+ continue;
+
+ goto err;
+ }
+
+ /*
+ * We return regular files, and optionally return directories
+ * (except for dot and dot-dot).
+ *
+ * Shared memory files are of a different type on QNX, and we
+ * return those as well.
+ */
+#ifdef HAVE_QNX
+ if (!S_ISREG(sb.st_mode) && !S_TYPEISSHM(&sb)) {
+#else
+ if (!S_ISREG(sb.st_mode)) {
+#endif
+ if (!returndir || !S_ISDIR(sb.st_mode))
+ continue;
+ if (dp->d_name[0] == '.' && (dp->d_name[1] == '\0' ||
+ (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
+ continue;
+ }
+
+ if (cnt >= arraysz) {
+ arraysz += 100;
+ if ((ret = __os_realloc(env,
+ (u_int)arraysz * sizeof(names[0]), &names)) != 0)
+ goto err;
+ }
+ if ((ret = __os_strdup(env, dp->d_name, &names[cnt])) != 0)
+ goto err;
+ cnt++;
+ }
+ (void)closedir(dirp);
+
+ *namesp = names;
+ *cntp = cnt;
+ return (0);
+
+err: if (names != NULL)
+ __os_dirfree(env, names, cnt);
+ if (dirp != NULL)
+ (void)closedir(dirp);
+ return (ret);
+}
+
+/*
+ * __os_dirfree --
+ * Free the list of files.
+ *
+ * PUBLIC: void __os_dirfree __P((ENV *, char **, int));
+ */
+void
+__os_dirfree(env, names, cnt)
+ ENV *env;
+ char **names;
+ int cnt;
+{
+ if (DB_GLOBAL(j_dirfree) != NULL)
+ DB_GLOBAL(j_dirfree)(names, cnt);
+ else {
+ while (cnt > 0)
+ __os_free(env, names[--cnt]);
+ __os_free(env, names);
+ }
+}
diff --git a/src/os/os_errno.c b/src/os/os_errno.c
new file mode 100644
index 00000000..a8219f90
--- /dev/null
+++ b/src/os/os_errno.c
@@ -0,0 +1,129 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_get_errno_ret_zero --
+ * Return the last system error, including an error of zero.
+ *
+ * PUBLIC: int __os_get_errno_ret_zero __P((void));
+ */
+int
+__os_get_errno_ret_zero()
+{
+ /* This routine must be able to return the same value repeatedly. */
+ return (errno);
+}
+
+/*
+ * We've seen cases where system calls failed but errno was never set. For
+ * that reason, __os_get_errno() and __os_get_syserr set errno to EAGAIN if
+ * it's not already set, to work around the problem. For obvious reasons,
+ * we can only call this function if we know an error has occurred, that
+ * is, we can't test the return for a non-zero value after the get call.
+ *
+ * __os_get_errno --
+ * Return the last ANSI C "errno" value or EAGAIN if the last error
+ * is zero.
+ *
+ * PUBLIC: int __os_get_errno __P((void));
+ */
+int
+__os_get_errno()
+{
+ /* This routine must be able to return the same value repeatedly. */
+ return (__os_get_syserr());
+}
+
+#if 0
+/*
+ * __os_get_neterr --
+ * Return the last network-related error or EAGAIN if the last
+ * error is zero.
+ *
+ * PUBLIC: int __os_get_neterr __P((void));
+ */
+int
+__os_get_neterr()
+{
+ /* This routine must be able to return the same value repeatedly. */
+ return (__os_get_syserr());
+}
+#endif
+
+/*
+ * __os_get_syserr --
+ * Return the last system error or EAGAIN if the last error is zero.
+ *
+ * PUBLIC: int __os_get_syserr __P((void));
+ */
+int
+__os_get_syserr()
+{
+ /* This routine must be able to return the same value repeatedly. */
+ if (errno == 0)
+ __os_set_errno(EAGAIN);
+ return (errno);
+}
+
+/*
+ * __os_set_errno --
+ * Set the value of errno.
+ *
+ * PUBLIC: void __os_set_errno __P((int));
+ */
+void
+__os_set_errno(evalue)
+ int evalue;
+{
+ /*
+ * This routine is called by the compatibility interfaces (DB 1.85,
+ * dbm and hsearch). Force values > 0, that is, not one of DB 2.X
+ * and later's public error returns. If something bad has happened,
+ * default to EFAULT -- a nasty return. Otherwise, default to EINVAL.
+ * As the compatibility APIs aren't included on Windows, the Windows
+ * version of this routine doesn't need this behavior.
+ */
+ errno =
+ evalue >= 0 ? evalue : (evalue == DB_RUNRECOVERY ? EFAULT : EINVAL);
+}
+
+/*
+ * __os_strerror --
+ * Return a string associated with the system error.
+ *
+ * PUBLIC: char *__os_strerror __P((int, char *, size_t));
+ */
+char *
+__os_strerror(error, buf, len)
+ int error;
+ char *buf;
+ size_t len;
+{
+ /* No translation is needed in the POSIX layer. */
+ (void)strncpy(buf, strerror(error), len - 1);
+ buf[len - 1] = '\0';
+
+ return (buf);
+}
+
+/*
+ * __os_posix_err
+ * Convert a system error to a POSIX error.
+ *
+ * PUBLIC: int __os_posix_err __P((int));
+ */
+int
+__os_posix_err(error)
+ int error;
+{
+ return (error);
+}
diff --git a/src/os/os_fid.c b/src/os/os_fid.c
new file mode 100644
index 00000000..f2d80e25
--- /dev/null
+++ b/src/os/os_fid.c
@@ -0,0 +1,135 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fileid --
+ * Return a unique identifier for a file.
+ *
+ * PUBLIC: int __os_fileid __P((ENV *, const char *, int, u_int8_t *));
+ */
+int
+__os_fileid(env, fname, unique_okay, fidp)
+ ENV *env;
+ const char *fname;
+ int unique_okay;
+ u_int8_t *fidp;
+{
+ pid_t pid;
+ size_t i;
+ u_int32_t tmp;
+ u_int8_t *p;
+
+#ifdef HAVE_STAT
+ struct stat sb;
+ int ret;
+
+ /*
+ * The structure of a fileid on a POSIX/UNIX system is:
+ *
+ * ino[4] dev[4] unique-ID[4] serial-counter[4] empty[4].
+ *
+ * For real files, which have a backing inode and device, the first
+ * 8 bytes are filled in and the following bytes are left 0. For
+ * temporary files, the following 12 bytes are filled in.
+ *
+ * Clear the buffer.
+ */
+ memset(fidp, 0, DB_FILE_ID_LEN);
+ RETRY_CHK((stat(CHAR_STAR_CAST fname, &sb)), ret);
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR_A("0158",
+ "stat: %s", "%s"), fname);
+ return (__os_posix_err(ret));
+ }
+
+ /*
+ * !!!
+ * Nothing is ever big enough -- on Sparc V9, st_ino, st_dev and the
+ * time_t types are all 8 bytes. As DB_FILE_ID_LEN is only 20 bytes,
+ * we convert to a (potentially) smaller fixed-size type and use it.
+ *
+ * We don't worry about byte sexing or the actual variable sizes.
+ *
+ * When this routine is called from the DB access methods, it's only
+ * called once -- whatever ID is generated when a database is created
+ * is stored in the database file's metadata, and that is what is
+ * saved in the mpool region's information to uniquely identify the
+ * file.
+ *
+ * When called from the mpool layer this routine will be called each
+ * time a new thread of control wants to share the file, which makes
+ * things tougher. As far as byte sexing goes, since the mpool region
+ * lives on a single host, there's no issue of that -- the entire
+ * region is byte sex dependent. As far as variable sizes go, we make
+ * the simplifying assumption that 32-bit and 64-bit processes will
+ * get the same 32-bit values if we truncate any returned 64-bit value
+ * to a 32-bit value. When we're called from the mpool layer, though,
+ * we need to be careful not to include anything that isn't
+ * reproducible for a given file, such as the timestamp or serial
+ * number.
+ */
+ tmp = (u_int32_t)sb.st_ino;
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+
+ tmp = (u_int32_t)sb.st_dev;
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+#else
+ /*
+ * Use the file name.
+ *
+ * XXX
+ * Cast the first argument, the BREW ARM compiler is unhappy if
+ * we don't.
+ */
+ (void)strncpy((char *)fidp, fname, DB_FILE_ID_LEN);
+#endif /* HAVE_STAT */
+
+ if (unique_okay) {
+ /* Add in 32-bits of (hopefully) unique number. */
+ __os_unique_id(env, &tmp);
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+
+ /*
+ * Initialize/increment the serial number we use to help
+ * avoid fileid collisions. Note we don't bother with
+ * locking; it's unpleasant to do from down in here, and
+ * if we race on this no real harm will be done, since the
+ * finished fileid has so many other components.
+ *
+ * We use the bottom 32-bits of the process ID, hoping they
+ * are more random than the top 32-bits (should we be on a
+ * machine with 64-bit process IDs).
+ *
+ * We increment by 100000 on each call as a simple way of
+ * randomizing; simply incrementing seems potentially less
+ * useful if pids are also simply incremented, since this
+ * is process-local and we may be one of a set of processes
+ * starting up. 100000 pushes us out of pid space on most
+ * 32-bit platforms, and has few interesting properties in
+ * base 2.
+ */
+ if (DB_GLOBAL(fid_serial) == 0) {
+ __os_id(env->dbenv, &pid, NULL);
+ DB_GLOBAL(fid_serial) = (u_int32_t)pid;
+ } else
+ DB_GLOBAL(fid_serial) += 100000;
+
+ for (p = (u_int8_t *)
+ &DB_GLOBAL(fid_serial), i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+ }
+
+ return (0);
+}
diff --git a/src/os/os_flock.c b/src/os/os_flock.c
new file mode 100644
index 00000000..904d5efe
--- /dev/null
+++ b/src/os/os_flock.c
@@ -0,0 +1,64 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fdlock --
+ * Acquire/release a lock on a byte in a file.
+ *
+ * PUBLIC: int __os_fdlock __P((ENV *, DB_FH *, off_t, int, int));
+ */
+int
+__os_fdlock(env, fhp, offset, acquire, nowait)
+ ENV *env;
+ DB_FH *fhp;
+ int acquire, nowait;
+ off_t offset;
+{
+#ifdef HAVE_FCNTL
+ DB_ENV *dbenv;
+ struct flock fl;
+ int ret, t_ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0138",
+ "fileops: flock %s %s offset %lu", "%s %s %lu"), fhp->name,
+ acquire ? DB_STR_P("acquire"): DB_STR_P("release"),
+ (u_long)offset);
+
+ fl.l_start = offset;
+ fl.l_len = 1;
+ fl.l_type = acquire ? F_WRLCK : F_UNLCK;
+ fl.l_whence = SEEK_SET;
+
+ RETRY_CHK_EINTR_ONLY(
+ (fcntl(fhp->fd, nowait ? F_SETLK : F_SETLKW, &fl)), ret);
+
+ if (ret == 0)
+ return (0);
+
+ if ((t_ret = __os_posix_err(ret)) != EACCES && t_ret != EAGAIN)
+ __db_syserr(env, ret, DB_STR("0139", "fcntl"));
+ return (t_ret);
+#else
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(acquire, 0);
+ COMPQUIET(nowait, 0);
+ COMPQUIET(offset, 0);
+ __db_syserr(env, DB_OPNOTSUP, DB_STR("0140",
+ "advisory file locking unavailable"));
+ return (DB_OPNOTSUP);
+#endif
+}
diff --git a/src/os/os_fsync.c b/src/os/os_fsync.c
new file mode 100644
index 00000000..4b757b2c
--- /dev/null
+++ b/src/os/os_fsync.c
@@ -0,0 +1,104 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_VXWORKS
+#include "ioLib.h"
+
+#define fsync(fd) __vx_fsync(fd)
+
+int
+__vx_fsync(fd)
+ int fd;
+{
+ int ret;
+
+ /*
+ * The results of ioctl are driver dependent. Some will return the
+ * number of bytes sync'ed. Only if it returns 'ERROR' should we
+ * flag it.
+ */
+ if ((ret = ioctl(fd, FIOSYNC, 0)) != ERROR)
+ return (0);
+ return (ret);
+}
+#endif
+
+#ifdef __hp3000s900
+#define fsync(fd) __mpe_fsync(fd)
+
+int
+__mpe_fsync(fd)
+ int fd;
+{
+ extern FCONTROL(short, short, void *);
+
+ FCONTROL(_MPE_FILENO(fd), 2, NULL); /* Flush the buffers */
+ FCONTROL(_MPE_FILENO(fd), 6, NULL); /* Write the EOF */
+ return (0);
+}
+#endif
+
+/*
+ * __os_fsync --
+ * Flush a file descriptor.
+ *
+ * PUBLIC: int __os_fsync __P((ENV *, DB_FH *));
+ */
+int
+__os_fsync(env, fhp)
+ ENV *env;
+ DB_FH *fhp;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+ /*
+ * Do nothing if the file descriptor has been marked as not requiring
+ * any sync to disk.
+ */
+ if (F_ISSET(fhp, DB_FH_NOSYNC))
+ return (0);
+
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0150", "fileops: flush %s", "%s"),
+ fhp->name);
+
+ if (DB_GLOBAL(j_fsync) != NULL)
+ ret = DB_GLOBAL(j_fsync)(fhp->fd);
+ else {
+#if defined(F_FULLFSYNC)
+ RETRY_CHK((fcntl(fhp->fd, F_FULLFSYNC, 0)), ret);
+ /*
+ * On OS X, F_FULLSYNC only works on HFS+, so we need to fall
+ * back to regular fsync on other filesystems.
+ */
+ if (ret == ENOTSUP)
+ RETRY_CHK((fsync(fhp->fd)), ret);
+#elif defined(HAVE_QNX)
+ ret = __qnx_fsync(fhp);
+#elif defined(HAVE_FDATASYNC)
+ RETRY_CHK((fdatasync(fhp->fd)), ret);
+#else
+ RETRY_CHK((fsync(fhp->fd)), ret);
+#endif
+ }
+
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR("0151", "fsync"));
+ ret = __os_posix_err(ret);
+ }
+ return (ret);
+}
diff --git a/src/os/os_getenv.c b/src/os/os_getenv.c
new file mode 100644
index 00000000..05972112
--- /dev/null
+++ b/src/os/os_getenv.c
@@ -0,0 +1,58 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_getenv --
+ * Retrieve an environment variable.
+ *
+ * PUBLIC: int __os_getenv __P((ENV *, const char *, char **, size_t));
+ */
+int
+__os_getenv(env, name, bpp, buflen)
+ ENV *env;
+ const char *name;
+ char **bpp;
+ size_t buflen;
+{
+ /*
+ * If we have getenv, there's a value and the buffer is large enough:
+ * copy value into the pointer, return 0
+ * If we have getenv, there's a value and the buffer is too short:
+ * set pointer to NULL, return EINVAL
+ * If we have getenv and there's no value:
+ * set pointer to NULL, return 0
+ * If we don't have getenv:
+ * set pointer to NULL, return 0
+ */
+#ifdef HAVE_GETENV
+ char *p;
+
+ if ((p = getenv(name)) != NULL) {
+ if (strlen(p) < buflen) {
+ (void)strcpy(*bpp, p);
+ return (0);
+ }
+
+ *bpp = NULL;
+ __db_errx(env, DB_STR_A("0157",
+ "%s: buffer too small to hold environment variable %s",
+ "%s %s"), name, p);
+ return (EINVAL);
+ }
+#else
+ COMPQUIET(env, NULL);
+ COMPQUIET(name, NULL);
+ COMPQUIET(buflen, 0);
+#endif
+ *bpp = NULL;
+ return (0);
+}
diff --git a/src/os/os_handle.c b/src/os/os_handle.c
new file mode 100644
index 00000000..8ae9dc7f
--- /dev/null
+++ b/src/os/os_handle.c
@@ -0,0 +1,243 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_openhandle --
+ * Open a file, using POSIX 1003.1 open flags.
+ *
+ * PUBLIC: int __os_openhandle
+ * PUBLIC: __P((ENV *, const char *, int, int, DB_FH **));
+ */
+int
+__os_openhandle(env, name, flags, mode, fhpp)
+ ENV *env;
+ const char *name;
+ int flags, mode;
+ DB_FH **fhpp;
+{
+ DB_FH *fhp;
+ u_int nrepeat, retries;
+ int fcntl_flags, ret;
+#ifdef HAVE_VXWORKS
+ int newflags;
+#endif
+ /*
+ * Allocate the file handle and copy the file name. We generally only
+ * use the name for verbose or error messages, but on systems where we
+ * can't unlink temporary files immediately, we use the name to unlink
+ * the temporary file when the file handle is closed.
+ *
+ * Lock the ENV handle and insert the new file handle on the list.
+ */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_FH), &fhp)) != 0)
+ return (ret);
+ if ((ret = __os_strdup(env, name, &fhp->name)) != 0)
+ goto err;
+ if (env != NULL) {
+ MUTEX_LOCK(env, env->mtx_env);
+ TAILQ_INSERT_TAIL(&env->fdlist, fhp, q);
+ MUTEX_UNLOCK(env, env->mtx_env);
+ F_SET(fhp, DB_FH_ENVLINK);
+ }
+
+ /* If the application specified an interface, use it. */
+ if (DB_GLOBAL(j_open) != NULL) {
+ if ((fhp->fd = DB_GLOBAL(j_open)(name, flags, mode)) == -1) {
+ ret = __os_posix_err(__os_get_syserr());
+ goto err;
+ }
+ goto done;
+ }
+
+ retries = 0;
+ for (nrepeat = 1; nrepeat < 4; ++nrepeat) {
+ ret = 0;
+#ifdef HAVE_VXWORKS
+ /*
+ * VxWorks does not support O_CREAT on open, you have to use
+ * creat() instead. (It does not support O_EXCL or O_TRUNC
+ * either, even though they are defined "for future support".)
+ * We really want the POSIX behavior that if O_CREAT is set,
+ * we open if it exists, or create it if it doesn't exist.
+ * If O_CREAT is specified, single thread and try to open the
+ * file. If successful, and O_EXCL return EEXIST. If
+ * unsuccessful call creat and then end single threading.
+ */
+ if (LF_ISSET(O_CREAT)) {
+ DB_BEGIN_SINGLE_THREAD;
+ newflags = flags & ~(O_CREAT | O_EXCL);
+ if ((fhp->fd = open(name, newflags, mode)) != -1) {
+ /*
+ * We need to mark the file opened at this
+ * point so that if we get any error below
+ * we will properly close the fd we just
+ * opened on the error path.
+ */
+ F_SET(fhp, DB_FH_OPENED);
+ if (LF_ISSET(O_EXCL)) {
+ /*
+ * If we get here, want O_EXCL create,
+ * and the file exists. Close and
+ * return EEXISTS.
+ */
+ DB_END_SINGLE_THREAD;
+ ret = EEXIST;
+ goto err;
+ }
+ /*
+ * XXX
+ * Assume any error means non-existence.
+ * Unfortunately return values (even for
+ * non-existence) are driver specific so
+ * there is no single error we can use to
+ * verify we truly got the equivalent of
+ * ENOENT.
+ */
+ } else
+ fhp->fd = creat(name, newflags);
+ DB_END_SINGLE_THREAD;
+ } else
+ /* FALLTHROUGH */
+#endif
+#ifdef __VMS
+ /*
+ * !!!
+ * Open with full sharing on VMS.
+ *
+ * We use these flags because they are the ones set by the VMS
+ * CRTL mmap() call when it opens a file, and we have to be
+ * able to open files that mmap() has previously opened, e.g.,
+ * when we're joining already existing DB regions.
+ */
+ fhp->fd = open(name, flags, mode, "shr=get,put,upd,del,upi");
+#else
+ fhp->fd = open(name, flags, mode);
+#endif
+ if (fhp->fd != -1) {
+ ret = 0;
+ break;
+ }
+
+ switch (ret = __os_posix_err(__os_get_syserr())) {
+ case EMFILE:
+ case ENFILE:
+ case ENOSPC:
+ /*
+ * If it's a "temporary" error, we retry up to 3 times,
+ * waiting up to 12 seconds. While it's not a problem
+ * if we can't open a database, an inability to open a
+ * log file is cause for serious dismay.
+ */
+ __os_yield(env, nrepeat * 2, 0);
+ break;
+ case EAGAIN:
+ case EBUSY:
+ case EINTR:
+ /*
+ * If an EAGAIN, EBUSY or EINTR, retry immediately for
+ * DB_RETRY times.
+ */
+ if (++retries < DB_RETRY)
+ --nrepeat;
+ break;
+ default:
+ /* Open is silent on error. */
+ goto err;
+ }
+ }
+
+ if (ret == 0) {
+#if defined(HAVE_FCNTL_F_SETFD)
+ /* Deny file descriptor access to any child process. */
+ if ((fcntl_flags = fcntl(fhp->fd, F_GETFD)) == -1 ||
+ fcntl(fhp->fd, F_SETFD, fcntl_flags | FD_CLOEXEC) == -1) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("0162",
+ "fcntl(F_SETFD)"));
+ ret = __os_posix_err(ret);
+ goto err;
+ }
+#else
+ COMPQUIET(fcntl_flags, 0);
+#endif
+
+done: F_SET(fhp, DB_FH_OPENED);
+ *fhpp = fhp;
+ return (0);
+ }
+
+err: (void)__os_closehandle(env, fhp);
+ return (ret);
+}
+
+/*
+ * __os_closehandle --
+ * Close a file.
+ *
+ * PUBLIC: int __os_closehandle __P((ENV *, DB_FH *));
+ */
+int
+__os_closehandle(env, fhp)
+ ENV *env;
+ DB_FH *fhp;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ ret = 0;
+
+ /*
+ * If we linked the DB_FH handle into the ENV, it needs to be
+ * unlinked.
+ */
+ DB_ASSERT(env, env != NULL || !F_ISSET(fhp, DB_FH_ENVLINK));
+
+ if (env != NULL) {
+ dbenv = env->dbenv;
+ if (fhp->name != NULL && FLD_ISSET(
+ dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0163",
+ "fileops: close %s", "%s"), fhp->name);
+
+ if (F_ISSET(fhp, DB_FH_ENVLINK)) {
+ /*
+ * Lock the ENV handle and remove this file
+ * handle from the list.
+ */
+ MUTEX_LOCK(env, env->mtx_env);
+ TAILQ_REMOVE(&env->fdlist, fhp, q);
+ MUTEX_UNLOCK(env, env->mtx_env);
+ }
+ }
+
+ /* Discard any underlying system file reference. */
+ if (F_ISSET(fhp, DB_FH_OPENED)) {
+ if (DB_GLOBAL(j_close) != NULL)
+ ret = DB_GLOBAL(j_close)(fhp->fd);
+ else
+ RETRY_CHK((close(fhp->fd)), ret);
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR("0164", "close"));
+ ret = __os_posix_err(ret);
+ }
+ }
+
+ /* Unlink the file if we haven't already done so. */
+ if (F_ISSET(fhp, DB_FH_UNLINK))
+ (void)__os_unlink(env, fhp->name, 0);
+
+ if (fhp->name != NULL)
+ __os_free(env, fhp->name);
+ __os_free(env, fhp);
+
+ return (ret);
+}
diff --git a/src/os/os_map.c b/src/os/os_map.c
new file mode 100644
index 00000000..0528f473
--- /dev/null
+++ b/src/os/os_map.c
@@ -0,0 +1,607 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+
+#ifdef HAVE_SHMGET
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#endif
+#endif
+
+#ifdef HAVE_MMAP
+static int __os_map __P((ENV *, char *, DB_FH *, size_t, int, int, void **));
+#endif
+#ifdef HAVE_SHMGET
+static int __shm_mode __P((ENV *));
+#else
+static int __no_system_mem __P((ENV *));
+#endif
+
+/*
+ * __os_attach --
+ * Create/join a shared memory region.
+ *
+ * PUBLIC: int __os_attach __P((ENV *, REGINFO *, REGION *));
+ */
+int
+__os_attach(env, infop, rp)
+ ENV *env;
+ REGINFO *infop;
+ REGION *rp;
+{
+ DB_ENV *dbenv;
+ int create_ok, ret;
+
+ /*
+ * We pass a DB_ENV handle to the user's replacement map function,
+ * so there must be a valid handle.
+ */
+ DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+ dbenv = env->dbenv;
+
+ if (DB_GLOBAL(j_region_map) != NULL) {
+ /*
+ * We have to find out if the region is being created. Ask
+ * the underlying map function, and use the REGINFO structure
+ * to pass that information back to our caller.
+ */
+ create_ok = F_ISSET(infop, REGION_CREATE) ? 1 : 0;
+ ret = DB_GLOBAL(j_region_map)
+ (dbenv, infop->name, rp->max, &create_ok, &infop->addr);
+ if (create_ok)
+ F_SET(infop, REGION_CREATE);
+ else
+ F_CLR(infop, REGION_CREATE);
+ return (ret);
+ }
+
+ if (F_ISSET(env, ENV_SYSTEM_MEM)) {
+ /*
+ * If the region is in system memory on UNIX, we use shmget(2).
+ *
+ * !!!
+ * There exist spinlocks that don't work in shmget memory, e.g.,
+ * the HP/UX msemaphore interface. If we don't have locks that
+ * will work in shmget memory, we better be private and not be
+ * threaded. If we reach this point, we know we're public, so
+ * it's an error.
+ */
+#if defined(HAVE_MUTEX_HPPA_MSEM_INIT)
+ __db_errx(env, DB_STR("0114",
+ "architecture does not support locks inside system shared memory"));
+ return (EINVAL);
+#endif
+#if defined(HAVE_SHMGET)
+ {
+ key_t segid;
+ int id, mode;
+
+ /*
+ * We could potentially create based on REGION_CREATE_OK, but
+ * that's dangerous -- we might get crammed in sideways if
+ * some of the expected regions exist but others do not. Also,
+ * if the requested size differs from an existing region's
+ * actual size, then all sorts of nasty things can happen.
+ * Basing create solely on REGION_CREATE is much safer -- a
+ * recovery will get us straightened out.
+ */
+ if (F_ISSET(infop, REGION_CREATE)) {
+ /*
+ * The application must give us a base System V IPC key
+ * value. Adjust that value based on the region's ID,
+ * and correct so the user's original value appears in
+ * the ipcs output.
+ */
+ if (dbenv->shm_key == INVALID_REGION_SEGID) {
+ __db_errx(env, DB_STR("0115",
+ "no base system shared memory ID specified"));
+ return (EINVAL);
+ }
+
+ /*
+ * !!!
+ * The BDB API takes a "long" as the base segment ID,
+ * then adds an unsigned 32-bit value and stores it
+ * in a key_t. Wrong, admittedly, but not worth an
+ * API change to fix.
+ */
+ segid = (key_t)
+ ((u_long)dbenv->shm_key + (infop->id - 1));
+
+ /*
+ * If map to an existing region, assume the application
+ * crashed and we're restarting. Delete the old region
+ * and re-try. If that fails, return an error, the
+ * application will have to select a different segment
+ * ID or clean up some other way.
+ */
+ if ((id = shmget(segid, 0, 0)) != -1) {
+ (void)shmctl(id, IPC_RMID, NULL);
+ if ((id = shmget(segid, 0, 0)) != -1) {
+ __db_errx(env, DB_STR_A("0116",
+ "shmget: key: %ld: shared system memory region already exists",
+ "%ld"), (long)segid);
+ return (EAGAIN);
+ }
+ }
+
+ /*
+ * Map the DbEnv::open method file mode permissions to
+ * shmget call permissions.
+ */
+ mode = IPC_CREAT | __shm_mode(env);
+ if ((id = shmget(segid, rp->max, mode)) == -1) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR_A("0117",
+ "shmget: key: %ld: unable to create shared system memory region",
+ "%ld"), (long)segid);
+ return (__os_posix_err(ret));
+ }
+ rp->size = rp->max;
+ rp->segid = id;
+ } else
+ id = rp->segid;
+
+ if ((infop->addr = shmat(id, NULL, 0)) == (void *)-1) {
+ infop->addr = NULL;
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR_A("0118",
+ "shmat: id %d: unable to attach to shared system memory region",
+ "%d"), id);
+ return (__os_posix_err(ret));
+ }
+
+ /* Optionally lock the memory down. */
+ if (F_ISSET(env, ENV_LOCKDOWN)) {
+#ifdef HAVE_SHMCTL_SHM_LOCK
+ ret = shmctl(
+ id, SHM_LOCK, NULL) == 0 ? 0 : __os_get_syserr();
+#else
+ ret = DB_OPNOTSUP;
+#endif
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR_A("0119",
+ "shmctl/SHM_LOCK: id %d: unable to lock down shared memory region",
+ "%d"), id);
+ return (__os_posix_err(ret));
+ }
+ }
+
+ return (0);
+ }
+#else
+ return (__no_system_mem(env));
+#endif
+ }
+
+#ifdef HAVE_MMAP
+ {
+ infop->fhp = NULL;
+
+ /*
+ * Try to open/create the shared region file. We DO NOT need to ensure
+ * that multiple threads/processes attempting to simultaneously create
+ * the region are properly ordered, our caller has already taken care
+ * of that.
+ */
+ if ((ret = __os_open(env, infop->name, 0,
+ DB_OSO_REGION |
+ (F_ISSET(infop, REGION_CREATE_OK) ? DB_OSO_CREATE : 0),
+ env->db_mode, &infop->fhp)) != 0)
+ __db_err(env, ret, "%s", infop->name);
+
+ /*
+ * If we created the file, grow it before mapping it in. We really want
+ * to avoid touching the buffer cache after mmap() is called, doing
+ * anything else confuses the hell out of systems without merged
+ * VM/buffer cache systems, or, more to the point, *badly* merged
+ * VM/buffer cache systems.
+ */
+ if (rp->max < rp->size)
+ rp->max = rp->size;
+ if (ret == 0 && F_ISSET(infop, REGION_CREATE)) {
+ if (F_ISSET(dbenv, DB_ENV_REGION_INIT))
+ ret = __db_file_write(env, infop->fhp,
+ rp->size / MEGABYTE, rp->size % MEGABYTE, 0x00);
+ else
+ ret = __db_file_extend(env, infop->fhp, rp->size);
+ }
+
+ /* Map the file in. */
+ if (ret == 0)
+ ret = __os_map(env,
+ infop->name, infop->fhp, rp->max, 1, 0, &infop->addr);
+
+ if (ret != 0 && infop->fhp != NULL) {
+ (void)__os_closehandle(env, infop->fhp);
+ infop->fhp = NULL;
+ }
+
+ return (ret);
+ }
+#else
+ COMPQUIET(infop, NULL);
+ COMPQUIET(rp, NULL);
+ __db_errx(env, DB_STR("0120",
+ "architecture lacks mmap(2), shared environments not possible"));
+ return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __os_detach --
+ * Detach from a shared memory region.
+ *
+ * PUBLIC: int __os_detach __P((ENV *, REGINFO *, int));
+ */
+int
+__os_detach(env, infop, destroy)
+ ENV *env;
+ REGINFO *infop;
+ int destroy;
+{
+ DB_ENV *dbenv;
+ REGION *rp;
+ int ret;
+
+ /*
+ * We pass a DB_ENV handle to the user's replacement unmap function,
+ * so there must be a valid handle.
+ */
+ DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+ dbenv = env->dbenv;
+
+ rp = infop->rp;
+
+ /* If the user replaced the unmap call, call through their interface. */
+ if (DB_GLOBAL(j_region_unmap) != NULL)
+ return (DB_GLOBAL(j_region_unmap)(dbenv, infop->addr));
+
+ if (F_ISSET(env, ENV_SYSTEM_MEM)) {
+#ifdef HAVE_SHMGET
+ int segid;
+
+ /*
+ * We may be about to remove the memory referenced by rp,
+ * save the segment ID, and (optionally) wipe the original.
+ */
+ segid = rp->segid;
+ if (destroy)
+ rp->segid = INVALID_REGION_SEGID;
+
+ if (shmdt(infop->addr) != 0) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("0121", "shmdt"));
+ return (__os_posix_err(ret));
+ }
+
+ if (destroy && shmctl(segid, IPC_RMID,
+ NULL) != 0 && (ret = __os_get_syserr()) != EINVAL) {
+ __db_syserr(env, ret, DB_STR_A("0122",
+ "shmctl: id %d: unable to delete system shared memory region",
+ "%d"), segid);
+ return (__os_posix_err(ret));
+ }
+
+ return (0);
+#else
+ return (__no_system_mem(env));
+#endif
+ }
+
+#ifdef HAVE_MMAP
+#ifdef HAVE_MUNLOCK
+ if (F_ISSET(env, ENV_LOCKDOWN))
+ (void)munlock(infop->addr, rp->max);
+#endif
+ if (infop->fhp != NULL) {
+ ret = __os_closehandle(env, infop->fhp);
+ infop->fhp = NULL;
+ if (ret != 0)
+ return (ret);
+ }
+
+ if (munmap(infop->addr, rp->max) != 0) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("0123", "munmap"));
+ return (__os_posix_err(ret));
+ }
+
+ if (destroy && (ret = __os_unlink(env, infop->name, 1)) != 0)
+ return (ret);
+
+ return (0);
+#else
+ COMPQUIET(destroy, 0);
+ COMPQUIET(ret, 0);
+ return (EINVAL);
+#endif
+}
+
+/*
+ * __os_mapfile --
+ * Map in a shared memory file.
+ *
+ * PUBLIC: int __os_mapfile __P((ENV *, char *, DB_FH *, size_t, int, void **));
+ */
+int
+__os_mapfile(env, path, fhp, len, is_rdonly, addrp)
+ ENV *env;
+ char *path;
+ DB_FH *fhp;
+ int is_rdonly;
+ size_t len;
+ void **addrp;
+{
+#if defined(HAVE_MMAP) && !defined(HAVE_QNX)
+ DB_ENV *dbenv;
+
+ /* If the user replaced the map call, call through their interface. */
+ if (DB_GLOBAL(j_file_map) != NULL) {
+ /*
+ * We pass a DB_ENV handle to the user's replacement map
+ * function, so there must be a valid handle.
+ */
+ DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+ dbenv = env->dbenv;
+
+ return (
+ DB_GLOBAL(j_file_map)(dbenv, path, len, is_rdonly, addrp));
+ }
+
+ return (__os_map(env, path, fhp, len, 0, is_rdonly, addrp));
+#else
+ COMPQUIET(env, NULL);
+ COMPQUIET(path, NULL);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(is_rdonly, 0);
+ COMPQUIET(len, 0);
+ COMPQUIET(addrp, NULL);
+ return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __os_unmapfile --
+ * Unmap the shared memory file.
+ *
+ * PUBLIC: int __os_unmapfile __P((ENV *, void *, size_t));
+ */
+int
+__os_unmapfile(env, addr, len)
+ ENV *env;
+ void *addr;
+ size_t len;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ /*
+ * We pass a DB_ENV handle to the user's replacement unmap function,
+ * so there must be a valid handle.
+ */
+ DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+ dbenv = env->dbenv;
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR("0124", "fileops: munmap"));
+
+ /* If the user replaced the map call, call through their interface. */
+ if (DB_GLOBAL(j_file_unmap) != NULL)
+ return (DB_GLOBAL(j_file_unmap)(dbenv, addr));
+
+#ifdef HAVE_MMAP
+#ifdef HAVE_MUNLOCK
+ if (F_ISSET(env, ENV_LOCKDOWN))
+ RETRY_CHK((munlock(addr, len)), ret);
+ /*
+ * !!!
+ * The return value is ignored.
+ */
+#else
+ COMPQUIET(env, NULL);
+#endif
+ RETRY_CHK((munmap(addr, len)), ret);
+ ret = __os_posix_err(ret);
+#else
+ COMPQUIET(env, NULL);
+ ret = EINVAL;
+#endif
+ return (ret);
+}
+
+#ifdef HAVE_MMAP
+/*
+ * __os_map --
+ * Call the mmap(2) function.
+ */
+static int
+__os_map(env, path, fhp, len, is_region, is_rdonly, addrp)
+ ENV *env;
+ char *path;
+ DB_FH *fhp;
+ int is_region, is_rdonly;
+ size_t len;
+ void **addrp;
+{
+ DB_ENV *dbenv;
+ int flags, prot, ret;
+ void *p;
+
+ /*
+ * We pass a DB_ENV handle to the user's replacement map function,
+ * so there must be a valid handle.
+ */
+ DB_ASSERT(env, env != NULL && env->dbenv != NULL);
+ dbenv = env->dbenv;
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0125", "fileops: mmap %s",
+ "%s"), path);
+
+ DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+ /*
+ * If it's read-only, it's private, and if it's not, it's shared.
+ * Don't bother with an additional parameter.
+ */
+ flags = is_rdonly ? MAP_PRIVATE : MAP_SHARED;
+
+#ifdef MAP_FILE
+ /*
+ * Historically, MAP_FILE was required for mapping regular files,
+ * even though it was the default. Some systems have it, some
+ * don't, some that have it set it to 0.
+ */
+ flags |= MAP_FILE;
+#endif
+
+ /*
+ * I know of no systems that implement the flag to tell the system
+ * that the region contains semaphores, but it's not an unreasonable
+ * thing to do, and has been part of the design since forever. I
+ * don't think anyone will object, but don't set it for read-only
+ * files, it doesn't make sense.
+ */
+#ifdef MAP_HASSEMAPHORE
+ if (is_region && !is_rdonly)
+ flags |= MAP_HASSEMAPHORE;
+#else
+ COMPQUIET(is_region, 0);
+#endif
+
+ /*
+ * FreeBSD:
+ * Causes data dirtied via this VM map to be flushed to physical media
+ * only when necessary (usually by the pager) rather then gratuitously.
+ * Typically this prevents the update daemons from flushing pages
+ * dirtied through such maps and thus allows efficient sharing of
+ * memory across unassociated processes using a file-backed shared
+ * memory map.
+ */
+#ifdef MAP_NOSYNC
+ flags |= MAP_NOSYNC;
+#endif
+
+ prot = PROT_READ | (is_rdonly ? 0 : PROT_WRITE);
+
+ /*
+ * XXX
+ * Work around a bug in the VMS V7.1 mmap() implementation. To map
+ * a file into memory on VMS it needs to be opened in a certain way,
+ * originally. To get the file opened in that certain way, the VMS
+ * mmap() closes the file and re-opens it. When it does this, it
+ * doesn't flush any caches out to disk before closing. The problem
+ * this causes us is that when the memory cache doesn't get written
+ * out, the file isn't big enough to match the memory chunk and the
+ * mmap() call fails. This call to fsync() fixes the problem. DEC
+ * thinks this isn't a bug because of language in XPG5 discussing user
+ * responsibility for on-disk and in-memory synchronization.
+ */
+#ifdef VMS
+ if (__os_fsync(env, fhp) == -1)
+ return (__os_posix_err(__os_get_syserr()));
+#endif
+
+ /* MAP_FAILED was not defined in early mmap implementations. */
+#ifndef MAP_FAILED
+#define MAP_FAILED -1
+#endif
+ if ((p = mmap(NULL,
+ len, prot, flags, fhp->fd, (off_t)0)) == (void *)MAP_FAILED) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("0126", "mmap"));
+ return (__os_posix_err(ret));
+ }
+
+ /*
+ * If it's a region, we want to make sure that the memory isn't paged.
+ * For example, Solaris will page large mpools because it thinks that
+ * I/O buffer memory is more important than we are. The mlock system
+ * call may or may not succeed (mlock is restricted to the super-user
+ * on some systems). Currently, the only other use of mmap in DB is
+ * to map read-only databases -- we don't want them paged, either, so
+ * the call isn't conditional.
+ */
+ if (F_ISSET(env, ENV_LOCKDOWN)) {
+#ifdef HAVE_MLOCK
+ ret = mlock(p, len) == 0 ? 0 : __os_get_syserr();
+#else
+ ret = DB_OPNOTSUP;
+#endif
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR("0127", "mlock"));
+ return (__os_posix_err(ret));
+ }
+ }
+
+ *addrp = p;
+ return (0);
+}
+#endif
+
+#ifdef HAVE_SHMGET
+#ifndef SHM_R
+#define SHM_R 0400
+#endif
+#ifndef SHM_W
+#define SHM_W 0200
+#endif
+
+/*
+ * __shm_mode --
+ * Map the DbEnv::open method file mode permissions to shmget call
+ * permissions.
+ */
+static int
+__shm_mode(env)
+ ENV *env;
+{
+ int mode;
+
+ /* Default to r/w owner, r/w group. */
+ if (env->db_mode == 0)
+ return (SHM_R | SHM_W | SHM_R >> 3 | SHM_W >> 3);
+
+ mode = 0;
+ if (env->db_mode & S_IRUSR)
+ mode |= SHM_R;
+ if (env->db_mode & S_IWUSR)
+ mode |= SHM_W;
+ if (env->db_mode & S_IRGRP)
+ mode |= SHM_R >> 3;
+ if (env->db_mode & S_IWGRP)
+ mode |= SHM_W >> 3;
+ if (env->db_mode & S_IROTH)
+ mode |= SHM_R >> 6;
+ if (env->db_mode & S_IWOTH)
+ mode |= SHM_W >> 6;
+ return (mode);
+}
+#else
+/*
+ * __no_system_mem --
+ * No system memory environments error message.
+ */
+static int
+__no_system_mem(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("0128",
+ "architecture doesn't support environments in system memory"));
+ return (DB_OPNOTSUP);
+}
+#endif /* HAVE_SHMGET */
diff --git a/src/os/os_mkdir.c b/src/os/os_mkdir.c
new file mode 100644
index 00000000..800d445c
--- /dev/null
+++ b/src/os/os_mkdir.c
@@ -0,0 +1,52 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_mkdir --
+ * Create a directory.
+ *
+ * PUBLIC: int __os_mkdir __P((ENV *, const char *, int));
+ */
+int
+__os_mkdir(env, name, mode)
+ ENV *env;
+ const char *name;
+ int mode;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0129", "fileops: mkdir %s",
+ "%s"), name);
+
+ /* Make the directory, with paranoid permissions. */
+#if defined(HAVE_VXWORKS)
+ RETRY_CHK((mkdir(CHAR_STAR_CAST name)), ret);
+#else
+ RETRY_CHK((mkdir(name, DB_MODE_700)), ret);
+#endif
+ if (ret != 0)
+ return (__os_posix_err(ret));
+
+ /* Set the absolute permissions, if specified. */
+#if !defined(HAVE_VXWORKS)
+ if (mode != 0) {
+ RETRY_CHK((chmod(name, mode)), ret);
+ if (ret != 0)
+ ret = __os_posix_err(ret);
+ }
+#endif
+ return (ret);
+}
diff --git a/src/os/os_open.c b/src/os/os_open.c
new file mode 100644
index 00000000..5090c8e1
--- /dev/null
+++ b/src/os/os_open.c
@@ -0,0 +1,162 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_open --
+ * Open a file descriptor (including page size and log size information).
+ *
+ * PUBLIC: int __os_open __P((ENV *,
+ * PUBLIC: const char *, u_int32_t, u_int32_t, int, DB_FH **));
+ */
+int
+__os_open(env, name, page_size, flags, mode, fhpp)
+ ENV *env;
+ const char *name;
+ u_int32_t page_size, flags;
+ int mode;
+ DB_FH **fhpp;
+{
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+ int oflags, ret;
+
+ COMPQUIET(page_size, 0);
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ *fhpp = NULL;
+ oflags = 0;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0152",
+ "fileops: open %s", "%s"), name);
+
+#undef OKFLAGS
+#define OKFLAGS \
+ (DB_OSO_ABSMODE | DB_OSO_CREATE | DB_OSO_DIRECT | DB_OSO_DSYNC |\
+ DB_OSO_EXCL | DB_OSO_RDONLY | DB_OSO_REGION | DB_OSO_SEQ | \
+ DB_OSO_TEMP | DB_OSO_TRUNC)
+ if ((ret = __db_fchk(env, "__os_open", flags, OKFLAGS)) != 0)
+ return (ret);
+
+#if defined(O_BINARY)
+ /*
+ * If there's a binary-mode open flag, set it, we never want any
+ * kind of translation. Some systems do translations by default,
+ * e.g., with Cygwin, the default mode for an open() is set by the
+ * mode of the mount that underlies the file.
+ */
+ oflags |= O_BINARY;
+#endif
+
+ /*
+ * DB requires the POSIX 1003.1 semantic that two files opened at the
+ * same time with DB_OSO_CREATE/O_CREAT and DB_OSO_EXCL/O_EXCL flags
+ * set return an EEXIST failure in at least one.
+ */
+ if (LF_ISSET(DB_OSO_CREATE))
+ oflags |= O_CREAT;
+
+ if (LF_ISSET(DB_OSO_EXCL))
+ oflags |= O_EXCL;
+
+#ifdef HAVE_O_DIRECT
+ if (LF_ISSET(DB_OSO_DIRECT))
+ oflags |= O_DIRECT;
+#endif
+#ifdef O_DSYNC
+ if (LF_ISSET(DB_OSO_DSYNC))
+ oflags |= O_DSYNC;
+#endif
+
+ if (LF_ISSET(DB_OSO_RDONLY))
+ oflags |= O_RDONLY;
+ else
+ oflags |= O_RDWR;
+
+ if (LF_ISSET(DB_OSO_TRUNC))
+ oflags |= O_TRUNC;
+
+ /*
+ * Undocumented feature: allow applications to create intermediate
+ * directories whenever a file is opened.
+ */
+ if (dbenv != NULL &&
+ env->dir_mode != 0 && LF_ISSET(DB_OSO_CREATE) &&
+ (ret = __db_mkpath(env, name)) != 0)
+ return (ret);
+
+ /* Open the file. */
+#ifdef HAVE_QNX
+ if (LF_ISSET(DB_OSO_REGION))
+ ret = __os_qnx_region_open(env, name, oflags, mode, &fhp);
+ else
+#endif
+ ret = __os_openhandle(env, name, oflags, mode, &fhp);
+ if (ret != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_OSO_REGION))
+ F_SET(fhp, DB_FH_REGION);
+#ifdef HAVE_FCHMOD
+ /*
+ * If the code using Berkeley DB is a library, that code may not be able
+ * to control the application's umask value. Allow applications to set
+ * absolute file modes. We can't fix the race between file creation and
+ * the fchmod call -- we can't modify the process' umask here since the
+ * process may be multi-threaded and the umask value is per-process, not
+ * per-thread.
+ */
+ if (LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_ABSMODE))
+ (void)fchmod(fhp->fd, mode);
+#endif
+
+#ifdef O_DSYNC
+ /*
+ * If we can configure the file descriptor to flush on write, the
+ * file descriptor does not need to be explicitly sync'd.
+ */
+ if (LF_ISSET(DB_OSO_DSYNC))
+ F_SET(fhp, DB_FH_NOSYNC);
+#endif
+
+#if defined(HAVE_DIRECTIO) && defined(DIRECTIO_ON)
+ /*
+ * The Solaris C library includes directio, but you have to set special
+ * compile flags to #define DIRECTIO_ON. Require both in order to call
+ * directio.
+ */
+ if (LF_ISSET(DB_OSO_DIRECT))
+ (void)directio(fhp->fd, DIRECTIO_ON);
+#endif
+
+ /*
+ * Delete any temporary file.
+ *
+ * !!!
+ * There's a race here, where we've created a file and we crash before
+ * we can unlink it. Temporary files aren't common in DB, regardless,
+ * it's not a security problem because the file is empty. There's no
+ * reasonable way to avoid the race (playing signal games isn't worth
+ * the portability nightmare), so we just live with it.
+ */
+ if (LF_ISSET(DB_OSO_TEMP)) {
+#if defined(HAVE_UNLINK_WITH_OPEN_FAILURE) || defined(CONFIG_TEST)
+ F_SET(fhp, DB_FH_UNLINK);
+#else
+ (void)__os_unlink(env, name, 0);
+#endif
+ }
+
+ *fhpp = fhp;
+ return (0);
+}
diff --git a/src/os/os_path.c b/src/os/os_path.c
new file mode 100644
index 00000000..478fdf45
--- /dev/null
+++ b/src/os/os_path.c
@@ -0,0 +1,27 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+/*
+ * __os_concat_path --
+ * Concatenate two elements of a path.
+ * PUBLIC: int __os_concat_path __P((char *,
+ * PUBLIC: size_t, const char *, const char *));
+ */
+int __os_concat_path(dest, destsize, path, file)
+ char *dest;
+ size_t destsize;
+ const char *path, *file;
+{
+ if ((size_t)snprintf(dest, destsize,
+ "%s%c%s", path, PATH_SEPARATOR[0], file) >= destsize)
+ return (EINVAL);
+ return (0);
+}
diff --git a/src/os/os_pid.c b/src/os/os_pid.c
new file mode 100644
index 00000000..b1b94d60
--- /dev/null
+++ b/src/os/os_pid.c
@@ -0,0 +1,63 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_id --
+ * Return the current process ID.
+ *
+ * PUBLIC: void __os_id __P((DB_ENV *, pid_t *, db_threadid_t*));
+ */
+void
+__os_id(dbenv, pidp, tidp)
+ DB_ENV *dbenv;
+ pid_t *pidp;
+ db_threadid_t *tidp;
+{
+ /*
+ * We can't depend on dbenv not being NULL, this routine is called
+ * from places where there's no DB_ENV handle.
+ *
+ * We cache the pid in the ENV handle, getting the process ID is a
+ * fairly slow call on lots of systems.
+ */
+ if (pidp != NULL) {
+ if (dbenv == NULL) {
+#if defined(HAVE_VXWORKS)
+ *pidp = taskIdSelf();
+#else
+ *pidp = getpid();
+#endif
+ } else
+ *pidp = dbenv->env->pid_cache;
+ }
+
+/*
+ * When building on MinGW, we define both HAVE_PTHREAD_SELF and DB_WIN32,
+ * and we are using pthreads instead of Windows threads implementation.
+ * So here, we need to check the thread implementations before checking
+ * the platform.
+ */
+ if (tidp != NULL) {
+#if defined(HAVE_PTHREAD_SELF)
+ *tidp = pthread_self();
+#elif defined(HAVE_MUTEX_UI_THREADS)
+ *tidp = thr_self();
+#elif defined(DB_WIN32)
+ *tidp = GetCurrentThreadId();
+#else
+ /*
+ * Default to just getpid.
+ */
+ DB_THREADID_INIT(*tidp);
+#endif
+ }
+}
diff --git a/src/os/os_rename.c b/src/os/os_rename.c
new file mode 100644
index 00000000..63aac7bb
--- /dev/null
+++ b/src/os/os_rename.c
@@ -0,0 +1,53 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_rename --
+ * Rename a file.
+ *
+ * PUBLIC: int __os_rename __P((ENV *,
+ * PUBLIC: const char *, const char *, u_int32_t));
+ */
+int
+__os_rename(env, oldname, newname, silent)
+ ENV *env;
+ const char *oldname, *newname;
+ u_int32_t silent;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0168", "fileops: rename %s to %s",
+ "%s %s"), oldname, newname);
+
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+
+ if (DB_GLOBAL(j_rename) != NULL)
+ ret = DB_GLOBAL(j_rename)(oldname, newname);
+ else
+ RETRY_CHK((rename(oldname, newname)), ret);
+
+ /*
+ * If "silent" is not set, then errors are OK and we should not output
+ * an error message.
+ */
+ if (ret != 0) {
+ if (!silent)
+ __db_syserr(env, ret, DB_STR_A("0169",
+ "rename %s %s", "%s %s"), oldname, newname);
+ ret = __os_posix_err(ret);
+ }
+ return (ret);
+}
diff --git a/src/os/os_root.c b/src/os/os_root.c
new file mode 100644
index 00000000..77e7a72c
--- /dev/null
+++ b/src/os/os_root.c
@@ -0,0 +1,27 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_isroot --
+ * Return if user has special permissions.
+ *
+ * PUBLIC: int __os_isroot __P((void));
+ */
+int
+__os_isroot()
+{
+#ifdef HAVE_GETUID
+ return (getuid() == 0);
+#else
+ return (0);
+#endif
+}
diff --git a/src/os/os_rpath.c b/src/os/os_rpath.c
new file mode 100644
index 00000000..16f3e54c
--- /dev/null
+++ b/src/os/os_rpath.c
@@ -0,0 +1,36 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __db_rpath --
+ * Return the last path separator in the path or NULL if none found.
+ *
+ * PUBLIC: char *__db_rpath __P((const char *));
+ */
+char *
+__db_rpath(path)
+ const char *path;
+{
+ const char *s, *last;
+
+ s = path;
+ last = NULL;
+ if (PATH_SEPARATOR[1] != '\0') {
+ for (; s[0] != '\0'; ++s)
+ if (strchr(PATH_SEPARATOR, s[0]) != NULL)
+ last = s;
+ } else
+ for (; s[0] != '\0'; ++s)
+ if (s[0] == PATH_SEPARATOR[0])
+ last = s;
+ return ((char *)last);
+}
diff --git a/src/os/os_rw.c b/src/os/os_rw.c
new file mode 100644
index 00000000..c0967514
--- /dev/null
+++ b/src/os/os_rw.c
@@ -0,0 +1,291 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_io --
+ * Do an I/O.
+ *
+ * PUBLIC: int __os_io __P((ENV *, int, DB_FH *, db_pgno_t,
+ * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int8_t *, size_t *));
+ */
+int
+__os_io(env, op, fhp, pgno, pgsize, relative, io_len, buf, niop)
+ ENV *env;
+ int op;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+ u_int32_t pgsize, relative, io_len;
+ u_int8_t *buf;
+ size_t *niop;
+{
+#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
+ DB_ENV *dbenv;
+ off_t offset;
+ ssize_t nio;
+#endif
+ int ret;
+
+ /*
+ * Check for illegal usage.
+ *
+ * This routine is used in one of two ways: reading bytes from an
+ * absolute offset and reading a specific database page. All of
+ * our absolute offsets are known to fit into a u_int32_t, while
+ * our database pages might be at offsets larger than a u_int32_t.
+ */
+ DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+ DB_ASSERT(env, (pgno == 0 && pgsize == 0) || relative == 0);
+
+#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if ((offset = relative) == 0)
+ offset = (off_t)pgno * pgsize;
+ switch (op) {
+ case DB_IO_READ:
+ if (DB_GLOBAL(j_read) != NULL)
+ goto slow;
+#if defined(HAVE_STATISTICS)
+ ++fhp->read_count;
+#endif
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0130",
+ "fileops: read %s: %lu bytes at offset %lu",
+ "%s %lu %lu"), fhp->name, (u_long)io_len,
+ (u_long)offset);
+
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+ nio = DB_GLOBAL(j_pread) != NULL ?
+ DB_GLOBAL(j_pread)(fhp->fd, buf, io_len, offset) :
+ pread(fhp->fd, buf, io_len, offset);
+ break;
+ case DB_IO_WRITE:
+ if (DB_GLOBAL(j_write) != NULL)
+ goto slow;
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ if (__os_fs_notzero())
+ goto slow;
+#endif
+#if defined(HAVE_STATISTICS)
+ ++fhp->write_count;
+#endif
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0131",
+ "fileops: write %s: %lu bytes at offset %lu",
+ "%s %lu %lu"), fhp->name, (u_long)io_len,
+ (u_long)offset);
+
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+ nio = DB_GLOBAL(j_pwrite) != NULL ?
+ DB_GLOBAL(j_pwrite)(fhp->fd, buf, io_len, offset) :
+ pwrite(fhp->fd, buf, io_len, offset);
+ break;
+ default:
+ return (EINVAL);
+ }
+ if (nio == (ssize_t)io_len) {
+ *niop = io_len;
+ return (0);
+ }
+slow:
+#endif
+ MUTEX_LOCK(env, fhp->mtx_fh);
+
+ if ((ret = __os_seek(env, fhp, pgno, pgsize, relative)) != 0)
+ goto err;
+ switch (op) {
+ case DB_IO_READ:
+ ret = __os_read(env, fhp, buf, io_len, niop);
+ break;
+ case DB_IO_WRITE:
+ ret = __os_write(env, fhp, buf, io_len, niop);
+ break;
+ default:
+ ret = EINVAL;
+ break;
+ }
+
+err: MUTEX_UNLOCK(env, fhp->mtx_fh);
+
+ return (ret);
+
+}
+
+/*
+ * __os_read --
+ * Read from a file handle.
+ *
+ * PUBLIC: int __os_read __P((ENV *, DB_FH *, void *, size_t, size_t *));
+ */
+int
+__os_read(env, fhp, addr, len, nrp)
+ ENV *env;
+ DB_FH *fhp;
+ void *addr;
+ size_t len;
+ size_t *nrp;
+{
+ DB_ENV *dbenv;
+ size_t offset;
+ ssize_t nr;
+ int ret;
+ u_int8_t *taddr;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ ret = 0;
+
+ DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+#if defined(HAVE_STATISTICS)
+ ++fhp->read_count;
+#endif
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0132",
+ "fileops: read %s: %lu bytes", "%s %lu"),
+ fhp->name, (u_long)len);
+
+ if (DB_GLOBAL(j_read) != NULL) {
+ *nrp = len;
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+ if (DB_GLOBAL(j_read)(fhp->fd, addr, len) != (ssize_t)len) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR_A("0133",
+ "read: %#lx, %lu", "%#lx %lu"),
+ P_TO_ULONG(addr), (u_long)len);
+ ret = __os_posix_err(ret);
+ }
+ return (ret);
+ }
+
+ for (taddr = addr, offset = 0;
+ offset < len; taddr += nr, offset += (u_int32_t)nr) {
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+ RETRY_CHK(((nr = read(fhp->fd,
+ CHAR_STAR_CAST taddr, len - offset)) < 0 ? 1 : 0), ret);
+ if (nr == 0 || ret != 0)
+ break;
+ }
+ *nrp = (size_t)(taddr - (u_int8_t *)addr);
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR_A("0134",
+ "read: %#lx, %lu", "%#lx %lu"),
+ P_TO_ULONG(taddr), (u_long)len - offset);
+ ret = __os_posix_err(ret);
+ }
+ return (ret);
+}
+
+/*
+ * __os_write --
+ * Write to a file handle.
+ *
+ * PUBLIC: int __os_write __P((ENV *, DB_FH *, void *, size_t, size_t *));
+ */
+int
+__os_write(env, fhp, addr, len, nwp)
+ ENV *env;
+ DB_FH *fhp;
+ void *addr;
+ size_t len;
+ size_t *nwp;
+{
+ DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ /* Zero-fill as necessary. */
+ if (__os_fs_notzero()) {
+ int ret;
+ if ((ret = __db_zero_fill(env, fhp)) != 0)
+ return (ret);
+ }
+#endif
+ return (__os_physwrite(env, fhp, addr, len, nwp));
+}
+
+/*
+ * __os_physwrite --
+ * Physical write to a file handle.
+ *
+ * PUBLIC: int __os_physwrite
+ * PUBLIC: __P((ENV *, DB_FH *, void *, size_t, size_t *));
+ */
+int
+__os_physwrite(env, fhp, addr, len, nwp)
+ ENV *env;
+ DB_FH *fhp;
+ void *addr;
+ size_t len;
+ size_t *nwp;
+{
+ DB_ENV *dbenv;
+ size_t offset;
+ ssize_t nw;
+ int ret;
+ u_int8_t *taddr;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ ret = 0;
+
+#if defined(HAVE_STATISTICS)
+ ++fhp->write_count;
+#endif
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0135",
+ "fileops: write %s: %lu bytes", "%s %lu"),
+ fhp->name, (u_long)len);
+
+#if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
+ if (__os_fs_notzero()) {
+ struct stat sb;
+ off_t cur_off;
+
+ DB_ASSERT(env, fstat(fhp->fd, &sb) != -1 &&
+ (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
+ cur_off <= sb.st_size);
+ }
+#endif
+ if (DB_GLOBAL(j_write) != NULL) {
+ *nwp = len;
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+ if (DB_GLOBAL(j_write)(fhp->fd, addr, len) != (ssize_t)len) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR_A("0136",
+ "write: %#lx, %lu", "%#lx %lu"),
+ P_TO_ULONG(addr), (u_long)len);
+ ret = __os_posix_err(ret);
+
+ DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
+ }
+ return (ret);
+ }
+
+ for (taddr = addr, offset = 0;
+ offset < len; taddr += nw, offset += (u_int32_t)nw) {
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+ RETRY_CHK(((nw = write(fhp->fd,
+ CHAR_STAR_CAST taddr, len - offset)) < 0 ? 1 : 0), ret);
+ if (ret != 0)
+ break;
+ }
+ *nwp = len;
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR_A("0137",
+ "write: %#lx, %lu", "%#lx %lu"),
+ P_TO_ULONG(taddr), (u_long)len - offset);
+ ret = __os_posix_err(ret);
+
+ DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
+ }
+ return (ret);
+}
diff --git a/src/os/os_seek.c b/src/os/os_seek.c
new file mode 100644
index 00000000..4676d33a
--- /dev/null
+++ b/src/os/os_seek.c
@@ -0,0 +1,66 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_seek --
+ * Seek to a page/byte offset in the file.
+ *
+ * PUBLIC: int __os_seek __P((ENV *,
+ * PUBLIC: DB_FH *, db_pgno_t, u_int32_t, off_t));
+ */
+int
+__os_seek(env, fhp, pgno, pgsize, relative)
+ ENV *env;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+ u_int32_t pgsize;
+ off_t relative;
+{
+ DB_ENV *dbenv;
+ off_t offset;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+#if defined(HAVE_STATISTICS)
+ ++fhp->seek_count;
+#endif
+
+ offset = (off_t)pgsize * pgno + relative;
+
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0170",
+ "fileops: seek %s to %lu", "%s %lu"),
+ fhp->name, (u_long)offset);
+
+ if (DB_GLOBAL(j_seek) != NULL)
+ ret = DB_GLOBAL(j_seek)(fhp->fd, offset, SEEK_SET);
+ else
+ RETRY_CHK((lseek(
+ fhp->fd, offset, SEEK_SET) == -1 ? 1 : 0), ret);
+
+ if (ret == 0) {
+ fhp->pgsize = pgsize;
+ fhp->pgno = pgno;
+ fhp->offset = relative;
+ } else {
+ __db_syserr(env, ret, DB_STR_A("0171",
+ "seek: %lu: (%lu * %lu) + %lu", "%lu %lu %lu %lu"),
+ (u_long)offset, (u_long)pgno, (u_long)pgsize,
+ (u_long)relative);
+ ret = __os_posix_err(ret);
+ }
+
+ return (ret);
+}
diff --git a/src/os/os_stack.c b/src/os/os_stack.c
new file mode 100644
index 00000000..037080f3
--- /dev/null
+++ b/src/os/os_stack.c
@@ -0,0 +1,45 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#if defined(HAVE_SYSTEM_INCLUDE_FILES) && defined(HAVE_BACKTRACE) && \
+ defined(HAVE_BACKTRACE_SYMBOLS) && defined(HAVE_EXECINFO_H)
+#include <execinfo.h>
+#endif
+
+/*
+ * __os_stack --
+ * Output a stack trace to the message file handle.
+ *
+ * PUBLIC: void __os_stack __P((ENV *));
+ */
+void
+__os_stack(env)
+ ENV *env;
+{
+#if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS)
+ void *array[200];
+ size_t i, size;
+ char **strings;
+
+ /*
+ * Solaris and the GNU C library support this interface. Solaris
+ * has additional interfaces (printstack and walkcontext), I don't
+ * know if they offer any additional value or not.
+ */
+ size = backtrace(array, sizeof(array) / sizeof(array[0]));
+ strings = backtrace_symbols(array, size);
+
+ for (i = 0; i < size; ++i)
+ __db_errx(env, "%s", strings[i]);
+ free(strings);
+#endif
+ COMPQUIET(env, NULL);
+}
diff --git a/src/os/os_stat.c b/src/os/os_stat.c
new file mode 100644
index 00000000..43c66075
--- /dev/null
+++ b/src/os/os_stat.c
@@ -0,0 +1,108 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_exists --
+ * Return if the file exists.
+ *
+ * PUBLIC: int __os_exists __P((ENV *, const char *, int *));
+ */
+int
+__os_exists(env, path, isdirp)
+ ENV *env;
+ const char *path;
+ int *isdirp;
+{
+ DB_ENV *dbenv;
+ struct stat sb;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0165",
+ "fileops: stat %s", "%s"), path);
+
+ if (DB_GLOBAL(j_exists) != NULL)
+ return (DB_GLOBAL(j_exists)(path, isdirp));
+
+ RETRY_CHK((stat(CHAR_STAR_CAST path, &sb)), ret);
+ if (ret != 0)
+ return (__os_posix_err(ret));
+
+#if !defined(S_ISDIR) || defined(STAT_MACROS_BROKEN)
+#undef S_ISDIR
+#ifdef _S_IFDIR
+#define S_ISDIR(m) (_S_IFDIR & (m))
+#else
+#define S_ISDIR(m) (((m) & 0170000) == 0040000)
+#endif
+#endif
+ if (isdirp != NULL)
+ *isdirp = S_ISDIR(sb.st_mode);
+
+ return (0);
+}
+
+/*
+ * __os_ioinfo --
+ * Return file size and I/O size; abstracted to make it easier
+ * to replace.
+ *
+ * PUBLIC: int __os_ioinfo __P((ENV *, const char *,
+ * PUBLIC: DB_FH *, u_int32_t *, u_int32_t *, u_int32_t *));
+ */
+int
+__os_ioinfo(env, path, fhp, mbytesp, bytesp, iosizep)
+ ENV *env;
+ const char *path;
+ DB_FH *fhp;
+ u_int32_t *mbytesp, *bytesp, *iosizep;
+{
+ struct stat sb;
+ int ret;
+
+ if (DB_GLOBAL(j_ioinfo) != NULL)
+ return (DB_GLOBAL(j_ioinfo)(path,
+ fhp->fd, mbytesp, bytesp, iosizep));
+
+ DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
+
+ RETRY_CHK((fstat(fhp->fd, &sb)), ret);
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR("0166", "fstat"));
+ return (__os_posix_err(ret));
+ }
+
+ /* Return the size of the file. */
+ if (mbytesp != NULL)
+ *mbytesp = (u_int32_t)(sb.st_size / MEGABYTE);
+ if (bytesp != NULL)
+ *bytesp = (u_int32_t)(sb.st_size % MEGABYTE);
+
+ /*
+ * Return the underlying filesystem I/O size, if available.
+ *
+ * XXX
+ * Check for a 0 size -- the HP MPE/iX architecture has st_blksize,
+ * but it's always 0.
+ */
+#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
+ if (iosizep != NULL && (*iosizep = sb.st_blksize) == 0)
+ *iosizep = DB_DEF_IOSIZE;
+#else
+ if (iosizep != NULL)
+ *iosizep = DB_DEF_IOSIZE;
+#endif
+ return (0);
+}
diff --git a/src/os/os_tmpdir.c b/src/os/os_tmpdir.c
new file mode 100644
index 00000000..06d35ba9
--- /dev/null
+++ b/src/os/os_tmpdir.c
@@ -0,0 +1,141 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#ifdef macintosh
+#include <TFileSpec.h>
+#endif
+#endif
+
+/*
+ * __os_tmpdir --
+ * Set the temporary directory path.
+ *
+ * The order of items in the list structure and the order of checks in
+ * the environment are documented.
+ *
+ * PUBLIC: int __os_tmpdir __P((ENV *, u_int32_t));
+ */
+int
+__os_tmpdir(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ int isdir, ret;
+ char *tdir, tdir_buf[DB_MAXPATHLEN];
+
+ dbenv = env->dbenv;
+
+ /* Use the environment if it's permitted and initialized. */
+ if (LF_ISSET(DB_USE_ENVIRON) ||
+ (LF_ISSET(DB_USE_ENVIRON_ROOT) && __os_isroot())) {
+ /* POSIX: TMPDIR */
+ tdir = tdir_buf;
+ if ((ret = __os_getenv(
+ env, "TMPDIR", &tdir, sizeof(tdir_buf))) != 0)
+ return (ret);
+ if (tdir != NULL && tdir[0] != '\0')
+ goto found;
+
+ /*
+ * Windows: TEMP, TMP
+ */
+ tdir = tdir_buf;
+ if ((ret = __os_getenv(
+ env, "TEMP", &tdir, sizeof(tdir_buf))) != 0)
+ return (ret);
+ if (tdir != NULL && tdir[0] != '\0')
+ goto found;
+
+ tdir = tdir_buf;
+ if ((ret = __os_getenv(
+ env, "TMP", &tdir, sizeof(tdir_buf))) != 0)
+ return (ret);
+ if (tdir != NULL && tdir[0] != '\0')
+ goto found;
+
+ /* Macintosh */
+ tdir = tdir_buf;
+ if ((ret = __os_getenv(
+ env, "TempFolder", &tdir, sizeof(tdir_buf))) != 0)
+ return (ret);
+
+ if (tdir != NULL && tdir[0] != '\0')
+found: return (__os_strdup(env, tdir, &dbenv->db_tmp_dir));
+ }
+
+#ifdef macintosh
+ /* Get the path to the temporary folder. */
+ {FSSpec spec;
+
+ if (!Special2FSSpec(kTemporaryFolderType,
+ kOnSystemDisk, 0, &spec))
+ return (__os_strdup(env,
+ FSp2FullPath(&spec), &dbenv->db_tmp_dir));
+ }
+#endif
+#ifdef DB_WIN32
+ /* Get the path to the temporary directory. */
+ {
+ _TCHAR tpath[DB_MAXPATHLEN + 1];
+ char *path, *eos;
+
+ if (GetTempPath(DB_MAXPATHLEN, tpath) > 2) {
+ FROM_TSTRING(env, tpath, path, ret);
+ if (ret != 0)
+ return (ret);
+
+ eos = path + strlen(path) - 1;
+ if (*eos == '\\' || *eos == '/')
+ *eos = '\0';
+ if (__os_exists(env, path, &isdir) == 0 && isdir) {
+ ret = __os_strdup(env,
+ path, &dbenv->db_tmp_dir);
+ FREE_STRING(env, path);
+ return (ret);
+ }
+ FREE_STRING(env, path);
+ }
+ }
+#endif
+
+ /*
+ * Step through the static list looking for a possibility.
+ *
+ * We don't use the obvious data structure because some C compilers
+ * (and I use the phrase loosely) don't like static data arrays.
+ */
+#define DB_TEMP_DIRECTORY(n) { \
+ char *__p = n; \
+ if (__os_exists(env, __p, &isdir) == 0 && isdir != 0) \
+ return (__os_strdup(env, __p, &dbenv->db_tmp_dir)); \
+ }
+#ifdef DB_WIN32
+ DB_TEMP_DIRECTORY("/temp");
+ DB_TEMP_DIRECTORY("C:/temp");
+ DB_TEMP_DIRECTORY("C:/tmp");
+#else
+ DB_TEMP_DIRECTORY("/var/tmp");
+ DB_TEMP_DIRECTORY("/usr/tmp");
+ DB_TEMP_DIRECTORY("/tmp");
+#if defined(ANDROID) || defined(DB_ANDROID)
+ DB_TEMP_DIRECTORY("/cache");
+#endif
+#endif
+
+ /*
+ * If we don't have any other place to store temporary files, store
+ * them in the current directory.
+ */
+ return (__os_strdup(env, "", &dbenv->db_tmp_dir));
+}
diff --git a/src/os/os_truncate.c b/src/os/os_truncate.c
new file mode 100644
index 00000000..f559e9cb
--- /dev/null
+++ b/src/os/os_truncate.c
@@ -0,0 +1,63 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_truncate --
+ * Truncate the file.
+ *
+ * PUBLIC: int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t));
+ */
+int
+__os_truncate(env, fhp, pgno, pgsize)
+ ENV *env;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+ u_int32_t pgsize;
+{
+ DB_ENV *dbenv;
+ off_t offset;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ /*
+ * Truncate a file so that "pgno" is discarded from the end of the
+ * file.
+ */
+ offset = (off_t)pgsize * pgno;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0141",
+ "fileops: truncate %s to %lu", "%s %lu"),
+ fhp->name, (u_long)offset);
+
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+
+ if (DB_GLOBAL(j_ftruncate) != NULL)
+ ret = DB_GLOBAL(j_ftruncate)(fhp->fd, offset);
+ else {
+#ifdef HAVE_FTRUNCATE
+ RETRY_CHK((ftruncate(fhp->fd, offset)), ret);
+#else
+ ret = DB_OPNOTSUP;
+#endif
+ }
+
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR_A("0142",
+ "ftruncate: %lu", "%lu"), (u_long)offset);
+ ret = __os_posix_err(ret);
+ }
+
+ return (ret);
+}
diff --git a/src/os/os_uid.c b/src/os/os_uid.c
new file mode 100644
index 00000000..2e5c9f87
--- /dev/null
+++ b/src/os/os_uid.c
@@ -0,0 +1,55 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_unique_id --
+ * Return a unique 32-bit value.
+ *
+ * PUBLIC: void __os_unique_id __P((ENV *, u_int32_t *));
+ */
+void
+__os_unique_id(env, idp)
+ ENV *env;
+ u_int32_t *idp;
+{
+ DB_ENV *dbenv;
+ db_timespec v;
+ pid_t pid;
+ u_int32_t id;
+
+ *idp = 0;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ /*
+ * Our randomized value is comprised of our process ID, the current
+ * time of day and a stack address, all XOR'd together.
+ */
+ __os_id(dbenv, &pid, NULL);
+ __os_gettime(env, &v, 1);
+
+ id = (u_int32_t)pid ^
+ (u_int32_t)v.tv_sec ^ (u_int32_t)v.tv_nsec ^ P_TO_UINT32(&pid);
+
+ /*
+ * We could try and find a reasonable random-number generator, but
+ * that's not all that easy to do. Seed and use srand()/rand(), if
+ * we can find them.
+ */
+ if (DB_GLOBAL(uid_init) == 0) {
+ DB_GLOBAL(uid_init) = 1;
+ srand((u_int)id);
+ }
+ id ^= (u_int)rand();
+
+ *idp = id;
+}
diff --git a/src/os/os_unlink.c b/src/os/os_unlink.c
new file mode 100644
index 00000000..f9a0b688
--- /dev/null
+++ b/src/os/os_unlink.c
@@ -0,0 +1,80 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_unlink --
+ * Remove a file.
+ *
+ * PUBLIC: int __os_unlink __P((ENV *, const char *, int));
+ */
+int
+__os_unlink(env, path, overwrite_test)
+ ENV *env;
+ const char *path;
+ int overwrite_test;
+{
+ DB_ENV *dbenv;
+ int ret, t_ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0160", "fileops: unlink %s",
+ "%s"), path);
+
+ /* Optionally overwrite the contents of the file to enhance security. */
+ if (dbenv != NULL && overwrite_test && F_ISSET(dbenv, DB_ENV_OVERWRITE))
+ (void)__db_file_multi_write(env, path);
+
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+
+ if (DB_GLOBAL(j_unlink) != NULL)
+ ret = DB_GLOBAL(j_unlink)(path);
+ else {
+ RETRY_CHK((unlink(CHAR_STAR_CAST path)), ret);
+#ifdef HAVE_QNX
+ /*
+ * The file may be a region file created by shm_open, not a
+ * regular file. Try and delete using unlink, and if that
+ * fails for an unexpected reason, try a shared memory unlink.
+ */
+ if (ret != 0 && __os_posix_err(ret) != ENOENT)
+ RETRY_CHK((shm_unlink(path)), ret);
+#endif
+ }
+
+ /*
+ * !!!
+ * The results of unlink are file system driver specific on VxWorks.
+ * In the case of removing a file that did not exist, some, at least,
+ * return an error, but with an errno of 0, not ENOENT. We do not
+ * have to test for that explicitly, the RETRY_CHK macro resets "ret"
+ * to be the errno, and so we'll just slide right on through.
+ *
+ * XXX
+ * We shouldn't be testing for an errno of ENOENT here, but ENOENT
+ * signals that a file is missing, and we attempt to unlink things
+ * (such as v. 2.x environment regions, in ENV->remove) that we
+ * are expecting not to be there. Reporting errors in these cases
+ * is annoying.
+ */
+ if (ret != 0) {
+ t_ret = __os_posix_err(ret);
+ if (t_ret != ENOENT)
+ __db_syserr(env, ret, DB_STR_A("0161",
+ "unlink: %s", "%s"), path);
+ ret = t_ret;
+ }
+
+ return (ret);
+}
diff --git a/src/os/os_yield.c b/src/os/os_yield.c
new file mode 100644
index 00000000..f0e170f0
--- /dev/null
+++ b/src/os/os_yield.c
@@ -0,0 +1,95 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#if defined(HAVE_SYSTEM_INCLUDE_FILES) && defined(HAVE_SCHED_YIELD)
+#include <sched.h>
+#endif
+
+static void __os_sleep __P((ENV *, u_long, u_long));
+
+/*
+ * __os_yield --
+ * Yield the processor, optionally pausing until running again.
+ *
+ * PUBLIC: void __os_yield __P((ENV *, u_long, u_long));
+ */
+void
+__os_yield(env, secs, usecs)
+ ENV *env;
+ u_long secs, usecs; /* Seconds and microseconds. */
+{
+ /*
+ * Don't require the values be normalized (some operating systems
+ * return an error if the usecs argument to select is too large).
+ */
+ for (; usecs >= US_PER_SEC; usecs -= US_PER_SEC)
+ ++secs;
+
+ if (DB_GLOBAL(j_yield) != NULL) {
+ (void)DB_GLOBAL(j_yield)(secs, usecs);
+ return;
+ }
+
+ /*
+ * Yield the processor so other processes or threads can run. Use
+ * the local yield call if not pausing, otherwise call the select
+ * function.
+ */
+ if (secs != 0 || usecs != 0)
+ __os_sleep(env, secs, usecs);
+ else {
+#if defined(HAVE_MUTEX_UI_THREADS)
+ thr_yield();
+#elif defined(HAVE_PTHREAD_YIELD)
+ pthread_yield();
+#elif defined(HAVE_SCHED_YIELD)
+ (void)sched_yield();
+#elif defined(HAVE_YIELD)
+ yield();
+#else
+ __os_sleep(env, 0, 0);
+#endif
+ }
+}
+
+/*
+ * __os_sleep --
+ * Pause the thread of control.
+ */
+static void
+__os_sleep(env, secs, usecs)
+ ENV *env;
+ u_long secs, usecs; /* Seconds and microseconds. */
+{
+ struct timeval t;
+ int ret;
+
+ /*
+ * Sheer raving paranoia -- don't select for 0 time, in case some
+ * implementation doesn't yield the processor in that case.
+ */
+ t.tv_sec = (long)secs;
+ t.tv_usec = (long)usecs + 1;
+
+ /*
+ * We don't catch interrupts and restart the system call here, unlike
+ * other Berkeley DB system calls. This may be a user attempting to
+ * interrupt a sleeping DB utility (for example, db_checkpoint), and
+ * we want the utility to see the signal and quit. This assumes it's
+ * always OK for DB to sleep for less time than originally scheduled.
+ */
+ if (select(0, NULL, NULL, NULL, &t) == -1) {
+ ret = __os_get_syserr();
+ if (__os_posix_err(ret) != EINTR)
+ __db_syserr(env, ret, DB_STR("0167", "select"));
+ }
+}
diff --git a/src/os_qnx/os_qnx_fsync.c b/src/os_qnx/os_qnx_fsync.c
new file mode 100644
index 00000000..827fa446
--- /dev/null
+++ b/src/os_qnx/os_qnx_fsync.c
@@ -0,0 +1,73 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * QNX has special requirements on FSYNC: if the file is a shared memory
+ * object, we can not call fsync because it is not implemented, instead,
+ * we set the O_DSYNC flag to the file descriptor and then do an empty
+ * write so that all data are synced. We only sync this way if the file
+ * is a shared memory object, other types of ordinary files are still synced
+ * using fsync, to be not only faster but also atomic.
+ * We don't just set the O_DSYNC flag on open, since it would force all writes
+ * to be sync'ed. And we remove the O_DSYNC if it is not originally set to
+ * the file descriptor before passed in to this function.
+ * This is slightly different to the VxWorks and hp code above, since QNX does
+ * supply a fsync call, it just has a unique requirement.
+ */
+int
+__qnx_fsync(fhp)
+ DB_FH *fhp;
+{
+ int ret;
+ int fd, unset, flags;
+
+ fd = fhp->fd;
+ unset = 1;
+ ret = flags = 0;
+ if (F_ISSET(fhp, DB_FH_REGION))
+ {
+ RETRY_CHK(fcntl(fd, F_GETFL), ret);
+ if (ret == -1)
+ goto err;
+ /*
+ * if already has O_DSYNC flag, we can't remove it
+ * after the empty write
+ */
+ if (ret & O_DSYNC != 0)
+ unset = 0;
+ else {
+ ret |= O_DSYNC;
+ flags = ret;
+ RETRY_CHK(fcntl(fd, F_SETFL, flags), ret);
+ if (ret == -1)
+ goto err;
+ }
+ /* Do an empty write, to force a sync */
+ RETRY_CHK(write(fd, "", 0), ret);
+ if (ret == -1)
+ goto err;
+ /* remove the O_DSYNC flag if necessary */
+ if (unset) {
+ RETRY_CHK(fcntl(fd, F_GETFL), ret);
+ if (ret == -1)
+ goto err;
+ ret &= ~O_DSYNC;
+ flags = ret;
+ RETRY_CHK(fcntl(fd, F_SETFL, flags), ret);
+ if (ret == -1)
+ goto err;
+ }
+ } else
+ RETRY_CHK(fdatasync(fd), ret);
+
+err: return (ret);
+}
diff --git a/src/os_qnx/os_qnx_open.c b/src/os_qnx/os_qnx_open.c
new file mode 100644
index 00000000..d0214a0d
--- /dev/null
+++ b/src/os_qnx/os_qnx_open.c
@@ -0,0 +1,79 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_qnx_region_open --
+ * Open a shared memory region file using POSIX shm_open.
+ *
+ * PUBLIC: #ifdef HAVE_QNX
+ * PUBLIC: int __os_qnx_region_open
+ * PUBLIC: __P((ENV *, const char *, int, int, DB_FH **));
+ * PUBLIC: #endif
+ */
+int
+__os_qnx_region_open(env, name, oflags, mode, fhpp)
+ ENV *env;
+ const char *name;
+ int oflags, mode;
+ DB_FH **fhpp;
+{
+ DB_FH *fhp;
+ int fcntl_flags;
+ int ret;
+
+ /*
+ * Allocate the file handle and copy the file name. We generally only
+ * use the name for verbose or error messages, but on systems where we
+ * can't unlink temporary files immediately, we use the name to unlink
+ * the temporary file when the file handle is closed.
+ *
+ * Lock the ENV handle and insert the new file handle on the list.
+ */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_FH), &fhp)) != 0)
+ return (ret);
+ if ((ret = __os_strdup(env, name, &fhp->name)) != 0)
+ goto err;
+ if (env != NULL) {
+ MUTEX_LOCK(env, env->mtx_env);
+ TAILQ_INSERT_TAIL(&env->fdlist, fhp, q);
+ MUTEX_UNLOCK(env, env->mtx_env);
+ F_SET(fhp, DB_FH_ENVLINK);
+ }
+
+ /*
+ * Once we have created the object, we don't need the name
+ * anymore. Other callers of this will convert themselves.
+ */
+ if ((fhp->fd = shm_open(name, oflags, mode)) == -1) {
+ ret = __os_posix_err(__os_get_syserr());
+err: (void)__os_closehandle(env, fhp);
+ return (ret);
+ }
+
+ F_SET(fhp, DB_FH_OPENED);
+
+#ifdef HAVE_FCNTL_F_SETFD
+ /* Deny file descriptor access to any child process. */
+ if ((fcntl_flags = fcntl(fhp->fd, F_GETFD)) == -1 ||
+ fcntl(fhp->fd, F_SETFD, fcntl_flags | FD_CLOEXEC) == -1) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("0001", "fcntl(F_SETFD)"));
+ (void)__os_closehandle(env, fhp);
+ return (__os_posix_err(ret));
+ }
+#else
+ COMPQUIET(fcntl_flags, 0);
+#endif
+ F_SET(fhp, DB_FH_OPENED);
+ *fhpp = fhp;
+ return (0);
+}
diff --git a/src/os_vxworks/os_vx_abs.c b/src/os_vxworks/os_vx_abs.c
new file mode 100644
index 00000000..69413ee5
--- /dev/null
+++ b/src/os_vxworks/os_vx_abs.c
@@ -0,0 +1,42 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "iosLib.h"
+
+/*
+ * __os_abspath --
+ * Return if a path is an absolute path.
+ */
+int
+__os_abspath(path)
+ const char *path;
+{
+ DEV_HDR *dummy;
+ char *ptail;
+
+ /*
+ * VxWorks devices can be rooted at any name at all.
+ * Use iosDevFind() to see if name matches any of our devices.
+ */
+ if ((dummy = iosDevFind(path, (const char**)&ptail)) == NULL)
+ return (0);
+ /*
+ * If the routine used a device, then ptail points to the
+ * rest and we are an abs path.
+ */
+ if (ptail != path)
+ return (1);
+ /*
+ * If the path starts with a '/', then we are an absolute path,
+ * using the host machine, otherwise we are not.
+ */
+ return (path[0] == '/');
+}
diff --git a/src/os_vxworks/os_vx_config.c b/src/os_vxworks/os_vx_config.c
new file mode 100644
index 00000000..649a3b4a
--- /dev/null
+++ b/src/os_vxworks/os_vx_config.c
@@ -0,0 +1,56 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fs_notzero --
+ * Return 1 if allocated filesystem blocks are not zeroed.
+ */
+int
+__os_fs_notzero()
+{
+ /*
+ * Some VxWorks FS drivers do not zero-fill pages that were never
+ * explicitly written to the file, they give you random garbage,
+ * and that breaks Berkeley DB.
+ */
+ return (1);
+}
+
+/*
+ * __os_support_direct_io --
+ * Return 1 if we support direct I/O.
+ */
+int
+__os_support_direct_io()
+{
+ return (0);
+}
+
+/*
+ * __os_support_db_register --
+ * Return 1 if the system supports DB_REGISTER.
+ */
+int
+__os_support_db_register()
+{
+ return (0);
+}
+
+/*
+ * __os_support_replication --
+ * Return 1 if the system supports replication.
+ */
+int
+__os_support_replication()
+{
+ return (1);
+}
diff --git a/src/os_vxworks/os_vx_map.c b/src/os_vxworks/os_vx_map.c
new file mode 100644
index 00000000..517cadae
--- /dev/null
+++ b/src/os_vxworks/os_vx_map.c
@@ -0,0 +1,436 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This code is derived from software contributed to Sleepycat Software by
+ * Frederick G.M. Roeber of Netscape Communications Corp.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * DB uses memory-mapped files for two things:
+ * faster access of read-only databases, and
+ * shared memory for process synchronization and locking.
+ * The code carefully does not mix the two uses. The first-case uses are
+ * actually written such that memory-mapping isn't really required -- it's
+ * merely a convenience -- so we don't have to worry much about it. In the
+ * second case, it's solely used as a shared memory mechanism, so that's
+ * all we have to replace.
+ *
+ * All memory in VxWorks is shared, and a task can allocate memory and keep
+ * notes. So I merely have to allocate memory, remember the "filename" for
+ * that memory, and issue small-integer segment IDs which index the list of
+ * these shared-memory segments. Subsequent opens are checked against the
+ * list of already open segments.
+ */
+typedef struct {
+ void *segment; /* Segment address. */
+ u_int32_t size; /* Segment size. */
+ char *name; /* Segment name. */
+ long segid; /* Segment ID. */
+} os_segdata_t;
+
+static os_segdata_t *__os_segdata; /* Segment table. */
+static int __os_segdata_size; /* Segment table size. */
+
+#define OS_SEGDATA_STARTING_SIZE 16
+#define OS_SEGDATA_INCREMENT 16
+
+static int __os_segdata_allocate
+ __P((ENV *, const char *, REGINFO *, REGION *));
+static int __os_segdata_find_byname
+ __P((ENV *, const char *, REGINFO *, REGION *));
+static int __os_segdata_init __P((ENV *));
+static int __os_segdata_new __P((ENV *, int *));
+static int __os_segdata_release __P((ENV *, REGION *, int));
+
+/*
+ * __os_attach --
+ * Create/join a shared memory region.
+ */
+int
+__os_attach(env, infop, rp)
+ ENV *env;
+ REGINFO *infop;
+ REGION *rp;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = env->dbenv;
+
+ if (__os_segdata == NULL)
+ __os_segdata_init(env);
+
+ DB_BEGIN_SINGLE_THREAD;
+
+ /* Try to find an already existing segment. */
+ ret = __os_segdata_find_byname(env, infop->name, infop, rp);
+
+ /*
+ * If we are trying to join a region, it is easy, either we
+ * found it and we return, or we didn't find it and we return
+ * an error that it doesn't exist.
+ */
+ if (!F_ISSET(infop, REGION_CREATE)) {
+ if (ret != 0) {
+ __db_errx(env, DB_STR_A("0197",
+ "segment %s does not exist", "%s"),
+ infop->name);
+ ret = EAGAIN;
+ }
+ goto out;
+ }
+
+ /*
+ * If we get here, we are trying to create the region.
+ * There are several things to consider:
+ * - if we have an error (not a found or not-found value), return.
+ * - they better have shm_key set.
+ * - if the region is already there (ret == 0 from above),
+ * assume the application crashed and we're restarting.
+ * Delete the old region.
+ * - try to create the region.
+ */
+ if (ret != 0 && ret != ENOENT)
+ goto out;
+
+ if (dbenv->shm_key == INVALID_REGION_SEGID) {
+ __db_errx(env, DB_STR("0198",
+ "no base shared memory ID specified"));
+ ret = EAGAIN;
+ goto out;
+ }
+ if (ret == 0 && __os_segdata_release(env, rp, 1) != 0) {
+ __db_errx(env,DB_STR_A("0199",
+ "key: %ld: shared memory region already exists", "%ld"),
+ dbenv->shm_key + (infop->id - 1));
+ ret = EAGAIN;
+ goto out;
+ }
+
+ ret = __os_segdata_allocate(env, infop->name, infop, rp);
+out:
+ DB_END_SINGLE_THREAD;
+ return (ret);
+}
+
+/*
+ * __os_detach --
+ * Detach from a shared region.
+ */
+int
+__os_detach(env, infop, destroy)
+ ENV *env;
+ REGINFO *infop;
+ int destroy;
+{
+ /*
+ * If just detaching, there is no mapping to discard.
+ * If destroying, remove the region.
+ */
+ if (destroy)
+ return (__os_segdata_release(env, infop->rp, 0));
+ return (0);
+}
+
+/*
+ * __os_mapfile --
+ * Map in a shared memory file.
+ */
+int
+__os_mapfile(env, path, fhp, len, is_rdonly, addrp)
+ ENV *env;
+ char *path;
+ DB_FH *fhp;
+ int is_rdonly;
+ size_t len;
+ void **addrp;
+{
+ /* We cannot map in regular files in VxWorks. */
+ COMPQUIET(env, NULL);
+ COMPQUIET(path, NULL);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(is_rdonly, 0);
+ COMPQUIET(len, 0);
+ COMPQUIET(addrp, NULL);
+ return (DB_OPNOTSUP);
+}
+
+/*
+ * __os_unmapfile --
+ * Unmap the shared memory file.
+ */
+int
+__os_unmapfile(env, addr, len)
+ ENV *env;
+ void *addr;
+ size_t len;
+{
+ /* We cannot map in regular files in VxWorks. */
+ COMPQUIET(env, NULL);
+ COMPQUIET(addr, NULL);
+ COMPQUIET(len, 0);
+ return (DB_OPNOTSUP);
+}
+
+/*
+ * __os_segdata_init --
+ * Initializes the library's table of shared memory segments.
+ * Called once on the first time through __os_segdata_new().
+ */
+static int
+__os_segdata_init(env)
+ ENV *env;
+{
+ int ret;
+
+ if (__os_segdata != NULL) {
+ __db_errx(env, DB_STR("0200",
+ "shared memory segment already exists"));
+ return (EEXIST);
+ }
+
+ /*
+ * The lock init call returns a locked lock.
+ */
+ DB_BEGIN_SINGLE_THREAD;
+ __os_segdata_size = OS_SEGDATA_STARTING_SIZE;
+ ret = __os_calloc(env,
+ __os_segdata_size, sizeof(os_segdata_t), &__os_segdata);
+ DB_END_SINGLE_THREAD;
+ return (ret);
+}
+
+/*
+ * __os_segdata_destroy --
+ * Destroys the library's table of shared memory segments. It also
+ * frees all linked data: the segments themselves, and their names.
+ * Currently not called. This function should be called if the
+ * user creates a function to unload or shutdown.
+ */
+int
+__os_segdata_destroy(env)
+ ENV *env;
+{
+ os_segdata_t *p;
+ int i;
+
+ if (__os_segdata == NULL)
+ return (0);
+
+ DB_BEGIN_SINGLE_THREAD;
+ for (i = 0; i < __os_segdata_size; i++) {
+ p = &__os_segdata[i];
+ if (p->name != NULL) {
+ __os_free(env, p->name);
+ p->name = NULL;
+ }
+ if (p->segment != NULL) {
+ __os_free(env, p->segment);
+ p->segment = NULL;
+ }
+ p->size = 0;
+ }
+
+ __os_free(env, __os_segdata);
+ __os_segdata = NULL;
+ __os_segdata_size = 0;
+ DB_END_SINGLE_THREAD;
+
+ return (0);
+}
+
+/*
+ * __os_segdata_allocate --
+ * Creates a new segment of the specified size, optionally with the
+ * specified name.
+ *
+ * Assumes it is called with the SEGDATA lock taken.
+ */
+static int
+__os_segdata_allocate(env, name, infop, rp)
+ ENV *env;
+ const char *name;
+ REGINFO *infop;
+ REGION *rp;
+{
+ DB_ENV *dbenv;
+ os_segdata_t *p;
+ int id, ret;
+
+ dbenv = env->dbenv;
+
+ if ((ret = __os_segdata_new(env, &id)) != 0)
+ return (ret);
+
+ p = &__os_segdata[id];
+ if ((ret = __os_calloc(env, 1, rp->size, &p->segment)) != 0)
+ return (ret);
+ if ((ret = __os_strdup(env, name, &p->name)) != 0) {
+ __os_free(env, p->segment);
+ p->segment = NULL;
+ return (ret);
+ }
+ p->size = rp->size;
+ p->segid = dbenv->shm_key + infop->id - 1;
+
+ infop->addr = p->segment;
+ rp->segid = id;
+
+ return (0);
+}
+
+/*
+ * __os_segdata_new --
+ * Finds a new segdata slot. Does not initialise it, so the fd returned
+ * is only valid until you call this again.
+ *
+ * Assumes it is called with the SEGDATA lock taken.
+ */
+static int
+__os_segdata_new(env, segidp)
+ ENV *env;
+ int *segidp;
+{
+ os_segdata_t *p;
+ int i, newsize, ret;
+
+ if (__os_segdata == NULL) {
+ __db_errx(env, DB_STR("0201",
+ "shared memory segment not initialized"));
+ return (EAGAIN);
+ }
+
+ for (i = 0; i < __os_segdata_size; i++) {
+ p = &__os_segdata[i];
+ if (p->segment == NULL) {
+ *segidp = i;
+ return (0);
+ }
+ }
+
+ /*
+ * No more free slots, expand.
+ */
+ newsize = __os_segdata_size + OS_SEGDATA_INCREMENT;
+ if ((ret = __os_realloc(env, newsize * sizeof(os_segdata_t),
+ &__os_segdata)) != 0)
+ return (ret);
+ memset(&__os_segdata[__os_segdata_size],
+ 0, OS_SEGDATA_INCREMENT * sizeof(os_segdata_t));
+
+ *segidp = __os_segdata_size;
+ __os_segdata_size = newsize;
+
+ return (0);
+}
+
+/*
+ * __os_segdata_find_byname --
+ * Finds a segment by its name and shm_key.
+ *
+ * Assumes it is called with the SEGDATA lock taken.
+ */
+static int
+__os_segdata_find_byname(env, name, infop, rp)
+ ENV *env;
+ const char *name;
+ REGINFO *infop;
+ REGION *rp;
+{
+ DB_ENV *dbenv;
+ os_segdata_t *p;
+ long segid;
+ int i;
+
+ dbenv = env->dbenv;
+
+ if (__os_segdata == NULL) {
+ __db_errx(env, DB_STR("0202",
+ "shared memory segment not initialized"));
+ return (EAGAIN);
+ }
+
+ if (name == NULL) {
+ __db_errx(env, DB_STR("0203", "no segment name given"));
+ return (EAGAIN);
+ }
+
+ /*
+ * If we are creating the region, compute the segid.
+ * If we are joining the region, we use the segid in the
+ * index we are given.
+ */
+ if (F_ISSET(infop, REGION_CREATE))
+ segid = dbenv->shm_key + (infop->id - 1);
+ else {
+ if (rp->segid >= __os_segdata_size ||
+ rp->segid == INVALID_REGION_SEGID) {
+ __db_errx(env, DB_STR("0204",
+ "Invalid segment id given"));
+ return (EAGAIN);
+ }
+ segid = __os_segdata[rp->segid].segid;
+ }
+ for (i = 0; i < __os_segdata_size; i++) {
+ p = &__os_segdata[i];
+ if (p->name != NULL && strcmp(name, p->name) == 0 &&
+ p->segid == segid) {
+ infop->addr = p->segment;
+ rp->segid = i;
+ return (0);
+ }
+ }
+ return (ENOENT);
+}
+
+/*
+ * __os_segdata_release --
+ * Free a segdata entry.
+ */
+static int
+__os_segdata_release(env, rp, is_locked)
+ ENV *env;
+ REGION *rp;
+ int is_locked;
+{
+ os_segdata_t *p;
+
+ if (__os_segdata == NULL) {
+ __db_errx(env, DB_STR("0205",
+ "shared memory segment not initialized"));
+ return (EAGAIN);
+ }
+
+ if (rp->segid < 0 || rp->segid >= __os_segdata_size) {
+ __db_errx(env, DB_STR_A("0206",
+ "segment id %ld out of range", "%ld"), rp->segid);
+ return (EINVAL);
+ }
+
+ if (is_locked == 0)
+ DB_BEGIN_SINGLE_THREAD;
+ p = &__os_segdata[rp->segid];
+ if (p->name != NULL) {
+ __os_free(env, p->name);
+ p->name = NULL;
+ }
+ if (p->segment != NULL) {
+ __os_free(env, p->segment);
+ p->segment = NULL;
+ }
+ p->size = 0;
+ if (is_locked == 0)
+ DB_END_SINGLE_THREAD;
+
+ /* Any shrink-table logic could go here */
+
+ return (0);
+}
diff --git a/src/os_vxworks/os_vx_rpath.c b/src/os_vxworks/os_vx_rpath.c
new file mode 100644
index 00000000..1ffd3549
--- /dev/null
+++ b/src/os_vxworks/os_vx_rpath.c
@@ -0,0 +1,55 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#include "iosLib.h"
+
+/*
+ * __db_rpath --
+ * Return the last path separator in the path or NULL if none found.
+ */
+char *
+__db_rpath(path)
+ const char *path;
+{
+ const char *s, *last;
+ DEV_HDR *dummy;
+ char *ptail;
+
+ /*
+ * VxWorks devices can be rooted at any name. We want to
+ * skip over the device name and not take into account any
+ * PATH_SEPARATOR characters that might be in that name.
+ *
+ * XXX [#2393]
+ * VxWorks supports having a filename directly follow a device
+ * name with no separator. I.e. to access a file 'xxx' in
+ * the top level directory of a device mounted at "mydrive"
+ * you could say "mydrivexxx" or "mydrive/xxx" or "mydrive\xxx".
+ * We do not support the first usage here.
+ * XXX
+ */
+ if ((dummy = iosDevFind(path, (const char**)&ptail)) == NULL)
+ s = path;
+ else
+ s = ptail;
+
+ last = NULL;
+ if (PATH_SEPARATOR[1] != '\0') {
+ for (; s[0] != '\0'; ++s)
+ if (strchr(PATH_SEPARATOR, s[0]) != NULL)
+ last = s;
+ } else
+ for (; s[0] != '\0'; ++s)
+ if (s[0] == PATH_SEPARATOR[0])
+ last = s;
+ return ((char *)last);
+}
diff --git a/src/os_vxworks/os_vx_yield.c b/src/os_vxworks/os_vx_yield.c
new file mode 100644
index 00000000..c7c54cf2
--- /dev/null
+++ b/src/os_vxworks/os_vx_yield.c
@@ -0,0 +1,49 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/* vxworks API to get system clock rate */
+int sysClkRateGet (void);
+
+/*
+ * __os_yield --
+ * Yield the processor, optionally pausing until running again.
+ */
+void
+__os_yield(env, secs, usecs)
+ ENV *env;
+ u_long secs, usecs; /* Seconds and microseconds. */
+{
+ int ticks_delay, ticks_per_second;
+
+ COMPQUIET(env, NULL);
+
+ /* Don't require the values be normalized. */
+ for (; usecs >= US_PER_SEC; usecs -= US_PER_SEC)
+ ++secs;
+
+ /*
+ * Yield the processor so other processes or threads can run.
+ *
+ * As a side effect, taskDelay() moves the calling task to the end of
+ * the ready queue for tasks of the same priority. In particular, you
+ * can yield the CPU to any other tasks of the same priority by
+ * "delaying" for zero clock ticks.
+ *
+ * Never wait less than a tick, if we were supposed to wait at all.
+ */
+ ticks_per_second = sysClkRateGet();
+ ticks_delay =
+ secs * ticks_per_second + (usecs * ticks_per_second) / US_PER_SEC;
+ if (ticks_delay == 0 && (secs != 0 || usecs != 0))
+ ticks_delay = 1;
+ (void)taskDelay(ticks_delay);
+}
diff --git a/src/os_windows/ce_ctime.c b/src/os_windows/ce_ctime.c
new file mode 100644
index 00000000..e8ae76aa
--- /dev/null
+++ b/src/os_windows/ce_ctime.c
@@ -0,0 +1,87 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static void __os_windows_ct_numb __P((char *, int));
+
+/*
+ * __os_ctime --
+ * Format a time-stamp.
+ */
+char *
+__os_ctime(tod, time_buf)
+ const time_t *tod;
+ char *time_buf;
+{
+ char *ncp;
+ __int64 i64_tod;
+ struct _FILETIME file_tod, file_loc;
+ struct _SYSTEMTIME sys_loc;
+static const __int64 SECS_BETWEEN_EPOCHS = 11644473600;
+static const __int64 SECS_TO_100NS = 10000000; /* 10^7 */
+
+ strcpy(time_buf, "Thu Jan 01 00:00:00 1970");
+ time_buf[CTIME_BUFLEN - 1] = '\0';
+
+ /* Convert the tod to a SYSTEM_TIME struct */
+ i64_tod = *tod;
+ i64_tod = (i64_tod + SECS_BETWEEN_EPOCHS)*SECS_TO_100NS;
+ memcpy(&file_tod, &i64_tod, sizeof(file_tod));
+ FileTimeToLocalFileTime(&file_tod, &file_loc);
+ FileTimeToSystemTime(&file_loc, &sys_loc);
+
+ /*
+ * Convert the _SYSTEMTIME to the correct format in time_buf.
+ * Based closely on the os_brew/ctime.c implementation.
+ *
+ * wWeekDay : Day of the week 0-6 (0=Monday, 6=Sunday)
+ */
+ ncp = &"MonTueWedThuFriSatSun"[sys_loc.wDayOfWeek*3];
+ time_buf[0] = *ncp++;
+ time_buf[1] = *ncp++;
+ time_buf[2] = *ncp;
+ ncp = &"JanFebMarAprMayJunJulAugSepOctNovDec"[(sys_loc.wMonth - 1) * 3];
+ time_buf[4] = *ncp++;
+ time_buf[5] = *ncp++;
+ time_buf[6] = *ncp;
+
+ __os_windows_ct_numb(time_buf + 8, sys_loc.wDay);
+ /* Add 100 to keep the leading zero. */
+ __os_windows_ct_numb(time_buf + 11, sys_loc.wHour + 100);
+ __os_windows_ct_numb(time_buf + 14, sys_loc.wMinute + 100);
+ __os_windows_ct_numb(time_buf + 17, sys_loc.wSecond + 100);
+
+ if (sys_loc.wYear < 100) { /* 9 99 */
+ time_buf[20] = ' ';
+ time_buf[21] = ' ';
+ __os_windows_ct_numb(time_buf + 22, sys_loc.wYear);
+ } else { /* 99 1999 */
+ __os_windows_ct_numb(time_buf + 20, sys_loc.wYear / 100);
+ __os_windows_ct_numb(time_buf + 22, sys_loc.wYear % 100 + 100);
+ }
+
+ return (time_buf);
+}
+
+/*
+ * __os_windows_ct_numb --
+ * Append ASCII representations for two digits to a string.
+ */
+static void
+__os_windows_ct_numb(cp, n)
+ char *cp;
+ int n;
+{
+ cp[0] = ' ';
+ if (n >= 10)
+ cp[0] = (n / 10) % 10 + '0';
+ cp[1] = n % 10 + '0';
+}
diff --git a/src/os_windows/os_abs.c b/src/os_windows/os_abs.c
new file mode 100644
index 00000000..e769ab2c
--- /dev/null
+++ b/src/os_windows/os_abs.c
@@ -0,0 +1,33 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_abspath --
+ * Return if a path is an absolute path.
+ */
+int
+__os_abspath(path)
+ const char *path;
+{
+ /*
+ * !!!
+ * Check for drive specifications, e.g., "C:". In addition, the path
+ * separator used by the win32 DB (PATH_SEPARATOR) is \; look for both
+ * / and \ since these are user-input paths.
+ */
+ if (strlen(path) == 0)
+ return (0);
+
+ if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':')
+ path += 2;
+ return (path[0] == '/' || path[0] == '\\');
+}
diff --git a/src/os_windows/os_clock.c b/src/os_windows/os_clock.c
new file mode 100644
index 00000000..e548729b
--- /dev/null
+++ b/src/os_windows/os_clock.c
@@ -0,0 +1,79 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_gettime --
+ * Return the current time-of-day clock in seconds and nanoseconds.
+ */
+void
+__os_gettime(env, tp, monotonic)
+ ENV *env;
+ db_timespec *tp;
+ int monotonic;
+{
+ if (monotonic) {
+ /*
+ * The elapsed time is stored as a DWORD value, so time wraps
+ * around to zero if the system runs for 49.7 days. Initialize
+ * a base value with 50 days worth of seconds, and add 50 more
+ * days every time the counter wraps. That ensures we always
+ * move forward.
+ *
+ * It's possible this code could race, but the danger is we
+ * would increment base_seconds more than once per wrap and
+ * eventually overflow, which is a pretty remote possibility.
+ */
+#define TIMER_WRAP_SECONDS (50 * 24 * 60 * 60)
+ static DWORD last_ticks;
+ static time_t base_seconds;
+ DWORD ticks;
+
+ ticks = GetTickCount();
+ if (ticks < last_ticks)
+ base_seconds += TIMER_WRAP_SECONDS;
+ last_ticks = ticks;
+ tp->tv_sec = base_seconds + (u_int32_t)(ticks / 1000);
+ tp->tv_nsec = (u_int32_t)((ticks % 1000) * NS_PER_MS);
+ } else {
+#ifdef DB_WINCE
+ FILETIME ft;
+ LARGE_INTEGER large_int;
+ LONGLONG ns_since_epoch, utc1970;
+ SYSTEMTIME st;
+
+ (void)GetSystemTime(&st);
+ (void)SystemTimeToFileTime(&st, &ft);
+
+ /*
+ * A FILETIME expresses time as 100 nanosecond chunks from
+ * Jan 1, 1601; convert to a timespec where the time is
+ * is expressed in seconds and nanoseconds from Jan 1, 1970.
+ *
+ * UTC_1970 is the number of 100-nano-second chunks from
+ * 1601 to 1970.
+ */
+#define NS100_PER_SEC (NS_PER_SEC / 100)
+#define UTC_1970 (((LONGLONG)27111902 << 32) + (LONGLONG)3577643008)
+ memcpy(&large_int, &ft, sizeof(large_int));
+ utc1970 = UTC_1970;
+ ns_since_epoch = (large_int.QuadPart - utc1970);
+ tp->tv_sec = (time_t)(ns_since_epoch / NS100_PER_SEC);
+ tp->tv_nsec = (long)(ns_since_epoch % NS100_PER_SEC);
+#else
+ struct _timeb now;
+
+ _ftime(&now);
+ tp->tv_sec = now.time;
+ tp->tv_nsec = now.millitm * NS_PER_MS;
+#endif
+ }
+}
diff --git a/src/os_windows/os_config.c b/src/os_windows/os_config.c
new file mode 100644
index 00000000..4250dbd4
--- /dev/null
+++ b/src/os_windows/os_config.c
@@ -0,0 +1,133 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_is_winnt --
+ * Return 1 if Windows/NT, otherwise 0.
+ *
+ * PUBLIC: int __os_is_winnt __P((void));
+ */
+int
+__os_is_winnt()
+{
+#ifdef DB_WINCE
+ return (1);
+#else
+ static int __os_type = -1;
+
+ /*
+ * The value of __os_type is computed only once, and cached to
+ * avoid the overhead of repeated calls to GetVersion().
+ */
+ if (__os_type == -1) {
+ if ((GetVersion() & 0x80000000) == 0)
+ __os_type = 1;
+ else
+ __os_type = 0;
+ }
+ return (__os_type);
+#endif
+}
+
+/*
+ * __os_fs_notzero --
+ * Return 1 if allocated filesystem blocks are not zeroed.
+ */
+int
+__os_fs_notzero()
+{
+#ifdef DB_WINCE
+ return (1);
+#else
+ static int __os_notzero = -1;
+ OSVERSIONINFO osvi;
+
+ /*
+ * Windows/NT zero-fills pages that were never explicitly written to
+ * the file. Note however that this is *NOT* documented. In fact, the
+ * Win32 documentation makes it clear that there are no guarantees that
+ * uninitialized bytes will be zeroed:
+ *
+ * If the file is extended, the contents of the file between the old
+ * EOF position and the new position are not defined.
+ *
+ * Experiments confirm that NT/2K/XP all zero fill for both NTFS and
+ * FAT32. Cygwin also relies on this behavior. This is the relevant
+ * comment from Cygwin:
+ *
+ * Oops, this is the bug case - Win95 uses whatever is on the disk
+ * instead of some known (safe) value, so we must seek back and fill
+ * in the gap with zeros. - DJ
+ * Note: this bug doesn't happen on NT4, even though the
+ * documentation for WriteFile() says that it *may* happen on any OS.
+ *
+ * We're making a bet, here, but we made it a long time ago and haven't
+ * yet seen any evidence that it was wrong.
+ *
+ * Windows 95/98 and On-Time give random garbage, and that breaks
+ * Berkeley DB.
+ *
+ * The value of __os_notzero is computed only once, and cached to
+ * avoid the overhead of repeated calls to GetVersion().
+ */
+ if (__os_notzero == -1) {
+ if (__os_is_winnt()) {
+ osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+ GetVersionEx(&osvi);
+ if (_tcscmp(osvi.szCSDVersion, _T("RTTarget-32")) == 0)
+ __os_notzero = 1; /* On-Time */
+ else
+ __os_notzero = 0; /* Windows/NT */
+ } else
+ __os_notzero = 1; /* Not Windows/NT */
+ }
+ return (__os_notzero);
+#endif
+}
+
+/*
+ * __os_support_direct_io --
+ * Check to see if we support direct I/O.
+ */
+int
+__os_support_direct_io()
+{
+ return (1);
+}
+
+/*
+ * __os_support_db_register --
+ * Return 1 if the system supports DB_REGISTER.
+ */
+int
+__os_support_db_register()
+{
+#ifdef DB_WINCE
+ return (0);
+#else
+ return (__os_is_winnt());
+#endif
+}
+
+/*
+ * __os_support_replication --
+ * Return 1 if the system supports replication.
+ */
+int
+__os_support_replication()
+{
+#ifdef DB_WINCE
+ return (0);
+#else
+ return (__os_is_winnt());
+#endif
+}
diff --git a/src/os_windows/os_cpu.c b/src/os_windows/os_cpu.c
new file mode 100644
index 00000000..0922071f
--- /dev/null
+++ b/src/os_windows/os_cpu.c
@@ -0,0 +1,27 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_cpu_count --
+ * Return the number of CPUs.
+ *
+ * PUBLIC: u_int32_t __os_cpu_count __P((void));
+ */
+u_int32_t
+__os_cpu_count()
+{
+ SYSTEM_INFO SystemInfo;
+
+ GetSystemInfo(&SystemInfo);
+
+ return ((u_int32_t)SystemInfo.dwNumberOfProcessors);
+}
diff --git a/src/os_windows/os_dir.c b/src/os_windows/os_dir.c
new file mode 100644
index 00000000..31d364d7
--- /dev/null
+++ b/src/os_windows/os_dir.c
@@ -0,0 +1,122 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_dirlist --
+ * Return a list of the files in a directory.
+ */
+int
+__os_dirlist(env, dir, returndir, namesp, cntp)
+ ENV *env;
+ const char *dir;
+ int returndir, *cntp;
+ char ***namesp;
+{
+ HANDLE dirhandle;
+ WIN32_FIND_DATA fdata;
+ int arraysz, cnt, ret;
+ char **names, *onename;
+ _TCHAR tfilespec[DB_MAXPATHLEN + 1];
+ _TCHAR *tdir;
+
+ *namesp = NULL;
+ *cntp = 0;
+
+ TO_TSTRING(env, dir, tdir, ret);
+ if (ret != 0)
+ return (ret);
+
+ (void)_sntprintf(tfilespec, DB_MAXPATHLEN,
+ _T("%s%hc*"), tdir, PATH_SEPARATOR[0]);
+
+ /*
+ * On WinCE, FindFirstFile will return INVALID_HANDLE_VALUE when
+ * the searched directory is empty, and set last error to
+ * ERROR_NO_MORE_FILES, on Windows it will return "." instead.
+ */
+ if ((dirhandle =
+ FindFirstFile(tfilespec, &fdata)) == INVALID_HANDLE_VALUE) {
+ if (GetLastError() == ERROR_NO_MORE_FILES)
+ return (0);
+ return (__os_posix_err(__os_get_syserr()));
+ }
+
+ names = NULL;
+ arraysz = cnt = ret = 0;
+ for (;;) {
+ if (returndir ||
+ (fdata.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == 0) {
+ if (fdata.cFileName[0] == _T('.') &&
+ (fdata.cFileName[1] == _T('\0') ||
+ (fdata.cFileName[1] == _T('.') &&
+ fdata.cFileName[2] == _T('\0'))))
+ goto next;
+ if (cnt >= arraysz) {
+ arraysz += 100;
+ if ((ret = __os_realloc(env,
+ arraysz * sizeof(names[0]), &names)) != 0)
+ goto err;
+ }
+ /*
+ * FROM_TSTRING doesn't necessarily allocate new
+ * memory, so we must do that explicitly.
+ * Unfortunately, when compiled with UNICODE, we'll
+ * copy twice.
+ */
+ FROM_TSTRING(env, fdata.cFileName, onename, ret);
+ if (ret != 0)
+ goto err;
+ ret = __os_strdup(env, onename, &names[cnt]);
+ FREE_STRING(env, onename);
+ if (ret != 0)
+ goto err;
+ cnt++;
+ }
+next:
+ if (!FindNextFile(dirhandle, &fdata)) {
+ if (GetLastError() == ERROR_NO_MORE_FILES)
+ break;
+ else {
+ ret = __os_posix_err(__os_get_syserr());
+ goto err;
+ }
+ }
+ }
+
+err: if (!FindClose(dirhandle) && ret == 0)
+ ret = __os_posix_err(__os_get_syserr());
+
+ if (ret == 0) {
+ *namesp = names;
+ *cntp = cnt;
+ } else if (names != NULL)
+ __os_dirfree(env, names, cnt);
+
+ FREE_STRING(env, tdir);
+
+ return (ret);
+}
+
+/*
+ * __os_dirfree --
+ * Free the list of files.
+ */
+void
+__os_dirfree(env, names, cnt)
+ ENV *env;
+ char **names;
+ int cnt;
+{
+ while (cnt > 0)
+ __os_free(env, names[--cnt]);
+ __os_free(env, names);
+}
diff --git a/src/os_windows/os_errno.c b/src/os_windows/os_errno.c
new file mode 100644
index 00000000..ba8ec359
--- /dev/null
+++ b/src/os_windows/os_errno.c
@@ -0,0 +1,428 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_get_errno_ret_zero --
+ * Return the last system error, including an error of zero.
+ */
+int
+__os_get_errno_ret_zero()
+{
+ /* This routine must be able to return the same value repeatedly. */
+ return (errno);
+}
+
+/*
+ * We've seen cases where system calls failed but errno was never set. For
+ * that reason, __os_get_errno() and __os_get_syserr set errno to EAGAIN if
+ * it's not already set, to work around the problem. For obvious reasons,
+ * we can only call this function if we know an error has occurred, that
+ * is, we can't test the return for a non-zero value after the get call.
+ *
+ * __os_get_errno --
+ * Return the last ANSI C "errno" value or EAGAIN if the last error
+ * is zero.
+ */
+int
+__os_get_errno()
+{
+ /* This routine must be able to return the same value repeatedly. */
+ if (errno == 0)
+ __os_set_errno(EAGAIN);
+ return (errno);
+}
+
+#ifdef HAVE_REPLICATION_THREADS
+/*
+ * __os_get_neterr --
+ * Return the last networking error or EAGAIN if the last error is zero.
+ *
+ * PUBLIC: #ifdef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __os_get_neterr __P((void));
+ * PUBLIC: #endif
+ */
+int
+__os_get_neterr()
+{
+ int err;
+
+ /* This routine must be able to return the same value repeatedly. */
+ err = WSAGetLastError();
+ if (err == 0)
+ WSASetLastError(err = ERROR_RETRY);
+ return (err);
+}
+#endif
+
+/*
+ * __os_get_syserr --
+ * Return the last system error or EAGAIN if the last error is zero.
+ */
+int
+__os_get_syserr()
+{
+ int err;
+
+ /* This routine must be able to return the same value repeatedly. */
+ err = GetLastError();
+ if (err == 0)
+ SetLastError(err = ERROR_RETRY);
+ return (err);
+}
+
+/*
+ * __os_set_errno --
+ * Set the value of errno.
+ */
+void
+__os_set_errno(evalue)
+ int evalue;
+{
+ /*
+ * This routine is called by the compatibility interfaces (DB 1.85,
+ * dbm and hsearch). Force values > 0, that is, not one of DB 2.X
+ * and later's public error returns. If something bad has happened,
+ * default to EFAULT -- a nasty return. Otherwise, default to EINVAL.
+ * As the compatibility APIs aren't included on Windows, the Windows
+ * version of this routine doesn't need this behavior.
+ */
+ errno =
+ evalue >= 0 ? evalue : (evalue == DB_RUNRECOVERY ? EFAULT : EINVAL);
+}
+
+/*
+ * __os_strerror --
+ * Return a string associated with the system error.
+ */
+char *
+__os_strerror(error, buf, len)
+ int error;
+ char *buf;
+ size_t len;
+{
+#ifdef DB_WINCE
+#define MAX_TMPBUF_LEN 512
+ _TCHAR tbuf[MAX_TMPBUF_LEN];
+ size_t maxlen;
+
+ DB_ASSERT(NULL, error != 0);
+
+ memset(tbuf, 0, sizeof(_TCHAR)*MAX_TMPBUF_LEN);
+ maxlen = (len > MAX_TMPBUF_LEN ? MAX_TMPBUF_LEN : len);
+ FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, (DWORD)error,
+ 0, tbuf, maxlen-1, NULL);
+
+ if (WideCharToMultiByte(CP_UTF8, 0, tbuf, -1,
+ buf, len, 0, NULL) == 0)
+ strncpy(buf, DB_STR("0035",
+ "Error message translation failed."), len);
+#else
+ DB_ASSERT(NULL, error != 0);
+ /*
+ * Explicitly call FormatMessageA, since we want to receive a char
+ * string back, not a tchar string.
+ */
+ FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM,
+ 0, (DWORD)error, 0, buf, (DWORD)(len - 1), NULL);
+ buf[len - 1] = '\0';
+#endif
+
+ return (buf);
+}
+
+/*
+ * __os_posix_err --
+ * Convert a system error to a POSIX error.
+ */
+int
+__os_posix_err(error)
+ int error;
+{
+ /* Handle calls on successful returns. */
+ if (error == 0)
+ return (0);
+
+ /*
+ * Translate the Windows error codes we care about.
+ */
+ switch (error) {
+ case ERROR_INVALID_PARAMETER:
+ return (EINVAL);
+
+ case ERROR_FILE_NOT_FOUND:
+ case ERROR_INVALID_DRIVE:
+ case ERROR_PATH_NOT_FOUND:
+ return (ENOENT);
+
+ case ERROR_NO_MORE_FILES:
+ case ERROR_TOO_MANY_OPEN_FILES:
+ return (EMFILE);
+
+ case ERROR_ACCESS_DENIED:
+ return (EPERM);
+
+ case ERROR_INVALID_HANDLE:
+ return (EBADF);
+
+ case ERROR_NOT_ENOUGH_MEMORY:
+ return (ENOMEM);
+
+ case ERROR_DISK_FULL:
+ return (ENOSPC);
+
+ case ERROR_ARENA_TRASHED:
+ case ERROR_BAD_COMMAND:
+ case ERROR_BAD_ENVIRONMENT:
+ case ERROR_BAD_FORMAT:
+ case ERROR_GEN_FAILURE:
+ case ERROR_INVALID_ACCESS:
+ case ERROR_INVALID_BLOCK:
+ case ERROR_INVALID_DATA:
+ case ERROR_READ_FAULT:
+ case ERROR_WRITE_FAULT:
+ return (EFAULT);
+
+ case ERROR_ALREADY_EXISTS:
+ case ERROR_FILE_EXISTS:
+ return (EEXIST);
+
+ case ERROR_NOT_SAME_DEVICE:
+ return (EXDEV);
+
+ case ERROR_WRITE_PROTECT:
+ return (EACCES);
+
+ case ERROR_LOCK_FAILED:
+ case ERROR_LOCK_VIOLATION:
+ case ERROR_NOT_READY:
+ case ERROR_SHARING_VIOLATION:
+ return (EBUSY);
+
+ case ERROR_RETRY:
+ return (EINTR);
+ }
+
+ /*
+ * Translate the Windows socket error codes.
+ */
+ switch (error) {
+ case WSAEADDRINUSE:
+#ifdef EADDRINUSE
+ return (EADDRINUSE);
+#else
+ break;
+#endif
+ case WSAEADDRNOTAVAIL:
+#ifdef EADDRNOTAVAIL
+ return (EADDRNOTAVAIL);
+#else
+ break;
+#endif
+ case WSAEAFNOSUPPORT:
+#ifdef EAFNOSUPPORT
+ return (EAFNOSUPPORT);
+#else
+ break;
+#endif
+ case WSAEALREADY:
+#ifdef EALREADY
+ return (EALREADY);
+#else
+ break;
+#endif
+ case WSAEBADF:
+ return (EBADF);
+ case WSAECONNABORTED:
+#ifdef ECONNABORTED
+ return (ECONNABORTED);
+#else
+ break;
+#endif
+ case WSAECONNREFUSED:
+#ifdef ECONNREFUSED
+ return (ECONNREFUSED);
+#else
+ break;
+#endif
+ case WSAECONNRESET:
+#ifdef ECONNRESET
+ return (ECONNRESET);
+#else
+ break;
+#endif
+ case WSAEDESTADDRREQ:
+#ifdef EDESTADDRREQ
+ return (EDESTADDRREQ);
+#else
+ break;
+#endif
+ case WSAEFAULT:
+ return (EFAULT);
+ case WSAEHOSTDOWN:
+#ifdef EHOSTDOWN
+ return (EHOSTDOWN);
+#else
+ break;
+#endif
+ case WSAEHOSTUNREACH:
+#ifdef EHOSTUNREACH
+ return (EHOSTUNREACH);
+#else
+ break;
+#endif
+ case WSAEINPROGRESS:
+#ifdef EINPROGRESS
+ return (EINPROGRESS);
+#else
+ break;
+#endif
+ case WSAEINTR:
+ return (EINTR);
+ case WSAEINVAL:
+ return (EINVAL);
+ case WSAEISCONN:
+#ifdef EISCONN
+ return (EISCONN);
+#else
+ break;
+#endif
+ case WSAELOOP:
+#ifdef ELOOP
+ return (ELOOP);
+#else
+ break;
+#endif
+ case WSAEMFILE:
+ return (EMFILE);
+ case WSAEMSGSIZE:
+#ifdef EMSGSIZE
+ return (EMSGSIZE);
+#else
+ break;
+#endif
+ case WSAENAMETOOLONG:
+ return (ENAMETOOLONG);
+ case WSAENETDOWN:
+#ifdef ENETDOWN
+ return (ENETDOWN);
+#else
+ break;
+#endif
+ case WSAENETRESET:
+#ifdef ENETRESET
+ return (ENETRESET);
+#else
+ break;
+#endif
+ case WSAENETUNREACH:
+#ifdef ENETUNREACH
+ return (ENETUNREACH);
+#else
+ break;
+#endif
+ case WSAENOBUFS:
+#ifdef ENOBUFS
+ return (ENOBUFS);
+#else
+ break;
+#endif
+ case WSAENOPROTOOPT:
+#ifdef ENOPROTOOPT
+ return (ENOPROTOOPT);
+#else
+ break;
+#endif
+ case WSAENOTCONN:
+#ifdef ENOTCONN
+ return (ENOTCONN);
+#else
+ break;
+#endif
+ case WSANOTINITIALISED:
+ return (EAGAIN);
+ case WSAENOTSOCK:
+#ifdef ENOTSOCK
+ return (ENOTSOCK);
+#else
+ break;
+#endif
+ case WSAEOPNOTSUPP:
+ return (DB_OPNOTSUP);
+ case WSAEPFNOSUPPORT:
+#ifdef EPFNOSUPPORT
+ return (EPFNOSUPPORT);
+#else
+ break;
+#endif
+ case WSAEPROTONOSUPPORT:
+#ifdef EPROTONOSUPPORT
+ return (EPROTONOSUPPORT);
+#else
+ break;
+#endif
+ case WSAEPROTOTYPE:
+#ifdef EPROTOTYPE
+ return (EPROTOTYPE);
+#else
+ break;
+#endif
+ case WSAESHUTDOWN:
+#ifdef ESHUTDOWN
+ return (ESHUTDOWN);
+#else
+ break;
+#endif
+ case WSAESOCKTNOSUPPORT:
+#ifdef ESOCKTNOSUPPORT
+ return (ESOCKTNOSUPPORT);
+#else
+ break;
+#endif
+ case WSAETIMEDOUT:
+#ifdef ETIMEDOUT
+ return (ETIMEDOUT);
+#else
+ break;
+#endif
+ case WSAETOOMANYREFS:
+#ifdef ETOOMANYREFS
+ return (ETOOMANYREFS);
+#else
+ break;
+#endif
+ case WSAEWOULDBLOCK:
+#ifdef EWOULDBLOCK
+ return (EWOULDBLOCK);
+#else
+ return (EAGAIN);
+#endif
+ case WSAHOST_NOT_FOUND:
+#ifdef EHOSTUNREACH
+ return (EHOSTUNREACH);
+#else
+ break;
+#endif
+ case WSASYSNOTREADY:
+ return (EAGAIN);
+ case WSATRY_AGAIN:
+ return (EAGAIN);
+ case WSAVERNOTSUPPORTED:
+ return (DB_OPNOTSUP);
+ case WSAEACCES:
+ return (EACCES);
+ }
+
+ /*
+ * EFAULT is the default if we don't have a translation.
+ */
+ return (EFAULT);
+}
diff --git a/src/os_windows/os_fid.c b/src/os_windows/os_fid.c
new file mode 100644
index 00000000..f2d190b1
--- /dev/null
+++ b/src/os_windows/os_fid.c
@@ -0,0 +1,129 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fileid --
+ * Return a unique identifier for a file.
+ */
+int
+__os_fileid(env, fname, unique_okay, fidp)
+ ENV *env;
+ const char *fname;
+ int unique_okay;
+ u_int8_t *fidp;
+{
+ pid_t pid;
+ size_t i;
+ u_int32_t tmp;
+ u_int8_t *p;
+ int ret;
+
+ /*
+ * The documentation for GetFileInformationByHandle() states that the
+ * inode-type numbers are not constant between processes. Actually,
+ * they are, they're the NTFS MFT indexes. So, this works on NTFS,
+ * but perhaps not on other platforms, and perhaps not over a network.
+ * Can't think of a better solution right now.
+ */
+ DB_FH *fhp;
+ BY_HANDLE_FILE_INFORMATION fi;
+ BOOL retval = FALSE;
+
+ DB_ASSERT(env, fname != NULL);
+
+ /* Clear the buffer. */
+ memset(fidp, 0, DB_FILE_ID_LEN);
+
+ /*
+ * First we open the file, because we're not given a handle to it.
+ * If we can't open it, we're in trouble.
+ */
+ if ((ret = __os_open(env, fname, 0,
+ DB_OSO_RDONLY, DB_MODE_400, &fhp)) != 0)
+ return (ret);
+
+ /* File open, get its info */
+ if ((retval = GetFileInformationByHandle(fhp->handle, &fi)) == FALSE)
+ ret = __os_get_syserr();
+ (void)__os_closehandle(env, fhp);
+
+ if (retval == FALSE)
+ return (__os_posix_err(ret));
+
+ /*
+ * We want the three 32-bit words which tell us the volume ID and
+ * the file ID. We make a crude attempt to copy the bytes over to
+ * the callers buffer.
+ *
+ * We don't worry about byte sexing or the actual variable sizes.
+ *
+ * When this routine is called from the DB access methods, it's only
+ * called once -- whatever ID is generated when a database is created
+ * is stored in the database file's metadata, and that is what is
+ * saved in the mpool region's information to uniquely identify the
+ * file.
+ *
+ * When called from the mpool layer this routine will be called each
+ * time a new thread of control wants to share the file, which makes
+ * things tougher. As far as byte sexing goes, since the mpool region
+ * lives on a single host, there's no issue of that -- the entire
+ * region is byte sex dependent. As far as variable sizes go, we make
+ * the simplifying assumption that 32-bit and 64-bit processes will
+ * get the same 32-bit values if we truncate any returned 64-bit value
+ * to a 32-bit value.
+ */
+ tmp = (u_int32_t)fi.nFileIndexLow;
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+ tmp = (u_int32_t)fi.nFileIndexHigh;
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+
+ if (unique_okay) {
+ /* Add in 32-bits of (hopefully) unique number. */
+ __os_unique_id(env, &tmp);
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+
+ /*
+ * Initialize/increment the serial number we use to help
+ * avoid fileid collisions. Note we don't bother with
+ * locking; it's unpleasant to do from down in here, and
+ * if we race on this no real harm will be done, since the
+ * finished fileid has so many other components.
+ *
+ * We use the bottom 32-bits of the process ID, hoping they
+ * are more random than the top 32-bits (should we be on a
+ * machine with 64-bit process IDs).
+ *
+ * We increment by 100000 on each call as a simple way of
+ * randomizing; simply incrementing seems potentially less
+ * useful if pids are also simply incremented, since this
+ * is process-local and we may be one of a set of processes
+ * starting up. 100000 pushes us out of pid space on most
+ * 32-bit platforms, and has few interesting properties in
+ * base 2.
+ */
+ if (DB_GLOBAL(fid_serial) == 0) {
+ __os_id(env->dbenv, &pid, NULL);
+ DB_GLOBAL(fid_serial) = (u_int32_t)pid;
+ } else
+ DB_GLOBAL(fid_serial) += 100000;
+
+ } else {
+ tmp = (u_int32_t)fi.dwVolumeSerialNumber;
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+ }
+
+ return (0);
+}
diff --git a/src/os_windows/os_flock.c b/src/os_windows/os_flock.c
new file mode 100644
index 00000000..cb3e4986
--- /dev/null
+++ b/src/os_windows/os_flock.c
@@ -0,0 +1,90 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fdlock --
+ * Acquire/release a lock on a byte in a file.
+ */
+int
+__os_fdlock(env, fhp, offset, acquire, nowait)
+ ENV *env;
+ DB_FH *fhp;
+ int acquire, nowait;
+ off_t offset;
+{
+#ifdef DB_WINCE
+ /*
+ * This functionality is not supported by WinCE, so just fail.
+ *
+ * Should only happen if an app attempts to open an environment
+ * with the DB_REGISTER flag.
+ */
+ __db_errx(env, DB_STR("0019",
+ "fdlock API not implemented for WinCE, DB_REGISTER "
+ "environment flag not supported."));
+ return (EFAULT);
+#else
+ DWORD low, high;
+ DB_ENV *dbenv;
+ OVERLAPPED over;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ DB_ASSERT(env,
+ F_ISSET(fhp, DB_FH_OPENED) && fhp->handle != INVALID_HANDLE_VALUE);
+
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0020",
+ "fileops: flock %s %s offset %lu", "%s %s %lu"), fhp->name,
+ acquire ? DB_STR_P("acquire"): DB_STR_P("release"),
+ (u_long)offset);
+
+ /*
+ * Windows file locking interferes with read/write operations, so we
+ * map the ranges to an area past the end of the file.
+ */
+ DB_ASSERT(env, offset < (u_int64_t)INT64_MAX);
+ offset = UINT64_MAX - offset;
+ low = (DWORD)offset;
+ high = (DWORD)(offset >> 32);
+
+ if (acquire) {
+ if (nowait)
+ RETRY_CHK_EINTR_ONLY(
+ !LockFile(fhp->handle, low, high, 1, 0), ret);
+ else if (__os_is_winnt()) {
+ memset(&over, 0, sizeof(over));
+ over.Offset = low;
+ over.OffsetHigh = high;
+ RETRY_CHK_EINTR_ONLY(
+ !LockFileEx(fhp->handle, LOCKFILE_EXCLUSIVE_LOCK,
+ 0, 1, 0, &over),
+ ret);
+ } else {
+ /* Windows 9x/ME doesn't support a blocking call. */
+ for (;;) {
+ RETRY_CHK_EINTR_ONLY(
+ !LockFile(fhp->handle, low, high, 1, 0),
+ ret);
+ if (__os_posix_err(ret) != EAGAIN)
+ break;
+ __os_yield(env, 1, 0);
+ }
+ }
+ } else
+ RETRY_CHK_EINTR_ONLY(
+ !UnlockFile(fhp->handle, low, high, 1, 0), ret);
+
+ return (__os_posix_err(ret));
+#endif
+}
diff --git a/src/os_windows/os_fsync.c b/src/os_windows/os_fsync.c
new file mode 100644
index 00000000..8824aac1
--- /dev/null
+++ b/src/os_windows/os_fsync.c
@@ -0,0 +1,44 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fsync --
+ * Flush a file descriptor.
+ */
+int
+__os_fsync(env, fhp)
+ ENV *env;
+ DB_FH *fhp;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ /*
+ * Do nothing if the file descriptor has been marked as not requiring
+ * any sync to disk.
+ */
+ if (F_ISSET(fhp, DB_FH_NOSYNC))
+ return (0);
+
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0023",
+ "fileops: flush %s", "%s"), fhp->name);
+
+ RETRY_CHK((!FlushFileBuffers(fhp->handle)), ret);
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR("0024", "FlushFileBuffers"));
+ ret = __os_posix_err(ret);
+ }
+ return (ret);
+}
diff --git a/src/os_windows/os_getenv.c b/src/os_windows/os_getenv.c
new file mode 100644
index 00000000..aad59d01
--- /dev/null
+++ b/src/os_windows/os_getenv.c
@@ -0,0 +1,103 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_getenv --
+ * Retrieve an environment variable.
+ */
+int
+__os_getenv(env, name, bpp, buflen)
+ ENV *env;
+ const char *name;
+ char **bpp;
+ size_t buflen;
+{
+#ifdef DB_WINCE
+ COMPQUIET(name, NULL);
+ /* WinCE does not have a getenv implementation. */
+ return (0);
+#else
+ _TCHAR *tname, tbuf[1024];
+ int ret;
+ char *p;
+
+ /*
+ * If there's a value and the buffer is large enough:
+ * copy value into the pointer, return 0
+ * If there's a value and the buffer is too short:
+ * set pointer to NULL, return EINVAL
+ * If there's no value:
+ * set pointer to NULL, return 0
+ */
+ if ((p = getenv(name)) != NULL) {
+ if (strlen(p) < buflen) {
+ (void)strcpy(*bpp, p);
+ return (0);
+ }
+ goto small_buf;
+ }
+
+ TO_TSTRING(env, name, tname, ret);
+ if (ret != 0)
+ return (ret);
+ /*
+ * The declared size of the tbuf buffer limits the maximum environment
+ * variable size in Berkeley DB on Windows. If that's too small, or if
+ * we need to get rid of large allocations on the BDB stack, we should
+ * malloc the tbuf memory.
+ */
+ ret = GetEnvironmentVariable(tname, tbuf, sizeof(tbuf));
+ FREE_STRING(env, tname);
+
+ /*
+ * If GetEnvironmentVariable succeeds, the return value is the number
+ * of characters stored in the buffer pointed to by lpBuffer, not
+ * including the terminating null character. If the buffer is not
+ * large enough to hold the data, the return value is the buffer size,
+ * in characters, required to hold the string and its terminating null
+ * character. If GetEnvironmentVariable fails, the return value is
+ * zero. If the specified environment variable was not found in the
+ * environment block, GetLastError returns ERROR_ENVVAR_NOT_FOUND.
+ */
+ if (ret == 0) {
+ if ((ret = __os_get_syserr()) == ERROR_ENVVAR_NOT_FOUND) {
+ *bpp = NULL;
+ return (0);
+ }
+ __db_syserr(env, ret, DB_STR("0026",
+ "GetEnvironmentVariable"));
+ return (__os_posix_err(ret));
+ }
+ if (ret > (int)sizeof(tbuf))
+ goto small_buf;
+
+ FROM_TSTRING(env, tbuf, p, ret);
+ if (ret != 0)
+ return (ret);
+ if (strlen(p) < buflen)
+ (void)strcpy(*bpp, p);
+ else
+ *bpp = NULL;
+ FREE_STRING(env, p);
+ if (*bpp == NULL)
+ goto small_buf;
+
+ return (0);
+
+small_buf:
+ *bpp = NULL;
+ __db_errx(env, DB_STR_A("0027",
+ "%s: buffer too small to hold environment variable %s", "%s %s"),
+ name, p);
+ return (EINVAL);
+#endif
+}
diff --git a/src/os_windows/os_handle.c b/src/os_windows/os_handle.c
new file mode 100644
index 00000000..e6edc3ef
--- /dev/null
+++ b/src/os_windows/os_handle.c
@@ -0,0 +1,167 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_openhandle --
+ * Open a file, using POSIX 1003.1 open flags.
+ */
+int
+__os_openhandle(env, name, flags, mode, fhpp)
+ ENV *env;
+ const char *name;
+ int flags, mode;
+ DB_FH **fhpp;
+{
+#ifdef DB_WINCE
+ /*
+ * __os_openhandle API is not implemented on WinCE.
+ * It is not currently called from within the Berkeley DB library,
+ * so don't log the failure via the __db_err mechanism.
+ */
+ return (EFAULT);
+#else
+ DB_FH *fhp;
+ int ret, nrepeat, retries;
+
+ /*
+ * Allocate the file handle and copy the file name. We generally only
+ * use the name for verbose or error messages, but on systems where we
+ * can't unlink temporary files immediately, we use the name to unlink
+ * the temporary file when the file handle is closed.
+ *
+ * Lock the ENV handle and insert the new file handle on the list.
+ */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_FH), &fhp)) != 0)
+ return (ret);
+ if ((ret = __os_strdup(env, name, &fhp->name)) != 0)
+ goto err;
+ if (env != NULL) {
+ MUTEX_LOCK(env, env->mtx_env);
+ TAILQ_INSERT_TAIL(&env->fdlist, fhp, q);
+ MUTEX_UNLOCK(env, env->mtx_env);
+ F_SET(fhp, DB_FH_ENVLINK);
+ }
+
+ retries = 0;
+ for (nrepeat = 1; nrepeat < 4; ++nrepeat) {
+ fhp->fd = _open(name, flags, mode);
+
+ if (fhp->fd != -1) {
+ ret = 0;
+ break;
+ }
+
+ switch (ret = __os_posix_err(__os_get_syserr())) {
+ case EMFILE:
+ case ENFILE:
+ case ENOSPC:
+ /*
+ * If it's a "temporary" error, we retry up to 3 times,
+ * waiting up to 12 seconds. While it's not a problem
+ * if we can't open a database, an inability to open a
+ * log file is cause for serious dismay.
+ */
+ __os_yield(env, nrepeat * 2, 0);
+ break;
+ case EAGAIN:
+ case EBUSY:
+ case EINTR:
+ /*
+ * If an EAGAIN, EBUSY or EINTR, retry immediately for
+ * DB_RETRY times.
+ */
+ if (++retries < DB_RETRY)
+ --nrepeat;
+ break;
+ default:
+ /* Open is silent on error. */
+ goto err;
+ }
+ }
+
+ if (ret == 0) {
+ F_SET(fhp, DB_FH_OPENED);
+ *fhpp = fhp;
+ return (0);
+ }
+
+err: (void)__os_closehandle(env, fhp);
+ return (ret);
+#endif
+}
+
+/*
+ * __os_closehandle --
+ * Close a file.
+ */
+int
+__os_closehandle(env, fhp)
+ ENV *env;
+ DB_FH *fhp;
+{
+ DB_ENV *dbenv;
+ int ret, t_ret;
+
+ ret = 0;
+
+ if (env != NULL) {
+ dbenv = env->dbenv;
+ if (fhp->name != NULL && FLD_ISSET(
+ dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0031",
+ "fileops: %s: close", "%s"), fhp->name);
+
+ if (F_ISSET(fhp, DB_FH_ENVLINK)) {
+ /*
+ * Lock the ENV handle and remove this file
+ * handle from the list.
+ */
+ MUTEX_LOCK(env, env->mtx_env);
+ TAILQ_REMOVE(&env->fdlist, fhp, q);
+ MUTEX_UNLOCK(env, env->mtx_env);
+ }
+ }
+
+ /* Discard any underlying system file reference. */
+ if (F_ISSET(fhp, DB_FH_OPENED)) {
+ if (fhp->handle != INVALID_HANDLE_VALUE)
+ RETRY_CHK((!CloseHandle(fhp->handle)), ret);
+ else
+#ifdef DB_WINCE
+ ret = EFAULT;
+#else
+ RETRY_CHK((_close(fhp->fd)), ret);
+#endif
+
+ if (fhp->trunc_handle != INVALID_HANDLE_VALUE) {
+ RETRY_CHK((!CloseHandle(fhp->trunc_handle)), t_ret);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR("0032",
+ "CloseHandle"));
+ ret = __os_posix_err(ret);
+ }
+ }
+
+ /* Unlink the file if we haven't already done so. */
+ if (F_ISSET(fhp, DB_FH_UNLINK))
+ (void)__os_unlink(env, fhp->name, 0);
+
+ if (fhp->name != NULL)
+ __os_free(env, fhp->name);
+ __os_free(env, fhp);
+
+ return (ret);
+}
diff --git a/src/os_windows/os_map.c b/src/os_windows/os_map.c
new file mode 100644
index 00000000..8f646d68
--- /dev/null
+++ b/src/os_windows/os_map.c
@@ -0,0 +1,397 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int __os_map
+ __P((ENV *, char *, REGINFO *, DB_FH *, size_t, int, int, int, void **));
+static int __os_unique_name __P((_TCHAR *, HANDLE, _TCHAR *, size_t));
+
+/*
+ * __os_attach --
+ * Create/join a shared memory region.
+ */
+int
+__os_attach(env, infop, rp)
+ ENV *env;
+ REGINFO *infop;
+ REGION *rp;
+{
+ int ret;
+ int is_sparse;
+#ifndef DB_WINCE
+ DWORD dw;
+#endif
+
+ infop->fhp = NULL;
+ /*
+ * On Windows/9X, files that are opened by multiple processes do not
+ * share data correctly. For this reason, we require that DB_PRIVATE
+ * be specified on that platform.
+ */
+ if (!F_ISSET(env, ENV_PRIVATE) && __os_is_winnt() == 0) {
+ __db_err(env, EINVAL, DB_STR("0006",
+ "Windows 9X systems must specify DB_PRIVATE"));
+ return (EINVAL);
+ }
+
+ /*
+ * Try to open/create the file. We DO NOT need to ensure that multiple
+ * threads/processes attempting to simultaneously create the region are
+ * properly ordered, our caller has already taken care of that.
+ */
+ if ((ret = __os_open(env, infop->name, 0, DB_OSO_REGION |
+ (F_ISSET(infop, REGION_CREATE_OK) ? DB_OSO_CREATE : 0),
+ env->db_mode, &infop->fhp)) != 0) {
+ __db_err(env, ret, "%s", infop->name);
+ return (ret);
+ }
+
+ is_sparse = 0;
+#ifndef DB_WINCE
+ /*
+ * Sparse file only works for NTFS filesystem. If we failed to set it,
+ * just ignore the error and use the normal method.
+ */
+ if (!F_ISSET(env, ENV_SYSTEM_MEM) && (DeviceIoControl(
+ infop->fhp->handle, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
+ &dw, NULL)))
+ is_sparse = 1;
+#endif
+
+ /*
+ * Map the file in. If we're creating an in-system-memory region,
+ * specify a segment ID (which is never used again) so that the
+ * calling code writes out the REGENV_REF structure to the primary
+ * environment file.
+ */
+ ret = __os_map(env, infop->name, infop, infop->fhp, rp->max,
+ 1, F_ISSET(env, ENV_SYSTEM_MEM), 0, &infop->addr);
+ if (ret == 0 && F_ISSET(env, ENV_SYSTEM_MEM))
+ rp->segid = 1;
+
+ if (ret != 0) {
+ (void)__os_closehandle(env, infop->fhp);
+ infop->fhp = NULL;
+ return (ret);
+ }
+
+ /*
+ * If we are using sparse file, we don't need to keep the file handle
+ * for writing or extending.
+ */
+ if (is_sparse && infop->fhp != NULL) {
+ ret = __os_closehandle(env, infop->fhp);
+ infop->fhp = NULL;
+ }
+ return (ret);
+}
+
+/*
+ * __os_detach --
+ * Detach from a shared memory region.
+ */
+int
+__os_detach(env, infop, destroy)
+ ENV *env;
+ REGINFO *infop;
+ int destroy;
+{
+ DB_ENV *dbenv;
+ int ret, t_ret;
+
+ dbenv = env->dbenv;
+
+ if (infop->wnt_handle != NULL) {
+ (void)CloseHandle(infop->wnt_handle);
+ infop->wnt_handle = NULL;
+ }
+ if (infop->fhp != NULL) {
+ ret = __os_closehandle(env, infop->fhp);
+ infop->fhp = NULL;
+ if (ret != 0)
+ return (ret);
+ }
+
+ ret = !UnmapViewOfFile(infop->addr) ? __os_get_syserr() : 0;
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR("0007", "UnmapViewOfFile"));
+ ret = __os_posix_err(ret);
+ }
+
+ if (!F_ISSET(env, ENV_SYSTEM_MEM) && destroy &&
+ (t_ret = __os_unlink(env, infop->name, 1)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __os_mapfile --
+ * Map in a shared memory file.
+ */
+int
+__os_mapfile(env, path, fhp, len, is_rdonly, addr)
+ ENV *env;
+ char *path;
+ DB_FH *fhp;
+ int is_rdonly;
+ size_t len;
+ void **addr;
+{
+#ifdef DB_WINCE
+ /*
+ * Windows CE has special requirements for file mapping to work.
+ * * The input handle needs to be opened using CreateFileForMapping
+ * * Concurrent access via a non mapped file is not supported.
+ * So we disable support for memory mapping files on Windows CE. It is
+ * currently only used as an optimization in mpool for small read only
+ * databases.
+ */
+ return (EFAULT);
+#else
+ DB_ENV *dbenv;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0008", "fileops: mmap %s", "%s"), path);
+ return (__os_map(env, path, NULL, fhp, len, 0, 0, is_rdonly, addr));
+#endif
+}
+
+/*
+ * __os_unmapfile --
+ * Unmap the shared memory file.
+ */
+int
+__os_unmapfile(env, addr, len)
+ ENV *env;
+ void *addr;
+ size_t len;
+{
+ DB_ENV *dbenv;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR("0009", "fileops: munmap"));
+
+ return (!UnmapViewOfFile(addr) ? __os_posix_err(__os_get_syserr()) : 0);
+}
+
+/*
+ * __os_unique_name --
+ * Create a unique identifying name from a pathname (may be absolute or
+ * relative) and/or a file descriptor.
+ *
+ * The name returned must be unique (different files map to different
+ * names), and repeatable (same files, map to same names). It's not
+ * so easy to do by name. Should handle not only:
+ *
+ * foo.bar == ./foo.bar == c:/whatever_path/foo.bar
+ *
+ * but also understand that:
+ *
+ * foo.bar == Foo.Bar (FAT file system)
+ * foo.bar != Foo.Bar (NTFS)
+ *
+ * The best solution is to use the file index, found in the file
+ * information structure (similar to UNIX inode #).
+ *
+ * When a file is deleted, its file index may be reused,
+ * but if the unique name has not gone from its namespace,
+ * we may get a conflict. So to ensure some tie in to the
+ * original pathname, we also use the creation time and the
+ * file basename. This is not a perfect system, but it
+ * should work for all but anamolous test cases.
+ *
+ */
+static int
+__os_unique_name(orig_path, hfile, result_path, result_path_len)
+ _TCHAR *orig_path, *result_path;
+ HANDLE hfile;
+ size_t result_path_len;
+{
+ BY_HANDLE_FILE_INFORMATION fileinfo;
+ _TCHAR *basename, *p;
+
+ /*
+ * In Windows, pathname components are delimited by '/' or '\', and
+ * if neither is present, we need to strip off leading drive letter
+ * (e.g. c:foo.txt).
+ */
+ basename = _tcsrchr(orig_path, '/');
+ p = _tcsrchr(orig_path, '\\');
+ if (basename == NULL || (p != NULL && p > basename))
+ basename = p;
+ if (basename == NULL)
+ basename = _tcsrchr(orig_path, ':');
+
+ if (basename == NULL)
+ basename = orig_path;
+ else
+ basename++;
+
+ if (!GetFileInformationByHandle(hfile, &fileinfo))
+ return (__os_posix_err(__os_get_syserr()));
+
+ (void)_sntprintf(result_path, result_path_len,
+ _T("__db_shmem.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%s"),
+ fileinfo.dwVolumeSerialNumber,
+ fileinfo.nFileIndexHigh,
+ fileinfo.nFileIndexLow,
+ fileinfo.ftCreationTime.dwHighDateTime,
+ fileinfo.ftCreationTime.dwHighDateTime,
+ basename);
+
+ return (0);
+}
+
+/*
+ * __os_map --
+ * The mmap(2) function for Windows.
+ */
+static int
+__os_map(env, path, infop, fhp, len, is_region, is_system, is_rdonly, addr)
+ ENV *env;
+ REGINFO *infop;
+ char *path;
+ DB_FH *fhp;
+ int is_region, is_system, is_rdonly;
+ size_t len;
+ void **addr;
+{
+ HANDLE hMemory;
+ int ret, use_pagefile;
+ _TCHAR *tpath, shmem_name[DB_MAXPATHLEN];
+ void *pMemory;
+ unsigned __int64 len64;
+
+ ret = 0;
+ if (infop != NULL)
+ infop->wnt_handle = NULL;
+
+ /*
+ * On 64 bit systems, len is already a 64 bit value.
+ * On 32 bit systems len is a 32 bit value.
+ * Always convert to a 64 bit value, so that the high order
+ * DWORD can be simply extracted on 64 bit platforms.
+ */
+ len64 = len;
+
+ use_pagefile = is_region && is_system;
+
+ /*
+ * If creating a region in system space, get a matching name in the
+ * paging file namespace.
+ */
+ if (use_pagefile) {
+#ifdef DB_WINCE
+ __db_errx(env, DB_STR("0010",
+ "Unable to memory map regions using system "
+ "memory on WinCE."));
+ return (EFAULT);
+#endif
+ TO_TSTRING(env, path, tpath, ret);
+ if (ret != 0)
+ return (ret);
+ ret = __os_unique_name(tpath, fhp->handle,
+ shmem_name, sizeof(shmem_name));
+ FREE_STRING(env, tpath);
+ if (ret != 0)
+ return (ret);
+ }
+
+ /*
+ * XXX
+ * DB: We have not implemented copy-on-write here.
+ *
+ * If this is an region in system memory, we try to open it using the
+ * OpenFileMapping() first, and only call CreateFileMapping() if we're
+ * really creating the section. There are two reasons:
+ *
+ * 1) We only create the mapping if we have newly created the region.
+ * This avoids a long-running problem caused by Windows reference
+ * counting, where regions that are closed by all processes are
+ * deleted. It turns out that just checking for a zeroed region
+ * is not good enough. See [#4882] and [#7127] for the details.
+ *
+ * 2) CreateFileMapping seems to mess up making the commit charge to
+ * the process. It thinks, incorrectly, that when we want to join a
+ * previously existing section, that it should make a commit charge
+ * for the whole section. In fact, there is no new committed memory
+ * whatever. The call can fail if there is insufficient memory free
+ * to handle the erroneous commit charge. So, we find that the
+ * bogus commit is not made if we call OpenFileMapping.
+ */
+ hMemory = NULL;
+ if (use_pagefile) {
+#ifndef DB_WINCE
+ hMemory = OpenFileMapping(
+ is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS,
+ 0, shmem_name);
+
+ if (hMemory == NULL && F_ISSET(infop, REGION_CREATE_OK))
+ hMemory = CreateFileMapping((HANDLE)-1, 0,
+ is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
+ (DWORD)(len64 >> 32), (DWORD)len64, shmem_name);
+#endif
+ } else {
+ hMemory = CreateFileMapping(fhp->handle, 0,
+ is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
+ (DWORD)(len64 >> 32), (DWORD)len64, NULL);
+#ifdef DB_WINCE
+ /*
+ * WinCE automatically closes the handle passed in.
+ * Ensure DB does not attempt to close the handle again.
+ */
+ fhp->handle = INVALID_HANDLE_VALUE;
+ F_CLR(fhp, DB_FH_OPENED);
+#endif
+ }
+
+ if (hMemory == NULL) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("0011", "OpenFileMapping"));
+ return (__env_panic(env, __os_posix_err(ret)));
+ }
+
+ pMemory = MapViewOfFile(hMemory,
+ (is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS), 0, 0, len);
+ if (pMemory == NULL) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("0012", "MapViewOfFile"));
+ return (__env_panic(env, __os_posix_err(ret)));
+ }
+
+ /*
+ * XXX
+ * It turns out that the kernel object underlying the named section
+ * is reference counted, but that the call to MapViewOfFile() above
+ * does NOT increment the reference count! So, if we close the handle
+ * here, the kernel deletes the object from the kernel namespace.
+ * When a second process comes along to join the region, the kernel
+ * happily creates a new object with the same name, but completely
+ * different identity. The two processes then have distinct isolated
+ * mapped sections, not at all what was wanted. Not closing the handle
+ * here fixes this problem. We carry the handle around in the region
+ * structure so we can close it when unmap is called.
+ */
+ if (use_pagefile && infop != NULL)
+ infop->wnt_handle = hMemory;
+ else
+ CloseHandle(hMemory);
+
+ *addr = pMemory;
+ return (ret);
+}
diff --git a/src/os_windows/os_mkdir.c b/src/os_windows/os_mkdir.c
new file mode 100644
index 00000000..b87f3f9d
--- /dev/null
+++ b/src/os_windows/os_mkdir.c
@@ -0,0 +1,44 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_mkdir --
+ * Create a directory.
+ */
+int
+__os_mkdir(env, name, mode)
+ ENV *env;
+ const char *name;
+ int mode;
+{
+ DB_ENV *dbenv;
+ _TCHAR *tname;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0013", "fileops: mkdir %s",
+ "%s"), name);
+
+ /* Make the directory, with paranoid permissions. */
+ TO_TSTRING(env, name, tname, ret);
+ if (ret != 0)
+ return (ret);
+ RETRY_CHK(!CreateDirectory(tname, NULL), ret);
+ FREE_STRING(env, tname);
+ if (ret != 0)
+ return (__os_posix_err(ret));
+
+ return (ret);
+}
diff --git a/src/os_windows/os_open.c b/src/os_windows/os_open.c
new file mode 100644
index 00000000..44f2faf3
--- /dev/null
+++ b/src/os_windows/os_open.c
@@ -0,0 +1,258 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_open --
+ * Open a file descriptor (including page size and log size information).
+ */
+int
+__os_open(env, name, page_size, flags, mode, fhpp)
+ ENV *env;
+ const char *name;
+ u_int32_t page_size, flags;
+ int mode;
+ DB_FH **fhpp;
+{
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+#ifndef DB_WINCE
+ DWORD cluster_size, sector_size, free_clusters, total_clusters;
+ _TCHAR *drive, dbuf[4]; /* <letter><colon><slash><nul> */
+
+#endif
+ int access, attr, createflag, nrepeat, ret, share;
+ _TCHAR *tname;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ *fhpp = NULL;
+ tname = NULL;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0025", "fileops: open %s",
+ "%s"), name);
+
+#undef OKFLAGS
+#define OKFLAGS \
+ (DB_OSO_ABSMODE | DB_OSO_CREATE | DB_OSO_DIRECT | DB_OSO_DSYNC |\
+ DB_OSO_EXCL | DB_OSO_RDONLY | DB_OSO_REGION | DB_OSO_SEQ | \
+ DB_OSO_TEMP | DB_OSO_TRUNC)
+ if ((ret = __db_fchk(env, "__os_open", flags, OKFLAGS)) != 0)
+ return (ret);
+
+ TO_TSTRING(env, name, tname, ret);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Allocate the file handle and copy the file name. We generally only
+ * use the name for verbose or error messages, but on systems where we
+ * can't unlink temporary files immediately, we use the name to unlink
+ * the temporary file when the file handle is closed.
+ *
+ * Lock the ENV handle and insert the new file handle on the list.
+ */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_FH), &fhp)) != 0)
+ return (ret);
+ if ((ret = __os_strdup(env, name, &fhp->name)) != 0)
+ goto err;
+ if (env != NULL) {
+ MUTEX_LOCK(env, env->mtx_env);
+ TAILQ_INSERT_TAIL(&env->fdlist, fhp, q);
+ MUTEX_UNLOCK(env, env->mtx_env);
+ F_SET(fhp, DB_FH_ENVLINK);
+ }
+
+ /*
+ * Otherwise, use the Windows/32 CreateFile interface so that we can
+ * play magic games with files to get data flush effects similar to
+ * the POSIX O_DSYNC flag.
+ *
+ * !!!
+ * We currently ignore the 'mode' argument. It would be possible
+ * to construct a set of security attributes that we could pass to
+ * CreateFile that would accurately represents the mode. In worst
+ * case, this would require looking up user and all group names and
+ * creating an entry for each. Alternatively, we could call the
+ * _chmod (partial emulation) function after file creation, although
+ * this leaves us with an obvious race. However, these efforts are
+ * largely meaningless on FAT, the most common file system, which
+ * only has a "readable" and "writable" flag, applying to all users.
+ */
+ access = GENERIC_READ;
+ if (!LF_ISSET(DB_OSO_RDONLY))
+ access |= GENERIC_WRITE;
+
+#ifdef DB_WINCE
+ /*
+ * WinCE translates these flags into share flags for
+ * CreateFileForMapping.
+ * Also WinCE does not support the FILE_SHARE_DELETE flag.
+ */
+ if (LF_ISSET(DB_OSO_REGION))
+ share = GENERIC_READ | GENERIC_WRITE;
+ else
+ share = FILE_SHARE_READ | FILE_SHARE_WRITE;
+#else
+ share = FILE_SHARE_READ | FILE_SHARE_WRITE;
+ if (__os_is_winnt())
+ share |= FILE_SHARE_DELETE;
+#endif
+ attr = FILE_ATTRIBUTE_NORMAL;
+
+ /*
+ * Reproduce POSIX 1003.1 semantics: if O_CREATE and O_EXCL are both
+ * specified, fail, returning EEXIST, unless we create the file.
+ */
+ if (LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_EXCL))
+ createflag = CREATE_NEW; /* create only if !exist*/
+ else if (!LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_TRUNC))
+ createflag = TRUNCATE_EXISTING; /* truncate, fail if !exist */
+ else if (LF_ISSET(DB_OSO_TRUNC))
+ createflag = CREATE_ALWAYS; /* create and truncate */
+ else if (LF_ISSET(DB_OSO_CREATE))
+ createflag = OPEN_ALWAYS; /* open or create */
+ else
+ createflag = OPEN_EXISTING; /* open only if existing */
+
+ if (LF_ISSET(DB_OSO_DSYNC)) {
+ F_SET(fhp, DB_FH_NOSYNC);
+ attr |= FILE_FLAG_WRITE_THROUGH;
+ }
+
+#ifndef DB_WINCE
+ if (LF_ISSET(DB_OSO_SEQ))
+ attr |= FILE_FLAG_SEQUENTIAL_SCAN;
+ else
+ attr |= FILE_FLAG_RANDOM_ACCESS;
+#endif
+
+ if (LF_ISSET(DB_OSO_TEMP))
+ attr |= FILE_FLAG_DELETE_ON_CLOSE;
+
+ /*
+ * We can turn filesystem buffering off if the page size is a
+ * multiple of the disk's sector size. To find the sector size,
+ * we call GetDiskFreeSpace, which expects a drive name like "d:\\"
+ * or NULL for the current disk (i.e., a relative path).
+ *
+ * WinCE only has GetDiskFreeSpaceEx which does not
+ * return the sector size.
+ */
+#ifndef DB_WINCE
+ if (LF_ISSET(DB_OSO_DIRECT) && page_size != 0 && name[0] != '\0') {
+ if (name[1] == ':') {
+ drive = dbuf;
+ _sntprintf(dbuf, sizeof(dbuf), _T("%c:\\"), tname[0]);
+ } else
+ drive = NULL;
+
+ /*
+ * We ignore all results except sectorsize, but some versions
+ * of Windows require that the parameters are non-NULL.
+ */
+ if (GetDiskFreeSpace(drive, &cluster_size,
+ &sector_size, &free_clusters, &total_clusters) &&
+ page_size % sector_size == 0)
+ attr |= FILE_FLAG_NO_BUFFERING;
+ }
+#endif
+
+ fhp->handle = fhp->trunc_handle = INVALID_HANDLE_VALUE;
+ for (nrepeat = 1;; ++nrepeat) {
+ if (fhp->handle == INVALID_HANDLE_VALUE) {
+#ifdef DB_WINCE
+ if (LF_ISSET(DB_OSO_REGION))
+ fhp->handle = CreateFileForMapping(tname,
+ access, share, NULL, createflag, attr, 0);
+ else
+#endif
+ fhp->handle = CreateFile(tname,
+ access, share, NULL, createflag, attr, 0);
+ }
+
+#ifdef HAVE_FTRUNCATE
+ /*
+ * Older versions of WinCE may not support truncate, if so, the
+ * HAVE_FTRUNCATE macro should be #undef'ed, and we
+ * don't need to open this second handle.
+ *
+ * WinCE dose not support opening a second handle on the same
+ * file via CreateFileForMapping, but this dose not matter
+ * since we are not truncating region files but database files.
+ *
+ * But some older versions of WinCE even
+ * dose not allow a second handle opened via CreateFile. If
+ * this is the case, users will need to #undef the
+ * HAVE_FTRUNCATE macro in build_wince/db_config.h.
+ */
+
+ /*
+ * Windows does not provide truncate directly. There is no
+ * safe way to use a handle for truncate concurrently with
+ * reads or writes. To deal with this, we open a second handle
+ * used just for truncating.
+ */
+ if (fhp->handle != INVALID_HANDLE_VALUE &&
+ !LF_ISSET(DB_OSO_RDONLY | DB_OSO_TEMP) &&
+ fhp->trunc_handle == INVALID_HANDLE_VALUE
+#ifdef DB_WINCE
+ /* Do not open trunc handle for region files. */
+ && (!LF_ISSET(DB_OSO_REGION))
+#endif
+ )
+ fhp->trunc_handle = CreateFile(
+ tname, access, share, NULL, OPEN_EXISTING, attr, 0);
+#endif
+
+#ifndef HAVE_FTRUNCATE
+ if (fhp->handle == INVALID_HANDLE_VALUE)
+#else
+ if (fhp->handle == INVALID_HANDLE_VALUE ||
+ (!LF_ISSET(DB_OSO_RDONLY | DB_OSO_TEMP) &&
+ fhp->trunc_handle == INVALID_HANDLE_VALUE
+#ifdef DB_WINCE
+ /* Do not open trunc handle for region files. */
+ && (!LF_ISSET(DB_OSO_REGION))
+#endif
+ ))
+#endif
+ {
+ /*
+ * If it's a "temporary" error, we retry up to 3 times,
+ * waiting up to 12 seconds. While it's not a problem
+ * if we can't open a database, an inability to open a
+ * log file is cause for serious dismay.
+ */
+ ret = __os_posix_err(__os_get_syserr());
+ if ((ret != ENFILE && ret != EMFILE && ret != ENOSPC) ||
+ nrepeat > 3)
+ goto err;
+
+ __os_yield(env, nrepeat * 2, 0);
+ } else
+ break;
+ }
+
+ FREE_STRING(env, tname);
+
+ if (LF_ISSET(DB_OSO_REGION))
+ F_SET(fhp, DB_FH_REGION);
+ F_SET(fhp, DB_FH_OPENED);
+ *fhpp = fhp;
+ return (0);
+
+err: FREE_STRING(env, tname);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ return (ret);
+}
diff --git a/src/os_windows/os_rename.c b/src/os_windows/os_rename.c
new file mode 100644
index 00000000..791f53a5
--- /dev/null
+++ b/src/os_windows/os_rename.c
@@ -0,0 +1,82 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_rename --
+ * Rename a file.
+ */
+int
+__os_rename(env, oldname, newname, silent)
+ ENV *env;
+ const char *oldname, *newname;
+ u_int32_t silent;
+{
+ DB_ENV *dbenv;
+ _TCHAR *toldname, *tnewname;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0036", "fileops: rename %s to %s",
+ "%s %s"), oldname, newname);
+
+ TO_TSTRING(env, oldname, toldname, ret);
+ if (ret != 0)
+ return (ret);
+ TO_TSTRING(env, newname, tnewname, ret);
+ if (ret != 0) {
+ FREE_STRING(env, toldname);
+ return (ret);
+ }
+
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+
+ if (!MoveFile(toldname, tnewname))
+ ret = __os_get_syserr();
+
+ if (__os_posix_err(ret) == EEXIST) {
+ ret = 0;
+#ifndef DB_WINCE
+ if (__os_is_winnt()) {
+ if (!MoveFileEx(
+ toldname, tnewname, MOVEFILE_REPLACE_EXISTING))
+ ret = __os_get_syserr();
+ } else
+#endif
+ {
+ /*
+ * There is no MoveFileEx for Win9x/Me/CE, so we have to
+ * do the best we can. Note that the MoveFile call
+ * above would have succeeded if oldname and newname
+ * refer to the same file, so we don't need to check
+ * that here.
+ */
+ (void)DeleteFile(tnewname);
+ if (!MoveFile(toldname, tnewname))
+ ret = __os_get_syserr();
+ }
+ }
+
+ FREE_STRING(env, tnewname);
+ FREE_STRING(env, toldname);
+
+ if (ret != 0) {
+ if (silent == 0)
+ __db_syserr(env, ret, DB_STR_A("0037",
+ "MoveFileEx %s %s", "%s %s"), oldname, newname);
+ ret = __os_posix_err(ret);
+ }
+
+ return (ret);
+}
diff --git a/src/os_windows/os_rw.c b/src/os_windows/os_rw.c
new file mode 100644
index 00000000..e64a7d08
--- /dev/null
+++ b/src/os_windows/os_rw.c
@@ -0,0 +1,218 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_io --
+ * Do an I/O.
+ */
+int
+__os_io(env, op, fhp, pgno, pgsize, relative, io_len, buf, niop)
+ ENV *env;
+ int op;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+ u_int32_t pgsize, relative, io_len;
+ u_int8_t *buf;
+ size_t *niop;
+{
+ int ret;
+
+#ifndef DB_WINCE
+ if (__os_is_winnt()) {
+ DB_ENV *dbenv;
+ DWORD nbytes;
+ OVERLAPPED over;
+ ULONG64 off;
+ dbenv = env == NULL ? NULL : env->dbenv;
+ if ((off = relative) == 0)
+ off = (ULONG64)pgsize * pgno;
+ over.Offset = (DWORD)(off & 0xffffffff);
+ over.OffsetHigh = (DWORD)(off >> 32);
+ over.hEvent = 0; /* we don't want asynchronous notifications */
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0014",
+ "fileops: %s %s: %lu bytes at offset %lu",
+ "%s %s %lu %lu"), op == DB_IO_READ ?
+ DB_STR_P("read") : DB_STR_P("write"),
+ fhp->name, (u_long)io_len, (u_long)off);
+
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+
+ switch (op) {
+ case DB_IO_READ:
+#if defined(HAVE_STATISTICS)
+ ++fhp->read_count;
+#endif
+ if (!ReadFile(fhp->handle,
+ buf, (DWORD)io_len, &nbytes, &over))
+ goto slow;
+ break;
+ case DB_IO_WRITE:
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ if (__os_fs_notzero())
+ goto slow;
+#endif
+#if defined(HAVE_STATISTICS)
+ ++fhp->write_count;
+#endif
+ if (!WriteFile(fhp->handle,
+ buf, (DWORD)io_len, &nbytes, &over))
+ goto slow;
+ break;
+ }
+ if (nbytes == io_len) {
+ *niop = (size_t)nbytes;
+ return (0);
+ }
+ }
+
+slow:
+#endif
+ MUTEX_LOCK(env, fhp->mtx_fh);
+
+ if ((ret = __os_seek(env, fhp, pgno, pgsize, relative)) != 0)
+ goto err;
+
+ switch (op) {
+ case DB_IO_READ:
+ ret = __os_read(env, fhp, buf, io_len, niop);
+ break;
+ case DB_IO_WRITE:
+ ret = __os_write(env, fhp, buf, io_len, niop);
+ break;
+ }
+
+err: MUTEX_UNLOCK(env, fhp->mtx_fh);
+
+ return (ret);
+}
+
+/*
+ * __os_read --
+ * Read from a file handle.
+ */
+int
+__os_read(env, fhp, addr, len, nrp)
+ ENV *env;
+ DB_FH *fhp;
+ void *addr;
+ size_t len;
+ size_t *nrp;
+{
+ DB_ENV *dbenv;
+ DWORD count;
+ size_t offset, nr;
+ u_int8_t *taddr;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ ret = 0;
+
+#if defined(HAVE_STATISTICS)
+ ++fhp->read_count;
+#endif
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0015", "fileops: read %s: %lu bytes",
+ "%s %lu"), fhp->name, (u_long)len);
+
+ for (taddr = addr,
+ offset = 0; offset < len; taddr += nr, offset += nr) {
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+ RETRY_CHK((!ReadFile(fhp->handle,
+ taddr, (DWORD)(len - offset), &count, NULL)), ret);
+ if (count == 0 || ret != 0)
+ break;
+ nr = (size_t)count;
+ }
+ *nrp = taddr - (u_int8_t *)addr;
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR_A("0016",
+ "read: 0x%lx, %lu", "%lx %lu"),
+ P_TO_ULONG(taddr), (u_long)len - offset);
+ ret = __os_posix_err(ret);
+ }
+ return (ret);
+}
+
+/*
+ * __os_write --
+ * Write to a file handle.
+ */
+int
+__os_write(env, fhp, addr, len, nwp)
+ ENV *env;
+ DB_FH *fhp;
+ void *addr;
+ size_t len;
+ size_t *nwp;
+{
+ int ret;
+
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ /* Zero-fill as necessary. */
+ if (__os_fs_notzero() &&
+ (ret = __db_zero_fill(env, fhp)) != 0)
+ return (ret);
+#endif
+ return (__os_physwrite(env, fhp, addr, len, nwp));
+}
+
+/*
+ * __os_physwrite --
+ * Physical write to a file handle.
+ */
+int
+__os_physwrite(env, fhp, addr, len, nwp)
+ ENV *env;
+ DB_FH *fhp;
+ void *addr;
+ size_t len;
+ size_t *nwp;
+{
+ DB_ENV *dbenv;
+ DWORD count;
+ size_t offset, nw;
+ u_int8_t *taddr;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ ret = 0;
+
+#if defined(HAVE_STATISTICS)
+ ++fhp->write_count;
+#endif
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0017", "fileops: write %s: %lu bytes",
+ "%s %lu"), fhp->name, (u_long)len);
+
+ for (taddr = addr,
+ offset = 0; offset < len; taddr += nw, offset += nw) {
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+ RETRY_CHK((!WriteFile(fhp->handle,
+ taddr, (DWORD)(len - offset), &count, NULL)), ret);
+ if (ret != 0)
+ break;
+ nw = (size_t)count;
+ }
+ *nwp = len;
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR_A("0018",
+ "write: %#lx, %lu", "%#lx %lu"),
+ P_TO_ULONG(taddr), (u_long)len - offset);
+ ret = __os_posix_err(ret);
+
+ DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
+ }
+ return (ret);
+}
diff --git a/src/os_windows/os_seek.c b/src/os_windows/os_seek.c
new file mode 100644
index 00000000..7632c15d
--- /dev/null
+++ b/src/os_windows/os_seek.c
@@ -0,0 +1,67 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_seek --
+ * Seek to a page/byte offset in the file.
+ */
+int
+__os_seek(env, fhp, pgno, pgsize, relative)
+ ENV *env;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+ u_int32_t pgsize;
+ off_t relative;
+{
+ /* Yes, this really is how Microsoft designed their API. */
+ union {
+ __int64 bigint;
+ struct {
+ unsigned long low;
+ long high;
+ };
+ } offbytes;
+ DB_ENV *dbenv;
+ off_t offset;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+#if defined(HAVE_STATISTICS)
+ ++fhp->seek_count;
+#endif
+
+ offset = (off_t)pgsize * pgno + relative;
+
+ if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0038",
+ "fileops: seek %s to %lu", "%s %lu"),
+ fhp->name, (u_long)offset);
+
+ offbytes.bigint = offset;
+ ret = (SetFilePointer(fhp->handle, offbytes.low,
+ &offbytes.high, FILE_BEGIN) == (DWORD)-1) ? __os_get_syserr() : 0;
+
+ if (ret == 0) {
+ fhp->pgsize = pgsize;
+ fhp->pgno = pgno;
+ fhp->offset = relative;
+ } else {
+ __db_syserr(env, ret, DB_STR_A("0039",
+ "seek: %lu: (%lu * %lu) + %lu", "%lu %lu %lu %lu"),
+ (u_long)offset, (u_long)pgno,
+ (u_long)pgsize, (u_long)relative);
+ ret = __os_posix_err(ret);
+ }
+
+ return (ret);
+}
diff --git a/src/os_windows/os_stat.c b/src/os_windows/os_stat.c
new file mode 100644
index 00000000..11248886
--- /dev/null
+++ b/src/os_windows/os_stat.c
@@ -0,0 +1,231 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Raw data reads must be done in multiples of the disk sector size. Currently
+ * the sector size is either 512 bytes or 4096 bytes. So we set the
+ * MAX_SECTOR_SIZE to 4096.
+ */
+#define MAX_SECTOR_SIZE 4096
+
+/*
+ * Find the cluster size of the file system that would contain the given path.
+ * If the value can't be determined, an error is returned.
+ */
+int
+__os_get_cluster_size(path, psize)
+ const char *path;
+ u_int32_t *psize;
+{
+
+#if (WINVER < 0x500) || defined(DB_WINCE)
+ /*
+ * WinCE and versions of Windows earlier than Windows NT don't have
+ * the APIs required to retrieve the cluster size.
+ */
+ *psize = DB_DEF_IOSIZE;
+ return (0);
+#else
+ BYTE clustershift, sectorshift, *pcluster;
+ char buffer[MAX_SECTOR_SIZE];
+ DWORD flags, infolen, length, mcl, name_size;
+ HANDLE vhandle;
+ int ret;
+ NTFS_VOLUME_DATA_BUFFER ntfsinfo;
+ size_t name_len;
+ TCHAR *env_path, name_buffer[MAX_PATH + 1], root_path[MAX_PATH + 1];
+ WORD *psector;
+
+ if (path == NULL || psize == NULL) {
+ return (EINVAL);
+ }
+
+ name_size = MAX_PATH + 1;
+ *psize = 0;
+
+ TO_TSTRING(NULL, path, env_path, ret);
+ if (ret != 0)
+ return (ret);
+ /* Retrieve the volume root path where the input path resides. */
+ if (!GetVolumePathName(env_path, root_path, name_size)) {
+ FREE_STRING(NULL, env_path);
+ return (__os_posix_err(__os_get_syserr()));
+ }
+ FREE_STRING(NULL, env_path);
+
+ /* Get the volume GUID name from the root path. */
+ if (!GetVolumeNameForVolumeMountPoint(
+ root_path, name_buffer, name_size))
+ return (__os_posix_err(__os_get_syserr()));
+
+ /* Delete the last trail "\" in the GUID name. */
+ name_len = _tcsclen(name_buffer);
+ if (name_len > 0)
+ name_buffer[name_len - 1] = _T('\0');
+
+ /* Create a handle to the volume. */
+ vhandle = CreateFile(name_buffer, FILE_READ_ATTRIBUTES | FILE_READ_DATA,
+ FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING,
+ FILE_ATTRIBUTE_NORMAL, NULL);
+
+ /* If open failed, return error */
+ if (vhandle == INVALID_HANDLE_VALUE)
+ return (__os_posix_err(__os_get_syserr()));
+
+ /* Get the volume information through the root path. */
+ if (!GetVolumeInformation(root_path, NULL, name_size, NULL, &mcl,
+ &flags, name_buffer, name_size)) {
+ ret = __os_posix_err(__os_get_syserr());
+ CloseHandle(vhandle);
+ return (ret);
+ }
+
+ ret = 0;
+ if (_tcscmp(name_buffer, _T("NTFS")) == 0) {
+ /*
+ * If this is NTFS file system, use FSCTL_GET_NTFS_VOLUME_DATA
+ * to get the cluster size.
+ */
+ if (DeviceIoControl(
+ vhandle, /* volume handle */
+ FSCTL_GET_NTFS_VOLUME_DATA, /* Control Code */
+ NULL, /* not use */
+ 0, /* not use */
+ &ntfsinfo, /* output buffer */
+ sizeof(NTFS_VOLUME_DATA_BUFFER),/* output buffer length */
+ &infolen, /* number of returned bytes */
+ NULL)) /* ignore here */
+ *psize = ntfsinfo.BytesPerCluster;
+ else
+ ret = __os_posix_err(__os_get_syserr());
+ } else if (_tcscmp(name_buffer, _T("exFAT")) == 0) {
+ /*
+ * If this is exFAT file system, read the information of sector
+ * and cluster from the BPB on sector 0
+ * +6C H: BYTE SectorSizeShift
+ * +6D H: BYTE ClusterShift
+ */
+ if (ReadFile(vhandle, buffer, MAX_SECTOR_SIZE, &length, NULL)) {
+ sectorshift = *(BYTE *)(&buffer[0x6C]);
+ clustershift = *(BYTE *)(&buffer[0x6D]);
+ *psize = 1 << sectorshift;
+ *psize = (*psize) << clustershift;
+ }
+ else
+ ret = __os_posix_err(__os_get_syserr());
+ } else if (_tcscmp(name_buffer, _T("FAT")) == 0 ||
+ _tcscmp(name_buffer, _T("FAT32")) == 0) {
+ /*
+ * If this is FAT or FAT32 file system, read the information of
+ * sector and cluster from the BPB on sector 0.
+ * +0B H: WORD Bytes per Sector.
+ * +0D H: BYTE Sectors Per Cluster.
+ */
+ if (ReadFile(vhandle, buffer, MAX_SECTOR_SIZE, &length, NULL)) {
+ psector = (WORD *)(&buffer[0x0B]);
+ pcluster = (BYTE *)(&buffer[0x0D]);
+ *psize = (*psector) * (*pcluster);
+ }
+ else
+ ret = __os_posix_err(__os_get_syserr());
+ }
+
+ CloseHandle(vhandle);
+ return (ret);
+#endif
+}
+
+/*
+ * __os_exists --
+ * Return if the file exists.
+ */
+int
+__os_exists(env, path, isdirp)
+ ENV *env;
+ const char *path;
+ int *isdirp;
+{
+ DB_ENV *dbenv;
+ DWORD attrs;
+ _TCHAR *tpath;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ TO_TSTRING(env, path, tpath, ret);
+ if (ret != 0)
+ return (ret);
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0033", "fileops: stat %s",
+ "%s"), path);
+
+ RETRY_CHK(
+ ((attrs = GetFileAttributes(tpath)) == (DWORD)-1 ? 1 : 0), ret);
+ if (ret == 0) {
+ if (isdirp != NULL)
+ *isdirp = (attrs & FILE_ATTRIBUTE_DIRECTORY);
+ } else
+ ret = __os_posix_err(ret);
+
+ FREE_STRING(env, tpath);
+ return (ret);
+}
+
+/*
+ * __os_ioinfo --
+ * Return file size and I/O size; abstracted to make it easier
+ * to replace.
+ */
+int
+__os_ioinfo(env, path, fhp, mbytesp, bytesp, iosizep)
+ ENV *env;
+ const char *path;
+ DB_FH *fhp;
+ u_int32_t *mbytesp, *bytesp, *iosizep;
+{
+ int ret;
+ BY_HANDLE_FILE_INFORMATION bhfi;
+ unsigned __int64 filesize;
+ u_int32_t io_sz;
+
+ RETRY_CHK((!GetFileInformationByHandle(fhp->handle, &bhfi)), ret);
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR("0034",
+ "GetFileInformationByHandle"));
+ return (__os_posix_err(ret));
+ }
+
+ filesize = ((unsigned __int64)bhfi.nFileSizeHigh << 32) +
+ bhfi.nFileSizeLow;
+
+ /* Return the size of the file. */
+ if (mbytesp != NULL)
+ *mbytesp = (u_int32_t)(filesize / MEGABYTE);
+ if (bytesp != NULL)
+ *bytesp = (u_int32_t)(filesize % MEGABYTE);
+
+ if (iosizep != NULL) {
+ /*
+ * Attempt to retrieve a file system cluster size, if the
+ * call succeeds, and the value returned is reasonable,
+ * use it as the default page size. Otherwise use a
+ * reasonable default value.
+ */
+ if (__os_get_cluster_size(path, &io_sz) != 0 || io_sz < 1025)
+ *iosizep = DB_DEF_IOSIZE;
+ else
+ *iosizep = io_sz;
+ }
+ return (0);
+}
diff --git a/src/os_windows/os_truncate.c b/src/os_windows/os_truncate.c
new file mode 100644
index 00000000..fcbb37b2
--- /dev/null
+++ b/src/os_windows/os_truncate.c
@@ -0,0 +1,99 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_truncate --
+ * Truncate the file.
+ */
+int
+__os_truncate(env, fhp, pgno, pgsize)
+ ENV *env;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+ u_int32_t pgsize;
+{
+ /* Yes, this really is how Microsoft have designed their API */
+ union {
+ __int64 bigint;
+ struct {
+ unsigned long low;
+ long high;
+ };
+ } off;
+ DB_ENV *dbenv;
+ off_t offset;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ offset = (off_t)pgsize * pgno;
+ ret = 0;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0021", "fileops: truncate %s to %lu",
+ "%s %lu"), fhp->name, (u_long)offset);
+
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ /*
+ * If the filesystem doesn't zero fill, it isn't safe to extend the
+ * file, or we end up with junk blocks. Just return in that case.
+ */
+ if (__os_fs_notzero()) {
+ off_t stat_offset;
+ u_int32_t mbytes, bytes;
+
+ /* Stat the file. */
+ if ((ret =
+ __os_ioinfo(env, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
+ return (ret);
+ stat_offset = (off_t)mbytes * MEGABYTE + bytes;
+
+ if (offset > stat_offset)
+ return (0);
+ }
+#endif
+
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+
+ /*
+ * Windows doesn't provide truncate directly. Instead, it has
+ * SetEndOfFile, which truncates to the current position. To
+ * deal with that, we open a duplicate file handle for truncating.
+ *
+ * We want to retry the truncate call, which involves a SetFilePointer
+ * and a SetEndOfFile, but there are several complications:
+ *
+ * 1) since the Windows API deals in 32-bit values, it's possible that
+ * the return from SetFilePointer (the low 32-bits) is
+ * INVALID_SET_FILE_POINTER even when the call has succeeded. So we
+ * have to also check whether GetLastError() returns NO_ERROR.
+ *
+ * 2) when it returns, SetFilePointer overwrites the high bits of the
+ * offset, so if we need to retry, we have to reset the offset each
+ * time.
+ *
+ * We can't switch to SetFilePointerEx, which knows about 64-bit
+ * offsets, because it isn't supported on Win9x/ME.
+ */
+ RETRY_CHK((off.bigint = (__int64)pgsize * pgno,
+ (SetFilePointer(fhp->trunc_handle, off.low, &off.high, FILE_BEGIN)
+ == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) ||
+ !SetEndOfFile(fhp->trunc_handle)), ret);
+
+ if (ret != 0) {
+ __db_syserr(env, ret, DB_STR_A("0022", "SetFilePointer: %lu",
+ "%lu"), pgno * pgsize);
+ ret = __os_posix_err(ret);
+ }
+
+ return (ret);
+}
diff --git a/src/os_windows/os_unlink.c b/src/os_windows/os_unlink.c
new file mode 100644
index 00000000..6a0a6572
--- /dev/null
+++ b/src/os_windows/os_unlink.c
@@ -0,0 +1,123 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_unlink --
+ * Remove a file.
+ */
+int
+__os_unlink(env, path, overwrite_test)
+ ENV *env;
+ const char *path;
+ int overwrite_test;
+{
+ DB_ENV *dbenv;
+ HANDLE h;
+ _TCHAR *tpath, *orig_tpath, buf[DB_MAXPATHLEN];
+ u_int32_t id;
+ int ret, t_ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0028", "fileops: unlink %s",
+ "%s"), path);
+
+ /* Optionally overwrite the contents of the file to enhance security. */
+ if (dbenv != NULL && overwrite_test && F_ISSET(dbenv, DB_ENV_OVERWRITE))
+ (void)__db_file_multi_write(env, path);
+
+ TO_TSTRING(env, path, tpath, ret);
+ if (ret != 0)
+ return (ret);
+ orig_tpath = tpath;
+
+ LAST_PANIC_CHECK_BEFORE_IO(env);
+
+ /*
+ * Windows NT and its descendants allow removal of open files, but the
+ * DeleteFile Win32 system call isn't equivalent to a POSIX unlink.
+ * Firstly, it only succeeds if FILE_SHARE_DELETE is set when the file
+ * is opened. Secondly, it leaves the file in a "zombie" state, where
+ * it can't be opened again, but a new file with the same name can't be
+ * created either.
+ *
+ * Since we depend on being able to recreate files (during recovery,
+ * say), we have to first rename the file, and then delete it. It
+ * still hangs around, but with a name we don't care about. The rename
+ * will fail if the file doesn't exist, which isn't a problem, but if
+ * it fails for some other reason, we need to know about it or a
+ * subsequent open may fail for no apparent reason.
+ */
+ if (__os_is_winnt()) {
+ __os_unique_id(env, &id);
+ _sntprintf(buf, DB_MAXPATHLEN, _T("%s.del.%010u"), tpath, id);
+ if (MoveFile(tpath, buf))
+ tpath = buf;
+ else {
+ ret = __os_get_syserr();
+ if (__os_posix_err(ret) != ENOENT)
+ /*
+ * System doesn't always return ENOENT when
+ * file is missing. So we need a double check
+ * here. Set the return value to ENOENT when
+ * file doesn't exist.
+ */
+ if (__os_exists(env, path, NULL) == 0)
+ __db_err(env, ret, DB_STR_A("0029",
+ "MoveFile: "
+ "rename %s to temporary file",
+ "%s"), path);
+ else
+ ret = ENOENT;
+ }
+
+ /*
+ * Try removing the file using the delete-on-close flag. This
+ * plays nicer with files that are still open than DeleteFile.
+ */
+ h = CreateFile(tpath, 0,
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+ NULL, OPEN_EXISTING, FILE_FLAG_DELETE_ON_CLOSE, 0);
+ if (h != INVALID_HANDLE_VALUE) {
+ (void)CloseHandle (h);
+ if (GetFileAttributes(tpath) == INVALID_FILE_ATTRIBUTES)
+ goto skipdel;
+ }
+ }
+
+ RETRY_CHK((!DeleteFile(tpath)), ret);
+
+skipdel:
+ FREE_STRING(env, orig_tpath);
+
+ /*
+ * XXX
+ * We shouldn't be testing for an errno of ENOENT here, but ENOENT
+ * signals that a file is missing, and we attempt to unlink things
+ * (such as v. 2.x environment regions, in ENV->remove) that we
+ * are expecting not to be there. Reporting errors in these cases
+ * is annoying.
+ */
+ if ((ret != 0) && (t_ret = __os_posix_err(ret)) != ENOENT) {
+ /* Double check if the file exists. */
+ if (__os_exists(env, path, NULL) == 0) {
+ __db_syserr(env, ret, DB_STR_A("0030",
+ "DeleteFile: %s", "%s"), path);
+ ret = t_ret;
+ } else
+ ret = ENOENT;
+ }
+
+ return (ret);
+}
diff --git a/src/os_windows/os_yield.c b/src/os_windows/os_yield.c
new file mode 100644
index 00000000..0d32ef69
--- /dev/null
+++ b/src/os_windows/os_yield.c
@@ -0,0 +1,35 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_yield --
+ * Yield the processor, optionally pausing until running again.
+ */
+void
+__os_yield(env, secs, usecs)
+ ENV *env;
+ u_long secs, usecs; /* Seconds and microseconds. */
+{
+ COMPQUIET(env, NULL);
+
+ /* Don't require the values be normalized. */
+ for (; usecs >= US_PER_SEC; usecs -= US_PER_SEC)
+ ++secs;
+
+ /*
+ * Yield the processor so other processes or threads can run.
+ *
+ * Sheer raving paranoia -- don't sleep for 0 time, in case some
+ * implementation doesn't yield the processor in that case.
+ */
+ Sleep(secs * MS_PER_SEC + (usecs / US_PER_MS) + 1);
+}
diff --git a/src/qam/qam.c b/src/qam/qam.c
new file mode 100644
index 00000000..e81d4795
--- /dev/null
+++ b/src/qam/qam.c
@@ -0,0 +1,1760 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+
+static int __qam_bulk __P((DBC *, DBT *, u_int32_t));
+static int __qamc_close __P((DBC *, db_pgno_t, int *));
+static int __qamc_del __P((DBC *, u_int32_t));
+static int __qamc_destroy __P((DBC *));
+static int __qamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __qamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __qam_consume __P((DBC *, QMETA *, db_recno_t));
+static int __qam_getno __P((DB *, const DBT *, db_recno_t *));
+
+#define DONT_NEED_LOCKS(dbc) ((dbc)->txn == NULL || \
+ F_ISSET(dbc, DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED))
+
+/*
+ * __qam_position --
+ * Position a queued access method cursor at a record. This returns
+ * the page locked. *exactp will be set if the record is valid.
+ * PUBLIC: int __qam_position
+ * PUBLIC: __P((DBC *, db_recno_t *, u_int32_t, int *));
+ */
+int
+__qam_position(dbc, recnop, get_mode, exactp)
+ DBC *dbc; /* open cursor */
+ db_recno_t *recnop; /* pointer to recno to find */
+ u_int32_t get_mode; /* flags to __memp_fget */
+ int *exactp; /* indicate if it was found */
+{
+ DB *dbp;
+ QAMDATA *qp;
+ QUEUE_CURSOR *cp;
+ db_pgno_t pg;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (QUEUE_CURSOR *)dbc->internal;
+
+ /* Fetch the page for this recno. */
+ cp->pgno = pg = QAM_RECNO_PAGE(dbp, *recnop);
+
+ cp->page = NULL;
+ *exactp = 0;
+ if ((ret = __qam_fget(dbc, &pg, get_mode, &cp->page)) != 0) {
+ if (!FLD_ISSET(get_mode, DB_MPOOL_CREATE) &&
+ (ret == DB_PAGE_NOTFOUND || ret == ENOENT))
+ ret = 0;
+ return (ret);
+ }
+ cp->indx = QAM_RECNO_INDEX(dbp, pg, *recnop);
+
+ if (PGNO(cp->page) == 0) {
+ /*
+ * We have read an uninitialized page: set the page number if
+ * we're creating the page. Otherwise, we know that the record
+ * doesn't exist yet.
+ */
+ if (!FLD_ISSET(get_mode, DB_MPOOL_CREATE)) {
+ *exactp = 0;
+ return (0);
+ }
+ DB_ASSERT(dbp->env, FLD_ISSET(get_mode, DB_MPOOL_CREATE));
+ PGNO(cp->page) = pg;
+ TYPE(cp->page) = P_QAMDATA;
+ }
+
+ qp = QAM_GET_RECORD(dbp, cp->page, cp->indx);
+ *exactp = F_ISSET(qp, QAM_VALID) ? 1 : 0;
+
+ return (ret);
+}
+
+/*
+ * __qam_pitem --
+ * Put an item on a queue page. Copy the data to the page and set the
+ * VALID and SET bits. If logging and the record was previously set,
+ * log that data, otherwise just log the new data.
+ *
+ * pagep must be write locked
+ *
+ * PUBLIC: int __qam_pitem
+ * PUBLIC: __P((DBC *, QPAGE *, u_int32_t, db_recno_t, DBT *));
+ */
+int
+__qam_pitem(dbc, pagep, indx, recno, data)
+ DBC *dbc;
+ QPAGE *pagep;
+ u_int32_t indx;
+ db_recno_t recno;
+ DBT *data;
+{
+ DB *dbp;
+ DBT olddata, pdata, *datap;
+ ENV *env;
+ QAMDATA *qp;
+ QUEUE *t;
+ u_int8_t *dest, *p;
+ int allocated, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ t = (QUEUE *)dbp->q_internal;
+ allocated = ret = 0;
+
+ if (data->size > t->re_len)
+ return (__db_rec_toobig(env, data->size, t->re_len));
+ qp = QAM_GET_RECORD(dbp, pagep, indx);
+
+ p = qp->data;
+ datap = data;
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if (data->doff + data->dlen > t->re_len) {
+ __db_errx(env, DB_STR_A("1142",
+"Record length error: data offset plus length larger than record size of %lu",
+ "%s %lu"), (u_long)t->re_len);
+ return (EINVAL);
+ }
+
+ if (data->size != data->dlen)
+ return (__db_rec_repl(env, data->size, data->dlen));
+
+ if (data->size == t->re_len)
+ goto no_partial;
+
+ /*
+ * If we are logging, then we have to build the record
+ * first, otherwise, we can simply drop the change
+ * directly on the page. After this clause, make
+ * sure that datap and p are set up correctly so that
+ * copying datap into p does the right thing.
+ *
+ * Note, I am changing this so that if the existing
+ * record is not valid, we create a complete record
+ * to log so that both this and the recovery code is simpler.
+ */
+
+ if (DBC_LOGGING(dbc) || !F_ISSET(qp, QAM_VALID)) {
+ datap = &pdata;
+ memset(datap, 0, sizeof(*datap));
+
+ if ((ret = __os_malloc(env,
+ t->re_len, &datap->data)) != 0)
+ return (ret);
+ allocated = 1;
+ datap->size = t->re_len;
+
+ /*
+ * Construct the record if it's valid, otherwise set it
+ * all to the pad character.
+ */
+ dest = datap->data;
+ if (F_ISSET(qp, QAM_VALID))
+ memcpy(dest, p, t->re_len);
+ else
+ memset(dest, (int)t->re_pad, t->re_len);
+
+ dest += data->doff;
+ memcpy(dest, data->data, data->size);
+ } else {
+ datap = data;
+ p += data->doff;
+ }
+ }
+
+no_partial:
+ if (DBC_LOGGING(dbc)) {
+ olddata.size = 0;
+ if (F_ISSET(qp, QAM_SET)) {
+ olddata.data = qp->data;
+ olddata.size = t->re_len;
+ }
+ if ((ret = __qam_add_log(dbp, dbc->txn, &LSN(pagep),
+ 0, &LSN(pagep), pagep->pgno,
+ indx, recno, datap, qp->flags,
+ olddata.size == 0 ? NULL : &olddata)) != 0)
+ goto err;
+ } else if (!F_ISSET((dbc), DBC_RECOVER))
+ LSN_NOT_LOGGED(LSN(pagep));
+
+ F_SET(qp, QAM_VALID | QAM_SET);
+ memcpy(p, datap->data, datap->size);
+ if (!F_ISSET(data, DB_DBT_PARTIAL))
+ memset(p + datap->size,
+ (int)t->re_pad, t->re_len - datap->size);
+
+err: if (allocated)
+ __os_free(env, datap->data);
+
+ return (ret);
+}
+/*
+ * __qamc_put
+ * Cursor put for queued access method.
+ * BEFORE and AFTER cannot be specified.
+ */
+static int
+__qamc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ QMETA *meta;
+ QUEUE_CURSOR *cp;
+ db_pgno_t metapg;
+ db_recno_t new_cur, new_first;
+ u_int32_t opcode;
+ int exact, ret, t_ret, writelock;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ if (pgnop != NULL)
+ *pgnop = PGNO_INVALID;
+
+ cp = (QUEUE_CURSOR *)dbc->internal;
+
+ switch (flags) {
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NOOVERWRITE:
+ case DB_OVERWRITE_DUP:
+ if ((ret = __qam_getno(dbp, key, &cp->recno)) != 0)
+ return (ret);
+ /* FALLTHROUGH */
+ case DB_CURRENT:
+ break;
+ default:
+ /* The interface shouldn't let anything else through. */
+ return (__db_ferr(env, "DBC->put", 0));
+ }
+
+ /* Write lock the record. */
+ if ((ret = __db_lget(dbc, LCK_COUPLE,
+ cp->recno, DB_LOCK_WRITE, DB_LOCK_RECORD, &cp->lock)) != 0)
+ return (ret);
+
+ if ((ret = __qam_position(dbc, &cp->recno,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &exact)) != 0) {
+ /* We could not get the page, we can release the record lock. */
+ (void)__LPUT(dbc, cp->lock);
+ return (ret);
+ }
+
+ if (exact != 0 && flags == DB_NOOVERWRITE)
+ ret = DB_KEYEXIST;
+ else
+ /* Put the item on the page. */
+ ret = __qam_pitem(dbc,
+ (QPAGE *)cp->page, cp->indx, cp->recno, data);
+
+ if ((t_ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ cp->page = NULL;
+ cp->lock_mode = DB_LOCK_WRITE;
+ if (ret != 0)
+ return (ret);
+
+ /* Unlock the record if not in a transaction. */
+ if ((ret = __TLPUT(dbc, cp->lock)) != 0)
+ return (ret);
+
+ /* We may need to reset the head or tail of the queue. */
+ metapg = ((QUEUE *)dbp->q_internal)->q_meta;
+
+ writelock = 0;
+ if ((ret = __memp_fget(mpf, &metapg,
+ dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+ return (ret);
+
+ opcode = 0;
+ new_cur = new_first = 0;
+
+ /*
+ * If the put address is outside the queue, adjust the head and
+ * tail of the queue. If the order is inverted we move
+ * the one which is closer. The first case is when the
+ * queue is empty, move first and current to where the new
+ * insert is.
+ */
+
+recheck:
+ if (meta->first_recno == meta->cur_recno) {
+ new_first = cp->recno;
+ new_cur = cp->recno;
+ QAM_INC_RECNO(new_cur);
+ opcode |= QAM_SETFIRST;
+ opcode |= QAM_SETCUR;
+ } else {
+ if (QAM_BEFORE_FIRST(meta, cp->recno)) {
+ new_first = cp->recno;
+ opcode |= QAM_SETFIRST;
+ }
+
+ if (QAM_AFTER_CURRENT(meta, cp->recno)) {
+ new_cur = cp->recno;
+ QAM_INC_RECNO(new_cur);
+ opcode |= QAM_SETCUR;
+ }
+ }
+
+ if (opcode == 0)
+ goto done;
+
+ /* Exclusive latch the metadata page. */
+ if (writelock == 0 && (ret = __memp_dirty(mpf, &meta,
+ dbc->thread_info, dbc->txn, dbc->priority, DB_MPOOL_DIRTY)) != 0)
+ goto done;
+ if (writelock++ == 0)
+ goto recheck;
+
+ if (DBC_LOGGING(dbc) && (ret = __qam_mvptr_log(dbp, dbc->txn,
+ &meta->dbmeta.lsn, 0, opcode, meta->first_recno,
+ new_first, meta->cur_recno, new_cur,
+ &meta->dbmeta.lsn, PGNO_BASE_MD)) != 0)
+ opcode = 0;
+
+ if (opcode & QAM_SETCUR)
+ meta->cur_recno = new_cur;
+ if (opcode & QAM_SETFIRST)
+ meta->first_recno = new_first;
+
+ QAM_WAKEUP(dbc, ret);
+
+done: if (meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __qam_append --
+ * Perform a put(DB_APPEND) in queue.
+ *
+ * PUBLIC: int __qam_append __P((DBC *, DBT *, DBT *));
+ */
+int
+__qam_append(dbc, key, data)
+ DBC *dbc;
+ DBT *key, *data;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ QMETA *meta;
+ QPAGE *page;
+ QUEUE *qp;
+ QUEUE_CURSOR *cp;
+ db_pgno_t pg, metapg;
+ db_recno_t recno;
+ int ret, t_ret, waited;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (QUEUE_CURSOR *)dbc->internal;
+ LOCK_INIT(lock);
+
+ /* Exclusive latch the meta page. */
+ metapg = ((QUEUE *)dbp->q_internal)->q_meta;
+again: if ((ret = __memp_fget(mpf, &metapg,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+ return (ret);
+
+ /* Get the next record number. */
+ recno = meta->cur_recno;
+ QAM_INC_RECNO(meta->cur_recno);
+
+ if (meta->cur_recno == meta->first_recno) {
+ QAM_DEC_RECNO(meta->cur_recno);
+ ret = EFBIG;
+ goto err;
+ }
+
+ if (QAM_BEFORE_FIRST(meta, recno))
+ meta->first_recno = recno;
+
+ /* Lock the record. */
+ waited = 0;
+ ret = __db_lget(dbc, 0, recno,
+ DB_LOCK_WRITE, DB_LOCK_NOWAIT | DB_LOCK_RECORD, &lock);
+
+ /* Release the meta page. */
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ meta = NULL;
+ /* If we couldn't lock the record try again. */
+ if (t_ret == 0 &&
+ (ret == DB_LOCK_NOTGRANTED || ret == DB_LOCK_DEADLOCK)) {
+ waited = 1;
+ ret = __db_lget(dbc, 0, recno,
+ DB_LOCK_WRITE, DB_LOCK_RECORD, &lock);
+ }
+
+ /*
+ * The application may modify the data based on the selected record
+ * number. We always want to call this even if we ultimately end
+ * up aborting, because we are allocating a record number, regardless.
+ */
+ if (dbc->dbp->db_append_recno != NULL &&
+ (t_ret = dbc->dbp->db_append_recno(dbc->dbp, data, recno)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ /*
+ * Capture errors from either the lock couple or the call to
+ * dbp->db_append_recno.
+ */
+ if (ret != 0)
+ goto err;
+
+ pg = QAM_RECNO_PAGE(dbp, recno);
+
+ /* Fetch for write the data page. */
+ if ((ret = __qam_fget(dbc, &pg,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &page)) != 0)
+ goto err;
+
+ /* See if this is a new page. */
+ if (page->pgno == 0) {
+ page->pgno = pg;
+ page->type = P_QAMDATA;
+ } else if (waited && F_ISSET(QAM_GET_RECORD(
+ dbp, page, QAM_RECNO_INDEX(dbp, pg, recno)), QAM_VALID)) {
+ /* The record is in use, try again. */
+ if ((ret = __qam_fput(dbc, pg, page, dbc->priority)) != 0)
+ goto err;
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ goto err;
+ goto again;
+ }
+
+ cp->lock = lock;
+ cp->lock_mode = DB_LOCK_WRITE;
+ LOCK_INIT(lock);
+
+ /* Put the item on the page and log it. */
+ ret = __qam_pitem(dbc, page,
+ QAM_RECNO_INDEX(dbp, pg, recno), recno, data);
+
+ if ((t_ret = __qam_fput(dbc,
+ pg, page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Return the record number to the user. */
+ if (ret == 0 && key != NULL)
+ ret = __db_retcopy(dbp->env, key,
+ &recno, sizeof(recno), &dbc->rkey->data, &dbc->rkey->ulen);
+
+ /* Position the cursor on this record. */
+ cp->recno = recno;
+
+ /* See if we are leaving the extent. */
+ qp = (QUEUE *) dbp->q_internal;
+ if (qp->page_ext != 0 &&
+ (recno % (qp->page_ext * qp->rec_page) == 0 ||
+ recno == UINT32_MAX)) {
+ if ((ret = __memp_fget(mpf, &metapg,
+ dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+ goto err;
+ if (!QAM_AFTER_CURRENT(meta, recno))
+ if ((ret = __qam_fclose(dbp, pg)) != 0)
+ goto err;
+ }
+
+ QAM_WAKEUP(dbc, ret);
+
+err: /* Release the meta page. */
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __qamc_del --
+ * Qam cursor->am_del function
+ */
+static int
+__qamc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBT data;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ QAMDATA *qp;
+ QMETA *meta;
+ QUEUE_CURSOR *cp;
+ db_pgno_t metapg;
+ db_recno_t first;
+ int exact, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (QUEUE_CURSOR *)dbc->internal;
+
+ metapg = ((QUEUE *)dbp->q_internal)->q_meta;
+
+ /* Read latch the meta page. */
+ if ((ret = __memp_fget(mpf, &metapg,
+ dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+ return (ret);
+
+ if (QAM_NOT_VALID(meta, cp->recno)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ first = meta->first_recno;
+
+ /* Don't hold the meta page long term. */
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0)
+ goto err;
+ meta = NULL;
+
+ /* Get the record. */
+ if ((ret = __db_lget(dbc, LCK_COUPLE,
+ cp->recno, DB_LOCK_WRITE, DB_LOCK_RECORD, &cp->lock)) != 0)
+ goto err;
+ cp->lock_mode = DB_LOCK_WRITE;
+
+ /* Find the record; delete only deletes exact matches. */
+ if ((ret = __qam_position(dbc, &cp->recno,
+ DB_MPOOL_DIRTY, &exact)) != 0)
+ goto err;
+
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ pagep = cp->page;
+ qp = QAM_GET_RECORD(dbp, pagep, cp->indx);
+
+ if (DBC_LOGGING(dbc)) {
+ if (((QUEUE *)dbp->q_internal)->page_ext == 0 ||
+ ((QUEUE *)dbp->q_internal)->re_len == 0) {
+ if ((ret = __qam_del_log(dbp,
+ dbc->txn, &LSN(pagep), 0, &LSN(pagep),
+ pagep->pgno, cp->indx, cp->recno)) != 0)
+ goto err;
+ } else {
+ data.size = ((QUEUE *)dbp->q_internal)->re_len;
+ data.data = qp->data;
+ if ((ret = __qam_delext_log(dbp,
+ dbc->txn, &LSN(pagep), 0, &LSN(pagep),
+ pagep->pgno, cp->indx, cp->recno, &data)) != 0)
+ goto err;
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+
+ F_CLR(qp, QAM_VALID);
+ if ((ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0)
+ goto err;
+ cp->page = NULL;
+
+ /*
+ * Other threads cannot move first_recno past
+ * our position while we have the record locked.
+ * If it's pointing at the deleted record then get
+ * the metapage and check again as lower numbered
+ * record may have been inserted.
+ */
+ if (LF_ISSET(DB_CONSUME) || cp->recno == first) {
+ if ((ret = __memp_fget(mpf, &metapg,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+ if (LF_ISSET(DB_CONSUME) || cp->recno == meta->first_recno)
+ ret = __qam_consume(dbc, meta, RECNO_OOB);
+ }
+
+err: if (meta != NULL && (t_ret = __memp_fput(mpf, dbc->thread_info,
+ meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (cp->page != NULL &&
+ (t_ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ cp->page = NULL;
+
+ return (ret);
+}
+
+#ifdef DEBUG_WOP
+#define QDEBUG
+#endif
+
+/*
+ * __qamc_get --
+ * Queue DBC->get function.
+ */
+static int
+__qamc_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DBC *dbcdup;
+ DBT tmp;
+ DB_LOCK lock, metalock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *pg;
+ QAMDATA *qp;
+ QMETA *meta;
+ QUEUE *t;
+ QUEUE_CURSOR *cp;
+ db_lockmode_t lock_mode;
+ db_pgno_t metapno;
+ db_recno_t first;
+ int exact, inorder, is_first, ret, t_ret, wait, with_delete;
+ int retrying;
+ u_int32_t skip, meta_mode;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (QUEUE_CURSOR *)dbc->internal;
+ LOCK_INIT(lock);
+
+ lock_mode = F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ meta_mode = 0;
+ meta = NULL;
+ *pgnop = 0;
+ pg = NULL;
+ retrying = t_ret = wait = with_delete = 0;
+
+ if (flags == DB_CONSUME_WAIT) {
+ wait = 1;
+ flags = DB_CONSUME;
+ }
+ if (flags == DB_CONSUME) {
+ with_delete = 1;
+ flags = DB_FIRST;
+ meta_mode = DB_MPOOL_DIRTY;
+ lock_mode = DB_LOCK_WRITE;
+ }
+ inorder = F_ISSET(dbp, DB_AM_INORDER) && with_delete;
+
+ DEBUG_LREAD(dbc, dbc->txn, "qamc_get",
+ flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+
+ /* Make lint and friends happy. */
+ is_first = 0;
+ first = 0;
+
+ t = (QUEUE *)dbp->q_internal;
+ metapno = t->q_meta;
+
+ /*
+ * Get the meta page first
+ */
+ if ((ret = __memp_fget(mpf, &metapno,
+ dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0)
+ return (ret);
+
+ /* Release any previous lock if not in a transaction. */
+ if ((ret = __TLPUT(dbc, cp->lock)) != 0)
+ goto err;
+
+ skip = 0;
+retry: /* Update the record number. */
+ switch (flags) {
+ case DB_CURRENT:
+ break;
+ case DB_NEXT_DUP:
+ case DB_PREV_DUP:
+ ret = DB_NOTFOUND;
+ goto err;
+ /* NOTREACHED */
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ if (cp->recno != RECNO_OOB) {
+ if (with_delete && !inorder &&
+ QAM_BEFORE_FIRST(meta, cp->recno))
+ cp->recno = meta->first_recno;
+ else
+ QAM_INC_RECNO(cp->recno);
+ /*
+ * Check to see if we are out of data.
+ */
+ if (QAM_AFTER_CURRENT(meta, cp->recno)) {
+ pg = NULL;
+ if (!wait) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ /*
+ * If we skipped a locked record, go back and
+ * find it. If we find a locked record again
+ * wait for it.
+ */
+ if (skip == 1 &&
+ !QAM_AFTER_CURRENT(meta, first)) {
+ retrying = 1;
+ cp->recno = first;
+ goto dolock;
+ }
+ flags = DB_FIRST;
+
+ if (CDB_LOCKING(env)) {
+ /* Drop the metapage before we wait. */
+ ret = __memp_fput(mpf, dbc->thread_info,
+ meta, dbc->priority);
+ meta = NULL;
+ if (ret != 0)
+ goto err;
+ if ((ret = __lock_get(
+ env, dbc->locker,
+ DB_LOCK_SWITCH, &dbc->lock_dbt,
+ DB_LOCK_WAIT, &dbc->mylock)) != 0)
+ goto err;
+
+ if ((ret = __lock_get(
+ env, dbc->locker,
+ DB_LOCK_UPGRADE, &dbc->lock_dbt,
+ DB_LOCK_WRITE, &dbc->mylock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &metapno,
+ dbc->thread_info,
+ dbc->txn, meta_mode, &meta)) != 0)
+ goto err;
+ goto retry;
+ }
+
+ /*
+ * Put us in the wait queue, when someone
+ * adds something they will unlock it.
+ */
+ if ((ret = __db_lget(dbc,
+ 0, PGNO_INVALID, DB_LOCK_WAIT,
+ DB_LOCK_NOWAIT, &metalock)) != 0)
+ goto err;
+
+ /* Drop the metapage before we wait. */
+ ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority);
+ meta = NULL;
+ if (ret != 0)
+ goto err;
+
+ /* Upgrade the lock to wait on it. */
+ if ((ret = __db_lget(dbc, 0,
+ PGNO_INVALID, DB_LOCK_WAIT,
+ DB_LOCK_UPGRADE, &metalock)) != 0) {
+ if (ret == DB_LOCK_DEADLOCK)
+ ret = DB_LOCK_NOTGRANTED;
+ goto err;
+ }
+
+ if ((ret = __memp_fget(mpf,
+ &metapno, dbc->thread_info, dbc->txn,
+ meta_mode, &meta)) != 0)
+ goto err;
+ goto retry;
+ }
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_FIRST:
+ flags = DB_NEXT;
+ is_first = 1;
+
+ /* get the first record number */
+ cp->recno = first = meta->first_recno;
+
+ break;
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ if (cp->recno != RECNO_OOB) {
+ if (cp->recno == meta->first_recno ||
+ QAM_BEFORE_FIRST(meta, cp->recno)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ QAM_DEC_RECNO(cp->recno);
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST:
+ if (meta->first_recno == meta->cur_recno) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ cp->recno = meta->cur_recno;
+ QAM_DEC_RECNO(cp->recno);
+ break;
+ case DB_SET:
+ case DB_SET_RANGE:
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ if ((ret = __qam_getno(dbp, key, &cp->recno)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_flag(env, "__qamc_get", flags);
+ goto err;
+ }
+
+dolock: if (!with_delete || inorder || retrying) {
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0)
+ goto err;
+ meta = NULL;
+ }
+
+ /* Lock the record. */
+ if (((ret = __db_lget(dbc, LCK_COUPLE, cp->recno, lock_mode,
+ (with_delete && !inorder && !retrying) ?
+ DB_LOCK_NOWAIT | DB_LOCK_RECORD : DB_LOCK_RECORD,
+ &lock)) == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
+ with_delete) {
+#ifdef QDEBUG
+ if (DBC_LOGGING(dbc))
+ (void)__log_printf(env,
+ dbc->txn, "Queue S: %x %u %u",
+ dbc->locker ? dbc->locker->id : 0,
+ cp->recno, first);
+#endif
+ skip = 1;
+ goto retry;
+ }
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * In the DB_FIRST or DB_LAST cases we must wait and then start over
+ * since the first/last may have moved while we slept. If we are
+ * reading in order and the first record was not there, we can skip it
+ * as it must have been aborted was was skipped by a non-queue insert
+ * or we could not have gotten its lock. If we have the wrong
+ * record we release our locks and try again.
+ */
+ switch (flags) {
+ default:
+ if (inorder) {
+ if (first != cp->recno)
+ break;
+ } else if (with_delete || !is_first)
+ break;
+ /* FALLTHROUGH */
+ case DB_SET:
+ case DB_SET_RANGE:
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ case DB_LAST:
+ if ((ret = __memp_fget(mpf, &metapno,
+ dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0)
+ goto lerr;
+ if ((is_first && cp->recno != meta->first_recno) ||
+ (flags == DB_LAST && cp->recno != meta->cur_recno - 1)) {
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ goto err;
+ if (is_first)
+ flags = DB_FIRST;
+ goto retry;
+ } else if (!is_first && flags != DB_LAST) {
+ if (QAM_BEFORE_FIRST(meta, cp->recno)) {
+ if (flags == DB_SET_RANGE ||
+ flags == DB_GET_BOTH_RANGE) {
+ cp->lock = lock;
+ LOCK_INIT(lock);
+ goto release_retry;
+ }
+ ret = DB_NOTFOUND;
+ goto lerr;
+ }
+ if (QAM_AFTER_CURRENT(meta, cp->recno)) {
+ ret = DB_NOTFOUND;
+ goto lerr;
+ }
+ }
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0)
+ goto err;
+ meta = NULL;
+ }
+
+ /* Position the cursor on the record. */
+ if ((ret = __qam_position(dbc, &cp->recno, 0, &exact)) != 0) {
+ /* We cannot get the page, release the record lock. */
+ (void)__LPUT(dbc, lock);
+ goto err;
+ }
+
+ pg = cp->page;
+ cp->lock = lock;
+ cp->lock_mode = lock_mode;
+ LOCK_INIT(lock);
+
+ if (!exact) {
+release_retry: /* Release locks and retry, if possible. */
+#ifdef QDEBUG
+ if (with_delete && DBC_LOGGING(dbc)) {
+ (void)__log_printf(dbp->env, dbc->txn,
+ "Queue E: %x %u %u",
+ dbc->locker ? dbc->locker->id : 0,
+ cp->recno, first);
+ }
+#endif
+ if (pg != NULL)
+ (void)__qam_fput(dbc, cp->pgno, pg, dbc->priority);
+ cp->page = pg = NULL;
+ if (with_delete) {
+ if ((ret = __LPUT(dbc, cp->lock)) != 0)
+ goto err1;
+ } else if ((ret = __TLPUT(dbc, cp->lock)) != 0)
+ goto err1;
+
+ if (meta == NULL && (ret = __memp_fget(mpf, &metapno,
+ dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0)
+ goto err1;
+ /*
+ * If we don't need locks and we are out of range
+ * then we can just skip to the FIRST/LAST record
+ * otherwise we must iterate to lock the records
+ * and get serializability.
+ */
+ switch (flags) {
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ if (!with_delete)
+ is_first = 0;
+ else if (first == cp->recno)
+ /* we have verified that this record is gone. */
+ QAM_INC_RECNO(first);
+ if (QAM_BEFORE_FIRST(meta, cp->recno) &&
+ DONT_NEED_LOCKS(dbc))
+ flags = DB_FIRST;
+ break;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ if (QAM_AFTER_CURRENT(meta, cp->recno) &&
+ DONT_NEED_LOCKS(dbc))
+ flags = DB_LAST;
+ else
+ flags = DB_PREV;
+ break;
+
+ case DB_GET_BOTH_RANGE:
+ case DB_SET_RANGE:
+ if (QAM_BEFORE_FIRST(meta, cp->recno) &&
+ DONT_NEED_LOCKS(dbc))
+ flags = DB_FIRST;
+ else
+ flags = DB_NEXT;
+ break;
+
+ default:
+ /* this is for the SET and GET_BOTH cases */
+ ret = DB_KEYEMPTY;
+ goto err1;
+ }
+ retrying = 0;
+ goto retry;
+ }
+
+ if (with_delete && cp->recno == first) {
+ if (meta == NULL &&
+ (ret = __memp_fget(mpf, &metapno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY | DB_MPOOL_TRY, &meta)) != 0) {
+ if (ret == DB_LOCK_NOTGRANTED) {
+ first = RECNO_OOB;
+ ret = 0;
+ } else
+ goto err;
+ }
+ if (meta != NULL && cp->recno != meta->cur_recno) {
+ if (DBC_LOGGING(dbc)) {
+#ifdef QDEBUG
+ (void)__log_printf(dbp->env, dbc->txn,
+ "Queue I: %x %u %u %u",
+ dbc->locker ? dbc->locker->id : 0,
+ cp->recno, first, meta->cur_recno);
+#endif
+ if ((ret = __qam_incfirst_log(dbp,
+ dbc->txn, &meta->dbmeta.lsn, 0,
+ cp->recno, PGNO_BASE_MD)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(meta->dbmeta.lsn);
+
+ meta->first_recno = cp->recno;
+ QAM_INC_RECNO(meta->first_recno);
+ }
+ }
+ if (meta != NULL) {
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0)
+ goto err;
+ meta = NULL;
+ }
+
+ qp = QAM_GET_RECORD(dbp, pg, cp->indx);
+
+ /* Return the data item. */
+ if (flags == DB_GET_BOTH || flags == DB_GET_BOTH_RANGE) {
+ /*
+ * Need to compare
+ */
+ tmp.data = qp->data;
+ tmp.size = t->re_len;
+ if ((ret = __bam_defcmp(dbp, data, &tmp)) != 0) {
+ if (flags == DB_GET_BOTH_RANGE)
+ goto release_retry;
+ ret = DB_NOTFOUND;
+ goto err1;
+ }
+ }
+
+ /* Return the key if the user didn't give us one. */
+ if (key != NULL && !F_ISSET(key, DB_DBT_ISSET)) {
+ if ((ret = __db_retcopy(dbp->env,
+ key, &cp->recno, sizeof(cp->recno),
+ &dbc->rkey->data, &dbc->rkey->ulen)) != 0)
+ goto err1;
+ F_SET(key, DB_DBT_ISSET);
+ }
+
+ if (data != NULL &&
+ !F_ISSET(dbc, DBC_MULTIPLE|DBC_MULTIPLE_KEY) &&
+ !F_ISSET(data, DB_DBT_ISSET)) {
+ if ((ret = __db_retcopy(dbp->env, data, qp->data, t->re_len,
+ &dbc->rdata->data, &dbc->rdata->ulen)) != 0)
+ goto err1;
+ F_SET(data, DB_DBT_ISSET);
+ }
+
+ /* Finally, if we are doing DB_CONSUME mark the record. */
+ if (with_delete) {
+ /*
+ * Assert that we're not a secondary index. Doing a DB_CONSUME
+ * on a secondary makes very little sense, since one can't
+ * DB_APPEND there; attempting one should be forbidden by
+ * the interface.
+ */
+ DB_ASSERT(env, !F_ISSET(dbp, DB_AM_SECONDARY));
+
+ /*
+ * If we have any secondary indices, call __dbc_del_primary to
+ * delete the references to the item we're about to delete.
+ *
+ * Note that we work on a duplicated cursor, since the
+ * __db_ret work has already been done, so it's not safe
+ * to perform any additional ops on this cursor.
+ */
+ if (DB_IS_PRIMARY(dbp)) {
+ if ((ret = __dbc_idup(dbc,
+ &dbcdup, DB_POSITION)) != 0)
+ goto err1;
+
+ if ((ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0)
+ goto err1;
+ cp->page = NULL;
+ if (meta != NULL &&
+ (ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0)
+ goto err1;
+ meta = NULL;
+ if ((ret = __dbc_del_primary(dbcdup)) != 0) {
+ /*
+ * The __dbc_del_primary return is more
+ * interesting.
+ */
+ (void)__dbc_close(dbcdup);
+ goto err1;
+ }
+
+ if ((ret = __dbc_close(dbcdup)) != 0)
+ goto err1;
+ if ((ret = __qam_fget(dbc,
+ &cp->pgno, DB_MPOOL_DIRTY, &cp->page)) != 0)
+ goto err;
+ } else if ((ret = __qam_dirty(dbc,
+ cp->pgno, &cp->page, dbc->priority)) != 0)
+ goto err1;
+
+ pg = cp->page;
+
+ if (DBC_LOGGING(dbc)) {
+ if (t->page_ext == 0 || t->re_len == 0) {
+ if ((ret = __qam_del_log(dbp, dbc->txn,
+ &LSN(pg), 0, &LSN(pg),
+ pg->pgno, cp->indx, cp->recno)) != 0)
+ goto err1;
+ } else {
+ tmp.data = qp->data;
+ tmp.size = t->re_len;
+ if ((ret = __qam_delext_log(dbp,
+ dbc->txn, &LSN(pg), 0, &LSN(pg),
+ pg->pgno, cp->indx, cp->recno, &tmp)) != 0)
+ goto err1;
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(pg));
+
+ F_CLR(qp, QAM_VALID);
+ if ((ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0)
+ goto err;
+ cp->page = NULL;
+
+ /*
+ * Clean up the first pointer, need to check two things:
+ * Are we leaving an page or an extent?
+ * Is the first pointer is beyond the first one we looked at?
+ * If we deleted the first record we checked then we moved
+ * the first pointer properly.
+ */
+
+ if (first == cp->recno && (skip = (first % t->rec_page)) != 0)
+ goto done;
+ if (meta == NULL &&
+ (ret = __memp_fget(mpf, &metapno,
+ dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+ goto err;
+ if (skip && !QAM_BEFORE_FIRST(meta, first))
+ goto done;
+
+#ifdef QDEBUG
+ if (DBC_LOGGING(dbc))
+ (void)__log_printf(env,
+ dbc->txn, "Queue D: %x %u %u %u",
+ dbc->locker ? dbc->locker->id : 0,
+ cp->recno, first, meta->first_recno);
+#endif
+ ret = __qam_consume(dbc, meta, first);
+ }
+
+err1: if (cp->page != NULL) {
+ if ((t_ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ cp->page = NULL;
+ }
+ if (0) {
+lerr: (void)__LPUT(dbc, lock);
+ }
+
+done:
+err: if (meta) {
+ /* Release the meta page. */
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
+ DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
+}
+
+/*
+ * __qam_consume -- try to reset the head of the queue.
+ *
+ */
+static int
+__qam_consume(dbc, meta, first)
+ DBC *dbc;
+ QMETA *meta;
+ db_recno_t first;
+{
+ DB *dbp;
+ DB_LOCK lock, save_lock;
+ DB_MPOOLFILE *mpf;
+ QUEUE_CURSOR *cp;
+ db_indx_t save_indx;
+ db_pgno_t save_page;
+ db_recno_t current, save_first, save_recno;
+ u_int32_t rec_extent;
+ int exact, ret, t_ret, wrapped;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (QUEUE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ save_page = cp->pgno;
+ save_indx = cp->indx;
+ save_recno = cp->recno;
+ save_lock = cp->lock;
+ save_first = first;
+
+ /*
+ * We call this routine for two reasons:
+ * 1) to toss pages and extents as we leave them.
+ * 2) update meta->first_recno.
+ * We do not need to update first_recno if we deleted
+ * the first record we tried since we updated it then.
+ * If we are not going to update meta->first_recno we
+ * do not need an exclusive latch.
+ */
+ if (first != cp->recno && (ret = __memp_dirty(mpf,
+ &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+ /*
+ * If we skipped some deleted records, we need to
+ * reposition on the first one. Get a lock
+ * in case someone is trying to put it back.
+ */
+ if (first == RECNO_OOB || !QAM_BEFORE_FIRST(meta, first))
+ first = meta->first_recno;
+
+ if (first != cp->recno) {
+ ret = __db_lget(dbc, 0, first, DB_LOCK_READ,
+ DB_LOCK_NOWAIT | DB_LOCK_RECORD, &lock);
+ if (ret == DB_LOCK_NOTGRANTED || ret == DB_LOCK_DEADLOCK) {
+ ret = 0;
+ goto done;
+ }
+ if (ret != 0)
+ goto err;
+ if (cp->page != NULL && (ret =
+ __qam_fput(dbc, cp->pgno, cp->page, dbc->priority)) != 0)
+ goto err;
+ cp->page = NULL;
+ if ((ret = __qam_position(dbc, &first, 0, &exact)) != 0) {
+ (void)__LPUT(dbc, lock);
+ goto err;
+ }
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ goto err;
+ if (exact != 0)
+ goto done;
+ }
+
+ current = meta->cur_recno;
+ wrapped = 0;
+ if (first > current)
+ wrapped = 1;
+ rec_extent = meta->page_ext * meta->rec_page;
+
+ /* Loop until we find a record or hit current */
+ for (;;) {
+ /*
+ * Check to see if we are moving off the extent
+ * and remove the extent.
+ * If we are moving off a page we need to
+ * get rid of the buffer.
+ */
+ if (rec_extent != 0 &&
+ ((exact = (first % rec_extent == 0)) ||
+ (first % meta->rec_page == 0) ||
+ first == UINT32_MAX)) {
+#ifdef QDEBUG
+ if (DBC_LOGGING(dbc))
+ (void)__log_printf(dbp->env, dbc->txn,
+ "Queue R: %x %u %u %u",
+ dbc->locker ? dbc->locker->id : 0,
+ cp->pgno, first, meta->first_recno);
+#endif
+ if (cp->page != NULL && (ret = __qam_fput(dbc,
+ cp->pgno, cp->page, DB_PRIORITY_VERY_LOW)) != 0)
+ break;
+ cp->page = NULL;
+
+ if (exact == 1 &&
+ (ret = __qam_fremove(dbp, cp->pgno)) != 0)
+ break;
+ } else if (cp->page != NULL && (ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0)
+ break;
+ cp->page = NULL;
+ first++;
+ if (first == RECNO_OOB) {
+ wrapped = 0;
+ first++;
+ }
+
+ /*
+ * LOOP EXIT when we come move to the current
+ * pointer.
+ */
+ if (!wrapped && first >= current)
+ break;
+
+ ret = __db_lget(dbc, 0, first, DB_LOCK_READ,
+ DB_LOCK_NOWAIT | DB_LOCK_RECORD, &lock);
+ if (ret == DB_LOCK_NOTGRANTED || ret == DB_LOCK_DEADLOCK) {
+ ret = 0;
+ break;
+ }
+ if (ret != 0)
+ break;
+
+ if ((ret = __qam_position(dbc, &first, 0, &exact)) != 0) {
+ (void)__LPUT(dbc, lock);
+ break;
+ }
+ if ((ret =__LPUT(dbc, lock)) != 0 || exact) {
+ if ((t_ret = __qam_fput(dbc, cp->pgno,
+ cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ cp->page = NULL;
+ break;
+ }
+ }
+
+ cp->pgno = save_page;
+ cp->indx = save_indx;
+ cp->recno = save_recno;
+ cp->lock = save_lock;
+
+done:
+ /*
+ * We have advanced as far as we can.
+ * Advance first_recno to this point.
+ */
+ if (ret == 0 && meta->first_recno != first && save_first != cp->recno) {
+ if (DBC_LOGGING(dbc)) {
+#ifdef QDEBUG
+ (void)__log_printf(dbp->env, dbc->txn,
+ "Queue M: %x %u %u %u",
+ dbc->locker ? dbc->locker->id : 0,
+ cp->recno, first, meta->first_recno);
+#endif
+ if ((ret = __qam_incfirst_log(dbp,
+ dbc->txn, &meta->dbmeta.lsn, 0,
+ first, PGNO_BASE_MD)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(meta->dbmeta.lsn);
+ meta->first_recno = first;
+ }
+
+err:
+ return (ret);
+}
+
+static int
+__qam_bulk(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_LOCK rlock;
+ DB_MPOOLFILE *mpf;
+ PAGE *pg;
+ QAMDATA *qp;
+ QMETA *meta;
+ QUEUE_CURSOR *cp;
+ db_indx_t indx;
+ db_lockmode_t lkmode;
+ db_pgno_t metapno;
+ u_int32_t *endp, *offp;
+ u_int32_t pagesize, re_len, recs;
+ u_int8_t *dbuf, *dp, *np;
+ int exact, ret, t_ret, valid;
+ int is_key, need_pg, size, space;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (QUEUE_CURSOR *)dbc->internal;
+
+ lkmode = F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+
+ pagesize = dbp->pgsize;
+ re_len = ((QUEUE *)dbp->q_internal)->re_len;
+ recs = ((QUEUE *)dbp->q_internal)->rec_page;
+ metapno = ((QUEUE *)dbp->q_internal)->q_meta;
+
+ is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+ size = 0;
+
+ dbuf = data->data;
+ np = dp = dbuf;
+
+ /* Keep track of space that is left. There is an termination entry */
+ space = (int)data->ulen;
+ space -= (int)sizeof(*offp);
+
+ /* Build the offset/size table from the end up. */
+ endp = (u_int32_t *)((u_int8_t *)dbuf + data->ulen);
+ endp--;
+ offp = endp;
+ /* Save the lock on the current position of the cursor. */
+ rlock = cp->lock;
+ LOCK_INIT(cp->lock);
+
+ if ((ret = __memp_fget(mpf, &metapno,
+ dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+ return (ret);
+
+next_pg:
+ /* Wrap around, skipping zero. */
+ if (cp->recno == RECNO_OOB)
+ cp->recno++;
+ if ((ret = __qam_position(dbc, &cp->recno, 0, &exact)) != 0)
+ goto done;
+
+ pg = cp->page;
+ indx = cp->indx;
+ need_pg = 1;
+
+ do {
+ /*
+ * If this page is a nonexistent page at the end of an
+ * extent, pg may be NULL. A NULL page has no valid records,
+ * so just keep looping as though qp exists and isn't QAM_VALID;
+ * calling QAM_GET_RECORD is unsafe.
+ */
+ valid = 0;
+
+ if (pg != NULL) {
+ if ((ret = __db_lget(dbc, LCK_COUPLE, cp->recno, lkmode,
+ DB_LOCK_NOWAIT | DB_LOCK_RECORD, &rlock)) != 0) {
+ if (ret != DB_LOCK_NOTGRANTED &&
+ ret != DB_LOCK_DEADLOCK)
+ goto done;
+ /* If we put anything in the buffer return. */
+ if (offp != endp)
+ break;
+ if ((ret = __memp_fput(mpf, dbc->thread_info,
+ meta, dbc->priority)) != 0)
+ goto done;
+ meta = NULL;
+ if ((ret = __db_lget(dbc, LCK_COUPLE, cp->recno,
+ lkmode, DB_LOCK_RECORD, &rlock)) != 0)
+ goto done;
+ if ((ret = __memp_fget(mpf,
+ &metapno, dbc->thread_info,
+ dbc->txn, 0, &meta)) != 0)
+ goto done;
+ }
+ qp = QAM_GET_RECORD(dbp, pg, indx);
+ if (F_ISSET(qp, QAM_VALID)) {
+ valid = 1;
+ space -= (int)
+ ((is_key ? 3 : 2) * sizeof(*offp));
+ if (space < 0)
+ goto get_space;
+ if (need_pg) {
+ dp = np;
+ size = (int)pagesize - QPAGE_SZ(dbp);
+ if (space < size) {
+get_space:
+ if (offp == endp) {
+ data->size = (u_int32_t)
+ DB_ALIGN((u_int32_t)
+ size + pagesize,
+ sizeof(u_int32_t));
+ ret = DB_BUFFER_SMALL;
+ break;
+ }
+ if (indx != 0)
+ indx--;
+ cp->recno--;
+ space = 0;
+ break;
+ }
+ memcpy(dp,
+ (u_int8_t *)pg + QPAGE_SZ(dbp),
+ (u_int)size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+ if (is_key)
+ *offp-- = cp->recno;
+ *offp-- = (u_int32_t)((((u_int8_t *)qp -
+ (u_int8_t *)pg) - QPAGE_SZ(dbp)) +
+ (dp - dbuf) + SSZA(QAMDATA, data));
+ *offp-- = re_len;
+ }
+ }
+ if (!valid && is_key == 0) {
+ *offp-- = 0;
+ *offp-- = 0;
+ }
+ cp->recno++;
+ } while (++indx < recs && cp->recno != RECNO_OOB &&
+ !QAM_AFTER_CURRENT(meta, cp->recno));
+
+ if (cp->page != NULL) {
+ if ((t_ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ cp->page = NULL;
+ }
+
+ if (ret == 0 && space > 0 &&
+ (indx >= recs || cp->recno == RECNO_OOB) &&
+ !QAM_AFTER_CURRENT(meta, cp->recno))
+ goto next_pg;
+
+ /*
+ * Correct recno in two cases:
+ * 1) If we just wrapped fetch must start at record 1 not a FIRST.
+ * 2) We ran out of space exactly at the end of a page.
+ */
+ if (cp->recno == RECNO_OOB || (space == 0 && indx == recs))
+ cp->recno--;
+
+ if (is_key == 1)
+ *offp = RECNO_OOB;
+ else
+ *offp = (u_int32_t)-1;
+
+done: /* Release the meta page. */
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ cp->lock = rlock;
+
+ return (ret);
+}
+
+/*
+ * __qamc_close --
+ * Close down the cursor from a single use.
+ */
+static int
+__qamc_close(dbc, root_pgno, rmroot)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int *rmroot;
+{
+ QUEUE_CURSOR *cp;
+ int ret;
+
+ COMPQUIET(root_pgno, 0);
+ COMPQUIET(rmroot, NULL);
+
+ cp = (QUEUE_CURSOR *)dbc->internal;
+
+ /* Discard any locks not acquired inside of a transaction. */
+ ret = __TLPUT(dbc, cp->lock);
+
+ LOCK_INIT(cp->lock);
+ cp->page = NULL;
+ cp->pgno = PGNO_INVALID;
+ cp->indx = 0;
+ cp->lock_mode = DB_LOCK_NG;
+ cp->recno = RECNO_OOB;
+ cp->flags = 0;
+
+ return (ret);
+}
+
+/*
+ * __qamc_dup --
+ * Duplicate a queue cursor, such that the new one holds appropriate
+ * locks for the position of the original.
+ *
+ * PUBLIC: int __qamc_dup __P((DBC *, DBC *));
+ */
+int
+__qamc_dup(orig_dbc, new_dbc)
+ DBC *orig_dbc, *new_dbc;
+{
+ QUEUE_CURSOR *orig, *new;
+
+ orig = (QUEUE_CURSOR *)orig_dbc->internal;
+ new = (QUEUE_CURSOR *)new_dbc->internal;
+
+ new->recno = orig->recno;
+
+ return (0);
+}
+
+/*
+ * __qamc_init
+ *
+ * PUBLIC: int __qamc_init __P((DBC *));
+ */
+int
+__qamc_init(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ QUEUE_CURSOR *cp;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ /* Allocate the internal structure. */
+ cp = (QUEUE_CURSOR *)dbc->internal;
+ if (cp == NULL) {
+ if ((ret =
+ __os_calloc(dbp->env, 1, sizeof(QUEUE_CURSOR), &cp)) != 0)
+ return (ret);
+ dbc->internal = (DBC_INTERNAL *)cp;
+ }
+
+ /* Initialize methods. */
+ dbc->close = dbc->c_close = __dbc_close_pp;
+ dbc->cmp = __dbc_cmp_pp;
+ dbc->count = dbc->c_count = __dbc_count_pp;
+ dbc->del = dbc->c_del = __dbc_del_pp;
+ dbc->dup = dbc->c_dup = __dbc_dup_pp;
+ dbc->get = dbc->c_get = __dbc_get_pp;
+ dbc->pget = dbc->c_pget = __dbc_pget_pp;
+ dbc->put = dbc->c_put = __dbc_put_pp;
+ dbc->am_bulk = __qam_bulk;
+ dbc->am_close = __qamc_close;
+ dbc->am_del = __qamc_del;
+ dbc->am_destroy = __qamc_destroy;
+ dbc->am_get = __qamc_get;
+ dbc->am_put = __qamc_put;
+ dbc->am_writelock = NULL;
+
+ return (0);
+}
+
+/*
+ * __qamc_destroy --
+ * Close a single cursor -- internal version.
+ */
+static int
+__qamc_destroy(dbc)
+ DBC *dbc;
+{
+ /* Discard the structures. */
+ __os_free(dbc->env, dbc->internal);
+
+ return (0);
+}
+
+/*
+ * __qam_getno --
+ * Check the user's record number.
+ */
+static int
+__qam_getno(dbp, key, rep)
+ DB *dbp;
+ const DBT *key;
+ db_recno_t *rep;
+{
+ /* If passed an empty DBT from Java, key->data may be NULL */
+ if (key->size != sizeof(db_recno_t)) {
+ __db_errx(dbp->env, DB_STR("1143",
+ "illegal record number size"));
+ return (EINVAL);
+ }
+
+ if ((*rep = *(db_recno_t *)key->data) == 0) {
+ __db_errx(dbp->env, DB_STR("1144",
+ "illegal record number of 0"));
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __qam_truncate --
+ * Truncate a queue database
+ *
+ * PUBLIC: int __qam_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__qam_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ QMETA *meta;
+ db_pgno_t metapno;
+ u_int32_t count;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+
+ /* Walk the queue, counting rows. */
+ for (count = 0;
+ (ret = __qamc_get(dbc, NULL, NULL, DB_CONSUME, &metapno)) == 0;)
+ count++;
+ if (ret != DB_NOTFOUND)
+ return (ret);
+
+ mpf = dbp->mpf;
+ /* Update the meta page. */
+ metapno = ((QUEUE *)dbp->q_internal)->q_meta;
+ if ((ret = __memp_fget(mpf, &metapno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &meta)) != 0)
+ return (ret);
+
+ /* Remove the last extent file. */
+ if (meta->cur_recno > 1 && ((QUEUE *)dbp->q_internal)->page_ext != 0) {
+ if ((ret = __qam_fremove(dbp,
+ QAM_RECNO_PAGE(dbp, meta->cur_recno - 1))) != 0)
+ goto err;
+ }
+
+ if (DBC_LOGGING(dbc)) {
+ ret = __qam_mvptr_log(dbp, dbc->txn, &meta->dbmeta.lsn, 0,
+ QAM_SETCUR | QAM_SETFIRST | QAM_TRUNCATE, meta->first_recno,
+ 1, meta->cur_recno, 1, &meta->dbmeta.lsn, PGNO_BASE_MD);
+ } else
+ LSN_NOT_LOGGED(meta->dbmeta.lsn);
+ if (ret == 0)
+ meta->first_recno = meta->cur_recno = 1;
+
+err: if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (countp != NULL)
+ *countp = count;
+
+ return (ret);
+}
+
+/*
+ * __qam_delete --
+ * Queue fast delete function.
+ *
+ * PUBLIC: int __qam_delete __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__qam_delete(dbc, key, flags)
+ DBC *dbc;
+ DBT *key;
+ u_int32_t flags;
+{
+ QUEUE_CURSOR *cp;
+ int ret;
+
+ cp = (QUEUE_CURSOR *)dbc->internal;
+ if ((ret = __qam_getno(dbc->dbp, key, &cp->recno)) != 0)
+ goto err;
+
+ ret = __qamc_del(dbc, flags);
+
+err: return (ret);
+}
diff --git a/src/qam/qam.src b/src/qam/qam.src
new file mode 100644
index 00000000..a8e2e4e0
--- /dev/null
+++ b/src/qam/qam.src
@@ -0,0 +1,89 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __qam
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/qam.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * incfirst
+ * Used when we increment first_recno.
+ */
+BEGIN incfirst 42 84
+DB fileid int32_t ld
+ARG recno db_recno_t lu
+ARG meta_pgno db_pgno_t lu
+END
+
+/*
+ * mvptr
+ * Used when we change one or both of cur_recno and first_recno.
+ */
+BEGIN mvptr 42 85
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG old_first db_recno_t lu
+ARG new_first db_recno_t lu
+ARG old_cur db_recno_t lu
+ARG new_cur db_recno_t lu
+POINTER metalsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+END
+
+
+/*
+ * del
+ * Used when we delete a record.
+ * recno is the record that is being deleted.
+ */
+BEGIN del 42 79
+DB fileid int32_t ld
+POINTER lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG recno db_recno_t lu
+END
+
+/*
+ * add
+ * Used when we put a record on a page.
+ * recno is the record being added.
+ * data is the record itself.
+ */
+BEGIN add 42 80
+DB fileid int32_t ld
+POINTER lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG recno db_recno_t lu
+DBT data DBT s
+ARG vflag u_int32_t lu
+DBT olddata DBT s
+END
+
+/*
+ * delext
+ * Used when we delete a record in extent based queue.
+ * recno is the record that is being deleted.
+ */
+BEGIN delext 42 83
+DB fileid int32_t ld
+POINTER lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG recno db_recno_t lu
+DBT data DBT s
+END
diff --git a/src/qam/qam_auto.c b/src/qam/qam_auto.c
new file mode 100644
index 00000000..604ad3f4
--- /dev/null
+++ b/src/qam/qam_auto.c
@@ -0,0 +1,83 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __qam_incfirst_desc[] = {
+ {LOGREC_DB, SSZ(__qam_incfirst_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__qam_incfirst_args, recno), "recno", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_incfirst_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __qam_mvptr_desc[] = {
+ {LOGREC_ARG, SSZ(__qam_mvptr_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__qam_mvptr_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__qam_mvptr_args, old_first), "old_first", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_mvptr_args, new_first), "new_first", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_mvptr_args, old_cur), "old_cur", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_mvptr_args, new_cur), "new_cur", "%lu"},
+ {LOGREC_POINTER, SSZ(__qam_mvptr_args, metalsn), "metalsn", ""},
+ {LOGREC_ARG, SSZ(__qam_mvptr_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __qam_del_desc[] = {
+ {LOGREC_DB, SSZ(__qam_del_args, fileid), "fileid", ""},
+ {LOGREC_POINTER, SSZ(__qam_del_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__qam_del_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_del_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_del_args, recno), "recno", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __qam_add_desc[] = {
+ {LOGREC_DB, SSZ(__qam_add_args, fileid), "fileid", ""},
+ {LOGREC_POINTER, SSZ(__qam_add_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__qam_add_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_add_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_add_args, recno), "recno", "%lu"},
+ {LOGREC_DBT, SSZ(__qam_add_args, data), "data", ""},
+ {LOGREC_ARG, SSZ(__qam_add_args, vflag), "vflag", "%lu"},
+ {LOGREC_DBT, SSZ(__qam_add_args, olddata), "olddata", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __qam_delext_desc[] = {
+ {LOGREC_DB, SSZ(__qam_delext_args, fileid), "fileid", ""},
+ {LOGREC_POINTER, SSZ(__qam_delext_args, lsn), "lsn", ""},
+ {LOGREC_ARG, SSZ(__qam_delext_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_delext_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__qam_delext_args, recno), "recno", "%lu"},
+ {LOGREC_DBT, SSZ(__qam_delext_args, data), "data", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __qam_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__qam_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_incfirst_recover, DB___qam_incfirst)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_mvptr_recover, DB___qam_mvptr)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_del_recover, DB___qam_del)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_add_recover, DB___qam_add)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_delext_recover, DB___qam_delext)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/qam/qam_autop.c b/src/qam/qam_autop.c
new file mode 100644
index 00000000..123a0a37
--- /dev/null
+++ b/src/qam/qam_autop.c
@@ -0,0 +1,126 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#ifdef HAVE_QUEUE
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __qam_incfirst_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_incfirst_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__qam_incfirst", __qam_incfirst_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_mvptr_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_mvptr_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__qam_mvptr", __qam_mvptr_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_del_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_del_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__qam_del", __qam_del_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_add_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_add_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__qam_add", __qam_add_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_delext_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__qam_delext_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__qam_delext", __qam_delext_desc, info));
+}
+
+/*
+ * PUBLIC: int __qam_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__qam_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_incfirst_print, DB___qam_incfirst)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_mvptr_print, DB___qam_mvptr)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_del_print, DB___qam_del)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_add_print, DB___qam_add)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __qam_delext_print, DB___qam_delext)) != 0)
+ return (ret);
+ return (0);
+}
+#endif /* HAVE_QUEUE */
diff --git a/src/qam/qam_conv.c b/src/qam/qam_conv.c
new file mode 100644
index 00000000..beb7c973
--- /dev/null
+++ b/src/qam/qam_conv.c
@@ -0,0 +1,79 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_am.h"
+#include "dbinc/qam.h"
+
+/*
+ * __qam_mswap --
+ * Swap the bytes on the queue metadata page.
+ *
+ * PUBLIC: int __qam_mswap __P((ENV *, PAGE *));
+ */
+int
+__qam_mswap(env, pg)
+ ENV *env;
+ PAGE *pg;
+{
+ u_int8_t *p;
+
+ COMPQUIET(env, NULL);
+
+ __db_metaswap(pg);
+ p = (u_int8_t *)pg + sizeof(DBMETA);
+
+ SWAP32(p); /* first_recno */
+ SWAP32(p); /* cur_recno */
+ SWAP32(p); /* re_len */
+ SWAP32(p); /* re_pad */
+ SWAP32(p); /* rec_page */
+ SWAP32(p); /* page_ext */
+ p += 91 * sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* crypto_magic */
+
+ return (0);
+}
+
+/*
+ * __qam_pgin_out --
+ * Convert host-specific page layout to/from the host-independent format
+ * stored on disk.
+ * We only need to fix up a few fields in the header
+ *
+ * PUBLIC: int __qam_pgin_out __P((ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__qam_pgin_out(env, pg, pp, cookie)
+ ENV *env;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+ QPAGE *h;
+
+ COMPQUIET(pg, 0);
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!F_ISSET(pginfo, DB_AM_SWAP))
+ return (0);
+
+ h = pp;
+ if (h->type == P_QAMMETA)
+ return (__qam_mswap(env, pp));
+
+ M_32_SWAP(h->lsn.file);
+ M_32_SWAP(h->lsn.offset);
+ M_32_SWAP(h->pgno);
+
+ return (0);
+}
diff --git a/src/qam/qam_files.c b/src/qam/qam_files.c
new file mode 100644
index 00000000..e9a9ff07
--- /dev/null
+++ b/src/qam/qam_files.c
@@ -0,0 +1,939 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+
+#define QAM_EXNAME(Q, I, B, L) \
+ snprintf((B), (L), \
+ QUEUE_EXTENT, (Q)->dir, PATH_SEPARATOR[0], (Q)->name, (I))
+
+/*
+ * __qam_fprobe -- calculate and open extent
+ *
+ * Calculate which extent the page is in, open and create if necessary.
+ *
+ * PUBLIC: int __qam_fprobe __P((DBC *, db_pgno_t,
+ * PUBLIC: void *, qam_probe_mode, DB_CACHE_PRIORITY, u_int32_t));
+ */
+int
+__qam_fprobe(dbc, pgno, addrp, mode, priority, flags)
+ DBC *dbc;
+ db_pgno_t pgno;
+ void *addrp;
+ qam_probe_mode mode;
+ DB_CACHE_PRIORITY priority;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ MPFARRAY *array;
+ QUEUE *qp;
+ u_int8_t fid[DB_FILE_ID_LEN];
+ u_int32_t i, extid, maxext, numext, lflags, offset, oldext, openflags;
+ char buf[DB_MAXPATHLEN];
+ int ftype, less, ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ qp = (QUEUE *)dbp->q_internal;
+ ret = 0;
+
+ if (qp->page_ext == 0) {
+ mpf = dbp->mpf;
+ switch (mode) {
+ case QAM_PROBE_GET:
+ return (__memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, flags, addrp));
+ case QAM_PROBE_PUT:
+ return (__memp_fput(mpf,
+ dbc->thread_info, addrp, priority));
+ case QAM_PROBE_DIRTY:
+ return (__memp_dirty(mpf, addrp,
+ dbc->thread_info, dbc->txn, priority, flags));
+ case QAM_PROBE_MPF:
+ *(DB_MPOOLFILE **)addrp = mpf;
+ return (0);
+ }
+ }
+
+ mpf = NULL;
+
+ /*
+ * Need to lock long enough to find the mpf or create the file.
+ * The file cannot go away because we must have a record locked
+ * in that file.
+ */
+ MUTEX_LOCK(env, dbp->mutex);
+ extid = QAM_PAGE_EXTENT(dbp, pgno);
+
+ /* Array1 will always be in use if array2 is in use. */
+ array = &qp->array1;
+ if (array->n_extent == 0) {
+ /* Start with 4 extents */
+ array->n_extent = 4;
+ array->low_extent = extid;
+ numext = offset = oldext = 0;
+ less = 0;
+ goto alloc;
+ }
+
+retry:
+ if (extid < array->low_extent) {
+ less = 1;
+ offset = array->low_extent - extid;
+ } else {
+ less = 0;
+ offset = extid - array->low_extent;
+ }
+ if (qp->array2.n_extent != 0 &&
+ (extid >= qp->array2.low_extent ?
+ offset > extid - qp->array2.low_extent :
+ offset > qp->array2.low_extent - extid)) {
+ array = &qp->array2;
+ if (extid < array->low_extent) {
+ less = 1;
+ offset = array->low_extent - extid;
+ } else {
+ less = 0;
+ offset = extid - array->low_extent;
+ }
+ }
+
+ /*
+ * Check to see if the requested extent is outside the range of
+ * extents in the array. This is true by default if there are
+ * no extents here yet.
+ */
+ if (less == 1 || offset >= array->n_extent) {
+ oldext = array->n_extent;
+ numext = (array->hi_extent - array->low_extent) + 1;
+ if (less == 1 && offset + numext <= array->n_extent) {
+ /*
+ * If we can fit this one into the existing array by
+ * shifting the existing entries then we do not have
+ * to allocate.
+ */
+ memmove(&array->mpfarray[offset],
+ array->mpfarray, numext
+ * sizeof(array->mpfarray[0]));
+ memset(array->mpfarray, 0, offset
+ * sizeof(array->mpfarray[0]));
+ offset = 0;
+ } else if (less == 0 && offset == array->n_extent &&
+ (mode == QAM_PROBE_GET || mode == QAM_PROBE_PUT) &&
+ array->mpfarray[0].pinref == 0) {
+ /*
+ * If this is at the end of the array and the file at
+ * the beginning has a zero pin count we can close
+ * the bottom extent and put this one at the end.
+ */
+ mpf = array->mpfarray[0].mpf;
+ if (mpf != NULL && (ret = __memp_fclose(mpf, 0)) != 0)
+ goto err;
+ memmove(&array->mpfarray[0], &array->mpfarray[1],
+ (array->n_extent - 1) * sizeof(array->mpfarray[0]));
+ array->low_extent++;
+ array->hi_extent++;
+ offset--;
+ array->mpfarray[offset].mpf = NULL;
+ array->mpfarray[offset].pinref = 0;
+ } else {
+ /*
+ * See if we have wrapped around the queue.
+ * If it has then allocate the second array.
+ * Otherwise just expand the one we are using.
+ */
+ maxext = (u_int32_t) UINT32_MAX
+ / (qp->page_ext * qp->rec_page);
+ if (offset >= maxext/2) {
+ array = &qp->array2;
+ DB_ASSERT(env, array->n_extent == 0);
+ oldext = 0;
+ array->n_extent = 4;
+ array->low_extent = extid;
+ offset = 0;
+ numext = 0;
+ } else if (array->mpfarray[0].pinref == 0) {
+ /*
+ * Check to see if there are extents marked
+ * for deletion at the beginning of the cache.
+ * If so close them so they will go away.
+ */
+ for (i = 0; i < array->n_extent; i++) {
+ if (array->mpfarray[i].pinref != 0)
+ break;
+ mpf = array->mpfarray[i].mpf;
+ if (mpf == NULL)
+ continue;
+ (void)__memp_get_flags(mpf, &lflags);
+ if (!FLD_ISSET(lflags, DB_MPOOL_UNLINK))
+ break;
+
+ array->mpfarray[i].mpf = NULL;
+ if ((ret = __memp_fclose(mpf, 0)) != 0)
+ goto err;
+ }
+ if (i == 0)
+ goto increase;
+ memmove(&array->mpfarray[0],
+ &array->mpfarray[i],
+ (array->n_extent - i) *
+ sizeof(array->mpfarray[0]));
+ memset(&array->mpfarray[array->n_extent - i],
+ '\0', i * sizeof(array->mpfarray[0]));
+ array->low_extent += i;
+ array->hi_extent += i;
+ goto retry;
+ } else {
+ /*
+ * Increase the size to at least include
+ * the new one and double it.
+ */
+increase: array->n_extent += offset;
+ array->n_extent <<= 2;
+ }
+alloc: if ((ret = __os_realloc(env,
+ array->n_extent * sizeof(struct __qmpf),
+ &array->mpfarray)) != 0)
+ goto err;
+
+ if (less == 1) {
+ /*
+ * Move the array up and put the new one
+ * in the first slot.
+ */
+ memmove(&array->mpfarray[offset],
+ array->mpfarray,
+ numext * sizeof(array->mpfarray[0]));
+ memset(array->mpfarray, 0,
+ offset * sizeof(array->mpfarray[0]));
+ memset(&array->mpfarray[numext + offset], 0,
+ (array->n_extent - (numext + offset))
+ * sizeof(array->mpfarray[0]));
+ offset = 0;
+ }
+ else
+ /* Clear the new part of the array. */
+ memset(&array->mpfarray[oldext], 0,
+ (array->n_extent - oldext) *
+ sizeof(array->mpfarray[0]));
+ }
+ }
+
+ /* Update the low and hi range of saved extents. */
+ if (extid < array->low_extent)
+ array->low_extent = extid;
+ if (extid > array->hi_extent)
+ array->hi_extent = extid;
+
+ /* If the extent file is not yet open, open it. */
+ if (array->mpfarray[offset].mpf == NULL) {
+ QAM_EXNAME(qp, extid, buf, sizeof(buf));
+ if ((ret = __memp_fcreate(
+ env, &array->mpfarray[offset].mpf)) != 0)
+ goto err;
+ mpf = array->mpfarray[offset].mpf;
+ (void)__memp_set_lsn_offset(mpf, 0);
+ (void)__memp_set_pgcookie(mpf, &qp->pgcookie);
+ (void)__memp_get_ftype(dbp->mpf, &ftype);
+ (void)__memp_set_ftype(mpf, ftype);
+ (void)__memp_set_clear_len(mpf, dbp->pgsize);
+
+ /* Set up the fileid for this extent. */
+ __qam_exid(dbp, fid, extid);
+ (void)__memp_set_fileid(mpf, fid);
+ openflags = DB_EXTENT;
+ if (LF_ISSET(DB_MPOOL_CREATE))
+ openflags |= DB_CREATE;
+ if (F_ISSET(dbp, DB_AM_RDONLY))
+ openflags |= DB_RDONLY;
+ if (F_ISSET(env->dbenv, DB_ENV_DIRECT_DB))
+ openflags |= DB_DIRECT;
+ if ((ret = __memp_fopen(mpf, NULL,
+ buf, NULL, openflags, qp->mode, dbp->pgsize)) != 0) {
+ array->mpfarray[offset].mpf = NULL;
+ (void)__memp_fclose(mpf, 0);
+ goto err;
+ }
+ }
+
+ /*
+ * We have found the right file. Update its ref count
+ * before dropping the dbp mutex so it does not go away.
+ */
+ mpf = array->mpfarray[offset].mpf;
+ if (mode == QAM_PROBE_GET)
+ array->mpfarray[offset].pinref++;
+
+ /*
+ * If we may create the page, then we are writing,
+ * the file may nolonger be empty after this operation
+ * so we clear the UNLINK flag.
+ */
+ if (LF_ISSET(DB_MPOOL_CREATE))
+ (void)__memp_set_flags(mpf, DB_MPOOL_UNLINK, 0);
+
+err:
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ if (ret == 0) {
+ pgno--;
+ pgno %= qp->page_ext;
+ switch (mode) {
+ case QAM_PROBE_GET:
+ ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, flags, addrp);
+ if (ret == 0)
+ return (0);
+ break;
+ case QAM_PROBE_PUT:
+ ret = __memp_fput(mpf,
+ dbc->thread_info, addrp, dbp->priority);
+ break;
+ case QAM_PROBE_DIRTY:
+ return (__memp_dirty(mpf, addrp,
+ dbc->thread_info, dbc->txn, dbp->priority, flags));
+ case QAM_PROBE_MPF:
+ *(DB_MPOOLFILE **)addrp = mpf;
+ return (0);
+ }
+
+ MUTEX_LOCK(env, dbp->mutex);
+ /* Recalculate because we dropped the lock. */
+ offset = extid - array->low_extent;
+ DB_ASSERT(env, array->mpfarray[offset].pinref > 0);
+ if (--array->mpfarray[offset].pinref == 0 &&
+ (mode == QAM_PROBE_GET || ret == 0)) {
+ /* Check to see if this file will be unlinked. */
+ (void)__memp_get_flags(mpf, &flags);
+ if (LF_ISSET(DB_MPOOL_UNLINK)) {
+ array->mpfarray[offset].mpf = NULL;
+ if ((t_ret =
+ __memp_fclose(mpf, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+ return (ret);
+}
+
+/*
+ * __qam_fclose -- close an extent.
+ *
+ * Calculate which extent the page is in and close it.
+ * We assume the mpf entry is present.
+ *
+ * PUBLIC: int __qam_fclose __P((DB *, db_pgno_t));
+ */
+int
+__qam_fclose(dbp, pgnoaddr)
+ DB *dbp;
+ db_pgno_t pgnoaddr;
+{
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ MPFARRAY *array;
+ QUEUE *qp;
+ u_int32_t extid, offset;
+ int ret;
+
+ ret = 0;
+ env = dbp->env;
+ qp = (QUEUE *)dbp->q_internal;
+
+ MUTEX_LOCK(env, dbp->mutex);
+
+ extid = QAM_PAGE_EXTENT(dbp, pgnoaddr);
+ array = &qp->array1;
+ if (array->low_extent > extid || array->hi_extent < extid)
+ array = &qp->array2;
+ offset = extid - array->low_extent;
+
+ DB_ASSERT(env,
+ extid >= array->low_extent && offset < array->n_extent);
+
+ /* If other threads are still using this file, leave it. */
+ if (array->mpfarray[offset].pinref != 0)
+ goto done;
+
+ mpf = array->mpfarray[offset].mpf;
+ array->mpfarray[offset].mpf = NULL;
+ ret = __memp_fclose(mpf, 0);
+
+done:
+ MUTEX_UNLOCK(env, dbp->mutex);
+ return (ret);
+}
+
+/*
+ * __qam_fremove -- remove an extent.
+ *
+ * Calculate which extent the page is in and remove it. There is no way
+ * to remove an extent without probing it first and seeing that is is empty
+ * so we assume the mpf entry is present.
+ *
+ * PUBLIC: int __qam_fremove __P((DB *, db_pgno_t));
+ */
+int
+__qam_fremove(dbp, pgnoaddr)
+ DB *dbp;
+ db_pgno_t pgnoaddr;
+{
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ MPFARRAY *array;
+ QUEUE *qp;
+ u_int32_t extid, offset;
+ int ret;
+
+ qp = (QUEUE *)dbp->q_internal;
+ env = dbp->env;
+ ret = 0;
+
+ MUTEX_LOCK(env, dbp->mutex);
+
+ extid = QAM_PAGE_EXTENT(dbp, pgnoaddr);
+ array = &qp->array1;
+ if (array->low_extent > extid || array->hi_extent < extid)
+ array = &qp->array2;
+ offset = extid - array->low_extent;
+
+ DB_ASSERT(env,
+ extid >= array->low_extent && offset < array->n_extent);
+
+ mpf = array->mpfarray[offset].mpf;
+ /* This extent my already be marked for delete and closed. */
+ if (mpf == NULL)
+ goto err;
+
+ /*
+ * The log must be flushed before the file is deleted. We depend on
+ * the log record of the last delete to recreate the file if we crash.
+ */
+ if (LOGGING_ON(env) && (ret = __log_flush(env, NULL)) != 0)
+ goto err;
+
+ (void)__memp_set_flags(mpf, DB_MPOOL_UNLINK, 1);
+ /* Someone could be real slow, let them close it down. */
+ if (array->mpfarray[offset].pinref != 0)
+ goto err;
+ array->mpfarray[offset].mpf = NULL;
+ if ((ret = __memp_fclose(mpf, 0)) != 0)
+ goto err;
+
+ /*
+ * If the file is at the bottom of the array
+ * shift things down and adjust the end points.
+ */
+ if (offset == 0) {
+ memmove(array->mpfarray, &array->mpfarray[1],
+ (array->hi_extent - array->low_extent)
+ * sizeof(array->mpfarray[0]));
+ array->mpfarray[
+ array->hi_extent - array->low_extent].mpf = NULL;
+ if (array->low_extent != array->hi_extent)
+ array->low_extent++;
+ } else {
+ if (extid == array->hi_extent)
+ array->hi_extent--;
+ }
+
+err: MUTEX_UNLOCK(env, dbp->mutex);
+
+ return (ret);
+}
+
+/*
+ * __qam_sync --
+ * Flush the database cache.
+ *
+ * PUBLIC: int __qam_sync __P((DB *));
+ */
+int
+__qam_sync(dbp)
+ DB *dbp;
+{
+ int ret;
+ /*
+ * We can't easily identify the extent files associated with a specific
+ * Queue file, so flush all Queue extent files.
+ */
+ if ((ret = __memp_fsync(dbp->mpf)) != 0)
+ return (ret);
+ if (((QUEUE *)dbp->q_internal)->page_ext != 0)
+ return (__memp_sync_int(
+ dbp->env, NULL, 0, DB_SYNC_QUEUE_EXTENT, NULL, NULL));
+ return (0);
+}
+
+/*
+ * __qam_gen_filelist -- generate a list of extent files.
+ * Another thread may close the handle so this should only
+ * be used single threaded or with care.
+ *
+ * PUBLIC: int __qam_gen_filelist __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, QUEUE_FILELIST **));
+ */
+int
+__qam_gen_filelist(dbp, ip, filelistp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ QUEUE_FILELIST **filelistp;
+{
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ QMETA *meta;
+ QUEUE *qp;
+ size_t extent_cnt;
+ db_recno_t i, current, first, stop, rec_extent;
+ QUEUE_FILELIST *fp;
+ int ret;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ qp = (QUEUE *)dbp->q_internal;
+ *filelistp = NULL;
+
+ if (qp->page_ext == 0)
+ return (0);
+
+ /* This may happen during metapage recovery. */
+ if (qp->name == NULL)
+ return (0);
+
+ /* Find out the first and last record numbers in the database. */
+ i = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &i, ip, NULL, 0, &meta)) != 0)
+ return (ret);
+
+ current = meta->cur_recno;
+ first = meta->first_recno;
+
+ if ((ret = __memp_fput(mpf, ip, meta, dbp->priority)) != 0)
+ return (ret);
+
+ /*
+ * Allocate the extent array. Calculate the worst case number of
+ * pages and convert that to a count of extents. The count of
+ * extents has 3 or 4 extra slots:
+ * roundoff at first (e.g., current record in extent);
+ * roundoff at current (e.g., first record in extent);
+ * NULL termination; and
+ * UINT32_MAX wraparound (the last extent can be small).
+ */
+ rec_extent = qp->rec_page * qp->page_ext;
+ if (current >= first)
+ extent_cnt = (current - first) / rec_extent + 3;
+ else
+ extent_cnt =
+ (current + (UINT32_MAX - first)) / rec_extent + 4;
+
+ if (extent_cnt == 0)
+ return (0);
+ if ((ret = __os_calloc(env,
+ extent_cnt, sizeof(QUEUE_FILELIST), filelistp)) != 0)
+ return (ret);
+ fp = *filelistp;
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+again:
+ if (current >= first)
+ stop = current;
+ else
+ stop = UINT32_MAX;
+
+ /*
+ * Make sure that first is at the same offset in the extent as stop.
+ * This guarantees that the stop will be reached in the loop below,
+ * even if it is the only record in its extent. This calculation is
+ * safe because first won't move out of its extent.
+ */
+ first -= first % rec_extent;
+ first += stop % rec_extent;
+
+ for (i = first; i >= first && i <= stop; i += rec_extent) {
+ if ((ret = __qam_fprobe(dbc, QAM_RECNO_PAGE(dbp, i),
+ &fp->mpf, QAM_PROBE_MPF, dbp->priority, 0)) != 0) {
+ if (ret == ENOENT)
+ continue;
+ goto err;
+ }
+ fp->id = QAM_RECNO_EXTENT(dbp, i);
+ fp++;
+ DB_ASSERT(env, (size_t)(fp - *filelistp) < extent_cnt);
+ }
+
+ if (current < first) {
+ first = 1;
+ goto again;
+ }
+
+err: (void)__dbc_close(dbc);
+ return (ret);
+}
+
+/*
+ * __qam_extent_names -- generate a list of extent files names.
+ *
+ * PUBLIC: int __qam_extent_names __P((ENV *, char *, char ***));
+ */
+int
+__qam_extent_names(env, name, namelistp)
+ ENV *env;
+ char *name;
+ char ***namelistp;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ QUEUE *qp;
+ QUEUE_FILELIST *filelist, *fp;
+ size_t len;
+ int cnt, ret, t_ret;
+ char buf[DB_MAXPATHLEN], **cp, *freep;
+
+ *namelistp = NULL;
+ filelist = NULL;
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+ if ((ret = __db_open(dbp, ip,
+ NULL, name, NULL, DB_QUEUE, DB_RDONLY, 0, PGNO_BASE_MD)) != 0)
+ goto done;
+ qp = dbp->q_internal;
+ if (qp->page_ext == 0)
+ goto done;
+
+ if ((ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+ goto done;
+
+ if (filelist == NULL)
+ goto done;
+
+ cnt = 0;
+ for (fp = filelist; fp->mpf != NULL; fp++)
+ cnt++;
+
+ /* QUEUE_EXTENT contains extra chars, but add 6 anyway for the int. */
+ len = (size_t)cnt * (sizeof(**namelistp) +
+ strlen(QUEUE_EXTENT) + strlen(qp->dir) + strlen(qp->name) + 6);
+
+ if ((ret = __os_malloc(dbp->env, len, namelistp)) != 0)
+ goto done;
+ cp = *namelistp;
+ freep = (char *)(cp + cnt + 1);
+ for (fp = filelist; fp->mpf != NULL; fp++) {
+ QAM_EXNAME(qp, fp->id, buf, sizeof(buf));
+ len = strlen(buf);
+ *cp++ = freep;
+ (void)strcpy(freep, buf);
+ freep += len + 1;
+ }
+ *cp = NULL;
+
+done:
+ if (filelist != NULL)
+ __os_free(dbp->env, filelist);
+ if ((t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __qam_exid --
+ * Generate a fileid for an extent based on the fileid of the main
+ * file. Since we do not log schema creates/deletes explicitly, the log
+ * never captures the fileid of an extent file. In order that masters and
+ * replicas have the same fileids (so they can explicitly delete them), we
+ * use computed fileids for the extent files of Queue files.
+ *
+ * An extent file id retains the low order 12 bytes of the file id and
+ * overwrites the dev/inode fields, placing a 0 in the inode field, and
+ * the extent number in the dev field.
+ *
+ * PUBLIC: void __qam_exid __P((DB *, u_int8_t *, u_int32_t));
+ */
+void
+__qam_exid(dbp, fidp, exnum)
+ DB *dbp;
+ u_int8_t *fidp;
+ u_int32_t exnum;
+{
+ int i;
+ u_int8_t *p;
+
+ /* Copy the fileid from the master. */
+ memcpy(fidp, dbp->fileid, DB_FILE_ID_LEN);
+
+ /* The first four bytes are the inode or the FileIndexLow; 0 it. */
+ for (i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = 0;
+
+ /* The next four bytes are the dev/FileIndexHigh; insert the exnum . */
+ for (p = (u_int8_t *)&exnum, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+}
+
+/*
+ * __qam_nameop --
+ * Remove or rename extent files associated with a particular file.
+ * This is to remove or rename (both in mpool and the file system) any
+ * extent files associated with the given dbp.
+ * This is either called from the QUEUE remove or rename methods or
+ * when undoing a transaction that created the database.
+ *
+ * PUBLIC: int __qam_nameop __P((DB *, DB_TXN *, const char *, qam_name_op));
+ */
+int
+__qam_nameop(dbp, txn, newname, op)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *newname;
+ qam_name_op op;
+{
+ ENV *env;
+ QUEUE *qp;
+ size_t exlen, fulllen, len;
+ u_int8_t fid[DB_FILE_ID_LEN];
+ u_int32_t exid;
+ int cnt, i, ret, t_ret;
+ char buf[DB_MAXPATHLEN], nbuf[DB_MAXPATHLEN], sepsave;
+ char *endname, *endpath, *exname, *fullname, **names;
+ char *ndir, *namep, *new, *cp;
+
+ env = dbp->env;
+ qp = (QUEUE *)dbp->q_internal;
+ cnt = ret = t_ret = 0;
+ namep = exname = fullname = NULL;
+ names = NULL;
+
+ /* If this isn't a queue with extents, we're done. */
+ if (qp->page_ext == 0)
+ return (0);
+
+ /*
+ * Generate the list of all queue extents for this file (from the
+ * file system) and then cycle through removing them and evicting
+ * from mpool. We have two modes of operation here. If we are
+ * undoing log operations, then do not write log records and try
+ * to keep going even if we encounter failures in nameop. If we
+ * are in mainline code, then return as soon as we have a problem.
+ * Memory allocation errors (__db_appname, __os_malloc) are always
+ * considered failure.
+ *
+ * Set buf to : dir/__dbq.NAME.0 and fullname to HOME/dir/__dbq.NAME.0
+ * or, in the case of an absolute path: /dir/__dbq.NAME.0
+ */
+ QAM_EXNAME(qp, 0, buf, sizeof(buf));
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, buf, &dbp->dirname, &fullname)) != 0)
+ return (ret);
+
+ /* We should always have a path separator here. */
+ if ((endpath = __db_rpath(fullname)) == NULL) {
+ ret = EINVAL;
+ goto err;
+ }
+ sepsave = *endpath;
+ *endpath = '\0';
+
+ /*
+ * Get the list of all names in the directory and restore the
+ * path separator.
+ */
+ if ((ret = __os_dirlist(env, fullname, 0, &names, &cnt)) != 0)
+ goto err;
+ *endpath = sepsave;
+
+ /* If there aren't any names, don't allocate any space. */
+ if (cnt == 0)
+ goto err;
+
+ /*
+ * Now, make endpath reference the queue extent names upon which
+ * we can match. Then we set the end of the path to be the
+ * beginning of the extent number, and we can compare the bytes
+ * between endpath and endname (__dbq.NAME.).
+ */
+ endpath++;
+ endname = strrchr(endpath, '.');
+ if (endname == NULL) {
+ ret = EINVAL;
+ goto err;
+ }
+ ++endname;
+ *endname = '\0';
+ len = strlen(endpath);
+ fulllen = strlen(fullname);
+
+ /* Allocate space for a full extent name. */
+ exlen = fulllen + 20;
+ if ((ret = __os_malloc(env, exlen, &exname)) != 0)
+ goto err;
+
+ ndir = new = NULL;
+ if (newname != NULL) {
+ if ((ret = __os_strdup(env, newname, &namep)) != 0)
+ goto err;
+ ndir = namep;
+ if ((new = __db_rpath(namep)) != NULL)
+ *new++ = '\0';
+ else {
+ new = namep;
+ ndir = PATH_DOT;
+ }
+ }
+ for (i = 0; i < cnt; i++) {
+ /* Check if this is a queue extent file. */
+ if (strncmp(names[i], endpath, len) != 0)
+ continue;
+ /* Make sure we have all numbers. foo.db vs. foo.db.0. */
+ for (cp = &names[i][len]; *cp != '\0'; cp++)
+ if (!isdigit((int)*cp))
+ break;
+ if (*cp != '\0')
+ continue;
+
+ /*
+ * We have a queue extent file. We need to generate its
+ * name and its fileid.
+ */
+ exid = (u_int32_t)strtoul(names[i] + len, NULL, 10);
+ __qam_exid(dbp, fid, exid);
+
+ switch (op) {
+ case QAM_NAME_DISCARD:
+ snprintf(exname, exlen,
+ "%s%s", fullname, names[i] + len);
+ if ((t_ret = __memp_nameop(dbp->env,
+ fid, NULL, exname, NULL,
+ F_ISSET(dbp, DB_AM_INMEM))) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+
+ case QAM_NAME_RENAME:
+ snprintf(nbuf, sizeof(nbuf), QUEUE_EXTENT,
+ ndir, PATH_SEPARATOR[0], new, exid);
+ QAM_EXNAME(qp, exid, buf, sizeof(buf));
+ if ((ret = __fop_rename(env,
+ txn, buf, nbuf, &dbp->dirname, fid, DB_APP_DATA, 1,
+ F_ISSET(dbp, DB_AM_NOT_DURABLE) ?
+ DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ break;
+
+ case QAM_NAME_REMOVE:
+ QAM_EXNAME(qp, exid, buf, sizeof(buf));
+ if ((ret = __fop_remove(env, txn, fid,
+ buf, &dbp->dirname,
+ DB_APP_DATA, F_ISSET(dbp, DB_AM_NOT_DURABLE) ?
+ DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ break;
+ }
+ }
+
+err: if (fullname != NULL)
+ __os_free(env, fullname);
+ if (exname != NULL)
+ __os_free(env, exname);
+ if (namep != NULL)
+ __os_free(env, namep);
+ if (names != NULL)
+ __os_dirfree(env, names, cnt);
+ return (ret);
+}
+
+/*
+ * __qam_lsn_reset -- reset the lsns for extents.
+ *
+ * PUBLIC: int __qam_lsn_reset __P((DB *, DB_THREAD_INFO *));
+ */
+int
+__qam_lsn_reset(dbp, ip)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+{
+ QUEUE *qp;
+ QUEUE_FILELIST *filelist, *fp;
+ int ret;
+
+ qp = dbp->q_internal;
+ if (qp->page_ext == 0)
+ return (0);
+
+ if ((ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+ return (ret);
+
+ if (filelist == NULL)
+ return (ret);
+
+ for (fp = filelist; fp->mpf != NULL; fp++)
+ if ((ret = __db_lsn_reset(fp->mpf, ip)) != 0)
+ break;
+
+ __os_free(dbp->env, filelist);
+ return (ret);
+}
+
+/*
+ * __qam_backup_extents--
+ * Routine to safely copy the active queue extents of a database.
+ * PUBLIC: int __qam_backup_extents __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, const char *, u_int32_t));
+ */
+int
+__qam_backup_extents(dbp, ip, target, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ const char *target;
+ u_int32_t flags;
+{
+ DB_FH *filep;
+ QUEUE *qp;
+ QUEUE_FILELIST *fp, *filelist;
+ int ret, t_ret;
+ char buf[DB_MAXPATHLEN];
+ void *handle;
+
+ if ((ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+ return (ret);
+
+ if (filelist == NULL)
+ return (0);
+
+ qp = dbp->q_internal;
+
+ for (fp = filelist; fp->mpf != NULL; fp++) {
+ QAM_EXNAME(qp, fp->id, buf, sizeof(buf));
+ if ((ret = __memp_backup_open(dbp->dbenv->env,
+ fp->mpf, buf, target, flags, &filep, &handle)) == 0)
+ ret = __memp_backup_mpf(dbp->dbenv->env, fp->mpf, ip,
+ 0, fp->mpf->mfp->last_pgno, filep, handle, flags);
+ if ((t_ret = __memp_backup_close(dbp->dbenv->env,
+ fp->mpf, buf, filep, handle)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ break;
+ }
+
+ __os_free(dbp->env, filelist);
+
+ return (ret);
+}
diff --git a/src/qam/qam_method.c b/src/qam/qam_method.c
new file mode 100644
index 00000000..0867e5dd
--- /dev/null
+++ b/src/qam/qam_method.c
@@ -0,0 +1,399 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __qam_rr __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ const char *, const char *, const char *, qam_name_op));
+static int __qam_set_extentsize __P((DB *, u_int32_t));
+
+/*
+ * __qam_db_create --
+ * Queue specific initialization of the DB structure.
+ *
+ * PUBLIC: int __qam_db_create __P((DB *));
+ */
+int
+__qam_db_create(dbp)
+ DB *dbp;
+{
+ QUEUE *t;
+ int ret;
+
+ /* Allocate and initialize the private queue structure. */
+ if ((ret = __os_calloc(dbp->env, 1, sizeof(QUEUE), &t)) != 0)
+ return (ret);
+ dbp->q_internal = t;
+ dbp->get_q_extentsize = __qam_get_extentsize;
+ dbp->set_q_extentsize = __qam_set_extentsize;
+
+ t->re_pad = ' ';
+
+ return (0);
+}
+
+/*
+ * __qam_db_close --
+ * Queue specific discard of the DB structure.
+ *
+ * PUBLIC: int __qam_db_close __P((DB *, u_int32_t));
+ */
+int
+__qam_db_close(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ MPFARRAY *array;
+ QUEUE *t;
+ struct __qmpf *mpfp;
+ u_int32_t i;
+ int ret, t_ret;
+
+ ret = 0;
+ if ((t = dbp->q_internal) == NULL)
+ return (0);
+
+ array = &t->array1;
+again:
+ mpfp = array->mpfarray;
+ if (mpfp != NULL) {
+ for (i = array->low_extent;
+ i <= array->hi_extent; i++, mpfp++) {
+ mpf = mpfp->mpf;
+ mpfp->mpf = NULL;
+ if (mpf != NULL && (t_ret = __memp_fclose(mpf,
+ LF_ISSET(DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0))
+ != 0 && ret == 0)
+ ret = t_ret;
+ }
+ __os_free(dbp->env, array->mpfarray);
+ }
+ if (t->array2.n_extent != 0) {
+ array = &t->array2;
+ array->n_extent = 0;
+ goto again;
+ }
+
+ if (LF_ISSET(DB_AM_DISCARD) &&
+ (t_ret = __qam_nameop(dbp, NULL,
+ NULL, QAM_NAME_DISCARD)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (t->path != NULL)
+ __os_free(dbp->env, t->path);
+ __os_free(dbp->env, t);
+ dbp->q_internal = NULL;
+
+ return (ret);
+}
+
+/*
+ * __qam_get_extentsize --
+ * The DB->q_get_extentsize method.
+ *
+ * PUBLIC: int __qam_get_extentsize __P((DB *, u_int32_t *));
+ */
+int
+__qam_get_extentsize(dbp, q_extentsizep)
+ DB *dbp;
+ u_int32_t *q_extentsizep;
+{
+ *q_extentsizep = ((QUEUE*)dbp->q_internal)->page_ext;
+ return (0);
+}
+
+static int
+__qam_set_extentsize(dbp, extentsize)
+ DB *dbp;
+ u_int32_t extentsize;
+{
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_extentsize");
+
+ if (extentsize < 1) {
+ __db_errx(dbp->env, DB_STR("1140",
+ "Extent size must be at least 1"));
+ return (EINVAL);
+ }
+
+ ((QUEUE*)dbp->q_internal)->page_ext = extentsize;
+
+ return (0);
+}
+
+/*
+ * __queue_pageinfo -
+ * Given a dbp, get first/last page information about a queue.
+ *
+ * PUBLIC: int __queue_pageinfo __P((DB *, db_pgno_t *, db_pgno_t *,
+ * PUBLIC: int *, int, u_int32_t));
+ */
+int
+__queue_pageinfo(dbp, firstp, lastp, emptyp, prpage, flags)
+ DB *dbp;
+ db_pgno_t *firstp, *lastp;
+ int *emptyp;
+ int prpage;
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ QMETA *meta;
+ db_pgno_t first, i, last;
+ int empty, ret, t_ret;
+
+ mpf = dbp->mpf;
+ ENV_GET_THREAD_INFO(dbp->env, ip);
+
+ /* Find out the page number of the last page in the database. */
+ i = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &i, ip, NULL, 0, &meta)) != 0)
+ return (ret);
+
+ first = QAM_RECNO_PAGE(dbp, meta->first_recno);
+ last = QAM_RECNO_PAGE(
+ dbp, meta->cur_recno == 1 ? 1 : meta->cur_recno - 1);
+
+ empty = meta->cur_recno == meta->first_recno;
+ if (firstp != NULL)
+ *firstp = first;
+ if (lastp != NULL)
+ *lastp = last;
+ if (emptyp != NULL)
+ *emptyp = empty;
+#ifdef HAVE_STATISTICS
+ if (prpage)
+ ret = __db_prpage(dbp, (PAGE *)meta, flags);
+#else
+ COMPQUIET(prpage, 0);
+ COMPQUIET(flags, 0);
+#endif
+
+ if ((t_ret = __memp_fput(mpf,
+ ip, meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+#ifdef HAVE_STATISTICS
+/*
+ * __db_prqueue --
+ * Print out a queue
+ *
+ * PUBLIC: int __db_prqueue __P((DB *, u_int32_t));
+ */
+int
+__db_prqueue(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DB_THREAD_INFO *ip;
+ PAGE *h;
+ db_pgno_t first, i, last, pg_ext, stop;
+ int empty, ret, t_ret;
+
+ if ((ret = __queue_pageinfo(dbp, &first, &last, &empty, 1, flags)) != 0)
+ return (ret);
+
+ if (empty || ret != 0)
+ return (ret);
+
+ ENV_GET_THREAD_INFO(dbp->env, ip);
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+ i = first;
+ if (first > last)
+ stop = QAM_RECNO_PAGE(dbp, UINT32_MAX);
+ else
+ stop = last;
+
+ /* Dump each page. */
+ pg_ext = ((QUEUE *)dbp->q_internal)->page_ext;
+begin:
+ for (; i <= stop; ++i) {
+ if ((ret = __qam_fget(dbc, &i, 0, &h)) != 0) {
+ if (pg_ext == 0) {
+ if (ret == DB_PAGE_NOTFOUND && first == last)
+ ret = 0;
+ goto err;
+ }
+ if (ret == ENOENT || ret == DB_PAGE_NOTFOUND) {
+ i += (pg_ext - ((i - 1) % pg_ext)) - 1;
+ ret = 0;
+ continue;
+ }
+ goto err;
+ }
+ (void)__db_prpage(dbp, h, flags);
+ if ((ret = __qam_fput(dbc, i, h, dbp->priority)) != 0)
+ goto err;
+ }
+
+ if (first > last) {
+ i = 1;
+ stop = last;
+ first = last;
+ goto begin;
+ }
+
+err:
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+#endif
+
+/*
+ * __qam_remove --
+ * Remove method for a Queue.
+ *
+ * PUBLIC: int __qam_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: const char *, const char *, u_int32_t));
+ */
+int
+__qam_remove(dbp, ip, txn, name, subdb, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__qam_rr(dbp, ip, txn, name, subdb, NULL, QAM_NAME_REMOVE));
+}
+
+/*
+ * __qam_rename --
+ * Rename method for a Queue.
+ *
+ * PUBLIC: int __qam_rename __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: const char *, const char *, const char *));
+ */
+int
+__qam_rename(dbp, ip, txn, name, subdb, newname)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+{
+ return (__qam_rr(dbp, ip, txn, name, subdb, newname, QAM_NAME_RENAME));
+}
+
+/*
+ * __qam_rr --
+ * Remove/Rename method for a Queue.
+ */
+static int
+__qam_rr(dbp, ip, txn, name, subdb, newname, op)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+ qam_name_op op;
+{
+ DB *tmpdbp;
+ ENV *env;
+ QUEUE *qp;
+ int ret, t_ret;
+
+ env = dbp->env;
+ ret = 0;
+
+ if (subdb != NULL && name != NULL) {
+ __db_errx(env, DB_STR("1141",
+ "Queue does not support multiple databases per file"));
+ return (EINVAL);
+ }
+
+ /*
+ * Since regular rename no longer opens the database, we may have
+ * to do it here.
+ */
+ if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ tmpdbp = dbp;
+ else {
+ if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+ return (ret);
+
+ /*
+ * We need to make sure we don't self-deadlock, so give
+ * this dbp the same locker as the incoming one.
+ */
+ tmpdbp->locker = dbp->locker;
+ if ((ret = __db_open(tmpdbp, ip, txn,
+ name, NULL, DB_QUEUE, DB_RDONLY, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+ }
+
+ qp = (QUEUE *)tmpdbp->q_internal;
+ if (qp->page_ext != 0)
+ ret = __qam_nameop(tmpdbp, txn, newname, op);
+
+ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+err: /*
+ * Since we copied the locker ID from the dbp, we'd better not
+ * free it here.
+ */
+ tmpdbp->locker = NULL;
+
+ /* We need to remove the lock event we associated with this. */
+ if (txn != NULL)
+ __txn_remlock(env,
+ txn, &tmpdbp->handle_lock, DB_LOCK_INVALIDID);
+
+ if ((t_ret = __db_close(tmpdbp,
+ txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+
+/*
+ * __qam_map_flags --
+ * Map queue-specific flags from public to the internal values.
+ *
+ * PUBLIC: void __qam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+ */
+void
+__qam_map_flags(dbp, inflagsp, outflagsp)
+ DB *dbp;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ COMPQUIET(dbp, NULL);
+
+ if (FLD_ISSET(*inflagsp, DB_INORDER)) {
+ FLD_SET(*outflagsp, DB_AM_INORDER);
+ FLD_CLR(*inflagsp, DB_INORDER);
+ }
+}
+
+/*
+ * __qam_set_flags --
+ * Set queue-specific flags.
+ *
+ * PUBLIC: int __qam_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__qam_set_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+
+ __qam_map_flags(dbp, flagsp, &dbp->flags);
+ return (0);
+}
diff --git a/src/qam/qam_open.c b/src/qam/qam_open.c
new file mode 100644
index 00000000..69f6cb75
--- /dev/null
+++ b/src/qam/qam_open.c
@@ -0,0 +1,346 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/fop.h"
+
+static int __qam_init_meta __P((DB *, QMETA *));
+
+/*
+ * __qam_open
+ *
+ * PUBLIC: int __qam_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, db_pgno_t, int, u_int32_t));
+ */
+int
+__qam_open(dbp, ip, txn, name, base_pgno, mode, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ int mode;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ QMETA *qmeta;
+ QUEUE *t;
+ int ret, t_ret;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ t = dbp->q_internal;
+ ret = 0;
+ qmeta = NULL;
+
+ if (name == NULL && t->page_ext != 0) {
+ __db_errx(env, DB_STR("1134",
+ "Extent size may not be specified for in-memory queue database"));
+ return (EINVAL);
+ }
+
+ if (MULTIVERSION(dbp)) {
+ __db_errx(env, DB_STR("1135",
+ "Multiversion queue databases are not supported"));
+ return (EINVAL);
+ }
+
+ /* Initialize the remaining fields/methods of the DB. */
+ dbp->db_am_remove = __qam_remove;
+ dbp->db_am_rename = __qam_rename;
+
+ /*
+ * Get a cursor. If DB_CREATE is specified, we may be creating
+ * pages, and to do that safely in CDB we need a write cursor.
+ * In STD_LOCKING mode, we'll synchronize using the meta page
+ * lock instead.
+ */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc,
+ LF_ISSET(DB_CREATE) && CDB_LOCKING(env) ?
+ DB_WRITECURSOR : 0)) != 0)
+ return (ret);
+
+ /*
+ * Get the meta data page. It must exist, because creates of
+ * files/databases come in through the __qam_new_file interface
+ * and queue doesn't support subdatabases.
+ */
+ if ((ret = __memp_fget(mpf, &base_pgno, ip, txn, 0, &qmeta)) != 0)
+ goto err;
+
+ /* If the magic number is incorrect, that's a fatal error. */
+ if (qmeta->dbmeta.magic != DB_QAMMAGIC) {
+ __db_errx(env, DB_STR_A("1136",
+ "__qam_open: %s: unexpected file type or format", "%s"),
+ name);
+ ret = EINVAL;
+ goto err;
+ }
+
+ /* Setup information needed to open extents. */
+ t->page_ext = qmeta->page_ext;
+
+ if (t->page_ext != 0 && (ret = __qam_set_ext_data(dbp, name)) != 0)
+ goto err;
+
+ if (mode == 0)
+ mode = DB_MODE_660;
+ t->mode = mode;
+ t->re_pad = (int)qmeta->re_pad;
+ t->re_len = qmeta->re_len;
+ t->rec_page = qmeta->rec_page;
+
+ t->q_meta = base_pgno;
+ t->q_root = base_pgno + 1;
+
+err: if (qmeta != NULL && (t_ret =
+ __memp_fput(mpf, ip, qmeta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __qam_set_ext_data --
+ * Setup DBP data for opening queue extents.
+ *
+ * PUBLIC: int __qam_set_ext_data __P((DB*, const char *));
+ */
+int
+__qam_set_ext_data(dbp, name)
+ DB *dbp;
+ const char *name;
+{
+ QUEUE *t;
+ int ret;
+
+ t = dbp->q_internal;
+ t->pginfo.db_pagesize = dbp->pgsize;
+ t->pginfo.flags =
+ F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+ t->pginfo.type = dbp->type;
+ t->pgcookie.data = &t->pginfo;
+ t->pgcookie.size = sizeof(DB_PGINFO);
+
+ if ((ret = __os_strdup(dbp->env, name, &t->path)) != 0)
+ return (ret);
+ t->dir = t->path;
+ if ((t->name = __db_rpath(t->path)) == NULL) {
+ t->name = t->path;
+ t->dir = PATH_DOT;
+ } else
+ *t->name++ = '\0';
+
+ return (0);
+}
+
+/*
+ * __qam_metachk --
+ *
+ * PUBLIC: int __qam_metachk __P((DB *, const char *, QMETA *));
+ */
+int
+__qam_metachk(dbp, name, qmeta)
+ DB *dbp;
+ const char *name;
+ QMETA *qmeta;
+{
+ ENV *env;
+ u_int32_t vers;
+ int ret;
+
+ env = dbp->env;
+ ret = 0;
+
+ /*
+ * At this point, all we know is that the magic number is for a Queue.
+ * Check the version, the database may be out of date.
+ */
+ vers = qmeta->dbmeta.version;
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ M_32_SWAP(vers);
+ switch (vers) {
+ case 1:
+ case 2:
+ __db_errx(env, DB_STR_A("1137",
+ "%s: queue version %lu requires a version upgrade",
+ "%s %lu"), name, (u_long)vers);
+ return (DB_OLD_VERSION);
+ case 3:
+ case 4:
+ break;
+ default:
+ __db_errx(env, DB_STR_A("1138",
+ "%s: unsupported qam version: %lu", "%s %lu"),
+ name, (u_long)vers);
+ return (EINVAL);
+ }
+
+ /* Swap the page if we need to. */
+ if (F_ISSET(dbp, DB_AM_SWAP) &&
+ (ret = __qam_mswap(env, (PAGE *)qmeta)) != 0)
+ return (ret);
+
+ /* Check the type. */
+ if (dbp->type != DB_QUEUE && dbp->type != DB_UNKNOWN)
+ return (EINVAL);
+ dbp->type = DB_QUEUE;
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE);
+
+ /* Set the page size. */
+ dbp->pgsize = qmeta->dbmeta.pagesize;
+
+ /* Copy the file's ID. */
+ memcpy(dbp->fileid, qmeta->dbmeta.uid, DB_FILE_ID_LEN);
+
+ /* Set up AM-specific methods that do not require an open. */
+ dbp->db_am_rename = __qam_rename;
+ dbp->db_am_remove = __qam_remove;
+
+ return (ret);
+}
+
+/*
+ * __qam_init_meta --
+ * Initialize the meta-data for a Queue database.
+ */
+static int
+__qam_init_meta(dbp, meta)
+ DB *dbp;
+ QMETA *meta;
+{
+ ENV *env;
+ QUEUE *t;
+
+ env = dbp->env;
+ t = dbp->q_internal;
+
+ memset(meta, 0, sizeof(QMETA));
+ LSN_NOT_LOGGED(meta->dbmeta.lsn);
+ meta->dbmeta.pgno = PGNO_BASE_MD;
+ meta->dbmeta.last_pgno = 0;
+ meta->dbmeta.magic = DB_QAMMAGIC;
+ meta->dbmeta.version = DB_QAMVERSION;
+ meta->dbmeta.pagesize = dbp->pgsize;
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
+ DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
+ meta->crypto_magic = meta->dbmeta.magic;
+ }
+ meta->dbmeta.type = P_QAMMETA;
+ meta->re_pad = (u_int32_t)t->re_pad;
+ meta->re_len = t->re_len;
+ meta->rec_page = CALC_QAM_RECNO_PER_PAGE(dbp);
+ meta->cur_recno = 1;
+ meta->first_recno = 1;
+ meta->page_ext = t->page_ext;
+ t->rec_page = meta->rec_page;
+ memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+
+ /* Verify that we can fit at least one record per page. */
+ if (QAM_RECNO_PER_PAGE(dbp) < 1) {
+ __db_errx(env, DB_STR_A("1139",
+ "Record size of %lu too large for page size of %lu",
+ "%lu %lu"), (u_long)t->re_len, (u_long)dbp->pgsize);
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __qam_new_file --
+ * Create the necessary pages to begin a new queue database file.
+ *
+ * PUBLIC: int __qam_new_file __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__qam_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ DBT pdbt;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO pginfo;
+ ENV *env;
+ QMETA *meta;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ /*
+ * Build meta-data page.
+ *
+ * This code appears more complex than it is because of the two cases
+ * (named and unnamed).
+ *
+ * For each page being created, there are three parts: 1) a "get page"
+ * chunk (which either uses malloc'd memory or calls __memp_fget), 2)
+ * the initialization, and 3) the "put page" chunk which either does a
+ * fop write or an __memp_fput.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ mpf = dbp->mpf;
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno, ip, txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+ return (ret);
+
+ if ((ret = __qam_init_meta(dbp, meta)) != 0)
+ goto err1;
+
+ if ((ret = __db_log_page(dbp,
+ txn, &meta->dbmeta.lsn, pgno, (PAGE *)meta)) != 0)
+ goto err1;
+err1: if ((t_ret =
+ __memp_fput(mpf, ip, meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ } else {
+ env = dbp->env;
+ if ((ret = __os_calloc(env, 1, dbp->pgsize, &meta)) != 0)
+ return (ret);
+
+ if ((ret = __qam_init_meta(dbp, meta)) != 0)
+ goto err2;
+
+ pginfo.db_pagesize = dbp->pgsize;
+ pginfo.flags =
+ F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+ pginfo.type = DB_QUEUE;
+ DB_SET_DBT(pdbt, &pginfo, sizeof(pginfo));
+ if ((ret =
+ __db_pgout(env->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
+ goto err2;
+ ret = __fop_write(env, txn, name, dbp->dirname,
+ DB_APP_DATA, fhp, dbp->pgsize, 0, 0, meta, dbp->pgsize, 1,
+ F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+err2: __os_free(env, meta);
+ }
+
+ return (ret);
+}
diff --git a/src/qam/qam_rec.c b/src/qam/qam_rec.c
new file mode 100644
index 00000000..c9ff6c83
--- /dev/null
+++ b/src/qam/qam_rec.c
@@ -0,0 +1,687 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __qam_adjust_first __P((DB *, DBC *, QMETA *, db_recno_t));
+
+/*
+ * LSNs in queue data pages are advisory. They do not have to be accurate
+ * as all operations are idempotent on records. They should not be rolled
+ * forward during recovery as committed transaction may obscure updates from
+ * an incomplete transaction that updates the same page. The incomplete
+ * transaction may be completed during a later hot backup cycle.
+ */
+
+/* Queue version of REC_DIRTY -- needs to probe the correct file. */
+#define QAM_DIRTY(dbc, pgno, pagep) \
+ if ((ret = __qam_dirty((dbc), \
+ pgno, pagep, (dbc)->priority)) != 0) { \
+ ret = __db_pgerr((dbc)->dbp, (pgno), ret); \
+ goto out; \
+ }
+
+static int
+__qam_adjust_first(file_dbp, dbc, meta, recno)
+ DB *file_dbp;
+ DBC *dbc;
+ QMETA *meta;
+ db_recno_t recno;
+{
+ QUEUE_CURSOR *cp;
+ u_int32_t rec_ext;
+ int exact, ret;
+
+ ret = 0;
+ if (meta->page_ext == 0)
+ rec_ext = 0;
+ else
+ rec_ext = meta->page_ext * meta->rec_page;
+ cp = (QUEUE_CURSOR *)dbc->internal;
+ if (meta->first_recno == RECNO_OOB)
+ meta->first_recno++;
+ while (meta->first_recno != meta->cur_recno &&
+ !QAM_BEFORE_FIRST(meta, recno)) {
+ if ((ret = __qam_position(dbc,
+ &meta->first_recno, 0, &exact)) != 0)
+ return (ret);
+ if (cp->page != NULL && (ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0)
+ return (ret);
+
+ if (exact == 1)
+ break;
+ if (cp->page != NULL &&
+ rec_ext != 0 && meta->first_recno % rec_ext == 0)
+ if ((ret =
+ __qam_fremove(file_dbp, cp->pgno)) != 0)
+ return (ret);
+ REC_DIRTY(file_dbp->mpf,
+ dbc->thread_info, dbc->priority, &meta);
+ QAM_INC_RECNO(meta->first_recno);
+ }
+out: return (ret);
+}
+
+/*
+ * __qam_incfirst_recover --
+ * Recovery function for incfirst.
+ *
+ * PUBLIC: int __qam_incfirst_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_incfirst_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __qam_incfirst_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN trunc_lsn;
+ DB_MPOOLFILE *mpf;
+ QMETA *meta;
+ db_pgno_t metapg;
+ int ret;
+
+ COMPQUIET(meta, NULL);
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__qam_incfirst_print);
+ REC_INTRO(__qam_incfirst_read, ip, 0);
+
+ metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+
+ /* Allocate our own cursor without DB_RECOVER as we need a locker. */
+ if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+ DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+ goto out;
+ F_SET(dbc, DBC_RECOVER);
+
+ if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+ 0, &meta)) != 0) {
+ if (DB_REDO(op)) {
+ if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+ DB_MPOOL_CREATE, &meta)) != 0)
+ goto out;
+ meta->dbmeta.pgno = metapg;
+ meta->dbmeta.type = P_QAMMETA;
+ } else {
+ *lsnp = argp->prev_lsn;
+ goto out;
+ }
+ }
+
+ /*
+ * Only move first_recno backwards so we pick up the aborted delete.
+ * When going forward we need to be careful since
+ * we may have bumped over a locked record.
+ */
+ if (DB_UNDO(op)) {
+ if (QAM_BEFORE_FIRST(meta, argp->recno)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ meta->first_recno = argp->recno;
+ }
+
+ trunc_lsn = ((DB_TXNHEAD *)info)->trunc_lsn;
+ /* if we are truncating, update the LSN */
+ if (!IS_ZERO_LSN(trunc_lsn) &&
+ LOG_COMPARE(&LSN(meta), &trunc_lsn) > 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ LSN(meta) = trunc_lsn;
+ }
+ } else {
+ if (LOG_COMPARE(&LSN(meta), lsnp) < 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ LSN(meta) = *lsnp;
+ }
+ if ((ret = __qam_adjust_first(file_dbp,
+ dbc, meta, argp->recno + 1)) != 0)
+ goto err;
+ }
+
+ ret = __memp_fput(mpf, ip, meta, dbc->priority);
+ if (ret != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ if (0) {
+err: (void)__memp_fput(mpf, ip, meta, dbc->priority);
+ }
+
+out: REC_CLOSE;
+}
+
+/*
+ * __qam_mvptr_recover --
+ * Recovery function for mvptr.
+ *
+ * PUBLIC: int __qam_mvptr_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_mvptr_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __qam_mvptr_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN trunc_lsn;
+ DB_MPOOLFILE *mpf;
+ QMETA *meta;
+ QUEUE_CURSOR *cp;
+ db_pgno_t metapg;
+ int cmp_n, cmp_p, exact, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__qam_mvptr_print);
+ REC_INTRO(__qam_mvptr_read, ip, 0);
+
+ /* Allocate our own cursor without DB_RECOVER as we need a locker. */
+ if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+ DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+ goto out;
+ F_SET(dbc, DBC_RECOVER);
+
+ metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+
+ if ((ret = __memp_fget(mpf, &metapg, ip, NULL, 0, &meta)) != 0) {
+ if (DB_REDO(op)) {
+ if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+ DB_MPOOL_CREATE, &meta)) != 0) {
+ goto out;
+ }
+ meta->dbmeta.pgno = metapg;
+ meta->dbmeta.type = P_QAMMETA;
+ } else {
+ *lsnp = argp->prev_lsn;
+ goto out;
+ }
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->metalsn);
+
+ /*
+ * Under normal circumstances, we never undo a movement of one of
+ * the pointers. Just move them along regardless of abort/commit.
+ * When going forward we need to verify that this is really where
+ * the pointer belongs. A transaction may roll back and reinsert
+ * a record that was missing at the time of this action.
+ *
+ * If we're undoing a truncate, we need to reset the pointers to
+ * their state before the truncate.
+ */
+ if (DB_UNDO(op)) {
+ if ((argp->opcode & QAM_TRUNCATE) && cmp_n <= 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ meta->first_recno = argp->old_first;
+ meta->cur_recno = argp->old_cur;
+ LSN(meta) = argp->metalsn;
+ }
+ /* If the page lsn is beyond the truncate point, move it back */
+ trunc_lsn = ((DB_TXNHEAD *)info)->trunc_lsn;
+ if (!IS_ZERO_LSN(trunc_lsn) &&
+ LOG_COMPARE(&trunc_lsn, &LSN(meta)) < 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ LSN(meta) = argp->metalsn;
+ }
+ } else if (op == DB_TXN_APPLY || cmp_p == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ cp = (QUEUE_CURSOR *)dbc->internal;
+ if ((argp->opcode & QAM_SETFIRST) &&
+ meta->first_recno == argp->old_first) {
+ if (argp->old_first > argp->new_first)
+ meta->first_recno = argp->new_first;
+ else {
+ if ((ret = __qam_position(dbc,
+ &meta->first_recno, 0, &exact)) != 0)
+ goto err;
+ if (!exact)
+ meta->first_recno = argp->new_first;
+ if (cp->page != NULL &&
+ (ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0)
+ goto err;
+ }
+ }
+
+ if ((argp->opcode & QAM_SETCUR) &&
+ meta->cur_recno == argp->old_cur) {
+ if (argp->old_cur < argp->new_cur)
+ meta->cur_recno = argp->new_cur;
+ else {
+ if ((ret = __qam_position(dbc,
+ &meta->cur_recno, 0, &exact)) != 0)
+ goto err;
+ if (!exact)
+ meta->cur_recno = argp->new_cur;
+ if (cp->page != NULL &&
+ (ret = __qam_fput(dbc,
+ cp->pgno, cp->page, dbc->priority)) != 0)
+ goto err;
+ }
+ }
+
+ meta->dbmeta.lsn = *lsnp;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ if (0) {
+err: (void)__memp_fput(mpf, ip, meta, dbc->priority);
+ }
+
+out: REC_CLOSE;
+}
+
+/*
+ * __qam_del_recover --
+ * Recovery function for del.
+ * Non-extent version or if there is no data (zero len).
+ *
+ * PUBLIC: int __qam_del_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_del_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __qam_del_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ QAMDATA *qp;
+ QMETA *meta;
+ QPAGE *pagep;
+ db_pgno_t metapg;
+ int cmp_n, ret, t_ret;
+
+ COMPQUIET(pagep, NULL);
+ meta = NULL;
+ pagep = NULL;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__qam_del_print);
+ REC_INTRO(__qam_del_read, ip, 0);
+
+ /* Allocate our own cursor without DB_RECOVER as we need a locker. */
+ if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+ DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+ goto out;
+ F_SET(dbc, DBC_RECOVER);
+
+ /* Get the meta page before latching the page. */
+ metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+ if ((ret = __memp_fget(mpf, &metapg,
+ ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+ goto err;
+
+ if ((ret = __qam_fget(dbc, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto err;
+
+ if (pagep->pgno == PGNO_INVALID) {
+ QAM_DIRTY(dbc, argp->pgno, &pagep);
+ pagep->pgno = argp->pgno;
+ pagep->type = P_QAMDATA;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+
+ if (DB_UNDO(op)) {
+ /* make sure first is behind us */
+ if (meta->first_recno == RECNO_OOB ||
+ (QAM_BEFORE_FIRST(meta, argp->recno) &&
+ (meta->first_recno <= meta->cur_recno ||
+ meta->first_recno -
+ argp->recno < argp->recno - meta->cur_recno))) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ meta->first_recno = argp->recno;
+ }
+
+ /* Need to undo delete - mark the record as present */
+ QAM_DIRTY(dbc, pagep->pgno, &pagep);
+ qp = QAM_GET_RECORD(file_dbp, pagep, argp->indx);
+ F_SET(qp, QAM_VALID);
+
+ /*
+ * Move the LSN back to this point; do not move it forward.
+ * If we're in an abort, because we don't hold a page lock,
+ * we could foul up a concurrent put. Having too late an
+ * LSN * is harmless in queue except when we're determining
+ * what we need to roll forward during recovery. [#2588]
+ */
+ if (cmp_n <= 0 && op == DB_TXN_BACKWARD_ROLL)
+ LSN(pagep) = argp->lsn;
+
+ if (op == DB_TXN_ABORT)
+ QAM_WAKEUP(dbc, ret);
+
+ } else if (op == DB_TXN_APPLY || (cmp_n > 0 && DB_REDO(op))) {
+ /* Need to redo delete - clear the valid bit */
+ QAM_DIRTY(dbc, pagep->pgno, &pagep);
+ qp = QAM_GET_RECORD(file_dbp, pagep, argp->indx);
+ F_CLR(qp, QAM_VALID);
+
+ /*
+ * We only move the LSN forward during replication.
+ * During recovery we could obscure an update from
+ * a partially completed transaction while processing
+ * a hot backup. [#13823]
+ */
+ if (op == DB_TXN_APPLY)
+ LSN(pagep) = *lsnp;
+ if ((ret = __qam_fput(dbc,
+ argp->pgno, pagep, dbc->priority)) != 0)
+ goto err;
+ pagep = NULL;
+ if ((ret = __qam_adjust_first(file_dbp,
+ dbc, meta, argp->recno)) != 0)
+ goto err;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+err: if (pagep != NULL && (t_ret =
+ __qam_fput(dbc, argp->pgno, pagep, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (meta != NULL && (t_ret =
+ __memp_fput(mpf, ip, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+out: REC_CLOSE;
+}
+
+/*
+ * __qam_delext_recover --
+ * Recovery function for del in an extent based queue.
+ *
+ * PUBLIC: int __qam_delext_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_delext_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __qam_delext_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ QAMDATA *qp;
+ QMETA *meta;
+ QPAGE *pagep;
+ db_pgno_t metapg;
+ int cmp_n, ret, t_ret;
+
+ COMPQUIET(pagep, NULL);
+ meta = NULL;
+ pagep = NULL;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__qam_delext_print);
+ REC_INTRO(__qam_delext_read, ip, 0);
+
+ /* Allocate our own cursor without DB_RECOVER as we need a locker. */
+ if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+ DB_QUEUE, PGNO_INVALID, 0, NULL, &dbc)) != 0)
+ goto out;
+ F_SET(dbc, DBC_RECOVER);
+
+ metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+ if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+ DB_MPOOL_EDIT, &meta)) != 0)
+ goto err;
+
+ if ((ret = __qam_fget(dbc, &argp->pgno,
+ DB_REDO(op) ? 0 : DB_MPOOL_CREATE, &pagep)) != 0) {
+ /*
+ * If we are redoing a delete and the page is not there
+ * we are done.
+ */
+ if (DB_REDO(op) && (ret == DB_PAGE_NOTFOUND || ret == ENOENT))
+ goto done;
+ goto out;
+ }
+
+ if (pagep->pgno == PGNO_INVALID) {
+ QAM_DIRTY(dbc, argp->pgno, &pagep);
+ pagep->pgno = argp->pgno;
+ pagep->type = P_QAMDATA;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+
+ if (DB_UNDO(op)) {
+ /* make sure first is behind us */
+ if (meta->first_recno == RECNO_OOB ||
+ (QAM_BEFORE_FIRST(meta, argp->recno) &&
+ (meta->first_recno <= meta->cur_recno ||
+ meta->first_recno -
+ argp->recno < argp->recno - meta->cur_recno))) {
+ meta->first_recno = argp->recno;
+ }
+
+ QAM_DIRTY(dbc, pagep->pgno, &pagep);
+ if ((ret = __qam_pitem(dbc, pagep,
+ argp->indx, argp->recno, &argp->data)) != 0)
+ goto err;
+
+ /*
+ * Move the LSN back to this point; do not move it forward.
+ * If we're in an abort, because we don't hold a page lock,
+ * we could foul up a concurrent put. Having too late an
+ * LSN is harmless in queue except when we're determining
+ * what we need to roll forward during recovery. [#2588]
+ */
+ if (cmp_n <= 0 && op == DB_TXN_BACKWARD_ROLL)
+ LSN(pagep) = argp->lsn;
+
+ if (op == DB_TXN_ABORT)
+ QAM_WAKEUP(dbc, ret);
+
+ } else if (op == DB_TXN_APPLY || (cmp_n > 0 && DB_REDO(op))) {
+ QAM_DIRTY(dbc, pagep->pgno, &pagep);
+ /* Need to redo delete - clear the valid bit */
+ qp = QAM_GET_RECORD(file_dbp, pagep, argp->indx);
+ F_CLR(qp, QAM_VALID);
+ /*
+ * We only move the LSN forward during replication.
+ * During recovery we could obscure an update from
+ * a partially completed transaction while processing
+ * a hot backup. [#13823]
+ */
+ if (op == DB_TXN_APPLY)
+ LSN(pagep) = *lsnp;
+ if ((ret = __qam_fput(dbc,
+ argp->pgno, pagep, dbc->priority)) != 0)
+ goto err;
+ pagep = NULL;
+ if ((ret = __qam_adjust_first(file_dbp,
+ dbc, meta, argp->recno)) != 0)
+ goto err;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+err: if (pagep != NULL && (t_ret =
+ __qam_fput(dbc, argp->pgno, pagep, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (meta != NULL && (t_ret =
+ __memp_fput(mpf, ip, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __qam_add_recover --
+ * Recovery function for add.
+ *
+ * PUBLIC: int __qam_add_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__qam_add_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __qam_add_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ QAMDATA *qp;
+ QMETA *meta;
+ QPAGE *pagep;
+ db_pgno_t metapg;
+ int cmp_n, ret;
+
+ COMPQUIET(pagep, NULL);
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__qam_add_print);
+ REC_INTRO(__qam_add_read, ip, 1);
+
+ if ((ret = __qam_fget(dbc, &argp->pgno,
+ DB_UNDO(op) ? 0 : DB_MPOOL_CREATE, &pagep)) != 0) {
+ /*
+ * If we are undoing an append and the page is not there
+ * we are done.
+ */
+ if (DB_UNDO(op) && (ret == DB_PAGE_NOTFOUND || ret == ENOENT))
+ goto done;
+ goto out;
+ }
+
+ if (pagep->pgno == PGNO_INVALID) {
+ QAM_DIRTY(dbc, argp->pgno, &pagep);
+ pagep->pgno = argp->pgno;
+ pagep->type = P_QAMDATA;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+
+ if (DB_REDO(op)) {
+ /* Fix meta-data page. */
+ metapg = ((QUEUE *)file_dbp->q_internal)->q_meta;
+ if ((ret = __memp_fget(mpf, &metapg, ip, NULL,
+ 0, &meta)) != 0)
+ goto err;
+ if (QAM_BEFORE_FIRST(meta, argp->recno)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ meta->first_recno = argp->recno;
+ }
+ if (argp->recno == meta->cur_recno ||
+ QAM_AFTER_CURRENT(meta, argp->recno)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ meta->cur_recno = argp->recno + 1;
+ }
+ if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
+ goto err;
+
+ /* Now update the actual page if necessary. */
+ if (op == DB_TXN_APPLY || cmp_n > 0) {
+ QAM_DIRTY(dbc, pagep->pgno, &pagep);
+ /* Need to redo add - put the record on page */
+ if ((ret = __qam_pitem(dbc,
+ pagep, argp->indx, argp->recno, &argp->data)) != 0)
+ goto err;
+ /*
+ * We only move the LSN forward during replication.
+ * During recovery we could obscure an update from
+ * a partially completed transaction while processing
+ * a hot backup. [#13823]
+ */
+ if (op == DB_TXN_APPLY) {
+ LSN(pagep) = *lsnp;
+ QAM_WAKEUP(dbc, ret);
+ }
+ }
+ } else if (DB_UNDO(op)) {
+ /*
+ * Need to undo add
+ * If this was an overwrite, put old record back.
+ * Otherwise just clear the valid bit
+ */
+ if (argp->olddata.size != 0) {
+ QAM_DIRTY(dbc, pagep->pgno, &pagep);
+ if ((ret = __qam_pitem(dbc, pagep,
+ argp->indx, argp->recno, &argp->olddata)) != 0)
+ goto err;
+
+ if (!(argp->vflag & QAM_VALID)) {
+ qp = QAM_GET_RECORD(
+ file_dbp, pagep, argp->indx);
+ F_CLR(qp, QAM_VALID);
+ }
+ } else {
+ QAM_DIRTY(dbc, pagep->pgno, &pagep);
+ qp = QAM_GET_RECORD(file_dbp, pagep, argp->indx);
+ qp->flags = 0;
+ }
+
+ /*
+ * Move the LSN back to this point; do not move it forward.
+ * If we're in an abort, because we don't hold a page lock,
+ * we could foul up a concurrent put. Having too late an
+ * LSN is harmless in queue except when we're determining
+ * what we need to roll forward during recovery. [#2588]
+ */
+ if (cmp_n <= 0 && op == DB_TXN_BACKWARD_ROLL)
+ LSN(pagep) = argp->lsn;
+ }
+
+ if ((ret = __qam_fput(dbc, argp->pgno, pagep, dbc->priority)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ if (0) {
+err: (void)__qam_fput(dbc, argp->pgno, pagep, dbc->priority);
+ }
+
+out: REC_CLOSE;
+}
diff --git a/src/qam/qam_stat.c b/src/qam/qam_stat.c
new file mode 100644
index 00000000..15c41bb5
--- /dev/null
+++ b/src/qam/qam_stat.c
@@ -0,0 +1,255 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+
+#ifdef HAVE_STATISTICS
+/*
+ * __qam_stat --
+ * Gather/print the qam statistics
+ *
+ * PUBLIC: int __qam_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__qam_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ DB_QUEUE_STAT *sp;
+ PAGE *h;
+ QAMDATA *qp, *ep;
+ QMETA *meta;
+ QUEUE *t;
+ db_indx_t indx;
+ db_pgno_t first, last, pgno, pg_ext, stop;
+ u_int32_t re_len;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+
+ LOCK_INIT(lock);
+ mpf = dbp->mpf;
+ sp = NULL;
+ t = dbp->q_internal;
+
+ if (spp == NULL)
+ return (0);
+
+ /* Allocate and clear the structure. */
+ if ((ret = __os_umalloc(dbp->env, sizeof(*sp), &sp)) != 0)
+ goto err;
+ memset(sp, 0, sizeof(*sp));
+
+ re_len = ((QUEUE *)dbp->q_internal)->re_len;
+
+ /* Determine the last page of the database. */
+ if ((ret = __db_lget(dbc, 0, t->q_meta, DB_LOCK_READ, 0, &lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &t->q_meta,
+ dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+ goto err;
+
+ if (flags == DB_FAST_STAT) {
+ sp->qs_nkeys = meta->dbmeta.key_count;
+ sp->qs_ndata = meta->dbmeta.record_count;
+ goto meta_only;
+ }
+
+ first = QAM_RECNO_PAGE(dbp, meta->first_recno);
+ last = QAM_RECNO_PAGE(dbp, meta->cur_recno);
+
+ ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ pgno = first;
+ if (first > last)
+ stop = QAM_RECNO_PAGE(dbp, UINT32_MAX);
+ else
+ stop = last;
+
+ /* Dump each page. */
+ pg_ext = ((QUEUE *)dbp->q_internal)->page_ext;
+begin:
+ /* Walk through the pages and count. */
+ for (; pgno <= stop; ++pgno) {
+ if ((ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
+ goto err;
+ ret = __qam_fget(dbc, &pgno, 0, &h);
+ if (ret == ENOENT) {
+ pgno += pg_ext - 1;
+ continue;
+ }
+ if (ret == DB_PAGE_NOTFOUND) {
+ if (pg_ext == 0) {
+ if (pgno != stop && first != last)
+ goto err;
+ ret = 0;
+ break;
+ }
+ pgno += (pg_ext - ((pgno - 1) % pg_ext)) - 1;
+ continue;
+ }
+ if (ret != 0)
+ goto err;
+
+ ++sp->qs_pages;
+
+ ep = (QAMDATA *)((u_int8_t *)h + dbp->pgsize - re_len);
+ for (indx = 0, qp = QAM_GET_RECORD(dbp, h, indx);
+ qp <= ep;
+ ++indx, qp = QAM_GET_RECORD(dbp, h, indx)) {
+ if (F_ISSET(qp, QAM_VALID))
+ sp->qs_ndata++;
+ else
+ sp->qs_pgfree += re_len;
+ }
+
+ ret = __qam_fput(dbc, pgno, h, dbc->priority);
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ }
+
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ goto err;
+ if (first > last) {
+ pgno = 1;
+ stop = last;
+ first = last;
+ goto begin;
+ }
+
+ /* Get the meta-data page. */
+ if ((ret = __db_lget(dbc,
+ 0, t->q_meta, F_ISSET(dbp, DB_AM_RDONLY) ?
+ DB_LOCK_READ : DB_LOCK_WRITE, 0, &lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &t->q_meta, dbc->thread_info, dbc->txn,
+ F_ISSET(dbp, DB_AM_RDONLY) ? 0 : DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+
+ if (!F_ISSET(dbp, DB_AM_RDONLY))
+ meta->dbmeta.key_count =
+ meta->dbmeta.record_count = sp->qs_ndata;
+ sp->qs_nkeys = sp->qs_ndata;
+
+meta_only:
+ /* Get the metadata fields. */
+ sp->qs_magic = meta->dbmeta.magic;
+ sp->qs_version = meta->dbmeta.version;
+ sp->qs_metaflags = meta->dbmeta.flags;
+ sp->qs_pagesize = meta->dbmeta.pagesize;
+ sp->qs_extentsize = meta->page_ext;
+ sp->qs_re_len = meta->re_len;
+ sp->qs_re_pad = meta->re_pad;
+ sp->qs_first_recno = meta->first_recno;
+ sp->qs_cur_recno = meta->cur_recno;
+
+ /* Discard the meta-data page. */
+ ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ *(DB_QUEUE_STAT **)spp = sp;
+
+ if (0) {
+err: if (sp != NULL)
+ __os_ufree(dbp->env, sp);
+ }
+
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __qam_stat_print --
+ * Display queue statistics.
+ *
+ * PUBLIC: int __qam_stat_print __P((DBC *, u_int32_t));
+ */
+int
+__qam_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_QUEUE_STAT *sp;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if ((ret = __qam_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Default Queue database information:");
+ }
+ __db_msg(env, "%lx\tQueue magic number", (u_long)sp->qs_magic);
+ __db_msg(env, "%lu\tQueue version number", (u_long)sp->qs_version);
+ __db_dl(env, "Fixed-length record size", (u_long)sp->qs_re_len);
+ __db_msg(env, "%#x\tFixed-length record pad", (int)sp->qs_re_pad);
+ __db_dl(env,
+ "Underlying database page size", (u_long)sp->qs_pagesize);
+ __db_dl(env,
+ "Underlying database extent size", (u_long)sp->qs_extentsize);
+ __db_dl(env,
+ "Number of records in the database", (u_long)sp->qs_nkeys);
+ __db_dl(env,
+ "Number of data items in the database", (u_long)sp->qs_ndata);
+ __db_dl(env, "Number of database pages", (u_long)sp->qs_pages);
+ __db_dl_pct(env,
+ "Number of bytes free in database pages",
+ (u_long)sp->qs_pgfree,
+ DB_PCT_PG(sp->qs_pgfree, sp->qs_pages, sp->qs_pagesize), "ff");
+ __db_msg(env,
+ "%lu\tFirst undeleted record", (u_long)sp->qs_first_recno);
+ __db_msg(env,
+ "%lu\tNext available record number", (u_long)sp->qs_cur_recno);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__qam_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbc->env));
+}
+#endif
diff --git a/src/qam/qam_stub.c b/src/qam/qam_stub.c
new file mode 100644
index 00000000..f5140079
--- /dev/null
+++ b/src/qam/qam_stub.c
@@ -0,0 +1,339 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_QUEUE
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/qam.h"
+
+/*
+ * If the library wasn't compiled with the Queue access method, various
+ * routines aren't available. Stub them here, returning an appropriate
+ * error.
+ */
+
+/*
+ * __db_no_queue_am --
+ * Error when a Berkeley DB build doesn't include the access method.
+ *
+ * PUBLIC: int __db_no_queue_am __P((ENV *));
+ */
+int
+__db_no_queue_am(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("1145",
+ "library build did not include support for the Queue access method"));
+ return (DB_OPNOTSUP);
+}
+
+int
+__db_prqueue(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_31_qammeta(dbp, real_name, buf)
+ DB *dbp;
+ char *real_name;
+ u_int8_t *buf;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(buf, NULL);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_32_qammeta(dbp, real_name, buf)
+ DB *dbp;
+ char *real_name;
+ u_int8_t *buf;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(buf, NULL);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_append(dbc, key, data)
+ DBC *dbc;
+ DBT *key, *data;
+{
+ COMPQUIET(key, NULL);
+ COMPQUIET(data, NULL);
+ return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qamc_dup(orig_dbc, new_dbc)
+ DBC *orig_dbc, *new_dbc;
+{
+ COMPQUIET(new_dbc, NULL);
+ return (__db_no_queue_am(orig_dbc->env));
+}
+
+int
+__qamc_init(dbc)
+ DBC *dbc;
+{
+ return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qam_db_close(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(flags, 0);
+ return (0);
+}
+
+int
+__qam_db_create(dbp)
+ DB *dbp;
+{
+ COMPQUIET(dbp, NULL);
+ return (0);
+}
+
+int
+__qam_extent_names(env, name, namelistp)
+ ENV *env;
+ char *name;
+ char ***namelistp;
+{
+ COMPQUIET(name, NULL);
+ COMPQUIET(namelistp, NULL);
+ return (__db_no_queue_am(env));
+}
+
+int
+__qam_gen_filelist(dbp, ip, filelistp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ QUEUE_FILELIST **filelistp;
+{
+ COMPQUIET(ip, NULL);
+ COMPQUIET(filelistp, NULL);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(dtabp, NULL);
+ return (0);
+}
+
+int
+__qam_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(dtabp, NULL);
+ return (0);
+}
+
+int
+__qam_metachk(dbp, name, qmeta)
+ DB *dbp;
+ const char *name;
+ QMETA *qmeta;
+{
+ COMPQUIET(name, NULL);
+ COMPQUIET(qmeta, NULL);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_mswap(env, pg)
+ ENV *env;
+ PAGE *pg;
+{
+ COMPQUIET(pg, NULL);
+ return (__db_no_queue_am(env));
+}
+
+int
+__qam_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ COMPQUIET(ip, NULL);
+ COMPQUIET(txn, NULL);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(name, NULL);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_open(dbp, ip, txn, name, base_pgno, mode, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ int mode;
+ u_int32_t flags;
+{
+ COMPQUIET(ip, NULL);
+ COMPQUIET(txn, NULL);
+ COMPQUIET(name, NULL);
+ COMPQUIET(base_pgno, 0);
+ COMPQUIET(mode, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_pgin_out(env, pg, pp, cookie)
+ ENV *env;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ COMPQUIET(pg, 0);
+ COMPQUIET(pp, NULL);
+ COMPQUIET(cookie, NULL);
+ return (__db_no_queue_am(env));
+}
+
+int
+__qam_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(h, NULL);
+ COMPQUIET(handle, NULL);
+ COMPQUIET(callback, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_set_ext_data(dbp, name)
+ DB *dbp;
+ const char *name;
+{
+ COMPQUIET(name, NULL);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qam_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qam_sync(dbp)
+ DB *dbp;
+{
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ COMPQUIET(countp, NULL);
+ return (__db_no_queue_am(dbc->env));
+}
+
+int
+__qam_vrfy_data(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ QPAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ QMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(meta, NULL);
+ COMPQUIET(pgno, 0);
+ COMPQUIET(flags, 0);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_vrfy_structure(dbp, vdp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_queue_am(dbp->env));
+}
+
+int
+__qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(handle, NULL);
+ COMPQUIET(callback, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_no_queue_am(dbp->env));
+}
+#endif /* !HAVE_QUEUE */
diff --git a/src/qam/qam_upgrade.c b/src/qam/qam_upgrade.c
new file mode 100644
index 00000000..ac96c889
--- /dev/null
+++ b/src/qam/qam_upgrade.c
@@ -0,0 +1,101 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_upgrade.h"
+#include "dbinc/db_page.h"
+#include "dbinc/qam.h"
+
+/*
+ * __qam_31_qammeta --
+ * Upgrade the database from version 1 to version 2.
+ *
+ * PUBLIC: int __qam_31_qammeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__qam_31_qammeta(dbp, real_name, buf)
+ DB *dbp;
+ char *real_name;
+ u_int8_t *buf;
+{
+ QMETA30 *oldmeta;
+ QMETA31 *newmeta;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(real_name, NULL);
+
+ newmeta = (QMETA31 *)buf;
+ oldmeta = (QMETA30 *)buf;
+
+ /*
+ * Copy the fields to their new locations.
+ * They may overlap so start at the bottom and use memmove().
+ */
+ newmeta->rec_page = oldmeta->rec_page;
+ newmeta->re_pad = oldmeta->re_pad;
+ newmeta->re_len = oldmeta->re_len;
+ newmeta->cur_recno = oldmeta->cur_recno;
+ newmeta->first_recno = oldmeta->first_recno;
+ newmeta->start = oldmeta->start;
+ memmove(newmeta->dbmeta.uid,
+ oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid));
+ newmeta->dbmeta.flags = oldmeta->dbmeta.flags;
+ newmeta->dbmeta.record_count = 0;
+ newmeta->dbmeta.key_count = 0;
+ ZERO_LSN(newmeta->dbmeta.unused3);
+
+ /* Update the version. */
+ newmeta->dbmeta.version = 2;
+
+ return (0);
+}
+
+/*
+ * __qam_32_qammeta --
+ * Upgrade the database from version 2 to version 3.
+ *
+ * PUBLIC: int __qam_32_qammeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__qam_32_qammeta(dbp, real_name, buf)
+ DB *dbp;
+ char *real_name;
+ u_int8_t *buf;
+{
+ QMETA31 *oldmeta;
+ QMETA32 *newmeta;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(real_name, NULL);
+
+ newmeta = (QMETA32 *)buf;
+ oldmeta = (QMETA31 *)buf;
+
+ /*
+ * Copy the fields to their new locations.
+ * We are dropping the first field so move
+ * from the top.
+ */
+ newmeta->first_recno = oldmeta->first_recno;
+ newmeta->cur_recno = oldmeta->cur_recno;
+ newmeta->re_len = oldmeta->re_len;
+ newmeta->re_pad = oldmeta->re_pad;
+ newmeta->rec_page = oldmeta->rec_page;
+ newmeta->page_ext = 0;
+ /* cur_recno now points to the first free slot. */
+ newmeta->cur_recno++;
+ if (newmeta->first_recno == 0)
+ newmeta->first_recno = 1;
+
+ /* Update the version. */
+ newmeta->dbmeta.version = 3;
+
+ return (0);
+}
diff --git a/src/qam/qam_verify.c b/src/qam/qam_verify.c
new file mode 100644
index 00000000..af5ab5db
--- /dev/null
+++ b/src/qam/qam_verify.c
@@ -0,0 +1,653 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+/*
+ * __qam_vrfy_meta --
+ * Verify the queue-specific part of a metadata page.
+ *
+ * PUBLIC: int __qam_vrfy_meta __P((DB *, VRFY_DBINFO *, QMETA *,
+ * PUBLIC: db_pgno_t, u_int32_t));
+ */
+int
+__qam_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ QMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ QUEUE *qp;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t *extents, extid, first, last;
+ size_t len;
+ int count, i, isbad, nextents, ret, t_ret;
+ char *buf, **names;
+
+ COMPQUIET(count, 0);
+
+ env = dbp->env;
+ qp = (QUEUE *)dbp->q_internal;
+ extents = NULL;
+ first = last = 0;
+ isbad = 0;
+ buf = NULL;
+ names = NULL;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ /*
+ * Queue can't be used in subdatabases, so if this isn't set
+ * something very odd is going on.
+ */
+ if (!F_ISSET(pip, VRFY_INCOMPLETE))
+ EPRINT((env, DB_STR_A("1146",
+ "Page %lu: queue databases must be one-per-file",
+ "%lu"), (u_long)pgno));
+
+ /*
+ * We have already checked the common fields in __db_vrfy_pagezero.
+ * However, we used the on-disk metadata page, it may have been stale.
+ * We now have the page from mpool, so check that.
+ */
+ if ((ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * Because the metapage pointers are rolled forward by
+ * aborting transactions, the extent of the queue may
+ * extend beyond the allocated pages, so we do
+ * not check that meta_current is within the allocated
+ * pages.
+ */
+
+ /*
+ * re_len: If this is bad, we can't safely verify queue data pages, so
+ * return DB_VERIFY_FATAL
+ */
+ if (DB_ALIGN(meta->re_len + sizeof(QAMDATA) - 1, sizeof(u_int32_t)) *
+ meta->rec_page + QPAGE_SZ(dbp) > dbp->pgsize) {
+ EPRINT((env, DB_STR_A("1147",
+ "Page %lu: queue record length %lu too high for page size and recs/page",
+ "%lu %lu"), (u_long)pgno, (u_long)meta->re_len));
+ ret = DB_VERIFY_FATAL;
+ goto err;
+ } else {
+ /*
+ * We initialize the Queue internal pointer; we may need
+ * it when handling extents. It would get set up in open,
+ * if we called open normally, but we don't.
+ */
+ vdp->re_pad = meta->re_pad;
+ qp->re_pad = (int)meta->re_pad;
+ qp->re_len = vdp->re_len = meta->re_len;
+ qp->rec_page = vdp->rec_page = meta->rec_page;
+ qp->page_ext = vdp->page_ext = meta->page_ext;
+ }
+
+ /*
+ * There's no formal maximum extentsize, and a 0 value represents
+ * no extents, so there's nothing to verify.
+ *
+ * Note that since QUEUE databases can't have subdatabases, it's an
+ * error to see more than one QUEUE metadata page in a single
+ * verifier run. Theoretically, this should really be a structure
+ * rather than a per-page check, but since we're setting qp fields
+ * here (and have only one qp to set) we raise the alarm now if
+ * this assumption fails. (We need the qp info to be reasonable
+ * before we do per-page verification of queue extents.)
+ */
+ if (F_ISSET(vdp, VRFY_QMETA_SET)) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1148",
+ "Page %lu: database contains multiple Queue metadata pages",
+ "%lu"), (u_long)pgno));
+ goto err;
+ }
+ F_SET(vdp, VRFY_QMETA_SET);
+ qp->page_ext = meta->page_ext;
+ dbp->pgsize = meta->dbmeta.pagesize;
+ qp->q_meta = pgno;
+ qp->q_root = pgno + 1;
+ vdp->first_recno = meta->first_recno;
+ vdp->last_recno = meta->cur_recno;
+ if (qp->page_ext != 0) {
+ first = QAM_RECNO_EXTENT(dbp, vdp->first_recno);
+ last = QAM_RECNO_EXTENT(dbp, vdp->last_recno);
+ }
+
+ /*
+ * Look in the data directory to see if there are any extents
+ * around that are not in the range of the queue. If so,
+ * then report that and look there if we are salvaging.
+ */
+
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, qp->dir, NULL, &buf)) != 0)
+ goto err;
+ if ((ret = __os_dirlist(env, buf, 0, &names, &count)) != 0)
+ goto err;
+ __os_free(env, buf);
+ buf = NULL;
+
+ /* In-memory dbs cannot have extents. */
+ nextents = 0;
+ if (!F_ISSET(dbp, DB_AM_INMEM)) {
+ len = strlen(QUEUE_EXTENT_HEAD) + strlen(qp->name) + 1;
+ if ((ret = __os_malloc(env, len, &buf)) != 0)
+ goto err;
+ len = (size_t)snprintf(buf, len, QUEUE_EXTENT_HEAD, qp->name);
+ for (i = 0; i < count; i++) {
+ if (strncmp(names[i], buf, len) == 0) {
+ /* Only save extents out of bounds. */
+ extid = (db_pgno_t)strtoul(
+ &names[i][len], NULL, 10);
+ if (qp->page_ext != 0 &&
+ (last > first ?
+ (extid >= first && extid <= last) :
+ (extid >= first || extid <= last)))
+ continue;
+ if (extents == NULL && (ret = __os_malloc(
+ env, (size_t)(count - i) * sizeof(extid),
+ &extents)) != 0)
+ goto err;
+ extents[nextents] = extid;
+ nextents++;
+ }
+ }
+ }
+ if (nextents > 0)
+ __db_errx(env, DB_STR_A("1149",
+ "Warning: %d extra extent files found", "%d"), nextents);
+ vdp->nextents = nextents;
+ vdp->extents = extents;
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (names != NULL)
+ __os_dirfree(env, names, count);
+ if (buf != NULL)
+ __os_free(env, buf);
+ if (ret != 0 && extents != NULL)
+ __os_free(env, extents);
+ if (LF_ISSET(DB_SALVAGE) &&
+ (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret == 0 && isbad == 1 ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __qam_meta2pgset --
+ * For a given Queue meta page, add all of the db's pages to the pgset. Dealing
+ * with extents complicates things, as it is possible for there to be gaps in
+ * the page number sequence (the user could have re-inserted record numbers that
+ * had been on deleted extents) so we test the existence of each extent before
+ * adding its pages to the pgset. If there are no extents, just loop from
+ * first_recno to last_recno.
+ *
+ * PUBLIC: int __qam_meta2pgset __P((DB *, VRFY_DBINFO *, DB *));
+ */
+int
+__qam_meta2pgset(dbp, vdp, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ DB *pgset;
+{
+ DBC *dbc;
+ PAGE *h;
+ db_pgno_t first, last, pgno, pg_ext, stop;
+ int ret, t_ret;
+ u_int32_t i;
+
+ ret = 0;
+ h = NULL;
+ if (vdp->last_recno <= vdp->first_recno)
+ return (0);
+
+ pg_ext = vdp->page_ext;
+
+ first = QAM_RECNO_PAGE(dbp, vdp->first_recno);
+
+ /*
+ * last_recno gives the next recno to be allocated, we want the last
+ * allocated recno.
+ */
+ last = QAM_RECNO_PAGE(dbp, vdp->last_recno - 1);
+
+ if (first == PGNO_INVALID || last == PGNO_INVALID)
+ return (DB_VERIFY_BAD);
+
+ pgno = first;
+ if (first > last)
+ stop = QAM_RECNO_PAGE(dbp, UINT32_MAX);
+ else
+ stop = last;
+
+ /*
+ * If this db doesn't have extents, just add all page numbers from first
+ * to last.
+ */
+ if (pg_ext == 0) {
+ for (pgno = first; pgno <= stop; pgno++)
+ if ((ret = __db_vrfy_pgset_inc(
+ pgset, vdp->thread_info, vdp->txn, pgno)) != 0)
+ break;
+ if (first > last)
+ for (pgno = 1; pgno <= last; pgno++)
+ if ((ret = __db_vrfy_pgset_inc(pgset,
+ vdp->thread_info, vdp->txn, pgno)) != 0)
+ break;
+
+ return (ret);
+ }
+
+ if ((ret = __db_cursor(dbp, vdp->thread_info, NULL, &dbc, 0)) != 0)
+ return (ret);
+ /*
+ * Check if we can get the first page of each extent. If we can, then
+ * add all of that extent's pages to the pgset. If we can't, assume the
+ * extent doesn't exist and don't add any pages, if we're wrong we'll
+ * find the pages in __db_vrfy_walkpages.
+ */
+begin: for (; pgno <= stop; pgno += pg_ext) {
+ if ((ret = __qam_fget(dbc, &pgno, 0, &h)) != 0) {
+ if (ret == ENOENT || ret == DB_PAGE_NOTFOUND) {
+ ret = 0;
+ continue;
+ }
+ goto err;
+ }
+ if ((ret = __qam_fput(dbc, pgno, h, dbp->priority)) != 0)
+ goto err;
+
+ for (i = 0; i < pg_ext && pgno + i <= last; i++)
+ if ((ret = __db_vrfy_pgset_inc(
+ pgset, vdp->thread_info, vdp->txn, pgno + i)) != 0)
+ goto err;
+
+ /* The first recno won't always occur on the first page of the
+ * extent. Back up to the beginning of the extent before the
+ * end of the loop so that the increment works correctly.
+ */
+ if (pgno == first)
+ pgno = pgno % pg_ext + 1;
+ }
+
+ if (first > last) {
+ pgno = 1;
+ first = last;
+ stop = last;
+ goto begin;
+ }
+
+err:
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __qam_vrfy_data --
+ * Verify a queue data page.
+ *
+ * PUBLIC: int __qam_vrfy_data __P((DB *, VRFY_DBINFO *, QPAGE *,
+ * PUBLIC: db_pgno_t, u_int32_t));
+ */
+int
+__qam_vrfy_data(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ QPAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ DB fakedb;
+ struct __queue fakeq;
+ QAMDATA *qp;
+ db_recno_t i;
+
+ /*
+ * Not much to do here, except make sure that flags are reasonable.
+ *
+ * QAM_GET_RECORD assumes a properly initialized q_internal
+ * structure, however, and we don't have one, so we play
+ * some gross games to fake it out.
+ */
+ fakedb.q_internal = &fakeq;
+ fakedb.flags = dbp->flags;
+ fakeq.re_len = vdp->re_len;
+
+ for (i = 0; i < vdp->rec_page; i++) {
+ qp = QAM_GET_RECORD(&fakedb, h, i);
+ if ((u_int8_t *)qp >= (u_int8_t *)h + dbp->pgsize) {
+ EPRINT((dbp->env, DB_STR_A("1150",
+ "Page %lu: queue record %lu extends past end of page",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+
+ if (qp->flags & ~(QAM_VALID | QAM_SET)) {
+ EPRINT((dbp->env, DB_STR_A("1151",
+ "Page %lu: queue record %lu has bad flags (%#lx)",
+ "%lu %lu %#lx"), (u_long)pgno, (u_long)i,
+ (u_long)qp->flags));
+ return (DB_VERIFY_BAD);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __qam_vrfy_structure --
+ * Verify a queue database structure, such as it is.
+ *
+ * PUBLIC: int __qam_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t));
+ */
+int
+__qam_vrfy_structure(dbp, vdp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ u_int32_t flags;
+{
+ VRFY_PAGEINFO *pip;
+ db_pgno_t i;
+ int ret, isbad;
+
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0)
+ return (ret);
+
+ if (pip->type != P_QAMMETA) {
+ EPRINT((dbp->env, DB_STR_A("1152",
+ "Page %lu: queue database has no meta page", "%lu"),
+ (u_long)PGNO_BASE_MD));
+ isbad = 1;
+ goto err;
+ }
+
+ if ((ret = __db_vrfy_pgset_inc(
+ vdp->pgset, vdp->thread_info, vdp->txn, 0)) != 0)
+ goto err;
+
+ for (i = 1; i <= vdp->last_pgno; i++) {
+ /* Send feedback to the application about our progress. */
+ if (!LF_ISSET(DB_SALVAGE))
+ __db_vrfy_struct_feedback(dbp, vdp);
+
+ if ((ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0 ||
+ (ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+ return (ret);
+ if (!F_ISSET(pip, VRFY_IS_ALLZEROES) &&
+ pip->type != P_QAMDATA && !F_ISSET(pip, VRFY_NONEXISTENT)) {
+ EPRINT((dbp->env, DB_STR_A("1153",
+ "Page %lu: queue database page of incorrect type %lu",
+ "%lu %lu"), (u_long)i, (u_long)pip->type));
+ isbad = 1;
+ goto err;
+ } else if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
+ vdp->thread_info, vdp->txn, i)) != 0)
+ goto err;
+ }
+
+err: if ((ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0)
+ return (ret);
+ return (isbad == 1 ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __qam_vrfy_walkqueue --
+ * Do a "walkpages" per-page verification pass over the set of Queue
+ * extent pages.
+ *
+ * PUBLIC: int __qam_vrfy_walkqueue __P((DB *, VRFY_DBINFO *, void *,
+ * PUBLIC: int (*)(void *, const void *), u_int32_t));
+ */
+int
+__qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ PAGE *h;
+ QUEUE *qp;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t first, i, last, pg_ext, stop;
+ int isbad, nextents, ret, t_ret;
+
+ COMPQUIET(h, NULL);
+
+ env = dbp->env;
+ qp = dbp->q_internal;
+ pip = NULL;
+ pg_ext = qp->page_ext;
+ isbad = ret = t_ret = 0;
+ h = NULL;
+
+ /* If this database has no extents, we've seen all the pages already. */
+ if (pg_ext == 0)
+ return (0);
+
+ first = QAM_RECNO_PAGE(dbp, vdp->first_recno);
+ last = QAM_RECNO_PAGE(dbp, vdp->last_recno);
+
+ i = first;
+ if (first > last)
+ stop = QAM_RECNO_PAGE(dbp, UINT32_MAX);
+ else
+ stop = last;
+ nextents = vdp->nextents;
+
+ /* Verify/salvage each page. */
+ if ((ret = __db_cursor(dbp, vdp->thread_info, NULL, &dbc, 0)) != 0)
+ return (ret);
+begin: for (; i <= stop; i++) {
+ /*
+ * If DB_SALVAGE is set, we inspect our database of completed
+ * pages, and skip any we've already printed in the subdb pass.
+ */
+ if (LF_ISSET(DB_SALVAGE) && (__db_salvage_isdone(vdp, i) != 0))
+ continue;
+ if ((t_ret = __qam_fget(dbc, &i, 0, &h)) != 0) {
+ if (t_ret == ENOENT || t_ret == DB_PAGE_NOTFOUND) {
+ i += (pg_ext - ((i - 1) % pg_ext)) - 1;
+ continue;
+ }
+
+ /*
+ * If an individual page get fails, keep going iff
+ * we're salvaging.
+ */
+ if (LF_ISSET(DB_SALVAGE)) {
+ if (ret == 0)
+ ret = t_ret;
+ continue;
+ }
+ h = NULL;
+ ret = t_ret;
+ goto err;
+ }
+
+ if (LF_ISSET(DB_SALVAGE)) {
+ /*
+ * We pretty much don't want to quit unless a
+ * bomb hits. May as well return that something
+ * was screwy, however.
+ */
+ if ((t_ret = __db_salvage_pg(dbp,
+ vdp, i, h, handle, callback, flags)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ isbad = 1;
+ }
+ } else {
+ /*
+ * If we are not salvaging, and we get any error
+ * other than DB_VERIFY_BAD, return immediately;
+ * it may not be safe to proceed. If we get
+ * DB_VERIFY_BAD, keep going; listing more errors
+ * may make it easier to diagnose problems and
+ * determine the magnitude of the corruption.
+ */
+ if ((ret = __db_vrfy_common(dbp,
+ vdp, h, i, flags)) == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (ret != 0)
+ goto err;
+
+ __db_vrfy_struct_feedback(dbp, vdp);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+ goto err;
+ if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+ goto put;
+ if (pip->type != P_QAMDATA) {
+ EPRINT((env, DB_STR_A("1154",
+ "Page %lu: queue database page of incorrect type %lu",
+ "%lu %lu"), (u_long)i, (u_long)pip->type));
+ isbad = 1;
+ goto err;
+ }
+ if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
+ vdp->thread_info, vdp->txn, i)) != 0)
+ goto err;
+ if ((ret = __qam_vrfy_data(dbp, vdp,
+ (QPAGE *)h, i, flags)) == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (ret != 0)
+ goto err;
+
+put: if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ goto err1;
+ pip = NULL;
+ }
+
+ /* Again, keep going iff we're salvaging. */
+ if ((t_ret = __qam_fput(dbc, i, h, dbp->priority)) != 0) {
+ if (LF_ISSET(DB_SALVAGE)) {
+ if (ret == 0)
+ ret = t_ret;
+ continue;
+ }
+ ret = t_ret;
+ goto err1;
+ }
+ }
+
+ if (first > last) {
+ i = 1;
+ stop = last;
+ first = last;
+ goto begin;
+ }
+
+ /*
+ * Now check to see if there were any lingering
+ * extents and dump their data.
+ */
+ if (LF_ISSET(DB_SALVAGE) && nextents != 0) {
+ nextents--;
+ i = 1 +
+ vdp->extents[nextents] * vdp->page_ext;
+ stop = i + vdp->page_ext;
+ goto begin;
+ }
+
+ if (0) {
+err: if (h != NULL && (t_ret =
+ __qam_fput(dbc, i, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pip != NULL && (t_ret =
+ __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+err1: if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __qam_salvage --
+ * Safely dump out all recnos and data on a queue page.
+ *
+ * PUBLIC: int __qam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *,
+ * PUBLIC: void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__qam_salvage(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DBT dbt, key;
+ QAMDATA *qp, *qep;
+ db_recno_t recno;
+ int ret, err_ret, t_ret;
+ u_int32_t pagesize, qlen;
+ u_int32_t i;
+
+ memset(&dbt, 0, sizeof(DBT));
+ memset(&key, 0, sizeof(DBT));
+
+ err_ret = ret = 0;
+
+ pagesize = (u_int32_t)dbp->mpf->mfp->pagesize;
+ qlen = ((QUEUE *)dbp->q_internal)->re_len;
+ dbt.size = qlen;
+ key.data = &recno;
+ key.size = sizeof(recno);
+ recno = (pgno - 1) * QAM_RECNO_PER_PAGE(dbp) + 1;
+ i = 0;
+ qep = (QAMDATA *)((u_int8_t *)h + pagesize - qlen);
+ for (qp = QAM_GET_RECORD(dbp, h, i); qp < qep;
+ recno++, i++, qp = QAM_GET_RECORD(dbp, h, i)) {
+ if (F_ISSET(qp, ~(QAM_VALID|QAM_SET)))
+ continue;
+ if (!F_ISSET(qp, QAM_SET))
+ continue;
+
+ if (!LF_ISSET(DB_AGGRESSIVE) && !F_ISSET(qp, QAM_VALID))
+ continue;
+
+ dbt.data = qp->data;
+ if ((ret = __db_vrfy_prdbt(&key,
+ 0, " ", handle, callback, 1, 0, vdp)) != 0)
+ err_ret = ret;
+
+ if ((ret = __db_vrfy_prdbt(&dbt,
+ 0, " ", handle, callback, 0, 0, vdp)) != 0)
+ err_ret = ret;
+ }
+
+ if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
+ return (t_ret);
+ return ((ret == 0 && err_ret != 0) ? err_ret : ret);
+}
diff --git a/src/rep/mlease.html b/src/rep/mlease.html
new file mode 100644
index 00000000..7d44b465
--- /dev/null
+++ b/src/rep/mlease.html
@@ -0,0 +1,1198 @@
+<!DOCTYPE doctype PUBLIC "-//w3c//dtd html 4.0 transitional//en">
+<!--Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.-->
+<html>
+<head>
+ <meta http-equiv="Content-Type"
+ content="text/html; charset=iso-8859-1">
+ <meta name="GENERATOR"
+ content="Mozilla/4.76 [en] (X11; U; FreeBSD 4.3-RELEASE i386) [Netscape]">
+ <title>Master Lease</title>
+</head>
+<body>
+<center>
+<h1>Master Leases for Berkeley DB</h1>
+</center>
+<center><i>Susan LoVerso</i> <br>
+<i>sue@sleepycat.com</i> <br>
+<i>Rev 1.1</i><br>
+<i>2007 Feb 2</i><br>
+</center>
+<p><br>
+</p>
+<h2>What are Master Leases?</h2>
+A master lease is a mechanism whereby clients grant master-ship rights
+to a site and that master, by holding lease rights can provide a&nbsp;
+guarantee of durability to a replication group for a given period of
+time.&nbsp; By granting a lease to a master,
+a&nbsp; client will not participate in an election to elect a new
+master until that granted master lease has expired.&nbsp; By holding a
+collection of granted leases, a master will be able to supply
+authoritative read requests to applications.&nbsp; By holding leases a
+read operation on a master can guarantee several things to the
+application:<br>
+<ol>
+ <li>Authoritative reads: a guarantee that the data being read by the
+application is durable and can never be rolled back.</li>
+ <li>Freshness: a guarantee that the data being read by the
+application <b>at the master</b> is
+not stale.</li>
+ <li>Master viability: a guarantee that a current master with valid
+leases will not encounter a duplicate master situation.<br>
+ </li>
+</ol>
+<h2>Requirements</h2>
+The requirements of DB to support this include:<br>
+<ul>
+ <li>After turning them on, users can choose to ignore them in reads
+or not.</li>
+ <li>We are providing read authority on the master only.&nbsp; A
+read on a client is equivalent to a read while ignoring leases.</li>
+ <li>We guarantee that data committed on a master <b>that has been
+read by an application on the
+master</b> will not be rolled back.&nbsp; Data read on a client or
+while ignoring leases <i>or data
+successfully updated/committed but not read,</i>
+may be rolled back.<br>
+ </li>
+ <li>A master will not return successfully from a read operation
+unless it holds a
+majority of leases unless leases are ignored.</li>
+ <li>Master leases will remove the possibility of a current/correct
+master being "shot down" by DUPMASTER.&nbsp; <b>NOTE: Old/Expired
+masters may discover a
+later master and return DUPMASTER to the application however.</b><br>
+ </li>
+ <li>Any send callback failure must result in premature lease
+expiration on the master.<br>
+ </li>
+ <li>Users who change the system clock during master leases void the
+guarantee and may get undefined behavior.&nbsp; We assume time always
+runs forward. <br>
+ </li>
+ <li>Clients are forbidden from participating in elections while they
+have an outstanding lease granted to another site.</li>
+ <li>Clients are forbidden from accepting a new master while they have
+an outstanding lease granted to another site.</li>
+ <li>Clients are forbidden from upgrading themselves to master while
+they have an outstanding lease granted to another site.</li>
+ <li>When asked for a lease grant explicitly by the master, the client
+cannot grant the lease to the master unless the LSN in the master's
+request has been processed by this client.<br>
+ </li>
+</ul>
+The requirements of the
+application using leases include:<br>
+<ul>
+ <li>Users must implement (Base API users on their own, RepMgr users
+via configuration) a majority (or larger) ACK policy. <br>
+ </li>
+ <li>The application must use the election mechanism to decide a master.
+It may not simply declare a site master.</li>
+ <li>The send callback must return an error if the majority ACK policy
+is not met for PERM records.</li>
+ <li>Users must set the number of sites in the group.</li>
+ <li>Using leases in a replication group is all-or-none.&nbsp;
+Therefore, if a site knows it is using leases, it can assume other
+sites are also.<br>
+ </li>
+ <li>All applications that care about read guarantees must forward or
+perform all reads on the master.&nbsp; Reading on the client means a
+read ignoring leases. </li>
+</ul>
+<p>There are some open questions
+remaining.</p>
+<ul>
+ <li>There is one major showstopper issue, see Crashing - Potential
+problem near the end of the document.&nbsp; We need a better solution
+than the one shown there (writing to disk every time a lease is
+granted). Perhaps just documenting that durability means it must be
+flushed to disk before success to avoid that situation?<br>
+ </li>
+ <li>What about db-&gt;join?&nbsp; Users can call join, but the calls
+on the join cursor to get the data would be subject to leases and
+therefore protected.&nbsp; Ok, this is not an open question.</li>
+ <li>What about other read-like operations?&nbsp; Clearly <i>
+DB-&gt;get, DB-&gt;pget, DBC-&gt;get,
+DBC-&gt;pget</i> need lease checks.&nbsp; However, other APIs use
+keys.&nbsp; <i>DB-&gt;key_range</i>
+provides an estimate only so it shouldn't need lease checks. <i>
+DB-&gt;stat</i> provides exact counts
+to <i>bt_nkeys</i> and <i>bt_ndata</i> fields.&nbsp; Are those
+fields considered authoritative that providing those values implies a
+durability guarantee and therefore <i>DB-&gt;stat</i>
+should be subject to lease verification?&nbsp; <i>DBC-&gt;count</i>
+provides a count for
+the number of data items associated with a key.&nbsp; Is this
+authoritative information? This is similar to stat - should it be
+subject to lease verification?<br>
+ </li>
+ <li>Do we require master lease checks on write operations?&nbsp; I
+think lease checks are not needed on write operations.&nbsp; It doesn't
+add correctness and adds a lot of complexity (checking leases in put,
+del, and cursors, then what about rename, remove, etc).<br>
+ </li>
+ <li>Do master leases give an iron-clad guarantee of never rolling
+back a transaction? No, but it should mean that a committed transaction
+can never be <b>read</b> on a master
+unless the lease is valid.&nbsp; A committed transaction on a master
+that has never been presented to the application may get rolled back.<br>
+ </li>
+ <li>Do we need to quarantine or prevent reads on an ex-master until
+sync-up is done?&nbsp; No.&nbsp; A master that is simply downgraded to
+client or crashes and reboots is now a client.&nbsp; Reading from that
+client is the same as saying Ignore Leases.</li>
+ <li>What about adding and removing sites while leases are
+active?&nbsp; This is SR 14778.&nbsp; A consistent <i>nsites</i> value
+is required by master
+leases.&nbsp; &nbsp; It isn't
+clear to me what a master is
+supposed to do if the value of nsites gets smaller while leases are
+active.&nbsp; Perhaps it leaves its larger table intact and simply
+checks for a smaller number of granted leases?<br>
+ </li>
+ <li>Can users turn leases off?&nbsp; No.&nbsp; There is no planned <i>turn
+leases off</i> API.</li>
+ <li>Clock skew will be a percentage.&nbsp; However, the smallest, 1%,
+is probably rather large for clock skew.&nbsp; Percentage was chosen
+for simplicity and similarity to other APIs.&nbsp; What granularity is
+appropriate here?</li>
+</ul>
+<h2>API Changes</h2>
+The API changes that are visible
+to the user are fairly minimal.&nbsp;
+There are a few API calls they need to make to configure master leases
+and then there is the API call to turn them on.&nbsp; There is also a
+new flag to existing APIs to allow read operations to ignore leases and
+return data that
+may be non-durable potentially.<br>
+<h3>Lease Timeout<br>
+</h3>
+There is a new timout the user
+must configure for leases called <b>DB_REP_LEASE_TIMEOUT</b>.&nbsp;
+This timeout will be new to
+the <i>dbenv-&gt;rep_set_timeout</i> method. The <b>DB_REP_LEASE_TIMEOUT</b>
+has no default and it is required that the user configure a timeout
+before they turn on leases (obviously, this timeout need not be set of
+leases will not be used).&nbsp; That timeout is the amount of time
+the lease is valid on the master and how long it is granted
+on the client.&nbsp; This timeout must be the same
+value on all sites (like log file size).&nbsp; The timeout used when
+refreshing leases is the <b>DB_REP_ACK_TIMEOUT</b>
+for RepMgr application.&nbsp; For Base API applications, lease
+refreshes will use the same mechanism as <b>PERM</b> messages and they
+should
+have no additional burden.&nbsp; This timeout is used for lease
+refreshment and is the amount of time a reader will wait to refresh
+leases before returning failure to the application from a read
+operation.<br>
+<br>
+This timeout will be both stored
+with its original value, and also
+converted to a <i>db_timespec</i>
+using the <b>DB_TIMEOUT_TO_TIMESPEC</b>
+macro and have the clock skew accounted for and stored in the shared
+rep structure:<br>
+<pre>db_timeout_t lease_timeout;<br>db_timespec lease_duration;<br></pre>
+NOTE:&nbsp; By sending the lease refresh during DB operations, we are
+forcing/assuming that the operation's process has a replication
+transport function set.&nbsp; That is obviously the case for write
+operations, but would it be a burden for read processes (on a
+master)?&nbsp; I think mostly not, but if we need leases for <i>
+DB-&gt;stat</i> then we need to
+document it as it is certainly possible for an application to have a
+separate or dedicated <i>stat</i>
+application or attempt to use <i>db_stat</i>
+(which will not work if leases must be checked).<br>
+<br>
+Leases should be checked after the local operation so that we don't
+have a window/boundary if we were to check leases first, get
+descheduled, the lose our lease and then perform the operation.&nbsp;
+Do the operation, then check leases before returning to the user.<br>
+<h3>Using Leases</h3>
+There is a new API that the user must call to tell the system to use
+the lease mechanism.&nbsp; The method must be called before the
+application calls <i>dbenv-&gt;rep_start</i>
+or <i>dbenv-&gt;repmgr_start</i>.
+This new
+method is:<br>
+<br>
+<pre>&nbsp;&nbsp;&nbsp; dbenv-&gt;rep_set_lease(DB_ENV *dbenv, u_int32_t clock_scale_factor, u_int32_t flags)<br>
+</pre>
+The <i>clock_scale_factor</i>
+parameter is interpreted as a percentage, greater than 100 (to transmit
+a floating point number as an integer to the API) that represents the
+maximum shkew between any two sites' clocks.&nbsp; That is, a <span
+ style="font-style: italic;">clock_scale_factor</span> of 150 suggests
+that the greatest discrepancy between clocks is that one runs 50%
+faster than the others.&nbsp; Both the
+master and client sides
+compensate for possible clock skew.&nbsp; The master uses the value to
+compensate in case the replica has a slow clock and replicas compensate
+in case they have a fast clock.&nbsp; This scaling factor will need to
+be divided by 100 on all sites to truly represent the percentage for
+adjustments made to time values.<br>
+<br>
+Assume the slowest replica's clock is a factor of <i>clock_scale_factor</i>
+slower than the
+fastest clock.&nbsp; Using that assumption, if the fastest clock goes
+from time t1 to t2 in X
+seconds, the slowest clock does it in (<i>clock_scale_factor</i> / 100)
+* X seconds.<br>
+<br>
+The <i>flags</i> parameter is not
+currently used.<br>
+<br>
+When the <i>dbenv-&gt;rep_set_lease</i>
+method is called, we will set a configuration flag indicating that
+leases are turned on:<br>
+<b>#define REP_C_LEASE &lt;value&gt;</b>.&nbsp;
+We will also record the <b>u_int32_t
+clock_skew</b> value passed in.&nbsp; The <i>rep_set_lease</i> method
+will not allow
+calls after <i>rep_start.&nbsp; </i>If
+multiple calls are made prior to calling <i>rep_start</i> then later
+calls will
+overwrite the earlier clock skew value.&nbsp; <br>
+<br>
+We need a new flag to prevent calling <i>rep_set_lease</i>
+after <i>rep_start</i>.&nbsp; The
+simplest solution would be to reject the call to
+<i>rep_set_lease&nbsp;
+</i>if<b>
+REP_F_CLIENT</b>
+or <b>REP_F_MASTER</b> is set.&nbsp;
+However that does not work in the cases where a site cleanly closes its
+environment and then opens without running recovery.&nbsp; The
+replication state will still be set.&nbsp; The prevention will be
+implemented as:<br>
+<pre>#define REP_F_START_CALLED &lt;some bit value&gt;<br></pre>
+In __rep_start, at the end:<br>
+<pre>if (ret == 0 ) {<br> REP_SYSTEM_LOCK<br> F_SET(rep, REP_F_START_CALLED)<br> REP_SYSTEM_UNLOCK<br>}</pre>
+In <i>__rep_env_refresh</i>, if we
+are the last reference closing the env (we already check for that):<br>
+<pre>F_CLR(rep, REP_F_START_CALLED);</pre>
+In order to avoid run-time floating point operations
+on <i>db_timespec</i> structures,
+when a site is declared as a client or master in <i>rep_start</i> we
+will pre-compute the
+lease duration based on the integer-based clock skew and the
+integer-based lease timeout.&nbsp; A master should set a replica's
+lease expiration to the <b>start time of
+the sent message +
+(lease_timeout / clock_scale_factor)</b> in case the replica has a
+slow clock.&nbsp; Replicas extend their leases to <b>received message
+time + (lease_timeout *
+clock_scale_factor)</b> in case this replica has a fast clock.&nbsp;
+Therefore, the computation will be as follows if the site is becoming a
+master:<br>
+<pre>db_timeout_t tmp;<br>tmp = (db_timeout_t)((double)rep-&gt;lease_timeout / ((double)rep-&gt;clock_skew / (double)100));<br>rep-&gt;lease_duration = DB_TIMEOUT_TO_TIMESPEC(&amp;tmp);<br></pre>
+Similarly, on a client the computation is:<br>
+<pre>tmp = (db_timeout_t)((double)rep-&gt;lease_timeout * ((double)rep-&gt;clock_skew / (double)100));<br></pre>
+When a site changes state, its lease duration will change based on
+whether it is becoming a master or client and it will be recomputed
+from the original values.&nbsp; Note that these computations, coupled
+with the fact that the lease on the master is computed based on the
+master's time that it sent the message means that leases on the master
+are more conservatively computed than on the clients.<br>
+<br>
+The <i>dbenv-&gt;rep_set_lease</i>
+method must be called after <i>dbenv-&gt;open</i>,
+similar to <i>dbenv-&gt;rep_set_config</i>.&nbsp;
+The reason is so that we can check that this is a replication
+environment and we have access to the replication shared memory region.<br>
+<h3>Read Operations<br>
+</h3>
+Authoritative read operations on the master with leases enabled will
+abide by leases by default.&nbsp; We will provide a flag that allows an
+operation on a master to ignore leases.&nbsp; <b>All read operations
+on a client imply
+ignoring leases.</b> If an application wants authoritative reads
+they must forward the read requests to the master and it is the
+application's responsibility to provide the forwarding.
+The consensus was that forcing <span style="font-weight: bold;">DB_IGNORE_LEASE</span>
+on client read operations (with leases enabled, obviously) was too
+heavy handed.&nbsp; Read operations on the client will ignore leases,
+but do no special flag checking.<br>
+<br>
+The flag will be called <b>DB_IGNORE_LEASE</b>
+and it will be a flag that can be OR'd into the DB access method and
+cursor operation values.&nbsp; It will be similar to the <b>DB_READ_UNCOMMITTED</b>
+flag.
+<br>
+</b>The methods that will
+adhere to leases are:<br>
+<ul>
+ <li><i>Db-&gt;get</i></li>
+ <li><i>Db-&gt;pget</i></li>
+ <li><i>Dbc-&gt;get</i></li>
+ <li><i>Dbc-&gt;pget</i></li>
+</ul>
+The code that will check leases for a client reading would look
+something
+like this, if we decide to become heavy-handed:<br>
+<pre>if (IS_REP_CLIENT(dbenv)) {<br> [get to rep structure]<br> if (FLD_ISSET(rep-&gt;config, REP_C_LEASE) &amp;&amp; !LF_ISSET(DB_IGNORE_LEASE)) {<br> db_err("Read operations must ignore leases or go to master");<br> ret = EINVAL;<br> goto err;<br> }<br>}<br></pre>
+On the master, the new code to abide by leases is more complex.&nbsp;
+After the call to perform the operation we will check the lease.&nbsp;
+In that checking code, the master will see if it has a valid
+lease.&nbsp; If so, then all is well.&nbsp; If not, it will try to
+refresh the leases.&nbsp; If that refresh attempt results in leases,
+all is well.&nbsp; If the refresh attempt does not get leases, then the
+master cannot respond to the read as an authority and we return an
+error.&nbsp; The new error is called <b>DB_REP_LEASE_EXPIRED</b>.&nbsp;
+The location of the master lease check is down after the internal call
+to read the data is successful:<br>
+<pre>if (IS_REP_MASTER(dbenv) &amp;&amp; !LF_ISSET(DB_IGNORE_LEASE)) {<br> [get to rep structure]<br> if (FLD_ISSET(rep-&gt;config, REP_C_LEASE) &amp;&amp;<br> (ret = __rep_lease_check(dbenv)) != 0) {<br> /*<br> * We don't hold the lease.<br> */<br> goto err;<br> }<br>}<br></pre>
+See below for the details of <i>__rep_lease_check</i>.<br>
+<br>
+Also note that if leases (or replication) are not configured, then <span
+ style="font-weight: bold;">DB_IGNORE_LEASE</span> is a no-op.&nbsp; It
+is ignored (and won't error) if used when leases are not in
+effect.&nbsp; The reason is so that we can generically set that flag in
+utility programs like <span style="font-style: italic;">db_dump</span>
+that walk the database with a cursor.&nbsp; Note that <span
+ style="font-style: italic;">db_dump</span> is the only utility that
+reads with a cursor.<span style="font-style: italic;"><span
+ style="font-style: italic;"></span></span><br>
+<h3><b>Nsites
+and Elections</b></h3>
+The call to <i>dbenv-&gt;rep_set_nsites</i>
+must be performed before the call to <i>dbenv-&gt;rep_start</i>
+or <i>dbenv-&gt;repmgr_start</i>.&nbsp;
+This document assumes either that <b>SR
+14778</b> gets resolved, or assumes that the value of <i>nsites</i> is
+immutable.&nbsp; The
+master and all clients need to know how many sites and leases are in
+the group.&nbsp; Clients need to know for elections.&nbsp; The master
+needs to know for the size of the lease table and to know what value a
+majority of the group is. <b>[Until
+14778 is resolved, the master lease work must assume <i>nsites</i> is
+immutable and will
+therefore enforce that this is called before <i>rep_start</i> using
+the same mechanism
+as <i>rep_set_lease</i>.]</b><br>
+<br>
+Elections and leases need to agree on the number of sites in the
+group.&nbsp; Therefore, when leases are in effect on clients, all calls
+to <i>dbenv-&gt;rep_elect</i> must
+set the <i>nsites</i> parameter to
+0.&nbsp; The <i>rep_elect</i> code
+path will return <b>EINVAL</b> if <b>REP_C_LEASE</b> is set and <i>nsites</i>
+is non-0.
+<h2>Lease Management</h2>
+<h3>Message Changes</h3>
+In order for clients to grant leases to the master a new message type
+must be added for that purpose.&nbsp; This will be the <b>REP_LEASE_GRANT</b>
+message.&nbsp;
+Granting leases will be a result of applying a <b>DB_REP_PERMANENT</b>
+record and therefore we
+do not need any additional message in order for a master to request a
+lease grant.&nbsp; The <b>REP_LEASE_GRANT</b>
+message will pass a structure as its message DBT:<br>
+<pre>struct __rep_lease_grant {<br> db_timespec msg_time;<br>#ifdef DIAGNOSTIC<br> db_timespec expire_time;<br>#endif<br>} REP_GRANT_INFO;<br></pre>
+In the <b>REP_LEASE_GRANT</b>
+message, the client is actually giving the master several pieces of
+information.&nbsp; We only need the echoed <i>msg_time</i> in this
+structure because
+everything else is already sent.&nbsp; The client is really sending the
+master:<br>
+<ul>
+ <li>Its EID (parameter to <span style="font-style: italic;">rep_send_message</span>
+and <span style="font-style: italic;">rep_process_message</span>)<br>
+ </li>
+ <li>The PERM LSN this message acknowledged (sent in the control
+message)</li>
+ <li>Unique identifier echoed back to master (<i>msg_time</i> sent in
+message as above)</li>
+</ul>
+On the client, we always maintain the maximum PERM LSN already in <i>lp-&gt;max_perm_lsn</i>.&nbsp;
+<h3>Local State Management</h3>
+Each client must maintain a <i>db_timespec</i>
+timestamp containing the expiration of its granted lease.&nbsp; This
+field will be in the replication shared memory structure:<br>
+<pre>db_timespec grant_expire;<br></pre>
+This timestamp already takes into account the clock skew.&nbsp; All
+new fields must be initialized when the region is created. Whenever we
+grant our master lease and want to send the <b>REP_LEASE_GRANT</b>
+message, this value
+will be updated.&nbsp; It will be used in the following way:
+<pre>db_timespec mytime;<br>DB_LSN perm_lsn;<br>DBT lease_dbt;<br>REP_GRANT_INFO gi;<br><br><br>timespecclear(&amp;mytime);<br>timespecclear(&amp;newgrant);<br>memset(&amp;lease_dbt, 0, sizeof(lease_dbt));<br>memset(&amp;gi, 0, sizeof(gi));<br>__os_gettime(dbenv, &amp;mytime);<br>timespecadd(&amp;mytime, &amp;rep-&gt;lease_duration);<br>MUTEX_LOCK(rep-&gt;clientdb_mutex);<br>perm_lsn = lp-&gt;max_perm_lsn;<br>MUTEX_UNLOCK(rep-&gt;clientdb_mutex);<br>REP_SYSTEM_LOCK(dbenv);<br>if (timespeccmp(mytime, rep-&gt;grant_expire, &gt;))<br> rep-&gt;grant_expire = mytime;<br>gi.msg_time = msg-&gt;msg_time;<br>#ifdef DIAGNOSTIC<br>gi.expire_time = rep-&gt;grant_expire;<br>#endif<br>lease_dbt.data = &amp;gi;<br>lease_dbt.size = sizeof(gi);<br>REP_SYSTEM_UNLOCK(dbenv);<br>__rep_send_message(dbenv, eid, REP_LEASE_GRANT, &amp;perm_lsn, &amp;lease_dbt, 0, 0);<br></pre>
+This updating of the lease grant will occur in the <b>PERM</b> code
+path when we have
+successfully applied the permanent record.<br>
+<h3>Maintaining Leases on the
+Master/Rep_start</h3>
+The master maintains a lease table that it checks when fulfilling a
+read request that is subject to leases.&nbsp; This table is initialized
+when a site calls<i>
+dbenv-&gt;rep_start(DB_MASTER)</i> and the site is undergoing a role
+change (i.e. a master making additional calls to <i>dbenv-&gt;rep_start(DB_MASTER)</i>
+does
+not affect an already existing table).<br>
+<br>
+When a non-master site becomes master, it must do two things related to
+leases on a role change.&nbsp; First, a client cannot upgrade to master
+while it has an outstanding lease granted to another site.&nbsp; If a
+client attempts to do so, an error, <b>EINVAL</b>,
+will be returned.&nbsp; The only way this should happen is if the
+application simply declares a site master, instead of using
+elections.&nbsp; Elections will already wait for leases to expire
+before proceeding. (See below.)
+<br>
+<br>
+Second, once we are proceeding with becoming a master, the site must
+allocate the table it will use to maintain lease information.&nbsp;
+This table will be sized based on <i>nsites</i>
+and it will be an array of the following structure:<br>
+<pre>struct {<br> int eid; /* EID of client site. */<br> db_timespec start_time; /* Unique time ID client echoes back on grants. */<br> db_timespec end_time; /* Master's lease expiration time. */<br> DB_LSN lease_lsn; /* Durable LSN this lease applies to. */<br> u_int32_t flags; /* Unused for now?? */<br>} REP_LEASE_ENTRY;<br></pre>
+<h3>Granting Leases</h3>
+It is the burden of the application to make sure that all sites in the
+group
+are using leases, or none are.&nbsp; Therefore, when a client processes
+a <b>PERM</b>
+log record that arrived from the master, it will grant its lease
+automatically if that record is permanent (i.e. <b>DB_REP_ISPERM</b>
+is being returned),
+and leases are configured.&nbsp; A client will not send a
+lease grant when it is processing log records (even <b>PERM</b>
+ones) it receives from other clients that use client-to-client
+synchronization.&nbsp; The reason is that the master requires a unique
+time-of-msg ID (see below) that the client echoes back in its lease
+grant and it will not have such an ID from another client.<br>
+<br>
+The master stores a time-of-msg ID in each message and the client
+simply echoes it back to the master.&nbsp; In its lease table, it does
+keep the base
+time-of-msg for a valid lease.&nbsp; When <b>REP_LEASE_GRANT</b>
+message comes in,
+the master does a number of things:<br>
+<ol>
+ <li>Pulls the echoed timespec from the client message, into <i>msg_time</i>.<br>
+ </li>
+ <li>Finds the entry in its lease table for the client's EID.&nbsp; It
+walks the table searching for the ID.&nbsp; EIDs of <span
+ style="font-weight: bold;">DB_EID_INVALID</span> are
+illegal.&nbsp; Either the master will find the entry, or it will find
+an empty slot in the table (i.e. it is still populating the table with
+leases).</li>
+ <li>If this is a previously unknown site lease, the master
+initializes the entry by copying to the <i>eid</i>, <i>start_time, </i>and
+ <i>lease_lsn</i> fields.&nbsp; The master
+also computes the <i>end_time</i>
+based on the adjusted <i>rep-&gt;lease_duration</i>.</li>
+ <li>If this is a lease from a previously known site, the master must
+perform <i>timespeccmp(&amp;msg_time,
+&amp;table[i].start_time, &gt;)</i> and only update the <i>end_time</i>
+of the lease when this is
+a more recent message.&nbsp; If it is a more recent message, then we
+should update
+the <i>lease_lsn</i> to the LSN in
+the message.</li>
+ <li>Since lease durations are computed taking the clock skew into
+account, clients compute them based on the current time and the master
+computes it based on original sending time, for diagnostic purposes
+only, I also plan to send the client's expiration time.&nbsp; The
+client errs on the side of computing a larger lease expiration time and
+the master errs on the side of computing a smaller duration.&nbsp;
+Since both are taking the clock skew
+into account, the client's ending expiration time should never be
+smaller than
+the master's computed expiration time or their value for clock skew may
+not be correct.<br>
+ </li>
+</ol>
+Any log records (new or resent) that originate from the master and
+result in <b>DB_REP_ISPERM</b> get an
+ack.<br>
+<br>
+<h3>Refreshing Leases</h3>
+Leases get refreshed when a master receives a <b>REP_LEASE_GRANT</b>
+message from a client. There are three pieces to lease
+refreshment.&nbsp; <br>
+<h4>Lazy Lease Refreshing on Read<br>
+</h4>
+If the master discovers that leases are
+expired during the read operation, it attempts to refresh its
+collection of lease grants.&nbsp; It does this by calling a new
+function <i>__rep_lease_refresh</i>.&nbsp;
+This function is very similar to the already-existing function <i>__rep_flush</i>.&nbsp;
+Basically, to
+refresh the lease, the master simply needs to resend the last PERM
+record to the clients.&nbsp; The requirements state that when the
+application send function returns successfully from sending a PERM
+record, the majority of clients have that PERM LSN durable.&nbsp; We
+will have a new public DB error return called <b>DB_REP_LEASE_EXPIRED</b>
+that will be
+returned back to the caller if the master cannot assert its
+authority.&nbsp; The code will look something like this:<br>
+<pre>/*<br> * Use lp-&gt;max_perm_lsn on the master (currently not used on the master)<br> * to keep track of the last PERM record written through the logging system.<br> * need to initialize lp-&gt;max_perm_lsn in rep_start on role_chg.<br> */<br>call __rep_send_message on the last PERM record the master wrote, with DB_REP_PERMANENT<br>if failure<br> expire leases<br> return lease expired error to caller<br>else /* success */<br> recheck lease table<br> /*<br> * We need to recheck the lease table because the client<br> * lease grant messages may not be processed yet, or got<br> * lost, or racing with the application's ACK messages or<br> * whatever. <br> */<br> if we have a majority of valid leases<br> return success<br> else<br> return lease expired error to caller <br></pre>
+<h4>Ongoing Update Refreshment<br>
+</h4>
+Second is having the master indicate to
+the client it needs to send a lease grant in response to the current
+PERM log message.&nbsp; The problem is
+that acknowledgements must contain a master-supplied message timestamp
+that the client sends back to the master.&nbsp; We need to modify the
+structure of the&nbsp; log record messages when leases are configured
+so
+that when a PERM message is sent, the master sends, and the client
+expects, the message timestamp.&nbsp; There are three fairly
+straightforward and different implementations to consider.<br>
+<ol>
+ <li>Adding the timestamp to the <b>REP_CONTROL</b>
+structure.&nbsp; If this option is chosen, then the code trivially
+sends back the timestamp in the client's reply.&nbsp; There is no
+special processing done by either side with the message contents.&nbsp;
+So, on a PERM log record, the master will send a non-zero
+timestamp.&nbsp; On a normal log record the timestamp will be zero or
+some known invalid value.&nbsp; If the client sees a non-zero
+timestamp, it sends a <b>REP_LEASE_GRANT</b>
+with the <i>lp-&gt;max_perm_lsn</i>
+after applying that log record.&nbsp; If it is zero, then the client
+does nothing different.&nbsp; The advantage is ease of code.&nbsp; The
+disadvantage is that for mixed version systems, the client is now
+dealing with different sized control structures.&nbsp; We would have to
+retain the old control structure so that during a mixed version group
+the (upgraded) clients can use, expect and send old control structures
+to the master.&nbsp; This is unfortunate, so let's consider additional
+implementations that don't require modifying the control structure.<br>
+ </li>
+ <li>Adding a new <b>REPCTL_LEASE</b>
+flag to the list of flags for the control structure, but do not change
+the control structure fields.&nbsp; When a master wants to send a
+message that needs a lease ack, it sets the flag.&nbsp; Additionally,
+instead of simply sending a log record DBT as the <i>rec</i> parameter
+for replication, we
+would send a new structure that had the timestamp first and then the
+record (similar to the bulk transfer buffer).&nbsp; The advantage of
+this is that the control structure does not change.&nbsp; Disadvantages
+include more special-cased code in the normal code path where we have
+to check the flag.&nbsp; If the flag is set we have to extract the
+timestamp value and massage the incoming data to pass on the real log
+record to <i>rep_apply</i>.&nbsp; On
+bulk transfer, we would just add the timestamp into the buffer.&nbsp;
+On normal transfers, it would incur an additional data copy on the
+master side.&nbsp; That is unfortunate.&nbsp; Additionally, if this
+record needs to be stored in the temp db, we need some way to get it
+back again later or <span style="font-style: italic;">rep_apply</span>
+would have to extract the timestamp out when it processed the record
+(either live or from the temp db).<br>
+ </li>
+ <li>Adding a different message type, such as <b>REP_LOG_ACK</b>.&nbsp;
+Similarly to <b>REP_LOG_MORE</b> this message would be a
+special-case version of a log record.&nbsp; We would extract out the
+timestamp and then handle as a normal log record.&nbsp; This
+implementation is rejected because it actually would require three new
+message types: <b>REP_LOG_ACK,
+REP_LOG_ACK_MORE, REP_BULK_LOG_ACK</b>.&nbsp; That is just too ugly
+to contemplate.</li>
+</ol>
+<b>[Slight digression:</b> it occurs
+to me while writing about #2 and #3 above, that our implementation of
+all of the *_MORE messages could really be implemented with a <b>REPCTL_MORE</b>
+flag instead of a
+separate message type.&nbsp; We should clean that up and simplify the
+messages but not part of master leases. Hmm, taking that thought
+process further, we really could get rid of the <b>REP_BULK_*</b>
+messages as well if we
+added a <b>REPCTL_BULK</b>
+flag.&nbsp; I think we should definitely do it for the *_MORE
+messages.&nbsp; I am not sure we should do it for bulk because the
+structure of the incoming data record is vastly different.]<br>
+<br>
+Of these options, I believe that modifying the control structure is the
+best alternative.&nbsp; The handling of the old structure will be very
+isolated to code dealing with old versions and is far less complicated
+than injecting the timestamp into the log record DBT and doing a data
+copy.&nbsp; Actually, I will likely combine #1 and the flag from #2
+above.&nbsp; I will have the <b>REPCTL_LEASE</b>
+flag that indicates a lease grant reply is expected and have the
+timestamp in the control structure.&nbsp;
+Also I will probably add in a spare field or two for future use in the <b>REP_CONTROL</b>
+structure.<br>
+<h4>Gap processing</h4>
+No matter which implementation we choose for ongoing lease refreshment,
+gap processing must be considered.&nbsp; The code above assumes the
+timestamps will be placed on PERM records only.&nbsp; Normal log
+records will not have a timestamp, nor a flag or anything else like
+that.&nbsp; However, any log message can fill a gap on a client and
+result in the processing of that normal log record to return <b>DB_REP_ISPERM</b>
+because later records
+were also processed.<br>
+<br>
+The current implementation should work fine in that case because when
+we store the message in the client temp db we store both the control
+DBT and the record DBT.&nbsp; Therefore, when a normal record fills a
+gap, the later PERM record, when retrieved will look just like it did
+when it arrived.&nbsp; The client will have access to the LSN, and the
+timestamp, etc.&nbsp; However, it does mean that sending the <b>REP_LEASE_GRANT</b>
+message must take
+place down in <i>__rep_apply</i>
+because that is the only place we have access to the contents of those
+stored records with the timestamps.<br>
+<br>
+There are two logical choices to consider for granting the lease when
+processing an update.&nbsp; As we process (either a live record or one
+read from the temp db after filling a gap) a PERM message, we send the <b>REP_LEASE_GRANT</b>
+message for each
+PERM record we successfully apply.&nbsp; Or, second, we keep track of
+the largest timestamp of all PERM records we've processed and at the
+end of the function after we've applied all records, we send back a
+single lease grant with the <i>max_perm_lsn</i>
+and a new <i>max_lease_timestamp</i>
+value to the master.&nbsp; The first is easier to implement, the second
+results in possibly slightly fewer messages at the expense of more
+bookkeeping on the client.<br>
+<br>
+A third, more complicated option would be to have the message timestamp
+on all records, but grants are only sent on the PERM messages.&nbsp; A
+reason to do this is that the later timestamp of a normal log record
+would be used as the timestamp sent in the reply and the master would
+get a more up to date timestamp value and a longer lease.&nbsp; <br>
+<br>
+If we change the <span style="font-weight: bold;">REP_CONTROL</span>
+structure to include the timestamp, we potentially break or at least
+need to revisit the gap processing algorithm.&nbsp; That code assumes
+that the control and record elements for the same LSN look the same
+each and every time.&nbsp; The code stores the <span
+ style="font-style: italic;">control</span> DBT as the key and the <span
+ style="font-style: italic;">rec</span> DBT as the data.&nbsp; We use a
+specialized compare function to sort based on the LSN in the control
+DBT.&nbsp; With master leases, the same record transmitted by a master
+multiple times or client for the same LSN will be different because the
+timestamp field will not be the same.&nbsp; Therefore, the client will
+end up with duplicate entries in the temp database for the same
+LSN.&nbsp; Both solutions (adding the timestamp to <span
+ style="font-weight: bold;">REP_CONTROL</span> and adding a <span
+ style="font-weight: bold;">REPCTL_LEASE</span> flag) can yield
+duplicate entries.&nbsp; The flag would cause the same record from the
+master and client to be different as well.<br>
+<h4>Handling Incoming Lease Grants<br>
+</h4>
+The third piece of lease management is handling the incoming <b>REP_LEASE_GRANT</b>
+message on the
+master.&nbsp; When this message is received, the master must do the
+following:<br>
+<pre>REP_SYSTEM_LOCK<br>msg_timestamp = cntrl-&gt;timestamp;<br>client_lease = __rep_lease_entry(dbenv, client eid)<br>if (client_lease == NULL)<br> initial lease for this site, DB_ASSERT there is space in the table<br> add this to the table if there is space<br>} else <br> compare msg_timestamp with client_lease-&gt;start_time<br> if (msg_timestamp is more recent &amp;&amp; msg_lsn &gt;= lease LSN)<br> update entry in table<br>REP_SYSTEM_UNLOCK<br></pre>
+<h3>Expiring Leases</h3>
+Leases can expire in two ways.&nbsp; First they can expire naturally
+due to the passage of time.&nbsp; When checking leases, if the current
+time is later than the lease entry's <i>end_time</i>
+then the lease is expired.&nbsp; Second, they can be forced with a
+premature expiration when the application's transport function returns
+an error.&nbsp; In the first case, there is nothing to do, in the
+second case we need to manipulate the <i>end_time</i>
+so that all future lease checks fail.&nbsp; Since the lease <i>start_time</i>
+is guaranteed to not be in the future we will have a function <i>__rep_lease_expire</i>
+that will:<br>
+<pre>REP_SYSTEM_LOCK<br>for each entry in the lease table<br> entry-&gt;end_time = entry-&gt;start_time;<br>REP_SYSTEM_UNLOCK<br></pre>
+Is there a potential race or problem with prematurely expiring
+leases?&nbsp; Consider an application that enforces an ALL
+acknowledgement policy for PERM records in its transport
+callback.&nbsp; There are four clients and three send the PERM ack to
+the application.&nbsp; The callback returns an error to the master DB
+code.&nbsp; The DB code will now prematurely expire its leases.&nbsp;
+However, at approximately the same time the three clients are also
+sending their <span style="font-weight: bold;">REP_LEASE_GRANT</span>
+messages to the master.&nbsp; There is a race between the master
+processing those messages and the thread handling the callback failure
+expiring the table.&nbsp; This is only an issue if the messages arrive
+after the table has been expired.<br>
+<br>
+Let's assume all three clients send their grants after the master
+expires the table.&nbsp; If we accept those grants and then a read
+occurs the read will succeed since the master has a majority of leases
+even though the callback failed earlier.&nbsp; Is that a problem?&nbsp;
+The lease code is using a majority and the application policy is using
+something other value.&nbsp; It feels like this should be okay since
+the data is held by leases on a majority.&nbsp; Should we consider
+having the lease checking threshold be the same as the permanent ack
+policy?&nbsp; That is difficult because Base API users implement
+whatever they want and DB does not know what it is.<br>
+<h3>Checking Leases</h3>
+When a read operation on the master completes, the last thing we need
+to do is verify the master leases.&nbsp; We've already discussed
+refreshing them when they are expired above.&nbsp; We need two things
+for a lease to be valid.&nbsp; It must be within the timeframe of the
+lease grant and the lease must be valid for the last PERM record
+LSN.&nbsp; Here is the logic
+for checking the validity of leases in <i>__rep_lease_check</i>:<br>
+<pre>#define MAX_REFRESH_TRIES 3<br>DB_LSN lease_lsn;<br>REP_LEASE_ENTRY *entry;<br>u_int32_t min_leases, valid_leases;<br>db_timespec cur_time;<br>int ret, tries;<br><br> tries = 0;<br>retry:<br> ret = 0;<br> LOG_SYSTEM_LOCK<br> lease_lsn = lp-&gt;lsn<br> LOG_SYSTEM_UNLOCK<br> REP_SYSTEM_LOCK<br> min_leases = rep-&gt;nsites / 2;<br> __os_gettime(dbenv, &amp;cur_time);<br> for (entry = head of table, valid_leases = 0; entry != NULL &amp;&amp; valid_leases &lt; min_leases; entry++)<br> if (timespec_cmp(&amp;entry-&gt;end_time, &amp;cur_time) &gt;= 0 &amp;&amp; log_compare(&amp;entry-&gt;lsn, lease_lsn) == 0)<br> valid_leases++;<br> REP_SYSTEM_UNLOCK<br> if (valid_leases &lt; min_leases) {<br> ret =__rep_lease_refresh(dbenv, ...);<br> /*<br> * If we are successful, we need to recheck the leases because <br> * the lease grant messages may have raced with the PERM<br> * acknowledgement. Give those messages a chance to arrive.<br> */<br> if (ret == 0) {<br> if (tries &lt;= MAX_REFRESH_TRIES) {<br> /*<br> * If we were successful sending, but not successful in racing the<br> * message thread, yield the processor so that message<br> * threads may have a chance to run.<br> */<br> if (tries &gt; 0)<br> /* __os_sleep instead?? */<br> __os_yield()<br> tries++;<br> goto retry;<br> } else<br> ret = DB_RET_LEASE_EXPIRED;<br> }<br> }<br> return (ret);</pre>
+If the master has enough valid leases it returns success.&nbsp; If it
+does not have enough, it attempts to refresh them.&nbsp; This attempt
+may fail if sending the PERM record does not receive sufficient
+acks.&nbsp; If we do receive sufficient acknowledgements we may still
+find that scheduling of message threads means the master hasn't yet
+processed the incoming <b>REP_LEASE_GRANT</b>
+messages yet.&nbsp; We will retry a couple times (possibly
+parameterized) if the master discovers that situation.&nbsp; <br>
+<h2>Elections</h2>
+When a client grants a lease to a master, it gives up the right to
+participate in an election until that grant expires.&nbsp; If we are
+the master and <i>dbenv-&gt;rep_elect</i>
+is called, it should return, no matter what, like it does today.&nbsp;
+If we are a client and <i>rep_elect</i>
+is called special processing takes place when leases are in
+effect.&nbsp; First, the easy case is if the lease granted by this
+client has already expired, then the client goes directly into the
+election as normal.&nbsp; If a valid lease grant is outstanding to a
+master, this site cannot participate in an election until that grant
+expires.&nbsp; We have at least two options when a site calls the <i>dbenv-&gt;rep_elect</i>
+API while
+leases are in effect.<br>
+<ol>
+ <li>The simplest coding solution for DB would be simply to refuse to
+participate in the election if this site has a current lease granted to
+a master.&nbsp; We would detect this situation and return EINVAL.&nbsp;
+This is correct behavior and trivial to implement.&nbsp; The
+disadvantage of this solution is that the application would then be
+responsible for repeatedly attempting an election until the lease grant
+expired.<br>
+ </li>
+ <li>The more satisfying solution is for DB to wait the remaining time
+for the grant.&nbsp; If this client hears from the master during that
+time the election does not take place and the call to <i>rep_elect</i>
+returns with the
+information for the current/old master.</li>
+</ol>
+<h3>Election Code Changes</h3>
+The code changes to support leases in the election code are fairly
+isolated.&nbsp; First if leases are configured, we must verify the <i>nsites</i>
+parameter is set to 0.&nbsp;
+Second, in <i>__rep_elect_init</i>
+we must not overwrite the value of <i>rep-&gt;nsites</i>
+for leases because it is controlled by the <i>dbenv-&gt;rep_set_nsites</i>
+API.&nbsp;
+These changes are small and easy to understand.<br>
+<br>
+The more complicated code will be the client code when it has an
+outstanding lease granted.&nbsp; The client will wait for the current
+lease grant to expire before proceeding with the election.&nbsp; The
+client will only do so if it does not hear from the master for the
+remainder of the lease grant time.&nbsp; If the client hears from the
+master, it returns and does not begin participating in the
+election.&nbsp; A new election phase, <b>REP_EPHASE0</b>
+will exist so that the call to <i>__rep_wait</i>
+can detect if a master responds.&nbsp; The client, while waiting for
+the lease grant to expire, will send a <b>REP_MASTER_REQ</b>
+message so that the master will respond with a <b>REP_NEWMASTER</b>
+message and thus,
+allow the client to know the master exists.&nbsp; However, it is also
+desirable that if the master
+replies to the client, the master wants the client to update its lease
+grant.&nbsp; <br>
+<br>
+Recall that the <b>REP_NEWMASTER</b>
+message does not result in a lease grant from the client.&nbsp; The
+client responds when it processes a PERM record that has the <b>REPCTL_LEASE</b>
+flag set in the message
+with its lease grant up to the given LSN.&nbsp; Therefore, we want the
+client's <b>REP_MASTER_REQ</b> to
+yield both the discovery of the existing master and have the master
+refresh its leases.&nbsp; The client will also use the <b>REPCTL_LEASE</b>
+flag in its <b>REP_MASTER_REQ</b> message to the
+master.&nbsp; This flag will serve as the indicator to the master that
+it needs to deal with leases and both send the <b>REP_NEWMASTER</b>
+message and refresh
+the lease.<br>
+The code will work as follows:<br>
+<pre>if (leases_configured &amp;&amp; (my_grant_still_valid || lease_never_granted) {<br> if (lease_never_granted)<br> wait_time = lease_timeout<br> else<br> wait_time = grant_expiration - current_time<br> F_SET(REP_F_EPHASE0);<br> __rep_send_message(..., REP_MASTER_REQ, ... REPCTL_LEASE);<br> ret = __rep_wait(..., REP_F_EPHASE0);<br> if (we found a master)<br> return<br>} /* if we don't return, fall out and proceed with election */<br></pre>
+On the master side, the code handling the <b>REP_MASTER_REQ</b> will
+do:<br>
+<pre>if (I am master) {<br> ...<br> __rep_send_message(REP_NEWMASTER...)<br> if (F_ISSET(rp, REPCTL_LEASE))<br> __rep_lease_refresh(...)<br>}<br></pre>
+Other minor implementation details are that<i> __rep_elect_done</i>
+must also clear
+the <b>REP_F_EPHASE0</b> flag.&nbsp;
+We also, obviously, need to define <b>REP_F_EPHASE0</b>
+in the list of replication flags.&nbsp; Note that the client's call to <i>__rep_wait</i>
+will return upon
+receiving the <b>REP_NEWMASTER</b>
+message.&nbsp; The client will independently refresh its lease when it
+receives the log record from the master's call to refresh the lease.<br>
+<br>
+Again, similar to what I suggested above, the code could simply assume
+global leases are configured, and instead of having the <b>REPCTL_LEASE</b>
+flag at all, the master
+assumes that it needs to refresh leases because it has them configured,
+not because it is specified in the <b>REP_MASTER_REQ</b>
+message it is processing. Right now I don't think every possible
+<b>REP_MASTER_REQ</b> message should result in a lease grant request.<br>
+<h4>Elections and Quiescient Systems</h4>
+It is possible that a master is slow or the client is close to its
+expiration time, or that the master is quiescient and all leases are
+currently expired, but nothing much is going on anyway, yet some client
+calls <i>__rep_elect</i> at that
+time.&nbsp; In the code above, we will not send the <b>REP_MASTER_REQ</b>
+because the lease is
+not valid.&nbsp; The client will simply proceed directly to sending the
+<b>REP_VOTE1</b> message, throwing all
+other clients into an election.&nbsp; The master is still master and
+should stay that way.&nbsp; Currently in response to a vote message, a
+master will broadcast out a <b>REP_NEWMASTER</b>
+to assert its mastership.&nbsp; That causes the election to
+complete.&nbsp; However, if desired the master may want to proactively
+refresh its leases.&nbsp; This situation indicates to me that the
+master should choose to refresh leases based on configuration, not a
+flag sent from the client.&nbsp; I believe anytime the master asserts
+its mastership via sending a <b>REP_NEWMASTER</b>
+message that I need to add code to proactively refresh leases at that
+time.<br>
+<h2>Other Implementation Details</h2>
+<h3>Role Changes<br>
+</h3>
+When a site changes its role via a call to <i>rep_start</i> in either
+direction, we
+must take action when leases are configured.&nbsp; There are three
+types of role changes that all need changes to deal with leases:<br>
+<ol>
+ <li><i>A master downgrading to a
+client.</i> When a master downgrades to a client, it can do so
+immediately after it has proactively expired all existing leases it
+holds.&nbsp; This situation is similar to an error from the send
+callback, and it effectively cancels all outstanding leases held on
+this site.&nbsp; Note that if this master expires its leases, it does
+not have any effect on when the clients' lease grants expire on the
+client side.&nbsp; The clients must still wait their full expected
+grant time.<br>
+ </li>
+ <li><i>A client upgrading to master.</i>
+If a client is upgrading to a master but it has an outstanding lease
+granted to another site, the code will return an <b>EINVAL</b>
+error.&nbsp; This situation
+only arises if the application simply declares this site master.&nbsp;
+If a site wins an election then the election itself should have waited
+long enough for the granted lease to expire and this state should not
+arise then.</li>
+ <li><i>A client finding a new master.</i>
+When a client discovers a new and different master, via a <b>REP_NEWMASTER</b>
+message then the
+client cannot accept that new master until its current lease grant
+expires.&nbsp; This situation should only occur when a site declares
+itself master without an election and that site's lease grant expires
+before this client's grant expires.&nbsp; However, it is <b>possible</b>
+for this situation to arise
+with elections also.&nbsp; If we have 5 sites holding an election and 4
+of those sites have leases expire at about the same time T, and this
+site's lease expires at time T+N and the election timeout is &lt; N,
+then those 4 sites may hold an election and elect a master without this
+site's participation.&nbsp; A client in this situation must call <i>__rep_wait</i>
+with the time remaining
+on its lease.&nbsp; If the lease is expired after waiting the remaining
+time, then the client can accept this new master.&nbsp; If the lease
+was refreshed during the waiting period then the client does not accept
+this new master and returns.<br>
+ </li>
+</ol>
+<h3>DUPMASTER</h3>
+A duplicate master situation can occur if an old master becomes
+disconnected from the rest of the group, that group elects a new master
+and then the partition is resolved.&nbsp; The requirement for master
+leases is that this situation will not cause the newly elected,
+rightful master to receive the <b>DB_REP_DUPMASTER</b>
+return.&nbsp; It is okay for the old master to get that return
+value.&nbsp; When a dual master situation exists, the following will
+happen:<br>
+<ul>
+ <li><i>On the current master and all
+current clients</i> - If the current master receives an update
+message or other conflicting message from the old master then that
+message will be ignored because the generation number is out of date.</li>
+ <li><i>On the old master</i> - If
+the old master receives an update message from the current master, or
+any other message with a later generation from any site, the new
+generation number will trigger this site to return <b>DB_REP_DUPMASTER</b>.&nbsp;
+However,
+instead of broadcasting out the <b>REP_DUPMASTER</b>
+message to shoot down others as well, this site, if leases are
+configured, will call <i>__rep_lease_check</i>
+and if they are expired, return the error.&nbsp; It should be
+impossible for us to receive a later generation message and still hold
+a majority of master leases.&nbsp; Something is seriously wrong and we
+will <b>DB_ASSERT</b> this situation
+cannot happen.<br>
+ </li>
+</ul>
+<h3>Client to Client Synchronization</h3>
+One question to ask is how lease grants interact with client-to-client
+synchronization. The only answer is that they do not.&nbsp; A client
+that is sending log records to another client cannot request the
+receiving client refresh its lease with the master.&nbsp; That client
+does not have a timestamp it can use for the master and clock skew
+makes it meaningless between machines.&nbsp; Therefore, sites that use
+client-to-client synchronization will likely see more lease refreshment
+during the read path and leases will be refreshed during live updates
+only.&nbsp; Of course, if a client supplies log records that fill a
+gap, and the later log records stored came from the master in a live
+update then the client will respond as per the discussion on Gap
+Processing above.<br>
+<h2>Interaction Matrix</h2>
+If leases are granted (by a client) or held (by a master) what should
+the following APIs and messages do?<br>
+<br>
+Other:<br>
+log_archive: Leases do not affect log_archive.&nbsp; OK.<br>
+dbenv-&gt;close: OK.<br>
+crash during lease grant and restart: <b>Potential
+problem here.&nbsp; See discussion below</b>.<br>
+<br>
+Rep Base API method:<br>
+rep_elect: Already discussed above.&nbsp; Must wait for lease to expire.<br>
+rep_flush: Master only, OK - this will be the basis for refreshing
+leases.<br>
+rep_get_*: Not affected by leases.<br>
+rep_process_message: Generally OK.&nbsp; We'll discuss each message
+below.<br>
+rep_set_config: OK.<br>
+rep_set_limit: OK<br>
+rep_set_nsites: Must be called before <i>rep_start</i>
+and <i>nsites</i> is immutable until
+14778 is resolved.<br>
+rep_set_priority: OK<br>
+rep_set_timeout: OK.&nbsp; Used to set lease timeout.<br>
+rep_set_transport: OK.<br>
+rep_start(MASTER): Role changes are discussed above.&nbsp; Make sure
+duplicate rep_start calls are no-ops for leases.<br>
+rep_start(CLIENT): Role changes are discussed above.&nbsp; Make sure
+duplicate calls are no-ops for leases.<br>
+rep_stat: OK.<br>
+rep_sync: Should not be able to happen.&nbsp; Client cannot accept new
+master with outstanding lease grant.&nbsp; Add DB_ASSERT here.<br>
+<br>
+REP_ALIVE: OK.<br>
+REP_ALIVE_REQ: OK.<br>
+REP_ALL_REQ: OK.<br>
+REP_BULK_LOG: OK.&nbsp; Clients check to send ACK.<br>
+REP_BULK_PAGE: Should never process one with lease granted.&nbsp; Add
+DB_ASSERT.<br>
+REP_DUPMASTER: Should never happen, this is what leases are supposed to
+prevent.&nbsp; See above.<br>
+REP_LOG: OK.&nbsp; Clients check to send ACK.<br>
+REP_LOG_MORE: OK.&nbsp; Clients check to send ACK.<br>
+REP_LOG_REQ: OK.<br>
+REP_MASTER_REQ: OK.<br>
+REP_NEWCLIENT: OK.<br>
+REP_NEWFILE: OK.&nbsp; Clients check to send ACK.<br>
+REP_NEWMASTER: See above.<br>
+REP_NEWSITE: OK.<br>
+REP_PAGE: OK.&nbsp; Should never process one with lease granted.&nbsp;
+Add DB_ASSERT.<br>
+REP_PAGE_FAIL:&nbsp; OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_PAGE_MORE:&nbsp; OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_PAGE_REQ: OK.<br>
+REP_REREQUEST: OK.<br>
+REP_UPDATE: OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_UPDATE_REQ: OK.&nbsp; This is a master-only message.<br>
+REP_VERIFY: OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_VERIFY_FAIL: OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_VERIFY_REQ: OK.<br>
+REP_VOTE1: OK.&nbsp; See Election discussion above.&nbsp; It is
+possible to receive one with a lease granted.&nbsp; Client cannot send
+one with an outstanding lease however.<br>
+REP_VOTE2: OK.&nbsp; See Election discussion above.&nbsp; It is
+possible to receive one with a lease granted.<br>
+<br>
+If the following method or message processing is in progress and a
+client wants to grant a lease, what should it do?&nbsp; Let's examine
+what this means.&nbsp; The client wanting to grant a lease simply means
+it is responding to the receipt of a <b>REP_LOG</b>
+(or its variants) message and applying a log record.&nbsp; Therefore,
+we need to consider a thread processing a log message racing with these
+other actions.<br>
+<br>
+Other:<br>
+log_archive: OK.&nbsp; <br>
+dbenv-&gt;close: User error.&nbsp; User should not be closing the env
+while other threads are using that handle.&nbsp; Should have no effect
+if a 2nd dbenv handle to same env is closed.<br>
+<br>
+Rep Base API method:<br>
+rep_elect: See Election discussion above.&nbsp; <i>rep_elect</i>
+should wait and may grant
+lease while election is in progress.<br>
+rep_flush: Should not be called on client.<br>
+rep_get_*: OK.<br>
+rep_process_message: Generally OK.&nbsp; See handling each message
+below.<br>
+rep_set_config: OK.<br>
+rep_set_limit: OK.<br>
+rep_set_nsites: Must be called before <i>rep_start</i>
+until 14778 is resolved.<br>
+rep_set_priority: OK.<br>
+rep_set_timeout: OK.<br>
+rep_set_transport: OK.<br>
+rep_start(MASTER): OK, can't happen - already protect racing <i>rep_start</i>
+and <i>rep_process_message</i>.<br>
+rep_start(CLIENT): OK, can't happen - already protect racing <i>rep_start</i>
+and <i>rep_process_message</i>.<br>
+rep_stat: OK.<br>
+rep_sync: Shouldn't happen because client cannot grant leases during
+sync-up.&nbsp; Incoming log message ignored.<br>
+<br>
+REP_ALIVE: OK.<br>
+REP_ALIVE_REQ: OK.<br>
+REP_ALL_REQ: OK.<br>
+REP_BULK_LOG: OK.<br>
+REP_BULK_PAGE: OK.&nbsp; Incoming log message ignored during internal
+init.<br>
+REP_DUPMASTER: Shouldn't happen.&nbsp; See DUPMASTER discussion above.<br>
+REP_LOG: OK.<br>
+REP_LOG_MORE: OK.<br>
+REP_LOG_REQ: OK.<br>
+REP_MASTER_REQ: OK.<br>
+REP_NEWCLIENT: OK.<br>
+REP_NEWFILE: OK.<br>
+REP_NEWMASTER: See above.&nbsp; If a client accepts a new master
+because its lease grant expired, then that master sends a message
+requesting the lease grant, this client will not process the log record
+if it is in sync-up recovery, or it may after the master switch is
+complete and the client doesn't need sync-up recovery.&nbsp; Basically,
+just uses existing log record processing/newmaster infrastructure.<br>
+REP_NEWSITE: OK.<br>
+REP_PAGE: OK.&nbsp; Receiving a log record during internal init PAGE
+phase should ignore log record.<br>
+REP_PAGE_FAIL: OK.<br>
+REP_PAGE_MORE: OK.<br>
+REP_PAGE_REQ: OK.<br>
+REP_REREQUEST: OK.<br>
+REP_UPDATE: OK.&nbsp; Receiving a log record during internal init
+should ignore log record.<br>
+REP_UPDATE_REQ: OK - master-only message.<br>
+REP_VERIFY: OK.&nbsp; Receiving a log record during verify phase
+ignores log record.<br>
+REP_VERIFY_FAIL: OK.<br>
+REP_VERIFY_REQ: OK.<br>
+REP_VOTE1: OK.&nbsp; This client is processing someone else's vote when
+the lease request comes in.&nbsp; That is fine.&nbsp; We protect our
+own election and lease interaction in <i>__rep_elect</i>.<br>
+REP_VOTE2: OK.<br>
+<h4>Crashing - Potential Problem<br>
+</h4>
+It appears there is one area where we could have a problem.&nbsp; I
+believe that crashes can cause us to break our guarantee on durability,
+authoritative reads and inability to elect duplicate masters.&nbsp;
+Consider this scenario:<br>
+<ol>
+ <li>A master and 4 clients are all up and running.</li>
+ <li>The master commits a txn and all 4 clients refresh their lease
+grants at time T.</li>
+ <li>All 4 clients have the txn and log records in the cache.&nbsp;
+None are flushing to disk.</li>
+ <li>All 4 clients have responded to the PERM messages as well as
+refreshed their lease with the master.</li>
+ <li>All 4 clients hit the same application coding error and crash
+(machine/OS stays up).</li>
+ <li>Master authoritatively reads data in txn from step 2.</li>
+ <li>All 4 clients restart the application and run recovery, thus the
+txn from step 2 is lost on all clients because it isn't any logs.<span
+ style="font-weight: bold;"></span><br>
+ </li>
+ <li>A network partition happens and the master is alone on its side.</li>
+ <li>All 4 clients are on the other side and elect a new master.</li>
+ <li>Partition resolves itself and we have duplicate masters, where
+the former master still holds all valid lease grants.<span
+ style="font-weight: bold;"></span><br>
+ </li>
+</ol>
+Therefore, we have broken both guarantees.&nbsp; In step 6 the data is
+really not durable and we've given it to the user.&nbsp; One can argue
+that if this is an issue the application better be syncing somewhere if
+they really want durability.&nbsp; However, worse than that is that we
+have a legitimate DUPMASTER situation in step 10 where both masters
+hold valid leases.&nbsp; The reason is that all lease knowledge is in
+the shared memory and that is lost when the app restarts and runs
+recovery.<br>
+<br>
+How can we solve this?&nbsp; The obvious solution is (ugh, yet another)
+durable BDB-owned file with some information in it, such as the current
+lease expiration time so that rebooting after a crash leaves the
+knowledge that the lease was granted.&nbsp; However, writing and
+syncing every lease grant on every client out to disk is far too
+expensive.<br>
+<br>
+A second possible solution is to have clients wait a full lease timeout
+before entering an election the first time. This solution solves the
+DUPMASTER issue, but not the non-authoritative read.&nbsp; This
+solution naturally falls out of elections and leases really.&nbsp; If a
+client has never granted a lease, it should be considered as having to
+wait a full lease timeout before entering an election.&nbsp;
+Applications already know that leases impact elections and this does
+not seem so bad as it is only on the first election.<br>
+<br>
+Is it sufficient to document that the authoritative read is only as
+authoritative as the durability guarantees they make on the sites that
+indicate it is permanent? Yes, I believe this is sufficient.&nbsp; If
+the application says it is permanent and it really isn't, then the
+application is at fault.&nbsp; Believing the application when it
+indicates with the PERM response that it is permanent avoids the
+authoritative problem.&nbsp; <br>
+<h2>Upgrade/Mixed Versions</h2>
+Clearly leases cannot be used with mixed version sites since masters
+running older releases will not have any knowledge of lease
+support.&nbsp; What considerations are needed in the lease code for
+mixed versions?<br>
+<br>
+First if the <b>REP_CONTROL</b>
+structure changes, we need to maintain and use an old version of the
+structure for talking to older clients and masters.&nbsp; The
+implementation of this would be similar to the way we manage for old <b>REP_VOTE_INFO</b>
+structures.&nbsp;
+Second any new messages need translation table entries added.&nbsp;
+Third, if we are assuming global leases then clearly any mixed versions
+cannot have leases configured, and leases cannot be used in mixed
+version groups.&nbsp; Maintaining two versions of the control structure
+is not necessary if we choose a different style of implementation and
+don't change the control structure.<br>
+<br>
+However, then how could an old application both run continuously,
+upgrade to the new release and take advantage of leases without taking
+down the entire application?&nbsp; I believe it is possible for clients
+to be configured for leases but be subject to the master regarding
+leases, yet the master code can assume that if it has leases
+configured, all client sites do as well.&nbsp; In several places above
+I suggested that a client could make a choice based on either a new <b>REPCTL_LEASE</b>
+flag or simply having
+leases turned on locally.&nbsp; If we choose to use the flag, then we
+can support leases with mixed versions.&nbsp; The upgraded clients can
+configure leases and they simply will not be granted until the old
+master is upgraded and send PERM message with the flag indicating it
+wants a lease grant.&nbsp; The client will not grant a lease until such
+time.&nbsp; The clients, while having the leases configured, will not
+grant a lease until told to do so and will simply have an expired
+lease.&nbsp; Then, when the old master finally upgrades, it too can
+configure leases and suddenly all sites are using them.&nbsp; I believe
+this should work just fine and I will need to make sure a client's
+granting of leases is only in response to the master asking for a
+grant.&nbsp; If the master never asks, then the client has them
+configured, but doesn't grant them.<br>
+<h2>Testing</h2>
+Clearly any user-facing API changes will need the equivalent reflection
+in the Tcl API for testing, under CONFIG_TEST.<br>
+<br>
+I am sure the list of tests will grow but off the top of my head:<br>
+Basic test: have N sites all configure leases, run some,&nbsp; read on
+master, etc.<br>
+Refresh test: Perform update on master, sleep until past expiration,
+read on master and make sure leases are refreshed/read successful<br>
+Error test: Test error conditions (reading on client with leases but no
+ignore flag, calling after rep_start, etc)<br>
+Read test: Test reading on both client and master both with and without
+the IGNORE flag.&nbsp; Test that data read with the ignore flag can be
+rolled back.<br>
+Dupmaster test: Force a DUPMASTER situation and verify that the newer
+master cannot get DUPMASTER error.<br>
+Election test: Call election while grant is outstanding and master
+exists.<br>
+Call election while grant is outstanding and master does not exist.<br>
+Call election after expiration on quiescient system with master
+existing.<br>
+Run with a group where some members have leases configured and other do
+not to make sure we get errors instead of dumping core.<br>
+<br>
+<small><br>
+</small>
+</body>
+</html>
diff --git a/src/rep/rep.msg b/src/rep/rep.msg
new file mode 100644
index 00000000..b751a64d
--- /dev/null
+++ b/src/rep/rep.msg
@@ -0,0 +1,160 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __rep
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/mp.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * bulk - message for bulk log records or pages
+ */
+BEGIN_MSG bulk check_length
+ARG len u_int32_t
+ARG lsn DB_LSN
+ARG bulkdata DBT
+END
+
+/*
+ * control - replication control message
+ */
+BEGIN_MSG control check_length
+ARG rep_version u_int32_t
+ARG log_version u_int32_t
+ARG lsn DB_LSN
+ARG rectype u_int32_t
+ARG gen u_int32_t
+ARG msg_sec u_int32_t
+ARG msg_nsec u_int32_t
+ARG flags u_int32_t
+END
+
+/*
+ * egen data
+ */
+BEGIN_MSG egen check_length
+ARG egen u_int32_t
+END
+
+/*
+ * file info
+ *
+ * NOTE: The order of the DBTs is important and relevant in the
+ * GET_CURINFO macro.
+ */
+BEGIN_MSG fileinfo alloc check_length version
+ARG pgsize u_int32_t
+ARG pgno db_pgno_t
+ARG max_pgno db_pgno_t
+ARG filenum u_int32_t
+ARG finfo_flags u_int32_t
+ARG type u_int32_t
+ARG db_flags u_int32_t
+ARG uid DBT
+ARG info DBT
+ARG dir DBT
+END
+
+BEGIN_MSG fileinfo_v6 alloc check_length version
+ARG pgsize u_int32_t
+ARG pgno db_pgno_t
+ARG max_pgno db_pgno_t
+ARG filenum u_int32_t
+ARG finfo_flags u_int32_t
+ARG type u_int32_t
+ARG db_flags u_int32_t
+ARG uid DBT
+ARG info DBT
+END
+
+/*
+ * grant info - clients send to masters granting a lease.
+ */
+BEGIN_MSG grant_info check_length
+ARG msg_sec u_int32_t
+ARG msg_nsec u_int32_t
+END
+
+/*
+ * We do not need to do anything with LOG record data.
+ * It is opaque data to us.
+ */
+
+/*
+ * log request
+ */
+BEGIN_MSG logreq check_length
+ARG endlsn DB_LSN
+END
+
+/*
+ * We do not need to do anything with NEWCLIENT/NEWSITE cdata dbt.
+ * It is user data and the app has to do whatever transformation
+ * it needs to with its own data.
+ */
+/*
+ * newfile version
+ */
+BEGIN_MSG newfile check_length
+ARG version u_int32_t
+END
+
+/*
+ * update - send update information
+ */
+BEGIN_MSG update alloc check_length version
+ARG first_lsn DB_LSN
+ARG first_vers u_int32_t
+ARG num_files u_int32_t
+END
+
+/*
+ * vote info. Current version.
+ */
+BEGIN_MSG vote_info check_length
+ARG egen u_int32_t
+ARG nsites u_int32_t
+ARG nvotes u_int32_t
+ARG priority u_int32_t
+ARG spare_pri u_int32_t
+ARG tiebreaker u_int32_t
+ARG data_gen u_int32_t
+END
+/*
+ * vote info old version from REPVERSION 5 and earlier.
+ */
+BEGIN_MSG vote_info_v5 check_length
+ARG egen u_int32_t
+ARG nsites u_int32_t
+ARG nvotes u_int32_t
+ARG priority u_int32_t
+ARG tiebreaker u_int32_t
+END
+
+/*
+ * LSN history database - key
+ */
+BEGIN_MSG lsn_hist_key
+ARG version u_int32_t
+ARG gen u_int32_t
+END
+
+/*
+ * LSN history database - data
+ */
+BEGIN_MSG lsn_hist_data
+ARG envid u_int32_t
+ARG lsn DB_LSN
+ARG hist_sec u_int32_t
+ARG hist_nsec u_int32_t
+END
diff --git a/src/rep/rep_automsg.c b/src/rep/rep_automsg.c
new file mode 100644
index 00000000..5d8155fb
--- /dev/null
+++ b/src/rep/rep_automsg.c
@@ -0,0 +1,1041 @@
+/* Do not edit: automatically built by gen_msg.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __rep_bulk_marshal __P((ENV *, __rep_bulk_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_bulk_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_bulk_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_BULK_SIZE
+ + (size_t)argp->bulkdata.size)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->len);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+ DB_HTONL_COPYOUT(env, bp, argp->bulkdata.size);
+ if (argp->bulkdata.size > 0) {
+ memcpy(bp, argp->bulkdata.data, argp->bulkdata.size);
+ bp += argp->bulkdata.size;
+ }
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_bulk_unmarshal __P((ENV *, __rep_bulk_args *,
+ * PUBLIC: u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_bulk_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_bulk_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+
+ needed = __REP_BULK_SIZE;
+ if (max < needed)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->len, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+ DB_NTOHL_COPYIN(env, argp->bulkdata.size, bp);
+ if (argp->bulkdata.size == 0)
+ argp->bulkdata.data = NULL;
+ else
+ argp->bulkdata.data = bp;
+ needed += (size_t)argp->bulkdata.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->bulkdata.size;
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_bulk message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_control_marshal __P((ENV *, __rep_control_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_control_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_control_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_CONTROL_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->rep_version);
+ DB_HTONL_COPYOUT(env, bp, argp->log_version);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+ DB_HTONL_COPYOUT(env, bp, argp->rectype);
+ DB_HTONL_COPYOUT(env, bp, argp->gen);
+ DB_HTONL_COPYOUT(env, bp, argp->msg_sec);
+ DB_HTONL_COPYOUT(env, bp, argp->msg_nsec);
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_control_unmarshal __P((ENV *,
+ * PUBLIC: __rep_control_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_control_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_control_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_CONTROL_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->rep_version, bp);
+ DB_NTOHL_COPYIN(env, argp->log_version, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+ DB_NTOHL_COPYIN(env, argp->rectype, bp);
+ DB_NTOHL_COPYIN(env, argp->gen, bp);
+ DB_NTOHL_COPYIN(env, argp->msg_sec, bp);
+ DB_NTOHL_COPYIN(env, argp->msg_nsec, bp);
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_control message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_egen_marshal __P((ENV *, __rep_egen_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_egen_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_egen_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_EGEN_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->egen);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_egen_unmarshal __P((ENV *, __rep_egen_args *,
+ * PUBLIC: u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_egen_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_egen_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_EGEN_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->egen, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_egen message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_marshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_fileinfo_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_fileinfo_marshal(env, version, argp, bp, max, lenp)
+ ENV *env;
+ u_int32_t version;
+ __rep_fileinfo_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ int copy_only;
+ u_int8_t *start;
+
+ if (max < __REP_FILEINFO_SIZE
+ + (size_t)argp->uid.size
+ + (size_t)argp->info.size
+ + (size_t)argp->dir.size)
+ return (ENOMEM);
+ start = bp;
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(bp, &argp->pgsize, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->pgsize);
+ if (copy_only) {
+ memcpy(bp, &argp->pgno, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->pgno);
+ if (copy_only) {
+ memcpy(bp, &argp->max_pgno, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->max_pgno);
+ if (copy_only) {
+ memcpy(bp, &argp->filenum, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->filenum);
+ if (copy_only) {
+ memcpy(bp, &argp->finfo_flags, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->finfo_flags);
+ if (copy_only) {
+ memcpy(bp, &argp->type, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->type);
+ if (copy_only) {
+ memcpy(bp, &argp->db_flags, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->db_flags);
+ if (copy_only) {
+ memcpy(bp, &argp->uid.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->uid.size);
+ if (argp->uid.size > 0) {
+ memcpy(bp, argp->uid.data, argp->uid.size);
+ bp += argp->uid.size;
+ }
+ if (copy_only) {
+ memcpy(bp, &argp->info.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->info.size);
+ if (argp->info.size > 0) {
+ memcpy(bp, argp->info.data, argp->info.size);
+ bp += argp->info.size;
+ }
+ if (copy_only) {
+ memcpy(bp, &argp->dir.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->dir.size);
+ if (argp->dir.size > 0) {
+ memcpy(bp, argp->dir.data, argp->dir.size);
+ bp += argp->dir.size;
+ }
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_unmarshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_fileinfo_args **, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_fileinfo_unmarshal(env, version, argpp, bp, max, nextp)
+ ENV *env;
+ u_int32_t version;
+ __rep_fileinfo_args **argpp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+ __rep_fileinfo_args *argp;
+ int ret;
+ int copy_only;
+
+ needed = __REP_FILEINFO_SIZE;
+ if (max < needed)
+ goto too_few;
+ if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0)
+ return (ret);
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(&argp->pgsize, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->pgsize, bp);
+ if (copy_only) {
+ memcpy(&argp->pgno, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->pgno, bp);
+ if (copy_only) {
+ memcpy(&argp->max_pgno, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->max_pgno, bp);
+ if (copy_only) {
+ memcpy(&argp->filenum, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->filenum, bp);
+ if (copy_only) {
+ memcpy(&argp->finfo_flags, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->finfo_flags, bp);
+ if (copy_only) {
+ memcpy(&argp->type, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->type, bp);
+ if (copy_only) {
+ memcpy(&argp->db_flags, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->db_flags, bp);
+ if (copy_only) {
+ memcpy(&argp->uid.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->uid.size, bp);
+ if (argp->uid.size == 0)
+ argp->uid.data = NULL;
+ else
+ argp->uid.data = bp;
+ needed += (size_t)argp->uid.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->uid.size;
+ if (copy_only) {
+ memcpy(&argp->info.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->info.size, bp);
+ if (argp->info.size == 0)
+ argp->info.data = NULL;
+ else
+ argp->info.data = bp;
+ needed += (size_t)argp->info.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->info.size;
+ if (copy_only) {
+ memcpy(&argp->dir.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->dir.size, bp);
+ if (argp->dir.size == 0)
+ argp->dir.data = NULL;
+ else
+ argp->dir.data = bp;
+ needed += (size_t)argp->dir.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->dir.size;
+
+ if (nextp != NULL)
+ *nextp = bp;
+ *argpp = argp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_fileinfo message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_v6_marshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_fileinfo_v6_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_fileinfo_v6_marshal(env, version, argp, bp, max, lenp)
+ ENV *env;
+ u_int32_t version;
+ __rep_fileinfo_v6_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ int copy_only;
+ u_int8_t *start;
+
+ if (max < __REP_FILEINFO_V6_SIZE
+ + (size_t)argp->uid.size
+ + (size_t)argp->info.size)
+ return (ENOMEM);
+ start = bp;
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(bp, &argp->pgsize, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->pgsize);
+ if (copy_only) {
+ memcpy(bp, &argp->pgno, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->pgno);
+ if (copy_only) {
+ memcpy(bp, &argp->max_pgno, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->max_pgno);
+ if (copy_only) {
+ memcpy(bp, &argp->filenum, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->filenum);
+ if (copy_only) {
+ memcpy(bp, &argp->finfo_flags, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->finfo_flags);
+ if (copy_only) {
+ memcpy(bp, &argp->type, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->type);
+ if (copy_only) {
+ memcpy(bp, &argp->db_flags, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->db_flags);
+ if (copy_only) {
+ memcpy(bp, &argp->uid.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->uid.size);
+ if (argp->uid.size > 0) {
+ memcpy(bp, argp->uid.data, argp->uid.size);
+ bp += argp->uid.size;
+ }
+ if (copy_only) {
+ memcpy(bp, &argp->info.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->info.size);
+ if (argp->info.size > 0) {
+ memcpy(bp, argp->info.data, argp->info.size);
+ bp += argp->info.size;
+ }
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_v6_unmarshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_fileinfo_v6_args **, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_fileinfo_v6_unmarshal(env, version, argpp, bp, max, nextp)
+ ENV *env;
+ u_int32_t version;
+ __rep_fileinfo_v6_args **argpp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+ __rep_fileinfo_v6_args *argp;
+ int ret;
+ int copy_only;
+
+ needed = __REP_FILEINFO_V6_SIZE;
+ if (max < needed)
+ goto too_few;
+ if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0)
+ return (ret);
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(&argp->pgsize, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->pgsize, bp);
+ if (copy_only) {
+ memcpy(&argp->pgno, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->pgno, bp);
+ if (copy_only) {
+ memcpy(&argp->max_pgno, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->max_pgno, bp);
+ if (copy_only) {
+ memcpy(&argp->filenum, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->filenum, bp);
+ if (copy_only) {
+ memcpy(&argp->finfo_flags, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->finfo_flags, bp);
+ if (copy_only) {
+ memcpy(&argp->type, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->type, bp);
+ if (copy_only) {
+ memcpy(&argp->db_flags, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->db_flags, bp);
+ if (copy_only) {
+ memcpy(&argp->uid.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->uid.size, bp);
+ if (argp->uid.size == 0)
+ argp->uid.data = NULL;
+ else
+ argp->uid.data = bp;
+ needed += (size_t)argp->uid.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->uid.size;
+ if (copy_only) {
+ memcpy(&argp->info.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->info.size, bp);
+ if (argp->info.size == 0)
+ argp->info.data = NULL;
+ else
+ argp->info.data = bp;
+ needed += (size_t)argp->info.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->info.size;
+
+ if (nextp != NULL)
+ *nextp = bp;
+ *argpp = argp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_fileinfo_v6 message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_grant_info_marshal __P((ENV *,
+ * PUBLIC: __rep_grant_info_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_grant_info_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_grant_info_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_GRANT_INFO_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->msg_sec);
+ DB_HTONL_COPYOUT(env, bp, argp->msg_nsec);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_grant_info_unmarshal __P((ENV *,
+ * PUBLIC: __rep_grant_info_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_grant_info_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_grant_info_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_GRANT_INFO_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->msg_sec, bp);
+ DB_NTOHL_COPYIN(env, argp->msg_nsec, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_grant_info message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_logreq_marshal __P((ENV *, __rep_logreq_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_logreq_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_logreq_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_LOGREQ_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->endlsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->endlsn.offset);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_logreq_unmarshal __P((ENV *, __rep_logreq_args *,
+ * PUBLIC: u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_logreq_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_logreq_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_LOGREQ_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->endlsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->endlsn.offset, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_logreq message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_newfile_marshal __P((ENV *, __rep_newfile_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_newfile_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_newfile_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_NEWFILE_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->version);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_newfile_unmarshal __P((ENV *,
+ * PUBLIC: __rep_newfile_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_newfile_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_newfile_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_NEWFILE_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->version, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_newfile message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_update_marshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_update_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_update_marshal(env, version, argp, bp, max, lenp)
+ ENV *env;
+ u_int32_t version;
+ __rep_update_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ int copy_only;
+ u_int8_t *start;
+
+ if (max < __REP_UPDATE_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(bp, &argp->first_lsn.file, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ memcpy(bp, &argp->first_lsn.offset, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ DB_HTONL_COPYOUT(env, bp, argp->first_lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->first_lsn.offset);
+ }
+ if (copy_only) {
+ memcpy(bp, &argp->first_vers, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->first_vers);
+ if (copy_only) {
+ memcpy(bp, &argp->num_files, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->num_files);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_update_unmarshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_update_args **, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_update_unmarshal(env, version, argpp, bp, max, nextp)
+ ENV *env;
+ u_int32_t version;
+ __rep_update_args **argpp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ __rep_update_args *argp;
+ int ret;
+ int copy_only;
+
+ if (max < __REP_UPDATE_SIZE)
+ goto too_few;
+ if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0)
+ return (ret);
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(&argp->first_lsn.file, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ memcpy(&argp->first_lsn.offset, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ DB_NTOHL_COPYIN(env, argp->first_lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->first_lsn.offset, bp);
+ }
+ if (copy_only) {
+ memcpy(&argp->first_vers, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->first_vers, bp);
+ if (copy_only) {
+ memcpy(&argp->num_files, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->num_files, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ *argpp = argp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_update message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_marshal __P((ENV *,
+ * PUBLIC: __rep_vote_info_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_vote_info_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_vote_info_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_VOTE_INFO_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->egen);
+ DB_HTONL_COPYOUT(env, bp, argp->nsites);
+ DB_HTONL_COPYOUT(env, bp, argp->nvotes);
+ DB_HTONL_COPYOUT(env, bp, argp->priority);
+ DB_HTONL_COPYOUT(env, bp, argp->spare_pri);
+ DB_HTONL_COPYOUT(env, bp, argp->tiebreaker);
+ DB_HTONL_COPYOUT(env, bp, argp->data_gen);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_unmarshal __P((ENV *,
+ * PUBLIC: __rep_vote_info_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_vote_info_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_vote_info_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_VOTE_INFO_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->egen, bp);
+ DB_NTOHL_COPYIN(env, argp->nsites, bp);
+ DB_NTOHL_COPYIN(env, argp->nvotes, bp);
+ DB_NTOHL_COPYIN(env, argp->priority, bp);
+ DB_NTOHL_COPYIN(env, argp->spare_pri, bp);
+ DB_NTOHL_COPYIN(env, argp->tiebreaker, bp);
+ DB_NTOHL_COPYIN(env, argp->data_gen, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_vote_info message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_v5_marshal __P((ENV *,
+ * PUBLIC: __rep_vote_info_v5_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_vote_info_v5_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_vote_info_v5_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_VOTE_INFO_V5_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->egen);
+ DB_HTONL_COPYOUT(env, bp, argp->nsites);
+ DB_HTONL_COPYOUT(env, bp, argp->nvotes);
+ DB_HTONL_COPYOUT(env, bp, argp->priority);
+ DB_HTONL_COPYOUT(env, bp, argp->tiebreaker);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_v5_unmarshal __P((ENV *,
+ * PUBLIC: __rep_vote_info_v5_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_vote_info_v5_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_vote_info_v5_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_VOTE_INFO_V5_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->egen, bp);
+ DB_NTOHL_COPYIN(env, argp->nsites, bp);
+ DB_NTOHL_COPYIN(env, argp->nvotes, bp);
+ DB_NTOHL_COPYIN(env, argp->priority, bp);
+ DB_NTOHL_COPYIN(env, argp->tiebreaker, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_vote_info_v5 message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __rep_lsn_hist_key_marshal __P((ENV *,
+ * PUBLIC: __rep_lsn_hist_key_args *, u_int8_t *));
+ */
+void
+__rep_lsn_hist_key_marshal(env, argp, bp)
+ ENV *env;
+ __rep_lsn_hist_key_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->version);
+ DB_HTONL_COPYOUT(env, bp, argp->gen);
+}
+
+/*
+ * PUBLIC: int __rep_lsn_hist_key_unmarshal __P((ENV *,
+ * PUBLIC: __rep_lsn_hist_key_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_lsn_hist_key_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_lsn_hist_key_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_LSN_HIST_KEY_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->version, bp);
+ DB_NTOHL_COPYIN(env, argp->gen, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_lsn_hist_key message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __rep_lsn_hist_data_marshal __P((ENV *,
+ * PUBLIC: __rep_lsn_hist_data_args *, u_int8_t *));
+ */
+void
+__rep_lsn_hist_data_marshal(env, argp, bp)
+ ENV *env;
+ __rep_lsn_hist_data_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->envid);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+ DB_HTONL_COPYOUT(env, bp, argp->hist_sec);
+ DB_HTONL_COPYOUT(env, bp, argp->hist_nsec);
+}
+
+/*
+ * PUBLIC: int __rep_lsn_hist_data_unmarshal __P((ENV *,
+ * PUBLIC: __rep_lsn_hist_data_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_lsn_hist_data_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_lsn_hist_data_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_LSN_HIST_DATA_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->envid, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+ DB_NTOHL_COPYIN(env, argp->hist_sec, bp);
+ DB_NTOHL_COPYIN(env, argp->hist_nsec, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_lsn_hist_data message"));
+ return (EINVAL);
+}
+
diff --git a/src/rep/rep_backup.c b/src/rep/rep_backup.c
new file mode 100644
index 00000000..cfde7622
--- /dev/null
+++ b/src/rep/rep_backup.c
@@ -0,0 +1,3568 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * Context information needed for buffer management during the building of a
+ * list of database files present in the environment. When fully built, the
+ * buffer is in the form of an UPDATE message: a (marshaled) update_args,
+ * followed by some number of (marshaled) fileinfo_args.
+ *
+ * Note that the fileinfo for the first file in the list always appears at
+ * (constant) offset __REP_UPDATE_SIZE in the buffer.
+ */
+typedef struct {
+ u_int8_t *buf; /* Buffer base address. */
+ u_int32_t size; /* Total allocated buffer size. */
+ u_int8_t *fillptr; /* Pointer to first unused space. */
+ u_int32_t count; /* Number of entries currently in list. */
+ u_int32_t version; /* Rep version of marshaled format. */
+} FILE_LIST_CTX;
+#define FIRST_FILE_PTR(buf) ((buf) + __REP_UPDATE_SIZE)
+
+/*
+ * Function that performs any desired processing on a single file, as part of
+ * the traversal of a list of database files, such as with internal init.
+ */
+typedef int (FILE_WALK_FN) __P((ENV *, __rep_fileinfo_args *, void *));
+
+static FILE_WALK_FN __rep_check_uid;
+static int __rep_clean_interrupted __P((ENV *));
+static FILE_WALK_FN __rep_cleanup_nimdbs;
+static int __rep_filedone __P((ENV *, DB_THREAD_INFO *ip, int,
+ REP *, __rep_fileinfo_args *, u_int32_t));
+static int __rep_find_dbs __P((ENV *, FILE_LIST_CTX *));
+static FILE_WALK_FN __rep_find_inmem;
+static int __rep_get_fileinfo __P((ENV *, const char *,
+ const char *, __rep_fileinfo_args *, u_int8_t *));
+static int __rep_get_file_list __P((ENV *,
+ DB_FH *, u_int32_t, u_int32_t *, DBT *));
+static int __rep_is_replicated_db __P((const char *, const char *));
+static int __rep_log_setup __P((ENV *,
+ REP *, u_int32_t, u_int32_t, DB_LSN *));
+static int __rep_mpf_open __P((ENV *, DB_MPOOLFILE **,
+ __rep_fileinfo_args *, u_int32_t));
+static int __rep_nextfile __P((ENV *, int, REP *));
+static int __rep_page_gap __P((ENV *,
+ REP *, __rep_fileinfo_args *, u_int32_t));
+static int __rep_page_sendpages __P((ENV *, DB_THREAD_INFO *, int,
+ __rep_control_args *, __rep_fileinfo_args *, DB_MPOOLFILE *, DB *));
+static int __rep_queue_filedone __P((ENV *,
+ DB_THREAD_INFO *, REP *, __rep_fileinfo_args *));
+static int __rep_remove_all __P((ENV *, u_int32_t, DBT *));
+static FILE_WALK_FN __rep_remove_by_list;
+static int __rep_remove_by_prefix __P((ENV *, const char *, const char *,
+ size_t, APPNAME));
+static FILE_WALK_FN __rep_remove_file;
+static int __rep_remove_logs __P((ENV *));
+static int __rep_remove_nimdbs __P((ENV *));
+static int __rep_rollback __P((ENV *, DB_LSN *));
+static int __rep_unlink_by_list __P((ENV *, u_int32_t,
+ u_int8_t *, u_int32_t, u_int32_t));
+static FILE_WALK_FN __rep_unlink_file;
+static int __rep_walk_filelist __P((ENV *, u_int32_t, u_int8_t *,
+ u_int32_t, u_int32_t, FILE_WALK_FN *, void *));
+static int __rep_walk_dir __P((ENV *, const char *, const char *,
+ FILE_LIST_CTX*));
+static int __rep_write_page __P((ENV *,
+ DB_THREAD_INFO *, REP *, __rep_fileinfo_args *));
+
+/*
+ * __rep_update_req -
+ * Process an update_req and send the file information to clients.
+ *
+ * PUBLIC: int __rep_update_req __P((ENV *, __rep_control_args *));
+ */
+int
+__rep_update_req(env, rp)
+ ENV *env;
+ __rep_control_args *rp;
+{
+ DBT updbt, vdbt;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ REP *rep;
+ __rep_update_args u_args;
+ FILE_LIST_CTX context;
+ size_t updlen;
+ u_int32_t flag, version;
+ int ret, t_ret;
+
+ /*
+ * Start by allocating 1Meg, which ought to be plenty enough to describe
+ * all databases in the environment. (If it's not, __rep_walk_dir can
+ * grow the size.)
+ *
+ * The data we send looks like this:
+ * __rep_update_args
+ * __rep_fileinfo_args
+ * __rep_fileinfo_args
+ * ...
+ */
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_SYSTEM_LOCK(env);
+ if (F_ISSET(rep, REP_F_INUPDREQ)) {
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+ }
+ F_SET(rep, REP_F_INUPDREQ);
+ REP_SYSTEM_UNLOCK(env);
+
+ dblp = env->lg_handle;
+ logc = NULL;
+ if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+ goto err_noalloc;
+ context.size = MEGABYTE;
+ context.count = 0;
+ context.version = rp->rep_version;
+
+ /* Reserve space for the update_args, and fill in file info. */
+ context.fillptr = FIRST_FILE_PTR(context.buf);
+ if ((ret = __rep_find_dbs(env, &context)) != 0)
+ goto err;
+
+ /*
+ * Now get our first LSN. We send the lsn of the first
+ * non-archivable log file.
+ */
+ flag = DB_SET;
+ if ((ret = __log_get_stable_lsn(env, &lsn, 0)) != 0) {
+ if (ret != DB_NOTFOUND)
+ goto err;
+ /*
+ * If ret is DB_NOTFOUND then there is no checkpoint
+ * in this log, that is okay, just start at the beginning.
+ */
+ ret = 0;
+ flag = DB_FIRST;
+ }
+
+ /*
+ * Now get the version number of the log file of that LSN.
+ */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+
+ memset(&vdbt, 0, sizeof(vdbt));
+ /*
+ * Set our log cursor on the LSN we are sending. Or
+ * to the first LSN if we have no stable LSN.
+ */
+ if ((ret = __logc_get(logc, &lsn, &vdbt, flag)) != 0) {
+ /*
+ * We could be racing a fresh master starting up. If we
+ * have no log records, assume an initial LSN and current
+ * log version.
+ */
+ if (ret != DB_NOTFOUND)
+ goto err;
+ INIT_LSN(lsn);
+ version = DB_LOGVERSION;
+ } else {
+ if ((ret = __logc_version(logc, &version)) != 0)
+ goto err;
+ }
+ /*
+ * Package up the update information.
+ */
+ u_args.first_lsn = lsn;
+ u_args.first_vers = version;
+ u_args.num_files = context.count;
+ if ((ret = __rep_update_marshal(env, rp->rep_version,
+ &u_args, context.buf, __REP_UPDATE_SIZE, &updlen)) != 0)
+ goto err;
+ DB_ASSERT(env, updlen == __REP_UPDATE_SIZE);
+
+ /*
+ * We have all the file information now. Send it.
+ */
+ DB_INIT_DBT(updbt, context.buf, context.fillptr - context.buf);
+
+ LOG_SYSTEM_LOCK(env);
+ lsn = ((LOG *)dblp->reginfo.primary)->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(
+ env, DB_EID_BROADCAST, REP_UPDATE, &lsn, &updbt, 0, 0);
+
+err: __os_free(env, context.buf);
+err_noalloc:
+ if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_INUPDREQ);
+ REP_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __rep_find_dbs -
+ * Walk through all the named files/databases including those in the
+ * environment or data_dirs and those that in named and in-memory. We
+ * need to open them, gather the necessary information and then close
+ * them.
+ *
+ * May be called either while holding REP_SYSTEM_LOCK or without.
+ */
+static int
+__rep_find_dbs(env, context)
+ ENV *env;
+ FILE_LIST_CTX *context;
+{
+ DB_ENV *dbenv;
+ int ret;
+ char **ddir, *real_dir;
+
+ dbenv = env->dbenv;
+ ret = 0;
+ real_dir = NULL;
+
+ /*
+ * If we have a data directory, walk it get a list of the
+ * replicated user databases. If the application has a metadata_dir,
+ * this will also find any persistent internal system databases.
+ */
+ if (dbenv->db_data_dir != NULL) {
+ for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) {
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, *ddir, NULL, &real_dir)) != 0)
+ break;
+ if ((ret = __rep_walk_dir(env,
+ real_dir, *ddir, context)) != 0)
+ break;
+ __os_free(env, real_dir);
+ real_dir = NULL;
+ }
+ }
+ /*
+ * Walk the environment directory. If the application doesn't
+ * have a metadata_dir, this will return persistent internal system
+ * databases. If the application doesn't have a separate data
+ * directory, this will also return all user databases.
+ */
+ if (ret == 0)
+ ret = __rep_walk_dir(env, env->db_home, NULL, context);
+
+ /* Now, collect any in-memory named databases. */
+ if (ret == 0)
+ ret = __rep_walk_dir(env, NULL, NULL, context);
+
+ if (real_dir != NULL)
+ __os_free(env, real_dir);
+ return (ret);
+}
+
+/*
+ * __rep_walk_dir --
+ *
+ * This is the routine that walks a directory and fills in the structures
+ * that we use to generate messages to the client telling it what
+ * files are available. If the directory name is NULL, then we should
+ * walk the list of in-memory named files.
+ */
+static int
+__rep_walk_dir(env, dir, datadir, context)
+ ENV *env;
+ const char *dir, *datadir;
+ FILE_LIST_CTX *context;
+{
+ __rep_fileinfo_args tmpfp;
+ size_t avail, len;
+ int cnt, first_file, i, ret;
+ u_int8_t uid[DB_FILE_ID_LEN];
+ char *file, **names, *subdb;
+
+ if (dir == NULL) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Walk_dir: Getting info for in-memory named files"));
+ if ((ret = __memp_inmemlist(env, &names, &cnt)) != 0)
+ return (ret);
+ } else {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Walk_dir: Getting info for datadir %s, dir: %s",
+ datadir == NULL ? "NULL" : datadir, dir));
+ if ((ret = __os_dirlist(env, dir, 0, &names, &cnt)) != 0)
+ return (ret);
+ }
+ VPRINT(env, (env, DB_VERB_REP_SYNC, "Walk_dir: Dir %s has %d files",
+ (dir == NULL) ? "INMEM" : dir, cnt));
+ first_file = 1;
+ for (i = 0; i < cnt; i++) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Walk_dir: File %d name: %s", i, names[i]));
+ if (!__rep_is_replicated_db(names[i], dir))
+ continue;
+
+ /* We found a file to process. */
+ if (dir == NULL) {
+ file = NULL;
+ subdb = names[i];
+ } else {
+ file = names[i];
+ subdb = NULL;
+ }
+ if ((ret = __rep_get_fileinfo(env,
+ file, subdb, &tmpfp, uid)) != 0) {
+ /*
+ * If we find a file that isn't a database, skip it.
+ */
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Walk_dir: File %d %s: returned error %s",
+ i, names[i], db_strerror(ret)));
+ ret = 0;
+ continue;
+ }
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Walk_dir: File %s at 0x%lx: pgsize %lu, max_pgno %lu",
+ names[i], P_TO_ULONG(context->fillptr),
+ (u_long)tmpfp.pgsize, (u_long)tmpfp.max_pgno));
+
+ /*
+ * On the first time through the loop, check to see if the file
+ * we're about to add is already on the list. If it is, it must
+ * have been added in a previous call, and that means the
+ * directory we're currently scanning has already been scanned
+ * before. (This can happen if the user called
+ * env->set_data_dir() more than once for the same directory.)
+ * If that's the case, we're done: not only is it a waste of
+ * time to scan the same directory again, but doing so would
+ * result in the same files appearing in the list more than
+ * once.
+ */
+ if (first_file && dir != NULL &&
+ (ret = __rep_walk_filelist(env, context->version,
+ FIRST_FILE_PTR(context->buf), context->size,
+ context->count, __rep_check_uid, uid)) != 0) {
+ if (ret == DB_KEYEXIST)
+ ret = 0;
+ goto err;
+ }
+ first_file = 0;
+
+ /*
+ * Finally we know that this file is a suitable database file
+ * that we haven't yet included on our list.
+ */
+ tmpfp.filenum = context->count++;
+
+ if (datadir != NULL)
+ DB_SET_DBT(tmpfp.dir, datadir, strlen(datadir) + 1);
+ else {
+ DB_SET_DBT(tmpfp.dir, NULL, 0);
+ }
+ DB_SET_DBT(tmpfp.info, names[i], strlen(names[i]) + 1);
+ DB_SET_DBT(tmpfp.uid, uid, DB_FILE_ID_LEN);
+retry: avail = (size_t)(&context->buf[context->size] -
+ context->fillptr);
+ if (context->version < DB_REPVERSION_53)
+ /*
+ * It is safe to cast to the old struct
+ * because the first part of the current
+ * struct matches the old struct.
+ */
+ ret = __rep_fileinfo_v6_marshal(env, context->version,
+ (__rep_fileinfo_v6_args *)&tmpfp,
+ context->fillptr, avail, &len);
+ else
+ ret = __rep_fileinfo_marshal(env, context->version,
+ &tmpfp, context->fillptr, avail, &len);
+ if (ret == ENOMEM) {
+ /*
+ * Here, 'len' is the total space in use in the buffer.
+ */
+ len = (size_t)(context->fillptr - context->buf);
+ context->size *= 2;
+
+ if ((ret = __os_realloc(env,
+ context->size, &context->buf)) != 0)
+ goto err;
+ context->fillptr = context->buf + len;
+
+ /*
+ * Now that we've reallocated the space, try to
+ * store it again.
+ */
+ goto retry;
+ }
+ /*
+ * Here, 'len' (still) holds the length of the marshaled
+ * information about the current file (as filled in by the last
+ * call to __rep_fileinfo_marshal()).
+ */
+ context->fillptr += len;
+ }
+err:
+ __os_dirfree(env, names, cnt);
+ return (ret);
+}
+
+/*
+ * Returns a boolean to indicate whether a file/database with the given name
+ * should be included in internal init.
+ */
+static int
+__rep_is_replicated_db(name, dir)
+ const char *name, *dir;
+{
+ if (strcmp(name, "DB_CONFIG") == 0 || strcmp(name, "pragma") == 0)
+ return (0);
+ if (IS_LOG_FILE(name))
+ return (0);
+
+ /*
+ * Remaining things that don't have a "__db" prefix are eligible.
+ */
+ if (!IS_DB_FILE(name))
+ return (1);
+
+ /* Here, we know we have a "__db" name. */
+ if (name[sizeof(DB_REGION_PREFIX) - 1] == 'p')
+ return (1); /* Partition files are eligible. */
+
+ /*
+ * Replicated system databases are eligible. When on disk, both DBs are
+ * sub-databases of a single database file.
+ */
+ if (dir == NULL) {
+ if (strcmp(name, REPMEMBERSHIP) == 0 ||
+ strcmp(name, REPLSNHIST) == 0)
+ return (1);
+ } else {
+ if (IS_REP_FILE(name))
+ return (1);
+ }
+
+ /* Some other "__db" named file. */
+ return (0);
+}
+
+/*
+ * Check whether the given uid is already present in the list of files being
+ * built in the context buffer. A return of DB_KEYEXIST means it is.
+ */
+static int
+__rep_check_uid(env, rfp, uid)
+ ENV *env;
+ __rep_fileinfo_args *rfp;
+ void *uid;
+{
+ int ret;
+
+ ret = 0;
+ if (memcmp(rfp->uid.data, uid, DB_FILE_ID_LEN) == 0) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Check_uid: Found matching file."));
+ ret = DB_KEYEXIST;
+ }
+ return (ret);
+
+}
+
+static int
+__rep_get_fileinfo(env, file, subdb, rfp, uid)
+ ENV *env;
+ const char *file, *subdb;
+ __rep_fileinfo_args *rfp;
+ u_int8_t *uid;
+{
+ DB *dbp;
+ DBC *dbc;
+ DBMETA *dbmeta;
+ DB_THREAD_INFO *ip;
+ PAGE *pagep;
+ int lorder, ret, t_ret;
+
+ dbp = NULL;
+ dbc = NULL;
+ pagep = NULL;
+
+ ENV_GET_THREAD_INFO(env, ip);
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ /*
+ * Use DB_AM_RECOVER to prevent getting locks, otherwise exclusive
+ * database handles would block the master from handling UPDATE_REQ.
+ */
+ F_SET(dbp, DB_AM_RECOVER);
+ if ((ret = __db_open(dbp, ip, NULL, file, subdb, DB_UNKNOWN,
+ DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0),
+ 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ goto err;
+ if ((ret = __memp_fget(dbp->mpf, &dbp->meta_pgno, ip, dbc->txn,
+ 0, &pagep)) != 0)
+ goto err;
+ /*
+ * We have the meta page. Set up our information.
+ */
+ dbmeta = (DBMETA *)pagep;
+ rfp->pgno = 0;
+ /*
+ * Queue is a special-case. We need to set max_pgno to 0 so that
+ * the client can compute the pages from the meta-data.
+ */
+ if (dbp->type == DB_QUEUE)
+ rfp->max_pgno = 0;
+ else
+ rfp->max_pgno = dbmeta->last_pgno;
+ rfp->pgsize = dbp->pgsize;
+ memcpy(uid, dbp->fileid, DB_FILE_ID_LEN);
+ rfp->type = (u_int32_t)dbp->type;
+ rfp->db_flags = dbp->flags;
+ rfp->finfo_flags = 0;
+ /*
+ * Send the lorder of this database.
+ */
+ (void)__db_get_lorder(dbp, &lorder);
+ if (lorder == 1234)
+ FLD_SET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN);
+ else
+ FLD_CLR(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN);
+
+ ret = __memp_fput(dbp->mpf, ip, pagep, dbc->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto err;
+err:
+ /*
+ * Check status of pagep in case any new error paths out leave
+ * a valid page. All current paths out have pagep NULL.
+ */
+ DB_ASSERT(env, pagep == NULL);
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbp != NULL && (t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_page_req
+ * Process a page_req and send the page information to the client.
+ *
+ * PUBLIC: int __rep_page_req __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_page_req(env, ip, eid, rp, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ __rep_control_args *rp;
+ DBT *rec;
+{
+ __rep_fileinfo_args *msgfp, msgf;
+ __rep_fileinfo_v6_args *msgfpv6;
+ DB_MPOOLFILE *mpf;
+ DB_REP *db_rep;
+ REP *rep;
+ int ret, t_ret;
+ u_int8_t *next;
+ void *msgfree;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (rp->rep_version < DB_REPVERSION_53) {
+ /*
+ * Build a current struct by copying in the older
+ * version struct and then setting up the data_dir.
+ * This is safe because all old fields are in the
+ * same location in the current struct.
+ */
+ if ((ret = __rep_fileinfo_v6_unmarshal(env, rp->rep_version,
+ &msgfpv6, rec->data, rec->size, &next)) != 0)
+ return (ret);
+ memcpy(&msgf, msgfpv6, sizeof(__rep_fileinfo_v6_args));
+ msgf.dir.data = NULL;
+ msgf.dir.size = 0;
+ msgfp = &msgf;
+ msgfree = msgfpv6;
+ } else {
+ if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version,
+ &msgfp, rec->data, rec->size, &next)) != 0)
+ return (ret);
+ msgfree = msgfp;
+ }
+
+ DB_TEST_SET(env->test_abort, DB_TEST_NO_PAGES);
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "page_req: file %d page %lu to %lu",
+ msgfp->filenum, (u_long)msgfp->pgno, (u_long)msgfp->max_pgno));
+
+ /*
+ * We need to open the file and then send its pages.
+ * If we cannot open the file, we send REP_FILE_FAIL.
+ */
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "page_req: Open %d via mpf_open", msgfp->filenum));
+ if ((ret = __rep_mpf_open(env, &mpf, msgfp, 0)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "page_req: Open %d failed", msgfp->filenum));
+ if (F_ISSET(rep, REP_F_MASTER))
+ (void)__rep_send_message(env, eid, REP_FILE_FAIL,
+ NULL, rec, 0, 0);
+ else
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ ret = __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, NULL);
+ t_ret = __memp_fclose(mpf, 0);
+ if (ret == 0 && t_ret != 0)
+ ret = t_ret;
+err:
+DB_TEST_RECOVERY_LABEL
+ __os_free(env, msgfree);
+ return (ret);
+}
+
+static int
+__rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ __rep_control_args *rp;
+ __rep_fileinfo_args *msgfp;
+ DB_MPOOLFILE *mpf;
+ DB *dbp;
+{
+ DB *qdbp;
+ DBC *qdbc;
+ DBT msgdbt;
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ PAGE *pagep;
+ REP *rep;
+ REP_BULK bulk;
+ REP_THROTTLE repth;
+ db_pgno_t p;
+ uintptr_t bulkoff;
+ size_t len, msgsz;
+ u_int32_t bulkflags, use_bulk;
+ int opened, ret, t_ret;
+ u_int8_t *buf;
+
+ dblp = env->lg_handle;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ opened = 0;
+ t_ret = 0;
+ qdbp = NULL;
+ qdbc = NULL;
+ buf = NULL;
+ bulk.addr = NULL;
+ use_bulk = FLD_ISSET(rep->config, REP_C_BULK);
+ if (msgfp->type == (u_int32_t)DB_QUEUE) {
+ if (dbp == NULL) {
+ if ((ret = __db_create_internal(&qdbp, env, 0)) != 0)
+ goto err;
+ /*
+ * We need to check whether this is in-memory so that
+ * we pass the name correctly as either the file or
+ * the database name.
+ */
+ if ((ret = __db_open(qdbp, ip, NULL,
+ FLD_ISSET(msgfp->db_flags, DB_AM_INMEM) ?
+ NULL : msgfp->info.data,
+ FLD_ISSET(msgfp->db_flags, DB_AM_INMEM) ?
+ msgfp->info.data : NULL,
+ DB_UNKNOWN,
+ DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0),
+ 0, PGNO_BASE_MD)) != 0)
+ goto err;
+ opened = 1;
+ } else
+ qdbp = dbp;
+ if ((ret = __db_cursor(qdbp, ip, NULL, &qdbc, 0)) != 0)
+ goto err;
+ }
+ msgsz = __REP_FILEINFO_SIZE + DB_FILE_ID_LEN + msgfp->pgsize +
+ msgfp->dir.size;
+ if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
+ goto err;
+ memset(&msgdbt, 0, sizeof(msgdbt));
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "sendpages: file %d page %lu to %lu",
+ msgfp->filenum, (u_long)msgfp->pgno, (u_long)msgfp->max_pgno));
+ memset(&repth, 0, sizeof(repth));
+ /*
+ * If we're doing bulk transfer, allocate a bulk buffer to put our
+ * pages in. We still need to initialize the throttle info
+ * because if we encounter a page larger than our entire bulk
+ * buffer, we need to send it as a singleton.
+ *
+ * Use a local var so that we don't need to worry if someone else
+ * turns on/off bulk in the middle of our call here.
+ */
+ if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid,
+ &bulkoff, &bulkflags, REP_BULK_PAGE)) != 0)
+ goto err;
+ REP_SYSTEM_LOCK(env);
+ repth.gbytes = rep->gbytes;
+ repth.bytes = rep->bytes;
+ repth.type = REP_PAGE;
+ repth.data_dbt = &msgdbt;
+ REP_SYSTEM_UNLOCK(env);
+
+ for (p = msgfp->pgno; p <= msgfp->max_pgno; p++) {
+ if (msgfp->type == (u_int32_t)DB_QUEUE && p != 0) {
+ /*
+ * If queue returns ENOENT or if queue is not configured
+ * convert it into PAGE_NOTFOUND. Queue might return
+ * ENOENT if an entire extent file does not exist in the
+ * "middle" of the database.
+ */
+#ifdef HAVE_QUEUE
+ if ((ret = __qam_fget(qdbc, &p, 0, &pagep)) == ENOENT)
+#endif
+ ret = DB_PAGE_NOTFOUND;
+ } else
+ ret = __memp_fget(mpf, &p, ip, NULL, 0, &pagep);
+ msgfp->pgno = p;
+ if (ret == DB_PAGE_NOTFOUND) {
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ ret = 0;
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "sendpages: PAGE_FAIL on page %lu",
+ (u_long)p));
+ if (rp->rep_version < DB_REPVERSION_53)
+ /*
+ * It is safe to cast to the old struct
+ * because the first part of the current
+ * struct matches the old struct.
+ */
+ ret = __rep_fileinfo_v6_marshal(env,
+ rp->rep_version,
+ (__rep_fileinfo_v6_args *)msgfp,
+ buf, msgsz, &len);
+ else
+ ret = __rep_fileinfo_marshal(env,
+ rp->rep_version, msgfp, buf,
+ msgsz, &len);
+ if (ret != 0)
+ goto err;
+ LOG_SYSTEM_LOCK(env);
+ lsn = ((LOG *)dblp->reginfo.primary)->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ DB_SET_DBT(msgdbt, buf, len);
+ (void)__rep_send_message(env, eid,
+ REP_PAGE_FAIL, &lsn, &msgdbt, 0, 0);
+ continue;
+ } else
+ ret = DB_NOTFOUND;
+ goto err;
+ } else if (ret != 0)
+ goto err;
+ else
+ DB_SET_DBT(msgfp->info, pagep, msgfp->pgsize);
+ len = 0;
+ /*
+ * Send along an indication of the byte order of this mpool
+ * page. Since mpool always keeps pages in the native byte
+ * order of the local environment, this is simply my
+ * environment's byte order.
+ *
+ * Since pages can be served from a variety of sites when using
+ * client-to-client synchronization, the receiving client needs
+ * to know the byte order of each page independently.
+ */
+ if (F_ISSET(env, ENV_LITTLEENDIAN))
+ FLD_SET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN);
+ else
+ FLD_CLR(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN);
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "sendpages: %lu, page lsn [%lu][%lu]", (u_long)p,
+ (u_long)pagep->lsn.file, (u_long)pagep->lsn.offset));
+ if (rp->rep_version < DB_REPVERSION_53)
+ /*
+ * It is safe to cast to the old struct
+ * because the first part of the current
+ * struct matches the old struct.
+ */
+ ret = __rep_fileinfo_v6_marshal(env,
+ rp->rep_version,
+ (__rep_fileinfo_v6_args *)msgfp,
+ buf, msgsz, &len);
+ else
+ ret = __rep_fileinfo_marshal(env, rp->rep_version,
+ msgfp, buf, msgsz, &len);
+ if (msgfp->type != (u_int32_t)DB_QUEUE || p == 0)
+ t_ret = __memp_fput(mpf,
+ ip, pagep, DB_PRIORITY_UNCHANGED);
+#ifdef HAVE_QUEUE
+ else
+ /*
+ * We don't need an #else for HAVE_QUEUE here because if
+ * we're not compiled with queue, then we're guaranteed
+ * to have set REP_PAGE_FAIL above.
+ */
+ t_ret = __qam_fput(qdbc, p, pagep, qdbp->priority);
+#endif
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ DB_ASSERT(env, len <= msgsz);
+ DB_SET_DBT(msgdbt, buf, len);
+
+ LOG_SYSTEM_LOCK(env);
+ repth.lsn = ((LOG *)dblp->reginfo.primary)->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ /*
+ * If we are configured for bulk, try to send this as a bulk
+ * request. If not configured, or it is too big for bulk
+ * then just send normally.
+ */
+ if (use_bulk)
+ ret = __rep_bulk_message(env, &bulk, &repth,
+ &repth.lsn, &msgdbt, 0);
+ if (!use_bulk || ret == DB_REP_BULKOVF)
+ ret = __rep_send_throttle(env, eid, &repth, 0, 0);
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "sendpages: %lu, lsn [%lu][%lu]", (u_long)p,
+ (u_long)repth.lsn.file, (u_long)repth.lsn.offset));
+ /*
+ * If we have REP_PAGE_MORE we need to break this loop.
+ * Otherwise, with REP_PAGE, we keep going.
+ */
+ if (repth.type == REP_PAGE_MORE || ret != 0) {
+ /* Ignore send failure, except to break the loop. */
+ if (ret == DB_REP_UNAVAIL)
+ ret = 0;
+ break;
+ }
+ }
+
+err:
+ /*
+ * We're done, force out whatever remains in the bulk buffer and
+ * free it.
+ */
+ if (use_bulk && bulk.addr != NULL &&
+ (t_ret = __rep_bulk_free(env, &bulk, 0)) != 0 && ret == 0 &&
+ t_ret != DB_REP_UNAVAIL)
+ ret = t_ret;
+ if (qdbc != NULL && (t_ret = __dbc_close(qdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (opened && (t_ret = __db_close(qdbp, NULL, DB_NOSYNC)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (buf != NULL)
+ __os_free(env, buf);
+ return (ret);
+}
+
+/*
+ * __rep_update_setup
+ * Process and setup with this file information.
+ *
+ * PUBLIC: int __rep_update_setup __P((ENV *, int, __rep_control_args *,
+ * PUBLIC: DBT *, time_t, DB_LSN *));
+ */
+int
+__rep_update_setup(env, eid, rp, rec, savetime, lsn)
+ ENV *env;
+ int eid;
+ __rep_control_args *rp;
+ DBT *rec;
+ time_t savetime;
+ DB_LSN *lsn;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ __rep_update_args *rup;
+ DB_LSN verify_lsn;
+ int clientdb_locked, *origbuf, ret;
+ u_int32_t count, size;
+ u_int8_t *end, *next;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ infop = env->reginfo;
+ renv = infop->primary;
+ clientdb_locked = 0;
+ ret = 0;
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ verify_lsn = lp->verify_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ if (rep->sync_state != SYNC_UPDATE || IN_ELECTION(rep)) {
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+ }
+ rep->sync_state = SYNC_OFF;
+
+ if ((ret = __rep_update_unmarshal(env, rp->rep_version,
+ &rup, rec->data, rec->size, &next)) != 0)
+ return (ret);
+ DB_ASSERT(env, next == FIRST_FILE_PTR((u_int8_t*)rec->data));
+
+ /*
+ * If we're doing an abbreviated internal init, it's because we found a
+ * sync point but we needed to materialize any NIMDBs. However, if we
+ * now see that there are no NIMDBs we can just skip to verify_match,
+ * just as we would have done if we had already loaded the NIMDBs. In
+ * other words, if there are no NIMDBs, then I can trivially say that
+ * I've already loaded all of them! The whole abbreviated internal init
+ * turns out not to have been necessary after all.
+ */
+ if (F_ISSET(rep, REP_F_ABBREVIATED)) {
+ count = rup->num_files;
+ end = &((u_int8_t*)rec->data)[rec->size];
+ size = (u_int32_t)(end - next);
+ if ((ret = __rep_walk_filelist(env, rp->rep_version,
+ next, size, count, __rep_find_inmem, NULL)) == 0) {
+ /*
+ * Not found: there are no NIMDBs on the list. Revert
+ * to VERIFY state, so that we can pick up where we left
+ * off, except that from now on (i.e., future master
+ * changes) we can skip checking for NIMDBs if we find a
+ * sync point.
+ */
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "UPDATE msg reveals no NIMDBs"));
+ F_SET(rep, REP_F_NIMDBS_LOADED);
+ rep->sync_state = SYNC_VERIFY;
+ F_CLR(rep, REP_F_ABBREVIATED);
+ ret = __rep_notify_threads(env, AWAIT_NIMDB);
+
+ REP_SYSTEM_UNLOCK(env);
+ if (ret == 0 && (ret = __rep_verify_match(env,
+ &verify_lsn, savetime)) == DB_REP_WOULDROLLBACK)
+ *lsn = verify_lsn;
+ __os_free(env, rup);
+ return (ret);
+ } else if (ret != DB_KEYEXIST)
+ goto err;
+ }
+
+ /*
+ * We know we're the first to come in here due to the
+ * SYNC_UPDATE state.
+ */
+ rep->sync_state = SYNC_PAGE;
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ /*
+ * We do not clear REP_LOCKOUT_* in this code.
+ * We'll eventually call the normal __rep_verify_match recovery
+ * code and that will clear all the flags and allow others to
+ * proceed. We lockout both the messages and API here.
+ * We lockout messages briefly because we are about to reset
+ * all our LSNs and we do not want another thread possibly
+ * using/needing those. We have to lockout the API for
+ * the duration of internal init.
+ */
+ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+ goto err;
+
+ if ((ret = __rep_lockout_api(env, rep)) != 0)
+ goto err;
+ /*
+ * We need to update the timestamp and kill any open handles
+ * on this client. The files are changing completely.
+ */
+ (void)time(&renv->rep_timestamp);
+
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->wait_ts = rep->request_gap;
+ ZERO_LSN(lp->ready_lsn);
+ ZERO_LSN(lp->verify_lsn);
+ ZERO_LSN(lp->prev_ckp);
+ ZERO_LSN(lp->waiting_lsn);
+ ZERO_LSN(lp->max_wait_lsn);
+ ZERO_LSN(lp->max_perm_lsn);
+ if (db_rep->rep_db == NULL)
+ ret = __rep_client_dbinit(env, 0, REP_DB);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (ret != 0)
+ goto err_nolock;
+
+ /*
+ * We need to empty out any old log records that might be in the
+ * temp database.
+ */
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_truncate(db_rep->rep_db, ip, NULL, &count)) != 0)
+ goto err_nolock;
+ STAT_SET(env,
+ rep, log_queued, rep->stat.st_log_queued, 0, &lp->ready_lsn);
+
+ REP_SYSTEM_LOCK(env);
+ if (F_ISSET(rep, REP_F_ABBREVIATED)) {
+ /*
+ * For an abbreviated internal init, the place from which we'll
+ * want to request master's logs after (NIMDB) pages are loaded
+ * is precisely the sync point we found during VERIFY. We'll
+ * roll back to there in a moment.
+ *
+ * We don't need first_vers, because it's only used with
+ * __log_newfile, which only happens with non-ABBREVIATED
+ * internal init.
+ */
+ rep->first_lsn = verify_lsn;
+ } else {
+ /*
+ * We will remove all logs we have so we need to request
+ * from the master's beginning.
+ */
+ rep->first_lsn = rup->first_lsn;
+ rep->first_vers = rup->first_vers;
+ }
+ rep->last_lsn = rp->lsn;
+ rep->nfiles = rup->num_files;
+
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Update setup for %d files.", rep->nfiles));
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Update setup: First LSN [%lu][%lu].",
+ (u_long)rep->first_lsn.file, (u_long)rep->first_lsn.offset));
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Update setup: Last LSN [%lu][%lu]",
+ (u_long)rep->last_lsn.file, (u_long)rep->last_lsn.offset));
+
+ if (rep->nfiles > 0) {
+ rep->infoversion = rp->rep_version;
+ rep->originfolen = rep->infolen =
+ rec->size - __REP_UPDATE_SIZE;
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ ret = __env_alloc(infop, (size_t)rep->infolen, &origbuf);
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ if (ret != 0)
+ goto err;
+ else
+ rep->originfo_off = R_OFFSET(infop, origbuf);
+ memcpy(R_ADDR(infop, rep->originfo_off),
+ FIRST_FILE_PTR((u_int8_t*)rec->data), rep->infolen);
+ }
+
+ /*
+ * Clear the decks to make room for the logs and databases that we will
+ * request as part of this internal init. For a normal, full internal
+ * init, that means all logs and databases. For an abbreviated internal
+ * init, it means only the NIMDBs, and only that portion of the log
+ * after the sync point.
+ */
+ if (F_ISSET(rep, REP_F_ABBREVIATED)) {
+ /*
+ * Note that in order to pare the log back to the sync point, we
+ * can't just crudely hack it off there. We need to make sure
+ * that pages in regular databases get rolled back to a state
+ * consistent with that sync point. So we have to do a real
+ * recovery step.
+ */
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Will roll back for abbreviated internal init"));
+ if ((ret = __rep_rollback(env, &rep->first_lsn)) != 0) {
+ if (ret == DB_REP_WOULDROLLBACK) {
+ DB_ASSERT(env, LOG_COMPARE(&rep->first_lsn,
+ &verify_lsn) == 0);
+ *lsn = verify_lsn;
+ }
+ goto err;
+ }
+ ret = __rep_remove_nimdbs(env);
+ } else
+ ret = __rep_remove_all(env, rp->rep_version, rec);
+ if (ret != 0)
+ goto err;
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ clientdb_locked = 1;
+ REP_SYSTEM_LOCK(env);
+ rep->curfile = 0;
+ ret = __rep_nextfile(env, eid, rep);
+ if (ret != 0)
+ goto err;
+
+ if (0) {
+err_nolock: REP_SYSTEM_LOCK(env);
+ }
+
+err: /*
+ * If we get an error, we cannot leave ourselves in the RECOVER_PAGE
+ * state because we have no file information. That also means undo'ing
+ * the rep_lockout. We need to move back to the RECOVER_UPDATE stage.
+ * In the non-error path, we will have already cleared LOCKOUT_MSG,
+ * but it doesn't hurt to clear it again.
+ */
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ if (ret != 0) {
+ if (rep->originfo_off != INVALID_ROFF) {
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->originfo_off));
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ rep->originfo_off = INVALID_ROFF;
+ }
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Update_setup: Error: Clear PAGE, set UPDATE again. %s",
+ db_strerror(ret)));
+ rep->sync_state = SYNC_UPDATE;
+ CLR_LOCKOUT_BDB(rep);
+ }
+ REP_SYSTEM_UNLOCK(env);
+ if (clientdb_locked)
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ __os_free(env, rup);
+ return (ret);
+}
+
+static int
+__rep_find_inmem(env, rfp, unused)
+ ENV *env;
+ __rep_fileinfo_args *rfp;
+ void *unused;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(unused, NULL);
+
+ return (FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? DB_KEYEXIST : 0);
+}
+
+/*
+ * Removes any currently existing NIMDBs. We do this at the beginning of
+ * abbreviated internal init, when any existing NIMDBs should be intact, so
+ * walk_dir should produce reliable results.
+ */
+static int
+__rep_remove_nimdbs(env)
+ ENV *env;
+{
+ FILE_LIST_CTX context;
+ int ret;
+
+ if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+ return (ret);
+ context.size = MEGABYTE;
+ context.count = 0;
+ context.fillptr = context.buf;
+ context.version = DB_REPVERSION;
+
+ /* NB: "NULL" asks walk_dir to consider only in-memory DBs */
+ if ((ret = __rep_walk_dir(env, NULL, NULL, &context)) != 0)
+ goto out;
+
+ if ((ret = __rep_closefiles(env)) != 0)
+ goto out;
+
+ ret = __rep_walk_filelist(env, context.version, context.buf,
+ context.size, context.count, __rep_remove_file, NULL);
+
+out:
+ __os_free(env, context.buf);
+ return (ret);
+}
+
+/*
+ * Removes all existing logs and databases, at the start of internal init. But
+ * before we do, write a list of the databases onto the init file, so that in
+ * case we crash in the middle, we'll know how to resume when we restart.
+ * Finally, also write into the init file the UPDATE message from the master (in
+ * the "rec" DBT), which includes the (new) list of databases we intend to
+ * request copies of (again, so that we know what to do if we crash in the
+ * middle).
+ *
+ * For the sake of simplicity, these database lists are in the form of an UPDATE
+ * message (since we already have the mechanisms in place), even though strictly
+ * speaking that contains more information than we really need to store.
+ *
+ * !!! Must be called with the REP_SYSTEM_LOCK held.
+ */
+static int
+__rep_remove_all(env, msg_version, rec)
+ ENV *env;
+ u_int32_t msg_version;
+ DBT *rec;
+{
+ FILE_LIST_CTX context;
+ __rep_update_args u_args;
+ DB_FH *fhp;
+ DB_REP *db_rep;
+#ifdef HAVE_REPLICATION_THREADS
+ DBT dbt;
+#endif
+ REP *rep;
+ size_t cnt, updlen;
+ u_int32_t bufsz, fvers, mvers, zero;
+ int ret, t_ret;
+ char *fname;
+
+ fname = NULL;
+ fhp = NULL;
+#ifdef HAVE_REPLICATION_THREADS
+ dbt.data = NULL;
+#endif
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * 1. Get list of databases currently present at this client, which we
+ * intend to remove.
+ */
+ if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+ return (ret);
+ context.size = MEGABYTE;
+ context.count = 0;
+ context.version = DB_REPVERSION;
+
+ /* Reserve space for the marshaled update_args. */
+ context.fillptr = FIRST_FILE_PTR(context.buf);
+
+ if ((ret = __rep_find_dbs(env, &context)) != 0)
+ goto out;
+ ZERO_LSN(u_args.first_lsn);
+ u_args.first_vers = 0;
+ u_args.num_files = context.count;
+ if ((ret = __rep_update_marshal(env, DB_REPVERSION,
+ &u_args, context.buf, __REP_UPDATE_SIZE, &updlen)) != 0)
+ goto out;
+ DB_ASSERT(env, updlen == __REP_UPDATE_SIZE);
+
+ /*
+ * 2. Before removing anything, safe-store the database list, so that in
+ * case we crash before we've removed them all, when we restart we
+ * can clean up what we were doing. Only write database list to
+ * file if not running in-memory replication.
+ *
+ * The original version of the file contains:
+ * data1 size (4 bytes)
+ * data1
+ * data2 size (possibly) (4 bytes)
+ * data2 (possibly)
+ *
+ * As of 4.7 the file has the following form:
+ * 0 (4 bytes - to indicate a new style file)
+ * file version (4 bytes)
+ * data1 version (4 bytes)
+ * data1 size (4 bytes)
+ * data1
+ * data2 version (possibly) (4 bytes)
+ * data2 size (possibly) (4 bytes)
+ * data2 (possibly)
+ */
+ if (!FLD_ISSET(rep->config, REP_C_INMEM)) {
+ if ((ret = __db_appname(env,
+ DB_APP_META, REP_INITNAME, NULL, &fname)) != 0)
+ goto out;
+ /* Sanity check that the write size fits into 32 bits. */
+ DB_ASSERT(env, (size_t)(context.fillptr - context.buf) ==
+ (u_int32_t)(context.fillptr - context.buf));
+ bufsz = (u_int32_t)(context.fillptr - context.buf);
+
+ /*
+ * (Short writes aren't possible, so we don't have to verify
+ * 'cnt'.) This first list is generated internally, so it is
+ * always in the form of the current message version.
+ */
+ zero = 0;
+ fvers = REP_INITVERSION;
+ mvers = DB_REPVERSION;
+ if ((ret = __os_open(env, fname, 0,
+ DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) != 0 ||
+ (ret =
+ __os_write(env, fhp, &zero, sizeof(zero), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, &fvers, sizeof(fvers), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, &mvers, sizeof(mvers), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, &bufsz, sizeof(bufsz), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, context.buf, bufsz, &cnt)) != 0 ||
+ (ret = __os_fsync(env, fhp)) != 0) {
+ __db_err(env, ret, "%s", fname);
+ goto out;
+ }
+ }
+
+ /*
+ * 3. Go ahead and remove logs and databases. The databases get removed
+ * according to the list we just finished safe-storing.
+ *
+ * Clearing NIMDBS_LOADED might not really be necessary, since once
+ * we've committed to removing all there's no chance of doing an
+ * abbreviated internal init. This just keeps us honest.
+ */
+ if ((ret = __rep_remove_logs(env)) != 0)
+ goto out;
+ if ((ret = __rep_closefiles(env)) != 0)
+ goto out;
+ F_CLR(rep, REP_F_NIMDBS_LOADED);
+ if ((ret = __rep_walk_filelist(env, context.version,
+ FIRST_FILE_PTR(context.buf), context.size,
+ context.count, __rep_remove_file, NULL)) != 0)
+ goto out;
+
+ /*
+ * 4. Safe-store the (new) list of database files we intend to copy from
+ * the master (again, so that in case we crash before we're finished
+ * doing so, we'll have enough information to clean up and start over
+ * again). This list is the list from the master, so it uses
+ * the message version. Only write to file if not running
+ * in-memory replication.
+ */
+ if (!FLD_ISSET(rep->config, REP_C_INMEM)) {
+ mvers = msg_version;
+ if ((ret =
+ __os_write(env, fhp, &mvers, sizeof(mvers), &cnt)) != 0 ||
+ (ret = __os_write(env, fhp,
+ &rec->size, sizeof(rec->size), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, rec->data, rec->size, &cnt)) != 0 ||
+ (ret = __os_fsync(env, fhp)) != 0) {
+ __db_err(env, ret, "%s", fname);
+ goto out;
+ }
+#ifdef HAVE_REPLICATION_THREADS
+ /* Invite repmgr to save any info it needs. */
+ if ((ret = __repmgr_init_save(env, &dbt)) != 0)
+ goto out;
+ if (dbt.size > 0 &&
+ ((ret = __os_write(env, fhp,
+ &dbt.size, sizeof(dbt.size), &cnt)) != 0 ||
+ (ret = __os_write(env, fhp,
+ dbt.data, dbt.size, &cnt)) != 0))
+ goto out;
+#endif
+ }
+
+out:
+#ifdef HAVE_REPLICATION_THREADS
+ if (dbt.data != NULL)
+ __os_free(env, dbt.data);
+#endif
+ if (fhp != NULL && (t_ret = __os_closehandle(env, fhp)) && ret == 0)
+ ret = t_ret;
+ if (fname != NULL)
+ __os_free(env, fname);
+ __os_free(env, context.buf);
+ return (ret);
+}
+
+/*
+ * __rep_remove_logs -
+ * Remove our logs to prepare for internal init.
+ */
+static int
+__rep_remove_logs(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ LOG *lp;
+ u_int32_t fnum, lastfile;
+ int ret;
+ char *name;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+
+ /*
+ * Call memp_sync to flush any pages that might be in the log buffers
+ * and not on disk before we remove files on disk. If there were no
+ * dirty pages, the log isn't flushed. Yet the log buffers could still
+ * be dirty: __log_flush should take care of this rare situation.
+ */
+ if ((ret = __memp_sync_int(env,
+ NULL, 0, DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0)
+ return (ret);
+ if ((ret = __log_flush(env, NULL)) != 0)
+ return (ret);
+ /*
+ * Forcibly remove existing log files or reset
+ * the in-memory log space.
+ */
+ if (lp->db_log_inmemory) {
+ ZERO_LSN(lsn);
+ if ((ret = __log_zero(env, &lsn)) != 0)
+ return (ret);
+ } else {
+ lastfile = lp->lsn.file;
+ for (fnum = 1; fnum <= lastfile; fnum++) {
+ if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0)
+ return (ret);
+ (void)time(&lp->timestamp);
+ (void)__os_unlink(env, name, 0);
+ __os_free(env, name);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Removes a file during internal init. Assumes underlying subsystems are
+ * active; therefore, this can't be used for internal init crash recovery.
+ */
+static int
+__rep_remove_file(env, rfp, unused)
+ ENV *env;
+ __rep_fileinfo_args *rfp;
+ void *unused;
+{
+ DB *dbp;
+#ifdef HAVE_QUEUE
+ DB_THREAD_INFO *ip;
+#endif
+ char *name;
+ int ret, t_ret;
+
+ COMPQUIET(unused, NULL);
+ dbp = NULL;
+ name = rfp->info.data;
+
+ /*
+ * Calling __fop_remove will both purge any matching
+ * fileid from mpool and unlink it on disk.
+ */
+#ifdef HAVE_QUEUE
+ /*
+ * Handle queue separately. __fop_remove will not
+ * remove extent files. Use __qam_remove to remove
+ * extent files that might exist under this name. Note that
+ * in-memory queue databases can't have extent files.
+ */
+ if (rfp->type == (u_int32_t)DB_QUEUE &&
+ !FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+
+ /*
+ * At present, qam_remove expects the passed-in dbp to have a
+ * locker allocated, and if not, db_open allocates a locker
+ * which qam_remove then leaks.
+ *
+ * TODO: it would be better to avoid cobbling together this
+ * sequence of low-level operations, if fileops provided some
+ * API to allow us to remove a database without write-locking
+ * its handle.
+ */
+ if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+ goto out;
+
+ ENV_GET_THREAD_INFO(env, ip);
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "QAM: Unlink %s via __qam_remove", name));
+ if ((ret = __qam_remove(dbp, ip, NULL, name, NULL, 0)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "qam_remove returned %d", ret));
+ goto out;
+ }
+ }
+#endif
+ /*
+ * We call fop_remove even if we've called qam_remove.
+ * That will only have removed extent files. Now
+ * we need to deal with the actual file itself.
+ */
+ if (FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+ MAKE_INMEM(dbp);
+ F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */
+ ret = __db_inmem_remove(dbp, NULL, name);
+ } else if ((ret = __fop_remove(env,
+ NULL, rfp->uid.data, name, (const char **)&rfp->dir.data,
+ __rep_is_internal_rep_file(rfp->info.data) ?
+ DB_APP_META : DB_APP_DATA, 0)) != 0)
+ /*
+ * If fop_remove fails, it could be because
+ * the client has a different data_dir
+ * structure than the master. Retry with the
+ * local, default settings.
+ */
+ ret = __fop_remove(env,
+ NULL, rfp->uid.data, name, NULL,
+ __rep_is_internal_rep_file(rfp->info.data) ?
+ DB_APP_META : DB_APP_DATA, 0);
+#ifdef HAVE_QUEUE
+out:
+#endif
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_bulk_page
+ * Process a bulk page message.
+ *
+ * PUBLIC: int __rep_bulk_page __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_bulk_page(env, ip, eid, rp, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ __rep_control_args *rp;
+ DBT *rec;
+{
+ __rep_control_args tmprp;
+ __rep_bulk_args b_args;
+ int ret;
+ u_int8_t *p, *ep;
+
+ /*
+ * We're going to be modifying the rp LSN contents so make
+ * our own private copy to play with. We need to set the
+ * rectype to REP_PAGE because we're calling through __rep_page
+ * to process each page, and lower functions make decisions
+ * based on the rectypes (for throttling/gap processing)
+ */
+ memcpy(&tmprp, rp, sizeof(tmprp));
+ tmprp.rectype = REP_PAGE;
+ ret = 0;
+ for (ep = (u_int8_t *)rec->data + rec->size, p = (u_int8_t *)rec->data;
+ p < ep;) {
+ /*
+ * First thing in the buffer is the length. Then the LSN
+ * of this page, then the page info itself.
+ */
+ if ((ret = __rep_bulk_unmarshal(env,
+ &b_args, p, rec->size, &p)) != 0)
+ return (ret);
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "rep_bulk_page: Processing LSN [%lu][%lu]",
+ (u_long)tmprp.lsn.file, (u_long)tmprp.lsn.offset));
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "rep_bulk_page: p %#lx ep %#lx pgrec data %#lx, size %lu (%#lx)",
+ P_TO_ULONG(p), P_TO_ULONG(ep),
+ P_TO_ULONG(b_args.bulkdata.data),
+ (u_long)b_args.bulkdata.size,
+ (u_long)b_args.bulkdata.size));
+ /*
+ * Now send the page info DBT to the page processing function.
+ */
+ ret = __rep_page(env, ip, eid, &tmprp, &b_args.bulkdata);
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "rep_bulk_page: rep_page ret %d", ret));
+
+ /*
+ * If this set of pages is already done just return.
+ */
+ if (ret != 0) {
+ if (ret == DB_REP_PAGEDONE)
+ ret = 0;
+ break;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __rep_page
+ * Process a page message. This processes any page related
+ * message: REP_PAGE, REP_PAGE_FAIL and REP_PAGE_MORE.
+ *
+ * PUBLIC: int __rep_page __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_page(env, ip, eid, rp, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ __rep_control_args *rp;
+ DBT *rec;
+{
+
+ DB_REP *db_rep;
+ DBT key, data;
+ REP *rep;
+ __rep_fileinfo_args *msgfp, msgf;
+ __rep_fileinfo_v6_args *msgfpv6;
+ db_recno_t recno;
+ int ret;
+ char *msg;
+ void *msgfree;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (rep->sync_state != SYNC_PAGE)
+ return (DB_REP_PAGEDONE);
+
+ if (rp->rectype == REP_PAGE_FAIL)
+ msg = "PAGE_FAIL";
+ else if (rp->rectype == REP_PAGE_MORE)
+ msg = "PAGE_MORE";
+ else
+ msg = "PAGE";
+ /*
+ * If we restarted internal init, it is possible to receive
+ * an old REP_PAGE message, while we're in the current
+ * stage of recovering pages. Until we have some sort of
+ * an init generation number, ignore any message that has
+ * a message LSN that is before this internal init's first_lsn.
+ */
+ if (LOG_COMPARE(&rp->lsn, &rep->first_lsn) < 0) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "%s: Old page: msg LSN [%lu][%lu] first_lsn [%lu][%lu]",
+ msg, (u_long)rp->lsn.file, (u_long)rp->lsn.offset,
+ (u_long)rep->first_lsn.file,
+ (u_long)rep->first_lsn.offset));
+ return (DB_REP_PAGEDONE);
+ }
+ if (rp->rep_version < DB_REPVERSION_53) {
+ /*
+ * Build a current struct by copying in the older
+ * version struct and then setting up the data_dir.
+ * This is safe because all old fields are in the
+ * same location in the current struct.
+ */
+ if ((ret = __rep_fileinfo_v6_unmarshal(env, rp->rep_version,
+ &msgfpv6, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ memcpy(&msgf, msgfpv6, sizeof(__rep_fileinfo_v6_args));
+ msgf.dir.data = NULL;
+ msgf.dir.size = 0;
+ msgfp = &msgf;
+ msgfree = msgfpv6;
+ } else {
+ if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version,
+ &msgfp, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ msgfree = msgfp;
+ }
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Check if the world changed.
+ */
+ if (rep->sync_state != SYNC_PAGE) {
+ ret = DB_REP_PAGEDONE;
+ goto err;
+ }
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "%s: Received page %lu from file %d",
+ msg, (u_long)msgfp->pgno, msgfp->filenum));
+ /*
+ * Check if this page is from the file we're expecting.
+ * This may be an old or delayed page message.
+ */
+ /*
+ * !!!
+ * If we allow dbrename/dbremove on the master while a client
+ * is updating, then we'd have to verify the file's uid here too.
+ */
+ if (msgfp->filenum != rep->curfile) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Msg file %d != curfile %d",
+ msgfp->filenum, rep->curfile));
+ ret = DB_REP_PAGEDONE;
+ goto err;
+ }
+ /*
+ * We want to create/open our dbp to the database
+ * where we'll keep our page information.
+ */
+ if ((ret = __rep_client_dbinit(env, 1, REP_PG)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "%s: Client_dbinit %s", msg, db_strerror(ret)));
+ goto err;
+ }
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ recno = (db_recno_t)(msgfp->pgno + 1);
+ key.data = &recno;
+ key.ulen = key.size = sizeof(db_recno_t);
+ key.flags = DB_DBT_USERMEM;
+
+ /*
+ * If we already have this page, then we don't want to bother
+ * rewriting it into the file. Otherwise, any other error
+ * we want to return.
+ */
+ ret = __db_put(db_rep->file_dbp, ip, NULL, &key, &data, DB_NOOVERWRITE);
+ if (ret == DB_KEYEXIST) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "%s: Received duplicate page %lu from file %d",
+ msg, (u_long)msgfp->pgno, msgfp->filenum));
+ STAT(rep->stat.st_pg_duplicated++);
+ PERFMON4(env, rep, pg_duplicated, eid,
+ msgfp->pgno, msgfp->filenum, rep->stat.st_pg_duplicated);
+ ret = 0;
+ goto err;
+ }
+ if (ret != 0)
+ goto err;
+
+ /*
+ * We put the page in the database file itself.
+ */
+ if (rp->rectype != REP_PAGE_FAIL) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "%s: Write page %lu into mpool", msg, (u_long)msgfp->pgno));
+ if ((ret = __rep_write_page(env, ip, rep, msgfp)) != 0) {
+ /*
+ * We got an error storing the page, therefore, we need
+ * remove this page marker from the page database too.
+ * !!!
+ * I'm ignoring errors from the delete because we want
+ * to return the original error. If we cannot write the
+ * page and we cannot delete the item we just put,
+ * what should we do? Panic the env and return
+ * DB_RUNRECOVERY?
+ */
+ (void)__db_del(db_rep->file_dbp, NULL, NULL, &key, 0);
+ goto err;
+ }
+ }
+ STAT_INC(env, rep, pg_record, rep->stat.st_pg_records, eid);
+ rep->npages++;
+
+ /*
+ * Now check the LSN on the page and save it if it is later
+ * than the one we have.
+ */
+ if (LOG_COMPARE(&rp->lsn, &rep->last_lsn) > 0)
+ rep->last_lsn = rp->lsn;
+
+ /*
+ * We've successfully written the page. Now we need to see if
+ * we're done with this file. __rep_filedone will check if we
+ * have all the pages expected and if so, set up for the next
+ * file and send out a page request for the next file's pages.
+ */
+ ret = __rep_filedone(env, ip, eid, rep, msgfp, rp->rectype);
+
+err: REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ __os_free(env, msgfree);
+ return (ret);
+}
+
+/*
+ * __rep_write_page -
+ * Write this page into a database.
+ */
+static int
+__rep_write_page(env, ip, rep, msgfp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ __rep_fileinfo_args *msgfp;
+{
+ DB db;
+ DBT pgcookie;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO *pginfo;
+ DB_REP *db_rep;
+ REGINFO *infop;
+ __rep_fileinfo_args *rfp;
+ int ret;
+ void *dst;
+
+ db_rep = env->rep_handle;
+ infop = env->reginfo;
+ rfp = NULL;
+
+ /*
+ * If this is the first page we're putting in this database, we need
+ * to create the mpool file. Otherwise call memp_fget to create the
+ * page in mpool. Then copy the data to the page, and memp_fput the
+ * page to give it back to mpool.
+ *
+ * We need to create the file, removing any existing file and associate
+ * the correct file ID with the new one.
+ */
+ GET_CURINFO(rep, infop, rfp);
+ if (db_rep->file_mpf == NULL) {
+ if (!FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+ /*
+ * Recreate the file on disk. We'll be putting
+ * the data into the file via mpool. System
+ * databases should go into the environment
+ * directory, not the data directory.
+ */
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "rep_write_page: Calling fop_create for %s",
+ (char *)rfp->info.data));
+ if ((ret = __fop_create(env, NULL, NULL,
+ rfp->info.data, (const char **)&rfp->dir.data,
+ __rep_is_internal_rep_file(rfp->info.data) ?
+ DB_APP_META : DB_APP_DATA, env->db_mode, 0)) != 0) {
+ /*
+ * If fop_create fails, it could be because
+ * the client has a different data_dir
+ * structure than the master. Retry with the
+ * local, default settings.
+ */
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "rep_write_page: fop_create ret %d. Retry for %s, master datadir %s",
+ ret, (char *)rfp->info.data,
+ rfp->dir.data == NULL ? "NULL" :
+ (char *)rfp->dir.data));
+ if ((ret = __fop_create(env, NULL, NULL,
+ rfp->info.data, NULL,
+ __rep_is_internal_rep_file(rfp->info.data) ?
+ DB_APP_META : DB_APP_DATA,
+ env->db_mode, 0)) != 0)
+ goto err;
+ }
+ }
+
+ if ((ret =
+ __rep_mpf_open(env, &db_rep->file_mpf, rfp,
+ FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ?
+ DB_CREATE : 0)) != 0)
+ goto err;
+ }
+ /*
+ * Handle queue specially. If we're a QUEUE database, we need to
+ * use the __qam_fget/put calls. We need to use db_rep->queue_dbc for
+ * that. That dbp is opened after getting the metapage for the
+ * queue database. Since the meta-page is always in the queue file,
+ * we'll use the normal path for that first page. After that we
+ * can assume the dbp is opened.
+ */
+ if (msgfp->type == (u_int32_t)DB_QUEUE && msgfp->pgno != 0) {
+#ifdef HAVE_QUEUE
+ ret = __qam_fget(db_rep->queue_dbc, &msgfp->pgno,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &dst);
+#else
+ /*
+ * This always returns an error.
+ */
+ ret = __db_no_queue_am(env);
+#endif
+ } else
+ ret = __memp_fget(db_rep->file_mpf, &msgfp->pgno, ip, NULL,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &dst);
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Before writing this page into our local mpool, see if its byte order
+ * needs to be swapped. When in mpool the page should be in the native
+ * byte order of our local environment. But the page image we've
+ * received may be in the opposite order (as indicated in finfo_flags).
+ */
+ if ((F_ISSET(env, ENV_LITTLEENDIAN) &&
+ !FLD_ISSET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN)) ||
+ (!F_ISSET(env, ENV_LITTLEENDIAN) &&
+ FLD_ISSET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN))) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "write_page: Page %d needs to be swapped", msgfp->pgno));
+ /*
+ * Set up a dbp to pass into the swap functions. We need
+ * only a few things: The environment and any special
+ * dbp flags and some obvious basics like db type and
+ * pagesize. Those flags were set back in rep_mpf_open
+ * and are available in the pgcookie set up with the
+ * mpoolfile associated with this database.
+ */
+ memset(&db, 0, sizeof(db));
+ db.env = env;
+ db.type = (DBTYPE)msgfp->type;
+ db.pgsize = msgfp->pgsize;
+ mpf = db_rep->file_mpf;
+ if ((ret = __memp_get_pgcookie(mpf, &pgcookie)) != 0)
+ goto err;
+ pginfo = (DB_PGINFO *)pgcookie.data;
+ db.flags = pginfo->flags;
+ if ((ret = __db_pageswap(env,
+ &db, msgfp->info.data, msgfp->pgsize, NULL, 1)) != 0)
+ goto err;
+ }
+
+ memcpy(dst, msgfp->info.data, msgfp->pgsize);
+#ifdef HAVE_QUEUE
+ if (msgfp->type == (u_int32_t)DB_QUEUE && msgfp->pgno != 0)
+ ret = __qam_fput(db_rep->queue_dbc,
+ msgfp->pgno, dst, db_rep->queue_dbc->priority);
+ else
+#endif
+ ret = __memp_fput(db_rep->file_mpf,
+ ip, dst, db_rep->file_dbp->priority);
+
+err: return (ret);
+}
+
+/*
+ * __rep_page_gap -
+ * After we've put the page into the database, we need to check if
+ * we have a page gap and whether we need to request pages.
+ */
+static int
+__rep_page_gap(env, rep, msgfp, type)
+ ENV *env;
+ REP *rep;
+ __rep_fileinfo_args *msgfp;
+ u_int32_t type;
+{
+ DBC *dbc;
+ DBT data, key;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGINFO *infop;
+ __rep_fileinfo_args *rfp;
+ db_recno_t recno;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ infop = env->reginfo;
+ ret = 0;
+ dbc = NULL;
+
+ /*
+ * We've successfully put this page into our file.
+ * Now we need to account for it and re-request new pages
+ * if necessary.
+ */
+ /*
+ * We already hold both the db mutex and rep mutex.
+ */
+ GET_CURINFO(rep, infop, rfp);
+
+ /*
+ * Make sure we're still talking about the same file.
+ * If not, we're done here.
+ */
+ if (rfp->filenum != msgfp->filenum) {
+ ret = DB_REP_PAGEDONE;
+ goto err;
+ }
+
+ /*
+ * We have 3 possible states:
+ * 1. We receive a page we already have accounted for.
+ * msg pgno < ready pgno
+ * 2. We receive a page that is beyond a gap.
+ * msg pgno > ready pgno
+ * 3. We receive the page we're expecting next.
+ * msg pgno == ready pgno
+ */
+ /*
+ * State 1. This can happen once we put our page record into the
+ * database, but by the time we acquire the mutex other
+ * threads have already accounted for this page and moved on.
+ * We just want to return.
+ */
+ if (msgfp->pgno < rep->ready_pg) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "PAGE_GAP: pgno %lu < ready %lu, waiting %lu",
+ (u_long)msgfp->pgno, (u_long)rep->ready_pg,
+ (u_long)rep->waiting_pg));
+ goto err;
+ }
+
+ /*
+ * State 2. This page is beyond the page we're expecting.
+ * We need to update waiting_pg if this page is less than
+ * (earlier) the current waiting_pg. There is nothing
+ * to do but see if we need to request.
+ */
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "PAGE_GAP: pgno %lu, max_pg %lu ready %lu, waiting %lu max_wait %lu",
+ (u_long)msgfp->pgno, (u_long)rfp->max_pgno, (u_long)rep->ready_pg,
+ (u_long)rep->waiting_pg, (u_long)rep->max_wait_pg));
+ if (msgfp->pgno > rep->ready_pg) {
+ /*
+ * We receive a page larger than the one we're expecting.
+ */
+ __os_gettime(env, &rep->last_pg_ts, 1);
+ if (rep->waiting_pg == PGNO_INVALID ||
+ msgfp->pgno < rep->waiting_pg)
+ rep->waiting_pg = msgfp->pgno;
+ } else {
+ /*
+ * We received the page we're expecting.
+ */
+ rep->ready_pg++;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ if (rep->ready_pg == rep->waiting_pg) {
+ /*
+ * If we get here we know we just filled a gap.
+ * Move the cursor to that place and then walk
+ * forward looking for the next gap, if it exists.
+ * Similar to log gaps, if we fill a gap we want to
+ * request the next gap right away if it has been
+ * a while since we last received a later page.
+ */
+ lp->rcvd_ts = rep->last_pg_ts;
+ lp->wait_ts = rep->request_gap;
+ rep->max_wait_pg = PGNO_INVALID;
+ /*
+ * We need to walk the recno database looking for the
+ * next page we need or expect.
+ */
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_cursor(db_rep->file_dbp, ip, NULL,
+ &dbc, 0)) != 0)
+ goto err;
+ /*
+ * Set cursor to the first waiting page.
+ * Page numbers/record numbers are offset by 1.
+ */
+ recno = (db_recno_t)rep->waiting_pg + 1;
+ key.data = &recno;
+ key.ulen = key.size = sizeof(db_recno_t);
+ key.flags = DB_DBT_USERMEM;
+ /*
+ * We know that page is there, this should
+ * find the record.
+ */
+ ret = __dbc_get(dbc, &key, &data, DB_SET);
+ if (ret != 0)
+ goto err;
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "PAGE_GAP: Set cursor for ready %lu, waiting %lu",
+ (u_long)rep->ready_pg, (u_long)rep->waiting_pg));
+ }
+ while (ret == 0 && rep->ready_pg == rep->waiting_pg) {
+ rep->ready_pg++;
+ ret = __dbc_get(dbc, &key, &data, DB_NEXT);
+ /*
+ * If we get to the end of the list, there are no
+ * more gaps. Reset waiting_pg.
+ */
+ if (ret == DB_NOTFOUND || ret == DB_KEYEMPTY) {
+ rep->waiting_pg = PGNO_INVALID;
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "PAGE_GAP: Next cursor No next - ready %lu, waiting %lu",
+ (u_long)rep->ready_pg,
+ (u_long)rep->waiting_pg));
+ break;
+ }
+ /*
+ * Subtract 1 from waiting_pg because record numbers
+ * are 1-based and pages are 0-based and we added 1
+ * into the page number when we put it into the db.
+ */
+ rep->waiting_pg = *(db_pgno_t *)key.data;
+ rep->waiting_pg--;
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "PAGE_GAP: Next cursor ready %lu, waiting %lu",
+ (u_long)rep->ready_pg, (u_long)rep->waiting_pg));
+ }
+ }
+
+ /*
+ * If we filled a gap and now have the entire file, there's
+ * nothing to do. We're done when ready_pg is > max_pgno
+ * because ready_pg is larger than the last page we received.
+ */
+ if (rep->ready_pg > rfp->max_pgno)
+ goto err;
+
+ /*
+ * Check if we need to ask for more pages.
+ */
+ if ((rep->waiting_pg != PGNO_INVALID &&
+ rep->ready_pg != rep->waiting_pg) || type == REP_PAGE_MORE) {
+ /*
+ * We got a page but we may still be waiting for more.
+ * If we got REP_PAGE_MORE we always want to ask for more.
+ * We need to set rfp->pgno to the current page number
+ * we will use to ask for more pages.
+ */
+ if (type == REP_PAGE_MORE)
+ rfp->pgno = msgfp->pgno;
+ if ((__rep_check_doreq(env, rep) || type == REP_PAGE_MORE) &&
+ ((ret = __rep_pggap_req(env, rep, rfp,
+ (type == REP_PAGE_MORE) ? REP_GAP_FORCE : 0)) != 0))
+ goto err;
+ } else {
+ lp->wait_ts = rep->request_gap;
+ rep->max_wait_pg = PGNO_INVALID;
+ }
+
+err:
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __rep_init_cleanup -
+ * Clean up internal initialization pieces.
+ *
+ * !!!
+ * Caller must hold client database mutex (mtx_clientdb) and REP_SYSTEM_LOCK.
+ *
+ * PUBLIC: int __rep_init_cleanup __P((ENV *, REP *, int));
+ */
+int
+__rep_init_cleanup(env, rep, force)
+ ENV *env;
+ REP *rep;
+ int force;
+{
+ DB *queue_dbp;
+ DB_REP *db_rep;
+ REGENV *renv;
+ REGINFO *infop;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ infop = env->reginfo;
+ renv = infop->primary;
+ ret = 0;
+ /*
+ * 1. Close up the file data pointer we used.
+ * 2. Close/reset the page database.
+ * 3. Close/reset the queue database if we're forcing a cleanup.
+ * 4. Free current file info.
+ * 5. If we have all files or need to force, free original file info.
+ */
+ if (db_rep->file_mpf != NULL) {
+ ret = __memp_fclose(db_rep->file_mpf, 0);
+ db_rep->file_mpf = NULL;
+ }
+ if (db_rep->file_dbp != NULL) {
+ t_ret = __db_close(db_rep->file_dbp, NULL, DB_NOSYNC);
+ db_rep->file_dbp = NULL;
+ if (ret == 0)
+ ret = t_ret;
+ }
+ if (force && db_rep->queue_dbc != NULL) {
+ queue_dbp = db_rep->queue_dbc->dbp;
+ if ((t_ret = __dbc_close(db_rep->queue_dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ db_rep->queue_dbc = NULL;
+ if ((t_ret = __db_close(queue_dbp, NULL, DB_NOSYNC)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+ if (rep->curinfo_off != INVALID_ROFF) {
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ __env_alloc_free(infop, R_ADDR(infop, rep->curinfo_off));
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ rep->curinfo_off = INVALID_ROFF;
+ }
+ if (IN_INTERNAL_INIT(rep) && force) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "clean up interrupted internal init"));
+ t_ret = F_ISSET(rep, REP_F_ABBREVIATED) ?
+ __rep_walk_filelist(env, rep->infoversion,
+ R_ADDR(infop, rep->originfo_off), rep->originfolen,
+ rep->nfiles, __rep_cleanup_nimdbs, NULL) :
+ __rep_clean_interrupted(env);
+ if (ret == 0)
+ ret = t_ret;
+
+ if (rep->originfo_off != INVALID_ROFF) {
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->originfo_off));
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ rep->originfo_off = INVALID_ROFF;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * Remove NIMDBs that may have been fully or partially loaded during an
+ * abbreviated internal init, when the init gets interrupted. At this point,
+ * we know that any databases we have processed are listed in originfo.
+ */
+static int
+__rep_cleanup_nimdbs(env, rfp, unused)
+ ENV *env;
+ __rep_fileinfo_args *rfp;
+ void *unused;
+{
+ DB *dbp;
+ char *namep;
+ int ret, t_ret;
+
+ COMPQUIET(unused, NULL);
+
+ ret = 0;
+ dbp = NULL;
+
+ if (FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+ namep = rfp->info.data;
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto out;
+ MAKE_INMEM(dbp);
+ F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */
+
+ /*
+ * Some of these "files" (actually NIMDBs) may not exist
+ * yet, simply because the interrupted abbreviated
+ * internal init had not yet progressed far enough to
+ * retrieve them. So ENOENT is an acceptable outcome.
+ */
+ if ((ret = __db_inmem_remove(dbp, NULL, namep)) == ENOENT)
+ ret = 0;
+ if ((t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+out:
+ return (ret);
+}
+
+/*
+ * Clean up files involved in an interrupted internal init.
+ */
+static int
+__rep_clean_interrupted(env)
+ ENV *env;
+{
+ REP *rep;
+ DB_LOG *dblp;
+ LOG *lp;
+ REGINFO *infop;
+ int ret, t_ret;
+
+ rep = env->rep_handle->region;
+ infop = env->reginfo;
+
+ /*
+ * 1. logs
+ * a) remove old log files
+ * b) set up initial log file #1
+ * 2. database files
+ * 3. the "init file"
+ *
+ * Steps 1 and 2 can be attempted independently. Step 1b is
+ * dependent on successful completion of 1a.
+ */
+
+ /* Step 1a. */
+ if ((ret = __rep_remove_logs(env)) == 0) {
+ /*
+ * Since we have no logs, recover by making it look like
+ * the case when a new client first starts up, namely we
+ * have nothing but a fresh log file #1. This is a
+ * little wasteful, since we may soon remove this log
+ * file again. But it's insignificant in the context of
+ * interrupted internal init.
+ */
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ /* Step 1b. */
+ ret = __rep_log_setup(env,
+ rep, 1, DB_LOGVERSION, &lp->ready_lsn);
+ }
+
+ /* Step 2. */
+ if ((t_ret = __rep_walk_filelist(env, rep->infoversion,
+ R_ADDR(infop, rep->originfo_off), rep->originfolen,
+ rep->nfiles, __rep_remove_by_list, NULL)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Step 3 must not be done if anything fails along the way, because the
+ * init file's raison d'etre is to show that some files remain to be
+ * cleaned up.
+ */
+ if (ret == 0)
+ ret = __rep_remove_init_file(env);
+
+ return (ret);
+}
+
+/*
+ * __rep_filedone -
+ * We need to check if we're done with the current file after
+ * processing the current page. Stat the database to see if
+ * we have all the pages. If so, we need to clean up/close
+ * this one, set up for the next one, and ask for its pages,
+ * or if this is the last file, request the log records and
+ * move to the REP_RECOVER_LOG state.
+ */
+static int
+__rep_filedone(env, ip, eid, rep, msgfp, type)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ REP *rep;
+ __rep_fileinfo_args *msgfp;
+ u_int32_t type;
+{
+ REGINFO *infop;
+ __rep_fileinfo_args *rfp;
+ int ret;
+
+ /*
+ * We've put our page, now we need to do any gap processing
+ * that might be needed to re-request pages.
+ */
+ ret = __rep_page_gap(env, rep, msgfp, type);
+ /*
+ * The world changed while we were doing gap processing.
+ * We're done here.
+ */
+ if (ret == DB_REP_PAGEDONE)
+ return (0);
+
+ infop = env->reginfo;
+ GET_CURINFO(rep, infop, rfp);
+ /*
+ * max_pgno is 0-based and npages is 1-based, so we don't have
+ * all the pages until npages is > max_pgno.
+ */
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "FILEDONE: have %lu pages. Need %lu.",
+ (u_long)rep->npages, (u_long)rfp->max_pgno + 1));
+ if (rep->npages <= rfp->max_pgno)
+ return (0);
+
+ /*
+ * If we're queue and we think we have all the pages for this file,
+ * we need to do special queue processing. Queue is handled in
+ * several stages.
+ */
+ if (rfp->type == (u_int32_t)DB_QUEUE &&
+ ((ret = __rep_queue_filedone(env, ip, rep, rfp)) !=
+ DB_REP_PAGEDONE))
+ return (ret);
+ /*
+ * We have all the pages for this file. Clean up.
+ */
+ if ((ret = __rep_init_cleanup(env, rep, 0)) != 0)
+ goto err;
+
+ rep->curfile++;
+ ret = __rep_nextfile(env, eid, rep);
+err:
+ return (ret);
+}
+
+/*
+ * Starts requesting pages for the next file in the list (if any), or if not,
+ * proceeds to the next stage: requesting logs.
+ *
+ * !!!
+ * Must be called with both clientdb_mutex and REP_SYSTEM, though we may drop
+ * REP_SYSTEM_LOCK momentarily in order to send a LOG_REQ (but not a PAGE_REQ).
+ */
+static int
+__rep_nextfile(env, eid, rep)
+ ENV *env;
+ int eid;
+ REP *rep;
+{
+ DBT dbt;
+ __rep_logreq_args lr_args;
+ DB_LOG *dblp;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ __rep_fileinfo_args *curinfo, *rfp, rf;
+ __rep_fileinfo_v6_args *rfpv6;
+ int *curbuf, ret;
+ u_int8_t *buf, *info_ptr, lrbuf[__REP_LOGREQ_SIZE], *nextinfo;
+ size_t len, msgsz;
+ void *rffree;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ rfp = NULL;
+
+ /*
+ * Always direct the next request to the master (at least nominally),
+ * regardless of where the current response came from. The application
+ * can always still redirect it to another client.
+ */
+ if (rep->master_id != DB_EID_INVALID)
+ eid = rep->master_id;
+
+ while (rep->curfile < rep->nfiles) {
+ /* Set curinfo to next file and examine it. */
+ info_ptr = R_ADDR(infop,
+ rep->originfo_off + (rep->originfolen - rep->infolen));
+ if (rep->infoversion < DB_REPVERSION_53) {
+ /*
+ * Build a current struct by copying in the older
+ * version struct and then setting up the data_dir.
+ * This is safe because all old fields are in the
+ * same location in the current struct.
+ */
+ if ((ret = __rep_fileinfo_v6_unmarshal(env,
+ rep->infoversion, &rfpv6,
+ info_ptr, rep->infolen, &nextinfo)) != 0)
+ return (ret);
+ memcpy(&rf, rfpv6, sizeof(__rep_fileinfo_v6_args));
+ rf.dir.data = NULL;
+ rf.dir.size = 0;
+ rfp = &rf;
+ rffree = rfpv6;
+ } else {
+ if ((ret = __rep_fileinfo_unmarshal(env,
+ rep->infoversion, &rfp, info_ptr,
+ rep->infolen, &nextinfo)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "NEXTINFO: Fileinfo read: %s",
+ db_strerror(ret)));
+ return (ret);
+ }
+ rffree = rfp;
+ }
+ rep->infolen -= (u_int32_t)(nextinfo - info_ptr);
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ ret = __env_alloc(infop, sizeof(__rep_fileinfo_args) +
+ rfp->uid.size + rfp->info.size + rfp->dir.size, &curbuf);
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ if (ret != 0) {
+ __os_free(env, rffree);
+ return (ret);
+ } else
+ rep->curinfo_off = R_OFFSET(infop, curbuf);
+ /* Copy fileinfo basic structure into curinfo. */
+ memcpy(R_ADDR(infop, rep->curinfo_off),
+ (u_int8_t*)rfp, sizeof(__rep_fileinfo_args));
+ /* Set up curinfo pointers to the various DBT data fields. */
+ GET_CURINFO(rep, infop, curinfo);
+ /* Copy uid and info DBT data from originfo buffer. */
+ if (rfp->uid.size > 0)
+ memcpy(curinfo->uid.data,
+ rfp->uid.data, rfp->uid.size);
+ if (rfp->info.size > 0)
+ memcpy(curinfo->info.data,
+ rfp->info.data, rfp->info.size);
+ if (rfp->dir.size > 0)
+ memcpy(curinfo->dir.data,
+ rfp->dir.data, rfp->dir.size);
+ __os_free(env, rffree);
+
+ /* Skip over regular DB's in "abbreviated" internal inits. */
+ if (F_ISSET(rep, REP_F_ABBREVIATED) &&
+ !FLD_ISSET(curinfo->db_flags, DB_AM_INMEM)) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Skipping file %d in abbreviated internal init",
+ curinfo->filenum));
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->curinfo_off));
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ rep->curinfo_off = INVALID_ROFF;
+ rep->curfile++;
+ continue;
+ }
+
+ /* Request this file's pages. */
+ DB_ASSERT(env, curinfo->pgno == 0);
+ rep->ready_pg = 0;
+ rep->npages = 0;
+ rep->waiting_pg = PGNO_INVALID;
+ rep->max_wait_pg = PGNO_INVALID;
+ memset(&dbt, 0, sizeof(dbt));
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Next file %d: pgsize %lu, maxpg %lu",
+ curinfo->filenum, (u_long)curinfo->pgsize,
+ (u_long)curinfo->max_pgno));
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "name %s dir %s",
+ curinfo->info.size > 0 ? (char *) curinfo->info.data :
+ "NULL", curinfo->dir.size > 0 ?
+ (char *)curinfo->dir.data : "NULL"));
+ msgsz = __REP_FILEINFO_SIZE + curinfo->dir.size +
+ curinfo->uid.size + curinfo->info.size;
+ if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
+ return (ret);
+ if (rep->infoversion < DB_REPVERSION_53)
+ /*
+ * It is safe to cast to the old struct
+ * because the first part of the current
+ * struct matches the old struct.
+ */
+ ret = __rep_fileinfo_v6_marshal(env, rep->infoversion,
+ (__rep_fileinfo_v6_args *)curinfo, buf,
+ msgsz, &len);
+ else
+ ret = __rep_fileinfo_marshal(env, rep->infoversion,
+ curinfo, buf, msgsz, &len);
+ if (ret != 0) {
+ __os_free(env, buf);
+ return (ret);
+ }
+ DB_INIT_DBT(dbt, buf, len);
+ (void)__rep_send_message(env, eid, REP_PAGE_REQ,
+ NULL, &dbt, 0, DB_REP_ANYWHERE);
+ __os_free(env, buf);
+
+ return (0);
+ }
+
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "NEXTFILE: have %d files. RECOVER_LOG now", rep->nfiles));
+ /*
+ * Move to REP_RECOVER_LOG state.
+ * Request logs.
+ */
+ /*
+ * We need to do a sync here so that any later opens
+ * can find the file and file id. We need to do it
+ * before we clear SYNC_PAGE so that we do not
+ * try to flush the log.
+ */
+ if ((ret = __memp_sync_int(env, NULL, 0,
+ DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0)
+ return (ret);
+ rep->sync_state = SYNC_LOG;
+ memset(&dbt, 0, sizeof(dbt));
+ lr_args.endlsn = rep->last_lsn;
+ if ((ret = __rep_logreq_marshal(env, &lr_args, lrbuf,
+ __REP_LOGREQ_SIZE, &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(dbt, lrbuf, len);
+
+ /*
+ * Get the logging subsystem ready to receive the first log record we
+ * are going to ask for. In the case of a normal internal init, this is
+ * pretty simple, since we only deal in whole log files. In the
+ * ABBREVIATED case we've already taken care of this, back when we
+ * processed the UPDATE message, because we had to do it by rolling back
+ * to a sync point at an arbitrary LSN.
+ */
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ /*
+ * Update ready_lsn so that future rerequests and VERIFY_FAILs know
+ * where to start.
+ */
+ if (!F_ISSET(rep, REP_F_ABBREVIATED) &&
+ (ret = __rep_log_setup(env, rep,
+ rep->first_lsn.file, rep->first_vers, &lp->ready_lsn)) != 0)
+ return (ret);
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "NEXTFILE: LOG_REQ from LSN [%lu][%lu] to [%lu][%lu]",
+ (u_long)rep->first_lsn.file, (u_long)rep->first_lsn.offset,
+ (u_long)rep->last_lsn.file, (u_long)rep->last_lsn.offset));
+ REP_SYSTEM_UNLOCK(env);
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->wait_ts = rep->request_gap;
+ (void)__rep_send_message(env, eid,
+ REP_LOG_REQ, &rep->first_lsn, &dbt, REPCTL_INIT, DB_REP_ANYWHERE);
+ REP_SYSTEM_LOCK(env);
+ return (0);
+}
+
+/*
+ * Run a recovery, for the purpose of rolling back the client environment to a
+ * specific sync point, in preparation for doing an abbreviated internal init
+ * (materializing only NIMDBs, when we already have the on-disk DBs).
+ *
+ * REP_SYSTEM_LOCK should be held on entry, and will be held on exit, but we
+ * drop it momentarily during the call.
+ */
+static int
+__rep_rollback(env, lsnp)
+ ENV *env;
+ DB_LSN *lsnp;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ DB_THREAD_INFO *ip;
+ DB_LSN trunclsn;
+ int ret;
+ u_int32_t unused;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ENV_GET_THREAD_INFO(env, ip);
+
+ DB_ASSERT(env, FLD_ISSET(rep->lockout_flags,
+ REP_LOCKOUT_API | REP_LOCKOUT_MSG | REP_LOCKOUT_OP));
+
+ REP_SYSTEM_UNLOCK(env);
+
+ if ((ret = __rep_dorecovery(env, lsnp, &trunclsn)) != 0)
+ goto errlock;
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->ready_lsn = trunclsn;
+ ZERO_LSN(lp->waiting_lsn);
+ ZERO_LSN(lp->max_wait_lsn);
+ lp->max_perm_lsn = *lsnp;
+ lp->wait_ts = rep->request_gap;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ ZERO_LSN(lp->verify_lsn);
+
+ if (db_rep->rep_db == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto errlock;
+ }
+
+ F_SET(db_rep->rep_db, DB_AM_RECOVER);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ F_CLR(db_rep->rep_db, DB_AM_RECOVER);
+ STAT_SET(env, rep, log_queued, rep->stat.st_log_queued, 0, lsnp);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+errlock:
+ REP_SYSTEM_LOCK(env);
+
+ return (ret);
+}
+
+/*
+ * __rep_mpf_open -
+ * Create and open the mpool file for a database.
+ * Used by both master and client to bring files into mpool.
+ */
+static int
+__rep_mpf_open(env, mpfp, rfp, flags)
+ ENV *env;
+ DB_MPOOLFILE **mpfp;
+ __rep_fileinfo_args *rfp;
+ u_int32_t flags;
+{
+ DB db;
+ int ret;
+
+ if ((ret = __memp_fcreate(env, mpfp)) != 0)
+ return (ret);
+
+ /*
+ * We need a dbp to pass into to __env_mpool. Set up
+ * only the parts that it needs.
+ */
+ memset(&db, 0, sizeof(db));
+ db.env = env;
+ db.type = (DBTYPE)rfp->type;
+ db.pgsize = rfp->pgsize;
+ memcpy(db.fileid, rfp->uid.data, DB_FILE_ID_LEN);
+ db.flags = rfp->db_flags;
+ /* We need to make sure the dbp isn't marked open. */
+ F_CLR(&db, DB_AM_OPEN_CALLED);
+ /*
+ * The byte order of this database may be different from my local native
+ * byte order. If so, set the swap bit so that the necessary swapping
+ * will be done during file I/O.
+ */
+ if ((F_ISSET(env, ENV_LITTLEENDIAN) &&
+ !FLD_ISSET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN)) ||
+ (!F_ISSET(env, ENV_LITTLEENDIAN) &&
+ FLD_ISSET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN))) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "rep_mpf_open: Different endian database. Set swap bit."));
+ F_SET(&db, DB_AM_SWAP);
+ } else
+ F_CLR(&db, DB_AM_SWAP);
+
+ db.mpf = *mpfp;
+ if (F_ISSET(&db, DB_AM_INMEM))
+ (void)__memp_set_flags(db.mpf, DB_MPOOL_NOFILE, 1);
+ if ((ret = __env_mpool(&db, rfp->info.data, flags)) != 0) {
+ (void)__memp_fclose(db.mpf, 0);
+ *mpfp = NULL;
+ }
+ return (ret);
+}
+
+/*
+ * __rep_pggap_req -
+ * Request a page gap. Assumes the caller holds the rep_mutex.
+ *
+ * PUBLIC: int __rep_pggap_req __P((ENV *, REP *, __rep_fileinfo_args *,
+ * PUBLIC: u_int32_t));
+ */
+int
+__rep_pggap_req(env, rep, reqfp, gapflags)
+ ENV *env;
+ REP *rep;
+ __rep_fileinfo_args *reqfp;
+ u_int32_t gapflags;
+{
+ DBT max_pg_dbt;
+ REGINFO *infop;
+ __rep_fileinfo_args *curinfo, *tmpfp, t;
+ size_t len, msgsz;
+ u_int32_t flags;
+ int alloc, master, ret;
+ u_int8_t *buf;
+
+ infop = env->reginfo;
+ ret = 0;
+ alloc = 0;
+ /*
+ * There is a window where we have to set REP_RECOVER_PAGE when
+ * we receive the update information to transition from getting
+ * file information to getting page information. However, that
+ * thread does release and then reacquire mutexes. So, we might
+ * try re-requesting before the original thread can get curinfo
+ * setup. If curinfo isn't set up there is nothing to do.
+ */
+ if (rep->curinfo_off == INVALID_ROFF)
+ return (0);
+ GET_CURINFO(rep, infop, curinfo);
+ if (reqfp == NULL) {
+ if ((ret = __rep_finfo_alloc(env, curinfo, &tmpfp)) != 0)
+ return (ret);
+ alloc = 1;
+ } else {
+ t = *reqfp;
+ tmpfp = &t;
+ }
+
+ /*
+ * If we've never requested this page, then
+ * request everything between it and the first
+ * page we have. If we have requested this page
+ * then only request this record, not the entire gap.
+ */
+ flags = 0;
+ memset(&max_pg_dbt, 0, sizeof(max_pg_dbt));
+ /*
+ * If this is a PAGE_MORE and we're forcing then we want to
+ * force the request to ask for the next page after this one.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_FORCE))
+ tmpfp->pgno++;
+ else
+ tmpfp->pgno = rep->ready_pg;
+ msgsz = __REP_FILEINFO_SIZE + tmpfp->dir.size +
+ tmpfp->uid.size + tmpfp->info.size;
+ if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
+ goto err;
+ if (rep->max_wait_pg == PGNO_INVALID ||
+ FLD_ISSET(gapflags, REP_GAP_FORCE | REP_GAP_REREQUEST)) {
+ /*
+ * Request the gap - set max to waiting_pg - 1 or if
+ * there is no waiting_pg, just ask for one.
+ */
+ if (rep->waiting_pg == PGNO_INVALID) {
+ if (FLD_ISSET(gapflags,
+ REP_GAP_FORCE | REP_GAP_REREQUEST))
+ rep->max_wait_pg = curinfo->max_pgno;
+ else
+ rep->max_wait_pg = rep->ready_pg;
+ } else {
+ /*
+ * If we're forcing, and waiting_pg is less than
+ * the page we want to start this request at, then
+ * we set max_wait_pg to the max pgno in the file.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_FORCE) &&
+ rep->waiting_pg < tmpfp->pgno)
+ rep->max_wait_pg = curinfo->max_pgno;
+ else
+ rep->max_wait_pg = rep->waiting_pg - 1;
+ }
+ tmpfp->max_pgno = rep->max_wait_pg;
+ /*
+ * Gap requests are "new" and can go anywhere.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_REREQUEST))
+ flags = DB_REP_REREQUEST;
+ else
+ flags = DB_REP_ANYWHERE;
+ } else {
+ /*
+ * Request 1 page - set max to ready_pg.
+ */
+ rep->max_wait_pg = rep->ready_pg;
+ tmpfp->max_pgno = rep->ready_pg;
+ /*
+ * If we're dropping to singletons, this is a rerequest.
+ */
+ flags = DB_REP_REREQUEST;
+ }
+ if ((master = rep->master_id) != DB_EID_INVALID) {
+
+ STAT_INC(env,
+ rep, pg_request, rep->stat.st_pg_requested, master);
+ /*
+ * We need to request the pages, but we need to get the
+ * new info into rep->finfo. Assert that the sizes never
+ * change. The only thing this should do is change
+ * the pgno field. Everything else remains the same.
+ */
+ if (rep->infoversion < DB_REPVERSION_53)
+ /*
+ * It is safe to cast to the old struct
+ * because the first part of the current
+ * struct matches the old struct.
+ */
+ ret = __rep_fileinfo_v6_marshal(env, rep->infoversion,
+ (__rep_fileinfo_v6_args *)tmpfp, buf,
+ msgsz, &len);
+ else
+ ret = __rep_fileinfo_marshal(env, rep->infoversion,
+ tmpfp, buf, msgsz, &len);
+ if (ret == 0) {
+ DB_INIT_DBT(max_pg_dbt, buf, len);
+ DB_ASSERT(env, len == max_pg_dbt.size);
+ (void)__rep_send_message(env, master,
+ REP_PAGE_REQ, NULL, &max_pg_dbt, 0, flags);
+ }
+ } else
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_MASTER_REQ, NULL, NULL, 0, 0);
+
+ __os_free(env, buf);
+err:
+ if (alloc)
+ __os_free(env, tmpfp);
+ return (ret);
+}
+
+/*
+ * __rep_finfo_alloc -
+ * Allocate and initialize a fileinfo structure.
+ *
+ * PUBLIC: int __rep_finfo_alloc __P((ENV *, __rep_fileinfo_args *,
+ * PUBLIC: __rep_fileinfo_args **));
+ */
+int
+__rep_finfo_alloc(env, rfpsrc, rfpp)
+ ENV *env;
+ __rep_fileinfo_args *rfpsrc, **rfpp;
+{
+ __rep_fileinfo_args *rfp;
+ size_t size;
+ int ret;
+ void *dirp, *infop, *uidp;
+
+ /*
+ * Allocate enough for the structure and the DBT data areas.
+ */
+ size = sizeof(__rep_fileinfo_args) + rfpsrc->uid.size +
+ rfpsrc->info.size + rfpsrc->dir.size;
+ if ((ret = __os_malloc(env, size, &rfp)) != 0)
+ return (ret);
+
+ /*
+ * Copy the structure itself, and then set the DBT data pointers
+ * to their space and copy the data itself as well.
+ */
+ memcpy(rfp, rfpsrc, sizeof(__rep_fileinfo_args));
+ uidp = (u_int8_t *)rfp + sizeof(__rep_fileinfo_args);
+ rfp->uid.data = uidp;
+ memcpy(uidp, rfpsrc->uid.data, rfpsrc->uid.size);
+
+ infop = (u_int8_t *)uidp + rfpsrc->uid.size;
+ rfp->info.data = infop;
+ memcpy(infop, rfpsrc->info.data, rfpsrc->info.size);
+
+ dirp = (u_int8_t *)infop + rfpsrc->info.size;
+ if (rfpsrc->dir.size > 0) {
+ rfp->dir.data = dirp;
+ memcpy(dirp, rfpsrc->dir.data, rfpsrc->dir.size);
+ } else
+ rfp->dir.data = NULL;
+ *rfpp = rfp;
+ return (ret);
+}
+
+/*
+ * __rep_log_setup -
+ * We know our first LSN and need to reset the log subsystem
+ * to get our logs set up for the proper file.
+ */
+static int
+__rep_log_setup(env, rep, file, version, lsnp)
+ ENV *env;
+ REP *rep;
+ u_int32_t file;
+ u_int32_t version;
+ DB_LSN *lsnp;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ LOG *lp;
+ int ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ /*
+ * Set up the log starting at the file number of the first LSN we
+ * need to get from the master.
+ */
+ LOG_SYSTEM_LOCK(env);
+ if ((ret = __log_newfile(dblp, &lsn, file, version)) == 0 &&
+ lsnp != NULL)
+ *lsnp = lsn;
+ LOG_SYSTEM_UNLOCK(env);
+
+ /*
+ * We reset first_lsn to the lp->lsn. We were given the LSN of
+ * the checkpoint and we now need the LSN for the beginning of
+ * the file, which __log_newfile conveniently set up for us
+ * in lp->lsn.
+ */
+ rep->first_lsn = lp->lsn;
+ TXN_SYSTEM_LOCK(env);
+ ZERO_LSN(region->last_ckp);
+ TXN_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __rep_queue_filedone -
+ * Determine if we're really done getting the pages for a queue file.
+ * Queue is handled in several steps.
+ * 1. First we get the meta page only.
+ * 2. We use the meta-page information to figure out first and last
+ * page numbers (and if queue wraps, first can be > last.
+ * 3. If first < last, we do a REP_PAGE_REQ for all pages.
+ * 4. If first > last, we REP_PAGE_REQ from first -> max page number.
+ * Then we'll ask for page 1 -> last.
+ *
+ * This function can return several things:
+ * DB_REP_PAGEDONE - if we're done with this file.
+ * 0 - if we're not done with this file.
+ * error - if we get an error doing some operations.
+ *
+ * This function will open a dbp handle to the queue file. This is needed
+ * by most of the QAM macros. We'll open it on the first pass through
+ * here and we'll close it whenever we decide we're done.
+ */
+static int
+__rep_queue_filedone(env, ip, rep, rfp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ __rep_fileinfo_args *rfp;
+{
+#ifndef HAVE_QUEUE
+ COMPQUIET(ip, NULL);
+ COMPQUIET(rep, NULL);
+ COMPQUIET(rfp, NULL);
+ return (__db_no_queue_am(env));
+#else
+ DB *queue_dbp;
+ DB_REP *db_rep;
+ db_pgno_t first, last;
+ u_int32_t flags;
+ int empty, ret, t_ret;
+
+ db_rep = env->rep_handle;
+ ret = 0;
+ queue_dbp = NULL;
+ if (db_rep->queue_dbc == NULL) {
+ /*
+ * We need to do a sync here so that the open
+ * can find the file and file id.
+ */
+ if ((ret = __memp_sync_int(env, NULL, 0,
+ DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0)
+ goto out;
+ if ((ret =
+ __db_create_internal(&queue_dbp, env, 0)) != 0)
+ goto out;
+ flags = DB_NO_AUTO_COMMIT |
+ (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
+ /*
+ * We need to check whether this is in-memory so that we pass
+ * the name correctly as either the file or the database name.
+ */
+ if ((ret = __db_open(queue_dbp, ip, NULL,
+ FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? NULL :
+ rfp->info.data,
+ FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? rfp->info.data :
+ NULL,
+ DB_QUEUE, flags, 0, PGNO_BASE_MD)) != 0)
+ goto out;
+
+ if ((ret = __db_cursor(queue_dbp,
+ ip, NULL, &db_rep->queue_dbc, 0)) != 0)
+ goto out;
+ } else
+ queue_dbp = db_rep->queue_dbc->dbp;
+
+ if ((ret = __queue_pageinfo(queue_dbp,
+ &first, &last, &empty, 0, 0)) != 0)
+ goto out;
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Queue fileinfo: first %lu, last %lu, empty %d",
+ (u_long)first, (u_long)last, empty));
+ /*
+ * We can be at the end of 3 possible states.
+ * 1. We have received the meta-page and now need to get the
+ * rest of the pages in the database.
+ * 2. We have received from first -> max_pgno. We might be done,
+ * or we might need to ask for wrapped pages.
+ * 3. We have received all pages in the file. We're done.
+ */
+ if (rfp->max_pgno == 0) {
+ /*
+ * We have just received the meta page. Set up the next
+ * pages to ask for and check if the file is empty.
+ */
+ if (empty)
+ goto out;
+ if (first > last) {
+ rfp->max_pgno =
+ QAM_RECNO_PAGE(db_rep->queue_dbc->dbp, UINT32_MAX);
+ } else
+ rfp->max_pgno = last;
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Queue fileinfo: First req: first %lu, last %lu",
+ (u_long)first, (u_long)rfp->max_pgno));
+ goto req;
+ } else if (rfp->max_pgno != last) {
+ /*
+ * If max_pgno != last that means we're dealing with a
+ * wrapped situation. Request next batch of pages.
+ * Set npages to 1 because we already have page 0, the
+ * meta-page, now we need pages 1-max_pgno.
+ */
+ first = 1;
+ rfp->max_pgno = last;
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Queue fileinfo: Wrap req: first %lu, last %lu",
+ (u_long)first, (u_long)last));
+req:
+ /*
+ * Since we're simulating a "gap" to resend new PAGE_REQ
+ * for this file, we need to set waiting page to last + 1
+ * so that we'll ask for all from ready_pg -> last.
+ */
+ rep->npages = first;
+ rep->ready_pg = first;
+ rep->waiting_pg = rfp->max_pgno + 1;
+ rep->max_wait_pg = PGNO_INVALID;
+ ret = __rep_pggap_req(env, rep, rfp, 0);
+ return (ret);
+ }
+ /*
+ * max_pgno == last
+ * If we get here, we have all the pages we need.
+ * Close the dbp and return.
+ */
+out:
+ if (db_rep->queue_dbc != NULL &&
+ (t_ret = __dbc_close(db_rep->queue_dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ db_rep->queue_dbc = NULL;
+
+ if (queue_dbp != NULL &&
+ (t_ret = __db_close(queue_dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == 0)
+ ret = DB_REP_PAGEDONE;
+ return (ret);
+#endif
+}
+
+/*
+ * PUBLIC: int __rep_remove_init_file __P((ENV *));
+ */
+int
+__rep_remove_init_file(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+ char *name;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * If running in-memory replication, return without any file
+ * operations.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
+ return (0);
+
+ /* Abbreviated internal init doesn't use an init file. */
+ if (F_ISSET(rep, REP_F_ABBREVIATED))
+ return (0);
+
+ if ((ret = __db_appname(env,
+ DB_APP_META, REP_INITNAME, NULL, &name)) != 0)
+ return (ret);
+ (void)__os_unlink(env, name, 0);
+ __os_free(env, name);
+ return (0);
+}
+
+/*
+ * Checks for the existence of the internal init flag file. If it exists, we
+ * remove all logs and databases, and then remove the flag file. This is
+ * intended to force the internal init to start over again, and thus affords
+ * protection against a client crashing during internal init. This function
+ * must be called before normal recovery in order to be properly effective.
+ *
+ * !!!
+ * This function should only be called during initial set-up of the environment,
+ * before various subsystems are initialized. It doesn't rely on the
+ * subsystems' code having been initialized, and it summarily deletes files "out
+ * from under" them, which might disturb the subsystems if they were up.
+ *
+ * PUBLIC: int __rep_reset_init __P((ENV *));
+ */
+int
+__rep_reset_init(env)
+ ENV *env;
+{
+ DB_FH *fhp;
+ __rep_update_args *rup;
+ DBT dbt;
+ char *allocated_dir, *dir, *init_name;
+ size_t cnt;
+ u_int32_t dbtvers, fvers, zero;
+ u_int8_t *next;
+ int ret, t_ret;
+
+ allocated_dir = NULL;
+ rup = NULL;
+ dbt.data = NULL;
+
+ if ((ret = __db_appname(env,
+ DB_APP_META, REP_INITNAME, NULL, &init_name)) != 0)
+ return (ret);
+
+ if ((ret = __os_open(
+ env, init_name, 0, DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0) {
+ if (ret == ENOENT)
+ ret = 0;
+ goto out;
+ }
+
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Cleaning up interrupted internal init"));
+
+ /* There are a few possibilities:
+ * 1. no init file, or less than 1 full file list
+ * 2. exactly one full file list
+ * 3. more than one, less then a second full file list
+ * 4. second file list in full
+ *
+ * In cases 2 or 4, we need to remove all logs, and then remove files
+ * according to the (most recent) file list. (In case 1 or 3, we don't
+ * have to do anything.)
+ *
+ * The __rep_get_file_list function takes care of folding these cases
+ * into two simple outcomes.
+ *
+ * As of 4.7, the first 4 bytes are 0. Read the first 4 bytes now.
+ * If they are non-zero it means we have an old-style init file.
+ * Otherwise, pass the file version in to rep_get_file_list.
+ */
+ if ((ret = __os_read(env, fhp, &zero, sizeof(zero), &cnt)) != 0)
+ goto out;
+ /*
+ * If we read successfully, but not enough, then unlink the file.
+ */
+ if (cnt != sizeof(zero))
+ goto rm;
+ if (zero != 0) {
+ /*
+ * Old style file. We have to set fvers to the 4.6
+ * version of the file and also rewind the file so
+ * that __rep_get_file_list can read out the length itself.
+ */
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto out;
+ fvers = REP_INITVERSION_46;
+ } else if ((ret = __os_read(env,
+ fhp, &fvers, sizeof(fvers), &cnt)) != 0)
+ goto out;
+ else if (cnt != sizeof(fvers))
+ goto rm;
+ ret = __rep_get_file_list(env, fhp, fvers, &dbtvers, &dbt);
+ if ((t_ret = __os_closehandle(env, fhp)) != 0 || ret != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto out;
+ }
+ if (dbt.data == NULL) {
+ /*
+ * The init file did not end with an intact file list. Since we
+ * never start log/db removal without an intact file list
+ * sync'ed to the init file, this must mean we don't have any
+ * partial set of files to clean up. So all we need to do is
+ * remove the init file.
+ */
+ goto rm;
+ }
+
+ /* Remove all log files. */
+ if (env->dbenv->db_log_dir == NULL)
+ dir = env->db_home;
+ else {
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, env->dbenv->db_log_dir, NULL, &dir)) != 0)
+ goto out;
+ allocated_dir = dir;
+ }
+
+ if ((ret = __rep_remove_by_prefix(env,
+ dir, LFPREFIX, sizeof(LFPREFIX)-1, DB_APP_LOG)) != 0)
+ goto out;
+
+ /*
+ * Remove databases according to the list, and queue extent files by
+ * searching them out on a walk through the data_dir's.
+ */
+ if ((ret = __rep_update_unmarshal(env, dbtvers,
+ &rup, dbt.data, dbt.size, &next)) != 0)
+ goto out;
+ if ((ret = __rep_unlink_by_list(env, dbtvers,
+ next, dbt.size, rup->num_files)) != 0)
+ goto out;
+
+ /* Here, we've established that the file exists. */
+rm: (void)__os_unlink(env, init_name, 0);
+out: if (rup != NULL)
+ __os_free(env, rup);
+ if (allocated_dir != NULL)
+ __os_free(env, allocated_dir);
+ if (dbt.data != NULL)
+ __os_free(env, dbt.data);
+
+ __os_free(env, init_name);
+ return (ret);
+}
+
+/*
+ * Reads the last fully intact file list from the init file. If the file ends
+ * with a partial list (or is empty), we're not interested in it. Lack of a
+ * full file list is indicated by a NULL dbt->data. On success, the list is
+ * returned in allocated space, which becomes the responsibility of the caller.
+ *
+ * The file format is a u_int32_t buffer length, in native format, followed by
+ * the file list itself, in the same format as in an UPDATE message (though
+ * many parts of it in this case are meaningless).
+ */
+static int
+__rep_get_file_list(env, fhp, fvers, dbtvers, dbt)
+ ENV *env;
+ DB_FH *fhp;
+ u_int32_t fvers;
+ u_int32_t *dbtvers;
+ DBT *dbt;
+{
+#ifdef HAVE_REPLICATION_THREADS
+ DBT mgrdbt;
+#endif
+ u_int32_t length, mvers;
+ size_t cnt;
+ int i, ret;
+
+ /* At most 2 file lists: old and new. */
+ dbt->data = NULL;
+ mvers = DB_REPVERSION_46;
+ length = 0;
+#ifdef HAVE_REPLICATION_THREADS
+ mgrdbt.data = NULL;
+#endif
+ for (i = 1; i <= 2; i++) {
+ if (fvers >= REP_INITVERSION_47) {
+ if ((ret = __os_read(env, fhp, &mvers,
+ sizeof(mvers), &cnt)) != 0)
+ goto err;
+ if (cnt == 0 && dbt->data != NULL)
+ break;
+ if (cnt != sizeof(mvers))
+ goto err;
+ }
+ if ((ret = __os_read(env,
+ fhp, &length, sizeof(length), &cnt)) != 0)
+ goto err;
+
+ /*
+ * Reaching the end here is fine, if we've been through at least
+ * once already.
+ */
+ if (cnt == 0 && dbt->data != NULL)
+ break;
+ if (cnt != sizeof(length))
+ goto err;
+
+ if ((ret = __os_realloc(env,
+ (size_t)length, &dbt->data)) != 0)
+ goto err;
+
+ if ((ret = __os_read(
+ env, fhp, dbt->data, length, &cnt)) != 0 ||
+ cnt != (size_t)length)
+ goto err;
+ }
+
+#ifdef HAVE_REPLICATION_THREADS
+ if (i == 3) {
+ if ((ret = __os_read(env, fhp,
+ &mgrdbt.size, sizeof(mgrdbt.size), &cnt)) != 0)
+ goto err;
+ if (cnt == 0)
+ goto absent;
+ if (cnt != sizeof(mgrdbt.size))
+ goto err;
+ if ((ret = __os_malloc(env,
+ (size_t)mgrdbt.size, &mgrdbt.data)) != 0)
+ goto err;
+ if ((ret = __os_read(env, fhp,
+ mgrdbt.data, mgrdbt.size, &cnt)) != 0 &&
+ cnt != (size_t)mgrdbt.size)
+ goto err;
+ /* Repmgr takes ownership of the allocated memory. */
+ if ((ret = __repmgr_init_restore(env, &mgrdbt)) != 0)
+ goto err;
+ }
+absent:
+#endif
+
+ *dbtvers = mvers;
+ dbt->size = length;
+ return (0);
+
+err:
+#ifdef HAVE_REPLICATION_THREADS
+ if (mgrdbt.data != NULL)
+ __os_free(env, mgrdbt.data);
+#endif
+ /*
+ * Note that it's OK to get here with a zero value in 'ret': it means we
+ * read less than we expected, and dbt->data == NULL indicates to the
+ * caller that we don't have an intact list.
+ */
+ if (dbt->data != NULL)
+ __os_free(env, dbt->data);
+ dbt->data = NULL;
+ return (ret);
+}
+
+/*
+ * Removes every file in a given directory that matches a given prefix. Notice
+ * how similar this is to __rep_walk_dir.
+ */
+static int
+__rep_remove_by_prefix(env, dir, prefix, pref_len, appname)
+ ENV *env;
+ const char *dir;
+ const char *prefix;
+ size_t pref_len;
+ APPNAME appname; /* What kind of name. */
+{
+ char *namep, **names;
+ int cnt, i, ret;
+
+ if ((ret = __os_dirlist(env, dir, 0, &names, &cnt)) != 0)
+ return (ret);
+ for (i = 0; i < cnt; i++) {
+ if (strncmp(names[i], prefix, pref_len) == 0) {
+ if ((ret = __db_appname(env,
+ appname, names[i], NULL, &namep)) != 0)
+ goto out;
+ (void)__os_unlink(env, namep, 0);
+ __os_free(env, namep);
+ }
+ }
+out: __os_dirfree(env, names, cnt);
+ return (ret);
+}
+
+/*
+ * Removes database files according to the contents of a list.
+ *
+ * This function must support removal either during environment creation, or
+ * when an internal init is reset in the middle. This means it must work
+ * regardless of whether underlying subsystems are initialized. However, it may
+ * assume that databases are not open. That means there is no REP!
+ */
+static int
+__rep_unlink_by_list(env, version, files, size, count)
+ ENV *env;
+ u_int32_t version;
+ u_int8_t *files;
+ u_int32_t size;
+ u_int32_t count;
+{
+ DB_ENV *dbenv;
+ char **ddir, *dir;
+ int ret;
+
+ dbenv = env->dbenv;
+
+ if ((ret = __rep_walk_filelist(env, version,
+ files, size, count, __rep_unlink_file, NULL)) != 0)
+ goto out;
+
+ /* Notice how similar this code is to __rep_find_dbs. */
+ if (dbenv->db_data_dir == NULL)
+ ret = __rep_remove_by_prefix(env, env->db_home,
+ QUEUE_EXTENT_PREFIX, sizeof(QUEUE_EXTENT_PREFIX) - 1,
+ DB_APP_DATA);
+ else {
+ for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) {
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, *ddir, NULL, &dir)) != 0)
+ break;
+ ret = __rep_remove_by_prefix(env, dir,
+ QUEUE_EXTENT_PREFIX, sizeof(QUEUE_EXTENT_PREFIX)-1,
+ DB_APP_DATA);
+ __os_free(env, dir);
+ if (ret != 0)
+ break;
+ }
+ }
+
+out:
+ return (ret);
+}
+
+static int
+__rep_unlink_file(env, rfp, unused)
+ ENV *env;
+ __rep_fileinfo_args *rfp;
+ void *unused;
+{
+ char *namep;
+ int ret;
+
+ COMPQUIET(unused, NULL);
+
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, rfp->info.data, NULL, &namep)) == 0) {
+ (void)__os_unlink(env, namep, 0);
+ __os_free(env, namep);
+ }
+ return (ret);
+}
+
+static int
+__rep_remove_by_list(env, rfp, unused)
+ ENV *env;
+ __rep_fileinfo_args *rfp;
+ void *unused;
+{
+ int ret;
+
+ COMPQUIET(unused, NULL);
+
+ if ((ret = __rep_remove_file(env, rfp, NULL)) == ENOENT) {
+ /*
+ * If the file already doesn't exist, that's perfectly
+ * OK. This can easily happen if we're cleaning up an
+ * interrupted internal init, and we only got part-way
+ * through the list of files.
+ */
+ ret = 0;
+ }
+ return (ret);
+}
+
+static int
+__rep_walk_filelist(env, version, files, size, count, fn, arg)
+ ENV *env;
+ u_int32_t version;
+ u_int8_t *files;
+ u_int32_t size;
+ u_int32_t count;
+ FILE_WALK_FN *fn;
+ void *arg;
+{
+ __rep_fileinfo_args *rfp, rf;
+ __rep_fileinfo_v6_args *rfpv6;
+ u_int8_t *next;
+ int ret;
+ void *rffree;
+
+ ret = 0;
+ rfp = NULL;
+ rfpv6 = NULL;
+ rffree = NULL;
+ while (count-- > 0) {
+ if (version < DB_REPVERSION_53) {
+ /*
+ * Build a current struct by copying in the older
+ * version struct and then setting up the data_dir.
+ * This is safe because all old fields are in the
+ * same location in the current struct.
+ */
+ if ((ret = __rep_fileinfo_v6_unmarshal(env, version,
+ &rfpv6, files, size, &next)) != 0)
+ break;
+ memcpy(&rf, rfpv6, sizeof(__rep_fileinfo_v6_args));
+ rf.dir.data = NULL;
+ rf.dir.size = 0;
+ rfp = &rf;
+ rffree = rfpv6;
+ } else {
+ if ((ret = __rep_fileinfo_unmarshal(env, version,
+ &rfp, files, size, &next)) != 0)
+ break;
+ rffree = rfp;
+ }
+ size -= (u_int32_t)(next - files);
+ files = next;
+
+ if ((ret = (*fn)(env, rfp, arg)) != 0)
+ break;
+ __os_free(env, rffree);
+ rfp = NULL;
+ rfpv6 = NULL;
+ rffree = NULL;
+ }
+
+ if (rffree != NULL)
+ __os_free(env, rffree);
+ return (ret);
+}
diff --git a/src/rep/rep_elect.c b/src/rep/rep_elect.c
new file mode 100644
index 00000000..9e8c5249
--- /dev/null
+++ b/src/rep/rep_elect.c
@@ -0,0 +1,1486 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+/*
+ * We need to check sites == nsites, not more than half
+ * like we do in __rep_elect and the VOTE2 code. The
+ * reason is that we want to process all the incoming votes
+ * and not short-circuit once we reach more than half. The
+ * real winner's vote may be in the last half.
+ */
+#define IS_PHASE1_DONE(rep) \
+ ((rep)->sites >= (rep)->nsites && (rep)->winner != DB_EID_INVALID)
+
+#define I_HAVE_WON(rep, winner) \
+ ((rep)->votes >= (rep)->nvotes && winner == (rep)->eid)
+
+static void __rep_cmp_vote __P((ENV *, REP *, int, DB_LSN *,
+ u_int32_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t));
+static int __rep_elect_init
+ __P((ENV *, u_int32_t, u_int32_t, int *, u_int32_t *));
+static int __rep_fire_elected __P((ENV *, REP *, u_int32_t));
+static void __rep_elect_master __P((ENV *, REP *));
+static int __rep_grow_sites __P((ENV *, u_int32_t));
+static void __rep_send_vote __P((ENV *, DB_LSN *, u_int32_t,
+ u_int32_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t, int,
+ u_int32_t, u_int32_t));
+static int __rep_tally __P((ENV *, REP *, int, u_int32_t *, u_int32_t, int));
+static int __rep_wait __P((ENV *, db_timeout_t *, int, u_int32_t, u_int32_t));
+
+/*
+ * __rep_elect_pp --
+ * Called after master failure to hold/participate in an election for
+ * a new master.
+ *
+ * PUBLIC: int __rep_elect_pp
+ * PUBLIC: __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__rep_elect_pp(dbenv, given_nsites, nvotes, flags)
+ DB_ENV *dbenv;
+ u_int32_t given_nsites, nvotes;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_elect", DB_INIT_REP);
+
+ if (APP_IS_REPMGR(env)) {
+ __db_errx(env, DB_STR("3527",
+"DB_ENV->rep_elect: cannot call from Replication Manager application"));
+ return (EINVAL);
+ }
+
+ /* We need a transport function because we send messages. */
+ if (db_rep->send == NULL) {
+ __db_errx(env, DB_STR("3528",
+ "DB_ENV->rep_elect: must be called after DB_ENV->rep_set_transport"));
+ return (EINVAL);
+ }
+
+ if (!IS_REP_STARTED(env)) {
+ __db_errx(env, DB_STR("3529",
+ "DB_ENV->rep_elect: must be called after DB_ENV->rep_start"));
+ return (EINVAL);
+ }
+
+ if (IS_USING_LEASES(env) && given_nsites != 0) {
+ __db_errx(env, DB_STR("3530",
+ "DB_ENV->rep_elect: nsites must be zero if leases configured"));
+ return (EINVAL);
+ }
+
+ ret = __rep_elect_int(env, given_nsites, nvotes, flags);
+
+ /*
+ * The DB_REP_IGNORE return code can be of use to repmgr (which of
+ * course calls __rep_elect_int directly), but it may too subtle to be
+ * useful for (Base API) applications: so preserve the pre-existing API
+ * behavior for applications by making this look like a 0.
+ */
+ if (ret == DB_REP_IGNORE)
+ ret = 0;
+ return (ret);
+}
+
+/*
+ * __rep_elect_int --
+ * Internal processing to hold/participate in an election for
+ * a new master after master failure.
+ *
+ * PUBLIC: int __rep_elect_int
+ * PUBLIC: __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__rep_elect_int(env, given_nsites, nvotes, flags)
+ ENV *env;
+ u_int32_t given_nsites, nvotes;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REP *rep;
+ int done, elected, in_progress;
+ int need_req, ret, send_vote, t_ret;
+ u_int32_t ack, ctlflags, data_gen, egen, nsites;
+ u_int32_t orig_tally, priority, realpri, repflags, tiebreaker;
+ db_timeout_t timeout;
+
+ COMPQUIET(flags, 0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ elected = 0;
+ egen = 0;
+ ret = 0;
+
+ /*
+ * Specifying 0 for nsites signals us to use the value configured
+ * previously via rep_set_nsites. Similarly, if the given nvotes is 0,
+ * it asks us to compute the value representing a simple majority.
+ */
+ nsites = given_nsites == 0 ? rep->config_nsites : given_nsites;
+ ack = nvotes == 0 ? ELECTION_MAJORITY(nsites) : nvotes;
+
+ /*
+ * XXX
+ * If users give us less than a majority, they run the risk of
+ * having a network partition. However, this also allows the
+ * scenario of master/1 client to elect the client. Allow
+ * sub-majority values, but give a warning.
+ */
+ if (ack <= (nsites / 2)) {
+ __db_errx(env, DB_STR_A("3531",
+ "DB_ENV->rep_elect:WARNING: nvotes (%d) is sub-majority with nsites (%d)",
+ "%d %d"), nvotes, nsites);
+ }
+
+ if (nsites < ack) {
+ __db_errx(env, DB_STR_A("3532",
+ "DB_ENV->rep_elect: nvotes (%d) is larger than nsites (%d)",
+ "%d %d"), ack, nsites);
+ return (EINVAL);
+ }
+
+ realpri = rep->priority;
+
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Start election nsites %d, ack %d, priority %d",
+ nsites, ack, realpri));
+
+ /*
+ * Special case when having an election while running with
+ * sites of potentially mixed versions. We set a bit indicating
+ * we're an electable site, but set our priority to 0.
+ * Old sites will never elect us, with 0 priority, but if all
+ * we have are new sites, then we can elect the best electable
+ * site of the group.
+ * Thus 'priority' is this special, possibly-fake, effective
+ * priority that we'll use for this election, while 'realpri' is our
+ * real, configured priority, as retrieved from REP region.
+ */
+ ctlflags = realpri != 0 ? REPCTL_ELECTABLE : 0;
+ ENV_ENTER(env, ip);
+
+ orig_tally = 0;
+ /* If we are already master, simply broadcast that fact and return. */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+master: LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ if (IS_USING_LEASES(env))
+ ret = __rep_lease_refresh(env);
+ if (ret == 0)
+ ret = DB_REP_IGNORE;
+ goto envleave;
+ }
+ REP_SYSTEM_LOCK(env);
+
+ /*
+ * If leases are configured, wait for them to expire, and
+ * see if we can discover the master while waiting.
+ */
+ if (IS_USING_LEASES(env) &&
+ (timeout = __rep_lease_waittime(env)) != 0) {
+ FLD_SET(rep->elect_flags, REP_E_PHASE0);
+ egen = rep->egen;
+ REP_SYSTEM_UNLOCK(env);
+ VPRINT(env, (env, DB_VERB_REP_ELECT,
+ "PHASE0 waittime from rep_lease_waittime: %lu",
+ (u_long)timeout));
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_MASTER_REQ, NULL, NULL, 0, 0);
+
+ /*
+ * The only possible non-zero return from __rep_wait() is a
+ * panic for a mutex failure. So the state of the PHASE0 flag
+ * doesn't matter much. If that changes in the future, it is
+ * still best not to clear the flag after an error, because
+ * another thread might be in the middle of its PHASE0 wait (and
+ * not getting an error), so we wouldn't want to cut short its
+ * wait. If there isn't another concurrent thread, the worst
+ * that would happen would be that we would leave the flag set,
+ * until the next time we came through here and completed a
+ * wait. Note that the code here is the only place where we
+ * check this flag.
+ */
+ if ((ret = __rep_wait(env,
+ &timeout, 0, egen, REP_E_PHASE0)) != 0)
+ goto envleave;
+ REP_SYSTEM_LOCK(env);
+ repflags = rep->elect_flags;
+ FLD_CLR(rep->elect_flags, REP_E_PHASE0);
+ /*
+ * If any other thread cleared PHASE0 while we were waiting,
+ * then we're done. Either we heard from a master, or some
+ * other thread completed its PHASE0 wait.
+ *
+ * Or, we could have waited long enough for our lease grant to
+ * expire. Check it to make sure.
+ */
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "after PHASE0 wait, flags 0x%x, elect_flags 0x%x",
+ rep->flags, rep->elect_flags));
+ if (!FLD_ISSET(repflags, REP_E_PHASE0) ||
+ __rep_islease_granted(env) || egen != rep->egen) {
+ VPRINT(env, (env, DB_VERB_REP_ELECT,
+ "PHASE0 Done: repflags 0x%x, egen %d rep->egen %d, lease_granted %d",
+ repflags, egen, rep->egen, __rep_islease_granted(env)));
+ goto unlck_lv;
+ }
+ F_SET(rep, REP_F_LEASE_EXPIRED);
+ }
+
+ /*
+ * After acquiring the mutex, and possibly waiting for leases to
+ * expire, without the mutex, we need to recheck our state. It
+ * may have changed. If we are now master, we're done.
+ */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ REP_SYSTEM_UNLOCK(env);
+ goto master;
+ }
+ if ((ret = __rep_elect_init(env, nsites, ack,
+ &in_progress, &orig_tally)) != 0)
+ goto unlck_lv;
+ /*
+ * If another thread is in the middle of an election we
+ * just quietly return and not interfere.
+ */
+ if (in_progress) {
+ ret = DB_REP_IGNORE;
+ goto unlck_lv;
+ }
+
+ /*
+ * Count threads in the guts of rep_elect, so that we only clear
+ * lockouts when the last thread is finishing. The "guts" start here,
+ * and do not include the above test where we "quietly return" via
+ * envleave.
+ *
+ * Closely associated with that is the notion that the current thread
+ * "owns" the right to process the election at the current egen. We set
+ * the local variable "egen" now to "our" egen; if rep->egen ever
+ * advances "out from under us" we know it's time to yield to a new
+ * generation. Our egen value was vetted in __rep_elect_init(), and we
+ * have not dropped the mutex since then.
+ *
+ * Other than occasionally checking that "our" egen still matches the
+ * current latest rep->egen, there should be no use of rep->egen in this
+ * function after this point.
+ */
+ rep->elect_th++;
+ egen = rep->egen;
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Election thread owns egen %lu", (u_long)egen));
+
+ priority = lp->persist.version != DB_LOGVERSION ? 0 : realpri;
+#ifdef CONFIG_TEST
+ /*
+ * This allows us to unit test the ELECTABLE flag simply by
+ * using the priority values.
+ */
+ if (priority > 0 && priority <= 5) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Artificially setting priority 0 (ELECTABLE) for CONFIG_TEST mode"));
+ DB_ASSERT(env, ctlflags == REPCTL_ELECTABLE);
+ priority = 0;
+ }
+#endif
+ __os_gettime(env, &rep->etime, 1);
+
+ /*
+ * Default to the normal timeout unless the user configured
+ * a full election timeout and we think we need a full election.
+ */
+ rep->full_elect = 0;
+ timeout = rep->elect_timeout;
+ if (!F_ISSET(rep, REP_F_GROUP_ESTD) && rep->full_elect_timeout != 0) {
+ rep->full_elect = 1;
+ timeout = rep->full_elect_timeout;
+ }
+
+ /*
+ * We need to lockout applying incoming log records during
+ * the election. We need to use a special rep_lockout_apply
+ * instead of rep_lockout_msg because we do not want to
+ * lockout all incoming messages, like other VOTEs!
+ */
+ if ((ret = __rep_lockout_apply(env, rep, 0)) != 0)
+ goto err_locked;
+ if ((ret = __rep_lockout_archive(env, rep)) != 0)
+ goto err_locked;
+
+ /*
+ * Since the lockout step (above) could have dropped the mutex, we must
+ * check to see if we still own the right to proceed with the election
+ * at this egen.
+ */
+ if (rep->egen != egen) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Found egen %lu, abandon my election at egen %lu",
+ (u_long)rep->egen, (u_long)egen));
+ goto err_locked;
+ }
+
+ /* Generate a randomized tiebreaker value. */
+ __os_unique_id(env, &tiebreaker);
+
+ FLD_SET(rep->elect_flags, REP_E_PHASE1);
+ FLD_CLR(rep->elect_flags, REP_E_TALLY);
+ /*
+ * We made sure that leases were expired before starting the
+ * election, but an existing master may be slow in responding.
+ * If, during lockout, acquiring mutexes, etc, the client has now
+ * re-granted its lease, we're done - a master exists.
+ */
+ if (IS_USING_LEASES(env) &&
+ __rep_islease_granted(env)) {
+ ret = 0;
+ goto err_locked;
+ }
+
+ /*
+ * If we are in the middle of recovering or internal
+ * init, we participate, but we set our priority to 0
+ * and turn off REPCTL_ELECTABLE. Check whether we
+ * are in an internal init state. If not,
+ * then that is okay, we can be elected (i.e. we are not
+ * in an inconsistent state).
+ */
+ INIT_LSN(lsn);
+ if (ISSET_LOCKOUT_BDB(rep) || IN_INTERNAL_INIT(rep) ||
+ rep->sync_state == SYNC_UPDATE) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Setting priority 0, unelectable, due to internal init/recovery"));
+ priority = 0;
+ ctlflags = 0;
+ data_gen = 0;
+ } else {
+ /*
+ * Use the last commit record as the LSN in the vote.
+ */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err_locked;
+ /*
+ * If we've walked back and there are no commit records,
+ * then reset LSN to INIT_LSN.
+ */
+ if ((ret = __rep_log_backup(env,
+ logc, &lsn, REP_REC_COMMIT)) == DB_NOTFOUND) {
+ INIT_LSN(lsn);
+ ret = 0;
+ }
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err_locked;
+ if ((ret = __rep_get_datagen(env, &data_gen)) != 0)
+ goto err_locked;
+ }
+
+ /*
+ * We are about to participate at this egen. We must
+ * write out the next egen before participating in this one
+ * so that if we crash we can never participate in this egen
+ * again.
+ */
+ if ((ret = __rep_write_egen(env, rep, egen + 1)) != 0)
+ goto err_locked;
+
+ /* Tally our own vote */
+ if ((ret = __rep_tally(env, rep, rep->eid, &rep->sites, egen, 1))
+ != 0) {
+ /*
+ * __rep_tally is telling us that this vote is a duplicate. But
+ * this is our own vote in this case, and that should be
+ * impossible for a given egen.
+ */
+ DB_ASSERT(env, ret != DB_REP_IGNORE);
+ goto err_locked;
+ }
+ __rep_cmp_vote(env, rep, rep->eid, &lsn, priority, rep->gen, data_gen,
+ tiebreaker, ctlflags);
+
+ RPRINT(env, (env, DB_VERB_REP_ELECT, "Beginning an election"));
+
+ /*
+ * Now send vote, remembering the details in case we need them later in
+ * order to send out a duplicate VOTE1. We must save the nsites and
+ * nvotes values that we originally send in the VOTE1 message, separate
+ * from rep->nsites and rep->nvotes, since the latter can change when we
+ * receive a VOTE1 from another site.
+ */
+ send_vote = DB_EID_INVALID;
+ done = IS_PHASE1_DONE(rep);
+ rep->vote1.lsn = lsn;
+ rep->vote1.nsites = nsites;
+ rep->vote1.nvotes = ack;
+ rep->vote1.priority = priority;
+ rep->vote1.tiebreaker = tiebreaker;
+ rep->vote1.ctlflags = ctlflags;
+ rep->vote1.data_gen = data_gen;
+ REP_SYSTEM_UNLOCK(env);
+
+ __rep_send_vote(env, &lsn, nsites, ack, priority, tiebreaker, egen,
+ data_gen, DB_EID_BROADCAST, REP_VOTE1, ctlflags);
+ DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTVOTE1, ret, NULL);
+ if (done) {
+ REP_SYSTEM_LOCK(env);
+ goto vote;
+ }
+
+ ret = __rep_wait(env, &timeout, rep->full_elect, egen, REP_E_PHASE1);
+ REP_SYSTEM_LOCK(env);
+ if (ret != 0)
+ goto err_locked;
+ if (rep->egen > egen)
+ /*
+ * For one reason or another, this election cycle is over; it
+ * doesn't matter why.
+ */
+ goto out;
+
+ if (FLD_ISSET(rep->elect_flags, REP_E_PHASE2)) {
+ /* Received enough votes while waiting to move us to phase 2. */
+ REP_SYSTEM_UNLOCK(env);
+ goto phase2;
+ }
+
+ /*
+ * If we got here, we haven't heard from everyone, but we've
+ * run out of time, so it's time to decide if we have enough
+ * votes to pick a winner and if so, to send out a vote to
+ * the winner.
+ */
+ if (rep->sites >= rep->nvotes) {
+vote:
+ /* We think we've seen enough to cast a vote. */
+ send_vote = rep->winner;
+ /*
+ * See if we won. This will make sure we
+ * don't count ourselves twice if we're racing
+ * with incoming votes.
+ */
+ if (rep->winner == rep->eid) {
+ if ((ret =__rep_tally(env,
+ rep, rep->eid, &rep->votes, egen, 2)) != 0 &&
+ ret != DB_REP_IGNORE)
+ goto err_locked;
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Counted my vote %d", rep->votes));
+ }
+ FLD_SET(rep->elect_flags, REP_E_PHASE2);
+ FLD_CLR(rep->elect_flags, REP_E_PHASE1);
+ }
+ if (send_vote == DB_EID_INVALID) {
+ /* We do not have enough votes to elect. */
+ if (rep->sites >= rep->nvotes)
+ __db_errx(env, DB_STR_A("3533",
+ "No electable site found: recvd %d of %d votes from %d sites",
+ "%d %d %d"), rep->sites, rep->nvotes, rep->nsites);
+ else
+ __db_errx(env, DB_STR_A("3534",
+ "Not enough votes to elect: recvd %d of %d from %d sites",
+ "%d %d %d"), rep->sites, rep->nvotes, rep->nsites);
+ ret = DB_REP_UNAVAIL;
+ goto err_locked;
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * We have seen enough vote1's. Now we need to wait
+ * for all the vote2's.
+ */
+ if (send_vote != rep->eid) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT, "Sending vote"));
+ __rep_send_vote(env, NULL, 0, 0, 0, 0, egen, 0,
+ send_vote, REP_VOTE2, 0);
+ /*
+ * If we are NOT the new master we want to send
+ * our vote to the winner, and wait longer. The
+ * reason is that the winner may be "behind" us
+ * in the election waiting and if the master is
+ * down, the winner will wait the full timeout
+ * and we want to give the winner enough time to
+ * process all the votes. Otherwise we could
+ * incorrectly return DB_REP_UNAVAIL and start a
+ * new election before the winner can declare
+ * itself.
+ */
+ timeout = timeout * 2;
+ }
+
+phase2:
+ if (I_HAVE_WON(rep, rep->winner)) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Skipping phase2 wait: already got %d votes", rep->votes));
+ REP_SYSTEM_LOCK(env);
+ goto i_won;
+ }
+ ret = __rep_wait(env, &timeout, rep->full_elect, egen, REP_E_PHASE2);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Since at "err_lock" we're expected to have the lock, it's convenient
+ * to acquire it before testing "ret" here, since we need it anyway for
+ * the following stuff.
+ */
+ if (ret != 0)
+ goto err_locked;
+ if (rep->egen > egen || !IN_ELECTION(rep))
+ goto out;
+
+ /* We must have timed out. */
+ ret = DB_REP_UNAVAIL;
+
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "After phase 2: votes %d, nvotes %d, nsites %d",
+ rep->votes, rep->nvotes, rep->nsites));
+
+ if (I_HAVE_WON(rep, rep->winner)) {
+i_won: __rep_elect_master(env, rep);
+ ret = 0;
+ elected = 1;
+ }
+err_locked:
+ /*
+ * If we get here because of a non-election error, then we did not tally
+ * our vote. In that case we do not want to discard all known election
+ * info.
+ */
+ if (ret == 0 || ret == DB_REP_UNAVAIL)
+ __rep_elect_done(env, rep);
+ else if (orig_tally)
+ FLD_SET(rep->elect_flags, orig_tally);
+
+#ifdef CONFIG_TEST
+ if (0) {
+DB_TEST_RECOVERY_LABEL
+ REP_SYSTEM_LOCK(env);
+ }
+#endif
+
+out:
+ /*
+ * We're leaving, so decrement thread count. If it's still >0 after
+ * that, another thread has come along to handle a later egen. Only the
+ * last thread to come through here should clear the lockouts.
+ */
+ need_req = 0;
+ DB_ASSERT(env, rep->elect_th > 0);
+ rep->elect_th--;
+ if (rep->elect_th == 0) {
+ need_req = F_ISSET(rep, REP_F_SKIPPED_APPLY) &&
+ !I_HAVE_WON(rep, rep->winner);
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_APPLY);
+ F_CLR(rep, REP_F_SKIPPED_APPLY);
+ }
+ /*
+ * Only clear archiving lockout if the election failed. If
+ * it succeeded, we keep archiving disabled until we either
+ * become master or complete synchronization with a master.
+ */
+ if (ret != 0 && rep->elect_th == 0)
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * If we skipped any log records, request them now.
+ */
+ if (need_req && (t_ret = __rep_resend_req(env, 0)) != 0 &&
+ (ret == 0 || ret == DB_REP_UNAVAIL || ret == DB_REP_IGNORE))
+ ret = t_ret;
+
+ /* Note that "elected" implies ret cannot be DB_REP_UNAVAIL here. */
+ if (elected) {
+ /*
+ * The only way ret can be non-zero is if __rep_resend_req()
+ * failed. So we don't have to check for UNAVAIL and IGNORE in
+ * deciding whether we're overwriting ret, as we did above.
+ */
+ DB_ASSERT(env, ret != DB_REP_UNAVAIL && ret != DB_REP_IGNORE);
+ if ((t_ret = __rep_fire_elected(env, rep, egen)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "%s %d, e_th %lu, egen %lu, flag 0x%lx, e_fl 0x%lx, lo_fl 0x%lx",
+ "Ended election with ", ret,
+ (u_long) rep->elect_th, (u_long)rep->egen,
+ (u_long)rep->flags, (u_long)rep->elect_flags,
+ (u_long)rep->lockout_flags));
+
+ if (0) {
+unlck_lv: REP_SYSTEM_UNLOCK(env);
+ }
+envleave:
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __rep_vote1 --
+ * Handle incoming vote1 message on a client.
+ *
+ * PUBLIC: int __rep_vote1 __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_vote1(env, rp, rec, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+{
+ DBT data_dbt;
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ REP_OLD_VOTE_INFO *ovi;
+ VOTE1_CONTENT vote1;
+ __rep_egen_args egen_arg;
+ __rep_vote_info_v5_args tmpvi5;
+ __rep_vote_info_args tmpvi, *vi;
+ u_int32_t egen;
+ int elected, master, resend, ret;
+ u_int8_t buf[__REP_MAXMSG_SIZE];
+ size_t len;
+
+ COMPQUIET(egen, 0);
+
+ elected = resend = ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT, "Master received vote"));
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ return (ret);
+ }
+
+ /*
+ * In 4.7 we changed to having fixed sized u_int32_t's from
+ * non-fixed 'int' fields in the vote structure.
+ */
+ if (rp->rep_version < DB_REPVERSION_47) {
+ ovi = (REP_OLD_VOTE_INFO *)rec->data;
+ tmpvi.egen = ovi->egen;
+ tmpvi.nsites = (u_int32_t)ovi->nsites;
+ tmpvi.nvotes = (u_int32_t)ovi->nvotes;
+ tmpvi.priority = (u_int32_t)ovi->priority;
+ tmpvi.tiebreaker = ovi->tiebreaker;
+ tmpvi.data_gen = 0;
+ } else if (rp->rep_version < DB_REPVERSION_52) {
+ if ((ret = __rep_vote_info_v5_unmarshal(env,
+ &tmpvi5, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ tmpvi.egen = tmpvi5.egen;
+ tmpvi.nsites = tmpvi5.nsites;
+ tmpvi.nvotes = tmpvi5.nvotes;
+ tmpvi.priority = tmpvi5.priority;
+ tmpvi.tiebreaker = tmpvi5.tiebreaker;
+ tmpvi.data_gen = 0;
+ } else
+ if ((ret = __rep_vote_info_unmarshal(env,
+ &tmpvi, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ vi = &tmpvi;
+ REP_SYSTEM_LOCK(env);
+
+ /*
+ * If we get a vote from a later election gen, we
+ * clear everything from the current one, and we'll
+ * start over by tallying it. If we get an old vote,
+ * send an ALIVE to the old participant.
+ */
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Received vote1 egen %lu, egen %lu",
+ (u_long)vi->egen, (u_long)rep->egen));
+ if (vi->egen < rep->egen) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Received old vote %lu, egen %lu, ignoring vote1",
+ (u_long)vi->egen, (u_long)rep->egen));
+ egen_arg.egen = rep->egen;
+ REP_SYSTEM_UNLOCK(env);
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(data_dbt, &egen_arg.egen,
+ sizeof(egen_arg.egen));
+ else {
+ if ((ret = __rep_egen_marshal(env,
+ &egen_arg, buf, __REP_EGEN_SIZE, &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(data_dbt, buf, len);
+ }
+ (void)__rep_send_message(env,
+ eid, REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
+ return (0);
+ }
+ if (vi->egen > rep->egen) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Received VOTE1 from egen %lu, my egen %lu",
+ (u_long)vi->egen, (u_long)rep->egen));
+ /*
+ * Terminate an election that may be in progress at the old
+ * egen. Whether or not there was one, this call will result in
+ * HOLDELECTION (assuming no unexpected failures crop up).
+ */
+ __rep_elect_done(env, rep);
+ rep->egen = vi->egen;
+ }
+
+ /*
+ * If this site (sender of the VOTE1) is the first to the party, simply
+ * initialize values from the message. Otherwise, see if the site knows
+ * about more sites, and/or requires more votes, than we do.
+ */
+ if (!IN_ELECTION_TALLY(rep)) {
+ FLD_SET(rep->elect_flags, REP_E_TALLY);
+ rep->nsites = vi->nsites;
+ rep->nvotes = vi->nvotes;
+ } else {
+ if (vi->nsites > rep->nsites)
+ rep->nsites = vi->nsites;
+ if (vi->nvotes > rep->nvotes)
+ rep->nvotes = vi->nvotes;
+ }
+
+ /*
+ * Ignore vote1's if we're in phase 2.
+ */
+ if (FLD_ISSET(rep->elect_flags, REP_E_PHASE2)) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "In phase 2, ignoring vote1"));
+ goto err;
+ }
+
+ /*
+ * Record this vote. If we're ignoring it, there's nothing more we need
+ * to do.
+ */
+ if ((ret = __rep_tally(env, rep, eid, &rep->sites, vi->egen, 1)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Tally returned %d, sites %d", ret, rep->sites));
+ if (ret == DB_REP_IGNORE)
+ ret = 0;
+ goto err;
+ }
+
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+"Incoming vote: (eid)%d (pri)%lu %s (gen)%lu (egen)%lu (datagen)%lu [%lu,%lu]",
+ eid, (u_long)vi->priority,
+ F_ISSET(rp, REPCTL_ELECTABLE) ? "ELECTABLE" : "",
+ (u_long)rp->gen, (u_long)vi->egen, (u_long)vi->data_gen,
+ (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
+ if (rep->sites > 1)
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+"Existing vote: (eid)%d (pri)%lu (gen)%lu (datagen)%lu (sites)%d [%lu,%lu]",
+ rep->winner, (u_long)rep->w_priority,
+ (u_long)rep->w_gen, (u_long)rep->w_datagen, rep->sites,
+ (u_long)rep->w_lsn.file,
+ (u_long)rep->w_lsn.offset));
+
+ __rep_cmp_vote(env, rep, eid, &rp->lsn, vi->priority,
+ rp->gen, vi->data_gen, vi->tiebreaker, rp->flags);
+ /*
+ * If you get a vote and you're not yet "in an election" at the proper
+ * egen, we've already recorded this vote. But that is all we need to
+ * do. But if you are in an election, check to see if we ought to send
+ * an extra VOTE1. We know that the VOTE1 we have received is not a
+ * duplicated, because of the successful return from __rep_tally(),
+ * above.
+ */
+ if (IN_ELECTION(rep)) {
+ /*
+ * If we're doing a full election, and we're into phase 1 (no
+ * REP_E_TALLY), then resend, in case the sender of this VOTE1
+ * missed our VOTE1.
+ */
+ if (rep->full_elect &&
+ FLD_ISSET((rep)->elect_flags, REP_E_PHASE1)) {
+ resend = 1;
+ vote1 = rep->vote1;
+ egen = rep->egen;
+ }
+ } else {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Not in election, but received vote1 0x%x 0x%x",
+ rep->flags, rep->elect_flags));
+ ret = DB_REP_HOLDELECTION;
+ goto err;
+ }
+
+ master = rep->winner;
+ lsn = rep->w_lsn;
+ if (IS_PHASE1_DONE(rep)) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT, "Phase1 election done"));
+ RPRINT(env, (env, DB_VERB_REP_ELECT, "Voting for %d%s",
+ master, master == rep->eid ? "(self)" : ""));
+ egen = rep->egen;
+ FLD_SET(rep->elect_flags, REP_E_PHASE2);
+ FLD_CLR(rep->elect_flags, REP_E_PHASE1);
+ if (master == rep->eid) {
+ if ((ret =__rep_tally(env, rep, rep->eid,
+ &rep->votes, egen, 2)) != 0 &&
+ ret != DB_REP_IGNORE)
+ goto err;
+ ret = 0;
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "After phase 1 done: counted vote %d of %d",
+ rep->votes, rep->nvotes));
+ if (I_HAVE_WON(rep, rep->winner)) {
+ __rep_elect_master(env, rep);
+ elected = 1;
+ }
+ goto err;
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ /* Vote for someone else. */
+ __rep_send_vote(env, NULL, 0, 0, 0, 0, egen, 0,
+ master, REP_VOTE2, 0);
+ } else
+err: REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Note that if we're elected, there's no need for resending our VOTE1,
+ * even if we thought it might have been necessary a moment ago.
+ */
+ if (elected)
+ ret = __rep_fire_elected(env, rep, egen);
+ else if (resend)
+ __rep_send_vote(env,
+ &vote1.lsn, vote1.nsites, vote1.nvotes, vote1.priority,
+ vote1.tiebreaker, egen, vote1.data_gen,
+ eid, REP_VOTE1, vote1.ctlflags);
+ return (ret);
+}
+
+/*
+ * __rep_vote2 --
+ * Handle incoming vote2 message on a client.
+ *
+ * PUBLIC: int __rep_vote2 __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_vote2(env, rp, rec, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ REP_OLD_VOTE_INFO *ovi;
+ __rep_vote_info_args tmpvi, *vi;
+ u_int32_t egen;
+ int ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ RPRINT(env, (env, DB_VERB_REP_ELECT, "We received a vote%s",
+ F_ISSET(rep, REP_F_MASTER) ? " (master)" : ""));
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ if (IS_USING_LEASES(env))
+ ret = __rep_lease_refresh(env);
+ return (ret);
+ }
+
+ REP_SYSTEM_LOCK(env);
+ egen = rep->egen;
+
+ /*
+ * We might be the last to the party and we haven't had
+ * time to tally all the vote1's, but others have and
+ * decided we're the winner. So, if we're in the process
+ * of tallying sites, keep the vote so that when our
+ * election thread catches up we'll have the votes we
+ * already received.
+ */
+ /*
+ * In 4.7 we changed to having fixed sized u_int32_t's from
+ * non-fixed 'int' fields in the vote structure.
+ */
+ if (rp->rep_version < DB_REPVERSION_47) {
+ ovi = (REP_OLD_VOTE_INFO *)rec->data;
+ tmpvi.egen = ovi->egen;
+ tmpvi.nsites = (u_int32_t)ovi->nsites;
+ tmpvi.nvotes = (u_int32_t)ovi->nvotes;
+ tmpvi.priority = (u_int32_t)ovi->priority;
+ tmpvi.tiebreaker = ovi->tiebreaker;
+ } else
+ if ((ret = __rep_vote_info_unmarshal(env,
+ &tmpvi, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ vi = &tmpvi;
+ if (!IN_ELECTION_TALLY(rep) && vi->egen >= rep->egen) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Not in election gen %lu, at %lu, got vote",
+ (u_long)vi->egen, (u_long)rep->egen));
+ ret = DB_REP_HOLDELECTION;
+ goto err;
+ }
+
+ /*
+ * Record this vote. In a VOTE2, the only valid entry
+ * in the vote information is the election generation.
+ *
+ * There are several things which can go wrong that we
+ * need to account for:
+ * 1. If we receive a latent VOTE2 from an earlier election,
+ * we want to ignore it.
+ * 2. If we receive a VOTE2 from a site from which we never
+ * received a VOTE1, we want to record it, because we simply
+ * may be processing messages out of order or its vote1 got lost,
+ * but that site got all the votes it needed to send it.
+ * 3. If we have received a duplicate VOTE2 from this election
+ * from the same site we want to ignore it.
+ * 4. If this is from the current election and someone is
+ * really voting for us, then we finally get to record it.
+ */
+ /*
+ * Case 1.
+ */
+ if (vi->egen != rep->egen) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Bad vote egen %lu. Mine %lu",
+ (u_long)vi->egen, (u_long)rep->egen));
+ ret = 0;
+ goto err;
+ }
+
+ /*
+ * __rep_tally takes care of cases 2, 3 and 4.
+ */
+ if ((ret = __rep_tally(env, rep, eid, &rep->votes, vi->egen, 2)) != 0) {
+ if (ret == DB_REP_IGNORE)
+ ret = 0;
+ goto err;
+ }
+ RPRINT(env, (env, DB_VERB_REP_ELECT, "Counted vote %d of %d",
+ rep->votes, rep->nvotes));
+ if (I_HAVE_WON(rep, rep->winner)) {
+ __rep_elect_master(env, rep);
+ ret = DB_REP_NEWMASTER;
+ }
+
+err: REP_SYSTEM_UNLOCK(env);
+ if (ret == DB_REP_NEWMASTER)
+ ret = __rep_fire_elected(env, rep, egen);
+ return (ret);
+}
+
+/*
+ * __rep_tally --
+ * Handle incoming vote message on a client. This will record either a
+ * VOTE1 or a VOTE2, depending on the "phase" value the caller passed in.
+ *
+ * This function will return:
+ * 0 if we successfully tally the vote;
+ * DB_REP_IGNORE if the vote is properly ignored;
+ * (anything else) in case of an unexpected error.
+ *
+ * !!! Caller must hold REP_SYSTEM_LOCK.
+ */
+static int
+__rep_tally(env, rep, eid, countp, egen, phase)
+ ENV *env;
+ REP *rep;
+ int eid;
+ u_int32_t *countp;
+ u_int32_t egen;
+ int phase;
+{
+ REP_VTALLY *tally, *vtp;
+ u_int32_t i;
+ int ret;
+
+ if (rep->nsites > rep->asites &&
+ (ret = __rep_grow_sites(env, rep->nsites)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Grow sites returned error %d", ret));
+ return (ret);
+ }
+ if (phase == 1)
+ tally = R_ADDR(env->reginfo, rep->tally_off);
+ else
+ tally = R_ADDR(env->reginfo, rep->v2tally_off);
+ vtp = &tally[0];
+ for (i = 0; i < *countp;) {
+ /*
+ * Ignore votes from earlier elections (i.e. we've heard
+ * from this site in this election, but its vote from an
+ * earlier election got delayed and we received it now).
+ * However, if we happened to hear from an earlier vote
+ * and we recorded it and we're now hearing from a later
+ * election we want to keep the updated one. Note that
+ * updating the entry will not increase the count.
+ * Also ignore votes that are duplicates.
+ */
+ if (vtp->eid == eid) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Tally found[%d] (%d, %lu), this vote (%d, %lu)",
+ i, vtp->eid, (u_long)vtp->egen,
+ eid, (u_long)egen));
+ if (vtp->egen >= egen)
+ return (DB_REP_IGNORE);
+ else {
+ vtp->egen = egen;
+ return (0);
+ }
+ }
+ i++;
+ vtp = &tally[i];
+ }
+
+ /*
+ * If we get here, we have a new voter we haven't seen before. Tally
+ * this vote.
+ */
+ RPRINT(env, (env, DB_VERB_REP_ELECT, "Tallying VOTE%d[%d] (%d, %lu)",
+ phase, i, eid, (u_long)egen));
+
+ vtp->eid = eid;
+ vtp->egen = egen;
+ (*countp)++;
+ return (0);
+}
+
+/*
+ * __rep_cmp_vote --
+ * Compare incoming vote1 message on a client. Called with the db_rep
+ * mutex held.
+ *
+ */
+static void
+__rep_cmp_vote(env, rep, eid, lsnp, priority, gen, data_gen, tiebreaker, flags)
+ ENV *env;
+ REP *rep;
+ int eid;
+ DB_LSN *lsnp;
+ u_int32_t priority;
+ u_int32_t data_gen, flags, gen, tiebreaker;
+{
+ int cmp, like_pri;
+
+ cmp = LOG_COMPARE(lsnp, &rep->w_lsn);
+ /*
+ * If we've seen more than one, compare us to the best so far.
+ * If we're the first, make ourselves the winner to start.
+ */
+ if (rep->sites > 1 &&
+ (priority != 0 || LF_ISSET(REPCTL_ELECTABLE))) {
+ /*
+ * Special case, if we have a mixed version group of sites,
+ * we set priority to 0, but set the ELECTABLE flag so that
+ * all sites talking at lower versions can correctly elect.
+ * If a non-zero priority comes in and current winner is
+ * zero priority (but was electable), then the non-zero
+ * site takes precedence no matter what its LSN is.
+ *
+ * Then the data_gen determines the winner. The site with
+ * the more recent generation of data wins.
+ *
+ * Then LSN is determinant only if we're comparing
+ * like-styled version/priorities at the same data_gen. I.e.
+ * both with 0/ELECTABLE priority or both with non-zero
+ * priority. Then actual priority value if LSNs
+ * are equal, then tiebreaker if both are equal.
+ */
+ /*
+ * Make note if we're comparing the same types of priorities
+ * that indicate electability or not. We know we are
+ * electable if we are here.
+ */
+ like_pri = (priority == 0 && rep->w_priority == 0) ||
+ (priority != 0 && rep->w_priority != 0);
+
+ if ((priority != 0 && rep->w_priority == 0) ||
+ (like_pri && data_gen > rep->w_datagen) ||
+ (like_pri && data_gen == rep->w_datagen && cmp > 0) ||
+ (cmp == 0 && (priority > rep->w_priority ||
+ (priority == rep->w_priority &&
+ (tiebreaker > rep->w_tiebreaker))))) {
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Accepting new vote"));
+ rep->winner = eid;
+ rep->w_priority = priority;
+ rep->w_lsn = *lsnp;
+ rep->w_gen = gen;
+ rep->w_datagen = data_gen;
+ rep->w_tiebreaker = tiebreaker;
+ }
+ } else if (rep->sites == 1) {
+ if (priority != 0 || LF_ISSET(REPCTL_ELECTABLE)) {
+ /* Make ourselves the winner to start. */
+ rep->winner = eid;
+ rep->w_priority = priority;
+ rep->w_gen = gen;
+ rep->w_datagen = data_gen;
+ rep->w_lsn = *lsnp;
+ rep->w_tiebreaker = tiebreaker;
+ } else {
+ rep->winner = DB_EID_INVALID;
+ rep->w_priority = 0;
+ rep->w_gen = 0;
+ rep->w_datagen = 0;
+ ZERO_LSN(rep->w_lsn);
+ rep->w_tiebreaker = 0;
+ }
+ }
+}
+
+/*
+ * __rep_elect_init
+ * Initialize an election. Sets beginp non-zero if the election is
+ * already in progress; makes it 0 otherwise. Leaves it untouched if we return
+ * DB_REP_NEWMASTER.
+ *
+ * Caller holds the REP_SYSTEM mutex, and relies on us not dropping it.
+ */
+static int
+__rep_elect_init(env, nsites, nvotes, beginp, otally)
+ ENV *env;
+ u_int32_t nsites, nvotes;
+ int *beginp;
+ u_int32_t *otally;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ret = 0;
+
+ if (otally != NULL)
+ *otally = FLD_ISSET(rep->elect_flags, REP_E_TALLY);
+
+ DB_ASSERT(env, rep->spent_egen <= rep->egen);
+ *beginp = rep->spent_egen == rep->egen;
+ if (!*beginp) {
+ /*
+ * Make sure that we always initialize all the election fields
+ * before putting ourselves in an election state. That means
+ * issuing calls that can fail (allocation) before setting all
+ * the variables.
+ */
+ if (nsites > rep->asites &&
+ (ret = __rep_grow_sites(env, nsites)) != 0)
+ goto err;
+ DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTINIT, ret, NULL);
+ rep->spent_egen = rep->egen;
+
+ STAT_INC(env, rep, election, rep->stat.st_elections, rep->egen);
+
+ /*
+ * If we're the first to the party, we simply set initial
+ * values: pre-existing values would be left over from previous
+ * election.
+ */
+ if (!IN_ELECTION_TALLY(rep)) {
+ rep->nsites = nsites;
+ rep->nvotes = nvotes;
+ } else {
+ if (nsites > rep->nsites)
+ rep->nsites = nsites;
+ if (nvotes > rep->nvotes)
+ rep->nvotes = nvotes;
+ }
+ }
+DB_TEST_RECOVERY_LABEL
+err:
+ return (ret);
+}
+
+/*
+ * __rep_elect_master
+ * Set up for new master from election. Must be called with
+ * the replication region mutex held.
+ */
+static void
+__rep_elect_master(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ if (F_ISSET(rep, REP_F_MASTERELECT | REP_F_MASTER)) {
+ /* We've been through here already; avoid double counting. */
+ return;
+ }
+
+ F_SET(rep, REP_F_MASTERELECT);
+ STAT_INC(env, rep, election_won, rep->stat.st_elections_won, rep->egen);
+
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Got enough votes to win; election done; (prev) gen %lu",
+ (u_long)rep->gen));
+}
+
+static int
+__rep_fire_elected(env, rep, egen)
+ ENV *env;
+ REP *rep;
+ u_int32_t egen;
+{
+ REP_EVENT_LOCK(env);
+ if (rep->notified_egen < egen) {
+ __rep_fire_event(env, DB_EVENT_REP_ELECTED, NULL);
+ rep->notified_egen = egen;
+ }
+ REP_EVENT_UNLOCK(env);
+ return (0);
+}
+
+/*
+ * Compute a sleep interval.
+ *
+ * The user specifies an overall timeout function, but checking is cheap and the
+ * timeout may be a generous upper bound. So sleep for the smaller of .5s and
+ * timeout/10. Make sure we sleep at least 1usec if timeout < 10.
+ */
+#define SLEEPTIME(timeout) \
+ ((timeout > 5000000) ? 500000 : ((timeout >= 10) ? timeout / 10 : 1))
+
+/*
+ * __rep_wait --
+ *
+ * Sleep until the indicated phase is over, or the timeout expires. The phase
+ * is over when someone clears the phase flag (in the course of processing an
+ * incoming message). This could either be a normal progression one one phase
+ * to the other, or it could be due to receiving a NEWMASTER or an egen change.
+ * In all cases we simply return 0, and the caller should check the state of the
+ * world (generally under mutex protection) to decide what to do next.
+ */
+static int
+__rep_wait(env, timeoutp, full_elect, egen, flags)
+ ENV *env;
+ db_timeout_t *timeoutp;
+ int full_elect;
+ u_int32_t egen, flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int done;
+ u_int32_t sleeptime, sleeptotal, timeout;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ done = 0;
+
+ timeout = *timeoutp;
+ sleeptime = SLEEPTIME(timeout);
+ sleeptotal = 0;
+ while (sleeptotal < timeout) {
+ __os_yield(env, 0, sleeptime);
+ sleeptotal += sleeptime;
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Check if group membership changed while we were
+ * sleeping. Specifically we're trying for a full
+ * election and someone is telling us we're joining
+ * a previously established replication group. (This is not
+ * applicable for the phase 0 wait, which uses a completely
+ * unrelated timeout value.)
+ */
+ if (!LF_ISSET(REP_E_PHASE0) &&
+ full_elect && F_ISSET(rep, REP_F_GROUP_ESTD)) {
+ *timeoutp = rep->elect_timeout;
+ timeout = *timeoutp;
+ if (sleeptotal >= timeout)
+ done = 1;
+ else
+ sleeptime = SLEEPTIME(timeout);
+ }
+
+ if (egen != rep->egen || !FLD_ISSET(rep->elect_flags, flags))
+ done = 1;
+ REP_SYSTEM_UNLOCK(env);
+
+ if (done)
+ return (0);
+ }
+ return (0);
+}
+
+/*
+ * __rep_grow_sites --
+ * Called to allocate more space in the election tally information.
+ * Called with the rep mutex held. We need to call the region mutex, so
+ * we need to make sure that we *never* acquire those mutexes in the
+ * opposite order.
+ */
+static int
+__rep_grow_sites(env, nsites)
+ ENV *env;
+ u_int32_t nsites;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ int ret, *tally;
+ u_int32_t nalloc;
+
+ rep = env->rep_handle->region;
+
+ /*
+ * Allocate either twice the current allocation or nsites,
+ * whichever is more.
+ */
+ nalloc = 2 * rep->asites;
+ if (nalloc < nsites)
+ nalloc = nsites;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ MUTEX_LOCK(env, renv->mtx_regenv);
+
+ /*
+ * We allocate 2 tally regions, one for tallying VOTE1's and
+ * one for VOTE2's. Always grow them in tandem, because if we
+ * get more VOTE1's we'll always expect more VOTE2's then too.
+ */
+ if ((ret = __env_alloc(infop,
+ (size_t)nalloc * sizeof(REP_VTALLY), &tally)) == 0) {
+ if (rep->tally_off != INVALID_ROFF)
+ __env_alloc_free(
+ infop, R_ADDR(infop, rep->tally_off));
+ rep->tally_off = R_OFFSET(infop, tally);
+ if ((ret = __env_alloc(infop,
+ (size_t)nalloc * sizeof(REP_VTALLY), &tally)) == 0) {
+ /* Success */
+ if (rep->v2tally_off != INVALID_ROFF)
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->v2tally_off));
+ rep->v2tally_off = R_OFFSET(infop, tally);
+ rep->asites = nalloc;
+ rep->nsites = nsites;
+ } else {
+ /*
+ * We were unable to allocate both. So, we must
+ * free the first one and reinitialize. If
+ * v2tally_off is valid, it is from an old
+ * allocation and we are clearing it all out due
+ * to the error.
+ */
+ if (rep->v2tally_off != INVALID_ROFF)
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->v2tally_off));
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->tally_off));
+ rep->v2tally_off = rep->tally_off = INVALID_ROFF;
+ rep->asites = 0;
+ }
+ }
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ return (ret);
+}
+
+/*
+ * __rep_send_vote
+ * Send this site's vote for the election.
+ */
+static void
+__rep_send_vote(env, lsnp,
+ nsites, nvotes, pri, tie, egen, data_gen, eid, vtype, flags)
+ ENV *env;
+ DB_LSN *lsnp;
+ int eid;
+ u_int32_t nsites, nvotes, pri;
+ u_int32_t flags, egen, data_gen, tie, vtype;
+{
+ DB_REP *db_rep;
+ DBT vote_dbt;
+ REP *rep;
+ REP_OLD_VOTE_INFO ovi;
+ __rep_vote_info_args vi;
+ __rep_vote_info_v5_args vi5;
+ u_int8_t buf[__REP_VOTE_INFO_SIZE];
+ size_t len;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ memset(&vi, 0, sizeof(vi));
+ memset(&vote_dbt, 0, sizeof(vote_dbt));
+
+ /*
+ * In 4.7 we went to fixed sized fields. They may not be
+ * the same as the sizes in older versions. In 5.2 we
+ * added the data_gen.
+ */
+ if (rep->version < DB_REPVERSION_47) {
+ ovi.egen = egen;
+ ovi.priority = (int) pri;
+ ovi.nsites = (int) nsites;
+ ovi.nvotes = (int) nvotes;
+ ovi.tiebreaker = tie;
+ DB_INIT_DBT(vote_dbt, &ovi, sizeof(ovi));
+ } else if (rep->version < DB_REPVERSION_52) {
+ vi5.egen = egen;
+ vi5.priority = pri;
+ vi5.nsites = nsites;
+ vi5.nvotes = nvotes;
+ vi5.tiebreaker = tie;
+ (void)__rep_vote_info_v5_marshal(env, &vi5, buf,
+ __REP_VOTE_INFO_SIZE, &len);
+ DB_INIT_DBT(vote_dbt, buf, len);
+ } else {
+ vi.egen = egen;
+ vi.priority = pri;
+ vi.nsites = nsites;
+ vi.nvotes = nvotes;
+ vi.tiebreaker = tie;
+ vi.data_gen = data_gen;
+ (void)__rep_vote_info_marshal(env, &vi, buf,
+ __REP_VOTE_INFO_SIZE, &len);
+ DB_INIT_DBT(vote_dbt, buf, len);
+ }
+
+ (void)__rep_send_message(env, eid, vtype, lsnp, &vote_dbt, flags, 0);
+}
diff --git a/src/rep/rep_lease.c b/src/rep/rep_lease.c
new file mode 100644
index 00000000..047c39a7
--- /dev/null
+++ b/src/rep/rep_lease.c
@@ -0,0 +1,545 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2007, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **));
+
+/*
+ * __rep_update_grant -
+ * Update a client's lease grant for this perm record
+ * and send the grant to the master. Caller must
+ * hold the mtx_clientdb mutex. Timespec given is in
+ * host local format.
+ *
+ * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *));
+ */
+int
+__rep_update_grant(env, ts)
+ ENV *env;
+ db_timespec *ts;
+{
+ DBT lease_dbt;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ __rep_grant_info_args gi;
+ db_timespec mytime;
+ u_int8_t buf[__REP_GRANT_INFO_SIZE];
+ int master, ret;
+ size_t len;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ timespecclear(&mytime);
+
+ /*
+ * Get current time, and add in the (skewed) lease duration
+ * time to send the grant to the master.
+ */
+ __os_gettime(env, &mytime, 1);
+ timespecadd(&mytime, &rep->lease_duration);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * If we are in an election, we cannot grant the lease.
+ * We need to check under the region mutex.
+ */
+ if (IN_ELECTION(rep)) {
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+ }
+ if (timespeccmp(&mytime, &rep->grant_expire, >))
+ rep->grant_expire = mytime;
+ F_CLR(rep, REP_F_LEASE_EXPIRED);
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Send the LEASE_GRANT message with the current lease grant
+ * no matter if we've actually extended the lease or not.
+ */
+ gi.msg_sec = (u_int32_t)ts->tv_sec;
+ gi.msg_nsec = (u_int32_t)ts->tv_nsec;
+
+ if ((ret = __rep_grant_info_marshal(env, &gi, buf,
+ __REP_GRANT_INFO_SIZE, &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(lease_dbt, buf, len);
+ /*
+ * Don't send to the master if this site has zero priority because
+ * our site cannot count toward the data being safe.
+ */
+ if ((master = rep->master_id) != DB_EID_INVALID && rep->priority > 0)
+ (void)__rep_send_message(env, master, REP_LEASE_GRANT,
+ &lp->max_perm_lsn, &lease_dbt, 0, 0);
+ return (0);
+}
+
+/*
+ * __rep_islease_granted -
+ * Return 0 if this client has no outstanding lease granted.
+ * Return 1 otherwise.
+ * Caller must hold the REP_SYSTEM (region) mutex, and (rep_elect) relies
+ * on us not dropping it.
+ *
+ * PUBLIC: int __rep_islease_granted __P((ENV *));
+ */
+int
+__rep_islease_granted(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ db_timespec mytime;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ /*
+ * Get current time and compare against our granted lease.
+ */
+ timespecclear(&mytime);
+ __os_gettime(env, &mytime, 1);
+
+ return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0);
+}
+
+/*
+ * __rep_lease_table_alloc -
+ * Allocate the lease table on a master. Called with rep mutex
+ * held. We need to acquire the env region mutex, so we need to
+ * make sure we never acquire those mutexes in the opposite order.
+ *
+ * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t));
+ */
+int
+__rep_lease_table_alloc(env, nsites)
+ ENV *env;
+ u_int32_t nsites;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ REP_LEASE_ENTRY *le, *table;
+ int *lease, ret;
+ u_int32_t i;
+
+ rep = env->rep_handle->region;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ /*
+ * If we have an old table from some other time, free it and
+ * allocate ourselves a new one that is known to be for
+ * the right number of sites.
+ */
+ if (rep->lease_off != INVALID_ROFF) {
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->lease_off));
+ rep->lease_off = INVALID_ROFF;
+ }
+ ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY),
+ &lease);
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ if (ret != 0)
+ return (ret);
+ else
+ rep->lease_off = R_OFFSET(infop, lease);
+ table = R_ADDR(infop, rep->lease_off);
+ for (i = 0; i < nsites; i++) {
+ le = &table[i];
+ le->eid = DB_EID_INVALID;
+ timespecclear(&le->start_time);
+ timespecclear(&le->end_time);
+ ZERO_LSN(le->lease_lsn);
+ }
+ return (0);
+}
+
+/*
+ * __rep_lease_grant -
+ * Handle incoming REP_LEASE_GRANT message on a master.
+ *
+ * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_lease_grant(env, rp, rec, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ __rep_grant_info_args gi;
+ REP_LEASE_ENTRY *le;
+ db_timespec msg_time;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ if ((ret = __rep_grant_info_unmarshal(env,
+ &gi, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ timespecset(&msg_time, gi.msg_sec, gi.msg_nsec);
+ le = NULL;
+
+ /*
+ * Get current time, and add in the (skewed) lease duration
+ * time to send the grant to the master.
+ */
+ REP_SYSTEM_LOCK(env);
+ __rep_find_entry(env, rep, eid, &le);
+ /*
+ * We either get back this site's entry, or an empty entry
+ * that we need to initialize.
+ */
+ DB_ASSERT(env, le != NULL);
+ /*
+ * Update the entry if it is an empty entry or if the new
+ * lease grant is a later start time than the current one.
+ */
+ VPRINT(env, (env, DB_VERB_REP_LEASE,
+ "lease_grant: grant msg time %lu %lu",
+ (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec));
+ if (le->eid == DB_EID_INVALID ||
+ timespeccmp(&msg_time, &le->start_time, >)) {
+ le->eid = eid;
+ le->start_time = msg_time;
+ le->end_time = le->start_time;
+ timespecadd(&le->end_time, &rep->lease_duration);
+ VPRINT(env, (env, DB_VERB_REP_LEASE,
+ "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu",
+ le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec,
+ (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec,
+ (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec));
+ }
+ /*
+ * Only update the lease table with a larger LSN value
+ * than the previous entry. This handles the case of a
+ * lagging record with a later start time, which is
+ * sometimes possible when a failed lease check resends
+ * the last permanent record.
+ */
+ if (LOG_COMPARE(&rp->lsn, &le->lease_lsn) > 0) {
+ le->lease_lsn = rp->lsn;
+ VPRINT(env, (env, DB_VERB_REP_LEASE,
+ "lease_grant: eid %d, lease_lsn [%lu][%lu]",
+ le->eid, (u_long)le->lease_lsn.file,
+ (u_long)le->lease_lsn.offset));
+ }
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+}
+
+/*
+ * Find the entry for the given EID. Or the first empty one.
+ */
+static void
+__rep_find_entry(env, rep, eid, lep)
+ ENV *env;
+ REP *rep;
+ int eid;
+ REP_LEASE_ENTRY **lep;
+{
+ REGINFO *infop;
+ REP_LEASE_ENTRY *le, *table;
+ u_int32_t i;
+
+ infop = env->reginfo;
+ table = R_ADDR(infop, rep->lease_off);
+
+ for (i = 0; i < rep->config_nsites; i++) {
+ le = &table[i];
+ /*
+ * Find either the one that matches the client's
+ * EID or the first empty one.
+ */
+ if (le->eid == eid || le->eid == DB_EID_INVALID) {
+ *lep = le;
+ return;
+ }
+ }
+ return;
+}
+
+/*
+ * __rep_lease_check -
+ * Return 0 if this master holds valid leases and can confirm
+ * its mastership. If leases are expired, an attempt is made
+ * to refresh the leases. If that fails, then return the
+ * DB_REP_LEASE_EXPIRED error to the user. No mutexes held.
+ *
+ * PUBLIC: int __rep_lease_check __P((ENV *, int));
+ */
+int
+__rep_lease_check(env, refresh)
+ ENV *env;
+ int refresh;
+{
+ DB_LOG *dblp;
+ DB_LSN lease_lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REGINFO *infop;
+ REP *rep;
+ REP_LEASE_ENTRY *le, *table;
+ db_timespec curtime;
+ int max_tries, ret, tries;
+ u_int32_t i, min_leases, valid_leases;
+
+ infop = env->reginfo;
+ tries = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ LOG_SYSTEM_LOCK(env);
+ lease_lsn = lp->max_perm_lsn;
+ LOG_SYSTEM_UNLOCK(env);
+#ifdef HAVE_STATISTICS
+ rep->stat.st_lease_chk++;
+#endif
+ /*
+ * Set the maximum number of retries to be 2x the lease timeout
+ * so that if a site is waiting to sync, it has a chance to do so.
+ */
+ max_tries = (int)(rep->lease_timeout / (LEASE_REFRESH_USEC / 2));
+ if (max_tries < LEASE_REFRESH_MIN)
+ max_tries = LEASE_REFRESH_MIN;
+retry:
+ REP_SYSTEM_LOCK(env);
+ min_leases = rep->config_nsites / 2;
+ ret = 0;
+ __os_gettime(env, &curtime, 1);
+ VPRINT(env, (env, DB_VERB_REP_LEASE,
+"%s %d of %d refresh %d min_leases %lu curtime %lu %lu, maxLSN [%lu][%lu]",
+ "lease_check: try ", tries, max_tries, refresh,
+ (u_long)min_leases, (u_long)curtime.tv_sec,
+ (u_long)curtime.tv_nsec,
+ (u_long)lease_lsn.file,
+ (u_long)lease_lsn.offset));
+ table = R_ADDR(infop, rep->lease_off);
+ for (i = 0, valid_leases = 0;
+ i < rep->config_nsites && valid_leases < min_leases; i++) {
+ le = &table[i];
+ /*
+ * Count this lease as valid if:
+ * - It is a valid entry (has an EID).
+ * - The lease has not expired.
+ * - The LSN is up to date.
+ */
+ if (le->eid != DB_EID_INVALID) {
+ VPRINT(env, (env, DB_VERB_REP_LEASE,
+ "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]",
+ (u_long)valid_leases, le->eid,
+ (u_long)le->lease_lsn.file,
+ (u_long)le->lease_lsn.offset));
+ VPRINT(env, (env, DB_VERB_REP_LEASE,
+ "lease_check: endtime %lu %lu",
+ (u_long)le->end_time.tv_sec,
+ (u_long)le->end_time.tv_nsec));
+ }
+ if (le->eid != DB_EID_INVALID &&
+ timespeccmp(&le->end_time, &curtime, >=) &&
+ LOG_COMPARE(&le->lease_lsn, &lease_lsn) >= 0)
+ valid_leases++;
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Now see if we have enough.
+ */
+ VPRINT(env, (env, DB_VERB_REP_LEASE, "valid %lu, min %lu",
+ (u_long)valid_leases, (u_long)min_leases));
+ if (valid_leases < min_leases) {
+#ifdef HAVE_STATISTICS
+ rep->stat.st_lease_chk_misses++;
+#endif
+ if (!refresh || tries > max_tries)
+ ret = DB_REP_LEASE_EXPIRED;
+ else {
+ /*
+ * If we are successful, we need to recheck the leases
+ * because the lease grant messages may have raced with
+ * the PERM acknowledgement. Give the grant messages
+ * a chance to arrive and be processed.
+ */
+ if (((tries % 10) == 5 &&
+ (ret = __rep_lease_refresh(env)) == 0) ||
+ (tries % 10) != 5) {
+ /*
+ * If we were successful sending, but
+ * not in racing the message threads,
+ * then yield the processor so that
+ * the message threads get a chance
+ * to run.
+ */
+ if (tries > 0)
+ __os_yield(env, 0, LEASE_REFRESH_USEC);
+ tries++;
+#ifdef HAVE_STATISTICS
+ rep->stat.st_lease_chk_refresh++;
+#endif
+ goto retry;
+ }
+ }
+ }
+
+ if (ret == DB_REP_LEASE_EXPIRED)
+ RPRINT(env, (env, DB_VERB_REP_LEASE,
+ "lease_check: Expired. Only %lu valid",
+ (u_long)valid_leases));
+ return (ret);
+}
+
+/*
+ * __rep_lease_refresh -
+ * Find the last permanent record and send that out so that it
+ * forces clients to grant their leases.
+ *
+ * If there is no permanent record, this function cannot refresh
+ * leases. That should not happen because the master should write
+ * a checkpoint when it starts, if there is no other perm record.
+ *
+ * PUBLIC: int __rep_lease_refresh __P((ENV *));
+ */
+int
+__rep_lease_refresh(env)
+ ENV *env;
+{
+ DBT rec;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ int ret, t_ret;
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ memset(&rec, 0, sizeof(rec));
+ memset(&lsn, 0, sizeof(lsn));
+ /*
+ * Use __rep_log_backup to find the last PERM record.
+ */
+ if ((ret = __rep_log_backup(env, logc, &lsn, REP_REC_PERM)) != 0) {
+ /*
+ * If there is no PERM record, then we get DB_NOTFOUND.
+ */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0)
+ goto err;
+
+ (void)__rep_send_message(env, DB_EID_BROADCAST, REP_LOG, &lsn,
+ &rec, REPCTL_LEASE, 0);
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_lease_expire -
+ * Proactively expire all leases granted to us.
+ * Assume the caller holds the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: int __rep_lease_expire __P((ENV *));
+ */
+int
+__rep_lease_expire(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REGINFO *infop;
+ REP *rep;
+ REP_LEASE_ENTRY *le, *table;
+ int ret;
+ u_int32_t i;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+
+ if (rep->lease_off != INVALID_ROFF) {
+ table = R_ADDR(infop, rep->lease_off);
+ /*
+ * Expire all leases forcibly. We are guaranteed that the
+ * start_time for all leases are not in the future. Therefore,
+ * set the end_time to the start_time.
+ */
+ for (i = 0; i < rep->config_nsites; i++) {
+ le = &table[i];
+ le->end_time = le->start_time;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __rep_lease_waittime -
+ * Return the amount of time remaining on a granted lease.
+ * Assume the caller holds the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *));
+ */
+db_timeout_t
+__rep_lease_waittime(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ db_timespec exptime, mytime;
+ db_timeout_t to;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ exptime = rep->grant_expire;
+ to = 0;
+ /*
+ * If the lease has never been granted, we must wait a full
+ * lease timeout because we could be freshly rebooted after
+ * a crash and a lease could be granted from a previous
+ * incarnation of this client. However, if the lease has never
+ * been granted, and this client has already waited a full
+ * lease timeout, we know our lease cannot be granted and there
+ * is no need to wait again.
+ */
+ RPRINT(env, (env, DB_VERB_REP_LEASE,
+ "wait_time: grant_expire %lu %lu lease_to %lu",
+ (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec,
+ (u_long)rep->lease_timeout));
+ if (!timespecisset(&exptime)) {
+ if (!F_ISSET(rep, REP_F_LEASE_EXPIRED))
+ to = rep->lease_timeout;
+ } else {
+ __os_gettime(env, &mytime, 1);
+ RPRINT(env, (env, DB_VERB_REP_LEASE,
+ "wait_time: mytime %lu %lu, grant_expire %lu %lu",
+ (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec,
+ (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec));
+ if (timespeccmp(&mytime, &exptime, <=)) {
+ /*
+ * If the current time is before the grant expiration
+ * compute the difference and return remaining grant
+ * time.
+ */
+ timespecsub(&exptime, &mytime);
+ DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1);
+ }
+ }
+ return (to);
+}
diff --git a/src/rep/rep_log.c b/src/rep/rep_log.c
new file mode 100644
index 00000000..42300685
--- /dev/null
+++ b/src/rep/rep_log.c
@@ -0,0 +1,1060 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+static int __rep_chk_newfile __P((ENV *, DB_LOGC *, REP *,
+ __rep_control_args *, int));
+static int __rep_log_split __P((ENV *, DB_THREAD_INFO *,
+ __rep_control_args *, DBT *, DB_LSN *, DB_LSN *));
+
+/*
+ * __rep_allreq --
+ * Handle a REP_ALL_REQ message.
+ *
+ * PUBLIC: int __rep_allreq __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_allreq(env, rp, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ int eid;
+{
+ DBT data_dbt, newfiledbt;
+ DB_LOGC *logc;
+ DB_LSN log_end, oldfilelsn;
+ DB_REP *db_rep;
+ REP *rep;
+ REP_BULK bulk;
+ REP_THROTTLE repth;
+ __rep_newfile_args nf_args;
+ uintptr_t bulkoff;
+ u_int32_t bulkflags, end_flag, flags, use_bulk;
+ int arch_flag, ret, t_ret;
+ u_int8_t buf[__REP_NEWFILE_SIZE];
+ size_t len;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ end_flag = 0;
+ arch_flag = 0;
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ /*
+ * If we're doing bulk transfer, allocate a bulk buffer to put our
+ * log records in. We still need to initialize the throttle info
+ * because if we encounter a log record larger than our entire bulk
+ * buffer, we need to send it as a singleton and also we want to
+ * support throttling with bulk.
+ *
+ * Use a local var so we don't need to worry if someone else turns
+ * on/off bulk in the middle of our call.
+ */
+ use_bulk = FLD_ISSET(rep->config, REP_C_BULK);
+ bulk.addr = NULL;
+ if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid,
+ &bulkoff, &bulkflags, REP_BULK_LOG)) != 0)
+ goto err;
+ memset(&repth, 0, sizeof(repth));
+ REP_SYSTEM_LOCK(env);
+ if ((ret = __rep_lockout_archive(env, rep)) != 0) {
+ REP_SYSTEM_UNLOCK(env);
+ goto err;
+ }
+ arch_flag = 1;
+ repth.gbytes = rep->gbytes;
+ repth.bytes = rep->bytes;
+ oldfilelsn = repth.lsn = rp->lsn;
+ repth.type = REP_LOG;
+ repth.data_dbt = &data_dbt;
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Get the LSN of the end of the log, so that in our reading loop
+ * (below), we can recognize when we get there, and set the
+ * REPCTL_LOG_END flag.
+ */
+ if ((ret = __logc_get(logc, &log_end, &data_dbt, DB_LAST)) != 0) {
+ if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER))
+ ret = 0;
+ goto err;
+ }
+
+ flags = IS_ZERO_LSN(rp->lsn) ||
+ IS_INIT_LSN(rp->lsn) ? DB_FIRST : DB_SET;
+ /*
+ * We get the first item so that a client servicing requests
+ * can distinguish between not having the records and reaching
+ * the end of its log. Return the DB_NOTFOUND if the client
+ * cannot get the record. Return 0 if we finish the loop and
+ * sent all that we have.
+ */
+ ret = __logc_get(logc, &repth.lsn, &data_dbt, flags);
+ /*
+ * If the client is asking for all records
+ * because it doesn't have any, and our first
+ * record is not in the first log file, then
+ * the client is outdated and needs to get a
+ * VERIFY_FAIL.
+ */
+ if (ret == 0 && repth.lsn.file != 1 && flags == DB_FIRST) {
+ if (F_ISSET(rep, REP_F_CLIENT))
+ ret = DB_NOTFOUND;
+ else
+ (void)__rep_send_message(env, eid,
+ REP_VERIFY_FAIL, &repth.lsn, NULL, 0, 0);
+ goto err;
+ }
+ /*
+ * If we got DB_NOTFOUND it could be because the LSN we were
+ * given is at the end of the log file and we need to switch
+ * log files. Reinitialize and get the current record when we return.
+ */
+ if (ret == DB_NOTFOUND) {
+ ret = __rep_chk_newfile(env, logc, rep, rp, eid);
+ /*
+ * If we still get DB_NOTFOUND the client gave us a
+ * bad or unknown LSN. Ignore it if we're the master.
+ * Any other error is returned.
+ */
+ if (ret == 0)
+ ret = __logc_get(logc, &repth.lsn,
+ &data_dbt, DB_CURRENT);
+ if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER)) {
+ ret = 0;
+ goto err;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * For singleton log records, we break when we get a REP_LOG_MORE.
+ * Or if we're not using throttling, or we are using bulk, we stop
+ * when we reach the end (i.e. ret != 0).
+ */
+ for (end_flag = 0;
+ ret == 0 && repth.type != REP_LOG_MORE && end_flag == 0;
+ ret = __logc_get(logc, &repth.lsn, &data_dbt, DB_NEXT)) {
+ /*
+ * If we just changed log files, we need to send the
+ * version of this log file to the client.
+ */
+ if (repth.lsn.file != oldfilelsn.file) {
+ if ((ret = __logc_version(logc, &nf_args.version)) != 0)
+ break;
+ memset(&newfiledbt, 0, sizeof(newfiledbt));
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(newfiledbt, &nf_args.version,
+ sizeof(nf_args.version));
+ else {
+ if ((ret = __rep_newfile_marshal(env, &nf_args,
+ buf, __REP_NEWFILE_SIZE, &len)) != 0)
+ goto err;
+ DB_INIT_DBT(newfiledbt, buf, len);
+ }
+ (void)__rep_send_message(env,
+ eid, REP_NEWFILE, &oldfilelsn, &newfiledbt,
+ REPCTL_RESEND, 0);
+ }
+
+ /*
+ * Mark the end of the ALL_REQ response to show that the
+ * receiving client should now be "caught up" with the
+ * replication group. If we're the master, then our log end is
+ * certainly authoritative. If we're another client, only if we
+ * ourselves have reached STARTUPDONE.
+ */
+ end_flag = (LOG_COMPARE(&repth.lsn, &log_end) >= 0 &&
+ (F_ISSET(rep, REP_F_MASTER) ||
+ rep->stat.st_startup_complete)) ?
+ REPCTL_LOG_END : 0;
+ /*
+ * If we are configured for bulk, try to send this as a bulk
+ * request. If not configured, or it is too big for bulk
+ * then just send normally.
+ */
+ if (use_bulk)
+ ret = __rep_bulk_message(env, &bulk, &repth,
+ &repth.lsn, &data_dbt, (REPCTL_RESEND | end_flag));
+ if (!use_bulk || ret == DB_REP_BULKOVF)
+ ret = __rep_send_throttle(env,
+ eid, &repth, 0, end_flag);
+ if (ret != 0)
+ break;
+ /*
+ * If we are about to change files, then we'll need the
+ * last LSN in the previous file. Save it here.
+ */
+ oldfilelsn = repth.lsn;
+ oldfilelsn.offset += logc->len;
+ }
+
+ if (ret == DB_NOTFOUND || ret == DB_REP_UNAVAIL)
+ ret = 0;
+ /*
+ * We're done, force out whatever remains in the bulk buffer and
+ * free it.
+ */
+err:
+ /*
+ * We could have raced an unlink from an earlier log_archive
+ * and the user is removing the files themselves, now. If
+ * we get an error indicating the log file might no longer
+ * exist, ignore it.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ if (bulk.addr != NULL && (t_ret = __rep_bulk_free(env, &bulk,
+ (REPCTL_RESEND | end_flag))) != 0 && ret == 0 &&
+ t_ret != DB_REP_UNAVAIL)
+ ret = t_ret;
+ if (arch_flag) {
+ REP_SYSTEM_LOCK(env);
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+ REP_SYSTEM_UNLOCK(env);
+ }
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_log --
+ * Handle a REP_LOG/REP_LOG_MORE message.
+ *
+ * PUBLIC: int __rep_log __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC: __rep_control_args *, DBT *, int, time_t, DB_LSN *));
+ */
+int
+__rep_log(env, ip, rp, rec, eid, savetime, ret_lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+ time_t savetime;
+ DB_LSN *ret_lsnp;
+{
+ DB_LOG *dblp;
+ DB_LSN last_lsn, lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int is_dup, master, ret;
+ u_int32_t gapflags;
+
+ is_dup = ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ ret = __rep_apply(env, ip, rp, rec, ret_lsnp, &is_dup, &last_lsn);
+ switch (ret) {
+ /*
+ * We're in an internal backup and we've gotten
+ * all the log we need to run recovery. Do so now.
+ */
+ case DB_REP_LOGREADY:
+ if ((ret =
+ __rep_logready(env, rep, savetime, &last_lsn)) != 0)
+ goto out;
+ break;
+ /*
+ * If we get any of the "normal" returns, we only process
+ * LOG_MORE if this is not a duplicate record. If the
+ * record is a duplicate we don't want to handle LOG_MORE
+ * and request a multiple data stream (or trigger internal
+ * initialization) since this could be a very old record
+ * that no longer exists on the master.
+ */
+ case DB_REP_ISPERM:
+ case DB_REP_NOTPERM:
+ case 0:
+ if (is_dup)
+ goto out;
+ else
+ break;
+ /*
+ * Any other return (errors), we're done.
+ */
+ default:
+ goto out;
+ }
+ if (rp->rectype == REP_LOG_MORE) {
+ master = rep->master_id;
+
+ /*
+ * Keep the cycle from stalling: In case we got the LOG_MORE out
+ * of order, before some preceding log records, we want to make
+ * sure our follow-up request resumes from where the LOG_MORE
+ * said it should. (If the preceding log records never arrive,
+ * normal gap processing should take care of asking for them.)
+ * But if we already have this record and/or more, we need to
+ * ask to resume from what we need. The upshot is we need the
+ * max of lp->lsn and the lsn from the message.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lsn = lp->ready_lsn;
+ if (LOG_COMPARE(&rp->lsn, &lsn) > 0)
+ lsn = rp->lsn;
+
+ /*
+ * If the master_id is invalid, this means that since
+ * the last record was sent, somebody declared an
+ * election and we may not have a master to request
+ * things of.
+ *
+ * This is not an error; when we find a new master,
+ * we'll re-negotiate where the end of the log is and
+ * try to bring ourselves up to date again anyway.
+ */
+ if (master == DB_EID_INVALID) {
+ ret = 0;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto out;
+ }
+ /*
+ * If we're waiting for records, set the wait_ts
+ * high so that we avoid re-requesting too soon and
+ * end up with multiple data streams.
+ */
+ if (IS_ZERO_LSN(lp->waiting_lsn))
+ lp->wait_ts = rep->max_gap;
+ /*
+ * If preceding log records were from the master, send the
+ * request for further log records to the master instead of
+ * allowing it to default to ANYWHERE.
+ */
+ gapflags = REP_GAP_FORCE;
+ if (master == eid)
+ gapflags = gapflags | REP_GAP_REREQUEST;
+ ret = __rep_loggap_req(env, rep, &lsn, gapflags);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+out:
+ return (ret);
+}
+
+/*
+ * __rep_bulk_log --
+ * Handle a REP_BULK_LOG message.
+ *
+ * PUBLIC: int __rep_bulk_log __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC: __rep_control_args *, DBT *, time_t, DB_LSN *));
+ */
+int
+__rep_bulk_log(env, ip, rp, rec, savetime, ret_lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __rep_control_args *rp;
+ DBT *rec;
+ time_t savetime;
+ DB_LSN *ret_lsnp;
+{
+ DB_LSN last_lsn;
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ret = __rep_log_split(env, ip, rp, rec, ret_lsnp, &last_lsn);
+ switch (ret) {
+ /*
+ * We're in an internal backup and we've gotten
+ * all the log we need to run recovery. Do so now.
+ */
+ case DB_REP_LOGREADY:
+ ret = __rep_logready(env, rep, savetime, &last_lsn);
+ break;
+ /*
+ * Any other return (errors), we're done.
+ */
+ default:
+ break;
+ }
+ return (ret);
+}
+
+/*
+ * __rep_log_split --
+ * - Split a log buffer into individual records.
+ *
+ * This is used by a client to process a bulk log message from the
+ * master and convert it into individual __rep_apply requests.
+ */
+static int
+__rep_log_split(env, ip, rp, rec, ret_lsnp, last_lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __rep_control_args *rp;
+ DBT *rec;
+ DB_LSN *ret_lsnp;
+ DB_LSN *last_lsnp;
+{
+ DBT logrec;
+ DB_LSN next_new_lsn, save_lsn, tmp_lsn;
+ __rep_control_args tmprp;
+ __rep_bulk_args b_args;
+ int is_dup, ret, save_ret;
+ u_int32_t save_flags;
+ u_int8_t *p, *ep;
+
+ memset(&logrec, 0, sizeof(logrec));
+ ZERO_LSN(next_new_lsn);
+ ZERO_LSN(save_lsn);
+ ZERO_LSN(tmp_lsn);
+ /*
+ * We're going to be modifying the rp LSN contents so make
+ * our own private copy to play with.
+ */
+ memcpy(&tmprp, rp, sizeof(tmprp));
+ /*
+ * We send the bulk buffer on a PERM record, so often we will have
+ * DB_LOG_PERM set. However, we only want to mark the last LSN
+ * we have as a PERM record. So clear it here, and when we're on
+ * the last record below, set it. The same applies if the sender
+ * set REPCTL_LOG_END on this message. We want the end of the
+ * bulk buffer to be marked as the end.
+ */
+ save_flags = F_ISSET(rp, REPCTL_LOG_END | REPCTL_PERM);
+ F_CLR(&tmprp, REPCTL_LOG_END | REPCTL_PERM);
+ is_dup = ret = save_ret = 0;
+ for (ep = (u_int8_t *)rec->data + rec->size, p = (u_int8_t *)rec->data;
+ p < ep; ) {
+ /*
+ * First thing in the buffer is the length. Then the LSN
+ * of this record, then the record itself.
+ */
+ if (rp->rep_version < DB_REPVERSION_47) {
+ memcpy(&b_args.len, p, sizeof(b_args.len));
+ p += sizeof(b_args.len);
+ memcpy(&tmprp.lsn, p, sizeof(DB_LSN));
+ p += sizeof(DB_LSN);
+ logrec.data = p;
+ logrec.size = b_args.len;
+ p += b_args.len;
+ } else {
+ if ((ret = __rep_bulk_unmarshal(env,
+ &b_args, p, rec->size, &p)) != 0)
+ return (ret);
+ tmprp.lsn = b_args.lsn;
+ logrec.data = b_args.bulkdata.data;
+ logrec.size = b_args.len;
+ }
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "log_rep_split: Processing LSN [%lu][%lu]",
+ (u_long)tmprp.lsn.file, (u_long)tmprp.lsn.offset));
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "log_rep_split: p %#lx ep %#lx logrec data %#lx, size %lu (%#lx)",
+ P_TO_ULONG(p), P_TO_ULONG(ep), P_TO_ULONG(logrec.data),
+ (u_long)logrec.size, (u_long)logrec.size));
+ if (p >= ep && save_flags)
+ F_SET(&tmprp, save_flags);
+ /*
+ * A previous call to __rep_apply indicated an earlier
+ * record is a dup and the next_new_lsn we are waiting for.
+ * Skip log records until we catch up with next_new_lsn.
+ */
+ if (is_dup && LOG_COMPARE(&tmprp.lsn, &next_new_lsn) < 0) {
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "log_split: Skip dup LSN [%lu][%lu]",
+ (u_long)tmprp.lsn.file, (u_long)tmprp.lsn.offset));
+ continue;
+ }
+ is_dup = 0;
+ ret = __rep_apply(env, ip,
+ &tmprp, &logrec, &tmp_lsn, &is_dup, last_lsnp);
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "log_split: rep_apply ret %d, dup %d, tmp_lsn [%lu][%lu]",
+ ret, is_dup, (u_long)tmp_lsn.file, (u_long)tmp_lsn.offset));
+ if (is_dup)
+ next_new_lsn = tmp_lsn;
+ switch (ret) {
+ /*
+ * If we received the pieces we need for running recovery,
+ * short-circuit because recovery will truncate the log to
+ * the LSN we want anyway.
+ */
+ case DB_REP_LOGREADY:
+ goto out;
+ /*
+ * If we just handled a special record, retain that information.
+ */
+ case DB_REP_ISPERM:
+ case DB_REP_NOTPERM:
+ save_ret = ret;
+ save_lsn = tmp_lsn;
+ ret = 0;
+ break;
+ /*
+ * Normal processing, do nothing, just continue.
+ */
+ case 0:
+ break;
+ /*
+ * If we get an error, then stop immediately.
+ */
+ default:
+ goto out;
+ }
+ }
+out:
+ /*
+ * If we finish processing successfully, set our return values
+ * based on what we saw.
+ */
+ if (ret == 0) {
+ ret = save_ret;
+ *ret_lsnp = save_lsn;
+ }
+ return (ret);
+}
+
+/*
+ * __rep_log_req --
+ * Handle a REP_LOG_REQ message.
+ *
+ * PUBLIC: int __rep_logreq __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_logreq(env, rp, rec, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+{
+ DBT data_dbt, newfiledbt;
+ DB_LOGC *logc;
+ DB_LSN firstlsn, lsn, oldfilelsn;
+ DB_REP *db_rep;
+ REP *rep;
+ REP_BULK bulk;
+ REP_THROTTLE repth;
+ __rep_logreq_args lr_args;
+ __rep_newfile_args nf_args;
+ uintptr_t bulkoff;
+ u_int32_t bulkflags, use_bulk;
+ int count, ret, t_ret;
+ u_int8_t buf[__REP_NEWFILE_SIZE];
+ size_t len;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /* COMPQUIET_LSN is what this is... */
+ ZERO_LSN(lr_args.endlsn);
+
+ if (rec != NULL && rec->size != 0) {
+ if (rp->rep_version < DB_REPVERSION_47)
+ lr_args.endlsn = *(DB_LSN *)rec->data;
+ else if ((ret = __rep_logreq_unmarshal(env, &lr_args,
+ rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "[%lu][%lu]: LOG_REQ max lsn: [%lu][%lu]",
+ (u_long) rp->lsn.file, (u_long)rp->lsn.offset,
+ (u_long)lr_args.endlsn.file,
+ (u_long)lr_args.endlsn.offset));
+ }
+ /*
+ * There are several different cases here.
+ * 1. We asked logc_get for a particular LSN and got it.
+ * 2. We asked logc_get for an LSN and it's not found because it is
+ * beyond the end of a log file and we need a NEWFILE msg.
+ * and then the record that was requested.
+ * 3. We asked logc_get for an LSN and it is already archived.
+ * 4. We asked logc_get for an LSN and it simply doesn't exist, but
+ * doesn't meet any of those other criteria, in which case
+ * it's an error (that should never happen on a master).
+ *
+ * If we have a valid LSN and the request has a data_dbt with
+ * it, the sender is asking for a chunk of log records.
+ * Then we need to send all records up to the LSN in the data dbt.
+ */
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ oldfilelsn = lsn = rp->lsn;
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ REP_SYSTEM_LOCK(env);
+ if ((ret = __rep_lockout_archive(env, rep)) != 0) {
+ REP_SYSTEM_UNLOCK(env);
+ goto err;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ if ((ret = __logc_get(logc, &lsn, &data_dbt, DB_SET)) == 0) {
+ /* Case 1 */
+ (void)__rep_send_message(env,
+ eid, REP_LOG, &lsn, &data_dbt, REPCTL_RESEND, 0);
+ oldfilelsn.offset += logc->len;
+ } else if (ret == DB_NOTFOUND) {
+ /*
+ * If logc_get races with log_archive or the user removing
+ * files from an earlier call to log_archive, it might return
+ * DB_NOTFOUND. We expect there to be some log record
+ * that is the first one. Loop until we either get
+ * a log record or some error. Since we only expect
+ * to get this racing log file removal, bound it to a few
+ * tries.
+ */
+ count = 0;
+ do {
+ ret = __logc_get(logc, &firstlsn, &data_dbt, DB_FIRST);
+ /*
+ * If we've raced this many tries and we're still
+ * getting DB_NOTFOUND, then pause a bit to disrupt
+ * the timing cycle that we appear to be in.
+ */
+ if (count > 5)
+ __os_yield(env, 0, 50000);
+ count++;
+ } while (ret == DB_NOTFOUND && count < 10);
+ if (ret != 0) {
+ /*
+ * If we're master we don't want to return DB_NOTFOUND.
+ * We'll just ignore the error and this message.
+ * It will get rerequested if needed.
+ */
+ if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER))
+ ret = 0;
+ goto err;
+ }
+ if (LOG_COMPARE(&firstlsn, &rp->lsn) > 0) {
+ /* Case 3 */
+ if (F_ISSET(rep, REP_F_CLIENT)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ (void)__rep_send_message(env, eid,
+ REP_VERIFY_FAIL, &rp->lsn, NULL, 0, 0);
+ ret = 0;
+ goto err;
+ }
+ ret = __rep_chk_newfile(env, logc, rep, rp, eid);
+ if (ret == DB_NOTFOUND) {
+ /* Case 4 */
+ /*
+ * If we still get DB_NOTFOUND the client gave us an
+ * unknown LSN, perhaps at the end of the log. Ignore
+ * it if we're the master. Return DB_NOTFOUND if
+ * we are the client.
+ */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ __db_errx(env, DB_STR_A("3501",
+ "Request for LSN [%lu][%lu] not found",
+ "%lu %lu"), (u_long)rp->lsn.file,
+ (u_long)rp->lsn.offset);
+ ret = 0;
+ goto err;
+ } else
+ ret = DB_NOTFOUND;
+ }
+ }
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If the user requested a gap, send the whole thing, while observing
+ * the limits from rep_set_limit.
+ *
+ * If we're doing bulk transfer, allocate a bulk buffer to put our
+ * log records in. We still need to initialize the throttle info
+ * because if we encounter a log record larger than our entire bulk
+ * buffer, we need to send it as a singleton.
+ *
+ * Use a local var so we don't need to worry if someone else turns
+ * on/off bulk in the middle of our call.
+ */
+ use_bulk = FLD_ISSET(rep->config, REP_C_BULK);
+ if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid,
+ &bulkoff, &bulkflags, REP_BULK_LOG)) != 0)
+ goto err;
+ memset(&repth, 0, sizeof(repth));
+ REP_SYSTEM_LOCK(env);
+ repth.gbytes = rep->gbytes;
+ repth.bytes = rep->bytes;
+ repth.type = REP_LOG;
+ repth.data_dbt = &data_dbt;
+ REP_SYSTEM_UNLOCK(env);
+ while (ret == 0 && rec != NULL && rec->size != 0 &&
+ repth.type == REP_LOG) {
+ if ((ret =
+ __logc_get(logc, &repth.lsn, &data_dbt, DB_NEXT)) != 0) {
+ /*
+ * If we're a client and we only have part of the gap,
+ * return DB_NOTFOUND so that we send a REREQUEST
+ * back to the requester and it can ask for more.
+ */
+ if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER))
+ ret = 0;
+ break;
+ }
+ if (LOG_COMPARE(&repth.lsn, &lr_args.endlsn) >= 0)
+ break;
+ if (repth.lsn.file != oldfilelsn.file) {
+ if ((ret = __logc_version(logc, &nf_args.version)) != 0)
+ break;
+ memset(&newfiledbt, 0, sizeof(newfiledbt));
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(newfiledbt, &nf_args.version,
+ sizeof(nf_args.version));
+ else {
+ if ((ret = __rep_newfile_marshal(env, &nf_args,
+ buf, __REP_NEWFILE_SIZE, &len)) != 0)
+ goto err;
+ DB_INIT_DBT(newfiledbt, buf, len);
+ }
+ (void)__rep_send_message(env,
+ eid, REP_NEWFILE, &oldfilelsn, &newfiledbt,
+ REPCTL_RESEND, 0);
+ }
+ /*
+ * If we are configured for bulk, try to send this as a bulk
+ * request. If not configured, or it is too big for bulk
+ * then just send normally.
+ */
+ if (use_bulk)
+ ret = __rep_bulk_message(env, &bulk, &repth,
+ &repth.lsn, &data_dbt, REPCTL_RESEND);
+ if (!use_bulk || ret == DB_REP_BULKOVF)
+ ret = __rep_send_throttle(env, eid, &repth, 0, 0);
+ if (ret != 0) {
+ /* Ignore send failure, except to break the loop. */
+ if (ret == DB_REP_UNAVAIL)
+ ret = 0;
+ break;
+ }
+ /*
+ * If we are about to change files, then we'll need the
+ * last LSN in the previous file. Save it here.
+ */
+ oldfilelsn = repth.lsn;
+ oldfilelsn.offset += logc->len;
+ }
+
+ /*
+ * We're done, force out whatever remains in the bulk buffer and
+ * free it.
+ */
+ if (use_bulk && (t_ret = __rep_bulk_free(env, &bulk,
+ REPCTL_RESEND)) != 0 && ret == 0 &&
+ t_ret != DB_REP_UNAVAIL)
+ ret = t_ret;
+err:
+ /*
+ * We could have raced an unlink from an earlier log_archive
+ * and the user is removing the files themselves, now. If
+ * we get an error indicating the log file might no longer
+ * exist, ignore it.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ REP_SYSTEM_LOCK(env);
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+ REP_SYSTEM_UNLOCK(env);
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_loggap_req -
+ * Request a log gap. Assumes the caller holds the REP->mtx_clientdb.
+ *
+ * lsnp is the current LSN we're handling. It is used to help decide
+ * if we ask for a gap or singleton.
+ * gapflags are flags that may override the algorithm or control the
+ * processing in some way.
+ *
+ * PUBLIC: int __rep_loggap_req __P((ENV *, REP *, DB_LSN *, u_int32_t));
+ */
+int
+__rep_loggap_req(env, rep, lsnp, gapflags)
+ ENV *env;
+ REP *rep;
+ DB_LSN *lsnp;
+ u_int32_t gapflags;
+{
+ DBT max_lsn_dbt, *max_lsn_dbtp;
+ DB_LOG *dblp;
+ DB_LSN next_lsn;
+ LOG *lp;
+ __rep_logreq_args lr_args;
+ size_t len;
+ u_int32_t ctlflags, flags, type;
+ int master, ret;
+ u_int8_t buf[__REP_LOGREQ_SIZE];
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (FLD_ISSET(gapflags, REP_GAP_FORCE))
+ next_lsn = *lsnp;
+ else
+ next_lsn = lp->ready_lsn;
+ ctlflags = flags = 0;
+ type = REP_LOG_REQ;
+ ret = 0;
+
+ /*
+ * Check if we need to ask for the gap.
+ * We ask for the gap if:
+ * We are forced to with gapflags.
+ * If max_wait_lsn is ZERO_LSN - we've never asked for
+ * records before.
+ * If we asked for a single record and received it.
+ *
+ * If we want a gap, but don't have an ending LSN (waiting_lsn)
+ * send an ALL_REQ. This is primarily used by REP_REREQUEST when
+ * an ALL_REQ was not able to be fulfilled by another client.
+ */
+ if (FLD_ISSET(gapflags, (REP_GAP_FORCE | REP_GAP_REREQUEST)) ||
+ IS_ZERO_LSN(lp->max_wait_lsn) ||
+ (lsnp != NULL && LOG_COMPARE(lsnp, &lp->max_wait_lsn) == 0)) {
+ lp->max_wait_lsn = lp->waiting_lsn;
+ /*
+ * In SYNC_LOG, make sure max_wait_lsn is set to avoid sending
+ * an ALL_REQ that could create an unnecessary dual data stream.
+ */
+ if (rep->sync_state == SYNC_LOG &&
+ IS_ZERO_LSN(lp->max_wait_lsn))
+ lp->max_wait_lsn = rep->last_lsn;
+ /*
+ * If we are forcing a gap, we need to send a max_wait_lsn
+ * that may be beyond the current gap/waiting_lsn (but
+ * it may not be). If we cannot determine any future
+ * waiting LSN, then it should be zero. If we're in
+ * internal init, it should be our ending LSN.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_FORCE)) {
+ if (LOG_COMPARE(&lp->max_wait_lsn, lsnp) <= 0) {
+ if (rep->sync_state == SYNC_LOG) {
+ DB_ASSERT(env, LOG_COMPARE(lsnp,
+ &rep->last_lsn) <= 0);
+ lp->max_wait_lsn = rep->last_lsn;
+ } else
+ ZERO_LSN(lp->max_wait_lsn);
+ }
+ }
+ if (IS_ZERO_LSN(lp->max_wait_lsn))
+ type = REP_ALL_REQ;
+ memset(&max_lsn_dbt, 0, sizeof(max_lsn_dbt));
+ lr_args.endlsn = lp->max_wait_lsn;
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(max_lsn_dbt, &lp->max_wait_lsn,
+ sizeof(DB_LSN));
+ else {
+ if ((ret = __rep_logreq_marshal(env, &lr_args, buf,
+ __REP_LOGREQ_SIZE, &len)) != 0)
+ goto err;
+ DB_INIT_DBT(max_lsn_dbt, buf, len);
+ }
+ max_lsn_dbtp = &max_lsn_dbt;
+ /*
+ * Gap requests are "new" and can go anywhere, unless
+ * this is already a re-request.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_REREQUEST))
+ flags = DB_REP_REREQUEST;
+ else
+ flags = DB_REP_ANYWHERE;
+ } else {
+ max_lsn_dbtp = NULL;
+ lp->max_wait_lsn = next_lsn;
+ /*
+ * If we're dropping to singletons, this is a re-request.
+ */
+ flags = DB_REP_REREQUEST;
+ }
+ if ((master = rep->master_id) != DB_EID_INVALID) {
+ STAT_INC(env,
+ rep, log_request, rep->stat.st_log_requested, master);
+ if (rep->sync_state == SYNC_LOG)
+ ctlflags = REPCTL_INIT;
+ (void)__rep_send_message(env, master,
+ type, &next_lsn, max_lsn_dbtp, ctlflags, flags);
+ } else
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_MASTER_REQ, NULL, NULL, 0, 0);
+err:
+ return (ret);
+}
+
+/*
+ * __rep_logready -
+ * Handle getting back REP_LOGREADY. Any call to __rep_apply
+ * can return it.
+ *
+ * PUBLIC: int __rep_logready __P((ENV *, REP *, time_t, DB_LSN *));
+ */
+int
+__rep_logready(env, rep, savetime, last_lsnp)
+ ENV *env;
+ REP *rep;
+ time_t savetime;
+ DB_LSN *last_lsnp;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ int ret;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ if ((ret = __log_flush(env, NULL)) != 0)
+ goto err;
+ if ((ret = __rep_verify_match(env, last_lsnp, savetime)) != 0)
+ goto err;
+
+ REP_SYSTEM_LOCK(env);
+ ZERO_LSN(rep->first_lsn);
+
+ if (rep->originfo_off != INVALID_ROFF) {
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ __env_alloc_free(infop, R_ADDR(infop, rep->originfo_off));
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ rep->originfo_off = INVALID_ROFF;
+ }
+
+ rep->sync_state = SYNC_OFF;
+ F_SET(rep, REP_F_NIMDBS_LOADED);
+ ret = __rep_notify_threads(env, AWAIT_NIMDB);
+ REP_SYSTEM_UNLOCK(env);
+ if (ret != 0)
+ goto err;
+
+ return (0);
+
+err:
+ DB_ASSERT(env, ret != DB_REP_WOULDROLLBACK);
+ __db_errx(env, DB_STR("3502",
+ "Client initialization failed. Need to manually restore client"));
+ return (__env_panic(env, ret));
+}
+
+/*
+ * __rep_chk_newfile --
+ * Determine if getting DB_NOTFOUND is because we're at the
+ * end of a log file and need to send a NEWFILE message.
+ *
+ * This function handles these cases:
+ * [Case 1 was that we found the record we were looking for - it
+ * is already handled by the caller.]
+ * 2. We asked logc_get for an LSN and it's not found because it is
+ * beyond the end of a log file and we need a NEWFILE msg.
+ * 3. We asked logc_get for an LSN and it simply doesn't exist, but
+ * doesn't meet any of those other criteria, in which case
+ * we return DB_NOTFOUND and the caller decides if it's an error.
+ *
+ * This function returns 0 if we had to send a message and the bad
+ * LSN is dealt with and DB_NOTFOUND if this really is an unknown LSN
+ * (on a client) and errors if it isn't found on the master.
+ */
+static int
+__rep_chk_newfile(env, logc, rep, rp, eid)
+ ENV *env;
+ DB_LOGC *logc;
+ REP *rep;
+ __rep_control_args *rp;
+ int eid;
+{
+ DBT data_dbt, newfiledbt;
+ DB_LOG *dblp;
+ DB_LSN endlsn;
+ LOG *lp;
+ __rep_newfile_args nf_args;
+ int ret;
+ u_int8_t buf[__REP_NEWFILE_SIZE];
+ size_t len;
+
+ ret = 0;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ LOG_SYSTEM_LOCK(env);
+ endlsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ if (endlsn.file > rp->lsn.file) {
+ /*
+ * Case 2:
+ * Need to find the LSN of the last record in
+ * file lsn.file so that we can send it with
+ * the NEWFILE call. In order to do that, we
+ * need to try to get {lsn.file + 1, 0} and
+ * then backup.
+ */
+ endlsn.file = rp->lsn.file + 1;
+ endlsn.offset = 0;
+ if ((ret = __logc_get(logc,
+ &endlsn, &data_dbt, DB_SET)) != 0 ||
+ (ret = __logc_get(logc,
+ &endlsn, &data_dbt, DB_PREV)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Unable to get prev of [%lu][%lu]",
+ (u_long)rp->lsn.file,
+ (u_long)rp->lsn.offset));
+ /*
+ * We want to push the error back
+ * to the client so that the client
+ * does an internal backup. The
+ * client asked for a log record
+ * we no longer have and it is
+ * outdated.
+ * XXX - This could be optimized by
+ * having the master perform and
+ * send a REP_UPDATE message. We
+ * currently want the client to set
+ * up its 'update' state prior to
+ * requesting REP_UPDATE_REQ.
+ *
+ * If we're a client servicing a request
+ * just return DB_NOTFOUND.
+ */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ ret = 0;
+ (void)__rep_send_message(env, eid,
+ REP_VERIFY_FAIL, &rp->lsn,
+ NULL, 0, 0);
+ } else
+ ret = DB_NOTFOUND;
+ } else {
+ endlsn.offset += logc->len;
+ if ((ret = __logc_version(logc,
+ &nf_args.version)) == 0) {
+ memset(&newfiledbt, 0,
+ sizeof(newfiledbt));
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(newfiledbt,
+ &nf_args.version,
+ sizeof(nf_args.version));
+ else {
+ if ((ret = __rep_newfile_marshal(env,
+ &nf_args, buf, __REP_NEWFILE_SIZE,
+ &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(newfiledbt, buf, len);
+ }
+ (void)__rep_send_message(env, eid,
+ REP_NEWFILE, &endlsn,
+ &newfiledbt, REPCTL_RESEND, 0);
+ }
+ }
+ } else
+ ret = DB_NOTFOUND;
+
+ return (ret);
+}
diff --git a/src/rep/rep_method.c b/src/rep/rep_method.c
new file mode 100644
index 00000000..f9f1924c
--- /dev/null
+++ b/src/rep/rep_method.c
@@ -0,0 +1,3032 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __rep_abort_prepared __P((ENV *));
+static int __rep_await_condition __P((ENV *,
+ struct rep_waitgoal *, db_timeout_t));
+static int __rep_bt_cmp __P((DB *, const DBT *, const DBT *));
+static int __rep_check_applied __P((ENV *,
+ DB_THREAD_INFO *, DB_COMMIT_INFO *, struct rep_waitgoal *));
+static void __rep_config_map __P((ENV *, u_int32_t *, u_int32_t *));
+static u_int32_t __rep_conv_vers __P((ENV *, u_int32_t));
+static int __rep_read_lsn_history __P((ENV *,
+ DB_THREAD_INFO *, DB_TXN **, DBC **, u_int32_t,
+ __rep_lsn_hist_data_args *, struct rep_waitgoal *, u_int32_t));
+static int __rep_restore_prepared __P((ENV *));
+static int __rep_save_lsn_hist __P((ENV *, DB_THREAD_INFO *, DB_LSN *));
+/*
+ * __rep_env_create --
+ * Replication-specific initialization of the ENV structure.
+ *
+ * PUBLIC: int __rep_env_create __P((DB_ENV *));
+ */
+int
+__rep_env_create(dbenv)
+ DB_ENV *dbenv;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ if ((ret = __os_calloc(env, 1, sizeof(DB_REP), &db_rep)) != 0)
+ return (ret);
+
+ db_rep->eid = DB_EID_INVALID;
+ db_rep->bytes = REP_DEFAULT_THROTTLE;
+ DB_TIMEOUT_TO_TIMESPEC(DB_REP_REQUEST_GAP, &db_rep->request_gap);
+ DB_TIMEOUT_TO_TIMESPEC(DB_REP_MAX_GAP, &db_rep->max_gap);
+ db_rep->elect_timeout = 2 * US_PER_SEC; /* 2 seconds */
+ db_rep->chkpt_delay = 30 * US_PER_SEC; /* 30 seconds */
+ db_rep->my_priority = DB_REP_DEFAULT_PRIORITY;
+ /*
+ * Make no clock skew the default. Setting both fields
+ * to the same non-zero value means no skew.
+ */
+ db_rep->clock_skew = 1;
+ db_rep->clock_base = 1;
+ FLD_SET(db_rep->config, REP_C_AUTOINIT);
+ FLD_SET(db_rep->config, REP_C_AUTOROLLBACK);
+
+ /*
+ * Turn on system messages by default.
+ */
+ FLD_SET(dbenv->verbose, DB_VERB_REP_SYSTEM);
+
+#ifdef HAVE_REPLICATION_THREADS
+ if ((ret = __repmgr_env_create(env, db_rep)) != 0) {
+ __os_free(env, db_rep);
+ return (ret);
+ }
+#endif
+
+ env->rep_handle = db_rep;
+ return (0);
+}
+
+/*
+ * __rep_env_destroy --
+ * Replication-specific destruction of the ENV structure.
+ *
+ * PUBLIC: void __rep_env_destroy __P((DB_ENV *));
+ */
+void
+__rep_env_destroy(dbenv)
+ DB_ENV *dbenv;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (env->rep_handle != NULL) {
+#ifdef HAVE_REPLICATION_THREADS
+ __repmgr_env_destroy(env, env->rep_handle);
+#endif
+ __os_free(env, env->rep_handle);
+ env->rep_handle = NULL;
+ }
+}
+
+/*
+ * __rep_get_config --
+ * Return the replication subsystem configuration.
+ *
+ * PUBLIC: int __rep_get_config __P((DB_ENV *, u_int32_t, int *));
+ */
+int
+__rep_get_config(dbenv, which, onp)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int *onp;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+ u_int32_t mapped;
+
+ env = dbenv->env;
+
+#undef OK_FLAGS
+#define OK_FLAGS \
+ (DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK | \
+ DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \
+ DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT | \
+ DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS)
+
+ if (FLD_ISSET(which, ~OK_FLAGS))
+ return (__db_ferr(env, "DB_ENV->rep_get_config", 0));
+
+ db_rep = env->rep_handle;
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_config", DB_INIT_REP);
+
+ mapped = 0;
+ __rep_config_map(env, &which, &mapped);
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ if (FLD_ISSET(rep->config, mapped))
+ *onp = 1;
+ else
+ *onp = 0;
+ } else {
+ if (FLD_ISSET(db_rep->config, mapped))
+ *onp = 1;
+ else
+ *onp = 0;
+ }
+ return (0);
+}
+
+/*
+ * __rep_set_config --
+ * Configure the replication subsystem.
+ *
+ * PUBLIC: int __rep_set_config __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__rep_set_config(dbenv, which, on)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int on;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ LOG *lp;
+ REP *rep;
+ REP_BULK bulk;
+ u_int32_t mapped, orig;
+ int ret, t_ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+#undef OK_FLAGS
+#define OK_FLAGS \
+ (DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK | \
+ DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \
+ DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT | \
+ DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS)
+#define REPMGR_FLAGS (REP_C_2SITE_STRICT | REP_C_ELECTIONS)
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_config", DB_INIT_REP);
+
+ if (FLD_ISSET(which, ~OK_FLAGS))
+ return (__db_ferr(env, "DB_ENV->rep_set_config", 0));
+
+ mapped = 0;
+ __rep_config_map(env, &which, &mapped);
+
+ if (APP_IS_BASEAPI(env) && FLD_ISSET(mapped, REPMGR_FLAGS)) {
+ __db_errx(env, DB_STR_A("3548",
+ "%s cannot configure repmgr settings from base replication application",
+ "%s"), "DB_ENV->rep_set_config:");
+ return (EINVAL);
+ }
+
+ if (REP_ON(env)) {
+#ifdef HAVE_REPLICATION_THREADS
+ if ((ret = __repmgr_valid_config(env, mapped)) != 0)
+ return (ret);
+#endif
+
+ ENV_ENTER(env, ip);
+
+ rep = db_rep->region;
+ /*
+ * In-memory replication must be called before calling
+ * env->open. If it is turned on and off before env->open,
+ * it doesn't matter. Any attempt to turn it on or off after
+ * env->open is intercepted by this error.
+ */
+ if (FLD_ISSET(mapped, REP_C_INMEM)) {
+ __db_errx(env, DB_STR_A("3549",
+"%s in-memory replication must be configured before DB_ENV->open",
+ "%s"), "DB_ENV->rep_set_config:");
+ ENV_LEAVE(env, ip);
+ return (EINVAL);
+ }
+ /*
+ * Leases must be turned on before calling rep_start.
+ * Leases can never be turned off once they're turned on.
+ */
+ if (FLD_ISSET(mapped, REP_C_LEASE)) {
+ if (F_ISSET(rep, REP_F_START_CALLED)) {
+ __db_errx(env, DB_STR("3550",
+ "DB_ENV->rep_set_config: leases must be "
+ "configured before DB_ENV->rep_start"));
+ ret = EINVAL;
+ }
+ if (on == 0) {
+ __db_errx(env, DB_STR("3551",
+ "DB_ENV->rep_set_config: leases cannot be turned off"));
+ ret = EINVAL;
+ }
+ if (ret != 0) {
+ ENV_LEAVE(env, ip);
+ return (ret);
+ }
+ }
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ orig = rep->config;
+ if (on)
+ FLD_SET(rep->config, mapped);
+ else
+ FLD_CLR(rep->config, mapped);
+
+ /*
+ * Bulk transfer requires special processing if it is getting
+ * toggled.
+ */
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (FLD_ISSET(rep->config, REP_C_BULK) &&
+ !FLD_ISSET(orig, REP_C_BULK))
+ db_rep->bulk = R_ADDR(&dblp->reginfo, lp->bulk_buf);
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * If turning bulk off and it was on, send out whatever is in
+ * the buffer already.
+ */
+ if (FLD_ISSET(orig, REP_C_BULK) &&
+ !FLD_ISSET(rep->config, REP_C_BULK) && lp->bulk_off != 0) {
+ memset(&bulk, 0, sizeof(bulk));
+ if (db_rep->bulk == NULL)
+ bulk.addr =
+ R_ADDR(&dblp->reginfo, lp->bulk_buf);
+ else
+ bulk.addr = db_rep->bulk;
+ bulk.offp = &lp->bulk_off;
+ bulk.len = lp->bulk_len;
+ bulk.type = REP_BULK_LOG;
+ bulk.eid = DB_EID_BROADCAST;
+ bulk.flagsp = &lp->bulk_flags;
+ ret = __rep_send_bulk(env, &bulk, 0);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ ENV_LEAVE(env, ip);
+
+#ifdef HAVE_REPLICATION_THREADS
+ /*
+ * If turning ELECTIONS on, and it was off, check whether we
+ * need to start an election immediately.
+ */
+ if (!FLD_ISSET(orig, REP_C_ELECTIONS) &&
+ FLD_ISSET(rep->config, REP_C_ELECTIONS) &&
+ (t_ret = __repmgr_turn_on_elections(env)) != 0 && ret == 0)
+ ret = t_ret;
+#endif
+ } else {
+ if (on)
+ FLD_SET(db_rep->config, mapped);
+ else
+ FLD_CLR(db_rep->config, mapped);
+ }
+ /* Configuring 2SITE_STRICT, etc. makes this a repmgr application */
+ if (ret == 0 && FLD_ISSET(mapped, REPMGR_FLAGS))
+ APP_SET_REPMGR(env);
+ return (ret);
+}
+
+static void
+__rep_config_map(env, inflagsp, outflagsp)
+ ENV *env;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ COMPQUIET(env, NULL);
+
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_AUTOINIT)) {
+ FLD_SET(*outflagsp, REP_C_AUTOINIT);
+ FLD_CLR(*inflagsp, DB_REP_CONF_AUTOINIT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_AUTOROLLBACK)) {
+ FLD_SET(*outflagsp, REP_C_AUTOROLLBACK);
+ FLD_CLR(*inflagsp, DB_REP_CONF_AUTOROLLBACK);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_BULK)) {
+ FLD_SET(*outflagsp, REP_C_BULK);
+ FLD_CLR(*inflagsp, DB_REP_CONF_BULK);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_DELAYCLIENT)) {
+ FLD_SET(*outflagsp, REP_C_DELAYCLIENT);
+ FLD_CLR(*inflagsp, DB_REP_CONF_DELAYCLIENT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_INMEM)) {
+ FLD_SET(*outflagsp, REP_C_INMEM);
+ FLD_CLR(*inflagsp, DB_REP_CONF_INMEM);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_LEASE)) {
+ FLD_SET(*outflagsp, REP_C_LEASE);
+ FLD_CLR(*inflagsp, DB_REP_CONF_LEASE);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_NOWAIT)) {
+ FLD_SET(*outflagsp, REP_C_NOWAIT);
+ FLD_CLR(*inflagsp, DB_REP_CONF_NOWAIT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_2SITE_STRICT)) {
+ FLD_SET(*outflagsp, REP_C_2SITE_STRICT);
+ FLD_CLR(*inflagsp, DB_REPMGR_CONF_2SITE_STRICT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_ELECTIONS)) {
+ FLD_SET(*outflagsp, REP_C_ELECTIONS);
+ FLD_CLR(*inflagsp, DB_REPMGR_CONF_ELECTIONS);
+ }
+ DB_ASSERT(env, *inflagsp == 0);
+}
+
+/*
+ * __rep_start_pp --
+ * Become a master or client, and start sending messages to participate
+ * in the replication environment. Must be called after the environment
+ * is open.
+ *
+ * PUBLIC: int __rep_start_pp __P((DB_ENV *, DBT *, u_int32_t));
+ */
+int
+__rep_start_pp(dbenv, dbt, flags)
+ DB_ENV *dbenv;
+ DBT *dbt;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ ENV *env;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_start", DB_INIT_REP);
+
+ if (APP_IS_REPMGR(env)) {
+ __db_errx(env, DB_STR("3552",
+"DB_ENV->rep_start: cannot call from Replication Manager application"));
+ return (EINVAL);
+ }
+
+ switch (LF_ISSET(DB_REP_CLIENT | DB_REP_MASTER)) {
+ case DB_REP_CLIENT:
+ case DB_REP_MASTER:
+ break;
+ default:
+ __db_errx(env, DB_STR("3553",
+ "DB_ENV->rep_start: must specify DB_REP_CLIENT or DB_REP_MASTER"));
+ return (EINVAL);
+ }
+
+ /* We need a transport function because we send messages. */
+ if (db_rep->send == NULL) {
+ __db_errx(env, DB_STR("3554",
+ "DB_ENV->rep_start: must be called after DB_ENV->rep_set_transport"));
+ return (EINVAL);
+ }
+
+ return (__rep_start_int(env, dbt, flags));
+}
+
+/*
+ * __rep_start_int --
+ * Internal processing to become a master or client and start sending
+ * messages to participate in the replication environment. If this is
+ * a newly created environment, then this site has likely been in an
+ * initial, undefined state - neither master nor client. What that means
+ * is that as a non-client, it can write log records locally (such as
+ * those generated by recovery) and as a non-master, it does not attempt
+ * to send those log records elsewhere.
+ *
+ * We must protect rep_start_int, which may change the world, with the rest
+ * of the DB library. Each API interface will count itself as it enters
+ * the library. Rep_start_int checks the following:
+ *
+ * rep->msg_th - this is the count of threads currently in rep_process_message
+ * rep->handle_cnt - number of threads actively using a dbp in library.
+ * rep->txn_cnt - number of active txns.
+ * REP_LOCKOUT_* - Replication flag that indicates that we wish to run
+ * recovery, and want to prohibit new transactions from entering and cause
+ * existing ones to return immediately (with a DB_LOCK_DEADLOCK error).
+ *
+ * There is also the renv->rep_timestamp which is updated whenever significant
+ * events (i.e., new masters, log rollback, etc). Upon creation, a handle
+ * is associated with the current timestamp. Each time a handle enters the
+ * library it must check if the handle timestamp is the same as the one
+ * stored in the replication region. This prevents the use of handles on
+ * clients that reference non-existent files whose creation was backed out
+ * during a synchronizing recovery.
+ *
+ * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t));
+ */
+int
+__rep_start_int(env, dbt, flags)
+ ENV *env;
+ DBT *dbt;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN lsn, perm_lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ DB_TXNREGION *region;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ db_timeout_t tmp;
+ u_int32_t new_gen, oldvers, pending_event, role;
+ int interrupting, locked, ret, role_chg, start_th, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ renv = infop->primary;
+ interrupting = locked = 0;
+ pending_event = DB_EVENT_NO_SUCH_EVENT;
+ role = LF_ISSET(DB_REP_CLIENT | DB_REP_MASTER);
+ start_th = 0;
+
+ /*
+ * If we're using master leases, check that all needed
+ * setup has been done, including setting the lease timeout.
+ */
+ if (IS_USING_LEASES(env) && rep->lease_timeout == 0) {
+ __db_errx(env, DB_STR("3555",
+"DB_ENV->rep_start: must call DB_ENV->rep_set_timeout for leases first"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ /* Serialize rep_start() calls. */
+ MUTEX_LOCK(env, rep->mtx_repstart);
+ start_th = 1;
+
+ /*
+ * In order to correctly check log files for old versions, we
+ * need to flush the logs. Serialize log flush to make sure it is
+ * always done just before the log old version check. Otherwise it
+ * is possible that another thread in rep_start could write LSN history
+ * and create a new log file that is not yet fully there for the log
+ * old version check.
+ */
+ if ((ret = __log_flush(env, NULL)) != 0)
+ goto out;
+
+ REP_SYSTEM_LOCK(env);
+ role_chg = (!F_ISSET(rep, REP_F_MASTER) && role == DB_REP_MASTER) ||
+ (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT);
+
+ /*
+ * There is no need for lockout if all we're doing is sending a message.
+ * In fact, lockout could be harmful: the typical use of this "duplicate
+ * client" style of call is when the application has to poll, seeking
+ * for a master. If the resulting NEWMASTER message were to arrive when
+ * we had messages locked out, we would discard it, resulting in further
+ * delay.
+ */
+ if (role == DB_REP_CLIENT && !role_chg) {
+ REP_SYSTEM_UNLOCK(env);
+ if ((ret = __dbt_usercopy(env, dbt)) == 0)
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWCLIENT, NULL, dbt, 0, 0);
+ goto out;
+ }
+
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
+ /*
+ * There is already someone in msg lockout. Return.
+ */
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Thread already in msg lockout"));
+ REP_SYSTEM_UNLOCK(env);
+ goto out;
+ } else if ((ret = __rep_lockout_msg(env, rep, 0)) != 0)
+ goto errunlock;
+
+ /*
+ * If we are internal init and we try to become master, reject it.
+ * Our environment databases/logs are in an inconsistent state and
+ * we cannot become master.
+ */
+ if (IN_INTERNAL_INIT(rep) && role == DB_REP_MASTER) {
+ __db_errx(env, DB_STR("3556",
+ "DB_ENV->rep_start: Cannot become master during internal init"));
+ ret = DB_REP_UNAVAIL;
+ goto errunlock;
+ }
+
+ /*
+ * Wait for any active txns or mpool ops to complete, and
+ * prevent any new ones from occurring, only if we're
+ * changing roles.
+ */
+ if (role_chg) {
+ if ((ret = __rep_lockout_api(env, rep)) != 0)
+ goto errunlock;
+ locked = 1;
+ }
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (role == DB_REP_MASTER) {
+ if (role_chg) {
+ /*
+ * If we were previously a client, it's possible we
+ * could have an interruptible STARTSYNC in progress.
+ * Interrupt it now, so that it doesn't slow down our
+ * transition to master, and because its effects aren't
+ * doing us any good anyway.
+ */
+ (void)__memp_set_config(
+ env->dbenv, DB_MEMP_SYNC_INTERRUPT, 1);
+ interrupting = 1;
+
+ /*
+ * If we're upgrading from having been a client,
+ * preclose, so that we close our temporary database
+ * and any files we opened while doing a rep_apply.
+ * If we don't we can infinitely leak file ids if
+ * the master crashed with files open (the likely
+ * case). If we don't close them we can run into
+ * problems if we try to remove that file or long
+ * running applications end up with an unbounded
+ * number of used fileids, each getting written
+ * on checkpoint. Just close them.
+ * Then invalidate all files open in the logging
+ * region. These are files open by other processes
+ * attached to the environment. They must be
+ * closed by the other processes when they notice
+ * the change in role.
+ */
+ if ((ret = __rep_preclose(env)) != 0)
+ goto errunlock;
+
+ new_gen = rep->gen + 1;
+ /*
+ * There could have been any number of failed
+ * elections, so jump the gen if we need to now.
+ */
+ if (rep->egen > rep->gen)
+ new_gen = rep->egen;
+ SET_GEN(new_gen);
+ /*
+ * If the "group" has only one site, it's OK to start as
+ * master without an election. This is how repmgr
+ * builds up a primordial group, by induction.
+ */
+ if (IS_USING_LEASES(env) &&
+ rep->config_nsites > 1 &&
+ !F_ISSET(rep, REP_F_MASTERELECT)) {
+ __db_errx(env, DB_STR("3557",
+"rep_start: Cannot become master without being elected when using leases."));
+ ret = EINVAL;
+ goto errunlock;
+ }
+ if (F_ISSET(rep, REP_F_MASTERELECT)) {
+ __rep_elect_done(env, rep);
+ F_CLR(rep, REP_F_MASTERELECT);
+ } else if (FLD_ISSET(rep->config, REP_C_INMEM))
+ /*
+ * Help detect if application has ignored our
+ * recommendation against reappointing same
+ * master after a crash/reboot when running
+ * in-memory replication. Doing this allows a
+ * slight chance of two masters at the same
+ * generation, resulting in client crashes.
+ */
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Appointed new master while running in-memory replication."));
+ if (rep->egen <= rep->gen)
+ rep->egen = rep->gen + 1;
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "New master gen %lu, egen %lu",
+ (u_long)rep->gen, (u_long)rep->egen));
+ /*
+ * If not running in-memory replication, write
+ * gen file.
+ */
+ if (!FLD_ISSET(rep->config, REP_C_INMEM) &&
+ (ret = __rep_write_gen(env, rep, rep->gen)) != 0)
+ goto errunlock;
+ }
+ /*
+ * Set lease duration assuming clients have faster clock.
+ * Master needs to compensate so that clients do not
+ * expire their grant while the master thinks it is valid.
+ */
+ if (IS_USING_LEASES(env) &&
+ (role_chg || !IS_REP_STARTED(env))) {
+ /*
+ * If we have already granted our lease, we
+ * cannot become master.
+ */
+ if ((ret = __rep_islease_granted(env))) {
+ __db_errx(env, DB_STR("3558",
+ "rep_start: Cannot become master with outstanding lease granted."));
+ ret = EINVAL;
+ goto errunlock;
+ }
+ /*
+ * Set max_perm_lsn to last PERM record on master.
+ */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto errunlock;
+ ret = __rep_log_backup(env, logc, &perm_lsn,
+ REP_REC_PERM);
+ (void)__logc_close(logc);
+ /*
+ * If we found a perm LSN use it. Otherwise, if
+ * no perm LSN exists, initialize.
+ */
+ if (ret == 0)
+ lp->max_perm_lsn = perm_lsn;
+ else if (ret == DB_NOTFOUND)
+ INIT_LSN(lp->max_perm_lsn);
+ else
+ goto errunlock;
+
+ /*
+ * Simply compute the larger ratio for the lease.
+ */
+ tmp = (db_timeout_t)((double)rep->lease_timeout /
+ ((double)rep->clock_skew /
+ (double)rep->clock_base));
+ DB_TIMEOUT_TO_TIMESPEC(tmp, &rep->lease_duration);
+ if ((ret = __rep_lease_table_alloc(env,
+ rep->config_nsites)) != 0)
+ goto errunlock;
+ }
+ rep->master_id = rep->eid;
+ STAT_INC(env, rep,
+ master_change, rep->stat.st_master_changes, rep->eid);
+
+#ifdef DIAGNOSTIC
+ if (!F_ISSET(rep, REP_F_GROUP_ESTD))
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Establishing group as master."));
+#endif
+ /*
+ * When becoming a master, clear the following flags:
+ * CLIENT: Site is no longer a client.
+ * ABBREVIATED: Indicates abbreviated internal init, which
+ * cannot occur on a master.
+ * MASTERELECT: Indicates that this master is elected
+ * rather than appointed. If we're changing roles we
+ * used this flag above for error checks and election
+ * cleanup.
+ * SKIPPED_APPLY: Indicates that client apply skipped
+ * some log records during an election, no longer
+ * applicable on master.
+ * DELAY: Indicates user config to delay initial client
+ * sync with new master, doesn't apply to master.
+ * LEASE_EXPIRED: Applies to client leases which are
+ * now defunct on master.
+ * NEWFILE: Used to delay client apply during newfile
+ * operation, not applicable to master.
+ */
+ F_CLR(rep, REP_F_CLIENT | REP_F_ABBREVIATED |
+ REP_F_MASTERELECT | REP_F_SKIPPED_APPLY | REP_F_DELAY |
+ REP_F_LEASE_EXPIRED | REP_F_NEWFILE);
+ /*
+ * When becoming a master, set the following flags:
+ * MASTER: Indicate that this site is master.
+ * GROUP_ESTD: Having a master means a that replication
+ * group exists.
+ * NIMDBS_LOADED: Inmem dbs are always present on a master.
+ */
+ F_SET(rep, REP_F_MASTER | REP_F_GROUP_ESTD |
+ REP_F_NIMDBS_LOADED);
+ /* Master cannot be in internal init. */
+ rep->sync_state = SYNC_OFF;
+
+ /*
+ * We're master. Set the versions to the current ones.
+ */
+ oldvers = lp->persist.version;
+ /*
+ * If we're moving forward to the current version, we need
+ * to force the log file to advance and reset the
+ * recovery table since it contains pointers to old
+ * recovery functions.
+ */
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "rep_start: Old log version was %lu", (u_long)oldvers));
+ if (lp->persist.version != DB_LOGVERSION) {
+ if ((ret = __env_init_rec(env, DB_LOGVERSION)) != 0)
+ goto errunlock;
+ }
+ rep->version = DB_REPVERSION;
+ /*
+ * When becoming a master, clear the following lockouts:
+ * ARCHIVE: Used to keep logs while client may be
+ * inconsistent, not needed on master.
+ * MSG: We set this above to block message processing while
+ * becoming a master, can turn messages back on here.
+ */
+ FLD_CLR(rep->lockout_flags,
+ REP_LOCKOUT_ARCHIVE | REP_LOCKOUT_MSG);
+ REP_SYSTEM_UNLOCK(env);
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+
+ /*
+ * Send the NEWMASTER message first so that clients know
+ * subsequent messages are coming from the right master.
+ * We need to perform all actions below no matter what
+ * regarding errors.
+ */
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ ret = 0;
+ if (role_chg) {
+ pending_event = DB_EVENT_REP_MASTER;
+ /*
+ * If prepared transactions have not been restored
+ * look to see if there are any. If there are,
+ * then mark the open files, otherwise close them.
+ */
+ region = env->tx_handle->reginfo.primary;
+ if (region->stat.st_nrestores == 0 &&
+ (t_ret = __rep_restore_prepared(env)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (region->stat.st_nrestores != 0) {
+ if ((t_ret = __dbreg_mark_restored(env)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ } else {
+ ret = __dbreg_invalidate_files(env, 0);
+ if ((t_ret = __rep_closefiles(env)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_SYS_DB_OP);
+ REP_SYSTEM_UNLOCK(env);
+ if ((t_ret = __txn_recycle_id(env, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Write LSN history database, ahead of unlocking the
+ * API so that clients can always know the heritage of
+ * any transaction they receive via replication.
+ */
+ if ((t_ret = __rep_save_lsn_hist(env, ip, &lsn)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ REP_SYSTEM_LOCK(env);
+ rep->gen_base_lsn = lsn;
+ rep->master_envid = renv->envid;
+ F_CLR(rep, REP_F_SYS_DB_OP);
+ CLR_LOCKOUT_BDB(rep);
+ locked = 0;
+ REP_SYSTEM_UNLOCK(env);
+ (void)__memp_set_config(
+ env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+ interrupting = 0;
+ }
+ } else {
+ /*
+ * Start a non-client as a client.
+ */
+ rep->master_id = DB_EID_INVALID;
+ /*
+ * A non-client should not have been participating in an
+ * election, so most election flags should be off. The TALLY
+ * flag is an exception because it is set any time we receive
+ * a VOTE1 and there is no reason to clear and lose it for an
+ * election that may begin shortly.
+ */
+ DB_ASSERT(env, !FLD_ISSET(rep->elect_flags, ~REP_E_TALLY));
+ /*
+ * A non-client should not have the following client flags
+ * set and should not be in internal init.
+ */
+ DB_ASSERT(env, !F_ISSET(rep,
+ REP_F_ABBREVIATED | REP_F_DELAY | REP_F_NEWFILE));
+ DB_ASSERT(env, rep->sync_state == SYNC_OFF);
+
+ if ((ret = __log_get_oldversion(env, &oldvers)) != 0)
+ goto errunlock;
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "rep_start: Found old version log %d", oldvers));
+ if (oldvers >= DB_LOGVERSION_MIN) {
+ __log_set_version(env, oldvers);
+ if ((ret = __env_init_rec(env, oldvers)) != 0)
+ goto errunlock;
+ oldvers = __rep_conv_vers(env, oldvers);
+ DB_ASSERT(env, oldvers != DB_REPVERSION_INVALID);
+ rep->version = oldvers;
+ }
+ /*
+ * When becoming a client, clear the following flags:
+ * MASTER: Site is no longer a master.
+ * MASTERELECT: Indicates that a master is elected
+ * rather than appointed, not applicable on client.
+ */
+ F_CLR(rep, REP_F_MASTER | REP_F_MASTERELECT);
+ F_SET(rep, REP_F_CLIENT);
+
+ /*
+ * On a client, compute the lease duration on the
+ * assumption that the client has a fast clock.
+ * Expire any existing leases we might have held as
+ * a master.
+ */
+ if (IS_USING_LEASES(env) && !IS_REP_STARTED(env)) {
+ if ((ret = __rep_lease_expire(env)) != 0)
+ goto errunlock;
+ /*
+ * Since the master is also compensating on its
+ * side as well, we're being doubly conservative
+ * to compensate on the client side. Theoretically,
+ * this compensation is not necessary, as it is
+ * effectively doubling the skew compensation.
+ * But we are making guarantees based on time and
+ * skews across machines. So we are being extra
+ * cautious.
+ */
+ tmp = (db_timeout_t)((double)rep->lease_timeout *
+ ((double)rep->clock_skew /
+ (double)rep->clock_base));
+ DB_TIMEOUT_TO_TIMESPEC(tmp, &rep->lease_duration);
+ if (rep->lease_off != INVALID_ROFF) {
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->lease_off));
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ rep->lease_off = INVALID_ROFF;
+ }
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Abort any prepared transactions that were restored
+ * by recovery. We won't be able to create any txns of
+ * our own until they're resolved, but we can't resolve
+ * them ourselves; the master has to. If any get
+ * resolved as commits, we'll redo them when commit
+ * records come in. Aborts will simply be ignored.
+ */
+ if ((ret = __rep_abort_prepared(env)) != 0)
+ goto errlock;
+
+ /*
+ * Since we're changing roles we need to init the db.
+ */
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto errlock;
+ /*
+ * Ignore errors, because if the file doesn't exist,
+ * this is perfectly OK.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ (void)__db_remove(dbp, ip, NULL, REPDBNAME,
+ NULL, DB_FORCE);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ /*
+ * Set pending_event after calls that can fail.
+ */
+ pending_event = DB_EVENT_REP_CLIENT;
+
+ REP_SYSTEM_LOCK(env);
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ if (locked) {
+ CLR_LOCKOUT_BDB(rep);
+ locked = 0;
+ }
+
+ if (F_ISSET(env, ENV_PRIVATE))
+ /*
+ * If we think we're a new client, and we have a
+ * private env, set our gen number down to 0.
+ * Otherwise, we can restart and think
+ * we're ready to accept a new record (because our
+ * gen is okay), but really this client needs to
+ * sync with the master.
+ */
+ SET_GEN(0);
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Announce ourselves and send out our data.
+ */
+ if ((ret = __dbt_usercopy(env, dbt)) != 0)
+ goto out;
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWCLIENT, NULL, dbt, 0, 0);
+ }
+
+ if (0) {
+ /*
+ * We have separate labels for errors. If we're returning an
+ * error before we've set REP_LOCKOUT_MSG, we use 'err'. If
+ * we are erroring while holding the region mutex, then we use
+ * 'errunlock' label. If we error without holding the rep
+ * mutex we must use 'errlock'.
+ */
+errlock: REP_SYSTEM_LOCK(env);
+errunlock: FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ if (locked)
+ CLR_LOCKOUT_BDB(rep);
+ if (interrupting)
+ (void)__memp_set_config(
+ env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+ REP_SYSTEM_UNLOCK(env);
+ }
+out:
+ if (ret == 0) {
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_START_CALLED);
+ REP_SYSTEM_UNLOCK(env);
+ }
+ if (pending_event != DB_EVENT_NO_SUCH_EVENT)
+ __rep_fire_event(env, pending_event, NULL);
+ if (start_th)
+ MUTEX_UNLOCK(env, rep->mtx_repstart);
+ __dbt_userfree(env, dbt, NULL, NULL);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * Write the current generation's base LSN into the history database.
+ */
+static int
+__rep_save_lsn_hist(env, ip, lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_LSN *lsnp;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REGENV *renv;
+ DB_TXN *txn;
+ DB *dbp;
+ DBT key_dbt, data_dbt;
+ __rep_lsn_hist_key_args key;
+ __rep_lsn_hist_data_args data;
+ u_int8_t key_buf[__REP_LSN_HIST_KEY_SIZE];
+ u_int8_t data_buf[__REP_LSN_HIST_DATA_SIZE];
+ db_timespec now;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ renv = env->reginfo->primary;
+ txn = NULL;
+ ret = 0;
+
+ if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+ return (ret);
+
+ /*
+ * Use the cached handle to the history database if it is already open.
+ * Since we're becoming master, we don't expect to need it after this,
+ * so clear the cached handle and close the database once we've written
+ * our update.
+ */
+ if ((dbp = db_rep->lsn_db) == NULL &&
+ (ret = __rep_open_sysdb(env,
+ ip, txn, REPLSNHIST, DB_CREATE, &dbp)) != 0)
+ goto err;
+
+ key.version = REP_LSN_HISTORY_FMT_VERSION;
+ key.gen = rep->gen;
+ __rep_lsn_hist_key_marshal(env, &key, key_buf);
+
+ data.envid = renv->envid;
+ data.lsn = *lsnp;
+ __os_gettime(env, &now, 0);
+ data.hist_sec = (u_int32_t)now.tv_sec;
+ data.hist_nsec = (u_int32_t)now.tv_nsec;
+ __rep_lsn_hist_data_marshal(env, &data, data_buf);
+
+ DB_INIT_DBT(key_dbt, key_buf, sizeof(key_buf));
+ DB_INIT_DBT(data_dbt, data_buf, sizeof(data_buf));
+
+ ret = __db_put(dbp, ip, txn, &key_dbt, &data_dbt, 0);
+err:
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ db_rep->lsn_db = NULL;
+
+ DB_ASSERT(env, txn != NULL);
+ if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * Open existing LSN history database, wherever it may be (on disk or in
+ * memory). If it doesn't exist, create it only if DB_CREATE is specified by
+ * our caller.
+ *
+ * If we could be sure that all sites in the replication group had matching
+ * REP_C_INMEM settings (that never changed over time), we could simply look for
+ * the database in the place where we knew it should be. The code here tries to
+ * be more flexible/resilient to mis-matching INMEM settings, even though we
+ * recommend against that.
+ * PUBLIC: int __rep_open_sysdb __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, const char *, u_int32_t, DB **));
+ */
+int
+__rep_open_sysdb(env, ip, txn, dbname, flags, dbpp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *dbname;
+ u_int32_t flags;
+ DB **dbpp;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ DB *dbp;
+ char *fname;
+ u_int32_t myflags;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+
+ myflags = DB_INTERNAL_PERSISTENT_DB |
+ (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
+
+ /*
+ * First, try opening it as a sub-database within a disk-resident
+ * database file. (If success, skip to the end.)
+ */
+ if ((ret = __db_open(dbp, ip, txn,
+ REPSYSDBNAME, dbname, DB_BTREE, myflags, 0, PGNO_BASE_MD)) == 0)
+ goto found;
+ if (ret != ENOENT)
+ goto err;
+
+ /*
+ * Here, the file was not found. Next, try opening it as an in-memory
+ * database (after the necessary clean-up).
+ */
+ ret = __db_close(dbp, txn, DB_NOSYNC);
+ dbp = NULL;
+ if (ret != 0 || (ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ if ((ret = __db_open(dbp, ip, txn,
+ NULL, dbname, DB_BTREE, myflags, 0, PGNO_BASE_MD)) == 0)
+ goto found;
+ if (ret != ENOENT)
+ goto err;
+
+ /*
+ * Here, the database was not found either on disk or in memory. Create
+ * it, according to our local INMEM setting.
+ */
+ ret = __db_close(dbp, txn, DB_NOSYNC);
+ dbp = NULL;
+ if (ret != 0)
+ goto err;
+ if (LF_ISSET(DB_CREATE)) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ if ((ret = __db_set_pagesize(dbp, REPSYSDBPGSZ)) != 0)
+ goto err;
+ FLD_SET(myflags, DB_CREATE);
+ fname = FLD_ISSET(rep->config, REP_C_INMEM) ?
+ NULL : REPSYSDBNAME;
+ if ((ret = __db_open(dbp, ip, txn, fname,
+ dbname, DB_BTREE, myflags, 0, PGNO_BASE_MD)) == 0)
+ goto found;
+ } else
+ ret = ENOENT;
+
+err:
+ if (dbp != NULL && (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 &&
+ (ret == 0 || ret == ENOENT))
+ ret = t_ret;
+ return (ret);
+
+found:
+ *dbpp = dbp;
+ return (0);
+}
+
+/*
+ * __rep_client_dbinit --
+ *
+ * Initialize the LSN database on the client side. This is called from the
+ * client initialization code. The startup flag value indicates if
+ * this is the first thread/process starting up and therefore should create
+ * the LSN database. This routine must be called once by each process acting
+ * as a client.
+ *
+ * Assumes caller holds appropriate mutex.
+ *
+ * PUBLIC: int __rep_client_dbinit __P((ENV *, int, repdb_t));
+ */
+int
+__rep_client_dbinit(env, startup, which)
+ ENV *env;
+ int startup;
+ repdb_t which;
+{
+ DB *dbp, **rdbpp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ int ret, t_ret;
+ u_int32_t flags;
+ const char *fname, *name, *subdb;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dbp = NULL;
+
+ if (which == REP_DB) {
+ name = REPDBNAME;
+ rdbpp = &db_rep->rep_db;
+ } else {
+ name = REPPAGENAME;
+ rdbpp = &db_rep->file_dbp;
+ }
+ /* Check if this has already been called on this environment. */
+ if (*rdbpp != NULL)
+ return (0);
+
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /* Set up arguments for __db_remove and __db_open calls. */
+ fname = name;
+ subdb = NULL;
+ if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+ fname = NULL;
+ subdb = name;
+ }
+
+ if (startup) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ /*
+ * Prevent in-memory database remove from writing to
+ * non-existent logs.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
+ (void)__db_set_flags(dbp, DB_TXN_NOT_DURABLE);
+ /*
+ * Ignore errors, because if the file doesn't exist, this
+ * is perfectly OK.
+ */
+ (void)__db_remove(dbp, ip, NULL, fname, subdb, DB_FORCE);
+ }
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ if (which == REP_DB &&
+ (ret = __bam_set_bt_compare(dbp, __rep_bt_cmp)) != 0)
+ goto err;
+
+ /* Don't write log records on the client. */
+ if ((ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+
+ flags = DB_NO_AUTO_COMMIT | DB_CREATE | DB_INTERNAL_TEMPORARY_DB |
+ (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
+
+ if ((ret = __db_open(dbp, ip, NULL, fname, subdb,
+ (which == REP_DB ? DB_BTREE : DB_RECNO),
+ flags, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ *rdbpp = dbp;
+
+ if (0) {
+err: if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ *rdbpp = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __rep_bt_cmp --
+ *
+ * Comparison function for the LSN table. We use the entire control
+ * structure as a key (for simplicity, so we don't have to merge the
+ * other fields in the control with the data field), but really only
+ * care about the LSNs.
+ */
+static int
+__rep_bt_cmp(dbp, dbt1, dbt2)
+ DB *dbp;
+ const DBT *dbt1, *dbt2;
+{
+ DB_LSN lsn1, lsn2;
+ __rep_control_args *rp1, *rp2;
+
+ COMPQUIET(dbp, NULL);
+
+ rp1 = dbt1->data;
+ rp2 = dbt2->data;
+
+ (void)__ua_memcpy(&lsn1, &rp1->lsn, sizeof(DB_LSN));
+ (void)__ua_memcpy(&lsn2, &rp2->lsn, sizeof(DB_LSN));
+
+ if (lsn1.file > lsn2.file)
+ return (1);
+
+ if (lsn1.file < lsn2.file)
+ return (-1);
+
+ if (lsn1.offset > lsn2.offset)
+ return (1);
+
+ if (lsn1.offset < lsn2.offset)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * __rep_abort_prepared --
+ * Abort any prepared transactions that recovery restored.
+ *
+ * This is used by clients that have just run recovery, since
+ * they cannot/should not call txn_recover and handle prepared transactions
+ * themselves.
+ */
+static int
+__rep_abort_prepared(env)
+ ENV *env;
+{
+#define PREPLISTSIZE 50
+ DB_LOG *dblp;
+ DB_PREPLIST prep[PREPLISTSIZE], *p;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ LOG *lp;
+ int ret;
+ long count, i;
+ u_int32_t op;
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ if (region->stat.st_nrestores == 0)
+ return (0);
+
+ op = DB_FIRST;
+ do {
+ if ((ret = __txn_recover(env,
+ prep, PREPLISTSIZE, &count, op)) != 0)
+ return (ret);
+ for (i = 0; i < count; i++) {
+ p = &prep[i];
+ if ((ret = __txn_abort(p->txn)) != 0)
+ return (ret);
+ env->rep_handle->region->op_cnt--;
+ env->rep_handle->region->max_prep_lsn = lp->lsn;
+ region->stat.st_nrestores--;
+ }
+ op = DB_NEXT;
+ } while (count == PREPLISTSIZE);
+
+ return (0);
+}
+
+/*
+ * __rep_restore_prepared --
+ * Restore to a prepared state any prepared but not yet committed
+ * transactions.
+ *
+ * This performs, in effect, a "mini-recovery"; it is called from
+ * __rep_start by newly upgraded masters. There may be transactions that an
+ * old master prepared but did not resolve, which we need to restore to an
+ * active state.
+ */
+static int
+__rep_restore_prepared(env)
+ ENV *env;
+{
+ DBT rec;
+ DB_LOGC *logc;
+ DB_LSN ckp_lsn, lsn;
+ DB_REP *db_rep;
+ DB_TXNHEAD *txninfo;
+ REP *rep;
+ __txn_ckp_args *ckp_args;
+ __txn_regop_args *regop_args;
+ __txn_prepare_args *prep_args;
+ int ret, t_ret;
+ u_int32_t hi_txn, low_txn, rectype, status, txnid, txnop;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ if (IS_ZERO_LSN(rep->max_prep_lsn)) {
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "restore_prep: No prepares. Skip."));
+ return (0);
+ }
+ txninfo = NULL;
+ ckp_args = NULL;
+ prep_args = NULL;
+ regop_args = NULL;
+ ZERO_LSN(ckp_lsn);
+ ZERO_LSN(lsn);
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ /*
+ * Get our first LSN to see if the prepared LSN is still
+ * available. If so, it might be unresolved. If not,
+ * then it is guaranteed to be resolved.
+ */
+ memset(&rec, 0, sizeof(DBT));
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
+ __db_errx(env, DB_STR("3559", "First record not found"));
+ goto err;
+ }
+ /*
+ * If the max_prep_lsn is no longer available, we're sure
+ * that txn has been resolved. We're done.
+ */
+ if (rep->max_prep_lsn.file < lsn.file) {
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "restore_prep: Prepare resolved. Skip"));
+ ZERO_LSN(rep->max_prep_lsn);
+ goto done;
+ }
+ /*
+ * We need to consider the set of records between the most recent
+ * checkpoint LSN and the end of the log; any txn in that
+ * range, and only txns in that range, could still have been
+ * active, and thus prepared but not yet committed (PBNYC),
+ * when the old master died.
+ *
+ * Find the most recent checkpoint LSN, and get the record there.
+ * If there is no checkpoint in the log, start off by getting
+ * the very first record in the log instead.
+ */
+ if ((ret = __txn_getckp(env, &lsn)) == 0) {
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_SET)) != 0) {
+ __db_errx(env, DB_STR_A("3560",
+ "Checkpoint record at LSN [%lu][%lu] not found",
+ "%lu %lu"), (u_long)lsn.file, (u_long)lsn.offset);
+ goto err;
+ }
+
+ if ((ret = __txn_ckp_read(
+ env, rec.data, &ckp_args)) == 0) {
+ ckp_lsn = ckp_args->ckp_lsn;
+ __os_free(env, ckp_args);
+ }
+ if (ret != 0) {
+ __db_errx(env, DB_STR_A("3561",
+ "Invalid checkpoint record at [%lu][%lu]",
+ "%lu %lu"), (u_long)lsn.file, (u_long)lsn.offset);
+ goto err;
+ }
+
+ if ((ret = __logc_get(logc, &ckp_lsn, &rec, DB_SET)) != 0) {
+ __db_errx(env, DB_STR_A("3562",
+ "Checkpoint LSN record [%lu][%lu] not found",
+ "%lu %lu"),
+ (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset);
+ goto err;
+ }
+ } else if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ /* An empty log means no PBNYC txns. */
+ ret = 0;
+ goto done;
+ }
+ __db_errx(env, DB_STR("3563",
+ "Attempt to get first log record failed"));
+ goto err;
+ }
+
+ /*
+ * We use the same txnlist infrastructure that recovery does;
+ * it demands an estimate of the high and low txnids for
+ * initialization.
+ *
+ * First, the low txnid.
+ */
+ do {
+ /* txnid is after rectype, which is a u_int32. */
+ LOGCOPY_32(env, &low_txn,
+ (u_int8_t *)rec.data + sizeof(u_int32_t));
+ if (low_txn != 0)
+ break;
+ } while ((ret = __logc_get(logc, &lsn, &rec, DB_NEXT)) == 0);
+
+ /* If there are no txns, there are no PBNYC txns. */
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ goto done;
+ } else if (ret != 0)
+ goto err;
+
+ /* Now, the high txnid. */
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0) {
+ /*
+ * Note that DB_NOTFOUND is unacceptable here because we
+ * had to have looked at some log record to get this far.
+ */
+ __db_errx(env, DB_STR("3564",
+ "Final log record not found"));
+ goto err;
+ }
+ do {
+ /* txnid is after rectype, which is a u_int32. */
+ LOGCOPY_32(env, &hi_txn,
+ (u_int8_t *)rec.data + sizeof(u_int32_t));
+ if (hi_txn != 0)
+ break;
+ } while ((ret = __logc_get(logc, &lsn, &rec, DB_PREV)) == 0);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ goto done;
+ } else if (ret != 0)
+ goto err;
+
+ /* We have a high and low txnid. Initialise the txn list. */
+ if ((ret = __db_txnlist_init(env,
+ NULL, low_txn, hi_txn, NULL, &txninfo)) != 0)
+ goto err;
+
+ /*
+ * Now, walk backward from the end of the log to ckp_lsn. Any
+ * prepares that we hit without first hitting a commit or
+ * abort belong to PBNYC txns, and we need to apply them and
+ * restore them to a prepared state.
+ *
+ * Note that we wind up applying transactions out of order.
+ * Since all PBNYC txns still held locks on the old master and
+ * were isolated, this should be safe.
+ */
+ F_SET(env->lg_handle, DBLOG_RECOVER);
+ for (ret = __logc_get(logc, &lsn, &rec, DB_LAST);
+ ret == 0 && LOG_COMPARE(&lsn, &ckp_lsn) > 0;
+ ret = __logc_get(logc, &lsn, &rec, DB_PREV)) {
+ LOGCOPY_32(env, &rectype, rec.data);
+ switch (rectype) {
+ case DB___txn_regop:
+ /*
+ * It's a commit or abort--but we don't care
+ * which! Just add it to the list of txns
+ * that are resolved.
+ */
+ if ((ret = __txn_regop_read(
+ env, rec.data, &regop_args)) != 0)
+ goto err;
+ txnid = regop_args->txnp->txnid;
+ txnop = regop_args->opcode;
+ __os_free(env, regop_args);
+
+ ret = __db_txnlist_find(env,
+ txninfo, txnid, &status);
+ if (ret == DB_NOTFOUND)
+ ret = __db_txnlist_add(env, txninfo,
+ txnid, txnop, &lsn);
+ else if (ret != 0)
+ goto err;
+ break;
+ case DB___txn_prepare:
+ /*
+ * It's a prepare. If its not aborted and
+ * we haven't put the txn on our list yet, it
+ * hasn't been resolved, so apply and restore it.
+ */
+ if ((ret = __txn_prepare_read(
+ env, rec.data, &prep_args)) != 0)
+ goto err;
+ ret = __db_txnlist_find(env, txninfo,
+ prep_args->txnp->txnid, &status);
+ if (ret == DB_NOTFOUND) {
+ if (prep_args->opcode == TXN_ABORT)
+ ret = __db_txnlist_add(env, txninfo,
+ prep_args->txnp->txnid,
+ prep_args->opcode, &lsn);
+ else if ((ret =
+ __rep_process_txn(env, &rec)) == 0) {
+ /*
+ * We are guaranteed to be single
+ * threaded here. We need to
+ * account for this newly
+ * instantiated txn in the op_cnt
+ * so that it is counted when it is
+ * resolved.
+ */
+ rep->op_cnt++;
+ ret = __txn_restore_txn(env,
+ &lsn, prep_args);
+ }
+ } else if (ret != 0)
+ goto err;
+ __os_free(env, prep_args);
+ break;
+ default:
+ continue;
+ }
+ }
+
+ /* It's not an error to have hit the beginning of the log. */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+done:
+err: t_ret = __logc_close(logc);
+ F_CLR(env->lg_handle, DBLOG_RECOVER);
+
+ if (txninfo != NULL)
+ __db_txnlist_end(env, txninfo);
+
+ return (ret == 0 ? t_ret : ret);
+}
+
+/*
+ * __rep_get_limit --
+ * Get the limit on the amount of data that will be sent during a single
+ * invocation of __rep_process_message.
+ *
+ * PUBLIC: int __rep_get_limit __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__rep_get_limit(dbenv, gbytesp, bytesp)
+ DB_ENV *dbenv;
+ u_int32_t *gbytesp, *bytesp;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_limit", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ REP_SYSTEM_LOCK(env);
+ if (gbytesp != NULL)
+ *gbytesp = rep->gbytes;
+ if (bytesp != NULL)
+ *bytesp = rep->bytes;
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ if (gbytesp != NULL)
+ *gbytesp = db_rep->gbytes;
+ if (bytesp != NULL)
+ *bytesp = db_rep->bytes;
+ }
+
+ return (0);
+}
+
+/*
+ * __rep_set_limit --
+ * Set a limit on the amount of data that will be sent during a single
+ * invocation of __rep_process_message.
+ *
+ * PUBLIC: int __rep_set_limit __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__rep_set_limit(dbenv, gbytes, bytes)
+ DB_ENV *dbenv;
+ u_int32_t gbytes, bytes;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_limit", DB_INIT_REP);
+
+ if (bytes > GIGABYTE) {
+ gbytes += bytes / GIGABYTE;
+ bytes = bytes % GIGABYTE;
+ }
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ REP_SYSTEM_LOCK(env);
+ rep->gbytes = gbytes;
+ rep->bytes = bytes;
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ db_rep->gbytes = gbytes;
+ db_rep->bytes = bytes;
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_nsites_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_set_nsites_pp(dbenv, n)
+ DB_ENV *dbenv;
+ u_int32_t n;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_nsites", DB_INIT_REP);
+ if (APP_IS_REPMGR(env)) {
+ __db_errx(env, DB_STR("3565",
+"DB_ENV->rep_set_nsites: cannot call from Replication Manager application"));
+ return (EINVAL);
+ }
+ if ((ret = __rep_set_nsites_int(env, n)) == 0)
+ APP_SET_BASEAPI(env);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_set_nsites_int __P((ENV *, u_int32_t));
+ */
+int
+__rep_set_nsites_int(env, n)
+ ENV *env;
+ u_int32_t n;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ ret = 0;
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ rep->config_nsites = n;
+ if (IS_USING_LEASES(env) &&
+ IS_REP_MASTER(env) && IS_REP_STARTED(env)) {
+ REP_SYSTEM_LOCK(env);
+ ret = __rep_lease_table_alloc(env, n);
+ REP_SYSTEM_UNLOCK(env);
+ }
+ } else
+ db_rep->config_nsites = n;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_get_nsites __P((DB_ENV *, u_int32_t *));
+ */
+int
+__rep_get_nsites(dbenv, n)
+ DB_ENV *dbenv;
+ u_int32_t *n;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_nsites", DB_INIT_REP);
+
+ if (APP_IS_REPMGR(env))
+ return (__repmgr_get_nsites(env, n));
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ *n = rep->config_nsites;
+ } else
+ *n = db_rep->config_nsites;
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_priority __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_set_priority(dbenv, priority)
+ DB_ENV *dbenv;
+ u_int32_t priority;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+ u_int32_t prev;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_priority", DB_INIT_REP);
+
+ ret = 0;
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ prev = rep->priority;
+ rep->priority = priority;
+#ifdef HAVE_REPLICATION_THREADS
+ ret = __repmgr_chg_prio(env, prev, priority);
+#endif
+ } else
+ db_rep->my_priority = priority;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_get_priority __P((DB_ENV *, u_int32_t *));
+ */
+int
+__rep_get_priority(dbenv, priority)
+ DB_ENV *dbenv;
+ u_int32_t *priority;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_priority", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ *priority = rep->priority;
+ } else
+ *priority = db_rep->my_priority;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t));
+ */
+int
+__rep_set_timeout(dbenv, which, timeout)
+ DB_ENV *dbenv;
+ int which;
+ db_timeout_t timeout;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+ int repmgr_timeout, ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+ repmgr_timeout = 0;
+
+ if (timeout == 0 && (which == DB_REP_CONNECTION_RETRY ||
+ which == DB_REP_ELECTION_TIMEOUT || which == DB_REP_LEASE_TIMEOUT ||
+ which == DB_REP_ELECTION_RETRY)) {
+ __db_errx(env, DB_STR("3566", "timeout value must be > 0"));
+ return (EINVAL);
+ }
+
+ if (which == DB_REP_ACK_TIMEOUT || which == DB_REP_CONNECTION_RETRY ||
+ which == DB_REP_ELECTION_RETRY ||
+ which == DB_REP_HEARTBEAT_MONITOR ||
+ which == DB_REP_HEARTBEAT_SEND)
+ repmgr_timeout = 1;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_timeout", DB_INIT_REP);
+
+ if (APP_IS_BASEAPI(env) && repmgr_timeout) {
+ __db_errx(env, DB_STR_A("3567",
+"%scannot set Replication Manager timeout from base replication application",
+ "%s"), "DB_ENV->rep_set_timeout:");
+ return (EINVAL);
+ }
+ if (which == DB_REP_LEASE_TIMEOUT && IS_REP_STARTED(env)) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR_A("3568",
+"%s: lease timeout must be set before DB_ENV->rep_start.",
+ "%s"), "DB_ENV->rep_set_timeout");
+ return (EINVAL);
+ }
+
+ switch (which) {
+ case DB_REP_CHECKPOINT_DELAY:
+ if (REP_ON(env))
+ rep->chkpt_delay = timeout;
+ else
+ db_rep->chkpt_delay = timeout;
+ break;
+ case DB_REP_ELECTION_TIMEOUT:
+ if (REP_ON(env))
+ rep->elect_timeout = timeout;
+ else
+ db_rep->elect_timeout = timeout;
+ break;
+ case DB_REP_FULL_ELECTION_TIMEOUT:
+ if (REP_ON(env))
+ rep->full_elect_timeout = timeout;
+ else
+ db_rep->full_elect_timeout = timeout;
+ break;
+ case DB_REP_LEASE_TIMEOUT:
+ if (REP_ON(env))
+ rep->lease_timeout = timeout;
+ else
+ db_rep->lease_timeout = timeout;
+ break;
+#ifdef HAVE_REPLICATION_THREADS
+ case DB_REP_ACK_TIMEOUT:
+ if (REP_ON(env))
+ rep->ack_timeout = timeout;
+ else
+ db_rep->ack_timeout = timeout;
+ break;
+ case DB_REP_CONNECTION_RETRY:
+ if (REP_ON(env))
+ rep->connection_retry_wait = timeout;
+ else
+ db_rep->connection_retry_wait = timeout;
+ break;
+ case DB_REP_ELECTION_RETRY:
+ if (REP_ON(env))
+ rep->election_retry_wait = timeout;
+ else
+ db_rep->election_retry_wait = timeout;
+ break;
+ case DB_REP_HEARTBEAT_MONITOR:
+ if (REP_ON(env))
+ rep->heartbeat_monitor_timeout = timeout;
+ else
+ db_rep->heartbeat_monitor_timeout = timeout;
+ break;
+ case DB_REP_HEARTBEAT_SEND:
+ if (REP_ON(env))
+ rep->heartbeat_frequency = timeout;
+ else
+ db_rep->heartbeat_frequency = timeout;
+ break;
+#endif
+ default:
+ __db_errx(env, DB_STR("3569",
+ "Unknown timeout type argument to DB_ENV->rep_set_timeout"));
+ ret = EINVAL;
+ }
+
+ /* Setting a repmgr timeout makes this a repmgr application */
+ if (ret == 0 && repmgr_timeout)
+ APP_SET_REPMGR(env);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_get_timeout __P((DB_ENV *, int, db_timeout_t *));
+ */
+int
+__rep_get_timeout(dbenv, which, timeout)
+ DB_ENV *dbenv;
+ int which;
+ db_timeout_t *timeout;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_timeout", DB_INIT_REP);
+
+ switch (which) {
+ case DB_REP_CHECKPOINT_DELAY:
+ *timeout = REP_ON(env) ?
+ rep->chkpt_delay : db_rep->chkpt_delay;
+ break;
+ case DB_REP_ELECTION_TIMEOUT:
+ *timeout = REP_ON(env) ?
+ rep->elect_timeout : db_rep->elect_timeout;
+ break;
+ case DB_REP_FULL_ELECTION_TIMEOUT:
+ *timeout = REP_ON(env) ?
+ rep->full_elect_timeout : db_rep->full_elect_timeout;
+ break;
+ case DB_REP_LEASE_TIMEOUT:
+ *timeout = REP_ON(env) ?
+ rep->lease_timeout : db_rep->lease_timeout;
+ break;
+#ifdef HAVE_REPLICATION_THREADS
+ case DB_REP_ACK_TIMEOUT:
+ *timeout = REP_ON(env) ?
+ rep->ack_timeout : db_rep->ack_timeout;
+ break;
+ case DB_REP_CONNECTION_RETRY:
+ *timeout = REP_ON(env) ?
+ rep->connection_retry_wait : db_rep->connection_retry_wait;
+ break;
+ case DB_REP_ELECTION_RETRY:
+ *timeout = REP_ON(env) ?
+ rep->election_retry_wait : db_rep->election_retry_wait;
+ break;
+ case DB_REP_HEARTBEAT_MONITOR:
+ *timeout = REP_ON(env) ? rep->heartbeat_monitor_timeout :
+ db_rep->heartbeat_monitor_timeout;
+ break;
+ case DB_REP_HEARTBEAT_SEND:
+ *timeout = REP_ON(env) ?
+ rep->heartbeat_frequency : db_rep->heartbeat_frequency;
+ break;
+#endif
+ default:
+ __db_errx(env, DB_STR("3570",
+ "unknown timeout type argument to DB_ENV->rep_get_timeout"));
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __rep_get_request --
+ * Get the minimum and maximum number of log records that we wait
+ * before retransmitting.
+ *
+ * PUBLIC: int __rep_get_request
+ * PUBLIC: __P((DB_ENV *, db_timeout_t *, db_timeout_t *));
+ */
+int
+__rep_get_request(dbenv, minp, maxp)
+ DB_ENV *dbenv;
+ db_timeout_t *minp, *maxp;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_request", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ /*
+ * We acquire the mtx_region or mtx_clientdb mutexes as needed.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (minp != NULL)
+ DB_TIMESPEC_TO_TIMEOUT((*minp), &rep->request_gap, 0);
+ if (maxp != NULL)
+ DB_TIMESPEC_TO_TIMEOUT((*maxp), &rep->max_gap, 0);
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ if (minp != NULL)
+ DB_TIMESPEC_TO_TIMEOUT((*minp),
+ &db_rep->request_gap, 0);
+ if (maxp != NULL)
+ DB_TIMESPEC_TO_TIMEOUT((*maxp), &db_rep->max_gap, 0);
+ }
+
+ return (0);
+}
+
+/*
+ * __rep_set_request --
+ * Set the minimum and maximum number of log records that we wait
+ * before retransmitting.
+ *
+ * PUBLIC: int __rep_set_request __P((DB_ENV *, db_timeout_t, db_timeout_t));
+ */
+int
+__rep_set_request(dbenv, min, max)
+ DB_ENV *dbenv;
+ db_timeout_t min, max;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ LOG *lp;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_request", DB_INIT_REP);
+
+ if (min == 0 || max < min) {
+ __db_errx(env, DB_STR("3571",
+ "DB_ENV->rep_set_request: Invalid min or max values"));
+ return (EINVAL);
+ }
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ /*
+ * We acquire the mtx_region or mtx_clientdb mutexes as needed.
+ */
+ REP_SYSTEM_LOCK(env);
+ DB_TIMEOUT_TO_TIMESPEC(min, &rep->request_gap);
+ DB_TIMEOUT_TO_TIMESPEC(max, &rep->max_gap);
+ REP_SYSTEM_UNLOCK(env);
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ dblp = env->lg_handle;
+ if (dblp != NULL && (lp = dblp->reginfo.primary) != NULL) {
+ DB_TIMEOUT_TO_TIMESPEC(min, &lp->wait_ts);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ENV_LEAVE(env, ip);
+ } else {
+ DB_TIMEOUT_TO_TIMESPEC(min, &db_rep->request_gap);
+ DB_TIMEOUT_TO_TIMESPEC(max, &db_rep->max_gap);
+ }
+
+ return (0);
+}
+
+/*
+ * __rep_set_transport_pp --
+ * Set the transport function for replication.
+ *
+ * PUBLIC: int __rep_set_transport_pp __P((DB_ENV *, int,
+ * PUBLIC: int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *,
+ * PUBLIC: int, u_int32_t)));
+ */
+int
+__rep_set_transport_pp(dbenv, eid, f_send)
+ DB_ENV *dbenv;
+ int eid;
+ int (*f_send) __P((DB_ENV *,
+ const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_transport", DB_INIT_REP);
+
+ if (APP_IS_REPMGR(env)) {
+ __db_errx(env, DB_STR("3572",
+ "DB_ENV->rep_set_transport: cannot call from "
+ "Replication Manager application"));
+ return (EINVAL);
+ }
+
+ if (f_send == NULL) {
+ __db_errx(env, DB_STR("3573",
+ "DB_ENV->rep_set_transport: no send function specified"));
+ return (EINVAL);
+ }
+
+ if (eid < 0) {
+ __db_errx(env, DB_STR("3574",
+ "DB_ENV->rep_set_transport: eid must be greater than or equal to 0"));
+ return (EINVAL);
+ }
+
+ if ((ret = __rep_set_transport_int(env, eid, f_send)) == 0)
+ /*
+ * Setting a non-repmgr send function makes this a base API
+ * application.
+ */
+ APP_SET_BASEAPI(env);
+
+ return (ret);
+}
+
+/*
+ * __rep_set_transport_int --
+ * Set the internal values for the transport function for replication.
+ *
+ * PUBLIC: int __rep_set_transport_int __P((ENV *, int,
+ * PUBLIC: int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *,
+ * PUBLIC: int, u_int32_t)));
+ */
+int
+__rep_set_transport_int(env, eid, f_send)
+ ENV *env;
+ int eid;
+ int (*f_send) __P((DB_ENV *,
+ const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ db_rep = env->rep_handle;
+ db_rep->send = f_send;
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ rep->eid = eid;
+ } else
+ db_rep->eid = eid;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_get_clockskew __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__rep_get_clockskew(dbenv, fast_clockp, slow_clockp)
+ DB_ENV *dbenv;
+ u_int32_t *fast_clockp, *slow_clockp;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_clockskew", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ REP_SYSTEM_LOCK(env);
+ *fast_clockp = rep->clock_skew;
+ *slow_clockp = rep->clock_base;
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ *fast_clockp = db_rep->clock_skew;
+ *slow_clockp = db_rep->clock_base;
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_clockskew __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__rep_set_clockskew(dbenv, fast_clock, slow_clock)
+ DB_ENV *dbenv;
+ u_int32_t fast_clock, slow_clock;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_clockskew", DB_INIT_REP);
+
+ /*
+ * Check for valid values. The fast clock should be a larger
+ * number than the slow clock. We use the slow clock value as
+ * our base for adjustment - therefore, a 2% difference should
+ * be fast == 102, slow == 100. Check for values being 0. If
+ * they are, then set them both to 1 internally.
+ *
+ * We will use these numbers to compute the larger ratio to be
+ * most conservative about the user's intention.
+ */
+ if (fast_clock == 0 || slow_clock == 0) {
+ /*
+ * If one value is zero, reject if both aren't zero.
+ */
+ if (slow_clock != 0 || fast_clock != 0) {
+ __db_errx(env, DB_STR("3575",
+ "DB_ENV->rep_set_clockskew: Zero only valid for "
+ "when used for both arguments"));
+ return (EINVAL);
+ }
+ fast_clock = 1;
+ slow_clock = 1;
+ }
+ if (fast_clock < slow_clock) {
+ __db_errx(env, DB_STR("3576",
+ "DB_ENV->rep_set_clockskew: slow_clock value is "
+ "larger than fast_clock_value"));
+ return (EINVAL);
+ }
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ if (IS_REP_STARTED(env)) {
+ __db_errx(env, DB_STR("3577",
+ "DB_ENV->rep_set_clockskew: must be called before DB_ENV->rep_start"));
+ return (EINVAL);
+ }
+ ENV_ENTER(env, ip);
+ REP_SYSTEM_LOCK(env);
+ rep->clock_skew = fast_clock;
+ rep->clock_base = slow_clock;
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ db_rep->clock_skew = fast_clock;
+ db_rep->clock_base = slow_clock;
+ }
+ return (ret);
+}
+
+/*
+ * __rep_flush --
+ * Re-push the last log record to all clients, in case they've lost
+ * messages and don't know it.
+ *
+ * PUBLIC: int __rep_flush __P((DB_ENV *));
+ */
+int
+__rep_flush(dbenv)
+ DB_ENV *dbenv;
+{
+ DBT rec;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_flush", DB_INIT_REP);
+
+ if (IS_REP_CLIENT(env))
+ return (0);
+
+ /* We need a transport function because we send messages. */
+ if (db_rep->send == NULL) {
+ __db_errx(env, DB_STR("3578",
+ "DB_ENV->rep_flush: must be called after DB_ENV->rep_set_transport"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ memset(&rec, 0, sizeof(rec));
+ memset(&lsn, 0, sizeof(lsn));
+
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0)
+ goto err;
+
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_LOG, &lsn, &rec, 0, 0);
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __rep_sync --
+ * Force a synchronization to occur between this client and the master.
+ * This is the other half of configuring DELAYCLIENT.
+ *
+ * PUBLIC: int __rep_sync __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_sync(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ LOG *lp;
+ REP *rep;
+ int master, ret;
+ u_int32_t repflags, type;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ COMPQUIET(flags, 0);
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_sync", DB_INIT_REP);
+
+ /* We need a transport function because we send messages. */
+ if (db_rep->send == NULL) {
+ __db_errx(env, DB_STR("3579",
+ "DB_ENV->rep_sync: must be called after DB_ENV->rep_set_transport"));
+ return (EINVAL);
+ }
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ rep = db_rep->region;
+ ret = 0;
+
+ ENV_ENTER(env, ip);
+
+ /*
+ * Simple cases. If we're not in the DELAY state we have nothing
+ * to do. If we don't know who the master is, send a MASTER_REQ.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lsn = lp->verify_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ master = rep->master_id;
+ if (master == DB_EID_INVALID) {
+ REP_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_MASTER_REQ, NULL, NULL, 0, 0);
+ goto out;
+ }
+ /*
+ * We want to hold the rep mutex to test and then clear the
+ * DELAY flag. Racing threads in here could otherwise result
+ * in dual data streams.
+ */
+ if (!F_ISSET(rep, REP_F_DELAY)) {
+ REP_SYSTEM_UNLOCK(env);
+ goto out;
+ }
+
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ /*
+ * If we get here, we clear the delay flag and kick off a
+ * synchronization. From this point forward, we will
+ * synchronize until the next time the master changes.
+ */
+ F_CLR(rep, REP_F_DELAY);
+ if (IS_ZERO_LSN(lsn) && !FLD_ISSET(rep->config, REP_C_AUTOINIT)) {
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+ CLR_RECOVERY_SETTINGS(rep);
+ ret = DB_REP_JOIN_FAILURE;
+ REP_SYSTEM_UNLOCK(env);
+ goto out;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * When we set REP_F_DELAY, we set verify_lsn to the real verify lsn if
+ * we need to verify, or we zeroed it out if this is a client that needs
+ * internal init. So, send the type of message now that
+ * __rep_new_master delayed sending.
+ */
+ if (IS_ZERO_LSN(lsn)) {
+ DB_ASSERT(env, rep->sync_state == SYNC_UPDATE);
+ type = REP_UPDATE_REQ;
+ repflags = 0;
+ } else {
+ DB_ASSERT(env, rep->sync_state == SYNC_VERIFY);
+ type = REP_VERIFY_REQ;
+ repflags = DB_REP_ANYWHERE;
+ }
+ (void)__rep_send_message(env, master, type, &lsn, NULL, 0, repflags);
+
+out: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_txn_applied __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, DB_COMMIT_INFO *, db_timeout_t));
+ */
+int
+__rep_txn_applied(env, ip, commit_info, timeout)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_COMMIT_INFO *commit_info;
+ db_timeout_t timeout;
+{
+ REP *rep;
+ db_timespec limit, now, t;
+ db_timeout_t duration;
+ struct rep_waitgoal reason;
+ int locked, ret, t_ret;
+
+ if (commit_info->gen == 0) {
+ __db_errx(env, DB_STR("3580",
+ "non-replication commit token in replication env"));
+ return (EINVAL);
+ }
+
+ rep = env->rep_handle->region;
+
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "checking txn_applied: gen %lu, envid %lu, LSN [%lu][%lu]",
+ (u_long)commit_info->gen, (u_long)commit_info->envid,
+ (u_long)commit_info->lsn.file, (u_long)commit_info->lsn.offset));
+ locked = 0;
+ __os_gettime(env, &limit, 1);
+ TIMESPEC_ADD_DB_TIMEOUT(&limit, timeout);
+
+retry:
+ /*
+ * The checking is done within the scope of the handle count, but if we
+ * end up having to wait that part is not. If a lockout sequence begins
+ * while we're waiting, it will wake us up, and we'll come back here to
+ * try entering the scope again, at which point we'll get an error so
+ * that we return immediately.
+ */
+ if ((ret = __op_handle_enter(env)) != 0)
+ goto out;
+
+ ret = __rep_check_applied(env, ip, commit_info, &reason);
+ t_ret = __env_db_rep_exit(env);
+
+ /*
+ * Between here and __rep_check_applied() we use DB_TIMEOUT privately to
+ * mean that the transaction hasn't been applied yet, but it still
+ * plausibly could be soon; think of it as meaning "not yet". So
+ * DB_TIMEOUT doesn't necessarily mean that DB_TIMEOUT is the ultimate
+ * return that the application will see.
+ *
+ * When we get this "not yet", we check the actual time remaining. If
+ * the time has expired, then indeed we can simply pass DB_TIMEOUT back
+ * up to the calling application. But if not, it tells us that we have
+ * a chance to wait and try again. This is a nice division of labor,
+ * because it means the lower level functions (__rep_check_applied() and
+ * below) do not have to mess with any actual time computations, or
+ * waiting, at all.
+ */
+ if (ret == DB_TIMEOUT && t_ret == 0 && F_ISSET(rep, REP_F_CLIENT)) {
+ __os_gettime(env, &now, 1);
+ if (timespeccmp(&now, &limit, <)) {
+
+ /* Compute how much time remains before the limit. */
+ t = limit;
+ timespecsub(&t, &now);
+ DB_TIMESPEC_TO_TIMEOUT(duration, &t, 1);
+
+ /*
+ * Wait for whatever __rep_check_applied told us we
+ * needed to wait for. But first, check the condition
+ * again under mutex protection, in case there was a
+ * close race.
+ */
+ if (reason.why == AWAIT_LSN ||
+ reason.why == AWAIT_HISTORY) {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ locked = 1;
+ }
+ REP_SYSTEM_LOCK(env);
+ ret = __rep_check_goal(env, &reason);
+ if (locked) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ locked = 0;
+ }
+ if (ret == DB_TIMEOUT) {
+ /*
+ * The usual case: we haven't reached our goal
+ * yet, even after checking again while holding
+ * mutex.
+ */
+ ret = __rep_await_condition(env,
+ &reason, duration);
+
+ /*
+ * If it were possible for
+ * __rep_await_condition() to return DB_TIMEOUT
+ * that would confuse the outer "if" statement
+ * here.
+ */
+ DB_ASSERT(env, ret != DB_TIMEOUT);
+ }
+ REP_SYSTEM_UNLOCK(env);
+ if (ret != 0)
+ goto out;
+
+ /*
+ * Note that the "reason" that check_applied set, and
+ * that await_condition waited for, does not necessarily
+ * represent a final result ready to return to the
+ * user. In some cases there may be a few state changes
+ * necessary before we are able to determine the final
+ * result. Thus whenever we complete a successful wait
+ * we need to cycle back and check the full txn_applied
+ * question again.
+ */
+ goto retry;
+ }
+ }
+
+ if (t_ret != 0 &&
+ (ret == 0 || ret == DB_TIMEOUT || ret == DB_NOTFOUND))
+ ret = t_ret;
+
+out:
+ return (ret);
+}
+
+/*
+ * The only non-zero return code from this function is for unexpected errors.
+ * We normally return 0, regardless of whether the wait terminated because the
+ * condition was satisfied or the timeout expired.
+ */
+static int
+__rep_await_condition(env, reasonp, duration)
+ ENV *env;
+ struct rep_waitgoal *reasonp;
+ db_timeout_t duration;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ struct __rep_waiter *waiter;
+ int ret;
+
+ rep = env->rep_handle->region;
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ /*
+ * Acquire the first lock on the self-blocking mutex when we first
+ * allocate it. Thereafter when it's on the free list we know that
+ * first lock has already been taken.
+ */
+ if ((waiter = SH_TAILQ_FIRST(&rep->free_waiters,
+ __rep_waiter)) == NULL) {
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ if ((ret = __env_alloc(env->reginfo,
+ sizeof(struct __rep_waiter), &waiter)) == 0) {
+ memset(waiter, 0, sizeof(*waiter));
+ if ((ret = __mutex_alloc(env, MTX_REP_WAITER,
+ DB_MUTEX_SELF_BLOCK, &waiter->mtx_repwait)) != 0)
+ __env_alloc_free(infop, waiter);
+ }
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ if (ret != 0)
+ return (ret);
+
+ MUTEX_LOCK(env, waiter->mtx_repwait);
+ } else
+ SH_TAILQ_REMOVE(&rep->free_waiters,
+ waiter, links, __rep_waiter);
+ waiter->flags = 0;
+ waiter->goal = *reasonp;
+ SH_TAILQ_INSERT_HEAD(&rep->waiters,
+ waiter, links, __rep_waiter);
+
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "waiting for condition %d", (int)reasonp->why));
+ REP_SYSTEM_UNLOCK(env);
+ /* Wait here for conditions to become more favorable. */
+ MUTEX_WAIT(env, waiter->mtx_repwait, duration);
+ REP_SYSTEM_LOCK(env);
+
+ if (!F_ISSET(waiter, REP_F_WOKEN))
+ SH_TAILQ_REMOVE(&rep->waiters, waiter, links, __rep_waiter);
+ SH_TAILQ_INSERT_HEAD(&rep->free_waiters, waiter, links, __rep_waiter);
+
+ return (0);
+}
+
+/*
+ * Check whether the transaction is currently applied. If it is not, but it
+ * might likely become applied in the future, then return DB_TIMEOUT. It's the
+ * caller's duty to figure out whether to wait or not in that case. Here we
+ * only do an immediate check of the current state of affairs.
+ */
+static int
+__rep_check_applied(env, ip, commit_info, reasonp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_COMMIT_INFO *commit_info;
+ struct rep_waitgoal *reasonp;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ DB_TXN *txn;
+ DBC *dbc;
+ __rep_lsn_hist_data_args hist, hist2;
+ DB_LSN lsn;
+ u_int32_t gen;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ gen = rep->gen;
+ txn = NULL;
+ dbc = NULL;
+
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ } else {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lsn = lp->max_perm_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+
+ /*
+ * The first thing to consider is whether we're in the right gen.
+ * The token gen either matches our current gen, or is left over from an
+ * older gen, or in rare circumstances could be from a "future" gen that
+ * we haven't learned about yet (or that got rolled back).
+ */
+ if (commit_info->gen == gen) {
+ ret = __rep_read_lsn_history(env,
+ ip, &txn, &dbc, gen, &hist, reasonp, DB_SET);
+ if (ret == DB_NOTFOUND) {
+ /*
+ * We haven't yet received the LSN history of the
+ * current generation from the master. Return
+ * DB_TIMEOUT to tell the caller it needs to wait and
+ * tell it to wait for the LSN history.
+ *
+ * Note that this also helps by eliminating the weird
+ * period between receiving a new gen (from a NEWMASTER)
+ * and the subsequent syncing with that new gen. We
+ * really only want to return success at the current gen
+ * once we've synced.
+ */
+ ret = DB_TIMEOUT;
+ reasonp->why = AWAIT_HISTORY;
+ reasonp->u.lsn = lsn;
+ }
+ if (ret != 0)
+ goto out;
+
+ if (commit_info->envid != hist.envid) {
+ /*
+ * Gens match, but envids don't: means there were two
+ * masters at the same gen, and the txn of interest was
+ * rolled back.
+ */
+ ret = DB_NOTFOUND;
+ goto out;
+ }
+
+ if (LOG_COMPARE(&commit_info->lsn, &lsn) > 0) {
+ /*
+ * We haven't yet gotten the LSN of interest, but we can
+ * expect it soon; so wait for it.
+ */
+ ret = DB_TIMEOUT;
+ reasonp->why = AWAIT_LSN;
+ reasonp->u.lsn = commit_info->lsn;
+ goto out;
+ }
+
+ if (LOG_COMPARE(&commit_info->lsn, &hist.lsn) >= 0) {
+ /*
+ * The LSN of interest is in the past, but within the
+ * range claimed for this gen. Success! (We have read
+ * consistency.)
+ */
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * There must have been a DUPMASTER at some point: the
+ * description of the txn of interest doesn't match what we see
+ * in the history available to us now.
+ */
+ ret = DB_NOTFOUND;
+
+ } else if (commit_info->gen < gen || gen == 0) {
+ /*
+ * Transaction from an old gen. Read this gen's base LSN, plus
+ * that of the next higher gen, because we want to check that
+ * the token LSN is within the close/open range defined by
+ * [base,next).
+ */
+ ret = __rep_read_lsn_history(env,
+ ip, &txn, &dbc, commit_info->gen, &hist, reasonp, DB_SET);
+ t_ret = __rep_read_lsn_history(env,
+ ip, &txn, &dbc, commit_info->gen, &hist2, reasonp, DB_NEXT);
+ if (ret == DB_NOTFOUND) {
+ /*
+ * If the desired gen is not in our database, it could
+ * mean either of two things. 1. The whole gen could
+ * have been rolled back. 2. We could just be really
+ * far behind on replication. Reading ahead to the next
+ * following gen, which we likely need anyway, helps us
+ * decide which case to conclude.
+ */
+ if (t_ret == 0)
+ /*
+ * Second read succeeded, so "being behind in
+ * replication" is not a viable reason for
+ * having failed to find the first read.
+ * Therefore, the gen must have been rolled
+ * back, and the proper result is NOTFOUND to
+ * indicate that.
+ */
+ goto out;
+ if (t_ret == DB_NOTFOUND) {
+ /*
+ * Second read also got a NOTFOUND: we're
+ * definitely "behind" (we don't even have
+ * current gen's history). So, waiting is the
+ * correct result.
+ */
+ ret = DB_TIMEOUT;
+ reasonp->why = AWAIT_HISTORY;
+ reasonp->u.lsn = lsn;
+ goto out;
+ }
+ /*
+ * Here, t_ret is something unexpected, which trumps the
+ * NOTFOUND returned from the first read.
+ */
+ ret = t_ret;
+ goto out;
+ }
+ if (ret != 0)
+ goto out; /* Unexpected error, first read. */
+ if (commit_info->envid != hist.envid) {
+ /*
+ * (We don't need the second read in order to make this
+ * test.)
+ *
+ * We have info for the indicated gen, but the envids
+ * don't match, meaning the txn was written at a dup
+ * master and that gen instance was rolled back.
+ */
+ ret = DB_NOTFOUND;
+ goto out;
+ }
+
+ /* Examine result of second read. */
+ if ((ret = t_ret) == DB_NOTFOUND) {
+ /*
+ * We haven't even heard about our current gen yet, so
+ * it's worth waiting for it.
+ */
+ ret = DB_TIMEOUT;
+ reasonp->why = AWAIT_HISTORY;
+ reasonp->u.lsn = lsn;
+ } else if (ret != 0)
+ goto out; /* Second read returned unexpected error. */
+
+ /*
+ * We now have the history info for the gen of the txn, and for
+ * the subsequent gen. All we have to do is see if the LSN is
+ * in range.
+ */
+ if (LOG_COMPARE(&commit_info->lsn, &hist.lsn) >= 0 &&
+ LOG_COMPARE(&commit_info->lsn, &hist2.lsn) < 0)
+ ret = 0;
+ else
+ ret = DB_NOTFOUND;
+ } else {
+ /*
+ * Token names a future gen. If we're a client and the LSN also
+ * is in the future, then it's possible we just haven't caught
+ * up yet, so we can wait for it. Otherwise, it must have been
+ * part of a generation that got lost in a roll-back.
+ */
+ if (F_ISSET(rep, REP_F_CLIENT) &&
+ LOG_COMPARE(&commit_info->lsn, &lsn) > 0) {
+ reasonp->why = AWAIT_GEN;
+ reasonp->u.gen = commit_info->gen;
+ return (DB_TIMEOUT);
+ }
+ return (DB_NOTFOUND);
+ }
+
+out:
+ if (dbc != NULL &&
+ (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (txn != NULL &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 1, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * The txn and dbc handles are owned by caller, though we create them if
+ * necessary. Caller is responsible for closing them.
+ */
+static int
+__rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_TXN **txn;
+ DBC **dbc;
+ u_int32_t gen;
+ __rep_lsn_hist_data_args *gen_infop;
+ struct rep_waitgoal *reasonp;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ DB *dbp;
+ __rep_lsn_hist_key_args key;
+ u_int8_t key_buf[__REP_LSN_HIST_KEY_SIZE];
+ u_int8_t data_buf[__REP_LSN_HIST_DATA_SIZE];
+ DBT key_dbt, data_dbt;
+ u_int32_t desired_gen;
+ int ret, tries;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+
+ DB_ASSERT(env, flags == DB_SET || flags == DB_NEXT);
+
+ /* Simply return cached info, if we already have it. */
+ desired_gen = flags == DB_SET ? gen : gen + 1;
+ REP_SYSTEM_LOCK(env);
+ if (rep->gen == desired_gen && !IS_ZERO_LSN(rep->gen_base_lsn)) {
+ gen_infop->lsn = rep->gen_base_lsn;
+ gen_infop->envid = rep->master_envid;
+ goto unlock;
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ tries = 0;
+retry:
+ if (*txn == NULL &&
+ (ret = __txn_begin(env, ip, NULL, txn, 0)) != 0)
+ return (ret);
+
+ if ((dbp = db_rep->lsn_db) == NULL) {
+ if ((ret = __rep_open_sysdb(env,
+ ip, *txn, REPLSNHIST, 0, &dbp)) != 0) {
+ /*
+ * If the database isn't there, it could be because it's
+ * memory-resident, and we haven't yet sync'ed with the
+ * master to materialize it. (It could make sense to
+ * include a test for INMEM in this conditional
+ * expression, if we were sure all sites had matching
+ * INMEM settings; but since we don't enforce that,
+ * leaving it out makes for more optimistic behavior.)
+ */
+ if (ret == ENOENT &&
+ !F_ISSET(rep, REP_F_NIMDBS_LOADED | REP_F_MASTER)) {
+ ret = DB_TIMEOUT;
+ reasonp->why = AWAIT_NIMDB;
+ }
+ goto err;
+ }
+ db_rep->lsn_db = dbp;
+ }
+
+ if (*dbc == NULL &&
+ (ret = __db_cursor(dbp, ip, *txn, dbc, 0)) != 0)
+ goto err;
+
+ if (flags == DB_SET) {
+ key.version = REP_LSN_HISTORY_FMT_VERSION;
+ key.gen = gen;
+ __rep_lsn_hist_key_marshal(env, &key, key_buf);
+ }
+ DB_INIT_DBT(key_dbt, key_buf, __REP_LSN_HIST_KEY_SIZE);
+ key_dbt.ulen = __REP_LSN_HIST_KEY_SIZE;
+ F_SET(&key_dbt, DB_DBT_USERMEM);
+
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ data_dbt.data = data_buf;
+ data_dbt.ulen = __REP_LSN_HIST_DATA_SIZE;
+ F_SET(&data_dbt, DB_DBT_USERMEM);
+ if ((ret = __dbc_get(*dbc, &key_dbt, &data_dbt, flags)) != 0) {
+ if ((ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
+ ++tries < 5) { /* Limit of 5 is an arbitrary choice. */
+ ret = __dbc_close(*dbc);
+ *dbc = NULL;
+ if (ret != 0)
+ goto err;
+ ret = __txn_abort(*txn);
+ *txn = NULL;
+ if (ret != 0)
+ goto err;
+ __os_yield(env, 0, 10000); /* Arbitrary duration. */
+ goto retry;
+ }
+ goto err;
+ }
+
+ /*
+ * In the DB_NEXT case, we don't know what the next gen is. Unmarshal
+ * the key too, just so that we can check whether it matches the current
+ * gen, for setting the cache. Note that, interestingly, the caller
+ * doesn't care what the key is in that case!
+ */
+ if ((ret = __rep_lsn_hist_key_unmarshal(env,
+ &key, key_buf, __REP_LSN_HIST_KEY_SIZE, NULL)) != 0)
+ goto err;
+ ret = __rep_lsn_hist_data_unmarshal(env,
+ gen_infop, data_buf, __REP_LSN_HIST_DATA_SIZE, NULL);
+
+ REP_SYSTEM_LOCK(env);
+ if (rep->gen == key.gen) {
+ rep->gen_base_lsn = gen_infop->lsn;
+ rep->master_envid = gen_infop->envid;
+ }
+unlock:
+ REP_SYSTEM_UNLOCK(env);
+
+err:
+ return (ret);
+}
+
+/*
+ * __rep_conv_vers --
+ * Convert from a log version to the replication message version
+ * that release used.
+ */
+static u_int32_t
+__rep_conv_vers(env, log_ver)
+ ENV *env;
+ u_int32_t log_ver;
+{
+ COMPQUIET(env, NULL);
+
+ /*
+ * We can't use a switch statement, some of the DB_LOGVERSION_XX
+ * constants are the same
+ */
+ if (log_ver == DB_LOGVERSION_53)
+ return (DB_REPVERSION_53);
+ if (log_ver == DB_LOGVERSION_52)
+ return (DB_REPVERSION_52);
+ /* 5.0 and 5.1 had identical log and rep versions. */
+ if (log_ver == DB_LOGVERSION_51)
+ return (DB_REPVERSION_51);
+ if (log_ver == DB_LOGVERSION_48p2)
+ return (DB_REPVERSION_48);
+ if (log_ver == DB_LOGVERSION_48)
+ return (DB_REPVERSION_48);
+ if (log_ver == DB_LOGVERSION_47)
+ return (DB_REPVERSION_47);
+ if (log_ver == DB_LOGVERSION_46)
+ return (DB_REPVERSION_46);
+ if (log_ver == DB_LOGVERSION_45)
+ return (DB_REPVERSION_45);
+ if (log_ver == DB_LOGVERSION_44)
+ return (DB_REPVERSION_44);
+ if (log_ver == DB_LOGVERSION)
+ return (DB_REPVERSION);
+ return (DB_REPVERSION_INVALID);
+}
diff --git a/src/rep/rep_record.c b/src/rep/rep_record.c
new file mode 100644
index 00000000..f4691974
--- /dev/null
+++ b/src/rep/rep_record.c
@@ -0,0 +1,2586 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __rep_collect_txn __P((ENV *, DB_LSN *, LSN_COLLECTION *));
+static int __rep_do_ckp __P((ENV *, DBT *, __rep_control_args *));
+static int __rep_fire_newmaster __P((ENV *, u_int32_t, int));
+static int __rep_fire_startupdone __P((ENV *, u_int32_t, int));
+static int __rep_getnext __P((ENV *, DB_THREAD_INFO *));
+static int __rep_lsn_cmp __P((const void *, const void *));
+static int __rep_newfile __P((ENV *, __rep_control_args *, DBT *));
+static int __rep_process_rec __P((ENV *, DB_THREAD_INFO *, __rep_control_args *,
+ DBT *, db_timespec *, DB_LSN *));
+static int __rep_remfirst __P((ENV *, DB_THREAD_INFO *, DBT *, DBT *));
+static int __rep_skip_msg __P((ENV *, REP *, int, u_int32_t));
+
+/* Used to consistently designate which messages ought to be received where. */
+
+#define MASTER_ONLY(rep, rp) do { \
+ if (!F_ISSET(rep, REP_F_MASTER)) { \
+ RPRINT(env, (env, DB_VERB_REP_MSGS, \
+ "Master record received on client")); \
+ REP_PRINT_MESSAGE(env, \
+ eid, rp, "rep_process_message", 0); \
+ /* Just skip/ignore it. */ \
+ ret = 0; \
+ goto errlock; \
+ } \
+} while (0)
+
+#define CLIENT_ONLY(rep, rp) do { \
+ if (!F_ISSET(rep, REP_F_CLIENT)) { \
+ RPRINT(env, (env, DB_VERB_REP_MSGS, \
+ "Client record received on master")); \
+ /* \
+ * Only broadcast DUPMASTER if leases are not \
+ * in effect. If I am an old master, using \
+ * leases and I get a newer message, my leases \
+ * had better all be expired. \
+ */ \
+ if (IS_USING_LEASES(env)) \
+ DB_ASSERT(env, \
+ __rep_lease_check(env, 0) == \
+ DB_REP_LEASE_EXPIRED); \
+ else { \
+ REP_PRINT_MESSAGE(env, \
+ eid, rp, "rep_process_message", 0); \
+ (void)__rep_send_message(env, DB_EID_BROADCAST, \
+ REP_DUPMASTER, NULL, NULL, 0, 0); \
+ } \
+ ret = DB_REP_DUPMASTER; \
+ goto errlock; \
+ } \
+} while (0)
+
+/*
+ * If a client is attempting to service a request and its gen is not in
+ * sync with its database state, it cannot service the request. Currently
+ * the only way to know this is with the heavy hammer of knowing (or not)
+ * who the master is. If the master is invalid, force a rerequest.
+ * If we receive an ALIVE, we update both gen and invalidate the
+ * master_id.
+ */
+#define CLIENT_MASTERCHK do { \
+ if (F_ISSET(rep, REP_F_CLIENT)) { \
+ if (master_id == DB_EID_INVALID) { \
+ STAT(rep->stat.st_client_svc_miss++); \
+ ret = __rep_skip_msg(env, rep, eid, rp->rectype);\
+ goto errlock; \
+ } \
+ } \
+} while (0)
+
+/*
+ * If a client is attempting to service a request it does not have,
+ * call rep_skip_msg to skip this message and force a rerequest to the
+ * sender. We don't hold the mutex for the stats and may miscount.
+ */
+#define CLIENT_REREQ do { \
+ if (F_ISSET(rep, REP_F_CLIENT)) { \
+ STAT(rep->stat.st_client_svc_req++); \
+ if (ret == DB_NOTFOUND) { \
+ STAT(rep->stat.st_client_svc_miss++); \
+ ret = __rep_skip_msg(env, rep, eid, rp->rectype);\
+ } \
+ } \
+} while (0)
+
+#define RECOVERING_SKIP do { \
+ if (IS_REP_CLIENT(env) && recovering) { \
+ /* Not holding region mutex, may miscount */ \
+ STAT(rep->stat.st_msgs_recover++); \
+ ret = __rep_skip_msg(env, rep, eid, rp->rectype); \
+ goto errlock; \
+ } \
+} while (0)
+
+/*
+ * If we're recovering the log we only want log records that are in the
+ * range we need to recover. Otherwise we can end up storing a huge
+ * number of "new" records, only to truncate the temp database later after
+ * we run recovery. If we are actively delaying a sync-up, we also skip
+ * all incoming log records until the application requests sync-up.
+ */
+#define RECOVERING_LOG_SKIP do { \
+ if (F_ISSET(rep, REP_F_DELAY) || \
+ rep->master_id == DB_EID_INVALID || \
+ (recovering && \
+ (rep->sync_state != SYNC_LOG || \
+ LOG_COMPARE(&rp->lsn, &rep->last_lsn) >= 0))) { \
+ /* Not holding region mutex, may miscount */ \
+ STAT(rep->stat.st_msgs_recover++); \
+ ret = __rep_skip_msg(env, rep, eid, rp->rectype); \
+ goto errlock; \
+ } \
+} while (0)
+
+#define ANYSITE(rep)
+
+/*
+ * __rep_process_message_pp --
+ *
+ * This routine takes an incoming message and processes it.
+ *
+ * control: contains the control fields from the record
+ * rec: contains the actual record
+ * eid: the environment id of the sender of the message;
+ * ret_lsnp: On DB_REP_ISPERM and DB_REP_NOTPERM returns, contains the
+ * lsn of the maximum permanent or current not permanent log record
+ * (respectively).
+ *
+ * PUBLIC: int __rep_process_message_pp
+ * PUBLIC: __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *));
+ */
+int
+__rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp)
+ DB_ENV *dbenv;
+ DBT *control, *rec;
+ int eid;
+ DB_LSN *ret_lsnp;
+{
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ ret = 0;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_process_message", DB_INIT_REP);
+
+ if (APP_IS_REPMGR(env)) {
+ __db_errx(env, DB_STR_A("3512",
+ "%s cannot call from Replication Manager application",
+ "%s"), "DB_ENV->rep_process_message:");
+ return (EINVAL);
+ }
+
+ /* Control argument must be non-Null. */
+ if (control == NULL || control->size == 0) {
+ __db_errx(env, DB_STR("3513",
+ "DB_ENV->rep_process_message: control argument must be specified"));
+ return (EINVAL);
+ }
+
+ /*
+ * Make sure site is a master or a client, which implies that
+ * replication has been started.
+ */
+ if (!IS_REP_MASTER(env) && !IS_REP_CLIENT(env)) {
+ __db_errx(env, DB_STR("3514",
+ "Environment not configured as replication master or client"));
+ return (EINVAL);
+ }
+
+ if ((ret = __dbt_usercopy(env, control)) != 0 ||
+ (ret = __dbt_usercopy(env, rec)) != 0) {
+ __dbt_userfree(env, control, rec, NULL);
+ __db_errx(env, DB_STR("3515",
+ "DB_ENV->rep_process_message: error retrieving DBT contents"));
+ return (ret);
+ }
+
+ ret = __rep_process_message_int(env, control, rec, eid, ret_lsnp);
+
+ __dbt_userfree(env, control, rec, NULL);
+ return (ret);
+}
+
+/*
+ * __rep_process_message_int --
+ *
+ * This routine performs the internal steps to process an incoming message.
+ *
+ * PUBLIC: int __rep_process_message_int
+ * PUBLIC: __P((ENV *, DBT *, DBT *, int, DB_LSN *));
+ */
+int
+__rep_process_message_int(env, control, rec, eid, ret_lsnp)
+ ENV *env;
+ DBT *control, *rec;
+ int eid;
+ DB_LSN *ret_lsnp;
+{
+ DBT data_dbt;
+ DB_LOG *dblp;
+ DB_LSN last_lsn, lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ REP_46_CONTROL *rp46;
+ REP_OLD_CONTROL *orp;
+ __rep_control_args *rp, tmprp;
+ __rep_egen_args egen_arg;
+ size_t len;
+ u_int32_t gen, rep_version;
+ int cmp, do_sync, lockout, master_id, recovering, ret, t_ret;
+ time_t savetime;
+ u_int8_t buf[__REP_MAXMSG_SIZE];
+
+ ret = 0;
+ do_sync = 0;
+ lockout = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ infop = env->reginfo;
+ renv = infop->primary;
+ /*
+ * Casting this to REP_OLD_CONTROL is just kind of stylistic: the
+ * rep_version field of course has to be in the same offset in all
+ * versions in order for this to work.
+ *
+ * We can look at the rep_version unswapped here because if we're
+ * talking to an old version, it will always be unswapped. If
+ * we're talking to a new version, the only issue is if it is
+ * swapped and we take one of the old version conditionals
+ * incorrectly. The rep_version would need to be very, very
+ * large for a swapped version to look like a small, older
+ * version. There is no problem here looking at it unswapped.
+ */
+ rep_version = ((REP_OLD_CONTROL *)control->data)->rep_version;
+ if (rep_version <= DB_REPVERSION_45) {
+ orp = (REP_OLD_CONTROL *)control->data;
+ if (rep_version == DB_REPVERSION_45 &&
+ F_ISSET(orp, REPCTL_INIT_45)) {
+ F_CLR(orp, REPCTL_INIT_45);
+ F_SET(orp, REPCTL_INIT);
+ }
+ tmprp.rep_version = orp->rep_version;
+ tmprp.log_version = orp->log_version;
+ tmprp.lsn = orp->lsn;
+ tmprp.rectype = orp->rectype;
+ tmprp.gen = orp->gen;
+ tmprp.flags = orp->flags;
+ tmprp.msg_sec = 0;
+ tmprp.msg_nsec = 0;
+ } else if (rep_version == DB_REPVERSION_46) {
+ rp46 = (REP_46_CONTROL *)control->data;
+ tmprp.rep_version = rp46->rep_version;
+ tmprp.log_version = rp46->log_version;
+ tmprp.lsn = rp46->lsn;
+ tmprp.rectype = rp46->rectype;
+ tmprp.gen = rp46->gen;
+ tmprp.flags = rp46->flags;
+ tmprp.msg_sec = (u_int32_t)rp46->msg_time.tv_sec;
+ tmprp.msg_nsec = (u_int32_t)rp46->msg_time.tv_nsec;
+ } else
+ if ((ret = __rep_control_unmarshal(env, &tmprp,
+ control->data, control->size, NULL)) != 0)
+ return (ret);
+ rp = &tmprp;
+ if (ret_lsnp != NULL)
+ ZERO_LSN(*ret_lsnp);
+
+ ENV_ENTER(env, ip);
+
+ REP_PRINT_MESSAGE(env, eid, rp, "rep_process_message", 0);
+ /*
+ * Check the version number for both rep and log. If it is
+ * an old version we support, convert it. Otherwise complain.
+ */
+ if (rp->rep_version < DB_REPVERSION) {
+ if (rp->rep_version < DB_REPVERSION_MIN) {
+ __db_errx(env, DB_STR_A("3516",
+ "unsupported old replication message version %lu, minimum version %d",
+ "%lu %d"), (u_long)rp->rep_version,
+ DB_REPVERSION_MIN);
+
+ ret = EINVAL;
+ goto errlock;
+ }
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Received record %lu with old rep version %lu",
+ (u_long)rp->rectype, (u_long)rp->rep_version));
+ rp->rectype = __rep_msg_from_old(rp->rep_version, rp->rectype);
+ DB_ASSERT(env, rp->rectype != REP_INVALID);
+ /*
+ * We should have a valid new record type for all the old
+ * versions.
+ */
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Converted to record %lu with old rep version %lu",
+ (u_long)rp->rectype, (u_long)rp->rep_version));
+ } else if (rp->rep_version > DB_REPVERSION) {
+ __db_errx(env, DB_STR_A("3517",
+ "unexpected replication message version %lu, expected %d",
+ "%lu %d"), (u_long)rp->rep_version, DB_REPVERSION);
+ ret = EINVAL;
+ goto errlock;
+ }
+
+ if (rp->log_version < DB_LOGVERSION) {
+ if (rp->log_version < DB_LOGVERSION_MIN) {
+ __db_errx(env, DB_STR_A("3518",
+ "unsupported old replication log version %lu, minimum version %d",
+ "%lu %d"), (u_long)rp->log_version,
+ DB_LOGVERSION_MIN);
+ ret = EINVAL;
+ goto errlock;
+ }
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Received record %lu with old log version %lu",
+ (u_long)rp->rectype, (u_long)rp->log_version));
+ } else if (rp->log_version > DB_LOGVERSION) {
+ __db_errx(env, DB_STR_A("3519",
+ "unexpected log record version %lu, expected %d",
+ "%lu %d"), (u_long)rp->log_version, DB_LOGVERSION);
+ ret = EINVAL;
+ goto errlock;
+ }
+
+ /*
+ * Acquire the replication lock.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
+ /*
+ * If we're racing with a thread in rep_start, then
+ * just ignore the message and return.
+ */
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Racing replication msg lockout, ignore message."));
+ /*
+ * Although we're ignoring the message, there are a few
+ * we need to pay a bit of attention to anyway. All of
+ * these cases are mutually exclusive.
+ * 1. If it is a PERM message, we don't want to return 0.
+ * 2. If it is a NEWSITE message let the app know so it can
+ * do whatever it needs for connection purposes.
+ * 3. If it is a c2c request, tell the sender we're not
+ * going to handle it.
+ */
+ if (F_ISSET(rp, REPCTL_PERM))
+ ret = DB_REP_IGNORE;
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * If this is new site information return DB_REP_NEWSITE so
+ * that the user can use whatever information may have been
+ * sent for connections.
+ */
+ if (rp->rectype == REP_NEWSITE)
+ ret = DB_REP_NEWSITE;
+ /*
+ * If another client has sent a c2c request to us, it may be a
+ * long time before it resends the request (due to its dual data
+ * streams avoidance heuristic); let it know we can't serve the
+ * request just now.
+ */
+ if (F_ISSET(rep, REP_F_CLIENT) && REP_MSG_REQ(rp->rectype)) {
+ STAT(rep->stat.st_client_svc_req++);
+ STAT(rep->stat.st_client_svc_miss++);
+ (void)__rep_send_message(env,
+ eid, REP_REREQUEST, NULL, NULL, 0, 0);
+ }
+ goto out;
+ }
+ rep->msg_th++;
+ gen = rep->gen;
+ master_id = rep->master_id;
+ recovering = IS_REP_RECOVERING(rep);
+ savetime = renv->rep_timestamp;
+
+ STAT(rep->stat.st_msgs_processed++);
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Check for lease configuration matching. Leases must be
+ * configured all or none. If I am a client and I receive a
+ * message requesting a lease, and I'm not using leases, that
+ * is an error.
+ */
+ if (!IS_USING_LEASES(env) &&
+ (F_ISSET(rp, REPCTL_LEASE) || rp->rectype == REP_LEASE_GRANT)) {
+ __db_errx(env, DB_STR("3520",
+ "Inconsistent lease configuration"));
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Client received lease message and not using leases"));
+ ret = EINVAL;
+ ret = __env_panic(env, ret);
+ goto errlock;
+ }
+
+ /*
+ * Check for generation number matching. Ignore any old messages
+ * except requests that are indicative of a new client that needs
+ * to get in sync.
+ */
+ if (rp->gen < gen && rp->rectype != REP_ALIVE_REQ &&
+ rp->rectype != REP_NEWCLIENT && rp->rectype != REP_MASTER_REQ &&
+ rp->rectype != REP_DUPMASTER && rp->rectype != REP_VOTE1) {
+ /*
+ * We don't hold the rep mutex, and could miscount if we race.
+ */
+ STAT(rep->stat.st_msgs_badgen++);
+ if (F_ISSET(rp, REPCTL_PERM))
+ ret = DB_REP_IGNORE;
+ goto errlock;
+ }
+
+ if (rp->gen > gen) {
+ /*
+ * If I am a master and am out of date with a lower generation
+ * number, I am in bad shape and should downgrade.
+ */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ STAT(rep->stat.st_dupmasters++);
+ ret = DB_REP_DUPMASTER;
+ /*
+ * Only broadcast DUPMASTER if leases are not
+ * in effect. If I am an old master, using
+ * leases and I get a newer message, my leases
+ * had better all be expired.
+ */
+ if (IS_USING_LEASES(env))
+ DB_ASSERT(env,
+ __rep_lease_check(env, 0) ==
+ DB_REP_LEASE_EXPIRED);
+ else if (rp->rectype != REP_DUPMASTER)
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_DUPMASTER,
+ NULL, NULL, 0, 0);
+ goto errlock;
+ }
+
+ /*
+ * I am a client and am out of date. If this is an election,
+ * or a response from the first site I contacted, then I can
+ * accept the generation number and participate in future
+ * elections and communication. Otherwise, I need to hear about
+ * a new master and sync up.
+ */
+ if (rp->rectype == REP_ALIVE ||
+ rp->rectype == REP_VOTE1 || rp->rectype == REP_VOTE2) {
+ REP_SYSTEM_LOCK(env);
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Updating gen from %lu to %lu",
+ (u_long)gen, (u_long)rp->gen));
+ rep->master_id = DB_EID_INVALID;
+ gen = rp->gen;
+ SET_GEN(gen);
+ /*
+ * Updating of egen will happen when we process the
+ * message below for each message type.
+ */
+ REP_SYSTEM_UNLOCK(env);
+ if (rp->rectype == REP_ALIVE)
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ, NULL,
+ NULL, 0, 0);
+ } else if (rp->rectype != REP_NEWMASTER) {
+ /*
+ * Ignore this message, retransmit if needed.
+ */
+ if (__rep_check_doreq(env, rep))
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ,
+ NULL, NULL, 0, 0);
+ goto errlock;
+ }
+ /*
+ * If you get here, then you're a client and either you're
+ * in an election or you have a NEWMASTER or an ALIVE message
+ * whose processing will do the right thing below.
+ */
+ }
+
+ /*
+ * If the sender is part of an established group, so are we now.
+ */
+ if (F_ISSET(rp, REPCTL_GROUP_ESTD)) {
+ REP_SYSTEM_LOCK(env);
+#ifdef DIAGNOSTIC
+ if (!F_ISSET(rep, REP_F_GROUP_ESTD))
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "I am now part of an established group"));
+#endif
+ F_SET(rep, REP_F_GROUP_ESTD);
+ REP_SYSTEM_UNLOCK(env);
+ }
+
+ /*
+ * We need to check if we're in recovery and if we are
+ * then we need to ignore any messages except VERIFY*, VOTE*,
+ * NEW* and ALIVE_REQ, or backup related messages: UPDATE*,
+ * PAGE* and FILE*. We need to also accept LOG messages
+ * if we're copying the log for recovery/backup.
+ */
+ switch (rp->rectype) {
+ case REP_ALIVE:
+ /*
+ * Handle even if we're recovering.
+ */
+ ANYSITE(rep);
+ if (rp->rep_version < DB_REPVERSION_47)
+ egen_arg.egen = *(u_int32_t *)rec->data;
+ else if ((ret = __rep_egen_unmarshal(env, &egen_arg,
+ rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ REP_SYSTEM_LOCK(env);
+ if (egen_arg.egen > rep->egen) {
+ /*
+ * If we're currently working futilely at processing an
+ * obsolete egen, treat it like an egen update, so that
+ * we abort the current rep_elect() call and signal the
+ * application to start a new one.
+ */
+ if (rep->spent_egen == rep->egen)
+ ret = DB_REP_HOLDELECTION;
+
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Received ALIVE egen of %lu, mine %lu",
+ (u_long)egen_arg.egen, (u_long)rep->egen));
+ __rep_elect_done(env, rep);
+ rep->egen = egen_arg.egen;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ break;
+ case REP_ALIVE_REQ:
+ /*
+ * Handle even if we're recovering.
+ */
+ ANYSITE(rep);
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+#ifdef CONFIG_TEST
+ /*
+ * Send this first, before the ALIVE message because of the
+ * way the test suite and messaging is done sequentially.
+ * In some sequences it is possible to get into a situation
+ * where the test suite cannot get the later NEWMASTER because
+ * we break out of the messaging loop too early.
+ */
+ if (F_ISSET(rep, REP_F_MASTER))
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+#endif
+ REP_SYSTEM_LOCK(env);
+ egen_arg.egen = rep->egen;
+ REP_SYSTEM_UNLOCK(env);
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(data_dbt, &egen_arg.egen,
+ sizeof(egen_arg.egen));
+ else {
+ if ((ret = __rep_egen_marshal(env,
+ &egen_arg, buf, __REP_EGEN_SIZE, &len)) != 0)
+ goto errlock;
+ DB_INIT_DBT(data_dbt, buf, len);
+ }
+ (void)__rep_send_message(env,
+ eid, REP_ALIVE, &lsn, &data_dbt, 0, 0);
+ break;
+ case REP_ALL_REQ:
+ RECOVERING_SKIP;
+ CLIENT_MASTERCHK;
+ ret = __rep_allreq(env, rp, eid);
+ CLIENT_REREQ;
+ break;
+ case REP_BULK_LOG:
+ RECOVERING_LOG_SKIP;
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_bulk_log(env, ip, rp, rec, savetime, ret_lsnp);
+ break;
+ case REP_BULK_PAGE:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_bulk_page(env, ip, eid, rp, rec);
+ break;
+ case REP_DUPMASTER:
+ /*
+ * Handle even if we're recovering.
+ */
+ if (F_ISSET(rep, REP_F_MASTER))
+ ret = DB_REP_DUPMASTER;
+ break;
+#ifdef NOTYET
+ case REP_FILE: /* TODO */
+ CLIENT_ONLY(rep, rp);
+ break;
+ case REP_FILE_REQ:
+ ret = __rep_send_file(env, rec, eid);
+ break;
+#endif
+ case REP_FILE_FAIL:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ /*
+ * Clean up any internal init that was in progress.
+ */
+ if (eid == rep->master_id) {
+ REP_SYSTEM_LOCK(env);
+ /*
+ * If we're already locking out messages, give up.
+ */
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG))
+ goto errhlk;
+ /*
+ * Lock out other messages to prevent race
+ * conditions.
+ */
+ if ((ret =
+ __rep_lockout_msg(env, rep, 1)) != 0) {
+ goto errhlk;
+ }
+ lockout = 1;
+ /*
+ * Need mtx_clientdb to safely clean up
+ * page database in __rep_init_cleanup().
+ */
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Clean up internal init if one was in progress.
+ */
+ if (ISSET_LOCKOUT_BDB(rep)) {
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "FILE_FAIL is cleaning up old internal init"));
+#ifdef CONFIG_TEST
+ STAT(rep->stat.st_filefail_cleanups++);
+#endif
+ ret = __rep_init_cleanup(env, rep, DB_FORCE);
+ F_CLR(rep, REP_F_ABBREVIATED);
+ CLR_RECOVERY_SETTINGS(rep);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (ret != 0) {
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "FILE_FAIL error cleaning up internal init: %d", ret));
+ goto errhlk;
+ }
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ lockout = 0;
+ /*
+ * Restart internal init, setting UPDATE flag and
+ * zeroing applicable LSNs.
+ */
+ rep->sync_state = SYNC_UPDATE;
+ ZERO_LSN(rep->first_lsn);
+ ZERO_LSN(rep->ckp_lsn);
+ REP_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env, eid, REP_UPDATE_REQ,
+ NULL, NULL, 0, 0);
+ }
+ break;
+ case REP_LEASE_GRANT:
+ /*
+ * Handle even if we're recovering.
+ */
+ MASTER_ONLY(rep, rp);
+ ret = __rep_lease_grant(env, rp, rec, eid);
+ break;
+ case REP_LOG:
+ case REP_LOG_MORE:
+ RECOVERING_LOG_SKIP;
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_log(env, ip, rp, rec, eid, savetime, ret_lsnp);
+ break;
+ case REP_LOG_REQ:
+ RECOVERING_SKIP;
+ CLIENT_MASTERCHK;
+ if (F_ISSET(rp, REPCTL_INIT))
+ MASTER_UPDATE(env, renv);
+ ret = __rep_logreq(env, rp, rec, eid);
+ CLIENT_REREQ;
+ break;
+ case REP_NEWSITE:
+ /*
+ * Handle even if we're recovering.
+ */
+ /* We don't hold the rep mutex, and may miscount. */
+ STAT(rep->stat.st_newsites++);
+
+ /* This is a rebroadcast; simply tell the application. */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env,
+ eid, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ }
+ ret = DB_REP_NEWSITE;
+ break;
+ case REP_NEWCLIENT:
+ /*
+ * Handle even if we're recovering.
+ */
+ /*
+ * This message was received and should have resulted in the
+ * application entering the machine ID in its machine table.
+ * We respond to this with an ALIVE to send relevant information
+ * to the new client (if we are a master, we'll send a
+ * NEWMASTER, so we only need to send the ALIVE if we're a
+ * client). But first, broadcast the new client's record to
+ * all the clients.
+ */
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWSITE, &rp->lsn, rec, 0, 0);
+
+ ret = DB_REP_NEWSITE;
+
+ if (F_ISSET(rep, REP_F_CLIENT)) {
+ REP_SYSTEM_LOCK(env);
+ egen_arg.egen = rep->egen;
+
+ /*
+ * Clean up any previous master remnants by making
+ * master_id invalid and cleaning up any internal
+ * init that was in progress.
+ */
+ if (eid == rep->master_id) {
+ rep->master_id = DB_EID_INVALID;
+
+ /*
+ * Already locking out messages, must be
+ * in sync-up recover or internal init,
+ * give up.
+ */
+ if (FLD_ISSET(rep->lockout_flags,
+ REP_LOCKOUT_MSG))
+ goto errhlk;
+
+ /*
+ * Lock out other messages to prevent race
+ * conditions.
+ */
+ if ((t_ret =
+ __rep_lockout_msg(env, rep, 1)) != 0) {
+ ret = t_ret;
+ goto errhlk;
+ }
+ lockout = 1;
+
+ /*
+ * Need mtx_clientdb to safely clean up
+ * page database in __rep_init_cleanup().
+ */
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+
+ /*
+ * Clean up internal init if one was in
+ * progress.
+ */
+ if (ISSET_LOCKOUT_BDB(rep)) {
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "NEWCLIENT is cleaning up old internal init for invalid master"));
+ t_ret = __rep_init_cleanup(env,
+ rep, DB_FORCE);
+ F_CLR(rep, REP_F_ABBREVIATED);
+ CLR_RECOVERY_SETTINGS(rep);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (t_ret != 0) {
+ ret = t_ret;
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "NEWCLIENT error cleaning up internal init for invalid master: %d", ret));
+ goto errhlk;
+ }
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ lockout = 0;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(data_dbt, &egen_arg.egen,
+ sizeof(egen_arg.egen));
+ else {
+ if ((ret = __rep_egen_marshal(env, &egen_arg,
+ buf, __REP_EGEN_SIZE, &len)) != 0)
+ goto errlock;
+ DB_INIT_DBT(data_dbt, buf, len);
+ }
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
+ break;
+ }
+ /* FALLTHROUGH */
+ case REP_MASTER_REQ:
+ RECOVERING_SKIP;
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ if (IS_USING_LEASES(env))
+ (void)__rep_lease_refresh(env);
+ }
+ /*
+ * If there is no master, then we could get into a state
+ * where an old client lost the initial ALIVE message and
+ * is calling an election under an old gen and can
+ * never get to the current gen.
+ */
+ if (F_ISSET(rep, REP_F_CLIENT) && rp->gen < gen) {
+ REP_SYSTEM_LOCK(env);
+ egen_arg.egen = rep->egen;
+ if (eid == rep->master_id)
+ rep->master_id = DB_EID_INVALID;
+ REP_SYSTEM_UNLOCK(env);
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(data_dbt, &egen_arg.egen,
+ sizeof(egen_arg.egen));
+ else {
+ if ((ret = __rep_egen_marshal(env, &egen_arg,
+ buf, __REP_EGEN_SIZE, &len)) != 0)
+ goto errlock;
+ DB_INIT_DBT(data_dbt, buf, len);
+ }
+ (void)__rep_send_message(env, eid,
+ REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
+ }
+ break;
+ case REP_NEWFILE:
+ RECOVERING_LOG_SKIP;
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_apply(env,
+ ip, rp, rec, ret_lsnp, NULL, &last_lsn);
+ if (ret == DB_REP_LOGREADY)
+ ret = __rep_logready(env, rep, savetime, &last_lsn);
+ break;
+ case REP_NEWMASTER:
+ /*
+ * Handle even if we're recovering.
+ */
+ ANYSITE(rep);
+ if (F_ISSET(rep, REP_F_MASTER) &&
+ eid != rep->eid) {
+ /* We don't hold the rep mutex, and may miscount. */
+ STAT(rep->stat.st_dupmasters++);
+ ret = DB_REP_DUPMASTER;
+ if (IS_USING_LEASES(env))
+ DB_ASSERT(env,
+ __rep_lease_check(env, 0) ==
+ DB_REP_LEASE_EXPIRED);
+ else
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_DUPMASTER,
+ NULL, NULL, 0, 0);
+ break;
+ }
+ if ((ret =
+ __rep_new_master(env, rp, eid)) == DB_REP_NEWMASTER)
+ ret = __rep_fire_newmaster(env, rp->gen, eid);
+ break;
+ case REP_PAGE:
+ case REP_PAGE_FAIL:
+ case REP_PAGE_MORE:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_page(env, ip, eid, rp, rec);
+ if (ret == DB_REP_PAGEDONE)
+ ret = 0;
+ break;
+ case REP_PAGE_REQ:
+ RECOVERING_SKIP;
+ CLIENT_MASTERCHK;
+ MASTER_UPDATE(env, renv);
+ ret = __rep_page_req(env, ip, eid, rp, rec);
+ CLIENT_REREQ;
+ break;
+ case REP_REREQUEST:
+ /*
+ * Handle even if we're recovering. Don't do a master
+ * check.
+ */
+ CLIENT_ONLY(rep, rp);
+ /*
+ * Don't hold any mutex, may miscount.
+ */
+ STAT(rep->stat.st_client_rerequests++);
+ ret = __rep_resend_req(env, 1);
+ break;
+ case REP_START_SYNC:
+ RECOVERING_SKIP;
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ cmp = LOG_COMPARE(&rp->lsn, &lp->ready_lsn);
+ /*
+ * The comparison needs to be <= because the LSN in
+ * the message can be the LSN of the first outstanding
+ * txn, which may be the LSN immediately after the
+ * previous commit. The ready_lsn is the LSN of the
+ * next record expected. In that case, the LSNs
+ * could be equal and the client has the commit and
+ * wants to sync. [SR #15338]
+ */
+ if (cmp <= 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ do_sync = 1;
+ } else {
+ STAT(rep->stat.st_startsync_delayed++);
+ /*
+ * There are cases where keeping the first ckp_lsn
+ * LSN is advantageous and cases where keeping
+ * a later LSN is better. If random, earlier
+ * log records are missing, keeping the later
+ * LSN seems to be better. That is what we'll
+ * do for now.
+ */
+ if (LOG_COMPARE(&rp->lsn, &rep->ckp_lsn) > 0)
+ rep->ckp_lsn = rp->lsn;
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Delayed START_SYNC memp_sync due to missing records."));
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "ready LSN [%lu][%lu], ckp_lsn [%lu][%lu]",
+ (u_long)lp->ready_lsn.file, (u_long)lp->ready_lsn.offset,
+ (u_long)rep->ckp_lsn.file, (u_long)rep->ckp_lsn.offset));
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+ break;
+ case REP_UPDATE:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ if ((ret = __rep_update_setup(env,
+ eid, rp, rec, savetime, &lsn)) == DB_REP_WOULDROLLBACK &&
+ ret_lsnp != NULL) {
+ /*
+ * Not for a normal internal init. But this could
+ * happen here if we had to ask for an UPDATE message in
+ * order to check for materializing NIMDBs; in other
+ * words, an "abbreviated internal init."
+ */
+ *ret_lsnp = lsn;
+ }
+ break;
+ case REP_UPDATE_REQ:
+ /*
+ * Handle even if we're recovering.
+ */
+ MASTER_ONLY(rep, rp);
+ infop = env->reginfo;
+ renv = infop->primary;
+ MASTER_UPDATE(env, renv);
+ ret = __rep_update_req(env, rp);
+ break;
+ case REP_VERIFY:
+ if (recovering) {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ cmp = LOG_COMPARE(&lp->verify_lsn, &rp->lsn);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ /*
+ * If this is not the verify record I want, skip it.
+ */
+ if (cmp != 0) {
+ ret = __rep_skip_msg(
+ env, rep, eid, rp->rectype);
+ break;
+ }
+ }
+ CLIENT_ONLY(rep, rp);
+ if ((ret = __rep_verify(env, rp, rec, eid, savetime)) ==
+ DB_REP_WOULDROLLBACK && ret_lsnp != NULL)
+ *ret_lsnp = rp->lsn;
+ break;
+ case REP_VERIFY_FAIL:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_verify_fail(env, rp);
+ break;
+ case REP_VERIFY_REQ:
+ RECOVERING_SKIP;
+ CLIENT_MASTERCHK;
+ ret = __rep_verify_req(env, rp, eid);
+ CLIENT_REREQ;
+ break;
+ case REP_VOTE1:
+ /*
+ * Handle even if we're recovering.
+ */
+ ret = __rep_vote1(env, rp, rec, eid);
+ break;
+ case REP_VOTE2:
+ /*
+ * Handle even if we're recovering.
+ */
+ ret = __rep_vote2(env, rp, rec, eid);
+ break;
+ default:
+ __db_errx(env, DB_STR_A("3521",
+ "DB_ENV->rep_process_message: unknown replication message: type %lu",
+ "%lu"), (u_long)rp->rectype);
+ ret = EINVAL;
+ break;
+ }
+
+errlock:
+ REP_SYSTEM_LOCK(env);
+errhlk: if (lockout)
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ rep->msg_th--;
+ REP_SYSTEM_UNLOCK(env);
+ if (do_sync) {
+ MUTEX_LOCK(env, rep->mtx_ckp);
+ lsn = rp->lsn;
+ /*
+ * This is the REP_START_SYNC sync, and so we permit it to be
+ * interrupted.
+ */
+ ret = __memp_sync(
+ env, DB_SYNC_CHECKPOINT | DB_SYNC_INTERRUPT_OK, &lsn);
+ MUTEX_UNLOCK(env, rep->mtx_ckp);
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "START_SYNC: Completed sync [%lu][%lu]",
+ (u_long)lsn.file, (u_long)lsn.offset));
+ }
+out:
+ if (ret == 0 && F_ISSET(rp, REPCTL_PERM)) {
+ if (ret_lsnp != NULL)
+ *ret_lsnp = rp->lsn;
+ ret = DB_REP_NOTPERM;
+ }
+ __dbt_userfree(env, control, rec, NULL);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __rep_apply --
+ *
+ * Handle incoming log records on a client, applying when possible and
+ * entering into the bookkeeping table otherwise. This routine manages
+ * the state of the incoming message stream -- processing records, via
+ * __rep_process_rec, when possible and enqueuing in the __db.rep.db
+ * when necessary. As gaps in the stream are filled in, this is where
+ * we try to process as much as possible from __db.rep.db to catch up.
+ *
+ * PUBLIC: int __rep_apply __P((ENV *, DB_THREAD_INFO *, __rep_control_args *,
+ * PUBLIC: DBT *, DB_LSN *, int *, DB_LSN *));
+ */
+int
+__rep_apply(env, ip, rp, rec, ret_lsnp, is_dupp, last_lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __rep_control_args *rp;
+ DBT *rec;
+ DB_LSN *ret_lsnp;
+ int *is_dupp;
+ DB_LSN *last_lsnp;
+{
+ DB *dbp;
+ DBT control_dbt, key_dbt;
+ DBT rec_dbt;
+ DB_LOG *dblp;
+ DB_LSN max_lsn, save_lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ db_timespec msg_time, max_ts;
+ u_int32_t gen, rectype;
+ int cmp, event, master, newfile_seen, ret, set_apply, t_ret;
+
+ COMPQUIET(gen, 0);
+ COMPQUIET(master, DB_EID_INVALID);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ event = ret = set_apply = 0;
+ memset(&control_dbt, 0, sizeof(control_dbt));
+ memset(&rec_dbt, 0, sizeof(rec_dbt));
+ ZERO_LSN(max_lsn);
+ timespecclear(&max_ts);
+ timespecset(&msg_time, rp->msg_sec, rp->msg_nsec);
+ cmp = -2; /* OOB value that LOG_COMPARE can't return. */
+
+ dblp = env->lg_handle;
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ /*
+ * Lazily open the temp db. Always set the startup flag to 0
+ * because it was initialized from rep_start.
+ */
+ if (db_rep->rep_db == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto out;
+ }
+ dbp = db_rep->rep_db;
+ lp = dblp->reginfo.primary;
+ newfile_seen = 0;
+ REP_SYSTEM_LOCK(env);
+ if (rep->sync_state == SYNC_LOG &&
+ LOG_COMPARE(&lp->ready_lsn, &rep->first_lsn) < 0)
+ lp->ready_lsn = rep->first_lsn;
+ cmp = LOG_COMPARE(&rp->lsn, &lp->ready_lsn);
+ /*
+ * If we are going to skip or process any message other
+ * than a duplicate, make note of it if we're in an
+ * election so that the election can rerequest proactively.
+ */
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_APPLY) && cmp >= 0)
+ F_SET(rep, REP_F_SKIPPED_APPLY);
+
+ /*
+ * If we're in the middle of processing a NEWFILE, we've dropped
+ * the mutex and if this matches it is a duplicate record. We
+ * do not want this call taking the "matching" code below because
+ * we may then process later records in the temp db and the
+ * original NEWFILE may not have the log file ready. It will
+ * process those temp db items when it completes.
+ */
+ if (F_ISSET(rep, REP_F_NEWFILE) && cmp == 0)
+ cmp = -1;
+
+ if (cmp == 0) {
+ /*
+ * If we are in an election (i.e. we've sent a vote
+ * with an LSN in it), then we drop the next record
+ * we're expecting. When we find a master, we'll
+ * either go into sync, or if it was an existing
+ * master, rerequest this one record (later records
+ * are accumulating in the temp db).
+ *
+ * We can simply return here, and rep_process_message
+ * will set NOTPERM if necessary for this record.
+ */
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_APPLY)) {
+ /*
+ * We will simply return now. All special return
+ * processing should be ignored because the special
+ * values are just initialized. Variables like
+ * max_lsn are still 0.
+ */
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "rep_apply: In election. Ignoring [%lu][%lu]",
+ (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto out;
+ }
+ rep->apply_th++;
+ set_apply = 1;
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "rep_apply: Set apply_th %d", rep->apply_th));
+ REP_SYSTEM_UNLOCK(env);
+ if (rp->rectype == REP_NEWFILE)
+ newfile_seen = 1;
+ if ((ret = __rep_process_rec(env, ip,
+ rp, rec, &max_ts, &max_lsn)) != 0)
+ goto err;
+ /*
+ * If we get the record we are expecting, reset
+ * the count of records we've received and are applying
+ * towards the request interval.
+ */
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ ZERO_LSN(lp->max_wait_lsn);
+
+ /*
+ * The __rep_remfirst() and __rep_getnext() functions each open,
+ * use and then close a cursor on the temp db, each time through
+ * the loop. Although this may seem excessive, it is necessary
+ * to avoid locking problems with checkpoints.
+ */
+ while (ret == 0 &&
+ LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) == 0) {
+ /*
+ * We just filled in a gap in the log record stream.
+ * Write subsequent records to the log.
+ */
+gap_check:
+ if ((ret = __rep_remfirst(env, ip,
+ &control_dbt, &rec_dbt)) != 0)
+ goto err;
+
+ rp = (__rep_control_args *)control_dbt.data;
+ timespecset(&msg_time, rp->msg_sec, rp->msg_nsec);
+ rec = &rec_dbt;
+ if (rp->rectype == REP_NEWFILE)
+ newfile_seen = 1;
+ if ((ret = __rep_process_rec(env, ip,
+ rp, rec, &max_ts, &max_lsn)) != 0)
+ goto err;
+
+ STAT(--rep->stat.st_log_queued);
+
+ /*
+ * Since we just filled a gap in the log stream, and
+ * we're writing subsequent records to the log, we want
+ * to use rcvd_ts and wait_ts so that we will
+ * request the next gap if we end up with a gap and
+ * not so recent records in the temp db, but not
+ * request if recent records are in the temp db and
+ * likely to arrive on its own shortly. We want to
+ * avoid requesting the record in that case. Also
+ * reset max_wait_lsn because the next gap is a
+ * fresh gap.
+ */
+ lp->rcvd_ts = lp->last_ts;
+ lp->wait_ts = rep->request_gap;
+ if ((ret = __rep_getnext(env, ip)) == DB_NOTFOUND) {
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ ret = 0;
+ break;
+ } else if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * Check if we're at a gap in the table and if so, whether we
+ * need to ask for any records.
+ */
+ if (!IS_ZERO_LSN(lp->waiting_lsn) &&
+ LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) != 0) {
+ /*
+ * We got a record and processed it, but we may
+ * still be waiting for more records. If we
+ * filled a gap we keep a count of how many other
+ * records are in the temp database and if we should
+ * request the next gap at this time.
+ */
+ if (__rep_check_doreq(env, rep) && (ret =
+ __rep_loggap_req(env, rep, &rp->lsn, 0)) != 0)
+ goto err;
+ } else {
+ lp->wait_ts = rep->request_gap;
+ ZERO_LSN(lp->max_wait_lsn);
+ }
+
+ } else if (cmp > 0) {
+ /*
+ * The LSN is higher than the one we were waiting for.
+ * This record isn't in sequence; add it to the temporary
+ * database, update waiting_lsn if necessary, and perform
+ * calculations to determine if we should issue requests
+ * for new records.
+ */
+ REP_SYSTEM_UNLOCK(env);
+ memset(&key_dbt, 0, sizeof(key_dbt));
+ key_dbt.data = rp;
+ key_dbt.size = sizeof(*rp);
+ ret = __db_put(dbp, ip, NULL, &key_dbt, rec, DB_NOOVERWRITE);
+ if (ret == 0) {
+ STAT(rep->stat.st_log_queued++);
+ __os_gettime(env, &lp->last_ts, 1);
+#ifdef HAVE_STATISTICS
+ rep->stat.st_log_queued_total++;
+ if (rep->stat.st_log_queued_max <
+ rep->stat.st_log_queued)
+ rep->stat.st_log_queued_max =
+ rep->stat.st_log_queued;
+#endif
+ }
+
+ if (ret == DB_KEYEXIST)
+ ret = 0;
+ if (ret != 0 && ret != ENOMEM)
+ goto done;
+
+ /*
+ * If we are using in-memory, and got ENOMEM, it is
+ * not an error. But in that case we want to skip
+ * comparing the message LSN since we're not storing it.
+ * However, we do want continue to check if we need to
+ * send a request for the gap.
+ */
+ if (ret == 0 && (IS_ZERO_LSN(lp->waiting_lsn) ||
+ LOG_COMPARE(&rp->lsn, &lp->waiting_lsn) < 0)) {
+ /*
+ * If this is a new gap, then reset the rcvd_ts so
+ * that an out-of-order record after an idle period
+ * does not (likely) immediately rerequest.
+ */
+ if (IS_ZERO_LSN(lp->waiting_lsn))
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->waiting_lsn = rp->lsn;
+ }
+
+ if (__rep_check_doreq(env, rep) &&
+ (ret = __rep_loggap_req(env, rep, &rp->lsn, 0) != 0))
+ goto err;
+
+ /*
+ * If this is permanent; let the caller know that we have
+ * not yet written it to disk, but we've accepted it.
+ */
+ if (ret == 0 && F_ISSET(rp, REPCTL_PERM)) {
+ max_lsn = rp->lsn;
+ ret = DB_REP_NOTPERM;
+ }
+ goto done;
+ } else {
+ STAT(rep->stat.st_log_duplicated++);
+ REP_SYSTEM_UNLOCK(env);
+ if (is_dupp != NULL) {
+ *is_dupp = 1;
+ /*
+ * Could get overwritten by max_lsn later.
+ * But max_lsn is guaranteed <= ready_lsn, so
+ * it would be a more conservative LSN to return.
+ */
+ *ret_lsnp = lp->ready_lsn;
+ }
+ LOGCOPY_32(env, &rectype, rec->data);
+ if (rectype == DB___txn_regop || rectype == DB___txn_ckp)
+ max_lsn = lp->max_perm_lsn;
+ /*
+ * We check REPCTL_LEASE here, because this client may
+ * have leases configured but the master may not (especially
+ * in a mixed version group. If the master has leases
+ * configured, all clients must also.
+ */
+ if (IS_USING_LEASES(env) &&
+ F_ISSET(rp, REPCTL_LEASE) &&
+ timespecisset(&msg_time)) {
+ if (timespeccmp(&msg_time, &lp->max_lease_ts, >))
+ max_ts = msg_time;
+ else
+ max_ts = lp->max_lease_ts;
+ }
+ goto done;
+ }
+
+ /* Check if we need to go back into the table. */
+ if (ret == 0 && LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) == 0)
+ goto gap_check;
+
+done:
+err: /*
+ * In case of a race, to make sure only one thread can get
+ * DB_REP_LOGREADY, zero out rep->last_lsn to show that we've gotten to
+ * this point.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (ret == 0 &&
+ rep->sync_state == SYNC_LOG &&
+ !IS_ZERO_LSN(rep->last_lsn) &&
+ LOG_COMPARE(&lp->ready_lsn, &rep->last_lsn) >= 0) {
+ *last_lsnp = max_lsn;
+ ZERO_LSN(rep->last_lsn);
+ ZERO_LSN(max_lsn);
+ ret = DB_REP_LOGREADY;
+ }
+ /*
+ * Only decrement if we were actually applying log records.
+ * We do not care if we processed a dup record or put one
+ * in the temp db.
+ */
+ if (set_apply) {
+ rep->apply_th--;
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "rep_apply: Decrement apply_th %d [%lu][%lu]",
+ rep->apply_th, (u_long)lp->ready_lsn.file,
+ (u_long)lp->ready_lsn.offset));
+ }
+
+ if (ret == 0 && rep->sync_state != SYNC_LOG &&
+ !IS_ZERO_LSN(max_lsn)) {
+ if (ret_lsnp != NULL)
+ *ret_lsnp = max_lsn;
+ ret = DB_REP_ISPERM;
+ DB_ASSERT(env, LOG_COMPARE(&max_lsn, &lp->max_perm_lsn) >= 0);
+ lp->max_perm_lsn = max_lsn;
+ if ((t_ret = __rep_notify_threads(env, AWAIT_LSN)) != 0)
+ ret = t_ret;
+ }
+
+ /*
+ * Start-up is complete when we process (or have already processed) up
+ * to the end of the replication group's log. In case we miss that
+ * message, as a back-up, we also recognize start-up completion when we
+ * actually process a live log record. Having cmp==0 here (with a good
+ * "ret" value) implies we actually processed the record.
+ */
+ if ((ret == 0 || ret == DB_REP_ISPERM) &&
+ rep->stat.st_startup_complete == 0 &&
+ rep->sync_state != SYNC_LOG &&
+ ((cmp <= 0 && F_ISSET(rp, REPCTL_LOG_END)) ||
+ (cmp == 0 && !F_ISSET(rp, REPCTL_RESEND)))) {
+ rep->stat.st_startup_complete = 1;
+ event = 1;
+ gen = rep->gen;
+ master = rep->master_id;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * If we've processed beyond the needed LSN for a pending
+ * start sync, start it now. We must compare > here
+ * because ready_lsn is the next record we expect and if
+ * the last record is a commit, that will dirty pages on
+ * a client as that txn is applied.
+ */
+ if (!IS_ZERO_LSN(rep->ckp_lsn) &&
+ LOG_COMPARE(&lp->ready_lsn, &rep->ckp_lsn) > 0) {
+ save_lsn = rep->ckp_lsn;
+ ZERO_LSN(rep->ckp_lsn);
+ } else
+ ZERO_LSN(save_lsn);
+
+ /*
+ * If this is a perm record, we are using leases, update the lease
+ * grant. We must hold the clientdb mutex. We must not hold
+ * the region mutex because rep_update_grant will acquire it.
+ */
+ if (ret == DB_REP_ISPERM && IS_USING_LEASES(env) &&
+ timespecisset(&max_ts)) {
+ if ((t_ret = __rep_update_grant(env, &max_ts)) != 0)
+ ret = t_ret;
+ else if (timespeccmp(&max_ts, &lp->max_lease_ts, >))
+ lp->max_lease_ts = max_ts;
+ }
+
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (!IS_ZERO_LSN(save_lsn)) {
+ /*
+ * Now call memp_sync holding only the ckp mutex.
+ */
+ MUTEX_LOCK(env, rep->mtx_ckp);
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Starting delayed __memp_sync call [%lu][%lu]",
+ (u_long)save_lsn.file, (u_long)save_lsn.offset));
+ t_ret = __memp_sync(env,
+ DB_SYNC_CHECKPOINT | DB_SYNC_INTERRUPT_OK, &save_lsn);
+ MUTEX_UNLOCK(env, rep->mtx_ckp);
+ }
+ if (event) {
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Start-up is done [%lu][%lu]",
+ (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
+
+ if ((t_ret = __rep_fire_startupdone(env, gen, master)) != 0) {
+ DB_ASSERT(env, ret == 0 || ret == DB_REP_ISPERM);
+ /* Failure trumps either of those values. */
+ ret = t_ret;
+ goto out;
+ }
+ }
+ if ((ret == 0 || ret == DB_REP_ISPERM) &&
+ newfile_seen && lp->db_log_autoremove)
+ __log_autoremove(env);
+ if (control_dbt.data != NULL)
+ __os_ufree(env, control_dbt.data);
+ if (rec_dbt.data != NULL)
+ __os_ufree(env, rec_dbt.data);
+
+out:
+ switch (ret) {
+ case 0:
+ break;
+ case DB_REP_ISPERM:
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Returning ISPERM [%lu][%lu], cmp = %d",
+ (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp));
+ break;
+ case DB_REP_LOGREADY:
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Returning LOGREADY up to [%lu][%lu], cmp = %d",
+ (u_long)last_lsnp->file,
+ (u_long)last_lsnp->offset, cmp));
+ break;
+ case DB_REP_NOTPERM:
+ if (rep->sync_state != SYNC_LOG &&
+ !IS_ZERO_LSN(max_lsn) && ret_lsnp != NULL)
+ *ret_lsnp = max_lsn;
+
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Returning NOTPERM [%lu][%lu], cmp = %d",
+ (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp));
+ break;
+ default:
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "Returning %d [%lu][%lu], cmp = %d", ret,
+ (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp));
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * __rep_process_txn --
+ *
+ * This is the routine that actually gets a transaction ready for
+ * processing.
+ *
+ * PUBLIC: int __rep_process_txn __P((ENV *, DBT *));
+ */
+int
+__rep_process_txn(env, rec)
+ ENV *env;
+ DBT *rec;
+{
+ DBT data_dbt, *lock_dbt;
+ DB_LOCKER *locker;
+ DB_LOCKREQ req, *lvp;
+ DB_LOGC *logc;
+ DB_LSN prev_lsn, *lsnp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ DB_TXNHEAD *txninfo;
+ LSN_COLLECTION lc;
+ REP *rep;
+ __txn_regop_args *txn_args;
+ __txn_regop_42_args *txn42_args;
+ __txn_prepare_args *prep_args;
+ u_int32_t rectype;
+ u_int i;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ logc = NULL;
+ txn_args = NULL;
+ txn42_args = NULL;
+ prep_args = NULL;
+ txninfo = NULL;
+
+ ENV_ENTER(env, ip);
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ if (F_ISSET(env, ENV_THREAD))
+ F_SET(&data_dbt, DB_DBT_REALLOC);
+
+ /*
+ * There are two phases: First, we have to traverse backwards through
+ * the log records gathering the list of all LSNs in the transaction.
+ * Once we have this information, we can loop through and then apply it.
+ *
+ * We may be passed a prepare (if we're restoring a prepare on upgrade)
+ * instead of a commit (the common case). Check which it is and behave
+ * appropriately.
+ */
+ LOGCOPY_32(env, &rectype, rec->data);
+ memset(&lc, 0, sizeof(lc));
+ if (rectype == DB___txn_regop) {
+ /*
+ * We're the end of a transaction. Make sure this is
+ * really a commit and not an abort!
+ */
+ if (rep->version >= DB_REPVERSION_44) {
+ if ((ret = __txn_regop_read(
+ env, rec->data, &txn_args)) != 0)
+ return (ret);
+ if (txn_args->opcode != TXN_COMMIT) {
+ __os_free(env, txn_args);
+ return (0);
+ }
+ prev_lsn = txn_args->prev_lsn;
+ lock_dbt = &txn_args->locks;
+ } else {
+ if ((ret = __txn_regop_42_read(
+ env, rec->data, &txn42_args)) != 0)
+ return (ret);
+ if (txn42_args->opcode != TXN_COMMIT) {
+ __os_free(env, txn42_args);
+ return (0);
+ }
+ prev_lsn = txn42_args->prev_lsn;
+ lock_dbt = &txn42_args->locks;
+ }
+ } else {
+ /* We're a prepare. */
+ DB_ASSERT(env, rectype == DB___txn_prepare);
+
+ if ((ret = __txn_prepare_read(
+ env, rec->data, &prep_args)) != 0)
+ return (ret);
+ prev_lsn = prep_args->prev_lsn;
+ lock_dbt = &prep_args->locks;
+ }
+
+ /* Get locks. */
+ if ((ret = __lock_id(env, NULL, &locker)) != 0)
+ goto err1;
+
+ /* We are always more important than user transactions. */
+ locker->priority = DB_LOCK_MAXPRIORITY;
+
+ if ((ret =
+ __lock_get_list(env, locker, 0, DB_LOCK_WRITE, lock_dbt)) != 0)
+ goto err;
+
+ /* Phase 1. Get a list of the LSNs in this transaction, and sort it. */
+ if ((ret = __rep_collect_txn(env, &prev_lsn, &lc)) != 0)
+ goto err;
+ qsort(lc.array, lc.nlsns, sizeof(DB_LSN), __rep_lsn_cmp);
+
+ /*
+ * The set of records for a transaction may include dbreg_register
+ * records. Create a txnlist so that they can keep track of file
+ * state between records.
+ */
+ if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0)
+ goto err;
+
+ /* Phase 2: Apply updates. */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+ for (lsnp = &lc.array[0], i = 0; i < lc.nlsns; i++, lsnp++) {
+ if ((ret = __logc_get(logc, lsnp, &data_dbt, DB_SET)) != 0) {
+ __db_errx(env, DB_STR_A("3522",
+ "failed to read the log at [%lu][%lu]", "%lu %lu"),
+ (u_long)lsnp->file, (u_long)lsnp->offset);
+ goto err;
+ }
+ if ((ret = __db_dispatch(env, &env->recover_dtab,
+ &data_dbt, lsnp, DB_TXN_APPLY, txninfo)) != 0) {
+ __db_errx(env, DB_STR_A("3523",
+ "transaction failed at [%lu][%lu]", "%lu %lu"),
+ (u_long)lsnp->file, (u_long)lsnp->offset);
+ goto err;
+ }
+ }
+
+err: memset(&req, 0, sizeof(req));
+ req.op = DB_LOCK_PUT_ALL;
+ if ((t_ret =
+ __lock_vec(env, locker, 0, &req, 1, &lvp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0)
+ ret = t_ret;
+
+err1: if (txn_args != NULL)
+ __os_free(env, txn_args);
+ if (txn42_args != NULL)
+ __os_free(env, txn42_args);
+ if (prep_args != NULL)
+ __os_free(env, prep_args);
+ if (lc.array != NULL)
+ __os_free(env, lc.array);
+
+ if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (txninfo != NULL)
+ __db_txnlist_end(env, txninfo);
+
+ if (F_ISSET(&data_dbt, DB_DBT_REALLOC) && data_dbt.data != NULL)
+ __os_ufree(env, data_dbt.data);
+
+#ifdef HAVE_STATISTICS
+ if (ret == 0)
+ /*
+ * We don't hold the rep mutex, and could miscount if we race.
+ */
+ rep->stat.st_txns_applied++;
+#endif
+
+ return (ret);
+}
+
+/*
+ * __rep_collect_txn
+ * Recursive function that will let us visit every entry in a transaction
+ * chain including all child transactions so that we can then apply
+ * the entire transaction family at once.
+ */
+static int
+__rep_collect_txn(env, lsnp, lc)
+ ENV *env;
+ DB_LSN *lsnp;
+ LSN_COLLECTION *lc;
+{
+ __txn_child_args *argp;
+ DB_LOGC *logc;
+ DB_LSN c_lsn;
+ DBT data;
+ u_int32_t rectype;
+ u_int nalloc;
+ int ret, t_ret;
+
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_REALLOC);
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ while (!IS_ZERO_LSN(*lsnp) &&
+ (ret = __logc_get(logc, lsnp, &data, DB_SET)) == 0) {
+ LOGCOPY_32(env, &rectype, data.data);
+ if (rectype == DB___txn_child) {
+ if ((ret = __txn_child_read(
+ env, data.data, &argp)) != 0)
+ goto err;
+ c_lsn = argp->c_lsn;
+ *lsnp = argp->prev_lsn;
+ __os_free(env, argp);
+ ret = __rep_collect_txn(env, &c_lsn, lc);
+ } else {
+ if (lc->nalloc < lc->nlsns + 1) {
+ nalloc = lc->nalloc == 0 ? 20 : lc->nalloc * 2;
+ if ((ret = __os_realloc(env,
+ nalloc * sizeof(DB_LSN), &lc->array)) != 0)
+ goto err;
+ lc->nalloc = nalloc;
+ }
+ lc->array[lc->nlsns++] = *lsnp;
+
+ /*
+ * Explicitly copy the previous lsn. The record
+ * starts with a u_int32_t record type, a u_int32_t
+ * txn id, and then the DB_LSN (prev_lsn) that we
+ * want. We copy explicitly because we have no idea
+ * what kind of record this is.
+ */
+ LOGCOPY_TOLSN(env, lsnp, (u_int8_t *)data.data +
+ sizeof(u_int32_t) + sizeof(u_int32_t));
+ }
+
+ if (ret != 0)
+ goto err;
+ }
+ if (ret != 0)
+ __db_errx(env, DB_STR_A("3524",
+ "collect failed at: [%lu][%lu]", "%lu %lu"),
+ (u_long)lsnp->file, (u_long)lsnp->offset);
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (data.data != NULL)
+ __os_ufree(env, data.data);
+ return (ret);
+}
+
+/*
+ * __rep_lsn_cmp --
+ * qsort-type-compatible wrapper for LOG_COMPARE.
+ */
+static int
+__rep_lsn_cmp(lsn1, lsn2)
+ const void *lsn1, *lsn2;
+{
+
+ return (LOG_COMPARE((DB_LSN *)lsn1, (DB_LSN *)lsn2));
+}
+
+/*
+ * __rep_newfile --
+ * NEWFILE messages have the LSN of the last record in the previous
+ * log file. When applying a NEWFILE message, make sure we haven't already
+ * swapped files. Assume caller hold mtx_clientdb.
+ */
+static int
+__rep_newfile(env, rp, rec)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+{
+ DB_LOG *dblp;
+ DB_LSN tmplsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ __rep_newfile_args nf_args;
+ int ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * If a newfile is already in progress, just ignore.
+ */
+ if (F_ISSET(rep, REP_F_NEWFILE))
+ return (0);
+ if (rp->lsn.file + 1 > lp->ready_lsn.file) {
+ if (rec == NULL || rec->size == 0) {
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+"rep_newfile: Old-style NEWFILE msg. Use control msg log version: %lu",
+ (u_long) rp->log_version));
+ nf_args.version = rp->log_version;
+ } else if (rp->rep_version < DB_REPVERSION_47)
+ nf_args.version = *(u_int32_t *)rec->data;
+ else if ((ret = __rep_newfile_unmarshal(env, &nf_args,
+ rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "rep_newfile: File %lu vers %lu",
+ (u_long)rp->lsn.file + 1, (u_long)nf_args.version));
+
+ /*
+ * We drop the mtx_clientdb mutex during
+ * the file operation, and then reacquire it when
+ * we're done. We avoid colliding with new incoming
+ * log records because lp->ready_lsn is not getting
+ * updated and there is no real log record at this
+ * ready_lsn. We avoid colliding with a duplicate
+ * NEWFILE message by setting an in-progress flag.
+ */
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_NEWFILE);
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ LOG_SYSTEM_LOCK(env);
+ ret = __log_newfile(dblp, &tmplsn, 0, nf_args.version);
+ LOG_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_NEWFILE);
+ REP_SYSTEM_UNLOCK(env);
+ if (ret == 0)
+ lp->ready_lsn = tmplsn;
+ return (ret);
+ } else
+ /* We've already applied this NEWFILE. Just ignore it. */
+ return (0);
+}
+
+/*
+ * __rep_do_ckp --
+ * Perform the memp_sync necessary for this checkpoint without holding the
+ * REP->mtx_clientdb. Callers of this function must hold REP->mtx_clientdb
+ * and must not be holding the region mutex.
+ */
+static int
+__rep_do_ckp(env, rec, rp)
+ ENV *env;
+ DBT *rec;
+ __rep_control_args *rp;
+{
+ DB_ENV *dbenv;
+ __txn_ckp_args *ckp_args;
+ DB_LSN ckp_lsn;
+ REP *rep;
+ int ret;
+
+ dbenv = env->dbenv;
+
+ /* Crack the log record and extract the checkpoint LSN. */
+ if ((ret = __txn_ckp_read(env, rec->data, &ckp_args)) != 0)
+ return (ret);
+ ckp_lsn = ckp_args->ckp_lsn;
+ __os_free(env, ckp_args);
+
+ rep = env->rep_handle->region;
+
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ DB_TEST_WAIT(env, env->test_check);
+
+ /*
+ * Sync the memory pool.
+ *
+ * This is the real PERM lock record/ckp. We cannot return ISPERM
+ * if we haven't truly completed the checkpoint, so we don't allow
+ * this call to be interrupted.
+ *
+ * We may be overlapping our log record with an in-progress startsync
+ * of this checkpoint; suppress the max_write settings on any running
+ * cache-flush operation so it completes quickly.
+ */
+ (void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 1);
+ MUTEX_LOCK(env, rep->mtx_ckp);
+ ret = __memp_sync(env, DB_SYNC_CHECKPOINT, &ckp_lsn);
+ MUTEX_UNLOCK(env, rep->mtx_ckp);
+ (void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 0);
+
+ /* Update the last_ckp in the txn region. */
+ if (ret == 0)
+ ret = __txn_updateckp(env, &rp->lsn);
+ else {
+ __db_errx(env, DB_STR_A("3525",
+ "Error syncing ckp [%lu][%lu]", "%lu %lu"),
+ (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset);
+ ret = __env_panic(env, ret);
+ }
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ return (ret);
+}
+
+/*
+ * __rep_remfirst --
+ * Remove the first entry from the __db.rep.db
+ */
+static int
+__rep_remfirst(env, ip, cntrl, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DBT *cntrl;
+ DBT *rec;
+{
+ DB *dbp;
+ DBC *dbc;
+ DB_REP *db_rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ dbp = db_rep->rep_db;
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /* The DBTs need to persist through another call. */
+ F_SET(cntrl, DB_DBT_REALLOC);
+ F_SET(rec, DB_DBT_REALLOC);
+ if ((ret = __dbc_get(dbc, cntrl, rec, DB_RMW | DB_FIRST)) == 0)
+ ret = __dbc_del(dbc, 0);
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __rep_getnext --
+ * Get the next record out of the __db.rep.db table.
+ */
+static int
+__rep_getnext(env, ip)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+{
+ DB *dbp;
+ DBC *dbc;
+ DBT lsn_dbt, nextrec_dbt;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ __rep_control_args *rp;
+ int ret, t_ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ db_rep = env->rep_handle;
+ dbp = db_rep->rep_db;
+
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /*
+ * Update waiting_lsn. We need to move it
+ * forward to the LSN of the next record
+ * in the queue.
+ *
+ * If the next item in the database is a log
+ * record--the common case--we're not
+ * interested in its contents, just in its LSN.
+ * Optimize by doing a partial get of the data item.
+ */
+ memset(&nextrec_dbt, 0, sizeof(nextrec_dbt));
+ F_SET(&nextrec_dbt, DB_DBT_PARTIAL);
+ nextrec_dbt.ulen = nextrec_dbt.dlen = 0;
+
+ memset(&lsn_dbt, 0, sizeof(lsn_dbt));
+ ret = __dbc_get(dbc, &lsn_dbt, &nextrec_dbt, DB_FIRST);
+ if (ret != DB_NOTFOUND && ret != 0)
+ goto err;
+
+ if (ret == DB_NOTFOUND) {
+ ZERO_LSN(lp->waiting_lsn);
+ /*
+ * Whether or not the current record is
+ * simple, there's no next one, and
+ * therefore we haven't got anything
+ * else to do right now. Break out.
+ */
+ goto err;
+ }
+ rp = (__rep_control_args *)lsn_dbt.data;
+ lp->waiting_lsn = rp->lsn;
+
+err: if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_process_rec --
+ *
+ * Given a record in 'rp', process it. In the case of a NEWFILE, that means
+ * potentially switching files. In the case of a checkpoint, it means doing
+ * the checkpoint, and in other cases, it means simply writing the record into
+ * the log.
+ */
+static int
+__rep_process_rec(env, ip, rp, rec, ret_tsp, ret_lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __rep_control_args *rp;
+ DBT *rec;
+ db_timespec *ret_tsp;
+ DB_LSN *ret_lsnp;
+{
+ DB *dbp;
+ DBT control_dbt, key_dbt, rec_dbt;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_LOGC *logc;
+ LOG *lp;
+ REP *rep;
+ DB_LSN lsn;
+ db_timespec msg_time;
+ u_int32_t rectype, txnid;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ dbp = db_rep->rep_db;
+ ret = 0;
+
+ memset(&rec_dbt, 0, sizeof(rec_dbt));
+ if (rp->rectype == REP_NEWFILE) {
+ if ((ret = __rep_newfile(env, rp, rec)) != 0)
+ return (ret);
+
+ /*
+ * In SYNC_LOG, in case the end-of-log sync point happens to be
+ * right at the file boundary, we need to make sure ret_lsnp
+ * points to a real log record, rather than the "dead space" at
+ * the end of the file that the NEWFILE msg normally points to.
+ */
+ if (rep->sync_state == SYNC_LOG) {
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ if ((ret = __logc_get(logc,
+ &lsn, &rec_dbt, DB_LAST)) == 0)
+ *ret_lsnp = lsn;
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+ }
+
+ LOGCOPY_32(env, &rectype, rec->data);
+ memset(&control_dbt, 0, sizeof(control_dbt));
+ timespecset(&msg_time, rp->msg_sec, rp->msg_nsec);
+
+ /*
+ * We write all records except for checkpoint records here.
+ * All non-checkpoint records need to appear in the log before
+ * we take action upon them (i.e., we enforce write-ahead logging).
+ * However, we can't write the checkpoint record here until the
+ * data buffers are actually written to disk, else we are creating
+ * an invalid log -- one that says all data before a certain point
+ * has been written to disk.
+ *
+ * If two threads are both processing the same checkpoint record
+ * (because, for example, it was resent and the original finally
+ * arrived), we handle that below by checking for the existence of
+ * the log record when we add it to the replication database.
+ *
+ * Any log records that arrive while we are processing the checkpoint
+ * are added to the bookkeeping database because ready_lsn is not yet
+ * updated to point after the checkpoint record.
+ */
+ if (rectype != DB___txn_ckp || rep->sync_state == SYNC_LOG) {
+ if ((ret = __log_rep_put(env, &rp->lsn, rec, 0)) != 0)
+ return (ret);
+ STAT(rep->stat.st_log_records++);
+ if (rep->sync_state == SYNC_LOG) {
+ *ret_lsnp = rp->lsn;
+ goto out;
+ }
+ }
+
+ switch (rectype) {
+ case DB___dbreg_register:
+ /*
+ * DB opens occur in the context of a transaction, so we can
+ * simply handle them when we process the transaction. Closes,
+ * however, are not transaction-protected, so we have to handle
+ * them here.
+ *
+ * It should be unsafe for the master to do a close of a file
+ * that was opened in an active transaction, so we should be
+ * guaranteed to get the ordering right.
+ *
+ * !!!
+ * The txn ID is the second 4-byte field of the log record.
+ * We should really be calling __dbreg_register_read() and
+ * working from the __dbreg_register_args structure, but this
+ * is considerably faster and the order of the fields won't
+ * change.
+ */
+ LOGCOPY_32(env, &txnid,
+ (u_int8_t *)rec->data + sizeof(u_int32_t));
+ if (txnid == TXN_INVALID)
+ ret = __db_dispatch(env, &env->recover_dtab,
+ rec, &rp->lsn, DB_TXN_APPLY, NULL);
+ break;
+ case DB___txn_regop:
+ /*
+ * If an application is doing app-specific recovery
+ * and acquires locks while applying a transaction,
+ * it can deadlock. Any other locks held by this
+ * thread should have been discarded in the
+ * __rep_process_txn error path, so if we simply
+ * retry, we should eventually succeed.
+ */
+ do {
+ ret = 0;
+ if (!F_ISSET(db_rep, DBREP_OPENFILES)) {
+ ret = __txn_openfiles(env, ip, NULL, 1);
+ F_SET(db_rep, DBREP_OPENFILES);
+ }
+ if (ret == 0)
+ ret = __rep_process_txn(env, rec);
+ } while (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED);
+
+ /* Now flush the log unless we're running TXN_NOSYNC. */
+ if (ret == 0 && !F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC))
+ ret = __log_flush(env, NULL);
+ if (ret != 0) {
+ __db_errx(env, DB_STR_A("3526",
+ "Error processing txn [%lu][%lu]", "%lu %lu"),
+ (u_long)rp->lsn.file, (u_long)rp->lsn.offset);
+ ret = __env_panic(env, ret);
+ }
+ *ret_lsnp = rp->lsn;
+ break;
+ case DB___txn_prepare:
+ ret = __log_flush(env, NULL);
+ /*
+ * Save the biggest prepared LSN we've seen.
+ */
+ rep->max_prep_lsn = rp->lsn;
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "process_rec: prepare at [%lu][%lu]",
+ (u_long)rep->max_prep_lsn.file,
+ (u_long)rep->max_prep_lsn.offset));
+ break;
+ case DB___txn_ckp:
+ /*
+ * We do not want to hold the REP->mtx_clientdb mutex while
+ * syncing the mpool, so if we get a checkpoint record we are
+ * supposed to process, add it to the __db.rep.db, do the
+ * memp_sync and then go back and process it later, when the
+ * sync has finished. If this record is already in the table,
+ * then some other thread will process it, so simply return
+ * REP_NOTPERM.
+ */
+ memset(&key_dbt, 0, sizeof(key_dbt));
+ key_dbt.data = rp;
+ key_dbt.size = sizeof(*rp);
+
+ /*
+ * We want to put this record into the tmp DB only if
+ * it doesn't exist, so use DB_NOOVERWRITE.
+ */
+ ret = __db_put(dbp, ip, NULL, &key_dbt, rec, DB_NOOVERWRITE);
+ if (ret == DB_KEYEXIST) {
+ if (ret_lsnp != NULL)
+ *ret_lsnp = rp->lsn;
+ ret = DB_REP_NOTPERM;
+ }
+ if (ret != 0)
+ break;
+
+ /*
+ * Now, do the checkpoint. Regardless of
+ * whether the checkpoint succeeds or not,
+ * we need to remove the record we just put
+ * in the temporary database. If the
+ * checkpoint failed, return an error. We
+ * will act like we never received the
+ * checkpoint.
+ */
+ if ((ret = __rep_do_ckp(env, rec, rp)) == 0)
+ ret = __log_rep_put(env, &rp->lsn, rec,
+ DB_LOG_CHKPNT);
+ if ((t_ret = __rep_remfirst(env, ip,
+ &control_dbt, &rec_dbt)) != 0 && ret == 0)
+ ret = t_ret;
+ /*
+ * If we're successful putting the log record in the
+ * log, flush it for a checkpoint.
+ */
+ if (ret == 0) {
+ *ret_lsnp = rp->lsn;
+ ret = __log_flush(env, NULL);
+ if (ret == 0 && lp->db_log_autoremove)
+ __log_autoremove(env);
+ }
+ break;
+ default:
+ break;
+ }
+
+out:
+ if (ret == 0 && F_ISSET(rp, REPCTL_PERM))
+ *ret_lsnp = rp->lsn;
+ if (IS_USING_LEASES(env) &&
+ F_ISSET(rp, REPCTL_LEASE))
+ *ret_tsp = msg_time;
+ /*
+ * Set ret_lsnp before flushing the log because if the
+ * flush fails, we've still written the record to the
+ * log and the LSN has been entered.
+ */
+ if (ret == 0 && F_ISSET(rp, REPCTL_FLUSH))
+ ret = __log_flush(env, NULL);
+ if (control_dbt.data != NULL)
+ __os_ufree(env, control_dbt.data);
+ if (rec_dbt.data != NULL)
+ __os_ufree(env, rec_dbt.data);
+
+ return (ret);
+}
+
+/*
+ * __rep_resend_req --
+ * We might have dropped a message, we need to resend our request.
+ * The request we send is dependent on what recovery state we're in.
+ * The caller holds no locks.
+ *
+ * PUBLIC: int __rep_resend_req __P((ENV *, int));
+ */
+int
+__rep_resend_req(env, rereq)
+ ENV *env;
+ int rereq;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn, *lsnp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int master, ret;
+ repsync_t sync_state;
+ u_int32_t gapflags, msgtype, repflags, sendflags;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+ lsnp = NULL;
+ msgtype = REP_INVALID;
+ sendflags = 0;
+
+ repflags = rep->flags;
+ sync_state = rep->sync_state;
+ /*
+ * If we are delayed we do not rerequest anything.
+ */
+ if (FLD_ISSET(repflags, REP_F_DELAY))
+ return (ret);
+ gapflags = rereq ? REP_GAP_REREQUEST : 0;
+
+ if (sync_state == SYNC_VERIFY) {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lsn = lp->verify_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (!IS_ZERO_LSN(lsn)) {
+ msgtype = REP_VERIFY_REQ;
+ lsnp = &lsn;
+ sendflags = DB_REP_REREQUEST;
+ }
+ } else if (sync_state == SYNC_UPDATE) {
+ /*
+ * UPDATE_REQ only goes to the master.
+ */
+ msgtype = REP_UPDATE_REQ;
+ } else if (sync_state == SYNC_PAGE) {
+ REP_SYSTEM_LOCK(env);
+ ret = __rep_pggap_req(env, rep, NULL, gapflags);
+ REP_SYSTEM_UNLOCK(env);
+ } else {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ ret = __rep_loggap_req(env, rep, NULL, gapflags);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+
+ if (msgtype != REP_INVALID) {
+ master = rep->master_id;
+ if (master == DB_EID_INVALID)
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0);
+ else
+ (void)__rep_send_message(env,
+ master, msgtype, lsnp, NULL, 0, sendflags);
+ }
+
+ return (ret);
+}
+
+/*
+ * __rep_check_doreq --
+ * PUBLIC: int __rep_check_doreq __P((ENV *, REP *));
+ *
+ * Check if we need to send another request. If so, compare with
+ * the request limits the user might have set. This assumes the
+ * caller holds the REP->mtx_clientdb mutex. Returns 1 if a request
+ * needs to be made, and 0 if it does not.
+ */
+int
+__rep_check_doreq(env, rep)
+ ENV *env;
+ REP *rep;
+{
+
+ DB_LOG *dblp;
+ LOG *lp;
+ db_timespec now;
+ int req;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ __os_gettime(env, &now, 1);
+ timespecsub(&now, &lp->rcvd_ts);
+ req = timespeccmp(&now, &lp->wait_ts, >=);
+ if (req) {
+ /*
+ * Add wait_ts to itself to double it.
+ */
+ timespecadd(&lp->wait_ts, &lp->wait_ts);
+ if (timespeccmp(&lp->wait_ts, &rep->max_gap, >))
+ lp->wait_ts = rep->max_gap;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ }
+ return (req);
+}
+
+/*
+ * __rep_skip_msg -
+ *
+ * If we're in recovery we want to skip/ignore the message, but
+ * we also need to see if we need to re-request any retransmissions.
+ */
+static int
+__rep_skip_msg(env, rep, eid, rectype)
+ ENV *env;
+ REP *rep;
+ int eid;
+ u_int32_t rectype;
+{
+ int do_req, ret;
+
+ ret = 0;
+ /*
+ * If we have a request message from a client then immediately
+ * send a REP_REREQUEST back to that client since we're skipping it.
+ */
+ if (F_ISSET(rep, REP_F_CLIENT) && REP_MSG_REQ(rectype))
+ do_req = 1;
+ else {
+ /* Check for need to retransmit. */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ do_req = __rep_check_doreq(env, rep);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+ /*
+ * Don't respond to a MASTER_REQ with
+ * a MASTER_REQ or REREQUEST.
+ */
+ if (do_req && rectype != REP_MASTER_REQ) {
+ /*
+ * There are three cases:
+ * 1. If we don't know who the master is, then send MASTER_REQ.
+ * 2. If the message we're skipping came from the master,
+ * then we need to rerequest.
+ * 3. If the message didn't come from a master (i.e. client
+ * to client), then send a rerequest back to the sender so
+ * the sender can rerequest it elsewhere, if we are a client.
+ */
+ if (rep->master_id == DB_EID_INVALID) /* Case 1. */
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0);
+ else if (eid == rep->master_id) /* Case 2. */
+ ret = __rep_resend_req(env, 0);
+ else if (F_ISSET(rep, REP_F_CLIENT)) /* Case 3. */
+ (void)__rep_send_message(env,
+ eid, REP_REREQUEST, NULL, NULL, 0, 0);
+ }
+ return (ret);
+}
+
+/*
+ * __rep_check_missing --
+ * PUBLIC: int __rep_check_missing __P((ENV *, u_int32_t, DB_LSN *));
+ *
+ * Check for and request any missing client information.
+ */
+int
+__rep_check_missing(env, gen, master_perm_lsn)
+ ENV *env;
+ u_int32_t gen;
+ DB_LSN *master_perm_lsn;
+{
+ DB_LOG *dblp;
+ DB_LSN *end_lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGINFO *infop;
+ REP *rep;
+ __rep_fileinfo_args *curinfo;
+ int do_req, has_log_gap, has_page_gap, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ infop = env->reginfo;
+ has_log_gap = has_page_gap = ret = 0;
+
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Check if we are okay to proceed with this operation. If not,
+ * do not rerequest anything.
+ */
+ if (!F_ISSET(rep, REP_F_CLIENT) || rep->master_id == DB_EID_INVALID ||
+ gen != rep->gen || FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ /*
+ * If this client is out-of-date, ask the master to identify
+ * itself so that this client will synchronize with the
+ * master's later generation.
+ */
+ if (gen > rep->gen && __rep_check_doreq(env, rep))
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ,
+ NULL, NULL, 0, 0);
+ goto out;
+ }
+
+ /*
+ * Prevent message lockout by counting ourself here.
+ * Setting rep->msg_th will prevent a major system
+ * change, such as a role change or running recovery, from
+ * occurring before sending out any rerequests.
+ */
+ rep->msg_th++;
+ REP_SYSTEM_UNLOCK(env);
+
+ /* Check that it is time to request missing information. */
+ if ((do_req = __rep_check_doreq(env, rep))) {
+ /* Check for interior or tail page gap. */
+ REP_SYSTEM_LOCK(env);
+ if (rep->sync_state == SYNC_PAGE &&
+ rep->curinfo_off != INVALID_ROFF) {
+ GET_CURINFO(rep, infop, curinfo);
+ has_page_gap =
+ rep->waiting_pg != PGNO_INVALID ||
+ rep->ready_pg <= curinfo->max_pgno;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ }
+ /* Check for interior or tail log gap. */
+ if (do_req && !has_page_gap) {
+ lp = dblp->reginfo.primary;
+ /*
+ * The LOG_COMPARE test is <= because ready_lsn is
+ * the next LSN we are expecting but we do not have
+ * it yet. If the needed LSN is at this LSN, it
+ * means we are missing the last record we need.
+ */
+ if (rep->sync_state == SYNC_LOG)
+ end_lsn = &rep->last_lsn;
+ else
+ end_lsn = master_perm_lsn;
+ has_log_gap = !IS_ZERO_LSN(lp->waiting_lsn) ||
+ LOG_COMPARE(&lp->ready_lsn, end_lsn) <= 0;
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ /*
+ * If it is time to send a request, only do so if we
+ * have a log gap or a page gap, or we need to resend an
+ * UPDATE_REQ or VERIFY_REQ, or we are in SYNC_LOG to keep
+ * requesting to the current known end of the log.
+ */
+ do_req = do_req && (has_log_gap || has_page_gap ||
+ rep->sync_state == SYNC_LOG ||
+ rep->sync_state == SYNC_UPDATE ||
+ rep->sync_state == SYNC_VERIFY);
+ /*
+ * Determines request type from current replication
+ * state and resends request. The request may have
+ * the DB_REP_ANYWHERE flag enabled if appropriate.
+ */
+ if (do_req)
+ ret = __rep_resend_req(env, 0);
+
+ REP_SYSTEM_LOCK(env);
+ rep->msg_th--;
+ REP_SYSTEM_UNLOCK(env);
+
+out: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+static int
+__rep_fire_newmaster(env, gen, master)
+ ENV *env;
+ u_int32_t gen;
+ int master;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_EVENT_LOCK(env);
+ /*
+ * The firing of this event should be idempotent with respect to a
+ * particular generation number.
+ */
+ if (rep->newmaster_event_gen < gen) {
+ __rep_fire_event(env, DB_EVENT_REP_NEWMASTER, &master);
+ rep->newmaster_event_gen = gen;
+ }
+ REP_EVENT_UNLOCK(env);
+ return (0);
+}
+
+static int
+__rep_fire_startupdone(env, gen, master)
+ ENV *env;
+ u_int32_t gen;
+ int master;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_EVENT_LOCK(env);
+ /*
+ * Usually NEWMASTER will already have been fired. But if not, fire
+ * it here now, to ensure the application receives events in the
+ * expected order.
+ */
+ if (rep->newmaster_event_gen < gen) {
+ __rep_fire_event(env, DB_EVENT_REP_NEWMASTER, &master);
+ rep->newmaster_event_gen = gen;
+ }
+
+ /*
+ * Caller already ensures that it only tries to fire STARTUPDONE once
+ * per generation. If we did not want to rely on that, we could add a
+ * simple boolean flag (to the set of data protected by the mtx_event).
+ * The precise meaning of that flag would be "STARTUPDONE has been fired
+ * for the generation value stored in `newmaster_event_gen'". Then the
+ * more accurate test here would be simply to check that flag, and fire
+ * the event (and set the flag) if it were not already set.
+ */
+ if (rep->newmaster_event_gen == gen)
+ __rep_fire_event(env, DB_EVENT_REP_STARTUPDONE, NULL);
+ REP_EVENT_UNLOCK(env);
+ return (0);
+}
diff --git a/src/rep/rep_region.c b/src/rep/rep_region.c
new file mode 100644
index 00000000..f1d69dff
--- /dev/null
+++ b/src/rep/rep_region.c
@@ -0,0 +1,610 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+static int __rep_egen_init __P((ENV *, REP *));
+static int __rep_gen_init __P((ENV *, REP *));
+
+/*
+ * __rep_open --
+ * Initialize the shared memory state for the replication system.
+ *
+ * PUBLIC: int __rep_open __P((ENV *));
+ */
+int
+__rep_open(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ int i, ret;
+ char *p;
+ char fname[sizeof(REP_DIAGNAME) + 3];
+
+ db_rep = env->rep_handle;
+ infop = env->reginfo;
+ renv = infop->primary;
+ ret = 0;
+ DB_ASSERT(env, DBREP_DIAG_FILES < 100);
+
+ if (renv->rep_off == INVALID_ROFF) {
+ /* Must create the region. */
+ if ((ret = __env_alloc(infop, sizeof(REP), &rep)) != 0)
+ return (ret);
+ memset(rep, 0, sizeof(*rep));
+
+ /*
+ * We have the region; fill in the values. Some values may
+ * have been configured before we open the region, and those
+ * are taken from the DB_REP structure.
+ */
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_REGION, 0, &rep->mtx_region)) != 0)
+ return (ret);
+ /*
+ * Because we have no way to prevent deadlocks and cannot log
+ * changes made to it, we single-thread access to the client
+ * bookkeeping database. This is suboptimal, but it only gets
+ * accessed when messages arrive out-of-order, so it should
+ * stay small and not be used in a high-performance app.
+ */
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_DATABASE, 0, &rep->mtx_clientdb)) != 0)
+ return (ret);
+
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_CHKPT, 0, &rep->mtx_ckp)) != 0)
+ return (ret);
+
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_DIAG, 0, &rep->mtx_diag)) != 0)
+ return (ret);
+
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_EVENT, 0, &rep->mtx_event)) != 0)
+ return (ret);
+
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_START, 0, &rep->mtx_repstart)) != 0)
+ return (ret);
+
+ rep->diag_off = 0;
+ rep->diag_index = 0;
+ rep->newmaster_event_gen = 0;
+ rep->notified_egen = 0;
+ rep->curinfo_off = INVALID_ROFF;
+ rep->lease_off = INVALID_ROFF;
+ rep->originfo_off = INVALID_ROFF;
+ rep->tally_off = INVALID_ROFF;
+ rep->v2tally_off = INVALID_ROFF;
+ rep->eid = db_rep->eid;
+ rep->master_id = DB_EID_INVALID;
+ rep->version = DB_REPVERSION;
+
+ SH_TAILQ_INIT(&rep->waiters);
+ SH_TAILQ_INIT(&rep->free_waiters);
+
+ rep->config = db_rep->config;
+ /*
+ * In-memory replication files must be set before we open
+ * the env, so we know if it is in memory here.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
+ FLD_CLR(env->dbenv->verbose, DB_VERB_REP_SYSTEM);
+
+ if ((ret = __rep_gen_init(env, rep)) != 0)
+ return (ret);
+ if ((ret = __rep_egen_init(env, rep)) != 0)
+ return (ret);
+ rep->gbytes = db_rep->gbytes;
+ rep->bytes = db_rep->bytes;
+ rep->request_gap = db_rep->request_gap;
+ rep->max_gap = db_rep->max_gap;
+ rep->config_nsites = db_rep->config_nsites;
+ rep->elect_timeout = db_rep->elect_timeout;
+ rep->full_elect_timeout = db_rep->full_elect_timeout;
+ rep->lease_timeout = db_rep->lease_timeout;
+ rep->clock_skew = db_rep->clock_skew;
+ rep->clock_base = db_rep->clock_base;
+ timespecclear(&rep->lease_duration);
+ timespecclear(&rep->grant_expire);
+ rep->chkpt_delay = db_rep->chkpt_delay;
+ rep->priority = db_rep->my_priority;
+
+ if ((ret = __rep_lockout_archive(env, rep)) != 0)
+ return (ret);
+
+ /* Copy application type flags if set before env open. */
+ if (F_ISSET(db_rep, DBREP_APP_REPMGR))
+ F_SET(rep, REP_F_APP_REPMGR);
+ if (F_ISSET(db_rep, DBREP_APP_BASEAPI))
+ F_SET(rep, REP_F_APP_BASEAPI);
+
+ /* Initialize encapsulating region. */
+ renv->rep_off = R_OFFSET(infop, rep);
+ (void)time(&renv->rep_timestamp);
+ renv->op_timestamp = 0;
+ F_CLR(renv, DB_REGENV_REPLOCKED);
+
+#ifdef HAVE_REPLICATION_THREADS
+ if ((ret = __repmgr_open(env, rep)) != 0)
+ return (ret);
+#endif
+ } else {
+ rep = R_ADDR(infop, renv->rep_off);
+ /*
+ * Prevent an application type mismatch between a process
+ * and the environment it is trying to join.
+ */
+ if ((F_ISSET(db_rep, DBREP_APP_REPMGR) &&
+ F_ISSET(rep, REP_F_APP_BASEAPI)) ||
+ (F_ISSET(db_rep, DBREP_APP_BASEAPI) &&
+ F_ISSET(rep, REP_F_APP_REPMGR))) {
+ __db_errx(env, DB_STR("3535",
+ "Application type mismatch for a replication "
+ "process joining the environment"));
+ return (EINVAL);
+ }
+#ifdef HAVE_REPLICATION_THREADS
+ if ((ret = __repmgr_join(env, rep)) != 0)
+ return (ret);
+#endif
+ }
+
+ db_rep->region = rep;
+ /*
+ * Open the diagnostic message files for this env handle. We do
+ * this no matter if we created the environment or not.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
+ goto out;
+ for (i = 0; i < DBREP_DIAG_FILES; i++) {
+ db_rep->diagfile[i] = NULL;
+ (void)snprintf(fname, sizeof(fname), REP_DIAGNAME, i);
+ if ((ret = __db_appname(env, DB_APP_NONE, fname,
+ NULL, &p)) != 0)
+ goto err;
+ ret = __os_open(env, p, 0, DB_OSO_CREATE, DB_MODE_600,
+ &db_rep->diagfile[i]);
+ __os_free(env, p);
+ if (ret != 0)
+ goto err;
+ }
+
+out:
+ return (0);
+
+err:
+ (void)__rep_close_diagfiles(env);
+ return (ret);
+}
+
+/*
+ * __rep_close_diagfiles --
+ * Close any diag message files that are open.
+ *
+ * PUBLIC: int __rep_close_diagfiles __P((ENV *));
+ */
+int
+__rep_close_diagfiles(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int i, ret, t_ret;
+
+ db_rep = env->rep_handle;
+ ret = t_ret = 0;
+
+ for (i = 0; i < DBREP_DIAG_FILES; i++) {
+ if (db_rep->diagfile[i] != NULL &&
+ (t_ret = __os_closehandle(env, db_rep->diagfile[i])) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ db_rep->diagfile[i] = NULL;
+ }
+ return (ret);
+}
+
+/*
+ * __rep_env_refresh --
+ * Replication-specific refresh of the ENV structure.
+ *
+ * PUBLIC: int __rep_env_refresh __P((ENV *));
+ */
+int
+__rep_env_refresh(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ struct __rep_waiter *waiter;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ renv = infop->primary;
+ ret = 0;
+
+ /*
+ * If we are the last reference closing the env, clear our knowledge of
+ * belonging to a group and that there is a valid handle where
+ * rep_start had already been called.
+ */
+ if (renv->refcnt == 1) {
+ F_CLR(rep, REP_F_GROUP_ESTD);
+ F_CLR(rep, REP_F_START_CALLED);
+ }
+
+#ifdef HAVE_REPLICATION_THREADS
+ ret = __repmgr_env_refresh(env);
+#endif
+
+ /*
+ * If a private region, return the memory to the heap. Not needed for
+ * filesystem-backed or system shared memory regions, that memory isn't
+ * owned by any particular process.
+ */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ if (rep != NULL) {
+ if ((t_ret = __mutex_free(env,
+ &rep->mtx_region)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __mutex_free(env,
+ &rep->mtx_clientdb)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __mutex_free(env,
+ &rep->mtx_ckp)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __mutex_free(env,
+ &rep->mtx_diag)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __mutex_free(env,
+ &rep->mtx_event)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __mutex_free(env,
+ &rep->mtx_repstart)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard commit queue elements. */
+ DB_ASSERT(env, SH_TAILQ_EMPTY(&rep->waiters));
+ while ((waiter = SH_TAILQ_FIRST(&rep->free_waiters,
+ __rep_waiter)) != NULL) {
+ SH_TAILQ_REMOVE(&rep->free_waiters,
+ waiter, links, __rep_waiter);
+ __env_alloc_free(env->reginfo, waiter);
+ }
+
+ if (rep->curinfo_off != INVALID_ROFF)
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->curinfo_off));
+ if (rep->lease_off != INVALID_ROFF)
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->lease_off));
+ if (rep->originfo_off != INVALID_ROFF)
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->originfo_off));
+ if (rep->tally_off != INVALID_ROFF)
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->tally_off));
+ if (rep->v2tally_off != INVALID_ROFF)
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->v2tally_off));
+ }
+
+ if (renv->rep_off != INVALID_ROFF)
+ __env_alloc_free(infop, R_ADDR(infop, renv->rep_off));
+ }
+ if ((t_ret = __rep_close_diagfiles(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ env->rep_handle->region = NULL;
+ return (ret);
+}
+
+/*
+ * __rep_close --
+ * Shut down all of replication.
+ *
+ * PUBLIC: int __rep_env_close __P((ENV *));
+ */
+int
+__rep_env_close(env)
+ ENV *env;
+{
+ int ret, t_ret;
+
+ ret = __rep_preclose(env);
+ if ((t_ret = __rep_closefiles(env)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_preclose --
+ * If we are a client, shut down our client database and send
+ * any outstanding bulk buffers.
+ *
+ * PUBLIC: int __rep_preclose __P((ENV *));
+ */
+int
+__rep_preclose(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ DB *dbp;
+ REP_BULK bulk;
+ int ret, t_ret;
+
+ ret = 0;
+
+ db_rep = env->rep_handle;
+ dblp = env->lg_handle;
+
+ /*
+ * If we have a rep region, we can preclose. Otherwise, return.
+ * If we're on an error path from env open, we may not have
+ * a region, even though we have a handle.
+ */
+ if (db_rep == NULL || db_rep->region == NULL)
+ return (ret);
+
+ if ((dbp = db_rep->lsn_db) != NULL) {
+ ret = __db_close(dbp, NULL, DB_NOSYNC);
+ db_rep->lsn_db = NULL;
+ }
+
+ MUTEX_LOCK(env, db_rep->region->mtx_clientdb);
+ if (db_rep->rep_db != NULL) {
+ if ((t_ret = __db_close(db_rep->rep_db,
+ NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ db_rep->rep_db = NULL;
+ }
+ /*
+ * We could be called early in an env_open error path, so
+ * only do this if we have a log region set up.
+ */
+ if (dblp == NULL)
+ goto out;
+ lp = dblp->reginfo.primary;
+ /*
+ * If we have something in the bulk buffer, send anything in it
+ * if we are able to.
+ */
+ if (lp->bulk_off != 0 && db_rep->send != NULL) {
+ memset(&bulk, 0, sizeof(bulk));
+ bulk.addr = R_ADDR(&dblp->reginfo, lp->bulk_buf);
+ bulk.offp = &lp->bulk_off;
+ bulk.len = lp->bulk_len;
+ bulk.type = REP_BULK_LOG;
+ bulk.eid = DB_EID_BROADCAST;
+ bulk.flagsp = &lp->bulk_flags;
+ /*
+ * Ignore send errors here. This can be called on the
+ * env->close path - make a best attempt to send.
+ */
+ (void)__rep_send_bulk(env, &bulk, 0);
+ }
+out: MUTEX_UNLOCK(env, db_rep->region->mtx_clientdb);
+ return (ret);
+}
+
+/*
+ * __rep_closefiles --
+ * If we were a client and are now a master, close all databases
+ * we've opened while applying messages as a client. This can
+ * be called from __env_close and we need to check if the env,
+ * handles and regions are set up, or not.
+ *
+ * PUBLIC: int __rep_closefiles __P((ENV *));
+ */
+int
+__rep_closefiles(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ int ret;
+
+ ret = 0;
+
+ db_rep = env->rep_handle;
+ dblp = env->lg_handle;
+
+ if (db_rep == NULL || db_rep->region == NULL)
+ return (ret);
+ if (dblp == NULL)
+ return (ret);
+ if ((ret = __dbreg_close_files(env, 0)) == 0)
+ F_CLR(db_rep, DBREP_OPENFILES);
+
+ return (ret);
+}
+
+/*
+ * __rep_egen_init --
+ * Initialize the value of egen in the region. Called only from
+ * __rep_region_init, which is guaranteed to be single-threaded
+ * as we create the rep region. We set the rep->egen field which
+ * is normally protected by db_rep->region->mutex.
+ */
+static int
+__rep_egen_init(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ DB_FH *fhp;
+ int ret;
+ size_t cnt;
+ char *p;
+
+ if ((ret = __db_appname(env,
+ DB_APP_META, REP_EGENNAME, NULL, &p)) != 0)
+ return (ret);
+ /*
+ * If the file doesn't exist, create it now and initialize with 1.
+ */
+ if (__os_exists(env, p, NULL) != 0) {
+ rep->egen = rep->gen + 1;
+ if ((ret = __rep_write_egen(env, rep, rep->egen)) != 0)
+ goto err;
+ } else {
+ /*
+ * File exists, open it and read in our egen.
+ */
+ if ((ret = __os_open(env, p, 0,
+ DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0)
+ goto err;
+ if ((ret = __os_read(env, fhp, &rep->egen, sizeof(u_int32_t),
+ &cnt)) != 0 || cnt != sizeof(u_int32_t))
+ goto err1;
+ RPRINT(env, (env, DB_VERB_REP_MISC, "Read in egen %lu",
+ (u_long)rep->egen));
+err1: (void)__os_closehandle(env, fhp);
+ }
+err: __os_free(env, p);
+ return (ret);
+}
+
+/*
+ * __rep_write_egen --
+ * Write out the egen into the env file.
+ *
+ * PUBLIC: int __rep_write_egen __P((ENV *, REP *, u_int32_t));
+ *
+ * Caller relies on us not dropping the REP_SYSTEM_LOCK.
+ */
+int
+__rep_write_egen(env, rep, egen)
+ ENV *env;
+ REP *rep;
+ u_int32_t egen;
+{
+ DB_FH *fhp;
+ int ret;
+ size_t cnt;
+ char *p;
+
+ /*
+ * If running in-memory replication, return without any file
+ * operations.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+ return (0);
+ }
+
+ if ((ret = __db_appname(env,
+ DB_APP_META, REP_EGENNAME, NULL, &p)) != 0)
+ return (ret);
+ if ((ret = __os_open(
+ env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) {
+ if ((ret = __os_write(env, fhp, &egen, sizeof(u_int32_t),
+ &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0))
+ __db_err(env, ret, "%s", p);
+ (void)__os_closehandle(env, fhp);
+ }
+ __os_free(env, p);
+ return (ret);
+}
+
+/*
+ * __rep_gen_init --
+ * Initialize the value of gen in the region. Called only from
+ * __rep_region_init, which is guaranteed to be single-threaded
+ * as we create the rep region. We set the rep->gen field which
+ * is normally protected by db_rep->region->mutex.
+ */
+static int
+__rep_gen_init(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ DB_FH *fhp;
+ int ret;
+ size_t cnt;
+ char *p;
+
+ if ((ret = __db_appname(env,
+ DB_APP_META, REP_GENNAME, NULL, &p)) != 0)
+ return (ret);
+
+ if (__os_exists(env, p, NULL) != 0) {
+ /*
+ * File doesn't exist, create it now and initialize with 0.
+ */
+ SET_GEN(0);
+ if ((ret = __rep_write_gen(env, rep, rep->gen)) != 0)
+ goto err;
+ } else {
+ /*
+ * File exists, open it and read in our gen.
+ */
+ if ((ret = __os_open(env, p, 0,
+ DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0)
+ goto err;
+ if ((ret = __os_read(env, fhp, &rep->gen, sizeof(u_int32_t),
+ &cnt)) < 0 || cnt == 0)
+ goto err1;
+ RPRINT(env, (env, DB_VERB_REP_MISC, "Read in gen %lu",
+ (u_long)rep->gen));
+err1: (void)__os_closehandle(env, fhp);
+ }
+err: __os_free(env, p);
+ return (ret);
+}
+
+/*
+ * __rep_write_gen --
+ * Write out the gen into the env file.
+ *
+ * PUBLIC: int __rep_write_gen __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_write_gen(env, rep, gen)
+ ENV *env;
+ REP *rep;
+ u_int32_t gen;
+{
+ DB_FH *fhp;
+ int ret;
+ size_t cnt;
+ char *p;
+
+ /*
+ * If running in-memory replication, return without any file
+ * operations.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+ return (0);
+ }
+
+ if ((ret = __db_appname(env,
+ DB_APP_META, REP_GENNAME, NULL, &p)) != 0)
+ return (ret);
+ if ((ret = __os_open(
+ env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) {
+ if ((ret = __os_write(env, fhp, &gen, sizeof(u_int32_t),
+ &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0))
+ __db_err(env, ret, "%s", p);
+ (void)__os_closehandle(env, fhp);
+ }
+ __os_free(env, p);
+ return (ret);
+}
diff --git a/src/rep/rep_stat.c b/src/rep/rep_stat.c
new file mode 100644
index 00000000..addfee25
--- /dev/null
+++ b/src/rep/rep_stat.c
@@ -0,0 +1,692 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+#ifdef HAVE_STATISTICS
+static int __rep_print_all __P((ENV *, u_int32_t));
+static int __rep_print_stats __P((ENV *, u_int32_t));
+static int __rep_stat __P((ENV *, DB_REP_STAT **, u_int32_t));
+static int __rep_stat_summary_print __P((ENV *));
+static const char *__rep_syncstate_to_string __P((repsync_t));
+
+/*
+ * Print the individual statistic for items that appear both in the full and
+ * the summary replication statistics output.
+ */
+#define PRINT_LOGQUEUED(sp) do { \
+ __db_dl(env, "Number of log records currently queued", \
+ (u_long)(sp)->st_log_queued); \
+} while (0)
+
+#define PRINT_MAXPERMLSN(sp) do { \
+ __db_msg(env, "%lu/%lu\t%s", \
+ (u_long)(sp)->st_max_perm_lsn.file, \
+ (u_long)(sp)->st_max_perm_lsn.offset, \
+ (sp)->st_max_perm_lsn.file == 0 ? \
+ "No maximum permanent LSN" : \
+ "Maximum permanent LSN"); \
+} while (0)
+
+#define PRINT_MSGSRECOVER(sp) do { \
+ __db_dl(env, "Number of messages ignored due to pending recovery", \
+ (u_long)(sp)->st_msgs_recover); \
+} while (0)
+
+#define PRINT_MSGSSENDFAILURES(sp) do { \
+ __db_dl(env, "Number of failed message sends", \
+ (u_long)(sp)->st_msgs_send_failures); \
+} while (0)
+
+#define PRINT_STARTUPCOMPLETE(sp) do { \
+ if ((sp)->st_startup_complete == 0) \
+ __db_msg(env, "Startup incomplete"); \
+ else \
+ __db_msg(env, "Startup complete"); \
+} while (0)
+
+#define PRINT_STATUS(sp, is_client) do { \
+ is_client = 0; \
+ switch ((sp)->st_status) { \
+ case DB_REP_MASTER: \
+ __db_msg(env, \
+ "Environment configured as a replication master"); \
+ break; \
+ case DB_REP_CLIENT: \
+ __db_msg(env, \
+ "Environment configured as a replication client"); \
+ is_client = 1; \
+ break; \
+ default: \
+ __db_msg(env, \
+ "Environment not configured for replication"); \
+ break; \
+ } \
+} while (0)
+
+/*
+ * __rep_stat_pp --
+ * ENV->rep_stat pre/post processing.
+ *
+ * PUBLIC: int __rep_stat_pp __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
+ */
+int
+__rep_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_REP_STAT **statp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_stat", DB_INIT_REP);
+
+ if ((ret = __db_fchk(env,
+ "DB_ENV->rep_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ ret = __rep_stat(env, statp, flags);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __rep_stat --
+ * ENV->rep_stat.
+ */
+static int
+__rep_stat(env, statp, flags)
+ ENV *env;
+ DB_REP_STAT **statp;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_REP_STAT *stats;
+ LOG *lp;
+ REP *rep;
+ u_int32_t startupdone;
+ uintmax_t queued;
+ int dolock, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ *statp = NULL;
+
+ /* Allocate a stat struct to return to the user. */
+ if ((ret = __os_umalloc(env, sizeof(DB_REP_STAT), &stats)) != 0)
+ return (ret);
+
+ /*
+ * Read without holding the lock. If we are in client recovery, we
+ * copy just the stats struct so we won't block. We only copy out
+ * those stats that don't require acquiring any mutex.
+ */
+ dolock = IS_REP_RECOVERING(rep) ? 0 : 1;
+ memcpy(stats, &rep->stat, sizeof(*stats));
+
+ /* Copy out election stats. */
+ if (FLD_ISSET(rep->elect_flags, REP_E_PHASE1))
+ stats->st_election_status = 1;
+ else if (FLD_ISSET(rep->elect_flags, REP_E_PHASE2))
+ stats->st_election_status = 2;
+
+ stats->st_election_nsites = rep->sites;
+ stats->st_election_cur_winner = rep->winner;
+ stats->st_election_priority = rep->w_priority;
+ stats->st_election_gen = rep->w_gen;
+ stats->st_election_datagen = rep->w_datagen;
+ stats->st_election_lsn = rep->w_lsn;
+ stats->st_election_votes = rep->votes;
+ stats->st_election_nvotes = rep->nvotes;
+ stats->st_election_tiebreaker = rep->w_tiebreaker;
+
+ /* Copy out other info that's protected by the rep mutex. */
+ stats->st_env_id = rep->eid;
+ stats->st_env_priority = rep->priority;
+ stats->st_nsites = rep->nsites;
+ stats->st_master = rep->master_id;
+ stats->st_gen = rep->gen;
+ stats->st_egen = rep->egen;
+
+ if (F_ISSET(rep, REP_F_MASTER))
+ stats->st_status = DB_REP_MASTER;
+ else if (F_ISSET(rep, REP_F_CLIENT))
+ stats->st_status = DB_REP_CLIENT;
+ else
+ stats->st_status = 0;
+
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ queued = rep->stat.st_log_queued;
+ startupdone = rep->stat.st_startup_complete;
+ memset(&rep->stat, 0, sizeof(rep->stat));
+ rep->stat.st_log_queued = rep->stat.st_log_queued_total =
+ rep->stat.st_log_queued_max = queued;
+ rep->stat.st_startup_complete = startupdone;
+ }
+
+ /*
+ * Log-related replication info is stored in the log system and
+ * protected by the log region lock.
+ */
+ if (dolock)
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ if (F_ISSET(rep, REP_F_CLIENT)) {
+ stats->st_next_lsn = lp->ready_lsn;
+ stats->st_waiting_lsn = lp->waiting_lsn;
+ stats->st_next_pg = rep->ready_pg;
+ stats->st_waiting_pg = rep->waiting_pg;
+ stats->st_max_lease_sec = (u_int32_t)lp->max_lease_ts.tv_sec;
+ stats->st_max_lease_usec = (u_int32_t)
+ (lp->max_lease_ts.tv_nsec / NS_PER_US);
+ } else {
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ LOG_SYSTEM_LOCK(env);
+ stats->st_next_lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ } else
+ ZERO_LSN(stats->st_next_lsn);
+ ZERO_LSN(stats->st_waiting_lsn);
+ stats->st_max_lease_sec = 0;
+ stats->st_max_lease_usec = 0;
+ }
+ stats->st_max_perm_lsn = lp->max_perm_lsn;
+ if (dolock)
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ *statp = stats;
+ return (0);
+}
+
+/*
+ * __rep_stat_print_pp --
+ * ENV->rep_stat_print pre/post processing.
+ *
+ * PUBLIC: int __rep_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_stat_print", DB_INIT_REP);
+
+ if ((ret = __db_fchk(env, "DB_ENV->rep_stat_print",
+ flags, DB_STAT_ALL | DB_STAT_CLEAR | DB_STAT_SUMMARY)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ ret = __rep_stat_print(env, flags);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __rep_stat_print --
+ * ENV->rep_stat_print method.
+ *
+ * PUBLIC: int __rep_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__rep_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ u_int32_t orig_flags;
+ int ret;
+
+ orig_flags = flags;
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+ if (LF_ISSET(DB_STAT_SUMMARY))
+ return (__rep_stat_summary_print(env));
+
+ if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+ ret = __rep_print_stats(env, orig_flags);
+ if (flags == 0 || ret != 0)
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_STAT_ALL) &&
+ (ret = __rep_print_all(env, orig_flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __rep_print_stats --
+ * Print out default statistics.
+ */
+static int
+__rep_print_stats(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_REP_STAT *sp;
+ int is_client, ret;
+ char *p;
+
+ if ((ret = __rep_stat(env, &sp, flags)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL))
+ __db_msg(env, "Default replication region information:");
+ PRINT_STATUS(sp, is_client);
+
+ __db_msg(env, "%lu/%lu\t%s",
+ (u_long)sp->st_next_lsn.file, (u_long)sp->st_next_lsn.offset,
+ is_client ? "Next LSN expected" : "Next LSN to be used");
+ __db_msg(env, "%lu/%lu\t%s",
+ (u_long)sp->st_waiting_lsn.file, (u_long)sp->st_waiting_lsn.offset,
+ sp->st_waiting_lsn.file == 0 ?
+ "Not waiting for any missed log records" :
+ "LSN of first log record we have after missed log records");
+ PRINT_MAXPERMLSN(sp);
+
+ __db_dl(env, "Next page number expected", (u_long)sp->st_next_pg);
+ p = sp->st_waiting_pg == PGNO_INVALID ?
+ "Not waiting for any missed pages" :
+ "Page number of first page we have after missed pages";
+ __db_msg(env, "%lu\t%s", (u_long)sp->st_waiting_pg, p);
+ __db_dl(env,
+ "Number of duplicate master conditions originally detected at this site",
+ (u_long)sp->st_dupmasters);
+ if (sp->st_env_id != DB_EID_INVALID)
+ __db_dl(env, "Current environment ID", (u_long)sp->st_env_id);
+ else
+ __db_msg(env, "No current environment ID");
+ __db_dl(env,
+ "Current environment priority", (u_long)sp->st_env_priority);
+ __db_dl(env, "Current generation number", (u_long)sp->st_gen);
+ __db_dl(env,
+ "Election generation number for the current or next election",
+ (u_long)sp->st_egen);
+ __db_dl(env, "Number of lease validity checks",
+ (u_long)sp->st_lease_chk);
+ __db_dl(env, "Number of invalid lease validity checks",
+ (u_long)sp->st_lease_chk_misses);
+ __db_dl(env,
+ "Number of lease refresh attempts during lease validity checks",
+ (u_long)sp->st_lease_chk_refresh);
+ __db_dl(env, "Number of live messages sent while using leases",
+ (u_long)sp->st_lease_sends);
+ __db_dl(env, "Number of duplicate log records received",
+ (u_long)sp->st_log_duplicated);
+ PRINT_LOGQUEUED(sp);
+ __db_dl(env, "Maximum number of log records ever queued at once",
+ (u_long)sp->st_log_queued_max);
+ __db_dl(env, "Total number of log records queued",
+ (u_long)sp->st_log_queued_total);
+ __db_dl(env,
+ "Number of log records received and appended to the log",
+ (u_long)sp->st_log_records);
+ __db_dl(env, "Number of log records missed and requested",
+ (u_long)sp->st_log_requested);
+ if (sp->st_master != DB_EID_INVALID)
+ __db_dl(env, "Current master ID", (u_long)sp->st_master);
+ else
+ __db_msg(env, "No current master ID");
+ __db_dl(env, "Number of times the master has changed",
+ (u_long)sp->st_master_changes);
+ __db_dl(env,
+ "Number of messages received with a bad generation number",
+ (u_long)sp->st_msgs_badgen);
+ __db_dl(env, "Number of messages received and processed",
+ (u_long)sp->st_msgs_processed);
+ PRINT_MSGSRECOVER(sp);
+ PRINT_MSGSSENDFAILURES(sp);
+ __db_dl(env, "Number of messages sent", (u_long)sp->st_msgs_sent);
+ __db_dl(env,
+ "Number of new site messages received", (u_long)sp->st_newsites);
+ __db_dl(env,
+ "Number of environments used in the last election",
+ (u_long)(sp)->st_nsites);
+ __db_dl(env, "Transmission limited", (u_long)sp->st_nthrottles);
+ __db_dl(env, "Number of outdated conditions detected",
+ (u_long)sp->st_outdated);
+ __db_dl(env, "Number of duplicate page records received",
+ (u_long)sp->st_pg_duplicated);
+ __db_dl(env, "Number of page records received and added to databases",
+ (u_long)sp->st_pg_records);
+ __db_dl(env, "Number of page records missed and requested",
+ (u_long)sp->st_pg_requested);
+ PRINT_STARTUPCOMPLETE(sp);
+ __db_dl(env,
+ "Number of transactions applied", (u_long)sp->st_txns_applied);
+
+ __db_dl(env, "Number of startsync messages delayed",
+ (u_long)sp->st_startsync_delayed);
+
+ __db_dl(env, "Number of elections held", (u_long)sp->st_elections);
+ __db_dl(env,
+ "Number of elections won", (u_long)sp->st_elections_won);
+
+ if (sp->st_election_status == 0) {
+ __db_msg(env, "No election in progress");
+ if (sp->st_election_sec > 0 || sp->st_election_usec > 0)
+ __db_msg(env,
+ "%lu.%.6lu\tDuration of last election (seconds)",
+ (u_long)sp->st_election_sec,
+ (u_long)sp->st_election_usec);
+ } else {
+ __db_dl(env, "Current election phase",
+ (u_long)sp->st_election_status);
+ __db_dl(env,
+ "Environment ID of the winner of the current or last election",
+ (u_long)sp->st_election_cur_winner);
+ __db_dl(env,
+ "Master generation number of the winner of the current or last election",
+ (u_long)sp->st_election_gen);
+ __db_dl(env,
+ "Master data generation number of the winner of the current or last election",
+ (u_long)sp->st_election_datagen);
+ __db_msg(env,
+ "%lu/%lu\tMaximum LSN of the winner of the current or last election",
+ (u_long)sp->st_election_lsn.file,
+ (u_long)sp->st_election_lsn.offset);
+ __db_dl(env,
+ "Number of sites responding to this site during the current election",
+ (u_long)sp->st_election_nsites);
+ __db_dl(env,
+ "Number of votes required in the current or last election",
+ (u_long)sp->st_election_nvotes);
+ __db_dl(env,
+ "Priority of the winner of the current or last election",
+ (u_long)sp->st_election_priority);
+ __db_dl(env,
+ "Tiebreaker value of the winner of the current or last election",
+ (u_long)sp->st_election_tiebreaker);
+ __db_dl(env,
+ "Number of votes received during the current election",
+ (u_long)sp->st_election_votes);
+ }
+ __db_dl(env, "Number of bulk buffer sends triggered by full buffer",
+ (u_long)sp->st_bulk_fills);
+ __db_dl(env, "Number of single records exceeding bulk buffer size",
+ (u_long)sp->st_bulk_overflows);
+ __db_dl(env, "Number of records added to a bulk buffer",
+ (u_long)sp->st_bulk_records);
+ __db_dl(env, "Number of bulk buffers sent",
+ (u_long)sp->st_bulk_transfers);
+ __db_dl(env, "Number of re-request messages received",
+ (u_long)sp->st_client_rerequests);
+ __db_dl(env,
+ "Number of request messages this client failed to process",
+ (u_long)sp->st_client_svc_miss);
+ __db_dl(env, "Number of request messages received by this client",
+ (u_long)sp->st_client_svc_req);
+ if (sp->st_max_lease_sec > 0 || sp->st_max_lease_usec > 0)
+ __db_msg(env,
+ "%lu.%.6lu\tDuration of maximum lease (seconds)",
+ (u_long)sp->st_max_lease_sec,
+ (u_long)sp->st_max_lease_usec);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+/*
+ * __rep_print_all --
+ * Display debugging replication region statistics.
+ */
+static int
+__rep_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ static const FN rep_cfn[] = {
+ { REP_C_2SITE_STRICT, "REP_C_2SITE_STRICT" },
+ { REP_C_AUTOINIT, "REP_C_AUTOINIT" },
+ { REP_C_AUTOROLLBACK, "REP_C_AUTOROLLBACK" },
+ { REP_C_BULK, "REP_C_BULK" },
+ { REP_C_DELAYCLIENT, "REP_C_DELAYCLIENT" },
+ { REP_C_ELECTIONS, "REP_C_ELECTIONS" },
+ { REP_C_INMEM, "REP_C_INMEM" },
+ { REP_C_LEASE, "REP_C_LEASE" },
+ { REP_C_NOWAIT, "REP_C_NOWAIT" },
+ { 0, NULL }
+ };
+ static const FN rep_efn[] = {
+ { REP_E_PHASE0, "REP_E_PHASE0" },
+ { REP_E_PHASE1, "REP_E_PHASE1" },
+ { REP_E_PHASE2, "REP_E_PHASE2" },
+ { REP_E_TALLY, "REP_E_TALLY" },
+ { 0, NULL }
+ };
+ static const FN rep_fn[] = {
+ { REP_F_ABBREVIATED, "REP_F_ABBREVIATED" },
+ { REP_F_APP_BASEAPI, "REP_F_APP_BASEAPI" },
+ { REP_F_APP_REPMGR, "REP_F_APP_REPMGR" },
+ { REP_F_CLIENT, "REP_F_CLIENT" },
+ { REP_F_DELAY, "REP_F_DELAY" },
+ { REP_F_GROUP_ESTD, "REP_F_GROUP_ESTD" },
+ { REP_F_LEASE_EXPIRED, "REP_F_LEASE_EXPIRED" },
+ { REP_F_MASTER, "REP_F_MASTER" },
+ { REP_F_MASTERELECT, "REP_F_MASTERELECT" },
+ { REP_F_NEWFILE, "REP_F_NEWFILE" },
+ { REP_F_NIMDBS_LOADED, "REP_F_NIMDBS_LOADED" },
+ { REP_F_SKIPPED_APPLY, "REP_F_SKIPPED_APPLY" },
+ { REP_F_START_CALLED, "REP_F_START_CALLED" },
+ { 0, NULL }
+ };
+ static const FN rep_lfn[] = {
+ { REP_LOCKOUT_API, "REP_LOCKOUT_API" },
+ { REP_LOCKOUT_APPLY, "REP_LOCKOUT_APPLY" },
+ { REP_LOCKOUT_ARCHIVE, "REP_LOCKOUT_ARCHIVE" },
+ { REP_LOCKOUT_MSG, "REP_LOCKOUT_MSG" },
+ { REP_LOCKOUT_OP, "REP_LOCKOUT_OP" },
+ { 0, NULL }
+ };
+ static const FN dbrep_fn[] = {
+ { DBREP_APP_BASEAPI, "DBREP_APP_BASEAPI" },
+ { DBREP_APP_REPMGR, "DBREP_APP_REPMGR" },
+ { DBREP_OPENFILES, "DBREP_OPENFILES" },
+ { 0, NULL }
+ };
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ char time_buf[CTIME_BUFLEN];
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ renv = infop->primary;
+ ENV_ENTER(env, ip);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB_REP handle information:");
+
+ if (db_rep->rep_db == NULL)
+ STAT_ISSET("Bookkeeping database", db_rep->rep_db);
+ else
+ (void)__db_stat_print(db_rep->rep_db, ip, flags);
+
+ __db_prflags(env, NULL, db_rep->flags, dbrep_fn, NULL, "\tFlags");
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "REP handle information:");
+ __mutex_print_debug_single(env,
+ "Replication region mutex", rep->mtx_region, flags);
+ __mutex_print_debug_single(env,
+ "Bookkeeping database mutex", rep->mtx_clientdb, flags);
+
+ STAT_LONG("Environment ID", rep->eid);
+ STAT_LONG("Master environment ID", rep->master_id);
+ STAT_ULONG("Election generation", rep->egen);
+ STAT_ULONG("Last active egen", rep->spent_egen);
+ STAT_ULONG("Master generation", rep->gen);
+ STAT_LONG("Space allocated for sites", rep->asites);
+ STAT_LONG("Sites in group", rep->nsites);
+ STAT_LONG("Votes needed for election", rep->nvotes);
+ STAT_LONG("Priority in election", rep->priority);
+ __db_dlbytes(env, "Limit on data sent in a single call",
+ rep->gbytes, (u_long)0, rep->bytes);
+ STAT_LONG("Request gap seconds", rep->request_gap.tv_sec);
+ STAT_LONG("Request gap microseconds",
+ rep->request_gap.tv_nsec / NS_PER_US);
+ STAT_LONG("Maximum gap seconds", rep->max_gap.tv_sec);
+ STAT_LONG("Maximum gap microseconds",
+ rep->max_gap.tv_nsec / NS_PER_US);
+
+ STAT_ULONG("Callers in rep_proc_msg", rep->msg_th);
+ STAT_ULONG("Callers in rep_elect", rep->elect_th);
+ STAT_ULONG("Library handle count", rep->handle_cnt);
+ STAT_ULONG("Multi-step operation count", rep->op_cnt);
+ __db_msg(env, "%.24s\tRecovery timestamp",
+ renv->rep_timestamp == 0 ?
+ "0" : __os_ctime(&renv->rep_timestamp, time_buf));
+
+ STAT_LONG("Sites heard from", rep->sites);
+ STAT_LONG("Current winner", rep->winner);
+ STAT_LONG("Winner priority", rep->w_priority);
+ STAT_ULONG("Winner generation", rep->w_gen);
+ STAT_ULONG("Winner data generation", rep->w_datagen);
+ STAT_LSN("Winner LSN", &rep->w_lsn);
+ STAT_LONG("Winner tiebreaker", rep->w_tiebreaker);
+ STAT_LONG("Votes for this site", rep->votes);
+
+ STAT_STRING("Synchronization State",
+ __rep_syncstate_to_string(rep->sync_state));
+ __db_prflags(env, NULL, rep->config, rep_cfn, NULL,
+ "\tConfig Flags");
+ __db_prflags(env, NULL, rep->elect_flags, rep_efn, NULL,
+ "\tElect Flags");
+ __db_prflags(env, NULL, rep->lockout_flags, rep_lfn,
+ NULL, "\tLockout Flags");
+ __db_prflags(env, NULL, rep->flags, rep_fn, NULL, "\tFlags");
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "LOG replication information:");
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ dblp = env->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+ STAT_LSN("First log record after a gap", &lp->waiting_lsn);
+ STAT_LSN("Maximum permanent LSN processed", &lp->max_perm_lsn);
+ STAT_LSN("LSN waiting to verify", &lp->verify_lsn);
+ STAT_LSN("Maximum LSN requested", &lp->max_wait_lsn);
+ STAT_LONG("Time to wait before requesting seconds", lp->wait_ts.tv_sec);
+ STAT_LONG("Time to wait before requesting microseconds",
+ lp->wait_ts.tv_nsec / NS_PER_US);
+ STAT_LSN("Next LSN expected", &lp->ready_lsn);
+ STAT_LONG("Maximum lease timestamp seconds", lp->max_lease_ts.tv_sec);
+ STAT_LONG("Maximum lease timestamp microseconds",
+ lp->max_lease_ts.tv_nsec / NS_PER_US);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ENV_LEAVE(env, ip);
+
+ return (0);
+}
+
+static const char *
+__rep_syncstate_to_string(state)
+ repsync_t state;
+{
+ switch (state) {
+ case SYNC_OFF:
+ return ("Not Synchronizing");
+ case SYNC_LOG:
+ return ("SYNC_LOG");
+ case SYNC_PAGE:
+ return ("SYNC_PAGE");
+ case SYNC_UPDATE:
+ return ("SYNC_UPDATE");
+ case SYNC_VERIFY:
+ return ("SYNC_VERIFY");
+ default:
+ break;
+ }
+ return ("UNKNOWN STATE");
+}
+
+/*
+ * __rep_stat_summary_print --
+ * Print out a brief summary of replication statistics.
+ */
+static int
+__rep_stat_summary_print(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ DB_REP_STAT *sp;
+ REP *rep;
+ int is_client, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+ if ((ret = __rep_stat(env, &sp, 0)) == 0) {
+ PRINT_STATUS(sp, is_client);
+ if (is_client)
+ PRINT_STARTUPCOMPLETE(sp);
+ PRINT_MAXPERMLSN(sp);
+ /*
+ * Use the number of sites that is kept up-to-date most
+ * frequently. The rep_stat st_nsites is only current
+ * as of the last election.
+ */
+ __db_dl(env, "Number of environments in the replication group",
+ (u_long)rep->config_nsites);
+ PRINT_MSGSSENDFAILURES(sp);
+ PRINT_MSGSRECOVER(sp);
+ PRINT_LOGQUEUED(sp);
+ __os_ufree(env, sp);
+ }
+ return (ret);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__rep_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_REP_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__rep_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/rep/rep_stub.c b/src/rep/rep_stub.c
new file mode 100644
index 00000000..2d96ea59
--- /dev/null
+++ b/src/rep/rep_stub.c
@@ -0,0 +1,425 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_REPLICATION
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+/*
+ * If the library wasn't compiled with replication support, various routines
+ * aren't available. Stub them here, returning an appropriate error.
+ */
+static int __db_norep __P((ENV *));
+
+/*
+ * __db_norep --
+ * Error when a Berkeley DB build doesn't include replication support.
+ */
+static int
+__db_norep(env)
+ ENV *env;
+{
+ __db_errx(env, DB_STR("3581",
+ "library build did not include support for replication"));
+ return (DB_OPNOTSUP);
+}
+
+int
+__db_rep_enter(dbp, checkgen, checklock, return_now)
+ DB *dbp;
+ int checkgen, checklock, return_now;
+{
+ COMPQUIET(checkgen, 0);
+ COMPQUIET(checklock, 0);
+ COMPQUIET(return_now, 0);
+ return (__db_norep(dbp->env));
+}
+
+int
+__env_rep_enter(env, checklock)
+ ENV *env;
+ int checklock;
+{
+ COMPQUIET(checklock, 0);
+ return (__db_norep(env));
+}
+
+int
+__env_db_rep_exit(env)
+ ENV *env;
+{
+ return (__db_norep(env));
+}
+
+int
+__op_rep_enter(env, local_nowait, obey_user)
+ ENV *env;
+ int local_nowait, obey_user;
+{
+ COMPQUIET(local_nowait, 0);
+ COMPQUIET(obey_user, 0);
+ return (__db_norep(env));
+}
+
+int
+__op_rep_exit(env)
+ ENV *env;
+{
+ return (__db_norep(env));
+}
+
+int
+__archive_rep_enter(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+int
+__archive_rep_exit(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+int
+__rep_bulk_message(env, bulkp, repth, lsnp, dbt, flags)
+ ENV *env;
+ REP_BULK *bulkp;
+ REP_THROTTLE *repth;
+ DB_LSN *lsnp;
+ const DBT *dbt;
+ u_int32_t flags;
+{
+ COMPQUIET(bulkp, NULL);
+ COMPQUIET(repth, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(dbt, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norep(env));
+}
+
+int
+__rep_env_refresh(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+int
+__rep_elect_pp(dbenv, nsites, nvotes, flags)
+ DB_ENV *dbenv;
+ u_int32_t nsites, nvotes;
+ u_int32_t flags;
+{
+ COMPQUIET(nsites, 0);
+ COMPQUIET(nvotes, 0);
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_flush(dbenv)
+ DB_ENV *dbenv;
+{
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_lease_check(env, refresh)
+ ENV *env;
+ int refresh;
+{
+ COMPQUIET(refresh, 0);
+ return (__db_norep(env));
+}
+
+int
+__rep_lease_expire(env)
+ ENV *env;
+{
+ return (__db_norep(env));
+}
+
+void
+__rep_msg(env, msg)
+ const ENV *env;
+ const char *msg;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(msg, NULL);
+ return;
+}
+
+int
+__rep_get_clockskew(dbenv, fast_clockp, slow_clockp)
+ DB_ENV *dbenv;
+ u_int32_t *fast_clockp, *slow_clockp;
+{
+ COMPQUIET(fast_clockp, NULL);
+ COMPQUIET(slow_clockp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_clockskew(dbenv, fast_clock, slow_clock)
+ DB_ENV *dbenv;
+ u_int32_t fast_clock, slow_clock;
+{
+ COMPQUIET(fast_clock, 0);
+ COMPQUIET(slow_clock, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_nsites_pp(dbenv, n)
+ DB_ENV *dbenv;
+ u_int32_t n;
+{
+ COMPQUIET(n, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_nsites(dbenv, n)
+ DB_ENV *dbenv;
+ u_int32_t *n;
+{
+ COMPQUIET(n, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_priority(dbenv, priority)
+ DB_ENV *dbenv;
+ u_int32_t priority;
+{
+ COMPQUIET(priority, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_priority(dbenv, priority)
+ DB_ENV *dbenv;
+ u_int32_t *priority;
+{
+ COMPQUIET(priority, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_timeout(dbenv, which, timeout)
+ DB_ENV *dbenv;
+ int which;
+ db_timeout_t timeout;
+{
+ COMPQUIET(which, 0);
+ COMPQUIET(timeout, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_timeout(dbenv, which, timeout)
+ DB_ENV *dbenv;
+ int which;
+ db_timeout_t *timeout;
+{
+ COMPQUIET(which, 0);
+ COMPQUIET(timeout, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_config(dbenv, which, onp)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int *onp;
+{
+ COMPQUIET(which, 0);
+ COMPQUIET(onp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_config(dbenv, which, on)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int on;
+{
+ COMPQUIET(which, 0);
+ COMPQUIET(on, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_limit(dbenv, gbytesp, bytesp)
+ DB_ENV *dbenv;
+ u_int32_t *gbytesp, *bytesp;
+{
+ COMPQUIET(gbytesp, NULL);
+ COMPQUIET(bytesp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_open(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+int
+__rep_preclose(env)
+ ENV *env;
+{
+ return (__db_norep(env));
+}
+
+int
+__rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp)
+ DB_ENV *dbenv;
+ DBT *control, *rec;
+ int eid;
+ DB_LSN *ret_lsnp;
+{
+ COMPQUIET(control, NULL);
+ COMPQUIET(rec, NULL);
+ COMPQUIET(eid, 0);
+ COMPQUIET(ret_lsnp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_send_message(env, eid, rtype, lsnp, dbtp, logflags, repflags)
+ ENV *env;
+ int eid;
+ u_int32_t rtype;
+ DB_LSN *lsnp;
+ const DBT *dbtp;
+ u_int32_t logflags, repflags;
+{
+ COMPQUIET(eid, 0);
+ COMPQUIET(rtype, 0);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(logflags, 0);
+ COMPQUIET(repflags, 0);
+ return (__db_norep(env));
+}
+
+int
+__rep_set_limit(dbenv, gbytes, bytes)
+ DB_ENV *dbenv;
+ u_int32_t gbytes, bytes;
+{
+ COMPQUIET(gbytes, 0);
+ COMPQUIET(bytes, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_transport_pp(dbenv, eid, f_send)
+ DB_ENV *dbenv;
+ int eid;
+ int (*f_send) __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *,
+ int, u_int32_t));
+{
+ COMPQUIET(eid, 0);
+ COMPQUIET(f_send, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_request(dbenv, min, max)
+ DB_ENV *dbenv;
+ u_int32_t min, max;
+{
+ COMPQUIET(min, 0);
+ COMPQUIET(max, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_request(dbenv, minp, maxp)
+ DB_ENV *dbenv;
+ u_int32_t *minp, *maxp;
+{
+ COMPQUIET(minp, NULL);
+ COMPQUIET(maxp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_start_pp(dbenv, dbt, flags)
+ DB_ENV *dbenv;
+ DBT *dbt;
+ u_int32_t flags;
+{
+ COMPQUIET(dbt, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_REP_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_norep(env));
+}
+
+int
+__rep_sync(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_txn_applied(env, ip, commit_info, timeout)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_COMMIT_INFO *commit_info;
+ db_timeout_t timeout;
+{
+ COMPQUIET(ip, 0);
+ COMPQUIET(commit_info, NULL);
+ COMPQUIET(timeout, 0);
+ return (__db_norep(env));
+}
+#endif /* !HAVE_REPLICATION */
diff --git a/src/rep/rep_util.c b/src/rep/rep_util.c
new file mode 100644
index 00000000..0dfe6122
--- /dev/null
+++ b/src/rep/rep_util.c
@@ -0,0 +1,2791 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+#ifdef REP_DIAGNOSTIC
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#endif
+
+/*
+ * rep_util.c:
+ * Miscellaneous replication-related utility functions, including
+ * those called by other subsystems.
+ */
+#define TIMESTAMP_CHECK(env, ts, renv) do { \
+ if (renv->op_timestamp != 0 && \
+ renv->op_timestamp + DB_REGENV_TIMEOUT < ts) { \
+ REP_SYSTEM_LOCK(env); \
+ F_CLR(renv, DB_REGENV_REPLOCKED); \
+ renv->op_timestamp = 0; \
+ REP_SYSTEM_UNLOCK(env); \
+ } \
+} while (0)
+
+static int __rep_lockout_int __P((ENV *, REP *, u_int32_t *, u_int32_t,
+ const char *, u_int32_t));
+static int __rep_newmaster_empty __P((ENV *, int));
+static int __rep_print_int __P((ENV *, u_int32_t, const char *, va_list));
+#ifdef REP_DIAGNOSTIC
+static void __rep_print_logmsg __P((ENV *, const DBT *, DB_LSN *));
+#endif
+static int __rep_show_progress __P((ENV *, const char *, int mins));
+
+/*
+ * __rep_bulk_message --
+ * This is a wrapper for putting a record into a bulk buffer. Since
+ * we have different bulk buffers, the caller must hand us the information
+ * we need to put the record into the correct buffer. All bulk buffers
+ * are protected by the REP->mtx_clientdb.
+ *
+ * PUBLIC: int __rep_bulk_message __P((ENV *, REP_BULK *, REP_THROTTLE *,
+ * PUBLIC: DB_LSN *, const DBT *, u_int32_t));
+ */
+int
+__rep_bulk_message(env, bulk, repth, lsn, dbt, flags)
+ ENV *env;
+ REP_BULK *bulk;
+ REP_THROTTLE *repth;
+ DB_LSN *lsn;
+ const DBT *dbt;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ __rep_bulk_args b_args;
+ size_t len;
+ int ret;
+ u_int32_t recsize, typemore;
+ u_int8_t *p;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+
+ /*
+ * Figure out the total number of bytes needed for this record.
+ * !!! The marshalling code includes the given len, but also
+ * puts its own copy of the dbt->size with the DBT portion of
+ * the record. Account for that here.
+ */
+ recsize = sizeof(len) + dbt->size + sizeof(DB_LSN) + sizeof(dbt->size);
+
+ /*
+ * If *this* buffer is actively being transmitted, don't wait,
+ * just return so that it can be sent as a singleton.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ if (FLD_ISSET(*(bulk->flagsp), BULK_XMIT)) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (DB_REP_BULKOVF);
+ }
+
+ /*
+ * If the record is bigger than the buffer entirely, send the
+ * current buffer and then return DB_REP_BULKOVF so that this
+ * record is sent as a singleton. Do we have enough info to
+ * do that here? XXX
+ */
+ if (recsize > bulk->len) {
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "bulk_msg: Record %d (0x%x) larger than entire buffer 0x%x",
+ recsize, recsize, bulk->len));
+ STAT(rep->stat.st_bulk_overflows++);
+ (void)__rep_send_bulk(env, bulk, flags);
+ /*
+ * XXX __rep_send_message...
+ */
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (DB_REP_BULKOVF);
+ }
+ /*
+ * If this record doesn't fit, send the current buffer.
+ * Sending the buffer will reset the offset, but we will
+ * drop the mutex while sending so we need to keep checking
+ * if we're racing.
+ */
+ while (recsize + *(bulk->offp) > bulk->len) {
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "bulk_msg: Record %lu (%#lx) doesn't fit. Send %lu (%#lx) now.",
+ (u_long)recsize, (u_long)recsize,
+ (u_long)bulk->len, (u_long)bulk->len));
+ STAT(rep->stat.st_bulk_fills++);
+ if ((ret = __rep_send_bulk(env, bulk, flags)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (ret);
+ }
+ }
+
+ /*
+ * If we're using throttling, see if we are at the throttling
+ * limit before we do any more work here, by checking if the
+ * call to rep_send_throttle changed the repth->type to the
+ * *_MORE message type. If the throttling code hits the limit
+ * then we're done here.
+ */
+ if (bulk->type == REP_BULK_LOG)
+ typemore = REP_LOG_MORE;
+ else
+ typemore = REP_PAGE_MORE;
+ if (repth != NULL) {
+ if ((ret = __rep_send_throttle(env,
+ bulk->eid, repth, REP_THROTTLE_ONLY, flags)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (ret);
+ }
+ if (repth->type == typemore) {
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "bulk_msg: Record %lu (0x%lx) hit throttle limit.",
+ (u_long)recsize, (u_long)recsize));
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (ret);
+ }
+ }
+
+ /*
+ * Now we own the buffer, and we know our record fits into it.
+ * The buffer is structured with the len, LSN and then the record.
+ * Copy the record into the buffer. Then if we need to,
+ * send the buffer.
+ */
+ p = bulk->addr + *(bulk->offp);
+ b_args.len = dbt->size;
+ b_args.lsn = *lsn;
+ b_args.bulkdata = *dbt;
+ /*
+ * If we're the first record, we need to save the first
+ * LSN in the bulk structure.
+ */
+ if (*(bulk->offp) == 0)
+ bulk->lsn = *lsn;
+ if (rep->version < DB_REPVERSION_47) {
+ len = 0;
+ memcpy(p, &dbt->size, sizeof(dbt->size));
+ p += sizeof(dbt->size);
+ memcpy(p, lsn, sizeof(DB_LSN));
+ p += sizeof(DB_LSN);
+ memcpy(p, dbt->data, dbt->size);
+ p += dbt->size;
+ } else if ((ret = __rep_bulk_marshal(env, &b_args, p,
+ bulk->len, &len)) != 0)
+ goto err;
+ *(bulk->offp) = (roff_t)(p + len - bulk->addr);
+ STAT(rep->stat.st_bulk_records++);
+ /*
+ * Send the buffer if it is a perm record or a force.
+ */
+ if (LF_ISSET(REPCTL_PERM)) {
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "bulk_msg: Send buffer after copy due to PERM"));
+ ret = __rep_send_bulk(env, bulk, flags);
+ }
+err:
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (ret);
+
+}
+
+/*
+ * __rep_send_bulk --
+ * This function transmits the bulk buffer given. It assumes the
+ * caller holds the REP->mtx_clientdb. We may release it and reacquire
+ * it during this call. We will return with it held.
+ *
+ * PUBLIC: int __rep_send_bulk __P((ENV *, REP_BULK *, u_int32_t));
+ */
+int
+__rep_send_bulk(env, bulkp, ctlflags)
+ ENV *env;
+ REP_BULK *bulkp;
+ u_int32_t ctlflags;
+{
+ DBT dbt;
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ /*
+ * If the offset is 0, we're done. There is nothing to send.
+ */
+ if (*(bulkp->offp) == 0)
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * Set that this buffer is being actively transmitted.
+ */
+ FLD_SET(*(bulkp->flagsp), BULK_XMIT);
+ DB_INIT_DBT(dbt, bulkp->addr, *(bulkp->offp));
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "send_bulk: Send %d (0x%x) bulk buffer bytes", dbt.size, dbt.size));
+
+ /*
+ * Unlocked the mutex and now send the message.
+ */
+ STAT(rep->stat.st_bulk_transfers++);
+ if ((ret = __rep_send_message(env,
+ bulkp->eid, bulkp->type, &bulkp->lsn, &dbt, ctlflags, 0)) != 0)
+ ret = DB_REP_UNAVAIL;
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ /*
+ * Ready the buffer for further records.
+ */
+ *(bulkp->offp) = 0;
+ FLD_CLR(*(bulkp->flagsp), BULK_XMIT);
+ return (ret);
+}
+
+/*
+ * __rep_bulk_alloc --
+ * This function allocates and initializes an internal bulk buffer.
+ * This is used by the master when fulfilling a request for a chunk of
+ * log records or a bunch of pages.
+ *
+ * PUBLIC: int __rep_bulk_alloc __P((ENV *, REP_BULK *, int, uintptr_t *,
+ * PUBLIC: u_int32_t *, u_int32_t));
+ */
+int
+__rep_bulk_alloc(env, bulkp, eid, offp, flagsp, type)
+ ENV *env;
+ REP_BULK *bulkp;
+ int eid;
+ uintptr_t *offp;
+ u_int32_t *flagsp, type;
+{
+ int ret;
+
+ memset(bulkp, 0, sizeof(REP_BULK));
+ *offp = *flagsp = 0;
+ bulkp->len = MEGABYTE;
+ if ((ret = __os_malloc(env, bulkp->len, &bulkp->addr)) != 0)
+ return (ret);
+
+ /*
+ * The cast is safe because offp is an "out" parameter. The value
+ * of offp is meaningless when calling __rep_bulk_alloc.
+ */
+ bulkp->offp = (roff_t *)offp;
+ bulkp->type = type;
+ bulkp->eid = eid;
+ bulkp->flagsp = flagsp;
+ return (ret);
+}
+
+/*
+ * __rep_bulk_free --
+ * This function sends the remainder of the bulk buffer and frees it.
+ *
+ * PUBLIC: int __rep_bulk_free __P((ENV *, REP_BULK *, u_int32_t));
+ */
+int
+__rep_bulk_free(env, bulkp, flags)
+ ENV *env;
+ REP_BULK *bulkp;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ MUTEX_LOCK(env, db_rep->region->mtx_clientdb);
+ ret = __rep_send_bulk(env, bulkp, flags);
+ MUTEX_UNLOCK(env, db_rep->region->mtx_clientdb);
+ __os_free(env, bulkp->addr);
+ return (ret);
+}
+
+/*
+ * __rep_send_message --
+ * This is a wrapper for sending a message. It takes care of constructing
+ * the control structure and calling the user's specified send function.
+ *
+ * PUBLIC: int __rep_send_message __P((ENV *, int,
+ * PUBLIC: u_int32_t, DB_LSN *, const DBT *, u_int32_t, u_int32_t));
+ */
+int
+__rep_send_message(env, eid, rtype, lsnp, dbt, ctlflags, repflags)
+ ENV *env;
+ int eid;
+ u_int32_t rtype;
+ DB_LSN *lsnp;
+ const DBT *dbt;
+ u_int32_t ctlflags, repflags;
+{
+ DBT cdbt, scrap_dbt;
+ DB_ENV *dbenv;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ REP_46_CONTROL cntrl46;
+ REP_OLD_CONTROL ocntrl;
+ __rep_control_args cntrl;
+ db_timespec msg_time;
+ int ret;
+ u_int32_t myflags;
+ u_int8_t buf[__REP_CONTROL_SIZE];
+ size_t len;
+
+ dbenv = env->dbenv;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+
+#if defined(DEBUG_ROP) || defined(DEBUG_WOP)
+ if (db_rep->send == NULL)
+ return (0);
+#endif
+
+ /* Set up control structure. */
+ memset(&cntrl, 0, sizeof(cntrl));
+ memset(&ocntrl, 0, sizeof(ocntrl));
+ memset(&cntrl46, 0, sizeof(cntrl46));
+ if (lsnp == NULL)
+ ZERO_LSN(cntrl.lsn);
+ else
+ cntrl.lsn = *lsnp;
+ /*
+ * Set the rectype based on the version we need to speak.
+ */
+ if (rep->version == DB_REPVERSION)
+ cntrl.rectype = rtype;
+ else if (rep->version < DB_REPVERSION) {
+ cntrl.rectype = __rep_msg_to_old(rep->version, rtype);
+ VPRINT(env, (env, DB_VERB_REP_MSGS,
+ "rep_send_msg: rtype %lu to version %lu record %lu.",
+ (u_long)rtype, (u_long)rep->version,
+ (u_long)cntrl.rectype));
+ if (cntrl.rectype == REP_INVALID)
+ return (ret);
+ } else {
+ __db_errx(env, DB_STR_A("3503",
+ "rep_send_message: Unknown rep version %lu, my version %lu",
+ "%lu %lu"), (u_long)rep->version, (u_long)DB_REPVERSION);
+ return (__env_panic(env, EINVAL));
+ }
+ cntrl.flags = ctlflags;
+ cntrl.rep_version = rep->version;
+ cntrl.log_version = lp->persist.version;
+ cntrl.gen = rep->gen;
+
+ /* Don't assume the send function will be tolerant of NULL records. */
+ if (dbt == NULL) {
+ memset(&scrap_dbt, 0, sizeof(DBT));
+ dbt = &scrap_dbt;
+ }
+
+ /*
+ * There are several types of records: commit and checkpoint records
+ * that affect database durability, regular log records that might
+ * be buffered on the master before being transmitted, and control
+ * messages which don't require the guarantees of permanency, but
+ * should not be buffered.
+ *
+ * There are request records that can be sent anywhere, and there
+ * are rerequest records that the app might want to send to the master.
+ */
+ myflags = repflags;
+ if (FLD_ISSET(ctlflags, REPCTL_PERM)) {
+ /*
+ * When writing to a system database, skip setting the PERMANENT
+ * flag. We don't care; we don't want to wait; and the
+ * application shouldn't be distracted/confused in case there is
+ * a failure.
+ */
+ if (!F_ISSET(rep, REP_F_SYS_DB_OP))
+ myflags |= DB_REP_PERMANENT;
+ } else if (rtype != REP_LOG || FLD_ISSET(ctlflags, REPCTL_RESEND))
+ myflags |= DB_REP_NOBUFFER;
+
+ /*
+ * Let everyone know if we've been in an established group.
+ */
+ if (F_ISSET(rep, REP_F_GROUP_ESTD))
+ F_SET(&cntrl, REPCTL_GROUP_ESTD);
+
+ /*
+ * If we are a master sending a perm record, then set the
+ * REPCTL_LEASE flag to have the client reply. Also set
+ * the start time that the client will echo back to us.
+ *
+ * !!! If we are a master, using leases, we had better not be
+ * sending to an older version.
+ */
+ if (IS_REP_MASTER(env) && IS_USING_LEASES(env) &&
+ FLD_ISSET(ctlflags, REPCTL_LEASE | REPCTL_PERM)) {
+ F_SET(&cntrl, REPCTL_LEASE);
+ DB_ASSERT(env, rep->version == DB_REPVERSION);
+ __os_gettime(env, &msg_time, 1);
+ cntrl.msg_sec = (u_int32_t)msg_time.tv_sec;
+ cntrl.msg_nsec = (u_int32_t)msg_time.tv_nsec;
+ }
+
+ REP_PRINT_MESSAGE(env, eid, &cntrl, "rep_send_message", myflags);
+#ifdef REP_DIAGNOSTIC
+ if (FLD_ISSET(
+ env->dbenv->verbose, DB_VERB_REP_MSGS) && rtype == REP_LOG)
+ __rep_print_logmsg(env, dbt, lsnp);
+#endif
+
+ /*
+ * If DB_REP_PERMANENT is set, the LSN better be non-zero.
+ */
+ DB_ASSERT(env, !FLD_ISSET(myflags, DB_REP_PERMANENT) ||
+ !IS_ZERO_LSN(cntrl.lsn));
+
+ /*
+ * If we're talking to an old version, send an old control structure.
+ */
+ memset(&cdbt, 0, sizeof(cdbt));
+ if (rep->version <= DB_REPVERSION_45) {
+ if (rep->version == DB_REPVERSION_45 &&
+ F_ISSET(&cntrl, REPCTL_INIT)) {
+ F_CLR(&cntrl, REPCTL_INIT);
+ F_SET(&cntrl, REPCTL_INIT_45);
+ }
+ ocntrl.rep_version = cntrl.rep_version;
+ ocntrl.log_version = cntrl.log_version;
+ ocntrl.lsn = cntrl.lsn;
+ ocntrl.rectype = cntrl.rectype;
+ ocntrl.gen = cntrl.gen;
+ ocntrl.flags = cntrl.flags;
+ cdbt.data = &ocntrl;
+ cdbt.size = sizeof(ocntrl);
+ } else if (rep->version == DB_REPVERSION_46) {
+ cntrl46.rep_version = cntrl.rep_version;
+ cntrl46.log_version = cntrl.log_version;
+ cntrl46.lsn = cntrl.lsn;
+ cntrl46.rectype = cntrl.rectype;
+ cntrl46.gen = cntrl.gen;
+ cntrl46.msg_time.tv_sec = (time_t)cntrl.msg_sec;
+ cntrl46.msg_time.tv_nsec = (long)cntrl.msg_nsec;
+ cntrl46.flags = cntrl.flags;
+ cdbt.data = &cntrl46;
+ cdbt.size = sizeof(cntrl46);
+ } else {
+ (void)__rep_control_marshal(env, &cntrl, buf,
+ __REP_CONTROL_SIZE, &len);
+ DB_INIT_DBT(cdbt, buf, len);
+ }
+
+ /*
+ * We set the LSN above to something valid. Give the master the
+ * actual LSN so that they can coordinate with permanent records from
+ * the client if they want to.
+ *
+ * !!! Even though we marshalled the control message for transmission,
+ * give the transport function the real LSN.
+ */
+ ret = db_rep->send(dbenv, &cdbt, dbt, &cntrl.lsn, eid, myflags);
+
+ /*
+ * We don't hold the rep lock, so this could miscount if we race.
+ * I don't think it's worth grabbing the mutex for that bit of
+ * extra accuracy.
+ */
+ if (ret != 0) {
+ RPRINT(env, (env, DB_VERB_REP_MSGS,
+ "rep_send_function returned: %d", ret));
+#ifdef HAVE_STATISTICS
+ rep->stat.st_msgs_send_failures++;
+ } else
+ rep->stat.st_msgs_sent++;
+#else
+ }
+#endif
+ return (ret);
+}
+
+#ifdef REP_DIAGNOSTIC
+/*
+ * __rep_print_logmsg --
+ * This is a debugging routine for printing out log records that
+ * we are about to transmit to a client.
+ */
+static void
+__rep_print_logmsg(env, logdbt, lsnp)
+ ENV *env;
+ const DBT *logdbt;
+ DB_LSN *lsnp;
+{
+ static int first = 1;
+ static DB_DISTAB dtab;
+
+ if (first) {
+ first = 0;
+
+ (void)__bam_init_print(env, &dtab);
+ (void)__crdel_init_print(env, &dtab);
+ (void)__db_init_print(env, &dtab);
+ (void)__dbreg_init_print(env, &dtab);
+ (void)__fop_init_print(env, &dtab);
+ (void)__ham_init_print(env, &dtab);
+ (void)__qam_init_print(env, &dtab);
+ (void)__repmgr_init_print(env, &dtab);
+ (void)__txn_init_print(env, &dtab);
+ }
+
+ (void)__db_dispatch(
+ env, &dtab, (DBT *)logdbt, lsnp, DB_TXN_PRINT, NULL);
+}
+#endif
+
+/*
+ * __rep_new_master --
+ * Called after a master election to sync back up with a new master.
+ * It's possible that we already know of this new master in which case
+ * we don't need to do anything.
+ *
+ * This is written assuming that this message came from the master; we
+ * need to enforce that in __rep_process_record, but right now, we have
+ * no way to identify the master.
+ *
+ * PUBLIC: int __rep_new_master __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_new_master(env, cntrl, eid)
+ ENV *env;
+ __rep_control_args *cntrl;
+ int eid;
+{
+ DBT dbt;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN first_lsn, lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ db_timeout_t lease_to;
+ u_int32_t unused, vers;
+ int change, do_req, lockout_msg, ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+ logc = NULL;
+ lockout_msg = 0;
+ REP_SYSTEM_LOCK(env);
+ change = rep->gen != cntrl->gen || rep->master_id != eid;
+ /*
+ * If we're hearing from a current or new master, then we
+ * want to clear EPHASE0 in case this site is waiting to
+ * hear from the master.
+ */
+ FLD_CLR(rep->elect_flags, REP_E_PHASE0);
+ if (change) {
+ /*
+ * If we are already locking out others, we're either
+ * in the middle of sync-up recovery or internal init
+ * when this newmaster comes in (we also lockout in
+ * rep_start, but we cannot be racing that because we
+ * don't allow rep_proc_msg when rep_start is going on).
+ *
+ * We're about to become the client of a new master. Since we
+ * want to be able to sync with the new master as quickly as
+ * possible, interrupt any STARTSYNC from the old master. The
+ * new master may need to rely on acks from us and the old
+ * STARTSYNC is now irrelevant.
+ *
+ * Note that, conveniently, the "lockout_msg" flag defines the
+ * section of this code path during which both "message lockout"
+ * and "memp sync interrupt" are in effect.
+ */
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG))
+ goto lckout;
+
+ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+ goto errlck;
+
+ (void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 1);
+ lockout_msg = 1;
+ /*
+ * We must wait any remaining lease time before accepting
+ * this new master. This must be after the lockout above
+ * so that no new message can be processed and re-grant
+ * the lease out from under us.
+ */
+ if (IS_USING_LEASES(env) &&
+ ((lease_to = __rep_lease_waittime(env)) != 0)) {
+ REP_SYSTEM_UNLOCK(env);
+ __os_yield(env, 0, (u_long)lease_to);
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_LEASE_EXPIRED);
+ }
+
+ vers = lp->persist.version;
+ if (cntrl->log_version != vers) {
+ /*
+ * Set everything up to the lower version. If we're
+ * going to be upgrading to the latest version that
+ * can happen automatically as we process later log
+ * records. We likely want to sync to earlier version.
+ */
+ DB_ASSERT(env, vers != 0);
+ if (cntrl->log_version < vers)
+ vers = cntrl->log_version;
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "newmaster: Setting log version to %d",vers));
+ __log_set_version(env, vers);
+ if ((ret = __env_init_rec(env, vers)) != 0)
+ goto errlck;
+ }
+
+ REP_SYSTEM_UNLOCK(env);
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->wait_ts = rep->request_gap;
+ ZERO_LSN(lp->verify_lsn);
+ ZERO_LSN(lp->prev_ckp);
+ ZERO_LSN(lp->waiting_lsn);
+ ZERO_LSN(lp->max_wait_lsn);
+ /*
+ * Open if we need to, in preparation for the truncate
+ * we'll do in a moment.
+ */
+ if (db_rep->rep_db == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto err;
+ }
+
+ /*
+ * If we were in the middle of an internal initialization
+ * and we've discovered a new master instead, clean up
+ * our old internal init information. We need to clean
+ * up any flags and unlock our lockout.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (ISSET_LOCKOUT_BDB(rep)) {
+ ret = __rep_init_cleanup(env, rep, DB_FORCE);
+ /*
+ * Note that if an in-progress internal init was indeed
+ * "cleaned up", clearing these flags now will allow the
+ * application to see a completely empty database
+ * environment for a moment (until the master responds
+ * to our ALL_REQ).
+ */
+ F_CLR(rep, REP_F_ABBREVIATED);
+ CLR_RECOVERY_SETTINGS(rep);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (ret != 0) {
+ /* TODO: consider add'l error recovery steps. */
+ goto errlck;
+ }
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused))
+ != 0)
+ goto errlck;
+ STAT(rep->stat.st_log_queued = 0);
+
+ /*
+ * This needs to be performed under message lockout
+ * if we're actually changing master.
+ */
+ __rep_elect_done(env, rep);
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Updating gen from %lu to %lu from master %d",
+ (u_long)rep->gen, (u_long)cntrl->gen, eid));
+ SET_GEN(cntrl->gen);
+ rep->mgen = cntrl->gen;
+ if ((ret = __rep_notify_threads(env, AWAIT_GEN)) != 0)
+ goto errlck;
+ (void)__rep_write_gen(env, rep, rep->gen);
+ if (rep->egen <= rep->gen)
+ rep->egen = rep->gen + 1;
+ rep->master_id = eid;
+ STAT(rep->stat.st_master_changes++);
+ rep->stat.st_startup_complete = 0;
+ rep->version = cntrl->rep_version;
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "egen: %lu. rep version %lu",
+ (u_long)rep->egen, (u_long)rep->version));
+
+ /*
+ * If we're delaying client sync-up, we know we have a
+ * new/changed master now, set flag indicating we are
+ * actively delaying.
+ */
+ if (FLD_ISSET(rep->config, REP_C_DELAYCLIENT))
+ F_SET(rep, REP_F_DELAY);
+ if ((ret = __rep_lockout_archive(env, rep)) != 0)
+ goto errlck;
+ rep->sync_state = SYNC_VERIFY;
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ (void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+ lockout_msg = 0;
+ } else
+ __rep_elect_done(env, rep);
+ REP_SYSTEM_UNLOCK(env);
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lsn = lp->ready_lsn;
+
+ if (!change) {
+ ret = 0;
+ do_req = __rep_check_doreq(env, rep);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ /*
+ * If there wasn't a change, we might still have some
+ * catching up or verification to do.
+ */
+ if (do_req &&
+ (rep->sync_state != SYNC_OFF ||
+ LOG_COMPARE(&lsn, &cntrl->lsn) < 0)) {
+ ret = __rep_resend_req(env, 0);
+ if (ret != 0)
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "resend_req ret is %lu", (u_long)ret));
+ }
+ /*
+ * If we're not in one of the recovery modes, we need to
+ * clear the ARCHIVE flag. Elections set ARCHIVE
+ * and if we called an election and found the same
+ * master, we need to clear ARCHIVE here.
+ */
+ if (rep->sync_state == SYNC_OFF) {
+ REP_SYSTEM_LOCK(env);
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+ REP_SYSTEM_UNLOCK(env);
+ }
+ return (ret);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ /*
+ * If the master changed, we need to start the process of
+ * figuring out what our last valid log record is. However,
+ * if both the master and we agree that the max LSN is 0,0,
+ * then there is no recovery to be done. If we are at 0 and
+ * the master is not, then we just need to request all the log
+ * records from the master.
+ */
+ if (IS_INIT_LSN(lsn) || IS_ZERO_LSN(lsn)) {
+ if ((ret = __rep_newmaster_empty(env, eid)) != 0)
+ goto err;
+ goto newmaster_complete;
+ }
+
+ memset(&dbt, 0, sizeof(dbt));
+ /*
+ * If this client is farther ahead on the log file than the master, see
+ * if there is any overlap in the logs. If not, the client is too
+ * far ahead of the master and the client will start over.
+ */
+ if (cntrl->lsn.file < lsn.file) {
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+ ret = __logc_get(logc, &first_lsn, &dbt, DB_FIRST);
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == DB_NOTFOUND)
+ goto notfound;
+ else if (ret != 0)
+ goto err;
+ if (cntrl->lsn.file < first_lsn.file)
+ goto notfound;
+ }
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+ ret = __rep_log_backup(env, logc, &lsn, REP_REC_PERM);
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == DB_NOTFOUND)
+ goto notfound;
+ else if (ret != 0)
+ goto err;
+
+ /*
+ * Finally, we have a record to ask for.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->verify_lsn = lsn;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->wait_ts = rep->request_gap;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (!F_ISSET(rep, REP_F_DELAY))
+ (void)__rep_send_message(env,
+ eid, REP_VERIFY_REQ, &lsn, NULL, 0, DB_REP_ANYWHERE);
+ goto newmaster_complete;
+
+err: /*
+ * If we failed, we need to clear the flags we may have set above
+ * because we're not going to be setting the verify_lsn.
+ */
+ REP_SYSTEM_LOCK(env);
+errlck: if (lockout_msg) {
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ (void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+ }
+ F_CLR(rep, REP_F_DELAY);
+ CLR_RECOVERY_SETTINGS(rep);
+lckout: REP_SYSTEM_UNLOCK(env);
+ return (ret);
+
+notfound:
+ /*
+ * If we don't have an identification record, we still
+ * might have some log records but we're discarding them
+ * to sync up with the master from the start.
+ * Therefore, truncate our log and treat it as if it
+ * were empty. In-memory logs can't be completely
+ * zeroed using __log_vtruncate, so just zero them out.
+ */
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "No commit or ckp found. Truncate log."));
+ if (lp->db_log_inmemory) {
+ ZERO_LSN(lsn);
+ ret = __log_zero(env, &lsn);
+ } else {
+ INIT_LSN(lsn);
+ ret = __log_vtruncate(env, &lsn, &lsn, NULL);
+ }
+ if (ret != 0 && ret != DB_NOTFOUND)
+ return (ret);
+ infop = env->reginfo;
+ renv = infop->primary;
+ REP_SYSTEM_LOCK(env);
+ (void)time(&renv->rep_timestamp);
+ REP_SYSTEM_UNLOCK(env);
+ if ((ret = __rep_newmaster_empty(env, eid)) != 0)
+ goto err;
+newmaster_complete:
+ return (DB_REP_NEWMASTER);
+}
+
+/*
+ * __rep_newmaster_empty
+ * Handle the case of a NEWMASTER message received when we have an empty
+ * log. This requires internal init. If we can't do that because
+ * AUTOINIT off, return JOIN_FAILURE. If F_DELAY is in effect, don't even
+ * consider AUTOINIT yet, because they could change it before rep_sync call.
+ */
+static int
+__rep_newmaster_empty(env, eid)
+ ENV *env;
+ int eid;
+{
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int msg, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ lp = env->lg_handle->reginfo.primary;
+ msg = ret = 0;
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ lp->wait_ts = rep->request_gap;
+
+ /* Usual case is to skip to UPDATE state; we may revise this below. */
+ rep->sync_state = SYNC_UPDATE;
+
+ if (F_ISSET(rep, REP_F_DELAY)) {
+ /*
+ * Having properly set up wait_ts for later, nothing more to
+ * do now.
+ */
+ } else if (!FLD_ISSET(rep->config, REP_C_AUTOINIT)) {
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE);
+ CLR_RECOVERY_SETTINGS(rep);
+ ret = DB_REP_JOIN_FAILURE;
+ } else {
+ /* Normal case: not DELAY but AUTOINIT. */
+ msg = 1;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ if (msg)
+ (void)__rep_send_message(env, eid, REP_UPDATE_REQ,
+ NULL, NULL, 0, 0);
+ return (ret);
+}
+
+/*
+ * __rep_elect_done
+ * Clear all election information for this site. Assumes the
+ * caller hold the region mutex.
+ *
+ * PUBLIC: void __rep_elect_done __P((ENV *, REP *));
+ */
+void
+__rep_elect_done(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ int inelect;
+ db_timespec endtime;
+
+ inelect = IN_ELECTION(rep);
+ FLD_CLR(rep->elect_flags, REP_E_PHASE1 | REP_E_PHASE2 | REP_E_TALLY);
+
+ rep->sites = 0;
+ rep->votes = 0;
+ if (inelect) {
+ if (timespecisset(&rep->etime)) {
+ __os_gettime(env, &endtime, 1);
+ timespecsub(&endtime, &rep->etime);
+#ifdef HAVE_STATISTICS
+ rep->stat.st_election_sec = (u_int32_t)endtime.tv_sec;
+ rep->stat.st_election_usec = (u_int32_t)
+ (endtime.tv_nsec / NS_PER_US);
+#endif
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Election finished in %lu.%09lu sec",
+ (u_long)endtime.tv_sec, (u_long)endtime.tv_nsec));
+ timespecclear(&rep->etime);
+ }
+ rep->egen++;
+ }
+ RPRINT(env, (env, DB_VERB_REP_ELECT,
+ "Election done; egen %lu", (u_long)rep->egen));
+}
+
+/*
+ * __env_rep_enter --
+ *
+ * Check if we are in the middle of replication initialization and/or
+ * recovery, and if so, disallow operations. If operations are allowed,
+ * increment handle-counts, so that we do not start recovery while we
+ * are operating in the library.
+ *
+ * PUBLIC: int __env_rep_enter __P((ENV *, int));
+ */
+int
+__env_rep_enter(env, checklock)
+ ENV *env;
+ int checklock;
+{
+ DB_REP *db_rep;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ int cnt, ret;
+ time_t timestamp;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) {
+ (void)time(&timestamp);
+ TIMESTAMP_CHECK(env, timestamp, renv);
+ /*
+ * Check if we're still locked out after checking
+ * the timestamp.
+ */
+ if (F_ISSET(renv, DB_REGENV_REPLOCKED))
+ return (EINVAL);
+ }
+
+ REP_SYSTEM_LOCK(env);
+ for (cnt = 0; FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_API);) {
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * We're spinning - environment may be hung. Check if
+ * recovery has been initiated.
+ */
+ PANIC_CHECK(env);
+ if (FLD_ISSET(rep->config, REP_C_NOWAIT)) {
+ __db_errx(env, DB_STR("3504",
+ "Operation locked out. Waiting for replication lockout to complete"));
+ return (DB_REP_LOCKOUT);
+ }
+ __os_yield(env, 1, 0);
+ if (++cnt % 60 == 0 &&
+ (ret = __rep_show_progress(env,
+ DB_STR_P("DB_ENV handle"), cnt / 60)) != 0)
+ return (ret);
+ REP_SYSTEM_LOCK(env);
+ }
+ rep->handle_cnt++;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+static int
+__rep_show_progress(env, which, mins)
+ ENV *env;
+ const char *which;
+ int mins;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ REP *rep;
+ DB_LSN ready_lsn;
+
+ rep = env->rep_handle->region;
+ dblp = env->lg_handle;
+ lp = dblp == NULL ? NULL : dblp->reginfo.primary;
+
+#define WAITING_MSG DB_STR_A("3505", \
+ "%s waiting %d minutes for replication lockout to complete", "%s %d")
+#define WAITING_ARGS WAITING_MSG, which, mins
+
+ __db_errx(env, WAITING_ARGS);
+ RPRINT(env, (env, DB_VERB_REP_SYNC, WAITING_ARGS));
+
+ if (lp == NULL)
+ ZERO_LSN(ready_lsn);
+ else {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ ready_lsn = lp->ready_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+ REP_SYSTEM_LOCK(env);
+ switch (rep->sync_state) {
+ case SYNC_PAGE:
+#define PAGE_MSG DB_STR_A("3506", \
+ "SYNC_PAGE: files %lu/%lu; pages %lu (%lu next)", "%lu %lu %lu %lu")
+#define PAGE_ARGS (u_long)rep->curfile, (u_long)rep->nfiles, \
+ (u_long)rep->npages, (u_long)rep->ready_pg
+ __db_errx(env, PAGE_MSG, PAGE_ARGS);
+ RPRINT(env, (env, DB_VERB_REP_SYNC, PAGE_MSG, PAGE_ARGS));
+ break;
+ case SYNC_LOG:
+#define LSN_ARG(lsn) (u_long)(lsn).file, (u_long)(lsn).offset
+#define LOG_LSN_ARGS LSN_ARG(ready_lsn), \
+ LSN_ARG(rep->first_lsn), LSN_ARG(rep->last_lsn)
+#ifdef HAVE_STATISTICS
+#define LOG_MSG DB_STR_A("3507", \
+ "SYNC_LOG: thru [%lu][%lu] from [%lu][%lu]/[%lu][%lu] (%lu queued)",\
+ "%lu %lu %lu %lu %lu %lu %lu")
+#define LOG_ARGS LOG_LSN_ARGS, (u_long)rep->stat.st_log_queued
+#else
+#define LOG_MSG DB_STR_A("3508", \
+ "SYNC_LOG: thru [%lu][%lu] from [%lu][%lu]/[%lu][%lu]", \
+ "%lu %lu %lu %lu %lu %lu")
+#define LOG_ARGS LOG_LSN_ARGS
+#endif
+ __db_errx(env, LOG_MSG, LOG_ARGS);
+ RPRINT(env, (env, DB_VERB_REP_SYNC, LOG_MSG, LOG_ARGS));
+ break;
+ default:
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "sync state %d", (int)rep->sync_state));
+ break;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+}
+
+/*
+ * __env_db_rep_exit --
+ *
+ * Decrement handle count upon routine exit.
+ *
+ * PUBLIC: int __env_db_rep_exit __P((ENV *));
+ */
+int
+__env_db_rep_exit(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_SYSTEM_LOCK(env);
+ rep->handle_cnt--;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __db_rep_enter --
+ * Called in replicated environments to keep track of in-use handles
+ * and prevent any concurrent operation during recovery. If checkgen is
+ * non-zero, then we verify that the dbp has the same handle as the env.
+ *
+ * If return_now is non-zero, we'll return DB_DEADLOCK immediately, else we'll
+ * sleep before returning DB_DEADLOCK. Without the sleep, it is likely
+ * the application will immediately try again and could reach a retry
+ * limit before replication has a chance to finish. The sleep increases
+ * the probability that an application retry will succeed.
+ *
+ * Typically calls with txns set return_now so that we return immediately.
+ * We want to return immediately because we want the txn to abort ASAP
+ * so that the lockout can proceed.
+ *
+ * PUBLIC: int __db_rep_enter __P((DB *, int, int, int));
+ */
+int
+__db_rep_enter(dbp, checkgen, checklock, return_now)
+ DB *dbp;
+ int checkgen, checklock, return_now;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ time_t timestamp;
+
+ env = dbp->env;
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) {
+ (void)time(&timestamp);
+ TIMESTAMP_CHECK(env, timestamp, renv);
+ /*
+ * Check if we're still locked out after checking
+ * the timestamp.
+ */
+ if (F_ISSET(renv, DB_REGENV_REPLOCKED))
+ return (EINVAL);
+ }
+
+ /*
+ * Return a dead handle if an internal handle is trying to
+ * get an exclusive lock on this database.
+ */
+ if (checkgen && dbp->mpf->mfp && IS_REP_CLIENT(env)) {
+ if (dbp->mpf->mfp->excl_lockout)
+ return (DB_REP_HANDLE_DEAD);
+ }
+
+ REP_SYSTEM_LOCK(env);
+ /*
+ * !!!
+ * Note, we are checking REP_LOCKOUT_OP, but we are
+ * incrementing rep->handle_cnt. That seems like a mismatch,
+ * but the intention is to return DEADLOCK to the application
+ * which will cause them to abort the txn quickly and allow
+ * the lockout to proceed.
+ *
+ * The correctness of doing this depends on the fact that
+ * lockout of the API always sets REP_LOCKOUT_OP first.
+ */
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_OP)) {
+ REP_SYSTEM_UNLOCK(env);
+ if (!return_now)
+ __os_yield(env, 5, 0);
+ return (DB_LOCK_DEADLOCK);
+ }
+
+ if (checkgen && dbp->timestamp != renv->rep_timestamp) {
+ REP_SYSTEM_UNLOCK(env);
+ return (DB_REP_HANDLE_DEAD);
+ }
+ rep->handle_cnt++;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * Check for permission to increment handle_cnt, and do so if possible. Used in
+ * cases where we want to count an operation in the context of a transaction,
+ * but the operation does not involve a DB handle.
+ *
+ * PUBLIC: int __op_handle_enter __P((ENV *));
+ */
+int
+__op_handle_enter(env)
+ ENV *env;
+{
+ REP *rep;
+ int ret;
+
+ rep = env->rep_handle->region;
+ REP_SYSTEM_LOCK(env);
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_OP))
+ ret = DB_LOCK_DEADLOCK;
+ else {
+ rep->handle_cnt++;
+ ret = 0;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __op_rep_enter --
+ *
+ * Check if we are in the middle of replication initialization and/or
+ * recovery, and if so, disallow new multi-step operations, such as
+ * transaction and memp gets. If operations are allowed,
+ * increment the op_cnt, so that we do not start recovery while we have
+ * active operations.
+ *
+ * PUBLIC: int __op_rep_enter __P((ENV *, int, int));
+ */
+int
+__op_rep_enter(env, local_nowait, obey_user)
+ ENV *env;
+ int local_nowait, obey_user;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int cnt, ret;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_SYSTEM_LOCK(env);
+ for (cnt = 0; FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_OP);) {
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * We're spinning - environment may be hung. Check if
+ * recovery has been initiated.
+ */
+ PANIC_CHECK(env);
+ if (local_nowait)
+ return (DB_REP_LOCKOUT);
+ if (FLD_ISSET(rep->config, REP_C_NOWAIT) && obey_user) {
+ __db_errx(env, DB_STR("3509",
+ "Operation locked out. Waiting for replication lockout to complete"));
+ return (DB_REP_LOCKOUT);
+ }
+ __os_yield(env, 5, 0);
+ cnt += 5;
+ if (++cnt % 60 == 0 &&
+ (ret = __rep_show_progress(env,
+ "__op_rep_enter", cnt / 60)) != 0)
+ return (ret);
+ REP_SYSTEM_LOCK(env);
+ }
+ rep->op_cnt++;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __op_rep_exit --
+ *
+ * Decrement op count upon transaction commit/abort/discard or
+ * memp_fput.
+ *
+ * PUBLIC: int __op_rep_exit __P((ENV *));
+ */
+int
+__op_rep_exit(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_SYSTEM_LOCK(env);
+ DB_ASSERT(env, rep->op_cnt > 0);
+ rep->op_cnt--;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __archive_rep_enter
+ * Used by log_archive to determine if it is okay to remove
+ * log files.
+ *
+ * PUBLIC: int __archive_rep_enter __P((ENV *));
+ */
+int
+__archive_rep_enter(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ time_t timestamp;
+ int ret;
+
+ ret = 0;
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ /*
+ * This is tested before REP_ON below because we always need
+ * to obey if any replication process has disabled archiving.
+ * Everything is in the environment region that we need here.
+ */
+ if (F_ISSET(renv, DB_REGENV_REPLOCKED)) {
+ (void)time(&timestamp);
+ TIMESTAMP_CHECK(env, timestamp, renv);
+ /*
+ * Check if we're still locked out after checking
+ * the timestamp.
+ */
+ if (F_ISSET(renv, DB_REGENV_REPLOCKED))
+ return (DB_REP_LOCKOUT);
+ }
+
+ if (!REP_ON(env))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ REP_SYSTEM_LOCK(env);
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_ARCHIVE))
+ ret = DB_REP_LOCKOUT;
+ else
+ rep->arch_th++;
+ REP_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __archive_rep_exit
+ * Clean up accounting for log archive threads.
+ *
+ * PUBLIC: int __archive_rep_exit __P((ENV *));
+ */
+int
+__archive_rep_exit(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ if (!REP_ON(env))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ REP_SYSTEM_LOCK(env);
+ rep->arch_th--;
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+}
+
+/*
+ * __rep_lockout_archive --
+ * Coordinate with other threads archiving log files so that
+ * we can run and know that no log files will be removed out
+ * from underneath us.
+ * Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_archive __P((ENV *, REP *));
+ */
+int
+__rep_lockout_archive(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ return (__rep_lockout_int(env, rep, &rep->arch_th, 0,
+ "arch_th", REP_LOCKOUT_ARCHIVE));
+}
+
+/*
+ * __rep_lockout_api --
+ * Coordinate with other threads in the library and active txns so
+ * that we can run single-threaded, for recovery or internal backup.
+ * Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_api __P((ENV *, REP *));
+ */
+int
+__rep_lockout_api(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ int ret;
+
+ /*
+ * We must drain long-running operations first. We check
+ * REP_LOCKOUT_OP in __db_rep_enter in order to allow them
+ * to abort existing txns quickly. Therefore, we must
+ * always lockout REP_LOCKOUT_OP first, then REP_LOCKOUT_API.
+ */
+ if ((ret = __rep_lockout_int(env, rep, &rep->op_cnt, 0,
+ "op_cnt", REP_LOCKOUT_OP)) != 0)
+ return (ret);
+ if ((ret = __rep_lockout_int(env, rep, &rep->handle_cnt, 0,
+ "handle_cnt", REP_LOCKOUT_API)) != 0)
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_OP);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_take_apilockout __P((ENV *));
+ *
+ * For use by repmgr (keep the module boundaries reasonably clean).
+ */
+int
+__rep_take_apilockout(env)
+ ENV *env;
+{
+ REP *rep;
+ int ret;
+
+ rep = env->rep_handle->region;
+ REP_SYSTEM_LOCK(env);
+ ret = __rep_lockout_api(env, rep);
+ REP_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_clear_apilockout __P((ENV *));
+ */
+int
+__rep_clear_apilockout(env)
+ ENV *env;
+{
+ REP *rep;
+
+ rep = env->rep_handle->region;
+
+ REP_SYSTEM_LOCK(env);
+ CLR_LOCKOUT_BDB(rep);
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+}
+
+/*
+ * __rep_lockout_apply --
+ * Coordinate with other threads processing messages so that
+ * we can run single-threaded and know that no incoming
+ * message can apply new log records.
+ * This call should be short-term covering a specific critical
+ * operation where we need to make sure no new records change
+ * the log. Currently used to coordinate with elections.
+ * Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_apply __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_lockout_apply(env, rep, apply_th)
+ ENV *env;
+ REP *rep;
+ u_int32_t apply_th;
+{
+ return (__rep_lockout_int(env, rep, &rep->apply_th, apply_th,
+ "apply_th", REP_LOCKOUT_APPLY));
+}
+
+/*
+ * __rep_lockout_msg --
+ * Coordinate with other threads processing messages so that
+ * we can run single-threaded and know that no incoming
+ * message can change the world (i.e., like a NEWMASTER message).
+ * This call should be short-term covering a specific critical
+ * operation where we need to make sure no new messages arrive
+ * in the middle and all message threads are out before we start it.
+ * Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_msg __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_lockout_msg(env, rep, msg_th)
+ ENV *env;
+ REP *rep;
+ u_int32_t msg_th;
+{
+ return (__rep_lockout_int(env, rep, &rep->msg_th, msg_th,
+ "msg_th", REP_LOCKOUT_MSG));
+}
+
+/*
+ * __rep_lockout_int --
+ * Internal common code for locking out and coordinating
+ * with other areas of the code.
+ * Assumes the caller holds the region mutex.
+ *
+ */
+static int
+__rep_lockout_int(env, rep, fieldp, field_val, msg, lockout_flag)
+ ENV *env;
+ REP *rep;
+ u_int32_t *fieldp;
+ const char *msg;
+ u_int32_t field_val, lockout_flag;
+{
+ int ret, wait_cnt;
+
+ FLD_SET(rep->lockout_flags, lockout_flag);
+ for (wait_cnt = 0; *fieldp > field_val;) {
+ if ((ret = __rep_notify_threads(env, LOCKOUT)) != 0)
+ return (ret);
+ REP_SYSTEM_UNLOCK(env);
+ /* We're spinning - environment may be hung. Check if
+ * recovery has been initiated.
+ */
+ PANIC_CHECK(env);
+ __os_yield(env, 1, 0);
+#ifdef DIAGNOSTIC
+ if (wait_cnt == 5) {
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Waiting for %s (%lu) to complete lockout to %lu",
+ msg, (u_long)*fieldp, (u_long)field_val));
+ __db_errx(env, DB_STR_A("3510",
+"Waiting for %s (%lu) to complete replication lockout",
+ "%s %lu"), msg, (u_long)*fieldp);
+ }
+ if (++wait_cnt % 60 == 0)
+ __db_errx(env, DB_STR_A("3511",
+"Waiting for %s (%lu) to complete replication lockout for %d minutes",
+ "%s %lu %d"), msg, (u_long)*fieldp, wait_cnt / 60);
+#endif
+ REP_SYSTEM_LOCK(env);
+ }
+
+ COMPQUIET(msg, NULL);
+ return (0);
+}
+
+/*
+ * __rep_send_throttle -
+ * Send a record, throttling if necessary. Callers of this function
+ * will throttle - breaking out of their loop, if the repth->type field
+ * changes from the normal message type to the *_MORE message type.
+ * This function will send the normal type unless throttling gets invoked.
+ * Then it sets the type field and sends the _MORE message.
+ *
+ * Throttling is always only relevant in serving requests, so we always send
+ * with REPCTL_RESEND. Additional desired flags can be passed in the ctlflags
+ * argument.
+ *
+ * PUBLIC: int __rep_send_throttle __P((ENV *, int, REP_THROTTLE *,
+ * PUBLIC: u_int32_t, u_int32_t));
+ */
+int
+__rep_send_throttle(env, eid, repth, flags, ctlflags)
+ ENV *env;
+ int eid;
+ REP_THROTTLE *repth;
+ u_int32_t ctlflags, flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ u_int32_t size, typemore;
+ int check_limit;
+
+ check_limit = repth->gbytes != 0 || repth->bytes != 0;
+ /*
+ * If we only want to do throttle processing and we don't have it
+ * turned on, return immediately.
+ */
+ if (!check_limit && LF_ISSET(REP_THROTTLE_ONLY))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ typemore = 0;
+ if (repth->type == REP_LOG)
+ typemore = REP_LOG_MORE;
+ if (repth->type == REP_PAGE)
+ typemore = REP_PAGE_MORE;
+ DB_ASSERT(env, typemore != 0);
+
+ /*
+ * data_dbt.size is only the size of the log
+ * record; it doesn't count the size of the
+ * control structure. Factor that in as well
+ * so we're not off by a lot if our log records
+ * are small.
+ */
+ size = repth->data_dbt->size + sizeof(__rep_control_args);
+ if (check_limit) {
+ while (repth->bytes <= size) {
+ if (repth->gbytes > 0) {
+ repth->bytes += GIGABYTE;
+ --(repth->gbytes);
+ continue;
+ }
+ /*
+ * We don't hold the rep mutex,
+ * and may miscount.
+ */
+ STAT(rep->stat.st_nthrottles++);
+ repth->type = typemore;
+ goto snd;
+ }
+ repth->bytes -= size;
+ }
+ /*
+ * Always send if it is typemore, otherwise send only if
+ * REP_THROTTLE_ONLY is not set.
+ *
+ * NOTE: It is the responsibility of the caller to marshal, if
+ * needed, the data_dbt. This function just sends what it is given.
+ */
+snd: if ((repth->type == typemore || !LF_ISSET(REP_THROTTLE_ONLY)) &&
+ (__rep_send_message(env, eid, repth->type,
+ &repth->lsn, repth->data_dbt, (REPCTL_RESEND | ctlflags), 0) != 0))
+ return (DB_REP_UNAVAIL);
+ return (0);
+}
+
+/*
+ * __rep_msg_to_old --
+ * Convert current message numbers to old message numbers.
+ *
+ * PUBLIC: u_int32_t __rep_msg_to_old __P((u_int32_t, u_int32_t));
+ */
+u_int32_t
+__rep_msg_to_old(version, rectype)
+ u_int32_t version, rectype;
+{
+ /*
+ * We need to convert from current message numbers to old numbers and
+ * we need to convert from old numbers to current numbers. Offset by
+ * one for more readable code.
+ */
+ /*
+ * Everything for version 0 is invalid, there is no version 0.
+ */
+ static const u_int32_t table[DB_REPVERSION][REP_MAX_MSG+1] = {
+ /* There is no DB_REPVERSION 0. */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * 4.2/DB_REPVERSION 1 no longer supported.
+ */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * 4.3/DB_REPVERSION 2 no longer supported.
+ */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * From 4.7 message number To 4.4/4.5 message number
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* REP_ALIVE */
+ 2, /* REP_ALIVE_REQ */
+ 3, /* REP_ALL_REQ */
+ 4, /* REP_BULK_LOG */
+ 5, /* REP_BULK_PAGE */
+ 6, /* REP_DUPMASTER */
+ 7, /* REP_FILE */
+ 8, /* REP_FILE_FAIL */
+ 9, /* REP_FILE_REQ */
+ REP_INVALID, /* REP_LEASE_GRANT */
+ 10, /* REP_LOG */
+ 11, /* REP_LOG_MORE */
+ 12, /* REP_LOG_REQ */
+ 13, /* REP_MASTER_REQ */
+ 14, /* REP_NEWCLIENT */
+ 15, /* REP_NEWFILE */
+ 16, /* REP_NEWMASTER */
+ 17, /* REP_NEWSITE */
+ 18, /* REP_PAGE */
+ 19, /* REP_PAGE_FAIL */
+ 20, /* REP_PAGE_MORE */
+ 21, /* REP_PAGE_REQ */
+ 22, /* REP_REREQUEST */
+ REP_INVALID, /* REP_START_SYNC */
+ 23, /* REP_UPDATE */
+ 24, /* REP_UPDATE_REQ */
+ 25, /* REP_VERIFY */
+ 26, /* REP_VERIFY_FAIL */
+ 27, /* REP_VERIFY_REQ */
+ 28, /* REP_VOTE1 */
+ 29 /* REP_VOTE2 */
+ },
+ /*
+ * From 4.7 message number To 4.6 message number. There are
+ * NO message differences between 4.6 and 4.7. The
+ * control structure changed.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* REP_ALIVE */
+ 2, /* REP_ALIVE_REQ */
+ 3, /* REP_ALL_REQ */
+ 4, /* REP_BULK_LOG */
+ 5, /* REP_BULK_PAGE */
+ 6, /* REP_DUPMASTER */
+ 7, /* REP_FILE */
+ 8, /* REP_FILE_FAIL */
+ 9, /* REP_FILE_REQ */
+ 10, /* REP_LEASE_GRANT */
+ 11, /* REP_LOG */
+ 12, /* REP_LOG_MORE */
+ 13, /* REP_LOG_REQ */
+ 14, /* REP_MASTER_REQ */
+ 15, /* REP_NEWCLIENT */
+ 16, /* REP_NEWFILE */
+ 17, /* REP_NEWMASTER */
+ 18, /* REP_NEWSITE */
+ 19, /* REP_PAGE */
+ 20, /* REP_PAGE_FAIL */
+ 21, /* REP_PAGE_MORE */
+ 22, /* REP_PAGE_REQ */
+ 23, /* REP_REREQUEST */
+ 24, /* REP_START_SYNC */
+ 25, /* REP_UPDATE */
+ 26, /* REP_UPDATE_REQ */
+ 27, /* REP_VERIFY */
+ 28, /* REP_VERIFY_FAIL */
+ 29, /* REP_VERIFY_REQ */
+ 30, /* REP_VOTE1 */
+ 31 /* REP_VOTE2 */
+ },
+ /*
+ * From 5.2 message number To 4.7 message number. There are
+ * NO message differences between 4.7 and 5.2. The
+ * content of vote1 changed.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* REP_ALIVE */
+ 2, /* REP_ALIVE_REQ */
+ 3, /* REP_ALL_REQ */
+ 4, /* REP_BULK_LOG */
+ 5, /* REP_BULK_PAGE */
+ 6, /* REP_DUPMASTER */
+ 7, /* REP_FILE */
+ 8, /* REP_FILE_FAIL */
+ 9, /* REP_FILE_REQ */
+ 10, /* REP_LEASE_GRANT */
+ 11, /* REP_LOG */
+ 12, /* REP_LOG_MORE */
+ 13, /* REP_LOG_REQ */
+ 14, /* REP_MASTER_REQ */
+ 15, /* REP_NEWCLIENT */
+ 16, /* REP_NEWFILE */
+ 17, /* REP_NEWMASTER */
+ 18, /* REP_NEWSITE */
+ 19, /* REP_PAGE */
+ 20, /* REP_PAGE_FAIL */
+ 21, /* REP_PAGE_MORE */
+ 22, /* REP_PAGE_REQ */
+ 23, /* REP_REREQUEST */
+ 24, /* REP_START_SYNC */
+ 25, /* REP_UPDATE */
+ 26, /* REP_UPDATE_REQ */
+ 27, /* REP_VERIFY */
+ 28, /* REP_VERIFY_FAIL */
+ 29, /* REP_VERIFY_REQ */
+ 30, /* REP_VOTE1 */
+ 31 /* REP_VOTE2 */
+ },
+ /*
+ * From 5.3 message number To 4.7 message number. There are
+ * NO message differences between 4.7 and 5.3. The
+ * content of fileinfo changed.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* REP_ALIVE */
+ 2, /* REP_ALIVE_REQ */
+ 3, /* REP_ALL_REQ */
+ 4, /* REP_BULK_LOG */
+ 5, /* REP_BULK_PAGE */
+ 6, /* REP_DUPMASTER */
+ 7, /* REP_FILE */
+ 8, /* REP_FILE_FAIL */
+ 9, /* REP_FILE_REQ */
+ 10, /* REP_LEASE_GRANT */
+ 11, /* REP_LOG */
+ 12, /* REP_LOG_MORE */
+ 13, /* REP_LOG_REQ */
+ 14, /* REP_MASTER_REQ */
+ 15, /* REP_NEWCLIENT */
+ 16, /* REP_NEWFILE */
+ 17, /* REP_NEWMASTER */
+ 18, /* REP_NEWSITE */
+ 19, /* REP_PAGE */
+ 20, /* REP_PAGE_FAIL */
+ 21, /* REP_PAGE_MORE */
+ 22, /* REP_PAGE_REQ */
+ 23, /* REP_REREQUEST */
+ 24, /* REP_START_SYNC */
+ 25, /* REP_UPDATE */
+ 26, /* REP_UPDATE_REQ */
+ 27, /* REP_VERIFY */
+ 28, /* REP_VERIFY_FAIL */
+ 29, /* REP_VERIFY_REQ */
+ 30, /* REP_VOTE1 */
+ 31 /* REP_VOTE2 */
+ }
+ };
+ return (table[version][rectype]);
+}
+
+/*
+ * __rep_msg_from_old --
+ * Convert old message numbers to current message numbers.
+ *
+ * PUBLIC: u_int32_t __rep_msg_from_old __P((u_int32_t, u_int32_t));
+ */
+u_int32_t
+__rep_msg_from_old(version, rectype)
+ u_int32_t version, rectype;
+{
+ /*
+ * We need to convert from current message numbers to old numbers and
+ * we need to convert from old numbers to current numbers. Offset by
+ * one for more readable code.
+ */
+ /*
+ * Everything for version 0 is invalid, there is no version 0.
+ */
+ static const u_int32_t table[DB_REPVERSION][REP_MAX_MSG+1] = {
+ /* There is no DB_REPVERSION 0. */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * 4.2/DB_REPVERSION 1 no longer supported.
+ */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * 4.3/DB_REPVERSION 2 no longer supported.
+ */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * From 4.4/4.5 message number To 4.7 message number
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* 1, REP_ALIVE */
+ 2, /* 2, REP_ALIVE_REQ */
+ 3, /* 3, REP_ALL_REQ */
+ 4, /* 4, REP_BULK_LOG */
+ 5, /* 5, REP_BULK_PAGE */
+ 6, /* 6, REP_DUPMASTER */
+ 7, /* 7, REP_FILE */
+ 8, /* 8, REP_FILE_FAIL */
+ 9, /* 9, REP_FILE_REQ */
+ /* 10, REP_LEASE_GRANT doesn't exist */
+ 11, /* 10, REP_LOG */
+ 12, /* 11, REP_LOG_MORE */
+ 13, /* 12, REP_LOG_REQ */
+ 14, /* 13, REP_MASTER_REQ */
+ 15, /* 14, REP_NEWCLIENT */
+ 16, /* 15, REP_NEWFILE */
+ 17, /* 16, REP_NEWMASTER */
+ 18, /* 17, REP_NEWSITE */
+ 19, /* 18, REP_PAGE */
+ 20, /* 19, REP_PAGE_FAIL */
+ 21, /* 20, REP_PAGE_MORE */
+ 22, /* 21, REP_PAGE_REQ */
+ 23, /* 22, REP_REREQUEST */
+ /* 24, REP_START_SYNC doesn't exist */
+ 25, /* 23, REP_UPDATE */
+ 26, /* 24, REP_UPDATE_REQ */
+ 27, /* 25, REP_VERIFY */
+ 28, /* 26, REP_VERIFY_FAIL */
+ 29, /* 27, REP_VERIFY_REQ */
+ 30, /* 28, REP_VOTE1 */
+ 31, /* 29, REP_VOTE2 */
+ REP_INVALID, /* 30, 4.4/4.5 no message */
+ REP_INVALID /* 31, 4.4/4.5 no message */
+ },
+ /*
+ * From 4.6 message number To 4.7 message number. There are
+ * NO message differences between 4.6 and 4.7. The
+ * control structure changed.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* 1, REP_ALIVE */
+ 2, /* 2, REP_ALIVE_REQ */
+ 3, /* 3, REP_ALL_REQ */
+ 4, /* 4, REP_BULK_LOG */
+ 5, /* 5, REP_BULK_PAGE */
+ 6, /* 6, REP_DUPMASTER */
+ 7, /* 7, REP_FILE */
+ 8, /* 8, REP_FILE_FAIL */
+ 9, /* 9, REP_FILE_REQ */
+ 10, /* 10, REP_LEASE_GRANT */
+ 11, /* 11, REP_LOG */
+ 12, /* 12, REP_LOG_MORE */
+ 13, /* 13, REP_LOG_REQ */
+ 14, /* 14, REP_MASTER_REQ */
+ 15, /* 15, REP_NEWCLIENT */
+ 16, /* 16, REP_NEWFILE */
+ 17, /* 17, REP_NEWMASTER */
+ 18, /* 18, REP_NEWSITE */
+ 19, /* 19, REP_PAGE */
+ 20, /* 20, REP_PAGE_FAIL */
+ 21, /* 21, REP_PAGE_MORE */
+ 22, /* 22, REP_PAGE_REQ */
+ 23, /* 22, REP_REREQUEST */
+ 24, /* 24, REP_START_SYNC */
+ 25, /* 25, REP_UPDATE */
+ 26, /* 26, REP_UPDATE_REQ */
+ 27, /* 27, REP_VERIFY */
+ 28, /* 28, REP_VERIFY_FAIL */
+ 29, /* 29, REP_VERIFY_REQ */
+ 30, /* 30, REP_VOTE1 */
+ 31 /* 31, REP_VOTE2 */
+ },
+ /*
+ * From 4.7 message number To 5.2 message number. There are
+ * NO message differences between them. The vote1 contents
+ * changed.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* 1, REP_ALIVE */
+ 2, /* 2, REP_ALIVE_REQ */
+ 3, /* 3, REP_ALL_REQ */
+ 4, /* 4, REP_BULK_LOG */
+ 5, /* 5, REP_BULK_PAGE */
+ 6, /* 6, REP_DUPMASTER */
+ 7, /* 7, REP_FILE */
+ 8, /* 8, REP_FILE_FAIL */
+ 9, /* 9, REP_FILE_REQ */
+ 10, /* 10, REP_LEASE_GRANT */
+ 11, /* 11, REP_LOG */
+ 12, /* 12, REP_LOG_MORE */
+ 13, /* 13, REP_LOG_REQ */
+ 14, /* 14, REP_MASTER_REQ */
+ 15, /* 15, REP_NEWCLIENT */
+ 16, /* 16, REP_NEWFILE */
+ 17, /* 17, REP_NEWMASTER */
+ 18, /* 18, REP_NEWSITE */
+ 19, /* 19, REP_PAGE */
+ 20, /* 20, REP_PAGE_FAIL */
+ 21, /* 21, REP_PAGE_MORE */
+ 22, /* 22, REP_PAGE_REQ */
+ 23, /* 22, REP_REREQUEST */
+ 24, /* 24, REP_START_SYNC */
+ 25, /* 25, REP_UPDATE */
+ 26, /* 26, REP_UPDATE_REQ */
+ 27, /* 27, REP_VERIFY */
+ 28, /* 28, REP_VERIFY_FAIL */
+ 29, /* 29, REP_VERIFY_REQ */
+ 30, /* 30, REP_VOTE1 */
+ 31 /* 31, REP_VOTE2 */
+ },
+ /*
+ * From 4.7 message number To 5.3 message number. There are
+ * NO message differences between them. The fileinfo contents
+ * changed.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* 1, REP_ALIVE */
+ 2, /* 2, REP_ALIVE_REQ */
+ 3, /* 3, REP_ALL_REQ */
+ 4, /* 4, REP_BULK_LOG */
+ 5, /* 5, REP_BULK_PAGE */
+ 6, /* 6, REP_DUPMASTER */
+ 7, /* 7, REP_FILE */
+ 8, /* 8, REP_FILE_FAIL */
+ 9, /* 9, REP_FILE_REQ */
+ 10, /* 10, REP_LEASE_GRANT */
+ 11, /* 11, REP_LOG */
+ 12, /* 12, REP_LOG_MORE */
+ 13, /* 13, REP_LOG_REQ */
+ 14, /* 14, REP_MASTER_REQ */
+ 15, /* 15, REP_NEWCLIENT */
+ 16, /* 16, REP_NEWFILE */
+ 17, /* 17, REP_NEWMASTER */
+ 18, /* 18, REP_NEWSITE */
+ 19, /* 19, REP_PAGE */
+ 20, /* 20, REP_PAGE_FAIL */
+ 21, /* 21, REP_PAGE_MORE */
+ 22, /* 22, REP_PAGE_REQ */
+ 23, /* 22, REP_REREQUEST */
+ 24, /* 24, REP_START_SYNC */
+ 25, /* 25, REP_UPDATE */
+ 26, /* 26, REP_UPDATE_REQ */
+ 27, /* 27, REP_VERIFY */
+ 28, /* 28, REP_VERIFY_FAIL */
+ 29, /* 29, REP_VERIFY_REQ */
+ 30, /* 30, REP_VOTE1 */
+ 31 /* 31, REP_VOTE2 */
+ }
+ };
+ return (table[version][rectype]);
+}
+
+/*
+ * __rep_print_system --
+ * Optionally print a verbose message, including to the system file.
+ *
+ * PUBLIC: int __rep_print_system __P((ENV *, u_int32_t, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+int
+#ifdef STDC_HEADERS
+__rep_print_system(ENV *env, u_int32_t verbose, const char *fmt, ...)
+#else
+__rep_print_system(env, verbose, fmt, va_alist)
+ ENV *env;
+ u_int32_t verbose;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+ int ret;
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ ret = __rep_print_int(env, verbose | DB_VERB_REP_SYSTEM, fmt, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __rep_print --
+ * Optionally print a verbose message.
+ *
+ * PUBLIC: int __rep_print __P((ENV *, u_int32_t, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4)));
+ */
+int
+#ifdef STDC_HEADERS
+__rep_print(ENV *env, u_int32_t verbose, const char *fmt, ...)
+#else
+__rep_print(env, verbose, fmt, va_alist)
+ ENV *env;
+ u_int32_t verbose;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+ int ret;
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ ret = __rep_print_int(env, verbose, fmt, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __rep_print_int --
+ * Optionally print a verbose message.
+ *
+ * NOTE:
+ * One anomaly is that the messaging functions expect/use/require
+ * void functions. The use of a mutex in __rep_print_int requires
+ * a return value.
+ */
+static int
+__rep_print_int(env, verbose, fmt, ap)
+ ENV *env;
+ u_int32_t verbose;
+ const char *fmt;
+ va_list ap;
+{
+ DB_MSGBUF mb;
+ REP *rep;
+ db_timespec ts;
+ pid_t pid;
+ db_threadid_t tid;
+ int diag_msg;
+ u_int32_t regular_msg, tmp_verbose;
+ const char *s;
+ char buf[DB_THREADID_STRLEN];
+
+ tmp_verbose = env->dbenv->verbose;
+ if (FLD_ISSET(tmp_verbose, verbose | DB_VERB_REPLICATION) == 0)
+ return (0);
+ DB_MSGBUF_INIT(&mb);
+
+ diag_msg = 0;
+ if (REP_ON(env)) {
+ rep = env->rep_handle->region;
+ /*
+ * If system diag messages are configured and this message's
+ * verbose level includes DB_VERB_REP_SYSTEM, this is a diag
+ * message. This means it will be written to the diagnostic
+ * message files.
+ */
+ diag_msg = FLD_ISSET(tmp_verbose, DB_VERB_REP_SYSTEM) &&
+ FLD_ISSET(verbose, DB_VERB_REP_SYSTEM) &&
+ !FLD_ISSET(rep->config, REP_C_INMEM);
+ } else
+ rep = NULL;
+ /*
+ * We need to know if this message should be printed out
+ * via the regular, user mechanism.
+ */
+ FLD_CLR(tmp_verbose, DB_VERB_REP_SYSTEM);
+ regular_msg = FLD_ISSET(tmp_verbose,
+ verbose | DB_VERB_REPLICATION);
+
+ /*
+ * It is possible we could be called before the env is finished
+ * getting set up and we want to skip that.
+ */
+ if (diag_msg == 0 && regular_msg == 0)
+ return (0);
+ s = NULL;
+ if (env->dbenv->db_errpfx != NULL)
+ s = env->dbenv->db_errpfx;
+ else if (rep != NULL) {
+ if (F_ISSET(rep, REP_F_CLIENT))
+ s = "CLIENT";
+ else if (F_ISSET(rep, REP_F_MASTER))
+ s = "MASTER";
+ }
+ if (s == NULL)
+ s = "REP_UNDEF";
+ __os_id(env->dbenv, &pid, &tid);
+ if (diag_msg)
+ MUTEX_LOCK(env, rep->mtx_diag);
+ __os_gettime(env, &ts, 1);
+ __db_msgadd(env, &mb, "[%lu:%lu][%s] %s: ",
+ (u_long)ts.tv_sec, (u_long)ts.tv_nsec/NS_PER_US,
+ env->dbenv->thread_id_string(env->dbenv, pid, tid, buf), s);
+
+ __db_msgadd_ap(env, &mb, fmt, ap);
+
+ DB_MSGBUF_REP_FLUSH(env, &mb, diag_msg, regular_msg);
+ if (diag_msg)
+ MUTEX_UNLOCK(env, rep->mtx_diag);
+ return (0);
+}
+
+/*
+ * PUBLIC: void __rep_print_message
+ * PUBLIC: __P((ENV *, int, __rep_control_args *, char *, u_int32_t));
+ */
+void
+__rep_print_message(env, eid, rp, str, flags)
+ ENV *env;
+ int eid;
+ __rep_control_args *rp;
+ char *str;
+ u_int32_t flags;
+{
+ u_int32_t ctlflags, rectype, verbflag;
+ char ftype[64], *home, *type;
+
+ rectype = rp->rectype;
+ ctlflags = rp->flags;
+ verbflag = DB_VERB_REP_MSGS | DB_VERB_REPLICATION;
+ if (rp->rep_version != DB_REPVERSION)
+ rectype = __rep_msg_from_old(rp->rep_version, rectype);
+ switch (rectype) {
+ case REP_ALIVE:
+ FLD_SET(verbflag, DB_VERB_REP_ELECT | DB_VERB_REP_MISC);
+ type = "alive";
+ break;
+ case REP_ALIVE_REQ:
+ type = "alive_req";
+ break;
+ case REP_ALL_REQ:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "all_req";
+ break;
+ case REP_BULK_LOG:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "bulk_log";
+ break;
+ case REP_BULK_PAGE:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC);
+ type = "bulk_page";
+ break;
+ case REP_DUPMASTER:
+ FLD_SET(verbflag, DB_VERB_REP_SYSTEM);
+ type = "dupmaster";
+ break;
+ case REP_FILE:
+ type = "file";
+ break;
+ case REP_FILE_FAIL:
+ type = "file_fail";
+ break;
+ case REP_FILE_REQ:
+ type = "file_req";
+ break;
+ case REP_LEASE_GRANT:
+ FLD_SET(verbflag, DB_VERB_REP_LEASE);
+ type = "lease_grant";
+ break;
+ case REP_LOG:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "log";
+ break;
+ case REP_LOG_MORE:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "log_more";
+ break;
+ case REP_LOG_REQ:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "log_req";
+ break;
+ case REP_MASTER_REQ:
+ type = "master_req";
+ break;
+ case REP_NEWCLIENT:
+ FLD_SET(verbflag, DB_VERB_REP_MISC | DB_VERB_REP_SYSTEM);
+ type = "newclient";
+ break;
+ case REP_NEWFILE:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "newfile";
+ break;
+ case REP_NEWMASTER:
+ FLD_SET(verbflag, DB_VERB_REP_MISC | DB_VERB_REP_SYSTEM);
+ type = "newmaster";
+ break;
+ case REP_NEWSITE:
+ type = "newsite";
+ break;
+ case REP_PAGE:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC);
+ type = "page";
+ break;
+ case REP_PAGE_FAIL:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC);
+ type = "page_fail";
+ break;
+ case REP_PAGE_MORE:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC);
+ type = "page_more";
+ break;
+ case REP_PAGE_REQ:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC);
+ type = "page_req";
+ break;
+ case REP_REREQUEST:
+ type = "rerequest";
+ break;
+ case REP_START_SYNC:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "start_sync";
+ break;
+ case REP_UPDATE:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+ type = "update";
+ break;
+ case REP_UPDATE_REQ:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+ type = "update_req";
+ break;
+ case REP_VERIFY:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+ type = "verify";
+ break;
+ case REP_VERIFY_FAIL:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+ type = "verify_fail";
+ break;
+ case REP_VERIFY_REQ:
+ FLD_SET(verbflag, DB_VERB_REP_SYNC | DB_VERB_REP_SYSTEM);
+ type = "verify_req";
+ break;
+ case REP_VOTE1:
+ FLD_SET(verbflag, DB_VERB_REP_ELECT | DB_VERB_REP_SYSTEM);
+ type = "vote1";
+ break;
+ case REP_VOTE2:
+ FLD_SET(verbflag, DB_VERB_REP_ELECT | DB_VERB_REP_SYSTEM);
+ type = "vote2";
+ break;
+ default:
+ type = "NOTYPE";
+ break;
+ }
+
+ /*
+ * !!!
+ * If adding new flags to print out make sure the aggregate
+ * length cannot overflow the buffer.
+ */
+ ftype[0] = '\0';
+ if (LF_ISSET(DB_REP_ANYWHERE))
+ (void)strcat(ftype, " any"); /* 4 */
+ if (FLD_ISSET(ctlflags, REPCTL_FLUSH))
+ (void)strcat(ftype, " flush"); /* 10 */
+ /*
+ * We expect most of the time the messages will indicate
+ * group membership. Only print if we're not already
+ * part of a group.
+ */
+ if (!FLD_ISSET(ctlflags, REPCTL_GROUP_ESTD))
+ (void)strcat(ftype, " nogroup"); /* 18 */
+ if (FLD_ISSET(ctlflags, REPCTL_LEASE))
+ (void)strcat(ftype, " lease"); /* 24 */
+ if (LF_ISSET(DB_REP_NOBUFFER))
+ (void)strcat(ftype, " nobuf"); /* 30 */
+ if (FLD_ISSET(ctlflags, REPCTL_PERM))
+ (void)strcat(ftype, " perm"); /* 35 */
+ if (LF_ISSET(DB_REP_REREQUEST))
+ (void)strcat(ftype, " rereq"); /* 41 */
+ if (FLD_ISSET(ctlflags, REPCTL_RESEND))
+ (void)strcat(ftype, " resend"); /* 48 */
+ if (FLD_ISSET(ctlflags, REPCTL_LOG_END))
+ (void)strcat(ftype, " logend"); /* 55 */
+
+ /*
+ * !!!
+ * We selectively turned on bits using different verbose settings
+ * that relate to each message type. Therefore, since the
+ * DB_VERB_REP_SYSTEM flag is explicitly set above when wanted,
+ * we *must* use the VPRINT macro here. It will correctly
+ * handle the messages whether or not the SYSTEM flag is set.
+ */
+ if ((home = env->db_home) == NULL)
+ home = "NULL";
+ VPRINT(env, (env, verbflag,
+ "%s %s: msgv = %lu logv %lu gen = %lu eid %d, type %s, LSN [%lu][%lu] %s",
+ home, str,
+ (u_long)rp->rep_version, (u_long)rp->log_version, (u_long)rp->gen,
+ eid, type, (u_long)rp->lsn.file, (u_long)rp->lsn.offset, ftype));
+ /*
+ * Make sure the version is close, and not swapped
+ * here. Check for current version, +/- a little bit.
+ */
+ DB_ASSERT(env, rp->rep_version <= DB_REPVERSION+10);
+ DB_ASSERT(env, rp->log_version <= DB_LOGVERSION+10);
+}
+
+/*
+ * PUBLIC: void __rep_fire_event __P((ENV *, u_int32_t, void *));
+ */
+void
+__rep_fire_event(env, event, info)
+ ENV *env;
+ u_int32_t event;
+ void *info;
+{
+ int ret;
+
+ /*
+ * Give repmgr first crack at handling all replication-related events.
+ * If it can't (or chooses not to) handle the event fully, then pass it
+ * along to the application.
+ */
+ ret = __repmgr_handle_event(env, event, info);
+ DB_ASSERT(env, ret == 0 || ret == DB_EVENT_NOT_HANDLED);
+
+ if (ret == DB_EVENT_NOT_HANDLED)
+ DB_EVENT(env, event, info);
+}
+
+/*
+ * __rep_msg --
+ * Rep system diagnostic messaging routine.
+ * This function is called from the __db_msg subsystem to
+ * write out diagnostic messages to replication-owned files.
+ *
+ * PUBLIC: void __rep_msg __P((const ENV *, const char *));
+ */
+void
+__rep_msg(env, msg)
+ const ENV *env;
+ const char *msg;
+{
+ DB_FH *fhp;
+ DB_REP *db_rep;
+ REP *rep;
+ int i;
+ size_t cnt, nlcnt;
+ char nl = '\n';
+
+ if (PANIC_ISSET(env))
+ return;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ DB_ASSERT((ENV *)env, !FLD_ISSET(rep->config, REP_C_INMEM));
+ /*
+ * We know the only way we get here is with the mutex locked. So
+ * we can read, modify and change all the diag related fields.
+ */
+ i = rep->diag_index;
+ fhp = db_rep->diagfile[i];
+
+ if (db_rep->diag_off != rep->diag_off)
+ (void)__os_seek((ENV *)env, fhp, 0, 0, rep->diag_off);
+ if (__os_write((ENV *)env, fhp, (void *)msg, strlen(msg), &cnt) != 0)
+ return;
+ if (__os_write((ENV *)env, fhp, &nl, 1, &nlcnt) != 0)
+ return;
+ db_rep->diag_off = rep->diag_off += (cnt + nlcnt);
+ /*
+ * If writing this message put us over the file size threshold,
+ * then we reset to the next file. We don't care if it is
+ * exactly at the size, some amount over the file size is fine.
+ */
+ if (rep->diag_off >= REP_DIAGSIZE) {
+ rep->diag_index = (++i % DBREP_DIAG_FILES);
+ rep->diag_off = 0;
+ }
+ return;
+}
+
+/*
+ * PUBLIC: int __rep_notify_threads __P((ENV *, rep_waitreason_t));
+ *
+ * Caller must hold rep region mutex. In the AWAIT_LSN case, caller must also
+ * hold mtx_clientdb.
+ */
+int
+__rep_notify_threads(env, wake_reason)
+ ENV *env;
+ rep_waitreason_t wake_reason;
+{
+ REP *rep;
+ struct __rep_waiter *waiter;
+ struct rep_waitgoal *goal;
+ int ret, wake;
+
+ ret = 0;
+ rep = env->rep_handle->region;
+
+ SH_TAILQ_FOREACH(waiter, &rep->waiters, links, __rep_waiter) {
+ goal = &waiter->goal;
+ wake = 0;
+ if (wake_reason == LOCKOUT) {
+ F_SET(waiter, REP_F_PENDING_LOCKOUT);
+ wake = 1;
+ } else if (wake_reason == goal->why ||
+ (goal->why == AWAIT_HISTORY && wake_reason == AWAIT_LSN)) {
+ /*
+ * It's important that we only call __rep_check_goal
+ * with "goals" that match the wake_reason passed to us
+ * (modulo the LSN-to-HISTORY equivalence), because the
+ * caller has ensured that it is holding the appropriate
+ * mutexes depending on the wake_reason.
+ */
+ if ((ret = __rep_check_goal(env, goal)) == 0)
+ wake = 1;
+ else if (ret == DB_TIMEOUT)
+ ret = 0;
+ else
+ goto out;
+ }
+
+ if (wake) {
+ MUTEX_UNLOCK(env, waiter->mtx_repwait);
+ SH_TAILQ_REMOVE(&rep->waiters,
+ waiter, links, __rep_waiter);
+ F_SET(waiter, REP_F_WOKEN);
+ }
+ }
+
+out:
+ return (ret);
+}
+
+/*
+ * A "wait goal" describes a condition that a thread may be waiting for.
+ * Evaluate the condition, returning 0 if the condition has been satisfied, and
+ * DB_TIMEOUT if not.
+ *
+ * Caller must hold REP_SYSTEM lock and/or mtx_clientdb as appropriate.
+ *
+ * PUBLIC: int __rep_check_goal __P((ENV *, struct rep_waitgoal *));
+ */
+int
+__rep_check_goal(env, goal)
+ ENV *env;
+ struct rep_waitgoal *goal;
+{
+ REP *rep;
+ LOG *lp;
+ int ret;
+
+ rep = env->rep_handle->region;
+ lp = env->lg_handle->reginfo.primary;
+ ret = DB_TIMEOUT; /* Pessimistic, to start. */
+
+ /*
+ * Note that while AWAIT_LSN and AWAIT_HISTORY look similar, they are
+ * actually quite different. With AWAIT_LSN, the u.lsn is the LSN of
+ * the commit of the transaction the caller is waiting for. So we need
+ * to make sure we have gotten at least that far, thus ">=".
+ *
+ * For AWAIT_HISTORY, the u.lsn is simply a copy of whatever the current
+ * max_perm_lsn was at the time we last checked. So anything if we have
+ * anything *beyond* that then we should wake up again and check to see
+ * if we now have the desired history (thus ">"). Thus when we're
+ * waiting for HISTORY we're going to get woken *at every commit we
+ * receive*! Fortunately it should be coming as the first transaction
+ * after the gen change, and waiting for HISTORY should be extremely
+ * rare anyway.
+ */
+ switch (goal->why) {
+ case AWAIT_LSN:
+ /* Have we reached our goal LSN? */
+ if (LOG_COMPARE(&lp->max_perm_lsn, &goal->u.lsn) >= 0)
+ ret = 0;
+ break;
+ case AWAIT_HISTORY:
+ /*
+ * Have we made any progress whatsoever, beyond where we were at
+ * the time the waiting thread noted the current LSN?
+ * When we have to wait for replication of the LSN history
+ * database, we don't know what LSN it's going to occur at. So
+ * we have to wake up every time we get a new transaction.
+ * Fortunately, this should be exceedingly rare, and the number
+ * of transactions we have to plow through should almost never
+ * be more than 1.
+ */
+ if (LOG_COMPARE(&lp->max_perm_lsn, &goal->u.lsn) > 0)
+ ret = 0;
+ break;
+ case AWAIT_GEN:
+ if (rep->gen >= goal->u.gen)
+ ret = 0;
+ break;
+ case AWAIT_NIMDB:
+ if (F_ISSET(rep, REP_F_NIMDBS_LOADED))
+ ret = 0;
+ break;
+ default:
+ DB_ASSERT(env, 0);
+ }
+ return (ret);
+}
+
+/*
+ * __rep_log_backup --
+ *
+ * Walk backwards in the log looking for specific kinds of records.
+ *
+ * PUBLIC: int __rep_log_backup __P((ENV *, DB_LOGC *, DB_LSN *, u_int32_t));
+ */
+int
+__rep_log_backup(env, logc, lsn, match)
+ ENV *env;
+ DB_LOGC *logc;
+ DB_LSN *lsn;
+ u_int32_t match;
+{
+ DBT mylog;
+ u_int32_t rectype;
+ int ret;
+
+ ret = 0;
+ memset(&mylog, 0, sizeof(mylog));
+ while ((ret = __logc_get(logc, lsn, &mylog, DB_PREV)) == 0) {
+ LOGCOPY_32(env, &rectype, mylog.data);
+ /*
+ * Check the record type against the desired match type(s).
+ */
+ if ((match == REP_REC_COMMIT &&
+ rectype == DB___txn_regop) ||
+ (match == REP_REC_PERM &&
+ (rectype == DB___txn_ckp || rectype == DB___txn_regop)))
+ break;
+ }
+ return (ret);
+}
+
+/*
+ * __rep_get_maxpermlsn --
+ *
+ * Safely retrieve the current max_perm_lsn value.
+ *
+ * PUBLIC: int __rep_get_maxpermlsn __P((ENV *, DB_LSN *));
+ */
+int
+__rep_get_maxpermlsn(env, max_perm_lsnp)
+ ENV *env;
+ DB_LSN *max_perm_lsnp;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REP *rep;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ *max_perm_lsnp = lp->max_perm_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ENV_LEAVE(env, ip);
+ return (0);
+}
+
+/*
+ * __rep_is_internal_rep_file --
+ *
+ * Return 1 if filename is an internal replication file; 0 otherwise.
+ * Works for all internal replication files including internal database
+ * files.
+ *
+ * PUBLIC: int __rep_is_internal_rep_file __P((char *));
+ */
+int
+__rep_is_internal_rep_file(filename)
+ char *filename;
+{
+ return (strncmp(filename,
+ REPFILEPREFIX, sizeof(REPFILEPREFIX) - 1) == 0 ? 1 : 0);
+}
+
+/*
+ * Get the last generation number from the LSN history database.
+ *
+ * PUBLIC: int __rep_get_datagen __P((ENV *, u_int32_t *));
+ */
+int
+__rep_get_datagen(env, data_genp)
+ ENV *env;
+ u_int32_t *data_genp;
+{
+ DB_REP *db_rep;
+ DB_TXN *txn;
+ DB *dbp;
+ DBC *dbc;
+ __rep_lsn_hist_key_args key;
+ u_int8_t key_buf[__REP_LSN_HIST_KEY_SIZE];
+ u_int8_t data_buf[__REP_LSN_HIST_DATA_SIZE];
+ DBT key_dbt, data_dbt;
+ u_int32_t flags;
+ int ret, t_ret, tries;
+
+ db_rep = env->rep_handle;
+ ret = 0;
+ *data_genp = 0;
+ tries = 0;
+ flags = DB_LAST;
+retry:
+ if ((ret = __txn_begin(env, NULL, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+ return (ret);
+
+ if ((dbp = db_rep->lsn_db) == NULL) {
+ if ((ret = __rep_open_sysdb(env,
+ NULL, txn, REPLSNHIST, 0, &dbp)) != 0) {
+ /*
+ * If the database isn't there, it could be because it's
+ * memory-resident, and we haven't yet sync'ed with the
+ * master to materialize it. It could be that this is
+ * a brand new environment. We have a 0 datagen.
+ * That is not an error.
+ */
+ ret = 0;
+ goto out;
+ }
+ db_rep->lsn_db = dbp;
+ }
+
+ if ((ret = __db_cursor(dbp, NULL, txn, &dbc, 0)) != 0)
+ goto out;
+
+ DB_INIT_DBT(key_dbt, key_buf, __REP_LSN_HIST_KEY_SIZE);
+ key_dbt.ulen = __REP_LSN_HIST_KEY_SIZE;
+ F_SET(&key_dbt, DB_DBT_USERMEM);
+
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ data_dbt.data = data_buf;
+ data_dbt.ulen = __REP_LSN_HIST_DATA_SIZE;
+ F_SET(&data_dbt, DB_DBT_USERMEM);
+ if ((ret = __dbc_get(dbc, &key_dbt, &data_dbt, flags)) != 0) {
+ if ((ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
+ ++tries < 5) /* Limit of 5 is an arbitrary choice. */
+ ret = 0;
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __txn_abort(txn)) != 0 && ret == 0)
+ ret = t_ret;
+ /*
+ * If we have any kind of error at this point, bail.
+ * Otherwise pause and try again.
+ */
+ if (ret != 0)
+ goto err;
+ __os_yield(env, 0, 10000); /* Arbitrary duration. */
+ goto retry;
+ }
+ if ((ret = __dbc_close(dbc)) == 0 &&
+ (ret = __rep_lsn_hist_key_unmarshal(env,
+ &key, key_buf, __REP_LSN_HIST_KEY_SIZE, NULL)) == 0)
+ *data_genp = key.gen;
+out:
+ if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+err:
+ return (ret);
+}
diff --git a/src/rep/rep_verify.c b/src/rep/rep_verify.c
new file mode 100644
index 00000000..5238f900
--- /dev/null
+++ b/src/rep/rep_verify.c
@@ -0,0 +1,751 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+static int __rep_internal_init __P((ENV *, u_int32_t));
+
+/*
+ * __rep_verify --
+ * Handle a REP_VERIFY message.
+ *
+ * PUBLIC: int __rep_verify __P((ENV *, __rep_control_args *, DBT *,
+ * PUBLIC: int, time_t));
+ */
+int
+__rep_verify(env, rp, rec, eid, savetime)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+ time_t savetime;
+{
+ DBT mylog;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN lsn, prev_ckp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ __txn_ckp_args *ckp_args;
+ u_int32_t logflag, rectype;
+ int master, match, ret, t_ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ /* Do nothing if VERIFY is not set. */
+ if (rep->sync_state != SYNC_VERIFY)
+ return (ret);
+
+#ifdef DIAGNOSTIC
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ if (IS_USING_LEASES(env)) {
+ REP_SYSTEM_LOCK(env);
+ DB_ASSERT(env, __rep_islease_granted(env) == 0);
+ REP_SYSTEM_UNLOCK(env);
+ }
+#endif
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ memset(&mylog, 0, sizeof(mylog));
+ /* If verify_lsn of ZERO is passed in, get last log. */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ logflag = IS_ZERO_LSN(lp->verify_lsn) ? DB_LAST : DB_SET;
+ prev_ckp = lp->prev_ckp;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if ((ret = __logc_get(logc, &rp->lsn, &mylog, logflag)) != 0)
+ goto out;
+ match = 0;
+ if (mylog.size == rec->size &&
+ memcmp(mylog.data, rec->data, rec->size) == 0)
+ match = 1;
+ /*
+ * If we don't have a match, backup to the previous
+ * identification record and try again.
+ */
+ if (match == 0) {
+ master = rep->master_id;
+ /*
+ * We will eventually roll back over this log record (unless we
+ * ultimately have to give up and do an internal init). So, if
+ * it was a checkpoint, make sure we don't end up without any
+ * checkpoints left in the entire log.
+ */
+ LOGCOPY_32(env, &rectype, mylog.data);
+ DB_ASSERT(env, ret == 0);
+ if (!lp->db_log_inmemory && rectype == DB___txn_ckp) {
+ if ((ret = __txn_ckp_read(env,
+ mylog.data, &ckp_args)) != 0)
+ goto out;
+ lsn = ckp_args->last_ckp;
+ __os_free(env, ckp_args);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->prev_ckp = lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (IS_ZERO_LSN(lsn)) {
+ /*
+ * No previous checkpoints? The only way this
+ * is OK is if we have the entire log, all the
+ * way back to file #1.
+ */
+ if ((ret = __logc_get(logc,
+ &lsn, &mylog, DB_FIRST)) != 0)
+ goto out;
+ if (lsn.file != 1) {
+ ret = __rep_internal_init(env, 0);
+ goto out;
+ }
+
+ /* Restore position of log cursor. */
+ if ((ret = __logc_get(logc,
+ &rp->lsn, &mylog, DB_SET)) != 0)
+ goto out;
+ }
+ }
+ if ((ret = __rep_log_backup(env, logc, &lsn,
+ REP_REC_PERM)) == 0) {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->verify_lsn = lsn;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->wait_ts = rep->request_gap;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (master != DB_EID_INVALID)
+ eid = master;
+ (void)__rep_send_message(env, eid, REP_VERIFY_REQ,
+ &lsn, NULL, 0, DB_REP_ANYWHERE);
+ } else if (ret == DB_NOTFOUND) {
+ /*
+ * We've either run out of records because
+ * logs have been removed or we've rolled back
+ * all the way to the beginning.
+ */
+ ret = __rep_internal_init(env, 0);
+ }
+ } else {
+ /*
+ * We have a match, so we can probably do a simple sync, without
+ * needing internal init. But first, check for a couple of
+ * special cases.
+ */
+
+ if (!lp->db_log_inmemory && !IS_ZERO_LSN(prev_ckp)) {
+ /*
+ * We previously saw a checkpoint, which means we may
+ * now be about to roll back over it and lose it. Make
+ * sure we'll end up still having at least one other
+ * checkpoint. (Note that if the current record -- the
+ * one we've just matched -- happens to be a checkpoint,
+ * then it must be the same as the prev_ckp we're now
+ * about to try reading. Which means we wouldn't really
+ * have to read it. But checking for that special case
+ * doesn't seem worth the trouble.)
+ */
+ if ((ret = __logc_get(logc,
+ &prev_ckp, &mylog, DB_SET)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = __rep_internal_init(env, 0);
+ goto out;
+ }
+ /*
+ * We succeeded reading for the prev_ckp, so it's safe
+ * to fall through to the verify_match.
+ */
+ }
+ /*
+ * Mixed version internal init doesn't work with 4.4, so we
+ * can't load NIMDBs from a very old-version master. So, fib to
+ * ourselves that they're already loaded, so that we don't try.
+ */
+ if (rep->version == DB_REPVERSION_44) {
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_NIMDBS_LOADED);
+ REP_SYSTEM_UNLOCK(env);
+ }
+ if (F_ISSET(rep, REP_F_NIMDBS_LOADED))
+ ret = __rep_verify_match(env, &rp->lsn, savetime);
+ else {
+ /*
+ * Even though we found a match, we haven't yet loaded
+ * any NIMDBs, so we have to do an abbreviated internal
+ * init. We leave lp->verify_lsn set to the matching
+ * sync point, in case upon eventual examination of the
+ * UPDATE message it turns out there are no NIMDBs
+ * (since we can then skip back to a verify_match
+ * outcome).
+ */
+ ret = __rep_internal_init(env, REP_F_ABBREVIATED);
+ }
+ }
+
+out: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static int
+__rep_internal_init(env, abbrev)
+ ENV *env;
+ u_int32_t abbrev;
+{
+ REP *rep;
+ int master, ret;
+
+ rep = env->rep_handle->region;
+ REP_SYSTEM_LOCK(env);
+#ifdef HAVE_STATISTICS
+ if (!abbrev)
+ rep->stat.st_outdated++;
+#endif
+
+ /*
+ * What we call "abbreviated internal init" is really just NIMDB
+ * materialization, and we always do that even if AUTOINIT has been
+ * turned off.
+ */
+ if (!FLD_ISSET(rep->config, REP_C_AUTOINIT) && !abbrev)
+ ret = DB_REP_JOIN_FAILURE;
+ else {
+ rep->sync_state = SYNC_UPDATE;
+ if (abbrev) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "send UPDATE_REQ, merely to check for NIMDB refresh"));
+ F_SET(rep, REP_F_ABBREVIATED);
+ } else
+ F_CLR(rep, REP_F_ABBREVIATED);
+ ZERO_LSN(rep->first_lsn);
+ ZERO_LSN(rep->ckp_lsn);
+ ret = 0;
+ }
+ master = rep->master_id;
+ REP_SYSTEM_UNLOCK(env);
+ if (ret == 0 && master != DB_EID_INVALID)
+ (void)__rep_send_message(env,
+ master, REP_UPDATE_REQ, NULL, NULL, 0, 0);
+ return (ret);
+}
+
+/*
+ * __rep_verify_fail --
+ * Handle a REP_VERIFY_FAIL message.
+ *
+ * PUBLIC: int __rep_verify_fail __P((ENV *, __rep_control_args *));
+ */
+int
+__rep_verify_fail(env, rp)
+ ENV *env;
+ __rep_control_args *rp;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int clnt_lock_held, lockout, master, ret;
+
+ clnt_lock_held = lockout = 0;
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ /*
+ * If we are already in the middle of updating (PAGE or UPDATE state),
+ * then we ignore this message.
+ */
+ if (rep->sync_state == SYNC_PAGE || rep->sync_state == SYNC_UPDATE)
+ return (0);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ /*
+ * Clean up old internal init in progress if:
+ * REP_C_AUTOINIT is configured and
+ * we are recovering LOG and this LSN is in the range we need.
+ */
+ if (rep->sync_state == SYNC_LOG &&
+ LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 &&
+ LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) {
+ /*
+ * Already locking out messages, give up.
+ */
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG))
+ goto unlock;
+
+ /*
+ * Lock out other messages to prevent race conditions.
+ */
+ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+ goto unlock;
+ lockout = 1;
+
+ /*
+ * Clean up internal init if one was in progress.
+ */
+ if (ISSET_LOCKOUT_BDB(rep)) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "VERIFY_FAIL is cleaning up old internal init for missing log"));
+ if ((ret =
+ __rep_init_cleanup(env, rep, DB_FORCE)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "VERIFY_FAIL error cleaning up internal init for missing log: %d", ret));
+ goto msglck;
+ }
+ CLR_RECOVERY_SETTINGS(rep);
+ }
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ lockout = 0;
+ }
+
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ clnt_lock_held = 1;
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Commence an internal init if:
+ * We are in VERIFY state and the failing LSN is the one we
+ * were verifying or
+ * we're recovering LOG and this LSN is in the range we need or
+ * we are in normal state (no recovery flags set) and
+ * the failing LSN is the one we're ready for.
+ *
+ * We don't want an old or delayed VERIFY_FAIL message to throw us
+ * into internal initialization when we shouldn't be.
+ */
+ if ((rep->sync_state == SYNC_VERIFY &&
+ LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) ||
+ (rep->sync_state == SYNC_LOG &&
+ LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 &&
+ LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) ||
+ (rep->sync_state == SYNC_OFF &&
+ LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0)) {
+ /*
+ * Update stats.
+ */
+ STAT(rep->stat.st_outdated++);
+
+ /*
+ * If REP_C_AUTOINIT is turned off, return
+ * DB_REP_JOIN_FAILURE instead of doing internal init.
+ */
+ if (!FLD_ISSET(rep->config, REP_C_AUTOINIT)) {
+ ret = DB_REP_JOIN_FAILURE;
+ goto unlock;
+ }
+
+ /*
+ * Do the internal init.
+ */
+ rep->sync_state = SYNC_UPDATE;
+ ZERO_LSN(rep->first_lsn);
+ ZERO_LSN(rep->ckp_lsn);
+ lp->wait_ts = rep->request_gap;
+ master = rep->master_id;
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (master != DB_EID_INVALID)
+ (void)__rep_send_message(env,
+ master, REP_UPDATE_REQ, NULL, NULL, 0, 0);
+ } else {
+ /*
+ * Otherwise ignore this message.
+ */
+msglck: if (lockout)
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+unlock: REP_SYSTEM_UNLOCK(env);
+ if (clnt_lock_held)
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+ return (ret);
+}
+
+/*
+ * __rep_verify_req --
+ * Handle a REP_VERIFY_REQ message.
+ *
+ * PUBLIC: int __rep_verify_req __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_verify_req(env, rp, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ int eid;
+{
+ DBT *d, data_dbt;
+ DB_LOGC *logc;
+ DB_REP *db_rep;
+ REP *rep;
+ u_int32_t type;
+ int old, ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ type = REP_VERIFY;
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ d = &data_dbt;
+ memset(d, 0, sizeof(data_dbt));
+ F_SET(logc, DB_LOG_SILENT_ERR);
+ ret = __logc_get(logc, &rp->lsn, d, DB_SET);
+ /*
+ * If the LSN was invalid, then we might get a DB_NOTFOUND
+ * we might get an EIO, we could get anything.
+ * If we get a DB_NOTFOUND, then there is a chance that
+ * the LSN comes before the first file present in which
+ * case we need to return a fail so that the client can
+ * perform an internal init or return a REP_JOIN_FAILURE.
+ *
+ * If we're a client servicing this request and we get a
+ * NOTFOUND, return it so the caller can rerequest from
+ * a better source.
+ */
+ if (ret == DB_NOTFOUND) {
+ if (F_ISSET(rep, REP_F_CLIENT)) {
+ (void)__logc_close(logc);
+ return (DB_NOTFOUND);
+ }
+ if (__log_is_outdated(env, rp->lsn.file, &old) == 0 &&
+ old != 0)
+ type = REP_VERIFY_FAIL;
+ }
+
+ if (ret != 0)
+ d = NULL;
+
+ (void)__rep_send_message(env, eid, type, &rp->lsn, d, 0, 0);
+ return (__logc_close(logc));
+}
+
+/*
+ * PUBLIC: int __rep_dorecovery __P((ENV *, DB_LSN *, DB_LSN *));
+ */
+int
+__rep_dorecovery(env, lsnp, trunclsnp)
+ ENV *env;
+ DB_LSN *lsnp, *trunclsnp;
+{
+ DBT mylog;
+ DB_LOGC *logc;
+ DB_LSN last_ckp, lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ int ret, rollback, skip_rec, t_ret, update;
+ u_int32_t rectype, opcode;
+ __txn_regop_args *txnrec;
+ __txn_regop_42_args *txn42rec;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /* Figure out if we are backing out any committed transactions. */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ memset(&mylog, 0, sizeof(mylog));
+ if (rep->sync_state == SYNC_LOG) {
+ /*
+ * Internal init can never skip recovery.
+ * Internal init must always update the timestamp and
+ * force dead handles.
+ */
+ skip_rec = 0;
+ update = 1;
+ } else {
+ skip_rec = 1;
+ update = 0;
+ }
+ rollback = 0;
+ while (update == 0 &&
+ (ret = __logc_get(logc, &lsn, &mylog, DB_PREV)) == 0 &&
+ LOG_COMPARE(&lsn, lsnp) > 0) {
+ LOGCOPY_32(env, &rectype, mylog.data);
+ /*
+ * Find out if we can skip recovery completely. If we
+ * are backing up over any record a client usually
+ * cares about, we must run recovery.
+ *
+ * Skipping sync-up recovery can be pretty scary!
+ * Here's why we can do it:
+ * If a master downgraded to client and is now running
+ * sync-up to a new master, that old master must have
+ * waited for any outstanding txns to resolve before
+ * becoming a client. Also we are in lockout so there
+ * can be no other operations right now.
+ *
+ * If the client wrote a commit record to the log, but
+ * was descheduled before processing the txn, and then
+ * a new master was found, we must've let the txn get
+ * processed because right now we are the only message
+ * thread allowed to be running.
+ */
+ DB_ASSERT(env, rep->op_cnt == 0);
+ DB_ASSERT(env, rep->msg_th == 1);
+ if (rectype == DB___txn_regop || rectype == DB___txn_ckp ||
+ rectype == DB___dbreg_register)
+ skip_rec = 0;
+ if (rectype == DB___txn_regop) {
+ if (rep->version >= DB_REPVERSION_44) {
+ if ((ret = __txn_regop_read(
+ env, mylog.data, &txnrec)) != 0)
+ goto err;
+ opcode = txnrec->opcode;
+ __os_free(env, txnrec);
+ } else {
+ if ((ret = __txn_regop_42_read(
+ env, mylog.data, &txn42rec)) != 0)
+ goto err;
+ opcode = txn42rec->opcode;
+ __os_free(env, txn42rec);
+ }
+ if (opcode != TXN_ABORT) {
+ rollback = 1;
+ update = 1;
+ }
+ }
+ }
+ /*
+ * Handle if the logc_get fails.
+ */
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If we successfully run recovery, we've opened all the necessary
+ * files. We are guaranteed to be single-threaded here, so no mutex
+ * is necessary.
+ */
+ if (skip_rec) {
+ if ((ret = __log_get_stable_lsn(env, &last_ckp, 0)) != 0) {
+ if (ret != DB_NOTFOUND)
+ goto err;
+ ZERO_LSN(last_ckp);
+ }
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Skip sync-up rec. Truncate log to [%lu][%lu], ckp [%lu][%lu]",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)last_ckp.file, (u_long)last_ckp.offset));
+ ret = __log_vtruncate(env, lsnp, &last_ckp, trunclsnp);
+ } else {
+ if (rollback && !FLD_ISSET(rep->config, REP_C_AUTOROLLBACK)) {
+ ret = DB_REP_WOULDROLLBACK;
+ goto err;
+ }
+ ret = __db_apprec(env, ip, lsnp, trunclsnp, update, 0);
+ }
+
+ if (ret != 0)
+ goto err;
+ F_SET(db_rep, DBREP_OPENFILES);
+
+ /*
+ * If we've just updated the env handle timestamp, then we would get
+ * HANDLE_DEAD next time we tried to use our LSN history database. So,
+ * close it here now, to save ourselves the trouble of worrying about it
+ * later.
+ */
+ if (update && db_rep->lsn_db != NULL) {
+ ret = __db_close(db_rep->lsn_db, NULL, DB_NOSYNC);
+ db_rep->lsn_db = NULL;
+ }
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __rep_verify_match --
+ * We have just received a matching log record during verification.
+ * Figure out if we're going to need to run recovery. If so, wait until
+ * everything else has exited the library. If not, set up the world
+ * correctly and move forward.
+ *
+ * PUBLIC: int __rep_verify_match __P((ENV *, DB_LSN *, time_t));
+ */
+int
+__rep_verify_match(env, reclsnp, savetime)
+ ENV *env;
+ DB_LSN *reclsnp;
+ time_t savetime;
+{
+ DB_LOG *dblp;
+ DB_LSN trunclsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ int done, event, master, ret;
+ u_int32_t unused;
+
+ dblp = env->lg_handle;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+ event = 0;
+ infop = env->reginfo;
+ renv = infop->primary;
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /*
+ * Check if the savetime is different than our current time stamp.
+ * If it is, then we're racing with another thread trying to recover
+ * and we lost. We must give up.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ done = savetime != renv->rep_timestamp;
+ if (done) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (0);
+ }
+ ZERO_LSN(lp->verify_lsn);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ /*
+ * Make sure the world hasn't changed while we tried to get
+ * the lock. If it hasn't then it's time for us to kick all
+ * operations out of DB and run recovery.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG) ||
+ (rep->sync_state != SYNC_LOG &&
+ ISSET_LOCKOUT_BDB(rep))) {
+ /*
+ * We lost. The world changed and we should do nothing.
+ */
+ STAT(rep->stat.st_msgs_recover++);
+ goto errunlock;
+ }
+
+ /*
+ * Lockout all message threads but ourselves.
+ */
+ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+ goto errunlock;
+
+ /*
+ * Lockout the API and wait for operations to complete.
+ */
+ if ((ret = __rep_lockout_api(env, rep)) != 0)
+ goto errunlock;
+
+ /* OK, everyone is out, we can now run recovery. */
+ REP_SYSTEM_UNLOCK(env);
+
+ if ((ret = __rep_dorecovery(env, reclsnp, &trunclsn)) != 0 ||
+ (ret = __rep_remove_init_file(env)) != 0) {
+ REP_SYSTEM_LOCK(env);
+ FLD_CLR(rep->lockout_flags,
+ REP_LOCKOUT_API | REP_LOCKOUT_MSG | REP_LOCKOUT_OP);
+ goto errunlock;
+ }
+
+ /*
+ * The log has been truncated (either directly by us or by __db_apprec)
+ * We want to make sure we're waiting for the LSN at the new end-of-log,
+ * not some later point.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->ready_lsn = trunclsn;
+ ZERO_LSN(lp->waiting_lsn);
+ ZERO_LSN(lp->max_wait_lsn);
+ lp->max_perm_lsn = *reclsnp;
+ lp->wait_ts = rep->request_gap;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ ZERO_LSN(lp->verify_lsn);
+ ZERO_LSN(lp->prev_ckp);
+
+ /*
+ * Discard any log records we have queued; we're about to re-request
+ * them, and can't trust the ones in the queue. We need to set the
+ * DB_AM_RECOVER bit in this handle, so that the operation doesn't
+ * deadlock.
+ */
+ if (db_rep->rep_db == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto out;
+ }
+
+ F_SET(db_rep->rep_db, DB_AM_RECOVER);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ F_CLR(db_rep->rep_db, DB_AM_RECOVER);
+
+ REP_SYSTEM_LOCK(env);
+ STAT(rep->stat.st_log_queued = 0);
+ if (IN_INTERNAL_INIT(rep))
+ event = 1;
+ CLR_RECOVERY_SETTINGS(rep);
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_ARCHIVE | REP_LOCKOUT_MSG);
+ if (ret != 0)
+ goto errunlock2;
+
+ /*
+ * If the master_id is invalid, this means that since
+ * the last record was sent, something happened to the
+ * master and we may not have a master to request
+ * things of.
+ *
+ * This is not an error; when we find a new master,
+ * we'll re-negotiate where the end of the log is and
+ * try to bring ourselves up to date again anyway.
+ */
+ master = rep->master_id;
+ REP_SYSTEM_UNLOCK(env);
+ if (master == DB_EID_INVALID) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ret = 0;
+ } else {
+ /*
+ * We're making an ALL_REQ. But now that we've
+ * cleared the flags, we're likely receiving new
+ * log records from the master, resulting in a gap
+ * immediately. So to avoid multiple data streams,
+ * set the wait_ts value high now to give the master
+ * a chance to start sending us these records before
+ * the gap code re-requests the same gap. Wait_recs
+ * will get reset once we start receiving these
+ * records.
+ */
+ lp->wait_ts = rep->max_gap;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ (void)__rep_send_message(env,
+ master, REP_ALL_REQ, reclsnp, NULL, 0, DB_REP_ANYWHERE);
+ }
+ if (event)
+ __rep_fire_event(env, DB_EVENT_REP_INIT_DONE, NULL);
+ if (0) {
+errunlock2: MUTEX_UNLOCK(env, rep->mtx_clientdb);
+errunlock: REP_SYSTEM_UNLOCK(env);
+ }
+out: return (ret);
+}
diff --git a/src/repmgr/repmgr.msg b/src/repmgr/repmgr.msg
new file mode 100644
index 00000000..020f2e9c
--- /dev/null
+++ b/src/repmgr/repmgr.msg
@@ -0,0 +1,119 @@
+PREFIX __repmgr
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/db_swap.h"
+INCLUDE
+
+BEGIN_MSG handshake
+ARG port u_int16_t
+ARG alignment u_int16_t
+ARG ack_policy u_int32_t
+ARG flags u_int32_t
+END
+
+BEGIN_MSG v3handshake
+ARG port u_int16_t
+ARG priority u_int32_t
+ARG flags u_int32_t
+END
+
+BEGIN_MSG v2handshake
+ARG port u_int16_t
+ARG priority u_int32_t
+END
+
+BEGIN_MSG parm_refresh
+ARG ack_policy u_int32_t
+ARG flags u_int32_t
+END
+
+BEGIN_MSG permlsn
+ARG generation u_int32_t
+ARG lsn DB_LSN
+END
+
+BEGIN_MSG version_proposal
+ARG min u_int32_t
+ARG max u_int32_t
+END
+
+BEGIN_MSG version_confirmation
+ARG version u_int32_t
+END
+
+BEGIN_MSG msg_hdr
+ARG type u_int8_t
+ARG word1 u_int32_t
+ARG word2 u_int32_t
+END
+
+/* Metadata that goes along with user message on a DB_CHANNEL. */
+BEGIN_MSG msg_metadata
+ARG tag u_int32_t
+ARG limit u_int32_t
+ARG flags u_int32_t
+END
+
+/*
+ * The membership database has a record for each site in the group, plus one
+ * extra meta-data record. The key of the meta-data record has a zero-length
+ * host, and a port value of 0.
+ */
+BEGIN_MSG membership_key check_length
+ARG host DBT
+ARG port u_int16_t
+END
+
+BEGIN_MSG membership_data
+ARG flags u_int32_t
+END
+
+/*
+ * The "format" identifies the content and layout of the records within the
+ * membership database (i.e., some of the items defined here in this *.msg
+ * file). It should be incremented when the layouts change in future Berkeley
+ * DB releases. The "version" counts group changes that the application makes
+ * by adding or removing sites; thus it varies dynamically thoughout the life of
+ * a group, during a single release of Berkeley DB.
+ */
+BEGIN_MSG member_metadata
+ARG format u_int32_t
+ARG version u_int32_t
+END
+
+/*
+ * When a new site wants to join a group, it "guesses" that the configured
+ * "helper" site is the master, and sends the request there. When that guess
+ * is wrong, the helper site responds with the location of the current master,
+ * in effect "forwarding" the request.
+ */
+BEGIN_MSG gm_fwd check_length
+ARG host DBT
+ARG port u_int16_t
+ARG gen u_int32_t
+END
+
+/* Membership list version header: */
+BEGIN_MSG membr_vers
+ARG version u_int32_t
+ARG gen u_int32_t
+END
+BEGIN_MSG site_info check_length
+ARG host DBT
+ARG port u_int16_t
+ARG flags u_int32_t
+END
+
+/*
+ * If site A breaks or rejects a connection from site B, it first
+ * tries to send B this message containing site A's currently known
+ * membership DB version. Site B can use this to decide what to do.
+ * If site B knows of a later version, it should retry the connection
+ * to site A later, polling at it until site A catches up. However, if
+ * site B's known version is less, it means that site B is no longer in
+ * the group, and so instead it should shut down and notify the application.
+ */
+BEGIN_MSG connect_reject
+ARG version u_int32_t
+ARG gen u_int32_t
+END
diff --git a/src/repmgr/repmgr.src b/src/repmgr/repmgr.src
new file mode 100644
index 00000000..68d8c239
--- /dev/null
+++ b/src/repmgr/repmgr.src
@@ -0,0 +1,23 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+
+DBPRIVATE
+PREFIX __repmgr
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc_auto/repmgr_auto.h"
+INCLUDE
+
+BEGIN member 52 200
+ARG version u_int32_t lu
+ARG prev_status u_int32_t lu
+ARG status u_int32_t lu
+DBT host DBT s
+ARG port u_int32_t lu
+END
diff --git a/src/repmgr/repmgr_auto.c b/src/repmgr/repmgr_auto.c
new file mode 100644
index 00000000..19eb24d4
--- /dev/null
+++ b/src/repmgr/repmgr_auto.c
@@ -0,0 +1,32 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc_auto/repmgr_auto.h"
+
+DB_LOG_RECSPEC __repmgr_member_desc[] = {
+ {LOGREC_ARG, SSZ(__repmgr_member_args, version), "version", "%lu"},
+ {LOGREC_ARG, SSZ(__repmgr_member_args, prev_status), "prev_status", "%lu"},
+ {LOGREC_ARG, SSZ(__repmgr_member_args, status), "status", "%lu"},
+ {LOGREC_DBT, SSZ(__repmgr_member_args, host), "host", ""},
+ {LOGREC_ARG, SSZ(__repmgr_member_args, port), "port", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__repmgr_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __repmgr_member_recover, DB___repmgr_member)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/repmgr/repmgr_automsg.c b/src/repmgr/repmgr_automsg.c
new file mode 100644
index 00000000..90af08ff
--- /dev/null
+++ b/src/repmgr/repmgr_automsg.c
@@ -0,0 +1,757 @@
+/* Do not edit: automatically built by gen_msg.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_swap.h"
+
+/*
+ * PUBLIC: void __repmgr_handshake_marshal __P((ENV *,
+ * PUBLIC: __repmgr_handshake_args *, u_int8_t *));
+ */
+void
+__repmgr_handshake_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_handshake_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONS_COPYOUT(env, bp, argp->port);
+ DB_HTONS_COPYOUT(env, bp, argp->alignment);
+ DB_HTONL_COPYOUT(env, bp, argp->ack_policy);
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_handshake_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_handshake_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_handshake_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_handshake_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_HANDSHAKE_SIZE)
+ goto too_few;
+ DB_NTOHS_COPYIN(env, argp->port, bp);
+ DB_NTOHS_COPYIN(env, argp->alignment, bp);
+ DB_NTOHL_COPYIN(env, argp->ack_policy, bp);
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_handshake message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_v3handshake_marshal __P((ENV *,
+ * PUBLIC: __repmgr_v3handshake_args *, u_int8_t *));
+ */
+void
+__repmgr_v3handshake_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_v3handshake_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONS_COPYOUT(env, bp, argp->port);
+ DB_HTONL_COPYOUT(env, bp, argp->priority);
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_v3handshake_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_v3handshake_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_v3handshake_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_v3handshake_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_V3HANDSHAKE_SIZE)
+ goto too_few;
+ DB_NTOHS_COPYIN(env, argp->port, bp);
+ DB_NTOHL_COPYIN(env, argp->priority, bp);
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_v3handshake message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_v2handshake_marshal __P((ENV *,
+ * PUBLIC: __repmgr_v2handshake_args *, u_int8_t *));
+ */
+void
+__repmgr_v2handshake_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_v2handshake_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONS_COPYOUT(env, bp, argp->port);
+ DB_HTONL_COPYOUT(env, bp, argp->priority);
+}
+
+/*
+ * PUBLIC: int __repmgr_v2handshake_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_v2handshake_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_v2handshake_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_v2handshake_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_V2HANDSHAKE_SIZE)
+ goto too_few;
+ DB_NTOHS_COPYIN(env, argp->port, bp);
+ DB_NTOHL_COPYIN(env, argp->priority, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_v2handshake message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_parm_refresh_marshal __P((ENV *,
+ * PUBLIC: __repmgr_parm_refresh_args *, u_int8_t *));
+ */
+void
+__repmgr_parm_refresh_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_parm_refresh_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->ack_policy);
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_parm_refresh_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_parm_refresh_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_parm_refresh_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_parm_refresh_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_PARM_REFRESH_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->ack_policy, bp);
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_parm_refresh message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_permlsn_marshal __P((ENV *,
+ * PUBLIC: __repmgr_permlsn_args *, u_int8_t *));
+ */
+void
+__repmgr_permlsn_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_permlsn_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->generation);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+}
+
+/*
+ * PUBLIC: int __repmgr_permlsn_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_permlsn_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_permlsn_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_permlsn_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_PERMLSN_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->generation, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_permlsn message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_version_proposal_marshal __P((ENV *,
+ * PUBLIC: __repmgr_version_proposal_args *, u_int8_t *));
+ */
+void
+__repmgr_version_proposal_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_version_proposal_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->min);
+ DB_HTONL_COPYOUT(env, bp, argp->max);
+}
+
+/*
+ * PUBLIC: int __repmgr_version_proposal_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_version_proposal_args *, u_int8_t *, size_t,
+ * PUBLIC: u_int8_t **));
+ */
+int
+__repmgr_version_proposal_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_version_proposal_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_VERSION_PROPOSAL_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->min, bp);
+ DB_NTOHL_COPYIN(env, argp->max, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_version_proposal message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_version_confirmation_marshal __P((ENV *,
+ * PUBLIC: __repmgr_version_confirmation_args *, u_int8_t *));
+ */
+void
+__repmgr_version_confirmation_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_version_confirmation_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->version);
+}
+
+/*
+ * PUBLIC: int __repmgr_version_confirmation_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_version_confirmation_args *, u_int8_t *, size_t,
+ * PUBLIC: u_int8_t **));
+ */
+int
+__repmgr_version_confirmation_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_version_confirmation_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_VERSION_CONFIRMATION_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->version, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_version_confirmation message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_msg_hdr_marshal __P((ENV *,
+ * PUBLIC: __repmgr_msg_hdr_args *, u_int8_t *));
+ */
+void
+__repmgr_msg_hdr_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_msg_hdr_args *argp;
+ u_int8_t *bp;
+{
+ *bp++ = argp->type;
+ DB_HTONL_COPYOUT(env, bp, argp->word1);
+ DB_HTONL_COPYOUT(env, bp, argp->word2);
+}
+
+/*
+ * PUBLIC: int __repmgr_msg_hdr_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_msg_hdr_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_msg_hdr_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_msg_hdr_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_MSG_HDR_SIZE)
+ goto too_few;
+ argp->type = *bp++;
+ DB_NTOHL_COPYIN(env, argp->word1, bp);
+ DB_NTOHL_COPYIN(env, argp->word2, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_msg_hdr message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_msg_metadata_marshal __P((ENV *,
+ * PUBLIC: __repmgr_msg_metadata_args *, u_int8_t *));
+ */
+void
+__repmgr_msg_metadata_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_msg_metadata_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->tag);
+ DB_HTONL_COPYOUT(env, bp, argp->limit);
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_msg_metadata_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_msg_metadata_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_msg_metadata_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_msg_metadata_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_MSG_METADATA_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->tag, bp);
+ DB_NTOHL_COPYIN(env, argp->limit, bp);
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_msg_metadata message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __repmgr_membership_key_marshal __P((ENV *,
+ * PUBLIC: __repmgr_membership_key_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__repmgr_membership_key_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __repmgr_membership_key_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REPMGR_MEMBERSHIP_KEY_SIZE
+ + (size_t)argp->host.size)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->host.size);
+ if (argp->host.size > 0) {
+ memcpy(bp, argp->host.data, argp->host.size);
+ bp += argp->host.size;
+ }
+ DB_HTONS_COPYOUT(env, bp, argp->port);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_membership_key_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_membership_key_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_membership_key_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_membership_key_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+
+ needed = __REPMGR_MEMBERSHIP_KEY_SIZE;
+ if (max < needed)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->host.size, bp);
+ if (argp->host.size == 0)
+ argp->host.data = NULL;
+ else
+ argp->host.data = bp;
+ needed += (size_t)argp->host.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->host.size;
+ DB_NTOHS_COPYIN(env, argp->port, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_membership_key message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_membership_data_marshal __P((ENV *,
+ * PUBLIC: __repmgr_membership_data_args *, u_int8_t *));
+ */
+void
+__repmgr_membership_data_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_membership_data_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_membership_data_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_membership_data_args *, u_int8_t *, size_t,
+ * PUBLIC: u_int8_t **));
+ */
+int
+__repmgr_membership_data_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_membership_data_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_MEMBERSHIP_DATA_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_membership_data message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_member_metadata_marshal __P((ENV *,
+ * PUBLIC: __repmgr_member_metadata_args *, u_int8_t *));
+ */
+void
+__repmgr_member_metadata_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_member_metadata_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->format);
+ DB_HTONL_COPYOUT(env, bp, argp->version);
+}
+
+/*
+ * PUBLIC: int __repmgr_member_metadata_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_member_metadata_args *, u_int8_t *, size_t,
+ * PUBLIC: u_int8_t **));
+ */
+int
+__repmgr_member_metadata_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_member_metadata_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_MEMBER_METADATA_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->format, bp);
+ DB_NTOHL_COPYIN(env, argp->version, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_member_metadata message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __repmgr_gm_fwd_marshal __P((ENV *,
+ * PUBLIC: __repmgr_gm_fwd_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__repmgr_gm_fwd_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __repmgr_gm_fwd_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REPMGR_GM_FWD_SIZE
+ + (size_t)argp->host.size)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->host.size);
+ if (argp->host.size > 0) {
+ memcpy(bp, argp->host.data, argp->host.size);
+ bp += argp->host.size;
+ }
+ DB_HTONS_COPYOUT(env, bp, argp->port);
+ DB_HTONL_COPYOUT(env, bp, argp->gen);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_gm_fwd_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_gm_fwd_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_gm_fwd_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_gm_fwd_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+
+ needed = __REPMGR_GM_FWD_SIZE;
+ if (max < needed)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->host.size, bp);
+ if (argp->host.size == 0)
+ argp->host.data = NULL;
+ else
+ argp->host.data = bp;
+ needed += (size_t)argp->host.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->host.size;
+ DB_NTOHS_COPYIN(env, argp->port, bp);
+ DB_NTOHL_COPYIN(env, argp->gen, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_gm_fwd message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_membr_vers_marshal __P((ENV *,
+ * PUBLIC: __repmgr_membr_vers_args *, u_int8_t *));
+ */
+void
+__repmgr_membr_vers_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_membr_vers_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->version);
+ DB_HTONL_COPYOUT(env, bp, argp->gen);
+}
+
+/*
+ * PUBLIC: int __repmgr_membr_vers_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_membr_vers_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_membr_vers_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_membr_vers_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_MEMBR_VERS_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->version, bp);
+ DB_NTOHL_COPYIN(env, argp->gen, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_membr_vers message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_info_marshal __P((ENV *,
+ * PUBLIC: __repmgr_site_info_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__repmgr_site_info_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __repmgr_site_info_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REPMGR_SITE_INFO_SIZE
+ + (size_t)argp->host.size)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->host.size);
+ if (argp->host.size > 0) {
+ memcpy(bp, argp->host.data, argp->host.size);
+ bp += argp->host.size;
+ }
+ DB_HTONS_COPYOUT(env, bp, argp->port);
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_info_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_site_info_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_site_info_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_site_info_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+
+ needed = __REPMGR_SITE_INFO_SIZE;
+ if (max < needed)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->host.size, bp);
+ if (argp->host.size == 0)
+ argp->host.data = NULL;
+ else
+ argp->host.data = bp;
+ needed += (size_t)argp->host.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->host.size;
+ DB_NTOHS_COPYIN(env, argp->port, bp);
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_site_info message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_connect_reject_marshal __P((ENV *,
+ * PUBLIC: __repmgr_connect_reject_args *, u_int8_t *));
+ */
+void
+__repmgr_connect_reject_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_connect_reject_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->version);
+ DB_HTONL_COPYOUT(env, bp, argp->gen);
+}
+
+/*
+ * PUBLIC: int __repmgr_connect_reject_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_connect_reject_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_connect_reject_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_connect_reject_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_CONNECT_REJECT_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->version, bp);
+ DB_NTOHL_COPYIN(env, argp->gen, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_connect_reject message"));
+ return (EINVAL);
+}
+
diff --git a/src/repmgr/repmgr_autop.c b/src/repmgr/repmgr_autop.c
new file mode 100644
index 00000000..8d7c1974
--- /dev/null
+++ b/src/repmgr/repmgr_autop.c
@@ -0,0 +1,44 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#ifdef HAVE_REPLICATION_THREADS
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc_auto/repmgr_auto.h"
+
+/*
+ * PUBLIC: int __repmgr_member_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__repmgr_member_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__repmgr_member", __repmgr_member_desc, info));
+}
+
+/*
+ * PUBLIC: int __repmgr_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__repmgr_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __repmgr_member_print, DB___repmgr_member)) != 0)
+ return (ret);
+ return (0);
+}
+#endif /* HAVE_REPLICATION_THREADS */
diff --git a/src/repmgr/repmgr_elect.c b/src/repmgr/repmgr_elect.c
new file mode 100644
index 00000000..3a84694a
--- /dev/null
+++ b/src/repmgr/repmgr_elect.c
@@ -0,0 +1,585 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static db_timeout_t __repmgr_compute_response_time __P((ENV *));
+static int __repmgr_elect __P((ENV *, u_int32_t, db_timespec *));
+static int __repmgr_elect_main __P((ENV *, REPMGR_RUNNABLE *));
+static void *__repmgr_elect_thread __P((void *));
+static int send_membership __P((ENV *));
+
+/*
+ * Starts an election thread.
+ *
+ * PUBLIC: int __repmgr_init_election __P((ENV *, u_int32_t));
+ *
+ * !!!
+ * Caller must hold mutex.
+ */
+int
+__repmgr_init_election(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ REPMGR_RUNNABLE *th;
+ int ret;
+ u_int i, new_size;
+
+ COMPQUIET(th, NULL);
+
+ db_rep = env->rep_handle;
+ if (db_rep->repmgr_status == stopped) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "ignoring elect thread request %#lx; repmgr is stopped",
+ (u_long)flags));
+ return (0);
+ }
+
+ /* Find an available slot, indexed by 'i'; allocate more if needed. */
+ for (i = 0; i < db_rep->aelect_threads; i++) {
+ th = db_rep->elect_threads[i];
+ if (th == NULL)
+ break;
+ if (th->finished) {
+ if ((ret = __repmgr_thread_join(th)) != 0)
+ return (ret);
+ /* Reuse the space in a moment. */
+ break;
+ }
+ }
+ if (i == db_rep->aelect_threads) {
+ new_size = db_rep->aelect_threads + 1;
+ if ((ret = __os_realloc(env,
+ sizeof(REPMGR_RUNNABLE*) * new_size,
+ &db_rep->elect_threads)) != 0)
+ return (ret);
+ db_rep->aelect_threads = new_size;
+ STAT(db_rep->region->mstat.st_max_elect_threads = new_size);
+ th = db_rep->elect_threads[i] = NULL;
+ }
+
+ if (th == NULL &&
+ (ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &th)) != 0)
+ return (ret);
+ th->run = __repmgr_elect_thread;
+ th->args.flags = flags;
+
+ if ((ret = __repmgr_thread_start(env, th)) == 0)
+ STAT(db_rep->region->mstat.st_elect_threads++);
+ else {
+ __os_free(env, th);
+ th = NULL;
+ }
+ db_rep->elect_threads[i] = th;
+
+ return (ret);
+}
+
+static void *
+__repmgr_elect_thread(argsp)
+ void *argsp;
+{
+ REPMGR_RUNNABLE *th;
+ ENV *env;
+ int ret;
+
+ th = argsp;
+ env = th->env;
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "starting election thread"));
+
+ if ((ret = __repmgr_elect_main(env, th)) != 0) {
+ __db_err(env, ret, "election thread failed");
+ (void)__repmgr_thread_failure(env, ret);
+ }
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "election thread is exiting"));
+ th->finished = TRUE;
+ return (NULL);
+}
+
+static int
+__repmgr_elect_main(env, th)
+ ENV *env;
+ REPMGR_RUNNABLE *th;
+{
+ DB_REP *db_rep;
+ REP *rep;
+#ifdef DB_WIN32
+ DWORD duration;
+ db_timeout_t t;
+#else
+ struct timespec deadline;
+#endif
+ db_timespec failtime, now, repstart_time, target, wait_til;
+ db_timeout_t delay_time, response_time, tmp_time;
+ u_long sec, usec;
+ u_int32_t flags;
+ int done_repstart, ret, suppress_election;
+ enum { ELECTION, REPSTART } action;
+
+ COMPQUIET(action, ELECTION);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ flags = th->args.flags;
+
+ if (LF_ISSET(ELECT_F_EVENT_NOTIFY))
+ DB_EVENT(env, DB_EVENT_REP_MASTER_FAILURE, NULL);
+
+ /*
+ * If leases are enabled, delay the election to allow any straggler
+ * messages to get processed that might grant our lease again and
+ * fool the base code into thinking the master is still there.
+ * Any delay here offsets the time election code will wait for a
+ * lease grant to expire. So with leases we're not adding more delay.
+ */
+ if (FLD_ISSET(db_rep->region->config, REP_C_LEASE)) {
+ /*
+ * Use the smallest of the lease timeout, ack timeout,
+ * or connection retry timeout. We want to give straggler
+ * messages a chance to get processed, but get an election
+ * underway as soon as possible to find a master.
+ */
+ if ((ret = __rep_get_timeout(env->dbenv,
+ DB_REP_LEASE_TIMEOUT, &delay_time)) != 0)
+ goto out;
+ if ((ret = __rep_get_timeout(env->dbenv,
+ DB_REP_ACK_TIMEOUT, &tmp_time)) != 0)
+ goto out;
+ if (tmp_time < delay_time)
+ delay_time = tmp_time;
+ if ((ret = __rep_get_timeout(env->dbenv,
+ DB_REP_CONNECTION_RETRY, &tmp_time)) != 0)
+ goto out;
+ if (tmp_time < delay_time)
+ delay_time = tmp_time;
+ sec = delay_time / US_PER_SEC;
+ usec = delay_time % US_PER_SEC;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Election with leases pause sec %lu, usec %lu", sec, usec));
+ __os_yield(env, sec, usec);
+ }
+
+ /*
+ * As a freshly started thread, lay claim to the title of being
+ * "preferred". If an older thread is sleeping for retry, when it wakes
+ * up it will relinquish its role (since there's no need for multiple
+ * threads to sleep and retry).
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ db_rep->preferred_elect_thr = th;
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ /*
+ * The 'done_repstart' flag keeps track of which was our most recent
+ * operation (repstart or election), so that we can alternate
+ * appropriately. There are a few different ways this thread can be
+ * invoked, and all but one specify some form of immediate election be
+ * called. The one exception is at initial start-up, where we
+ * first probe for a master by sending out rep_start(CLIENT) calls.
+ */
+ if (LF_ISSET(ELECT_F_IMMED)) {
+ /*
+ * When the election succeeds, we've successfully completed
+ * everything we need to do. If it fails in an unexpected way,
+ * we abort all processing as usual. The only time we need to
+ * stay in here and do some more work is on DB_REP_UNAVAIL,
+ * in which case we want to wait a while and retry later.
+ */
+ if ((ret = __repmgr_elect(env, flags, &failtime)) ==
+ DB_REP_UNAVAIL)
+ done_repstart = FALSE;
+ else
+ goto out;
+ } else {
+ /*
+ * We didn't really have an election failure, because in this
+ * case we haven't even done an election yet. But the timing
+ * we want turns out the same: we want to wait for the election
+ * retry time and then call for an election if nothing else
+ * interesting happens before then.
+ */
+ __os_gettime(env, &failtime, 1);
+
+ /*
+ * Although we didn't do a repstart in this thread, we know that
+ * our caller did one just before creating the thread.
+ */
+ done_repstart = TRUE;
+ }
+
+ LOCK_MUTEX(db_rep->mutex);
+ for (;;) {
+ ret = 0;
+
+ if (db_rep->repmgr_status == stopped)
+ goto unlock;
+
+ /*
+ * If we've become the master (which could happen after an
+ * election in another election thread), or we find we have a
+ * working connection to a known master, then we're quite
+ * content: that's really the essential purpose of this whole
+ * thread.
+ */
+ if (__repmgr_master_is_known(env))
+ goto unlock;
+
+ /*
+ * When circumstances force us to do an immediate election, we
+ * may be forced to create multiple threads in order to do so.
+ * But we certainly don't need multiple threads sleeping,
+ * alternating and retrying. The "preferred election thread" is
+ * the one that has the authority and responsibility to
+ * persevere until our work is done. Note that this role can
+ * switch from one thread to another, depending on the timing of
+ * events. In particular, when an election fails the thread
+ * that got the failure becomes the chosen one that will remain
+ * to avenge the failure.
+ */
+ if (db_rep->preferred_elect_thr != th)
+ goto unlock;
+
+ timespecclear(&wait_til);
+ __os_gettime(env, &now, 1);
+
+ /*
+ * See if it's time to retry the operation. Normally it's an
+ * election we're interested in retrying. But we refrain from
+ * calling for elections if so configured.
+ */
+ suppress_election = LF_ISSET(ELECT_F_STARTUP) ?
+ db_rep->init_policy == DB_REP_CLIENT :
+ !FLD_ISSET(rep->config, REP_C_ELECTIONS);
+ repstart_time = db_rep->repstart_time;
+ target = suppress_election ? repstart_time : failtime;
+ TIMESPEC_ADD_DB_TIMEOUT(&target, rep->election_retry_wait);
+ if (timespeccmp(&now, &target, >=)) {
+ /*
+ * We've surpassed our target retry time.
+ * However, elections should generally alternate with
+ * rep_start calls, so do that if we haven't done one
+ * since the last election.
+ */
+ action = suppress_election ? REPSTART :
+ (done_repstart ? ELECTION : REPSTART);
+
+ } else if (db_rep->new_connection) {
+ /* Seen a recent new connection, let's do rep_start. */
+ action = REPSTART;
+ } else
+ wait_til = target;
+
+ if (!timespecisset(&wait_til)) {
+ response_time = __repmgr_compute_response_time(env);
+ target = repstart_time;
+ TIMESPEC_ADD_DB_TIMEOUT(&target, response_time);
+ if (timespeccmp(&now, &target, <)) {
+ /* We haven't waited long enough. */
+ wait_til = target;
+ }
+ }
+
+ if (timespecisset(&wait_til)) {
+#ifdef DB_WIN32
+ timespecsub(&wait_til, &now);
+ DB_TIMESPEC_TO_TIMEOUT(t, &wait_til, TRUE);
+ duration = t / US_PER_MS;
+ if ((ret = SignalObjectAndWait(*db_rep->mutex,
+ db_rep->check_election, duration, FALSE)) !=
+ WAIT_OBJECT_0 && ret != WAIT_TIMEOUT)
+ goto out;
+
+ LOCK_MUTEX(db_rep->mutex);
+
+ /*
+ * Although there could be multiple threads, only the
+ * "preferred" thread resets the event object. If the
+ * others tried to do so, the preferred thread might
+ * miss the wake-up. Another way of saying this is that
+ * the precise meaning of the check_election event is
+ * that "there may be some election-thread-related work
+ * to do, and the correct thread to do it has not yet
+ * been woken up".
+ */
+ if (ret == WAIT_OBJECT_0 &&
+ db_rep->preferred_elect_thr == th &&
+ !ResetEvent(db_rep->check_election)) {
+ ret = GetLastError();
+ goto unlock;
+ }
+#else
+ deadline.tv_sec = wait_til.tv_sec;
+ deadline.tv_nsec = wait_til.tv_nsec;
+ if ((ret = pthread_cond_timedwait(
+ &db_rep->check_election, db_rep->mutex, &deadline))
+ != ETIMEDOUT && ret != 0)
+ goto unlock;
+#endif
+ continue;
+ }
+
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (action == ELECTION) {
+ db_rep->new_connection = FALSE;
+ if ((ret = __repmgr_elect(env, 0, &failtime)) ==
+ DB_REP_UNAVAIL)
+ done_repstart = FALSE;
+ else
+ goto out;
+ LOCK_MUTEX(db_rep->mutex);
+ db_rep->preferred_elect_thr = th;
+ } else {
+ DB_ASSERT(env, action == REPSTART);
+
+ db_rep->new_connection = FALSE;
+ if ((ret = __repmgr_repstart(env, DB_REP_CLIENT)) != 0)
+ goto out;
+ done_repstart = TRUE;
+
+ LOCK_MUTEX(db_rep->mutex);
+ __os_gettime(env, &db_rep->repstart_time, 1);
+ }
+ }
+
+#ifdef HAVE_STATISTICS
+ /*
+ * We normally don't bother taking a mutex to increment statistics. But
+ * in this case, since we're incrementing and decrementing in pairs, it
+ * could be very weird if we were "off somewhat". For example, we could
+ * get a negative value. And this is not a high-traffic, performance-
+ * critical path.
+ * On the other hand, it suffices to take repmgr's (handle-based)
+ * mutex, rather than the rep mutex which normally protects shared
+ * memory, since all election thread activity must be occurring in the
+ * single listener process, under control of one single rep handle.
+ */
+out:
+ LOCK_MUTEX(db_rep->mutex);
+unlock:
+ rep->mstat.st_elect_threads--;
+ UNLOCK_MUTEX(db_rep->mutex);
+#else
+unlock:
+ UNLOCK_MUTEX(db_rep->mutex);
+out:
+#endif
+ return (ret);
+}
+
+static db_timeout_t
+__repmgr_compute_response_time(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ db_timeout_t ato, eto;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * Avoid crowding operations too close together. If we've just recently
+ * done a rep_start, wait a moment in case there's a master out there,
+ * to give it a chance to respond with a NEWMASTER message. This is
+ * particularly an issue at start-up time, when we're likely to have
+ * several "new connection establishment" events bombarding us with lots
+ * of rep_start requests in quick succession.
+ *
+ * We don't have a separate user configuration for rep_start response,
+ * but it's reasonable to expect it to be similar to either the ack
+ * timeout or the election timeout, whichever is smaller. However, only
+ * consider the ack timeout if all signs point to it being in use.
+ */
+ ato = rep->ack_timeout;
+ eto = rep->elect_timeout;
+ if (ato > 0 &&
+ rep->perm_policy != DB_REPMGR_ACKS_NONE &&
+ rep->priority > 0 &&
+ ato < eto)
+ return (ato);
+
+ return (eto);
+}
+
+static int
+__repmgr_elect(env, flags, failtimep)
+ ENV *env;
+ u_int32_t flags;
+ db_timespec *failtimep;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ u_int32_t invitation, nsites, nvotes;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ nsites = db_rep->region->config_nsites;
+ DB_ASSERT(env, nsites > 0);
+
+ /*
+ * With only 2 sites in the group, even a single failure could make it
+ * impossible to get a majority. So, fudge a little, unless the user
+ * really wants strict safety.
+ */
+ if (nsites == 2 &&
+ !FLD_ISSET(db_rep->region->config, REP_C_2SITE_STRICT))
+ nvotes = 1;
+ else
+ nvotes = ELECTION_MAJORITY(nsites);
+
+ if (LF_ISSET(ELECT_F_INVITEE)) {
+ /*
+ * We're going to the election party because we were invited by
+ * another site. Accept the other site's suggested value, if
+ * it's reasonable. (I.e., the other site may have wanted to do
+ * a "fast" election after losing contact with the master. If
+ * so, let's not spoil it by imposing our own full nsites count
+ * on it.)
+ */
+ rep = db_rep->region;
+ invitation = rep->nsites;
+ if (invitation == nsites || invitation == nsites - 1) {
+ nsites = invitation;
+ }
+ }
+ if (LF_ISSET(ELECT_F_FAST) && nsites > nvotes) {
+ /*
+ * If we're doing an election because we noticed that the master
+ * failed, it's reasonable to expect that the master won't
+ * participate. By not waiting for its vote, we can probably
+ * complete the election faster. But note that we shouldn't
+ * allow this to affect nvotes calculation.
+ *
+ * However, if we have 2 sites, and strict majority is turned
+ * on, now nvotes would be 2, and it doesn't make sense to
+ * rep_elect to see nsites of 1 in that case. So only decrement
+ * nsites if it currently exceeds nvotes.
+ */
+ nsites--;
+ }
+ /* The rule for leases overrides all of the above. */
+ if (IS_USING_LEASES(env))
+ nsites = 0;
+
+ switch (ret = __rep_elect_int(env, nsites, nvotes, 0)) {
+ case DB_REP_UNAVAIL:
+ __os_gettime(env, failtimep, 1);
+ DB_EVENT(env, DB_EVENT_REP_ELECTION_FAILED, NULL);
+ if ((t_ret = send_membership(env)) != 0)
+ ret = t_ret;
+ break;
+
+ case 0:
+ if (db_rep->takeover_pending)
+ ret = __repmgr_claim_victory(env);
+ break;
+
+ case DB_REP_IGNORE:
+ ret = 0;
+ break;
+
+ default:
+ __db_err(env, ret, DB_STR("3629",
+ "unexpected election failure"));
+ break;
+ }
+ return (ret);
+}
+
+/*
+ * If an election fails with DB_REP_UNAVAIL, it could be because a participating
+ * site has an obsolete, too-high notion of the group size. (This could happen
+ * if the site was down/disconnected during removal of some (other) sites.) To
+ * remedy this, broadcast a current copy of the membership list. Since all
+ * sites are doing this, and we always ratchet to the most up-to-date version,
+ * this should bring all sites up to date. We only do this after a failure,
+ * during what will normally be an idle period anyway, so that we don't slow
+ * down a first election following the loss of an active master.
+ */
+static int
+send_membership(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ u_int8_t *buf;
+ size_t len;
+ int ret;
+
+ db_rep = env->rep_handle;
+ buf = NULL;
+ LOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_marshal_member_list(env, &buf, &len)) != 0)
+ goto out;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Broadcast latest membership list"));
+ ret = __repmgr_bcast_own_msg(env, REPMGR_SHARING, buf, len);
+out:
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (buf != NULL)
+ __os_free(env, buf);
+ return (ret);
+}
+
+/*
+ * Becomes master after we've won an election, if we can.
+ *
+ * PUBLIC: int __repmgr_claim_victory __P((ENV *));
+ */
+int
+__repmgr_claim_victory(env)
+ ENV *env;
+{
+ int ret;
+
+ env->rep_handle->takeover_pending = FALSE;
+ if ((ret = __repmgr_become_master(env)) == DB_REP_UNAVAIL) {
+ ret = 0;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Won election but lost race with DUPMASTER client intent"));
+ }
+ return (ret);
+}
+
+/*
+ * When turning on elections in an already-running system, check to see if we're
+ * in a state where we need an election (i.e., we would have started one
+ * previously if elections hadn't been turned off), and if so start one.
+ *
+ * PUBLIC: int __repmgr_turn_on_elections __P((ENV *));
+ */
+int
+__repmgr_turn_on_elections(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+
+ DB_ASSERT(env, REP_ON(env));
+ LOCK_MUTEX(db_rep->mutex);
+ if (db_rep->selector == NULL ||
+ !FLD_ISSET(rep->config, REP_C_ELECTIONS) ||
+ __repmgr_master_is_known(env))
+ goto out;
+
+ ret = __repmgr_init_election(env, ELECT_F_IMMED);
+
+out:
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
diff --git a/src/repmgr/repmgr_method.c b/src/repmgr/repmgr_method.c
new file mode 100644
index 00000000..229cf650
--- /dev/null
+++ b/src/repmgr/repmgr_method.c
@@ -0,0 +1,3092 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+
+/* Context for an API thread waiting for response to a synchronous request. */
+struct response_wait {
+ REPMGR_CONNECTION *conn;
+ u_int32_t index;
+};
+
+static int addr_chk __P((const ENV *, const char *, u_int));
+static void adjust_bulk_response __P((ENV *, DBT *));
+static int bad_callback_method __P((DB_CHANNEL *, const char *));
+static void copy_body __P((u_int8_t *, REPMGR_IOVECS *));
+static int get_shared_netaddr __P((ENV *, int, repmgr_netaddr_t *));
+static int establish_connection __P((ENV *, int, REPMGR_CONNECTION **));
+static int get_channel_connection __P((CHANNEL *, REPMGR_CONNECTION **));
+static int init_dbsite __P((ENV *, int, const char *, u_int, DB_SITE **));
+static int join_group_at_site __P((ENV *, repmgr_netaddr_t *));
+static int kick_blockers __P((ENV *, REPMGR_CONNECTION *, void *));
+static int make_request_conn __P((ENV *,
+ repmgr_netaddr_t *, REPMGR_CONNECTION **));
+static int set_local_site __P((DB_SITE *, u_int32_t));
+static int read_own_msg __P((ENV *,
+ REPMGR_CONNECTION *, u_int32_t *, u_int8_t **, size_t *));
+static int refresh_site __P((DB_SITE *));
+static int __repmgr_await_threads __P((ENV *));
+static int __repmgr_build_data_out __P((ENV *,
+ DBT *, u_int32_t, __repmgr_msg_metadata_args *, REPMGR_IOVECS **iovecsp));
+static int __repmgr_build_msg_out __P((ENV *,
+ DBT *, u_int32_t, __repmgr_msg_metadata_args *, REPMGR_IOVECS **iovecsp));
+static int repmgr_only __P((ENV *, const char *));
+static int __repmgr_restart __P((ENV *, int, u_int32_t));
+static int __repmgr_remove_site __P((DB_SITE *));
+static int __repmgr_remove_site_pp __P((DB_SITE *));
+static int __repmgr_start_msg_threads __P((ENV *, u_int));
+static int request_self __P((ENV *, DBT *, u_int32_t, DBT *, u_int32_t));
+static int response_complete __P((ENV *, void *));
+static int send_msg_conn __P((ENV *, REPMGR_CONNECTION *, DBT *, u_int32_t));
+static int send_msg_self __P((ENV *, REPMGR_IOVECS *, u_int32_t));
+static int site_by_addr __P((ENV *, const char *, u_int, DB_SITE **));
+
+/*
+ * PUBLIC: int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+ */
+int
+__repmgr_start(dbenv, nthreads, flags)
+ DB_ENV *dbenv;
+ int nthreads;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_SITE *me, *site;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int first, is_listener, locked, min, need_masterseek, ret, start_master;
+ u_int i, n;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ switch (flags) {
+ case 0:
+ case DB_REP_CLIENT:
+ case DB_REP_ELECTION:
+ case DB_REP_MASTER:
+ break;
+ default:
+ __db_errx(env, DB_STR("3635",
+ "repmgr_start: unrecognized flags parameter value"));
+ return (EINVAL);
+ }
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->repmgr_start", DB_INIT_REP);
+ if (!F_ISSET(env, ENV_THREAD)) {
+ __db_errx(env, DB_STR("3636",
+ "Replication Manager needs an environment with DB_THREAD"));
+ return (EINVAL);
+ }
+
+ if (APP_IS_BASEAPI(env))
+ return (repmgr_only(env, "repmgr_start"));
+
+ /* Check that the required initialization has been done. */
+ if (!IS_VALID_EID(db_rep->self_eid)) {
+ __db_errx(env, DB_STR("3637",
+ "A local site must be named before calling repmgr_start"));
+ return (EINVAL);
+ }
+
+ /* Check if it is a shut-down site, if so, clean the resources. */
+ if (db_rep->repmgr_status == stopped) {
+ if ((ret = __repmgr_stop(env)) != 0) {
+ __db_errx(env, DB_STR("3638",
+ "Could not clean up repmgr"));
+ return (ret);
+ }
+ db_rep->repmgr_status = ready;
+ }
+
+ db_rep->init_policy = flags;
+ if ((ret = __rep_set_transport_int(env,
+ db_rep->self_eid, __repmgr_send)) != 0)
+ return (ret);
+ if (!REPMGR_INITED(db_rep) && (ret = __repmgr_init(env)) != 0)
+ return (ret);
+ /*
+ * As a prerequisite to starting replication, get our list of remote
+ * sites properly set up. Mainly this involves reading the group
+ * membership database; but alternatively, deciding what to do when it's
+ * not present (which depends on various conditions).
+ */
+ start_master = (flags == DB_REP_MASTER);
+
+ if (db_rep->restored_list != NULL) {
+ ret = __repmgr_refresh_membership(env,
+ db_rep->restored_list, db_rep->restored_list_length);
+ __os_free(env, db_rep->restored_list);
+ db_rep->restored_list = NULL;
+ } else {
+ ret = __repmgr_reload_gmdb(env);
+ me = SITE_FROM_EID(db_rep->self_eid);
+ if (ret == 0) {
+ if (me->membership != SITE_PRESENT)
+ /*
+ * We have a database but the local site is not
+ * shown as "present" in the group. We must
+ * have been removed from the group, or perhaps
+ * we're being created via hot backup. In
+ * either case the thing to do is to try to
+ * join.
+ */
+ ret = __repmgr_join_group(env);
+ } else if (ret == ENOENT) {
+ ENV_ENTER(env, ip);
+ if (FLD_ISSET(me->config, DB_GROUP_CREATOR))
+ start_master = TRUE;
+ /*
+ * LEGACY is inconsistent with CREATOR, but start_master
+ * could still be true due to "flags" being passed as
+ * DB_REP_MASTER. In that case, being started as master
+ * is irrelevant to establishing initial membership
+ * list: LEGACY always takes precedence if set.
+ */
+ if (FLD_ISSET(me->config, DB_LEGACY)) {
+ LOCK_MUTEX(db_rep->mutex);
+ db_rep->membership_version = 1;
+ db_rep->member_version_gen = 1;
+ for (n = i = 0; i < db_rep->site_cnt; i++) {
+ site = SITE_FROM_EID(i);
+ if (!FLD_ISSET(site->config, DB_LEGACY))
+ continue;
+ if ((ret = __repmgr_set_membership(env,
+ site->net_addr.host,
+ site->net_addr.port,
+ SITE_PRESENT)) != 0)
+ break;
+ n++;
+ }
+ ret = __rep_set_nsites_int(env, n);
+ DB_ASSERT(env, ret == 0);
+ UNLOCK_MUTEX(db_rep->mutex);
+ } else if (start_master) {
+ LOCK_MUTEX(db_rep->mutex);
+ db_rep->membership_version = 1;
+ db_rep->member_version_gen = 1;
+ if ((ret = __repmgr_set_membership(env,
+ me->net_addr.host, me->net_addr.port,
+ SITE_PRESENT)) == 0) {
+ ret = __rep_set_nsites_int(env, 1);
+ DB_ASSERT(env, ret == 0);
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+ } else
+ ret = __repmgr_join_group(env);
+ ENV_LEAVE(env, ip);
+ } else if (ret == DB_DELETED)
+ ret = DB_REP_UNAVAIL;
+ }
+ if (ret != 0)
+ return (ret);
+
+ DB_ASSERT(env, start_master ||
+ SITE_FROM_EID(db_rep->self_eid)->membership == SITE_PRESENT);
+
+ /*
+ * If we're the first repmgr_start() call, we will have to start threads.
+ * Therefore, we require a flags value (to tell us how).
+ */
+ if (db_rep->repmgr_status != running && flags == 0) {
+ __db_errx(env, DB_STR("3639",
+ "a non-zero flags value is required for initial repmgr_start() call"));
+ return (EINVAL);
+ }
+
+ /*
+ * Figure out the current situation. The current invocation of
+ * repmgr_start() is either the first one (on the given env handle), or
+ * a subsequent one.
+ *
+ * Then, in case there could be multiple processes, we're either the
+ * main listener process or a subordinate process. On a "subsequent"
+ * repmgr_start() call we already have enough information to know which
+ * it is. Otherwise, negotiate with information in the shared region to
+ * claim the listener role if possible.
+ *
+ * To avoid a race, once we decide we're in the first call, mark the
+ * handle as started, so that no other thread thinks the same thing.
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+ if (db_rep->repmgr_status == running) {
+ first = FALSE;
+ is_listener = !IS_SUBORDINATE(db_rep);
+ } else {
+ first = TRUE;
+ db_rep->repmgr_status = running;
+
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ if (rep->listener == 0) {
+ is_listener = TRUE;
+ __os_id(dbenv, &rep->listener, NULL);
+ } else {
+ is_listener = FALSE;
+ nthreads = 0;
+ }
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ ENV_LEAVE(env, ip);
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+ locked = FALSE;
+
+ if (!first) {
+ /*
+ * Subsequent call is allowed when ELECTIONS are turned off, so
+ * that the application can make its own dynamic role changes.
+ * It's also allowed in any case, if not trying to change roles
+ * (flags == 0), in order to change number of message processing
+ * threads. The __repmgr_restart() function will take care of
+ * these cases entirely.
+ */
+ if (!is_listener || (flags != 0 &&
+ FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS))) {
+ __db_errx(env, DB_STR("3640",
+ "repmgr is already started"));
+ ret = EINVAL;
+ } else
+ ret = __repmgr_restart(env, nthreads, flags);
+ return (ret);
+ }
+
+ /*
+ * The minimum legal number of threads is either 1 or 0, depending upon
+ * whether we're the main process or a subordinate.
+ */
+ min = is_listener ? 1 : 0;
+ if (nthreads < min) {
+ __db_errx(env, DB_STR_A("3641",
+ "repmgr_start: nthreads parameter must be >= %d",
+ "%d"), min);
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Ensure at least one more thread (for channel messages and GMDB
+ * requests) beyond those set aside to avoid starvation of rep
+ * messages.
+ *
+ * Note that it's OK to silently fudge the number here, because the
+ * documentation says that "[i]n addition to these message processing
+ * threads, the Replication Manager creates and manages a few of its own
+ * threads of control."
+ */
+ min = RESERVED_MSG_TH(env) + 1;
+ if (nthreads < min && is_listener)
+ nthreads = min;
+
+ if (is_listener) {
+ if ((ret = __repmgr_listen(env)) != 0)
+ goto err;
+ /*
+ * Make some sort of call to rep_start before starting message
+ * processing threads, to ensure that incoming messages being
+ * processed always have a rep context properly configured.
+ * Note that even if we're starting without recovery, we need a
+ * rep_start call in case we're using leases. Leases keep track
+ * of rep_start calls even within an env region lifetime.
+ */
+ if (start_master) {
+ ret = __repmgr_become_master(env);
+ /* No other repmgr threads running yet. */
+ DB_ASSERT(env, ret != DB_REP_UNAVAIL);
+ if (ret != 0)
+ goto err;
+ need_masterseek = FALSE;
+ } else {
+ if ((ret = __repmgr_repstart(env, DB_REP_CLIENT)) != 0)
+ goto err;
+ /*
+ * The repmgr election code starts elections only if
+ * the DB_REP_ELECTION start flag was specified, but
+ * it performs other actions to help find a master for
+ * DB_REP_CLIENT, which is why we need_masterseek for
+ * both cases.
+ */
+ need_masterseek = TRUE;
+ }
+
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+
+ /*
+ * Since these allocated memory blocks are used by other
+ * threads, we have to be a bit careful about freeing them in
+ * case of any errors. __repmgr_await_threads (which we call in
+ * the err: coda below) takes care of that.
+ *
+ * Start by allocating enough space for 2 election threads. We
+ * occasionally need that many; more are possible, but would be
+ * extremely rare.
+ */
+#define ELECT_THREADS_ALLOC 2
+
+ if ((ret = __os_calloc(env, ELECT_THREADS_ALLOC,
+ sizeof(REPMGR_RUNNABLE *), &db_rep->elect_threads)) != 0)
+ goto err;
+ db_rep->aelect_threads = ELECT_THREADS_ALLOC;
+ STAT(rep->mstat.st_max_elect_threads = ELECT_THREADS_ALLOC);
+
+ if ((ret = __os_calloc(env, (u_int)nthreads,
+ sizeof(REPMGR_RUNNABLE *), &db_rep->messengers)) != 0)
+ goto err;
+ db_rep->athreads = (u_int)nthreads;
+
+ db_rep->nthreads = 0;
+ if ((ret =
+ __repmgr_start_msg_threads(env, (u_int)nthreads)) != 0)
+ goto err;
+
+ if (need_masterseek) {
+ /*
+ * The repstart_time field records that time when we
+ * last issued a rep_start(CLIENT) that sent out a
+ * NEWCLIENT message. We use it to avoid doing so
+ * twice in quick succession (to give the master a
+ * reasonable chance to respond). The rep_start()
+ * that we just issued above doesn't count, because we
+ * haven't established any connections yet, and so no
+ * message could have been sent out. The instant we
+ * get our first connection set up we want to send out
+ * our first real NEWCLIENT.
+ */
+ timespecclear(&db_rep->repstart_time);
+
+ if ((ret = __repmgr_init_election(env,
+ ELECT_F_STARTUP)) != 0)
+ goto err;
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+ locked = FALSE;
+ }
+ /* All processes (even non-listeners) need a select() thread. */
+ if ((ret = __repmgr_start_selector(env)) == 0)
+ return (is_listener ? 0 : DB_REP_IGNORE);
+
+err:
+ /* If we couldn't succeed at everything, undo the parts we did do. */
+ if (db_rep->selector != NULL) {
+ if (!locked)
+ LOCK_MUTEX(db_rep->mutex);
+ (void)__repmgr_stop_threads(env);
+ UNLOCK_MUTEX(db_rep->mutex);
+ locked = FALSE;
+ }
+ (void)__repmgr_await_threads(env);
+ if (!locked)
+ LOCK_MUTEX(db_rep->mutex);
+ (void)__repmgr_net_close(env);
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_valid_config __P((ENV *, u_int32_t));
+ */
+int
+__repmgr_valid_config(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ DB_ASSERT(env, REP_ON(env));
+ LOCK_MUTEX(db_rep->mutex);
+
+ /* (Can't check IS_SUBORDINATE if select thread isn't running yet.) */
+ if (LF_ISSET(REP_C_ELECTIONS) &&
+ db_rep->selector != NULL && IS_SUBORDINATE(db_rep)) {
+ __db_errx(env, DB_STR("3642",
+ "can't configure repmgr elections from subordinate process"));
+ ret = EINVAL;
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * Starts message processing threads. On entry, the actual number of threads
+ * already active is db_rep->nthreads; the desired number of threads is passed
+ * as "n".
+ *
+ * Caller must hold mutex.
+ */
+static int
+__repmgr_start_msg_threads(env, n)
+ ENV *env;
+ u_int n;
+{
+ DB_REP *db_rep;
+ REPMGR_RUNNABLE *messenger;
+ int ret;
+
+ db_rep = env->rep_handle;
+ DB_ASSERT(env, db_rep->athreads >= n);
+ while (db_rep->nthreads < n) {
+ if ((ret = __os_calloc(env,
+ 1, sizeof(REPMGR_RUNNABLE), &messenger)) != 0)
+ return (ret);
+
+ messenger->run = __repmgr_msg_thread;
+ if ((ret = __repmgr_thread_start(env, messenger)) != 0) {
+ __os_free(env, messenger);
+ return (ret);
+ }
+ db_rep->messengers[db_rep->nthreads++] = messenger;
+ }
+ return (0);
+}
+
+/*
+ * Handles a repmgr_start() call that occurs when repmgr is already running.
+ * This is allowed (when elections are not in use), to dynamically change
+ * master/client role. It is also allowed (regardless of the ELECTIONS setting)
+ * to change the number of msg processing threads.
+ */
+static int
+__repmgr_restart(env, nthreads, flags)
+ ENV *env;
+ int nthreads;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_RUNNABLE **th;
+ u_int32_t cur_repflags;
+ int locked, ret, t_ret;
+ u_int delta, i, min, nth;
+
+ th = NULL;
+ locked = FALSE;
+
+ if (flags == DB_REP_ELECTION) {
+ __db_errx(env, DB_STR("3643",
+ "subsequent repmgr_start() call may not specify DB_REP_ELECTION"));
+ return (EINVAL);
+ }
+ if (nthreads < 0) {
+ __db_errx(env, DB_STR("3644",
+ "repmgr_start: nthreads parameter must be >= 0"));
+ return (EINVAL);
+ }
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ DB_ASSERT(env, REP_ON(env));
+ rep = db_rep->region;
+
+ cur_repflags = F_ISSET(rep, REP_F_MASTER | REP_F_CLIENT);
+ DB_ASSERT(env, cur_repflags);
+ if (FLD_ISSET(cur_repflags, REP_F_MASTER) &&
+ flags == DB_REP_CLIENT)
+ ret = __repmgr_become_client(env);
+ else if (FLD_ISSET(cur_repflags, REP_F_CLIENT) &&
+ flags == DB_REP_MASTER)
+ ret = __repmgr_become_master(env);
+ if (ret != 0)
+ return (ret);
+
+ if (nthreads == 0)
+ return (0);
+ nth = (u_int)nthreads;
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+ min = RESERVED_MSG_TH(env) + db_rep->non_rep_th;
+ if (nth < min)
+ nth = min;
+
+ if (nth > db_rep->nthreads) {
+ /*
+ * To increase the number of threads, first allocate more space,
+ * unless we already have enough unused space available.
+ */
+ if (db_rep->athreads < nth) {
+ if ((ret = __os_realloc(env,
+ sizeof(REPMGR_RUNNABLE *) * nth,
+ &db_rep->messengers)) != 0)
+ goto out;
+ db_rep->athreads = nth;
+ }
+ ret = __repmgr_start_msg_threads(env, nth);
+ } else if (nth < db_rep->nthreads) {
+ /*
+ * Remove losers from array, and then wait for each of them. We
+ * have to make an array copy, because we have to drop the mutex
+ * to wait for the threads to complete, and if we left the real
+ * array in the handle in the pending state while waiting,
+ * another thread could come along wanting to make another
+ * change, and would make a mess.
+ * The alternative is about as inelegant: we could do these
+ * one at a time here if we added another field to the handle,
+ * to keep track of both the actual number of threads and the
+ * user's desired number of threads.
+ */
+ /*
+ * Make sure signalling the condition variable works, before
+ * making a mess of the data structures. Although it may seem a
+ * little backwards, it doesn't really matter since we're
+ * holding the mutex. Once we allocate the temp array and grab
+ * ownership of the loser thread structs, we must continue
+ * trying (even if errors) so that we definitely free the
+ * memory.
+ */
+ if ((ret = __repmgr_wake_msngers(env, nth)) != 0)
+ goto out;
+ delta = db_rep->nthreads - nth;
+ if ((ret = __os_calloc(env, (size_t)delta,
+ sizeof(REPMGR_RUNNABLE *), &th)) != 0)
+ goto out;
+ for (i = 0; i < delta; i++) {
+ th[i] = db_rep->messengers[nth + i];
+ th[i]->quit_requested = TRUE;
+ db_rep->messengers[nth + i] = NULL;
+ }
+ db_rep->nthreads = nth;
+ UNLOCK_MUTEX(db_rep->mutex);
+ locked = FALSE;
+
+ DB_ASSERT(env, ret == 0);
+ for (i = 0; i < delta; i++) {
+ if ((t_ret = __repmgr_thread_join(th[i])) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ __os_free(env, th[i]);
+ }
+ __os_free(env, th);
+ }
+
+out: if (locked)
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_autostart __P((ENV *));
+ *
+ * Preconditions: rep_start() has been called; we're within an ENV_ENTER.
+ */
+int
+__repmgr_autostart(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ DB_ASSERT(env, REP_ON(env));
+ LOCK_MUTEX(db_rep->mutex);
+
+ if (REPMGR_INITED(db_rep))
+ ret = 0;
+ else
+ ret = __repmgr_init(env);
+ if (ret != 0)
+ goto out;
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Automatically joining existing repmgr env"));
+
+ /*
+ * We're only called if we're a master, which means we've had a
+ * rep_start() call, which means we must have had a previous
+ * rep_set_transport() call (in the region, in a separate env handle).
+ * We could therefore get away with simply poking in a pointer to our
+ * send function; but we need to dig up our EID value anyway, so we
+ * might as well set it properly.
+ */
+ db_rep->self_eid = rep->eid;
+ if ((ret = __rep_set_transport_int(env,
+ db_rep->self_eid, __repmgr_send)) != 0)
+ goto out;
+
+ if (db_rep->selector == NULL && db_rep->repmgr_status != running)
+ ret = __repmgr_start_selector(env);
+
+out:
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_start_selector __P((ENV *));
+ */
+int
+__repmgr_start_selector(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_RUNNABLE *selector;
+ int ret;
+
+ db_rep = env->rep_handle;
+ if ((ret = __os_calloc(env, 1, sizeof(REPMGR_RUNNABLE), &selector))
+ != 0)
+ return (ret);
+ selector->run = __repmgr_select_thread;
+
+ /*
+ * In case the select thread ever examines db_rep->selector, set it
+ * before starting the thread (since once we create it we could be
+ * racing with it).
+ */
+ db_rep->selector = selector;
+ if ((ret = __repmgr_thread_start(env, selector)) != 0) {
+ __db_err(env, ret, DB_STR("3645",
+ "can't start selector thread"));
+ __os_free(env, selector);
+ db_rep->selector = NULL;
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_close __P((ENV *));
+ *
+ * Close repmgr during env close. It stops repmgr, frees sites array and
+ * its addresses.
+ */
+int
+__repmgr_close(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ int ret;
+ u_int i;
+
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ ret = __repmgr_stop(env);
+ if (db_rep->sites != NULL) {
+ for (i = 0; i < db_rep->site_cnt; i++) {
+ site = &db_rep->sites[i];
+ DB_ASSERT(env, TAILQ_EMPTY(&site->sub_conns));
+ __repmgr_cleanup_netaddr(env, &site->net_addr);
+ }
+ __os_free(env, db_rep->sites);
+ db_rep->sites = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_stop __P((ENV *));
+ *
+ * Stop repmgr either when closing the env or removing the current repmgr from
+ * replication group. It stops threads if necessary, frees resources allocated
+ * after __repmgr_start, and cleans up site membership.
+ */
+int
+__repmgr_stop(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ int ret, t_ret;
+ u_int i;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+
+ if (db_rep->selector != NULL) {
+ if (db_rep->repmgr_status != stopped) {
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_stop_threads(env);
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+ if ((t_ret = __repmgr_await_threads(env)) != 0 && ret == 0)
+ ret = t_ret;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Repmgr threads are finished"));
+ }
+ __repmgr_net_destroy(env, db_rep);
+ if ((t_ret = __repmgr_deinit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_queue_destroy(env)) != 0 && ret == 0)
+ ret = t_ret;
+ if (db_rep->restored_list != NULL) {
+ __os_free(env, db_rep->restored_list);
+ db_rep->restored_list = NULL;
+ }
+ /*
+ * Clean up current site membership and state, so that the obsolete
+ * membership won't mislead us for the next repmgr start.
+ */
+ for (i = 0; i < db_rep->site_cnt; i++) {
+ site = &db_rep->sites[i];
+ site->state = SITE_IDLE;
+ site->membership = 0;
+ }
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_set_ack_policy __P((DB_ENV *, int));
+ */
+int
+__repmgr_set_ack_policy(dbenv, policy)
+ DB_ENV *dbenv;
+ int policy;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->repmgr_set_ack_policy", DB_INIT_REP);
+
+ if (APP_IS_BASEAPI(env))
+ return (repmgr_only(env, "repmgr_set_ack_policy"));
+
+ switch (policy) {
+ case DB_REPMGR_ACKS_ALL:
+ case DB_REPMGR_ACKS_ALL_AVAILABLE:
+ case DB_REPMGR_ACKS_ALL_PEERS:
+ case DB_REPMGR_ACKS_NONE:
+ case DB_REPMGR_ACKS_ONE:
+ case DB_REPMGR_ACKS_ONE_PEER:
+ case DB_REPMGR_ACKS_QUORUM:
+ if (REP_ON(env)) {
+ if (rep->perm_policy != policy) {
+ rep->perm_policy = policy;
+ if ((ret = __repmgr_bcast_parm_refresh(env))
+ != 0)
+ return (ret);
+ }
+ } else
+ db_rep->perm_policy = policy;
+ /*
+ * Setting an ack policy makes this a replication manager
+ * application.
+ */
+ APP_SET_REPMGR(env);
+ return (0);
+ default:
+ __db_errx(env, DB_STR("3646",
+ "unknown ack_policy in DB_ENV->repmgr_set_ack_policy"));
+ return (EINVAL);
+ }
+}
+
+/*
+ * PUBLIC: int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+ */
+int
+__repmgr_get_ack_policy(dbenv, policy)
+ DB_ENV *dbenv;
+ int *policy;
+{
+ ENV *env;
+ DB_REP *db_rep;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ *policy = REP_ON(env) ? rep->perm_policy : db_rep->perm_policy;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_env_create __P((ENV *, DB_REP *));
+ */
+int
+__repmgr_env_create(env, db_rep)
+ ENV *env;
+ DB_REP *db_rep;
+{
+ int ret;
+
+ /* Set some default values. */
+ db_rep->ack_timeout = DB_REPMGR_DEFAULT_ACK_TIMEOUT;
+ db_rep->connection_retry_wait = DB_REPMGR_DEFAULT_CONNECTION_RETRY;
+ db_rep->election_retry_wait = DB_REPMGR_DEFAULT_ELECTION_RETRY;
+ db_rep->config_nsites = 0;
+ db_rep->perm_policy = DB_REPMGR_ACKS_QUORUM;
+ FLD_SET(db_rep->config, REP_C_ELECTIONS);
+ FLD_SET(db_rep->config, REP_C_2SITE_STRICT);
+
+ db_rep->self_eid = DB_EID_INVALID;
+ db_rep->listen_fd = INVALID_SOCKET;
+ TAILQ_INIT(&db_rep->connections);
+ TAILQ_INIT(&db_rep->retries);
+
+ db_rep->input_queue.size = 0;
+ STAILQ_INIT(&db_rep->input_queue.header);
+
+ __repmgr_env_create_pf(db_rep);
+ ret = __repmgr_create_mutex(env, &db_rep->mutex);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: void __repmgr_env_destroy __P((ENV *, DB_REP *));
+ */
+void
+__repmgr_env_destroy(env, db_rep)
+ ENV *env;
+ DB_REP *db_rep;
+{
+ if (db_rep->mutex != NULL) {
+ (void)__repmgr_destroy_mutex(env, db_rep->mutex);
+ db_rep->mutex = NULL;
+ }
+}
+
+/*
+ * PUBLIC: int __repmgr_stop_threads __P((ENV *));
+ *
+ * Caller must hold mutex;
+ */
+int
+__repmgr_stop_threads(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ db_rep->repmgr_status = stopped;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Stopping repmgr threads"));
+ if ((ret = __repmgr_signal(&db_rep->check_election)) != 0)
+ return (ret);
+
+ /*
+ * Because we've set "finished", it's enough to wake msg_avail, even on
+ * Windows. (We don't need to wake per-thread Event Objects here, as we
+ * did in the case of only wanting to stop a subset of msg threads.)
+ */
+ if ((ret = __repmgr_signal(&db_rep->msg_avail)) != 0)
+ return (ret);
+
+ if ((ret = __repmgr_each_connection(env,
+ kick_blockers, NULL, TRUE)) != 0)
+ return (ret);
+
+ return (__repmgr_wake_main_thread(env));
+}
+
+static int
+kick_blockers(env, conn, unused)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ void *unused;
+{
+ int ret, t_ret;
+
+ COMPQUIET(unused, NULL);
+
+ ret = __repmgr_signal(&conn->drained);
+ if ((t_ret = __repmgr_wake_waiters(env,
+ &conn->response_waiters)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * "Joins" all repmgr background threads.
+ */
+static int
+__repmgr_await_threads(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_RUNNABLE *th;
+ REPMGR_SITE *site;
+ int ret, t_ret;
+ u_int i;
+
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ /*
+ * First wait for the threads we started explicitly. Then wait for
+ * those "remote descendent" threads that these first threads may have
+ * started. This order is important, because, for example, the select
+ * thread, in its last gasp, may have started yet another new instance
+ * of a connector thread.
+ */
+
+ /* Message processing threads. */
+ for (i = 0;
+ i < db_rep->nthreads && db_rep->messengers[i] != NULL; i++) {
+ th = db_rep->messengers[i];
+ if ((t_ret = __repmgr_thread_join(th)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, th);
+ }
+ __os_free(env, db_rep->messengers);
+ db_rep->messengers = NULL;
+
+ /* The select() loop thread. */
+ if (db_rep->selector != NULL) {
+ if ((t_ret = __repmgr_thread_join(db_rep->selector)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ __os_free(env, db_rep->selector);
+ db_rep->selector = NULL;
+ }
+
+ /* Election threads. */
+ for (i = 0; i < db_rep->aelect_threads; i++) {
+ th = db_rep->elect_threads[i];
+ if (th != NULL) {
+ if ((t_ret = __repmgr_thread_join(th)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, th);
+ }
+ }
+ __os_free(env, db_rep->elect_threads);
+ db_rep->aelect_threads = 0;
+
+ /* Threads opening outgoing socket connections. */
+ FOR_EACH_REMOTE_SITE_INDEX(i) {
+ LOCK_MUTEX(db_rep->mutex);
+ site = SITE_FROM_EID(i);
+ th = site->connector;
+ site->connector = NULL;
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (th != NULL) {
+ if ((t_ret = __repmgr_thread_join(th)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, th);
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
+ */
+int
+__repmgr_local_site(dbenv, sitep)
+ DB_ENV *dbenv;
+ DB_SITE **sitep;
+{
+ DB_REP *db_rep;
+ ENV *env;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ if (!IS_VALID_EID(db_rep->self_eid))
+ return (DB_NOTFOUND);
+ return (__repmgr_site_by_eid(dbenv, db_rep->self_eid, sitep));
+}
+
+static int
+addr_chk(env, host, port)
+ const ENV *env;
+ const char *host;
+ u_int port;
+{
+ if (host == NULL || host[0] == '\0') {
+ __db_errx(env, DB_STR("3648",
+ "repmgr_site: a host name is required"));
+ return (EINVAL);
+ }
+ if (port == 0 || port > UINT16_MAX) {
+ __db_errx(env, DB_STR_A("3649",
+ "repmgr_site: port out of range [1,%u]", "%u"), UINT16_MAX);
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+ */
+int
+__repmgr_channel(dbenv, eid, dbchannelp, flags)
+ DB_ENV *dbenv;
+ int eid;
+ DB_CHANNEL **dbchannelp;
+ u_int32_t flags;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ DB_REP *db_rep;
+ DB_CHANNEL *dbchannel;
+ CHANNEL *channel;
+ REPMGR_CONNECTION *conn;
+ int cur_eid, master, ret;
+
+ channel = NULL;
+ dbchannel = NULL;
+ conn = NULL;
+
+ env = dbenv->env;
+ if ((ret = __db_fchk(env, "DB_ENV->repmgr_channel", flags, 0)) != 0)
+ return (ret);
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (db_rep->selector == NULL) {
+ __db_errx(env, DB_STR("3650",
+ "DB_ENV->repmgr_channel: must be called after DB_ENV->repmgr_start"));
+ return (EINVAL);
+ }
+ /*
+ * Note that repmgr_start() checks DB_INIT_REP, ENV_THREAD and
+ * APP_IS_BASEAPI.
+ */
+ if (db_rep->repmgr_status == stopped) {
+ __db_errx(env, DB_STR("3651", "repmgr is stopped"));
+ return (EINVAL);
+ }
+
+ if (eid == DB_EID_MASTER) {
+ if ((master = rep->master_id) == DB_EID_INVALID)
+ return (DB_REP_UNAVAIL);
+ cur_eid = master;
+ } else if (IS_KNOWN_REMOTE_SITE(eid))
+ cur_eid = eid;
+ else {
+ __db_errx(env, DB_STR_A("3652",
+ "%d is not a valid remote EID", "%d"), eid);
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ if ((ret = __os_calloc(env, 1, sizeof(DB_CHANNEL), &dbchannel)) != 0 ||
+ (ret = __os_calloc(env, 1, sizeof(CHANNEL), &channel)) != 0)
+ goto err;
+ dbchannel->channel = channel;
+ channel->db_channel = dbchannel;
+ channel->env = env;
+
+ /* Preserve EID as passed by the caller (not cur_eid). */
+ dbchannel->eid = eid;
+ dbchannel->timeout = DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT;
+
+ dbchannel->close = __repmgr_channel_close;
+ dbchannel->send_msg = __repmgr_send_msg;
+ dbchannel->send_request = __repmgr_send_request;
+ dbchannel->set_timeout = __repmgr_channel_timeout;
+
+ if (cur_eid != db_rep->self_eid &&
+ (ret = establish_connection(env, cur_eid, &conn)) != 0)
+ goto err;
+
+ if (IS_VALID_EID(eid)) {
+ DB_ASSERT(env, conn != NULL);
+ channel->c.conn = conn;
+ } else {
+ /*
+ * If the configured EID is one of the special ones (MASTER or
+ * BROADCAST) we need a mutex for dynamic messing with
+ * connections that could happen later.
+ */
+ if ((ret = __repmgr_create_mutex(env,
+ &channel->c.conns.mutex)) != 0)
+ goto err;
+
+ if (conn != NULL) {
+ /*
+ * Allocate enough array elements to use cur_eid as an
+ * index; save the number of slots allocated as "cnt."
+ */
+ if ((ret = __os_calloc(env,
+ (u_int)cur_eid + 1, sizeof(REPMGR_CONNECTION *),
+ &channel->c.conns.array)) != 0)
+ goto err;
+ channel->c.conns.cnt = (u_int)cur_eid + 1;
+ channel->c.conns.array[cur_eid] = conn;
+ }
+ }
+
+ if (conn != NULL) {
+ LOCK_MUTEX(db_rep->mutex);
+ conn->ref_count++;
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+
+ *dbchannelp = dbchannel;
+
+err:
+ if (ret != 0) {
+ if (conn != NULL)
+ (void)__repmgr_disable_connection(env, conn);
+ if (channel != NULL) {
+ if (!IS_VALID_EID(eid) &&
+ channel->c.conns.mutex != NULL)
+ (void)__repmgr_destroy_mutex(env,
+ channel->c.conns.mutex);
+ __os_free(env, channel);
+ }
+ if (dbchannel != NULL)
+ __os_free(env, dbchannel);
+ }
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+static int
+get_shared_netaddr(env, eid, netaddr)
+ ENV *env;
+ int eid;
+ repmgr_netaddr_t *netaddr;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REGINFO *infop;
+ SITEINFO *base, *p;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+
+ if ((u_int)eid >= rep->site_cnt) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ DB_ASSERT(env, rep->siteinfo_off != INVALID_ROFF);
+
+ infop = env->reginfo;
+ base = R_ADDR(infop, rep->siteinfo_off);
+ p = &base[eid];
+ netaddr->host = R_ADDR(infop, p->addr.host);
+ netaddr->port = p->addr.port;
+ ret = 0;
+
+err:
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ return (ret);
+}
+
+static int
+establish_connection(env, eid, connp)
+ ENV *env;
+ int eid;
+ REPMGR_CONNECTION **connp;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ DBT vi;
+ repmgr_netaddr_t netaddr;
+ __repmgr_msg_hdr_args msg_hdr;
+ __repmgr_version_confirmation_args conf;
+ int alloc, locked, ret, unused;
+
+ db_rep = env->rep_handle;
+ alloc = locked = FALSE;
+
+ if ((ret = get_shared_netaddr(env, eid, &netaddr)) != 0)
+ return (ret);
+
+ if ((ret = __repmgr_connect(env, &netaddr, &conn, &unused)) != 0)
+ return (ret);
+ conn->type = APP_CONNECTION;
+
+ /* Read a handshake msg, to get version confirmation and parameters. */
+ if ((ret = __repmgr_read_conn(conn)) != 0)
+ goto out;
+ /*
+ * We can only get here after having read the full 9 bytes that we
+ * expect, so this can't fail.
+ */
+ DB_ASSERT(env, conn->reading_phase == SIZES_PHASE);
+ ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+ conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+ DB_ASSERT(env, ret == 0);
+ __repmgr_iovec_init(&conn->iovecs);
+ conn->reading_phase = DATA_PHASE;
+
+ if ((ret = __repmgr_prepare_simple_input(env, conn, &msg_hdr)) != 0)
+ goto out;
+ alloc = TRUE;
+
+ if ((ret = __repmgr_read_conn(conn)) != 0)
+ goto out;
+
+ /*
+ * Analyze the handshake msg, and stash relevant info.
+ */
+ if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
+ goto out;
+ DB_ASSERT(env, vi.size > 0);
+ if ((ret = __repmgr_version_confirmation_unmarshal(env,
+ &conf, vi.data, vi.size, NULL)) != 0)
+ goto out;
+
+ if (conf.version < CHANNEL_MIN_VERSION) {
+ ret = DB_REP_UNAVAIL;
+ goto out;
+ }
+
+ conn->version = conf.version;
+
+ if ((ret = __repmgr_send_handshake(env,
+ conn, NULL, 0, APP_CHANNEL_CONNECTION)) != 0)
+ goto out;
+ conn->state = CONN_READY;
+ __repmgr_reset_for_reading(conn);
+ if ((ret = __repmgr_set_nonblock_conn(conn)) != 0) {
+ __db_err(env, ret, DB_STR("3653", "set_nonblock channel"));
+ goto out;
+ }
+
+ /*
+ * Turn over the responsibility for reading on this connection to the
+ * select() thread.
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+ if ((ret = __repmgr_wake_main_thread(env)) != 0)
+ goto out;
+
+ /*
+ * Share this new connection with the select thread, which will
+ * hereafter own the exclusive right to read input from it. Once we get
+ * past this point, we can't unilaterally close and destroy the
+ * connection if a retryable connection error happens. Fortunately,
+ * we're now at the point where everything has succeeded; so there will
+ * be no more errors.
+ */
+ TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries);
+ conn->ref_count++;
+ *connp = conn;
+
+out:
+ if (locked)
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ if (ret != 0) {
+ /*
+ * Since we can't have given the connection to the select()
+ * thread yet, clean-up is as simple as this:
+ */
+ (void)__repmgr_close_connection(env, conn);
+ (void)__repmgr_destroy_conn(env, conn);
+ }
+
+ if (alloc) {
+ DB_ASSERT(env, conn->input.repmgr_msg.cntrl.size > 0);
+ __os_free(env, conn->input.repmgr_msg.cntrl.data);
+ DB_ASSERT(env, conn->input.repmgr_msg.rec.size > 0);
+ __os_free(env, conn->input.repmgr_msg.rec.data);
+ }
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_set_msg_dispatch __P((DB_ENV *,
+ * PUBLIC: void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t),
+ * PUBLIC: u_int32_t));
+ */
+int
+__repmgr_set_msg_dispatch(dbenv, dispatch, flags)
+ DB_ENV *dbenv;
+ void (*dispatch) __P((DB_ENV *,
+ DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+ u_int32_t flags;
+{
+ ENV *env;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ env = dbenv->env;
+ if ((ret = __db_fchk(env,
+ "DB_ENV->repmgr_msg_dispatch", flags, 0)) != 0)
+ return (ret);
+ if (APP_IS_BASEAPI(env))
+ return (repmgr_only(env, "repmgr_msg_dispatch"));
+
+ db_rep = env->rep_handle;
+ db_rep->msg_dispatch = dispatch;
+ APP_SET_REPMGR(env);
+ return (0);
+}
+
+/*
+ * Implementation of DB_CHANNEL->send_msg() method for use in a normal channel
+ * explicitly created by the message-originator application.
+ *
+ * PUBLIC: int __repmgr_send_msg __P((DB_CHANNEL *,
+ * PUBLIC: DBT *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_send_msg(db_channel, msg, nmsg, flags)
+ DB_CHANNEL *db_channel;
+ DBT *msg;
+ u_int32_t nmsg;
+ u_int32_t flags;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ CHANNEL *channel;
+ REPMGR_CONNECTION *conn;
+ int ret;
+
+ channel = db_channel->channel;
+ env = channel->env;
+ if ((ret = __db_fchk(env,
+ "DB_CHANNEL->send_msg", flags, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ if ((ret = get_channel_connection(channel, &conn)) == 0)
+ ret = send_msg_conn(env, conn, msg, nmsg);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * Sends an async msg on the given connection (or just copies it locally if conn
+ * is NULL, since that means we're "sending to the master" when we ourselves are
+ * the master).
+ */
+static int
+send_msg_conn(env, conn, msg, nmsg)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ DBT *msg;
+ u_int32_t nmsg;
+{
+ DB_REP *db_rep;
+ REPMGR_IOVECS *iovecs;
+ __repmgr_msg_metadata_args meta;
+ int ret;
+
+ db_rep = env->rep_handle;
+ memset(&meta, 0, sizeof(meta));
+ if (conn == NULL) {
+ /* Sending to DB_EID_MASTER when we ourselves are master. */
+ if ((ret = __repmgr_build_data_out(env,
+ msg, nmsg, &meta, &iovecs)) != 0)
+ return (ret);
+ ret = send_msg_self(env, iovecs, nmsg);
+ } else {
+ if ((ret = __repmgr_build_msg_out(env,
+ msg, nmsg, &meta, &iovecs)) != 0)
+ return (ret);
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_send_many(env, conn, iovecs, 0);
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+
+ __os_free(env, iovecs);
+ return (ret);
+}
+
+/*
+ * Simulate sending by simply copying the message into a msg struct to be
+ * queued. On input, iovecs is ready to "send", with first slot set aside for
+ * message header.
+ */
+static int
+send_msg_self(env, iovecs, nmsg)
+ ENV *env;
+ REPMGR_IOVECS *iovecs;
+ u_int32_t nmsg;
+{
+ REPMGR_MESSAGE *msg;
+ size_t align, bodysize, structsize;
+ u_int8_t *membase;
+ int ret;
+
+ align = sizeof(double);
+ bodysize = iovecs->total_bytes - __REPMGR_MSG_HDR_SIZE;
+ structsize = (size_t)DB_ALIGN((size_t)(sizeof(REPMGR_MESSAGE) +
+ nmsg * sizeof(DBT)), align);
+ if ((ret = __os_malloc(env, structsize + bodysize, &membase)) != 0)
+ return (ret);
+
+ msg = (void*)membase;
+ membase += structsize;
+
+ /*
+ * Build a msg struct that looks like what would be received in the
+ * usual case.
+ */
+ msg->msg_hdr.type = REPMGR_APP_MESSAGE;
+ APP_MSG_BUFFER_SIZE(msg->msg_hdr) = (u_int32_t)bodysize;
+ APP_MSG_SEGMENT_COUNT(msg->msg_hdr) = nmsg;
+
+ msg->v.appmsg.conn = NULL;
+
+ /*
+ * The "buf" is the message body (as [if] transmitted); i.e., it
+ * excludes the header (which we've just constructed separately). So,
+ * skip over slot 0 in the iovecs, which had been reserved for the hdr.
+ */
+ DB_INIT_DBT(msg->v.appmsg.buf, membase, bodysize);
+ copy_body(membase, iovecs);
+
+ return (__repmgr_queue_put(env, msg));
+}
+
+/*
+ * Copies a message body into a single contiguous buffer. The given iovecs is
+ * assumed to have the first slot reserved for a message header, and we skip
+ * that part.
+ */
+static void
+copy_body(membase, iovecs)
+ u_int8_t *membase;
+ REPMGR_IOVECS *iovecs;
+{
+ size_t sz;
+ int i;
+
+ for (i = 1; i < iovecs->count; i++) {
+ if ((sz = (size_t)iovecs->vectors[i].iov_len) > 0) {
+ memcpy(membase, iovecs->vectors[i].iov_base, sz);
+ membase += sz;
+ }
+ }
+}
+
+/*
+ * Gets a connection to be used for sending, either an async message or a
+ * request. On a DB_EID_MASTER channel this entails checking the current
+ * master, and possibly opening a new connection if the master has changed.
+ * Allow an old connection to stay intact, because responses to previous
+ * requests could still be arriving (though often the connection will have died
+ * anyway, if the master changed due to failure of the old master).
+ *
+ * If the local site is currently master, then for a master channel we return
+ * (via connp) a NULL pointer.
+ */
+static int
+get_channel_connection(channel, connp)
+ CHANNEL *channel;
+ REPMGR_CONNECTION **connp;
+{
+ ENV *env;
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_CONNECTION *conn;
+ DB_CHANNEL *db_channel;
+ int eid, ret;
+
+ env = channel->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ db_channel = channel->db_channel;
+
+ /*
+ * On a specific-EID channel it's very simple, because there is only
+ * ever one connection, which was established when the channel was
+ * created.
+ */
+ if (db_channel->eid >= 0) {
+ *connp = channel->c.conn;
+ return (0);
+ }
+
+ /*
+ * For now we only support one connection at a time. When we support
+ * DB_EID_BROADCAST channels in the future, we will have to loop through
+ * all connected sites.
+ */
+ DB_ASSERT(env, db_channel->eid == DB_EID_MASTER);
+ eid = rep->master_id;
+ if (eid == db_rep->self_eid) {
+ *connp = NULL;
+ return (0);
+ }
+ if (eid == DB_EID_INVALID)
+ return (DB_REP_UNAVAIL);
+
+ LOCK_MUTEX(channel->c.conns.mutex);
+ if ((u_int)eid >= channel->c.conns.cnt) {
+ /*
+ * Allocate an array big enough such that `eid' is a valid
+ * index; initialize the newly allocated (tail) portion.
+ */
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Grow master-channel array to accommodate EID %d", eid));
+ if ((ret = __os_realloc(env,
+ sizeof(REPMGR_CONNECTION *) * ((u_int)eid + 1),
+ &channel->c.conns.array)) != 0)
+ goto out;
+ memset(&channel->c.conns.array[channel->c.conns.cnt],
+ 0,
+ sizeof(REPMGR_CONNECTION *) *
+ (((u_int)eid + 1) - channel->c.conns.cnt));
+ channel->c.conns.cnt = (u_int)eid + 1;
+ }
+ DB_ASSERT(env, (u_int)eid < channel->c.conns.cnt);
+
+ if ((conn = channel->c.conns.array[eid]) == NULL) {
+ if ((ret = establish_connection(env, eid, &conn)) != 0)
+ goto out;
+
+ /*
+ * Even though `conn' is a newly created object, by the time we
+ * get here it has already been given out to the select()
+ * thread, so we should hold the mutex while incrementing the
+ * ref count.
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ channel->c.conns.array[eid] = conn;
+ conn->ref_count++;
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+
+ *connp = conn;
+ ret = 0;
+out:
+ UNLOCK_MUTEX(channel->c.conns.mutex);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_request __P((DB_CHANNEL *,
+ * PUBLIC: DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+ */
+int
+__repmgr_send_request(db_channel, request, nrequest, response, timeout, flags)
+ DB_CHANNEL *db_channel;
+ DBT *request;
+ u_int32_t nrequest;
+ DBT *response;
+ db_timeout_t timeout;
+ u_int32_t flags;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_REP *db_rep;
+ CHANNEL *channel;
+ REPMGR_CONNECTION *conn;
+ REPMGR_IOVECS *iovecs;
+ REPMGR_RESPONSE *resp;
+ struct response_wait ctx;
+ __repmgr_msg_metadata_args meta;
+ size_t sz;
+ void *dummy;
+ u_int32_t i, n;
+ int ret;
+
+ channel = db_channel->channel;
+ env = channel->env;
+ db_rep = env->rep_handle;
+
+ if ((ret = __db_fchk(env,
+ "DB_CHANNEL->send_request", flags, DB_MULTIPLE)) != 0)
+ return (ret);
+
+ if (db_channel->eid == DB_EID_BROADCAST) {
+ __db_errx(env, DB_STR("3654",
+ "DB_CHANNEL->send_request() not supported on DB_EID_BROADCAST channel"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ ret = get_channel_connection(channel, &conn);
+ ENV_LEAVE(env, ip);
+ if (ret != 0)
+ return (ret);
+
+ if (conn == NULL)
+ return (request_self(env, request, nrequest, response, flags));
+
+ /* Find an available array slot, or grow the array if necessary. */
+ LOCK_MUTEX(db_rep->mutex);
+ for (i = 0; i < conn->aresp; i++)
+ if (!(F_ISSET(&conn->responses[i], RESP_IN_USE)))
+ break;
+ if (i == conn->aresp) {
+ n = conn->aresp == 0 ? 1 : conn->aresp * 2;
+ ret = __os_realloc(env,
+ sizeof(REPMGR_RESPONSE) * n, &conn->responses);
+ memset(&conn->responses[i], 0,
+ sizeof(REPMGR_RESPONSE) * (n - i));
+ conn->aresp = n;
+ }
+ resp = &conn->responses[i];
+ resp->flags = RESP_IN_USE | RESP_THREAD_WAITING;
+ resp->dbt = *response;
+ resp->ret = 0;
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ /*
+ * The index "i" is stable, but the address in the "resp" pointer could
+ * change while we drop the mutex, if another thread has to grow the
+ * allocated array. So we can't use "resp" again until after we set it
+ * again, from "i", under mutex protection.
+ */
+
+ meta.tag = i;
+ meta.flags = REPMGR_REQUEST_MSG_TYPE |
+ (LF_ISSET(DB_MULTIPLE) ? REPMGR_MULTI_RESP : 0) |
+ (F_ISSET(response, DB_DBT_USERMEM) ? REPMGR_RESPONSE_LIMIT : 0);
+ meta.limit = response->ulen;
+
+ /*
+ * Build an iovecs structure describing the request message, and then
+ * send it.
+ */
+ if ((ret = __repmgr_build_msg_out(env,
+ request, nrequest, &meta, &iovecs)) != 0) {
+ /*
+ * Since we haven't sent the message yet, there's no chance the
+ * select thread has started relying on the REPMGR_RESPONSE, so
+ * it's easy to deallocate it.
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ F_CLR(&conn->responses[i], RESP_IN_USE | RESP_THREAD_WAITING);
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+ }
+
+ timeout = timeout > 0 ? timeout : db_channel->timeout;
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_send_many(env, conn, iovecs, timeout);
+ if (ret == DB_TIMEOUT)
+ F_CLR(&conn->responses[i], RESP_IN_USE | RESP_THREAD_WAITING);
+ UNLOCK_MUTEX(db_rep->mutex);
+ __os_free(env, iovecs);
+ if (ret != 0) {
+ /*
+ * An error while writing will force the connection to be
+ * closed, busted, abandoned. Since there could be a few app
+ * threads waiting, *any* abandoning of a connection will have
+ * to wake up those threads, with a COMPLETE indication and an
+ * error code. That's more than we want to tackle here.
+ */
+ return (ret);
+ }
+
+ /*
+ * Here, we've successfully sent the request. Once we've gotten this
+ * far, the select thread owns the REPMGR_RESPONSE slot until it marks
+ * it complete.
+ */
+ ctx.conn = conn;
+ ctx.index = i;
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_await_cond(env,
+ response_complete, &ctx, timeout, &conn->response_waiters);
+
+ resp = &conn->responses[i];
+ if (ret == 0) {
+ DB_ASSERT(env, F_ISSET(resp, RESP_COMPLETE));
+ *response = resp->dbt;
+ if ((ret = resp->ret) == 0 && LF_ISSET(DB_MULTIPLE))
+ adjust_bulk_response(env, response);
+ F_CLR(resp, RESP_IN_USE | RESP_THREAD_WAITING);
+
+ } else {
+ F_CLR(resp, RESP_THREAD_WAITING);
+ if (ret == DB_TIMEOUT && F_ISSET(resp, RESP_READING)) {
+ /*
+ * The select thread is in the midst of reading the
+ * response, but we're about to yank the buffer out from
+ * under it. So, replace it with a dummy buffer.
+ * (There's no way to abort the reading of a message
+ * part-way through.)
+ *
+ * Notice that whatever buffer the user is getting back,
+ * including her own in the case of USERMEM, may already
+ * have some partial data written into it.
+ *
+ * We always read responses in just one single chunk, so
+ * figuring out the needed buffer size is fairly simple.
+ */
+ DB_ASSERT(env, conn->iovecs.offset == 0 &&
+ conn->iovecs.count == 1);
+ sz = conn->iovecs.vectors[0].iov_len;
+
+ if ((ret = __os_malloc(env, sz, &dummy)) != 0)
+ goto out;
+ __repmgr_iovec_init(&conn->iovecs);
+ DB_INIT_DBT(resp->dbt, dummy, sz);
+ __repmgr_add_dbt(&conn->iovecs, &resp->dbt);
+ F_SET(resp, RESP_DUMMY_BUF);
+ }
+ }
+
+out:
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+static int
+response_complete(env, ctx)
+ ENV *env;
+ void *ctx;
+{
+ REPMGR_CONNECTION *conn;
+ struct response_wait *rw;
+
+ COMPQUIET(env, NULL);
+
+ rw = ctx;
+ conn = rw->conn;
+ return (F_ISSET(&conn->responses[rw->index], RESP_COMPLETE) ||
+ conn->state == CONN_DEFUNCT);
+}
+
+/*
+ * "Send" a request to ourselves, by invoking the application's call-back
+ * function directly, in the case where a channel directed to DB_EID_MASTER is
+ * used on a master.
+ */
+static int
+request_self(env, request, nrequest, response, flags)
+ ENV *env;
+ DBT *request;
+ u_int32_t nrequest;
+ DBT *response;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ DB_CHANNEL db_channel;
+ CHANNEL channel;
+ __repmgr_msg_metadata_args meta;
+
+ db_rep = env->rep_handle;
+ if (db_rep->msg_dispatch == NULL) {
+ __db_errx(env, DB_STR("3655",
+ "No message dispatch call-back function has been configured"));
+ return (DB_NOSERVER);
+ }
+
+ db_channel.channel = &channel;
+ db_channel.send_msg = __repmgr_send_response;
+
+ /* Supply stub functions for methods inapplicable in msg disp func. */
+ db_channel.close = __repmgr_channel_close_inval;
+ db_channel.send_request = __repmgr_send_request_inval;
+ db_channel.set_timeout = __repmgr_channel_timeout_inval;
+
+ channel.env = env;
+ channel.c.conn = NULL;
+ channel.responded = FALSE;
+ channel.meta = &meta;
+ channel.response.dbt = *response;
+
+ meta.flags = REPMGR_REQUEST_MSG_TYPE |
+ (LF_ISSET(DB_MULTIPLE) ? REPMGR_MULTI_RESP : 0) |
+ (F_ISSET(response, DB_DBT_USERMEM) ? REPMGR_RESPONSE_LIMIT : 0);
+ meta.limit = response->ulen;
+
+ (*db_rep->msg_dispatch)(env->dbenv,
+ &db_channel, request, nrequest, DB_REPMGR_NEED_RESPONSE);
+
+ if (!channel.responded) {
+ __db_errx(env, DB_STR("3656",
+ "Application failed to provide a response"));
+ return (DB_KEYEMPTY);
+ } else {
+ response->data = channel.response.dbt.data;
+ response->size = channel.response.dbt.size;
+ if (LF_ISSET(DB_MULTIPLE))
+ adjust_bulk_response(env, response);
+ }
+ return (0);
+}
+
+static void
+adjust_bulk_response(env, response)
+ ENV *env;
+ DBT *response;
+{
+ u_int32_t n, *p;
+
+#ifndef DIAGNOSTIC
+ COMPQUIET(env, NULL);
+#endif
+
+ /*
+ * Convert bulk-buffer segment info to host byte-order, and count
+ * segments. See the definition of DB_MULTIPLE_INIT for a reminder of
+ * the structure of a bulk buffer. Each segment has both an offset and
+ * a length, so "n" ends up as the number of u_int32_t words we (might)
+ * need to shuffle, below.
+ */
+ p = (u_int32_t *)((u_int8_t *)response->data +
+ response->size - sizeof(u_int32_t));
+ for (n = 1; *p != (u_int32_t)-1; p -= 2) {
+ DB_ASSERT(env, p > (u_int32_t *)response->data);
+ p[0] = ntohl(p[0]);
+ p[-1] = ntohl(p[-1]);
+ n += 2;
+ }
+ /*
+ * The bulk pointers appear at the end of the transmitted response, so
+ * unless the buffer happened to be exactly the right size we need to
+ * shuffle them to the end of the buffer.
+ */
+ if (F_ISSET(response, DB_DBT_USERMEM))
+ memmove((u_int8_t *)response->data +
+ response->ulen - n * sizeof(u_int32_t),
+ p, n * sizeof(u_int32_t));
+ else
+ response->ulen = response->size;
+}
+
+/*
+ * Implementation of DB_CHANNEL->send_msg() method for use in recipient's msg
+ * dispatch callback function.
+ *
+ * PUBLIC: int __repmgr_send_response __P((DB_CHANNEL *,
+ * PUBLIC: DBT *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_send_response(db_channel, msg, nmsg, flags)
+ DB_CHANNEL *db_channel;
+ DBT *msg;
+ u_int32_t nmsg;
+ u_int32_t flags;
+{
+ ENV *env;
+ DB_REP *db_rep;
+ CHANNEL *channel;
+ REPMGR_CONNECTION *conn;
+ REPMGR_IOVECS iovecs, *iovecsp;
+ DBT *dbt;
+ __repmgr_msg_hdr_args msg_hdr;
+ u_int8_t msg_hdr_buf[__REPMGR_MSG_HDR_SIZE], *msg_hdr_buf_p;
+ size_t sz;
+ int alloc, ret;
+
+ COMPQUIET(iovecsp, NULL);
+
+ channel = db_channel->channel;
+ env = channel->env;
+ db_rep = env->rep_handle;
+ conn = channel->c.conn;
+
+ if ((ret = __db_fchk(env,
+ "DB_CHANNEL->send_msg", flags, 0)) != 0)
+ return (ret);
+
+ if (!F_ISSET(channel->meta, REPMGR_REQUEST_MSG_TYPE))
+ return (send_msg_conn(env, conn, msg, nmsg));
+
+ if (channel->responded) {
+ __db_errx(env, DB_STR("3657",
+ "a response has already been sent"));
+ return (EINVAL);
+ }
+
+ alloc = FALSE;
+ if (F_ISSET(channel->meta, REPMGR_MULTI_RESP)) {
+ /*
+ * Originator accepts bulk format: response can be any number of
+ * segments.
+ */
+ if ((ret = __repmgr_build_data_out(env,
+ msg, nmsg, NULL, &iovecsp)) != 0)
+ goto out;
+ alloc = TRUE;
+
+ /*
+ * Set buffer pointer to space we "know" build_data_out reserved
+ * for us.
+ */
+ msg_hdr_buf_p = (u_int8_t *)iovecsp->vectors[0].iov_base;
+ msg_hdr.type = REPMGR_APP_RESPONSE;
+ APP_RESP_TAG(msg_hdr) = channel->meta->tag;
+ APP_RESP_BUFFER_SIZE(msg_hdr) =
+ (u_int32_t)(iovecsp->total_bytes - __REPMGR_MSG_HDR_SIZE);
+ __repmgr_msg_hdr_marshal(env, &msg_hdr, msg_hdr_buf_p);
+ } else if (nmsg > 1) {
+ __db_errx(env, DB_STR("3658",
+ "originator does not accept multi-segment response"));
+ goto small;
+ } else {
+ iovecsp = &iovecs;
+ __repmgr_iovec_init(iovecsp);
+ msg_hdr.type = REPMGR_APP_RESPONSE;
+ APP_RESP_TAG(msg_hdr) = channel->meta->tag;
+ __repmgr_add_buffer(iovecsp,
+ msg_hdr_buf, __REPMGR_MSG_HDR_SIZE);
+ if (nmsg == 0)
+ APP_RESP_BUFFER_SIZE(msg_hdr) = 0;
+ else if ((APP_RESP_BUFFER_SIZE(msg_hdr) = msg->size) > 0)
+ __repmgr_add_dbt(iovecsp, msg);
+ __repmgr_msg_hdr_marshal(env, &msg_hdr, msg_hdr_buf);
+ }
+
+ if (F_ISSET(channel->meta, REPMGR_RESPONSE_LIMIT) &&
+ (APP_RESP_BUFFER_SIZE(msg_hdr) > channel->meta->limit)) {
+ __db_errx(env, DB_STR("3659",
+ "originator's USERMEM buffer too small"));
+small:
+ if (conn == NULL)
+ channel->response.ret = DB_BUFFER_SMALL;
+ else
+ (void)__repmgr_send_err_resp(env,
+ channel, DB_BUFFER_SMALL);
+ ret = EINVAL;
+ } else {
+ if (conn == NULL) {
+ sz = APP_RESP_BUFFER_SIZE(msg_hdr);
+ dbt = &channel->response.dbt;
+ if (F_ISSET(dbt, DB_DBT_MALLOC))
+ ret = __os_umalloc(env, sz, &dbt->data);
+ else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+ if (dbt->data == NULL || dbt->size < sz)
+ ret = __os_urealloc(env,
+ sz, &dbt->data);
+ else
+ ret = 0;
+ }
+ dbt->size = (u_int32_t)sz;
+ copy_body(dbt->data, iovecsp);
+ channel->response.ret = 0;
+ ret = 0;
+ } else {
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_send_many(env, conn, iovecsp, 0);
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+ }
+
+out:
+ if (alloc)
+ __os_free(env, iovecsp);
+
+ /*
+ * Once we've handed the tag back to the originator it becomes
+ * meaningless, so we can't use it again. Note the fact that we've
+ * responded, so that we don't try.
+ */
+ channel->responded = TRUE;
+
+ return (ret);
+}
+
+static int
+__repmgr_build_msg_out(env, msg, nmsg, meta, iovecsp)
+ ENV *env;
+ DBT *msg;
+ u_int32_t nmsg;
+ __repmgr_msg_metadata_args *meta;
+ REPMGR_IOVECS **iovecsp;
+{
+ REPMGR_IOVECS *iovecs;
+ __repmgr_msg_hdr_args msg_hdr;
+ u_int8_t *msg_hdr_buf;
+ int ret;
+
+ if ((ret = __repmgr_build_data_out(env, msg, nmsg, meta, &iovecs)) != 0)
+ return (ret);
+
+ /*
+ * The IOVECS holds the entire message to be transmitted, including the
+ * 9-byte header. The header contains the length of the remaining part
+ * of the message. The header buffer area is of course pointed to by
+ * the first of the io vectors.
+ */
+ msg_hdr_buf = (u_int8_t *)iovecs->vectors[0].iov_base;
+ msg_hdr.type = REPMGR_APP_MESSAGE;
+ APP_MSG_BUFFER_SIZE(msg_hdr) =
+ (u_int32_t)(iovecs->total_bytes - __REPMGR_MSG_HDR_SIZE);
+ APP_MSG_SEGMENT_COUNT(msg_hdr) = nmsg;
+ __repmgr_msg_hdr_marshal(env, &msg_hdr, msg_hdr_buf);
+
+ *iovecsp = iovecs;
+ return (0);
+}
+
+/*
+ * Allocate and build most of an outgoing message, leaving it up to the caller
+ * to fill in the header afterwards.
+ */
+static int
+__repmgr_build_data_out(env, msg, nmsg, meta, iovecsp)
+ ENV *env;
+ DBT *msg;
+ u_int32_t nmsg;
+ __repmgr_msg_metadata_args *meta;
+ REPMGR_IOVECS **iovecsp;
+{
+ REPMGR_IOVECS *iovecs;
+ u_int32_t *bulk_base, *bulk_ptr, i, n;
+ u_int8_t *membase, *meta_buf, *msg_hdr_buf, *p, *pad;
+ void *inc_p;
+ size_t align, bulk_area_sz, memsize, segments, sz, offset;
+ int ret;
+
+ COMPQUIET(pad, NULL);
+
+ /*
+ * The actual message as it will be sent on the wire is composed of the
+ * following parts:
+ *
+ * (a) the 9-byte header
+ * (b) for each msg DBT ('nmsg' of them):
+ * (b.1) the data itself, and
+ * (b.2) an alignment pad, if necessary
+ * (c) trailing section for bulk-style pointers (2 words per segment,
+ * plus a -1 end-marker)
+ * (d) message meta-data (optionally)
+ *
+ * Note that nmsg could be 0.
+ */
+
+ /* First, count how many segments need padding. */
+ n = 0;
+ align = sizeof(double);
+ for (i = 0; i < nmsg; i++) {
+ p = msg[i].data;
+ p = &p[msg[i].size];
+ inc_p = ALIGNP_INC(p, align);
+ if ((u_int8_t *)inc_p > p)
+ n++;
+ }
+
+ /*
+ * Here we allocate memory to hold the actual pieces of the message we
+ * will send, plus the iovecs structure that points to those pieces. We
+ * don't include the memory for the user's data (item (b.1) from the
+ * above explanation), since the user is supplying them directly. Also
+ * note that we can reuse just one padding buffer even if we need to
+ * send it (i.e., point to it from an iovec) more than once.
+ *
+ * According to the list of message segments explained above, the total
+ * number of iovec elements we need is (1 + nmsg + n + 1 + f(meta)).
+ */
+ segments = nmsg + n + (meta == NULL ? 2 : 3);
+ sz = segments > MIN_IOVEC ? REPMGR_IOVECS_ALLOC_SZ(segments) :
+ sizeof(REPMGR_IOVECS);
+
+ bulk_area_sz = (nmsg * 2 + 1) * sizeof(u_int32_t);
+ memsize = sz + __REPMGR_MSG_HDR_SIZE +
+ bulk_area_sz + (n > 0 ? align : 0) + __REPMGR_MSG_METADATA_SIZE;
+
+ if ((ret = __os_malloc(env, memsize, &membase)) != 0)
+ return (ret);
+ p = membase;
+ iovecs = (REPMGR_IOVECS *)p;
+ p += sz;
+ bulk_base = (u_int32_t *)p;
+ p += bulk_area_sz;
+ if (n > 0) {
+ pad = p;
+ memset(pad, 0, align);
+ p += align;
+ }
+ msg_hdr_buf = p;
+ p += __REPMGR_MSG_HDR_SIZE;
+ meta_buf = p;
+
+ /*
+ * The message header appears first (on the wire), so we have to add its
+ * buffer address to the iovec list first. But we don't actually
+ * compose the content; that's the responsibility of the caller, after
+ * we return.
+ */
+ __repmgr_iovec_init(iovecs);
+ __repmgr_add_buffer(iovecs, msg_hdr_buf, __REPMGR_MSG_HDR_SIZE);
+
+ offset = 0;
+ bulk_ptr = &bulk_base[2*nmsg + 1]; /* Work backward from the end. */
+ for (i = 0; i < nmsg; i++) {
+ p = msg[i].data;
+ sz = (size_t)msg[i].size;
+
+ /*
+ * Format of bulk pointers is similar to the usage of
+ * DB_MULTIPLE_NEXT, but note that the lengths we pass are of
+ * course for the actual data itself, not including any
+ * padding.
+ */
+ *--bulk_ptr = htonl((u_long)offset);
+ *--bulk_ptr = htonl((u_long)sz);
+
+ __repmgr_add_dbt(iovecs, &msg[i]);
+ offset += sz;
+
+ p = &p[sz];
+ inc_p = ALIGNP_INC(p, align);
+ if ((u_int8_t *)inc_p > p) {
+ DB_ASSERT(env, n > 0);
+ sz = (size_t)((u_int8_t *)inc_p - p);
+ DB_ASSERT(env, sz <= align);
+ __repmgr_add_buffer(iovecs, pad, sz);
+ offset += sz;
+ }
+ }
+ *--bulk_ptr = (u_int32_t)-1;
+ __repmgr_add_buffer(iovecs, bulk_ptr, bulk_area_sz);
+
+ if (meta != NULL) {
+ __repmgr_msg_metadata_marshal(env, meta, meta_buf);
+ __repmgr_add_buffer(iovecs,
+ meta_buf, __REPMGR_MSG_METADATA_SIZE);
+ }
+
+ *iovecsp = iovecs;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_channel_close __P((DB_CHANNEL *, u_int32_t));
+ */
+int
+__repmgr_channel_close(dbchan, flags)
+ DB_CHANNEL *dbchan;
+ u_int32_t flags;
+{
+ ENV *env;
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ CHANNEL *channel;
+ u_int32_t i;
+ int ret, t_ret;
+
+ channel = dbchan->channel;
+ env = channel->env;
+ ret = __db_fchk(env, "DB_CHANNEL->close", flags, 0);
+ db_rep = env->rep_handle;
+
+ /*
+ * Disable connection(s) (if not already done due to an error having
+ * occurred previously); release our reference to conn struct(s).
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ if (dbchan->eid >= 0) {
+ conn = channel->c.conn;
+ if (conn->state != CONN_DEFUNCT &&
+ (t_ret = __repmgr_disable_connection(env, conn)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ } else if (channel->c.conns.cnt > 0) {
+ for (i = 0; i < channel->c.conns.cnt; i++)
+ if ((conn = channel->c.conns.array[i]) != NULL) {
+ if (conn->state != CONN_DEFUNCT &&
+ (t_ret = __repmgr_disable_connection(env,
+ conn)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_decr_conn_ref(env,
+ conn)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ __os_free(env, channel->c.conns.array);
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ if (!IS_VALID_EID(dbchan->eid) && channel->c.conns.mutex != NULL &&
+ (t_ret = __repmgr_destroy_mutex(env,
+ channel->c.conns.mutex)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __repmgr_wake_main_thread(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ __os_free(env, channel);
+ __os_free(env, dbchan);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_channel_timeout __P((DB_CHANNEL *, db_timeout_t));
+ */
+int
+__repmgr_channel_timeout(chan, timeout)
+ DB_CHANNEL *chan;
+ db_timeout_t timeout;
+{
+ chan->timeout = timeout;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_request_inval __P((DB_CHANNEL *,
+ * PUBLIC: DBT *, u_int32_t, DBT *, db_timeout_t, u_int32_t));
+ */
+int
+__repmgr_send_request_inval(dbchan, request, nrequest, response, timeout, flags)
+ DB_CHANNEL *dbchan;
+ DBT *request;
+ u_int32_t nrequest;
+ DBT *response;
+ db_timeout_t timeout;
+ u_int32_t flags;
+{
+ COMPQUIET(request, NULL);
+ COMPQUIET(nrequest, 0);
+ COMPQUIET(response, NULL);
+ COMPQUIET(timeout, 0);
+ COMPQUIET(flags, 0);
+ return (bad_callback_method(dbchan, "send_request"));
+}
+
+/*
+ * PUBLIC: int __repmgr_channel_close_inval __P((DB_CHANNEL *, u_int32_t));
+ */
+int
+__repmgr_channel_close_inval(dbchan, flags)
+ DB_CHANNEL *dbchan;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (bad_callback_method(dbchan, "close"));
+}
+
+/*
+ * PUBLIC: int __repmgr_channel_timeout_inval __P((DB_CHANNEL *, db_timeout_t));
+ */
+int
+__repmgr_channel_timeout_inval(dbchan, timeout)
+ DB_CHANNEL *dbchan;
+ db_timeout_t timeout;
+{
+ COMPQUIET(timeout, 0);
+ return (bad_callback_method(dbchan, "set_timeout"));
+}
+
+static int
+bad_callback_method(chan, method)
+ DB_CHANNEL *chan;
+ const char *method;
+{
+ __db_errx(chan->channel->env, DB_STR_A("3660",
+ "%s() invalid on DB_CHANNEL supplied to msg dispatch function",
+ "%s"), method);
+ return (EINVAL);
+}
+
+static int
+repmgr_only(env, method)
+ ENV *env;
+ const char *method;
+{
+ __db_errx(env, DB_STR_A("3661",
+ "%s: cannot call from base replication application",
+ "%s"), method);
+ return (EINVAL);
+}
+
+/*
+ * Attempts to join the replication group, by finding a remote "helper" site and
+ * sending a request message to it.
+ *
+ * PUBLIC: int __repmgr_join_group __P((ENV *));
+ */
+int
+__repmgr_join_group(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ repmgr_netaddr_t addr;
+ u_int i;
+ int pass, ret;
+
+ db_rep = env->rep_handle;
+
+ /*
+ * Make two passes through the site list. On the first pass, try
+ * joining via an existing, fully "present" site whom we've found in the
+ * membership database. If that is fruitless, on the second pass try
+ * any site marked as a bootstrap helper.
+ *
+ * On the first attempt to join, when we have found no database, the
+ * first pass will produce nothing. On a later attempt to rejoin after
+ * having been removed, it's better to give priority to existing
+ * remaining sites from the database, and only rely on bootstrap helpers
+ * as a last resort.
+ *
+ * pass 0 => present members
+ * pass 1 => helpers
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ for (pass = 0; pass <= 1; pass++) {
+ FOR_EACH_REMOTE_SITE_INDEX(i) {
+ site = SITE_FROM_EID(i);
+ if (pass == 0 && site->membership != SITE_PRESENT)
+ continue;
+ if (pass == 1 &&
+ !FLD_ISSET(site->config, DB_BOOTSTRAP_HELPER))
+ continue;
+ addr = site->net_addr;
+ UNLOCK_MUTEX(db_rep->mutex);
+ if ((ret = join_group_at_site(env,
+ &addr)) == DB_REP_UNAVAIL) {
+ LOCK_MUTEX(db_rep->mutex);
+ continue;
+ }
+ return (ret);
+ }
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (DB_REP_UNAVAIL);
+}
+
+/*
+ * Sends a request message to another site, asking for permission to join the
+ * replication group. Ideally the other site is the master, because only the
+ * master can grant that request. But since we're not currently part of the
+ * group, we generally don't know which site is master. If the target site is
+ * not master, it will respond by telling us who is.
+ */
+static int
+join_group_at_site(env, addrp)
+ ENV *env;
+ repmgr_netaddr_t *addrp;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ SITE_STRING_BUFFER addr_buf;
+ repmgr_netaddr_t addr, myaddr;
+ __repmgr_gm_fwd_args fwd;
+ __repmgr_site_info_args site_info;
+ u_int8_t *p, *response_buf, siteinfo_buf[MAX_MSG_BUF];
+ char host_buf[MAXHOSTNAMELEN + 1], *host;
+ u_int32_t gen, type;
+ size_t len;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+
+ LOCK_MUTEX(db_rep->mutex);
+ myaddr = SITE_FROM_EID(db_rep->self_eid)->net_addr;
+ UNLOCK_MUTEX(db_rep->mutex);
+ len = strlen(myaddr.host) + 1;
+ DB_INIT_DBT(site_info.host, myaddr.host, len);
+ site_info.port = myaddr.port;
+ site_info.flags = 0;
+ ret = __repmgr_site_info_marshal(env,
+ &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len);
+ DB_ASSERT(env, ret == 0);
+
+ conn = NULL;
+ response_buf = NULL;
+ gen = 0;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "try join request to site %s",
+ __repmgr_format_addr_loc(addrp, addr_buf)));
+retry:
+ if ((ret = make_request_conn(env, addrp, &conn)) != 0)
+ return (ret);
+ if ((ret = __repmgr_send_sync_msg(env, conn,
+ REPMGR_JOIN_REQUEST, siteinfo_buf, (u_int32_t)len)) != 0)
+ goto err;
+
+ if ((ret = read_own_msg(env,
+ conn, &type, &response_buf, &len)) != 0)
+ goto err;
+
+ if (type == REPMGR_GM_FAILURE) {
+ ret = DB_REP_UNAVAIL;
+ goto err;
+ }
+ if (type == REPMGR_GM_FORWARD) {
+ /*
+ * The remote site we thought was master is telling us that some
+ * other site has become master. Retry with the new master.
+ * However, in order to avoid an endless cycle, only continue
+ * retrying as long as the master gen is advancing.
+ */
+ ret = __repmgr_close_connection(env, conn);
+ if ((t_ret = __repmgr_destroy_conn(env, conn)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ conn = NULL;
+ if (ret != 0)
+ goto err;
+
+ ret = __repmgr_gm_fwd_unmarshal(env, &fwd,
+ response_buf, len, &p);
+ DB_ASSERT(env, ret == 0);
+ if (fwd.gen > gen) {
+ if (fwd.host.size > MAXHOSTNAMELEN + 1) {
+ ret = DB_REP_UNAVAIL;
+ goto err;
+ }
+ host = fwd.host.data;
+ host[fwd.host.size-1] = '\0'; /* Just to be sure. */
+ (void)strcpy(host_buf, host);
+ addr.host = host_buf;
+ addr.port = fwd.port;
+ addrp = &addr;
+ gen = fwd.gen;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "will retry join request at forwarded master %s, gen %lu",
+ __repmgr_format_addr_loc(addrp, addr_buf),
+ (u_long)gen));
+ __os_free(env, response_buf);
+ response_buf = NULL;
+ goto retry;
+ } else {
+ ret = DB_REP_UNAVAIL;
+ goto err;
+ }
+ }
+ if (type == REPMGR_JOIN_SUCCESS)
+ ret = __repmgr_refresh_membership(env, response_buf, len);
+ else
+ ret = DB_REP_UNAVAIL; /* Invalid response: protocol violation */
+
+err:
+ if (conn != NULL) {
+ if ((t_ret = __repmgr_close_connection(env, conn)) != 0 &&
+ ret != 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_destroy_conn(env, conn)) != 0 &&
+ ret != 0)
+ ret = t_ret;
+ }
+ if (response_buf != NULL)
+ __os_free(env, response_buf);
+
+ return (ret);
+}
+
+/*
+ * Reads a whole message, when we expect to get a REPMGR_OWN_MSG.
+ */
+static int
+read_own_msg(env, conn, typep, bufp, lenp)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ u_int32_t *typep;
+ u_int8_t **bufp;
+ size_t *lenp;
+{
+ __repmgr_msg_hdr_args msg_hdr;
+ u_int8_t *buf;
+ u_int32_t type;
+ size_t size;
+ int ret;
+
+ __repmgr_reset_for_reading(conn);
+ if ((ret = __repmgr_read_conn(conn)) != 0)
+ goto err;
+ ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+ conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+ DB_ASSERT(env, ret == 0);
+
+ if ((conn->msg_type = msg_hdr.type) != REPMGR_OWN_MSG) {
+ ret = DB_REP_UNAVAIL; /* Protocol violation. */
+ goto err;
+ }
+ type = REPMGR_OWN_MSG_TYPE(msg_hdr);
+ if ((size = (size_t)REPMGR_OWN_BUF_SIZE(msg_hdr)) > 0) {
+ conn->reading_phase = DATA_PHASE;
+ __repmgr_iovec_init(&conn->iovecs);
+
+ if ((ret = __os_malloc(env, size, &buf)) != 0)
+ goto err;
+ conn->input.rep_message = NULL;
+
+ __repmgr_add_buffer(&conn->iovecs, buf, size);
+ if ((ret = __repmgr_read_conn(conn)) != 0) {
+ __os_free(env, buf);
+ goto err;
+ }
+ *bufp = buf;
+ }
+
+ *typep = type;
+ *lenp = size;
+
+err:
+ return (ret);
+}
+
+static int
+make_request_conn(env, addr, connp)
+ ENV *env;
+ repmgr_netaddr_t *addr;
+ REPMGR_CONNECTION **connp;
+{
+ DBT vi;
+ __repmgr_msg_hdr_args msg_hdr;
+ __repmgr_version_confirmation_args conf;
+ REPMGR_CONNECTION *conn;
+ int alloc, ret, unused;
+
+ alloc = FALSE;
+ if ((ret = __repmgr_connect(env, addr, &conn, &unused)) != 0)
+ return (ret);
+ conn->type = APP_CONNECTION;
+
+ /* Read a handshake msg, to get version confirmation and parameters. */
+ if ((ret = __repmgr_read_conn(conn)) != 0)
+ goto err;
+ /*
+ * We can only get here after having read the full 9 bytes that we
+ * expect, so this can't fail.
+ */
+ DB_ASSERT(env, conn->reading_phase == SIZES_PHASE);
+ ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+ conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+ DB_ASSERT(env, ret == 0);
+ __repmgr_iovec_init(&conn->iovecs);
+ conn->reading_phase = DATA_PHASE;
+
+ if ((ret = __repmgr_prepare_simple_input(env, conn, &msg_hdr)) != 0)
+ goto err;
+ alloc = TRUE;
+
+ if ((ret = __repmgr_read_conn(conn)) != 0)
+ goto err;
+
+ /*
+ * Analyze the handshake msg, and stash relevant info.
+ */
+ if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
+ goto err;
+ DB_ASSERT(env, vi.size > 0);
+ if ((ret = __repmgr_version_confirmation_unmarshal(env,
+ &conf, vi.data, vi.size, NULL)) != 0)
+ goto err;
+
+ if (conf.version < GM_MIN_VERSION) {
+ ret = DB_REP_UNAVAIL;
+ goto err;
+ }
+ conn->version = conf.version;
+
+err:
+ if (alloc) {
+ DB_ASSERT(env, conn->input.repmgr_msg.cntrl.size > 0);
+ __os_free(env, conn->input.repmgr_msg.cntrl.data);
+ DB_ASSERT(env, conn->input.repmgr_msg.rec.size > 0);
+ __os_free(env, conn->input.repmgr_msg.rec.data);
+ }
+ __repmgr_reset_for_reading(conn);
+ if (ret == 0)
+ *connp = conn;
+ else {
+ (void)__repmgr_close_connection(env, conn);
+ (void)__repmgr_destroy_conn(env, conn);
+ }
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_site __P((DB_ENV *,
+ * PUBLIC: const char *, u_int, DB_SITE **, u_int32_t));
+ */
+int
+__repmgr_site(dbenv, host, port, sitep, flags)
+ DB_ENV *dbenv;
+ const char *host;
+ u_int port;
+ DB_SITE **sitep;
+ u_int32_t flags;
+{
+ int ret;
+
+ if ((ret = __db_fchk(dbenv->env, "repmgr_site", flags, 0)) == 0)
+ ret = site_by_addr(dbenv->env, host, port, sitep);
+
+ return ret;
+}
+
+static int
+site_by_addr(env, host, port, sitep)
+ ENV *env;
+ const char *host;
+ u_int port;
+ DB_SITE **sitep;
+{
+ DB_THREAD_INFO *ip;
+ DB_REP *db_rep;
+ DB_SITE *dbsite;
+ REPMGR_SITE *site;
+ int eid, locked, ret;
+
+ COMPQUIET(ip, NULL);
+ PANIC_CHECK(env);
+ db_rep = env->rep_handle;
+ ENV_NOT_CONFIGURED(env, db_rep->region, "repmgr_site", DB_INIT_REP);
+ if (APP_IS_BASEAPI(env))
+ return (repmgr_only(env, "repmgr_site"));
+ if ((ret = addr_chk(env, host, port)) != 0)
+ return (ret);
+
+ if (REP_ON(env)) {
+ LOCK_MUTEX(db_rep->mutex);
+ ENV_ENTER(env, ip);
+ locked = TRUE;
+ } else
+ locked = FALSE;
+ ret = __repmgr_find_site(env, host, port, &eid);
+ DB_ASSERT(env, IS_VALID_EID(eid));
+ site = SITE_FROM_EID(eid);
+ /*
+ * Point to the stable, permanent copy of the host name. That's the one
+ * we want the DB_SITE handle to point to; just like site_by_eid() does.
+ */
+ host = site->net_addr.host;
+ if (locked) {
+ ENV_LEAVE(env, ip);
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+ if (ret != 0)
+ return (ret);
+
+ if ((ret = init_dbsite(env, eid, host, port, &dbsite)) != 0)
+ return (ret);
+
+ /* Manipulating a site makes this a replication manager application. */
+ APP_SET_REPMGR(env);
+ *sitep = dbsite;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+ */
+int
+__repmgr_site_by_eid(dbenv, eid, sitep)
+ DB_ENV *dbenv;
+ int eid;
+ DB_SITE **sitep;
+{
+ ENV *env;
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ DB_SITE *dbsite;
+ int ret;
+
+ env = dbenv->env;
+ PANIC_CHECK(env);
+ db_rep = env->rep_handle;
+
+ if (eid < 0 || eid >= (int)db_rep->site_cnt)
+ return (DB_NOTFOUND);
+ site = SITE_FROM_EID(eid);
+
+ if ((ret = init_dbsite(env, eid,
+ site->net_addr.host, site->net_addr.port, &dbsite)) != 0)
+ return (ret);
+ *sitep = dbsite;
+ return (0);
+}
+
+static int
+init_dbsite(env, eid, host, port, sitep)
+ ENV *env;
+ int eid;
+ const char *host;
+ u_int port;
+ DB_SITE **sitep;
+{
+ DB_SITE *dbsite;
+ int ret;
+
+ if ((ret = __os_calloc(env, 1, sizeof(DB_SITE), &dbsite)) != 0)
+ return (ret);
+
+ dbsite->env = env;
+ dbsite->eid = eid;
+ dbsite->host = host;
+ dbsite->port = port;
+ dbsite->flags = (REP_ON(env) ? 0 : DB_SITE_PREOPEN);
+
+ dbsite->get_address = __repmgr_get_site_address;
+ dbsite->get_config = __repmgr_get_config;
+ dbsite->get_eid = __repmgr_get_eid;
+ dbsite->set_config = __repmgr_site_config;
+ dbsite->remove = __repmgr_remove_site_pp;
+ dbsite->close = __repmgr_site_close;
+
+ *sitep = dbsite;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_get_site_address __P((DB_SITE *,
+ * PUBLIC: const char **, u_int *));
+ */
+int
+__repmgr_get_site_address(dbsite, hostp, port)
+ DB_SITE *dbsite;
+ const char **hostp;
+ u_int *port;
+{
+ if (hostp != NULL)
+ *hostp = dbsite->host;
+ if (port != NULL)
+ *port = dbsite->port;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_get_eid __P((DB_SITE *, int *));
+ */
+int
+__repmgr_get_eid(dbsite, eidp)
+ DB_SITE *dbsite;
+ int *eidp;
+{
+ int ret;
+
+ if ((ret = refresh_site(dbsite)) != 0)
+ return (ret);
+
+ if (F_ISSET(dbsite, DB_SITE_PREOPEN)) {
+ __db_errx(dbsite->env, DB_STR("3662",
+ "Can't determine EID before env open"));
+ return (EINVAL);
+ }
+ *eidp = dbsite->eid;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_get_config __P((DB_SITE *, u_int32_t, u_int32_t *));
+ */
+int
+__repmgr_get_config(dbsite, which, valuep)
+ DB_SITE *dbsite;
+ u_int32_t which;
+ u_int32_t *valuep;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REGINFO *infop;
+ REP *rep;
+ REPMGR_SITE *site;
+ SITEINFO *sites;
+ int ret;
+
+ env = dbsite->env;
+ db_rep = env->rep_handle;
+
+ if ((ret = refresh_site(dbsite)) != 0)
+ return (ret);
+ LOCK_MUTEX(db_rep->mutex);
+ DB_ASSERT(env, IS_VALID_EID(dbsite->eid));
+ site = SITE_FROM_EID(dbsite->eid);
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ infop = env->reginfo;
+
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ sites = R_ADDR(infop, rep->siteinfo_off);
+
+ site->config = sites[dbsite->eid].config;
+
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ ENV_LEAVE(env, ip);
+ }
+ *valuep = FLD_ISSET(site->config, which) ? 1 : 0;
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_config __P((DB_SITE *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_site_config(dbsite, which, value)
+ DB_SITE *dbsite;
+ u_int32_t which;
+ u_int32_t value;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REGINFO *infop;
+ REP *rep;
+ REPMGR_SITE *site;
+ SITEINFO *sites;
+ int ret;
+
+ env = dbsite->env;
+ db_rep = env->rep_handle;
+
+ if ((ret = refresh_site(dbsite)) != 0)
+ return (ret);
+ switch (which) {
+ case DB_BOOTSTRAP_HELPER:
+ case DB_REPMGR_PEER:
+ if (dbsite->eid == db_rep->self_eid) {
+ __db_errx(env, DB_STR("3663",
+ "Site config value not applicable to local site"));
+ return (EINVAL);
+ }
+ break;
+ case DB_GROUP_CREATOR:
+ /*
+ * Ignore if this is set on remote site. Users will often
+ * copy and edit a DB_CONFIG for all sites.
+ */
+ break;
+ case DB_LEGACY:
+ /* Applicable to either local or remote site. */
+ break;
+ case DB_LOCAL_SITE:
+ /*
+ * This special case needs extra processing, to set the
+ * "self_eid" index in addition to the flag bit.
+ */
+ if ((ret = set_local_site(dbsite, value)) != 0)
+ return (ret);
+ break;
+ default:
+ __db_errx(env,
+ DB_STR("3665", "Unrecognized site config value"));
+ return (EINVAL);
+ }
+
+ DB_ASSERT(env, IS_VALID_EID(dbsite->eid));
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ infop = env->reginfo;
+
+ LOCK_MUTEX(db_rep->mutex);
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ sites = R_ADDR(infop, rep->siteinfo_off);
+ site = SITE_FROM_EID(dbsite->eid);
+
+ /*
+ * Make sure we're up to date with shared memory version. After
+ * env open, we never set private without also updating shared.
+ * But another process could have set the shared one, so shared
+ * is always "best."
+ */
+ site->config = sites[dbsite->eid].config;
+ if (value)
+ FLD_SET(site->config, which);
+ else
+ FLD_CLR(site->config, which);
+ if (site->config != sites[dbsite->eid].config) {
+ sites[dbsite->eid].config = site->config;
+ rep->siteinfo_seq++;
+ }
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ ENV_LEAVE(env, ip);
+ UNLOCK_MUTEX(db_rep->mutex);
+ } else {
+ site = SITE_FROM_EID(dbsite->eid);
+ if (value)
+ FLD_SET(site->config, which);
+ else
+ FLD_CLR(site->config, which);
+ }
+ return (0);
+}
+
+static int
+set_local_site(dbsite, value)
+ DB_SITE *dbsite;
+ u_int32_t value;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+ REPMGR_SITE *site;
+ int locked, ret;
+
+ COMPQUIET(rep, NULL);
+ COMPQUIET(ip, NULL);
+ env = dbsite->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ locked = FALSE;
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ LOCK_MUTEX(db_rep->mutex);
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ locked = TRUE;
+ /* Make sure we're in sync first. */
+ if (IS_VALID_EID(rep->self_eid))
+ db_rep->self_eid = rep->self_eid;
+ }
+ if (!value && db_rep->self_eid == dbsite->eid) {
+ __db_errx(env, DB_STR("3666",
+ "A previously given local site may not be unset"));
+ ret = EINVAL;
+ } else if (IS_VALID_EID(db_rep->self_eid) &&
+ db_rep->self_eid != dbsite->eid) {
+ __db_errx(env, DB_STR("3667",
+ "A (different) local site has already been set"));
+ ret = EINVAL;
+ } else {
+ DB_ASSERT(env, IS_VALID_EID(dbsite->eid));
+ site = SITE_FROM_EID(dbsite->eid);
+ if (FLD_ISSET(site->config,
+ DB_BOOTSTRAP_HELPER | DB_REPMGR_PEER)) {
+ __db_errx(env, DB_STR("3668",
+ "Local site cannot have HELPER or PEER attributes"));
+ ret = EINVAL;
+ }
+ }
+ if (ret == 0) {
+ db_rep->self_eid = dbsite->eid;
+ if (locked) {
+ rep->self_eid = dbsite->eid;
+ rep->siteinfo_seq++;
+ }
+ }
+ if (locked) {
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ ENV_LEAVE(env, ip);
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+ return (ret);
+}
+
+/*
+ * Brings the dbsite's EID up to date, in case it got shuffled around across an
+ * env open.
+ */
+static int
+refresh_site(dbsite)
+ DB_SITE *dbsite;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REPMGR_SITE *site;
+
+ env = dbsite->env;
+ PANIC_CHECK(env);
+ if (F_ISSET(dbsite, DB_SITE_PREOPEN) && REP_ON(env)) {
+ db_rep = env->rep_handle;
+ LOCK_MUTEX(db_rep->mutex);
+ site = __repmgr_lookup_site(env, dbsite->host, dbsite->port);
+ DB_ASSERT(env, site != NULL);
+ dbsite->eid = EID_FROM_SITE(site);
+ F_CLR(dbsite, DB_SITE_PREOPEN);
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+ return (0);
+}
+
+static int
+__repmgr_remove_site_pp(dbsite)
+ DB_SITE *dbsite;
+{
+ int ret, t_ret;
+
+ ret = __repmgr_remove_site(dbsite);
+ /*
+ * The remove() method is documented as a destructor, which means that
+ * absolutely all calls must deallocate the handle, including error
+ * cases, even mutex failures.
+ */
+ if ((t_ret = __repmgr_site_close(dbsite)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static int
+__repmgr_remove_site(dbsite)
+ DB_SITE *dbsite;
+{
+ ENV *env;
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_CONNECTION *conn;
+ repmgr_netaddr_t addr;
+ __repmgr_site_info_args site_info;
+ u_int8_t *response_buf, siteinfo_buf[MAX_MSG_BUF];
+ size_t len;
+ u_int32_t type;
+ int master, ret, t_ret;
+
+ if ((ret = refresh_site(dbsite)) != 0)
+ return (ret);
+ env = dbsite->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (db_rep->repmgr_status != running || !SELECTOR_RUNNING(db_rep)) {
+ __db_errx(env, DB_STR("3669", "repmgr is not running"));
+ return (EINVAL);
+ }
+
+ if (!IS_VALID_EID((master = rep->master_id)))
+ return (DB_REP_UNAVAIL);
+ LOCK_MUTEX(db_rep->mutex);
+ DB_ASSERT(env, IS_VALID_EID(master));
+ addr = SITE_FROM_EID(master)->net_addr;
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ len = strlen(dbsite->host) + 1;
+ DB_INIT_DBT(site_info.host, dbsite->host, len);
+ site_info.port = dbsite->port;
+ site_info.flags = 0;
+ ret = __repmgr_site_info_marshal(env,
+ &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len);
+ DB_ASSERT(env, ret == 0);
+
+ conn = NULL;
+ response_buf = NULL;
+ if ((ret = make_request_conn(env, &addr, &conn)) != 0)
+ return (ret);
+ if ((ret = __repmgr_send_sync_msg(env, conn,
+ REPMGR_REMOVE_REQUEST, siteinfo_buf, (u_int32_t)len)) != 0)
+ goto err;
+ if ((ret = read_own_msg(env,
+ conn, &type, &response_buf, &len)) != 0)
+ goto err;
+ ret = type == REPMGR_REMOVE_SUCCESS ? 0 : DB_REP_UNAVAIL;
+err:
+ if (conn != NULL) {
+ if ((t_ret = __repmgr_close_connection(env, conn)) != 0 &&
+ ret != 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_destroy_conn(env, conn)) != 0 &&
+ ret != 0)
+ ret = t_ret;
+ }
+ if (response_buf != NULL)
+ __os_free(env, response_buf);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_close __P((DB_SITE *));
+ */
+int
+__repmgr_site_close(dbsite)
+ DB_SITE *dbsite;
+{
+ __os_free(dbsite->env, dbsite);
+ return (0);
+}
diff --git a/src/repmgr/repmgr_msg.c b/src/repmgr/repmgr_msg.c
new file mode 100644
index 00000000..13537823
--- /dev/null
+++ b/src/repmgr/repmgr_msg.c
@@ -0,0 +1,1655 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+#include "dbinc_auto/repmgr_auto.h"
+
+static int dispatch_app_message __P((ENV *, REPMGR_MESSAGE *));
+static int finish_gmdb_update __P((ENV *,
+ DB_THREAD_INFO *, DBT *, u_int32_t, u_int32_t, __repmgr_member_args *));
+static int incr_gm_version __P((ENV *, DB_THREAD_INFO *, DB_TXN *));
+static void marshal_site_data __P((ENV *, u_int32_t, u_int8_t *, DBT *));
+static void marshal_site_key __P((ENV *,
+ repmgr_netaddr_t *, u_int8_t *, DBT *, __repmgr_member_args *));
+static int message_loop __P((ENV *, REPMGR_RUNNABLE *));
+static int process_message __P((ENV*, DBT*, DBT*, int));
+static int reject_fwd __P((ENV *, REPMGR_CONNECTION *));
+static int rescind_pending __P((ENV *,
+ DB_THREAD_INFO *, int, u_int32_t, u_int32_t));
+static int resolve_limbo_int __P((ENV *, DB_THREAD_INFO *));
+static int resolve_limbo_wrapper __P((ENV *, DB_THREAD_INFO *));
+static int send_permlsn __P((ENV *, u_int32_t, DB_LSN *));
+static int send_permlsn_conn __P((ENV *,
+ REPMGR_CONNECTION *, u_int32_t, DB_LSN *));
+static int serve_join_request __P((ENV *,
+ DB_THREAD_INFO *, REPMGR_MESSAGE *));
+static int serve_remove_request __P((ENV *,
+ DB_THREAD_INFO *, REPMGR_MESSAGE *));
+static int serve_repmgr_request __P((ENV *, REPMGR_MESSAGE *));
+
+/*
+ * Map one of the phase-1/provisional membership status values to its
+ * corresponding ultimate goal status: if "adding", the goal is to be fully
+ * "present". Otherwise ("deleting") the goal is to not even appear in the
+ * database at all (0).
+ */
+#define NEXT_STATUS(s) (u_int32_t)((s) == SITE_ADDING ? SITE_PRESENT : 0)
+
+/*
+ * PUBLIC: void *__repmgr_msg_thread __P((void *));
+ */
+void *
+__repmgr_msg_thread(argsp)
+ void *argsp;
+{
+ REPMGR_RUNNABLE *th;
+ ENV *env;
+ int ret;
+
+ th = argsp;
+ env = th->env;
+
+ if ((ret = message_loop(env, th)) != 0) {
+ __db_err(env, ret, "message thread failed");
+ (void)__repmgr_thread_failure(env, ret);
+ }
+ return (NULL);
+}
+
+static int
+message_loop(env, th)
+ ENV *env;
+ REPMGR_RUNNABLE *th;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_MESSAGE *msg;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site;
+ __repmgr_permlsn_args permlsn;
+ int incremented, ret, t_ret;
+ u_int32_t membership;
+
+ COMPQUIET(membership, 0);
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ LOCK_MUTEX(db_rep->mutex);
+ while ((ret = __repmgr_queue_get(env, &msg, th)) == 0) {
+ incremented = FALSE;
+ if (IS_DEFERRABLE(msg->msg_hdr.type)) {
+ /*
+ * Count threads currently processing channel requests
+ * or GMDB operations, so that we can limit the number
+ * of them, in order to avoid starving more important
+ * rep messages.
+ */
+ db_rep->non_rep_th++;
+ incremented = TRUE;
+ }
+ if (msg->msg_hdr.type == REPMGR_REP_MESSAGE) {
+ DB_ASSERT(env,
+ IS_VALID_EID(msg->v.repmsg.originating_eid));
+ site = SITE_FROM_EID(msg->v.repmsg.originating_eid);
+ membership = site->membership;
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ switch (msg->msg_hdr.type) {
+ case REPMGR_REP_MESSAGE:
+ if (membership != SITE_PRESENT)
+ break;
+ while ((ret = process_message(env,
+ &msg->v.repmsg.control, &msg->v.repmsg.rec,
+ msg->v.repmsg.originating_eid)) == DB_LOCK_DEADLOCK)
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "repmgr deadlock retry"));
+ break;
+ case REPMGR_APP_MESSAGE:
+ ret = dispatch_app_message(env, msg);
+ conn = msg->v.appmsg.conn;
+ if (conn != NULL) {
+ LOCK_MUTEX(db_rep->mutex);
+ t_ret = __repmgr_decr_conn_ref(env, conn);
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ }
+ break;
+ case REPMGR_OWN_MSG:
+ ret = serve_repmgr_request(env, msg);
+ break;
+ case REPMGR_HEARTBEAT:
+ if ((ret = __repmgr_permlsn_unmarshal(env,
+ &permlsn, msg->v.repmsg.control.data,
+ msg->v.repmsg.control.size, NULL)) != 0)
+ ret = DB_REP_UNAVAIL;
+ else if (rep->master_id == db_rep->self_eid) {
+ /*
+ * If a master receives a heartbeat, there
+ * may be a dupmaster. Resend latest log
+ * message to prompt base replication to
+ * detect it without the need for application
+ * activity.
+ */
+ ret = __rep_flush(env->dbenv);
+ } else {
+ /*
+ * Use heartbeat message to initiate rerequest
+ * processing.
+ */
+ ret = __rep_check_missing(env,
+ permlsn.generation, &permlsn.lsn);
+ }
+ break;
+ default:
+ ret = __db_unknown_path(env, "message loop");
+ break;
+ }
+
+ __os_free(env, msg);
+ LOCK_MUTEX(db_rep->mutex);
+ if (incremented)
+ db_rep->non_rep_th--;
+ if (ret != 0)
+ goto out;
+ }
+ /*
+ * A return of DB_REP_UNAVAIL from __repmgr_queue_get() merely means we
+ * should finish gracefully.
+ */
+ if (ret == DB_REP_UNAVAIL)
+ ret = 0;
+out:
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+static int
+dispatch_app_message(env, msg)
+ ENV *env;
+ REPMGR_MESSAGE *msg;
+{
+ DB_REP *db_rep;
+ DB_CHANNEL db_channel;
+ CHANNEL channel;
+ __repmgr_msg_metadata_args meta;
+ DBT *dbt, *segment;
+ u_int32_t flags, i, size, *uiptr;
+ u_int8_t *data;
+ void *ptr;
+ int ret;
+
+ COMPQUIET(size, 0);
+
+ db_rep = env->rep_handle;
+
+ db_channel.channel = &channel;
+ db_channel.send_msg = __repmgr_send_response;
+
+ /* Supply stub functions for methods inapplicable in msg disp func. */
+ db_channel.close = __repmgr_channel_close_inval;
+ db_channel.send_request = __repmgr_send_request_inval;
+ db_channel.set_timeout = __repmgr_channel_timeout_inval;
+
+ channel.msg = msg;
+ channel.env = env;
+ channel.c.conn = msg->v.appmsg.conn;
+ channel.responded = FALSE;
+ channel.meta = &meta;
+
+ /*
+ * The user data is in a form similar to that of a bulk buffer.
+ * However, there's also our meta-data tacked on to the end of it.
+ * Fortunately, the meta-data is fixed length, so it's easy to peel it
+ * off.
+ *
+ * The user data "bulk buffer" lacks the usual "-1" end-marker. But
+ * that's OK, because we already know how many segments there are (from
+ * the message header). Convert this information into the DBT array
+ * that we will pass to the user's function.
+ *
+ * (See the definition of DB_MULTIPLE_INIT for a reminder of the format
+ * of a bulk buffer.)
+ */
+ dbt = &msg->v.appmsg.buf;
+ data = dbt->data;
+ dbt->size -= __REPMGR_MSG_METADATA_SIZE;
+ ret = __repmgr_msg_metadata_unmarshal(env,
+ &meta, &data[dbt->size], __REPMGR_MSG_METADATA_SIZE, NULL);
+ DB_ASSERT(env, ret == 0);
+
+ dbt->ulen = dbt->size;
+ DB_MULTIPLE_INIT(ptr, dbt);
+ for (i = 0; i < APP_MSG_SEGMENT_COUNT(msg->msg_hdr); i++) {
+ segment = &msg->v.appmsg.segments[i];
+ uiptr = ptr;
+ *uiptr = ntohl(*uiptr);
+ uiptr[-1] = ntohl(uiptr[-1]);
+ DB_MULTIPLE_NEXT(ptr, dbt, data, size);
+ DB_ASSERT(env, data != NULL);
+ DB_INIT_DBT(*segment, data, size);
+ }
+
+ flags = F_ISSET(&meta, REPMGR_REQUEST_MSG_TYPE) ?
+ DB_REPMGR_NEED_RESPONSE : 0;
+
+ if (db_rep->msg_dispatch == NULL) {
+ __db_errx(env, DB_STR("3670",
+ "No message dispatch call-back function has been configured"));
+ if (F_ISSET(channel.meta, REPMGR_REQUEST_MSG_TYPE))
+ return (__repmgr_send_err_resp(env,
+ &channel, DB_NOSERVER));
+ else
+ return (0);
+ }
+
+ (*db_rep->msg_dispatch)(env->dbenv,
+ &db_channel, &msg->v.appmsg.segments[0],
+ APP_MSG_SEGMENT_COUNT(msg->msg_hdr), flags);
+
+ if (F_ISSET(channel.meta, REPMGR_REQUEST_MSG_TYPE) &&
+ !channel.responded) {
+ __db_errx(env, DB_STR("3671",
+ "Application failed to provide a response"));
+ return (__repmgr_send_err_resp(env, &channel, DB_KEYEMPTY));
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_err_resp __P((ENV *, CHANNEL *, int));
+ */
+int
+__repmgr_send_err_resp(env, channel, err)
+ ENV *env;
+ CHANNEL *channel;
+ int err;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_IOVECS iovecs;
+ __repmgr_msg_hdr_args msg_hdr;
+ u_int8_t msg_hdr_buf[__REPMGR_MSG_HDR_SIZE];
+ int ret;
+
+ db_rep = env->rep_handle;
+ msg_hdr.type = REPMGR_RESP_ERROR;
+
+ /* Make it non-negative, so we can send on wire without worry. */
+ DB_ASSERT(env, err < 0);
+ RESP_ERROR_CODE(msg_hdr) = (u_int32_t)(-err);
+
+ RESP_ERROR_TAG(msg_hdr) = channel->meta->tag;
+
+ __repmgr_iovec_init(&iovecs);
+ __repmgr_msg_hdr_marshal(env, &msg_hdr, msg_hdr_buf);
+ __repmgr_add_buffer(&iovecs, msg_hdr_buf, __REPMGR_MSG_HDR_SIZE);
+
+ conn = channel->c.conn;
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_send_many(env, conn, &iovecs, 0);
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ return (ret);
+}
+
+static int
+process_message(env, control, rec, eid)
+ ENV *env;
+ DBT *control, *rec;
+ int eid;
+{
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ REP *rep;
+ int dirty, ret, t_ret;
+ u_int32_t generation;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * Save initial generation number, in case it changes in a close race
+ * with a NEWMASTER.
+ */
+ generation = rep->gen;
+
+ ret = 0;
+ switch (t_ret =
+ __rep_process_message_int(env, control, rec, eid, &lsn)) {
+ case 0:
+ if (db_rep->takeover_pending)
+ ret = __repmgr_claim_victory(env);
+ break;
+
+ case DB_REP_HOLDELECTION:
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_init_election(env,
+ ELECT_F_IMMED | ELECT_F_INVITEE);
+ UNLOCK_MUTEX(db_rep->mutex);
+ break;
+
+ case DB_REP_DUPMASTER:
+ /*
+ * Initiate an election if we're configured to be using
+ * elections, but only if we're *NOT* using leases. When using
+ * leases, there is never any uncertainty over which site is the
+ * rightful master, and only the loser gets the DUPMASTER return
+ * code.
+ */
+ if ((ret = __repmgr_become_client(env)) == 0 &&
+ FLD_ISSET(rep->config, REP_C_LEASE | REP_C_ELECTIONS)
+ == REP_C_ELECTIONS) {
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_init_election(env, ELECT_F_IMMED);
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+ DB_EVENT(env, DB_EVENT_REP_DUPMASTER, NULL);
+ break;
+
+ case DB_REP_ISPERM:
+#ifdef CONFIG_TEST
+ if (env->test_abort == DB_TEST_REPMGR_PERM)
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "ISPERM: Test hook. Skip ACK for permlsn [%lu][%lu]",
+ (u_long)lsn.file, (u_long)lsn.offset));
+#endif
+ DB_TEST_SET(env->test_abort, DB_TEST_REPMGR_PERM);
+ ret = send_permlsn(env, generation, &lsn);
+DB_TEST_RECOVERY_LABEL
+ break;
+
+ case DB_LOCK_DEADLOCK:
+ case DB_REP_IGNORE:
+ case DB_REP_NEWSITE:
+ case DB_REP_NOTPERM:
+ break;
+
+ case DB_REP_JOIN_FAILURE:
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "repmgr fires join failure event"));
+ DB_EVENT(env, DB_EVENT_REP_JOIN_FAILURE, NULL);
+ break;
+
+ case DB_REP_WOULDROLLBACK:
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "repmgr fires would-rollback event"));
+ DB_EVENT(env, DB_EVENT_REP_WOULD_ROLLBACK, &lsn);
+ break;
+
+ default:
+ __db_err(env, t_ret, "DB_ENV->rep_process_message");
+ ret = t_ret;
+ }
+
+ if (ret != 0)
+ goto err;
+ LOCK_MUTEX(db_rep->mutex);
+ dirty = db_rep->gmdb_dirty;
+ db_rep->gmdb_dirty = FALSE;
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (dirty) {
+ if ((ret = __op_rep_enter(env, FALSE, FALSE)) != 0)
+ goto err;
+ ret = __repmgr_reload_gmdb(env);
+ t_ret = __op_rep_exit(env);
+ if (ret == ENOENT)
+ ret = 0;
+ else if (ret == DB_DELETED)
+ ret = __repmgr_bow_out(env);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ }
+err:
+ return (ret);
+}
+
+/*
+ * Handle replication-related events. Returns only 0 or DB_EVENT_NOT_HANDLED;
+ * no other error returns are tolerated.
+ *
+ * PUBLIC: int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+ */
+int
+__repmgr_handle_event(env, event, info)
+ ENV *env;
+ u_int32_t event;
+ void *info;
+{
+ DB_REP *db_rep;
+
+ db_rep = env->rep_handle;
+
+ if (db_rep->selector == NULL) {
+ /* Repmgr is not in use, so all events go to application. */
+ return (DB_EVENT_NOT_HANDLED);
+ }
+
+ switch (event) {
+ case DB_EVENT_REP_ELECTED:
+ DB_ASSERT(env, info == NULL);
+ db_rep->takeover_pending = TRUE;
+
+ /*
+ * The application doesn't really need to see this, because the
+ * purpose of this event is to tell the winning site that it
+ * should call rep_start(MASTER), and in repmgr we do that
+ * automatically. Still, they could conceivably be curious, and
+ * it doesn't hurt anything to let them know.
+ */
+ break;
+ case DB_EVENT_REP_INIT_DONE:
+ db_rep->gmdb_dirty = TRUE;
+ break;
+ case DB_EVENT_REP_NEWMASTER:
+ DB_ASSERT(env, info != NULL);
+
+ /* Application still needs to see this. */
+ break;
+ default:
+ break;
+ }
+ return (DB_EVENT_NOT_HANDLED);
+}
+
+static int
+send_permlsn(env, generation, lsn)
+ ENV *env;
+ u_int32_t generation;
+ DB_LSN *lsn;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site;
+ int ack, bcast, eid, master, policy, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+ master = rep->master_id;
+ LOCK_MUTEX(db_rep->mutex);
+
+ /*
+ * If the file number has changed, send it to everyone, regardless of
+ * anything else. Otherwise, send it to the master if we know a master,
+ * and that master's ack policy requires it.
+ */
+ bcast = FALSE;
+ if (LOG_COMPARE(lsn, &db_rep->perm_lsn) > 0) {
+ if (lsn->file > db_rep->perm_lsn.file) {
+ bcast = TRUE;
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "send_permlsn: broadcast [%lu][%lu]",
+ (u_long)lsn->file, (u_long)lsn->offset));
+ }
+ db_rep->perm_lsn = *lsn;
+ }
+ if (IS_KNOWN_REMOTE_SITE(master)) {
+ site = SITE_FROM_EID(master);
+ /*
+ * Use master's ack policy if we know it; use our own if the
+ * master is too old (down-rev) to have told us its policy.
+ */
+ policy = site->ack_policy > 0 ?
+ site->ack_policy : rep->perm_policy;
+ if (policy == DB_REPMGR_ACKS_NONE ||
+ (IS_PEER_POLICY(policy) && rep->priority == 0))
+ ack = FALSE;
+ else
+ ack = TRUE;
+ } else {
+ site = NULL;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "dropping ack with no known master"));
+ ack = FALSE;
+ }
+
+ /*
+ * Send to master first, since we need to send to all its connections.
+ */
+ if (site != NULL && (bcast || ack)) {
+ if (site->state == SITE_CONNECTED) {
+ if ((conn = site->ref.conn.in) != NULL &&
+ conn->state == CONN_READY &&
+ (ret = send_permlsn_conn(env,
+ conn, generation, lsn)) != 0)
+ goto unlock;
+ if ((conn = site->ref.conn.out) != NULL &&
+ conn->state == CONN_READY &&
+ (ret = send_permlsn_conn(env,
+ conn, generation, lsn)) != 0)
+ goto unlock;
+ }
+ TAILQ_FOREACH(conn, &site->sub_conns, entries) {
+ if ((ret = send_permlsn_conn(env,
+ conn, generation, lsn)) != 0)
+ goto unlock;
+ }
+ }
+ if (bcast) {
+ /*
+ * Send to everyone except the master (since we've already done
+ * that, above).
+ */
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ if (eid == master)
+ continue;
+ site = SITE_FROM_EID(eid);
+ /*
+ * Send the ack out on primary connections only.
+ */
+ if (site->state == SITE_CONNECTED) {
+ if ((conn = site->ref.conn.in) != NULL &&
+ conn->state == CONN_READY &&
+ (ret = send_permlsn_conn(env,
+ conn, generation, lsn)) != 0)
+ goto unlock;
+ if ((conn = site->ref.conn.out) != NULL &&
+ conn->state == CONN_READY &&
+ (ret = send_permlsn_conn(env,
+ conn, generation, lsn)) != 0)
+ goto unlock;
+ }
+ }
+ }
+
+unlock:
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * Sends a perm LSN message on one connection, if it needs it.
+ *
+ * !!! Called with mutex held.
+ */
+static int
+send_permlsn_conn(env, conn, generation, lsn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ u_int32_t generation;
+ DB_LSN *lsn;
+{
+ DBT control2, rec2;
+ __repmgr_permlsn_args permlsn;
+ u_int8_t buf[__REPMGR_PERMLSN_SIZE];
+ int ret;
+
+ ret = 0;
+
+ if (conn->state == CONN_READY) {
+ DB_ASSERT(env, conn->version > 0);
+ permlsn.generation = generation;
+ memcpy(&permlsn.lsn, lsn, sizeof(DB_LSN));
+ if (conn->version == 1) {
+ control2.data = &permlsn;
+ control2.size = sizeof(permlsn);
+ } else {
+ __repmgr_permlsn_marshal(env, &permlsn, buf);
+ control2.data = buf;
+ control2.size = __REPMGR_PERMLSN_SIZE;
+ }
+ rec2.size = 0;
+ /*
+ * It's hard to imagine anyone would care about a lost ack if
+ * the path to the master is so congested as to need blocking;
+ * so pass "maxblock" argument as 0.
+ */
+ if ((ret = __repmgr_send_one(env, conn, REPMGR_PERMLSN,
+ &control2, &rec2, 0)) == DB_REP_UNAVAIL)
+ ret = __repmgr_bust_connection(env, conn);
+ }
+ return (ret);
+}
+
+static int
+serve_repmgr_request(env, msg)
+ ENV *env;
+ REPMGR_MESSAGE *msg;
+{
+ DB_THREAD_INFO *ip;
+ DBT *dbt;
+ REPMGR_CONNECTION *conn;
+ int ret, t_ret;
+
+ ENV_ENTER(env, ip);
+ switch (REPMGR_OWN_MSG_TYPE(msg->msg_hdr)) {
+ case REPMGR_JOIN_REQUEST:
+ ret = serve_join_request(env, ip, msg);
+ break;
+ case REPMGR_REJOIN:
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "One try at rejoining group automatically"));
+ if ((ret = __repmgr_join_group(env)) == DB_REP_UNAVAIL)
+ ret = __repmgr_bow_out(env);
+ break;
+ case REPMGR_REMOVE_REQUEST:
+ ret = serve_remove_request(env, ip, msg);
+ break;
+ case REPMGR_RESOLVE_LIMBO:
+ ret = resolve_limbo_wrapper(env, ip);
+ break;
+ case REPMGR_SHARING:
+ dbt = &msg->v.gmdb_msg.request;
+ ret = __repmgr_refresh_membership(env, dbt->data, dbt->size);
+ break;
+ default:
+ ret = __db_unknown_path(env, "serve_repmgr_request");
+ break;
+ }
+ if ((conn = msg->v.gmdb_msg.conn) != NULL) {
+ if ((t_ret = __repmgr_close_connection(env, conn)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * Attempts to fulfill a remote site's request to join the replication group.
+ * Only the master can grant this request, so if we've received this request
+ * when we're not the master, we'll send an appropriate failure message instead.
+ */
+static int
+serve_join_request(env, ip, msg)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ REPMGR_MESSAGE *msg;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ DBT *dbt;
+ __repmgr_site_info_args site_info;
+ u_int8_t *buf;
+ char *host;
+ size_t len;
+ u_int32_t status;
+ int eid, ret, t_ret;
+
+ db_rep = env->rep_handle;
+ COMPQUIET(status, 0);
+
+ conn = msg->v.gmdb_msg.conn;
+ dbt = &msg->v.gmdb_msg.request;
+ ret = __repmgr_site_info_unmarshal(env,
+ &site_info, dbt->data, dbt->size, NULL);
+
+ host = site_info.host.data;
+ host[site_info.host.size - 1] = '\0';
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Request to join group from %s:%u", host, (u_int)site_info.port));
+
+ if ((ret = __repmgr_hold_master_role(env, conn)) == DB_REP_UNAVAIL)
+ return (0);
+ if (ret != 0)
+ return (ret);
+
+ LOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_find_site(env, host, site_info.port, &eid)) == 0) {
+ DB_ASSERT(env, eid != db_rep->self_eid);
+ status = SITE_FROM_EID(eid)->membership;
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (ret != 0)
+ goto err;
+
+ switch (status) {
+ case 0:
+ case SITE_ADDING:
+ ret = __repmgr_update_membership(env, ip, eid, SITE_ADDING);
+ break;
+ case SITE_PRESENT:
+ /* Already in desired state. */
+ break;
+ case SITE_DELETING:
+ ret = rescind_pending(env,
+ ip, eid, SITE_DELETING, SITE_PRESENT);
+ break;
+ default:
+ ret = __db_unknown_path(env, "serve_join_request");
+ break;
+ }
+ if (ret != 0)
+ goto err;
+
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_marshal_member_list(env, &buf, &len);
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (ret != 0)
+ goto err;
+ ret = __repmgr_send_sync_msg(env, conn, REPMGR_JOIN_SUCCESS,
+ buf, (u_int32_t)len);
+ __os_free(env, buf);
+
+err:
+
+ if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret == DB_REP_UNAVAIL)
+ ret = __repmgr_send_sync_msg(env, conn,
+ REPMGR_GM_FAILURE, NULL, 0);
+
+ return (ret);
+}
+
+static int
+serve_remove_request(env, ip, msg)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ REPMGR_MESSAGE *msg;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site;
+ DBT *dbt;
+ __repmgr_site_info_args site_info;
+ char *host;
+ u_int32_t status, type;
+ int eid, ret, t_ret;
+
+ COMPQUIET(status, 0);
+ db_rep = env->rep_handle;
+
+ conn = msg->v.gmdb_msg.conn;
+ dbt = &msg->v.gmdb_msg.request;
+ ret = __repmgr_site_info_unmarshal(env,
+ &site_info, dbt->data, dbt->size, NULL);
+
+ host = site_info.host.data;
+ host[site_info.host.size - 1] = '\0';
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Request to remove %s:%u from group", host, (u_int)site_info.port));
+
+ if ((ret = __repmgr_hold_master_role(env, conn)) == DB_REP_UNAVAIL)
+ return (0);
+ if (ret != 0)
+ return (ret);
+
+ LOCK_MUTEX(db_rep->mutex);
+ if ((site = __repmgr_lookup_site(env, host, site_info.port)) == NULL)
+ eid = DB_EID_INVALID;
+ else {
+ eid = EID_FROM_SITE(site);
+ status = site->membership;
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (eid == DB_EID_INVALID) {
+ /* Doesn't exist: already been removed. */
+ ret = 0;
+ goto err;
+ } else if (eid == db_rep->self_eid) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Reject request to remove current master"));
+ ret = DB_REP_UNAVAIL;
+ goto err;
+ }
+
+ switch (status) {
+ case 0:
+ /* Already in desired state. */
+ break;
+ case SITE_ADDING:
+ ret = rescind_pending(env, ip, eid, SITE_ADDING, 0);
+ break;
+ case SITE_PRESENT:
+ case SITE_DELETING:
+ ret = __repmgr_update_membership(env, ip, eid, SITE_DELETING);
+ break;
+ default:
+ ret = __db_unknown_path(env, "serve_remove_request");
+ break;
+ }
+err:
+ if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0)
+ ret = t_ret;
+ switch (ret) {
+ case 0:
+ type = REPMGR_REMOVE_SUCCESS;
+ break;
+ case DB_REP_UNAVAIL:
+ type = REPMGR_GM_FAILURE;
+ break;
+ default:
+ return (ret);
+ }
+ return (__repmgr_send_sync_msg(env, conn, type, NULL, 0));
+}
+
+/*
+ * Runs a limbo resolution on a message processing thread, upon request from the
+ * send() function when it notices that a user transaction has gotten a perm
+ * success. (It wouldn't work for the user thread to do it in-line.)
+ */
+static int
+resolve_limbo_wrapper(env, ip)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+{
+ int do_close, ret, t_ret;
+
+ if ((ret = __repmgr_hold_master_role(env, NULL)) == DB_REP_UNAVAIL)
+ return (0);
+ if (ret != 0)
+ return (ret);
+retry:
+ if ((ret = __repmgr_setup_gmdb_op(env, ip, NULL, 0)) != 0)
+ goto rlse;
+
+ /*
+ * A limbo resolution request is merely a "best effort" attempt to
+ * shorten the duration of a pending change. So if it fails for lack of
+ * acks again, no one really cares.
+ */
+ if ((ret = resolve_limbo_int(env, ip)) == DB_REP_UNAVAIL) {
+ do_close = FALSE;
+ ret = 0;
+ } else
+ do_close = TRUE;
+
+ if ((t_ret = __repmgr_cleanup_gmdb_op(env, do_close)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED)
+ goto retry;
+rlse:
+ if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * Checks for the need to resolve limbo (failure of a previous GMDB update to
+ * get enough acks), and does it if nec. No-op if none needed.
+ *
+ * Must be called within setup_gmdb_op/cleanup_gmdb_op context.
+ */
+static int
+resolve_limbo_int(env, ip)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+{
+ DB_REP *db_rep;
+ DB_TXN *txn;
+ REPMGR_SITE *site;
+ DB_LSN orig_lsn;
+ DBT key_dbt, data_dbt;
+ __repmgr_member_args logrec;
+ repmgr_netaddr_t addr;
+ u_int32_t orig_status, status;
+ int eid, locked, ret, t_ret;
+ u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+ u_int8_t key_buf[MAX_MSG_BUF];
+
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+
+ /*
+ * Is there a previous GMDB update failure currently pending? If not,
+ * there's nothing for us to do.
+ */
+ eid = db_rep->limbo_victim;
+ if (!IS_VALID_EID(eid))
+ goto out;
+ site = SITE_FROM_EID(eid);
+ addr = site->net_addr;
+ marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec);
+ orig_status = site->membership;
+ if (orig_status == SITE_PRESENT || orig_status == 0)
+ goto out;
+
+ if (IS_ZERO_LSN(db_rep->limbo_failure))
+ goto out;
+
+ /*
+ * There are potentially two parts: the self-update of the existing
+ * limbo record, and then the finishing-off if the first is successful.
+ * We might only have to do the finishing-off, if some arbitrary random
+ * txn triggered a limbo resolution request on a msg processing thread.
+ */
+ if (LOG_COMPARE(&db_rep->durable_lsn, &db_rep->limbo_failure) > 0) {
+ /*
+ * Nice! Limbo has been resolved by an arbitrary other txn
+ * succeeding subsequently. So we don't have to do the
+ * "self-update" part.
+ */
+ } else {
+ /*
+ * Do a self-update, to try to trigger a "durable". Since
+ * nothing in the database is changing, we need neither an ASL
+ * hint nor a bump in the version sequence.
+ */
+ orig_lsn = db_rep->limbo_failure;
+ db_rep->active_gmdb_update = gmdb_primary;
+ UNLOCK_MUTEX(db_rep->mutex);
+ locked = FALSE;
+
+ if ((ret = __txn_begin(env,
+ ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+ goto out;
+
+ marshal_site_data(env, orig_status, data_buf, &data_dbt);
+
+ ret = __db_put(db_rep->gmdb, ip, txn, &key_dbt, &data_dbt, 0);
+ if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto out;
+
+ /*
+ * Check to see whether we got another PERM failure. This is
+ * quite possible in the case where a GMDB request is being
+ * retried by a requestor, but unlikely if we had a resolution
+ * via an "arbitrary" txn.
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+ if (LOG_COMPARE(&db_rep->limbo_failure, &orig_lsn) > 0) {
+ db_rep->limbo_resolution_needed = TRUE;
+ ret = DB_REP_UNAVAIL;
+ goto out;
+ }
+ }
+ DB_ASSERT(env, locked);
+
+ /*
+ * Here, either we didn't need to do the self-update, or we did it and
+ * it succeeded. So now we're ready to do the second phase update.
+ */
+ db_rep->limbo_victim = DB_EID_INVALID;
+ UNLOCK_MUTEX(db_rep->mutex);
+ locked = FALSE;
+ status = NEXT_STATUS(orig_status);
+ if ((ret = finish_gmdb_update(env,
+ ip, &key_dbt, orig_status, status, &logrec)) != 0)
+ goto out;
+
+ /* Track modified membership status in our in-memory sites array. */
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+ if ((ret = __repmgr_set_membership(env,
+ addr.host, addr.port, status)) != 0)
+ goto out;
+ __repmgr_set_sites(env);
+
+out:
+ if (locked)
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * Update a specific record in the Group Membership database. The record to be
+ * updated is implied by "eid"; "pstatus" is the provisional status (ADDING or
+ * DELETING) to be used in the first phase of the update. The ultimate goal
+ * status is inferred (ADDING -> PRESENT, or DELETING -> 0).
+ *
+ * PUBLIC: int __repmgr_update_membership __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, int, u_int32_t));
+ */
+int
+__repmgr_update_membership(env, ip, eid, pstatus)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ u_int32_t pstatus; /* Provisional status. */
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ DB_TXN *txn;
+ DB_LSN lsn, orig_lsn;
+ DBT key_dbt, data_dbt;
+ __repmgr_member_args logrec;
+ repmgr_netaddr_t addr;
+ u_int32_t orig_status, ult_status;
+ int do_close, locked, ret, t_ret;
+ u_int8_t key_buf[MAX_MSG_BUF];
+ u_int8_t status_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+
+ DB_ASSERT(env, pstatus == SITE_ADDING || pstatus == SITE_DELETING);
+
+ db_rep = env->rep_handle;
+ COMPQUIET(orig_status, 0);
+ COMPQUIET(addr.host, NULL);
+ COMPQUIET(addr.port, 0);
+
+retry:
+ txn = NULL;
+ locked = FALSE;
+ DB_ASSERT(env, db_rep->gmdb_busy);
+ if ((ret = __repmgr_setup_gmdb_op(env, ip, NULL, 0)) != 0)
+ return (ret);
+
+ /*
+ * Usually we'll keep the GMDB closed, to conserve resources, since
+ * changes should be rare. However, if a PERM FAIL puts us in limbo, we
+ * expect to clean that up as soon as we can; so leave it open for now
+ * in that case.
+ */
+ do_close = TRUE;
+
+ /*
+ * Before attempting any fresh updates, resolve any lingering incomplete
+ * updates from the past (i.e., those that resulted in PERM_FAIL). If
+ * we can't, then we mustn't proceed with any more updates. Getting an
+ * additional perm failure would increase the dissonance between the
+ * effective group size and the number of sites from which we can safely
+ * accept acks. Besides, if we can't clear the previous failure,
+ * there's practically no hope that a new update would fare any better.
+ */
+ if ((ret = resolve_limbo_int(env, ip)) != 0) {
+ if (ret == DB_REP_UNAVAIL)
+ do_close = FALSE;
+ goto err;
+ }
+
+ /*
+ * If there was a successful limbo resolution, it could have either been
+ * for some unrelated change, or it could have been the same change our
+ * caller is now (re-)trying to perform. In the latter case, we have
+ * nothing more to do -- resolve_limbo() has done it all for us! To
+ * find out, compare the site's current status with the ultimate goal
+ * status associated with the provisional status that was passed to us
+ * as input.
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+ DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(eid));
+ site = SITE_FROM_EID(eid);
+ if ((orig_status = site->membership) == NEXT_STATUS(pstatus))
+ goto err;
+ addr = site->net_addr;
+
+ /*
+ * Anticipate modified membership status in our in-memory sites array.
+ * This forces us into an awkward rescission, below, if our transaction
+ * suffers a hard failure and must be aborted. But it's necessary
+ * because of the requirement that, on additions, the quorum computation
+ * must be based on the incremented nsites value. An alternative might
+ * possibly be to increment nsites separately from adding the new site
+ * to the array, or even having a special epicycle at the point where
+ * send() counts acks (we'd have to make active_gmdb_update richer), but
+ * those seem even more confusing.
+ */
+ if ((ret = __repmgr_set_membership(env,
+ addr.host, addr.port, pstatus)) != 0)
+ goto err;
+ __repmgr_set_sites(env);
+
+ /*
+ * Hint to our send() function that we want to know the result of ack
+ * counting.
+ */
+ orig_lsn = db_rep->limbo_failure;
+ db_rep->active_gmdb_update = gmdb_primary;
+ UNLOCK_MUTEX(db_rep->mutex);
+ locked = FALSE;
+
+ if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+ goto err;
+ marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec);
+ marshal_site_data(env, pstatus, status_buf, &data_dbt);
+ if ((ret = __db_put(db_rep->gmdb,
+ ip, txn, &key_dbt, &data_dbt, 0)) != 0)
+ goto err;
+ if ((ret = incr_gm_version(env, ip, txn)) != 0)
+ goto err;
+
+ /*
+ * Add some information to the log for this txn. This is an annotation,
+ * for the sole purpose of enabling the client to notice whenever a
+ * change has occurred in this database. It has nothing to do with
+ * local recovery.
+ */
+ ZERO_LSN(lsn);
+ if ((ret = __repmgr_member_log(env,
+ txn, &lsn, 0, db_rep->membership_version,
+ orig_status, pstatus, &logrec.host, logrec.port)) != 0)
+ goto err;
+ ret = __txn_commit(txn, 0);
+ txn = NULL;
+ if (ret != 0)
+ goto err;
+
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+
+ if (LOG_COMPARE(&db_rep->limbo_failure, &orig_lsn) > 0) {
+ /*
+ * Failure LSN advanced, meaning this update wasn't acked by
+ * enough clients.
+ */
+ db_rep->limbo_resolution_needed = TRUE;
+ db_rep->limbo_victim = eid;
+ ret = DB_REP_UNAVAIL;
+ do_close = FALSE;
+ goto err;
+ }
+
+ /* Now we'll complete the status change. */
+ ult_status = NEXT_STATUS(pstatus);
+ UNLOCK_MUTEX(db_rep->mutex);
+ locked = FALSE;
+
+ if ((ret = finish_gmdb_update(env, ip,
+ &key_dbt, pstatus, ult_status, &logrec)) != 0)
+ goto err;
+
+ /* Track modified membership status in our in-memory sites array. */
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+ ret = __repmgr_set_membership(env, addr.host, addr.port, ult_status);
+ __repmgr_set_sites(env);
+
+err:
+ if (locked)
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (txn != NULL) {
+ DB_ASSERT(env, ret != 0);
+ (void)__txn_abort(txn);
+ /*
+ * We've just aborted the txn which moved the site info from
+ * orig_status to something else, so restore that value now so
+ * that we keep in sync.
+ */
+ (void)__repmgr_set_membership(env,
+ addr.host, addr.port, orig_status);
+ }
+ if ((t_ret = __repmgr_cleanup_gmdb_op(env, do_close)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED)
+ goto retry;
+ return (ret);
+}
+
+/*
+ * Rescind a partially completed membership DB change, setting the new status to
+ * the value given.
+ */
+static int
+rescind_pending(env, ip, eid, cur_status, new_status)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ u_int32_t cur_status, new_status;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ DBT key_dbt;
+ __repmgr_member_args logrec;
+ repmgr_netaddr_t addr;
+ u_int8_t key_buf[MAX_MSG_BUF];
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+
+retry:
+ if ((ret = __repmgr_setup_gmdb_op(env, ip, NULL, 0)) != 0)
+ return (ret);
+
+ LOCK_MUTEX(db_rep->mutex);
+ DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(eid));
+ site = SITE_FROM_EID(eid);
+ addr = site->net_addr;
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec);
+ if ((ret = finish_gmdb_update(env,
+ ip, &key_dbt, cur_status, new_status, &logrec)) != 0)
+ goto err;
+
+ /* Track modified membership status in our in-memory sites array. */
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_set_membership(env, addr.host, addr.port, new_status);
+ __repmgr_set_sites(env);
+ UNLOCK_MUTEX(db_rep->mutex);
+
+err:
+ if ((t_ret = __repmgr_cleanup_gmdb_op(env, TRUE)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED)
+ goto retry;
+ return (ret);
+}
+
+/*
+ * Caller must have already taken care of serializing this operation
+ * (hold_master_role(), setup_gmdb_op()).
+ */
+static int
+incr_gm_version(env, ip, txn)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+{
+ DB_REP *db_rep;
+ u_int32_t version;
+ int ret;
+
+ db_rep = env->rep_handle;
+ version = db_rep->membership_version + 1;
+ if ((ret = __repmgr_set_gm_version(env, ip, txn, version)) == 0)
+ db_rep->membership_version = version;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_set_gm_version __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+ */
+int
+__repmgr_set_gm_version(env, ip, txn, version)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t version;
+{
+ DB_REP *db_rep;
+ DBT key_dbt, data_dbt;
+ __repmgr_membership_key_args key;
+ __repmgr_member_metadata_args metadata;
+ u_int8_t key_buf[__REPMGR_MEMBERSHIP_KEY_SIZE + 1];
+ u_int8_t metadata_buf[__REPMGR_MEMBER_METADATA_SIZE];
+ size_t len;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ metadata.format = REPMGR_GMDB_FMT_VERSION;
+ metadata.version = version;
+ __repmgr_member_metadata_marshal(env, &metadata, metadata_buf);
+ DB_INIT_DBT(data_dbt, metadata_buf, __REPMGR_MEMBER_METADATA_SIZE);
+
+ DB_INIT_DBT(key.host, NULL, 0);
+ key.port = 0;
+ ret = __repmgr_membership_key_marshal(env,
+ &key, key_buf, sizeof(key_buf), &len);
+ DB_ASSERT(env, ret == 0);
+ DB_INIT_DBT(key_dbt, key_buf, len);
+
+ if ((ret = __db_put(db_rep->gmdb,
+ ip, txn, &key_dbt, &data_dbt, 0)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * Performs the second phase of a 2-phase membership DB operation: an "adding"
+ * site becomes fully "present" in the group; a "deleting" site is finally
+ * really deleted.
+ */
+static int
+finish_gmdb_update(env, ip, key_dbt, prev_status, status, logrec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DBT *key_dbt;
+ u_int32_t prev_status, status;
+ __repmgr_member_args *logrec;
+{
+ DB_REP *db_rep;
+ DB_LSN lsn;
+ DB_TXN *txn;
+ DBT data_dbt;
+ u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+
+ db_rep->active_gmdb_update = gmdb_secondary;
+ if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+ return (ret);
+
+ if (status == 0)
+ ret = __db_del(db_rep->gmdb, ip, txn, key_dbt, 0);
+ else {
+ marshal_site_data(env, status, data_buf, &data_dbt);
+ ret = __db_put(db_rep->gmdb, ip, txn, key_dbt, &data_dbt, 0);
+ }
+ if (ret != 0)
+ goto err;
+
+ if ((ret = incr_gm_version(env, ip, txn)) != 0)
+ goto err;
+
+ ZERO_LSN(lsn);
+ if ((ret = __repmgr_member_log(env,
+ txn, &lsn, 0, db_rep->membership_version,
+ prev_status, status, &logrec->host, logrec->port)) != 0)
+ goto err;
+
+err:
+ if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * Set up everything we need to update the Group Membership database. This may
+ * or may not include providing a transaction in which to do the updates
+ * (depending on whether the caller wants the creation of the database to be in
+ * the same transaction as the updates).
+ *
+ * PUBLIC: int __repmgr_setup_gmdb_op __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN **, u_int32_t));
+ */
+int
+__repmgr_setup_gmdb_op(env, ip, txnp, flags)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_TXN **txnp;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ DB_TXN *txn;
+ DB *dbp;
+ int ret, was_open;
+
+ db_rep = env->rep_handle;
+
+ dbp = NULL;
+ txn = NULL;
+
+ /*
+ * If the caller provided a place to return a txn handle, create it and
+ * perform any open operation as part of that txn. The caller is
+ * responsible for disposing of the txn. Otherwise, only begin a txn if
+ * we need to do the open and in that case commit it right after the
+ * open.
+ */
+ DB_ASSERT(env, db_rep->gmdb_busy);
+ was_open = db_rep->gmdb != NULL;
+ if ((txnp != NULL || !was_open) &&
+ (ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+ goto err;
+
+ if (!was_open) {
+ DB_ASSERT(env, txn != NULL);
+ /*
+ * Opening the membership database is like a secondary GMDB
+ * operation, in the sense that we don't care how many clients
+ * ack it, yet we don't want the application to see any perm
+ * failure events.
+ */
+ DB_ASSERT(env, db_rep->active_gmdb_update == none);
+ db_rep->active_gmdb_update = gmdb_secondary;
+ ret = __rep_open_sysdb(env,
+ ip, txn, REPMEMBERSHIP, flags, &dbp);
+ if (ret == 0 && txnp == NULL) {
+ /* The txn was just for the open operation. */
+ ret = __txn_commit(txn, 0);
+ txn = NULL;
+ }
+ db_rep->active_gmdb_update = none;
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * Lock out normal API operations. Again because we need to know that
+ * if a PERM_FAIL occurs, it was associated with our txn. Also, so that
+ * we avoid confusing the application with a PERM_FAIL event for our own
+ * txn.
+ */
+ if ((ret = __rep_take_apilockout(env)) != 0)
+ goto err;
+
+ /*
+ * Here, all steps have succeeded. Stash and/or pass back the fruits of
+ * our labor.
+ */
+ if (!was_open) {
+ DB_ASSERT(env, dbp != NULL);
+ db_rep->gmdb = dbp;
+ }
+ if (txnp != NULL) {
+ DB_ASSERT(env, txn != NULL);
+ *txnp = txn;
+ }
+ /*
+ * In the successful case, a later call to cleanup_gmdb_op will
+ * ENV_LEAVE.
+ */
+ return (0);
+
+err:
+ DB_ASSERT(env, ret != 0);
+ if (dbp != NULL)
+ (void)__db_close(dbp, txn, DB_NOSYNC);
+ if (txn != NULL)
+ (void)__txn_abort(txn);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_cleanup_gmdb_op __P((ENV *, int));
+ */
+int
+__repmgr_cleanup_gmdb_op(env, do_close)
+ ENV *env;
+ int do_close;
+{
+ DB_REP *db_rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ db_rep->active_gmdb_update = none;
+ ret = __rep_clear_apilockout(env);
+
+ if (do_close && db_rep->gmdb != NULL) {
+ if ((t_ret = __db_close(db_rep->gmdb, NULL, DB_NOSYNC) != 0) &&
+ ret == 0)
+ ret = t_ret;
+ db_rep->gmdb = NULL;
+ }
+ return (ret);
+}
+
+/*
+ * Check whether we're currently master, and if so hold that role so that we can
+ * perform a Group Membership database operation. After a successful call, the
+ * caller must call rlse_master_role to release the hold.
+ *
+ * If we can't guarantee that we can remain master, send an appropriate failure
+ * message on the given connection (unless NULL).
+ *
+ * We also ensure that only one GMDB operation will take place at time, for a
+ * couple of reasons: if we get a PERM_FAIL it means the fate of the change is
+ * indeterminate, so we have to assume the worst. We have to assume the higher
+ * value of nsites, yet we can't accept ack from the questionable site. If we
+ * allowed concurrent operations, this could lead to more than one questionable
+ * site, which would be even worse. Also, when we get a PERM_FAIL we want to
+ * know which txn failed, and that would be messy if there could be several.
+ *
+ * Of course we can't simply take the mutex for the duration, because
+ * the mutex needs to be available in order to send out the log
+ * records.
+ *
+ * PUBLIC: int __repmgr_hold_master_role __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_hold_master_role(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ LOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_await_gmdbop(env)) == 0) {
+ /*
+ * If we're currently master, but client_intent is set, it means
+ * that another thread is on the way to becoming master, so we
+ * can't promise to hold the master role for the caller: we've
+ * lost a close race.
+ */
+ if (rep->master_id != db_rep->self_eid ||
+ db_rep->client_intent)
+ ret = DB_REP_UNAVAIL;
+ else
+ db_rep->gmdb_busy = TRUE;
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (conn != NULL && ret == DB_REP_UNAVAIL &&
+ (t_ret = reject_fwd(env, conn)) != 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * Releases the "master role" lock once we're finished performing a membership
+ * DB operation.
+ *
+ * PUBLIC: int __repmgr_rlse_master_role __P((ENV *));
+ */
+int
+__repmgr_rlse_master_role(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ LOCK_MUTEX(db_rep->mutex);
+ db_rep->gmdb_busy = FALSE;
+ ret = __repmgr_signal(&db_rep->gmdb_idle);
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * Responds to a membership change request in the case we're not currently
+ * master. If we know the master, responds with a "forward" message, to tell
+ * the requestor who is master. Otherwise rejects it outright.
+ */
+static int
+reject_fwd(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ SITE_STRING_BUFFER site_string;
+ __repmgr_gm_fwd_args fwd;
+ repmgr_netaddr_t addr;
+ u_int8_t buf[MAX_MSG_BUF];
+ u_int32_t msg_type;
+ size_t len;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (IS_KNOWN_REMOTE_SITE(rep->master_id)) {
+ msg_type = REPMGR_GM_FORWARD;
+ LOCK_MUTEX(db_rep->mutex);
+ addr = SITE_FROM_EID(rep->master_id)->net_addr;
+ UNLOCK_MUTEX(db_rep->mutex);
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Forwarding request to master %s",
+ __repmgr_format_addr_loc(&addr, site_string)));
+ fwd.host.data = addr.host;
+ fwd.host.size = (u_int32_t)strlen(fwd.host.data) + 1;
+ fwd.port = addr.port;
+ fwd.gen = rep->mgen;
+ ret = __repmgr_gm_fwd_marshal(env,
+ &fwd, buf, sizeof(buf), &len);
+ DB_ASSERT(env, ret == 0);
+ } else {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Rejecting membership request with no known master"));
+ msg_type = REPMGR_GM_FAILURE;
+ len = 0;
+ }
+
+ return (__repmgr_send_sync_msg(env, conn,
+ msg_type, buf, (u_int32_t)len));
+}
+
+/*
+ * The length of "buf" must be at least MAX_GMDB_KEY.
+ */
+static void
+marshal_site_key(env, addr, buf, dbt, logrec)
+ ENV *env;
+ repmgr_netaddr_t *addr;
+ u_int8_t *buf;
+ DBT *dbt;
+ __repmgr_member_args *logrec;
+{
+ __repmgr_membership_key_args key;
+ size_t len;
+ int ret;
+
+ DB_INIT_DBT(key.host, addr->host, strlen(addr->host) + 1);
+ logrec->host = key.host;
+ key.port = addr->port;
+ logrec->port = key.port;
+ ret = __repmgr_membership_key_marshal(env,
+ &key, buf, MAX_MSG_BUF, &len);
+ DB_ASSERT(env, ret == 0);
+ DB_INIT_DBT(*dbt, buf, len);
+}
+
+static void
+marshal_site_data(env, status, buf, dbt)
+ ENV *env;
+ u_int32_t status;
+ u_int8_t *buf;
+ DBT *dbt;
+{
+ __repmgr_membership_data_args member_status;
+
+ member_status.flags = status;
+ __repmgr_membership_data_marshal(env, &member_status, buf);
+ DB_INIT_DBT(*dbt, buf, __REPMGR_MEMBERSHIP_DATA_SIZE);
+}
+
+/*
+ * PUBLIC: void __repmgr_set_sites __P((ENV *));
+ *
+ * Caller must hold mutex.
+ */
+void
+__repmgr_set_sites(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int ret;
+ u_int32_t n;
+ u_int i;
+
+ db_rep = env->rep_handle;
+
+ for (i = 0, n = 0; i < db_rep->site_cnt; i++) {
+ if (db_rep->sites[i].membership > 0)
+ n++;
+ }
+ ret = __rep_set_nsites_int(env, n);
+ DB_ASSERT(env, ret == 0);
+}
diff --git a/src/repmgr/repmgr_net.c b/src/repmgr/repmgr_net.c
new file mode 100644
index 00000000..54e3d066
--- /dev/null
+++ b/src/repmgr/repmgr_net.c
@@ -0,0 +1,2043 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+
+/*
+ * The functions in this module implement a simple wire protocol for
+ * transmitting messages of various types. Every message consists of a 9-byte
+ * header followed by a body (though the body could be 0-length). The header is
+ * the marshaled form of the "msg_hdr" structure defined in repmgr.src. The
+ * interpretation of header fields depends on message type, and is defined in
+ * repmgr.h. But as a general principle, in all cases there is enough
+ * information in the header for us to know the total size of the body, and the
+ * total amount of memory we need to allocate for storing and processing the
+ * message.
+ */
+
+/*
+ * In sending a message, we first try to send it in-line, in the sending thread,
+ * and without first copying the message, by using scatter/gather I/O, using
+ * iovecs to point to the various pieces of the message. If that all works
+ * without blocking, that's optimal.
+ * If we find that, for a particular connection, we can't send without
+ * blocking, then we must copy the message for sending later in the select()
+ * thread. In the course of doing that, we might as well "flatten" the message,
+ * forming one single buffer, to simplify life. Not only that, once we've gone
+ * to the trouble of doing that, other sites to which we also want to send the
+ * message (in the case of a broadcast), may as well take advantage of the
+ * simplified structure also.
+ * The sending_msg structure below holds it all. Note that this structure,
+ * and the "flat_msg" structure, are allocated separately, because (1) the
+ * flat_msg version is usually not needed; and (2) when a flat_msg is needed, it
+ * will need to live longer than the wrapping sending_msg structure.
+ * Note that, for the broadcast case, where we're going to use this
+ * repeatedly, the iovecs is a template that must be copied, since in normal use
+ * the iovecs pointers and lengths get adjusted after every partial write.
+ */
+struct sending_msg {
+ REPMGR_IOVECS *iovecs;
+ REPMGR_FLAT *fmsg;
+};
+
+/*
+ * Context for a thread waiting for client acks for PERM message. Passed from
+ * the send() function to the got_acks() predicate function, via
+ * __repmgr_await_cond(). The got_acks() function computes two potentially
+ * independent results: (1) do we have enough acks to stop waiting for more (the
+ * function return value, which triggers the behavior of await_cond()); and (2)
+ * whether the PERM message should be considered durable.
+ */
+struct repmgr_permanence {
+ DB_LSN lsn; /* LSN whose ack this thread is waiting for. */
+ u_int threshold; /* Number of client acks to wait for. */
+ u_int quorum; /* Durability threshold for QUORUM policy. */
+ int policy; /* Ack policy to be used for this txn. */
+ int is_durable; /* Result flag. */
+};
+
+#ifdef CONFIG_TEST
+static u_int fake_port __P((ENV *, u_int));
+#endif
+static int final_cleanup __P((ENV *, REPMGR_CONNECTION *, void *));
+static int flatten __P((ENV *, struct sending_msg *));
+static int got_acks __P((ENV *, void *));
+static int __repmgr_finish_connect
+ __P((ENV *, socket_t s, REPMGR_CONNECTION **));
+static int __repmgr_propose_version __P((ENV *, REPMGR_CONNECTION *));
+static int __repmgr_start_connect __P((ENV*, socket_t *, ADDRINFO *, int *));
+static void setup_sending_msg __P((ENV *,
+ struct sending_msg *, u_int8_t *, u_int, const DBT *, const DBT *));
+static int __repmgr_send_internal
+ __P((ENV *, REPMGR_CONNECTION *, struct sending_msg *, db_timeout_t));
+static int enqueue_msg
+ __P((ENV *, REPMGR_CONNECTION *, struct sending_msg *, size_t));
+static REPMGR_SITE *connected_site __P((ENV *, int));
+static REPMGR_SITE *__repmgr_find_available_peer __P((ENV *));
+static int send_connection __P((ENV *, u_int,
+ REPMGR_CONNECTION *, struct sending_msg *, int *));
+
+/*
+ * Connects to the given network address, using blocking operations. Any thread
+ * synchronization is the responsibility of the caller.
+ *
+ * PUBLIC: int __repmgr_connect __P((ENV *,
+ * PUBLIC: repmgr_netaddr_t *, REPMGR_CONNECTION **, int *));
+ */
+int
+__repmgr_connect(env, netaddr, connp, errp)
+ ENV *env;
+ repmgr_netaddr_t *netaddr;
+ REPMGR_CONNECTION **connp;
+ int *errp;
+{
+ REPMGR_CONNECTION *conn;
+ ADDRINFO *ai0, *ai;
+ socket_t sock;
+ int err, ret;
+ u_int port;
+
+ COMPQUIET(err, 0);
+#ifdef CONFIG_TEST
+ port = fake_port(env, netaddr->port);
+#else
+ port = netaddr->port;
+#endif
+ if ((ret = __repmgr_getaddr(env, netaddr->host, port, 0, &ai0)) != 0)
+ return (ret);
+
+ /*
+ * Try each address on the list, until success. Note that if several
+ * addresses on the list produce retryable error, we can only pass back
+ * to our caller the last one.
+ */
+ for (ai = ai0; ai != NULL; ai = ai->ai_next) {
+ switch ((ret = __repmgr_start_connect(env, &sock, ai, &err))) {
+ case 0:
+ if ((ret = __repmgr_finish_connect(env,
+ sock, &conn)) == 0)
+ *connp = conn;
+ else
+ (void)closesocket(sock);
+ goto out;
+ case DB_REP_UNAVAIL:
+ continue;
+ default:
+ goto out;
+ }
+ }
+
+out:
+ __os_freeaddrinfo(env, ai0);
+ if (ret == DB_REP_UNAVAIL) {
+ __repmgr_print_conn_err(env, netaddr, err);
+ *errp = err;
+ }
+ return (ret);
+}
+
+static int
+__repmgr_start_connect(env, socket_result, ai, err)
+ ENV *env;
+ socket_t *socket_result;
+ ADDRINFO *ai;
+ int *err;
+{
+ socket_t s;
+ int ret;
+
+ if ((s = socket(ai->ai_family,
+ ai->ai_socktype, ai->ai_protocol)) == SOCKET_ERROR) {
+ ret = net_errno;
+ __db_err(env, ret, "create socket");
+ return (ret);
+ }
+
+ if (connect(s, ai->ai_addr, (socklen_t)ai->ai_addrlen) != 0) {
+ *err = net_errno;
+ (void)closesocket(s);
+ return (DB_REP_UNAVAIL);
+ }
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "connection established"));
+
+ *socket_result = s;
+ return (0);
+}
+
+static int
+__repmgr_finish_connect(env, s, connp)
+ ENV *env;
+ socket_t s;
+ REPMGR_CONNECTION **connp;
+{
+ REPMGR_CONNECTION *conn;
+ int ret;
+
+ if ((ret = __repmgr_new_connection(env, &conn, s, CONN_CONNECTED)) != 0)
+ return (ret);
+
+ if ((ret = __repmgr_set_keepalive(env, conn)) == 0 &&
+ (ret = __repmgr_propose_version(env, conn)) == 0)
+ *connp = conn;
+ else
+ (void)__repmgr_destroy_conn(env, conn);
+ return (ret);
+}
+
+static int
+__repmgr_propose_version(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ __repmgr_version_proposal_args versions;
+ repmgr_netaddr_t *my_addr;
+ size_t hostname_len, rec_length;
+ u_int8_t *buf, *p;
+ int ret;
+
+ db_rep = env->rep_handle;
+ my_addr = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+
+ /*
+ * In repmgr wire protocol version 1, a handshake message had a rec part
+ * that looked like this:
+ *
+ * +-----------------+----+
+ * | host name ... | \0 |
+ * +-----------------+----+
+ *
+ * To ensure its own sanity, the old repmgr would write a NUL into the
+ * last byte of a received message, and then use normal C library string
+ * operations (e.g., strlen, strcpy).
+ *
+ * Now, a version proposal has a rec part that looks like this:
+ *
+ * +-----------------+----+------------------+------+
+ * | host name ... | \0 | extra info ... | \0 |
+ * +-----------------+----+------------------+------+
+ *
+ * The "extra info" contains the version parameters, in marshaled form.
+ */
+
+ hostname_len = strlen(my_addr->host);
+ rec_length = hostname_len + 1 +
+ __REPMGR_VERSION_PROPOSAL_SIZE + 1;
+ if ((ret = __os_malloc(env, rec_length, &buf)) != 0)
+ goto out;
+ p = buf;
+ (void)strcpy((char*)p, my_addr->host);
+
+ p += hostname_len + 1;
+ versions.min = DB_REPMGR_MIN_VERSION;
+ versions.max = DB_REPMGR_VERSION;
+ __repmgr_version_proposal_marshal(env, &versions, p);
+
+ ret = __repmgr_send_v1_handshake(env, conn, buf, rec_length);
+ __os_free(env, buf);
+out:
+ return (ret);
+}
+
+/*
+ * __repmgr_send --
+ * The send function for DB_ENV->rep_set_transport.
+ *
+ * PUBLIC: int __repmgr_send __P((DB_ENV *, const DBT *, const DBT *,
+ * PUBLIC: const DB_LSN *, int, u_int32_t));
+ */
+int
+__repmgr_send(dbenv, control, rec, lsnp, eid, flags)
+ DB_ENV *dbenv;
+ const DBT *control, *rec;
+ const DB_LSN *lsnp;
+ int eid;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site;
+ struct repmgr_permanence perm;
+ db_timeout_t maxblock;
+ u_int32_t available, nclients, needed, npeers_sent, nsites_sent, quorum;
+ int missed_peer, policy, ret, t_ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+
+ LOCK_MUTEX(db_rep->mutex);
+
+ /*
+ * If we're already "stopped", we can't send anything. This covers the
+ * case where a bulk buffer is flushed at env close, or perhaps an
+ * unexpected __repmgr_thread_failure.
+ */
+ if (db_rep->repmgr_status == stopped) {
+ ret = DB_REP_UNAVAIL;
+ goto out;
+ }
+
+ /*
+ * Check whether we need to refresh our site address information with
+ * more recent updates from shared memory.
+ */
+ if (rep->siteinfo_seq > db_rep->siteinfo_seq &&
+ (ret = __repmgr_sync_siteaddr(env)) != 0)
+ goto out;
+
+ if (eid == DB_EID_BROADCAST) {
+ if ((ret = __repmgr_send_broadcast(env,
+ REPMGR_REP_MESSAGE, control, rec,
+ &nsites_sent, &npeers_sent, &missed_peer)) != 0)
+ goto out;
+ } else {
+ DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(eid));
+
+ /*
+ * Since repmgr's simple c2c implementation doesn't truly manage
+ * staged synchronization it doesn't work well with master
+ * leases. So, disable it during the time when a new master may
+ * be trying to establish its first set of lease grants.
+ */
+ if (IS_USING_LEASES(env) && !rep->stat.st_startup_complete)
+ LF_CLR(DB_REP_ANYWHERE);
+ /*
+ * If this is a request that can be sent anywhere, then see if
+ * we can send it to our peer (to save load on the master), but
+ * not if it's a rerequest, 'cuz that likely means we tried this
+ * already and failed.
+ */
+ if ((flags & (DB_REP_ANYWHERE | DB_REP_REREQUEST)) ==
+ DB_REP_ANYWHERE &&
+ (site = __repmgr_find_available_peer(env)) != NULL) {
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "sending request to peer"));
+ } else if ((site = connected_site(env, eid)) == NULL) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "ignoring message sent to unavailable site"));
+ ret = DB_REP_UNAVAIL;
+ goto out;
+ }
+
+ /*
+ * In case the connection is clogged up and we have to wait for
+ * space on the output queue, how long shall we wait? We could
+ * of course create a new timeout configuration type, so that
+ * the application could set it directly. But that would start
+ * to overwhelm the user with too many choices to think about.
+ * We already have an ACK timeout, which is the user's estimate
+ * of how long it should take to send a message to the client,
+ * have it be processed, and return a message back to us. We
+ * multiply that by the queue size, because that's how many
+ * messages have to be swallowed up by the client before we're
+ * able to start sending again (at least to a rough
+ * approximation).
+ */
+ maxblock = OUT_QUEUE_LIMIT *
+ (rep->ack_timeout == 0 ?
+ DB_REPMGR_DEFAULT_ACK_TIMEOUT : rep->ack_timeout);
+
+ /*
+ * Assign the conn struct pointer to a local variable ("conn"),
+ * because the pointer in the site struct (ref.conn.in or
+ * ref.conn.out) could get clobbered if the connection gets
+ * busted in another thread during our send_one() call. That
+ * could happen if the outgoing half of the connection is
+ * clogged and we decide to await_drain().
+ */
+#undef SEND_ONE_CONNECTION
+#define SEND_ONE_CONNECTION(c) \
+ do { \
+ if ((conn = (c)) != NULL && \
+ IS_READY_STATE(conn->state) && \
+ (ret = __repmgr_send_one(env, \
+ conn, REPMGR_REP_MESSAGE, \
+ control, rec, maxblock)) == DB_REP_UNAVAIL && \
+ (t_ret = \
+ __repmgr_bust_connection(env, conn)) != 0) \
+ ret = t_ret; \
+ } while (0)
+
+ SEND_ONE_CONNECTION(site->ref.conn.in);
+ if (ret != 0 && ret != DB_REP_UNAVAIL)
+ goto out;
+ SEND_ONE_CONNECTION(site->ref.conn.out);
+ if (ret != 0)
+ goto out;
+#undef SEND_ONE_CONNECTION
+
+ nsites_sent = 1;
+ npeers_sent = F_ISSET(site, SITE_ELECTABLE) ? 1 : 0;
+ missed_peer = FALSE;
+ }
+
+ /*
+ * Traditionally, each ack policy determines how many acks are needed to
+ * constitute successful durability. We would simply wait until we
+ * collected that many acks, and if we got them it was success, or if we
+ * timed out it was failure. And if we knew from the start that we
+ * hadn't even sent the message to enough sites to meet the "needed"
+ * threshold, then there was no point in waiting.
+ * It's a different story for the ALL_AVAILABLE policy. There the
+ * decision to continue awaiting more acks is decoupled from the
+ * durability question: we want to wait until we get acks from all sites
+ * we sent to (though still within the timeout limit).
+ * So now we have to think of "needed" in a slightly more general
+ * way: it's the threshold that controls how many acks we keep waiting
+ * for. It's usually still also controls the determination of the
+ * durability result; except not for ALL_AVAILABLE.
+ */
+ if (LF_ISSET(DB_REP_PERMANENT)) {
+ /* Adjust so as not to count the local site, which is master. */
+ nclients = db_rep->region->config_nsites -1;
+
+ /*
+ * When doing membership DB changes, avoid some impossible
+ * situations.
+ */
+ policy = rep->perm_policy;
+ switch (db_rep->active_gmdb_update) {
+ case gmdb_primary:
+ if (policy == DB_REPMGR_ACKS_ALL ||
+ policy == DB_REPMGR_ACKS_ALL_PEERS)
+ policy = DB_REPMGR_ACKS_ALL_AVAILABLE;
+ else if (policy == DB_REPMGR_ACKS_QUORUM &&
+ nclients == 1)
+ nclients = 0;
+ else if ((policy == DB_REPMGR_ACKS_ONE ||
+ policy == DB_REPMGR_ACKS_ONE_PEER) &&
+ nclients == 1) {
+ nclients = 0;
+ policy = DB_REPMGR_ACKS_QUORUM;
+ }
+ break;
+ case gmdb_secondary:
+ policy = DB_REPMGR_ACKS_NONE;
+ break;
+ case none:
+ break;
+ }
+ quorum = 0;
+ switch (policy) {
+ case DB_REPMGR_ACKS_NONE:
+ needed = 0;
+ COMPQUIET(available, 0);
+ break;
+
+ case DB_REPMGR_ACKS_ONE:
+ needed = 1;
+ available = nsites_sent;
+ break;
+
+ case DB_REPMGR_ACKS_ALL:
+ /* Number of sites in the group besides myself. */
+ needed = nclients;
+ available = nsites_sent;
+ break;
+
+ case DB_REPMGR_ACKS_ONE_PEER:
+ needed = 1;
+ available = npeers_sent;
+ break;
+
+ case DB_REPMGR_ACKS_ALL_PEERS:
+ /*
+ * Too hard to figure out "needed", since we're not
+ * keeping track of how many peers we have; so just skip
+ * the optimization in this case.
+ */
+ needed = 1;
+ available = npeers_sent;
+ break;
+
+ case DB_REPMGR_ACKS_QUORUM:
+ case DB_REPMGR_ACKS_ALL_AVAILABLE:
+ /*
+ * The minimum number of acks necessary to ensure that
+ * the transaction is durable if an election is held.
+ *
+ * Unless instructed otherwise, our special handling for
+ * 2-site groups means that a client that loses contact
+ * with the master elects itself master (even though
+ * that doesn't constitute a majority). In order to
+ * provide the expected guarantee implied by the
+ * definition of "quorum" we have to fudge the ack
+ * calculation in this case: specifically, we need to
+ * make sure that the client has received it in order
+ * for us to consider it "perm". Thus, if nclients is
+ * 1, needed should be 1.
+ *
+ * While we're at it, if nclients is 0 (a nascent
+ * "group" consisting of nothing but a master), surely
+ * the number of acks we need should be 0.
+ *
+ * Note that turning the usual strict behavior back on
+ * in a 2-site group results in "0" as the number of
+ * clients needed to ack a txn in order for it to have
+ * arrived at a quorum. This is the correct result,
+ * strange as it may seem! This may well mean that in a
+ * 2-site group the QUORUM policy is rarely the right
+ * choice.
+ *
+ * When a GMDB update adds the second site, force
+ * "strict" behavior: in that case nsites is 2, but the
+ * new site is not yet allowed to contribute an ack.
+ */
+ if (nclients > 1 ||
+ FLD_ISSET(db_rep->region->config,
+ REP_C_2SITE_STRICT) ||
+ db_rep->active_gmdb_update == gmdb_primary)
+ quorum = nclients / 2;
+ else
+ quorum = nclients;
+
+ if (policy == DB_REPMGR_ACKS_ALL_AVAILABLE) {
+ if (nsites_sent > 0)
+ needed = available = nsites_sent;
+ else {
+ ret = quorum > 0 ? DB_REP_UNAVAIL : 0;
+ goto out;
+ }
+ } else {
+ DB_ASSERT(env, policy == DB_REPMGR_ACKS_QUORUM);
+ needed = quorum;
+ available = npeers_sent;
+ if (npeers_sent < quorum && !missed_peer) {
+ /*
+ * If we sent to all peers, it doesn't
+ * matter how few there were. This
+ * derives from the definition of the
+ * QUORUM policy: no possible subsequent
+ * election can fail to include the
+ * transaction. If all electable sites
+ * have the transaction, then it can't
+ * be lost in an election, no matter how
+ * few there are.
+ */
+ needed = npeers_sent;
+ }
+ }
+ break;
+
+ default:
+ ret = __db_unknown_path(env, "__repmgr_send");
+ goto out;
+ }
+ if (policy != DB_REPMGR_ACKS_ALL_AVAILABLE) {
+ /*
+ * Skip the waiting if it is unnecessary, or if it would
+ * be futile. For most ack policies, these decisions
+ * are straightforward, and can be computed in the
+ * following generic way. For ALL_AVAILABLE, skipping
+ * is also possible, but it is decided earlier (above,
+ * inside the "switch" statement).
+ *
+ * Note that for ALL, there is a surprising side-effect
+ * if even one client is down. It will not wait for
+ * any acks and the running clients can fall further
+ * and further behind the master.
+ */
+ if (needed == 0)
+ goto out;
+ if (available < needed) {
+ ret = DB_REP_UNAVAIL;
+ goto out;
+ }
+ }
+
+ /* In ALL_PEERS case, display of "needed" might be confusing. */
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "will await acknowledgement: need %u", needed));
+ perm.lsn = *lsnp;
+ perm.threshold = needed;
+ perm.policy = policy;
+ perm.quorum = quorum;
+ perm.is_durable = FALSE;
+ ret = __repmgr_await_cond(env, got_acks,
+ &perm, rep->ack_timeout, &db_rep->ack_waiters);
+ if (ret == 0 || ret == DB_TIMEOUT)
+ ret = perm.is_durable ? 0 : DB_REP_UNAVAIL;
+ }
+
+out: UNLOCK_MUTEX(db_rep->mutex);
+ if (LF_ISSET(DB_REP_PERMANENT)) {
+ if (ret != 0) {
+ switch (db_rep->active_gmdb_update) {
+ case none:
+ /*
+ * Fire perm-failed event to the application as
+ * usual; no other bookkeeping needed here.
+ */
+ STAT(db_rep->region->mstat.st_perm_failed++);
+ DB_EVENT(env, DB_EVENT_REP_PERM_FAILED, NULL);
+ break;
+ case gmdb_primary:
+ /*
+ * Since this is a membership DB operation,
+ * refrain from bothering the application about
+ * it (with an event that it wouldn't be
+ * expecting), and make a note of the failure so
+ * we can resolve it later.
+ */
+ db_rep->limbo_failure = *lsnp;
+ /* FALLTHROUGH */
+ case gmdb_secondary:
+ /* Merely refrain from firing event. */
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "GMDB perm failure %d at [%lu][%lu]",
+ (int)db_rep->active_gmdb_update,
+ (u_long)lsnp->file, (u_long)lsnp->offset));
+ break;
+ }
+ } else if (db_rep->limbo_resolution_needed) {
+ /*
+ * A previous membership DB operation failed, leaving us
+ * "in limbo", but now some perm operation has completed
+ * successfully. Since the ack of any txn implies ack
+ * of all txns that occur before it (in LSN order), we
+ * now know that the previous failure can be resolved.
+ * We can't do it here in this thread, so put a request
+ * on the message processing queue to have it handled
+ * later.
+ */
+ db_rep->durable_lsn = *lsnp;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "perm success [%lu][%lu] with limbo resolution needed",
+ (u_long)lsnp->file, (u_long)lsnp->offset));
+ db_rep->limbo_resolution_needed = FALSE;
+
+ /* Don't trump ret, even if it's zero. */
+ LOCK_MUTEX(db_rep->mutex);
+ if ((t_ret = __repmgr_defer_op(env,
+ REPMGR_RESOLVE_LIMBO)) != 0)
+ __db_err(env, t_ret, "repmgr_defer_op");
+ UNLOCK_MUTEX(db_rep->mutex);
+ }
+ }
+ return (ret);
+}
+
+static REPMGR_SITE *
+connected_site(env, eid)
+ ENV *env;
+ int eid;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+
+ db_rep = env->rep_handle;
+ DB_ASSERT(env, IS_VALID_EID(eid));
+ site = SITE_FROM_EID(eid);
+ if (site->state == SITE_CONNECTED)
+ return (site);
+ return (NULL);
+}
+
+/*
+ * Synchronize our list of sites with new information that has been added to the
+ * list in the shared region.
+ *
+ * PUBLIC: int __repmgr_sync_siteaddr __P((ENV *));
+ */
+int
+__repmgr_sync_siteaddr(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ u_int added;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ret = 0;
+
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+
+ if (!IS_VALID_EID(db_rep->self_eid))
+ db_rep->self_eid = rep->self_eid;
+
+ added = db_rep->site_cnt;
+ if ((ret = __repmgr_copy_in_added_sites(env)) == 0)
+ ret = __repmgr_init_new_sites(env, (int)added,
+ (int)db_rep->site_cnt);
+
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ return (ret);
+}
+
+/*
+ * Sends message to all sites with which we currently have an active
+ * connection. Sets result parameters according to how many sites we attempted
+ * to begin sending to, even if we did nothing more than queue it for later
+ * delivery.
+ *
+ * !!!
+ * Caller must hold env->mutex.
+ * PUBLIC: int __repmgr_send_broadcast __P((ENV *, u_int,
+ * PUBLIC: const DBT *, const DBT *, u_int *, u_int *, int *));
+ */
+int
+__repmgr_send_broadcast(env, type, control, rec, nsitesp, npeersp, missingp)
+ ENV *env;
+ u_int type;
+ const DBT *control, *rec;
+ u_int *nsitesp, *npeersp;
+ int *missingp;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ struct sending_msg msg;
+ REPMGR_SITE *site;
+ REPMGR_IOVECS iovecs;
+ u_int8_t msg_hdr_buf[__REPMGR_MSG_HDR_SIZE];
+ u_int nsites, npeers;
+ int eid, full_member, has_missing_peer, ret, sent1, sent2;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * Sending a broadcast is quick, because we allow no blocking. So it
+ * shouldn't much matter. But just in case, take the timestamp before
+ * sending, so that if anything we err on the side of keeping clients
+ * placated (i.e., possibly sending a heartbeat slightly more frequently
+ * than necessary).
+ */
+ __os_gettime(env, &db_rep->last_bcast, 1);
+
+ msg.iovecs = &iovecs;
+ setup_sending_msg(env, &msg, msg_hdr_buf, type, control, rec);
+ nsites = npeers = 0;
+ has_missing_peer = FALSE;
+
+ /* Send to (only the main connection with) every site. */
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ sent1 = sent2 = FALSE;
+ site = SITE_FROM_EID(eid);
+
+ /*
+ * Exclude non-member sites, unless we're the master, since it's
+ * useful to keep letting a removed site see updates so that it
+ * learns of its own removal, and will know to rejoin at its
+ * next reboot.
+ */
+ if (site->membership == SITE_PRESENT)
+ full_member = TRUE;
+ else {
+ full_member = FALSE;
+ if (rep->master_id != db_rep->self_eid)
+ goto next;
+ }
+
+ /*
+ * Send message on either or both main connections, as
+ * available.
+ */
+ if ((ret = send_connection(env, type,
+ site->ref.conn.in, &msg, &sent1)) != 0)
+ return (ret);
+ if ((ret = send_connection(env, type,
+ site->ref.conn.out, &msg, &sent2)) != 0)
+ return (ret);
+next:
+ /*
+ * Count how many full-fledged member sites we sent to, and how
+ * many of those were electable peers. These values will be
+ * used by the caller to manage waiting for acks. Ignore
+ * non-full-fledged member sites because we don't accept acks
+ * from them.
+ */
+ if (full_member) {
+ if (sent1 || sent2) {
+ nsites++;
+ if (F_ISSET(site, SITE_ELECTABLE))
+ npeers++;
+ } else {
+ /*
+ * Keep track of whether any of the sites we
+ * failed to send to was an electable peer. If
+ * we don't know a site's electability yet, we
+ * assume the worst in order to be safe.
+ */
+ if (!F_ISSET(site, SITE_HAS_PRIO) ||
+ F_ISSET(site, SITE_ELECTABLE))
+ has_missing_peer = TRUE;
+ }
+ }
+ }
+
+ *nsitesp = nsites;
+ *npeersp = npeers;
+ *missingp = has_missing_peer;
+ return (0);
+}
+
+static int
+send_connection(env, type, conn, msg, sent)
+ ENV *env;
+ u_int type;
+ REPMGR_CONNECTION *conn;
+ struct sending_msg *msg;
+ int *sent;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ static const u_int version_max_msg_type[] = {
+ 0,
+ REPMGR_MAX_V1_MSG_TYPE,
+ REPMGR_MAX_V2_MSG_TYPE,
+ REPMGR_MAX_V3_MSG_TYPE,
+ REPMGR_MAX_V4_MSG_TYPE
+ };
+
+ db_rep = env->rep_handle;
+ *sent = FALSE;
+ if (conn == NULL || !IS_READY_STATE(conn->state))
+ return (0);
+
+ DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(conn->eid) &&
+ conn->version > 0 &&
+ conn->version <= DB_REPMGR_VERSION);
+
+ /*
+ * Skip if the type of message we're sending is beyond the range
+ * of known message types for this connection's version.
+ *
+ * !!!
+ * Don't be misled by the apparent generality of this simple
+ * test. It works currently, because the only kinds of messages
+ * that we broadcast are REP_MESSAGE and HEARTBEAT. But in the
+ * future other kinds of messages might require more intricate
+ * per-connection-version customization (for example,
+ * per-version message format conversion, addition of new
+ * fields, etc.).
+ */
+ if (type > version_max_msg_type[conn->version])
+ return (0);
+
+ /*
+ * Broadcast messages are either application threads committing
+ * transactions, or replication status message that we can
+ * afford to lose. So don't allow blocking for them (pass
+ * maxblock argument as 0).
+ */
+ if ((ret = __repmgr_send_internal(env, conn, msg, 0)) == 0)
+ *sent = TRUE;
+ else if (ret == DB_TIMEOUT) {
+ /*
+ * Couldn't send because of a full output queue.
+ * Indicating that we sent it would be wrong, but it's
+ * otherwise OK in the sense that the connection isn't
+ * definitively known to be broken, and rep protocol
+ * always allows us to drop a message if we have to.
+ */
+ ret = 0;
+ } else if (ret == DB_REP_UNAVAIL)
+ ret = __repmgr_bust_connection(env, conn);
+ return (ret);
+}
+
+/*
+ * __repmgr_send_one --
+ * Send a message to a site, or if you can't just yet, make a copy of it
+ * and arrange to have it sent later. 'rec' may be NULL, in which case we send
+ * a zero length and no data.
+ *
+ * !!!
+ * Note that the mutex should be held through this call.
+ * It doubles as a synchronizer to make sure that two threads don't
+ * intersperse writes that are part of two single messages.
+ *
+ * PUBLIC: int __repmgr_send_one __P((ENV *, REPMGR_CONNECTION *,
+ * PUBLIC: u_int, const DBT *, const DBT *, db_timeout_t));
+ */
+int
+__repmgr_send_one(env, conn, msg_type, control, rec, maxblock)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ u_int msg_type;
+ const DBT *control, *rec;
+ db_timeout_t maxblock;
+{
+ struct sending_msg msg;
+ REPMGR_IOVECS iovecs;
+ u_int8_t hdr_buf[__REPMGR_MSG_HDR_SIZE];
+ int ret;
+
+ msg.iovecs = &iovecs;
+ setup_sending_msg(env, &msg, hdr_buf, msg_type, control, rec);
+ if ((ret =
+ __repmgr_send_internal(env, conn, &msg, maxblock)) == DB_TIMEOUT &&
+ maxblock == 0)
+ ret = 0;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_many __P((ENV *,
+ * PUBLIC: REPMGR_CONNECTION *, REPMGR_IOVECS *, db_timeout_t));
+ */
+int
+__repmgr_send_many(env, conn, iovecs, maxblock)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ REPMGR_IOVECS *iovecs;
+ db_timeout_t maxblock;
+{
+ struct sending_msg msg;
+ int ret;
+
+ if (conn->state == CONN_DEFUNCT)
+ return (DB_REP_UNAVAIL);
+ msg.iovecs = iovecs;
+ msg.fmsg = NULL;
+ if ((ret =
+ __repmgr_send_internal(env, conn, &msg, maxblock)) == DB_TIMEOUT &&
+ maxblock == 0)
+ ret = 0;
+ if (ret != 0 && ret != DB_TIMEOUT)
+ (void)__repmgr_disable_connection(env, conn);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_own_msg __P((ENV *,
+ * PUBLIC: REPMGR_CONNECTION *, u_int32_t, u_int8_t *, u_int32_t));
+ */
+int
+__repmgr_send_own_msg(env, conn, type, buf, len)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ u_int8_t *buf;
+ u_int32_t len, type;
+{
+ REPMGR_IOVECS iovecs;
+ struct sending_msg msg;
+ __repmgr_msg_hdr_args msg_hdr;
+ u_int8_t hdr_buf[__REPMGR_MSG_HDR_SIZE];
+
+ if (conn->version < OWN_MIN_VERSION)
+ return (0);
+ msg_hdr.type = REPMGR_OWN_MSG;
+ REPMGR_OWN_BUF_SIZE(msg_hdr) = len;
+ REPMGR_OWN_MSG_TYPE(msg_hdr) = type;
+ __repmgr_msg_hdr_marshal(env, &msg_hdr, hdr_buf);
+
+ __repmgr_iovec_init(&iovecs);
+ __repmgr_add_buffer(&iovecs, hdr_buf, __REPMGR_MSG_HDR_SIZE);
+ if (len > 0)
+ __repmgr_add_buffer(&iovecs, buf, len);
+
+ msg.iovecs = &iovecs;
+ msg.fmsg = NULL;
+ return (__repmgr_send_internal(env, conn, &msg, 0));
+}
+
+/*
+ * Attempts a "best effort" to send a message on the given site. If there is an
+ * excessive backlog of message already queued on the connection, what shall we
+ * do? If the caller doesn't mind blocking, we'll wait (a limited amount of
+ * time) for the queue to drain. Otherwise we'll simply drop the message. This
+ * is always allowed by the replication protocol. But in the case of a
+ * multi-message response to a request like PAGE_REQ, LOG_REQ or ALL_REQ we
+ * almost always get a flood of messages that instantly fills our queue, so
+ * blocking improves performance (by avoiding the need for the client to
+ * re-request).
+ */
+static int
+__repmgr_send_internal(env, conn, msg, maxblock)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ struct sending_msg *msg;
+ db_timeout_t maxblock;
+{
+ DB_REP *db_rep;
+ SITE_STRING_BUFFER buffer;
+ int ret;
+ size_t total_written;
+
+ db_rep = env->rep_handle;
+
+ DB_ASSERT(env, conn->state != CONN_DEFUNCT);
+ if (!STAILQ_EMPTY(&conn->outbound_queue)) {
+ /*
+ * Output to this site is currently owned by the select()
+ * thread, so we can't try sending in-line here. We can only
+ * queue the msg for later.
+ */
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "msg to %s to be queued",
+ __repmgr_format_eid_loc(db_rep, conn, buffer)));
+ if (conn->out_queue_length >= OUT_QUEUE_LIMIT &&
+ maxblock > 0 && conn->state != CONN_CONGESTED) {
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "block thread, awaiting output queue space"));
+ conn->ref_count++;
+ ret = __repmgr_await_drain(env, conn, maxblock);
+ conn->ref_count--;
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "drain returned %d (%d,%d)", ret,
+ db_rep->repmgr_status, conn->out_queue_length));
+ if (db_rep->repmgr_status == stopped)
+ return (DB_TIMEOUT);
+ if (ret != 0)
+ return (ret);
+ if (STAILQ_EMPTY(&conn->outbound_queue))
+ goto empty;
+ }
+ if (conn->out_queue_length < OUT_QUEUE_LIMIT)
+ return (enqueue_msg(env, conn, msg, 0));
+ else {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "queue limit exceeded"));
+ STAT(env->rep_handle->
+ region->mstat.st_msgs_dropped++);
+ return (DB_TIMEOUT);
+ }
+ }
+empty:
+ if ((ret = __repmgr_write_iovecs(env,
+ conn, msg->iovecs, &total_written)) == 0)
+ return (0);
+ switch (ret) {
+ case WOULDBLOCK:
+#if defined(DB_REPMGR_EAGAIN) && DB_REPMGR_EAGAIN != WOULDBLOCK
+ case DB_REPMGR_EAGAIN:
+#endif
+ break;
+ default:
+#ifdef EBADF
+ DB_ASSERT(env, ret != EBADF);
+#endif
+ __repmgr_fire_conn_err_event(env, conn, ret);
+ STAT(env->rep_handle->region->mstat.st_connection_drop++);
+ return (DB_REP_UNAVAIL);
+ }
+
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC, "wrote only %lu bytes to %s",
+ (u_long)total_written,
+ __repmgr_format_eid_loc(db_rep, conn, buffer)));
+ /*
+ * We can't send any more without blocking: queue (a pointer to) a
+ * "flattened" copy of the message, so that the select() thread will
+ * finish sending it later.
+ */
+ if ((ret = enqueue_msg(env, conn, msg, total_written)) != 0)
+ return (ret);
+
+ STAT(env->rep_handle->region->mstat.st_msgs_queued++);
+
+ /*
+ * Wake the main select thread so that it can discover that it has
+ * received ownership of this connection. Note that we didn't have to
+ * do this in the previous case (above), because the non-empty queue
+ * implies that the select() thread is already managing ownership of
+ * this connection.
+ */
+ return (__repmgr_wake_main_thread(env));
+}
+
+/*
+ * PUBLIC: int __repmgr_write_iovecs __P((ENV *, REPMGR_CONNECTION *,
+ * PUBLIC: REPMGR_IOVECS *, size_t *));
+ */
+int
+__repmgr_write_iovecs(env, conn, iovecs, writtenp)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ REPMGR_IOVECS *iovecs;
+ size_t *writtenp;
+{
+ REPMGR_IOVECS iovec_buf, *v;
+ size_t nw, sz, total_written;
+ int ret;
+
+ /*
+ * Send as much data to the site as we can, without blocking. Keep
+ * writing as long as we're making some progress.
+ *
+ * Make a scratch copy of iovecs for our use, since we destroy it in the
+ * process of adjusting pointers after each partial I/O. The minimal
+ * REPMGR_IOVECS struct template is usually enough. But for app
+ * messages that need more than 3 segments we allocate a separate
+ * buffer.
+ */
+ if (iovecs->count <= MIN_IOVEC) {
+ v = &iovec_buf;
+ sz = sizeof(iovec_buf);
+ } else {
+ sz = (size_t)REPMGR_IOVECS_ALLOC_SZ((u_int)iovecs->count);
+ if ((ret = __os_malloc(env, sz, &v)) != 0)
+ return (ret);
+ }
+ memcpy(v, iovecs, sz);
+
+ total_written = 0;
+ while ((ret = __repmgr_writev(conn->fd, &v->vectors[v->offset],
+ v->count-v->offset, &nw)) == 0) {
+ total_written += nw;
+ if (__repmgr_update_consumed(v, nw)) /* all written */
+ break;
+ }
+ *writtenp = total_written;
+ if (v != &iovec_buf)
+ __os_free(env, v);
+ return (ret);
+}
+
+/*
+ * Count up how many sites have ack'ed the given LSN.
+ *
+ * Computes two results: the main result (function's return code) is a boolean
+ * flag indicating whether we've gotten all the acks we need and can therefore
+ * stop waiting for more. The perm->is_durable field determines whether we got
+ * enough acks to consider the transaction durably replicated. These two
+ * results are almost always the same, except when using the ALL_AVAILABLE
+ * policy.
+ *
+ * !!!
+ * Caller must hold the mutex.
+ */
+static int
+got_acks(env, context)
+ ENV *env;
+ void *context;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ struct repmgr_permanence *perm;
+ u_int sites_acked, peers_acked;
+ int done, eid, has_unacked_peer, is_perm, policy;
+
+ db_rep = env->rep_handle;
+ perm = context;
+ policy = perm->policy;
+
+ sites_acked = peers_acked = 0;
+ has_unacked_peer = FALSE;
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ site = SITE_FROM_EID(eid);
+ if (site->membership != SITE_PRESENT)
+ continue;
+ if (!F_ISSET(site, SITE_HAS_PRIO)) {
+ /*
+ * Never connected to this site: since we can't know
+ * whether it's a peer, assume the worst.
+ */
+ has_unacked_peer = TRUE;
+ continue;
+ }
+
+ if (LOG_COMPARE(&site->max_ack, &perm->lsn) >= 0) {
+ sites_acked++;
+ if (F_ISSET(site, SITE_ELECTABLE))
+ peers_acked++;
+ } else {
+ /* This site hasn't ack'ed the message. */
+ if (F_ISSET(site, SITE_ELECTABLE))
+ has_unacked_peer = TRUE;
+ }
+ }
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "checking perm result, %lu, %lu, %d",
+ (u_long)sites_acked, (u_long)peers_acked, has_unacked_peer));
+
+ switch (policy) {
+ case DB_REPMGR_ACKS_ALL:
+ case DB_REPMGR_ACKS_ONE:
+ is_perm = (sites_acked >= perm->threshold);
+ break;
+ case DB_REPMGR_ACKS_ONE_PEER:
+ is_perm = (peers_acked >= perm->threshold);
+ break;
+ case DB_REPMGR_ACKS_QUORUM:
+ case DB_REPMGR_ACKS_ALL_AVAILABLE:
+ is_perm = (peers_acked >= perm->quorum) || !has_unacked_peer;
+ break;
+ case DB_REPMGR_ACKS_ALL_PEERS:
+ is_perm = !has_unacked_peer;
+ break;
+ default:
+ is_perm = FALSE;
+ (void)__db_unknown_path(env, "got_acks");
+ }
+ if (is_perm)
+ perm->is_durable = TRUE;
+ if (policy == DB_REPMGR_ACKS_ALL_AVAILABLE)
+ done = sites_acked >= perm->threshold;
+ else
+ done = is_perm;
+ return (done);
+}
+
+/*
+ * Abandons a connection, to recover from an error. Takes necessary recovery
+ * action. Note that we don't actually close and clean up the connection here;
+ * that happens later, in the select() thread main loop. See further
+ * explanation at function __repmgr_disable_connection().
+ *
+ * Idempotent.
+ *
+ * PUBLIC: int __repmgr_bust_connection __P((ENV *, REPMGR_CONNECTION *));
+ *
+ * !!!
+ * Caller holds mutex.
+ */
+int
+__repmgr_bust_connection(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_SITE *site;
+ u_int32_t flags;
+ int ret, eid;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (conn->state == CONN_DEFUNCT)
+ return (0);
+ eid = conn->eid;
+ if ((ret = __repmgr_disable_connection(env, conn)) != 0)
+ return (ret);
+
+ /*
+ * When we have lost the connection to another site, take any/all
+ * appropriate recovery steps. But what does it mean to lose "the"
+ * connection, now that we actually have various different kinds of
+ * connection?
+ *
+ * 1. We're only talking about "rep" connections. Connections backing
+ * user channels aren't of concern here.
+ * 2. Subordinate connections are also not of concern here.
+ * 3. If we have two "main" connections with a given remote site (one
+ * incoming and the other outgoing), then if we lose one we still
+ * have the other. So, we still "have a connection" with the remote
+ * site.
+ *
+ * Finally, the appropriate recovery steps also depend on the current
+ * replication role (master/client) of both the local site and the
+ * remote site.
+ */
+ if (conn->type != REP_CONNECTION || !IS_KNOWN_REMOTE_SITE(eid))
+ goto out;
+
+ site = SITE_FROM_EID(eid);
+ /*
+ * When closing one of our main connections ("in" or "out"), if we still
+ * have the other one present, then we still consider ourselves to be
+ * connected, so there's nothing more to do. But if we have now become
+ * "not connected", we have some recovery steps to do. (Note that we
+ * don't care at all about subordinate connections, for the purposes of
+ * recovery steps.)
+ */
+ if (conn == site->ref.conn.in) {
+ site->ref.conn.in = NULL;
+ if (site->ref.conn.out != NULL) /* We're still connected. */
+ goto out;
+ } else if (conn == site->ref.conn.out) {
+ site->ref.conn.out = NULL;
+ if (site->ref.conn.in != NULL)
+ goto out;
+ } else /* Subordinate connection. */
+ goto out;
+
+ if ((ret = __repmgr_schedule_connection_attempt(env, eid, FALSE)) != 0)
+ goto out;
+
+ /*
+ * If the failed connection was the one between us and the
+ * master, assume that the master may have failed, and call for
+ * an election. But only do this for the connection to the main
+ * master process, not a subordinate one. And only do it if
+ * we're our site's main process, not a subordinate one. And
+ * skip it if the application has configured us not to do
+ * elections.
+ */
+ if (!IS_SUBORDINATE(db_rep) && eid == rep->master_id) {
+ /*
+ * Even if we're not doing elections, defer the event
+ * notification to later execution in the election
+ * thread. We don't want to fire an event in the select
+ * thread, and certainly not while holding the mutex.
+ */
+ flags = ELECT_F_EVENT_NOTIFY;
+ if (FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS))
+ LF_SET(ELECT_F_IMMED | ELECT_F_FAST);
+ else
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Master failure, but no elections"));
+
+ if ((ret = __repmgr_init_election(env, flags)) != 0)
+ goto out;
+ }
+
+ /*
+ * If we're the master site, and we lose a main connection to a
+ * client (whether we're the main replication process or a
+ * subordinate process), then the client is going to have
+ * trouble receiving live log records from us. So, set the
+ * temporary log archive block timer, to give the client a
+ * fighting chance to restart/recover/reconnect. (We don't care
+ * about the client's subordinate connections to us -- i.e.,
+ * connections with a subordinate process at the client site --
+ * because those sites can only be reading, not applying updates
+ * from us.)
+ */
+ if (rep->master_id == db_rep->self_eid) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Repmgr: bust connection. Block archive"));
+ MASTER_UPDATE(env, (REGENV *)env->reginfo->primary);
+ }
+out:
+ return (ret);
+}
+
+/*
+ * Remove a connection from the possibility of any further activity, making sure
+ * it ends up on the main connections list, so that it will be cleaned up at the
+ * next opportunity in the select() thread.
+ *
+ * Various threads write onto TCP/IP sockets, and an I/O error could occur at
+ * any time. However, only the dedicated "select()" thread may close the socket
+ * file descriptor, because under POSIX we have to drop our mutex and then call
+ * select() as two distinct (non-atomic) operations.
+ *
+ * To simplify matters, there is a single place in the select thread where we
+ * close and clean up after any defunct connection. Even if the I/O error
+ * happens in the select thread we follow this convention.
+ *
+ * When an error occurs, we disable the connection (mark it defunct so that no
+ * one else will try to use it, and so that the select thread will find it and
+ * clean it up), and then usually take some additional recovery action: schedule
+ * a connection retry for later, and possibly call for an election if it was a
+ * connection to the master. (This happens in the function
+ * __repmgr_bust_connection.) But sometimes we don't want to do the recovery
+ * part; just the disabling part.
+ *
+ * PUBLIC: int __repmgr_disable_connection __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_disable_connection(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ REPMGR_RESPONSE *resp;
+ u_int32_t i;
+ int eid, ret, t_ret;
+
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ conn->state = CONN_DEFUNCT;
+ if (conn->type == REP_CONNECTION) {
+ eid = conn->eid;
+ if (IS_VALID_EID(eid)) {
+ site = SITE_FROM_EID(eid);
+ if (conn != site->ref.conn.in &&
+ conn != site->ref.conn.out)
+ /* It's a subordinate connection. */
+ TAILQ_REMOVE(&site->sub_conns, conn, entries);
+ TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries);
+ conn->ref_count++;
+ }
+ conn->eid = -1;
+ } else if (conn->type == APP_CONNECTION) {
+ for (i = 0; i < conn->aresp; i++) {
+ resp = &conn->responses[i];
+ if (F_ISSET(resp, RESP_IN_USE) &&
+ F_ISSET(resp, RESP_THREAD_WAITING)) {
+ F_SET(resp, RESP_COMPLETE);
+ resp->ret = DB_REP_UNAVAIL;
+ }
+ }
+ ret = __repmgr_wake_waiters(env, &conn->response_waiters);
+ }
+ if ((t_ret = __repmgr_signal(&conn->drained)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_wake_main_thread(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_cleanup_defunct __P((ENV *, REPMGR_CONNECTION *));
+ *
+ * Caller should hold mutex, since we remove connection from main list.
+ */
+int
+__repmgr_cleanup_defunct(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+
+ ret = __repmgr_close_connection(env, conn);
+
+ TAILQ_REMOVE(&db_rep->connections, conn, entries);
+ if ((t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_close_connection __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_close_connection(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ int ret;
+#ifdef DB_WIN32
+ int t_ret;
+#endif
+
+ ret = 0;
+ if (conn->fd != INVALID_SOCKET &&
+ closesocket(conn->fd) == SOCKET_ERROR) {
+ ret = net_errno;
+ __db_err(env, ret, DB_STR("3582", "closing socket"));
+ }
+ conn->fd = INVALID_SOCKET;
+#ifdef DB_WIN32
+ if (conn->event_object != WSA_INVALID_EVENT &&
+ !WSACloseEvent(conn->event_object)) {
+ t_ret = net_errno;
+ __db_err(env, t_ret, DB_STR("3583",
+ "releasing WSA event object"));
+ if (ret == 0)
+ ret = t_ret;
+ }
+ conn->event_object = WSA_INVALID_EVENT;
+#endif
+ return (ret);
+}
+
+/*
+ * Decrements a connection's ref count; destroys the connection when the ref
+ * count reaches zero.
+ *
+ * PUBLIC: int __repmgr_decr_conn_ref __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_decr_conn_ref(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_ASSERT(env, conn->ref_count > 0);
+ return (--conn->ref_count > 0 ? 0 :
+ __repmgr_destroy_conn(env, conn));
+}
+
+/*
+ * Destroys a conn struct, by freeing all memory and associated resources.
+ * (This is a destructor, so it always must run to completion, and of course the
+ * passed-in object no longer exists upon return.)
+ *
+ * PUBLIC: int __repmgr_destroy_conn __P((ENV *, REPMGR_CONNECTION *));
+ *
+ * Caller is responsible for holding mutex if necessary; we make no assumption
+ * here, since we operate only on the given connection, in isolation. (However,
+ * note that if this conn has messages on its outbound queue, those are shared
+ * objects, and we decrement the ref count. So in that case the mutex will need
+ * to be held.)
+ */
+int
+__repmgr_destroy_conn(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ QUEUED_OUTPUT *out;
+ REPMGR_FLAT *msg;
+ REPMGR_RESPONSE *resp;
+ DBT *dbt;
+ int ret, t_ret;
+
+ ret = 0;
+
+ DB_ASSERT(env, conn->ref_count == 0);
+ /*
+ * Deallocate any input and output buffers we may have.
+ */
+ if (conn->reading_phase == DATA_PHASE) {
+ switch (conn->msg_type) {
+ case REPMGR_OWN_MSG:
+ if (conn->input.rep_message == NULL)
+ break;
+ /* FALLTHROUGH */
+ case REPMGR_APP_MESSAGE:
+ case REPMGR_HEARTBEAT:
+ case REPMGR_REP_MESSAGE:
+ __os_free(env, conn->input.rep_message);
+ break;
+
+ case REPMGR_APP_RESPONSE:
+ /*
+ * DATA_PHASE of an APP_RESPONSE is another way of
+ * saying there must be a cur_resp, and it must be
+ * READING.
+ */
+ DB_ASSERT(env, conn->cur_resp < conn->aresp &&
+ conn->responses != NULL);
+ resp = &conn->responses[conn->cur_resp];
+ DB_ASSERT(env, F_ISSET(resp, RESP_READING));
+ if (F_ISSET(resp, RESP_DUMMY_BUF))
+ __os_free(env, resp->dbt.data);
+ break;
+
+ case REPMGR_PERMLSN:
+ case REPMGR_HANDSHAKE:
+ dbt = &conn->input.repmgr_msg.cntrl;
+ if (dbt->size > 0)
+ __os_free(env, dbt->data);
+ dbt = &conn->input.repmgr_msg.rec;
+ if (dbt->size > 0)
+ __os_free(env, dbt->data);
+ break;
+
+ case REPMGR_RESP_ERROR:
+ /*
+ * This type doesn't use a DATA_PHASE, so this should be
+ * impossible.
+ */
+ default:
+ ret = __db_unknown_path(env, "destroy_conn");
+ }
+ }
+
+ if (conn->type == APP_CONNECTION && conn->responses != NULL)
+ __os_free(env, conn->responses);
+
+ if ((t_ret = __repmgr_destroy_waiters(env,
+ &conn->response_waiters)) != 0 && ret == 0)
+ ret = t_ret;
+
+ while (!STAILQ_EMPTY(&conn->outbound_queue)) {
+ out = STAILQ_FIRST(&conn->outbound_queue);
+ STAILQ_REMOVE_HEAD(&conn->outbound_queue, entries);
+ msg = out->msg;
+ if (--msg->ref_count <= 0)
+ __os_free(env, msg);
+ __os_free(env, out);
+ }
+ if ((t_ret = __repmgr_free_cond(&conn->drained)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ __os_free(env, conn);
+ return (ret);
+}
+
+static int
+enqueue_msg(env, conn, msg, offset)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ struct sending_msg *msg;
+ size_t offset;
+{
+ QUEUED_OUTPUT *q_element;
+ int ret;
+
+ if (msg->fmsg == NULL && ((ret = flatten(env, msg)) != 0))
+ return (ret);
+ if ((ret = __os_malloc(env, sizeof(QUEUED_OUTPUT), &q_element)) != 0)
+ return (ret);
+ q_element->msg = msg->fmsg;
+ msg->fmsg->ref_count++; /* encapsulation would be sweeter */
+ q_element->offset = offset;
+
+ /* Put it on the connection's outbound queue. */
+ STAILQ_INSERT_TAIL(&conn->outbound_queue, q_element, entries);
+ conn->out_queue_length++;
+ return (0);
+}
+
+/*
+ * Either "control" or "rec" (or both) may be NULL, in which case we treat it
+ * like a zero-length DBT.
+ */
+static void
+setup_sending_msg(env, msg, hdr_buf, type, control, rec)
+ ENV *env;
+ struct sending_msg *msg;
+ u_int8_t *hdr_buf;
+ u_int type;
+ const DBT *control, *rec;
+{
+ __repmgr_msg_hdr_args msg_hdr;
+
+ /*
+ * Since we know that the msg hdr is a fixed size, we can add its buffer
+ * to the iovecs before actually marshaling the content. But the
+ * add_buffer and add_dbt calls have to be in the right order.
+ */
+ __repmgr_iovec_init(msg->iovecs);
+ __repmgr_add_buffer(msg->iovecs, hdr_buf, __REPMGR_MSG_HDR_SIZE);
+
+ msg_hdr.type = type;
+
+ if ((REP_MSG_CONTROL_SIZE(msg_hdr) =
+ (control == NULL ? 0 : control->size)) > 0)
+ __repmgr_add_dbt(msg->iovecs, control);
+
+ if ((REP_MSG_REC_SIZE(msg_hdr) = (rec == NULL ? 0 : rec->size)) > 0)
+ __repmgr_add_dbt(msg->iovecs, rec);
+
+ __repmgr_msg_hdr_marshal(env, &msg_hdr, hdr_buf);
+ msg->fmsg = NULL;
+}
+
+/*
+ * Convert a message stored as iovec pointers to various pieces, into flattened
+ * form, by copying all the pieces, and then make the iovec just point to the
+ * new simplified form.
+ */
+static int
+flatten(env, msg)
+ ENV *env;
+ struct sending_msg *msg;
+{
+ u_int8_t *p;
+ size_t msg_size;
+ int i, ret;
+
+ DB_ASSERT(env, msg->fmsg == NULL);
+
+ msg_size = msg->iovecs->total_bytes;
+ if ((ret = __os_malloc(env, sizeof(*msg->fmsg) + msg_size,
+ &msg->fmsg)) != 0)
+ return (ret);
+ msg->fmsg->length = msg_size;
+ msg->fmsg->ref_count = 0;
+ p = &msg->fmsg->data[0];
+
+ for (i = 0; i < msg->iovecs->count; i++) {
+ memcpy(p, msg->iovecs->vectors[i].iov_base,
+ msg->iovecs->vectors[i].iov_len);
+ p = &p[msg->iovecs->vectors[i].iov_len];
+ }
+ __repmgr_iovec_init(msg->iovecs);
+ __repmgr_add_buffer(msg->iovecs, &msg->fmsg->data[0], msg_size);
+ return (0);
+}
+
+/*
+ * Scan the list of remote sites, returning the first one that is a peer,
+ * is not the current master, and is available.
+ */
+static REPMGR_SITE *
+__repmgr_find_available_peer(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site;
+ u_int i;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ FOR_EACH_REMOTE_SITE_INDEX(i) {
+ site = &db_rep->sites[i];
+ if (FLD_ISSET(site->config, DB_REPMGR_PEER) &&
+ EID_FROM_SITE(site) != rep->master_id &&
+ site->state == SITE_CONNECTED &&
+ (((conn = site->ref.conn.in) != NULL &&
+ conn->state == CONN_READY) ||
+ ((conn = site->ref.conn.out) != NULL &&
+ conn->state == CONN_READY)))
+ return (site);
+ }
+ return (NULL);
+}
+
+/*
+ * Copy host/port values into the given netaddr struct. Allocates memory for
+ * the copy of the host name, which becomes the responsibility of the caller.
+ *
+ * PUBLIC: int __repmgr_pack_netaddr __P((ENV *, const char *,
+ * PUBLIC: u_int, repmgr_netaddr_t *));
+ */
+int
+__repmgr_pack_netaddr(env, host, port, addr)
+ ENV *env;
+ const char *host;
+ u_int port;
+ repmgr_netaddr_t *addr;
+{
+ int ret;
+
+ DB_ASSERT(env, host != NULL);
+
+ if ((ret = __os_strdup(env, host, &addr->host)) != 0)
+ return (ret);
+ addr->port = (u_int16_t)port;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_getaddr __P((ENV *,
+ * PUBLIC: const char *, u_int, int, ADDRINFO **));
+ */
+int
+__repmgr_getaddr(env, host, port, flags, result)
+ ENV *env;
+ const char *host;
+ u_int port;
+ int flags; /* Matches struct addrinfo declaration. */
+ ADDRINFO **result;
+{
+ ADDRINFO *answer, hints;
+ char buffer[10]; /* 2**16 fits in 5 digits. */
+
+ /*
+ * Ports are really 16-bit unsigned values, but it's too painful to
+ * push that type through the API.
+ */
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_flags = flags;
+ (void)snprintf(buffer, sizeof(buffer), "%u", port);
+
+ /*
+ * Although it's generally bad to discard error information, the return
+ * code from __os_getaddrinfo is undependable. Our callers at least
+ * would like to be able to distinguish errors in getaddrinfo (which we
+ * want to consider to be re-tryable), from other failure (e.g., EINVAL,
+ * above).
+ */
+ if (__os_getaddrinfo(env, host, port, buffer, &hints, &answer) != 0)
+ return (DB_REP_UNAVAIL);
+ *result = answer;
+
+ return (0);
+}
+
+/*
+ * Initialize a socket for listening. Sets a file descriptor for the socket,
+ * ready for an accept() call in a thread that we're happy to let block.
+ *
+ * PUBLIC: int __repmgr_listen __P((ENV *));
+ */
+int
+__repmgr_listen(env)
+ ENV *env;
+{
+ ADDRINFO *ai;
+ DB_REP *db_rep;
+ repmgr_netaddr_t *addrp;
+ char *why;
+ int sockopt, ret;
+ socket_t s;
+
+ db_rep = env->rep_handle;
+
+ /* Use OOB value as sentinel to show no socket open. */
+ s = INVALID_SOCKET;
+
+ addrp = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+ if ((ret = __repmgr_getaddr(env,
+ addrp->host, addrp->port, AI_PASSIVE, &ai)) != 0)
+ return (ret);
+
+ /*
+ * Given the assert is correct, we execute the loop at least once, which
+ * means 'why' will have been set by the time it's needed. But of
+ * course lint doesn't know about DB_ASSERT.
+ */
+ COMPQUIET(why, "");
+ DB_ASSERT(env, ai != NULL);
+ for (; ai != NULL; ai = ai->ai_next) {
+
+ if ((s = socket(ai->ai_family,
+ ai->ai_socktype, ai->ai_protocol)) == INVALID_SOCKET) {
+ why = DB_STR("3584", "can't create listen socket");
+ continue;
+ }
+
+ /*
+ * When testing, it's common to kill and restart regularly. On
+ * some systems, this causes bind to fail with "address in use"
+ * errors unless this option is set.
+ */
+ sockopt = 1;
+ if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (sockopt_t)&sockopt,
+ sizeof(sockopt)) != 0) {
+ why = DB_STR("3585",
+ "can't set REUSEADDR socket option");
+ break;
+ }
+
+ if (bind(s, ai->ai_addr, (socklen_t)ai->ai_addrlen) != 0) {
+ why = DB_STR("3586",
+ "can't bind socket to listening address");
+ ret = net_errno;
+ (void)closesocket(s);
+ s = INVALID_SOCKET;
+ continue;
+ }
+
+ if (listen(s, 5) != 0) {
+ why = DB_STR("3587", "listen()");
+ break;
+ }
+
+ if ((ret = __repmgr_set_nonblocking(s)) != 0) {
+ __db_err(env, ret, DB_STR("3588",
+ "can't unblock listen socket"));
+ goto clean;
+ }
+
+ db_rep->listen_fd = s;
+ goto out;
+ }
+
+ if (ret == 0)
+ ret = net_errno;
+ __db_err(env, ret, "%s", why);
+clean: if (s != INVALID_SOCKET)
+ (void)closesocket(s);
+out:
+ __os_freeaddrinfo(env, ai);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_net_close __P((ENV *));
+ */
+int
+__repmgr_net_close(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_SITE *site;
+ u_int eid;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if ((ret = __repmgr_each_connection(env, final_cleanup, NULL,
+ FALSE)) == 0) {
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ site = SITE_FROM_EID(eid);
+ site->ref.conn.in = NULL;
+ site->ref.conn.out = NULL;
+ }
+ }
+
+ if (db_rep->listen_fd != INVALID_SOCKET) {
+ if (closesocket(db_rep->listen_fd) == SOCKET_ERROR && ret == 0)
+ ret = net_errno;
+ db_rep->listen_fd = INVALID_SOCKET;
+ rep->listener = 0;
+ }
+ return (ret);
+}
+
+/* Called only from env->close(), so we know we're single threaded. */
+static int
+final_cleanup(env, conn, unused)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ void *unused;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ int ret, t_ret;
+
+ COMPQUIET(unused, NULL);
+ db_rep = env->rep_handle;
+
+ ret = __repmgr_close_connection(env, conn);
+ /* Remove the connection from whatever list it's on, if any. */
+ if (conn->type == REP_CONNECTION && IS_VALID_EID(conn->eid)) {
+ site = SITE_FROM_EID(conn->eid);
+
+ if (site->state == SITE_CONNECTED &&
+ (conn == site->ref.conn.in || conn == site->ref.conn.out)) {
+ /* Not on any list, so no need to do anything. */
+ } else
+ TAILQ_REMOVE(&site->sub_conns, conn, entries);
+ t_ret = __repmgr_destroy_conn(env, conn);
+
+ } else {
+ TAILQ_REMOVE(&db_rep->connections, conn, entries);
+ t_ret = __repmgr_decr_conn_ref(env, conn);
+ }
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * PUBLIC: void __repmgr_net_destroy __P((ENV *, DB_REP *));
+ */
+void
+__repmgr_net_destroy(env, db_rep)
+ ENV *env;
+ DB_REP *db_rep;
+{
+ REPMGR_RETRY *retry;
+
+ while (!TAILQ_EMPTY(&db_rep->retries)) {
+ retry = TAILQ_FIRST(&db_rep->retries);
+ TAILQ_REMOVE(&db_rep->retries, retry, entries);
+ __os_free(env, retry);
+ }
+
+ DB_ASSERT(env, TAILQ_EMPTY(&db_rep->connections));
+}
+
+#ifdef CONFIG_TEST
+/*
+ * Substitute a fake target port instead of the port actually configured, for
+ * certain types of testing, if desired.
+ *
+ * When a DB_TEST_FAKE_PORT environment variable is present, it names a TCP/IP
+ * port on which a "port arbiter" service may be running. If it is indeed
+ * running, we should send it a request to ask it what "fake" port to use in
+ * place of the given "real" port. (The "real" port is the port normally
+ * configured, and present in the membership database.) The arbiter is not
+ * always running for all tests, so if it's not present it simply means we
+ * should not substitute a fake port. Also, even if it is running, in some
+ * tests we don't want to substitute a fake port: in that case, the arbiter's
+ * response could name the same port as the "real" port we sent it.
+ *
+ * !!! This is only used for testing.
+ */
+static u_int
+fake_port(env, port)
+ ENV *env;
+ u_int port;
+{
+#define MIN_PORT 1
+#define MAX_PORT 65535
+ ADDRINFO *ai0, *ai;
+ db_iovec_t iovec;
+ char *arbiter, buf[100], *end, *p;
+ socket_t s;
+ long result;
+ size_t count;
+ int ret;
+ u_int arbiter_port;
+
+ if ((arbiter = getenv("DB_TEST_FAKE_PORT")) == NULL)
+ return (port);
+ if (__db_getlong(env->dbenv, "repmgr_net.c:fake_port",
+ arbiter, MIN_PORT, MAX_PORT, &result) != 0)
+ return (port);
+ arbiter_port = (u_int)result;
+
+ /*
+ * Send a message of the form "{config,Port}" onto a connection to
+ * arbiter_port.
+ */
+ if ((ret = __repmgr_getaddr(env,
+ "localhost", arbiter_port, 0, &ai0)) != 0) {
+ __db_err(env, ret, "fake_port:getaddr");
+ return (port);
+ }
+ s = INVALID_SOCKET;
+ for (ai = ai0; ai != NULL; ai = ai->ai_next) {
+ if ((s = socket(ai->ai_family,
+ ai->ai_socktype, ai->ai_protocol)) == SOCKET_ERROR) {
+ ret = net_errno;
+ s = INVALID_SOCKET;
+ __db_err(env, ret, "fake_port:socket");
+ goto err;
+ }
+ /*
+ * Note that port substitution is used in only a small number of
+ * tests. When there is no "port arbiter" running, it's not an
+ * error; it just means we should use the normal configured port
+ * as is.
+ */
+ if (connect(s, ai->ai_addr, (socklen_t)ai->ai_addrlen) != 0) {
+ ret = net_errno;
+ (void)closesocket(s);
+ s = INVALID_SOCKET;
+ }
+ }
+ if (ret != 0)
+ goto err;
+ (void)snprintf(buf, sizeof(buf), "{config,%u}\r\n", port);
+ iovec.iov_base = buf;
+ iovec.iov_len = (u_long)strlen(buf);
+ while ((ret = __repmgr_writev(s, &iovec, 1, &count)) == 0) {
+ iovec.iov_base = (u_int8_t *)iovec.iov_base + count;
+ if ((iovec.iov_len -= (u_long)count) == 0)
+ break;
+ }
+ if (ret != 0) {
+ __db_err(env, ret, "fake_port:writev");
+ goto err;
+ }
+
+ /* The response should be a line telling us what port to use. */
+ iovec.iov_base = buf;
+ iovec.iov_len = sizeof(buf);
+ p = buf;
+ while ((ret = __repmgr_readv(s, &iovec, 1, &count)) == 0) {
+ if (count == 0) {
+ __db_errx(env, "fake_port: premature EOF");
+ goto err;
+ }
+ /* Keep reading until we get a line end. */
+ for (p = iovec.iov_base, end = &p[count]; p < end; p++)
+ if (*p == '\r' || *p == '\n')
+ break;
+ if (p < end) {
+ *p = '\0';
+ break;
+ }
+ iovec.iov_base = (u_int8_t *)iovec.iov_base + count;
+ iovec.iov_len -= (u_long)count;
+ DB_ASSERT(env, iovec.iov_len > 0);
+ }
+ if (ret != 0)
+ goto err;
+
+ if (__db_getlong(env->dbenv, "repmgr_net.c:fake_port",
+ buf, MIN_PORT, MAX_PORT, &result) == 0)
+ port = (u_int)result;
+
+err:
+ /*
+ * Note that we always return some port value, even if an error happens.
+ * Since this is just test code: if an error prevented proper fake port
+ * substitution, it should result in a test failure.
+ */
+ if (s != INVALID_SOCKET)
+ (void)closesocket(s);
+ __os_freeaddrinfo(env, ai0);
+ return (port);
+}
+#endif
diff --git a/src/repmgr/repmgr_posix.c b/src/repmgr/repmgr_posix.c
new file mode 100644
index 00000000..0687681a
--- /dev/null
+++ b/src/repmgr/repmgr_posix.c
@@ -0,0 +1,804 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * Invalid open file descriptor value, that can be used as an out-of-band
+ * sentinel to mark our signalling pipe as unopened.
+ */
+#define NO_SUCH_FILE_DESC (-1)
+
+/* Aggregated control info needed for preparing for select() call. */
+struct io_info {
+ fd_set *reads, *writes;
+ int maxfd;
+};
+
+static int __repmgr_conn_work __P((ENV *, REPMGR_CONNECTION *, void *));
+static int prepare_io __P((ENV *, REPMGR_CONNECTION *, void *));
+
+/*
+ * Starts the thread described in the argument, and stores the resulting thread
+ * ID therein.
+ *
+ * PUBLIC: int __repmgr_thread_start __P((ENV *, REPMGR_RUNNABLE *));
+ */
+int
+__repmgr_thread_start(env, runnable)
+ ENV *env;
+ REPMGR_RUNNABLE *runnable;
+{
+ pthread_attr_t *attrp;
+#if defined(_POSIX_THREAD_ATTR_STACKSIZE) && defined(DB_STACKSIZE)
+ pthread_attr_t attributes;
+ size_t size;
+ int ret;
+
+ attrp = &attributes;
+ if ((ret = pthread_attr_init(&attributes)) != 0) {
+ __db_err(env, ret, DB_STR("3630",
+ "pthread_attr_init in repmgr_thread_start"));
+ return (ret);
+ }
+
+ size = DB_STACKSIZE;
+
+#ifdef PTHREAD_STACK_MIN
+ if (size < PTHREAD_STACK_MIN)
+ size = PTHREAD_STACK_MIN;
+#endif
+ if ((ret = pthread_attr_setstacksize(&attributes, size)) != 0) {
+ __db_err(env, ret, DB_STR("3631",
+ "pthread_attr_setstacksize in repmgr_thread_start"));
+ return (ret);
+ }
+#else
+ attrp = NULL;
+#endif
+
+ runnable->finished = FALSE;
+ runnable->quit_requested = FALSE;
+ runnable->env = env;
+
+ return (pthread_create(&runnable->thread_id, attrp,
+ runnable->run, runnable));
+}
+
+/*
+ * PUBLIC: int __repmgr_thread_join __P((REPMGR_RUNNABLE *));
+ */
+int
+__repmgr_thread_join(thread)
+ REPMGR_RUNNABLE *thread;
+{
+ return (pthread_join(thread->thread_id, NULL));
+}
+
+/*
+ * PUBLIC: int __repmgr_set_nonblock_conn __P((REPMGR_CONNECTION *));
+ */
+int
+__repmgr_set_nonblock_conn(conn)
+ REPMGR_CONNECTION *conn;
+{
+ return (__repmgr_set_nonblocking(conn->fd));
+}
+
+/*
+ * PUBLIC: int __repmgr_set_nonblocking __P((socket_t));
+ */
+int
+__repmgr_set_nonblocking(fd)
+ socket_t fd;
+{
+ int flags;
+
+ if ((flags = fcntl(fd, F_GETFL, 0)) < 0)
+ return (errno);
+ if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0)
+ return (errno);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_wake_waiters __P((ENV *, waiter_t *));
+ *
+ * Wake any "waiter" threads (either sending threads waiting for acks, or
+ * channel users waiting for response to request).
+ *
+ * !!!
+ * Caller must hold the db_rep->mutex, if this thread synchronization is to work
+ * properly.
+ */
+int
+__repmgr_wake_waiters(env, waiter)
+ ENV *env;
+ waiter_t *waiter;
+{
+ COMPQUIET(env, NULL);
+ return (pthread_cond_broadcast(waiter));
+}
+
+/*
+ * Waits a limited time for a condition to become true. (If the limit is 0 we
+ * wait forever.) All calls share just the one db_rep->mutex, but use whatever
+ * waiter_t the caller passes us.
+ *
+ * PUBLIC: int __repmgr_await_cond __P((ENV *,
+ * PUBLIC: PREDICATE, void *, db_timeout_t, waiter_t *));
+ */
+int
+__repmgr_await_cond(env, pred, ctx, timeout, wait_condition)
+ ENV *env;
+ PREDICATE pred;
+ void *ctx;
+ db_timeout_t timeout;
+ waiter_t *wait_condition;
+{
+ DB_REP *db_rep;
+ struct timespec deadline;
+ int ret, timed;
+
+ db_rep = env->rep_handle;
+ if ((timed = (timeout > 0)))
+ __repmgr_compute_wait_deadline(env, &deadline, timeout);
+ else
+ COMPQUIET(deadline.tv_sec, 0);
+
+ while (!(*pred)(env, ctx)) {
+ if (timed)
+ ret = pthread_cond_timedwait(wait_condition,
+ db_rep->mutex, &deadline);
+ else
+ ret = pthread_cond_wait(wait_condition, db_rep->mutex);
+ if (db_rep->repmgr_status == stopped)
+ return (DB_REP_UNAVAIL);
+ if (ret == ETIMEDOUT)
+ return (DB_TIMEOUT);
+ if (ret != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+/*
+ * Waits for an in-progress membership DB operation (if any) to complete.
+ *
+ * PUBLIC: int __repmgr_await_gmdbop __P((ENV *));
+ *
+ * Caller holds mutex; we drop it while waiting.
+ */
+int
+__repmgr_await_gmdbop(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ while (db_rep->gmdb_busy)
+ if ((ret = pthread_cond_wait(&db_rep->gmdb_idle,
+ db_rep->mutex)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * __repmgr_compute_wait_deadline --
+ * Computes a deadline time a certain distance into the future.
+ *
+ * PUBLIC: void __repmgr_compute_wait_deadline __P((ENV*,
+ * PUBLIC: struct timespec *, db_timeout_t));
+ */
+void
+__repmgr_compute_wait_deadline(env, result, wait)
+ ENV *env;
+ struct timespec *result;
+ db_timeout_t wait;
+{
+ /*
+ * The result is suitable for the pthread_cond_timewait call. (That
+ * call uses nano-second resolution; elsewhere we use microseconds.)
+ *
+ * Start with "now"; then add the "wait" offset.
+ *
+ * A db_timespec is the same as a "struct timespec" so we can pass
+ * result directly to the underlying Berkeley DB OS routine.
+ *
+ * !!!
+ * We use the system clock for the pthread_cond_timedwait call, but
+ * that's not optimal on systems with monotonic timers. Instead,
+ * we should call pthread_condattr_setclock on systems where it and
+ * monotonic timers are available, and then configure both this call
+ * and the subsequent pthread_cond_timewait call to use a monotonic
+ * timer.
+ */
+ __os_gettime(env, (db_timespec *)result, 0);
+ TIMESPEC_ADD_DB_TIMEOUT(result, wait);
+}
+
+/*
+ * PUBLIC: int __repmgr_await_drain __P((ENV *,
+ * PUBLIC: REPMGR_CONNECTION *, db_timeout_t));
+ *
+ * Waits for space to become available on the connection's output queue.
+ * Various ways we can exit:
+ *
+ * 1. queue becomes non-full
+ * 2. exceed time limit
+ * 3. connection becomes defunct (due to error in another thread)
+ * 4. repmgr is shutting down
+ * 5. any unexpected system resource failure
+ *
+ * In cases #3 and #5 we return an error code. Caller is responsible for
+ * distinguishing the remaining cases if desired, though we do help with #2 by
+ * showing the connection as congested.
+ *
+ * !!!
+ * Caller must hold repmgr->mutex.
+ */
+int
+__repmgr_await_drain(env, conn, timeout)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ db_timeout_t timeout;
+{
+ DB_REP *db_rep;
+ struct timespec deadline;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ __repmgr_compute_wait_deadline(env, &deadline, timeout);
+
+ ret = 0;
+ while (conn->out_queue_length >= OUT_QUEUE_LIMIT) {
+ ret = pthread_cond_timedwait(&conn->drained,
+ db_rep->mutex, &deadline);
+ switch (ret) {
+ case 0:
+ if (db_rep->repmgr_status == stopped)
+ goto out; /* #4. */
+ /*
+ * Another thread could have stumbled into an error on
+ * the socket while we were waiting.
+ */
+ if (conn->state == CONN_DEFUNCT) {
+ ret = DB_REP_UNAVAIL; /* #3. */
+ goto out;
+ }
+ break;
+ case ETIMEDOUT:
+ conn->state = CONN_CONGESTED;
+ ret = 0;
+ goto out; /* #2. */
+ default:
+ goto out; /* #5. */
+ }
+ }
+ /* #1. */
+
+out:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_alloc_cond __P((cond_var_t *));
+ *
+ * Initialize a condition variable (in allocated space).
+ */
+int
+__repmgr_alloc_cond(c)
+ cond_var_t *c;
+{
+ return (pthread_cond_init(c, NULL));
+}
+
+/*
+ * PUBLIC: int __repmgr_free_cond __P((cond_var_t *));
+ *
+ * Clean up a previously initialized condition variable.
+ */
+int
+__repmgr_free_cond(c)
+ cond_var_t *c;
+{
+ return (pthread_cond_destroy(c));
+}
+
+/*
+ * PUBLIC: void __repmgr_env_create_pf __P((DB_REP *));
+ */
+void
+__repmgr_env_create_pf(db_rep)
+ DB_REP *db_rep;
+{
+ db_rep->read_pipe = db_rep->write_pipe = NO_SUCH_FILE_DESC;
+}
+
+/*
+ * "Platform"-specific mutex creation function.
+ *
+ * PUBLIC: int __repmgr_create_mutex_pf __P((mgr_mutex_t *));
+ */
+int
+__repmgr_create_mutex_pf(mutex)
+ mgr_mutex_t *mutex;
+{
+ return (pthread_mutex_init(mutex, NULL));
+}
+
+/*
+ * PUBLIC: int __repmgr_destroy_mutex_pf __P((mgr_mutex_t *));
+ */
+int
+__repmgr_destroy_mutex_pf(mutex)
+ mgr_mutex_t *mutex;
+{
+ return (pthread_mutex_destroy(mutex));
+}
+
+/*
+ * PUBLIC: int __repmgr_init __P((ENV *));
+ */
+int
+__repmgr_init(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ struct sigaction sigact;
+ int ack_inited, elect_inited, file_desc[2], gmdb_inited, queue_inited;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ /*
+ * Make sure we're not ignoring SIGPIPE, 'cuz otherwise we'd be killed
+ * just for trying to write onto a socket that had been reset. Note
+ * that we don't undo this in case of a later error, since we document
+ * that we leave the signal handling state like this, even after env
+ * close.
+ */
+ if (sigaction(SIGPIPE, NULL, &sigact) == -1) {
+ ret = errno;
+ __db_err(env, ret, DB_STR("3632",
+ "can't access signal handler"));
+ return (ret);
+ }
+ if (sigact.sa_handler == SIG_DFL) {
+ sigact.sa_handler = SIG_IGN;
+ sigact.sa_flags = 0;
+ if (sigaction(SIGPIPE, &sigact, NULL) == -1) {
+ ret = errno;
+ __db_err(env, ret, DB_STR("3633",
+ "can't access signal handler"));
+ return (ret);
+ }
+ }
+
+ ack_inited = elect_inited = gmdb_inited = queue_inited = FALSE;
+ if ((ret = __repmgr_init_waiters(env, &db_rep->ack_waiters)) != 0)
+ goto err;
+ ack_inited = TRUE;
+
+ if ((ret = pthread_cond_init(&db_rep->check_election, NULL)) != 0)
+ goto err;
+ elect_inited = TRUE;
+
+ if ((ret = pthread_cond_init(&db_rep->gmdb_idle, NULL)) != 0)
+ goto err;
+ gmdb_inited = TRUE;
+
+ if ((ret = pthread_cond_init(&db_rep->msg_avail, NULL)) != 0)
+ goto err;
+ queue_inited = TRUE;
+
+ if ((ret = pipe(file_desc)) == -1) {
+ ret = errno;
+ goto err;
+ }
+
+ db_rep->read_pipe = file_desc[0];
+ db_rep->write_pipe = file_desc[1];
+ return (0);
+err:
+ if (queue_inited)
+ (void)pthread_cond_destroy(&db_rep->msg_avail);
+ if (gmdb_inited)
+ (void)pthread_cond_destroy(&db_rep->gmdb_idle);
+ if (elect_inited)
+ (void)pthread_cond_destroy(&db_rep->check_election);
+ if (ack_inited)
+ (void)__repmgr_destroy_waiters(env, &db_rep->ack_waiters);
+ db_rep->read_pipe = db_rep->write_pipe = NO_SUCH_FILE_DESC;
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_deinit __P((ENV *));
+ */
+int
+__repmgr_deinit(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+
+ if (!(REPMGR_INITED(db_rep)))
+ return (0);
+
+ ret = pthread_cond_destroy(&db_rep->msg_avail);
+
+ if ((t_ret = pthread_cond_destroy(&db_rep->gmdb_idle)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = pthread_cond_destroy(&db_rep->check_election)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __repmgr_destroy_waiters(env,
+ &db_rep->ack_waiters)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (close(db_rep->read_pipe) == -1 && ret == 0)
+ ret = errno;
+ if (close(db_rep->write_pipe) == -1 && ret == 0)
+ ret = errno;
+
+ db_rep->read_pipe = db_rep->write_pipe = NO_SUCH_FILE_DESC;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_init_waiters __P((ENV *, waiter_t *));
+ */
+int
+__repmgr_init_waiters(env, waiters)
+ ENV *env;
+ waiter_t *waiters;
+{
+ COMPQUIET(env, NULL);
+ return (pthread_cond_init(waiters, NULL));
+}
+
+/*
+ * PUBLIC: int __repmgr_destroy_waiters __P((ENV *, waiter_t *));
+ */
+int
+__repmgr_destroy_waiters(env, waiters)
+ ENV *env;
+ waiter_t *waiters;
+{
+ COMPQUIET(env, NULL);
+ return (pthread_cond_destroy(waiters));
+}
+
+/*
+ * PUBLIC: int __repmgr_lock_mutex __P((mgr_mutex_t *));
+ */
+int
+__repmgr_lock_mutex(mutex)
+ mgr_mutex_t *mutex;
+{
+ return (pthread_mutex_lock(mutex));
+}
+
+/*
+ * PUBLIC: int __repmgr_unlock_mutex __P((mgr_mutex_t *));
+ */
+int
+__repmgr_unlock_mutex(mutex)
+ mgr_mutex_t *mutex;
+{
+ return (pthread_mutex_unlock(mutex));
+}
+
+/*
+ * Signals a condition variable.
+ *
+ * !!!
+ * Caller must hold mutex.
+ *
+ * PUBLIC: int __repmgr_signal __P((cond_var_t *));
+ */
+int
+__repmgr_signal(v)
+ cond_var_t *v;
+{
+ return (pthread_cond_broadcast(v));
+}
+
+/*
+ * Wake repmgr message processing threads, expressly for the purpose of shutting
+ * some subset of them down.
+ *
+ * !!!
+ * Caller must hold mutex.
+ *
+ * PUBLIC: int __repmgr_wake_msngers __P((ENV*, u_int));
+ */
+int
+__repmgr_wake_msngers(env, n)
+ ENV *env;
+ u_int n;
+{
+ DB_REP *db_rep;
+
+ COMPQUIET(n, 0);
+
+ db_rep = env->rep_handle;
+ return (__repmgr_signal(&db_rep->msg_avail));
+}
+
+/*
+ * PUBLIC: int __repmgr_wake_main_thread __P((ENV*));
+ *
+ * Can be called either with or without the mutex being held.
+ */
+int
+__repmgr_wake_main_thread(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ u_int8_t any_value;
+
+ COMPQUIET(any_value, 0);
+ db_rep = env->rep_handle;
+
+ /*
+ * It doesn't matter what byte value we write. Just the appearance of a
+ * byte in the stream is enough to wake up the select() thread reading
+ * the pipe.
+ */
+ if (write(db_rep->write_pipe, VOID_STAR_CAST &any_value, 1) == -1)
+ return (errno);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_writev __P((socket_t, db_iovec_t *, int, size_t *));
+ */
+int
+__repmgr_writev(fd, iovec, buf_count, byte_count_p)
+ socket_t fd;
+ db_iovec_t *iovec;
+ int buf_count;
+ size_t *byte_count_p;
+{
+ int nw, result;
+
+ if ((nw = writev(fd, iovec, buf_count)) == -1) {
+ /* Why? See note at __repmgr_readv(). */
+ result = errno;
+ DB_ASSERT(NULL, result != 0);
+ return (result);
+ }
+ *byte_count_p = (size_t)nw;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_readv __P((socket_t, db_iovec_t *, int, size_t *));
+ */
+int
+__repmgr_readv(fd, iovec, buf_count, byte_count_p)
+ socket_t fd;
+ db_iovec_t *iovec;
+ int buf_count;
+ size_t *byte_count_p;
+{
+ int result;
+ ssize_t nw;
+
+ if ((nw = readv(fd, iovec, buf_count)) == -1) {
+ /*
+ * Why bother to assert this obvious "truth"? On some systems
+ * when the library is loaded into a single-threaded Tcl
+ * configuration the differing errno mechanisms apparently
+ * conflict, and we occasionally "see" a 0 value here! And that
+ * turns out to be painful to debug.
+ */
+ result = errno;
+ DB_ASSERT(NULL, result != 0);
+ return (result);
+ }
+ *byte_count_p = (size_t)nw;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_select_loop __P((ENV *));
+ */
+int
+__repmgr_select_loop(env)
+ ENV *env;
+{
+ struct timeval select_timeout, *select_timeout_p;
+ DB_REP *db_rep;
+ db_timespec timeout;
+ fd_set reads, writes;
+ struct io_info io_info;
+ int ret;
+ u_int8_t buf[10]; /* arbitrary size */
+
+ db_rep = env->rep_handle;
+ /*
+ * Almost this entire thread operates while holding the mutex. But note
+ * that it never blocks, except in the call to select() (which is the
+ * one place we relinquish the mutex).
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_first_try_connections(env)) != 0)
+ goto out;
+ for (;;) {
+ FD_ZERO(&reads);
+ FD_ZERO(&writes);
+
+ /*
+ * Figure out which sockets to ask for input and output. It's
+ * simple for the signalling pipe and listen socket; but depends
+ * on backlog states for the connections to other sites.
+ */
+ FD_SET((u_int)db_rep->read_pipe, &reads);
+ io_info.maxfd = db_rep->read_pipe;
+
+ if (!IS_SUBORDINATE(db_rep)) {
+ FD_SET((u_int)db_rep->listen_fd, &reads);
+ if (db_rep->listen_fd > io_info.maxfd)
+ io_info.maxfd = db_rep->listen_fd;
+ }
+
+ io_info.reads = &reads;
+ io_info.writes = &writes;
+ if ((ret = __repmgr_each_connection(env,
+ prepare_io, &io_info, TRUE)) != 0)
+ goto out;
+
+ if (__repmgr_compute_timeout(env, &timeout)) {
+ /* Convert the timespec to a timeval. */
+ select_timeout.tv_sec = timeout.tv_sec;
+ select_timeout.tv_usec = timeout.tv_nsec / NS_PER_US;
+ select_timeout_p = &select_timeout;
+ } else {
+ /* No time-based events, so wait only for I/O. */
+ select_timeout_p = NULL;
+ }
+
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ if ((ret = select(io_info.maxfd + 1,
+ &reads, &writes, NULL, select_timeout_p)) == -1) {
+ switch (ret = errno) {
+ case EINTR:
+ case EWOULDBLOCK:
+ LOCK_MUTEX(db_rep->mutex);
+ continue; /* simply retry */
+ default:
+ __db_err(env, ret, DB_STR("3634",
+ "select"));
+ return (ret);
+ }
+ }
+ LOCK_MUTEX(db_rep->mutex);
+ if (db_rep->repmgr_status == stopped) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Timer expiration events include retrying of lost connections.
+ * Obviously elements can be added to the connection list there.
+ */
+ if ((ret = __repmgr_check_timeouts(env)) != 0)
+ goto out;
+
+ if ((ret = __repmgr_each_connection(env,
+ __repmgr_conn_work, &io_info, TRUE)) != 0)
+ goto out;
+
+ /*
+ * Read any bytes in the signalling pipe. Note that we don't
+ * actually need to do anything with them; they're just there to
+ * wake us up when necessary.
+ */
+ if (FD_ISSET((u_int)db_rep->read_pipe, &reads) &&
+ read(db_rep->read_pipe, VOID_STAR_CAST buf,
+ sizeof(buf)) <= 0) {
+ ret = errno;
+ goto out;
+ }
+ /*
+ * Obviously elements can be added to the connection list here.
+ */
+ if (!IS_SUBORDINATE(db_rep) &&
+ FD_ISSET((u_int)db_rep->listen_fd, &reads) &&
+ (ret = __repmgr_accept(env)) != 0)
+ goto out;
+ }
+out:
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (ret == DB_DELETED)
+ ret = __repmgr_bow_out(env);
+ LOCK_MUTEX(db_rep->mutex);
+ (void)__repmgr_net_close(env);
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * Examines a connection to see what sort of I/O to ask for. Clean up defunct
+ * connections.
+ */
+static int
+prepare_io(env, conn, info_)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ void *info_;
+{
+ struct io_info *info;
+
+ info = info_;
+
+ if (conn->state == CONN_DEFUNCT)
+ return (__repmgr_cleanup_defunct(env, conn));
+
+ if (!STAILQ_EMPTY(&conn->outbound_queue)) {
+ FD_SET((u_int)conn->fd, info->writes);
+ if (conn->fd > info->maxfd)
+ info->maxfd = conn->fd;
+ }
+ /*
+ * For now we always accept incoming data. If we ever implement some
+ * kind of flow control, we should override it for fledgling connections
+ * (!IS_VALID_EID(conn->eid)) -- in other words, allow reading such a
+ * connection even during flow control duress.
+ */
+ FD_SET((u_int)conn->fd, info->reads);
+ if (conn->fd > info->maxfd)
+ info->maxfd = conn->fd;
+
+ return (0);
+}
+
+/*
+ * Examine a connection, to see what work needs to be done.
+ */
+static int
+__repmgr_conn_work(env, conn, info_)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ void *info_;
+{
+ struct io_info *info;
+ int ret;
+ u_int fd;
+
+ ret = 0;
+ fd = (u_int)conn->fd;
+ info = info_;
+
+ if (conn->state == CONN_DEFUNCT)
+ return (0);
+
+ if (FD_ISSET(fd, info->writes))
+ ret = __repmgr_write_some(env, conn);
+
+ if (ret == 0 && FD_ISSET(fd, info->reads))
+ ret = __repmgr_read_from_site(env, conn);
+
+ if (ret == DB_REP_UNAVAIL)
+ ret = __repmgr_bust_connection(env, conn);
+ return (ret);
+}
diff --git a/src/repmgr/repmgr_queue.c b/src/repmgr/repmgr_queue.c
new file mode 100644
index 00000000..6a381acf
--- /dev/null
+++ b/src/repmgr/repmgr_queue.c
@@ -0,0 +1,180 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static REPMGR_MESSAGE *available_work __P((ENV *));
+
+/*
+ * Deallocates memory used by all messages on the queue.
+ *
+ * PUBLIC: int __repmgr_queue_destroy __P((ENV *));
+ */
+int
+__repmgr_queue_destroy(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_MESSAGE *m;
+ REPMGR_CONNECTION *conn;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+
+ ret = 0;
+ while (!STAILQ_EMPTY(&db_rep->input_queue.header)) {
+ m = STAILQ_FIRST(&db_rep->input_queue.header);
+ STAILQ_REMOVE_HEAD(&db_rep->input_queue.header, entries);
+ if (m->msg_hdr.type == REPMGR_APP_MESSAGE) {
+ if ((conn = m->v.appmsg.conn) != NULL &&
+ (t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+ __os_free(env, m);
+ }
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_queue_get __P((ENV *,
+ * PUBLIC: REPMGR_MESSAGE **, REPMGR_RUNNABLE *));
+ *
+ * Get the first input message from the queue and return it to the caller. The
+ * caller hereby takes responsibility for the entire message buffer, and should
+ * free it when done.
+ *
+ * Caller must hold mutex.
+ */
+int
+__repmgr_queue_get(env, msgp, th)
+ ENV *env;
+ REPMGR_MESSAGE **msgp;
+ REPMGR_RUNNABLE *th;
+{
+ DB_REP *db_rep;
+ REPMGR_MESSAGE *m;
+#ifdef DB_WIN32
+ HANDLE wait_events[2];
+#endif
+ int ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+
+ while ((m = available_work(env)) == NULL &&
+ db_rep->repmgr_status == running && !th->quit_requested) {
+#ifdef DB_WIN32
+ /*
+ * On Windows, msg_avail means either there's something in the
+ * queue, or we're all finished. So, reset the event if that is
+ * not true.
+ */
+ if (STAILQ_EMPTY(&db_rep->input_queue.header) &&
+ db_rep->repmgr_status == running &&
+ !ResetEvent(db_rep->msg_avail)) {
+ ret = GetLastError();
+ goto err;
+ }
+ wait_events[0] = db_rep->msg_avail;
+ wait_events[1] = th->quit_event;
+ UNLOCK_MUTEX(db_rep->mutex);
+ ret = WaitForMultipleObjects(2, wait_events, FALSE, INFINITE);
+ LOCK_MUTEX(db_rep->mutex);
+ if (ret == WAIT_FAILED) {
+ ret = GetLastError();
+ goto err;
+ }
+
+#else
+ if ((ret = pthread_cond_wait(&db_rep->msg_avail,
+ db_rep->mutex)) != 0)
+ goto err;
+#endif
+ }
+ if (db_rep->repmgr_status == stopped || th->quit_requested)
+ ret = DB_REP_UNAVAIL;
+ else {
+ STAILQ_REMOVE(&db_rep->input_queue.header,
+ m, __repmgr_message, entries);
+ db_rep->input_queue.size--;
+ *msgp = m;
+ }
+
+err:
+ return (ret);
+}
+
+/*
+ * Gets an "available" item of work (i.e., a message) from the input queue. If
+ * there are plenty of message threads currently available, then we simply
+ * return the first thing on the queue, regardless of what type of message it
+ * is. But otherwise skip over any message type that may possibly turn out to
+ * be "long-running", so that we avoid starving out the important rep message
+ * processing.
+ */
+static REPMGR_MESSAGE *
+available_work(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_MESSAGE *m;
+
+ db_rep = env->rep_handle;
+ if (STAILQ_EMPTY(&db_rep->input_queue.header))
+ return (NULL);
+ /*
+ * The "non_rep_th" field is the dynamically varying count of threads
+ * currently processing non-replication messages (a.k.a. possibly
+ * long-running messages, a.k.a. "deferrable"). We always ensure that
+ * db_rep->nthreads > reserved.
+ */
+ if (db_rep->nthreads > db_rep->non_rep_th + RESERVED_MSG_TH(env))
+ return (STAILQ_FIRST(&db_rep->input_queue.header));
+ STAILQ_FOREACH(m, &db_rep->input_queue.header, entries) {
+ if (!IS_DEFERRABLE(m->msg_hdr.type))
+ return (m);
+ }
+ return (NULL);
+}
+
+/*
+ * PUBLIC: int __repmgr_queue_put __P((ENV *, REPMGR_MESSAGE *));
+ *
+ * !!!
+ * Caller must hold repmgr->mutex.
+ */
+int
+__repmgr_queue_put(env, msg)
+ ENV *env;
+ REPMGR_MESSAGE *msg;
+{
+ DB_REP *db_rep;
+
+ db_rep = env->rep_handle;
+
+ STAILQ_INSERT_TAIL(&db_rep->input_queue.header, msg, entries);
+ db_rep->input_queue.size++;
+
+ return (__repmgr_signal(&db_rep->msg_avail));
+}
+
+/*
+ * PUBLIC: int __repmgr_queue_size __P((ENV *));
+ *
+ * !!!
+ * Caller must hold repmgr->mutex.
+ */
+int
+__repmgr_queue_size(env)
+ ENV *env;
+{
+ return (env->rep_handle->input_queue.size);
+}
diff --git a/src/repmgr/repmgr_rec.c b/src/repmgr/repmgr_rec.c
new file mode 100644
index 00000000..41827aff
--- /dev/null
+++ b/src/repmgr/repmgr_rec.c
@@ -0,0 +1,45 @@
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc_auto/repmgr_auto.h"
+
+/*
+ * __repmgr_member_recover --
+ * Recovery function for member.
+ *
+ * PUBLIC: int __repmgr_member_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__repmgr_member_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __repmgr_member_args *argp;
+ int ret;
+
+ COMPQUIET(info, NULL);
+ COMPQUIET(op, DB_TXN_APPLY);
+
+ REC_PRINT(__repmgr_member_print);
+ REC_NOOP_INTRO(__repmgr_member_read);
+
+ /*
+ * The annotation log record describes the update in enough detail for
+ * us to be able to optimize our tracking of it at clients sites.
+ * However, for now we just simply reread the whole (small) database
+ * each time, since changes happen so seldom (and we need to have the
+ * code for reading the whole thing anyway, for other cases).
+ */
+ env->rep_handle->gmdb_dirty = TRUE;
+
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ REC_NOOP_CLOSE;
+}
diff --git a/src/repmgr/repmgr_sel.c b/src/repmgr/repmgr_sel.c
new file mode 100644
index 00000000..ba14368f
--- /dev/null
+++ b/src/repmgr/repmgr_sel.c
@@ -0,0 +1,2096 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+typedef int (*HEARTBEAT_ACTION) __P((ENV *));
+
+static int accept_handshake __P((ENV *, REPMGR_CONNECTION *, char *));
+static int accept_v1_handshake __P((ENV *, REPMGR_CONNECTION *, char *));
+static void check_min_log_file __P((ENV *));
+static int dispatch_msgin __P((ENV *, REPMGR_CONNECTION *));
+static int prepare_input __P((ENV *, REPMGR_CONNECTION *));
+static int process_own_msg __P((ENV *, REPMGR_CONNECTION *));
+static int process_parameters __P((ENV *,
+ REPMGR_CONNECTION *, char *, u_int, u_int32_t, int, u_int32_t));
+static int read_version_response __P((ENV *, REPMGR_CONNECTION *));
+static int record_permlsn __P((ENV *, REPMGR_CONNECTION *));
+static int __repmgr_call_election __P((ENV *));
+static int __repmgr_connector_main __P((ENV *, REPMGR_RUNNABLE *));
+static void *__repmgr_connector_thread __P((void *));
+static int __repmgr_next_timeout __P((ENV *,
+ db_timespec *, HEARTBEAT_ACTION *));
+static int __repmgr_retry_connections __P((ENV *));
+static int __repmgr_send_heartbeat __P((ENV *));
+static int __repmgr_try_one __P((ENV *, int));
+static int resolve_collision __P((ENV *, REPMGR_SITE *, REPMGR_CONNECTION *));
+static int send_version_response __P((ENV *, REPMGR_CONNECTION *));
+
+#define ONLY_HANDSHAKE(env, conn) do { \
+ if (conn->msg_type != REPMGR_HANDSHAKE) { \
+ __db_errx(env, DB_STR_A("3613", \
+ "unexpected msg type %d in state %d", "%d %d"), \
+ (int)conn->msg_type, conn->state); \
+ return (DB_REP_UNAVAIL); \
+ } \
+} while (0)
+
+/*
+ * PUBLIC: void *__repmgr_select_thread __P((void *));
+ */
+void *
+__repmgr_select_thread(argsp)
+ void *argsp;
+{
+ REPMGR_RUNNABLE *args;
+ ENV *env;
+ int ret;
+
+ args = argsp;
+ env = args->env;
+
+ if ((ret = __repmgr_select_loop(env)) != 0) {
+ __db_err(env, ret, DB_STR("3614", "select loop failed"));
+ (void)__repmgr_thread_failure(env, ret);
+ }
+ return (NULL);
+}
+
+/*
+ * PUBLIC: int __repmgr_bow_out __P((ENV *));
+ */
+int
+__repmgr_bow_out(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_stop_threads(env);
+ UNLOCK_MUTEX(db_rep->mutex);
+ DB_EVENT(env, DB_EVENT_REP_LOCAL_SITE_REMOVED, NULL);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_accept __P((ENV *));
+ */
+int
+__repmgr_accept(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ ACCEPT_ADDR siaddr;
+ socklen_t addrlen;
+ socket_t s;
+ int ret;
+
+ db_rep = env->rep_handle;
+ addrlen = sizeof(siaddr);
+ if ((s = accept(db_rep->listen_fd, (struct sockaddr *)&siaddr,
+ &addrlen)) == -1) {
+ /*
+ * Some errors are innocuous and so should be ignored. MSDN
+ * Library documents the Windows ones; the Unix ones are
+ * advocated in Stevens' UNPv1, section 16.6; and Linux
+ * Application Development, p. 416.
+ */
+ switch (ret = net_errno) {
+#ifdef DB_WIN32
+ case WSAECONNRESET:
+ case WSAEWOULDBLOCK:
+#else
+ case EINTR:
+ case EWOULDBLOCK:
+ case ECONNABORTED:
+ case ENETDOWN:
+#ifdef EPROTO
+ case EPROTO:
+#endif
+ case ENOPROTOOPT:
+ case EHOSTDOWN:
+#ifdef ENONET
+ case ENONET:
+#endif
+ case EHOSTUNREACH:
+ case EOPNOTSUPP:
+ case ENETUNREACH:
+#endif
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "accept error %d considered innocuous", ret));
+ return (0);
+ default:
+ __db_err(env, ret, DB_STR("3615", "accept error"));
+ return (ret);
+ }
+ }
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "accepted a new connection"));
+
+ if ((ret =
+ __repmgr_new_connection(env, &conn, s, CONN_NEGOTIATE)) != 0) {
+ (void)closesocket(s);
+ return (ret);
+ }
+ if ((ret = __repmgr_set_keepalive(env, conn)) != 0) {
+ (void)__repmgr_destroy_conn(env, conn);
+ return (ret);
+ }
+ if ((ret = __repmgr_set_nonblock_conn(conn)) != 0) {
+ __db_err(env, ret, DB_STR("3616",
+ "can't set nonblock after accept"));
+ (void)__repmgr_destroy_conn(env, conn);
+ return (ret);
+ }
+
+ /*
+ * We don't yet know which site this connection is coming from. So for
+ * now, put it on the "orphans" list; we'll move it to the appropriate
+ * site struct later when we discover who we're talking with, and what
+ * type of connection it is.
+ */
+ conn->eid = -1;
+ TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries);
+ conn->ref_count++;
+
+ return (0);
+}
+
+/*
+ * Computes how long we should wait for input, in other words how long until we
+ * have to wake up and do something. Returns TRUE if timeout is set; FALSE if
+ * there is nothing to wait for.
+ *
+ * Note that the resulting timeout could be zero; but it can't be negative.
+ *
+ * PUBLIC: int __repmgr_compute_timeout __P((ENV *, db_timespec *));
+ */
+int
+__repmgr_compute_timeout(env, timeout)
+ ENV *env;
+ db_timespec *timeout;
+{
+ DB_REP *db_rep;
+ REPMGR_RETRY *retry;
+ db_timespec now, t;
+ int have_timeout;
+
+ db_rep = env->rep_handle;
+
+ /*
+ * There are two factors to consider: are heartbeats in use? and, do we
+ * have any sites with broken connections that we ought to retry?
+ */
+ have_timeout = __repmgr_next_timeout(env, &t, NULL);
+
+ /* List items are in order, so we only have to examine the first one. */
+ if (!TAILQ_EMPTY(&db_rep->retries)) {
+ retry = TAILQ_FIRST(&db_rep->retries);
+ if (have_timeout) {
+ /* Choose earliest timeout deadline. */
+ t = timespeccmp(&retry->time, &t, <) ? retry->time : t;
+ } else {
+ t = retry->time;
+ have_timeout = TRUE;
+ }
+ }
+
+ if (have_timeout) {
+ __os_gettime(env, &now, 1);
+ if (timespeccmp(&now, &t, >=))
+ timespecclear(timeout);
+ else {
+ *timeout = t;
+ timespecsub(timeout, &now);
+ }
+ }
+
+ return (have_timeout);
+}
+
+/*
+ * Figures out the next heartbeat-related thing to be done, and when it should
+ * be done. The code is factored this way because this computation needs to be
+ * done both before each select() call, and after (when we're checking for timer
+ * expiration).
+ */
+static int
+__repmgr_next_timeout(env, deadline, action)
+ ENV *env;
+ db_timespec *deadline;
+ HEARTBEAT_ACTION *action;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ HEARTBEAT_ACTION my_action;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *master;
+ db_timespec t;
+ u_int32_t version;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (rep->master_id == db_rep->self_eid &&
+ rep->heartbeat_frequency > 0) {
+ t = db_rep->last_bcast;
+ TIMESPEC_ADD_DB_TIMEOUT(&t, rep->heartbeat_frequency);
+ my_action = __repmgr_send_heartbeat;
+ } else if ((master = __repmgr_connected_master(env)) != NULL &&
+ !IS_SUBORDINATE(db_rep) &&
+ rep->heartbeat_monitor_timeout > 0) {
+ version = 0;
+ if ((conn = master->ref.conn.in) != NULL &&
+ IS_READY_STATE(conn->state))
+ version = conn->version;
+ if ((conn = master->ref.conn.out) != NULL &&
+ IS_READY_STATE(conn->state) &&
+ conn->version > version)
+ version = conn->version;
+ if (version >= HEARTBEAT_MIN_VERSION) {
+ /*
+ * If we have a working connection to a heartbeat-aware
+ * master, let's monitor it. Otherwise there's really
+ * nothing we can do.
+ */
+ t = master->last_rcvd_timestamp;
+ TIMESPEC_ADD_DB_TIMEOUT(&t,
+ rep->heartbeat_monitor_timeout);
+ my_action = __repmgr_call_election;
+ } else
+ return (FALSE);
+ } else
+ return (FALSE);
+
+ *deadline = t;
+ if (action != NULL)
+ *action = my_action;
+ return (TRUE);
+}
+
+/*
+ * Sends a heartbeat message.
+ *
+ * repmgr also uses the heartbeat facility to manage rerequests. We
+ * send the master's current generation and max_perm_lsn with the heartbeat
+ * message to help a client determine whether it has all master transactions.
+ * When a client receives a heartbeat message, it also checks whether it
+ * needs to rerequest anything. Note that heartbeats must be enabled for
+ * this rerequest processing to occur.
+ */
+static int
+__repmgr_send_heartbeat(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ DBT control, rec;
+ __repmgr_permlsn_args permlsn;
+ u_int8_t buf[__REPMGR_PERMLSN_SIZE];
+ u_int unused1, unused2;
+ int ret, unused3;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ permlsn.generation = rep->gen;
+ if ((ret = __rep_get_maxpermlsn(env, &permlsn.lsn)) != 0)
+ return (ret);
+ __repmgr_permlsn_marshal(env, &permlsn, buf);
+ control.data = buf;
+ control.size = __REPMGR_PERMLSN_SIZE;
+
+ DB_INIT_DBT(rec, NULL, 0);
+ return (__repmgr_send_broadcast(env,
+ REPMGR_HEARTBEAT, &control, &rec, &unused1, &unused2, &unused3));
+}
+
+/*
+ * PUBLIC: REPMGR_SITE *__repmgr_connected_master __P((ENV *));
+ */
+REPMGR_SITE *
+__repmgr_connected_master(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *master;
+ int master_id;
+
+ db_rep = env->rep_handle;
+ master_id = db_rep->region->master_id;
+
+ if (!IS_KNOWN_REMOTE_SITE(master_id))
+ return (NULL);
+ master = SITE_FROM_EID(master_id);
+ if (master->state == SITE_CONNECTED)
+ return (master);
+ return (NULL);
+}
+
+static int
+__repmgr_call_election(env)
+ ENV *env;
+{
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *master;
+ int ret;
+
+ master = __repmgr_connected_master(env);
+ if (master == NULL)
+ return (0);
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "heartbeat monitor timeout expired"));
+ STAT(env->rep_handle->region->mstat.st_connection_drop++);
+ if ((conn = master->ref.conn.in) != NULL &&
+ (ret = __repmgr_bust_connection(env, conn)) != 0)
+ return (ret);
+ if ((conn = master->ref.conn.out) != NULL &&
+ (ret = __repmgr_bust_connection(env, conn)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_check_timeouts __P((ENV *));
+ *
+ * !!!
+ * Assumes caller holds the mutex.
+ */
+int
+__repmgr_check_timeouts(env)
+ ENV *env;
+{
+ db_timespec when, now;
+ HEARTBEAT_ACTION action;
+ int ret;
+
+ /*
+ * Figure out the next heartbeat-related thing to be done. Then, if
+ * it's time to do it, do so.
+ */
+ if (__repmgr_next_timeout(env, &when, &action)) {
+ __os_gettime(env, &now, 1);
+ if (timespeccmp(&when, &now, <=) &&
+ (ret = (*action)(env)) != 0)
+ return (ret);
+ }
+
+ return (__repmgr_retry_connections(env));
+}
+
+/*
+ * Initiates connection attempts for any sites on the idle list whose retry
+ * times have expired.
+ */
+static int
+__repmgr_retry_connections(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ REPMGR_RETRY *retry;
+ db_timespec now;
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+ __os_gettime(env, &now, 1);
+
+ while (!TAILQ_EMPTY(&db_rep->retries)) {
+ retry = TAILQ_FIRST(&db_rep->retries);
+ if (timespeccmp(&retry->time, &now, >=))
+ break; /* since items are in time order */
+
+ TAILQ_REMOVE(&db_rep->retries, retry, entries);
+
+ eid = retry->eid;
+ __os_free(env, retry);
+ DB_ASSERT(env, IS_VALID_EID(eid));
+ site = SITE_FROM_EID(eid);
+ DB_ASSERT(env, site->state == SITE_PAUSING);
+
+ if (site->membership == SITE_PRESENT) {
+ if ((ret = __repmgr_try_one(env, eid)) != 0)
+ return (ret);
+ } else
+ site->state = SITE_IDLE;
+ }
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_first_try_connections __P((ENV *));
+ *
+ * !!!
+ * Assumes caller holds the mutex.
+ */
+int
+__repmgr_first_try_connections(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ site = SITE_FROM_EID(eid);
+ /*
+ * Normally all sites would be IDLE here. But if a user thread
+ * triggered an auto-start in a subordinate process, our send()
+ * function may have found new sites when it sync'ed site
+ * addresses, and that action causes connection attempts to be
+ * scheduled (resulting in PAUSING state here, or conceivably
+ * even CONNECTING or CONNECTED).
+ */
+ if (site->state == SITE_IDLE &&
+ site->membership == SITE_PRESENT &&
+ (ret = __repmgr_try_one(env, eid)) != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+/*
+ * Starts a thread to open a connection to the site at the given EID.
+ */
+static int
+__repmgr_try_one(env, eid)
+ ENV *env;
+ int eid;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ REPMGR_RUNNABLE *th;
+ int ret;
+
+ db_rep = env->rep_handle;
+ DB_ASSERT(env, IS_VALID_EID(eid));
+ site = SITE_FROM_EID(eid);
+ th = site->connector;
+ if (th == NULL) {
+ if ((ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &th)) != 0)
+ return (ret);
+ site->connector = th;
+ } else if (th->finished) {
+ if ((ret = __repmgr_thread_join(th)) != 0)
+ return (ret);
+ } else {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "eid %lu previous connector thread still running; will retry",
+ (u_long)eid));
+ return (__repmgr_schedule_connection_attempt(env,
+ eid, FALSE));
+ }
+
+ site->state = SITE_CONNECTING;
+
+ th->run = __repmgr_connector_thread;
+ th->args.eid = eid;
+ if ((ret = __repmgr_thread_start(env, th)) != 0) {
+ __os_free(env, th);
+ site->connector = NULL;
+ }
+ return (ret);
+}
+
+static void *
+__repmgr_connector_thread(argsp)
+ void *argsp;
+{
+ REPMGR_RUNNABLE *th;
+ ENV *env;
+ int ret;
+
+ th = argsp;
+ env = th->env;
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "starting connector thread, eid %u", th->args.eid));
+ if ((ret = __repmgr_connector_main(env, th)) != 0) {
+ __db_err(env, ret, DB_STR("3617", "connector thread failed"));
+ (void)__repmgr_thread_failure(env, ret);
+ }
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "connector thread is exiting"));
+
+ th->finished = TRUE;
+ return (NULL);
+}
+
+static int
+__repmgr_connector_main(env, th)
+ ENV *env;
+ REPMGR_RUNNABLE *th;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ REPMGR_CONNECTION *conn;
+ DB_REPMGR_CONN_ERR info;
+ repmgr_netaddr_t netaddr;
+ SITE_STRING_BUFFER site_string;
+ int err, ret, t_ret;
+
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ LOCK_MUTEX(db_rep->mutex);
+ DB_ASSERT(env, IS_VALID_EID(th->args.eid));
+ site = SITE_FROM_EID(th->args.eid);
+ if (site->state != SITE_CONNECTING && db_rep->repmgr_status == stopped)
+ goto unlock;
+
+ /*
+ * Drop the mutex during operations that could block. During those
+ * times, the site struct could move (if we had to grow the sites
+ * array), but host wouldn't.
+ *
+ * Also, during those times we might receive an incoming connection from
+ * the site, which would change its state. So, check state each time we
+ * reacquire the mutex, and quit if the state of the world changed while
+ * we were away.
+ */
+ netaddr = site->net_addr;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "connecting to %s",
+ __repmgr_format_site_loc(site, site_string)));
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ if ((ret = __repmgr_connect(env, &netaddr, &conn, &err)) == 0) {
+ DB_EVENT(env, DB_EVENT_REP_CONNECT_ESTD, &th->args.eid);
+ LOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_set_nonblock_conn(conn)) != 0) {
+ __db_err(env, ret, DB_STR("3618",
+ "set_nonblock in connnect thread"));
+ goto cleanup;
+ }
+ conn->type = REP_CONNECTION;
+ site = SITE_FROM_EID(th->args.eid);
+ if (site->state != SITE_CONNECTING ||
+ db_rep->repmgr_status == stopped)
+ goto cleanup;
+
+ conn->eid = th->args.eid;
+ site = SITE_FROM_EID(th->args.eid);
+ site->ref.conn.out = conn;
+ site->state = SITE_CONNECTED;
+ __os_gettime(env, &site->last_rcvd_timestamp, 1);
+ ret = __repmgr_wake_main_thread(env);
+ } else if (ret == DB_REP_UNAVAIL) {
+ /* Retryable error while trying to connect: retry later. */
+ info.eid = th->args.eid;
+ info.error = err;
+ DB_EVENT(env, DB_EVENT_REP_CONNECT_TRY_FAILED, &info);
+ STAT(db_rep->region->mstat.st_connect_fail++);
+
+ LOCK_MUTEX(db_rep->mutex);
+ site = SITE_FROM_EID(th->args.eid);
+ if (site->state != SITE_CONNECTING ||
+ db_rep->repmgr_status == stopped) {
+ ret = 0;
+ goto unlock;
+ }
+ ret = __repmgr_schedule_connection_attempt(env,
+ th->args.eid, FALSE);
+ } else
+ goto out;
+
+ if (0) {
+cleanup:
+ if ((t_ret = __repmgr_destroy_conn(env, conn)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+unlock:
+ UNLOCK_MUTEX(db_rep->mutex);
+out:
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_v1_handshake __P((ENV *,
+ * PUBLIC: REPMGR_CONNECTION *, void *, size_t));
+ */
+int
+__repmgr_send_v1_handshake(env, conn, buf, len)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ void *buf;
+ size_t len;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ repmgr_netaddr_t *my_addr;
+ DB_REPMGR_V1_HANDSHAKE buffer;
+ DBT cntrl, rec;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ my_addr = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+
+ /*
+ * We're about to send from a structure that has padding holes in it.
+ * Initializing it keeps Valgrind happy, plus we really shouldn't be
+ * sending out random garbage anyway (pro forma privacy issue).
+ */
+ memset(&buffer, 0, sizeof(buffer));
+ buffer.version = 1;
+ buffer.priority = htonl(rep->priority);
+ buffer.port = my_addr->port;
+ cntrl.data = &buffer;
+ cntrl.size = sizeof(buffer);
+
+ rec.data = buf;
+ rec.size = (u_int32_t)len;
+
+ /*
+ * It would of course be disastrous to block the select() thread, so
+ * pass the "maxblock" argument as 0. Fortunately blocking should
+ * never be necessary here, because the hand-shake is always the first
+ * thing we send. Which is a good thing, because it would be almost as
+ * disastrous if we allowed ourselves to drop a handshake.
+ */
+ return (__repmgr_send_one(env,
+ conn, REPMGR_HANDSHAKE, &cntrl, &rec, 0));
+}
+
+/*
+ * PUBLIC: int __repmgr_read_from_site __P((ENV *, REPMGR_CONNECTION *));
+ *
+ * !!!
+ * Caller is assumed to hold repmgr->mutex, 'cuz we call queue_put() from here.
+ */
+int
+__repmgr_read_from_site(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ /*
+ * Loop, just in case we get EINTR and need to restart the I/O. (All
+ * other branches return.)
+ */
+ for (;;) {
+ switch ((ret = __repmgr_read_conn(conn))) {
+#ifndef DB_WIN32
+ case EINTR:
+ continue;
+#endif
+
+#if defined(DB_REPMGR_EAGAIN) && DB_REPMGR_EAGAIN != WOULDBLOCK
+ case DB_REPMGR_EAGAIN:
+#endif
+ case WOULDBLOCK:
+ return (0);
+
+ case DB_REP_UNAVAIL:
+ /* Error 0 is understood to mean EOF. */
+ __repmgr_fire_conn_err_event(env, conn, 0);
+ STAT(env->rep_handle->
+ region->mstat.st_connection_drop++);
+ return (DB_REP_UNAVAIL);
+
+ case 0:
+ if (IS_VALID_EID(conn->eid)) {
+ site = SITE_FROM_EID(conn->eid);
+ __os_gettime(env,
+ &site->last_rcvd_timestamp, 1);
+ }
+ return (conn->reading_phase == SIZES_PHASE ?
+ prepare_input(env, conn) :
+ dispatch_msgin(env, conn));
+
+ default:
+#ifdef EBADF
+ DB_ASSERT(env, ret != EBADF);
+#endif
+ __repmgr_fire_conn_err_event(env, conn, ret);
+ STAT(db_rep->region->mstat.st_connection_drop++);
+ return (DB_REP_UNAVAIL);
+ }
+ }
+}
+
+/*
+ * Reads in the current input phase, as defined by the connection's IOVECS
+ * struct.
+ *
+ * Returns DB_REP_UNAVAIL for EOF.
+ *
+ * Makes no assumption about synchronization: it's up to the caller to hold
+ * mutex if necessary.
+ *
+ * PUBLIC: int __repmgr_read_conn __P((REPMGR_CONNECTION *));
+ */
+int
+__repmgr_read_conn(conn)
+ REPMGR_CONNECTION *conn;
+{
+ size_t nr;
+ int ret;
+
+ /*
+ * Keep reading pieces as long as we're making some progress, or until
+ * we complete the current read phase as defined in iovecs.
+ */
+ for (;;) {
+ if ((ret = __repmgr_readv(conn->fd,
+ &conn->iovecs.vectors[conn->iovecs.offset],
+ conn->iovecs.count - conn->iovecs.offset, &nr)) != 0)
+ return (ret);
+
+ if (nr == 0)
+ return (DB_REP_UNAVAIL);
+
+ if (__repmgr_update_consumed(&conn->iovecs, nr)) {
+ /* We've fully read as much as we wanted. */
+ return (0);
+ }
+ }
+}
+
+/*
+ * Having finished reading the 9-byte message header, figure out what kind of
+ * message we're about to receive, and prepare input buffers accordingly. The
+ * header includes enough information for us to figure out how much buffer space
+ * we need to allocate (though in some cases we need to do a bit of computation
+ * to arrive at the answer).
+ *
+ * Caller must hold mutex.
+ */
+static int
+prepare_input(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+#define MEM_ALIGN sizeof(double)
+ DBT *dbt;
+ __repmgr_msg_hdr_args msg_hdr;
+ REPMGR_RESPONSE *resp;
+ u_int32_t control_size, rec_size, size;
+ size_t memsize, control_offset, rec_offset;
+ void *membase;
+ int ret, skip;
+
+ DB_ASSERT(env, conn->reading_phase == SIZES_PHASE);
+
+ /*
+ * We can only get here after having read the full 9 bytes that we
+ * expect, so this can't fail.
+ */
+ ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+ conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+ DB_ASSERT(env, ret == 0);
+
+ __repmgr_iovec_init(&conn->iovecs);
+ skip = FALSE;
+
+ switch ((conn->msg_type = msg_hdr.type)) {
+ case REPMGR_HEARTBEAT:
+ /*
+ * The underlying byte-receiving mechanism will already have
+ * noted the fact that we got some traffic on this connection,
+ * which is all that is needed to monitor the heartbeat. But
+ * we also put the heartbeat message on the message queue so
+ * that it will perform rerequest processing.
+ */
+ case REPMGR_REP_MESSAGE:
+ env->rep_handle->seen_repmsg = TRUE;
+ control_size = REP_MSG_CONTROL_SIZE(msg_hdr);
+ rec_size = REP_MSG_REC_SIZE(msg_hdr);
+ if (control_size == 0) {
+ if (conn->msg_type == REPMGR_HEARTBEAT) {
+ /*
+ * Got an old-style heartbeat without payload,
+ * nothing to do.
+ */
+ skip = TRUE;
+ break;
+ } else {
+ __db_errx(env, DB_STR("3619",
+ "illegal size for rep msg"));
+ return (DB_REP_UNAVAIL);
+ }
+ }
+ /*
+ * Allocate a block of memory large enough to hold a
+ * DB_REPMGR_MESSAGE wrapper, plus the (one or) two DBT
+ * data areas that it points to. Start by calculating
+ * the total memory needed.
+ */
+ memsize = DB_ALIGN(sizeof(REPMGR_MESSAGE), MEM_ALIGN);
+ control_offset = memsize;
+ memsize += control_size;
+ if (rec_size > 0) {
+ memsize = DB_ALIGN(memsize, MEM_ALIGN);
+ rec_offset = memsize;
+ memsize += rec_size;
+ } else
+ COMPQUIET(rec_offset, 0);
+ if ((ret = __os_malloc(env, memsize, &membase)) != 0)
+ return (ret);
+ conn->input.rep_message = membase;
+ conn->input.rep_message->msg_hdr = msg_hdr;
+ conn->input.rep_message->v.repmsg.originating_eid = conn->eid;
+
+ DB_INIT_DBT(conn->input.rep_message->v.repmsg.control,
+ (u_int8_t*)membase + control_offset, control_size);
+ __repmgr_add_dbt(&conn->iovecs,
+ &conn->input.rep_message->v.repmsg.control);
+
+ if (rec_size > 0) {
+ DB_INIT_DBT(conn->input.rep_message->v.repmsg.rec,
+ (rec_size > 0 ?
+ (u_int8_t*)membase + rec_offset : NULL),
+ rec_size);
+ __repmgr_add_dbt(&conn->iovecs,
+ &conn->input.rep_message->v.repmsg.rec);
+ } else
+ DB_INIT_DBT(conn->input.rep_message->v.repmsg.rec,
+ NULL, 0);
+ break;
+
+ case REPMGR_APP_MESSAGE:
+ /*
+ * We need a buffer big enough to hold the REPMGR_MESSAGE struct
+ * and the data that we expect to receive on the wire. We must
+ * extend the struct size for the variable-length DBT array at
+ * the end.
+ */
+ size = DB_ALIGN((size_t)(sizeof(REPMGR_MESSAGE) +
+ APP_MSG_SEGMENT_COUNT(msg_hdr) * sizeof(DBT)),
+ MEM_ALIGN);
+ memsize = size + APP_MSG_BUFFER_SIZE(msg_hdr);
+ if ((ret = __os_malloc(env, memsize, &membase)) != 0)
+ return (ret);
+ conn->input.rep_message = membase;
+ conn->input.rep_message->msg_hdr = msg_hdr;
+ conn->input.rep_message->v.appmsg.conn = conn;
+
+ DB_INIT_DBT(conn->input.rep_message->v.appmsg.buf,
+ (u_int8_t*)membase + size,
+ APP_MSG_BUFFER_SIZE(msg_hdr));
+ __repmgr_add_dbt(&conn->iovecs,
+ &conn->input.rep_message->v.appmsg.buf);
+ break;
+
+ case REPMGR_OWN_MSG:
+ size = sizeof(REPMGR_MESSAGE) + REPMGR_OWN_BUF_SIZE(msg_hdr);
+ if ((ret = __os_malloc(env, size, &membase)) != 0)
+ return (ret);
+ conn->input.rep_message = membase;
+ conn->input.rep_message->msg_hdr = msg_hdr;
+
+ /*
+ * Save "conn" pointer in case this turns out to be a one-shot
+ * request. If it isn't, it won't matter.
+ */
+ /*
+ * An OWN msg that arrives in PARAMETERS state has bypassed the
+ * final handshake, implying that this connection is to be used
+ * for a one-shot GMDB request.
+ */
+ if (REPMGR_OWN_BUF_SIZE(msg_hdr) == 0) {
+ __db_errx(env, DB_STR_A("3680",
+ "invalid own buf size %lu in prepare_input", "%lu"),
+ (u_long)REPMGR_OWN_BUF_SIZE(msg_hdr));
+ return (DB_REP_UNAVAIL);
+ }
+ DB_INIT_DBT(conn->input.rep_message->v.gmdb_msg.request,
+ (u_int8_t*)membase + sizeof(REPMGR_MESSAGE),
+ REPMGR_OWN_BUF_SIZE(msg_hdr));
+ __repmgr_add_dbt(&conn->iovecs,
+ &conn->input.rep_message->v.gmdb_msg.request);
+ break;
+
+ case REPMGR_APP_RESPONSE:
+ size = APP_RESP_BUFFER_SIZE(msg_hdr);
+ conn->cur_resp = APP_RESP_TAG(msg_hdr);
+ if (conn->cur_resp >= conn->aresp) {
+ __db_errx(env, DB_STR_A("3681",
+ "invalid cur resp %lu in prepare_input", "%lu"),
+ (u_long)conn->cur_resp);
+ return (DB_REP_UNAVAIL);
+ }
+ resp = &conn->responses[conn->cur_resp];
+ DB_ASSERT(env, F_ISSET(resp, RESP_IN_USE));
+
+ dbt = &resp->dbt;
+
+ /*
+ * Prepare to read message body into either the user-supplied
+ * buffer, or one we allocate here.
+ */
+ ret = 0;
+ if (!F_ISSET(resp, RESP_THREAD_WAITING)) {
+ /* Caller already timed out; allocate dummy buffer. */
+ if (size > 0) {
+ memset(dbt, 0, sizeof(*dbt));
+ ret = __os_malloc(env, size, &dbt->data);
+ F_SET(resp, RESP_DUMMY_BUF);
+ } else
+ F_CLR(resp, RESP_IN_USE);
+ } else if (F_ISSET(dbt, DB_DBT_MALLOC))
+ ret = __os_umalloc(env, size, &dbt->data);
+ else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+ if (dbt->data == NULL || dbt->size < size)
+ ret = __os_urealloc(env, size, &dbt->data);
+ } else if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+ /* Recipient should have checked size limit. */
+ DB_ASSERT(env, size <= dbt->ulen);
+ }
+ dbt->size = size;
+ if (ret != 0)
+ return (ret);
+
+ if (size > 0) {
+ __repmgr_add_dbt(&conn->iovecs, dbt);
+ F_SET(resp, RESP_READING);
+ } else {
+ skip = TRUE;
+ if (F_ISSET(resp, RESP_THREAD_WAITING)) {
+ F_SET(resp, RESP_COMPLETE);
+ if ((ret = __repmgr_wake_waiters(env,
+ &conn->response_waiters)) != 0)
+ return (ret);
+ }
+ }
+ break;
+
+ case REPMGR_RESP_ERROR:
+ DB_ASSERT(env, RESP_ERROR_TAG(msg_hdr) < conn->aresp &&
+ conn->responses != NULL);
+ resp = &conn->responses[RESP_ERROR_TAG(msg_hdr)];
+ DB_ASSERT(env, !F_ISSET(resp, RESP_READING));
+ if (F_ISSET(resp, RESP_THREAD_WAITING)) {
+ F_SET(resp, RESP_COMPLETE);
+
+ /*
+ * DB errors are always negative, but we only send
+ * unsigned values on the wire.
+ */
+ resp->ret = -((int)RESP_ERROR_CODE(msg_hdr));
+ if ((ret = __repmgr_wake_waiters(env,
+ &conn->response_waiters)) != 0)
+ return (ret);
+ } else
+ F_CLR(resp, RESP_IN_USE);
+ skip = TRUE;
+ break;
+
+ case REPMGR_HANDSHAKE:
+ case REPMGR_PERMLSN:
+ if ((ret = __repmgr_prepare_simple_input(env,
+ conn, &msg_hdr)) != 0)
+ return (ret);
+ break;
+
+ default:
+ __db_errx(env, DB_STR_A("3676",
+ "unexpected msg type %lu in prepare_input", "%lu"),
+ (u_long)conn->msg_type);
+ return (DB_REP_UNAVAIL);
+ }
+
+ if (skip) {
+ /*
+ * We can skip the DATA_PHASE, because the current message type
+ * only has a header, no following data.
+ */
+ __repmgr_reset_for_reading(conn);
+ } else
+ conn->reading_phase = DATA_PHASE;
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_prepare_simple_input __P((ENV *,
+ * PUBLIC: REPMGR_CONNECTION *, __repmgr_msg_hdr_args *));
+ */
+int
+__repmgr_prepare_simple_input(env, conn, msg_hdr)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ __repmgr_msg_hdr_args *msg_hdr;
+{
+ DBT *dbt;
+ u_int32_t control_size, rec_size;
+ int ret;
+
+ control_size = REP_MSG_CONTROL_SIZE(*msg_hdr);
+ rec_size = REP_MSG_REC_SIZE(*msg_hdr);
+
+ dbt = &conn->input.repmgr_msg.cntrl;
+ if ((dbt->size = control_size) > 0) {
+ if ((ret = __os_malloc(env,
+ dbt->size, &dbt->data)) != 0)
+ return (ret);
+ __repmgr_add_dbt(&conn->iovecs, dbt);
+ }
+
+ dbt = &conn->input.repmgr_msg.rec;
+ if ((dbt->size = rec_size) > 0) {
+ if ((ret = __os_malloc(env,
+ dbt->size, &dbt->data)) != 0) {
+ dbt = &conn->input.repmgr_msg.cntrl;
+ if (dbt->size > 0)
+ __os_free(env, dbt->data);
+ return (ret);
+ }
+ __repmgr_add_dbt(&conn->iovecs, dbt);
+ }
+ return (0);
+}
+
+/*
+ * Processes an incoming message, depending on our current state.
+ *
+ * Caller must hold mutex.
+ */
+static int
+dispatch_msgin(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ REPMGR_RUNNABLE *th;
+ REPMGR_RESPONSE *resp;
+ DBT *dbt;
+ char *hostname;
+ int eid, ret;
+
+ DB_ASSERT(env, conn->reading_phase == DATA_PHASE);
+ db_rep = env->rep_handle;
+
+ switch (conn->state) {
+ case CONN_CONNECTED:
+ /*
+ * In this state, we know we're working with an outgoing
+ * connection. We've sent a version proposal, and now expect
+ * the response (which could be a dumb old V1 handshake).
+ */
+ ONLY_HANDSHAKE(env, conn);
+
+ /*
+ * Here is a good opportunity to clean up this site's connector
+ * thread, because we generally come through here after making
+ * an outgoing connection, yet we're out of the main loop, so we
+ * don't hit this often.
+ */
+ eid = conn->eid;
+ DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(conn->eid));
+ site = SITE_FROM_EID(eid);
+ th = site->connector;
+ if (th != NULL && th->finished) {
+ if ((ret = __repmgr_thread_join(th)) != 0)
+ return (ret);
+ __os_free(env, th);
+ site->connector = NULL;
+ }
+
+ if ((ret = read_version_response(env, conn)) != 0)
+ return (ret);
+ break;
+
+ case CONN_NEGOTIATE:
+ /*
+ * Since we're in this state, we know we're working with an
+ * incoming connection, and this is the first message we've
+ * received. So it must be a version negotiation proposal (or a
+ * legacy V1 handshake). (We'll verify this of course.)
+ */
+ ONLY_HANDSHAKE(env, conn);
+ if ((ret = send_version_response(env, conn)) != 0)
+ return (ret);
+ break;
+
+ case CONN_PARAMETERS:
+ /*
+ * We've previously agreed on a (>1) version, so we expect
+ * either the other side's parameters handshake, or possibly a
+ * GMDB request on a one-shot, dedicated connection.
+ */
+ switch (conn->msg_type) {
+ case REPMGR_HANDSHAKE:
+ dbt = &conn->input.repmgr_msg.rec;
+ hostname = dbt->data;
+ hostname[dbt->size-1] = '\0';
+ if ((ret = accept_handshake(env, conn, hostname)) != 0)
+ return (ret);
+ conn->state = CONN_READY;
+ break;
+ case REPMGR_OWN_MSG:
+ /*
+ * GM change requests arrive in their own dedicated
+ * connections, and when they're served the entire
+ * connection isn't needed any more. So the message
+ * processing thread will do the entire job of serving
+ * the request and finishing off the connection; so we
+ * don't have to read it any more. Note that normally
+ * whenever we remove a connection from our list we
+ * decrement the reference count; but we also increment
+ * it whenever we pass a reference over to the message
+ * processing threads' queue. So in this case it's a
+ * wash.
+ */
+ conn->input.rep_message->v.gmdb_msg.conn = conn;
+ TAILQ_REMOVE(&db_rep->connections, conn, entries);
+ if ((ret = __repmgr_queue_put(env,
+ conn->input.rep_message)) != 0)
+ return (ret);
+ break;
+
+ default:
+ __db_errx(env, DB_STR_A("3620",
+ "unexpected msg type %d in PARAMETERS state", "%d"),
+ (int)conn->msg_type);
+ return (DB_REP_UNAVAIL);
+ }
+
+ break;
+
+ case CONN_READY:
+ case CONN_CONGESTED:
+ /*
+ * We have a complete message, so process it. Acks and
+ * handshakes get processed here, in line. Regular rep messages
+ * get posted to a queue, to be handled by a thread from the
+ * message thread pool.
+ */
+ switch (conn->msg_type) {
+ case REPMGR_PERMLSN:
+ if ((ret = record_permlsn(env, conn)) != 0)
+ return (ret);
+ break;
+
+ case REPMGR_HEARTBEAT:
+ case REPMGR_APP_MESSAGE:
+ case REPMGR_REP_MESSAGE:
+ if ((ret = __repmgr_queue_put(env,
+ conn->input.rep_message)) != 0)
+ return (ret);
+ /*
+ * The queue has taken over responsibility for the
+ * rep_message buffer, and will free it later.
+ */
+ if (conn->msg_type == REPMGR_APP_MESSAGE)
+ conn->ref_count++;
+ break;
+
+ case REPMGR_OWN_MSG:
+ /*
+ * Since we're in one of the "ready" states we know this
+ * isn't a one-shot request, so we are not giving
+ * ownership of this connection over to the message
+ * thread queue; we're going to keep reading on it
+ * ourselves. The message thread that processes this
+ * request has no need for a connection anyway, since
+ * there is no response that needs to be returned.
+ */
+ conn->input.rep_message->v.gmdb_msg.conn = NULL;
+ if ((ret = process_own_msg(env, conn)) != 0)
+ return (ret);
+ break;
+
+ case REPMGR_APP_RESPONSE:
+ DB_ASSERT(env, conn->cur_resp < conn->aresp &&
+ conn->responses != NULL);
+ resp = &conn->responses[conn->cur_resp];
+ DB_ASSERT(env, F_ISSET(resp, RESP_READING));
+ F_CLR(resp, RESP_READING);
+ if (F_ISSET(resp, RESP_THREAD_WAITING)) {
+ F_SET(resp, RESP_COMPLETE);
+ if ((ret = __repmgr_wake_waiters(env,
+ &conn->response_waiters)) != 0)
+ return (ret);
+ } else {
+ /*
+ * If the calling thread is no longer with us,
+ * yet we're reading, it can only mean we're
+ * reading into a dummy buffer, so free it now.
+ */
+ DB_ASSERT(env, F_ISSET(resp, RESP_DUMMY_BUF));
+ __os_free(env, resp->dbt.data);
+ F_CLR(resp, RESP_IN_USE);
+ }
+ break;
+
+ case REPMGR_RESP_ERROR:
+ default:
+ __db_errx(env, DB_STR_A("3621",
+ "unexpected msg type rcvd in ready state: %d",
+ "%d"), (int)conn->msg_type);
+ return (DB_REP_UNAVAIL);
+ }
+ break;
+
+ case CONN_DEFUNCT:
+ break;
+
+ default:
+ DB_ASSERT(env, FALSE);
+ }
+
+ switch (conn->msg_type) {
+ case REPMGR_HANDSHAKE:
+ case REPMGR_PERMLSN:
+ dbt = &conn->input.repmgr_msg.cntrl;
+ if (dbt->size > 0)
+ __os_free(env, dbt->data);
+ dbt = &conn->input.repmgr_msg.rec;
+ if (dbt->size > 0)
+ __os_free(env, dbt->data);
+ break;
+ default:
+ /*
+ * Some messages in REPMGR_OWN_MSG format are also handled
+ */
+ break;
+ }
+ __repmgr_reset_for_reading(conn);
+ return (0);
+}
+
+/*
+ * Process one of repmgr's "own" message types, and one that occurs on a regular
+ * (not one-shot) connection.
+ */
+static int
+process_own_msg(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ DBT *dbt;
+ REPMGR_SITE *site;
+ REPMGR_MESSAGE *msg;
+ __repmgr_connect_reject_args reject;
+ __repmgr_parm_refresh_args parms;
+ int ret;
+
+ ret = 0;
+ /*
+ * Set "msg" to point to the message struct. If we do all necessary
+ * processing here now, leave it set so that it can be freed. On the
+ * other hand, if we pass it off to the message queue for later
+ * processing by a message thread, we want to avoid freeing the memory
+ * here, so clear the pointer in such a case.
+ */
+ switch (REPMGR_OWN_MSG_TYPE((msg = conn->input.rep_message)->msg_hdr)) {
+ case REPMGR_CONNECT_REJECT:
+ dbt = &msg->v.gmdb_msg.request;
+ if ((ret = __repmgr_connect_reject_unmarshal(env,
+ &reject, dbt->data, dbt->size, NULL)) != 0)
+ return (DB_REP_UNAVAIL);
+
+ /*
+ * If we're being rejected by someone who has more up-to-date
+ * membership information than we do, it means we have been
+ * removed from the group. If we've just gotten started, we can
+ * make one attempt at automatically rejoining; otherwise we bow
+ * out gracefully.
+ */
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "got rejection msg citing version %lu/%lu",
+ (u_long)reject.gen, (u_long)reject.version));
+
+ if (__repmgr_gmdb_version_cmp(env,
+ reject.gen, reject.version) > 0) {
+ if (env->rep_handle->seen_repmsg)
+ ret = DB_DELETED;
+ else if ((ret = __repmgr_defer_op(env,
+ REPMGR_REJOIN)) == 0)
+ ret = DB_REP_UNAVAIL;
+ } else
+ ret = DB_REP_UNAVAIL;
+ DB_ASSERT(env, ret != 0);
+ return (ret);
+
+ case REPMGR_SHARING:
+ if ((ret = __repmgr_queue_put(env, msg)) != 0)
+ return (ret);
+ /* Show that we no longer own this memory. */
+ msg = NULL;
+ break;
+
+ case REPMGR_PARM_REFRESH:
+ dbt = &conn->input.rep_message->v.gmdb_msg.request;
+ if ((ret = __repmgr_parm_refresh_unmarshal(env,
+ &parms, dbt->data, dbt->size, NULL)) != 0)
+ return (DB_REP_UNAVAIL);
+ db_rep = env->rep_handle;
+ DB_ASSERT(env, conn->type == REP_CONNECTION &&
+ IS_KNOWN_REMOTE_SITE(conn->eid));
+ site = SITE_FROM_EID(conn->eid);
+ site->ack_policy = (int)parms.ack_policy;
+ if (F_ISSET(&parms, ELECTABLE_SITE))
+ F_SET(site, SITE_ELECTABLE);
+ else
+ F_CLR(site, SITE_ELECTABLE);
+ F_SET(site, SITE_HAS_PRIO);
+ break;
+
+ case REPMGR_GM_FAILURE:
+ case REPMGR_GM_FORWARD:
+ case REPMGR_JOIN_REQUEST:
+ case REPMGR_JOIN_SUCCESS:
+ case REPMGR_REMOVE_REQUEST:
+ case REPMGR_RESOLVE_LIMBO:
+ default:
+ __db_errx(env, DB_STR_A("3677",
+ "unexpected msg type %lu in process_own_msg", "%lu"),
+ (u_long)REPMGR_OWN_MSG_TYPE(msg->msg_hdr));
+ return (DB_REP_UNAVAIL);
+ }
+ /*
+ * If we haven't given ownership of the msg buffer to another thread,
+ * free it now.
+ */
+ if (msg != NULL)
+ __os_free(env, msg);
+ return (ret);
+}
+
+/*
+ * Examine and verify the incoming version proposal message, and send an
+ * appropriate response.
+ */
+static int
+send_version_response(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ __repmgr_version_proposal_args versions;
+ __repmgr_version_confirmation_args conf;
+ repmgr_netaddr_t *my_addr;
+ char *hostname;
+ u_int8_t buf[__REPMGR_VERSION_CONFIRMATION_SIZE+1];
+ DBT vi;
+ int ret;
+
+ db_rep = env->rep_handle;
+ my_addr = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+
+ if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
+ return (ret);
+ if (vi.size == 0) {
+ /* No version info, so we must be talking to a v1 site. */
+ hostname = conn->input.repmgr_msg.rec.data;
+ if ((ret = accept_v1_handshake(env, conn, hostname)) != 0)
+ return (ret);
+ if ((ret = __repmgr_send_v1_handshake(env,
+ conn, my_addr->host, strlen(my_addr->host) + 1)) != 0)
+ return (ret);
+ conn->state = CONN_READY;
+ } else {
+ if ((ret = __repmgr_version_proposal_unmarshal(env,
+ &versions, vi.data, vi.size, NULL)) != 0)
+ return (DB_REP_UNAVAIL);
+
+ if (DB_REPMGR_VERSION >= versions.min &&
+ DB_REPMGR_VERSION <= versions.max)
+ conf.version = DB_REPMGR_VERSION;
+ else if (versions.max >= DB_REPMGR_MIN_VERSION &&
+ versions.max <= DB_REPMGR_VERSION)
+ conf.version = versions.max;
+ else {
+ /*
+ * User must have wired up a combination of versions
+ * exceeding what we said we'd support.
+ */
+ __db_errx(env, DB_STR_A("3622",
+ "No available version between %lu and %lu",
+ "%lu %lu"), (u_long)versions.min,
+ (u_long)versions.max);
+ return (DB_REP_UNAVAIL);
+ }
+ conn->version = conf.version;
+
+ __repmgr_version_confirmation_marshal(env, &conf, buf);
+ buf[__REPMGR_VERSION_CONFIRMATION_SIZE] = '\0';
+ DB_ASSERT(env, !IS_SUBORDINATE(db_rep));
+ if ((ret = __repmgr_send_handshake(env,
+ conn, buf, sizeof(buf), 0)) != 0)
+ return (ret);
+
+ conn->state = CONN_PARAMETERS;
+ }
+ return (ret);
+}
+
+/*
+ * Sends a version-aware handshake to the remote site, only after we've verified
+ * that it is indeed version-aware. We can send either v2 or v3 handshake,
+ * depending on the connection's version.
+ *
+ * PUBLIC: int __repmgr_send_handshake __P((ENV *,
+ * PUBLIC: REPMGR_CONNECTION *, void *, size_t, u_int32_t));
+ */
+int
+__repmgr_send_handshake(env, conn, opt, optlen, flags)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ void *opt;
+ size_t optlen;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ DBT cntrl, rec;
+ __repmgr_handshake_args hs;
+ __repmgr_v2handshake_args v2hs;
+ __repmgr_v3handshake_args v3hs;
+ repmgr_netaddr_t *my_addr;
+ size_t hostname_len, rec_len;
+ void *buf;
+ u_int8_t *p;
+ u_int32_t cntrl_len;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ my_addr = &SITE_FROM_EID(db_rep->self_eid)->net_addr;
+
+ /*
+ * The cntrl part has various parameters (varies by version). The rec
+ * part has the host name, followed by whatever optional extra data was
+ * passed to us.
+ *
+ * Version awareness was introduced with protocol version 2 (so version
+ * 1 is handled elsewhere).
+ */
+ switch (conn->version) {
+ case 2:
+ cntrl_len = __REPMGR_V2HANDSHAKE_SIZE;
+ break;
+ case 3:
+ cntrl_len = __REPMGR_V3HANDSHAKE_SIZE;
+ break;
+ case 4:
+ cntrl_len = __REPMGR_HANDSHAKE_SIZE;
+ break;
+ default:
+ __db_errx(env, DB_STR_A("3678",
+ "unexpected conn version %lu in send_handshake", "%lu"),
+ (u_long)conn->version);
+ return (DB_REP_UNAVAIL);
+ }
+ hostname_len = strlen(my_addr->host);
+ rec_len = hostname_len + 1 +
+ (opt == NULL ? 0 : optlen);
+
+ if ((ret = __os_malloc(env, cntrl_len + rec_len, &buf)) != 0)
+ return (ret);
+
+ cntrl.data = p = buf;
+ switch (conn->version) {
+ case 2:
+ /* Not allowed to use multi-process feature in v2 group. */
+ DB_ASSERT(env, !IS_SUBORDINATE(db_rep));
+ v2hs.port = my_addr->port;
+ v2hs.priority = rep->priority;
+ __repmgr_v2handshake_marshal(env, &v2hs, p);
+ break;
+ case 3:
+ v3hs.port = my_addr->port;
+ v3hs.priority = rep->priority;
+ v3hs.flags = flags;
+ __repmgr_v3handshake_marshal(env, &v3hs, p);
+ break;
+ case 4:
+ hs.port = my_addr->port;
+ hs.alignment = MEM_ALIGN;
+ hs.ack_policy = (u_int32_t)rep->perm_policy;
+ hs.flags = flags;
+ if (rep->priority > 0)
+ F_SET(&hs, ELECTABLE_SITE);
+ __repmgr_handshake_marshal(env, &hs, p);
+ break;
+ default:
+ DB_ASSERT(env, FALSE);
+ break;
+ }
+ cntrl.size = cntrl_len;
+
+ p = rec.data = &p[cntrl_len];
+ (void)strcpy((char*)p, my_addr->host);
+ p += hostname_len + 1;
+ if (opt != NULL) {
+ memcpy(p, opt, optlen);
+ p += optlen;
+ }
+ rec.size = (u_int32_t)(p - (u_int8_t*)rec.data);
+
+ /* Never block on select thread: pass maxblock as 0. */
+ ret = __repmgr_send_one(env,
+ conn, REPMGR_HANDSHAKE, &cntrl, &rec, 0);
+ __os_free(env, buf);
+ return (ret);
+}
+
+static int
+read_version_response(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ __repmgr_version_confirmation_args conf;
+ DBT vi;
+ char *hostname;
+ u_int32_t flags;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
+ return (ret);
+ hostname = conn->input.repmgr_msg.rec.data;
+ if (vi.size == 0) {
+ if ((ret = accept_v1_handshake(env, conn, hostname)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __repmgr_version_confirmation_unmarshal(env,
+ &conf, vi.data, vi.size, NULL)) != 0)
+ return (DB_REP_UNAVAIL);
+ if (conf.version >= DB_REPMGR_MIN_VERSION &&
+ conf.version <= DB_REPMGR_VERSION)
+ conn->version = conf.version;
+ else {
+ /*
+ * Remote site "confirmed" a version outside of the
+ * range we proposed. It should never do that.
+ */
+ __db_errx(env, DB_STR_A("3623",
+ "Can't support confirmed version %lu", "%lu"),
+ (u_long)conf.version);
+ return (DB_REP_UNAVAIL);
+ }
+
+ if ((ret = accept_handshake(env, conn, hostname)) != 0)
+ return (ret);
+ flags = IS_SUBORDINATE(db_rep) ? REPMGR_SUBORDINATE : 0;
+ if ((ret = __repmgr_send_handshake(env,
+ conn, NULL, 0, flags)) != 0)
+ return (ret);
+ }
+ conn->state = CONN_READY;
+ return (ret);
+}
+
+/*
+ * Examine the rec part of a handshake message to see if it has any version
+ * information in it. This is the magic that lets us allows version-aware sites
+ * to exchange information, and yet avoids tripping up v1 sites, which don't
+ * know how to look for it.
+ *
+ * PUBLIC: int __repmgr_find_version_info __P((ENV *,
+ * PUBLIC: REPMGR_CONNECTION *, DBT *));
+ */
+int
+__repmgr_find_version_info(env, conn, vi)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ DBT *vi;
+{
+ DBT *dbt;
+ char *hostname;
+ u_int32_t hostname_len;
+
+ dbt = &conn->input.repmgr_msg.rec;
+ if (dbt->size == 0) {
+ __db_errx(env, DB_STR("3624",
+ "handshake is missing rec part"));
+ return (DB_REP_UNAVAIL);
+ }
+ hostname = dbt->data;
+ hostname[dbt->size-1] = '\0';
+ hostname_len = (u_int32_t)strlen(hostname);
+ if (hostname_len + 1 == dbt->size) {
+ /*
+ * The rec DBT held only the host name. This is a simple legacy
+ * V1 handshake; it contains no version information.
+ */
+ vi->size = 0;
+ } else {
+ /*
+ * There's more data than just the host name. The remainder is
+ * available to be treated as a normal byte buffer (and read in
+ * by one of the unmarshal functions). Note that the remaining
+ * length should not include the padding byte that we have
+ * already clobbered.
+ */
+ vi->data = &((u_int8_t *)dbt->data)[hostname_len + 1];
+ vi->size = (dbt->size - (hostname_len+1)) - 1;
+ }
+ return (0);
+}
+
+static int
+accept_handshake(env, conn, hostname)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ char *hostname;
+{
+ __repmgr_handshake_args hs;
+ __repmgr_v2handshake_args hs2;
+ __repmgr_v3handshake_args hs3;
+ u_int port;
+ u_int32_t ack, flags;
+ int electable;
+
+ switch (conn->version) {
+ case 2:
+ if (__repmgr_v2handshake_unmarshal(env, &hs2,
+ conn->input.repmgr_msg.cntrl.data,
+ conn->input.repmgr_msg.cntrl.size, NULL) != 0)
+ return (DB_REP_UNAVAIL);
+ port = hs2.port;
+ electable = hs2.priority > 0;
+ ack = flags = 0;
+ break;
+ case 3:
+ if (__repmgr_v3handshake_unmarshal(env, &hs3,
+ conn->input.repmgr_msg.cntrl.data,
+ conn->input.repmgr_msg.cntrl.size, NULL) != 0)
+ return (DB_REP_UNAVAIL);
+ port = hs3.port;
+ electable = hs3.priority > 0;
+ flags = hs3.flags;
+ ack = 0;
+ break;
+ case 4:
+ if (__repmgr_handshake_unmarshal(env, &hs,
+ conn->input.repmgr_msg.cntrl.data,
+ conn->input.repmgr_msg.cntrl.size, NULL) != 0)
+ return (DB_REP_UNAVAIL);
+ port = hs.port;
+ electable = F_ISSET(&hs, ELECTABLE_SITE);
+ flags = hs.flags;
+ ack = hs.ack_policy;
+ break;
+ default:
+ __db_errx(env, DB_STR_A("3679",
+ "unexpected conn version %lu in accept_handshake", "%lu"),
+ (u_long)conn->version);
+ return (DB_REP_UNAVAIL);
+ }
+
+ return (process_parameters(env,
+ conn, hostname, port, ack, electable, flags));
+}
+
+static int
+accept_v1_handshake(env, conn, hostname)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ char *hostname;
+{
+ DB_REPMGR_V1_HANDSHAKE *handshake;
+ u_int32_t prio;
+ int electable;
+
+ handshake = conn->input.repmgr_msg.cntrl.data;
+ if (conn->input.repmgr_msg.cntrl.size != sizeof(*handshake) ||
+ handshake->version != 1) {
+ __db_errx(env, DB_STR("3625", "malformed V1 handshake"));
+ return (DB_REP_UNAVAIL);
+ }
+
+ conn->version = 1;
+ prio = ntohl(handshake->priority);
+ electable = prio > 0;
+ return (process_parameters(env,
+ conn, hostname, handshake->port, 0, electable, 0));
+}
+
+/* Caller must hold mutex. */
+static int
+process_parameters(env, conn, host, port, ack, electable, flags)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ char *host;
+ u_int port;
+ int electable;
+ u_int32_t ack, flags;
+{
+ DB_REP *db_rep;
+ REPMGR_RETRY *retry;
+ REPMGR_SITE *site;
+ __repmgr_connect_reject_args reject;
+ u_int8_t reject_buf[__REPMGR_CONNECT_REJECT_SIZE];
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+
+ /* Connection state can be used to discern incoming versus outgoing. */
+ if (conn->state == CONN_CONNECTED) {
+ /*
+ * Since we initiated this as an outgoing connection, we
+ * obviously already know the host, port and site. We just need
+ * the other site's electability flag (which we'll grab below,
+ * after the big "else" clause).
+ */
+ DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(conn->eid));
+ site = SITE_FROM_EID(conn->eid);
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "handshake from connection to %s:%lu EID %u",
+ site->net_addr.host,
+ (u_long)site->net_addr.port, conn->eid));
+ } else {
+ DB_ASSERT(env, conn->state == CONN_NEGOTIATE ||
+ conn->state == CONN_PARAMETERS);
+ /*
+ * Incoming connection: until now we haven't known what kind of
+ * connection we're dealing with (and in the case of a
+ * REP_CONNECTION, what its EID is); so it must be on the
+ * "orphans" list. But now that we've received the parameters
+ * we'll be able to figure all that out.
+ */
+ if (LF_ISSET(APP_CHANNEL_CONNECTION)) {
+ conn->type = APP_CONNECTION;
+ return (0);
+ } else
+ conn->type = REP_CONNECTION;
+
+ /*
+ * Now that we've been given the host and port, use them to find
+ * the site.
+ */
+ if ((site = __repmgr_lookup_site(env, host, port)) != NULL &&
+ site->membership == SITE_PRESENT) {
+ TAILQ_REMOVE(&db_rep->connections, conn, entries);
+ conn->ref_count--;
+
+ eid = EID_FROM_SITE(site);
+ if (LF_ISSET(REPMGR_SUBORDINATE)) {
+ /*
+ * Accept it, as a supplementary source of
+ * input, but nothing else.
+ */
+ TAILQ_INSERT_TAIL(&site->sub_conns,
+ conn, entries);
+ conn->eid = eid;
+ } else {
+ DB_EVENT(env,
+ DB_EVENT_REP_CONNECT_ESTD, &eid);
+ switch (site->state) {
+ case SITE_PAUSING:
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "handshake from paused site %s:%u EID %u",
+ host, port, eid));
+ retry = site->ref.retry;
+ TAILQ_REMOVE(&db_rep->retries,
+ retry, entries);
+ __os_free(env, retry);
+ break;
+ case SITE_CONNECTED:
+ /*
+ * We got an incoming connection for a
+ * site we were already connected to; at
+ * least we thought we were.
+ */
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "connection from %s:%u EID %u while already connected",
+ host, port, eid));
+ if ((ret = resolve_collision(env,
+ site, conn)) != 0)
+ return (ret);
+ break;
+ case SITE_CONNECTING:
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "handshake from connecting site %s:%u EID %u",
+ host, port, eid));
+ /*
+ * Connector thread will give up when it
+ * sees this site's state change, so we
+ * don't have to do anything else here.
+ */
+ break;
+ default:
+ DB_ASSERT(env, FALSE);
+ }
+ conn->eid = eid;
+ site->state = SITE_CONNECTED;
+ site->ref.conn.in = conn;
+ __os_gettime(env,
+ &site->last_rcvd_timestamp, 1);
+ }
+ } else {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "rejecting connection from unknown or provisional site %s:%u",
+ host, port));
+ reject.version = db_rep->membership_version;
+ reject.gen = db_rep->member_version_gen;
+ __repmgr_connect_reject_marshal(env,
+ &reject, reject_buf);
+
+ if ((ret = __repmgr_send_own_msg(env, conn,
+ REPMGR_CONNECT_REJECT, reject_buf,
+ __REPMGR_CONNECT_REJECT_SIZE)) != 0)
+ return (ret);
+
+ /*
+ * Since we haven't set conn->eid, bust_connection will
+ * not schedule a retry for this "failure", which is
+ * exactly what we want.
+ */
+ return (DB_REP_UNAVAIL);
+ }
+ }
+
+ if (electable)
+ F_SET(site, SITE_ELECTABLE);
+ else
+ F_CLR(site, SITE_ELECTABLE);
+ F_SET(site, SITE_HAS_PRIO);
+ site->ack_policy = (int)ack;
+
+ /*
+ * If we're moping around wishing we knew who the master was, then
+ * getting in touch with another site might finally provide sufficient
+ * connectivity to find out.
+ */
+ if (!IS_SUBORDINATE(db_rep) && /* us */
+ !__repmgr_master_is_known(env) &&
+ !LF_ISSET(REPMGR_SUBORDINATE)) { /* the remote site */
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "handshake with no known master to wake election thread"));
+ db_rep->new_connection = TRUE;
+ if ((ret = __repmgr_signal(&db_rep->check_election)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+resolve_collision(env, site, conn)
+ ENV *env;
+ REPMGR_SITE *site;
+ REPMGR_CONNECTION *conn;
+{
+ int ret;
+
+ /*
+ * No need for site-oriented recovery, since we now have a replacement
+ * connection; so skip bust_connection() and call disable_conn()
+ * directly.
+ *
+ * If we already had an incoming connection, this new one always
+ * replaces it. Whether it also/alternatively replaces an outgoing
+ * connection depends on whether we're client or server (so as to avoid
+ * connection collisions resulting in no remaining connections). (If
+ * it's an older version that doesn't know about our collision
+ * resolution protocol, it will behave like a client.)
+ */
+ if (site->ref.conn.in != NULL) {
+ ret = __repmgr_disable_connection(env, site->ref.conn.in);
+ site->ref.conn.in = NULL;
+ if (ret != 0)
+ return (ret);
+ }
+ if (site->ref.conn.out != NULL &&
+ conn->version >= CONN_COLLISION_VERSION &&
+ __repmgr_is_server(env, site)) {
+ ret = __repmgr_disable_connection(env, site->ref.conn.out);
+ site->ref.conn.out = NULL;
+ if (ret != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+static int
+record_permlsn(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ __repmgr_permlsn_args *ackp, ack;
+ SITE_STRING_BUFFER location;
+ u_int32_t gen;
+ int ret;
+ u_int do_log_check;
+
+ db_rep = env->rep_handle;
+ do_log_check = 0;
+
+ if (conn->version == 0 ||
+ !IS_READY_STATE(conn->state) || !IS_VALID_EID(conn->eid)) {
+ __db_errx(env, DB_STR("3682",
+ "unexpected connection info in record_permlsn"));
+ return (DB_REP_UNAVAIL);
+ }
+ site = SITE_FROM_EID(conn->eid);
+
+ /*
+ * Extract the LSN. Save it only if it is an improvement over what the
+ * site has already ack'ed.
+ */
+ if (conn->version == 1) {
+ ackp = conn->input.repmgr_msg.cntrl.data;
+ if (conn->input.repmgr_msg.cntrl.size != sizeof(ack) ||
+ conn->input.repmgr_msg.rec.size != 0) {
+ __db_errx(env, DB_STR("3627", "bad ack msg size"));
+ return (DB_REP_UNAVAIL);
+ }
+ } else {
+ ackp = &ack;
+ if ((ret = __repmgr_permlsn_unmarshal(env, ackp,
+ conn->input.repmgr_msg.cntrl.data,
+ conn->input.repmgr_msg.cntrl.size, NULL)) != 0)
+ return (DB_REP_UNAVAIL);
+ }
+
+ /* Ignore stale acks. */
+ gen = db_rep->region->gen;
+ if (ackp->generation < gen) {
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "ignoring stale ack (%lu<%lu), from %s",
+ (u_long)ackp->generation, (u_long)gen,
+ __repmgr_format_site_loc(site, location)));
+ return (0);
+ }
+ VPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "got ack [%lu][%lu](%lu) from %s", (u_long)ackp->lsn.file,
+ (u_long)ackp->lsn.offset, (u_long)ackp->generation,
+ __repmgr_format_site_loc(site, location)));
+
+ if (ackp->generation == gen &&
+ LOG_COMPARE(&ackp->lsn, &site->max_ack) == 1) {
+ /*
+ * If file number for this site changed, check lowest log
+ * file needed after recording new permlsn for this site.
+ */
+ if (ackp->lsn.file > site->max_ack.file)
+ do_log_check = 1;
+ memcpy(&site->max_ack, &ackp->lsn, sizeof(DB_LSN));
+ if (do_log_check)
+ check_min_log_file(env);
+ if ((ret = __repmgr_wake_waiters(env,
+ &db_rep->ack_waiters)) != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+/*
+ * Maintains lowest log file still needed by the repgroup. This is stored
+ * in shared rep region so that it is accessible to repmgr subordinate
+ * processes that may not themselves have connections to other sites
+ * (e.g. a separate db_archive process.)
+ */
+static void
+check_min_log_file(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site;
+ u_int32_t min_log;
+ int eid;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ min_log = 0;
+
+ /*
+ * Record the lowest log file number from all connected sites. If this
+ * is a client, ignore the master because the master does not maintain
+ * nor send out its repmgr perm LSN in this way. Consider connections
+ * so that we don't allow a site that has been down a long time to
+ * indefinitely prevent log archiving.
+ */
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ if (eid == rep->master_id)
+ continue;
+ site = SITE_FROM_EID(eid);
+ if (site->state == SITE_CONNECTED &&
+ (((conn = site->ref.conn.in) != NULL &&
+ conn->state == CONN_READY) ||
+ ((conn = site->ref.conn.out) != NULL &&
+ conn->state == CONN_READY)) &&
+ !IS_ZERO_LSN(site->max_ack) &&
+ (min_log == 0 || site->max_ack.file < min_log))
+ min_log = site->max_ack.file;
+ }
+ /*
+ * During normal operation min_log should increase over time, but it
+ * is possible if a site returns after being disconnected for a while
+ * that min_log could decrease.
+ */
+ if (min_log != 0 && min_log != rep->min_log_file)
+ rep->min_log_file = min_log;
+}
+
+/*
+ * PUBLIC: int __repmgr_write_some __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_write_some(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ QUEUED_OUTPUT *output;
+ REPMGR_FLAT *msg;
+ int bytes, ret;
+
+ while (!STAILQ_EMPTY(&conn->outbound_queue)) {
+ output = STAILQ_FIRST(&conn->outbound_queue);
+ msg = output->msg;
+ if ((bytes = sendsocket(conn->fd, &msg->data[output->offset],
+ msg->length - output->offset, 0)) == SOCKET_ERROR) {
+ switch (ret = net_errno) {
+ case WOULDBLOCK:
+#if defined(DB_REPMGR_EAGAIN) && DB_REPMGR_EAGAIN != WOULDBLOCK
+ case DB_REPMGR_EAGAIN:
+#endif
+ return (0);
+ default:
+ __repmgr_fire_conn_err_event(env, conn, ret);
+ STAT(env->rep_handle->
+ region->mstat.st_connection_drop++);
+ return (DB_REP_UNAVAIL);
+ }
+ }
+
+ if ((output->offset += (size_t)bytes) >= msg->length) {
+ STAILQ_REMOVE_HEAD(&conn->outbound_queue, entries);
+ __os_free(env, output);
+ conn->out_queue_length--;
+ if (--msg->ref_count <= 0)
+ __os_free(env, msg);
+
+ /*
+ * We've achieved enough movement to free up at least
+ * one space in the outgoing queue. Wake any message
+ * threads that may be waiting for space. Leave
+ * CONGESTED state so that when the queue reaches the
+ * high-water mark again, the filling thread will be
+ * allowed to try waiting again.
+ */
+ conn->state = CONN_READY;
+ if ((ret = __repmgr_signal(&conn->drained)) != 0)
+ return (ret);
+ }
+ }
+
+ return (0);
+}
diff --git a/src/repmgr/repmgr_stat.c b/src/repmgr/repmgr_stat.c
new file mode 100644
index 00000000..fd6dabd3
--- /dev/null
+++ b/src/repmgr/repmgr_stat.c
@@ -0,0 +1,363 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#ifdef HAVE_STATISTICS
+static int __repmgr_print_all __P((ENV *, u_int32_t));
+static int __repmgr_print_sites __P((ENV *));
+static int __repmgr_print_stats __P((ENV *, u_int32_t));
+static int __repmgr_stat __P((ENV *, DB_REPMGR_STAT **, u_int32_t));
+
+/*
+ * __repmgr_stat_pp --
+ * DB_ENV->repmgr_stat pre/post processing.
+ *
+ * PUBLIC: int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+ */
+int
+__repmgr_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_REPMGR_STAT **statp;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->repmgr_stat", DB_INIT_REP);
+
+ if ((ret = __db_fchk(env,
+ "DB_ENV->repmgr_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ return (__repmgr_stat(env, statp, flags));
+}
+
+/*
+ * __repmgr_stat --
+ * ENV->repmgr_stat.
+ */
+static int
+__repmgr_stat(env, statp, flags)
+ ENV *env;
+ DB_REPMGR_STAT **statp;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ DB_REPMGR_STAT *copy, *stats;
+ uintmax_t tmp;
+ int ret;
+
+ db_rep = env->rep_handle;
+ stats = &db_rep->region->mstat;
+
+ *statp = NULL;
+
+ /* Allocate a stat struct to return to the user. */
+ if ((ret = __os_umalloc(env, sizeof(DB_REPMGR_STAT), &copy)) != 0)
+ return (ret);
+
+ memcpy(copy, stats, sizeof(*stats));
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ tmp = stats->st_max_elect_threads;
+ memset(stats, 0, sizeof(DB_REPMGR_STAT));
+ stats->st_max_elect_threads = tmp;
+ }
+
+ *statp = copy;
+ return (0);
+}
+
+/*
+ * __repmgr_stat_print_pp --
+ * DB_ENV->repmgr_stat_print pre/post processing.
+ *
+ * PUBLIC: int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__repmgr_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->repmgr_stat_print", DB_INIT_REP);
+
+ if ((ret = __db_fchk(env, "DB_ENV->repmgr_stat_print",
+ flags, DB_STAT_ALL | DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ return (__repmgr_stat_print(env, flags));
+}
+
+/*
+ * PUBLIC: int __repmgr_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__repmgr_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ u_int32_t orig_flags;
+ int ret;
+
+ orig_flags = flags;
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+ if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+ if ((ret = __repmgr_print_stats(env, orig_flags)) == 0)
+ ret = __repmgr_print_sites(env);
+ if (flags == 0 || ret != 0)
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_STAT_ALL) &&
+ (ret = __repmgr_print_all(env, orig_flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+static int
+__repmgr_print_stats(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_REPMGR_STAT *sp;
+ int ret;
+
+ if ((ret = __repmgr_stat(env, &sp, flags)) != 0)
+ return (ret);
+
+ __db_dl(env, "Number of PERM messages not acknowledged",
+ (u_long)sp->st_perm_failed);
+ __db_dl(env, "Number of messages queued due to network delay",
+ (u_long)sp->st_msgs_queued);
+ __db_dl(env, "Number of messages discarded due to queue length",
+ (u_long)sp->st_msgs_dropped);
+ __db_dl(env, "Number of existing connections dropped",
+ (u_long)sp->st_connection_drop);
+ __db_dl(env, "Number of failed new connection attempts",
+ (u_long)sp->st_connect_fail);
+ __db_dl(env, "Number of currently active election threads",
+ (u_long)sp->st_elect_threads);
+ __db_dl(env, "Election threads for which space is reserved",
+ (u_long)sp->st_max_elect_threads);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+static int
+__repmgr_print_sites(env)
+ ENV *env;
+{
+ DB_REPMGR_SITE *list;
+ DB_MSGBUF mb;
+ u_int count, i;
+ int ret;
+
+ if ((ret = __repmgr_site_list(env->dbenv, &count, &list)) != 0)
+ return (ret);
+
+ if (count == 0)
+ return (0);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB_REPMGR site information:");
+
+ DB_MSGBUF_INIT(&mb);
+ for (i = 0; i < count; ++i) {
+ __db_msgadd(env, &mb, "%s (eid: %d, port: %u",
+ list[i].host, list[i].eid, list[i].port);
+ if (list[i].status != 0)
+ __db_msgadd(env, &mb, ", %sconnected",
+ list[i].status == DB_REPMGR_CONNECTED ? "" : "dis");
+ __db_msgadd(env, &mb, ", %speer",
+ F_ISSET(&list[i], DB_REPMGR_ISPEER) ? "" : "non-");
+ __db_msgadd(env, &mb, ")");
+ DB_MSGBUF_FLUSH(env, &mb);
+ }
+
+ __os_ufree(env, list);
+
+ return (0);
+}
+
+/*
+ * __repmgr_print_all --
+ * Display debugging replication manager statistics.
+ */
+static int
+__repmgr_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(flags, 0);
+ return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__repmgr_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_REPMGR_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__repmgr_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
+
+/*
+ * PUBLIC: int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+ */
+int
+__repmgr_site_list(dbenv, countp, listp)
+ DB_ENV *dbenv;
+ u_int *countp;
+ DB_REPMGR_SITE **listp;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ DB_REPMGR_SITE *status;
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ REPMGR_SITE *site;
+ size_t array_size, total_size;
+ int eid, locked, ret;
+ u_int count, i;
+ char *name;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->repmgr_site_list", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ LOCK_MUTEX(db_rep->mutex);
+ locked = TRUE;
+
+ ENV_ENTER(env, ip);
+ if (rep->siteinfo_seq > db_rep->siteinfo_seq)
+ ret = __repmgr_sync_siteaddr(env);
+ ENV_LEAVE(env, ip);
+ if (ret != 0)
+ goto err;
+ } else {
+ rep = NULL;
+ locked = FALSE;
+ }
+
+ /* Initialize for empty list or error return. */
+ *countp = 0;
+ *listp = NULL;
+
+ /*
+ * First, add up how much memory we need for the host names, excluding
+ * the local site.
+ */
+ for (i = 0, count = 0, total_size = 0; i < db_rep->site_cnt; i++) {
+ site = &db_rep->sites[i];
+
+ if ((int)i == db_rep->self_eid || site->membership == 0)
+ continue;
+
+ /* Make room for the NUL terminating byte. */
+ total_size += strlen(site->net_addr.host) + 1;
+ count++;
+ }
+ if (count == 0)
+ goto err;
+ array_size = sizeof(DB_REPMGR_SITE) * count;
+ total_size += array_size;
+
+ if ((ret = __os_umalloc(env, total_size, &status)) != 0)
+ goto err;
+
+ /*
+ * Put the storage for the host names after the array of structs. This
+ * way, the caller can free the whole thing in one single operation.
+ */
+ name = (char *)((u_int8_t *)status + array_size);
+ for (eid = 0, i = 0; eid < (int)db_rep->site_cnt; eid++) {
+ site = &db_rep->sites[eid];
+ if (eid == db_rep->self_eid || site->membership == 0)
+ continue;
+
+ /* If we don't have rep, we can't really know EID yet. */
+ status[i].eid = rep ? eid : DB_EID_INVALID;
+
+ status[i].host = name;
+ (void)strcpy(name, site->net_addr.host);
+ name += strlen(name) + 1;
+
+ status[i].port = site->net_addr.port;
+
+ status[i].flags = 0;
+
+ if (FLD_ISSET(site->config, DB_REPMGR_PEER))
+ F_SET(&status[i], DB_REPMGR_ISPEER);
+
+ /*
+ * If we haven't started a communications thread, connection
+ * status is kind of meaningless. This distinction is useful
+ * for calls from the db_stat utility: it could be useful for
+ * db_stat to display known sites with EID; but would be
+ * confusing for it to display "disconnected" if another process
+ * does indeed have a connection established (db_stat can't know
+ * that).
+ */
+ if (db_rep->selector == NULL)
+ status[i].status = 0;
+ else if (site->state != SITE_CONNECTED)
+ status[i].status = DB_REPMGR_DISCONNECTED;
+ else if ((site->ref.conn.in != NULL &&
+ IS_READY_STATE(site->ref.conn.in->state)) ||
+ (site->ref.conn.out != NULL &&
+ IS_READY_STATE(site->ref.conn.out->state)))
+ status[i].status = DB_REPMGR_CONNECTED;
+ else
+ status[i].status = DB_REPMGR_DISCONNECTED;
+
+ i++;
+ }
+
+ *countp = count;
+ *listp = status;
+
+err: if (locked)
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
diff --git a/src/repmgr/repmgr_stub.c b/src/repmgr/repmgr_stub.c
new file mode 100644
index 00000000..734c2240
--- /dev/null
+++ b/src/repmgr/repmgr_stub.c
@@ -0,0 +1,262 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_REPLICATION_THREADS
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * If the library wasn't compiled with replication support, various routines
+ * aren't available. Stub them here, returning an appropriate error.
+ */
+static int __db_norepmgr __P((DB_ENV *));
+
+/*
+ * __db_norepmgr --
+ * Error when a Berkeley DB build doesn't include replication mgr support.
+ */
+static int
+__db_norepmgr(dbenv)
+ DB_ENV *dbenv;
+{
+ __db_errx(dbenv->env, DB_STR("3628",
+ "library build did not include support for the Replication Manager"));
+ return (DB_OPNOTSUP);
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_close __P((ENV *));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_close(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_get_ack_policy(dbenv, policy)
+ DB_ENV *dbenv;
+ int *policy;
+{
+ COMPQUIET(policy, NULL);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_set_ack_policy __P((DB_ENV *, int));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_set_ack_policy(dbenv, policy)
+ DB_ENV *dbenv;
+ int policy;
+{
+ COMPQUIET(policy, 0);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_site
+ * PUBLIC: __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_site(dbenv, host, port, dbsitep, flags)
+ DB_ENV *dbenv;
+ const char *host;
+ u_int port;
+ DB_SITE **dbsitep;
+ u_int32_t flags;
+{
+ COMPQUIET(host, NULL);
+ COMPQUIET(port, 0);
+ COMPQUIET(dbsitep, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_site_by_eid(dbenv, eid, dbsitep)
+ DB_ENV *dbenv;
+ int eid;
+ DB_SITE **dbsitep;
+{
+ COMPQUIET(eid, 0);
+ COMPQUIET(dbsitep, NULL);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_local_site
+ * PUBLIC: __P((DB_ENV *, DB_SITE **));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_local_site(dbenv, dbsitep)
+ DB_ENV *dbenv;
+ DB_SITE **dbsitep;
+{
+ COMPQUIET(dbsitep, NULL);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_site_list(dbenv, countp, listp)
+ DB_ENV *dbenv;
+ u_int *countp;
+ DB_REPMGR_SITE **listp;
+{
+ COMPQUIET(countp, NULL);
+ COMPQUIET(listp, NULL);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_start(dbenv, nthreads, flags)
+ DB_ENV *dbenv;
+ int nthreads;
+ u_int32_t flags;
+{
+ COMPQUIET(nthreads, 0);
+ COMPQUIET(flags, 0);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_REPMGR_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_handle_event(env, event, info)
+ ENV *env;
+ u_int32_t event;
+ void *info;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(event, 0);
+ COMPQUIET(info, NULL);
+
+ /*
+ * It's not an error for this function to be called. Replication calls
+ * this to let repmgr handle events. If repmgr isn't part of the build,
+ * all replication events should be forwarded to the application.
+ */
+ return (DB_EVENT_NOT_HANDLED);
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_channel __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_channel(dbenv, eid, dbchannelp, flags)
+ DB_ENV *dbenv;
+ int eid;
+ DB_CHANNEL **dbchannelp;
+ u_int32_t flags;
+{
+ COMPQUIET(eid, 0);
+ COMPQUIET(dbchannelp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_set_msg_dispatch __P((DB_ENV *,
+ * PUBLIC: void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t),
+ * PUBLIC: u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_set_msg_dispatch(dbenv, dispatch, flags)
+ DB_ENV *dbenv;
+ void (*dispatch) __P((DB_ENV *,
+ DB_CHANNEL *, DBT *, u_int32_t, u_int32_t));
+ u_int32_t flags;
+{
+ COMPQUIET(dispatch, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_init_recover __P((ENV *, DB_DISTAB *));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(dtabp, NULL);
+ return (0);
+}
+#endif /* !HAVE_REPLICATION_THREADS */
diff --git a/src/repmgr/repmgr_util.c b/src/repmgr/repmgr_util.c
new file mode 100644
index 00000000..c2439436
--- /dev/null
+++ b/src/repmgr/repmgr_util.c
@@ -0,0 +1,2086 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/txn.h"
+
+#define INITIAL_SITES_ALLOCATION 3 /* Arbitrary guess. */
+
+static int get_eid __P((ENV *, const char *, u_int, int *));
+static int __repmgr_addrcmp __P((repmgr_netaddr_t *, repmgr_netaddr_t *));
+static int read_gmdb __P((ENV *, DB_THREAD_INFO *, u_int8_t **, size_t *));
+
+/*
+ * Schedules a future attempt to re-establish a connection with the given site.
+ * Usually, we wait the configured retry_wait period. But if the "immediate"
+ * parameter is given as TRUE, we'll make the wait time 0, and put the request
+ * at the _beginning_ of the retry queue.
+ *
+ * PUBLIC: int __repmgr_schedule_connection_attempt __P((ENV *, int, int));
+ *
+ * !!!
+ * Caller should hold mutex.
+ *
+ * Unless an error occurs, we always attempt to wake the main thread;
+ * __repmgr_bust_connection relies on this behavior.
+ */
+int
+__repmgr_schedule_connection_attempt(env, eid, immediate)
+ ENV *env;
+ int eid;
+ int immediate;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_RETRY *retry, *target;
+ REPMGR_SITE *site;
+ db_timespec t;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ if ((ret = __os_malloc(env, sizeof(*retry), &retry)) != 0)
+ return (ret);
+
+ DB_ASSERT(env, IS_VALID_EID(eid));
+ site = SITE_FROM_EID(eid);
+ __os_gettime(env, &t, 1);
+ if (immediate)
+ TAILQ_INSERT_HEAD(&db_rep->retries, retry, entries);
+ else {
+ TIMESPEC_ADD_DB_TIMEOUT(&t, rep->connection_retry_wait);
+ /*
+ * Insert the new "retry" on the (time-ordered) list in its
+ * proper position. To do so, find the list entry ("target")
+ * with a later time; insert the new entry just before that.
+ */
+ TAILQ_FOREACH(target, &db_rep->retries, entries) {
+ if (timespeccmp(&target->time, &t, >))
+ break;
+ }
+ if (target == NULL)
+ TAILQ_INSERT_TAIL(&db_rep->retries, retry, entries);
+ else
+ TAILQ_INSERT_BEFORE(target, retry, entries);
+
+ }
+ retry->eid = eid;
+ retry->time = t;
+
+ site->state = SITE_PAUSING;
+ site->ref.retry = retry;
+
+ return (__repmgr_wake_main_thread(env));
+}
+
+/*
+ * Determines whether a remote site should be considered a "server" to us as a
+ * "client" (in typical client/server terminology, not to be confused with our
+ * usual use of the term "client" as in the master/client replication role), or
+ * vice versa.
+ *
+ * PUBLIC: int __repmgr_is_server __P((ENV *, REPMGR_SITE *));
+ */
+int
+__repmgr_is_server(env, site)
+ ENV *env;
+ REPMGR_SITE *site;
+{
+ DB_REP *db_rep;
+ int cmp;
+
+ db_rep = env->rep_handle;
+ cmp = __repmgr_addrcmp(&site->net_addr,
+ &SITE_FROM_EID(db_rep->self_eid)->net_addr);
+ DB_ASSERT(env, cmp != 0);
+
+ /*
+ * The mnemonic here is that a server conventionally has a
+ * small well-known port number, while clients typically use a port
+ * number from the higher ephemeral range. So, for the remote site to
+ * be considered a server, its address should have compared as lower
+ * than ours.
+ */
+ return (cmp == -1);
+}
+
+/*
+ * Compare two network addresses (lexicographically), and return -1, 0, or 1, as
+ * the first is less than, equal to, or greater than the second.
+ */
+static int
+__repmgr_addrcmp(addr1, addr2)
+ repmgr_netaddr_t *addr1, *addr2;
+{
+ int cmp;
+
+ cmp = strcmp(addr1->host, addr2->host);
+ if (cmp != 0)
+ return (cmp);
+
+ if (addr1->port < addr2->port)
+ return (-1);
+ else if (addr1->port > addr2->port)
+ return (1);
+ return (0);
+}
+
+/*
+ * Initialize the necessary control structures to begin reading a new input
+ * message.
+ *
+ * PUBLIC: void __repmgr_reset_for_reading __P((REPMGR_CONNECTION *));
+ */
+void
+__repmgr_reset_for_reading(con)
+ REPMGR_CONNECTION *con;
+{
+ con->reading_phase = SIZES_PHASE;
+ __repmgr_iovec_init(&con->iovecs);
+ __repmgr_add_buffer(&con->iovecs,
+ con->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE);
+}
+
+/*
+ * Constructs a DB_REPMGR_CONNECTION structure.
+ *
+ * PUBLIC: int __repmgr_new_connection __P((ENV *,
+ * PUBLIC: REPMGR_CONNECTION **, socket_t, int));
+ */
+int
+__repmgr_new_connection(env, connp, s, state)
+ ENV *env;
+ REPMGR_CONNECTION **connp;
+ socket_t s;
+ int state;
+{
+ REPMGR_CONNECTION *c;
+ int ret;
+
+ if ((ret = __os_calloc(env, 1, sizeof(REPMGR_CONNECTION), &c)) != 0)
+ return (ret);
+ if ((ret = __repmgr_alloc_cond(&c->drained)) != 0) {
+ __os_free(env, c);
+ return (ret);
+ }
+ if ((ret = __repmgr_init_waiters(env, &c->response_waiters)) != 0) {
+ (void)__repmgr_free_cond(&c->drained);
+ __os_free(env, c);
+ return (ret);
+ }
+
+ c->fd = s;
+ c->state = state;
+ c->type = UNKNOWN_CONN_TYPE;
+#ifdef DB_WIN32
+ c->event_object = WSA_INVALID_EVENT;
+#endif
+
+ STAILQ_INIT(&c->outbound_queue);
+ c->out_queue_length = 0;
+
+ __repmgr_reset_for_reading(c);
+ *connp = c;
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_set_keepalive __P((ENV *, REPMGR_CONNECTION *));
+ */
+int
+__repmgr_set_keepalive(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ int ret, sockopt;
+
+ ret = 0;
+#ifdef SO_KEEPALIVE
+ sockopt = 1;
+ if (setsockopt(conn->fd, SOL_SOCKET,
+ SO_KEEPALIVE, (sockopt_t)&sockopt, sizeof(sockopt)) != 0) {
+ ret = net_errno;
+ __db_err(env, ret, DB_STR("3626",
+ "can't set KEEPALIVE socket option"));
+ (void)__repmgr_destroy_conn(env, conn);
+ }
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_new_site __P((ENV *, REPMGR_SITE**,
+ * PUBLIC: const char *, u_int));
+ *
+ * Manipulates the process-local copy of the sites list. So, callers should
+ * hold the db_rep->mutex (except for single-threaded, pre-open configuration).
+ */
+int
+__repmgr_new_site(env, sitep, host, port)
+ ENV *env;
+ REPMGR_SITE **sitep;
+ const char *host;
+ u_int port;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site, *sites;
+ char *p;
+ u_int i, new_site_max;
+ int ret;
+
+ db_rep = env->rep_handle;
+ if (db_rep->site_cnt >= db_rep->site_max) {
+ new_site_max = db_rep->site_max == 0 ?
+ INITIAL_SITES_ALLOCATION : db_rep->site_max * 2;
+ if ((ret = __os_malloc(env,
+ sizeof(REPMGR_SITE) * new_site_max, &sites)) != 0)
+ return (ret);
+ if (db_rep->site_max > 0) {
+ /*
+ * For each site in the array, copy the old struct to
+ * the space allocated for the new struct. But the
+ * sub_conns list header (and one of the conn structs on
+ * the list, if any) contain pointers to the address of
+ * the old list header; so we have to move them
+ * explicitly. If not for that, we could use a simple
+ * __os_realloc() call.
+ */
+ for (i = 0; i < db_rep->site_cnt; i++) {
+ sites[i] = db_rep->sites[i];
+ TAILQ_INIT(&sites[i].sub_conns);
+ while (!TAILQ_EMPTY(
+ &db_rep->sites[i].sub_conns)) {
+ conn = TAILQ_FIRST(
+ &db_rep->sites[i].sub_conns);
+ TAILQ_REMOVE(
+ &db_rep->sites[i].sub_conns,
+ conn, entries);
+ TAILQ_INSERT_TAIL(&sites[i].sub_conns,
+ conn, entries);
+ }
+ }
+ __os_free(env, db_rep->sites);
+ }
+ db_rep->sites = sites;
+ db_rep->site_max = new_site_max;
+ }
+ if ((ret = __os_strdup(env, host, &p)) != 0) {
+ /* No harm in leaving the increased site_max intact. */
+ return (ret);
+ }
+ site = &db_rep->sites[db_rep->site_cnt++];
+
+ site->net_addr.host = p;
+ site->net_addr.port = (u_int16_t)port;
+
+ ZERO_LSN(site->max_ack);
+ site->ack_policy = 0;
+ site->alignment = 0;
+ site->flags = 0;
+ timespecclear(&site->last_rcvd_timestamp);
+ TAILQ_INIT(&site->sub_conns);
+ site->connector = NULL;
+ site->ref.conn.in = site->ref.conn.out = NULL;
+ site->state = SITE_IDLE;
+
+ site->membership = 0;
+ site->config = 0;
+
+ *sitep = site;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_create_mutex __P((ENV *, mgr_mutex_t **));
+ */
+int
+__repmgr_create_mutex(env, mtxp)
+ ENV *env;
+ mgr_mutex_t **mtxp;
+{
+ mgr_mutex_t *mtx;
+ int ret;
+
+ if ((ret = __os_malloc(env, sizeof(mgr_mutex_t), &mtx)) == 0 &&
+ (ret = __repmgr_create_mutex_pf(mtx)) != 0) {
+ __os_free(env, mtx);
+ }
+ if (ret == 0)
+ *mtxp = mtx;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_destroy_mutex __P((ENV *, mgr_mutex_t *));
+ */
+int
+__repmgr_destroy_mutex(env, mtx)
+ ENV *env;
+ mgr_mutex_t *mtx;
+{
+ int ret;
+
+ ret = __repmgr_destroy_mutex_pf(mtx);
+ __os_free(env, mtx);
+ return (ret);
+}
+
+/*
+ * Kind of like a destructor for a repmgr_netaddr_t: cleans up any subordinate
+ * allocated memory pointed to by the addr, though it does not free the struct
+ * itself.
+ *
+ * PUBLIC: void __repmgr_cleanup_netaddr __P((ENV *, repmgr_netaddr_t *));
+ */
+void
+__repmgr_cleanup_netaddr(env, addr)
+ ENV *env;
+ repmgr_netaddr_t *addr;
+{
+ if (addr->host != NULL) {
+ __os_free(env, addr->host);
+ addr->host = NULL;
+ }
+}
+
+/*
+ * PUBLIC: void __repmgr_iovec_init __P((REPMGR_IOVECS *));
+ */
+void
+__repmgr_iovec_init(v)
+ REPMGR_IOVECS *v;
+{
+ v->offset = v->count = 0;
+ v->total_bytes = 0;
+}
+
+/*
+ * PUBLIC: void __repmgr_add_buffer __P((REPMGR_IOVECS *, void *, size_t));
+ *
+ * !!!
+ * There is no checking for overflow of the vectors[5] array.
+ */
+void
+__repmgr_add_buffer(v, address, length)
+ REPMGR_IOVECS *v;
+ void *address;
+ size_t length;
+{
+ if (length > 0) {
+ v->vectors[v->count].iov_base = address;
+ v->vectors[v->count++].iov_len = (u_long)length;
+ v->total_bytes += length;
+ }
+}
+
+/*
+ * PUBLIC: void __repmgr_add_dbt __P((REPMGR_IOVECS *, const DBT *));
+ */
+void
+__repmgr_add_dbt(v, dbt)
+ REPMGR_IOVECS *v;
+ const DBT *dbt;
+{
+ if (dbt->size > 0) {
+ v->vectors[v->count].iov_base = dbt->data;
+ v->vectors[v->count++].iov_len = dbt->size;
+ v->total_bytes += dbt->size;
+ }
+}
+
+/*
+ * Update a set of iovecs to reflect the number of bytes transferred in an I/O
+ * operation, so that the iovecs can be used to continue transferring where we
+ * left off.
+ * Returns TRUE if the set of buffers is now fully consumed, FALSE if more
+ * remains.
+ *
+ * PUBLIC: int __repmgr_update_consumed __P((REPMGR_IOVECS *, size_t));
+ */
+int
+__repmgr_update_consumed(v, byte_count)
+ REPMGR_IOVECS *v;
+ size_t byte_count;
+{
+ db_iovec_t *iov;
+ int i;
+
+ for (i = v->offset; ; i++) {
+ DB_ASSERT(NULL, i < v->count && byte_count > 0);
+ iov = &v->vectors[i];
+ if (byte_count > iov->iov_len) {
+ /*
+ * We've consumed (more than) this vector's worth.
+ * Adjust count and continue.
+ */
+ byte_count -= iov->iov_len;
+ } else {
+ /*
+ * Adjust length of remaining portion of vector.
+ * byte_count can never be greater than iov_len, or we
+ * would not be in this section of the if clause.
+ */
+ iov->iov_len -= (u_int32_t)byte_count;
+ if (iov->iov_len > 0) {
+ /*
+ * Still some left in this vector. Adjust base
+ * address too, and leave offset pointing here.
+ */
+ iov->iov_base = (void *)
+ ((u_int8_t *)iov->iov_base + byte_count);
+ v->offset = i;
+ } else {
+ /*
+ * Consumed exactly to a vector boundary.
+ * Advance to next vector for next time.
+ */
+ v->offset = i+1;
+ }
+ /*
+ * If offset has reached count, the entire thing is
+ * consumed.
+ */
+ return (v->offset >= v->count);
+ }
+ }
+}
+
+/*
+ * Builds a buffer containing our network address information, suitable for
+ * publishing as cdata via a call to rep_start, and sets up the given DBT to
+ * point to it. The buffer is dynamically allocated memory, and the caller must
+ * assume responsibility for it.
+ *
+ * PUBLIC: int __repmgr_prepare_my_addr __P((ENV *, DBT *));
+ */
+int
+__repmgr_prepare_my_addr(env, dbt)
+ ENV *env;
+ DBT *dbt;
+{
+ DB_REP *db_rep;
+ repmgr_netaddr_t addr;
+ size_t size, hlen;
+ u_int16_t port_buffer;
+ u_int8_t *ptr;
+ int ret;
+
+ db_rep = env->rep_handle;
+ LOCK_MUTEX(db_rep->mutex);
+ addr = SITE_FROM_EID(db_rep->self_eid)->net_addr;
+ UNLOCK_MUTEX(db_rep->mutex);
+ /*
+ * The cdata message consists of the 2-byte port number, in network byte
+ * order, followed by the null-terminated host name string.
+ */
+ port_buffer = htons(addr.port);
+ size = sizeof(port_buffer) + (hlen = strlen(addr.host) + 1);
+ if ((ret = __os_malloc(env, size, &ptr)) != 0)
+ return (ret);
+
+ DB_INIT_DBT(*dbt, ptr, size);
+
+ memcpy(ptr, &port_buffer, sizeof(port_buffer));
+ ptr = &ptr[sizeof(port_buffer)];
+ memcpy(ptr, addr.host, hlen);
+
+ return (0);
+}
+
+/*
+ * !!!
+ * This may only be called after threads have been started, because we don't
+ * know the answer until we have established group membership (e.g., reading the
+ * membership database). That should be OK, because we only need this
+ * for starting an election, or counting acks after sending a PERM message.
+ *
+ * PUBLIC: int __repmgr_get_nsites __P((ENV *, u_int32_t *));
+ */
+int
+__repmgr_get_nsites(env, nsitesp)
+ ENV *env;
+ u_int32_t *nsitesp;
+{
+ DB_REP *db_rep;
+ u_int32_t nsites;
+
+ db_rep = env->rep_handle;
+
+ if ((nsites = db_rep->region->config_nsites) == 0) {
+ __db_errx(env, DB_STR("3672",
+ "Nsites unknown before repmgr_start()"));
+ return (EINVAL);
+ }
+ *nsitesp = nsites;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_thread_failure __P((ENV *, int));
+ */
+int
+__repmgr_thread_failure(env, why)
+ ENV *env;
+ int why;
+{
+ DB_REP *db_rep;
+
+ db_rep = env->rep_handle;
+ LOCK_MUTEX(db_rep->mutex);
+ (void)__repmgr_stop_threads(env);
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (__env_panic(env, why));
+}
+
+/*
+ * Format a printable representation of a site location, suitable for inclusion
+ * in an error message. The buffer must be at least as big as
+ * MAX_SITE_LOC_STRING.
+ *
+ * PUBLIC: char *__repmgr_format_eid_loc __P((DB_REP *,
+ * PUBLIC: REPMGR_CONNECTION *, char *));
+ *
+ * Caller must hold mutex.
+ */
+char *
+__repmgr_format_eid_loc(db_rep, conn, buffer)
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ char *buffer;
+{
+ int eid;
+
+ if (conn->type == APP_CONNECTION)
+ snprintf(buffer,
+ MAX_SITE_LOC_STRING, "(application channel)");
+ else if (conn->type == REP_CONNECTION &&
+ IS_VALID_EID(eid = conn->eid))
+ (void)__repmgr_format_site_loc(SITE_FROM_EID(eid), buffer);
+ else
+ snprintf(buffer, MAX_SITE_LOC_STRING, "(unidentified site)");
+ return (buffer);
+}
+
+/*
+ * PUBLIC: char *__repmgr_format_site_loc __P((REPMGR_SITE *, char *));
+ */
+char *
+__repmgr_format_site_loc(site, buffer)
+ REPMGR_SITE *site;
+ char *buffer;
+{
+ return (__repmgr_format_addr_loc(&site->net_addr, buffer));
+}
+
+/*
+ * PUBLIC: char *__repmgr_format_addr_loc __P((repmgr_netaddr_t *, char *));
+ */
+char *
+__repmgr_format_addr_loc(addr, buffer)
+ repmgr_netaddr_t *addr;
+ char *buffer;
+{
+ snprintf(buffer, MAX_SITE_LOC_STRING, "site %s:%lu",
+ addr->host, (u_long)addr->port);
+ return (buffer);
+}
+
+/*
+ * PUBLIC: int __repmgr_repstart __P((ENV *, u_int32_t));
+ */
+int
+__repmgr_repstart(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DBT my_addr;
+ int ret;
+
+ /* Include "cdata" in case sending to old-version site. */
+ if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0)
+ return (ret);
+ ret = __rep_start_int(env, &my_addr, flags);
+ __os_free(env, my_addr.data);
+ if (ret != 0)
+ __db_err(env, ret, DB_STR("3673", "rep_start"));
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_become_master __P((ENV *));
+ */
+int
+__repmgr_become_master(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ DB *dbp;
+ DB_TXN *txn;
+ REPMGR_SITE *site;
+ DBT key_dbt, data_dbt;
+ __repmgr_membership_key_args key;
+ __repmgr_membership_data_args member_status;
+ repmgr_netaddr_t addr;
+ u_int32_t status;
+ u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+ u_int8_t key_buf[MAX_MSG_BUF];
+ size_t len;
+ u_int i;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ dbp = NULL;
+ txn = NULL;
+
+ /* Examine membership list to see if we have a victim in limbo. */
+ LOCK_MUTEX(db_rep->mutex);
+ ZERO_LSN(db_rep->limbo_failure);
+ ZERO_LSN(db_rep->durable_lsn);
+ db_rep->limbo_victim = DB_EID_INVALID;
+ db_rep->limbo_resolution_needed = FALSE;
+ FOR_EACH_REMOTE_SITE_INDEX(i) {
+ site = SITE_FROM_EID(i);
+ if (site->membership == SITE_ADDING ||
+ site->membership == SITE_DELETING) {
+ db_rep->limbo_victim = (int)i;
+ db_rep->limbo_resolution_needed = TRUE;
+
+ /*
+ * Since there can never be more than one limbo victim,
+ * when we find one we don't have to continue looking
+ * for others.
+ */
+ break;
+ }
+ }
+ db_rep->client_intent = FALSE;
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ if ((ret = __repmgr_repstart(env, DB_REP_MASTER)) != 0)
+ return (ret);
+
+ if (db_rep->have_gmdb)
+ return (0);
+
+ db_rep->member_version_gen = db_rep->region->gen;
+ ENV_ENTER(env, ip);
+ if ((ret = __repmgr_hold_master_role(env, NULL)) != 0)
+ goto leave;
+retry:
+ if ((ret = __repmgr_setup_gmdb_op(env, ip, &txn, DB_CREATE)) != 0)
+ goto err;
+
+ DB_ASSERT(env, txn != NULL);
+ dbp = db_rep->gmdb;
+ DB_ASSERT(env, dbp != NULL);
+
+ /* Write the meta-data record. */
+ if ((ret = __repmgr_set_gm_version(env, ip, txn, 1)) != 0)
+ goto err;
+
+ /* Write a record representing each site in the group. */
+ for (i = 0; i < db_rep->site_cnt; i++) {
+ LOCK_MUTEX(db_rep->mutex);
+ site = SITE_FROM_EID(i);
+ addr = site->net_addr;
+ status = site->membership;
+ UNLOCK_MUTEX(db_rep->mutex);
+ if (status == 0)
+ continue;
+ DB_INIT_DBT(key.host, addr.host, strlen(addr.host) + 1);
+ key.port = addr.port;
+ ret = __repmgr_membership_key_marshal(env,
+ &key, key_buf, sizeof(key_buf), &len);
+ DB_ASSERT(env, ret == 0);
+ DB_INIT_DBT(key_dbt, key_buf, len);
+ member_status.flags = status;
+ __repmgr_membership_data_marshal(env, &member_status, data_buf);
+ DB_INIT_DBT(data_dbt, data_buf, __REPMGR_MEMBERSHIP_DATA_SIZE);
+ if ((ret = __db_put(dbp, ip, txn, &key_dbt, &data_dbt, 0)) != 0)
+ goto err;
+ }
+
+err:
+ if (txn != NULL) {
+ if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_cleanup_gmdb_op(env, TRUE)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+ if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED)
+ goto retry;
+ if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0)
+ ret = t_ret;
+leave:
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * Visits all the connections we know about, performing the desired action.
+ * "err_quit" determines whether we give up, or soldier on, in case of an
+ * error.
+ *
+ * PUBLIC: int __repmgr_each_connection __P((ENV *,
+ * PUBLIC: CONNECTION_ACTION, void *, int));
+ *
+ * !!!
+ * Caller must hold mutex.
+ */
+int
+__repmgr_each_connection(env, callback, info, err_quit)
+ ENV *env;
+ CONNECTION_ACTION callback;
+ void *info;
+ int err_quit;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn, *next;
+ REPMGR_SITE *site;
+ int eid, ret, t_ret;
+
+#define HANDLE_ERROR \
+ do { \
+ if (err_quit) \
+ return (t_ret); \
+ if (ret == 0) \
+ ret = t_ret; \
+ } while (0)
+
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ /*
+ * We might have used TAILQ_FOREACH here, except that in some cases we
+ * need to unlink an element along the way.
+ */
+ for (conn = TAILQ_FIRST(&db_rep->connections);
+ conn != NULL;
+ conn = next) {
+ next = TAILQ_NEXT(conn, entries);
+
+ if ((t_ret = (*callback)(env, conn, info)) != 0)
+ HANDLE_ERROR;
+ }
+
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ site = SITE_FROM_EID(eid);
+
+ if (site->state == SITE_CONNECTED) {
+ if ((conn = site->ref.conn.in) != NULL &&
+ (t_ret = (*callback)(env, conn, info)) != 0)
+ HANDLE_ERROR;
+ if ((conn = site->ref.conn.out) != NULL &&
+ (t_ret = (*callback)(env, conn, info)) != 0)
+ HANDLE_ERROR;
+ }
+
+ for (conn = TAILQ_FIRST(&site->sub_conns);
+ conn != NULL;
+ conn = next) {
+ next = TAILQ_NEXT(conn, entries);
+ if ((t_ret = (*callback)(env, conn, info)) != 0)
+ HANDLE_ERROR;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Initialize repmgr's portion of the shared region area. Note that we can't
+ * simply get the REP* address from the env as we usually do, because at the
+ * time of this call it hasn't been linked into there yet.
+ *
+ * This function is only called during creation of the region. If anything
+ * fails, our caller will panic and remove the region. So, if we have any
+ * failure, we don't have to clean up any partial allocation.
+ *
+ * PUBLIC: int __repmgr_open __P((ENV *, void *));
+ */
+int
+__repmgr_open(env, rep_)
+ ENV *env;
+ void *rep_;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = rep_;
+
+ if ((ret = __mutex_alloc(env, MTX_REPMGR, 0, &rep->mtx_repmgr)) != 0)
+ return (ret);
+
+ DB_ASSERT(env, rep->siteinfo_seq == 0 && db_rep->siteinfo_seq == 0);
+ rep->siteinfo_off = INVALID_ROFF;
+ rep->siteinfo_seq = 0;
+ if ((ret = __repmgr_share_netaddrs(env, rep, 0, db_rep->site_cnt)) != 0)
+ return (ret);
+
+ rep->self_eid = db_rep->self_eid;
+ rep->perm_policy = db_rep->perm_policy;
+ rep->ack_timeout = db_rep->ack_timeout;
+ rep->connection_retry_wait = db_rep->connection_retry_wait;
+ rep->election_retry_wait = db_rep->election_retry_wait;
+ rep->heartbeat_monitor_timeout = db_rep->heartbeat_monitor_timeout;
+ rep->heartbeat_frequency = db_rep->heartbeat_frequency;
+ return (ret);
+}
+
+/*
+ * Join an existing environment, by setting up our local site info structures
+ * from shared network address configuration in the region.
+ *
+ * As __repmgr_open(), note that we can't simply get the REP* address from the
+ * env as we usually do, because at the time of this call it hasn't been linked
+ * into there yet.
+ *
+ * PUBLIC: int __repmgr_join __P((ENV *, void *));
+ */
+int
+__repmgr_join(env, rep_)
+ ENV *env;
+ void *rep_;
+{
+ DB_REP *db_rep;
+ REGINFO *infop;
+ REP *rep;
+ SITEINFO *p;
+ REPMGR_SITE *site, temp;
+ repmgr_netaddr_t *addrp;
+ char *host;
+ u_int i, j;
+ int ret;
+
+ db_rep = env->rep_handle;
+ infop = env->reginfo;
+ rep = rep_;
+ ret = 0;
+
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+
+ /*
+ * Merge local and shared lists of remote sites. Note that the
+ * placement of entries in the shared array must not change. To
+ * accomplish the merge, pull in entries from the shared list, into the
+ * proper position, shuffling not-yet-resolved local entries if
+ * necessary. Then add any remaining locally known entries to the
+ * shared list.
+ */
+ i = 0;
+ if (rep->siteinfo_off != INVALID_ROFF) {
+ p = R_ADDR(infop, rep->siteinfo_off);
+
+ /* For each address in the shared list ... */
+ for (; i < rep->site_cnt; i++) {
+ host = R_ADDR(infop, p[i].addr.host);
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Site %s:%lu found at EID %u",
+ host, (u_long)p[i].addr.port, i));
+ /*
+ * Find it in the local list. Everything before 'i'
+ * already matches the shared list, and is therefore in
+ * the right place. So we only need to search starting
+ * from 'i'. When found, local config values will be
+ * used because they are assumed to be "fresher". But
+ * membership status is not, since this process hasn't
+ * been active (running) yet.
+ */
+ for (j = i; j < db_rep->site_cnt; j++) {
+ site = &db_rep->sites[j];
+ addrp = &site->net_addr;
+ if (strcmp(host, addrp->host) == 0 &&
+ p[i].addr.port == addrp->port) {
+ p[i].config = site->config;
+ site->membership = p[i].status;
+ break;
+ }
+ }
+
+ /*
+ * When not found in local list, copy peer values
+ * from shared list.
+ */
+ if (j == db_rep->site_cnt) {
+ if ((ret = __repmgr_new_site(env,
+ &site, host, p[i].addr.port)) != 0)
+ goto unlock;
+ site->config = p[i].config;
+ site->membership = p[i].status;
+ }
+ DB_ASSERT(env, j < db_rep->site_cnt);
+
+ /* Found or added at 'j', but belongs at 'i': swap. */
+ if (i != j) {
+ temp = db_rep->sites[j];
+ db_rep->sites[j] = db_rep->sites[i];
+ db_rep->sites[i] = temp;
+ /*
+ * If we're moving the entry that self_eid
+ * points to, then adjust self_eid to match.
+ * For now this is still merely our original,
+ * in-process pointer; we have yet to make sure
+ * it matches the one from shared memory.
+ */
+ if (db_rep->self_eid == (int)j)
+ db_rep->self_eid = (int)i;
+ }
+ }
+ }
+ if ((ret = __repmgr_share_netaddrs(env, rep, i, db_rep->site_cnt)) != 0)
+ goto unlock;
+ if (db_rep->self_eid == DB_EID_INVALID)
+ db_rep->self_eid = rep->self_eid;
+ else if (rep->self_eid == DB_EID_INVALID)
+ rep->self_eid = db_rep->self_eid;
+ else if (db_rep->self_eid != rep->self_eid) {
+ __db_errx(env, DB_STR("3674",
+ "A mismatching local site address has been set in the environment"));
+ ret = EINVAL;
+ goto unlock;
+ }
+
+ db_rep->siteinfo_seq = rep->siteinfo_seq;
+unlock:
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_env_refresh __P((ENV *env));
+ */
+int
+__repmgr_env_refresh(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REGINFO *infop;
+ SITEINFO *shared_array;
+ u_int i;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ ret = 0;
+ COMPQUIET(i, 0);
+
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ ret = __mutex_free(env, &rep->mtx_repmgr);
+ if (rep->siteinfo_off != INVALID_ROFF) {
+ shared_array = R_ADDR(infop, rep->siteinfo_off);
+ for (i = 0; i < db_rep->site_cnt; i++)
+ __env_alloc_free(infop, R_ADDR(infop,
+ shared_array[i].addr.host));
+ __env_alloc_free(infop, shared_array);
+ rep->siteinfo_off = INVALID_ROFF;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * Copies new remote site information from the indicated private array slots
+ * into the shared region. The corresponding shared array slots do not exist
+ * yet; they must be allocated.
+ *
+ * PUBLIC: int __repmgr_share_netaddrs __P((ENV *, void *, u_int, u_int));
+ *
+ * !!! The rep pointer is passed, because it may not yet have been installed
+ * into the env handle.
+ *
+ * !!! Assumes caller holds mtx_repmgr lock.
+ */
+int
+__repmgr_share_netaddrs(env, rep_, start, limit)
+ ENV *env;
+ void *rep_;
+ u_int start, limit;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REGINFO *infop;
+ REGENV *renv;
+ SITEINFO *orig, *shared_array;
+ char *host, *hostbuf;
+ size_t sz;
+ u_int i, n;
+ int eid, ret, touched;
+
+ db_rep = env->rep_handle;
+ infop = env->reginfo;
+ renv = infop->primary;
+ rep = rep_;
+ ret = 0;
+ touched = FALSE;
+
+ MUTEX_LOCK(env, renv->mtx_regenv);
+
+ for (i = start; i < limit; i++) {
+ if (rep->site_cnt >= rep->site_max) {
+ /* Table is full, we need more space. */
+ if (rep->siteinfo_off == INVALID_ROFF) {
+ n = INITIAL_SITES_ALLOCATION;
+ sz = n * sizeof(SITEINFO);
+ if ((ret = __env_alloc(infop,
+ sz, &shared_array)) != 0)
+ goto out;
+ } else {
+ n = 2 * rep->site_max;
+ sz = n * sizeof(SITEINFO);
+ if ((ret = __env_alloc(infop,
+ sz, &shared_array)) != 0)
+ goto out;
+ orig = R_ADDR(infop, rep->siteinfo_off);
+ memcpy(shared_array, orig,
+ sizeof(SITEINFO) * rep->site_cnt);
+ __env_alloc_free(infop, orig);
+ }
+ rep->siteinfo_off = R_OFFSET(infop, shared_array);
+ rep->site_max = n;
+ } else
+ shared_array = R_ADDR(infop, rep->siteinfo_off);
+
+ DB_ASSERT(env, rep->site_cnt < rep->site_max &&
+ rep->siteinfo_off != INVALID_ROFF);
+
+ host = db_rep->sites[i].net_addr.host;
+ sz = strlen(host) + 1;
+ if ((ret = __env_alloc(infop, sz, &hostbuf)) != 0)
+ goto out;
+ eid = (int)rep->site_cnt++;
+ (void)strcpy(hostbuf, host);
+ shared_array[eid].addr.host = R_OFFSET(infop, hostbuf);
+ shared_array[eid].addr.port = db_rep->sites[i].net_addr.port;
+ shared_array[eid].config = db_rep->sites[i].config;
+ shared_array[eid].status = db_rep->sites[i].membership;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "EID %d is assigned for site %s:%lu",
+ eid, host, (u_long)shared_array[eid].addr.port));
+ touched = TRUE;
+ }
+
+out:
+ if (touched)
+ db_rep->siteinfo_seq = ++rep->siteinfo_seq;
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ return (ret);
+}
+
+/*
+ * Copy into our local list any newly added/changed remote site
+ * configuration information.
+ *
+ * !!! Caller must hold db_rep->mutex and mtx_repmgr locks.
+ *
+ * PUBLIC: int __repmgr_copy_in_added_sites __P((ENV *));
+ */
+int
+__repmgr_copy_in_added_sites(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REGINFO *infop;
+ SITEINFO *base, *p;
+ REPMGR_SITE *site;
+ char *host;
+ int ret;
+ u_int i;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (rep->siteinfo_off == INVALID_ROFF)
+ goto out;
+
+ infop = env->reginfo;
+ base = R_ADDR(infop, rep->siteinfo_off);
+
+ /* Create private array slots for new sites. */
+ for (i = db_rep->site_cnt; i < rep->site_cnt; i++) {
+ p = &base[i];
+ host = R_ADDR(infop, p->addr.host);
+ if ((ret = __repmgr_new_site(env,
+ &site, host, p->addr.port)) != 0)
+ return (ret);
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Site %s:%lu found at EID %u",
+ host, (u_long)p->addr.port, i));
+ }
+
+ /* Make sure info is up to date for all sites, old and new. */
+ for (i = 0; i < db_rep->site_cnt; i++) {
+ p = &base[i];
+ site = SITE_FROM_EID(i);
+ site->config = p->config;
+ site->membership = p->status;
+ }
+
+out:
+ /*
+ * We always make sure our local list has been brought up to date with
+ * the shared list before adding to the local list (except before env
+ * open of course). So here there should be nothing on our local list
+ * not yet in shared memory.
+ */
+ DB_ASSERT(env, db_rep->site_cnt == rep->site_cnt);
+ db_rep->siteinfo_seq = rep->siteinfo_seq;
+ return (0);
+}
+
+/*
+ * Initialize a range of sites newly added to our site list array. Process each
+ * array entry in the range from <= x < limit. Passing from >= limit is
+ * allowed, and is effectively a no-op.
+ *
+ * PUBLIC: int __repmgr_init_new_sites __P((ENV *, int, int));
+ *
+ * !!! Assumes caller holds db_rep->mutex.
+ */
+int
+__repmgr_init_new_sites(env, from, limit)
+ ENV *env;
+ int from, limit;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ int i, ret;
+
+ db_rep = env->rep_handle;
+
+ if (db_rep->selector == NULL)
+ return (0);
+
+ DB_ASSERT(env, IS_VALID_EID(from) && IS_VALID_EID(limit) &&
+ from <= limit);
+ for (i = from; i < limit; i++) {
+ site = SITE_FROM_EID(i);
+ if (site->membership == SITE_PRESENT &&
+ (ret = __repmgr_schedule_connection_attempt(env,
+ i, TRUE)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_failchk __P((ENV *));
+ */
+int
+__repmgr_failchk(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ DB_REP *db_rep;
+ REP *rep;
+ db_threadid_t unused;
+
+ dbenv = env->dbenv;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ DB_THREADID_INIT(unused);
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+
+ /*
+ * Check to see if the main (listener) replication process may have died
+ * without cleaning up the flag. If so, we only have to clear it, and
+ * another process should then be able to come along and become the
+ * listener. So in either case we can return success.
+ */
+ if (rep->listener != 0 && !dbenv->is_alive(dbenv,
+ rep->listener, unused, DB_MUTEX_PROCESS_ONLY))
+ rep->listener = 0;
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_master_is_known __P((ENV *));
+ */
+int
+__repmgr_master_is_known(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *master;
+
+ db_rep = env->rep_handle;
+
+ /*
+ * We are the master, or we know of a master and have a healthy
+ * connection to it.
+ */
+ if (db_rep->region->master_id == db_rep->self_eid)
+ return (TRUE);
+ if ((master = __repmgr_connected_master(env)) == NULL)
+ return (FALSE);
+ if ((conn = master->ref.conn.in) != NULL &&
+ IS_READY_STATE(conn->state))
+ return (TRUE);
+ if ((conn = master->ref.conn.out) != NULL &&
+ IS_READY_STATE(conn->state))
+ return (TRUE);
+ return (FALSE);
+}
+
+/*
+ * PUBLIC: int __repmgr_stable_lsn __P((ENV *, DB_LSN *));
+ *
+ * This function may be called before any of repmgr's threads have
+ * been started. This code must not be called before env open.
+ * Currently that is impossible since its only caller is log_archive
+ * which itself cannot be called before env_open.
+ */
+int
+__repmgr_stable_lsn(env, stable_lsn)
+ ENV *env;
+ DB_LSN *stable_lsn;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (rep->min_log_file != 0 && rep->min_log_file < stable_lsn->file) {
+ /*
+ * Returning an LSN to be consistent with the rest of the
+ * log archiving processing. Construct LSN of format
+ * [filenum][0].
+ */
+ stable_lsn->file = rep->min_log_file;
+ stable_lsn->offset = 0;
+ }
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Repmgr_stable_lsn: Returning stable_lsn[%lu][%lu]",
+ (u_long)stable_lsn->file, (u_long)stable_lsn->offset));
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_send_sync_msg __P((ENV *, REPMGR_CONNECTION *,
+ * PUBLIC: u_int32_t, u_int8_t *, u_int32_t));
+ */
+int
+__repmgr_send_sync_msg(env, conn, type, buf, len)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ u_int8_t *buf;
+ u_int32_t len, type;
+{
+ REPMGR_IOVECS iovecs;
+ __repmgr_msg_hdr_args msg_hdr;
+ u_int8_t hdr_buf[__REPMGR_MSG_HDR_SIZE];
+ size_t unused;
+
+ msg_hdr.type = REPMGR_OWN_MSG;
+ REPMGR_OWN_BUF_SIZE(msg_hdr) = len;
+ REPMGR_OWN_MSG_TYPE(msg_hdr) = type;
+ __repmgr_msg_hdr_marshal(env, &msg_hdr, hdr_buf);
+
+ __repmgr_iovec_init(&iovecs);
+ __repmgr_add_buffer(&iovecs, hdr_buf, __REPMGR_MSG_HDR_SIZE);
+ if (len > 0)
+ __repmgr_add_buffer(&iovecs, buf, len);
+
+ return (__repmgr_write_iovecs(env, conn, &iovecs, &unused));
+}
+
+/*
+ * Produce a membership list from the known info currently in memory.
+ *
+ * PUBLIC: int __repmgr_marshal_member_list __P((ENV *, u_int8_t **, size_t *));
+ *
+ * Caller must hold mutex.
+ */
+int
+__repmgr_marshal_member_list(env, bufp, lenp)
+ ENV *env;
+ u_int8_t **bufp;
+ size_t *lenp;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_SITE *site;
+ __repmgr_membr_vers_args membr_vers;
+ __repmgr_site_info_args site_info;
+ u_int8_t *buf, *p;
+ size_t bufsize, len;
+ u_int i;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /* Compute a (generous) upper bound on needed buffer size. */
+ bufsize = __REPMGR_MEMBR_VERS_SIZE +
+ db_rep->site_cnt * (__REPMGR_SITE_INFO_SIZE + MAXHOSTNAMELEN + 1);
+ if ((ret = __os_malloc(env, bufsize, &buf)) != 0)
+ return (ret);
+ p = buf;
+
+ membr_vers.version = db_rep->membership_version;
+ membr_vers.gen = rep->gen;
+ __repmgr_membr_vers_marshal(env, &membr_vers, p);
+ p += __REPMGR_MEMBR_VERS_SIZE;
+
+ for (i = 0; i < db_rep->site_cnt; i++) {
+ site = SITE_FROM_EID(i);
+ if (site->membership == 0)
+ continue;
+
+ site_info.host.data = site->net_addr.host;
+ site_info.host.size =
+ (u_int32_t)strlen(site->net_addr.host) + 1;
+ site_info.port = site->net_addr.port;
+ site_info.flags = site->membership;
+
+ ret = __repmgr_site_info_marshal(env,
+ &site_info, p, (size_t)(&buf[bufsize]-p), &len);
+ DB_ASSERT(env, ret == 0);
+ p += len;
+ }
+ len = (size_t)(p - buf);
+
+ *bufp = buf;
+ *lenp = len;
+ DB_ASSERT(env, ret == 0);
+ return (0);
+}
+
+/*
+ * Produce a membership list by reading the database.
+ */
+static int
+read_gmdb(env, ip, bufp, lenp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int8_t **bufp;
+ size_t *lenp;
+{
+ DB_TXN *txn;
+ DB *dbp;
+ DBC *dbc;
+ DBT key_dbt, data_dbt;
+ __repmgr_membership_key_args key;
+ __repmgr_membership_data_args member_status;
+ __repmgr_member_metadata_args metadata;
+ __repmgr_membr_vers_args membr_vers;
+ __repmgr_site_info_args site_info;
+ u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+ u_int8_t key_buf[MAX_MSG_BUF];
+ u_int8_t metadata_buf[__REPMGR_MEMBER_METADATA_SIZE];
+ char *host;
+ size_t bufsize, len;
+ u_int8_t *buf, *p;
+ u_int32_t gen;
+ int ret, t_ret;
+
+ txn = NULL;
+ dbp = NULL;
+ dbc = NULL;
+ buf = NULL;
+ COMPQUIET(len, 0);
+
+ if ((ret = __rep_get_datagen(env, &gen)) != 0)
+ return (ret);
+ if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+ goto err;
+ if ((ret = __rep_open_sysdb(env, ip, txn, REPMEMBERSHIP, 0, &dbp)) != 0)
+ goto err;
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+
+ memset(&key_dbt, 0, sizeof(key_dbt));
+ key_dbt.data = key_buf;
+ key_dbt.ulen = sizeof(key_buf);
+ F_SET(&key_dbt, DB_DBT_USERMEM);
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ data_dbt.data = metadata_buf;
+ data_dbt.ulen = sizeof(metadata_buf);
+ F_SET(&data_dbt, DB_DBT_USERMEM);
+
+ /* Get metadata record, make sure key looks right. */
+ if ((ret = __dbc_get(dbc, &key_dbt, &data_dbt, DB_NEXT)) != 0)
+ goto err;
+ ret = __repmgr_membership_key_unmarshal(env,
+ &key, key_buf, key_dbt.size, NULL);
+ DB_ASSERT(env, ret == 0);
+ DB_ASSERT(env, key.host.size == 0);
+ DB_ASSERT(env, key.port == 0);
+ ret = __repmgr_member_metadata_unmarshal(env,
+ &metadata, metadata_buf, data_dbt.size, NULL);
+ DB_ASSERT(env, ret == 0);
+ DB_ASSERT(env, metadata.format == REPMGR_GMDB_FMT_VERSION);
+ DB_ASSERT(env, metadata.version > 0);
+
+ bufsize = 1000; /* Initial guess. */
+ if ((ret = __os_malloc(env, bufsize, &buf)) != 0)
+ goto err;
+ membr_vers.version = metadata.version;
+ membr_vers.gen = gen;
+ __repmgr_membr_vers_marshal(env, &membr_vers, buf);
+ p = &buf[__REPMGR_MEMBR_VERS_SIZE];
+
+ data_dbt.data = data_buf;
+ data_dbt.ulen = sizeof(data_buf);
+ while ((ret = __dbc_get(dbc, &key_dbt, &data_dbt, DB_NEXT)) == 0) {
+ ret = __repmgr_membership_key_unmarshal(env,
+ &key, key_buf, key_dbt.size, NULL);
+ DB_ASSERT(env, ret == 0);
+ DB_ASSERT(env, key.host.size <= MAXHOSTNAMELEN + 1 &&
+ key.host.size > 1);
+ host = (char*)key.host.data;
+ DB_ASSERT(env, host[key.host.size-1] == '\0');
+ DB_ASSERT(env, key.port > 0);
+
+ ret = __repmgr_membership_data_unmarshal(env,
+ &member_status, data_buf, data_dbt.size, NULL);
+ DB_ASSERT(env, ret == 0);
+ DB_ASSERT(env, member_status.flags != 0);
+
+ site_info.host = key.host;
+ site_info.port = key.port;
+ site_info.flags = member_status.flags;
+ if ((ret = __repmgr_site_info_marshal(env, &site_info,
+ p, (size_t)(&buf[bufsize]-p), &len)) == ENOMEM) {
+ bufsize *= 2;
+ len = (size_t)(p - buf);
+ if ((ret = __os_realloc(env, bufsize, &buf)) != 0)
+ goto err;
+ p = &buf[len];
+ ret = __repmgr_site_info_marshal(env,
+ &site_info, p, (size_t)(&buf[bufsize]-p), &len);
+ DB_ASSERT(env, ret == 0);
+ }
+ p += len;
+ }
+ len = (size_t)(p - buf);
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+err:
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ if (txn != NULL &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == 0) {
+ *bufp = buf;
+ *lenp = len;
+ } else if (buf != NULL)
+ __os_free(env, buf);
+ return (ret);
+}
+
+/*
+ * Refresh our sites array from the given membership list.
+ *
+ * PUBLIC: int __repmgr_refresh_membership __P((ENV *,
+ * PUBLIC: u_int8_t *, size_t));
+ */
+int
+__repmgr_refresh_membership(env, buf, len)
+ ENV *env;
+ u_int8_t *buf;
+ size_t len;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ __repmgr_membr_vers_args membr_vers;
+ __repmgr_site_info_args site_info;
+ char *host;
+ u_int8_t *p;
+ u_int16_t port;
+ u_int32_t i, n;
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+
+ /*
+ * Membership list consists of membr_vers followed by a number of
+ * site_info structs.
+ */
+ ret = __repmgr_membr_vers_unmarshal(env, &membr_vers, buf, len, &p);
+ DB_ASSERT(env, ret == 0);
+
+ if (db_rep->repmgr_status == stopped)
+ return (0);
+ /* Ignore obsolete versions. */
+ if (__repmgr_gmdb_version_cmp(env,
+ membr_vers.gen, membr_vers.version) <= 0)
+ return (0);
+
+ LOCK_MUTEX(db_rep->mutex);
+
+ db_rep->membership_version = membr_vers.version;
+ db_rep->member_version_gen = membr_vers.gen;
+
+ for (i = 0; i < db_rep->site_cnt; i++)
+ F_CLR(SITE_FROM_EID(i), SITE_TOUCHED);
+
+ for (n = 0; p < &buf[len]; ++n) {
+ ret = __repmgr_site_info_unmarshal(env,
+ &site_info, p, (size_t)(&buf[len] - p), &p);
+ DB_ASSERT(env, ret == 0);
+
+ host = site_info.host.data;
+ DB_ASSERT(env,
+ (u_int8_t*)site_info.host.data + site_info.host.size <= p);
+ host[site_info.host.size-1] = '\0';
+ port = site_info.port;
+
+ if ((ret = __repmgr_set_membership(env,
+ host, port, site_info.flags)) != 0)
+ goto err;
+
+ if ((ret = __repmgr_find_site(env, host, port, &eid)) != 0)
+ goto err;
+ DB_ASSERT(env, IS_VALID_EID(eid));
+ F_SET(SITE_FROM_EID(eid), SITE_TOUCHED);
+ }
+ ret = __rep_set_nsites_int(env, n);
+ DB_ASSERT(env, ret == 0);
+
+ /* Scan "touched" flags so as to notice sites that have been removed. */
+ for (i = 0; i < db_rep->site_cnt; i++) {
+ site = SITE_FROM_EID(i);
+ if (F_ISSET(site, SITE_TOUCHED))
+ continue;
+ host = site->net_addr.host;
+ port = site->net_addr.port;
+ if ((ret = __repmgr_set_membership(env, host, port, 0)) != 0)
+ goto err;
+ }
+
+err:
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_reload_gmdb __P((ENV *));
+ */
+int
+__repmgr_reload_gmdb(env)
+ ENV *env;
+{
+ DB_THREAD_INFO *ip;
+ u_int8_t *buf;
+ size_t len;
+ int ret;
+
+ ENV_ENTER(env, ip);
+ if ((ret = read_gmdb(env, ip, &buf, &len)) == 0) {
+ env->rep_handle->have_gmdb = TRUE;
+ ret = __repmgr_refresh_membership(env, buf, len);
+ __os_free(env, buf);
+ }
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * Return 1, 0, or -1, as the given gen/version combination is >, =, or < our
+ * currently known version.
+ *
+ * PUBLIC: int __repmgr_gmdb_version_cmp __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_gmdb_version_cmp(env, gen, version)
+ ENV *env;
+ u_int32_t gen, version;
+{
+ DB_REP *db_rep;
+ u_int32_t g, v;
+
+ db_rep = env->rep_handle;
+ g = db_rep->member_version_gen;
+ v = db_rep->membership_version;
+
+ if (gen == g)
+ return (version == v ? 0 :
+ (version < v ? -1 : 1));
+ return (gen < g ? -1 : 1);
+}
+
+/*
+ * PUBLIC: int __repmgr_init_save __P((ENV *, DBT *));
+ */
+int
+__repmgr_init_save(env, dbt)
+ ENV *env;
+ DBT *dbt;
+{
+ DB_REP *db_rep;
+ u_int8_t *buf;
+ size_t len;
+ int ret;
+
+ db_rep = env->rep_handle;
+ LOCK_MUTEX(db_rep->mutex);
+ if (db_rep->site_cnt == 0) {
+ dbt->data = NULL;
+ dbt->size = 0;
+ ret = 0;
+ } else if ((ret = __repmgr_marshal_member_list(env, &buf, &len)) == 0) {
+ dbt->data = buf;
+ dbt->size = (u_int32_t)len;
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_init_restore __P((ENV *, DBT *));
+ */
+int
+__repmgr_init_restore(env, dbt)
+ ENV *env;
+ DBT *dbt;
+{
+ DB_REP *db_rep;
+
+ db_rep = env->rep_handle;
+ db_rep->restored_list = dbt->data;
+ db_rep->restored_list_length = dbt->size;
+ return (0);
+}
+
+/*
+ * Generates an internal request for a deferred operation, to be performed on a
+ * separate thread (conveniently, a message-processing thread).
+ *
+ * PUBLIC: int __repmgr_defer_op __P((ENV *, u_int32_t));
+ *
+ * Caller should hold mutex.
+ */
+int
+__repmgr_defer_op(env, op)
+ ENV *env;
+ u_int32_t op;
+{
+ REPMGR_MESSAGE *msg;
+ int ret;
+
+ /*
+ * Overload REPMGR_MESSAGE to convey the type of operation being
+ * requested. For now "op" is all we need; plenty of room for expansion
+ * if needed in the future.
+ *
+ * Leave msg->v.gmdb_msg.conn NULL to show no conn to be cleaned up.
+ */
+ if ((ret = __os_calloc(env, 1, sizeof(*msg), &msg)) != 0)
+ return (ret);
+ msg->msg_hdr.type = REPMGR_OWN_MSG;
+ REPMGR_OWN_MSG_TYPE(msg->msg_hdr) = op;
+ ret = __repmgr_queue_put(env, msg);
+ return (ret);
+}
+
+/*
+ * PUBLIC: void __repmgr_fire_conn_err_event __P((ENV *,
+ * PUBLIC: REPMGR_CONNECTION *, int));
+ */
+void
+__repmgr_fire_conn_err_event(env, conn, err)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ int err;
+{
+ DB_REP *db_rep;
+ DB_REPMGR_CONN_ERR info;
+
+ db_rep = env->rep_handle;
+ if (conn->type == REP_CONNECTION && IS_VALID_EID(conn->eid)) {
+ __repmgr_print_conn_err(env,
+ &SITE_FROM_EID(conn->eid)->net_addr, err);
+ info.eid = conn->eid;
+ info.error = err;
+ DB_EVENT(env, DB_EVENT_REP_CONNECT_BROKEN, &info);
+ }
+}
+
+/*
+ * PUBLIC: void __repmgr_print_conn_err __P((ENV *, repmgr_netaddr_t *, int));
+ */
+void
+__repmgr_print_conn_err(env, netaddr, err)
+ ENV *env;
+ repmgr_netaddr_t *netaddr;
+ int err;
+{
+ SITE_STRING_BUFFER site_loc_buf;
+ char msgbuf[200]; /* Arbitrary size. */
+
+ (void)__repmgr_format_addr_loc(netaddr, site_loc_buf);
+ /* TCP/IP sockets API convention: 0 indicates "end-of-file". */
+ if (err == 0)
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "EOF on connection to %s", site_loc_buf));
+ else
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "`%s' (%d) on connection to %s",
+ __os_strerror(err, msgbuf, sizeof(msgbuf)),
+ err, site_loc_buf));
+}
+
+/*
+ * Change role from master to client, but if a GMDB operation is in progress,
+ * wait for it to finish first.
+ *
+ * PUBLIC: int __repmgr_become_client __P((ENV *));
+ */
+int
+__repmgr_become_client(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ LOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_await_gmdbop(env)) == 0)
+ db_rep->client_intent = TRUE;
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret == 0 ? __repmgr_repstart(env, DB_REP_CLIENT) : ret);
+}
+
+/*
+ * Looks up a site from our local (in-process) list, or returns NULL if not
+ * found.
+ *
+ * PUBLIC: REPMGR_SITE *__repmgr_lookup_site __P((ENV *, const char *, u_int));
+ */
+REPMGR_SITE *
+__repmgr_lookup_site(env, host, port)
+ ENV *env;
+ const char *host;
+ u_int port;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *site;
+ u_int i;
+
+ db_rep = env->rep_handle;
+ for (i = 0; i < db_rep->site_cnt; i++) {
+ site = &db_rep->sites[i];
+
+ if (strcmp(site->net_addr.host, host) == 0 &&
+ site->net_addr.port == port)
+ return (site);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Look up a site, or add it if it doesn't already exist.
+ *
+ * Caller must hold db_rep mutex and be within ENV_ENTER context, unless this is
+ * a pre-open call.
+ *
+ * PUBLIC: int __repmgr_find_site __P((ENV *, const char *, u_int, int *));
+ */
+int
+__repmgr_find_site(env, host, port, eidp)
+ ENV *env;
+ const char *host;
+ u_int port;
+ int *eidp;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_SITE *site;
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+ ret = 0;
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ ret = get_eid(env, host, port, &eid);
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ } else {
+ if ((site = __repmgr_lookup_site(env, host, port)) == NULL &&
+ (ret = __repmgr_new_site(env, &site, host, port)) != 0)
+ return (ret);
+ eid = EID_FROM_SITE(site);
+ }
+ if (ret == 0)
+ *eidp = eid;
+ return (ret);
+}
+
+/*
+ * Get the EID of the named remote site, even if it means creating a new entry
+ * in our table if it doesn't already exist.
+ *
+ * Caller must hold both db_rep mutex and mtx_repmgr.
+ */
+static int
+get_eid(env, host, port, eidp)
+ ENV *env;
+ const char *host;
+ u_int port;
+ int *eidp;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_SITE *site;
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if ((ret = __repmgr_copy_in_added_sites(env)) != 0)
+ return (ret);
+ if ((site = __repmgr_lookup_site(env, host, port)) == NULL) {
+ /*
+ * Store both locally and in shared region.
+ */
+ if ((ret = __repmgr_new_site(env, &site, host, port)) != 0)
+ return (ret);
+
+ eid = EID_FROM_SITE(site);
+ DB_ASSERT(env, (u_int)eid == db_rep->site_cnt - 1);
+ if ((ret = __repmgr_share_netaddrs(env,
+ rep, (u_int)eid, db_rep->site_cnt)) == 0) {
+ /* Show that a change was made. */
+ db_rep->siteinfo_seq = ++rep->siteinfo_seq;
+ } else {
+ /*
+ * Rescind the local slot we just added, so that we at
+ * least keep the two lists in sync.
+ */
+ db_rep->site_cnt--;
+ __repmgr_cleanup_netaddr(env, &site->net_addr);
+ }
+ } else
+ eid = EID_FROM_SITE(site);
+ if (ret == 0)
+ *eidp = eid;
+ return (ret);
+}
+
+/*
+ * Sets the named remote site's group membership status to the given value,
+ * creating it first if it doesn't already exist. Adjusts connections
+ * accordingly.
+ *
+ * PUBLIC: int __repmgr_set_membership __P((ENV *,
+ * PUBLIC: const char *, u_int, u_int32_t));
+ *
+ * Caller must host db_rep mutex, and be in ENV_ENTER context.
+ */
+int
+__repmgr_set_membership(env, host, port, status)
+ ENV *env;
+ const char *host;
+ u_int port;
+ u_int32_t status;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REGINFO *infop;
+ REPMGR_SITE *site;
+ SITEINFO *sites;
+ u_int32_t orig;
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+
+ COMPQUIET(orig, 0);
+ COMPQUIET(site, NULL);
+ DB_ASSERT(env, REP_ON(env));
+
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ if ((ret = get_eid(env, host, port, &eid)) == 0) {
+ DB_ASSERT(env, IS_VALID_EID(eid));
+ site = SITE_FROM_EID(eid);
+ orig = site->membership;
+ sites = R_ADDR(infop, rep->siteinfo_off);
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "set membership for %s:%lu %lu (was %lu)",
+ host, (u_long)port, (u_long)status, (u_long)orig));
+ if (status != sites[eid].status) {
+ /*
+ * Show that a change is occurring.
+ *
+ * The call to get_eid() might have also bumped the
+ * sequence number, and since this is all happening
+ * within a single critical section it would be possible
+ * to avoid "wasting" a sequence number. But it's
+ * hardly worth the trouble and mental complexity: the
+ * sequence number counts changes that occur within an
+ * env region lifetime, so there should be plenty.
+ * We'll run out of membership DB version numbers long
+ * before this becomes a problem.
+ */
+ db_rep->siteinfo_seq = ++rep->siteinfo_seq;
+ }
+
+ /* Set both private and shared copies of the info. */
+ site->membership = status;
+ sites[eid].status = status;
+ }
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+
+ /*
+ * If our notion of the site's membership changed, we may need to create
+ * or kill a connection.
+ */
+ if (ret == 0 && db_rep->repmgr_status == running &&
+ SELECTOR_RUNNING(db_rep)) {
+
+ if (eid == db_rep->self_eid && status != SITE_PRESENT)
+ ret = DB_DELETED;
+ else if (orig != SITE_PRESENT && status == SITE_PRESENT &&
+ site->state == SITE_IDLE) {
+ /*
+ * Here we might have just joined a group, or we might
+ * be an existing site and we've just learned of another
+ * site joining the group. In the former case, we
+ * certainly want to connect right away; in the later
+ * case it might be better to wait, because the new site
+ * probably isn't quite ready to accept our connection.
+ * But deciding which case we're in here would be messy,
+ * so for now we just keep it simple and always try
+ * connecting immediately. The resulting connection
+ * failure shouldn't hurt anything, because we'll just
+ * naturally try again later.
+ */
+ ret = __repmgr_schedule_connection_attempt(env,
+ eid, TRUE);
+ if (eid != db_rep->self_eid)
+ DB_EVENT(env, DB_EVENT_REP_SITE_ADDED, &eid);
+ } else if (orig != 0 && status == 0)
+ DB_EVENT(env, DB_EVENT_REP_SITE_REMOVED, &eid);
+
+ /*
+ * Callers are responsible for adjusting nsites, even though in
+ * a way it would make sense to do it here. It's awkward to do
+ * it here at start-up/join time, when we load up starting from
+ * an empty array. Then we would get rep_set_nsites()
+ * repeatedly, and when leases were in use that would thrash the
+ * lease table adjustment.
+ */
+ }
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_bcast_parm_refresh __P((ENV *));
+ */
+int
+__repmgr_bcast_parm_refresh(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ __repmgr_parm_refresh_args parms;
+ u_int8_t buf[__REPMGR_PARM_REFRESH_SIZE];
+ int ret;
+
+ DB_ASSERT(env, REP_ON(env));
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ LOCK_MUTEX(db_rep->mutex);
+ parms.ack_policy = (u_int32_t)rep->perm_policy;
+ if (rep->priority == 0)
+ parms.flags = 0;
+ else
+ parms.flags = SITE_ELECTABLE;
+ __repmgr_parm_refresh_marshal(env, &parms, buf);
+ ret = __repmgr_bcast_own_msg(env,
+ REPMGR_PARM_REFRESH, buf, __REPMGR_PARM_REFRESH_SIZE);
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_chg_prio __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_chg_prio(env, prev, cur)
+ ENV *env;
+ u_int32_t prev, cur;
+{
+ if ((prev == 0 && cur != 0) ||
+ (prev != 0 && cur == 0))
+ return (__repmgr_bcast_parm_refresh(env));
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_bcast_own_msg __P((ENV *,
+ * PUBLIC: u_int32_t, u_int8_t *, size_t));
+ *
+ * Caller must hold mutex.
+ */
+int
+__repmgr_bcast_own_msg(env, type, buf, len)
+ ENV *env;
+ u_int32_t type;
+ u_int8_t *buf;
+ size_t len;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site;
+ int ret;
+ u_int i;
+
+ db_rep = env->rep_handle;
+ if (!SELECTOR_RUNNING(db_rep))
+ return (0);
+ FOR_EACH_REMOTE_SITE_INDEX(i) {
+ site = SITE_FROM_EID(i);
+ if (site->state != SITE_CONNECTED)
+ continue;
+ if ((conn = site->ref.conn.in) != NULL &&
+ conn->state == CONN_READY &&
+ (ret = __repmgr_send_own_msg(env,
+ conn, type, buf, (u_int32_t)len)) != 0 &&
+ (ret = __repmgr_bust_connection(env, conn)) != 0)
+ return (ret);
+ if ((conn = site->ref.conn.out) != NULL &&
+ conn->state == CONN_READY &&
+ (ret = __repmgr_send_own_msg(env,
+ conn, type, buf, (u_int32_t)len)) != 0 &&
+ (ret = __repmgr_bust_connection(env, conn)) != 0)
+ return (ret);
+ }
+ return (0);
+}
diff --git a/src/repmgr/repmgr_windows.c b/src/repmgr/repmgr_windows.c
new file mode 100644
index 00000000..d9c2a03d
--- /dev/null
+++ b/src/repmgr/repmgr_windows.c
@@ -0,0 +1,849 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/* Convert time-out from microseconds to milliseconds, rounding up. */
+#define DB_TIMEOUT_TO_WINDOWS_TIMEOUT(t) (((t) + (US_PER_MS - 1)) / US_PER_MS)
+
+typedef struct __cond_waiter {
+ HANDLE event;
+ PREDICATE pred;
+ void *ctx;
+ int next_free;
+} COND_WAITER;
+
+#define WAITER_SLOT_IN_USE(w) ((w)->pred != NULL)
+
+/*
+ * Array slots [0:next_avail-1] are initialized, and either in use or on the
+ * free list. Slots beyond that are virgin territory, whose memory contents
+ * could be garbage. In particular, note that slots [0:next_avail-1] have a
+ * Win32 Event Object created for them, which have to be freed when cleaning up
+ * this data structure.
+ *
+ * "first_free" points to a list of not-in-use slots threaded through the first
+ * section of the array.
+ */
+struct __cond_waiters_table {
+ struct __cond_waiter *array;
+ int size;
+ int next_avail;
+ int first_free;
+};
+
+/*
+ * Aggregated control info needed for preparing for WSAWaitForMultipleEvents()
+ * call.
+ */
+struct io_info {
+ REPMGR_CONNECTION **connections;
+ WSAEVENT *events;
+ DWORD nevents;
+};
+
+static int allocate_wait_slot __P((ENV *, int *, COND_WAITERS_TABLE *));
+static void free_wait_slot __P((ENV *, int, COND_WAITERS_TABLE *));
+static int handle_completion __P((ENV *, REPMGR_CONNECTION *));
+static int prepare_io __P((ENV *, REPMGR_CONNECTION *, void *));
+
+int
+__repmgr_thread_start(env, runnable)
+ ENV *env;
+ REPMGR_RUNNABLE *runnable;
+{
+ HANDLE event, thread_id;
+
+ runnable->finished = FALSE;
+ runnable->quit_requested = FALSE;
+ runnable->env = env;
+
+ if ((event = CreateEvent(NULL, TRUE, FALSE, NULL)) == NULL)
+ return (GetLastError());
+ thread_id = CreateThread(NULL, 0,
+ (LPTHREAD_START_ROUTINE)runnable->run, runnable, 0, NULL);
+ if (thread_id == NULL) {
+ CloseHandle(event);
+ return (GetLastError());
+ }
+ runnable->thread_id = thread_id;
+ runnable->quit_event = event;
+ return (0);
+}
+
+int
+__repmgr_thread_join(thread)
+ REPMGR_RUNNABLE *thread;
+{
+ int ret;
+
+ ret = 0;
+ if (WaitForSingleObject(thread->thread_id, INFINITE) != WAIT_OBJECT_0)
+ ret = GetLastError();
+ if (!CloseHandle(thread->thread_id) && ret == 0)
+ ret = GetLastError();
+ if (!CloseHandle(thread->quit_event) && ret == 0)
+ ret = GetLastError();
+
+ return (ret);
+}
+
+int
+__repmgr_set_nonblocking(s)
+ SOCKET s;
+{
+ int ret;
+ u_long onoff;
+
+ onoff = 1; /* any non-zero value */
+ if ((ret = ioctlsocket(s, FIONBIO, &onoff)) == SOCKET_ERROR)
+ return (WSAGetLastError());
+ return (0);
+}
+
+int
+__repmgr_set_nonblock_conn(conn)
+ REPMGR_CONNECTION *conn;
+{
+ int ret;
+
+ if ((ret = __repmgr_set_nonblocking(conn->fd)) != 0)
+ return (ret);
+
+ if ((conn->event_object = WSACreateEvent()) == WSA_INVALID_EVENT) {
+ ret = net_errno;
+ return (ret);
+ }
+ return (0);
+}
+
+/*
+ * !!!
+ * Caller must hold the repmgr->mutex, if this thread synchronization is to work
+ * properly.
+ */
+int
+__repmgr_wake_waiters(env, w)
+ ENV *env;
+ waiter_t *w;
+{
+ DB_REP *db_rep;
+ COND_WAITERS_TABLE *waiters;
+ COND_WAITER *slot;
+ int i, ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ waiters = *w;
+ for (i = 0; i < waiters->next_avail; i++) {
+ slot = &waiters->array[i];
+ if (!WAITER_SLOT_IN_USE(slot))
+ continue;
+ if ((*slot->pred)(env, slot->ctx) ||
+ db_rep->repmgr_status == stopped)
+ if (!SetEvent(slot->event) && ret == 0)
+ ret = GetLastError();
+ }
+ return (ret);
+}
+
+/*
+ * !!!
+ * Caller must hold mutex.
+ */
+int
+__repmgr_await_cond(env, pred, ctx, timeout, waiters_p)
+ ENV *env;
+ PREDICATE pred;
+ void *ctx;
+ db_timeout_t timeout;
+ waiter_t *waiters_p;
+{
+ COND_WAITERS_TABLE *waiters;
+ COND_WAITER *waiter;
+ DB_REP *db_rep;
+ REP *rep;
+ DWORD ret, win_timeout;
+ int i;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ waiters = *waiters_p;
+
+ if ((ret = allocate_wait_slot(env, &i, waiters)) != 0)
+ goto err;
+ waiter = &waiters->array[i];
+
+ win_timeout = timeout > 0 ?
+ DB_TIMEOUT_TO_WINDOWS_TIMEOUT(timeout) : INFINITE;
+ waiter->pred = pred;
+ waiter->ctx = ctx;
+ if ((ret = SignalObjectAndWait(*db_rep->mutex,
+ waiter->event, win_timeout, FALSE)) == WAIT_FAILED) {
+ ret = GetLastError();
+ } else if (ret == WAIT_TIMEOUT)
+ ret = DB_TIMEOUT;
+ else
+ DB_ASSERT(env, ret == WAIT_OBJECT_0);
+
+ LOCK_MUTEX(db_rep->mutex);
+ free_wait_slot(env, i, waiters);
+ if (db_rep->repmgr_status == stopped)
+ ret = DB_REP_UNAVAIL;
+
+err:
+ return (ret);
+}
+
+/*
+ * !!!
+ * Caller must hold the mutex.
+ */
+static int
+allocate_wait_slot(env, resultp, table)
+ ENV *env;
+ int *resultp;
+ COND_WAITERS_TABLE *table;
+{
+ COND_WAITER *w;
+ HANDLE event;
+ int i, ret;
+
+ if (table->first_free == -1) {
+ if (table->next_avail >= table->size) {
+ /*
+ * Grow the array.
+ */
+ table->size *= 2;
+ w = table->array;
+ if ((ret = __os_realloc(env, table->size * sizeof(*w),
+ &w)) != 0)
+ return (ret);
+ table->array = w;
+ }
+ if ((event = CreateEvent(NULL,
+ FALSE, FALSE, NULL)) == NULL) {
+ /* No need to rescind the memory reallocation. */
+ return (GetLastError());
+ }
+
+ /*
+ * Here if, one way or another, we're good to go for using the
+ * next slot (for the first time).
+ */
+ i = table->next_avail++;
+ w = &table->array[i];
+ w->event = event;
+ } else {
+ i = table->first_free;
+ w = &table->array[i];
+ table->first_free = w->next_free;
+ }
+ /*
+ * Make sure this event state is nonsignaled. It is possible that
+ * late processing could have signaled this event after the end of
+ * the previous wait but before reacquiring the mutex, and this
+ * extra signal would incorrectly cause the next wait to return
+ * immediately.
+ */
+ (void)WaitForSingleObject(w->event, 0);
+ *resultp = i;
+ return (0);
+}
+
+static void
+free_wait_slot(env, slot_index, table)
+ ENV *env;
+ int slot_index;
+ COND_WAITERS_TABLE *table;
+{
+ DB_REP *db_rep;
+ COND_WAITER *slot;
+
+ db_rep = env->rep_handle;
+ slot = &table->array[slot_index];
+
+ slot->pred = NULL; /* show it's not in use */
+ slot->next_free = table->first_free;
+ table->first_free = slot_index;
+}
+
+int
+__repmgr_await_gmdbop(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ while (db_rep->gmdb_busy) {
+ if (!ResetEvent(db_rep->gmdb_idle))
+ return (GetLastError());
+ ret = SignalObjectAndWait(*db_rep->mutex,
+ db_rep->gmdb_idle, INFINITE, FALSE);
+ LOCK_MUTEX(db_rep->mutex);
+ if (ret == WAIT_FAILED)
+ return (GetLastError());
+ DB_ASSERT(env, ret == WAIT_OBJECT_0);
+ }
+ return (0);
+}
+
+/* (See requirements described in repmgr_posix.c.) */
+int
+__repmgr_await_drain(env, conn, timeout)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ db_timeout_t timeout;
+{
+ DB_REP *db_rep;
+ db_timespec deadline, delta, now;
+ db_timeout_t t;
+ DWORD duration, ret;
+ int round_up;
+
+ db_rep = env->rep_handle;
+
+ __os_gettime(env, &deadline, 1);
+ TIMESPEC_ADD_DB_TIMEOUT(&deadline, timeout);
+
+ while (conn->out_queue_length >= OUT_QUEUE_LIMIT) {
+ if (!ResetEvent(conn->drained))
+ return (GetLastError());
+
+ /* How long until the deadline? */
+ __os_gettime(env, &now, 1);
+ if (timespeccmp(&now, &deadline, >=)) {
+ conn->state = CONN_CONGESTED;
+ return (0);
+ }
+ delta = deadline;
+ timespecsub(&delta, &now);
+ round_up = TRUE;
+ DB_TIMESPEC_TO_TIMEOUT(t, &delta, round_up);
+ duration = DB_TIMEOUT_TO_WINDOWS_TIMEOUT(t);
+
+ ret = SignalObjectAndWait(*db_rep->mutex,
+ conn->drained, duration, FALSE);
+ LOCK_MUTEX(db_rep->mutex);
+ if (ret == WAIT_FAILED)
+ return (GetLastError());
+ else if (ret == WAIT_TIMEOUT) {
+ conn->state = CONN_CONGESTED;
+ return (0);
+ } else
+ DB_ASSERT(env, ret == WAIT_OBJECT_0);
+
+ if (db_rep->repmgr_status == stopped)
+ return (0);
+ if (conn->state == CONN_DEFUNCT)
+ return (DB_REP_UNAVAIL);
+ }
+ return (0);
+}
+
+/*
+ * Creates a manual reset event, which is usually our best choice when we may
+ * have multiple threads waiting on a single event.
+ */
+int
+__repmgr_alloc_cond(c)
+ cond_var_t *c;
+{
+ HANDLE event;
+
+ if ((event = CreateEvent(NULL, TRUE, FALSE, NULL)) == NULL)
+ return (GetLastError());
+ *c = event;
+ return (0);
+}
+
+int
+__repmgr_free_cond(c)
+ cond_var_t *c;
+{
+ if (CloseHandle(*c))
+ return (0);
+ return (GetLastError());
+}
+
+void
+__repmgr_env_create_pf(db_rep)
+ DB_REP *db_rep;
+{
+}
+
+int
+__repmgr_create_mutex_pf(mutex)
+ mgr_mutex_t *mutex;
+{
+ if ((*mutex = CreateMutex(NULL, FALSE, NULL)) == NULL)
+ return (GetLastError());
+ return (0);
+}
+
+int
+__repmgr_destroy_mutex_pf(mutex)
+ mgr_mutex_t *mutex;
+{
+ return (CloseHandle(*mutex) ? 0 : GetLastError());
+}
+
+int
+__repmgr_init(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ WSADATA wsaData;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ if ((ret = WSAStartup(MAKEWORD(2, 2), &wsaData)) != 0) {
+ __db_err(env, ret, DB_STR("3589",
+ "unable to initialize Windows networking"));
+ return (ret);
+ }
+
+ if ((db_rep->signaler = CreateEvent(NULL, /* security attr */
+ FALSE, /* (not) of the manual reset variety */
+ FALSE, /* (not) initially signaled */
+ NULL)) == NULL) /* name */
+ goto geterr;
+
+ if ((db_rep->msg_avail = CreateEvent(NULL, TRUE, FALSE, NULL))
+ == NULL)
+ goto geterr;
+
+ if ((db_rep->check_election = CreateEvent(NULL, TRUE, FALSE, NULL))
+ == NULL)
+ goto geterr;
+
+ if ((db_rep->gmdb_idle = CreateEvent(NULL, TRUE, FALSE, NULL))
+ == NULL)
+ goto geterr;
+
+ if ((ret = __repmgr_init_waiters(env, &db_rep->ack_waiters)) != 0)
+ goto err;
+ return (0);
+
+geterr:
+ ret = GetLastError();
+err:
+ if (db_rep->gmdb_idle != NULL)
+ CloseHandle(db_rep->gmdb_idle);
+ if (db_rep->check_election != NULL)
+ CloseHandle(db_rep->check_election);
+ if (db_rep->msg_avail != NULL)
+ CloseHandle(db_rep->msg_avail);
+ if (db_rep->signaler != NULL)
+ CloseHandle(db_rep->signaler);
+ db_rep->msg_avail =
+ db_rep->check_election =
+ db_rep->gmdb_idle =
+ db_rep->signaler = NULL;
+ (void)WSACleanup();
+ return (ret);
+}
+
+int
+__repmgr_deinit(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ if (!(REPMGR_INITED(db_rep)))
+ return (0);
+
+ ret = 0;
+ if (WSACleanup() == SOCKET_ERROR)
+ ret = WSAGetLastError();
+
+ if ((t_ret = __repmgr_destroy_waiters(env, &db_rep->ack_waiters))
+ != 0 && ret == 0)
+ ret = t_ret;
+
+ if (!CloseHandle(db_rep->gmdb_idle) && ret == 0)
+ ret = GetLastError();
+
+ if (!CloseHandle(db_rep->check_election) && ret == 0)
+ ret = GetLastError();
+
+ if (!CloseHandle(db_rep->msg_avail) && ret == 0)
+ ret = GetLastError();
+
+ if (!CloseHandle(db_rep->signaler) && ret == 0)
+ ret = GetLastError();
+ db_rep->msg_avail =
+ db_rep->check_election =
+ db_rep->gmdb_idle =
+ db_rep->signaler = NULL;
+
+ return (ret);
+}
+
+int
+__repmgr_init_waiters(env, waiters)
+ ENV *env;
+ waiter_t *waiters;
+{
+#define INITIAL_ALLOCATION 5 /* arbitrary size */
+ COND_WAITERS_TABLE *table;
+ int ret;
+
+ table = NULL;
+
+ if ((ret =
+ __os_calloc(env, 1, sizeof(COND_WAITERS_TABLE), &table)) != 0)
+ return (ret);
+
+ if ((ret = __os_calloc(env, INITIAL_ALLOCATION, sizeof(COND_WAITER),
+ &table->array)) != 0) {
+ __os_free(env, table);
+ return (ret);
+ }
+
+ table->size = INITIAL_ALLOCATION;
+ table->first_free = -1;
+ table->next_avail = 0;
+
+ /* There's a restaurant joke in there somewhere. */
+ *waiters = table;
+ return (0);
+}
+
+int
+__repmgr_destroy_waiters(env, waitersp)
+ ENV *env;
+ waiter_t *waitersp;
+{
+ waiter_t waiters;
+ int i, ret;
+
+ waiters = *waitersp;
+ ret = 0;
+ for (i = 0; i < waiters->next_avail; i++) {
+ if (!CloseHandle(waiters->array[i].event) && ret == 0)
+ ret = GetLastError();
+ }
+ __os_free(env, waiters->array);
+ __os_free(env, waiters);
+ return (ret);
+}
+
+int
+__repmgr_lock_mutex(mutex)
+ mgr_mutex_t *mutex;
+{
+ if (WaitForSingleObject(*mutex, INFINITE) == WAIT_OBJECT_0)
+ return (0);
+ return (GetLastError());
+}
+
+int
+__repmgr_unlock_mutex(mutex)
+ mgr_mutex_t *mutex;
+{
+ if (ReleaseMutex(*mutex))
+ return (0);
+ return (GetLastError());
+}
+
+int
+__repmgr_signal(v)
+ cond_var_t *v;
+{
+ return (SetEvent(*v) ? 0 : GetLastError());
+}
+
+int
+__repmgr_wake_msngers(env, n)
+ ENV *env;
+ u_int n;
+{
+ DB_REP *db_rep;
+ u_int i;
+
+ db_rep = env->rep_handle;
+
+ /* Ask all threads beyond index 'n' to shut down. */
+ for (i = n; i< db_rep->nthreads; i++)
+ if (!SetEvent(db_rep->messengers[i]->quit_event))
+ return (GetLastError());
+ return (0);
+}
+
+int
+__repmgr_wake_main_thread(env)
+ ENV *env;
+{
+ if (!SetEvent(env->rep_handle->signaler))
+ return (GetLastError());
+ return (0);
+}
+
+int
+__repmgr_writev(fd, iovec, buf_count, byte_count_p)
+ socket_t fd;
+ db_iovec_t *iovec;
+ int buf_count;
+ size_t *byte_count_p;
+{
+ DWORD bytes;
+
+ if (WSASend(fd, iovec,
+ (DWORD)buf_count, &bytes, 0, NULL, NULL) == SOCKET_ERROR)
+ return (net_errno);
+
+ *byte_count_p = (size_t)bytes;
+ return (0);
+}
+
+int
+__repmgr_readv(fd, iovec, buf_count, xfr_count_p)
+ socket_t fd;
+ db_iovec_t *iovec;
+ int buf_count;
+ size_t *xfr_count_p;
+{
+ DWORD bytes, flags;
+
+ flags = 0;
+ if (WSARecv(fd, iovec,
+ (DWORD)buf_count, &bytes, &flags, NULL, NULL) == SOCKET_ERROR)
+ return (net_errno);
+
+ *xfr_count_p = (size_t)bytes;
+ return (0);
+}
+
+int
+__repmgr_select_loop(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ DWORD ret;
+ DWORD select_timeout;
+ REPMGR_CONNECTION *connections[WSA_MAXIMUM_WAIT_EVENTS];
+ WSAEVENT events[WSA_MAXIMUM_WAIT_EVENTS];
+ db_timespec timeout;
+ WSAEVENT listen_event;
+ WSANETWORKEVENTS net_events;
+ struct io_info io_info;
+ int i;
+
+ db_rep = env->rep_handle;
+ io_info.connections = connections;
+ io_info.events = events;
+
+ if ((listen_event = WSACreateEvent()) == WSA_INVALID_EVENT) {
+ __db_err(env, net_errno, DB_STR("3590",
+ "can't create event for listen socket"));
+ return (net_errno);
+ }
+ if (!IS_SUBORDINATE(db_rep) &&
+ WSAEventSelect(db_rep->listen_fd, listen_event, FD_ACCEPT) ==
+ SOCKET_ERROR) {
+ ret = net_errno;
+ __db_err(env, ret, DB_STR("3591",
+ "can't enable event for listener"));
+ (void)WSACloseEvent(listen_event);
+ goto out;
+ }
+
+ LOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_first_try_connections(env)) != 0)
+ goto unlock;
+ for (;;) {
+ /* Start with the two events that we always wait for. */
+#define SIGNALER_INDEX 0
+#define LISTENER_INDEX 1
+ events[SIGNALER_INDEX] = db_rep->signaler;
+ if (IS_SUBORDINATE(db_rep))
+ io_info.nevents = 1;
+ else {
+ events[LISTENER_INDEX] = listen_event;
+ io_info.nevents = 2;
+ }
+
+ if ((ret = __repmgr_each_connection(env,
+ prepare_io, &io_info, TRUE)) != 0)
+ goto unlock;
+
+ if (__repmgr_compute_timeout(env, &timeout))
+ select_timeout =
+ (DWORD)(timeout.tv_sec * MS_PER_SEC +
+ timeout.tv_nsec / NS_PER_MS);
+ else {
+ /* No time-based events to wake us up. */
+ select_timeout = WSA_INFINITE;
+ }
+
+ UNLOCK_MUTEX(db_rep->mutex);
+ ret = WSAWaitForMultipleEvents(
+ io_info.nevents, events, FALSE, select_timeout, FALSE);
+ if (db_rep->repmgr_status == stopped) {
+ ret = 0;
+ goto out;
+ }
+ LOCK_MUTEX(db_rep->mutex);
+
+ /*
+ * !!!
+ * Note that `ret' remains set as the return code from
+ * WSAWaitForMultipleEvents, above.
+ */
+ if (ret >= WSA_WAIT_EVENT_0 &&
+ ret < WSA_WAIT_EVENT_0 + io_info.nevents) {
+ if ((i = ret - WSA_WAIT_EVENT_0) == SIGNALER_INDEX) {
+ /* Another thread woke us. */
+ } else if (!IS_SUBORDINATE(db_rep) &&
+ i == LISTENER_INDEX) {
+ if ((ret = WSAEnumNetworkEvents(
+ db_rep->listen_fd, listen_event,
+ &net_events)) == SOCKET_ERROR) {
+ ret = net_errno;
+ goto unlock;
+ }
+ DB_ASSERT(env,
+ net_events.lNetworkEvents & FD_ACCEPT);
+ if ((ret = net_events.iErrorCode[FD_ACCEPT_BIT])
+ != 0)
+ goto unlock;
+ if ((ret = __repmgr_accept(env)) != 0)
+ goto unlock;
+ } else {
+ if (connections[i]->state != CONN_DEFUNCT &&
+ (ret = handle_completion(env,
+ connections[i])) != 0)
+ goto unlock;
+ }
+ } else if (ret == WSA_WAIT_TIMEOUT) {
+ if ((ret = __repmgr_check_timeouts(env)) != 0)
+ goto unlock;
+ } else if (ret == WSA_WAIT_FAILED) {
+ ret = net_errno;
+ goto unlock;
+ }
+ }
+
+unlock:
+ UNLOCK_MUTEX(db_rep->mutex);
+out:
+ if (!CloseHandle(listen_event) && ret == 0)
+ ret = GetLastError();
+ if (ret == DB_DELETED)
+ ret = __repmgr_bow_out(env);
+ LOCK_MUTEX(db_rep->mutex);
+ (void)__repmgr_net_close(env);
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+
+static int
+prepare_io(env, conn, info_)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ void *info_;
+{
+ struct io_info *info;
+ long desired_events;
+ int ret;
+
+ if (conn->state == CONN_DEFUNCT)
+ return (__repmgr_cleanup_defunct(env, conn));
+
+ /*
+ * Note that even if we're suffering flow control, we
+ * nevertheless still read if we haven't even yet gotten
+ * a handshake. Why? (1) Handshakes are important; and
+ * (2) they don't hurt anything flow-control-wise.
+ */
+ info = info_;
+
+ /*
+ * If we ever implemented flow control, we would have some conditions to
+ * examine here. But as it is, we always are willing to accept I/O on
+ * every connection.
+ *
+ * We can only handle as many connections as the number of events the
+ * WSAWaitForMultipleEvents function allows (minus 2, for our overhead:
+ * the listener and the signaler).
+ */
+ DB_ASSERT(env, info->nevents < WSA_MAXIMUM_WAIT_EVENTS);
+ info->events[info->nevents] = conn->event_object;
+ info->connections[info->nevents++] = conn;
+
+ desired_events = FD_READ | FD_CLOSE;
+ if (!STAILQ_EMPTY(&conn->outbound_queue))
+ desired_events |= FD_WRITE;
+ if (WSAEventSelect(conn->fd,
+ conn->event_object, desired_events) == SOCKET_ERROR) {
+ ret = net_errno;
+ __db_err(env, ret, DB_STR_A("3592",
+ "can't set event bits 0x%lx", "%lx"), desired_events);
+ } else
+ ret = 0;
+
+ return (ret);
+}
+
+static int
+handle_completion(env, conn)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+{
+ int error, ret;
+ WSANETWORKEVENTS events;
+
+ if ((ret = WSAEnumNetworkEvents(conn->fd, conn->event_object, &events))
+ == SOCKET_ERROR) {
+ error = net_errno;
+ __db_err(env, error, DB_STR("3593", "EnumNetworkEvents"));
+ goto report;
+ }
+
+ /* Check both writing and reading. */
+ if (events.lNetworkEvents & FD_CLOSE) {
+ error = events.iErrorCode[FD_CLOSE_BIT];
+ goto report;
+ }
+
+ if (events.lNetworkEvents & FD_WRITE) {
+ if (events.iErrorCode[FD_WRITE_BIT] != 0) {
+ error = events.iErrorCode[FD_WRITE_BIT];
+ goto report;
+ } else if ((ret =
+ __repmgr_write_some(env, conn)) != 0)
+ goto err;
+ }
+
+ if (events.lNetworkEvents & FD_READ) {
+ if (events.iErrorCode[FD_READ_BIT] != 0) {
+ error = events.iErrorCode[FD_READ_BIT];
+ goto report;
+ } else if ((ret =
+ __repmgr_read_from_site(env, conn)) != 0)
+ goto err;
+ }
+
+ if (0) {
+report:
+ __repmgr_fire_conn_err_event(env, conn, error);
+ STAT(env->rep_handle->region->mstat.st_connection_drop++);
+ ret = DB_REP_UNAVAIL;
+ }
+err:
+ if (ret == DB_REP_UNAVAIL)
+ ret = __repmgr_bust_connection(env, conn);
+ return (ret);
+}
diff --git a/src/sequence/seq_stat.c b/src/sequence/seq_stat.c
new file mode 100644
index 00000000..d5b9a401
--- /dev/null
+++ b/src/sequence/seq_stat.c
@@ -0,0 +1,275 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+#ifdef HAVE_64BIT_TYPES
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc_auto/sequence_ext.h"
+
+#ifdef HAVE_STATISTICS
+static int __seq_print_all __P((DB_SEQUENCE *, u_int32_t));
+static int __seq_print_stats __P((DB_SEQUENCE *, u_int32_t));
+
+/*
+ * __seq_stat --
+ * Get statistics from the sequence.
+ *
+ * PUBLIC: int __seq_stat __P((DB_SEQUENCE *, DB_SEQUENCE_STAT **, u_int32_t));
+ */
+int
+__seq_stat(seq, spp, flags)
+ DB_SEQUENCE *seq;
+ DB_SEQUENCE_STAT **spp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBT data;
+ DB_SEQUENCE_STAT *sp;
+ DB_SEQ_RECORD record;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ dbp = seq->seq_dbp;
+ env = dbp->env;
+
+ SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->stat");
+
+ switch (flags) {
+ case DB_STAT_CLEAR:
+ case DB_STAT_ALL:
+ case 0:
+ break;
+ default:
+ return (__db_ferr(env, "DB_SEQUENCE->stat", 0));
+ }
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Allocate and clear the structure. */
+ if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
+ goto err;
+ memset(sp, 0, sizeof(*sp));
+
+ if (seq->mtx_seq != MUTEX_INVALID) {
+ __mutex_set_wait_info(
+ env, seq->mtx_seq, &sp->st_wait, &sp->st_nowait);
+
+ if (LF_ISSET(DB_STAT_CLEAR))
+ __mutex_clear(env, seq->mtx_seq);
+ }
+ memset(&data, 0, sizeof(data));
+ data.data = &record;
+ data.ulen = sizeof(record);
+ data.flags = DB_DBT_USERMEM;
+retry: if ((ret = __db_get(dbp, ip, NULL, &seq->seq_key, &data, 0)) != 0) {
+ if (ret == DB_BUFFER_SMALL &&
+ data.size > sizeof(seq->seq_record)) {
+ if ((ret = __os_malloc(env,
+ data.size, &data.data)) != 0)
+ goto err;
+ data.ulen = data.size;
+ goto retry;
+ }
+ goto err;
+ }
+
+ if (data.data != &record)
+ memcpy(&record, data.data, sizeof(record));
+ sp->st_current = record.seq_value;
+ sp->st_value = seq->seq_record.seq_value;
+ sp->st_last_value = seq->seq_last_value;
+ sp->st_min = seq->seq_record.seq_min;
+ sp->st_max = seq->seq_record.seq_max;
+ sp->st_cache_size = seq->seq_cache_size;
+ sp->st_flags = seq->seq_record.flags;
+
+ *spp = sp;
+ if (data.data != &record)
+ __os_free(env, data.data);
+
+ /* Release replication block. */
+err: if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __seq_stat_print --
+ * Print statistics from the sequence.
+ *
+ * PUBLIC: int __seq_stat_print __P((DB_SEQUENCE *, u_int32_t));
+ */
+int
+__seq_stat_print(seq, flags)
+ DB_SEQUENCE *seq;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ dbp = seq->seq_dbp;
+ env = dbp->env;
+
+ SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->stat_print");
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if ((ret = __seq_print_stats(seq, flags)) != 0)
+ goto err;
+
+ if (LF_ISSET(DB_STAT_ALL) &&
+ (ret = __seq_print_all(seq, flags)) != 0)
+ goto err;
+
+ /* Release replication block. */
+err: if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+
+}
+
+static const FN __db_seq_flags_fn[] = {
+ { DB_SEQ_DEC, "decrement" },
+ { DB_SEQ_INC, "increment" },
+ { DB_SEQ_RANGE_SET, "range set (internal)" },
+ { DB_SEQ_WRAP, "wraparound at end" },
+ { 0, NULL }
+};
+
+/*
+ * __db_get_seq_flags_fn --
+ * Return the __db_seq_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_seq_flags_fn __P((void));
+ */
+const FN *
+__db_get_seq_flags_fn()
+{
+ return (__db_seq_flags_fn);
+}
+
+/*
+ * __seq_print_stats --
+ * Display sequence stat structure.
+ */
+static int
+__seq_print_stats(seq, flags)
+ DB_SEQUENCE *seq;
+ u_int32_t flags;
+{
+ DB_SEQUENCE_STAT *sp;
+ ENV *env;
+ int ret;
+
+ env = seq->seq_dbp->env;
+
+ if ((ret = __seq_stat(seq, &sp, flags)) != 0)
+ return (ret);
+ __db_dl_pct(env, "The number of sequence locks that required waiting",
+ (u_long)sp->st_wait,
+ DB_PCT(sp->st_wait, sp->st_wait + sp->st_nowait), NULL);
+ STAT_FMT("The current sequence value",
+ INT64_FMT, db_seq_t, sp->st_current);
+ STAT_FMT("The cached sequence value",
+ INT64_FMT, db_seq_t, sp->st_value);
+ STAT_FMT("The last cached sequence value",
+ INT64_FMT, db_seq_t, sp->st_last_value);
+ STAT_FMT("The minimum sequence value",
+ INT64_FMT, db_seq_t, sp->st_min);
+ STAT_FMT("The maximum sequence value",
+ INT64_FMT, db_seq_t, sp->st_max);
+ STAT_ULONG("The cache size", sp->st_cache_size);
+ __db_prflags(env, NULL,
+ sp->st_flags, __db_seq_flags_fn, NULL, "\tSequence flags");
+ __os_ufree(seq->seq_dbp->env, sp);
+ return (0);
+}
+
+/*
+ * __seq_print_all --
+ * Display sequence debugging information - none for now.
+ * (The name seems a bit strange, no?)
+ */
+static int
+__seq_print_all(seq, flags)
+ DB_SEQUENCE *seq;
+ u_int32_t flags;
+{
+ COMPQUIET(seq, NULL);
+ COMPQUIET(flags, 0);
+ return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__seq_stat(seq, statp, flags)
+ DB_SEQUENCE *seq;
+ DB_SEQUENCE_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(seq->seq_dbp->env));
+}
+
+int
+__seq_stat_print(seq, flags)
+ DB_SEQUENCE *seq;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(seq->seq_dbp->env));
+}
+
+/*
+ * __db_get_seq_flags_fn --
+ * Return the __db_seq_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_seq_flags_fn __P((void));
+ */
+const FN *
+__db_get_seq_flags_fn()
+{
+ static const FN __db_seq_flags_fn[] = {
+ { 0, NULL }
+ };
+
+ /*
+ * !!!
+ * The Tcl API uses this interface, stub it off.
+ */
+ return (__db_seq_flags_fn);
+}
+#endif /* !HAVE_STATISTICS */
+#endif /* HAVE_64BIT_TYPES */
diff --git a/src/sequence/sequence.c b/src/sequence/sequence.c
new file mode 100644
index 00000000..1c19f838
--- /dev/null
+++ b/src/sequence/sequence.c
@@ -0,0 +1,1011 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+#include "dbinc_auto/sequence_ext.h"
+
+#ifdef HAVE_64BIT_TYPES
+/*
+ * Sequences must be architecture independent but they are stored as user
+ * data in databases so the code here must handle the byte ordering. We
+ * store them in little-endian byte ordering. If we are on a big-endian
+ * machine we swap in and out when we read from the database. seq->seq_rp
+ * always points to the record in native ordering.
+ *
+ * Version 1 always stored things in native format so if we detect this we
+ * upgrade on the fly and write the record back at open time.
+ */
+#define SEQ_SWAP(rp) \
+ do { \
+ M_32_SWAP((rp)->seq_version); \
+ M_32_SWAP((rp)->flags); \
+ M_64_SWAP((rp)->seq_value); \
+ M_64_SWAP((rp)->seq_max); \
+ M_64_SWAP((rp)->seq_min); \
+ } while (0)
+
+#define SEQ_SWAP_IN(env, seq) \
+ do { \
+ if (!F_ISSET((env), ENV_LITTLEENDIAN)) { \
+ memcpy(&seq->seq_record, seq->seq_data.data, \
+ sizeof(seq->seq_record)); \
+ SEQ_SWAP(&seq->seq_record); \
+ } \
+ } while (0)
+
+#define SEQ_SWAP_OUT(env, seq) \
+ do { \
+ if (!F_ISSET((env), ENV_LITTLEENDIAN)) { \
+ memcpy(seq->seq_data.data, \
+ &seq->seq_record, sizeof(seq->seq_record));\
+ SEQ_SWAP((DB_SEQ_RECORD*)seq->seq_data.data); \
+ } \
+ } while (0)
+
+static int __seq_chk_cachesize __P((ENV *, int32_t, db_seq_t, db_seq_t));
+static int __seq_close __P((DB_SEQUENCE *, u_int32_t));
+static int __seq_close_pp __P((DB_SEQUENCE *, u_int32_t));
+static int __seq_get
+ __P((DB_SEQUENCE *, DB_TXN *, int32_t, db_seq_t *, u_int32_t));
+static int __seq_get_cachesize __P((DB_SEQUENCE *, int32_t *));
+static int __seq_get_db __P((DB_SEQUENCE *, DB **));
+static int __seq_get_flags __P((DB_SEQUENCE *, u_int32_t *));
+static int __seq_get_key __P((DB_SEQUENCE *, DBT *));
+static int __seq_get_range __P((DB_SEQUENCE *, db_seq_t *, db_seq_t *));
+static int __seq_initial_value __P((DB_SEQUENCE *, db_seq_t));
+static int __seq_open_pp __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t));
+static int __seq_remove __P((DB_SEQUENCE *, DB_TXN *, u_int32_t));
+static int __seq_set_cachesize __P((DB_SEQUENCE *, int32_t));
+static int __seq_set_flags __P((DB_SEQUENCE *, u_int32_t));
+static int __seq_set_range __P((DB_SEQUENCE *, db_seq_t, db_seq_t));
+static int __seq_update
+ __P((DB_SEQUENCE *, DB_THREAD_INFO *, DB_TXN *, int32_t, u_int32_t));
+
+/*
+ * db_sequence_create --
+ * DB_SEQUENCE constructor.
+ *
+ * EXTERN: int db_sequence_create __P((DB_SEQUENCE **, DB *, u_int32_t));
+ */
+int
+db_sequence_create(seqp, dbp, flags)
+ DB_SEQUENCE **seqp;
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_SEQUENCE *seq;
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "db_sequence_create");
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case 0:
+ break;
+ default:
+ return (__db_ferr(env, "db_sequence_create", 0));
+ }
+
+ if (dbp->type == DB_HEAP) {
+ __db_errx(env, DB_STR("4016",
+ "Heap databases may not be used with sequences."));
+ return (EINVAL);
+
+ }
+
+ /* Allocate the sequence. */
+ if ((ret = __os_calloc(env, 1, sizeof(*seq), &seq)) != 0)
+ return (ret);
+
+ seq->seq_dbp = dbp;
+ seq->close = __seq_close_pp;
+ seq->get = __seq_get;
+ seq->get_cachesize = __seq_get_cachesize;
+ seq->set_cachesize = __seq_set_cachesize;
+ seq->get_db = __seq_get_db;
+ seq->get_flags = __seq_get_flags;
+ seq->get_key = __seq_get_key;
+ seq->get_range = __seq_get_range;
+ seq->initial_value = __seq_initial_value;
+ seq->open = __seq_open_pp;
+ seq->remove = __seq_remove;
+ seq->set_flags = __seq_set_flags;
+ seq->set_range = __seq_set_range;
+ seq->stat = __seq_stat;
+ seq->stat_print = __seq_stat_print;
+ seq->seq_rp = &seq->seq_record;
+ *seqp = seq;
+
+ return (0);
+}
+
+/*
+ * __seq_open --
+ * DB_SEQUENCE->open method.
+ *
+ */
+static int
+__seq_open_pp(seq, txn, keyp, flags)
+ DB_SEQUENCE *seq;
+ DB_TXN *txn;
+ DBT *keyp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_SEQ_RECORD *rp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ u_int32_t tflags;
+ int handle_check, txn_local, ret, t_ret;
+#define SEQ_OPEN_FLAGS (DB_CREATE | DB_EXCL | DB_THREAD)
+
+ dbp = seq->seq_dbp;
+ env = dbp->env;
+ txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+ SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->open");
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if ((ret = __db_fchk(env,
+ "DB_SEQUENCE->open", flags, SEQ_OPEN_FLAGS)) != 0)
+ goto err;
+
+ if (keyp->size == 0) {
+ __db_errx(env, DB_STR("4001",
+ "Zero length sequence key specified"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if ((ret = __db_get_flags(dbp, &tflags)) != 0)
+ goto err;
+
+ /*
+ * We can let replication clients open sequences, but must
+ * check later that they do not update them.
+ */
+ if (F_ISSET(dbp, DB_AM_RDONLY)) {
+ ret = __db_rdonly(dbp->env, "DB_SEQUENCE->open");
+ goto err;
+ }
+ if (FLD_ISSET(tflags, DB_DUP)) {
+ __db_errx(env, DB_STR("4002",
+ "Sequences not supported in databases configured for duplicate data"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (LF_ISSET(DB_THREAD)) {
+ if ((ret = __mutex_alloc(env,
+ MTX_SEQUENCE, DB_MUTEX_PROCESS_ONLY, &seq->mtx_seq)) != 0)
+ goto err;
+ }
+
+ memset(&seq->seq_data, 0, sizeof(DBT));
+ if (F_ISSET(env, ENV_LITTLEENDIAN)) {
+ seq->seq_data.data = &seq->seq_record;
+ seq->seq_data.flags = DB_DBT_USERMEM;
+ } else {
+ if ((ret = __os_umalloc(env,
+ sizeof(seq->seq_record), &seq->seq_data.data)) != 0)
+ goto err;
+ seq->seq_data.flags = DB_DBT_REALLOC;
+ }
+
+ seq->seq_data.ulen = seq->seq_data.size = sizeof(seq->seq_record);
+ seq->seq_rp = &seq->seq_record;
+
+ if ((ret = __dbt_usercopy(env, keyp)) != 0)
+ goto err;
+
+ memset(&seq->seq_key, 0, sizeof(DBT));
+ if ((ret = __os_malloc(env, keyp->size, &seq->seq_key.data)) != 0)
+ goto err;
+ memcpy(seq->seq_key.data, keyp->data, keyp->size);
+ seq->seq_key.size = seq->seq_key.ulen = keyp->size;
+ seq->seq_key.flags = DB_DBT_USERMEM;
+
+retry: if ((ret = __db_get(dbp, ip,
+ txn, &seq->seq_key, &seq->seq_data, 0)) != 0) {
+ if (ret == DB_BUFFER_SMALL &&
+ seq->seq_data.size > sizeof(seq->seq_record)) {
+ seq->seq_data.flags = DB_DBT_REALLOC;
+ seq->seq_data.data = NULL;
+ goto retry;
+ }
+ if ((ret != DB_NOTFOUND && ret != DB_KEYEMPTY) ||
+ !LF_ISSET(DB_CREATE))
+ goto err;
+ if (IS_REP_CLIENT(env) &&
+ !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ ret = __db_rdonly(env, "DB_SEQUENCE->open");
+ goto err;
+ }
+ ret = 0;
+
+ rp = &seq->seq_record;
+ if (!F_ISSET(rp, DB_SEQ_RANGE_SET)) {
+ rp->seq_max = INT64_MAX;
+ rp->seq_min = INT64_MIN;
+ }
+ /* INC is the default. */
+ if (!F_ISSET(rp, DB_SEQ_DEC))
+ F_SET(rp, DB_SEQ_INC);
+
+ rp->seq_version = DB_SEQUENCE_VERSION;
+
+ if (rp->seq_value > rp->seq_max ||
+ rp->seq_value < rp->seq_min) {
+ __db_errx(env, DB_STR("4003",
+ "Sequence value out of range"));
+ ret = EINVAL;
+ goto err;
+ } else {
+ SEQ_SWAP_OUT(env, seq);
+ /* Create local transaction as necessary. */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret =
+ __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+
+ if ((ret = __db_put(dbp, ip, txn, &seq->seq_key,
+ &seq->seq_data, DB_NOOVERWRITE)) != 0) {
+ __db_errx(env, DB_STR("4004",
+ "Sequence create failed"));
+ goto err;
+ }
+ }
+ } else if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) {
+ ret = EEXIST;
+ goto err;
+ } else if (seq->seq_data.size < sizeof(seq->seq_record)) {
+ __db_errx(env, DB_STR("4005",
+ "Bad sequence record format"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (F_ISSET(env, ENV_LITTLEENDIAN))
+ seq->seq_rp = seq->seq_data.data;
+
+ /*
+ * The first release was stored in native mode.
+ * Check the version number before swapping.
+ */
+ rp = seq->seq_data.data;
+ if (rp->seq_version == DB_SEQUENCE_OLDVER) {
+oldver: if (IS_REP_CLIENT(env) &&
+ !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ ret = __db_rdonly(env, "DB_SEQUENCE->open");
+ goto err;
+ }
+ rp->seq_version = DB_SEQUENCE_VERSION;
+ if (!F_ISSET(env, ENV_LITTLEENDIAN)) {
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret =
+ __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ goto retry;
+ }
+ memcpy(&seq->seq_record, rp, sizeof(seq->seq_record));
+ SEQ_SWAP_OUT(env, seq);
+ }
+ if ((ret = __db_put(dbp,
+ ip, txn, &seq->seq_key, &seq->seq_data, 0)) != 0)
+ goto err;
+ }
+ rp = seq->seq_rp;
+
+ SEQ_SWAP_IN(env, seq);
+
+ if (rp->seq_version != DB_SEQUENCE_VERSION) {
+ /*
+ * The database may have moved from one type
+ * of machine to another, check here.
+ * If we moved from little-end to big-end then
+ * the swap above will make the version correct.
+ * If the move was from big to little
+ * then we need to swap to see if this
+ * is an old version.
+ */
+ if (rp->seq_version == DB_SEQUENCE_OLDVER)
+ goto oldver;
+ M_32_SWAP(rp->seq_version);
+ if (rp->seq_version == DB_SEQUENCE_OLDVER) {
+ SEQ_SWAP(rp);
+ goto oldver;
+ }
+ M_32_SWAP(rp->seq_version);
+ __db_errx(env, DB_STR_A("4006",
+ "Unsupported sequence version: %d", "%d"),
+ rp->seq_version);
+ goto err;
+ }
+
+ seq->seq_last_value = seq->seq_prev_value = rp->seq_value;
+ if (F_ISSET(rp, DB_SEQ_INC))
+ seq->seq_last_value--;
+ else
+ seq->seq_last_value++;
+
+ /*
+ * It's an error to specify a cache larger than the range of sequences.
+ */
+ if (seq->seq_cache_size != 0 && (ret = __seq_chk_cachesize(
+ env, seq->seq_cache_size, rp->seq_max, rp->seq_min)) != 0)
+ goto err;
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+ if (ret != 0) {
+ __os_free(env, seq->seq_key.data);
+ seq->seq_key.data = NULL;
+ }
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, keyp, NULL, NULL);
+ return (ret);
+}
+
+/*
+ * __seq_get_cachesize --
+ * Accessor for value passed into DB_SEQUENCE->set_cachesize call.
+ *
+ */
+static int
+__seq_get_cachesize(seq, cachesize)
+ DB_SEQUENCE *seq;
+ int32_t *cachesize;
+{
+ SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_cachesize");
+
+ *cachesize = seq->seq_cache_size;
+ return (0);
+}
+
+/*
+ * __seq_set_cachesize --
+ * DB_SEQUENCE->set_cachesize.
+ *
+ */
+static int
+__seq_set_cachesize(seq, cachesize)
+ DB_SEQUENCE *seq;
+ int32_t cachesize;
+{
+ ENV *env;
+ int ret;
+
+ env = seq->seq_dbp->env;
+
+ if (cachesize < 0) {
+ __db_errx(env, DB_STR("4007",
+ "Cache size must be >= 0"));
+ return (EINVAL);
+ }
+
+ /*
+ * It's an error to specify a cache larger than the range of sequences.
+ */
+ if (SEQ_IS_OPEN(seq) && (ret = __seq_chk_cachesize(env,
+ cachesize, seq->seq_rp->seq_max, seq->seq_rp->seq_min)) != 0)
+ return (ret);
+
+ seq->seq_cache_size = cachesize;
+ return (0);
+}
+
+#define SEQ_SET_FLAGS (DB_SEQ_WRAP | DB_SEQ_INC | DB_SEQ_DEC)
+/*
+ * __seq_get_flags --
+ * Accessor for flags passed into DB_SEQUENCE->open call
+ *
+ */
+static int
+__seq_get_flags(seq, flagsp)
+ DB_SEQUENCE *seq;
+ u_int32_t *flagsp;
+{
+ SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_flags");
+
+ *flagsp = F_ISSET(seq->seq_rp, SEQ_SET_FLAGS);
+ return (0);
+}
+
+/*
+ * __seq_set_flags --
+ * DB_SEQUENCE->set_flags.
+ *
+ */
+static int
+__seq_set_flags(seq, flags)
+ DB_SEQUENCE *seq;
+ u_int32_t flags;
+{
+ DB_SEQ_RECORD *rp;
+ ENV *env;
+ int ret;
+
+ env = seq->seq_dbp->env;
+ rp = seq->seq_rp;
+
+ SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->set_flags");
+
+ if ((ret = __db_fchk(
+ env, "DB_SEQUENCE->set_flags", flags, SEQ_SET_FLAGS)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(env,
+ "DB_SEQUENCE->set_flags", flags, DB_SEQ_DEC, DB_SEQ_INC)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_SEQ_DEC | DB_SEQ_INC))
+ F_CLR(rp, DB_SEQ_DEC | DB_SEQ_INC);
+ F_SET(rp, flags);
+
+ return (0);
+}
+
+/*
+ * __seq_initial_value --
+ * DB_SEQUENCE->initial_value.
+ *
+ */
+static int
+__seq_initial_value(seq, value)
+ DB_SEQUENCE *seq;
+ db_seq_t value;
+{
+ DB_SEQ_RECORD *rp;
+ ENV *env;
+
+ env = seq->seq_dbp->env;
+ SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->initial_value");
+
+ rp = seq->seq_rp;
+ if (F_ISSET(rp, DB_SEQ_RANGE_SET) &&
+ (value > rp->seq_max || value < rp->seq_min)) {
+ __db_errx(env, DB_STR("4008",
+ "Sequence value out of range"));
+ return (EINVAL);
+ }
+
+ rp->seq_value = value;
+
+ return (0);
+}
+
+/*
+ * __seq_get_range --
+ * Accessor for range passed into DB_SEQUENCE->set_range call
+ *
+ */
+static int
+__seq_get_range(seq, minp, maxp)
+ DB_SEQUENCE *seq;
+ db_seq_t *minp, *maxp;
+{
+ SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_range");
+
+ *minp = seq->seq_rp->seq_min;
+ *maxp = seq->seq_rp->seq_max;
+ return (0);
+}
+
+/*
+ * __seq_set_range --
+ * SEQUENCE->set_range.
+ *
+ */
+static int
+__seq_set_range(seq, min, max)
+ DB_SEQUENCE *seq;
+ db_seq_t min, max;
+{
+ DB_SEQ_RECORD *rp;
+ ENV *env;
+
+ env = seq->seq_dbp->env;
+ SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->set_range");
+
+ rp = seq->seq_rp;
+ if (min >= max) {
+ __db_errx(env, DB_STR("4009",
+ "Minimum sequence value must be less than maximum sequence value"));
+ return (EINVAL);
+ }
+
+ rp->seq_min = min;
+ rp->seq_max = max;
+ F_SET(rp, DB_SEQ_RANGE_SET);
+
+ return (0);
+}
+
+static int
+__seq_update(seq, ip, txn, delta, flags)
+ DB_SEQUENCE *seq;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ int32_t delta;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBT *data, ldata;
+ DB_SEQ_RECORD *rp;
+ ENV *env;
+ int32_t adjust;
+ int ret, txn_local, need_mutex;
+
+ dbp = seq->seq_dbp;
+ env = dbp->env;
+ need_mutex = 0;
+ data = &seq->seq_data;
+
+ /*
+ * Create a local transaction as necessary, check for consistent
+ * transaction usage, and, if we have no transaction but do have
+ * locking on, acquire a locker id for the handle lock acquisition.
+ */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, flags)) != 0)
+ return (ret);
+ txn_local = 1;
+ } else
+ txn_local = 0;
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+ /*
+ * If we are in a global transaction avoid deadlocking on the mutex.
+ * The write lock on the data will prevent two updaters getting in
+ * at once. Fetch the data then see if things are what we thought
+ * they were.
+ */
+ if (txn_local == 0 && txn != NULL) {
+ MUTEX_UNLOCK(env, seq->mtx_seq);
+ need_mutex = 1;
+ data = &ldata;
+ data->data = NULL;
+ data->flags = DB_DBT_REALLOC;
+ }
+
+retry: if ((ret = __db_get(dbp, ip,
+ txn, &seq->seq_key, data, DB_RMW)) != 0) {
+ if (ret == DB_BUFFER_SMALL &&
+ seq->seq_data.size > sizeof(seq->seq_record)) {
+ data->flags = DB_DBT_REALLOC;
+ data->data = NULL;
+ goto retry;
+ }
+ goto err;
+ }
+
+ if (data->size < sizeof(seq->seq_record)) {
+ __db_errx(env, DB_STR("4010",
+ "Bad sequence record format"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ /* We have an exclusive lock on the data, see if we raced. */
+ if (need_mutex) {
+ MUTEX_LOCK(env, seq->mtx_seq);
+ need_mutex = 0;
+ rp = seq->seq_rp;
+ /*
+ * Note that caching must be off if we have global
+ * transaction so the value we fetch from the database
+ * is the correct current value.
+ */
+ if (data->size <= seq->seq_data.size) {
+ memcpy(seq->seq_data.data, data->data, data->size);
+ __os_ufree(env, data->data);
+ } else {
+ seq->seq_data.data = data->data;
+ seq->seq_data.size = data->size;
+ }
+ }
+ if (F_ISSET(env, ENV_LITTLEENDIAN))
+ seq->seq_rp = seq->seq_data.data;
+ SEQ_SWAP_IN(env, seq);
+ rp = seq->seq_rp;
+
+ if (F_ISSET(rp, DB_SEQ_WRAPPED))
+ goto overflow;
+
+ adjust = delta > seq->seq_cache_size ? delta : seq->seq_cache_size;
+
+ /*
+ * Check whether this operation will cause the sequence to wrap.
+ *
+ * The sequence minimum and maximum values can be INT64_MIN and
+ * INT64_MAX, so we need to do the test carefully to cope with
+ * arithmetic overflow. The first part of the test below checks
+ * whether we will hit the end of the 64-bit range. The second part
+ * checks whether we hit the end of the sequence.
+ */
+again: if (F_ISSET(rp, DB_SEQ_INC)) {
+ if (rp->seq_value + adjust - 1 < rp->seq_value ||
+ rp->seq_value + adjust - 1 > rp->seq_max) {
+ /* Don't wrap just to fill the cache. */
+ if (adjust > delta) {
+ adjust = delta;
+ goto again;
+ }
+ if (F_ISSET(rp, DB_SEQ_WRAP))
+ rp->seq_value = rp->seq_min;
+ else {
+overflow: __db_errx(env, DB_STR("4011",
+ "Sequence overflow"));
+ ret = EINVAL;
+ goto err;
+ }
+ }
+ /* See if we are at the end of the 64 bit range. */
+ if (!F_ISSET(rp, DB_SEQ_WRAP) &&
+ rp->seq_value + adjust < rp->seq_value)
+ F_SET(rp, DB_SEQ_WRAPPED);
+ } else {
+ if ((rp->seq_value - adjust) + 1 > rp->seq_value ||
+ (rp->seq_value - adjust) + 1 < rp->seq_min) {
+ /* Don't wrap just to fill the cache. */
+ if (adjust > delta) {
+ adjust = delta;
+ goto again;
+ }
+ if (F_ISSET(rp, DB_SEQ_WRAP))
+ rp->seq_value = rp->seq_max;
+ else
+ goto overflow;
+ }
+ /* See if we are at the end of the 64 bit range. */
+ if (!F_ISSET(rp, DB_SEQ_WRAP) &&
+ rp->seq_value - adjust > rp->seq_value)
+ F_SET(rp, DB_SEQ_WRAPPED);
+ adjust = -adjust;
+ }
+
+ rp->seq_value += adjust;
+ SEQ_SWAP_OUT(env, seq);
+ ret = __db_put(dbp, ip, txn, &seq->seq_key, &seq->seq_data, 0);
+ rp->seq_value -= adjust;
+ if (ret != 0) {
+ __db_errx(env, DB_STR("4012",
+ "Sequence update failed"));
+ goto err;
+ }
+ seq->seq_last_value = rp->seq_value + adjust;
+ if (F_ISSET(rp, DB_SEQ_INC))
+ seq->seq_last_value--;
+ else
+ seq->seq_last_value++;
+
+err: if (need_mutex) {
+ if (data->data != NULL)
+ __os_ufree(env, data->data);
+ MUTEX_LOCK(env, seq->mtx_seq);
+ }
+ return (txn_local ? __db_txn_auto_resolve(
+ env, txn, LF_ISSET(DB_TXN_NOSYNC), ret) : ret);
+}
+
+static int
+__seq_get(seq, txn, delta, retp, flags)
+ DB_SEQUENCE *seq;
+ DB_TXN *txn;
+ int32_t delta;
+ db_seq_t *retp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_SEQ_RECORD *rp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ dbp = seq->seq_dbp;
+ env = dbp->env;
+ rp = seq->seq_rp;
+ ret = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+ SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get");
+
+ if (delta < 0 || (delta == 0 && !LF_ISSET(DB_CURRENT))) {
+ __db_errx(env, "Sequence delta must be greater than 0");
+ return (EINVAL);
+ }
+
+ if (seq->seq_cache_size != 0 && txn != NULL) {
+ __db_errx(env,
+ "Sequence with non-zero cache may not specify transaction handle");
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0)
+ return (ret);
+
+ MUTEX_LOCK(env, seq->mtx_seq);
+
+ if (handle_check && IS_REP_CLIENT(env) &&
+ !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ ret = __db_rdonly(env, "DB_SEQUENCE->get");
+ goto err;
+ }
+
+ if (rp->seq_min + delta > rp->seq_max) {
+ __db_errx(env, DB_STR("4013", "Sequence overflow"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (LF_ISSET(DB_CURRENT)) {
+ *retp = seq->seq_prev_value;
+ } else if (F_ISSET(rp, DB_SEQ_INC)) {
+ if (seq->seq_last_value + 1 - rp->seq_value < delta &&
+ (ret = __seq_update(seq, ip, txn, delta, flags)) != 0)
+ goto err;
+
+ rp = seq->seq_rp;
+ *retp = rp->seq_value;
+ seq->seq_prev_value = rp->seq_value;
+ rp->seq_value += delta;
+ } else {
+ if ((rp->seq_value - seq->seq_last_value) + 1 < delta &&
+ (ret = __seq_update(seq, ip, txn, delta, flags)) != 0)
+ goto err;
+
+ rp = seq->seq_rp;
+ *retp = rp->seq_value;
+ seq->seq_prev_value = rp->seq_value;
+ rp->seq_value -= delta;
+ }
+
+err: MUTEX_UNLOCK(env, seq->mtx_seq);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __seq_get_db --
+ * Accessor for dbp passed into db_sequence_create call
+ *
+ */
+static int
+__seq_get_db(seq, dbpp)
+ DB_SEQUENCE *seq;
+ DB **dbpp;
+{
+ *dbpp = seq->seq_dbp;
+ return (0);
+}
+
+/*
+ * __seq_get_key --
+ * Accessor for key passed into DB_SEQUENCE->open call
+ *
+ */
+static int
+__seq_get_key(seq, key)
+ DB_SEQUENCE *seq;
+ DBT *key;
+{
+ SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_key");
+
+ if (F_ISSET(key, DB_DBT_USERCOPY))
+ return (__db_retcopy(seq->seq_dbp->env, key,
+ seq->seq_key.data, seq->seq_key.size, NULL, 0));
+
+ key->data = seq->seq_key.data;
+ key->size = key->ulen = seq->seq_key.size;
+ key->flags = seq->seq_key.flags;
+ return (0);
+}
+
+/*
+ * __seq_close_pp --
+ * Close a sequence pre/post processing
+ *
+ */
+static int
+__seq_close_pp(seq, flags)
+ DB_SEQUENCE *seq;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ENV_ENTER(seq->seq_dbp->env, ip);
+ ret = __seq_close(seq, flags);
+ ENV_LEAVE(seq->seq_dbp->env, ip);
+
+ return (ret);
+}
+
+/*
+ * __seq_close --
+ * Close a sequence
+ *
+ */
+static int
+__seq_close(seq, flags)
+ DB_SEQUENCE *seq;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret, t_ret;
+
+ ret = 0;
+ env = seq->seq_dbp->env;
+
+ if (flags != 0)
+ ret = __db_ferr(env, "DB_SEQUENCE->close", 0);
+
+ if ((t_ret = __mutex_free(env, &seq->mtx_seq)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (seq->seq_key.data != NULL)
+ __os_free(env, seq->seq_key.data);
+ if (seq->seq_data.data != NULL &&
+ seq->seq_data.data != &seq->seq_record)
+ __os_ufree(env, seq->seq_data.data);
+ seq->seq_key.data = NULL;
+
+ memset(seq, CLEAR_BYTE, sizeof(*seq));
+ __os_free(env, seq);
+
+ return (ret);
+}
+
+/*
+ * __seq_remove --
+ * Remove a sequence from the database.
+ */
+static int
+__seq_remove(seq, txn, flags)
+ DB_SEQUENCE *seq;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ dbp = seq->seq_dbp;
+ env = dbp->env;
+ txn_local = 0;
+
+ SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->remove");
+
+ /*
+ * Flags can only be 0, unless the database has DB_AUTO_COMMIT enabled.
+ * Then DB_TXN_NOSYNC is allowed.
+ */
+ if (flags != 0 &&
+ (flags != DB_TXN_NOSYNC || !IS_DB_AUTO_COMMIT(dbp, txn)))
+ return (__db_ferr(env, "DB_SEQUENCE->remove illegal flag", 0));
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Create a local transaction as necessary, check for consistent
+ * transaction usage, and, if we have no transaction but do have
+ * locking on, acquire a locker id for the handle lock acquisition.
+ */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, flags)) != 0)
+ return (ret);
+ txn_local = 1;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ ret = __db_del(dbp, ip, txn, &seq->seq_key, 0);
+
+ if ((t_ret = __seq_close(seq, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+err: if (txn_local && (t_ret =
+ __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __seq_chk_cachesize --
+ * Validate the cache size vs. the range.
+ */
+static int
+__seq_chk_cachesize(env, cachesize, max, min)
+ ENV *env;
+ int32_t cachesize;
+ db_seq_t max, min;
+{
+ /*
+ * It's an error to specify caches larger than the sequence range.
+ *
+ * The min and max of the range can be either positive or negative,
+ * the difference will fit in an unsigned variable of the same type.
+ * Assume a 2's complement machine, and simply subtract.
+ */
+ if ((u_int32_t)cachesize > (u_int64_t)max - (u_int64_t)min) {
+ __db_errx(env, DB_STR("4014",
+ "Number of items to be cached is larger than the sequence range"));
+ return (EINVAL);
+ }
+ return (0);
+}
+
+#else /* !HAVE_64BIT_TYPES */
+
+int
+db_sequence_create(seqp, dbp, flags)
+ DB_SEQUENCE **seqp;
+ DB *dbp;
+ u_int32_t flags;
+{
+ COMPQUIET(seqp, NULL);
+ COMPQUIET(flags, 0);
+ __db_errx(dbp->env, DB_STR("4015",
+ "library build did not include support for sequences"));
+ return (DB_OPNOTSUP);
+}
+#endif /* HAVE_64BIT_TYPES */
diff --git a/src/txn/txn.c b/src/txn/txn.c
new file mode 100644
index 00000000..81225e5c
--- /dev/null
+++ b/src/txn/txn.c
@@ -0,0 +1,2169 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * The President and Fellows of Harvard University. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+#define LOG_FLAGS(txn) \
+ (DB_LOG_COMMIT | (F_ISSET(txn, TXN_SYNC) ? \
+ DB_FLUSH : (F_ISSET(txn, TXN_WRITE_NOSYNC) ? \
+ DB_LOG_WRNOSYNC : 0)))
+
+/*
+ * __txn_isvalid enumerated types. We cannot simply use the transaction
+ * statuses, because different statuses need to be handled differently
+ * depending on the caller.
+ */
+typedef enum {
+ TXN_OP_ABORT,
+ TXN_OP_COMMIT,
+ TXN_OP_DISCARD,
+ TXN_OP_PREPARE
+} txnop_t;
+
+static int __txn_abort_pp __P((DB_TXN *));
+static int __txn_applied __P((ENV *,
+ DB_THREAD_INFO *, DB_COMMIT_INFO *, db_timeout_t));
+static void __txn_build_token __P((DB_TXN *, DB_LSN *));
+static int __txn_begin_int __P((DB_TXN *));
+static int __txn_close_cursors __P((DB_TXN *));
+static int __txn_commit_pp __P((DB_TXN *, u_int32_t));
+static int __txn_discard __P((DB_TXN *, u_int32_t));
+static int __txn_dispatch_undo
+ __P((ENV *, DB_TXN *, DBT *, DB_LSN *, DB_TXNHEAD *));
+static int __txn_end __P((DB_TXN *, int));
+static int __txn_isvalid __P((const DB_TXN *, txnop_t));
+static int __txn_undo __P((DB_TXN *));
+static int __txn_set_commit_token __P((DB_TXN *txn, DB_TXN_TOKEN *));
+static void __txn_set_txn_lsnp __P((DB_TXN *, DB_LSN **, DB_LSN **));
+
+#define TxnAlloc "Unable to allocate a transaction handle"
+
+/*
+ * __txn_begin_pp --
+ * ENV->txn_begin pre/post processing.
+ *
+ * PUBLIC: int __txn_begin_pp __P((DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t));
+ */
+int
+__txn_begin_pp(dbenv, parent, txnpp, flags)
+ DB_ENV *dbenv;
+ DB_TXN *parent, **txnpp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int rep_check, ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env, env->tx_handle, "txn_begin", DB_INIT_TXN);
+
+ if ((ret = __db_fchk(env,
+ "txn_begin", flags,
+ DB_IGNORE_LEASE |DB_READ_COMMITTED | DB_READ_UNCOMMITTED |
+ DB_TXN_FAMILY | DB_TXN_NOSYNC | DB_TXN_SNAPSHOT | DB_TXN_SYNC |
+ DB_TXN_WAIT | DB_TXN_WRITE_NOSYNC | DB_TXN_NOWAIT |
+ DB_TXN_BULK)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(env, "txn_begin", flags,
+ DB_TXN_WRITE_NOSYNC | DB_TXN_NOSYNC, DB_TXN_SYNC)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(env, "txn_begin",
+ flags, DB_TXN_WRITE_NOSYNC, DB_TXN_NOSYNC)) != 0)
+ return (ret);
+ if (parent != NULL && LF_ISSET(DB_TXN_FAMILY)) {
+ __db_errx(env, DB_STR("4521",
+ "Family transactions cannot have parents"));
+ return (EINVAL);
+ } else if (IS_REAL_TXN(parent) &&
+ !F_ISSET(parent, TXN_SNAPSHOT) && LF_ISSET(DB_TXN_SNAPSHOT)) {
+ __db_errx(env, DB_STR("4522",
+ "Child transaction snapshot setting must match parent"));
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ /* Replication accounts for top-level transactions. */
+ rep_check = IS_ENV_REPLICATED(env) &&
+ !IS_REAL_TXN(parent) && !LF_ISSET(DB_TXN_FAMILY);
+
+ if (rep_check && (ret = __op_rep_enter(env, 0, 1)) != 0)
+ goto err;
+
+ ret = __txn_begin(env, ip, parent, txnpp, flags);
+
+ /*
+ * We only decrement the count if the operation fails.
+ * Otherwise the count will be decremented when the
+ * txn is resolved by txn_commit, txn_abort, etc.
+ */
+ if (ret != 0 && rep_check)
+ (void)__op_rep_exit(env);
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_begin --
+ * ENV->txn_begin.
+ *
+ * This is a wrapper to the actual begin process. We allocate a DB_TXN
+ * structure for the caller and then call into __txn_begin_int code.
+ *
+ * Internally, we use TXN_DETAIL structures, but the DB_TXN structure
+ * provides access to the transaction ID and the offset in the transaction
+ * region of the TXN_DETAIL structure.
+ *
+ * PUBLIC: int __txn_begin __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_TXN **, u_int32_t));
+ */
+int
+__txn_begin(env, ip, parent, txnpp, flags)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_TXN *parent, **txnpp;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ DB_LOCKREGION *region;
+ DB_TXN *txn;
+ TXN_DETAIL *ptd, *td;
+ int ret;
+
+ if (F_ISSET(env, ENV_FORCE_TXN_BULK))
+ flags |= DB_TXN_BULK;
+
+ *txnpp = NULL;
+ if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0) {
+ __db_errx(env, TxnAlloc);
+ return (ret);
+ }
+
+ dbenv = env->dbenv;
+ txn->mgrp = env->tx_handle;
+ txn->parent = parent;
+ if (parent != NULL && F_ISSET(parent, TXN_FAMILY))
+ parent = NULL;
+ TAILQ_INIT(&txn->kids);
+ TAILQ_INIT(&txn->events);
+ STAILQ_INIT(&txn->logs);
+ TAILQ_INIT(&txn->my_cursors);
+ TAILQ_INIT(&txn->femfs);
+ txn->flags = TXN_MALLOC;
+ txn->thread_info =
+ ip != NULL ? ip : (parent != NULL ? parent->thread_info : NULL);
+
+ /*
+ * Set the sync mode for commit. Any local bits override those
+ * in the environment. SYNC is the default.
+ */
+ if (LF_ISSET(DB_TXN_SYNC))
+ F_SET(txn, TXN_SYNC);
+ else if (LF_ISSET(DB_TXN_NOSYNC))
+ F_SET(txn, TXN_NOSYNC);
+ else if (LF_ISSET(DB_TXN_WRITE_NOSYNC))
+ F_SET(txn, TXN_WRITE_NOSYNC);
+ else if (F_ISSET(dbenv, DB_ENV_TXN_NOSYNC))
+ F_SET(txn, TXN_NOSYNC);
+ else if (F_ISSET(dbenv, DB_ENV_TXN_WRITE_NOSYNC))
+ F_SET(txn, TXN_WRITE_NOSYNC);
+ else
+ F_SET(txn, TXN_SYNC);
+
+ if (LF_ISSET(DB_TXN_NOWAIT) ||
+ (F_ISSET(dbenv, DB_ENV_TXN_NOWAIT) && !LF_ISSET(DB_TXN_WAIT)))
+ F_SET(txn, TXN_NOWAIT);
+ if (LF_ISSET(DB_READ_COMMITTED))
+ F_SET(txn, TXN_READ_COMMITTED);
+ if (LF_ISSET(DB_READ_UNCOMMITTED))
+ F_SET(txn, TXN_READ_UNCOMMITTED);
+ if (LF_ISSET(DB_TXN_FAMILY))
+ F_SET(txn, TXN_FAMILY | TXN_INFAMILY | TXN_READONLY);
+ if (LF_ISSET(DB_TXN_SNAPSHOT) || F_ISSET(dbenv, DB_ENV_TXN_SNAPSHOT) ||
+ (parent != NULL && F_ISSET(parent, TXN_SNAPSHOT)))
+ F_SET(txn, TXN_SNAPSHOT);
+ if (LF_ISSET(DB_IGNORE_LEASE))
+ F_SET(txn, TXN_IGNORE_LEASE);
+
+ /*
+ * We set TXN_BULK only for the outermost transaction. This
+ * is a temporary limitation; in the future we will allow it
+ * for nested transactions as well. See #17669 for details.
+ *
+ * Also, ignore requests for DB_TXN_BULK if replication is enabled.
+ */
+ if (LF_ISSET(DB_TXN_BULK) && parent == NULL && !REP_ON(txn->mgrp->env))
+ F_SET(txn, TXN_BULK);
+
+ if ((ret = __txn_begin_int(txn)) != 0)
+ goto err;
+ td = txn->td;
+
+ if (parent != NULL) {
+ ptd = parent->td;
+ TAILQ_INSERT_HEAD(&parent->kids, txn, klinks);
+ SH_TAILQ_INSERT_HEAD(&ptd->kids, td, klinks, __txn_detail);
+ }
+
+ if (LOCKING_ON(env)) {
+ region = env->lk_handle->reginfo.primary;
+ if (parent != NULL) {
+ ret = __lock_inherit_timeout(env,
+ parent->locker, txn->locker);
+ /* No parent locker set yet. */
+ if (ret == EINVAL) {
+ parent = NULL;
+ ret = 0;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * Parent is NULL if we have no parent
+ * or it has no timeouts set.
+ */
+ if (parent == NULL && region->tx_timeout != 0)
+ if ((ret = __lock_set_timeout(env, txn->locker,
+ region->tx_timeout, DB_SET_TXN_TIMEOUT)) != 0)
+ goto err;
+ }
+
+ *txnpp = txn;
+ PERFMON2(env, txn, begin, txn->txnid, flags);
+ return (0);
+
+err:
+ __os_free(env, txn);
+ return (ret);
+}
+
+/*
+ * __txn_recycle_id --
+ * Find a range of useable transaction ids.
+ *
+ * PUBLIC: int __txn_recycle_id __P((ENV *, int));
+ */
+int
+__txn_recycle_id(env, locked)
+ ENV *env;
+ int locked;
+{
+ DB_LSN null_lsn;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ TXN_DETAIL *td;
+ u_int32_t *ids;
+ int nids, ret;
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ if ((ret = __os_malloc(env,
+ sizeof(u_int32_t) * region->curtxns, &ids)) != 0) {
+ __db_errx(env, DB_STR("4523",
+ "Unable to allocate transaction recycle buffer"));
+ return (ret);
+ }
+ nids = 0;
+ SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail)
+ ids[nids++] = td->txnid;
+ region->last_txnid = TXN_MINIMUM - 1;
+ region->cur_maxid = TXN_MAXIMUM;
+ if (nids != 0)
+ __db_idspace(ids, nids,
+ &region->last_txnid, &region->cur_maxid);
+ __os_free(env, ids);
+
+ /*
+ * Check LOGGING_ON rather than DBENV_LOGGING as we want to emit this
+ * record at the end of recovery.
+ */
+ if (LOGGING_ON(env)) {
+ if (locked)
+ TXN_SYSTEM_UNLOCK(env);
+ ret = __txn_recycle_log(env, NULL, &null_lsn,
+ 0, region->last_txnid + 1, region->cur_maxid);
+ /* Make it simple on the caller, if error we hold the lock. */
+ if (locked && ret != 0)
+ TXN_SYSTEM_LOCK(env);
+ }
+
+ return (ret);
+}
+
+/*
+ * __txn_begin_int --
+ * Normal DB version of txn_begin.
+ */
+static int
+__txn_begin_int(txn)
+ DB_TXN *txn;
+{
+ DB_ENV *dbenv;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ ENV *env;
+ TXN_DETAIL *td;
+ u_int32_t id;
+ int inserted, ret;
+
+ mgr = txn->mgrp;
+ env = mgr->env;
+ dbenv = env->dbenv;
+ region = mgr->reginfo.primary;
+ td = NULL;
+ inserted = 0;
+
+ TXN_SYSTEM_LOCK(env);
+ if (!F_ISSET(txn, TXN_COMPENSATE) && F_ISSET(region, TXN_IN_RECOVERY)) {
+ __db_errx(env, DB_STR("4524",
+ "operation not permitted during recovery"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Allocate a new transaction id. Our current valid range can span
+ * the maximum valid value, so check for it and wrap manually.
+ */
+ if (region->last_txnid == TXN_MAXIMUM &&
+ region->cur_maxid != TXN_MAXIMUM)
+ region->last_txnid = TXN_MINIMUM - 1;
+
+ /* Allocate a new transaction detail structure. */
+ if ((ret =
+ __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) {
+ __db_errx(env, DB_STR("4525",
+ "Unable to allocate memory for transaction detail"));
+ goto err;
+ }
+
+ id = ++region->last_txnid;
+
+#ifdef HAVE_STATISTICS
+ STAT_INC(env, txn, nbegins, region->stat.st_nbegins, id);
+ STAT_INC(env, txn, nactive, region->stat.st_nactive, id);
+ if (region->stat.st_nactive > region->stat.st_maxnactive)
+ STAT_SET(env, txn, maxnactive,
+ region->stat.st_maxnactive, region->stat.st_nactive, id);
+#endif
+
+ td->txnid = id;
+ dbenv->thread_id(dbenv, &td->pid, &td->tid);
+
+ ZERO_LSN(td->last_lsn);
+ ZERO_LSN(td->begin_lsn);
+ SH_TAILQ_INIT(&td->kids);
+ if (txn->parent != NULL && !F_ISSET(txn->parent, TXN_FAMILY))
+ td->parent = R_OFFSET(&mgr->reginfo, txn->parent->td);
+ else
+ td->parent = INVALID_ROFF;
+ td->name = INVALID_ROFF;
+ MAX_LSN(td->read_lsn);
+ MAX_LSN(td->visible_lsn);
+ td->mvcc_ref = 0;
+ td->mvcc_mtx = MUTEX_INVALID;
+ td->status = TXN_RUNNING;
+ td->flags = F_ISSET(txn, TXN_NOWAIT) ? TXN_DTL_NOWAIT : 0;
+ td->nlog_dbs = 0;
+ td->nlog_slots = TXN_NSLOTS;
+ td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots);
+
+ /* XA specific fields. */
+ td->xa_ref = 1;
+ td->xa_br_status = TXN_XA_IDLE;
+
+ /* Place transaction on active transaction list. */
+ SH_TAILQ_INSERT_HEAD(&region->active_txn, td, links, __txn_detail);
+ region->curtxns++;
+
+ /* Increment bulk transaction counter while holding transaction lock. */
+ if (F_ISSET(txn, TXN_BULK))
+ ((DB_TXNREGION *)env->tx_handle->reginfo.primary)->n_bulk_txn++;
+
+ inserted = 1;
+
+ if (region->last_txnid == region->cur_maxid) {
+ if ((ret = __txn_recycle_id(env, 1)) != 0)
+ goto err;
+ } else
+ TXN_SYSTEM_UNLOCK(env);
+
+ txn->txnid = id;
+ txn->td = td;
+
+ /* Allocate a locker for this txn. */
+ if (LOCKING_ON(env) && (ret =
+ __lock_getlocker(env->lk_handle, id, 1, &txn->locker)) != 0)
+ goto err;
+
+ txn->abort = __txn_abort_pp;
+ txn->commit = __txn_commit_pp;
+ txn->discard = __txn_discard;
+ txn->get_name = __txn_get_name;
+ txn->get_priority = __txn_get_priority;
+ txn->id = __txn_id;
+ txn->prepare = __txn_prepare;
+ txn->set_commit_token = __txn_set_commit_token;
+ txn->set_txn_lsnp = __txn_set_txn_lsnp;
+ txn->set_name = __txn_set_name;
+ txn->set_priority = __txn_set_priority;
+ txn->set_timeout = __txn_set_timeout;
+
+ /* We can't call __txn_set_priority until txn->td is set. */
+ if (LOCKING_ON(env) && (ret = __txn_set_priority(txn,
+ txn->parent == NULL ?
+ TXN_PRIORITY_DEFAULT : txn->parent->locker->priority)) != 0)
+ goto err;
+ else
+ td->priority = 0;
+
+ /*
+ * If this is a transaction family, we must link the child to the
+ * maximal grandparent in the lock table for deadlock detection.
+ */
+ if (txn->parent != NULL) {
+ if (LOCKING_ON(env) && (ret = __lock_addfamilylocker(env,
+ txn->parent->txnid, txn->txnid,
+ F_ISSET(txn->parent, TXN_FAMILY))) != 0)
+ goto err;
+
+ /*
+ * If the parent is only used to establish compatability, do
+ * not reference it again.
+ */
+ if (F_ISSET(txn->parent, TXN_FAMILY)) {
+ txn->parent = NULL;
+ F_SET(txn, TXN_INFAMILY);
+ }
+ }
+
+ if (F_ISSET(txn, TXN_MALLOC)) {
+ MUTEX_LOCK(env, mgr->mutex);
+ TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links);
+ MUTEX_UNLOCK(env, mgr->mutex);
+ }
+
+ return (0);
+
+err: if (inserted) {
+ TXN_SYSTEM_LOCK(env);
+ SH_TAILQ_REMOVE(&region->active_txn, td, links, __txn_detail);
+ region->curtxns--;
+ if (F_ISSET(txn, TXN_BULK))
+ ((DB_TXNREGION *)
+ env->tx_handle->reginfo.primary)->n_bulk_txn--;
+ }
+ if (td != NULL)
+ __env_alloc_free(&mgr->reginfo, td);
+ TXN_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __txn_continue
+ * Fill in the fields of the local transaction structure given
+ * the detail transaction structure. Optionally link transactions
+ * to transaction manager list.
+ *
+ * PUBLIC: int __txn_continue __P((ENV *,
+ * PUBLIC: DB_TXN *, TXN_DETAIL *, DB_THREAD_INFO *, int));
+ */
+int
+__txn_continue(env, txn, td, ip, add_to_list)
+ ENV *env;
+ DB_TXN *txn;
+ TXN_DETAIL *td;
+ DB_THREAD_INFO *ip;
+ int add_to_list;
+{
+ DB_LOCKREGION *region;
+ DB_TXNMGR *mgr;
+ int ret;
+
+ ret = 0;
+
+ /*
+ * This code follows the order of the structure definition so it
+ * is relatively easy to make sure that we are setting everything.
+ */
+ mgr = txn->mgrp = env->tx_handle;
+ txn->parent = NULL;
+ txn->thread_info = ip;
+ txn->txnid = td->txnid;
+ txn->name = NULL;
+ txn->td = td;
+ td->xa_ref++;
+
+ /* This never seems to be used: txn->expire */
+ txn->txn_list = NULL;
+
+ TAILQ_INIT(&txn->kids);
+ TAILQ_INIT(&txn->events);
+ STAILQ_INIT(&txn->logs);
+
+ /*
+ * These fields should never persist across different processes as we
+ * require that cursors be opened/closed within the same service routine
+ * and we disallow file level operations in XA transactions.
+ */
+ TAILQ_INIT(&txn->my_cursors);
+ TAILQ_INIT(&txn->femfs);
+
+ /* Put the transaction onto the transaction manager's list. */
+ if (add_to_list) {
+ MUTEX_LOCK(env, mgr->mutex);
+ TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links);
+ MUTEX_UNLOCK(env, mgr->mutex);
+ }
+
+ txn->token_buffer = 0;
+ txn->cursors = 0;
+
+ txn->abort = __txn_abort_pp;
+ txn->commit = __txn_commit_pp;
+ txn->discard = __txn_discard;
+ txn->get_name = __txn_get_name;
+ txn->get_priority = __txn_get_priority;
+ txn->id = __txn_id;
+ txn->prepare = __txn_prepare;
+ txn->set_commit_token = __txn_set_commit_token;
+ txn->set_name = __txn_set_name;
+ txn->set_priority = __txn_set_priority;
+ txn->set_timeout = __txn_set_timeout;
+ txn->set_txn_lsnp = __txn_set_txn_lsnp;
+
+ /* XXX Do we need to explicitly set a SYNC flag here? */
+ txn->flags = TXN_MALLOC |
+ (F_ISSET(td, TXN_DTL_NOWAIT) ? TXN_NOWAIT : 0);
+ txn->xa_thr_status = TXN_XA_THREAD_NOTA;
+
+ /*
+ * If this is a restored transaction, we need to propagate that fact
+ * to the process-local structure. However, if it's not a restored
+ * transaction, we need to make sure that we have a locker associated
+ * with this transaction.
+ */
+ if (F_ISSET(td, TXN_DTL_RESTORED))
+ F_SET(txn, TXN_RESTORED);
+ else
+ if ((ret = __lock_getlocker(env->lk_handle,
+ txn->txnid, 0, &txn->locker)) == 0)
+ ret = __txn_set_priority(txn, td->priority);
+
+ if (LOCKING_ON(env)) {
+ region = env->lk_handle->reginfo.primary;
+ if (region->tx_timeout != 0 &&
+ (ret = __lock_set_timeout(env, txn->locker,
+ region->tx_timeout, DB_SET_TXN_TIMEOUT)) != 0)
+ return (ret);
+ txn->lock_timeout = region->tx_timeout;
+ }
+
+ return (ret);
+}
+
+/*
+ * __txn_commit_pp --
+ * Interface routine to TXN->commit.
+ */
+static int
+__txn_commit_pp(txn, flags)
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int rep_check, ret, t_ret;
+
+ env = txn->mgrp->env;
+ rep_check = IS_ENV_REPLICATED(env) &&
+ txn->parent == NULL && IS_REAL_TXN(txn);
+
+ ENV_ENTER(env, ip);
+ ret = __txn_commit(txn, flags);
+ if (rep_check && (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_commit --
+ * Commit a transaction.
+ *
+ * PUBLIC: int __txn_commit __P((DB_TXN *, u_int32_t));
+ */
+int
+__txn_commit(txn, flags)
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DBT list_dbt;
+ DB_LOCKREQ request;
+ DB_TXN *kid;
+ ENV *env;
+ REGENV *renv;
+ REGINFO *infop;
+ TXN_DETAIL *td;
+ DB_LSN token_lsn;
+ u_int32_t id;
+ int ret, t_ret;
+
+ env = txn->mgrp->env;
+ td = txn->td;
+ PERFMON2(env, txn, commit, txn->txnid, flags);
+
+ DB_ASSERT(env, txn->xa_thr_status == TXN_XA_THREAD_NOTA ||
+ td->xa_ref == 1);
+ /*
+ * A common mistake in Berkeley DB programs is to mis-handle deadlock
+ * return. If the transaction deadlocked, they want abort, not commit.
+ */
+ if (F_ISSET(txn, TXN_DEADLOCK)) {
+ ret = __db_txn_deadlock_err(env, txn);
+ goto err;
+ }
+
+ /* Close registered cursors before committing. */
+ if ((ret = __txn_close_cursors(txn)) != 0)
+ goto err;
+
+ if ((ret = __txn_isvalid(txn, TXN_OP_COMMIT)) != 0)
+ return (ret);
+
+ /*
+ * Check for master leases at the beginning. If we are a master and
+ * cannot have valid leases now, we error and abort this txn. There
+ * should always be a perm record in the log because the master updates
+ * the LSN history system database in rep_start() (with IGNORE_LEASE
+ * set).
+ *
+ * Only check leases if this txn writes to the log file
+ * (i.e. td->last_lsn).
+ */
+ if (txn->parent == NULL && IS_REP_MASTER(env) &&
+ IS_USING_LEASES(env) && !F_ISSET(txn, TXN_IGNORE_LEASE) &&
+ !IS_ZERO_LSN(td->last_lsn) &&
+ (ret = __rep_lease_check(env, 1)) != 0) {
+ DB_ASSERT(env, ret != DB_NOTFOUND);
+ goto err;
+ }
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ /*
+ * No mutex is needed as envid is read-only once it is set.
+ */
+ id = renv->envid;
+
+ /*
+ * We clear flags that are incorrect, ignoring any flag errors, and
+ * default to synchronous operations. By definition, transaction
+ * handles are dead when we return, and this error should never
+ * happen, but we don't want to fail in the field 'cause the app is
+ * specifying the wrong flag for some reason.
+ */
+ if (__db_fchk(env, "DB_TXN->commit", flags,
+ DB_TXN_NOSYNC | DB_TXN_SYNC | DB_TXN_WRITE_NOSYNC) != 0)
+ flags = DB_TXN_SYNC;
+ if (__db_fcchk(env, "DB_TXN->commit", flags,
+ DB_TXN_SYNC, DB_TXN_NOSYNC | DB_TXN_WRITE_NOSYNC) != 0)
+ flags = DB_TXN_SYNC;
+
+ if (LF_ISSET(DB_TXN_WRITE_NOSYNC)) {
+ F_CLR(txn, TXN_SYNC_FLAGS);
+ F_SET(txn, TXN_WRITE_NOSYNC);
+ }
+ if (LF_ISSET(DB_TXN_NOSYNC)) {
+ F_CLR(txn, TXN_SYNC_FLAGS);
+ F_SET(txn, TXN_NOSYNC);
+ }
+ if (LF_ISSET(DB_TXN_SYNC)) {
+ F_CLR(txn, TXN_SYNC_FLAGS);
+ F_SET(txn, TXN_SYNC);
+ }
+
+ DB_ASSERT(env, F_ISSET(txn, TXN_SYNC_FLAGS));
+
+ /*
+ * Commit any unresolved children. If anyone fails to commit,
+ * then try to abort the rest of the kids and then abort the parent.
+ * Abort should never fail; if it does, we bail out immediately.
+ */
+ while ((kid = TAILQ_FIRST(&txn->kids)) != NULL)
+ if ((ret = __txn_commit(kid, flags)) != 0)
+ while ((kid = TAILQ_FIRST(&txn->kids)) != NULL)
+ if ((t_ret = __txn_abort(kid)) != 0)
+ return (__env_panic(env, t_ret));
+
+ /*
+ * If there are any log records, write a log record and sync the log,
+ * else do no log writes. If the commit is for a child transaction,
+ * we do not need to commit the child synchronously since it may still
+ * abort (if its parent aborts), and otherwise its parent or ultimate
+ * ancestor will write synchronously.
+ */
+ ZERO_LSN(token_lsn);
+ if (DBENV_LOGGING(env) && (!IS_ZERO_LSN(td->last_lsn) ||
+ STAILQ_FIRST(&txn->logs) != NULL)) {
+ if (txn->parent == NULL) {
+ /*
+ * We are about to free all the read locks for this
+ * transaction below. Some of those locks might be
+ * handle locks which should not be freed, because
+ * they will be freed when the handle is closed. Check
+ * the events and preprocess any trades now so we don't
+ * release the locks below.
+ */
+ if ((ret =
+ __txn_doevents(env, txn, TXN_COMMIT, 1)) != 0)
+ goto err;
+
+ memset(&request, 0, sizeof(request));
+ if (LOCKING_ON(env)) {
+ request.op = DB_LOCK_PUT_READ;
+ if (IS_REP_MASTER(env) &&
+ !IS_ZERO_LSN(td->last_lsn)) {
+ memset(&list_dbt, 0, sizeof(list_dbt));
+ request.obj = &list_dbt;
+ }
+ ret = __lock_vec(env,
+ txn->locker, 0, &request, 1, NULL);
+ }
+
+ if (ret == 0 && !IS_ZERO_LSN(td->last_lsn)) {
+ ret = __txn_flush_fe_files(txn);
+ if (ret == 0)
+ ret = __txn_regop_log(env, txn,
+ &td->visible_lsn, LOG_FLAGS(txn),
+ TXN_COMMIT,
+ (int32_t)time(NULL), id,
+ request.obj);
+ if (ret == 0)
+ token_lsn = td->last_lsn =
+ td->visible_lsn;
+#ifdef DIAGNOSTIC
+ if (ret == 0) {
+ DB_LSN s_lsn;
+
+ DB_ASSERT(env, __log_current_lsn_int(
+ env, &s_lsn, NULL, NULL) == 0);
+ DB_ASSERT(env, LOG_COMPARE(
+ &td->visible_lsn, &s_lsn) <= 0);
+ COMPQUIET(s_lsn.file, 0);
+ }
+#endif
+ }
+
+ if (request.obj != NULL && request.obj->data != NULL)
+ __os_free(env, request.obj->data);
+ if (ret != 0)
+ goto err;
+ } else {
+ /* Log the commit in the parent! */
+ if (!IS_ZERO_LSN(td->last_lsn) &&
+ (ret = __txn_child_log(env, txn->parent,
+ &((TXN_DETAIL *)txn->parent->td)->last_lsn,
+ 0, txn->txnid, &td->last_lsn)) != 0) {
+ goto err;
+ }
+ if (STAILQ_FIRST(&txn->logs) != NULL) {
+ /*
+ * Put the child first so we back it out first.
+ * All records are undone in reverse order.
+ */
+ STAILQ_CONCAT(&txn->logs, &txn->parent->logs);
+ txn->parent->logs = txn->logs;
+ STAILQ_INIT(&txn->logs);
+ }
+
+ F_SET(txn->parent, TXN_CHILDCOMMIT);
+ }
+ }
+ if (txn->token_buffer != NULL && ret == 0 && DBENV_LOGGING(env))
+ __txn_build_token(txn, &token_lsn);
+
+ if (txn->txn_list != NULL) {
+ __db_txnlist_end(env, txn->txn_list);
+ txn->txn_list = NULL;
+ }
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Check for master leases at the end of only a normal commit.
+ * If we're a child, that is not a perm record. If we are a
+ * master and cannot get valid leases now, something happened
+ * during the commit. The only thing to do is panic.
+ *
+ * Only check leases if this txn writes to the log file
+ * (i.e. td->last_lsn).
+ */
+ if (txn->parent == NULL && IS_REP_MASTER(env) &&
+ IS_USING_LEASES(env) && !F_ISSET(txn, TXN_IGNORE_LEASE) &&
+ !IS_ZERO_LSN(td->last_lsn) &&
+ (ret = __rep_lease_check(env, 1)) != 0)
+ return (__env_panic(env, ret));
+
+ /*
+ * This is here rather than in __txn_end because __txn_end is
+ * called too late during abort. So commit and abort each
+ * call it independently.
+ */
+ __txn_reset_fe_watermarks(txn);
+
+ /* This is OK because __txn_end can only fail with a panic. */
+ return (__txn_end(txn, 1));
+
+err: /*
+ * If we are prepared, then we "must" be able to commit. We panic here
+ * because even though the coordinator might be able to retry it is not
+ * clear it would know to do that. Otherwise we'll try to abort. If
+ * that is successful, then we return whatever was in ret (that is, the
+ * reason we failed). If the abort was unsuccessful, abort probably
+ * returned DB_RUNRECOVERY and we need to propagate that up.
+ */
+ if (td->status == TXN_PREPARED)
+ return (__env_panic(env, ret));
+
+ if ((t_ret = __txn_abort(txn)) != 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __txn_close_cursors
+ * Close a transaction's registered cursors, all its cursors are
+ * guaranteed to be closed.
+ */
+static int
+__txn_close_cursors(txn)
+ DB_TXN *txn;
+{
+ int ret, tret;
+ DBC *dbc;
+
+ ret = tret = 0;
+ dbc = NULL;
+
+ if (txn == NULL)
+ return (0);
+
+ while ((dbc = TAILQ_FIRST(&txn->my_cursors)) != NULL) {
+
+ DB_ASSERT(dbc->env, txn == dbc->txn);
+
+ /*
+ * Unregister the cursor from its transaction, regardless
+ * of return.
+ */
+ TAILQ_REMOVE(&(txn->my_cursors), dbc, txn_cursors);
+ dbc->txn_cursors.tqe_next = NULL;
+ dbc->txn_cursors.tqe_prev = NULL;
+
+ /* Removed from the active queue here. */
+ if (F_ISSET(dbc, DBC_ACTIVE))
+ ret = __dbc_close(dbc);
+
+ dbc->txn = NULL;
+
+ /* We have to close all cursors anyway, so continue on error. */
+ if (ret != 0) {
+ __db_err(dbc->env, ret, "__dbc_close");
+ if (tret == 0)
+ tret = ret;
+ }
+ }
+ txn->my_cursors.tqh_first = NULL;
+ txn->my_cursors.tqh_last = NULL;
+
+ return (tret);/* Return the first error if any. */
+}
+
+/*
+ * __txn_set_commit_token --
+ * Store a pointer to user's commit token buffer, for later use.
+ */
+static int
+__txn_set_commit_token(txn, tokenp)
+ DB_TXN *txn;
+ DB_TXN_TOKEN *tokenp;
+{
+ ENV *env;
+
+ env = txn->mgrp->env;
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_TXN->set_commit_token", DB_INIT_LOG);
+ if (txn->parent != NULL) {
+ __db_errx(env, DB_STR("4526",
+ "commit token unavailable for nested txn"));
+ return (EINVAL);
+ }
+ if (IS_REP_CLIENT(env)) {
+ __db_errx(env, DB_STR("4527",
+ "may not be called on a replication client"));
+ return (EINVAL);
+ }
+
+ txn->token_buffer = tokenp;
+
+#ifdef DIAGNOSTIC
+ /*
+ * Applications may rely on the contents of the token buffer becoming
+ * valid only after a successful commit(). So it is not strictly
+ * necessary to initialize the buffer here. But in case they get
+ * confused we initialize it here to a recognizably invalid value.
+ */
+ memset(tokenp, 0, DB_TXN_TOKEN_SIZE);
+#endif
+
+ return (0);
+}
+
+/*
+ * __txn_build_token --
+ * Stash a token describing the committing transaction into the buffer
+ * previously designated by the user. Called only in the case where the user
+ * has indeed supplied a buffer address.
+ */
+static void
+__txn_build_token(txn, lsnp)
+ DB_TXN *txn;
+ DB_LSN *lsnp;
+{
+ ENV *env;
+ REGENV *renv;
+ u_int8_t *bp;
+ u_int32_t gen, version;
+
+ bp = txn->token_buffer->buf;
+ env = txn->mgrp->env;
+ renv = env->reginfo->primary;
+
+ /* Marshal the information into external form. */
+ version = REP_COMMIT_TOKEN_FMT_VERSION;
+ gen = REP_ON(env) ? env->rep_handle->region->gen : 0;
+ DB_HTONL_COPYOUT(env, bp, version);
+ DB_HTONL_COPYOUT(env, bp, gen);
+ DB_HTONL_COPYOUT(env, bp, renv->envid);
+ DB_HTONL_COPYOUT(env, bp, lsnp->file);
+ DB_HTONL_COPYOUT(env, bp, lsnp->offset);
+}
+
+/*
+ * __txn_abort_pp --
+ * Interface routine to TXN->abort.
+ */
+static int
+__txn_abort_pp(txn)
+ DB_TXN *txn;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int rep_check, ret, t_ret;
+
+ env = txn->mgrp->env;
+ rep_check = IS_ENV_REPLICATED(env) &&
+ txn->parent == NULL && IS_REAL_TXN(txn);
+
+ ENV_ENTER(env, ip);
+ ret = __txn_abort(txn);
+ if (rep_check && (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_abort --
+ * Abort a transaction.
+ *
+ * PUBLIC: int __txn_abort __P((DB_TXN *));
+ */
+int
+__txn_abort(txn)
+ DB_TXN *txn;
+{
+ DB_LOCKREQ request;
+ DB_TXN *kid;
+ ENV *env;
+ REGENV *renv;
+ REGINFO *infop;
+ TXN_DETAIL *td;
+ u_int32_t id;
+ int ret;
+
+ env = txn->mgrp->env;
+ td = txn->td;
+ /*
+ * Do not abort an XA transaction if another process is still using
+ * it, however make sure that it is aborted when the last process
+ * tries to abort it.
+ */
+ if (txn->xa_thr_status != TXN_XA_THREAD_NOTA && td->xa_ref > 1) {
+ td->status = TXN_NEED_ABORT;
+ return (0);
+ }
+
+ PERFMON1(env, txn, abort, txn->txnid);
+ /*
+ * Close registered cursors before the abort. Even if the call fails,
+ * all cursors are closed.
+ */
+ if ((ret = __txn_close_cursors(txn)) != 0)
+ return (__env_panic(env, ret));
+
+ /* Ensure that abort always fails fatally. */
+ if ((ret = __txn_isvalid(txn, TXN_OP_ABORT)) != 0)
+ return (__env_panic(env, ret));
+
+ /*
+ * Clear the watermarks now. Can't do this in __txn_end because
+ * __db_refresh, called from undo, will free the DB_MPOOLFILEs.
+ */
+ __txn_reset_fe_watermarks(txn);
+
+ /*
+ * Try to abort any unresolved children.
+ *
+ * Abort either succeeds or panics the region. As soon as we
+ * see any failure, we just get out of here and return the panic
+ * up.
+ */
+ while ((kid = TAILQ_FIRST(&txn->kids)) != NULL)
+ if ((ret = __txn_abort(kid)) != 0)
+ return (ret);
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ /*
+ * No mutex is needed as envid is read-only once it is set.
+ */
+ id = renv->envid;
+
+ /*
+ * Fast path -- no need to do anything fancy if there were no
+ * modifications (e.g., log records) for this transaction.
+ * We still call txn_undo to cleanup the txn_list from our
+ * children.
+ */
+ if (IS_ZERO_LSN(td->last_lsn) && STAILQ_FIRST(&txn->logs) == NULL) {
+ if (txn->txn_list == NULL)
+ goto done;
+ else
+ goto undo;
+ }
+
+ if (LOCKING_ON(env)) {
+ /* Allocate a locker for this restored txn if necessary. */
+ if (txn->locker == NULL &&
+ (ret = __lock_getlocker(env->lk_handle,
+ txn->txnid, 1, &txn->locker)) != 0)
+ return (__env_panic(env, ret));
+ /*
+ * We are about to free all the read locks for this transaction
+ * below. Some of those locks might be handle locks which
+ * should not be freed, because they will be freed when the
+ * handle is closed. Check the events and preprocess any
+ * trades now so that we don't release the locks below.
+ */
+ if ((ret = __txn_doevents(env, txn, TXN_ABORT, 1)) != 0)
+ return (__env_panic(env, ret));
+
+ /* Turn off timeouts. */
+ if ((ret = __lock_set_timeout(env,
+ txn->locker, 0, DB_SET_TXN_TIMEOUT)) != 0)
+ return (__env_panic(env, ret));
+
+ if ((ret = __lock_set_timeout(env,
+ txn->locker, 0, DB_SET_LOCK_TIMEOUT)) != 0)
+ return (__env_panic(env, ret));
+
+ request.op = DB_LOCK_UPGRADE_WRITE;
+ request.obj = NULL;
+ if ((ret = __lock_vec(
+ env, txn->locker, 0, &request, 1, NULL)) != 0)
+ return (__env_panic(env, ret));
+ }
+undo: if ((ret = __txn_undo(txn)) != 0)
+ return (__env_panic(env, ret));
+
+ /*
+ * Normally, we do not need to log aborts. However, if we
+ * are a distributed transaction (i.e., we have a prepare),
+ * then we log the abort so we know that this transaction
+ * was actually completed.
+ */
+done: if (DBENV_LOGGING(env) && td->status == TXN_PREPARED &&
+ (ret = __txn_regop_log(env, txn, &td->last_lsn,
+ LOG_FLAGS(txn), TXN_ABORT, (int32_t)time(NULL), id, NULL)) != 0)
+ return (__env_panic(env, ret));
+
+ /* __txn_end always panics if it errors, so pass the return along. */
+ return (__txn_end(txn, 0));
+}
+
+/*
+ * __txn_discard --
+ * Interface routine to TXN->discard.
+ */
+static int
+__txn_discard(txn, flags)
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int rep_check, ret, t_ret;
+
+ env = txn->mgrp->env;
+ rep_check = IS_ENV_REPLICATED(env) &&
+ txn->parent == NULL && IS_REAL_TXN(txn);
+
+ ENV_ENTER(env, ip);
+ ret = __txn_discard_int(txn, flags);
+ if (rep_check && (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_discard --
+ * Free the per-process resources associated with this txn handle.
+ *
+ * PUBLIC: int __txn_discard_int __P((DB_TXN *, u_int32_t flags));
+ */
+int
+__txn_discard_int(txn, flags)
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DB_TXNMGR *mgr;
+ ENV *env;
+ int ret;
+
+ COMPQUIET(flags, 0);
+
+ mgr = txn->mgrp;
+ env = mgr->env;
+
+ /* Close registered cursors. */
+ if ((ret = __txn_close_cursors(txn)) != 0)
+ return (ret);
+
+ if ((ret = __txn_isvalid(txn, TXN_OP_DISCARD)) != 0)
+ return (ret);
+
+ /* Should be no children. */
+ DB_ASSERT(env, TAILQ_FIRST(&txn->kids) == NULL);
+
+ /* Free the space. */
+ MUTEX_LOCK(env, mgr->mutex);
+ mgr->n_discards++;
+ if (F_ISSET(txn, TXN_MALLOC)) {
+ TAILQ_REMOVE(&mgr->txn_chain, txn, links);
+ }
+ MUTEX_UNLOCK(env, mgr->mutex);
+ if (F_ISSET(txn, TXN_MALLOC) &&
+ txn->xa_thr_status != TXN_XA_THREAD_ASSOCIATED)
+ __os_free(env, txn);
+
+ return (0);
+}
+
+/*
+ * __txn_prepare --
+ * Flush the log so a future commit is guaranteed to succeed.
+ *
+ * PUBLIC: int __txn_prepare __P((DB_TXN *, u_int8_t *));
+ */
+int
+__txn_prepare(txn, gid)
+ DB_TXN *txn;
+ u_int8_t *gid;
+{
+ DBT list_dbt, gid_dbt;
+ DB_LOCKREQ request;
+ DB_THREAD_INFO *ip;
+ DB_TXN *kid;
+ ENV *env;
+ TXN_DETAIL *td;
+ u_int32_t lflags;
+ int ret;
+
+ env = txn->mgrp->env;
+ td = txn->td;
+ PERFMON2(env, txn, prepare, txn->txnid, gid);
+ DB_ASSERT(env, txn->xa_thr_status == TXN_XA_THREAD_NOTA ||
+ td->xa_ref == 1);
+ ENV_ENTER(env, ip);
+
+ /* Close registered cursors. */
+ if ((ret = __txn_close_cursors(txn)) != 0)
+ goto err;
+
+ if ((ret = __txn_isvalid(txn, TXN_OP_PREPARE)) != 0)
+ goto err;
+ if (F_ISSET(txn, TXN_DEADLOCK)) {
+ ret = __db_txn_deadlock_err(env, txn);
+ goto err;
+ }
+
+ /* Commit any unresolved children. */
+ while ((kid = TAILQ_FIRST(&txn->kids)) != NULL)
+ if ((ret = __txn_commit(kid, DB_TXN_NOSYNC)) != 0)
+ goto err;
+
+ /* We must set the global transaction ID here. */
+ memcpy(td->gid, gid, DB_GID_SIZE);
+ if ((ret = __txn_doevents(env, txn, TXN_PREPARE, 1)) != 0)
+ goto err;
+ memset(&request, 0, sizeof(request));
+ if (LOCKING_ON(env)) {
+ request.op = DB_LOCK_PUT_READ;
+ if (!IS_ZERO_LSN(td->last_lsn)) {
+ memset(&list_dbt, 0, sizeof(list_dbt));
+ request.obj = &list_dbt;
+ }
+ if ((ret = __lock_vec(env,
+ txn->locker, 0, &request, 1, NULL)) != 0)
+ goto err;
+
+ }
+ if (DBENV_LOGGING(env)) {
+ memset(&gid_dbt, 0, sizeof(gid));
+ gid_dbt.data = gid;
+ gid_dbt.size = DB_GID_SIZE;
+ lflags = DB_LOG_COMMIT | DB_FLUSH;
+ if ((ret = __txn_prepare_log(env,
+ txn, &td->last_lsn, lflags, TXN_PREPARE,
+ &gid_dbt, &td->begin_lsn, request.obj)) != 0)
+ __db_err(env, ret, DB_STR("4528",
+ "DB_TXN->prepare: log_write failed"));
+
+ if (request.obj != NULL && request.obj->data != NULL)
+ __os_free(env, request.obj->data);
+ if (ret != 0)
+ goto err;
+
+ }
+
+ MUTEX_LOCK(env, txn->mgrp->mutex);
+ td->status = TXN_PREPARED;
+ MUTEX_UNLOCK(env, txn->mgrp->mutex);
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_id --
+ * Return the transaction ID.
+ *
+ * PUBLIC: u_int32_t __txn_id __P((DB_TXN *));
+ */
+u_int32_t
+__txn_id(txn)
+ DB_TXN *txn;
+{
+ return (txn->txnid);
+}
+
+/*
+ * __txn_get_name --
+ * Get a descriptive string from a transaction.
+ *
+ * PUBLIC: int __txn_get_name __P((DB_TXN *, const char **));
+ */
+int
+__txn_get_name(txn, namep)
+ DB_TXN *txn;
+ const char **namep;
+{
+ *namep = txn->name;
+
+ return (0);
+}
+
+/*
+ * __txn_set_name --
+ * Set a descriptive string for a transaction.
+ *
+ * PUBLIC: int __txn_set_name __P((DB_TXN *, const char *));
+ */
+int
+__txn_set_name(txn, name)
+ DB_TXN *txn;
+ const char *name;
+{
+ DB_THREAD_INFO *ip;
+ DB_TXNMGR *mgr;
+ ENV *env;
+ TXN_DETAIL *td;
+ size_t len;
+ int ret;
+ char *p;
+
+ mgr = txn->mgrp;
+ env = mgr->env;
+ td = txn->td;
+ len = strlen(name) + 1;
+
+ if ((ret = __os_realloc(env, len, &txn->name)) != 0)
+ return (ret);
+ memcpy(txn->name, name, len);
+
+ ENV_ENTER(env, ip);
+ TXN_SYSTEM_LOCK(env);
+ if (td->name != INVALID_ROFF) {
+ __env_alloc_free(
+ &mgr->reginfo, R_ADDR(&mgr->reginfo, td->name));
+ td->name = INVALID_ROFF;
+ }
+ if ((ret = __env_alloc(&mgr->reginfo, len, &p)) != 0) {
+ TXN_SYSTEM_UNLOCK(env);
+ __db_errx(env, DB_STR("4529",
+ "Unable to allocate memory for transaction name"));
+
+ __os_free(env, txn->name);
+ txn->name = NULL;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+ }
+ TXN_SYSTEM_UNLOCK(env);
+ td->name = R_OFFSET(&mgr->reginfo, p);
+ memcpy(p, name, len);
+
+#ifdef DIAGNOSTIC
+ /*
+ * If DIAGNOSTIC is set, map the name into the log so users can track
+ * operations through the log.
+ */
+ if (DBENV_LOGGING(env))
+ (void)__log_printf(env, txn, "transaction %#lx named %s",
+ (u_long)txn->txnid, name);
+#endif
+
+ ENV_LEAVE(env, ip);
+ return (0);
+}
+
+/*
+ * __txn_get_priority --
+ * Get a transaction's priority level
+ * PUBLIC: int __txn_get_priority __P((DB_TXN *, u_int32_t *));
+ */
+int
+__txn_get_priority(txn, priorityp)
+ DB_TXN *txn;
+ u_int32_t *priorityp;
+{
+ if (txn->locker == NULL)
+ return EINVAL;
+
+ *priorityp = txn->locker->priority;
+ return (0);
+}
+
+/*
+ * __txn_set_priority --
+ * Assign a transaction a priority level
+ * PUBLIC: int __txn_set_priority __P((DB_TXN *, u_int32_t));
+ */
+int
+__txn_set_priority(txn, priority)
+ DB_TXN *txn;
+ u_int32_t priority;
+{
+ if (txn->locker == NULL)
+ return EINVAL;
+
+ txn->locker->priority = priority;
+ ((TXN_DETAIL *)txn->td)->priority = priority;
+
+ return (0);
+}
+
+/*
+ * __txn_set_timeout --
+ * ENV->set_txn_timeout.
+ * PUBLIC: int __txn_set_timeout __P((DB_TXN *, db_timeout_t, u_int32_t));
+ */
+int
+__txn_set_timeout(txn, timeout, op)
+ DB_TXN *txn;
+ db_timeout_t timeout;
+ u_int32_t op;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = txn->mgrp->env;
+
+ if (op != DB_SET_TXN_TIMEOUT && op != DB_SET_LOCK_TIMEOUT)
+ return (__db_ferr(env, "DB_TXN->set_timeout", 0));
+
+ ENV_ENTER(env, ip);
+ ret = __lock_set_timeout( env, txn->locker, timeout, op);
+ ENV_LEAVE(txn->mgrp->env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_isvalid --
+ * Return 0 if the DB_TXN is reasonable, otherwise panic.
+ */
+static int
+__txn_isvalid(txn, op)
+ const DB_TXN *txn;
+ txnop_t op;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ ENV *env;
+ TXN_DETAIL *td;
+
+ mgr = txn->mgrp;
+ env = mgr->env;
+ region = mgr->reginfo.primary;
+
+ /* Check for recovery. */
+ if (!F_ISSET(txn, TXN_COMPENSATE) &&
+ F_ISSET(region, TXN_IN_RECOVERY)) {
+ __db_errx(env, DB_STR("4530",
+ "operation not permitted during recovery"));
+ goto err;
+ }
+
+ /* Check for live cursors. */
+ if (txn->cursors != 0) {
+ __db_errx(env, DB_STR("4531",
+ "transaction has active cursors"));
+ goto err;
+ }
+
+ /* Check transaction's state. */
+ td = txn->td;
+
+ /* Handle any operation specific checks. */
+ switch (op) {
+ case TXN_OP_DISCARD:
+ /*
+ * Since we're just tossing the per-process space; there are
+ * a lot of problems with the transaction that we can tolerate.
+ */
+
+ /* Transaction is already been reused. */
+ if (txn->txnid != td->txnid)
+ return (0);
+
+ /*
+ * What we've got had better be either a prepared or
+ * restored transaction.
+ */
+ if (td->status != TXN_PREPARED &&
+ !F_ISSET(td, TXN_DTL_RESTORED)) {
+ __db_errx(env, DB_STR("4532",
+ "not a restored transaction"));
+ return (__env_panic(env, EINVAL));
+ }
+
+ return (0);
+ case TXN_OP_PREPARE:
+ if (txn->parent != NULL) {
+ /*
+ * This is not fatal, because you could imagine an
+ * application that simply prepares everybody because
+ * it doesn't distinguish between children and parents.
+ * I'm not arguing this is good, but I could imagine
+ * someone doing it.
+ */
+ __db_errx(env, DB_STR("4533",
+ "Prepare disallowed on child transactions"));
+ return (EINVAL);
+ }
+ break;
+ case TXN_OP_ABORT:
+ case TXN_OP_COMMIT:
+ default:
+ break;
+ }
+
+ switch (td->status) {
+ case TXN_PREPARED:
+ if (op == TXN_OP_PREPARE) {
+ __db_errx(env, DB_STR("4534",
+ "transaction already prepared"));
+ /*
+ * Txn_prepare doesn't blow away the user handle, so
+ * in this case, give the user the opportunity to
+ * abort or commit.
+ */
+ return (EINVAL);
+ }
+ break;
+ case TXN_RUNNING:
+ case TXN_NEED_ABORT:
+ break;
+ case TXN_ABORTED:
+ case TXN_COMMITTED:
+ default:
+ __db_errx(env, DB_STR_A("4535",
+ "transaction already %s", "%s"),
+ td->status == TXN_COMMITTED ?
+ DB_STR_P("committed") : DB_STR_P("aborted"));
+ goto err;
+ }
+
+ return (0);
+
+err: /*
+ * If there's a serious problem with the transaction, panic. TXN
+ * handles are dead by definition when we return, and if you use
+ * a cursor you forgot to close, we have no idea what will happen.
+ */
+ return (__env_panic(env, EINVAL));
+}
+
+/*
+ * __txn_end --
+ * Internal transaction end routine.
+ */
+static int
+__txn_end(txn, is_commit)
+ DB_TXN *txn;
+ int is_commit;
+{
+ DB_LOCKREQ request;
+ DB_TXNLOGREC *lr;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ ENV *env;
+ TXN_DETAIL *ptd, *td;
+ db_mutex_t mvcc_mtx;
+ int do_closefiles, ret;
+
+ mgr = txn->mgrp;
+ env = mgr->env;
+ region = mgr->reginfo.primary;
+ do_closefiles = 0;
+
+ /* Process commit events. */
+ if ((ret = __txn_doevents(env,
+ txn, is_commit ? TXN_COMMIT : TXN_ABORT, 0)) != 0)
+ return (__env_panic(env, ret));
+
+ /* End the transaction. */
+ td = txn->td;
+ if (td->nlog_dbs != 0 &&
+ (ret = __txn_dref_fname(env, txn)) != 0 && ret != EIO)
+ return (__env_panic(env, ret));
+
+ if (td->mvcc_ref != 0 && IS_MAX_LSN(td->visible_lsn)) {
+ /*
+ * Some pages were dirtied but nothing was logged. This can
+ * happen easily if we are aborting, but there are also cases
+ * in the compact code where pages are dirtied unconditionally
+ * and then we find out that there is no work to do.
+ *
+ * We need to make sure that the versions become visible to
+ * future transactions. We need to set visible_lsn before
+ * setting td->status to ensure safe reads of visible_lsn in
+ * __memp_fget.
+ */
+ if ((ret = __log_current_lsn_int(env, &td->visible_lsn,
+ NULL, NULL)) != 0)
+ return (__env_panic(env, ret));
+ }
+
+ /*
+ * Release the locks.
+ *
+ * __txn_end cannot return an simple error, we MUST return
+ * success/failure from commit or abort, ignoring any internal
+ * errors. So, we panic if something goes wrong. We can't
+ * deadlock here because we're not acquiring any new locks,
+ * so DB_LOCK_DEADLOCK is just as fatal as any other error.
+ */
+ if (LOCKING_ON(env)) {
+ /* Allocate a locker for this restored txn if necessary. */
+ if (txn->locker == NULL &&
+ (ret = __lock_getlocker(env->lk_handle,
+ txn->txnid, 1, &txn->locker)) != 0)
+ return (__env_panic(env, ret));
+ request.op = txn->parent == NULL ||
+ is_commit == 0 ? DB_LOCK_PUT_ALL : DB_LOCK_INHERIT;
+ request.obj = NULL;
+ if ((ret = __lock_vec(env,
+ txn->locker, 0, &request, 1, NULL)) != 0)
+ return (__env_panic(env, ret));
+ }
+
+ TXN_SYSTEM_LOCK(env);
+ td->status = is_commit ? TXN_COMMITTED : TXN_ABORTED;
+ SH_TAILQ_REMOVE(&region->active_txn, td, links, __txn_detail);
+ region->curtxns--;
+ if (F_ISSET(td, TXN_DTL_RESTORED)) {
+ region->stat.st_nrestores--;
+ do_closefiles = region->stat.st_nrestores == 0;
+ }
+
+ if (td->name != INVALID_ROFF) {
+ __env_alloc_free(&mgr->reginfo,
+ R_ADDR(&mgr->reginfo, td->name));
+ td->name = INVALID_ROFF;
+ }
+ if (td->nlog_slots != TXN_NSLOTS)
+ __env_alloc_free(&mgr->reginfo,
+ R_ADDR(&mgr->reginfo, td->log_dbs));
+
+ if (txn->parent != NULL) {
+ ptd = txn->parent->td;
+ SH_TAILQ_REMOVE(&ptd->kids, td, klinks, __txn_detail);
+ } else if ((mvcc_mtx = td->mvcc_mtx) != MUTEX_INVALID) {
+ MUTEX_LOCK(env, mvcc_mtx);
+ if (td->mvcc_ref != 0) {
+ SH_TAILQ_INSERT_HEAD(&region->mvcc_txn,
+ td, links, __txn_detail);
+
+ /*
+ * The transaction has been added to the list of
+ * committed snapshot transactions with active pages.
+ * It needs to be freed when the last page is evicted.
+ */
+ F_SET(td, TXN_DTL_SNAPSHOT);
+#ifdef HAVE_STATISTICS
+ STAT_INC(env, txn,
+ nsnapshot, region->stat.st_nsnapshot, txn->txnid);
+ if (region->stat.st_nsnapshot >
+ region->stat.st_maxnsnapshot)
+ STAT_SET(env, txn, maxnsnapshot,
+ region->stat.st_maxnsnapshot,
+ region->stat.st_nsnapshot,
+ txn->txnid);
+#endif
+ td = NULL;
+ }
+ MUTEX_UNLOCK(env, mvcc_mtx);
+ if (td != NULL)
+ if ((ret = __mutex_free(env, &td->mvcc_mtx)) != 0)
+ return (__env_panic(env, ret));
+ }
+
+ if (td != NULL)
+ __env_alloc_free(&mgr->reginfo, td);
+
+#ifdef HAVE_STATISTICS
+ if (is_commit)
+ STAT_INC(env,
+ txn, ncommits, region->stat.st_ncommits, txn->txnid);
+ else
+ STAT_INC(env,
+ txn, naborts, region->stat.st_naborts, txn->txnid);
+ STAT_DEC(env, txn, nactive, region->stat.st_nactive, txn->txnid);
+#endif
+
+ /* Increment bulk transaction counter while holding transaction lock. */
+ if (F_ISSET(txn, TXN_BULK))
+ ((DB_TXNREGION *)env->tx_handle->reginfo.primary)->n_bulk_txn--;
+
+ TXN_SYSTEM_UNLOCK(env);
+
+ /*
+ * The transaction cannot get more locks, remove its locker info,
+ * if any.
+ */
+ if (LOCKING_ON(env) && (ret =
+ __lock_freelocker(env->lk_handle, txn->locker)) != 0)
+ return (__env_panic(env, ret));
+ if (txn->parent != NULL)
+ TAILQ_REMOVE(&txn->parent->kids, txn, klinks);
+
+ /* Free the space. */
+ while ((lr = STAILQ_FIRST(&txn->logs)) != NULL) {
+ STAILQ_REMOVE(&txn->logs, lr, __txn_logrec, links);
+ __os_free(env, lr);
+ }
+ if (txn->name != NULL) {
+ __os_free(env, txn->name);
+ txn->name = NULL;
+ }
+
+ /*
+ * Free the transaction structure if we allocated it and if we are
+ * not in an XA transaction that will be freed when we exit the XA
+ * wrapper routines.
+ */
+ if (F_ISSET(txn, TXN_MALLOC) &&
+ txn->xa_thr_status != TXN_XA_THREAD_ASSOCIATED) {
+ MUTEX_LOCK(env, mgr->mutex);
+ TAILQ_REMOVE(&mgr->txn_chain, txn, links);
+ MUTEX_UNLOCK(env, mgr->mutex);
+
+ __os_free(env, txn);
+ }
+
+ if (do_closefiles) {
+ /*
+ * Otherwise, we have resolved the last outstanding prepared
+ * txn and need to invalidate the fileids that were left
+ * open for those txns and then close them.
+ */
+ (void)__dbreg_invalidate_files(env, 1);
+ (void)__dbreg_close_files(env, 1);
+ if (IS_REP_MASTER(env))
+ F_CLR(env->rep_handle, DBREP_OPENFILES);
+ F_CLR(env->lg_handle, DBLOG_OPENFILES);
+ mgr->n_discards = 0;
+ (void)__txn_checkpoint(env, 0, 0,
+ DB_CKP_INTERNAL | DB_FORCE);
+ }
+
+ return (0);
+}
+
+static int
+__txn_dispatch_undo(env, txn, rdbt, key_lsn, txnlist)
+ ENV *env;
+ DB_TXN *txn;
+ DBT *rdbt;
+ DB_LSN *key_lsn;
+ DB_TXNHEAD *txnlist;
+{
+ int ret;
+
+ txnlist->td = txn->td;
+ ret = __db_dispatch(env, &env->recover_dtab,
+ rdbt, key_lsn, DB_TXN_ABORT, txnlist);
+ if (ret == DB_SURPRISE_KID) {
+ F_SET(txn, TXN_CHILDCOMMIT);
+ ret = 0;
+ }
+ if (ret == 0 && F_ISSET(txn, TXN_CHILDCOMMIT) && IS_ZERO_LSN(*key_lsn))
+ ret = __db_txnlist_lsnget(env, txnlist, key_lsn, 0);
+
+ return (ret);
+}
+
+/*
+ * __txn_undo --
+ * Undo the transaction with id txnid.
+ */
+static int
+__txn_undo(txn)
+ DB_TXN *txn;
+{
+ DBT rdbt;
+ DB_LOGC *logc;
+ DB_LSN key_lsn;
+ DB_TXN *ptxn;
+ DB_TXNHEAD *txnlist;
+ DB_TXNLOGREC *lr;
+ DB_TXNMGR *mgr;
+ ENV *env;
+ int ret, t_ret;
+
+ mgr = txn->mgrp;
+ env = mgr->env;
+ logc = NULL;
+ txnlist = NULL;
+ ret = 0;
+
+ if (!LOGGING_ON(env))
+ return (0);
+
+ /*
+ * This is the simplest way to code this, but if the mallocs during
+ * recovery turn out to be a performance issue, we can do the
+ * allocation here and use DB_DBT_USERMEM.
+ */
+ memset(&rdbt, 0, sizeof(rdbt));
+
+ /*
+ * Allocate a txnlist for children and aborted page allocs.
+ * We need to associate the list with the maximal parent
+ * so that aborted pages are recovered when that transaction
+ * is committed or aborted.
+ */
+ for (ptxn = txn->parent; ptxn != NULL && ptxn->parent != NULL;)
+ ptxn = ptxn->parent;
+
+ if (ptxn != NULL && ptxn->txn_list != NULL)
+ txnlist = ptxn->txn_list;
+ else if (txn->txn_list != NULL)
+ txnlist = txn->txn_list;
+ else if ((ret = __db_txnlist_init(env,
+ txn->thread_info, 0, 0, NULL, &txnlist)) != 0)
+ return (ret);
+ else if (ptxn != NULL)
+ ptxn->txn_list = txnlist;
+
+ /*
+ * Take log records from the linked list stored in the transaction,
+ * then from the log.
+ */
+ STAILQ_FOREACH(lr, &txn->logs, links) {
+ rdbt.data = lr->data;
+ rdbt.size = 0;
+ LSN_NOT_LOGGED(key_lsn);
+ ret =
+ __txn_dispatch_undo(env, txn, &rdbt, &key_lsn, txnlist);
+ if (ret != 0) {
+ __db_err(env, ret, DB_STR("4536",
+ "DB_TXN->abort: in-memory log undo failed"));
+ goto err;
+ }
+ }
+
+ key_lsn = ((TXN_DETAIL *)txn->td)->last_lsn;
+
+ if (!IS_ZERO_LSN(key_lsn) &&
+ (ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+
+ while (!IS_ZERO_LSN(key_lsn)) {
+ /*
+ * The dispatch routine returns the lsn of the record
+ * before the current one in the key_lsn argument.
+ */
+ if ((ret = __logc_get(logc, &key_lsn, &rdbt, DB_SET)) == 0) {
+ ret = __txn_dispatch_undo(env,
+ txn, &rdbt, &key_lsn, txnlist);
+ }
+
+ if (ret != 0) {
+ __db_err(env, ret, DB_STR_A("4537",
+ "DB_TXN->abort: log undo failed for LSN: %lu %lu",
+ "%lu %lu"), (u_long)key_lsn.file,
+ (u_long)key_lsn.offset);
+ goto err;
+ }
+ }
+
+err: if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ptxn == NULL && txnlist != NULL)
+ __db_txnlist_end(env, txnlist);
+ return (ret);
+}
+
+/*
+ * __txn_activekids --
+ * Return if this transaction has any active children.
+ *
+ * PUBLIC: int __txn_activekids __P((ENV *, u_int32_t, DB_TXN *));
+ */
+int
+__txn_activekids(env, rectype, txn)
+ ENV *env;
+ u_int32_t rectype;
+ DB_TXN *txn;
+{
+ /*
+ * On a child commit, we know that there are children (i.e., the
+ * committing child at the least. In that case, skip this check.
+ */
+ if (F_ISSET(txn, TXN_COMPENSATE) || rectype == DB___txn_child)
+ return (0);
+
+ if (TAILQ_FIRST(&txn->kids) != NULL) {
+ __db_errx(env, DB_STR("4538",
+ "Child transaction is active"));
+ return (EPERM);
+ }
+ return (0);
+}
+
+/*
+ * __txn_force_abort --
+ * Force an abort record into the log if the commit record
+ * failed to get to disk.
+ *
+ * PUBLIC: int __txn_force_abort __P((ENV *, u_int8_t *));
+ */
+int
+__txn_force_abort(env, buffer)
+ ENV *env;
+ u_int8_t *buffer;
+{
+ DB_CIPHER *db_cipher;
+ HDR hdr, *hdrp;
+ u_int32_t offset, opcode, sum_len;
+ u_int8_t *bp, *key;
+ size_t hdrsize, rec_len;
+ int ret;
+
+ db_cipher = env->crypto_handle;
+
+ /*
+ * This routine depends on the layout of HDR and the __txn_regop
+ * record in txn.src. We are passed the beginning of the commit
+ * record in the log buffer and overwrite the commit with an abort
+ * and recalculate the checksum.
+ */
+ hdrsize = CRYPTO_ON(env) ? HDR_CRYPTO_SZ : HDR_NORMAL_SZ;
+
+ hdrp = (HDR *)buffer;
+ memcpy(&hdr.prev, buffer + SSZ(HDR, prev), sizeof(hdr.prev));
+ memcpy(&hdr.len, buffer + SSZ(HDR, len), sizeof(hdr.len));
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(&hdr, CRYPTO_ON(env));
+ rec_len = hdr.len - hdrsize;
+
+ offset = sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN);
+ if (CRYPTO_ON(env)) {
+ key = db_cipher->mac_key;
+ sum_len = DB_MAC_KEY;
+ if ((ret = db_cipher->decrypt(env, db_cipher->data,
+ &hdrp->iv[0], buffer + hdrsize, rec_len)) != 0)
+ return (__env_panic(env, ret));
+ } else {
+ key = NULL;
+ sum_len = sizeof(u_int32_t);
+ }
+ bp = buffer + hdrsize + offset;
+ opcode = TXN_ABORT;
+ LOGCOPY_32(env, bp, &opcode);
+
+ if (CRYPTO_ON(env) &&
+ (ret = db_cipher->encrypt(env,
+ db_cipher->data, &hdrp->iv[0], buffer + hdrsize, rec_len)) != 0)
+ return (__env_panic(env, ret));
+
+#ifdef HAVE_LOG_CHECKSUM
+ __db_chksum(&hdr, buffer + hdrsize, rec_len, key, NULL);
+ if (LOG_SWAPPED(env))
+ __log_hdrswap(&hdr, CRYPTO_ON(env));
+ memcpy(buffer + SSZA(HDR, chksum), hdr.chksum, sum_len);
+#endif
+
+ return (0);
+}
+
+/*
+ * __txn_preclose --
+ * Before we can close an environment, we need to check if we were in the
+ * middle of taking care of restored transactions. If so, close the files
+ * we opened.
+ *
+ * PUBLIC: int __txn_preclose __P((ENV *));
+ */
+int
+__txn_preclose(env)
+ ENV *env;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ int do_closefiles, ret;
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+ do_closefiles = 0;
+
+ TXN_SYSTEM_LOCK(env);
+ if (region != NULL &&
+ region->stat.st_nrestores <= mgr->n_discards &&
+ mgr->n_discards != 0)
+ do_closefiles = 1;
+ TXN_SYSTEM_UNLOCK(env);
+
+ if (do_closefiles) {
+ /*
+ * Set the DBLOG_RECOVER flag while closing these files so they
+ * do not create additional log records that will confuse future
+ * recoveries.
+ */
+ F_SET(env->lg_handle, DBLOG_RECOVER);
+ ret = __dbreg_close_files(env, 0);
+ F_CLR(env->lg_handle, DBLOG_RECOVER);
+ } else
+ ret = 0;
+
+ return (ret);
+}
+
+/*
+ * __txn_reset --
+ * Reset the last txnid to its minimum value, and log the reset.
+ *
+ * PUBLIC: int __txn_reset __P((ENV *));
+ */
+int
+__txn_reset(env)
+ ENV *env;
+{
+ DB_LSN scrap;
+ DB_TXNREGION *region;
+
+ region = env->tx_handle->reginfo.primary;
+ region->last_txnid = TXN_MINIMUM;
+
+ DB_ASSERT(env, LOGGING_ON(env));
+ return (__txn_recycle_log(env,
+ NULL, &scrap, 0, TXN_MINIMUM, TXN_MAXIMUM));
+}
+
+/*
+ * txn_set_txn_lsnp --
+ * Set the pointer to the begin_lsn field if that field is zero.
+ * Set the pointer to the last_lsn field.
+ */
+static void
+__txn_set_txn_lsnp(txn, blsnp, llsnp)
+ DB_TXN *txn;
+ DB_LSN **blsnp, **llsnp;
+{
+ TXN_DETAIL *td;
+
+ td = txn->td;
+ *llsnp = &td->last_lsn;
+
+ while (txn->parent != NULL)
+ txn = txn->parent;
+
+ td = txn->td;
+ if (IS_ZERO_LSN(td->begin_lsn))
+ *blsnp = &td->begin_lsn;
+}
+
+/*
+ * PUBLIC: int __txn_applied_pp __P((DB_ENV *,
+ * PUBLIC: DB_TXN_TOKEN *, db_timeout_t, u_int32_t));
+ */
+int
+__txn_applied_pp(dbenv, token, timeout, flags)
+ DB_ENV *dbenv;
+ DB_TXN_TOKEN *token;
+ db_timeout_t timeout;
+ u_int32_t flags;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_COMMIT_INFO commit_info;
+ u_int8_t *bp;
+ int ret;
+
+ env = dbenv->env;
+
+ if (flags != 0)
+ return (__db_ferr(env, "DB_ENV->txn_applied", 0));
+
+ /* Unmarshal the token from its stored form. */
+ bp = token->buf;
+ DB_NTOHL_COPYIN(env, commit_info.version, bp);
+ DB_ASSERT(env, commit_info.version == REP_COMMIT_TOKEN_FMT_VERSION);
+ DB_NTOHL_COPYIN(env, commit_info.gen, bp);
+ DB_NTOHL_COPYIN(env, commit_info.envid, bp);
+ DB_NTOHL_COPYIN(env, commit_info.lsn.file, bp);
+ DB_NTOHL_COPYIN(env, commit_info.lsn.offset, bp);
+
+ /*
+ * Check for a token representing a transaction that committed without
+ * any log records having been written. Ideally an application should
+ * be smart enough to avoid trying to use a token from such an "empty"
+ * transaction. But in some cases it might be difficult for them to
+ * keep track, so we don't really forbid it.
+ */
+ if (IS_ZERO_LSN(commit_info.lsn))
+ return (DB_KEYEMPTY);
+
+ ENV_REQUIRES_CONFIG(env,
+ env->lg_handle, "DB_ENV->txn_applied", DB_INIT_LOG);
+
+ ENV_ENTER(env, ip);
+ ret = __txn_applied(env, ip, &commit_info, timeout);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+static int
+__txn_applied(env, ip, commit_info, timeout)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_COMMIT_INFO *commit_info;
+ db_timeout_t timeout;
+{
+ LOG *lp;
+ DB_LSN lsn;
+ REGENV *renv;
+
+ /*
+ * The lockout protection scope between __op_handle_enter and
+ * __env_db_rep_exit is handled within __rep_txn_applied, and is not
+ * needed here since the rest of this function only runs in a
+ * non-replication env.
+ */
+ if (REP_ON(env))
+ return (__rep_txn_applied(env, ip, commit_info, timeout));
+
+ if (commit_info->gen != 0) {
+ __db_errx(env, DB_STR("4539",
+ "replication commit token in non-replication env"));
+ return (EINVAL);
+ }
+
+ lp = env->lg_handle->reginfo.primary;
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+
+ renv = env->reginfo->primary;
+
+ if (renv->envid == commit_info->envid &&
+ LOG_COMPARE(&commit_info->lsn, &lsn) <= 0)
+ return (0);
+ return (DB_NOTFOUND);
+}
diff --git a/src/txn/txn.src b/src/txn/txn.src
new file mode 100644
index 00000000..7e82dc82
--- /dev/null
+++ b/src/txn/txn.src
@@ -0,0 +1,120 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __txn
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/lock.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * This is the standard log operation for commit.
+ * Note that we are using an int32_t for the timestamp. This means that
+ * in 2039 we will need to deprecate this log record and create one that
+ * either changes the Epoch or has a 64-bit offset.
+ * NOTE: The opcode MUST be the first argument in these records, because
+ * the force_abort code overwrites it with an ABORT should the write to
+ * the log fail.
+ * envid:
+ * Environment ID of this operation (4.4+).
+ */
+BEGIN_COMPAT regop 42 10
+ARG opcode u_int32_t lu
+TIME timestamp int32_t ld
+LOCKS locks DBT s
+END
+
+BEGIN regop 44 10
+ARG opcode u_int32_t lu
+TIME timestamp int32_t ld
+ARG envid u_int32_t lu
+LOCKS locks DBT s
+END
+
+/*
+ * This is the checkpoint record. It contains the lsn that the checkpoint
+ * guarantees and a pointer to the last checkpoint so we can walk backwards
+ * by checkpoint.
+ *
+ * ckp_lsn:
+ * The lsn in the log of the most recent point at which all begun
+ * transactions have been aborted. This is the point for which
+ * the checkpoint is relevant.
+ * last_ckp:
+ * The previous checkpoint.
+ * timestamp:
+ * See comment in commit about timestamps.
+ * envid:
+ * Environment ID of this checkpoint (4.3+).
+ * rep_gen:
+ * Persistent replication generation number (4.2-4.5 only).
+ * Renamed to 'spare' in 4.6.
+ */
+BEGIN_COMPAT ckp 42 11
+POINTER ckp_lsn DB_LSN * lu
+POINTER last_ckp DB_LSN * lu
+TIME timestamp int32_t ld
+ARG rep_gen u_int32_t lu
+END
+
+BEGIN ckp 43 11
+POINTER ckp_lsn DB_LSN * lu
+POINTER last_ckp DB_LSN * lu
+TIME timestamp int32_t ld
+ARG envid u_int32_t lu
+ARG spare u_int32_t lu
+END
+
+/*
+ * This is the (new) log operation for a child commit. It is
+ * logged as a record in the PARENT. The child field contains
+ * the transaction ID of the child committing and the c_lsn is
+ * the last LSN of the child's log trail.
+ */
+BEGIN child 42 12
+ARG child u_int32_t lx
+POINTER c_lsn DB_LSN * lu
+END
+
+
+/*
+ * This is the standard log operation for prepare.
+ * NOTE: The opcode MUST be the first argument in these records, because
+ * the force_abort code overwrites it with an ABORT should the write to
+ * the log fail.
+ */
+BEGIN_COMPAT xa_regop 42 13
+ARG opcode u_int32_t lu
+DBT xid DBT s
+ARG formatID int32_t ld
+ARG gtrid u_int32_t lu
+ARG bqual u_int32_t lu
+POINTER begin_lsn DB_LSN * lu
+LOCKS locks DBT s
+END
+
+BEGIN prepare 48 13
+ARG opcode u_int32_t lu
+DBT gid DBT s
+POINTER begin_lsn DB_LSN * lu
+LOCKS locks DBT s
+END
+
+/*
+ * Log the fact that we are recycling txnids.
+ */
+BEGIN recycle 42 14
+ARG min u_int32_t lu
+ARG max u_int32_t lu
+END
diff --git a/src/txn/txn_auto.c b/src/txn/txn_auto.c
new file mode 100644
index 00000000..926d3653
--- /dev/null
+++ b/src/txn/txn_auto.c
@@ -0,0 +1,93 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+DB_LOG_RECSPEC __txn_regop_42_desc[] = {
+ {LOGREC_ARG, SSZ(__txn_regop_42_args, opcode), "opcode", "%lu"},
+ {LOGREC_TIME, SSZ(__txn_regop_42_args, timestamp), "timestamp", ""},
+ {LOGREC_LOCKS, SSZ(__txn_regop_42_args, locks), "locks", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_regop_desc[] = {
+ {LOGREC_ARG, SSZ(__txn_regop_args, opcode), "opcode", "%lu"},
+ {LOGREC_TIME, SSZ(__txn_regop_args, timestamp), "timestamp", ""},
+ {LOGREC_ARG, SSZ(__txn_regop_args, envid), "envid", "%lu"},
+ {LOGREC_LOCKS, SSZ(__txn_regop_args, locks), "locks", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_ckp_42_desc[] = {
+ {LOGREC_POINTER, SSZ(__txn_ckp_42_args, ckp_lsn), "ckp_lsn", ""},
+ {LOGREC_POINTER, SSZ(__txn_ckp_42_args, last_ckp), "last_ckp", ""},
+ {LOGREC_TIME, SSZ(__txn_ckp_42_args, timestamp), "timestamp", ""},
+ {LOGREC_ARG, SSZ(__txn_ckp_42_args, rep_gen), "rep_gen", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_ckp_desc[] = {
+ {LOGREC_POINTER, SSZ(__txn_ckp_args, ckp_lsn), "ckp_lsn", ""},
+ {LOGREC_POINTER, SSZ(__txn_ckp_args, last_ckp), "last_ckp", ""},
+ {LOGREC_TIME, SSZ(__txn_ckp_args, timestamp), "timestamp", ""},
+ {LOGREC_ARG, SSZ(__txn_ckp_args, envid), "envid", "%lu"},
+ {LOGREC_ARG, SSZ(__txn_ckp_args, spare), "spare", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_child_desc[] = {
+ {LOGREC_ARG, SSZ(__txn_child_args, child), "child", "%lx"},
+ {LOGREC_POINTER, SSZ(__txn_child_args, c_lsn), "c_lsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_xa_regop_42_desc[] = {
+ {LOGREC_ARG, SSZ(__txn_xa_regop_42_args, opcode), "opcode", "%lu"},
+ {LOGREC_DBT, SSZ(__txn_xa_regop_42_args, xid), "xid", ""},
+ {LOGREC_ARG, SSZ(__txn_xa_regop_42_args, formatID), "formatID", "%ld"},
+ {LOGREC_ARG, SSZ(__txn_xa_regop_42_args, gtrid), "gtrid", "%lu"},
+ {LOGREC_ARG, SSZ(__txn_xa_regop_42_args, bqual), "bqual", "%lu"},
+ {LOGREC_POINTER, SSZ(__txn_xa_regop_42_args, begin_lsn), "begin_lsn", ""},
+ {LOGREC_LOCKS, SSZ(__txn_xa_regop_42_args, locks), "locks", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_prepare_desc[] = {
+ {LOGREC_ARG, SSZ(__txn_prepare_args, opcode), "opcode", "%lu"},
+ {LOGREC_DBT, SSZ(__txn_prepare_args, gid), "gid", ""},
+ {LOGREC_POINTER, SSZ(__txn_prepare_args, begin_lsn), "begin_lsn", ""},
+ {LOGREC_LOCKS, SSZ(__txn_prepare_args, locks), "locks", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __txn_recycle_desc[] = {
+ {LOGREC_ARG, SSZ(__txn_recycle_args, min), "min", "%lu"},
+ {LOGREC_ARG, SSZ(__txn_recycle_args, max), "max", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+/*
+ * PUBLIC: int __txn_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__txn_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_regop_recover, DB___txn_regop)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_ckp_recover, DB___txn_ckp)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_child_recover, DB___txn_child)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_prepare_recover, DB___txn_prepare)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_recycle_recover, DB___txn_recycle)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/txn/txn_autop.c b/src/txn/txn_autop.c
new file mode 100644
index 00000000..0924a401
--- /dev/null
+++ b/src/txn/txn_autop.c
@@ -0,0 +1,175 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __txn_regop_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_regop_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__txn_regop_42", __txn_regop_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_regop_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_regop_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__txn_regop", __txn_regop_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_ckp_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_ckp_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__txn_ckp_42", __txn_ckp_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_ckp_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_ckp_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__txn_ckp", __txn_ckp_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_child_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_child_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__txn_child", __txn_child_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_xa_regop_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_xa_regop_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__txn_xa_regop_42", __txn_xa_regop_42_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_prepare_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_prepare_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__txn_prepare", __txn_prepare_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_recycle_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__txn_recycle_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__txn_recycle", __txn_recycle_desc, info));
+}
+
+/*
+ * PUBLIC: int __txn_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__txn_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_regop_print, DB___txn_regop)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_ckp_print, DB___txn_ckp)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_child_print, DB___txn_child)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_prepare_print, DB___txn_prepare)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __txn_recycle_print, DB___txn_recycle)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/txn/txn_chkpt.c b/src/txn/txn_chkpt.c
new file mode 100644
index 00000000..73715b10
--- /dev/null
+++ b/src/txn/txn_chkpt.c
@@ -0,0 +1,419 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * The President and Fellows of Harvard University. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * __txn_checkpoint_pp --
+ * ENV->txn_checkpoint pre/post processing.
+ *
+ * PUBLIC: int __txn_checkpoint_pp
+ * PUBLIC: __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__txn_checkpoint_pp(dbenv, kbytes, minutes, flags)
+ DB_ENV *dbenv;
+ u_int32_t kbytes, minutes, flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->tx_handle, "txn_checkpoint", DB_INIT_TXN);
+
+ /*
+ * On a replication client, all transactions are read-only; therefore,
+ * a checkpoint is a null-op.
+ *
+ * We permit txn_checkpoint, instead of just rendering it illegal,
+ * so that an application can just let a checkpoint thread continue
+ * to operate as it gets promoted or demoted between being a
+ * master and a client.
+ */
+ if (IS_REP_CLIENT(env))
+ return (0);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__txn_checkpoint(env, kbytes, minutes, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_checkpoint --
+ * ENV->txn_checkpoint.
+ *
+ * PUBLIC: int __txn_checkpoint
+ * PUBLIC: __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__txn_checkpoint(env, kbytes, minutes, flags)
+ ENV *env;
+ u_int32_t kbytes, minutes, flags;
+{
+ DB_LOG *dblp;
+ DB_LSN ckp_lsn, last_ckp, msg_lsn;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ time_t last_ckp_time, now;
+ u_int32_t bytes, id, logflags, mbytes, op;
+ int ret;
+
+ ret = 0;
+
+ /*
+ * A client will only call through here during recovery,
+ * so just sync the Mpool and go home. We want to be sure
+ * that since queue meta pages are not rolled back that they
+ * are clean in the cache prior to any transaction log
+ * truncation due to syncup.
+ */
+ if (IS_REP_CLIENT(env)) {
+ if (MPOOL_ON(env) &&
+ (ret = __memp_sync(env, DB_SYNC_CHECKPOINT, NULL)) != 0) {
+ __db_err(env, ret, DB_STR("4518",
+ "txn_checkpoint: failed to flush the buffer cache"));
+ return (ret);
+ }
+ return (0);
+ }
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+ infop = env->reginfo;
+ renv = infop->primary;
+ /*
+ * No mutex is needed as envid is read-only once it is set.
+ */
+ id = renv->envid;
+
+ MUTEX_LOCK(env, region->mtx_ckp);
+ /*
+ * The checkpoint LSN is an LSN such that all transactions begun before
+ * it are complete. Our first guess (corrected below based on the list
+ * of active transactions) is the last-written LSN.
+ */
+ if ((ret = __log_current_lsn_int(env, &ckp_lsn, &mbytes, &bytes)) != 0)
+ goto err;
+
+ /*
+ * Save for possible use in START_SYNC message.
+ */
+ msg_lsn = ckp_lsn;
+ if (!LF_ISSET(DB_FORCE)) {
+ /* Don't checkpoint a quiescent database. */
+ if (bytes == 0 && mbytes == 0)
+ goto err;
+
+ /*
+ * If either kbytes or minutes is non-zero, then only take the
+ * checkpoint if more than "minutes" minutes have passed or if
+ * more than "kbytes" of log data have been written since the
+ * last checkpoint.
+ */
+ if (kbytes != 0 &&
+ mbytes * 1024 + bytes / 1024 >= (u_int32_t)kbytes)
+ goto do_ckp;
+
+ if (minutes != 0) {
+ (void)time(&now);
+
+ TXN_SYSTEM_LOCK(env);
+ last_ckp_time = region->time_ckp;
+ TXN_SYSTEM_UNLOCK(env);
+
+ if (now - last_ckp_time >= (time_t)(minutes * 60))
+ goto do_ckp;
+ }
+
+ /*
+ * If we checked time and data and didn't go to checkpoint,
+ * we're done.
+ */
+ if (minutes != 0 || kbytes != 0)
+ goto err;
+ }
+
+ /*
+ * We must single thread checkpoints otherwise the chk_lsn may get out
+ * of order. We need to capture the start of the earliest currently
+ * active transaction (chk_lsn) and then flush all buffers. While
+ * doing this we we could then be overtaken by another checkpoint that
+ * sees a later chk_lsn but competes first. An archive process could
+ * then remove a log this checkpoint depends on.
+ */
+do_ckp:
+ if ((ret = __txn_getactive(env, &ckp_lsn)) != 0)
+ goto err;
+
+ /*
+ * Checkpoints in replication groups can cause performance problems.
+ *
+ * As on the master, checkpoint on the replica requires the cache be
+ * flushed. The problem occurs when a client has dirty cache pages
+ * to write when the checkpoint record arrives, and the client's PERM
+ * response is necessary in order to meet the system's durability
+ * guarantees. In this case, the master will have to wait until the
+ * client completes its cache flush and writes the checkpoint record
+ * before subsequent transactions can be committed. The delay may
+ * cause transactions to timeout waiting on client response, which
+ * can cause nasty ripple effects in the system's overall throughput.
+ * [#15338]
+ *
+ * First, we send a start-sync record when the checkpoint starts so
+ * clients can start flushing their cache in preparation for the
+ * arrival of the checkpoint record.
+ */
+ if (LOGGING_ON(env) && IS_REP_MASTER(env)) {
+#ifdef HAVE_REPLICATION_THREADS
+ /*
+ * If repmgr is configured in the shared environment, but no
+ * send() function configured for this process, assume we have a
+ * replication-unaware process that wants to automatically
+ * participate in replication (i.e., sending replication
+ * messages to clients).
+ */
+ if (env->rep_handle->send == NULL &&
+ F_ISSET(env, ENV_THREAD) && APP_IS_REPMGR(env) &&
+ (ret = __repmgr_autostart(env)) != 0)
+ goto err;
+#endif
+ /*
+ * Send the LSN (saved in msg_lsn) where the sync starts
+ * on the master. Clients must have this LSN to assure that
+ * they have applied all txns up to this point.
+ */
+ if (env->rep_handle->send != NULL)
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_START_SYNC, &msg_lsn, NULL, 0, 0);
+ }
+
+ /* Flush the cache. */
+ if (MPOOL_ON(env) &&
+ (ret = __memp_sync_int(
+ env, NULL, 0, DB_SYNC_CHECKPOINT, NULL, NULL)) != 0) {
+ __db_err(env, ret, DB_STR("4519",
+ "txn_checkpoint: failed to flush the buffer cache"));
+ goto err;
+ }
+
+ /*
+ * The client won't have more dirty pages to flush from its cache than
+ * the master did, but there may be differences between the hardware,
+ * I/O configuration and workload on the master and the client that
+ * can result in the client being unable to finish its cache flush as
+ * fast as the master. A way to avoid the problem is to pause after
+ * the master completes its checkpoint and before the actual checkpoint
+ * record is logged, giving the replicas additional time to finish.
+ *
+ * !!!
+ * Currently turned off when testing, because it makes the test suite
+ * take a long time to run.
+ */
+#ifndef CONFIG_TEST
+ if (LOGGING_ON(env) &&
+ IS_REP_MASTER(env) && env->rep_handle->send != NULL &&
+ !LF_ISSET(DB_CKP_INTERNAL) &&
+ env->rep_handle->region->chkpt_delay != 0)
+ __os_yield(env, 0, env->rep_handle->region->chkpt_delay);
+#endif
+
+ /*
+ * Because we can't be a replication client here, and because
+ * recovery (somewhat unusually) calls txn_checkpoint and expects
+ * it to write a log message, LOGGING_ON is the correct macro here.
+ */
+ if (LOGGING_ON(env)) {
+ TXN_SYSTEM_LOCK(env);
+ last_ckp = region->last_ckp;
+ TXN_SYSTEM_UNLOCK(env);
+ /*
+ * Put out records for the open files before we log
+ * the checkpoint. The records are certain to be at
+ * or after ckp_lsn, but before the checkpoint record
+ * itself, so they're sure to be included if we start
+ * recovery from the ckp_lsn contained in this
+ * checkpoint.
+ */
+ logflags = DB_LOG_CHKPNT;
+ /*
+ * If this is a normal checkpoint, log files as checkpoints.
+ * If we are recovering, only log as DBREG_RCLOSE if
+ * there are no prepared txns. Otherwise, it should
+ * stay as DBREG_CHKPNT.
+ */
+ op = DBREG_CHKPNT;
+ if (!IS_RECOVERING(env))
+ logflags |= DB_FLUSH;
+ else if (region->stat.st_nrestores == 0)
+ op = DBREG_RCLOSE;
+ if ((ret = __dbreg_log_files(env, op)) != 0 ||
+ (ret = __txn_ckp_log(env, NULL, &ckp_lsn, logflags,
+ &ckp_lsn, &last_ckp, (int32_t)time(NULL), id, 0)) != 0) {
+ __db_err(env, ret, DB_STR_A("4520",
+ "txn_checkpoint: log failed at LSN [%ld %ld]",
+ "%ld %ld"),
+ (long)ckp_lsn.file, (long)ckp_lsn.offset);
+ goto err;
+ }
+
+ if ((ret = __txn_updateckp(env, &ckp_lsn)) != 0)
+ goto err;
+ }
+
+err: MUTEX_UNLOCK(env, region->mtx_ckp);
+ if (ret == 0 && lp->db_log_autoremove)
+ __log_autoremove(env);
+ return (ret);
+}
+
+/*
+ * __txn_getactive --
+ * Find the oldest active transaction and figure out its "begin" LSN.
+ * This is the lowest LSN we can checkpoint, since any record written
+ * after it may be involved in a transaction and may therefore need
+ * to be undone in the case of an abort.
+ *
+ * We check both the file and offset for 0 since the lsn may be in
+ * transition. If it is then we don't care about this txn because it
+ * must be starting after we set the initial value of lsnp in the caller.
+ * All txns must initialize their begin_lsn before writing to the log.
+ *
+ * PUBLIC: int __txn_getactive __P((ENV *, DB_LSN *));
+ */
+int
+__txn_getactive(env, lsnp)
+ ENV *env;
+ DB_LSN *lsnp;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ TXN_DETAIL *td;
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ TXN_SYSTEM_LOCK(env);
+ SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail)
+ if (td->begin_lsn.file != 0 &&
+ td->begin_lsn.offset != 0 &&
+ LOG_COMPARE(&td->begin_lsn, lsnp) < 0)
+ *lsnp = td->begin_lsn;
+ TXN_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __txn_getckp --
+ * Get the LSN of the last transaction checkpoint.
+ *
+ * PUBLIC: int __txn_getckp __P((ENV *, DB_LSN *));
+ */
+int
+__txn_getckp(env, lsnp)
+ ENV *env;
+ DB_LSN *lsnp;
+{
+ DB_LSN lsn;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ TXN_SYSTEM_LOCK(env);
+ lsn = region->last_ckp;
+ TXN_SYSTEM_UNLOCK(env);
+
+ if (IS_ZERO_LSN(lsn))
+ return (DB_NOTFOUND);
+
+ *lsnp = lsn;
+ return (0);
+}
+
+/*
+ * __txn_updateckp --
+ * Update the last_ckp field in the transaction region. This happens
+ * at the end of a normal checkpoint and also when a replication client
+ * receives a checkpoint record.
+ *
+ * PUBLIC: int __txn_updateckp __P((ENV *, DB_LSN *));
+ */
+int
+__txn_updateckp(env, lsnp)
+ ENV *env;
+ DB_LSN *lsnp;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ /*
+ * We want to make sure last_ckp only moves forward; since we drop
+ * locks above and in log_put, it's possible for two calls to
+ * __txn_ckp_log to finish in a different order from how they were
+ * called.
+ */
+ TXN_SYSTEM_LOCK(env);
+ if (LOG_COMPARE(&region->last_ckp, lsnp) < 0) {
+ region->last_ckp = *lsnp;
+ (void)time(&region->time_ckp);
+ }
+ TXN_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
diff --git a/src/txn/txn_failchk.c b/src/txn/txn_failchk.c
new file mode 100644
index 00000000..b2007ad6
--- /dev/null
+++ b/src/txn/txn_failchk.c
@@ -0,0 +1,101 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+
+/*
+ * __txn_failchk --
+ * Check for transactions started by dead threads of control.
+ *
+ * PUBLIC: int __txn_failchk __P((ENV *));
+ */
+int
+__txn_failchk(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ DB_TXN *ktxn, *txn;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ TXN_DETAIL *ktd, *td;
+ db_threadid_t tid;
+ int ret;
+ char buf[DB_THREADID_STRLEN];
+ pid_t pid;
+
+ mgr = env->tx_handle;
+ dbenv = env->dbenv;
+ region = mgr->reginfo.primary;
+
+retry: TXN_SYSTEM_LOCK(env);
+
+ SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail) {
+ /*
+ * If this is a child transaction, skip it.
+ * The parent will take care of it.
+ */
+ if (td->parent != INVALID_ROFF)
+ continue;
+ /*
+ * If the txn is prepared, then it does not matter
+ * what the state of the thread is.
+ */
+ if (td->status == TXN_PREPARED)
+ continue;
+
+ /* If the thread is still alive, it's not a problem. */
+ if (dbenv->is_alive(dbenv, td->pid, td->tid, 0))
+ continue;
+
+ if (F_ISSET(td, TXN_DTL_INMEMORY)) {
+ TXN_SYSTEM_UNLOCK(env);
+ return (__db_failed(env, DB_STR("4501",
+ "Transaction has in memory logs"),
+ td->pid, td->tid));
+ }
+
+ /* Abort the transaction. */
+ TXN_SYSTEM_UNLOCK(env);
+ if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0)
+ return (ret);
+ if ((ret = __txn_continue(env, txn, td, NULL, 1)) != 0)
+ return (ret);
+ SH_TAILQ_FOREACH(ktd, &td->kids, klinks, __txn_detail) {
+ if (F_ISSET(ktd, TXN_DTL_INMEMORY))
+ return (__db_failed(env, DB_STR("4502",
+ "Transaction has in memory logs"),
+ td->pid, td->tid));
+ if ((ret =
+ __os_calloc(env, 1, sizeof(DB_TXN), &ktxn)) != 0)
+ return (ret);
+ if ((ret =
+ __txn_continue(env, ktxn, ktd, NULL, 1)) != 0)
+ return (ret);
+ ktxn->parent = txn;
+ ktxn->mgrp = txn->mgrp;
+ TAILQ_INSERT_HEAD(&txn->kids, ktxn, klinks);
+ }
+ pid = td->pid;
+ tid = td->tid;
+ (void)dbenv->thread_id_string(dbenv, pid, tid, buf);
+ __db_msg(env, DB_STR_A("4503",
+ "Aborting txn %#lx: %s", "%#lx %s"),
+ (u_long)txn->txnid, buf);
+ if ((ret = __txn_abort(txn)) != 0)
+ return (__db_failed(env, DB_STR("4504",
+ "Transaction abort failed"), pid, tid));
+ goto retry;
+ }
+
+ TXN_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
diff --git a/src/txn/txn_method.c b/src/txn/txn_method.c
new file mode 100644
index 00000000..629eac04
--- /dev/null
+++ b/src/txn/txn_method.c
@@ -0,0 +1,124 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+
+/*
+ * __txn_env_create --
+ * Transaction specific initialization of the DB_ENV structure.
+ *
+ * PUBLIC: int __txn_env_create __P((DB_ENV *));
+ */
+int
+__txn_env_create(dbenv)
+ DB_ENV *dbenv;
+{
+ /*
+ * !!!
+ * Our caller has not yet had the opportunity to reset the panic
+ * state or turn off mutex locking, and so we can neither check
+ * the panic state or acquire a mutex in the DB_ENV create path.
+ */
+ dbenv->tx_max = 0;
+
+ return (0);
+}
+
+/*
+ * __txn_env_destroy --
+ * Transaction specific destruction of the DB_ENV structure.
+ *
+ * PUBLIC: void __txn_env_destroy __P((DB_ENV *));
+ */
+void
+__txn_env_destroy(dbenv)
+ DB_ENV *dbenv;
+{
+ COMPQUIET(dbenv, NULL);
+}
+
+/*
+ * PUBLIC: int __txn_get_tx_max __P((DB_ENV *, u_int32_t *));
+ */
+int
+__txn_get_tx_max(dbenv, tx_maxp)
+ DB_ENV *dbenv;
+ u_int32_t *tx_maxp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_NOT_CONFIGURED(env,
+ env->tx_handle, "DB_ENV->get_tx_max", DB_INIT_TXN);
+
+ if (TXN_ON(env)) {
+ /* Cannot be set after open, no lock required to read. */
+ *tx_maxp = ((DB_TXNREGION *)
+ env->tx_handle->reginfo.primary)->maxtxns;
+ } else
+ *tx_maxp = dbenv->tx_max;
+ return (0);
+}
+
+/*
+ * __txn_set_tx_max --
+ * DB_ENV->set_tx_max.
+ *
+ * PUBLIC: int __txn_set_tx_max __P((DB_ENV *, u_int32_t));
+ */
+int
+__txn_set_tx_max(dbenv, tx_max)
+ DB_ENV *dbenv;
+ u_int32_t tx_max;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_tx_max");
+
+ dbenv->tx_max = tx_max;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __txn_get_tx_timestamp __P((DB_ENV *, time_t *));
+ */
+int
+__txn_get_tx_timestamp(dbenv, timestamp)
+ DB_ENV *dbenv;
+ time_t *timestamp;
+{
+ *timestamp = dbenv->tx_timestamp;
+ return (0);
+}
+
+/*
+ * __txn_set_tx_timestamp --
+ * Set the transaction recovery timestamp.
+ *
+ * PUBLIC: int __txn_set_tx_timestamp __P((DB_ENV *, time_t *));
+ */
+int
+__txn_set_tx_timestamp(dbenv, timestamp)
+ DB_ENV *dbenv;
+ time_t *timestamp;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_tx_timestamp");
+
+ dbenv->tx_timestamp = *timestamp;
+ return (0);
+}
diff --git a/src/txn/txn_rec.c b/src/txn/txn_rec.c
new file mode 100644
index 00000000..b39d56d1
--- /dev/null
+++ b/src/txn/txn_rec.c
@@ -0,0 +1,616 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1996
+ * The President and Fellows of Harvard University. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+
+/*
+ * PUBLIC: int __txn_regop_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * These records are only ever written for commits. Normally, we redo any
+ * committed transaction, however if we are doing recovery to a timestamp, then
+ * we may treat transactions that committed after the timestamp as aborted.
+ */
+int
+__txn_regop_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __txn_regop_args *argp;
+ DB_TXNHEAD *headp;
+ int ret;
+ u_int32_t status;
+
+#ifdef DEBUG_RECOVER
+ (void)__txn_regop_print(env, dbtp, lsnp, op, info);
+#endif
+
+ if ((ret = __txn_regop_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ headp = info;
+ /*
+ * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL.
+ * We check for the former explicitly and the last two clauses
+ * apply to the BACKWARD_ROLL case.
+ */
+
+ if (op == DB_TXN_FORWARD_ROLL) {
+ /*
+ * If this was a 2-phase-commit transaction, then it
+ * might already have been removed from the list, and
+ * that's OK. Ignore the return code from remove.
+ */
+ if ((ret = __db_txnlist_remove(env,
+ info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0)
+ goto err;
+ } else if ((env->dbenv->tx_timestamp != 0 &&
+ argp->timestamp > (int32_t)env->dbenv->tx_timestamp) ||
+ (!IS_ZERO_LSN(headp->trunc_lsn) &&
+ LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) {
+ /*
+ * We failed either the timestamp check or the trunc_lsn check,
+ * so we treat this as an abort even if it was a commit record.
+ */
+ if ((ret = __db_txnlist_update(env, info,
+ argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0)
+ goto err;
+ else if (status != TXN_IGNORE && status != TXN_OK)
+ goto err;
+ } else {
+ /* This is a normal commit; mark it appropriately. */
+ if ((ret = __db_txnlist_update(env,
+ info, argp->txnp->txnid, argp->opcode, lsnp,
+ &status, 0)) == DB_NOTFOUND) {
+ if ((ret = __db_txnlist_add(env,
+ info, argp->txnp->txnid,
+ argp->opcode == TXN_ABORT ?
+ TXN_IGNORE : argp->opcode, lsnp)) != 0)
+ goto err;
+ } else if (ret != 0 ||
+ (status != TXN_IGNORE && status != TXN_OK))
+ goto err;
+ }
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+
+ if (0) {
+err: __db_errx(env, DB_STR_A("4514",
+ "txnid %lx commit record found, already on commit list",
+ "%lx"), (u_long)argp->txnp->txnid);
+ ret = EINVAL;
+ }
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_prepare_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * These records are only ever written for prepares.
+ */
+int
+__txn_prepare_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __txn_prepare_args *argp;
+ DBT *lock_dbt;
+ DB_TXNHEAD *headp;
+ DB_LOCKTAB *lt;
+ u_int32_t status;
+ int ret;
+
+#ifdef DEBUG_RECOVER
+ (void)__txn_prepare_print(env, dbtp, lsnp, op, info);
+#endif
+
+ if ((ret = __txn_prepare_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ if (argp->opcode != TXN_PREPARE && argp->opcode != TXN_ABORT) {
+ ret = EINVAL;
+ goto err;
+ }
+ headp = info;
+
+ /*
+ * The return value here is either a DB_NOTFOUND or it is
+ * the transaction status from the list. It is not a normal
+ * error return, so we must make sure that in each of the
+ * cases below, we overwrite the ret value so we return
+ * appropriately.
+ */
+ ret = __db_txnlist_find(env, info, argp->txnp->txnid, &status);
+
+ /*
+ * If we are rolling forward, then an aborted prepare
+ * indicates that this may be the last record we'll see for
+ * this transaction ID, so we should remove it from the list.
+ */
+
+ if (op == DB_TXN_FORWARD_ROLL) {
+ if ((ret = __db_txnlist_remove(env,
+ info, argp->txnp->txnid)) != 0)
+ goto txn_err;
+ } else if (op == DB_TXN_BACKWARD_ROLL && status == TXN_PREPARE) {
+ /*
+ * On the backward pass, we have four possibilities:
+ * 1. The transaction is already committed, no-op.
+ * 2. The transaction is already aborted, no-op.
+ * 3. The prepare failed and was aborted, mark as abort.
+ * 4. The transaction is neither committed nor aborted.
+ * Treat this like a commit and roll forward so that
+ * the transaction can be resurrected in the region.
+ * We handle cases 3 and 4 here; cases 1 and 2
+ * are the final clause below.
+ */
+ if (argp->opcode == TXN_ABORT) {
+ if ((ret = __db_txnlist_update(env,
+ info, argp->txnp->txnid,
+ TXN_ABORT, NULL, &status, 0)) != 0 &&
+ status != TXN_PREPARE)
+ goto txn_err;
+ ret = 0;
+ }
+ /*
+ * This is prepared, but not yet committed transaction. We
+ * need to add it to the transaction list, so that it gets
+ * rolled forward. We also have to add it to the region's
+ * internal state so it can be properly aborted or committed
+ * after recovery (see txn_recover).
+ */
+ else if ((ret = __db_txnlist_remove(env,
+ info, argp->txnp->txnid)) != 0) {
+txn_err: __db_errx(env,
+ DB_STR_A("4515",
+ "transaction not in list %lx", "%lx"),
+ (u_long)argp->txnp->txnid);
+ ret = DB_NOTFOUND;
+ } else if (IS_ZERO_LSN(headp->trunc_lsn) ||
+ LOG_COMPARE(&headp->trunc_lsn, lsnp) >= 0) {
+ if ((ret = __db_txnlist_add(env,
+ info, argp->txnp->txnid, TXN_COMMIT, lsnp)) == 0) {
+ /* Re-acquire the locks for this transaction. */
+ lock_dbt = &argp->locks;
+ if (LOCKING_ON(env)) {
+ lt = env->lk_handle;
+ if ((ret = __lock_getlocker(lt,
+ argp->txnp->txnid, 1,
+ &argp->txnp->locker)) != 0)
+ goto err;
+ if ((ret = __lock_get_list(env,
+ argp->txnp->locker, 0,
+ DB_LOCK_WRITE, lock_dbt)) != 0)
+ goto err;
+ }
+
+ ret = __txn_restore_txn(env, lsnp, argp);
+ }
+ }
+ } else
+ ret = 0;
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+
+err: __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_ckp_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__txn_ckp_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __txn_ckp_args *argp;
+ int ret;
+
+#ifdef DEBUG_RECOVER
+ __txn_ckp_print(env, dbtp, lsnp, op, info);
+#endif
+ if ((ret = __txn_ckp_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ if (op == DB_TXN_BACKWARD_ROLL)
+ __db_txnlist_ckp(env, info, lsnp);
+
+ *lsnp = argp->last_ckp;
+ __os_free(env, argp);
+ return (DB_TXN_CKP);
+}
+
+/*
+ * __txn_child_recover
+ * Recover a commit record for a child transaction.
+ *
+ * PUBLIC: int __txn_child_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__txn_child_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __txn_child_args *argp;
+ u_int32_t c_stat, p_stat, tmpstat;
+ int ret, t_ret;
+
+#ifdef DEBUG_RECOVER
+ (void)__txn_child_print(env, dbtp, lsnp, op, info);
+#endif
+ if ((ret = __txn_child_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ /*
+ * This is a record in a PARENT's log trail indicating that a
+ * child committed. If we are aborting, return the childs last
+ * record's LSN. If we are in recovery, then if the
+ * parent is committing, we set ourselves up to commit, else
+ * we do nothing.
+ */
+ if (op == DB_TXN_ABORT) {
+ *lsnp = argp->c_lsn;
+ ret = __db_txnlist_lsnadd(env, info, &argp->prev_lsn);
+ goto out;
+ } else if (op == DB_TXN_BACKWARD_ROLL) {
+ /* Child might exist -- look for it. */
+ ret = __db_txnlist_find(env, info, argp->child, &c_stat);
+ t_ret =
+ __db_txnlist_find(env, info, argp->txnp->txnid, &p_stat);
+ if (ret != 0 && ret != DB_NOTFOUND)
+ goto out;
+ if (t_ret != 0 && t_ret != DB_NOTFOUND) {
+ ret = t_ret;
+ goto out;
+ }
+ /*
+ * If the parent is in state COMMIT or IGNORE, then we apply
+ * that to the child, else we need to abort the child.
+ */
+
+ if (ret == DB_NOTFOUND ||
+ c_stat == TXN_OK || c_stat == TXN_COMMIT) {
+ if (t_ret == DB_NOTFOUND ||
+ (p_stat != TXN_COMMIT && p_stat != TXN_IGNORE))
+ c_stat = TXN_ABORT;
+ else
+ c_stat = p_stat;
+
+ if (ret == DB_NOTFOUND)
+ ret = __db_txnlist_add(env,
+ info, argp->child, c_stat, NULL);
+ else
+ ret = __db_txnlist_update(env, info,
+ argp->child, c_stat, NULL, &tmpstat, 0);
+ } else if (c_stat == TXN_EXPECTED) {
+ /*
+ * The open after this create succeeded. If the
+ * parent succeeded, we don't want to redo; if the
+ * parent aborted, we do want to undo.
+ */
+ switch (p_stat) {
+ case TXN_COMMIT:
+ case TXN_IGNORE:
+ c_stat = TXN_IGNORE;
+ break;
+ default:
+ c_stat = TXN_ABORT;
+ }
+ ret = __db_txnlist_update(env,
+ info, argp->child, c_stat, NULL, &tmpstat, 0);
+ } else if (c_stat == TXN_UNEXPECTED) {
+ /*
+ * The open after this create failed. If the parent
+ * is rolling forward, we need to roll forward. If
+ * the parent failed, then we do not want to abort
+ * (because the file may not be the one in which we
+ * are interested).
+ */
+ ret = __db_txnlist_update(env, info, argp->child,
+ p_stat == TXN_COMMIT ? TXN_COMMIT : TXN_IGNORE,
+ NULL, &tmpstat, 0);
+ }
+ } else if (op == DB_TXN_OPENFILES) {
+ /*
+ * If we have a partial subtransaction, then the whole
+ * transaction should be ignored.
+ */
+ if ((ret = __db_txnlist_find(env,
+ info, argp->child, &c_stat)) == DB_NOTFOUND)
+ ret = __db_txnlist_update(env, info,
+ argp->txnp->txnid, TXN_IGNORE,
+ NULL, &p_stat, 1);
+ } else if (DB_REDO(op)) {
+ /* Forward Roll */
+ if ((ret =
+ __db_txnlist_remove(env, info, argp->child)) != 0)
+ __db_errx(env, DB_STR_A("4516",
+ "Transaction not in list %x", "%x"), argp->child);
+ }
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+
+out: __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * __txn_restore_txn --
+ * Using only during XA recovery. If we find any transactions that are
+ * prepared, but not yet committed, then we need to restore the transaction's
+ * state into the shared region, because the TM is going to issue an abort
+ * or commit and we need to respond correctly.
+ *
+ * lsnp is the LSN of the returned LSN
+ * argp is the prepare record (in an appropriate structure)
+ *
+ * PUBLIC: int __txn_restore_txn __P((ENV *, DB_LSN *, __txn_prepare_args *));
+ */
+int
+__txn_restore_txn(env, lsnp, argp)
+ ENV *env;
+ DB_LSN *lsnp;
+ __txn_prepare_args *argp;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ TXN_DETAIL *td;
+ int ret;
+
+ if (argp->gid.size == 0)
+ return (0);
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+ TXN_SYSTEM_LOCK(env);
+
+ /* Allocate a new transaction detail structure. */
+ if ((ret = __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) {
+ TXN_SYSTEM_UNLOCK(env);
+ return (ret);
+ }
+
+ /* Place transaction on active transaction list. */
+ SH_TAILQ_INSERT_HEAD(&region->active_txn, td, links, __txn_detail);
+ region->curtxns++;
+
+ td->txnid = argp->txnp->txnid;
+ __os_id(env->dbenv, &td->pid, &td->tid);
+ td->last_lsn = *lsnp;
+ td->begin_lsn = argp->begin_lsn;
+ td->parent = INVALID_ROFF;
+ td->name = INVALID_ROFF;
+ SH_TAILQ_INIT(&td->kids);
+ MAX_LSN(td->read_lsn);
+ MAX_LSN(td->visible_lsn);
+ td->mvcc_ref = 0;
+ td->mvcc_mtx = MUTEX_INVALID;
+ td->status = TXN_PREPARED;
+ td->flags = TXN_DTL_RESTORED;
+ memcpy(td->gid, argp->gid.data, argp->gid.size);
+ td->nlog_dbs = 0;
+ td->nlog_slots = TXN_NSLOTS;
+ td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots);
+
+ region->stat.st_nrestores++;
+#ifdef HAVE_STATISTICS
+ STAT_INC(env, txn, nactive, region->stat.st_nactive, td->txnid);
+ if (region->stat.st_nactive > region->stat.st_maxnactive)
+ STAT_SET(env, txn, maxnactive, region->stat.st_maxnactive,
+ region->stat.st_nactive, td->txnid);
+#endif
+ TXN_SYSTEM_UNLOCK(env);
+ return (0);
+}
+
+/*
+ * __txn_recycle_recover --
+ * Recovery function for recycle.
+ *
+ * PUBLIC: int __txn_recycle_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__txn_recycle_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __txn_recycle_args *argp;
+ int ret;
+
+#ifdef DEBUG_RECOVER
+ (void)__txn_child_print(env, dbtp, lsnp, op, info);
+#endif
+ if ((ret = __txn_recycle_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ COMPQUIET(lsnp, NULL);
+
+ if ((ret = __db_txnlist_gen(env, info,
+ DB_UNDO(op) ? -1 : 1, argp->min, argp->max)) != 0)
+ return (ret);
+
+ __os_free(env, argp);
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __txn_regop_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * These records are only ever written for commits. Normally, we redo any
+ * committed transaction, however if we are doing recovery to a timestamp, then
+ * we may treat transactions that committed after the timestamp as aborted.
+ */
+int
+__txn_regop_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __txn_regop_42_args *argp;
+ DB_TXNHEAD *headp;
+ u_int32_t status;
+ int ret;
+
+#ifdef DEBUG_RECOVER
+ (void)__txn_regop_42_print(env, dbtp, lsnp, op, info);
+#endif
+
+ if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ headp = info;
+ /*
+ * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL.
+ * We check for the former explicitly and the last two clauses
+ * apply to the BACKWARD_ROLL case.
+ */
+
+ if (op == DB_TXN_FORWARD_ROLL) {
+ /*
+ * If this was a 2-phase-commit transaction, then it
+ * might already have been removed from the list, and
+ * that's OK. Ignore the return code from remove.
+ */
+ if ((ret = __db_txnlist_remove(env,
+ info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0)
+ goto err;
+ } else if ((env->dbenv->tx_timestamp != 0 &&
+ argp->timestamp > (int32_t)env->dbenv->tx_timestamp) ||
+ (!IS_ZERO_LSN(headp->trunc_lsn) &&
+ LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) {
+ /*
+ * We failed either the timestamp check or the trunc_lsn check,
+ * so we treat this as an abort even if it was a commit record.
+ */
+ if ((ret = __db_txnlist_update(env, info,
+ argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0)
+ goto err;
+ else if (status != TXN_IGNORE && status != TXN_OK)
+ goto err;
+ } else {
+ /* This is a normal commit; mark it appropriately. */
+ if ((ret = __db_txnlist_update(env,
+ info, argp->txnp->txnid, argp->opcode, lsnp,
+ &status, 0)) == DB_NOTFOUND) {
+ if ((ret = __db_txnlist_add(env,
+ info, argp->txnp->txnid,
+ argp->opcode == TXN_ABORT ?
+ TXN_IGNORE : argp->opcode, lsnp)) != 0)
+ goto err;
+ } else if (ret != 0 ||
+ (status != TXN_IGNORE && status != TXN_OK))
+ goto err;
+ }
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+
+ if (0) {
+err: __db_errx(env, DB_STR_A("4517",
+ "txnid %lx commit record found, already on commit list",
+ "%lx"), (u_long)argp->txnp->txnid);
+ ret = EINVAL;
+ }
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_ckp_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__txn_ckp_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __txn_ckp_42_args *argp;
+ int ret;
+
+#ifdef DEBUG_RECOVER
+ __txn_ckp_42_print(env, dbtp, lsnp, op, info);
+#endif
+ if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ if (op == DB_TXN_BACKWARD_ROLL)
+ __db_txnlist_ckp(env, info, lsnp);
+
+ *lsnp = argp->last_ckp;
+ __os_free(env, argp);
+ return (DB_TXN_CKP);
+}
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
new file mode 100644
index 00000000..67f24439
--- /dev/null
+++ b/src/txn/txn_recover.c
@@ -0,0 +1,317 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc_auto/db_auto.h"
+#include "dbinc_auto/crdel_auto.h"
+#include "dbinc_auto/db_ext.h"
+
+/*
+ * __txn_recover_pp --
+ * ENV->txn_recover pre/post processing.
+ *
+ * PUBLIC: int __txn_recover_pp __P((DB_ENV *,
+ * PUBLIC: DB_PREPLIST *, long, long *, u_int32_t));
+ */
+int
+__txn_recover_pp(dbenv, preplist, count, retp, flags)
+ DB_ENV *dbenv;
+ DB_PREPLIST *preplist;
+ long count, *retp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(
+ env, env->tx_handle, "txn_recover", DB_INIT_TXN);
+
+ if (F_ISSET((DB_TXNREGION *)env->tx_handle->reginfo.primary,
+ TXN_IN_RECOVERY)) {
+ __db_errx(env, DB_STR("4505",
+ "operation not permitted while in recovery"));
+ return (EINVAL);
+ }
+
+ if (flags != DB_FIRST && flags != DB_NEXT)
+ return (__db_ferr(env, "DB_ENV->txn_recover", 0));
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__txn_recover(env, preplist, count, retp, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_recover --
+ * ENV->txn_recover.
+ *
+ * PUBLIC: int __txn_recover __P((ENV *,
+ * PUBLIC: DB_PREPLIST *, long, long *, u_int32_t));
+ */
+int
+__txn_recover(env, txns, count, retp, flags)
+ ENV *env;
+ DB_PREPLIST *txns;
+ long count, *retp;
+ u_int32_t flags;
+{
+ /*
+ * Public API to retrieve the list of prepared, but not yet committed
+ * transactions. See __txn_get_prepared for details. This function
+ * and __db_xa_recover both wrap that one.
+ */
+ return (__txn_get_prepared(env,
+ NULL, txns, count, retp, flags));
+
+}
+
+/*
+ * __txn_get_prepared --
+ * Returns a list of prepared (and for XA, heuristically completed)
+ * transactions (less than or equal to the count parameter). One of
+ * xids or txns must be set to point to an array of the appropriate type.
+ * The count parameter indicates the number of entries in the xids and/or
+ * txns array. The retp parameter will be set to indicate the number of
+ * entries returned in the xids/txns array. Flags indicates the operation,
+ * one of DB_FIRST or DB_NEXT.
+ *
+ * PUBLIC: int __txn_get_prepared __P((ENV *,
+ * PUBLIC: XID *, DB_PREPLIST *, long, long *, u_int32_t));
+ */
+int
+__txn_get_prepared(env, xids, txns, count, retp, flags)
+ ENV *env;
+ XID *xids;
+ DB_PREPLIST *txns;
+ long count; /* This is long for XA compatibility. */
+ long *retp;
+ u_int32_t flags;
+{
+ DB_LSN min;
+ DB_PREPLIST *prepp;
+ DB_THREAD_INFO *ip;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ TXN_DETAIL *td;
+ XID *xidp;
+ long i;
+ int restored, ret;
+
+ *retp = 0;
+ MAX_LSN(min);
+ prepp = txns;
+ xidp = xids;
+ restored = ret = 0;
+
+ /*
+ * If we are starting a scan, then we traverse the active transaction
+ * list once making sure that all transactions are marked as not having
+ * been collected. Then on each pass, we mark the ones we collected
+ * so that if we cannot collect them all at once, we can finish up
+ * next time with a continue.
+ */
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ /*
+ * During this pass we need to figure out if we are going to need
+ * to open files. We need to open files if we've never collected
+ * before (in which case, none of the COLLECTED bits will be set)
+ * and the ones that we are collecting are restored (if they aren't
+ * restored, then we never crashed; just the main server did).
+ */
+ TXN_SYSTEM_LOCK(env);
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /* Now begin collecting active transactions. */
+ for (td = SH_TAILQ_FIRST(&region->active_txn, __txn_detail);
+ td != NULL && *retp < count;
+ td = SH_TAILQ_NEXT(td, links, __txn_detail)) {
+ if (td->status != TXN_PREPARED ||
+ (flags != DB_FIRST && F_ISSET(td, TXN_DTL_COLLECTED)))
+ continue;
+
+ if (F_ISSET(td, TXN_DTL_RESTORED))
+ restored = 1;
+
+ if (xids != NULL) {
+ xidp->formatID = td->format;
+ /*
+ * XID structure uses longs; use use u_int32_t's as we
+ * log them to disk. Cast them to make the conversion
+ * explicit.
+ */
+ xidp->gtrid_length = (long)td->gtrid;
+ xidp->bqual_length = (long)td->bqual;
+ memcpy(xidp->data, td->gid, sizeof(td->gid));
+ xidp++;
+ }
+
+ if (txns != NULL) {
+ if ((ret = __os_calloc(env,
+ 1, sizeof(DB_TXN), &prepp->txn)) != 0) {
+ TXN_SYSTEM_UNLOCK(env);
+ goto err;
+ }
+ prepp->txn->td = td;
+ memcpy(prepp->gid, td->gid, sizeof(td->gid));
+ prepp++;
+ }
+
+ if (!IS_ZERO_LSN(td->begin_lsn) &&
+ LOG_COMPARE(&td->begin_lsn, &min) < 0)
+ min = td->begin_lsn;
+
+ (*retp)++;
+ F_SET(td, TXN_DTL_COLLECTED);
+ }
+ if (flags == DB_FIRST)
+ for (; td != NULL; td = SH_TAILQ_NEXT(td, links, __txn_detail))
+ F_CLR(td, TXN_DTL_COLLECTED);
+ TXN_SYSTEM_UNLOCK(env);
+
+ /*
+ * Now link all the transactions into the transaction manager's list.
+ */
+ if (txns != NULL && *retp != 0) {
+ MUTEX_LOCK(env, mgr->mutex);
+ for (i = 0; i < *retp; i++) {
+ if ((ret = __txn_continue(env,
+ txns[i].txn, txns[i].txn->td, ip, 0)) != 0)
+ goto err;
+ F_SET(txns[i].txn, TXN_MALLOC);
+ if (F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC))
+ F_SET(txns[i].txn, TXN_NOSYNC);
+ else if (F_ISSET(env->dbenv, DB_ENV_TXN_WRITE_NOSYNC))
+ F_SET(txns[i].txn, TXN_WRITE_NOSYNC);
+ else
+ F_SET(txns[i].txn, TXN_SYNC);
+ TAILQ_INSERT_TAIL(&mgr->txn_chain, txns[i].txn, links);
+ }
+ MUTEX_UNLOCK(env, mgr->mutex);
+
+ /*
+ * If we are restoring, update our count of outstanding
+ * transactions.
+ */
+ if (REP_ON(env)) {
+ REP_SYSTEM_LOCK(env);
+ env->rep_handle->region->op_cnt += (u_long)*retp;
+ REP_SYSTEM_UNLOCK(env);
+ }
+
+ }
+
+ /* If recovery already opened the files for us, don't do it here. */
+ if (restored != 0 && flags == DB_FIRST &&
+ !F_ISSET(env->lg_handle, DBLOG_OPENFILES))
+ ret = __txn_openfiles(env, ip, &min, 0);
+
+ if (0) {
+err: TXN_SYSTEM_UNLOCK(env);
+ }
+ return (ret);
+}
+
+/*
+ * __txn_openfiles --
+ * Call env_openfiles.
+ *
+ * PUBLIC: int __txn_openfiles __P((ENV *, DB_THREAD_INFO *, DB_LSN *, int));
+ */
+int
+__txn_openfiles(env, ip, min, force)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_LSN *min;
+ int force;
+{
+ DBT data;
+ DB_LOGC *logc;
+ DB_LSN open_lsn;
+ DB_TXNHEAD *txninfo;
+ __txn_ckp_args *ckp_args;
+ int ret, t_ret;
+
+ /*
+ * Figure out the last checkpoint before the smallest
+ * start_lsn in the region.
+ */
+ logc = NULL;
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+
+ memset(&data, 0, sizeof(data));
+ if ((ret = __txn_getckp(env, &open_lsn)) == 0)
+ while (!IS_ZERO_LSN(open_lsn) && (ret =
+ __logc_get(logc, &open_lsn, &data, DB_SET)) == 0 &&
+ (force ||
+ (min != NULL && LOG_COMPARE(min, &open_lsn) < 0))) {
+ /* Format the log record. */
+ if ((ret = __txn_ckp_read(
+ env, data.data, &ckp_args)) != 0) {
+ __db_errx(env, DB_STR_A("4506",
+ "Invalid checkpoint record at [%lu][%lu]",
+ "%lu %lu"), (u_long)open_lsn.file,
+ (u_long)open_lsn.offset);
+ goto err;
+ }
+ /*
+ * If force is set, then we're forcing ourselves
+ * to go back far enough to open files.
+ * Use ckp_lsn and then break out of the loop.
+ */
+ open_lsn = force ? ckp_args->ckp_lsn :
+ ckp_args->last_ckp;
+ __os_free(env, ckp_args);
+ if (force) {
+ if ((ret = __logc_get(logc, &open_lsn,
+ &data, DB_SET)) != 0)
+ goto err;
+ break;
+ }
+ }
+
+ /*
+ * There are several ways by which we may have gotten here.
+ * - We got a DB_NOTFOUND -- we need to read the first
+ * log record.
+ * - We found a checkpoint before min. We're done.
+ * - We found a checkpoint after min who's last_ckp is 0. We
+ * need to start at the beginning of the log.
+ * - We are forcing an openfiles and we have our ckp_lsn.
+ */
+ if ((ret == DB_NOTFOUND || IS_ZERO_LSN(open_lsn)) && (ret =
+ __logc_get(logc, &open_lsn, &data, DB_FIRST)) != 0) {
+ __db_errx(env, DB_STR("4507", "No log records"));
+ goto err;
+ }
+
+ if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0)
+ goto err;
+ ret = __env_openfiles(
+ env, logc, txninfo, &data, &open_lsn, NULL, (double)0, 0);
+ if (txninfo != NULL)
+ __db_txnlist_end(env, txninfo);
+
+err:
+ if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
diff --git a/src/txn/txn_region.c b/src/txn/txn_region.c
new file mode 100644
index 00000000..6f43d45f
--- /dev/null
+++ b/src/txn/txn_region.c
@@ -0,0 +1,518 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+static int __txn_init __P((ENV *, DB_TXNMGR *));
+
+/*
+ * __txn_open --
+ * Open a transaction region.
+ *
+ * PUBLIC: int __txn_open __P((ENV *));
+ */
+int
+__txn_open(env)
+ ENV *env;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ int ret;
+
+ /* Create/initialize the transaction manager structure. */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_TXNMGR), &mgr)) != 0)
+ return (ret);
+ TAILQ_INIT(&mgr->txn_chain);
+ mgr->env = env;
+
+ /* Join/create the txn region. */
+ if ((ret = __env_region_share(env, &mgr->reginfo)) != 0)
+ goto err;
+
+ /* If we created the region, initialize it. */
+ if (F_ISSET(&mgr->reginfo, REGION_CREATE))
+ if ((ret = __txn_init(env, mgr)) != 0)
+ goto err;
+
+ /* Set the local addresses. */
+ region = mgr->reginfo.primary =
+ R_ADDR(&mgr->reginfo,
+ ((REGENV *)env->reginfo->primary)->tx_primary);
+
+ /* If threaded, acquire a mutex to protect the active TXN list. */
+ if ((ret = __mutex_alloc(
+ env, MTX_TXN_ACTIVE, DB_MUTEX_PROCESS_ONLY, &mgr->mutex)) != 0)
+ goto err;
+
+ mgr->reginfo.mtx_alloc = region->mtx_region;
+ env->tx_handle = mgr;
+ return (0);
+
+err: env->tx_handle = NULL;
+ if (mgr->reginfo.addr != NULL)
+ (void)__env_region_detach(env, &mgr->reginfo, 0);
+
+ (void)__mutex_free(env, &mgr->mutex);
+ __os_free(env, mgr);
+ return (ret);
+}
+
+/*
+ * __txn_init --
+ * Initialize a transaction region in shared memory.
+ */
+static int
+__txn_init(env, mgr)
+ ENV *env;
+ DB_TXNMGR *mgr;
+{
+ DB_ENV *dbenv;
+ DB_LSN last_ckp;
+ DB_TXNREGION *region;
+ int ret;
+
+ dbenv = env->dbenv;
+
+ /*
+ * Find the last checkpoint in the log.
+ */
+ ZERO_LSN(last_ckp);
+ if (LOGGING_ON(env)) {
+ /*
+ * The log system has already walked through the last
+ * file. Get the LSN of a checkpoint it may have found.
+ */
+ if ((ret = __log_get_cached_ckp_lsn(env, &last_ckp)) != 0)
+ return (ret);
+
+ /*
+ * If that didn't work, look backwards from the beginning of
+ * the last log file until we find the last checkpoint.
+ */
+ if (IS_ZERO_LSN(last_ckp) &&
+ (ret = __txn_findlastckp(env, &last_ckp, NULL)) != 0)
+ return (ret);
+ }
+
+ if ((ret = __env_alloc(&mgr->reginfo,
+ sizeof(DB_TXNREGION), &mgr->reginfo.primary)) != 0) {
+ __db_errx(env, DB_STR("4508",
+ "Unable to allocate memory for the transaction region"));
+ return (ret);
+ }
+ ((REGENV *)env->reginfo->primary)->tx_primary =
+ R_OFFSET(&mgr->reginfo, mgr->reginfo.primary);
+ region = mgr->reginfo.primary;
+ memset(region, 0, sizeof(*region));
+
+ /* We share the region so we need the same mutex. */
+ region->mtx_region = ((REGENV *)env->reginfo->primary)->mtx_regenv;
+ mgr->reginfo.mtx_alloc = region->mtx_region;
+
+ region->maxtxns = dbenv->tx_max;
+ region->inittxns = dbenv->tx_init;
+ region->last_txnid = TXN_MINIMUM;
+ region->cur_maxid = TXN_MAXIMUM;
+
+ if ((ret = __mutex_alloc(
+ env, MTX_TXN_CHKPT, 0, &region->mtx_ckp)) != 0)
+ return (ret);
+ region->last_ckp = last_ckp;
+ region->time_ckp = time(NULL);
+
+ memset(&region->stat, 0, sizeof(region->stat));
+#ifdef HAVE_STATISTICS
+ region->stat.st_maxtxns = region->maxtxns;
+ region->stat.st_inittxns = region->inittxns;
+#endif
+
+ SH_TAILQ_INIT(&region->active_txn);
+ SH_TAILQ_INIT(&region->mvcc_txn);
+ return (ret);
+}
+
+/*
+ * __txn_findlastckp --
+ * Find the last checkpoint in the log, walking backwards from the
+ * max_lsn given or the beginning of the last log file. (The
+ * log system looked through the last log file when it started up.)
+ *
+ * PUBLIC: int __txn_findlastckp __P((ENV *, DB_LSN *, DB_LSN *));
+ */
+int
+__txn_findlastckp(env, lsnp, max_lsn)
+ ENV *env;
+ DB_LSN *lsnp;
+ DB_LSN *max_lsn;
+{
+ DBT dbt;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ int ret, t_ret;
+ u_int32_t rectype;
+
+ ZERO_LSN(*lsnp);
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ /* Get the last LSN. */
+ memset(&dbt, 0, sizeof(dbt));
+ if (max_lsn != NULL) {
+ lsn = *max_lsn;
+ if ((ret = __logc_get(logc, &lsn, &dbt, DB_SET)) != 0)
+ goto err;
+ } else {
+ if ((ret = __logc_get(logc, &lsn, &dbt, DB_LAST)) != 0)
+ goto err;
+ /*
+ * Twiddle the last LSN so it points to the beginning of the
+ * last file; we know there's no checkpoint after that, since
+ * the log system already looked there.
+ */
+ lsn.offset = 0;
+ }
+
+ /* Read backwards, looking for checkpoints. */
+ while ((ret = __logc_get(logc, &lsn, &dbt, DB_PREV)) == 0) {
+ if (dbt.size < sizeof(u_int32_t))
+ continue;
+ LOGCOPY_32(env, &rectype, dbt.data);
+ if (rectype == DB___txn_ckp) {
+ *lsnp = lsn;
+ break;
+ }
+ }
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Not finding a checkpoint is not an error; there may not exist
+ * one in the log.
+ */
+ return ((ret == 0 || ret == DB_NOTFOUND) ? 0 : ret);
+}
+
+/*
+ * __txn_env_refresh --
+ * Clean up after the transaction system on a close or failed open.
+ *
+ * PUBLIC: int __txn_env_refresh __P((ENV *));
+ */
+int
+__txn_env_refresh(env)
+ ENV *env;
+{
+ DB_TXN *txn;
+ DB_TXNMGR *mgr;
+ REGINFO *reginfo;
+ u_int32_t txnid;
+ int aborted, ret, t_ret;
+
+ ret = 0;
+ mgr = env->tx_handle;
+ reginfo = &mgr->reginfo;
+
+ /*
+ * This function can only be called once per process (i.e., not
+ * once per thread), so no synchronization is required.
+ *
+ * The caller is probably doing something wrong if close is called with
+ * active transactions. Try and abort any active transactions that are
+ * not prepared, but it's quite likely the aborts will fail because
+ * recovery won't find open files. If we can't abort any of the
+ * unprepared transaction, panic, we have to run recovery to get back
+ * to a known state.
+ */
+ aborted = 0;
+ if (TAILQ_FIRST(&mgr->txn_chain) != NULL) {
+ while ((txn = TAILQ_FIRST(&mgr->txn_chain)) != NULL) {
+ /* Prepared transactions are OK. */
+ txnid = txn->txnid;
+ if (((TXN_DETAIL *)txn->td)->status == TXN_PREPARED) {
+ if ((ret = __txn_discard_int(txn, 0)) != 0) {
+ __db_err(env, ret, DB_STR_A("4509",
+ "unable to discard txn %#lx",
+ "%#lx"), (u_long)txnid);
+ break;
+ }
+ continue;
+ }
+ aborted = 1;
+ if ((t_ret = __txn_abort(txn)) != 0) {
+ __db_err(env, t_ret, DB_STR_A("4510",
+ "unable to abort transaction %#lx", "%#lx"),
+ (u_long)txnid);
+ ret = __env_panic(env, t_ret);
+ break;
+ }
+ }
+ if (aborted) {
+ __db_errx(env, DB_STR("4511",
+ "Error: closing the transaction region with active transactions"));
+ if (ret == 0)
+ ret = EINVAL;
+ }
+ }
+
+ /* Discard the per-thread lock. */
+ if ((t_ret = __mutex_free(env, &mgr->mutex)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Detach from the region. */
+ if (F_ISSET(env, ENV_PRIVATE))
+ reginfo->mtx_alloc = MUTEX_INVALID;
+ if ((t_ret = __env_region_detach(env, reginfo, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ __os_free(env, mgr);
+
+ env->tx_handle = NULL;
+ return (ret);
+}
+
+/*
+ * __txn_region_mutex_count --
+ * Return the number of mutexes the txn region will need.
+ *
+ * PUBLIC: u_int32_t __txn_region_mutex_count __P((ENV *));
+ */
+u_int32_t
+__txn_region_mutex_count(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ /*
+ * We need a mutex for DB_TXNMGR structure, two mutexes for
+ * the DB_TXNREGION structure.
+ */
+ return (1 + 2);
+}
+/*
+ * __txn_region_mutex_max --
+ * Return the number of additional mutexes the txn region will need.
+ *
+ * PUBLIC: u_int32_t __txn_region_mutex_max __P((ENV *));
+ */
+u_int32_t
+__txn_region_mutex_max(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ u_int32_t count;
+
+ dbenv = env->dbenv;
+
+ if ((count = dbenv->tx_max) == 0)
+ count = DEF_MAX_TXNS;
+ /* We may need a mutex for each MVCC txn. */
+ return (count > dbenv->tx_init ? count - dbenv->tx_init : 0);
+}
+
+/*
+ * __txn_region_size --
+ * Return the amount of space needed for the txn region.
+ * PUBLIC: size_t __txn_region_size __P((ENV *));
+ */
+size_t
+__txn_region_size(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ size_t s;
+
+ dbenv = env->dbenv;
+
+ /*
+ * Make the region large enough to hold the primary transaction region
+ * structure, txn_init transaction detail structures, txn_init chunks of
+ * overhead required by the underlying shared region allocator for each
+ * chunk of memory, txn_max transaction names, at an average of 20
+ * bytes each, and 10KB for safety.
+ */
+ s = sizeof(DB_TXNREGION) + dbenv->tx_init *
+ (sizeof(TXN_DETAIL) + __env_alloc_overhead() + 20) + 10 * 1024;
+ return (s);
+}
+
+/*
+ * __txn_region_max --
+ * Return the additional amount of space needed for the txn region.
+ * PUBLIC: size_t __txn_region_max __P((ENV *));
+ */
+size_t
+__txn_region_max(env)
+ ENV *env;
+{
+ DB_ENV *dbenv;
+ size_t s;
+ u_int32_t count;
+
+ dbenv = env->dbenv;
+
+ if ((count = dbenv->tx_max) == 0)
+ count = DEF_MAX_TXNS;
+ if (count <= dbenv->tx_init)
+ return (0);
+ s = (count - dbenv->tx_init) *
+ (sizeof(TXN_DETAIL) + __env_alloc_overhead() + 20);
+ return (s);
+}
+
+/*
+ * __txn_id_set --
+ * Set the current transaction ID and current maximum unused ID (for
+ * testing purposes only).
+ *
+ * PUBLIC: int __txn_id_set __P((ENV *, u_int32_t, u_int32_t));
+ */
+int
+__txn_id_set(env, cur_txnid, max_txnid)
+ ENV *env;
+ u_int32_t cur_txnid, max_txnid;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ int ret;
+
+ ENV_REQUIRES_CONFIG(env, env->tx_handle, "txn_id_set", DB_INIT_TXN);
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+ region->last_txnid = cur_txnid;
+ region->cur_maxid = max_txnid;
+
+ ret = 0;
+ if (cur_txnid < TXN_MINIMUM) {
+ __db_errx(env, DB_STR_A("4512",
+ "Current ID value %lu below minimum", "%lu"),
+ (u_long)cur_txnid);
+ ret = EINVAL;
+ }
+ if (max_txnid < TXN_MINIMUM) {
+ __db_errx(env, DB_STR_A("4513",
+ "Maximum ID value %lu below minimum", "%lu"),
+ (u_long)max_txnid);
+ ret = EINVAL;
+ }
+ return (ret);
+}
+
+/*
+ * __txn_oldest_reader --
+ * Find the oldest "read LSN" of any active transaction'
+ * MVCC changes older than this can safely be discarded from the cache.
+ *
+ * PUBLIC: int __txn_oldest_reader __P((ENV *, DB_LSN *));
+ */
+int
+__txn_oldest_reader(env, lsnp)
+ ENV *env;
+ DB_LSN *lsnp;
+{
+ DB_LSN old_lsn;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ TXN_DETAIL *td;
+ int ret;
+
+ if ((mgr = env->tx_handle) == NULL)
+ return (0);
+ region = mgr->reginfo.primary;
+
+ if ((ret = __log_current_lsn_int(env, &old_lsn, NULL, NULL)) != 0)
+ return (ret);
+
+ TXN_SYSTEM_LOCK(env);
+ SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail)
+ if (LOG_COMPARE(&td->read_lsn, &old_lsn) < 0)
+ old_lsn = td->read_lsn;
+
+ *lsnp = old_lsn;
+ TXN_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __txn_add_buffer --
+ * Add to the count of buffers created by the given transaction.
+ *
+ * PUBLIC: int __txn_add_buffer __P((ENV *, TXN_DETAIL *));
+ */
+int
+__txn_add_buffer(env, td)
+ ENV *env;
+ TXN_DETAIL *td;
+{
+ DB_ASSERT(env, td != NULL);
+
+ MUTEX_LOCK(env, td->mvcc_mtx);
+ DB_ASSERT(env, td->mvcc_ref < UINT32_MAX);
+ ++td->mvcc_ref;
+ MUTEX_UNLOCK(env, td->mvcc_mtx);
+
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+/*
+ * __txn_remove_buffer --
+ * Remove a buffer from a transaction -- free the transaction if necessary.
+ *
+ * PUBLIC: int __txn_remove_buffer __P((ENV *, TXN_DETAIL *, db_mutex_t));
+ */
+int
+__txn_remove_buffer(env, td, hash_mtx)
+ ENV *env;
+ TXN_DETAIL *td;
+ db_mutex_t hash_mtx;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ int need_free, ret;
+
+ DB_ASSERT(env, td != NULL);
+ ret = 0;
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ MUTEX_LOCK(env, td->mvcc_mtx);
+ DB_ASSERT(env, td->mvcc_ref > 0);
+
+ /*
+ * We free the transaction detail here only if this is the last
+ * reference and td is on the list of committed snapshot transactions
+ * with active pages.
+ */
+ need_free = (--td->mvcc_ref == 0) && F_ISSET(td, TXN_DTL_SNAPSHOT);
+ MUTEX_UNLOCK(env, td->mvcc_mtx);
+
+ if (need_free) {
+ MUTEX_UNLOCK(env, hash_mtx);
+
+ ret = __mutex_free(env, &td->mvcc_mtx);
+ td->mvcc_mtx = MUTEX_INVALID;
+
+ TXN_SYSTEM_LOCK(env);
+ SH_TAILQ_REMOVE(&region->mvcc_txn, td, links, __txn_detail);
+ STAT_DEC(env,
+ txn, nsnapshot, region->stat.st_nsnapshot, td->txnid);
+ __env_alloc_free(&mgr->reginfo, td);
+ TXN_SYSTEM_UNLOCK(env);
+
+ MUTEX_READLOCK(env, hash_mtx);
+ }
+
+ return (ret);
+}
diff --git a/src/txn/txn_stat.c b/src/txn/txn_stat.c
new file mode 100644
index 00000000..62fe622d
--- /dev/null
+++ b/src/txn/txn_stat.c
@@ -0,0 +1,461 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_STATISTICS
+static int __txn_compare __P((const void *, const void *));
+static int __txn_print_all __P((ENV *, u_int32_t));
+static int __txn_print_stats __P((ENV *, u_int32_t));
+static int __txn_stat __P((ENV *, DB_TXN_STAT **, u_int32_t));
+static char *__txn_status __P((DB_TXN_ACTIVE *));
+static char *__txn_xa_status __P((DB_TXN_ACTIVE *));
+static void __txn_gid __P((ENV *, DB_MSGBUF *, DB_TXN_ACTIVE *));
+
+/*
+ * __txn_stat_pp --
+ * DB_ENV->txn_stat pre/post processing.
+ *
+ * PUBLIC: int __txn_stat_pp __P((DB_ENV *, DB_TXN_STAT **, u_int32_t));
+ */
+int
+__txn_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_TXN_STAT **statp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->tx_handle, "DB_ENV->txn_stat", DB_INIT_TXN);
+
+ if ((ret = __db_fchk(env,
+ "DB_ENV->txn_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__txn_stat(env, statp, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_stat --
+ * ENV->txn_stat.
+ */
+static int
+__txn_stat(env, statp, flags)
+ ENV *env;
+ DB_TXN_STAT **statp;
+ u_int32_t flags;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ DB_TXN_STAT *stats;
+ TXN_DETAIL *td;
+ size_t nbytes;
+ u_int32_t maxtxn, ndx;
+ int ret;
+
+ *statp = NULL;
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ TXN_SYSTEM_LOCK(env);
+ maxtxn = region->curtxns;
+ nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * maxtxn;
+ if ((ret = __os_umalloc(env, nbytes, &stats)) != 0) {
+ TXN_SYSTEM_UNLOCK(env);
+ return (ret);
+ }
+
+ memcpy(stats, &region->stat, sizeof(region->stat));
+ stats->st_last_txnid = region->last_txnid;
+ stats->st_last_ckp = region->last_ckp;
+ stats->st_time_ckp = region->time_ckp;
+ stats->st_txnarray = (DB_TXN_ACTIVE *)&stats[1];
+
+ for (ndx = 0,
+ td = SH_TAILQ_FIRST(&region->active_txn, __txn_detail);
+ td != NULL && ndx < maxtxn;
+ td = SH_TAILQ_NEXT(td, links, __txn_detail), ++ndx) {
+ stats->st_txnarray[ndx].txnid = td->txnid;
+ if (td->parent == INVALID_ROFF)
+ stats->st_txnarray[ndx].parentid = TXN_INVALID;
+ else
+ stats->st_txnarray[ndx].parentid =
+ ((TXN_DETAIL *)R_ADDR(&mgr->reginfo,
+ td->parent))->txnid;
+ stats->st_txnarray[ndx].pid = td->pid;
+ stats->st_txnarray[ndx].tid = td->tid;
+ stats->st_txnarray[ndx].lsn = td->begin_lsn;
+ stats->st_txnarray[ndx].read_lsn = td->read_lsn;
+ stats->st_txnarray[ndx].mvcc_ref = td->mvcc_ref;
+ stats->st_txnarray[ndx].status = td->status;
+ stats->st_txnarray[ndx].xa_status = td->xa_br_status;
+ stats->st_txnarray[ndx].priority = td->priority;
+
+ if (td->status == TXN_PREPARED)
+ memcpy(stats->st_txnarray[ndx].gid,
+ td->gid, sizeof(td->gid));
+ if (td->name != INVALID_ROFF) {
+ (void)strncpy(stats->st_txnarray[ndx].name,
+ R_ADDR(&mgr->reginfo, td->name),
+ sizeof(stats->st_txnarray[ndx].name) - 1);
+ stats->st_txnarray[ndx].name[
+ sizeof(stats->st_txnarray[ndx].name) - 1] = '\0';
+ } else
+ stats->st_txnarray[ndx].name[0] = '\0';
+ }
+
+ __mutex_set_wait_info(env, region->mtx_region,
+ &stats->st_region_wait, &stats->st_region_nowait);
+ stats->st_regsize = (roff_t)mgr->reginfo.rp->size;
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ if (!LF_ISSET(DB_STAT_SUBSYSTEM))
+ __mutex_clear(env, region->mtx_region);
+ memset(&region->stat, 0, sizeof(region->stat));
+ region->stat.st_maxtxns = region->maxtxns;
+ region->stat.st_inittxns = region->inittxns;
+ region->stat.st_maxnactive =
+ region->stat.st_nactive = stats->st_nactive;
+ region->stat.st_maxnsnapshot =
+ region->stat.st_nsnapshot = stats->st_nsnapshot;
+ }
+
+ TXN_SYSTEM_UNLOCK(env);
+
+ *statp = stats;
+ return (0);
+}
+
+/*
+ * __txn_stat_print_pp --
+ * DB_ENV->txn_stat_print pre/post processing.
+ *
+ * PUBLIC: int __txn_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__txn_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG(env,
+ env->tx_handle, "DB_ENV->txn_stat_print", DB_INIT_TXN);
+
+ if ((ret = __db_fchk(env, "DB_ENV->txn_stat_print",
+ flags, DB_STAT_ALL | DB_STAT_ALLOC | DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env, (__txn_stat_print(env, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __txn_stat_print
+ * ENV->txn_stat_print method.
+ *
+ * PUBLIC: int __txn_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__txn_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ u_int32_t orig_flags;
+ int ret;
+
+ orig_flags = flags;
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+ if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+ ret = __txn_print_stats(env, orig_flags);
+ if (flags == 0 || ret != 0)
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_STAT_ALL) &&
+ (ret = __txn_print_all(env, orig_flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __txn_print_stats --
+ * Display default transaction region statistics.
+ */
+static int
+__txn_print_stats(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ DB_MSGBUF mb;
+ DB_TXN_ACTIVE *txn;
+ DB_TXN_STAT *sp;
+ u_int32_t i;
+ int ret;
+ char buf[DB_THREADID_STRLEN], time_buf[CTIME_BUFLEN];
+
+ dbenv = env->dbenv;
+
+ if ((ret = __txn_stat(env, &sp, flags)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL))
+ __db_msg(env, "Default transaction region information:");
+ __db_msg(env, "%lu/%lu\t%s",
+ (u_long)sp->st_last_ckp.file, (u_long)sp->st_last_ckp.offset,
+ sp->st_last_ckp.file == 0 ?
+ "No checkpoint LSN" : "File/offset for last checkpoint LSN");
+ if (sp->st_time_ckp == 0)
+ __db_msg(env, "0\tNo checkpoint timestamp");
+ else
+ __db_msg(env, "%.24s\tCheckpoint timestamp",
+ __os_ctime(&sp->st_time_ckp, time_buf));
+ __db_msg(env, "%#lx\tLast transaction ID allocated",
+ (u_long)sp->st_last_txnid);
+ __db_dl(env, "Maximum number of active transactions configured",
+ (u_long)sp->st_maxtxns);
+ __db_dl(env, "Initial number of transactions configured",
+ (u_long)sp->st_inittxns);
+ __db_dl(env, "Active transactions", (u_long)sp->st_nactive);
+ __db_dl(env,
+ "Maximum active transactions", (u_long)sp->st_maxnactive);
+ __db_dl(env,
+ "Number of transactions begun", (u_long)sp->st_nbegins);
+ __db_dl(env,
+ "Number of transactions aborted", (u_long)sp->st_naborts);
+ __db_dl(env,
+ "Number of transactions committed", (u_long)sp->st_ncommits);
+ __db_dl(env, "Snapshot transactions", (u_long)sp->st_nsnapshot);
+ __db_dl(env, "Maximum snapshot transactions",
+ (u_long)sp->st_maxnsnapshot);
+ __db_dl(env,
+ "Number of transactions restored", (u_long)sp->st_nrestores);
+
+ __db_dlbytes(env, "Region size",
+ (u_long)0, (u_long)0, (u_long)sp->st_regsize);
+ __db_dl_pct(env,
+ "The number of region locks that required waiting",
+ (u_long)sp->st_region_wait, DB_PCT(sp->st_region_wait,
+ sp->st_region_wait + sp->st_region_nowait), NULL);
+
+ qsort(sp->st_txnarray,
+ sp->st_nactive, sizeof(sp->st_txnarray[0]), __txn_compare);
+ __db_msg(env, "Active transactions:");
+ DB_MSGBUF_INIT(&mb);
+ for (i = 0; i < sp->st_nactive; ++i) {
+ txn = &sp->st_txnarray[i];
+ __db_msgadd(env, &mb, "\t%lx: %s; xa_status %s;"
+ " pid/thread %s; begin LSN: file/offset %lu/%lu",
+ (u_long)txn->txnid, __txn_status(txn), __txn_xa_status(txn),
+ dbenv->thread_id_string(dbenv, txn->pid, txn->tid, buf),
+ (u_long)txn->lsn.file, (u_long)txn->lsn.offset);
+ if (txn->parentid != 0)
+ __db_msgadd(env, &mb,
+ "; parent: %lx", (u_long)txn->parentid);
+ if (!IS_MAX_LSN(txn->read_lsn))
+ __db_msgadd(env, &mb, "; read LSN: %lu/%lu",
+ (u_long)txn->read_lsn.file,
+ (u_long)txn->read_lsn.offset);
+ if (txn->mvcc_ref != 0)
+ __db_msgadd(env, &mb,
+ "; mvcc refcount: %lu", (u_long)txn->mvcc_ref);
+ if (LOCKING_ON(env))
+ __db_msgadd(env, &mb,
+ "; priority: %lu", (u_long)txn->priority);
+ if (txn->name[0] != '\0')
+ __db_msgadd(env, &mb, "; \"%s\"", txn->name);
+ if (txn->status == TXN_PREPARE)
+ __txn_gid(env, &mb, txn);
+ DB_MSGBUF_FLUSH(env, &mb);
+ }
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+/*
+ * __txn_print_all --
+ * Display debugging transaction region statistics.
+ */
+static int
+__txn_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { TXN_IN_RECOVERY, "TXN_IN_RECOVERY" },
+ { 0, NULL }
+ };
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ char time_buf[CTIME_BUFLEN];
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ TXN_SYSTEM_LOCK(env);
+
+ __db_print_reginfo(env, &mgr->reginfo, "Transaction", flags);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB_TXNMGR handle information:");
+ __mutex_print_debug_single(env, "DB_TXNMGR mutex", mgr->mutex, flags);
+ __db_dl(env,
+ "Number of transactions discarded", (u_long)mgr->n_discards);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB_TXNREGION handle information:");
+ __mutex_print_debug_single(
+ env, "DB_TXNREGION region mutex", region->mtx_region, flags);
+ STAT_ULONG("Maximum number of active txns", region->maxtxns);
+ STAT_HEX("Last transaction ID allocated", region->last_txnid);
+ STAT_HEX("Current maximum unused ID", region->cur_maxid);
+
+ __mutex_print_debug_single(
+ env, "checkpoint mutex", region->mtx_ckp, flags);
+ STAT_LSN("Last checkpoint LSN", &region->last_ckp);
+ __db_msg(env,
+ "%.24s\tLast checkpoint timestamp",
+ region->time_ckp == 0 ? "0" :
+ __os_ctime(&region->time_ckp, time_buf));
+
+ __db_prflags(env, NULL, region->flags, fn, NULL, "\tFlags");
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ TXN_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+static char *
+__txn_status(txn)
+ DB_TXN_ACTIVE *txn;
+{
+ switch (txn->status) {
+ case TXN_ABORTED:
+ return ("aborted");
+ case TXN_COMMITTED:
+ return ("committed");
+ case TXN_NEED_ABORT:
+ return ("need abort");
+ case TXN_PREPARED:
+ return ("prepared");
+ case TXN_RUNNING:
+ return ("running");
+ default:
+ break;
+ }
+ return ("unknown state");
+}
+
+static char *
+__txn_xa_status(txn)
+ DB_TXN_ACTIVE *txn;
+{
+ switch (txn->xa_status) {
+ case TXN_XA_ACTIVE:
+ return ("xa active");
+ case TXN_XA_DEADLOCKED:
+ return ("xa deadlock");
+ case TXN_XA_IDLE:
+ return ("xa idle");
+ case TXN_XA_PREPARED:
+ return ("xa prepared");
+ case TXN_XA_ROLLEDBACK:
+ return ("xa rollback");
+ default:
+ break;
+ }
+ return ("no xa state");
+}
+
+static void
+__txn_gid(env, mbp, txn)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ DB_TXN_ACTIVE *txn;
+{
+ u_int32_t v, *xp;
+ u_int i;
+ int cnt;
+
+ __db_msgadd(env, mbp, "\n\tGID:");
+ for (cnt = 0, xp = (u_int32_t *)txn->gid, i = 0;;) {
+ memcpy(&v, xp++, sizeof(u_int32_t));
+ __db_msgadd(env, mbp, "%#lx ", (u_long)v);
+ if ((i += sizeof(u_int32_t)) >= DB_GID_SIZE)
+ break;
+ if (++cnt == 4) {
+ DB_MSGBUF_FLUSH(env, mbp);
+ __db_msgadd(env, mbp, "\t\t");
+ cnt = 0;
+ }
+ }
+}
+
+static int
+__txn_compare(a1, b1)
+ const void *a1, *b1;
+{
+ const DB_TXN_ACTIVE *a, *b;
+
+ a = a1;
+ b = b1;
+
+ if (a->txnid > b->txnid)
+ return (1);
+ if (a->txnid < b->txnid)
+ return (-1);
+ return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__txn_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_TXN_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__txn_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/src/txn/txn_util.c b/src/txn/txn_util.c
new file mode 100644
index 00000000..0ecd7f6c
--- /dev/null
+++ b/src/txn/txn_util.c
@@ -0,0 +1,696 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+
+typedef struct __txn_event TXN_EVENT;
+struct __txn_event {
+ TXN_EVENT_T op;
+ TAILQ_ENTRY(__txn_event) links;
+ union {
+ struct {
+ /* Delayed close. */
+ DB *dbp;
+ } c;
+ struct {
+ /* Delayed remove. */
+ char *name;
+ u_int8_t *fileid;
+ int inmem;
+ } r;
+ struct {
+ /* Lock event. */
+ DB_LOCK lock;
+ DB_LOCKER *locker;
+ DB *dbp;
+ } t;
+ } u;
+};
+
+#define TXN_TOP_PARENT(txn) do { \
+ while (txn->parent != NULL) \
+ txn = txn->parent; \
+} while (0)
+
+static void __clear_fe_watermark __P((DB_TXN *, DB *));
+
+/*
+ * __txn_closeevent --
+ *
+ * Creates a close event that can be added to the [so-called] commit list, so
+ * that we can redo a failed DB handle close once we've aborted the transaction.
+ *
+ * PUBLIC: int __txn_closeevent __P((ENV *, DB_TXN *, DB *));
+ */
+int
+__txn_closeevent(env, txn, dbp)
+ ENV *env;
+ DB_TXN *txn;
+ DB *dbp;
+{
+ int ret;
+ TXN_EVENT *e;
+
+ e = NULL;
+ if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0)
+ return (ret);
+
+ e->u.c.dbp = dbp;
+ e->op = TXN_CLOSE;
+ TXN_TOP_PARENT(txn);
+ TAILQ_INSERT_TAIL(&txn->events, e, links);
+
+ return (0);
+}
+
+/*
+ * __txn_remevent --
+ *
+ * Creates a remove event that can be added to the commit list.
+ *
+ * PUBLIC: int __txn_remevent __P((ENV *,
+ * PUBLIC: DB_TXN *, const char *, u_int8_t *, int));
+ */
+int
+__txn_remevent(env, txn, name, fileid, inmem)
+ ENV *env;
+ DB_TXN *txn;
+ const char *name;
+ u_int8_t *fileid;
+ int inmem;
+{
+ int ret;
+ TXN_EVENT *e;
+
+ e = NULL;
+ if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0)
+ return (ret);
+
+ if ((ret = __os_strdup(env, name, &e->u.r.name)) != 0)
+ goto err;
+
+ if (fileid != NULL) {
+ if ((ret = __os_calloc(env,
+ 1, DB_FILE_ID_LEN, &e->u.r.fileid)) != 0) {
+ __os_free(env, e->u.r.name);
+ goto err;
+ }
+ memcpy(e->u.r.fileid, fileid, DB_FILE_ID_LEN);
+ }
+
+ e->u.r.inmem = inmem;
+ e->op = TXN_REMOVE;
+ TAILQ_INSERT_TAIL(&txn->events, e, links);
+
+ return (0);
+
+err: __os_free(env, e);
+
+ return (ret);
+}
+
+/*
+ * __txn_remrem --
+ * Remove a remove event because the remove has been superceeded,
+ * by a create of the same name, for example.
+ *
+ * PUBLIC: void __txn_remrem __P((ENV *, DB_TXN *, const char *));
+ */
+void
+__txn_remrem(env, txn, name)
+ ENV *env;
+ DB_TXN *txn;
+ const char *name;
+{
+ TXN_EVENT *e, *next_e;
+
+ for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) {
+ next_e = TAILQ_NEXT(e, links);
+ if (e->op != TXN_REMOVE || strcmp(name, e->u.r.name) != 0)
+ continue;
+ TAILQ_REMOVE(&txn->events, e, links);
+ __os_free(env, e->u.r.name);
+ if (e->u.r.fileid != NULL)
+ __os_free(env, e->u.r.fileid);
+ __os_free(env, e);
+ }
+
+ return;
+}
+
+/*
+ * __txn_lockevent --
+ *
+ * Add a lockevent to the commit-queue. The lock event indicates a locker
+ * trade.
+ *
+ * PUBLIC: int __txn_lockevent __P((ENV *,
+ * PUBLIC: DB_TXN *, DB *, DB_LOCK *, DB_LOCKER *));
+ */
+int
+__txn_lockevent(env, txn, dbp, lock, locker)
+ ENV *env;
+ DB_TXN *txn;
+ DB *dbp;
+ DB_LOCK *lock;
+ DB_LOCKER *locker;
+{
+ int ret;
+ TXN_EVENT *e;
+
+ if (!LOCKING_ON(env))
+ return (0);
+
+ e = NULL;
+ if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0)
+ return (ret);
+
+ e->u.t.locker = locker;
+ e->u.t.lock = *lock;
+ e->u.t.dbp = dbp;
+ if (F2_ISSET(dbp, DB2_AM_EXCL))
+ e->op = TXN_XTRADE;
+ else
+ e->op = TXN_TRADE;
+ /* This event goes on the current transaction, not its parent. */
+ TAILQ_INSERT_TAIL(&txn->events, e, links);
+ dbp->cur_txn = txn;
+
+ return (0);
+}
+
+/*
+ * __txn_remlock --
+ * Remove a lock event because the locker is going away. We can remove
+ * by lock (using offset) or by locker_id (or by both).
+ *
+ * PUBLIC: void __txn_remlock __P((ENV *, DB_TXN *, DB_LOCK *, DB_LOCKER *));
+ */
+void
+__txn_remlock(env, txn, lock, locker)
+ ENV *env;
+ DB_TXN *txn;
+ DB_LOCK *lock;
+ DB_LOCKER *locker;
+{
+ TXN_EVENT *e, *next_e;
+
+ for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) {
+ next_e = TAILQ_NEXT(e, links);
+ if ((e->op != TXN_TRADE && e->op != TXN_TRADED &&
+ e->op != TXN_XTRADE) ||
+ (e->u.t.lock.off != lock->off && e->u.t.locker != locker))
+ continue;
+ TAILQ_REMOVE(&txn->events, e, links);
+ __os_free(env, e);
+ }
+
+ return;
+}
+
+/*
+ * __txn_doevents --
+ * Process the list of events associated with a transaction. On commit,
+ * apply the events; on abort, just toss the entries.
+ *
+ * PUBLIC: int __txn_doevents __P((ENV *, DB_TXN *, int, int));
+ */
+
+/*
+ * Trade a locker associated with a thread for one that is associated
+ * only with the handle. Mark the locker so failcheck will know.
+ */
+#define DO_TRADE do { \
+ memset(&req, 0, sizeof(req)); \
+ req.lock = e->u.t.lock; \
+ req.op = DB_LOCK_TRADE; \
+ t_ret = __lock_vec(env, txn->parent ? \
+ txn->parent->locker : e->u.t.locker, 0, &req, 1, NULL); \
+ if (t_ret == 0) { \
+ if (txn->parent != NULL) { \
+ e->u.t.dbp->cur_txn = txn->parent; \
+ e->u.t.dbp->cur_locker = txn->parent->locker; \
+ } else { \
+ e->op = TXN_TRADED; \
+ e->u.t.dbp->cur_locker = e->u.t.locker; \
+ F_SET(e->u.t.dbp->cur_locker, \
+ DB_LOCKER_HANDLE_LOCKER); \
+ if (opcode != TXN_PREPARE) \
+ e->u.t.dbp->cur_txn = NULL; \
+ } \
+ } else if (t_ret == DB_NOTFOUND) \
+ t_ret = 0; \
+ if (t_ret != 0 && ret == 0) \
+ ret = t_ret; \
+} while (0)
+
+int
+__txn_doevents(env, txn, opcode, preprocess)
+ ENV *env;
+ DB_TXN *txn;
+ int opcode, preprocess;
+{
+ DB_LOCKREQ req;
+ TXN_EVENT *e, *enext;
+ int ret, t_ret;
+
+ ret = 0;
+
+ /*
+ * This phase only gets called if we have a phase where we
+ * release read locks. Since not all paths will call this
+ * phase, we have to check for it below as well. So, when
+ * we do the trade, we update the opcode of the entry so that
+ * we don't try the trade again.
+ */
+ if (preprocess) {
+ for (e = TAILQ_FIRST(&txn->events);
+ e != NULL; e = enext) {
+ enext = TAILQ_NEXT(e, links);
+ /*
+ * Move all exclusive handle locks and
+ * read handle locks to the handle locker.
+ */
+ if (!(opcode == TXN_COMMIT && e->op == TXN_XTRADE) &&
+ (e->op != TXN_TRADE ||
+ IS_WRITELOCK(e->u.t.lock.mode)))
+ continue;
+ DO_TRADE;
+ if (txn->parent != NULL) {
+ TAILQ_REMOVE(&txn->events, e, links);
+ TAILQ_INSERT_HEAD(
+ &txn->parent->events, e, links);
+ }
+ }
+ return (ret);
+ }
+
+ /*
+ * Prepare should only cause a preprocess, since the transaction
+ * isn't over.
+ */
+ DB_ASSERT(env, opcode != TXN_PREPARE);
+ while ((e = TAILQ_FIRST(&txn->events)) != NULL) {
+ TAILQ_REMOVE(&txn->events, e, links);
+ /*
+ * Most deferred events should only happen on
+ * commits, not aborts or prepares. The two exceptions are
+ * close and xtrade which gets done on commit and abort, but
+ * not prepare. If we're not doing operations, then we
+ * can just go free resources.
+ */
+ if (opcode == TXN_ABORT && (e->op != TXN_CLOSE &&
+ e->op != TXN_XTRADE))
+ goto dofree;
+ switch (e->op) {
+ case TXN_CLOSE:
+ if ((t_ret = __db_close(e->u.c.dbp,
+ NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+ case TXN_REMOVE:
+ if (txn->parent != NULL)
+ TAILQ_INSERT_TAIL(
+ &txn->parent->events, e, links);
+ else if (e->u.r.fileid != NULL) {
+ if ((t_ret = __memp_nameop(env,
+ e->u.r.fileid, NULL, e->u.r.name,
+ NULL, e->u.r.inmem)) != 0 && ret == 0)
+ ret = t_ret;
+ } else if ((t_ret =
+ __os_unlink(env, e->u.r.name, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+ case TXN_TRADE:
+ case TXN_XTRADE:
+ DO_TRADE;
+ if (txn->parent != NULL) {
+ TAILQ_INSERT_HEAD(
+ &txn->parent->events, e, links);
+ continue;
+ }
+ /* Fall through */
+ case TXN_TRADED:
+ /*
+ * Downgrade the lock if it is not an exclusive
+ * database handle lock. An exclusive database
+ * should not have any locks other than the
+ * handle lock.
+ */
+ if (ret == 0 && !F2_ISSET(e->u.t.dbp, DB2_AM_EXCL)) {
+ if ((t_ret = __lock_downgrade(env,
+ &e->u.t.lock, DB_LOCK_READ, 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ /* Update the handle lock mode. */
+ if (ret == 0 && e->u.t.lock.off ==
+ e->u.t.dbp->handle_lock.off &&
+ e->u.t.lock.ndx ==
+ e->u.t.dbp->handle_lock.ndx)
+ e->u.t.dbp->handle_lock.mode =
+ DB_LOCK_READ;
+ }
+ break;
+ default:
+ /* This had better never happen. */
+ DB_ASSERT(env, 0);
+ }
+dofree:
+ /* Free resources here. */
+ switch (e->op) {
+ case TXN_REMOVE:
+ if (txn->parent != NULL)
+ continue;
+ if (e->u.r.fileid != NULL)
+ __os_free(env, e->u.r.fileid);
+ __os_free(env, e->u.r.name);
+ break;
+ case TXN_TRADE:
+ case TXN_XTRADE:
+ if (opcode == TXN_ABORT)
+ e->u.t.dbp->cur_txn = NULL;
+ break;
+ case TXN_CLOSE:
+ case TXN_TRADED:
+ default:
+ break;
+ }
+ __os_free(env, e);
+ }
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __txn_record_fname __P((ENV *, DB_TXN *, FNAME *));
+ */
+int
+__txn_record_fname(env, txn, fname)
+ ENV *env;
+ DB_TXN *txn;
+ FNAME *fname;
+{
+ DB_LOG *dblp;
+ DB_TXNMGR *mgr;
+ TXN_DETAIL *td;
+ roff_t fname_off;
+ roff_t *np, *ldbs;
+ u_int32_t i;
+ int ret;
+
+ if ((td = txn->td) == NULL)
+ return (0);
+ mgr = env->tx_handle;
+ dblp = env->lg_handle;
+ fname_off = R_OFFSET(&dblp->reginfo, fname);
+
+ /* See if we already have a ref to this DB handle. */
+ ldbs = R_ADDR(&mgr->reginfo, td->log_dbs);
+ for (i = 0, np = ldbs; i < td->nlog_dbs; i++, np++)
+ if (*np == fname_off)
+ return (0);
+
+ if (td->nlog_slots <= td->nlog_dbs) {
+ TXN_SYSTEM_LOCK(env);
+ if ((ret = __env_alloc(&mgr->reginfo,
+ sizeof(roff_t) * (td->nlog_slots << 1), &np)) != 0) {
+ TXN_SYSTEM_UNLOCK(env);
+ return (ret);
+ }
+
+ memcpy(np, ldbs, td->nlog_dbs * sizeof(roff_t));
+ if (td->nlog_slots > TXN_NSLOTS)
+ __env_alloc_free(&mgr->reginfo, ldbs);
+
+ TXN_SYSTEM_UNLOCK(env);
+ td->log_dbs = R_OFFSET(&mgr->reginfo, np);
+ ldbs = np;
+ td->nlog_slots = td->nlog_slots << 1;
+ }
+
+ ldbs[td->nlog_dbs] = fname_off;
+ td->nlog_dbs++;
+ fname->txn_ref++;
+
+ return (0);
+}
+
+/*
+ * __txn_dref_fnam --
+ * Either pass the fname to our parent txn or decrement the refcount
+ * and close the fileid if it goes to zero.
+ *
+ * PUBLIC: int __txn_dref_fname __P((ENV *, DB_TXN *));
+ */
+int
+__txn_dref_fname(env, txn)
+ ENV *env;
+ DB_TXN *txn;
+{
+ DB_LOG *dblp;
+ DB_TXNMGR *mgr;
+ FNAME *fname;
+ roff_t *np;
+ TXN_DETAIL *ptd, *td;
+ u_int32_t i;
+ int ret;
+
+ td = txn->td;
+
+ if (td->nlog_dbs == 0)
+ return (0);
+
+ mgr = env->tx_handle;
+ dblp = env->lg_handle;
+ ret = 0;
+
+ ptd = txn->parent != NULL ? txn->parent->td : NULL;
+
+ np = R_ADDR(&mgr->reginfo, td->log_dbs);
+ /*
+ * The order in which FNAMEs are cleaned up matters. Cleaning up
+ * in the wrong order can result in database handles leaking. If
+ * we are passing the FNAMEs to the parent transaction make sure
+ * they are passed in order. If we are cleaning up the FNAMEs,
+ * make sure that is done in reverse order.
+ */
+ if (ptd != NULL) {
+ for (i = 0; i < td->nlog_dbs; i++, np++) {
+ fname = R_ADDR(&dblp->reginfo, *np);
+ MUTEX_LOCK(env, fname->mutex);
+ ret = __txn_record_fname(env, txn->parent, fname);
+ fname->txn_ref--;
+ MUTEX_UNLOCK(env, fname->mutex);
+ if (ret != 0)
+ break;
+ }
+ } else {
+ np += td->nlog_dbs - 1;
+ for (i = 0; i < td->nlog_dbs; i++, np--) {
+ fname = R_ADDR(&dblp->reginfo, *np);
+ MUTEX_LOCK(env, fname->mutex);
+ if (fname->txn_ref == 1) {
+ MUTEX_UNLOCK(env, fname->mutex);
+ DB_ASSERT(env, fname->txn_ref != 0);
+ ret = __dbreg_close_id_int(
+ env, fname, DBREG_CLOSE, 0);
+ } else {
+ fname->txn_ref--;
+ MUTEX_UNLOCK(env, fname->mutex);
+ }
+ if (ret != 0 && ret != EIO)
+ break;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * Common removal routine. This is called only after verifying that
+ * the DB_MPOOLFILE is in the list.
+ */
+static void
+__clear_fe_watermark(txn, db)
+ DB_TXN *txn;
+ DB *db;
+{
+ MPOOLFILE *mpf;
+
+ mpf = db->mpf->mfp;
+ mpf->fe_watermark = PGNO_INVALID;
+ mpf->fe_txnid = 0U;
+ mpf->fe_nlws = 0U;
+ TAILQ_REMOVE(&txn->femfs, db, felink);
+}
+
+/*
+ * __txn_reset_fe_watermarks
+ * Reset the file extension state of MPOOLFILEs involved in this transaction.
+ *
+ * PUBLIC: void __txn_reset_fe_watermarks __P((DB_TXN *));
+ */
+void
+__txn_reset_fe_watermarks(txn)
+ DB_TXN *txn;
+{
+ DB *db;
+
+ if (txn->parent) {
+ DB_ASSERT(txn->mgrp->env, TAILQ_FIRST(&txn->femfs) == NULL);
+ }
+
+ while ((db = TAILQ_FIRST(&txn->femfs)))
+ __clear_fe_watermark(txn, db);
+}
+
+/*
+ * __txn_remove_fe_watermark
+ * Remove a watermark from the transaction's list
+ *
+ * PUBLIC: void __txn_remove_fe_watermark __P((DB_TXN *,DB *));
+ */
+void
+__txn_remove_fe_watermark(txn, db)
+ DB_TXN *txn;
+ DB *db;
+{
+ DB *db_tmp;
+
+ if (txn == NULL || !F_ISSET(txn, TXN_BULK))
+ return;
+
+ TAILQ_FOREACH(db_tmp, &txn->femfs, felink) {
+ if (db_tmp == db) {
+ __clear_fe_watermark(txn, db);
+ break;
+ }
+ }
+}
+
+/*
+ * __txn_add_fe_watermark
+ *
+ * Add an entry to the transaction's list of
+ * file_extension_watermarks, if warranted. Also, set the watermark
+ * page number in the MPOOLFILE. The metadata lock associated with
+ * the mfp must be held when this function is called.
+ *
+ * PUBLIC: void __txn_add_fe_watermark __P((DB_TXN *, DB *, db_pgno_t));
+ */
+void
+__txn_add_fe_watermark(txn, db, pgno)
+ DB_TXN *txn;
+ DB *db;
+ db_pgno_t pgno;
+{
+ MPOOLFILE *mfp;
+
+ if (txn == NULL || !F_ISSET(txn, TXN_BULK))
+ return;
+
+ mfp = db->mpf->mfp;
+ /* If the watermark is already set, there's nothing to do. */
+ if (mfp->fe_watermark != PGNO_INVALID) {
+#ifdef DIAGNOSTIC
+ DB_ASSERT(txn->mgrp->env, mfp->fe_txnid == txn->txnid);
+#endif
+ return;
+ }
+
+ /* We can update MPOOLFILE because the metadata lock is held. */
+ mfp->fe_watermark = pgno;
+ mfp->fe_txnid = txn->txnid;
+
+ TAILQ_INSERT_TAIL(&txn->femfs, db, felink);
+}
+
+/*
+ * __txn_flush_fe_files
+ * For every extended file in which a log record write was skipped,
+ * flush the data pages. This is called during commit.
+ *
+ * PUBLIC: int __txn_flush_fe_files __P((DB_TXN *));
+ */
+int
+__txn_flush_fe_files(txn)
+ DB_TXN *txn;
+{
+ DB *db;
+ ENV *env;
+ int ret;
+
+ env = txn->mgrp->env;
+
+ DB_ASSERT(env, txn->mgrp != NULL);
+ DB_ASSERT(env, env != NULL);
+
+#ifdef DIAGNOSTIC
+ DB_ASSERT(env, txn->parent == NULL);
+#endif
+
+ TAILQ_FOREACH(db, &txn->femfs, felink) {
+ if (db->mpf->mfp->fe_nlws > 0 &&
+ (ret = __memp_sync_int(env, db->mpf, 0,
+ DB_SYNC_FILE, NULL, NULL)))
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __txn_pg_above_fe_watermark --
+ *
+ * Test whether there is a file extension watermark for the given
+ * database, and, if so, whether the given page number is above the
+ * watermark. If this test returns true, then logging of the page's
+ * update can be suppressed when the file extension/bulk loading
+ * optimization is in force.
+ *
+ * PUBLIC: int __txn_pg_above_fe_watermark
+ * PUBLIC: __P((DB_TXN*, MPOOLFILE*, db_pgno_t));
+ */
+int
+__txn_pg_above_fe_watermark(txn, mpf, pgno)
+ DB_TXN *txn;
+ MPOOLFILE *mpf;
+ db_pgno_t pgno;
+{
+ ENV *env;
+ int skip;
+
+ if (txn == NULL || (!F_ISSET(txn, TXN_BULK)) ||
+ mpf->fe_watermark == PGNO_INVALID)
+ return (0);
+
+ env = txn->mgrp->env;
+
+ skip = 0;
+ TXN_SYSTEM_LOCK(env);
+ if (((DB_TXNREGION *)env->tx_handle->reginfo.primary)->n_hotbackup > 0)
+ skip = 1;
+ TXN_SYSTEM_UNLOCK(env);
+ if (skip)
+ return (0);
+
+ /*
+ * If the watermark is a valid page number, then the extending
+ * transaction should be the current outermost transaction.
+ */
+ DB_ASSERT(txn->mgrp->env, mpf->fe_txnid == txn->txnid);
+
+ return (mpf->fe_watermark <= pgno);
+}
diff --git a/src/xa/xa.c b/src/xa/xa.c
new file mode 100644
index 00000000..ee75e792
--- /dev/null
+++ b/src/xa/xa.c
@@ -0,0 +1,1068 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+#include "dbinc_auto/xa_ext.h"
+
+static void corrupted_env __P((ENV *, int));
+
+static int __xa_get_txn __P((ENV *,
+ XID *, TXN_DETAIL *, DB_TXN **, u_long, int));
+static void __xa_put_txn __P((ENV *, DB_TXN *));
+
+static int __xa_txn_get_prepared
+ __P((ENV *, XID *, DB_PREPLIST *, long, long *, u_int32_t));
+static int __xa_thread_enter __P((ENV *, DB_THREAD_INFO **));
+
+static int __db_xa_close __P((char *, int, long));
+static int __db_xa_commit __P((XID *, int, long));
+static int __db_xa_complete __P((int *, int *, int, long));
+static int __db_xa_end __P((XID *, int, long));
+static int __db_xa_forget __P((XID *, int, long));
+static int __db_xa_open __P((char *, int, long));
+static int __db_xa_prepare __P((XID *, int, long));
+static int __db_xa_recover __P((XID *, long, int, long));
+static int __db_xa_rollback __P((XID *, int, long));
+static int __db_xa_start __P((XID *, int, long));
+
+/*
+ * Possible flag values:
+ * Dynamic registration 0 => no dynamic registration
+ * TMREGISTER => dynamic registration
+ * Asynchronous operation 0 => no support for asynchrony
+ * TMUSEASYNC => async support
+ * Migration support 0 => migration of transactions across
+ * threads is possible
+ * TMNOMIGRATE => no migration across threads
+ */
+const struct xa_switch_t db_xa_switch = {
+ "Berkeley DB", /* name[RMNAMESZ] */
+ TMNOMIGRATE, /* flags */
+ 0, /* version */
+ __db_xa_open, /* xa_open_entry */
+ __db_xa_close, /* xa_close_entry */
+ __db_xa_start, /* xa_start_entry */
+ __db_xa_end, /* xa_end_entry */
+ __db_xa_rollback, /* xa_rollback_entry */
+ __db_xa_prepare, /* xa_prepare_entry */
+ __db_xa_commit, /* xa_commit_entry */
+ __db_xa_recover, /* xa_recover_entry */
+ __db_xa_forget, /* xa_forget_entry */
+ __db_xa_complete /* xa_complete_entry */
+};
+
+/*
+ * __xa_get_txn --
+ * Return a pointer to the current transaction structure for the
+ * designated environment. We take the XA flags so we can specifically
+ * test for TMJOIN and TMRESUME. These are testing for compliance with
+ * the XA state machine. The various cases are:
+ *
+ * TMRESUME: DB_TXN should already exist for this thread and should be
+ * in state SUSPENDED. Either error or change state.
+ * TMJOIN: DB_TXN should *not* exist, but TXN_DETAIL should -- create
+ * the DB_TXN and __txn_continue it.
+ * neither: Neither DB_TXN nor TXN_DETAIL should exist (td should be NULL) --
+ * start transaction.
+ *
+ * In addition, we use this to retrieve the current txn during __db_xa_end.
+ * In this case, the td and the txn should exist and the txn should currently
+ * be associated.
+ *
+ */
+static int
+__xa_get_txn(env, xid, td, txnp, flags, ending)
+ ENV *env;
+ XID *xid;
+ TXN_DETAIL *td;
+ DB_TXN **txnp;
+ u_long flags;
+ int ending;
+{
+ DB_ENV *dbenv;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ dbenv = env->dbenv;
+ COMPQUIET(ip, NULL);
+ ENV_ENTER_RET(env, ip, ret);
+ if (ret != 0)
+ return (XAER_RMFAIL);
+ else
+ ret = XA_OK;
+ DB_ASSERT(env, ip != NULL);
+ if (ending != 0)
+ DB_ASSERT(env,
+ ip->dbth_xa_status == TXN_XA_THREAD_ASSOCIATED);
+ else
+ DB_ASSERT(env,
+ ip->dbth_xa_status != TXN_XA_THREAD_ASSOCIATED);
+
+ /*
+ * Two cases: the transaction should already exist in this
+ * environment or it should not. If it should exist, then
+ * we should have found its detail and the JOIN or RESUME
+ * flags should have been set.
+ */
+ if (td == NULL) {
+ DB_ASSERT(env, ending == 0);
+ if (LF_ISSET(TMJOIN | TMRESUME))
+ ret = XAER_NOTA;
+ /*
+ * The snapshot flag is ignored if the database is not
+ * enabled for MVCC. This allows MVCC to be used
+ * with XA transactions.
+ */
+ else if ((ret = __txn_begin(env,
+ ip, NULL, txnp, DB_TXN_NOWAIT|DB_TXN_SNAPSHOT)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4540",
+ "xa_get_txn: transaction begin failed"));
+ ret = XAER_RMERR;
+ } else {
+ SH_TAILQ_INSERT_HEAD(&ip->dbth_xatxn,
+ *txnp, xa_links, __db_txn);
+ (*txnp)->xa_thr_status = TXN_XA_THREAD_ASSOCIATED;
+ ip->dbth_xa_status = TXN_XA_THREAD_ASSOCIATED;
+
+ /* Initialize XA fields in the detail structure. */
+ /* XXX Does this need protection of the TXN lock? */
+ td = (TXN_DETAIL *)((*txnp)->td);
+ memcpy(td->gid, xid->data, XIDDATASIZE);
+ td->bqual = (u_int32_t)xid->bqual_length;
+ td->gtrid = (u_int32_t)xid->gtrid_length;
+ td->format = (int32_t)xid->formatID;
+ td->xa_br_status = TXN_XA_ACTIVE;
+ }
+ } else {
+ /* If we get here, the transaction exists. */
+ if (ending == 0 && !LF_ISSET(TMRESUME) && !LF_ISSET(TMJOIN)) {
+ ret = XAER_DUPID;
+ goto out;
+ }
+
+ SH_TAILQ_FOREACH(*txnp, &ip->dbth_xatxn, xa_links, __db_txn)
+ if ((*txnp)->td == td)
+ break;
+
+ /* Check that we are not a child transaction. */
+ if (td->parent != INVALID_ROFF) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4541",
+ "xa_get_txn: XA transaction with parent"));
+ ret = XAER_RMERR;
+ goto out;
+ }
+
+ if (*txnp != NULL) {
+ if (ending) {
+ DB_ASSERT(env, (*txnp)->xa_thr_status ==
+ TXN_XA_THREAD_ASSOCIATED);
+ DB_ASSERT(env, (*txnp) ==
+ SH_TAILQ_FIRST(&ip->dbth_xatxn, __db_txn));
+ } else if (LF_ISSET(TMRESUME)) {
+ DB_ASSERT(env, (*txnp)->xa_thr_status ==
+ TXN_XA_THREAD_SUSPENDED);
+ DB_ASSERT(env, ip->dbth_xa_status ==
+ TXN_XA_THREAD_SUSPENDED);
+ (*txnp)->xa_thr_status =
+ TXN_XA_THREAD_ASSOCIATED;
+ ip->dbth_xa_status = TXN_XA_THREAD_ASSOCIATED;
+ if ((*txnp) !=
+ SH_TAILQ_FIRST(&ip->dbth_xatxn, __db_txn)) {
+ SH_TAILQ_REMOVE(&ip->dbth_xatxn,
+ (*txnp), xa_links, __db_txn);
+ SH_TAILQ_INSERT_HEAD(&ip->dbth_xatxn,
+ (*txnp), xa_links, __db_txn);
+ }
+ if (td->xa_br_status == TXN_XA_IDLE)
+ td->xa_br_status = TXN_XA_ACTIVE;
+ } else
+ ret = XAER_PROTO;
+ } else {
+ if (LF_ISSET(TMRESUME)) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4542",
+ "xa_get_txn: transaction does not exist"));
+ ret = XAER_PROTO;
+ } else if ((ret =
+ __os_malloc(env, sizeof(DB_TXN), txnp)) == 0) {
+ /* We are joining this branch. */
+ ret = __txn_continue(env, *txnp, td, ip, 1);
+ if (ret != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4543",
+ "xa_get_txn: txn_continue fails"));
+ ret = XAER_RMFAIL;
+ }
+ ip->dbth_xa_status = TXN_XA_THREAD_ASSOCIATED;
+ (*txnp)->xa_thr_status =
+ TXN_XA_THREAD_ASSOCIATED;
+ SH_TAILQ_INSERT_HEAD(&ip->dbth_xatxn,
+ (*txnp), xa_links, __db_txn);
+ if (td->xa_br_status == TXN_XA_IDLE)
+ td->xa_br_status = TXN_XA_ACTIVE;
+ } else {
+ dbenv->err(dbenv, ret, DB_STR("4544",
+ "xa_get_txn: os_malloc failed"));
+ ret = XAER_RMERR;
+ }
+ }
+ }
+out: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * Release use of this transaction.
+ */
+static void
+__xa_put_txn(env, txnp)
+ ENV *env;
+ DB_TXN *txnp;
+{
+ DB_THREAD_INFO *ip;
+ TXN_DETAIL *td;
+
+ ip = txnp->thread_info;
+ DB_ASSERT(env, ip != NULL);
+ SH_TAILQ_REMOVE(&ip->dbth_xatxn, txnp, xa_links, __db_txn);
+ TAILQ_REMOVE(&txnp->mgrp->txn_chain, txnp, links);
+ td = txnp->td;
+ DB_ASSERT(env, td->xa_ref > 0);
+ td->xa_ref--;
+ __os_free(env, txnp);
+ ip->dbth_xa_status = TXN_XA_THREAD_UNASSOCIATED;
+}
+
+static
+int __xa_thread_enter(env, ipp)
+ ENV *env;
+ DB_THREAD_INFO **ipp;
+{
+ int ret;
+ DB_THREAD_INFO *ip;
+
+ COMPQUIET(ip, NULL);
+ ENV_ENTER_RET(env, ip, ret);
+ if (ret == 0)
+ ip->dbth_xa_status = TXN_XA_THREAD_UNASSOCIATED;
+ *ipp = ip;
+ return (ret);
+}
+
+/*
+ * __xa_txn_get_prepared --
+ * Wrap the internal call to __txn_get_prepared so that we can call
+ * it from XA. XA routines are not considered to be running "inside" the
+ * library, so when they make calls into the library, we need to use interface
+ * routines that support replication and failchk. Since __txn_get_prepared
+ * is internal, there is no user API to call, so we use this wrapper routine
+ * instead.
+ */
+static int
+__xa_txn_get_prepared(env, xids, txns, count, retp, flags)
+ ENV *env;
+ XID *xids;
+ DB_PREPLIST *txns;
+ long count; /* This is long for XA compatibility. */
+ long *retp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ip = NULL;
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__txn_get_prepared(env, xids, txns, count, retp, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+#define XA_FLAGS \
+ (DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | \
+ DB_INIT_TXN | DB_THREAD | DB_REGISTER | DB_RECOVER)
+
+/*
+ * __db_xa_open --
+ * The open call in the XA protocol. The rmid field is an id number
+ * that the TM assigned us and will pass us on every xa call. We need to
+ * map that rmid number into a env structure that we create during
+ * initialization. The file xa_map.c implements all such xa->db mappings.
+ * The xa_info field is instance specific information. We require
+ * that the value of DB_HOME be passed in xa_info. Since xa_info is the
+ * only thing that we get to pass to db_env_create, any config information
+ * will have to be done via a config file instead of via the db_env_create
+ * call.
+ */
+static int
+__db_xa_open(xa_info, rmid, arg_flags)
+ char *xa_info;
+ int rmid;
+ long arg_flags;
+{
+ DB_ENV *dbenv;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int inmem, ret;
+ u_long flags;
+
+ flags = (u_long)arg_flags; /* Conversion for bit operations. */
+ ret = 0;
+
+ if (LF_ISSET(TMASYNC))
+ return (XAER_ASYNC);
+ if (flags != TMNOFLAGS)
+ return (XAER_INVAL);
+
+ /* Verify if we already have this environment open. */
+ if (__db_rmid_to_env(rmid, &env) == 0) {
+ env->xa_ref++;
+ /* Indicate that this thread is in an XA environment. */
+ if ((ret = __xa_thread_enter(env, &ip)) == 0) {
+ DB_ASSERT(env, ip != NULL);
+ ENV_LEAVE(env, ip);
+ return (XA_OK);
+ } else
+ return (XAER_RMERR);
+ }
+
+ /* Open a new environment. */
+ if ((ret = db_env_create(&dbenv, 0)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4545",
+ "xa_open: Failure creating env handle"));
+ return (XAER_RMERR);
+ }
+ if ((ret = dbenv->set_thread_count(dbenv, 25)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4546",
+ "xa_open: Failure setting thread count"));
+ goto err;
+ }
+ env = dbenv->env;
+ if ((ret = dbenv->open(dbenv, xa_info, XA_FLAGS, 0)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4547",
+ "xa_open: Failure opening environment"));
+ goto err;
+ }
+
+ /*
+ * Make sure that the environment is not configured for in-memory
+ * logging.
+ */
+ if ((ret = dbenv->log_get_config(dbenv,
+ DB_LOG_IN_MEMORY, &inmem)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4548",
+ "xa_open: Failure getting log configuration"));
+ goto err;
+ }
+ if (inmem != 0) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4549",
+ "xa_open: In-memory logging not allowed in XA environment"));
+ goto err;
+ }
+
+ /* Create the mapping. */
+ __db_map_rmid(rmid, env);
+ env->xa_ref = 1;
+
+ /* Indicate that this thread is in an XA environment. */
+ if ((ret = __xa_thread_enter(env, &ip)) == 0) {
+ ENV_LEAVE(env, ip);
+ return (XA_OK);
+ } else
+ return (XAER_RMERR);
+
+err: (void)dbenv->close(dbenv, 0);
+ /*
+ * If the environment is corrupt, then we need to get all threads
+ * and processes out of it and run recovery. There is no particularly
+ * clean way to do that, so we'll use a really big hammer and
+ * crash the server.
+ */
+ if (ret == DB_RUNRECOVERY)
+ exit(1);
+
+ return (XAER_RMERR);
+}
+
+/*
+ * __db_xa_close --
+ * The close call of the XA protocol. The only trickiness here
+ * is that if there are any active transactions, we must fail. It is
+ * *not* an error to call close on an environment that has already been
+ * closed (I am interpreting that to mean it's OK to call close on an
+ * environment that has never been opened).
+ */
+static int
+__db_xa_close(xa_info, rmid, arg_flags)
+ char *xa_info;
+ int rmid;
+ long arg_flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret, t_ret;
+ u_long flags;
+
+ COMPQUIET(xa_info, NULL);
+ COMPQUIET(ip, NULL);
+ ret = 0;
+
+ flags = (u_long)arg_flags; /* Conversion for bit operations. */
+
+ if (LF_ISSET(TMASYNC))
+ return (XAER_ASYNC);
+ if (flags != TMNOFLAGS)
+ return (XAER_INVAL);
+
+ /* If the environment is closed, then we're done. */
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XA_OK);
+
+ /* Check if there are any pending transactions. */
+ ENV_ENTER_RET(env, ip, ret);
+ /*
+ * If the environment is corrupt, then we need to get all threads
+ * and processes out of it and run recovery. There is no particularly
+ * clean way to do that, so we'll use a really big hammer and
+ * crash the server.
+ */
+ if (ret == DB_RUNRECOVERY)
+ exit(1);
+ else if (ret != 0)
+ return (XAER_RMFAIL);
+
+ /*
+ * If we are calling close without ever having called open, then we
+ * don't want to do anything, because if we do, our ref counts would
+ * be all wrong.
+ */
+ if (ip->dbth_xa_status == TXN_XA_THREAD_NOTA) {
+ ret = XAER_PROTO;
+ goto err;
+ }
+
+ /*
+ * It is an error for a transaction manager to call xa_close from
+ * a thread of control that is associated with a transaction branch.
+ */
+ if (SH_TAILQ_FIRST(&ip->dbth_xatxn, __db_txn) != NULL) {
+ ret = XAER_PROTO;
+ goto err;
+ }
+
+ if (env->xa_ref > 1) {
+ env->xa_ref--;
+ goto err;
+ } else {
+ /* Destroy the mapping. */
+ ret = __db_unmap_rmid(rmid);
+
+ /* Close the environment. */
+ t_ret = env->dbenv->close(env->dbenv, 0);
+
+ if (ret != 0 || t_ret != 0)
+ ret = XAER_RMERR;
+ /* Don't try to leave an environment we just closed. */
+ goto out;
+ }
+
+err: ENV_LEAVE(env, ip);
+out: return (ret == 0 ? XA_OK : ret);
+}
+
+/*
+ * __db_xa_start --
+ * Begin a transaction for the current resource manager.
+ */
+static int
+__db_xa_start(xid, rmid, arg_flags)
+ XID *xid;
+ int rmid;
+ long arg_flags;
+{
+ DB_ENV *dbenv;
+ DB_TXN *txnp;
+ ENV *env;
+ TXN_DETAIL *td;
+ int ret;
+ u_long flags;
+
+ flags = (u_long)arg_flags; /* Conversion for bit operations. */
+ ret = 0;
+
+#define OK_FLAGS (TMJOIN | TMRESUME | TMNOWAIT | TMASYNC | TMNOFLAGS)
+ if (LF_ISSET(~OK_FLAGS))
+ return (XAER_INVAL);
+
+ if (LF_ISSET(TMJOIN) && LF_ISSET(TMRESUME))
+ return (XAER_INVAL);
+
+ if (LF_ISSET(TMASYNC))
+ return (XAER_ASYNC);
+
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+ dbenv = env->dbenv;
+
+ /* Die if the environment is corrupted. */
+ PANIC_CHECK_RET(env, ret);
+ if (ret == DB_RUNRECOVERY)
+ exit(1);
+
+ /*
+ * If td comes back NULL, then we know that we don't have a
+ * transaction yet.
+ */
+ if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4550",
+ "xa_start: failure mapping xid"));
+ return (XAER_RMFAIL);
+ }
+
+ /*
+ * This can't block, so we can ignore TMNOWAIT.
+ *
+ * Other error conditions: RMERR, OUTSIDE, PROTO, RB*
+ */
+ if (td != NULL) {
+ if (td->xa_br_status == TXN_XA_DEADLOCKED)
+ return (XA_RBDEADLOCK);
+ if (td->xa_br_status == TXN_XA_ROLLEDBACK)
+ return (XA_RBOTHER);
+ }
+ if ((ret = __xa_get_txn(env, xid, td, &txnp, flags, 0)) != 0)
+ return (ret);
+
+ return (XA_OK);
+}
+
+/*
+ * __db_xa_end --
+ * Disassociate the current transaction from the current process.
+ */
+static int
+__db_xa_end(xid, rmid, arg_flags)
+ XID *xid;
+ int rmid;
+ long arg_flags;
+{
+ DB_ENV *dbenv;
+ DB_TXN *txn;
+ ENV *env;
+ TXN_DETAIL *td;
+ int ret;
+ u_long flags;
+
+ flags = (u_long)arg_flags; /* Convert for bit manipulation. */
+ if (flags != TMNOFLAGS && !LF_ISSET(TMSUSPEND | TMSUCCESS | TMFAIL))
+ return (XAER_INVAL);
+
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+ dbenv = env->dbenv;
+
+ if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4551",
+ "xa_end: failure mapping xid"));
+ return (XAER_RMFAIL);
+ }
+ if (td == NULL)
+ return (XAER_NOTA);
+
+ if ((ret = __xa_get_txn(env, xid, td, &txn, flags, 1)) != 0)
+ return (ret);
+
+ /* We are ending; make sure there are no open cursors. */
+ if (txn->cursors != 0) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4552",
+ "xa_end: cannot end with open cursors"));
+ return (XAER_RMERR);
+ }
+
+ if (td != txn->td) {
+ dbenv->err(dbenv, ret, DB_STR("4553",
+ "xa_end: txn_detail mismatch"));
+ return (XAER_RMERR);
+ }
+
+ if (td->xa_br_status == TXN_XA_DEADLOCKED)
+ return (XA_RBDEADLOCK);
+
+ /*
+ * This happens if this process timed out,
+ * and the TMS called __db_xa_rollback
+ * while this process was holding the txn.
+ * Need to handle the txn in this process.
+ */
+ if (td->status == TXN_NEED_ABORT) {
+ if (txn->abort(txn) != 0)
+ return (XAER_RMERR);
+ __xa_put_txn(env, txn);
+ return (XA_RBOTHER);
+ }
+
+ if (td->xa_br_status == TXN_XA_IDLE) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4554",
+ "xa_end: ending transaction that is idle"));
+ return (XAER_PROTO);
+ }
+
+ /*
+ * If we are deadlocked or prepared, don't change this, but
+ * if we are active and the only handle, then make this transaction
+ * idle.
+ */
+ if (td->xa_ref == 1 && td->xa_br_status == TXN_XA_ACTIVE)
+ td->xa_br_status = TXN_XA_IDLE;
+ if (LF_ISSET(TMSUSPEND)) {
+ txn->thread_info->dbth_xa_status = TXN_XA_THREAD_SUSPENDED;
+ txn->xa_thr_status = TXN_XA_THREAD_SUSPENDED;
+ } else {
+ __xa_put_txn(env, txn);
+ }
+ return (XA_OK);
+}
+
+/*
+ * If, during a transaction completion operation (commit, abort, prepare)
+ * we detect a corrupt environment, we must close and reopen the
+ * environment and check if the transaction in question exists. If it
+ * does, then we can complete the operation as requested. If it does
+ * not, then we have to return aborted, because we just recovered the
+ * environment, aborting this transaction.
+ */
+static void
+corrupted_env(env, rmid)
+ ENV *env;
+ int rmid;
+{
+ DB_ENV *dbenv;
+ const char *path;
+ char *home;
+ int ret;
+ ENV *env2;
+
+ COMPQUIET(home, NULL);
+ ret = 0;
+ dbenv = env->dbenv;
+ path = NULL;
+ if (dbenv->get_home(dbenv, &path) != 0)
+ goto err;
+ if (path != NULL && (__os_strdup(NULL, path, &home) != 0))
+ goto err;
+ /*
+ * Check that no one else came in and cleaned
+ * up the environment before we could. If they
+ * did then just call __db_xa_open to get the
+ * new environment. If they have not then
+ * unmap the old handle so no one else can get
+ * it.
+ */
+ if (__db_rmid_to_env(rmid, &env2) == 0) {
+ PANIC_CHECK_RET(env2, ret);
+ if (ret != 0)
+ (void)__db_unmap_rmid(rmid);
+ }
+
+ /*
+ * If we cannot get the environment then it is
+ * corrupted and are currently unable to run recovery.
+ * In that case all we can do is crash and restart,
+ * and recovery will clean up the lost transaction.
+ */
+ if ( __db_xa_open(home, rmid, 0) != XA_OK)
+ goto err;
+
+ __os_free(NULL, home);
+ if (0) {
+err: exit(1);
+ }
+}
+
+/*
+ * __db_xa_prepare --
+ * Sync the log to disk so we can guarantee recoverability.
+ */
+static int
+__db_xa_prepare(xid, rmid, arg_flags)
+ XID *xid;
+ int rmid;
+ long arg_flags;
+{
+ DB_ENV *dbenv;
+ DB_TXN *txnp;
+ ENV *env;
+ TXN_DETAIL *td;
+ int ret;
+ u_long flags;
+
+ flags = (u_long)arg_flags; /* Conversion for bit operations. */
+ ret = 0;
+
+ if (LF_ISSET(TMASYNC))
+ return (XAER_ASYNC);
+ if (flags != TMNOFLAGS)
+ return (XAER_INVAL);
+
+ /*
+ * We need to know if we've ever called prepare on this.
+ * As part of the prepare, we set the xa_status field to
+ * reflect that fact that prepare has been called, and if
+ * it's ever called again, it's an error.
+ */
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+ dbenv = env->dbenv;
+
+ /*
+ * If the environment is corrupted, reopen it or die if that
+ * is not possible.
+ */
+ PANIC_CHECK_RET(env, ret);
+ if (ret == DB_RUNRECOVERY) {
+ corrupted_env(env, rmid);
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+ dbenv = env->dbenv;
+ }
+
+ if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4555",
+ "xa_prepare: failure mapping xid"));
+ return (XAER_RMFAIL);
+ }
+ if (td == NULL) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4556",
+ "xa_prepare: xid not found"));
+ return (XAER_NOTA);
+ }
+
+ if (td->xa_br_status == TXN_XA_DEADLOCKED)
+ return (XA_RBDEADLOCK);
+ if (td->xa_br_status == TXN_XA_ROLLEDBACK)
+ return (XA_RBOTHER);
+
+ if (td->xa_br_status != TXN_XA_ACTIVE &&
+ td->xa_br_status != TXN_XA_IDLE) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4557",
+ "xa_prepare: transaction neither active nor idle"));
+ return (XAER_PROTO);
+ }
+
+ /* Now, fill in the global transaction structure. */
+ if ((ret = __xa_get_txn(env, xid, td, &txnp, TMJOIN, 0)) != 0)
+ return (ret);
+
+ if ((ret = txnp->prepare(txnp, (u_int8_t *)xid->data)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4558",
+ "xa_prepare: txnp->prepare failed"));
+ td->xa_br_status = TXN_XA_IDLE;
+ return (XAER_RMERR);
+ }
+ td->xa_br_status = TXN_XA_PREPARED;
+
+ __xa_put_txn(env, txnp);
+ return (XA_OK);
+}
+
+/*
+ * __db_xa_commit --
+ * Commit the transaction
+ */
+static int
+__db_xa_commit(xid, rmid, arg_flags)
+ XID *xid;
+ int rmid;
+ long arg_flags;
+{
+ DB_ENV *dbenv;
+ DB_TXN *txnp;
+ ENV *env;
+ TXN_DETAIL *td;
+ int ret;
+ u_long flags;
+
+ flags = (u_long)arg_flags; /* Conversion for bit operations. */
+ ret = 0;
+
+ if (LF_ISSET(TMASYNC))
+ return (XAER_ASYNC);
+#undef OK_FLAGS
+#define OK_FLAGS (TMNOFLAGS | TMNOWAIT | TMONEPHASE)
+ if (LF_ISSET(~OK_FLAGS))
+ return (XAER_INVAL);
+
+ /*
+ * We need to know if we've ever called prepare on this.
+ * We can verify this by examining the xa_status field.
+ */
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+ dbenv = env->dbenv;
+
+ /*
+ * If the environment is corrupted, reopen it or die if that
+ * is not possible.
+ */
+ PANIC_CHECK_RET(env, ret);
+ if (ret == DB_RUNRECOVERY) {
+ corrupted_env(env, rmid);
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+ dbenv = env->dbenv;
+ }
+
+ if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4559",
+ "xa_commit: failure mapping xid"));
+ return (XAER_RMFAIL);
+ }
+ if (td == NULL) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4560",
+ "xa_commit: xid not found"));
+ return (XAER_NOTA);
+ }
+
+ if (td->xa_br_status == TXN_XA_DEADLOCKED)
+ return (XA_RBDEADLOCK);
+
+ if (td->xa_br_status == TXN_XA_ROLLEDBACK)
+ return (XA_RBOTHER);
+
+ if (LF_ISSET(TMONEPHASE) && td->xa_br_status != TXN_XA_IDLE) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4561",
+ "xa_commit: commiting transaction active in branch"));
+ return (XAER_PROTO);
+ }
+
+ if (!LF_ISSET(TMONEPHASE) && td->xa_br_status != TXN_XA_PREPARED) {
+ dbenv->err(dbenv, EINVAL, DB_STR("4562",
+ "xa_commit: attempting to commit unprepared transaction"));
+ return (XAER_PROTO);
+ }
+
+ /* Now, fill in the global transaction structure. */
+ if ((ret = __xa_get_txn(env, xid, td, &txnp, TMJOIN, 0)) != 0)
+ return (ret);
+
+ /*
+ * Because this transaction is currently associated, commit will not free
+ * the transaction structure, which is good, because we need to do that
+ * in xa_put_txn below.
+ */
+ if ((ret = txnp->commit(txnp, 0)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4563",
+ "xa_commit: txnp->commit failed"));
+ return (XAER_RMERR);
+ }
+
+ __xa_put_txn(env, txnp);
+ return (XA_OK);
+}
+
+/*
+ * __db_xa_recover --
+ * Returns a list of prepared and heuristically completed transactions.
+ *
+ * The return value is the number of xids placed into the xid array (less
+ * than or equal to the count parameter). The flags are going to indicate
+ * whether we are starting a scan or continuing one.
+ */
+static int
+__db_xa_recover(xids, count, rmid, flags)
+ XID *xids;
+ long count, flags;
+ int rmid;
+{
+ ENV *env;
+ int ret;
+ u_int32_t newflags;
+ long rval;
+
+ /* If the environment is closed, then we're done. */
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+
+ if (LF_ISSET(TMSTARTRSCAN))
+ newflags = DB_FIRST;
+ else if (LF_ISSET(TMENDRSCAN))
+ newflags = DB_LAST;
+ else
+ newflags = DB_NEXT;
+
+ rval = 0;
+ if ((ret = __xa_txn_get_prepared(env,
+ xids, NULL, count, &rval, newflags)) != 0) {
+ env->dbenv->err(env->dbenv, ret, DB_STR("4564",
+ "xa_recover: txn_get_prepared failed"));
+ return (XAER_RMERR);
+ }
+
+ return (rval);
+}
+
+/*
+ * __db_xa_rollback
+ * Abort an XA transaction.
+ */
+static int
+__db_xa_rollback(xid, rmid, arg_flags)
+ XID *xid;
+ int rmid;
+ long arg_flags;
+{
+ DB_ENV *dbenv;
+ DB_TXN *txnp;
+ ENV *env;
+ TXN_DETAIL *td;
+ int ret;
+ u_long flags;
+
+ flags = (u_long)arg_flags; /* Conversion for bit operations. */
+ ret = 0;
+
+ if (LF_ISSET(TMASYNC))
+ return (XAER_ASYNC);
+ if (flags != TMNOFLAGS)
+ return (XAER_INVAL);
+
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+ dbenv = env->dbenv;
+
+ /*
+ * If the environment is corrupted, reopen it or die if that
+ * is not possible.
+ */
+ PANIC_CHECK_RET(env, ret);
+ if (ret == DB_RUNRECOVERY) {
+ corrupted_env(env, rmid);
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+ dbenv = env->dbenv;
+ }
+
+ if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4565",
+ "xa_rollback: failure mapping xid"));
+ return (XAER_RMFAIL);
+ } if (td == NULL) {
+ dbenv->err(dbenv, ret, DB_STR("4566",
+ "xa_rollback: xid not found"));
+ return (XAER_NOTA);
+ }
+
+ if (td->xa_br_status == TXN_XA_DEADLOCKED)
+ return (XA_RBDEADLOCK);
+
+ if (td->xa_br_status == TXN_XA_ROLLEDBACK)
+ return (XA_RBOTHER);
+
+ if (td->xa_br_status != TXN_XA_ACTIVE &&
+ td->xa_br_status != TXN_XA_IDLE &&
+ td->xa_br_status != TXN_XA_PREPARED) {
+ dbenv->err(dbenv, EINVAL, DB_STR_A("4567",
+ "xa_rollback: transaction in invalid state %d",
+ "%d"), (int)td->xa_br_status);
+ return (XAER_PROTO);
+ }
+
+ /* Now, fill in the global transaction structure. */
+ if ((ret = __xa_get_txn(env, xid, td, &txnp, TMJOIN, 0)) != 0)
+ return (ret);
+ /*
+ * Normally abort frees the txnp, but if this is an associated XA
+ * transaction, then abort will not free it; we do that below.
+ */
+ if ((ret = txnp->abort(txnp)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4568",
+ "xa_rollback: failure aborting transaction"));
+ return (XAER_RMERR);
+ }
+
+ __xa_put_txn(env, txnp);
+
+ return (XA_OK);
+}
+
+/*
+ * __db_xa_forget --
+ * Forget about an XID for a transaction that was heuristically
+ * completed. Since we do not heuristically complete anything, I
+ * don't think we have to do anything here, but we should make sure
+ * that we reclaim the slots in the txnid table.
+ */
+static int
+__db_xa_forget(xid, rmid, arg_flags)
+ XID *xid;
+ int rmid;
+ long arg_flags;
+{
+ DB_ENV *dbenv;
+ DB_TXN *txnp;
+ ENV *env;
+ TXN_DETAIL *td;
+ int ret;
+ u_long flags;
+
+ flags = (u_long)arg_flags; /* Conversion for bit operations. */
+
+ if (LF_ISSET(TMASYNC))
+ return (XAER_ASYNC);
+ if (flags != TMNOFLAGS)
+ return (XAER_INVAL);
+
+ if (__db_rmid_to_env(rmid, &env) != 0)
+ return (XAER_PROTO);
+ dbenv = env->dbenv;
+
+ /*
+ * If mapping is gone, then we're done.
+ */
+ if ((ret = __db_xid_to_txn(env, xid, &td)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4569",
+ "xa_forget: failure mapping xid"));
+ return (XAER_RMFAIL);
+ }
+ if (td == NULL) {
+ dbenv->err(dbenv, ret, DB_STR("4570",
+ "xa_forget: xid not found"));
+ return (XA_OK);
+ }
+
+ if ((ret = __xa_get_txn(env, xid, td, &txnp, TMJOIN, 0)) != 0)
+ return (ret);
+
+ if ((ret = txnp->discard(txnp, 0)) != 0) {
+ dbenv->err(dbenv, ret, DB_STR("4571",
+ "xa_forget: txnp->discard failed"));
+ return (XAER_RMFAIL);
+ }
+
+ __xa_put_txn(env, txnp);
+ return (XA_OK);
+}
+
+/*
+ * __db_xa_complete --
+ * Used to wait for asynchronous operations to complete. Since we're
+ * not doing asynch, this is an invalid operation.
+ */
+static int
+__db_xa_complete(handle, retval, rmid, flags)
+ int *handle, *retval, rmid;
+ long flags;
+{
+ COMPQUIET(handle, NULL);
+ COMPQUIET(retval, NULL);
+ COMPQUIET(rmid, 0);
+ COMPQUIET(flags, 0);
+
+ return (XAER_INVAL);
+}
diff --git a/src/xa/xa_map.c b/src/xa/xa_map.c
new file mode 100644
index 00000000..4dcf4d75
--- /dev/null
+++ b/src/xa/xa_map.c
@@ -0,0 +1,152 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/txn.h"
+#include "dbinc_auto/xa_ext.h"
+
+/*
+ * This file contains all the mapping information that we need to support
+ * the DB/XA interface.
+ */
+
+/*
+ * __db_rmid_to_env
+ * Return the environment associated with a given XA rmid.
+ *
+ * PUBLIC: int __db_rmid_to_env __P((int, ENV **));
+ */
+int
+__db_rmid_to_env(rmid, envp)
+ int rmid;
+ ENV **envp;
+{
+ ENV *env;
+
+ *envp = NULL;
+ if (TAILQ_EMPTY(&DB_GLOBAL(envq)))
+ TAILQ_INIT(&DB_GLOBAL(envq));
+
+ /*
+ * When we map an rmid, move that environment to be the first one in
+ * the list of environments, so we acquire the correct environment
+ * in DB->open.
+ */
+ for (env = TAILQ_FIRST(&DB_GLOBAL(envq));
+ env != NULL; env = TAILQ_NEXT(env, links)) {
+ if (env->xa_rmid == rmid) {
+ *envp = env;
+ if (env != TAILQ_FIRST(&DB_GLOBAL(envq))) {
+ TAILQ_REMOVE(&DB_GLOBAL(envq), env, links);
+ TAILQ_INSERT_HEAD(&DB_GLOBAL(envq), env, links);
+ }
+ return (0);
+ }
+ }
+ return (1);
+}
+
+/*
+ * __db_xid_to_txn
+ * Return the txn that corresponds to this XID.
+ *
+ * PUBLIC: int __db_xid_to_txn __P((ENV *, XID *, TXN_DETAIL **));
+ */
+int
+__db_xid_to_txn(env, xid, tdp)
+ ENV *env;
+ XID *xid;
+ TXN_DETAIL **tdp;
+{
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ u_int8_t *gid;
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ /*
+ * Search the internal active transaction table to find the
+ * matching xid. If this is a performance hit, then we
+ * can create a hash table, but I doubt it's worth it.
+ */
+ TXN_SYSTEM_LOCK(env);
+ gid = (u_int8_t *)(xid->data);
+ SH_TAILQ_FOREACH(*tdp, &region->active_txn, links, __txn_detail)
+ if (memcmp(gid, (*tdp)->gid, sizeof((*tdp)->gid)) == 0)
+ break;
+ TXN_SYSTEM_UNLOCK(env);
+
+ /*
+ * This returns an error, because TXN_SYSTEM_{UN}LOCK may return
+ * an error.
+ */
+ return (0);
+}
+
+/*
+ * __db_map_rmid
+ * Create a mapping between the specified rmid and environment.
+ *
+ * PUBLIC: void __db_map_rmid __P((int, ENV *));
+ */
+void
+__db_map_rmid(rmid, env)
+ int rmid;
+ ENV *env;
+{
+ env->xa_rmid = rmid;
+ TAILQ_INSERT_HEAD(&DB_GLOBAL(envq), env, links);
+}
+
+/*
+ * __db_unmap_rmid
+ * Destroy the mapping for the given rmid.
+ *
+ * PUBLIC: int __db_unmap_rmid __P((int));
+ */
+int
+__db_unmap_rmid(rmid)
+ int rmid;
+{
+ ENV *e;
+
+ for (e = TAILQ_FIRST(&DB_GLOBAL(envq));
+ e->xa_rmid != rmid;
+ e = TAILQ_NEXT(e, links))
+ ;
+
+ if (e == NULL)
+ return (EINVAL);
+
+ TAILQ_REMOVE(&DB_GLOBAL(envq), e, links);
+ return (0);
+}
+
+/*
+ * __db_unmap_xid
+ * Destroy the mapping for the specified XID.
+ *
+ * PUBLIC: void __db_unmap_xid __P((ENV *, XID *, size_t));
+ */
+
+void
+__db_unmap_xid(env, xid, off)
+ ENV *env;
+ XID *xid;
+ size_t off;
+{
+ TXN_DETAIL *td;
+
+ COMPQUIET(xid, NULL);
+
+ td = R_ADDR(&env->tx_handle->reginfo, off);
+ memset(td->gid, 0, sizeof(td->gid));
+}