summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/blob/blob_fileops.c352
-rw-r--r--src/blob/blob_page.c374
-rw-r--r--src/blob/blob_stream.c283
-rw-r--r--src/blob/blob_util.c1189
-rw-r--r--src/btree/bt_compact.c239
-rw-r--r--src/btree/bt_compare.c105
-rw-r--r--src/btree/bt_compress.c72
-rw-r--r--src/btree/bt_conv.c9
-rw-r--r--src/btree/bt_curadj.c2
-rw-r--r--src/btree/bt_cursor.c94
-rw-r--r--src/btree/bt_delete.c18
-rw-r--r--src/btree/bt_method.c19
-rw-r--r--src/btree/bt_open.c43
-rw-r--r--src/btree/bt_put.c177
-rw-r--r--src/btree/bt_rec.c2
-rw-r--r--src/btree/bt_reclaim.c2
-rw-r--r--src/btree/bt_recno.c30
-rw-r--r--src/btree/bt_rsearch.c7
-rw-r--r--src/btree/bt_search.c49
-rw-r--r--src/btree/bt_split.c118
-rw-r--r--src/btree/bt_stat.c12
-rw-r--r--src/btree/bt_upgrade.c94
-rw-r--r--src/btree/bt_verify.c261
-rw-r--r--src/btree/btree.src2
-rw-r--r--src/clib/bsearch.c2
-rw-r--r--src/clib/getcwd.c2
-rw-r--r--src/clib/getopt.c2
-rw-r--r--src/clib/isalpha.c2
-rw-r--r--src/clib/isdigit.c2
-rw-r--r--src/clib/isprint.c2
-rw-r--r--src/clib/isspace.c2
-rw-r--r--src/clib/memcmp.c2
-rw-r--r--src/clib/memmove.c2
-rw-r--r--src/clib/printf.c2
-rw-r--r--src/clib/raise.c2
-rw-r--r--src/clib/rand.c2
-rw-r--r--src/clib/snprintf.c2
-rw-r--r--src/clib/strerror.c2
-rw-r--r--src/clib/time.c2
-rw-r--r--src/common/clock.c2
-rw-r--r--src/common/crypto_stub.c2
-rw-r--r--src/common/db_byteorder.c2
-rw-r--r--src/common/db_compint.c2
-rw-r--r--src/common/db_err.c636
-rw-r--r--src/common/db_getlong.c2
-rw-r--r--src/common/db_idspace.c2
-rw-r--r--src/common/db_log2.c2
-rw-r--r--src/common/db_shash.c2
-rw-r--r--src/common/dbt.c2
-rw-r--r--src/common/mkpath.c2
-rw-r--r--src/common/openflags.c2
-rw-r--r--src/common/os_method.c2
-rw-r--r--src/common/util_arg.c2
-rw-r--r--src/common/util_cache.c2
-rw-r--r--src/common/util_log.c2
-rw-r--r--src/common/util_sig.c2
-rw-r--r--src/common/zerofill.c2
-rw-r--r--src/crypto/aes_method.c2
-rw-r--r--src/crypto/crypto.c77
-rw-r--r--src/crypto/mersenne/mt19937db.c2
-rw-r--r--src/crypto/rijndael/rijndael-api-fst.c6
-rw-r--r--src/db/crdel.src2
-rw-r--r--src/db/crdel_rec.c4
-rw-r--r--src/db/db.c46
-rw-r--r--src/db/db.src2
-rw-r--r--src/db/db_am.c18
-rw-r--r--src/db/db_backup.c169
-rw-r--r--src/db/db_cam.c367
-rw-r--r--src/db/db_cds.c22
-rw-r--r--src/db/db_compact.c72
-rw-r--r--src/db/db_conv.c145
-rw-r--r--src/db/db_copy.c2
-rw-r--r--src/db/db_dispatch.c10
-rw-r--r--src/db/db_dup.c2
-rw-r--r--src/db/db_iface.c55
-rw-r--r--src/db/db_join.c9
-rw-r--r--src/db/db_meta.c16
-rw-r--r--src/db/db_method.c225
-rw-r--r--src/db/db_open.c142
-rw-r--r--src/db/db_overflow.c187
-rw-r--r--src/db/db_ovfl_vrfy.c2
-rw-r--r--src/db/db_pr.c343
-rw-r--r--src/db/db_rec.c10
-rw-r--r--src/db/db_reclaim.c3
-rw-r--r--src/db/db_remove.c28
-rw-r--r--src/db/db_rename.c7
-rw-r--r--src/db/db_ret.c122
-rw-r--r--src/db/db_setid.c2
-rw-r--r--src/db/db_setlsn.c2
-rw-r--r--src/db/db_sort_multiple.c8
-rw-r--r--src/db/db_stati.c2
-rw-r--r--src/db/db_truncate.c6
-rw-r--r--src/db/db_upg.c122
-rw-r--r--src/db/db_upg_opd.c7
-rw-r--r--src/db/db_vrfy.c19
-rw-r--r--src/db/db_vrfy_stub.c2
-rw-r--r--src/db/db_vrfyutil.c11
-rw-r--r--src/db/partition.c292
-rw-r--r--src/dbinc/atomic.h17
-rw-r--r--src/dbinc/blob.h103
-rw-r--r--src/dbinc/btree.h9
-rw-r--r--src/dbinc/clock.h9
-rw-r--r--src/dbinc/crypto.h2
-rw-r--r--src/dbinc/cxx_int.h2
-rw-r--r--src/dbinc/db.in278
-rw-r--r--src/dbinc/db_185.in2
-rw-r--r--src/dbinc/db_am.h12
-rw-r--r--src/dbinc/db_cxx.in86
-rw-r--r--src/dbinc/db_dispatch.h2
-rw-r--r--src/dbinc/db_int.in140
-rw-r--r--src/dbinc/db_join.h2
-rw-r--r--src/dbinc/db_page.h185
-rw-r--r--src/dbinc/db_swap.h88
-rw-r--r--src/dbinc/db_upgrade.h119
-rw-r--r--src/dbinc/db_verify.h9
-rw-r--r--src/dbinc/debug.h47
-rw-r--r--src/dbinc/fop.h16
-rw-r--r--src/dbinc/globals.h22
-rw-r--r--src/dbinc/hash.h4
-rw-r--r--src/dbinc/heap.h5
-rw-r--r--src/dbinc/hmac.h2
-rw-r--r--src/dbinc/lock.h17
-rw-r--r--src/dbinc/log.h27
-rw-r--r--src/dbinc/log_verify.h2
-rw-r--r--src/dbinc/mp.h48
-rw-r--r--src/dbinc/mutex.h110
-rw-r--r--src/dbinc/mutex_int.h67
-rw-r--r--src/dbinc/os.h2
-rw-r--r--src/dbinc/partition.h12
-rw-r--r--src/dbinc/perfmon.h2
-rw-r--r--src/dbinc/qam.h2
-rw-r--r--src/dbinc/queue.h2
-rw-r--r--src/dbinc/region.h27
-rw-r--r--src/dbinc/rep.h287
-rw-r--r--src/dbinc/repmgr.h157
-rw-r--r--src/dbinc/shqueue.h13
-rw-r--r--src/dbinc/tcl_db.h53
-rw-r--r--src/dbinc/txn.h2
-rw-r--r--src/dbinc/win_db.h44
-rw-r--r--src/dbinc/xa.h2
-rw-r--r--src/dbinc_auto/api_flags.in78
-rw-r--r--src/dbinc_auto/blob_ext.h41
-rw-r--r--src/dbinc_auto/btree_ext.h12
-rw-r--r--src/dbinc_auto/common_ext.h24
-rw-r--r--src/dbinc_auto/db_ext.h33
-rw-r--r--src/dbinc_auto/dbreg_auto.h31
-rw-r--r--src/dbinc_auto/dbreg_ext.h5
-rw-r--r--src/dbinc_auto/env_ext.h22
-rw-r--r--src/dbinc_auto/fileops_auto.h169
-rw-r--r--src/dbinc_auto/fileops_ext.h20
-rw-r--r--src/dbinc_auto/hash_ext.h4
-rw-r--r--src/dbinc_auto/heap_auto.h48
-rw-r--r--src/dbinc_auto/heap_ext.h6
-rw-r--r--src/dbinc_auto/int_def.in283
-rw-r--r--src/dbinc_auto/lock_ext.h5
-rw-r--r--src/dbinc_auto/log_ext.h11
-rw-r--r--src/dbinc_auto/mp_ext.h5
-rw-r--r--src/dbinc_auto/mutex_ext.h17
-rw-r--r--src/dbinc_auto/os_ext.h38
-rw-r--r--src/dbinc_auto/rep_automsg.h61
-rw-r--r--src/dbinc_auto/rep_ext.h37
-rw-r--r--src/dbinc_auto/repmgr_automsg.h37
-rw-r--r--src/dbinc_auto/repmgr_ext.h61
-rw-r--r--src/dbinc_auto/sequence_ext.h4
-rw-r--r--src/dbinc_auto/tcl_ext.h9
-rw-r--r--src/dbinc_auto/txn_ext.h3
-rw-r--r--src/dbreg/dbreg.c18
-rw-r--r--src/dbreg/dbreg.src27
-rw-r--r--src/dbreg/dbreg_auto.c12
-rw-r--r--src/dbreg/dbreg_autop.c17
-rw-r--r--src/dbreg/dbreg_rec.c154
-rw-r--r--src/dbreg/dbreg_stat.c2
-rw-r--r--src/dbreg/dbreg_util.c73
-rw-r--r--src/env/env_alloc.c2
-rw-r--r--src/env/env_backup.c2
-rw-r--r--src/env/env_config.c38
-rw-r--r--src/env/env_failchk.c141
-rw-r--r--src/env/env_file.c16
-rw-r--r--src/env/env_globals.c19
-rw-r--r--src/env/env_method.c187
-rw-r--r--src/env/env_name.c20
-rw-r--r--src/env/env_open.c250
-rw-r--r--src/env/env_recover.c130
-rw-r--r--src/env/env_region.c117
-rw-r--r--src/env/env_register.c197
-rw-r--r--src/env/env_sig.c17
-rw-r--r--src/env/env_stat.c43
-rw-r--r--src/fileops/fileops.src91
-rw-r--r--src/fileops/fileops_auto.c72
-rw-r--r--src/fileops/fileops_autop.c122
-rw-r--r--src/fileops/fop_basic.c216
-rw-r--r--src/fileops/fop_rec.c759
-rw-r--r--src/fileops/fop_util.c89
-rw-r--r--src/hash/hash.c178
-rw-r--r--src/hash/hash.src2
-rw-r--r--src/hash/hash_compact.c24
-rw-r--r--src/hash/hash_conv.c9
-rw-r--r--src/hash/hash_dup.c7
-rw-r--r--src/hash/hash_func.c2
-rw-r--r--src/hash/hash_meta.c2
-rw-r--r--src/hash/hash_method.c12
-rw-r--r--src/hash/hash_open.c44
-rw-r--r--src/hash/hash_page.c126
-rw-r--r--src/hash/hash_rec.c8
-rw-r--r--src/hash/hash_reclaim.c2
-rw-r--r--src/hash/hash_stat.c16
-rw-r--r--src/hash/hash_stub.c36
-rw-r--r--src/hash/hash_upgrade.c93
-rw-r--r--src/hash/hash_verify.c180
-rw-r--r--src/heap/heap.c592
-rw-r--r--src/heap/heap.src26
-rw-r--r--src/heap/heap_auto.c28
-rw-r--r--src/heap/heap_autop.c34
-rw-r--r--src/heap/heap_backup.c2
-rw-r--r--src/heap/heap_conv.c7
-rw-r--r--src/heap/heap_method.c14
-rw-r--r--src/heap/heap_open.c41
-rw-r--r--src/heap/heap_rec.c216
-rw-r--r--src/heap/heap_reclaim.c4
-rw-r--r--src/heap/heap_stat.c11
-rw-r--r--src/heap/heap_stub.c36
-rw-r--r--src/heap/heap_upgrade.c106
-rw-r--r--src/heap/heap_verify.c169
-rw-r--r--src/hmac/hmac.c2
-rw-r--r--src/lock/Design2
-rw-r--r--src/lock/lock.c50
-rw-r--r--src/lock/lock_alloc.incl2
-rw-r--r--src/lock/lock_deadlock.c47
-rw-r--r--src/lock/lock_failchk.c9
-rw-r--r--src/lock/lock_id.c306
-rw-r--r--src/lock/lock_list.c2
-rw-r--r--src/lock/lock_method.c2
-rw-r--r--src/lock/lock_region.c43
-rw-r--r--src/lock/lock_stat.c208
-rw-r--r--src/lock/lock_stub.c7
-rw-r--r--src/lock/lock_timer.c2
-rw-r--r--src/lock/lock_util.c2
-rw-r--r--src/log/log.c201
-rw-r--r--src/log/log_archive.c4
-rw-r--r--src/log/log_compare.c2
-rw-r--r--src/log/log_debug.c2
-rw-r--r--src/log/log_get.c24
-rw-r--r--src/log/log_method.c35
-rw-r--r--src/log/log_print.c11
-rw-r--r--src/log/log_put.c80
-rw-r--r--src/log/log_stat.c2
-rw-r--r--src/log/log_verify.c29
-rw-r--r--src/log/log_verify_auto.c3
-rw-r--r--src/log/log_verify_int.c439
-rw-r--r--src/log/log_verify_stub.c2
-rw-r--r--src/log/log_verify_util.c39
-rw-r--r--src/mp/mp_alloc.c320
-rw-r--r--src/mp/mp_backup.c7
-rw-r--r--src/mp/mp_bh.c14
-rw-r--r--src/mp/mp_fget.c170
-rw-r--r--src/mp/mp_fmethod.c58
-rw-r--r--src/mp/mp_fopen.c79
-rw-r--r--src/mp/mp_fput.c5
-rw-r--r--src/mp/mp_fset.c2
-rw-r--r--src/mp/mp_method.c21
-rw-r--r--src/mp/mp_mvcc.c20
-rw-r--r--src/mp/mp_region.c260
-rw-r--r--src/mp/mp_register.c2
-rw-r--r--src/mp/mp_resize.c121
-rw-r--r--src/mp/mp_stat.c73
-rw-r--r--src/mp/mp_sync.c21
-rw-r--r--src/mp/mp_trickle.c2
-rw-r--r--src/mutex/mut_alloc.c234
-rw-r--r--src/mutex/mut_failchk.c203
-rw-r--r--src/mutex/mut_fcntl.c248
-rw-r--r--src/mutex/mut_method.c29
-rw-r--r--src/mutex/mut_pthread.c275
-rw-r--r--src/mutex/mut_region.c68
-rw-r--r--src/mutex/mut_stat.c119
-rw-r--r--src/mutex/mut_stub.c12
-rw-r--r--src/mutex/mut_tas.c228
-rw-r--r--src/mutex/mut_win32.c188
-rw-r--r--src/mutex/test_mutex.c89
-rw-r--r--src/mutex/uts4_cc.s2
-rw-r--r--src/os/os_abort.c6
-rw-r--r--src/os/os_abs.c2
-rw-r--r--src/os/os_addrinfo.c2
-rw-r--r--src/os/os_alloc.c28
-rw-r--r--src/os/os_clock.c13
-rw-r--r--src/os/os_config.c2
-rw-r--r--src/os/os_cpu.c2
-rw-r--r--src/os/os_ctime.c7
-rw-r--r--src/os/os_dir.c2
-rw-r--r--src/os/os_errno.c2
-rw-r--r--src/os/os_fid.c2
-rw-r--r--src/os/os_flock.c2
-rw-r--r--src/os/os_fsync.c2
-rw-r--r--src/os/os_getenv.c2
-rw-r--r--src/os/os_handle.c15
-rw-r--r--src/os/os_map.c43
-rw-r--r--src/os/os_mkdir.c2
-rw-r--r--src/os/os_open.c2
-rw-r--r--src/os/os_path.c2
-rw-r--r--src/os/os_pid.c4
-rw-r--r--src/os/os_rename.c2
-rw-r--r--src/os/os_rmdir.c38
-rw-r--r--src/os/os_root.c2
-rw-r--r--src/os/os_rpath.c2
-rw-r--r--src/os/os_rw.c2
-rw-r--r--src/os/os_seek.c2
-rw-r--r--src/os/os_stack.c135
-rw-r--r--src/os/os_stat.c2
-rw-r--r--src/os/os_tmpdir.c2
-rw-r--r--src/os/os_truncate.c10
-rw-r--r--src/os/os_uid.c65
-rw-r--r--src/os/os_unlink.c2
-rw-r--r--src/os/os_yield.c2
-rw-r--r--src/os_qnx/os_qnx_fsync.c2
-rw-r--r--src/os_qnx/os_qnx_open.c2
-rw-r--r--src/os_vxworks/os_vx_abs.c2
-rw-r--r--src/os_vxworks/os_vx_config.c2
-rw-r--r--src/os_vxworks/os_vx_map.c2
-rw-r--r--src/os_vxworks/os_vx_rpath.c2
-rw-r--r--src/os_vxworks/os_vx_yield.c2
-rw-r--r--src/os_windows/ce_ctime.c6
-rw-r--r--src/os_windows/ce_freopen.c52
-rw-r--r--src/os_windows/ce_gmtime.c58
-rw-r--r--src/os_windows/ce_localtime.c44
-rw-r--r--src/os_windows/ce_mktime.c257
-rw-r--r--src/os_windows/ce_remove.c26
-rw-r--r--src/os_windows/ce_util_sig.c35
-rw-r--r--src/os_windows/os_abs.c2
-rw-r--r--src/os_windows/os_clock.c4
-rw-r--r--src/os_windows/os_config.c2
-rw-r--r--src/os_windows/os_cpu.c2
-rw-r--r--src/os_windows/os_dir.c2
-rw-r--r--src/os_windows/os_errno.c2
-rw-r--r--src/os_windows/os_fid.c8
-rw-r--r--src/os_windows/os_flock.c2
-rw-r--r--src/os_windows/os_fsync.c2
-rw-r--r--src/os_windows/os_getenv.c2
-rw-r--r--src/os_windows/os_handle.c2
-rw-r--r--src/os_windows/os_map.c22
-rw-r--r--src/os_windows/os_mkdir.c2
-rw-r--r--src/os_windows/os_open.c2
-rw-r--r--src/os_windows/os_rename.c2
-rw-r--r--src/os_windows/os_rmdir.c42
-rw-r--r--src/os_windows/os_rw.c2
-rw-r--r--src/os_windows/os_seek.c2
-rw-r--r--src/os_windows/os_stat.c2
-rw-r--r--src/os_windows/os_truncate.c9
-rw-r--r--src/os_windows/os_unlink.c2
-rw-r--r--src/os_windows/os_yield.c2
-rw-r--r--src/qam/qam.c102
-rw-r--r--src/qam/qam.src2
-rw-r--r--src/qam/qam_conv.c2
-rw-r--r--src/qam/qam_files.c9
-rw-r--r--src/qam/qam_method.c2
-rw-r--r--src/qam/qam_open.c2
-rw-r--r--src/qam/qam_rec.c6
-rw-r--r--src/qam/qam_stat.c2
-rw-r--r--src/qam/qam_stub.c2
-rw-r--r--src/qam/qam_upgrade.c2
-rw-r--r--src/qam/qam_verify.c6
-rw-r--r--src/rep/mlease.html2
-rw-r--r--src/rep/rep.msg70
-rw-r--r--src/rep/rep_automsg.c467
-rw-r--r--src/rep/rep_backup.c2148
-rw-r--r--src/rep/rep_elect.c74
-rw-r--r--src/rep/rep_lease.c30
-rw-r--r--src/rep/rep_log.c39
-rw-r--r--src/rep/rep_method.c615
-rw-r--r--src/rep/rep_record.c315
-rw-r--r--src/rep/rep_region.c164
-rw-r--r--src/rep/rep_stat.c76
-rw-r--r--src/rep/rep_stub.c18
-rw-r--r--src/rep/rep_util.c568
-rw-r--r--src/rep/rep_verify.c27
-rw-r--r--src/repmgr/repmgr.msg44
-rw-r--r--src/repmgr/repmgr.src2
-rw-r--r--src/repmgr/repmgr_automsg.c206
-rw-r--r--src/repmgr/repmgr_elect.c214
-rw-r--r--src/repmgr/repmgr_method.c954
-rw-r--r--src/repmgr/repmgr_msg.c655
-rw-r--r--src/repmgr/repmgr_net.c186
-rw-r--r--src/repmgr/repmgr_posix.c2
-rw-r--r--src/repmgr/repmgr_queue.c132
-rw-r--r--src/repmgr/repmgr_rec.c10
-rw-r--r--src/repmgr/repmgr_sel.c726
-rw-r--r--src/repmgr/repmgr_stat.c74
-rw-r--r--src/repmgr/repmgr_stub.c74
-rw-r--r--src/repmgr/repmgr_util.c957
-rw-r--r--src/repmgr/repmgr_windows.c50
-rw-r--r--src/sequence/seq_stat.c15
-rw-r--r--src/sequence/sequence.c184
-rw-r--r--src/txn/txn.c42
-rw-r--r--src/txn/txn.src2
-rw-r--r--src/txn/txn_chkpt.c4
-rw-r--r--src/txn/txn_failchk.c4
-rw-r--r--src/txn/txn_method.c2
-rw-r--r--src/txn/txn_rec.c7
-rw-r--r--src/txn/txn_recover.c6
-rw-r--r--src/txn/txn_region.c121
-rw-r--r--src/txn/txn_stat.c2
-rw-r--r--src/txn/txn_util.c43
-rw-r--r--src/xa/xa.c12
-rw-r--r--src/xa/xa_map.c2
402 files changed, 24727 insertions, 4983 deletions
diff --git a/src/blob/blob_fileops.c b/src/blob/blob_fileops.c
new file mode 100644
index 00000000..713e7e83
--- /dev/null
+++ b/src/blob/blob_fileops.c
@@ -0,0 +1,352 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+
+/*
+ * __blob_file_create --
+ * Blobs are orginaized in a directory sturcture consisting of
+ * <DB_HOME>/__db_bl/<blob_sub_dir>/. Below that, the blob_id
+ * is used to construct a path to the blob file, and to name
+ * the blob file. blob_id=1 would result in __db.bl001.
+ * blob_id=12002 would result in 012/__db.bl012002.
+ *
+ * PUBLIC: int __blob_file_create __P
+ * PUBLIC: ((DBC *, DB_FH **, db_seq_t *));
+ */
+int
+__blob_file_create(dbc, fhpp, blob_id)
+ DBC *dbc;
+ DB_FH **fhpp;
+ db_seq_t *blob_id;
+{
+ DB *dbp;
+ DB_FH *fhp;
+ ENV *env;
+ int ret;
+ char *ppath;
+ const char *dir;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ fhp = *fhpp = NULL;
+ ppath = NULL;
+ dir = NULL;
+ DB_ASSERT(env, !DB_IS_READONLY(dbc->dbp));
+
+ if ((ret = __blob_generate_id(dbp, dbc->txn, blob_id)) != 0)
+ goto err;
+
+ if ((ret = __blob_id_to_path(
+ env, dbp->blob_sub_dir, *blob_id, &ppath)) != 0)
+ goto err;
+
+ if ((ret = __fop_create(env, dbc->txn,
+ &fhp, ppath, &dir, DB_APP_BLOB, env->db_mode,
+ (F_ISSET(dbc->dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)))
+ != 0) {
+ __db_errx(env, DB_STR_A("0228",
+ "Error creating blob file: %llu.", "%llu"),
+ (unsigned long long)*blob_id);
+ goto err;
+ }
+
+err: if (ppath != NULL)
+ __os_free(env, ppath);
+ if (ret == 0)
+ *fhpp = fhp;
+ return (ret);
+}
+
+/*
+ * __blob_file_close --
+ *
+ * PUBLIC: int __blob_file_close __P ((DBC *, DB_FH *, u_int32_t));
+ */
+int
+__blob_file_close(dbc, fhp, flags)
+ DBC *dbc;
+ DB_FH *fhp;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbc->env;
+ ret = t_ret = 0;
+ if (fhp != NULL) {
+ /* Only sync if the file was open for writing. */
+ if (LF_ISSET(DB_FOP_WRITE))
+ t_ret = __os_fsync(env, fhp);
+ ret = __os_closehandle(env, fhp);
+ if (t_ret != 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
+ * __blob_file_delete --
+ * Delete a blob file.
+ *
+ * PUBLIC: int __blob_file_delete __P((DBC *, db_seq_t));
+ */
+int
+__blob_file_delete(dbc, blob_id)
+ DBC *dbc;
+ db_seq_t blob_id;
+{
+ ENV *env;
+ char *blob_name, *full_path;
+ int ret;
+
+ env = dbc->dbp->env;
+ blob_name = full_path = NULL;
+
+ if ((ret = __blob_id_to_path(
+ env, dbc->dbp->blob_sub_dir, blob_id, &blob_name)) != 0) {
+ __db_errx(env, DB_STR_A("0229",
+ "Failed to construct path for blob file %llu.",
+ "%llu"), (unsigned long long)blob_id);
+ goto err;
+ }
+
+ /* Log the file remove event. */
+ if (!IS_REAL_TXN(dbc->txn)) {
+ if ((ret = __db_appname(
+ env, DB_APP_BLOB, blob_name, NULL, &full_path)) != 0)
+ goto err;
+ ret = __os_unlink(env, full_path, 0);
+ } else {
+ ret = __fop_remove(
+ env, dbc->txn, NULL, blob_name, NULL, DB_APP_BLOB, 0);
+ }
+
+ if (ret != 0) {
+ __db_errx(env, DB_STR_A("0230",
+ "Failed to remove blob file while deleting: %s.",
+ "%s"), blob_name);
+ goto err;
+ }
+
+err: if (blob_name != NULL)
+ __os_free(env, blob_name);
+ if (full_path != NULL)
+ __os_free(env, full_path);
+ return (ret);
+}
+
+/*
+ * __blob_file_open --
+ *
+ * PUBLIC: int __blob_file_open
+ * PUBLIC: __P((DB *, DB_FH **, db_seq_t, u_int32_t, int));
+ */
+int
+__blob_file_open(dbp, fhpp, blob_id, flags, printerr)
+ DB *dbp;
+ DB_FH **fhpp;
+ db_seq_t blob_id;
+ u_int32_t flags;
+ int printerr;
+{
+ ENV *env;
+ int ret;
+ u_int32_t oflags;
+ char *path, *ppath;
+
+ env = dbp->env;
+ *fhpp = NULL;
+ ppath = path = NULL;
+ oflags = 0;
+
+ if ((ret = __blob_id_to_path(
+ env, dbp->blob_sub_dir, blob_id, &ppath)) != 0)
+ goto err;
+
+ if ((ret = __db_appname(
+ env, DB_APP_BLOB, ppath, NULL, &path)) != 0) {
+ __db_errx(env, DB_STR_A("0231",
+ "Failed to get path to blob file: %llu.", "%llu"),
+ (unsigned long long)blob_id);
+ goto err;
+ }
+
+ if (LF_ISSET(DB_FOP_READONLY) || DB_IS_READONLY(dbp))
+ oflags |= DB_OSO_RDONLY;
+ if ((ret = __os_open(env, path, 0, oflags, 0, fhpp)) != 0) {
+ /*
+ * In replication it is possible to try to read a blob file
+ * that has been deleted. In that case do not print an error.
+ */
+ if (printerr == 1) {
+ __db_errx(env, DB_STR_A("0232",
+ "Error opening blob file: %s.", "%s"), path);
+ }
+ goto err;
+ }
+
+err: if (path != NULL)
+ __os_free(env, path);
+ if (ppath != NULL)
+ __os_free(env, ppath);
+ return (ret);
+}
+
+/*
+ * __blob_file_read --
+ *
+ * PUBLIC: int __blob_file_read
+ * PUBLIC: __P((ENV *, DB_FH *, DBT *, off_t, u_int32_t));
+ */
+int
+__blob_file_read(env, fhp, dbt, offset, size)
+ ENV *env;
+ DB_FH *fhp;
+ DBT *dbt;
+ off_t offset;
+ u_int32_t size;
+{
+ int ret;
+ size_t bytes;
+ void *buf;
+
+ bytes = 0;
+ buf = NULL;
+
+ if ((ret = __os_seek(env, fhp, 0, 0, offset)) != 0)
+ goto err;
+
+ if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+ if ((ret = __os_malloc(env, size, &buf)) != 0)
+ goto err;
+ } else
+ buf = dbt->data;
+
+ if ((ret = __os_read(env, fhp, buf, size, &bytes)) != 0) {
+ __db_errx(env, DB_STR("0233", "Error reading blob file."));
+ goto err;
+ }
+ /*
+ * It is okay to read off the end of the file, in which case less bytes
+ * will be returned than requested. This is also how the code behaves
+ * in the DB_DBT_PARTIAL API.
+ */
+ dbt->size = (u_int32_t)bytes;
+
+ if (F_ISSET(dbt, DB_DBT_USERCOPY) && dbt->size != 0) {
+ ret = env->dbt_usercopy(
+ dbt, 0, buf, dbt->size, DB_USERCOPY_SETDATA);
+ }
+
+err: if (buf != NULL && buf != dbt->data)
+ __os_free(env, buf);
+ return (ret);
+}
+
+/*
+ * __blob_file_write --
+ *
+ * PUBLIC: int __blob_file_write
+ * PUBLIC: __P((DBC *, DB_FH *, DBT *,
+ * PUBLIC: off_t, db_seq_t, off_t *, u_int32_t));
+ */
+int
+__blob_file_write(dbc, fhp, buf, offset, blob_id, file_size, flags)
+ DBC *dbc;
+ DB_FH *fhp;
+ DBT *buf;
+ off_t offset;
+ db_seq_t blob_id;
+ off_t *file_size;
+ u_int32_t flags;
+{
+ ENV *env;
+ off_t size, write_offset;
+ char *dirname, *name;
+ int ret, blob_lg;
+ size_t data_size;
+ void *ptr;
+
+ env = dbc->env;
+ dirname = name = NULL;
+ size = 0;
+ write_offset = offset;
+ DB_ASSERT(env, !DB_IS_READONLY(dbc->dbp));
+ DB_ASSERT(env, fhp != NULL);
+
+ /* File size is used to tell if the write is extending the file. */
+ size = *file_size;
+
+ if (DBENV_LOGGING(env)) {
+ if ((ret = __log_get_config(
+ env->dbenv, DB_LOG_BLOB, &blob_lg)) != 0)
+ goto err;
+ if (blob_lg == 0 && !REP_ON(env))
+ LF_SET(DB_FOP_PARTIAL_LOG);
+ if (!LF_ISSET(DB_FOP_CREATE) && (size <= offset))
+ LF_SET(DB_FOP_APPEND);
+ }
+
+ if ((ret = __blob_id_to_path(
+ env, dbc->dbp->blob_sub_dir, blob_id, &name)) != 0)
+ goto err;
+
+ if ((ret = __dbt_usercopy(env, buf)) != 0)
+ goto err;
+
+ /*
+ * If the write overwrites some of the file, and writes off the end
+ * of the file, break the write into two writes, one that overwrites
+ * data, and an append. Otherwise if the write is aborted, the
+ * data written past the end of the file will not be erased.
+ */
+ if (offset < size && (offset + buf->size) > size) {
+ ptr = buf->data;
+ data_size = (size_t)(size - offset);
+ if ((ret = __fop_write_file(env, dbc->txn, name, dirname,
+ DB_APP_BLOB, fhp, offset, ptr, data_size, flags)) != 0) {
+ __db_errx(env, DB_STR_A("0235",
+ "Error writing blob file: %s.", "%s"), name);
+ goto err;
+ }
+ LF_SET(DB_FOP_APPEND);
+ ptr = (u_int8_t *)ptr + data_size;
+ data_size = buf->size - data_size;
+ write_offset = size;
+ } else {
+ if (!LF_ISSET(DB_FOP_CREATE) && (offset >= size))
+ LF_SET(DB_FOP_APPEND);
+ ptr = buf->data;
+ data_size = buf->size;
+ }
+
+ if ((ret = __fop_write_file(env, dbc->txn, name, dirname,
+ DB_APP_BLOB, fhp, write_offset, ptr, data_size, flags)) != 0) {
+ __db_errx(env, DB_STR_A("0236",
+ "Error writing blob file: %s.", "%s"), name);
+ goto err;
+ }
+
+ if (LF_ISSET(DB_FOP_SYNC_WRITE))
+ if ((ret = __os_fsync(env, fhp)) != 0)
+ goto err;
+
+ /* Update the size of the file. */
+ if ((offset + (off_t)buf->size) > size)
+ *file_size = offset + (off_t)buf->size;
+
+err: if (name != NULL)
+ __os_free(env, name);
+
+ return (ret);
+}
diff --git a/src/blob/blob_page.c b/src/blob/blob_page.c
new file mode 100644
index 00000000..96a2b59b
--- /dev/null
+++ b/src/blob/blob_page.c
@@ -0,0 +1,374 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+
+/*
+ * Blob file data item code.
+ *
+ * Blob file data entries are stored on linked lists of pages. The initial
+ * reference is a structure with an encoded version of the path where the file
+ * is stored. The blob file contains only the users data.
+ */
+
+/*
+ * __blob_bulk --
+ * Dump blob file into buffer.
+ * The space requirements have already been checked, if the blob is
+ * larger than UINT32MAX then DB_BUFFER_SMALL would have already
+ * been returned.
+ * PUBLIC: int __blob_bulk
+ * PUBLIC: __P((DBC *, u_int32_t, db_seq_t, u_int8_t *));
+ */
+int
+__blob_bulk(dbc, len, blob_id, dp)
+ DBC *dbc;
+ u_int32_t len;
+ db_seq_t blob_id;
+ u_int8_t *dp;
+{
+ DBT dbt;
+ DB_FH *fhp;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbc->dbp->env;
+ fhp = NULL;
+ memset(&dbt, 0, sizeof(dbt));
+ F_SET(&dbt, DB_DBT_USERMEM);
+ dbt.ulen = len;
+ dbt.data = (void *)dp;
+
+ if ((ret = __blob_file_open(
+ dbc->dbp, &fhp, blob_id, DB_FOP_READONLY, 1)) != 0)
+ goto err;
+
+ if ((ret = __blob_file_read(env, fhp, &dbt, 0, len)) != 0)
+ goto err;
+
+ /* Close any open file descriptors. */
+err: if (fhp != NULL) {
+ t_ret = __blob_file_close(dbc, fhp, 0);
+ if (ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+
+/*
+ * __blob_get --
+ * Get a blob file item. Analogous to db_overflow.c:__db_goff.
+ *
+ * PUBLIC: int __blob_get __P((DBC *,
+ * PUBLIC: DBT *, db_seq_t, off_t, void **, u_int32_t *));
+ */
+int
+__blob_get(dbc, dbt, blob_id, file_size, bpp, bpsz)
+ DBC *dbc;
+ DBT *dbt;
+ db_seq_t blob_id;
+ off_t file_size;
+ void **bpp;
+ u_int32_t *bpsz;
+{
+ DB_FH *fhp;
+ ENV *env;
+ int ret, t_ret;
+ u_int32_t needed, start, tlen;
+
+ env = dbc->dbp->env;
+ fhp = NULL;
+ ret = 0;
+
+ /*
+ * Blobs larger than UINT32_MAX can only be read using
+ * the DB_STREAM API, or the DB_DBT_PARTIAL API.
+ */
+ if (file_size > UINT32_MAX) {
+ if (!F_ISSET(dbt, DB_DBT_PARTIAL)) {
+ dbt->size = UINT32_MAX;
+ ret = DB_BUFFER_SMALL;
+ goto err;
+ } else
+ tlen = UINT32_MAX;
+ } else
+ tlen = (u_int32_t)file_size;
+
+ if (((ret = __db_alloc_dbt(
+ env, dbt, tlen, &needed, &start, bpp, bpsz)) != 0) || needed == 0)
+ goto err;
+ dbt->size = needed;
+
+ if ((ret = __blob_file_open(
+ dbc->dbp, &fhp, blob_id, DB_FOP_READONLY, 1)) != 0)
+ goto err;
+
+ if ((ret = __blob_file_read(env, fhp, dbt, dbt->doff, needed)) != 0)
+ goto err;
+
+ /* Close any open file descriptors. */
+err: if (fhp != NULL) {
+ t_ret = __blob_file_close(dbc, fhp, 0);
+ if (ret == 0)
+ ret = t_ret;
+ }
+ /* Does the dbt need to be cleaned on error? */
+ return (ret);
+}
+
+/*
+ * __blob_put --
+ * Put a blob file item.
+ *
+ * PUBLIC: int __blob_put __P((
+ * PUBLIC: DBC *, DBT *, db_seq_t *, off_t *size, DB_LSN *));
+ */
+int
+__blob_put(dbc, dbt, blob_id, size, plsn)
+ DBC *dbc;
+ DBT *dbt;
+ db_seq_t *blob_id;
+ off_t *size;
+ DB_LSN *plsn;
+{
+ DBT partial;
+ DB_FH *fhp;
+ ENV *env;
+ int ret, t_ret;
+ off_t offset;
+
+ env = dbc->dbp->env;
+ fhp = NULL;
+ offset = 0;
+ DB_ASSERT(env, blob_id != NULL);
+ DB_ASSERT(env, *blob_id == 0);
+
+ ZERO_LSN(*plsn);
+
+ /* If the id didn't refer to an existing blob generate a new one. */
+ if ((ret = __blob_file_create(dbc, &fhp, blob_id)) != 0)
+ goto err;
+
+ /*
+ * If doing a partial put with dbt->doff == 0, then treat like
+ * a normal put. Otherwise write NULLs into the file up to doff, which
+ * is required by the PARTIAL API. Since the file is being created,
+ * its size is always 0.
+ */
+ DB_ASSERT(env, *size == 0);
+ if (F_ISSET(dbt, DB_DBT_PARTIAL) && dbt->doff > 0) {
+ memset(&partial, 0, sizeof(partial));
+ if ((ret = __os_malloc(env, dbt->doff, &partial.data)) != 0)
+ goto err;
+ memset(partial.data, 0, dbt->doff);
+ partial.size = dbt->doff;
+ ret = __blob_file_write(
+ dbc, fhp, &partial, 0, *blob_id, size, DB_FOP_CREATE);
+ offset = dbt->doff;
+ __os_free(env, partial.data);
+ if (ret != 0)
+ goto err;
+ }
+
+ if ((ret = __blob_file_write(
+ dbc, fhp, dbt, offset, *blob_id, size, DB_FOP_CREATE)) != 0)
+ goto err;
+
+ /* Close any open file descriptors. */
+err: if (fhp != NULL) {
+ t_ret = __blob_file_close(dbc, fhp, DB_FOP_WRITE);
+ if (ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+
+/*
+ * __blob_repl --
+ * Replace a blob file contents. It would be nice if this could be done
+ * by truncating the file and writing in the new data, but undoing a
+ * truncate would require a lot of logging, so it is performed by
+ * deleting the old blob file, and creating a new one.
+ *
+ * PUBLIC: int __blob_repl __P((DBC *, DBT *, db_seq_t, db_seq_t *,off_t *));
+ */
+int
+__blob_repl(dbc, nval, blob_id, new_blob_id, size)
+ DBC *dbc;
+ DBT *nval;
+ db_seq_t blob_id;
+ db_seq_t *new_blob_id;
+ off_t *size;
+{
+ DBT partial;
+ DB_FH *fhp, *new_fhp;
+ DB_LSN lsn;
+ ENV *env;
+ int ret, t_ret;
+ off_t current, old_size;
+
+ fhp = new_fhp = NULL;
+ *new_blob_id = 0;
+ old_size = *size;
+ env = dbc->env;
+ memset(&partial, 0, sizeof(partial));
+
+ /*
+ * Handling partial replace.
+ * 1. doff > blob file size : Pad the end of the blob file with NULLs
+ * up to doff, then append the data.
+ * 2. doff == size: Write the data to the existing blob file.
+ * 3. dlen == size: Write the data to the existing blob file.
+ * 4. Create a new blob file. Copy old blob data up to doff
+ * to the new file. Append the new data. Append data
+ * from the old file from doff + dlen to the end of the
+ * old file to the new file. Delete the old file.
+ */
+ if (F_ISSET(nval, DB_DBT_PARTIAL)) {
+ if ((nval->doff > *size) ||
+ ((nval->doff == *size) || (nval->dlen == nval->size))) {
+ /* Open the file for appending. */
+ if ((ret = __blob_file_open(
+ dbc->dbp, &fhp, blob_id, 0, 1)) != 0)
+ goto err;
+ *new_blob_id = blob_id;
+
+ /* Pad the end of the blob with NULLs. */
+ if (nval->doff > *size) {
+ partial.size = nval->doff - (u_int32_t)*size;
+ if ((ret = __os_malloc(
+ env, partial.size, &partial.data)) != 0)
+ goto err;
+ memset(partial.data, 0, partial.size);
+ if ((ret = __blob_file_write(dbc, fhp,
+ &partial, *size, blob_id, size, 0)) != 0)
+ goto err;
+ }
+
+ /* Write in the data. */
+ if ((ret = __blob_file_write(dbc, fhp,
+ nval, nval->doff, blob_id, size, 0)) != 0)
+ goto err;
+
+ /* Close the file */
+ ret = __blob_file_close(dbc, fhp, DB_FOP_WRITE);
+ fhp = NULL;
+ if (ret != 0)
+ goto err;
+ } else {
+ /* Open the old blob file. */
+ if ((ret = __blob_file_open(
+ dbc->dbp, &fhp, blob_id, DB_FOP_READONLY, 1)) != 0)
+ goto err;
+ /* Create the new blob file. */
+ if ((ret = __blob_file_create(
+ dbc, &new_fhp, new_blob_id)) != 0)
+ goto err;
+
+ *size = 0;
+ /* Copy data to the new file up to doff. */
+ if (nval->doff != 0) {
+ partial.ulen = partial.size = nval->doff;
+ if ((ret = __os_malloc(
+ env, partial.ulen, &partial.data)) != 0)
+ goto err;
+ if ((ret = __blob_file_read(
+ env, fhp, &partial, 0, partial.size)) != 0)
+ goto err;
+ if ((ret = __blob_file_write(
+ dbc, new_fhp, &partial, 0,
+ *new_blob_id, size, DB_FOP_CREATE)) != 0)
+ goto err;
+ }
+
+ /* Write the partial data into the new file. */
+ if ((ret = __blob_file_write(
+ dbc, new_fhp, nval, nval->doff,
+ *new_blob_id, size, DB_FOP_CREATE)) != 0)
+ goto err;
+
+ /* Copy remaining blob data into the new file. */
+ current = nval->doff + nval->dlen;
+ while (current < old_size) {
+ if (partial.ulen < MEGABYTE) {
+ if ((ret = __os_realloc(env,
+ MEGABYTE, &partial.data)) != 0)
+ goto err;
+ partial.size = partial.ulen = MEGABYTE;
+ }
+ if ((old_size - current) < partial.ulen) {
+ partial.size =
+ (u_int32_t)(old_size - current);
+ } else
+ partial.size = MEGABYTE;
+
+ if ((ret = __blob_file_read(env, fhp,
+ &partial, current, partial.size)) != 0)
+ goto err;
+ if ((ret = __blob_file_write(
+ dbc, new_fhp, &partial, *size,
+ *new_blob_id, size, DB_FOP_CREATE)) != 0)
+ goto err;
+ current += partial.size;
+ }
+
+ /* Close the old file. */
+ ret = __blob_file_close(dbc, fhp, 0);
+ fhp = NULL;
+ if (ret != 0)
+ goto err;
+
+ /* Delete the old blob file. */
+ if ((ret = __blob_del(dbc, blob_id)) != 0)
+ goto err;
+ }
+ goto err;
+ }
+
+ if ((ret = __blob_del(dbc, blob_id)) != 0)
+ goto err;
+
+ *size = 0;
+ if ((ret = __blob_put(dbc, nval, new_blob_id, size, &lsn)) != 0)
+ goto err;
+
+err: if (fhp != NULL) {
+ t_ret = __blob_file_close(dbc, fhp, DB_FOP_WRITE);
+ if (ret == 0)
+ ret = t_ret;
+ }
+ if (new_fhp != NULL) {
+ t_ret = __blob_file_close(dbc, new_fhp, DB_FOP_WRITE);
+ if (ret == 0)
+ ret = t_ret;
+ }
+ if (partial.data != NULL)
+ __os_free(env, partial.data);
+ return (ret);
+}
+
+/*
+ * __blob_del --
+ * Delete a blob file. The onpage record is handled separately..
+ *
+ * PUBLIC: int __blob_del __P((DBC *, db_seq_t));
+ */
+int
+__blob_del(dbc, blob_id)
+ DBC *dbc;
+ db_seq_t blob_id;
+{
+ int ret;
+
+ ret = __blob_file_delete(dbc, blob_id);
+
+ return (ret);
+}
diff --git a/src/blob/blob_stream.c b/src/blob/blob_stream.c
new file mode 100644
index 00000000..ab21aa0f
--- /dev/null
+++ b/src/blob/blob_stream.c
@@ -0,0 +1,283 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+
+static int __db_stream_close __P((DB_STREAM *, u_int32_t));
+static int __db_stream_read
+ __P((DB_STREAM *, DBT *, db_off_t, u_int32_t, u_int32_t));
+static int __db_stream_size __P((DB_STREAM *, db_off_t *, u_int32_t));
+static int __db_stream_write __P((DB_STREAM *, DBT *, db_off_t, u_int32_t));
+
+/*
+ * __db_stream_init
+ * DB_STREAM initializer.
+ *
+ * PUBLIC: int __db_stream_init __P((DBC *, DB_STREAM **, u_int32_t));
+ */
+int
+__db_stream_init(dbc, dbsp, flags)
+ DBC *dbc;
+ DB_STREAM **dbsp;
+ u_int32_t flags;
+{
+ DB_STREAM *dbs;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+ off_t size;
+
+ dbs = NULL;
+ env = dbc->env;
+
+ if ((ret = __os_malloc(env, sizeof(DB_STREAM), &dbs)) != 0)
+ return (ret);
+ memset(dbs, 0, sizeof(DB_STREAM));
+
+ ENV_ENTER(env, ip);
+ /* Should the copy be transient? */
+ if ((ret = __dbc_idup(dbc, &dbs->dbc, DB_POSITION)) != 0)
+ goto err;
+ dbs->flags = flags;
+
+ /*
+ * Make sure we have a write lock on the db record if writing
+ * to the blob.
+ */
+ if (F_ISSET(dbs, DB_FOP_WRITE))
+ F_SET(dbc, DBC_RMW);
+
+ if ((ret = __dbc_get_blob_id(dbs->dbc, &dbs->blob_id)) != 0) {
+ if (ret == EINVAL)
+ __db_errx(env, DB_STR("0211",
+ "Error, cursor does not point to a blob."));
+ goto err;
+ }
+
+ if ((ret = __dbc_get_blob_size(dbs->dbc, &size)) != 0)
+ goto err;
+ dbs->file_size = size;
+
+ if ((ret = __blob_file_open(
+ dbs->dbc->dbp, &dbs->fhp, dbs->blob_id, flags, 1)) != 0)
+ goto err;
+ ENV_LEAVE(env, ip);
+
+ dbs->close = __db_stream_close;
+ dbs->read = __db_stream_read;
+ dbs->size = __db_stream_size;
+ dbs->write = __db_stream_write;
+
+ *dbsp = dbs;
+ return (0);
+
+err: if (dbs != NULL && dbs->dbc != NULL)
+ (void)__dbc_close(dbs->dbc);
+ ENV_LEAVE(env, ip);
+ if (dbs != NULL)
+ __os_free(env, dbs);
+ return (ret);
+}
+
+/*
+ * __db_stream_close --
+ *
+ * DB_STREAM->close
+ */
+static int
+__db_stream_close(dbs, flags)
+ DB_STREAM *dbs;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbs->dbc->env;
+
+ if ((ret = __db_fchk(env, "DB_STREAM->close", flags, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ ret = __db_stream_close_int(dbs);
+
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __db_stream_close_int --
+ * Close a DB_STREAM object.
+ *
+ * PUBLIC: int __db_stream_close_int __P ((DB_STREAM *));
+ */
+int
+__db_stream_close_int(dbs)
+ DB_STREAM *dbs;
+{
+ DBC *dbc;
+ ENV *env;
+ int ret, t_ret;
+
+ dbc = dbs->dbc;
+ env = dbc->env;
+
+ ret = __blob_file_close(dbc, dbs->fhp, dbs->flags);
+
+ if ((t_ret = __dbc_close(dbs->dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ __os_free(env, dbs);
+
+ return (ret);
+}
+
+/*
+ * __db_stream_read --
+ *
+ * DB_STREAM->read
+ */
+static int
+__db_stream_read(dbs, data, offset, size, flags)
+ DB_STREAM *dbs;
+ DBT *data;
+ db_off_t offset;
+ u_int32_t size;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ int ret;
+ u_int32_t needed, start;
+
+ dbc = dbs->dbc;
+ env = dbc->dbp->env;
+ ret = 0;
+
+ if ((ret = __db_fchk(env, "DB_STREAM->read", flags, 0)) != 0)
+ return (ret);
+
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR("0212",
+ "Error, do not use DB_DBT_PARTIAL with DB_STREAM."));
+ goto err;
+ }
+
+ if (offset > dbs->file_size) {
+ data->size = 0;
+ goto err;
+ }
+
+ if ((ret = __db_alloc_dbt(
+ env, data, size, &needed, &start, NULL, NULL)) != 0)
+ goto err;
+ data->size = needed;
+
+ if (needed == 0)
+ goto err;
+
+ ret = __blob_file_read(env, dbs->fhp, data, offset, size);
+
+err: return (ret);
+}
+
+/*
+ * __db_stream_size --
+ *
+ * DB_STREAM->size
+ */
+static int
+__db_stream_size(dbs, size, flags)
+ DB_STREAM *dbs;
+ db_off_t *size;
+ u_int32_t flags;
+{
+ int ret;
+
+ if ((ret = __db_fchk(dbs->dbc->env, "DB_STREAM->size", flags, 0)) != 0)
+ return (ret);
+
+ *size = dbs->file_size;
+
+ return (0);
+}
+
+/*
+ * __db_stream_write --
+ *
+ * DB_STREAM->write
+ */
+static int
+__db_stream_write(dbs, data, offset, flags)
+ DB_STREAM *dbs;
+ DBT *data;
+ db_off_t offset;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+ off_t file_size;
+ u_int32_t wflags;
+
+ env = dbs->dbc->env;
+
+ if ((ret = __db_fchk(
+ env, "DB_STREAM->write", flags, DB_STREAM_SYNC_WRITE)) != 0)
+ return (ret);
+
+ if (F_ISSET(dbs, DB_FOP_READONLY)) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR("0213", "Error, blob is read only."));
+ return (ret);
+ }
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR("0214",
+ "Error, do not use DB_DBT_PARTIAL with DB_STREAM."));
+ return (ret);
+ }
+ if (offset < 0 ) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR_A("0215",
+ "Error, invalid offset value: %lld", "%lld"),
+ (long long)offset);
+ return (ret);
+ }
+ /* Catch overflow. */
+ if (offset + (db_off_t)data->size < offset) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR_A("0216",
+ "Error, this write will exceed the maximum blob size: %lu %lld",
+ "%lu %lld"), (u_long)data->size, (long long)offset);
+ return (ret);
+ }
+
+ ENV_ENTER(env, ip);
+ wflags = dbs->flags;
+ if (LF_ISSET(DB_STREAM_SYNC_WRITE))
+ wflags |= DB_FOP_SYNC_WRITE;
+ file_size = dbs->file_size;
+ if ((ret = __blob_file_write(dbs->dbc, dbs->fhp,
+ data, offset, dbs->blob_id, &file_size, wflags)) != 0)
+ goto err;
+ if (file_size != dbs->file_size) {
+ dbs->file_size = file_size;
+ if ((ret = __dbc_set_blob_size(dbs->dbc, dbs->file_size)) != 0)
+ goto err;
+ }
+err: ENV_LEAVE(env, ip);
+
+ return (ret);
+}
diff --git a/src/blob/blob_util.c b/src/blob/blob_util.c
new file mode 100644
index 00000000..b2e3474b
--- /dev/null
+++ b/src/blob/blob_util.c
@@ -0,0 +1,1189 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/db_am.h"
+#include "dbinc/blob.h"
+#include "dbinc/fop.h"
+#include "dbinc/txn.h"
+#include "dbinc_auto/sequence_ext.h"
+
+static int __blob_open_meta_db __P((
+ DB *, DB_TXN *, DB **, DB_SEQUENCE **, int, int));
+static int __blob_clean_dir
+ __P((ENV *, DB_TXN *, const char *, const char *, int));
+static int __blob_copy_dir __P((DB *, const char *, const char *));
+
+#define BLOB_ID_KEY "blob_id"
+#define BLOB_SEQ_DB_NAME "blob_id_seq"
+#define BLOB_DIR_ID_KEY "blob_dir_id"
+#define BLOB_DIR_SEQ_DB_NAME "blob_dir_id_seq"
+
+/*
+ * __blob_make_sub_dir --
+ * Create the name of the subdirectory in the blob directory
+ * for the given database file and subdatabase ids.
+ *
+ * PUBLIC: int __blob_make_sub_dir __P((ENV *, char **, db_seq_t, db_seq_t));
+ */
+int
+__blob_make_sub_dir(env, blob_sub_dir, file_id, db_id)
+ ENV *env;
+ char **blob_sub_dir;
+ db_seq_t file_id;
+ db_seq_t db_id;
+{
+ char fname[MAX_BLOB_PATH_SZ], dname[MAX_BLOB_PATH_SZ];
+ int ret;
+ size_t len;
+
+ *blob_sub_dir = NULL;
+ memset(fname, 0, MAX_BLOB_PATH_SZ);
+ memset(dname, 0, MAX_BLOB_PATH_SZ);
+
+ if (db_id == 0 && file_id == 0)
+ return (0);
+
+ if (db_id < 0 || file_id < 0)
+ return (EINVAL);
+
+ /* The master db has no subdb id. */
+ if (db_id != 0)
+ (void)snprintf(dname, MAX_BLOB_PATH_SZ,
+ "%s%llu", BLOB_DIR_PREFIX, (unsigned long long)db_id);
+ (void)snprintf(fname, MAX_BLOB_PATH_SZ, "%s%llu",
+ BLOB_DIR_PREFIX, (unsigned long long)file_id);
+
+ len = strlen(fname) + (db_id ? strlen(dname) : 0) + 3;
+ if ((ret = __os_malloc(env, len, blob_sub_dir)) != 0)
+ goto err;
+ if (db_id != 0)
+ (void)sprintf(*blob_sub_dir, "%s%c%s%c", fname,
+ PATH_SEPARATOR[0], dname, PATH_SEPARATOR[0]);
+ else
+ (void)sprintf(*blob_sub_dir, "%s%c", fname, PATH_SEPARATOR[0]);
+
+ return (0);
+
+err: if (*blob_sub_dir != NULL)
+ __os_free(env, *blob_sub_dir);
+
+ return (ret);
+}
+
+/*
+ * __blob_make_meta_fname --
+ * Construct a (usually partial) path name of a blob metadata data file.
+ * It usually is relative to the environment home directory; only when a
+ * blob directory is configured and is an absolute path does this make a
+ * full path.
+ *
+ * When dbp is set it constructs the blob metadata filename for that db;
+ * otherwise it constructs the environment-wide directory id filename.
+ *
+ * PUBLIC: int __blob_make_meta_fname __P((ENV *, DB *, char **));
+ */
+int
+__blob_make_meta_fname(env, dbp, meta_fname)
+ ENV *env;
+ DB *dbp;
+ char **meta_fname;
+{
+ char *fname, *sub_dir;
+ int ret;
+ size_t len;
+
+ fname = NULL;
+ len = strlen(BLOB_META_FILE_NAME) + 1;
+ if (dbp == NULL) {
+ sub_dir = "";
+ } else {
+ sub_dir = dbp->blob_sub_dir;
+ DB_ASSERT(env, sub_dir != NULL);
+ len += strlen(sub_dir);
+ }
+ if ((ret = __os_malloc(env, len, &fname)) != 0)
+ goto err;
+
+ snprintf(fname, len, "%s%s", sub_dir, BLOB_META_FILE_NAME);
+ *meta_fname = fname;
+ return (0);
+err:
+ if (fname != NULL)
+ __os_free(env, fname);
+ return (ret);
+}
+
+/*
+ * __blob_get_dir --
+ * Get the root directory of this database's blob files.
+ *
+ * PUBLIC: int __blob_get_dir __P((DB *, char **));
+ */
+int
+__blob_get_dir(dbp, dirp)
+ DB *dbp;
+ char **dirp;
+{
+ char *blob_dir;
+ int ret;
+
+ *dirp = NULL;
+
+ if (dbp->blob_sub_dir == NULL)
+ return (0);
+
+ /* Get the path of the blob directory for this database. */
+ if ((ret = __db_appname(dbp->env,
+ DB_APP_BLOB, dbp->blob_sub_dir, NULL, &blob_dir)) != 0)
+ goto err;
+
+ *dirp = blob_dir;
+ return (0);
+
+err: if (blob_dir != NULL)
+ __os_free(dbp->env, blob_dir);
+
+ return (ret);
+}
+
+/*
+ * __blob_open_meta_db --
+ * Open or create a blob meta database. This can be either
+ * the environment-wide db used to generate blob directory ids (__db1), or
+ * the per-db db used to generate blob ids (__db.bl001).
+ */
+static int
+__blob_open_meta_db(dbp, txn, meta_db, seq, file, create)
+ DB *dbp;
+ DB_TXN *txn;
+ DB **meta_db;
+ DB_SEQUENCE **seq;
+ int file;
+ int create;
+{
+#ifdef HAVE_64BIT_TYPES
+ ENV *env;
+ DB *blob_meta_db;
+ DBT key;
+ DB_SEQUENCE *blob_seq;
+ DB_THREAD_INFO *ip;
+ DB_TXN *local_txn;
+ char *fullname, *fname, *dname, *path;
+ int free_paths, ret, use_txn;
+ u_int32_t flags;
+
+ flags = 0;
+ fullname = fname = NULL;
+ blob_meta_db = NULL;
+ blob_seq = NULL;
+ local_txn = NULL;
+ env = dbp->env;
+ free_paths = use_txn = 0;
+ memset(&key, 0, sizeof(DBT));
+
+ /*
+ * Get the directory of the database, the meta db file name,
+ * and the sub-db name.
+ * file: blob directory/meta-file-name
+ * else: blob directory/per-db-blobdir/meta-file-name
+ */
+ if (file) {
+ key.data = BLOB_DIR_ID_KEY;
+ key.size = (u_int32_t)strlen(BLOB_DIR_ID_KEY);
+ dname = BLOB_DIR_SEQ_DB_NAME;
+ fname = BLOB_META_FILE_NAME;
+ } else {
+ key.data = BLOB_ID_KEY;
+ key.size = (u_int32_t)strlen(BLOB_ID_KEY);
+ dname = BLOB_SEQ_DB_NAME;
+ if ((ret = __blob_make_meta_fname(env,
+ file ? NULL : dbp, &fname)) < 0)
+ goto err;
+ free_paths = 1;
+ if (dbp->open_flags & DB_THREAD)
+ LF_SET(DB_THREAD);
+ }
+
+ if ((ret = __db_appname(env, DB_APP_BLOB, fname, NULL, &fullname)) != 0)
+ goto err;
+
+ path = fullname;
+#ifdef DB_WIN32
+ /*
+ * Absolute paths on windows can result in it creating a "C" or "D"
+ * directory in the working directory.
+ */
+ if (__os_abspath(path))
+ path += 2;
+#endif
+ /*
+ * Create the blob, database file, and database name directories. The
+ * mkdir isn't logged, so __fop_create_recover needs to do this as well.
+ */
+ if (__os_exists(env, fullname, NULL) != 0) {
+ if (!create) {
+ ret = ENOENT;
+ goto err;
+ } else if ((ret = __db_mkpath(env, path)) != 0)
+ goto err;
+ }
+
+ if ((ret = __db_create_internal(&blob_meta_db, env, 0)) != 0)
+ goto err;
+
+ if (create)
+ LF_SET(DB_CREATE);
+
+ /* Disable blobs in the blob meta databases themselves. */
+ if ((ret = __db_set_blob_threshold(blob_meta_db, 0, 0)) != 0)
+ goto err;
+
+ /*
+ * To avoid concurrency issues, the blob meta database is
+ * opened and operated on in a local transaction. The one
+ * exception is when the blob meta database is created in the
+ * same txn as the parent db. Then the blob meta database
+ * shares the given txn, so if the txn is rolled back, the
+ * creation of the blob meta database will also be rolled back.
+ */
+ if (!file && IS_REAL_TXN(dbp->cur_txn))
+ use_txn = 1;
+
+ ENV_GET_THREAD_INFO(env, ip);
+ if (IS_REAL_TXN(txn)) {
+ if (use_txn)
+ local_txn = txn;
+ else {
+ if ((ret = __txn_begin(
+ env, ip, NULL, &local_txn, DB_IGNORE_LEASE)) != 0)
+ goto err;
+ }
+ }
+ if ((ret = __db_open(blob_meta_db, ip, local_txn, fname, dname,
+ DB_BTREE, flags | DB_INTERNAL_BLOB_DB, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ /* Open the sequence that holds the blob ids. */
+ if ((ret = db_sequence_create(&blob_seq, blob_meta_db, 0)) != 0)
+ goto err;
+
+ /* No-op if already initialized, 0 is an invalid value for blob ids. */
+ if ((ret = __seq_initial_value(blob_seq, 1)) != 0)
+ goto err;
+ if ((ret = __seq_open(blob_seq, local_txn, &key, flags)) != 0)
+ goto err;
+
+ if (local_txn != NULL && use_txn == 0 &&
+ (ret = __txn_commit(local_txn, 0)) != 0) {
+ local_txn = NULL;
+ goto err;
+ }
+ __os_free(env, fullname);
+ if (free_paths)
+ __os_free(env, fname);
+ *meta_db = blob_meta_db;
+ *seq = blob_seq;
+ return (0);
+
+err:
+ if (fullname)
+ __os_free(env, fullname);
+ if (fname != NULL && free_paths)
+ __os_free(env, fname);
+ if (local_txn != NULL && use_txn == 0)
+ (void)__txn_abort(local_txn);
+ if (blob_seq != NULL)
+ (void)__seq_close(blob_seq, 0);
+ if (blob_meta_db != NULL)
+ (void)__db_close(blob_meta_db, NULL, 0);
+ return (ret);
+
+#else /*HAVE_64BIT_TYPES*/
+ __db_errx(dbp->env, DB_STR("0217",
+ "library build did not include support for blobs"));
+ return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __blob_generate_dir_ids --
+ *
+ * Generate the unique ids used to create a blob directory for the database.
+ * Only one argument is needed. Files with one database only need the
+ * file id. The master database only needs the file id, and
+ * subdatabases inherit the file id from the master, so they only need the
+ * subdatabase id.
+ *
+ * PUBLIC: int __blob_generate_dir_ids
+ * PUBLIC: __P((DB *, DB_TXN *, db_seq_t *));
+ */
+int
+__blob_generate_dir_ids(dbp, txn, id)
+ DB *dbp;
+ DB_TXN *txn;
+ db_seq_t *id;
+{
+ DB *blob_meta_db;
+ DB_SEQUENCE *blob_seq;
+ int ret;
+ u_int32_t flags;
+
+#ifdef HAVE_64BIT_TYPES
+ flags = 0;
+ blob_meta_db = NULL;
+ blob_seq = NULL;
+
+ if ((ret = __blob_open_meta_db(
+ dbp, txn, &blob_meta_db, &blob_seq, 1, 1)) != 0)
+ goto err;
+
+ if (IS_REAL_TXN(txn))
+ LF_SET(DB_AUTO_COMMIT | DB_TXN_NOSYNC);
+
+ DB_ASSERT(dbp->env, id != NULL);
+ if (*id == 0) {
+ if ((ret = __seq_get(blob_seq, 0, 1, id, flags)) != 0)
+ goto err;
+ }
+
+err: if (blob_seq != NULL)
+ (void)__seq_close(blob_seq, 0);
+ if (blob_meta_db != NULL)
+ (void)__db_close(blob_meta_db, NULL, 0);
+ return (ret);
+#else /*HAVE_64BIT_TYPES*/
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(txn, NULL);
+ __db_errx(dbp->env, DB_STR("0218",
+ "library build did not include support for blobs"));
+ return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __blob_generate_id --
+ * Generate a new blob ID.
+ *
+ * PUBLIC: int __blob_generate_id __P((DB *, DB_TXN *, db_seq_t *));
+ */
+int
+__blob_generate_id(dbp, txn, blob_id)
+ DB *dbp;
+ DB_TXN *txn;
+ db_seq_t *blob_id;
+{
+#ifdef HAVE_64BIT_TYPES
+ DB_TXN *ltxn;
+ int ret;
+ u_int32_t flags;
+ flags = DB_IGNORE_LEASE;
+ ltxn = NULL;
+
+ if (dbp->blob_seq == NULL) {
+ if ((ret = __blob_open_meta_db(dbp, txn,
+ &dbp->blob_meta_db, &dbp->blob_seq, 0, 1)) != 0)
+ goto err;
+ }
+
+ /*
+ * If this is the opening transaction of the database, use it instead
+ * of auto commit. Otherwise it could deadlock with the transaction
+ * used to open the blob meta database in __blob_open_meta_db.
+ */
+ if (IS_REAL_TXN(dbp->cur_txn))
+ ltxn = txn;
+
+ if (IS_REAL_TXN(txn) && ltxn == NULL)
+ LF_SET(DB_AUTO_COMMIT | DB_TXN_NOSYNC);
+
+ if ((ret = __seq_get(dbp->blob_seq, ltxn, 1, blob_id, flags)) != 0)
+ goto err;
+
+err: return (ret);
+#else /*HAVE_64BIT_TYPES*/
+ COMPQUIET(blob_id, NULL);
+ __db_errx(dbp->env, DB_STR("0219",
+ "library build did not include support for blobs"));
+ return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __blob_highest_id
+ *
+ * Returns the highest id in the blob meta database.
+ *
+ * PUBLIC: int __blob_highest_id __P((DB *, DB_TXN *, db_seq_t *));
+ */
+int
+__blob_highest_id(dbp, txn, id)
+ DB *dbp;
+ DB_TXN *txn;
+ db_seq_t *id;
+{
+#ifdef HAVE_64BIT_TYPES
+ int ret;
+
+ *id = 0;
+ if (dbp->blob_sub_dir == NULL) {
+ if ((ret = __blob_make_sub_dir(dbp->env, &dbp->blob_sub_dir,
+ dbp->blob_file_id, dbp->blob_sdb_id)) != 0)
+ goto err;
+ }
+ if (dbp->blob_seq == NULL) {
+ ret = __blob_open_meta_db(dbp, txn,
+ &dbp->blob_meta_db, &dbp->blob_seq, 0, 0);
+ /*
+ * It is not an error if the blob meta database does not
+ * exist.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ if (ret != 0)
+ goto err;
+ }
+
+ ret = __seq_get(dbp->blob_seq, txn, 0, id, DB_CURRENT);
+err:
+ return (ret);
+#else /*HAVE_64BIT_TYPES*/
+ COMPQUIET(id, NULL);
+ __db_errx(dbp->env, DB_STR("0245",
+ "library build did not include support for blobs"));
+ return (DB_OPNOTSUP);
+#endif
+}
+
+/*
+ * __blob_calculate_dirs
+ *
+ * Use a blob id to to determine the path below the blob subdirectory in
+ * which the blob file is located. Assumes enough space exists in the path
+ * variable to hold the path.
+ *
+ * PUBLIC: void __blob_calculate_dirs __P((db_seq_t, char *, int *, int *));
+ */
+void
+__blob_calculate_dirs(blob_id, path, len, depth)
+ db_seq_t blob_id;
+ char *path;
+ int *len;
+ int *depth;
+{
+ int i;
+ db_seq_t factor, tmp;
+
+ /* Calculate the subdirectories from the blob id. */
+ factor = 1;
+ for ((*depth) = 0, tmp = blob_id/BLOB_DIR_ELEMS;
+ tmp != 0; tmp = tmp/BLOB_DIR_ELEMS, (*depth)++)
+ factor *= BLOB_DIR_ELEMS;
+
+ for (i = (*depth); i > 0; i--) {
+ tmp = (blob_id / factor) % BLOB_DIR_ELEMS;
+ factor /= BLOB_DIR_ELEMS;
+ (*len) += sprintf(path + (*len),
+ "%03llu%c", (unsigned long long)tmp, PATH_SEPARATOR[0]);
+ }
+}
+
+/*
+ * __blob_id_to_path --
+ * Generate the file name and blob specific part of the path for a particular
+ * blob_id. The __db_appname API is used to generate a fully qualified path.
+ * The caller must deallocate the path.
+ *
+ * PUBLIC: int __blob_id_to_path __P((ENV *, const char *, db_seq_t, char **));
+ */
+int
+__blob_id_to_path(env, blob_sub_dir, blob_id, ppath)
+ ENV *env;
+ const char *blob_sub_dir;
+ db_seq_t blob_id;
+ char **ppath;
+{
+ char *path, *tmp_path;
+ int depth, name_len, ret;
+ size_t len;
+
+ name_len = 0;
+ path = tmp_path = *ppath = NULL;
+
+ if (blob_id < 1) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ len = MAX_BLOB_PATH_SZ + strlen(blob_sub_dir) + 1;
+ if ((ret = __os_malloc(env, len, &path)) != 0)
+ goto err;
+
+ memset(path, 0, len);
+ name_len += sprintf(path, "%s", blob_sub_dir);
+
+ __blob_calculate_dirs(blob_id, path, &name_len, &depth);
+
+ /*
+ * Populate the file name. Ensure there are 3 digits for each directory
+ * level (even if they are 0).
+ */
+ (void)sprintf(path + name_len, "%s%0*llu",
+ BLOB_FILE_PREFIX, (depth + 1) * 3, (unsigned long long)blob_id);
+
+ /* If this is the first file in the directory, ensure it exists. */
+ if (blob_id % BLOB_DIR_ELEMS == 0 && depth > 0) {
+ if ((ret = __db_appname(
+ env, DB_APP_BLOB, path, NULL, &tmp_path)) != 0 )
+ goto err;
+
+ if ((ret = __db_mkpath(env, tmp_path)) != 0) {
+ __db_errx(env, DB_STR("0221",
+ "Error creating blob directory."));
+ ret = EINVAL;
+ goto err;
+ }
+ __os_free(env, tmp_path);
+ }
+
+ *ppath = path;
+ return (0);
+
+err:
+ if (tmp_path != NULL)
+ __os_free(env, tmp_path);
+ if (path != NULL)
+ __os_free(env, path);
+
+ return (ret);
+}
+
+/*
+ * __blob_str_to_id
+ *
+ * If the given string is a positive number, it returns it as a signed
+ * 64 bit integer. Otherwise the number is returned as 0.
+ *
+ * PUBLIC: int __blob_str_to_id __P((ENV *, const char **, db_seq_t *));
+ */
+int
+__blob_str_to_id(env, path, id)
+ ENV *env;
+ const char **path;
+ db_seq_t *id;
+{
+ db_seq_t i;
+ const char *p;
+ char buf[2];
+
+ p = *path;
+ i = 10;
+ *id = 0;
+ buf[1] = '\0';
+ while (p[0] >= '0' && p[0] <= '9') {
+ *id *= i;
+ buf[0] = p[0];
+ *id += atoi(buf);
+ if (*id < 0) {
+ __db_errx(env, DB_STR("0246",
+ "Blob id integer overflow."));
+ return (EINVAL);
+ }
+ p++;
+ }
+ *path = p;
+ return (0);
+}
+
+/*
+ * __blob_path_to_dir_ids --
+ * Get the file and subdatabase ids from a path to a blob file
+ * or a path in the blob directory structure. Skips the
+ * subdatabase directory id if sdb_id is NULL.
+ *
+ * PUBLIC: int __blob_path_to_dir_ids
+ * PUBLIC: __P((ENV *, const char *, db_seq_t *, db_seq_t *));
+ */
+int
+__blob_path_to_dir_ids(env, path, file_id, sdb_id)
+ ENV *env;
+ const char *path;
+ db_seq_t *file_id;
+ db_seq_t *sdb_id;
+{
+ int ret;
+ size_t len;
+ const char *p;
+
+ *file_id = 0;
+ if (sdb_id != NULL)
+ *sdb_id = 0;
+ ret = 0;
+ p = path;
+
+ /*
+ * The blob file and subdatabase directories are of the form __db###,
+ * so search the string for any directories that match that form.
+ */
+ len = strlen(path);
+ do {
+ p = strstr(p, BLOB_DIR_PREFIX);
+ if (p == NULL || p > (path + len + 4))
+ return (ret);
+ p += 4;
+ } while (p[0] < '0' || p[0] > '9');
+
+ /* The file id should be next in the path. */
+ if ((ret = __blob_str_to_id(env, &p, file_id)) != 0)
+ return (ret);
+
+ /* Quit now if a subdatabase argument was not passed. */
+ if (sdb_id == NULL)
+ return (ret);
+
+ p = strstr(p, BLOB_DIR_PREFIX);
+ /* It is okay for the path not to include a sdb_id. */
+ if (p == NULL || p > (path + 4 + len))
+ return (ret);
+
+ p += 4;
+ ret = __blob_str_to_id(env, &p, sdb_id);
+
+ return (ret);
+}
+
+/*
+ * __blob_salvage --
+ *
+ * Print a blob file during salvage. The function assumes the DBT already has
+ * a buffer large enough to hold "size" bytes.
+ *
+ * PUBLIC: int __blob_salvage __P((ENV *, db_seq_t, off_t, size_t,
+ * PUBLIC: db_seq_t, db_seq_t, DBT *));
+ */
+int
+__blob_salvage(env, blob_id, offset, size, file_id, sdb_id, dbt)
+ ENV *env;
+ db_seq_t blob_id;
+ off_t offset;
+ size_t size;
+ db_seq_t file_id;
+ db_seq_t sdb_id;
+ DBT *dbt;
+{
+ DB_FH *fhp;
+ char *blob_sub_dir, *dir, *path;
+ int ret;
+ size_t bytes;
+
+ blob_sub_dir = dir = path = NULL;
+ fhp = NULL;
+
+ if (file_id == 0 && sdb_id == 0) {
+ ret = ENOENT;
+ goto err;
+ }
+
+ if ((ret = __blob_make_sub_dir(
+ env, &blob_sub_dir, file_id, sdb_id)) != 0)
+ goto err;
+
+ if ((ret = __blob_id_to_path(env, blob_sub_dir, blob_id, &dir)) != 0)
+ goto err;
+
+ if ((ret = __db_appname(env, DB_APP_BLOB, dir, NULL, &path)) != 0)
+ goto err;
+
+ if ((ret = __os_open(env, path, 0, DB_OSO_RDONLY, 0, &fhp)) != 0)
+ goto err;
+
+ if ((ret = __os_seek(env, fhp, 0, 0, offset)) != 0)
+ goto err;
+
+ if ((ret = __os_read(env, fhp, dbt->data, size, &bytes)) != 0)
+ goto err;
+
+ dbt->size = (u_int32_t)bytes;
+ if (bytes != size)
+ ret = EIO;
+
+err: if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ if (dir != NULL)
+ __os_free(env, dir);
+ if (path != NULL)
+ __os_free(env, path);
+ if (blob_sub_dir != NULL)
+ __os_free(env, blob_sub_dir);
+ return (ret);
+}
+
+/*
+ * __blob_vrfy --
+ *
+ * Checks that a blob file for the given blob id exists, and is the given size.
+ *
+ * PUBLIC: int __blob_vrfy __P((ENV *, db_seq_t, off_t,
+ * PUBLIC: db_seq_t, db_seq_t, db_pgno_t, u_int32_t));
+ */
+int
+__blob_vrfy(env, blob_id, blob_size, file_id, sdb_id, pgno, flags)
+ ENV *env;
+ db_seq_t blob_id;
+ off_t blob_size;
+ db_seq_t file_id;
+ db_seq_t sdb_id;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ DB_FH *fhp;
+ char *blob_sub_dir, *dir, *path;
+ int isdir, ret;
+ off_t actual_size;
+ u_int32_t mbytes, bytes;
+
+ blob_sub_dir = dir = path = NULL;
+ fhp = NULL;
+ isdir = 0;
+ ret = DB_VERIFY_BAD;
+
+ if ((ret = __blob_make_sub_dir(
+ env, &blob_sub_dir, file_id, sdb_id)) != 0)
+ goto err;
+
+ if (__blob_id_to_path(env, blob_sub_dir, blob_id, &dir) != 0) {
+ EPRINT((env, DB_STR_A("0222",
+ "Page %lu: Error getting path to blob file for %llu",
+ "%lu %llu"), (u_long)pgno, (unsigned long long)blob_id));
+ goto err;
+ }
+ if (__db_appname(env, DB_APP_BLOB, dir, NULL, &path) != 0) {
+ EPRINT((env, DB_STR_A("0223",
+ "Page %lu: Error getting path to blob file for %llu",
+ "%lu %llu"), (u_long)pgno, (unsigned long long)blob_id));
+ goto err;
+ }
+ if ((__os_exists(env, path, &isdir)) != 0 || isdir != 0) {
+ EPRINT((env, DB_STR_A("0224",
+ "Page %lu: blob file does not exist at %s",
+ "%lu %s"), (u_long)pgno, path));
+ goto err;
+ }
+ if (__os_open(env, path, 0, DB_OSO_RDONLY, 0, &fhp) != 0) {
+ EPRINT((env, DB_STR_A("0225",
+ "Page %lu: Error opening blob file at %s",
+ "%lu %s"), (u_long)pgno, path));
+ goto err;
+ }
+ if (__os_ioinfo(env, path, fhp, &mbytes, &bytes, NULL) != 0) {
+ EPRINT((env, DB_STR_A("0226",
+ "Page %lu: Error getting blob file size at %s",
+ "%lu %s"), (u_long)pgno, path));
+ goto err;
+ }
+
+ actual_size = ((off_t)mbytes * (off_t)MEGABYTE) + bytes;
+ if (blob_size != actual_size) {
+ EPRINT((env, DB_STR_A("0227",
+"Page %lu: blob file size does not match size in database record: %llu %llu",
+ "%lu %llu %llu"), (u_long)pgno,
+ (unsigned long long)actual_size,
+ (unsigned long long)blob_size));
+ goto err;
+ }
+
+ ret = 0;
+
+err: if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ if (dir != NULL)
+ __os_free(env, dir);
+ if (path != NULL)
+ __os_free(env, path);
+ if (blob_sub_dir != NULL)
+ __os_free(env, blob_sub_dir);
+ return (ret);
+}
+
+/*
+ * __blob_del_hierarchy --
+ *
+ * Deletes the entire blob directory. Used by replication.
+ *
+ * PUBLIC: int __blob_del_hierarchy __P((ENV *));
+ */
+int
+__blob_del_hierarchy(env)
+ ENV *env;
+{
+ int ret;
+ char *blob_dir;
+
+ blob_dir = NULL;
+
+ if ((ret = __db_appname(env, DB_APP_BLOB, NULL, NULL, &blob_dir)) != 0)
+ goto err;
+
+ if ((ret = __blob_clean_dir(env, NULL, blob_dir, NULL, 0)) != 0)
+ goto err;
+
+err: if (blob_dir != NULL)
+ __os_free(env, blob_dir);
+ return (ret);
+}
+
+/*
+ * __blob_del_all --
+ *
+ * Deletes all the blob files and meta databases in a database's blob
+ * directory. Does not delete the directories if the delete is transactionally
+ * protected, since there is no current way to undo a directory delete in case
+ * the operation is aborted.
+ *
+ * PUBLIC: int __blob_del_all __P((DB *, DB_TXN *, int));
+ */
+int
+__blob_del_all(dbp, txn, istruncate)
+ DB *dbp;
+ DB_TXN *txn;
+ int istruncate;
+{
+#ifdef HAVE_64BIT_TYPES
+ ENV *env;
+ char *path;
+ int isdir, ret;
+
+ env = dbp->env;
+ path = NULL;
+ ret = 0;
+
+ if (dbp->blob_sub_dir == NULL) {
+ if ((ret = __blob_make_sub_dir(env, &dbp->blob_sub_dir,
+ dbp->blob_file_id, dbp->blob_sdb_id)) != 0)
+ goto err;
+ }
+
+ /* Do nothing if blobs are not enabled. */
+ if (dbp->blob_sub_dir == NULL ||
+ (dbp->blob_file_id == 0 && dbp->blob_sdb_id == 0))
+ goto err;
+
+ if ((ret = __blob_get_dir(dbp, &path)) != 0)
+ goto err;
+
+ /* Close the blob meta data databases, they are about to be deleted. */
+ if (!istruncate) {
+ if (dbp->blob_seq != NULL) {
+ if ((ret = __seq_close(dbp->blob_seq, 0)) != 0)
+ goto err;
+ dbp->blob_seq = NULL;
+ }
+ if (dbp->blob_meta_db != NULL) {
+ if ((ret =
+ __db_close(dbp->blob_meta_db, NULL, 0)) != 0)
+ goto err;
+ dbp->blob_meta_db = NULL;
+ }
+ }
+
+ /*
+ * The blob directory may not exist if blobs were enabled,
+ * but none were created.
+ */
+ if (__os_exists(env, path, &isdir) != 0)
+ goto err;
+
+ if ((ret = __blob_clean_dir(
+ env, txn, path, dbp->blob_sub_dir, istruncate)) != 0)
+ goto err;
+
+ if (!IS_REAL_TXN(txn) && !istruncate) {
+ if ((ret = __os_rmdir(env, path)) != 0)
+ goto err;
+ }
+
+err: if (path != NULL)
+ __os_free(env, path);
+ return (ret);
+
+#else /*HAVE_64BIT_TYPES*/
+ __db_errx(dbp->env, DB_STR("0220",
+ "library build did not include support for blobs"));
+ return (DB_OPNOTSUP);
+#endif
+
+}
+
+/*
+ * __blob_clean_dir --
+ *
+ * Delete all files in the given directory, and all files
+ * in all sub-directories. Does not remove directories if the operation is
+ * transactionally protected.
+ */
+static int
+__blob_clean_dir(env, txn, dir, subdir, istruncate)
+ ENV *env;
+ DB_TXN *txn;
+ const char *dir;
+ const char *subdir;
+ int istruncate;
+{
+ DB *meta;
+ DB_THREAD_INFO *ip;
+ char *blob_dir, **dirs, *fname, full_path[DB_MAXPATHLEN], *local_path;
+ int count, i, isdir, ret, t_ret;
+
+ count = 0;
+ dirs = NULL;
+ fname = NULL;
+ meta = NULL;
+
+ /* Get a list of all files in the directory. */
+ if ((ret = __os_dirlist(env, dir, 1, &dirs, &count)) != 0) {
+ if (ret == ENOENT)
+ ret = 0;
+ goto err;
+ }
+
+ for (i = 0; i < count; i++) {
+ (void)sprintf(full_path, "%s%c%s%c",
+ dir, PATH_SEPARATOR[0], dirs[i], '\0');
+
+ if (__os_exists(env, full_path, &isdir) != 0)
+ continue;
+
+ /* If it is a directory, clean it. Else remove the file. */
+ if (isdir) {
+ if ((ret = __blob_clean_dir(
+ env, txn, full_path, subdir, istruncate)) != 0)
+ goto err;
+ /* Delete the top directory. */
+ if (!IS_REAL_TXN(txn)) {
+ if ((ret = __os_rmdir(env, full_path)) != 0)
+ goto err;
+ }
+ } else if (strcmp(dirs[i], BLOB_META_FILE_NAME) == 0 ) {
+ /* Ignore the meta db when truncating. */
+ if (istruncate)
+ continue;
+ blob_dir = (env->dbenv->db_blob_dir != NULL ?
+ env->dbenv->db_blob_dir : BLOB_DEFAULT_DIR);
+ if ((fname = strstr(full_path, blob_dir)) == NULL)
+ goto err;
+ fname += strlen(blob_dir) + 1;
+ if ((ret = __db_create_internal(&meta, env, 0)) != 0)
+ goto err;
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_remove_int(meta,
+ ip, txn, fname, NULL, 0)) != 0)
+ goto err;
+ /*
+ * Closing the local DB handle releases the transaction
+ * locks, but those have to remain until the
+ * transaction is resolved, so NULL the DB locker.
+ * See __env_dbremove_pp for more details.
+ */
+ if (IS_REAL_TXN(txn))
+ meta->locker = NULL;
+ if ((t_ret = __db_close(
+ meta, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ meta = NULL;
+ if (ret != 0)
+ goto err;
+ } else {
+ if (!IS_REAL_TXN(txn))
+ ret = __os_unlink(env, full_path, 0);
+ else {
+ local_path = (subdir == NULL ? full_path :
+ strstr(full_path, subdir));
+ if (local_path != NULL)
+ ret = __fop_remove(env, txn, NULL,
+ local_path, NULL, DB_APP_BLOB, 0);
+ }
+ if (ret != 0)
+ goto err;
+ }
+ }
+err: if (meta != NULL) {
+ if ((t_ret = __db_close(
+ meta, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if (dirs != NULL)
+ __os_dirfree(env, dirs, count);
+
+ return (ret);
+}
+
+/*
+ * __blob_copy_all --
+ * Copy all files in the blob directory.
+ *
+ * PUBLIC: int __blob_copy_all __P((DB*, const char *, u_int32_t));
+ */
+int __blob_copy_all(dbp, target, flags)
+ DB *dbp;
+ const char *target;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ char *blobdir, *fullname, *metafname, new_target[DB_MAXPATHLEN];
+ const char *path;
+ int ret;
+
+ env = dbp->env;
+ blobdir = NULL;
+ fullname = NULL;
+ metafname = NULL;
+ ret = 0;
+
+ /* Do nothing if blobs are not enabled. */
+ if (dbp->blob_sub_dir == NULL || dbp->blob_threshold == 0)
+ return (0);
+
+ /* Create the directory structure in the target directory. */
+ if (env->dbenv->db_blob_dir != NULL)
+ path = env->dbenv->db_blob_dir;
+ else
+ path = BLOB_DEFAULT_DIR;
+
+ /*
+ * Default blob directory will be maintained in the target
+ * directory only when it is backing up a single directory.
+ */
+ (void)snprintf(new_target, sizeof(new_target), "%s%c%s%c%c",
+ target, PATH_SEPARATOR[0], LF_ISSET(DB_BACKUP_SINGLE_DIR) ?
+ BLOB_DEFAULT_DIR : path, PATH_SEPARATOR[0], '\0');
+ path = new_target;
+#ifdef DB_WIN32
+ /*
+ * Absolute paths on windows can result in it creating a "C" or "D"
+ * directory in the working directory.
+ */
+ if (__os_abspath(path))
+ path += 2;
+#endif
+ if ((ret = __db_mkpath(env, path)) != 0)
+ goto err;
+
+ /* Copy the directory id database. */
+ if ((ret = __blob_make_meta_fname(env, NULL, &metafname)) != 0)
+ goto err;
+ if ((ret = __db_appname(env,
+ DB_APP_BLOB, metafname, NULL, &fullname)) != 0)
+ goto err;
+ path = fullname;
+ /* Remove env home from the full path of directory id database. */
+ if (!__os_abspath(fullname) &&
+ env->db_home != NULL && (env->db_home)[0] != '\0')
+ path += (strlen(env->db_home) + 1);
+ ENV_GET_THREAD_INFO(env, ip);
+
+ if ((ret = __db_dbbackup(
+ dbp->dbenv, ip, path, new_target, 0, 0, metafname)) != 0)
+ goto err;
+
+ if ((ret = __blob_get_dir(dbp, &blobdir)) != 0)
+ goto err;
+
+ /*
+ * The blob directory may not exist if blobs were enabled,
+ * but none were created.
+ */
+ if (__os_exists(env, blobdir, NULL) != 0)
+ goto err;
+
+ (void)sprintf(new_target + strlen(new_target),
+ "%s%c", dbp->blob_sub_dir, '\0');
+ if ((ret = __blob_copy_dir(dbp, blobdir, new_target)) != 0)
+ goto err;
+
+err: if (blobdir != NULL)
+ __os_free(env, blobdir);
+ if (metafname != NULL)
+ __os_free(env, metafname);
+ if (fullname != NULL)
+ __os_free(env, fullname);
+ return (ret);
+}
+
+/*
+ * __blob_copy_dir --
+ * Copy all files in the given directory, and all files
+ * in all sub-directories.
+ */
+static int
+__blob_copy_dir(dbp, dir, target)
+ DB *dbp;
+ const char *dir;
+ const char *target;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ char **dirs, full_path[DB_MAXPATHLEN], new_target[DB_MAXPATHLEN];
+ int count, i, isdir, ret;
+
+ env = dbp->env;
+ count = 0;
+ dirs = NULL;
+
+ /* Create the directory sturcture in the target directory. */
+ if ((ret = __db_mkpath(env, target)) != 0)
+ goto err;
+
+ ENV_GET_THREAD_INFO(env, ip);
+ /* Get a list of all files in the directory. */
+ if ((ret = __os_dirlist(env, dir, 1, &dirs, &count)) != 0)
+ goto err;
+
+ for (i = 0; i < count; i++) {
+ (void)sprintf(full_path, "%s%c%s%c",
+ dir, PATH_SEPARATOR[0], dirs[i], '\0');
+
+ if (__os_exists(env, full_path, &isdir) != 0)
+ continue;
+
+ /*
+ * If it is a directory, copy the files in it.
+ * Else if it is the meta database, call __db_dbbackup, else
+ * copy the file.
+ */
+ if (isdir) {
+ (void)sprintf(new_target,
+ "%s%c%s%c%c", target, PATH_SEPARATOR[0],
+ dirs[i], PATH_SEPARATOR[0], '\0');
+ if ((ret = __blob_copy_dir(
+ dbp, full_path, new_target)) != 0)
+ goto err;
+ } else {
+ if (strcmp(dirs[i], BLOB_META_FILE_NAME) == 0) {
+ (void)sprintf(full_path, "%s%c%s%c",
+ dbp->blob_sub_dir,
+ PATH_SEPARATOR[0], dirs[i], '\0');
+ if ((ret = __db_dbbackup(dbp->dbenv, ip,
+ full_path, target, 0, 0,
+ BLOB_META_FILE_NAME)) != 0)
+ goto err;
+ } else {
+ if ((ret = backup_data_copy(
+ dbp->dbenv, dirs[i], dir, target, 0)) != 0)
+ goto err;
+ }
+ }
+ }
+
+err:
+ if (dirs != NULL)
+ __os_dirfree(env, dirs, count);
+ return (ret);
+}
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index b455ff23..be4c6b01 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -22,13 +22,16 @@ static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int));
static int __bam_lock_tree __P((DBC *, EPG *, EPG *csp, u_int32_t, u_int32_t));
static int __bam_lock_subtree __P((DBC *, PAGE *, u_int32_t, u_int32_t));
static int __bam_merge __P((DBC *,
- DBC *, u_int32_t, DBT *, DB_COMPACT *,int *));
-static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *));
+ DBC *, u_int32_t, DBT *, DB_COMPACT *, int *, int *));
+static int __bam_merge_internal __P((DBC *,
+ DBC *, int, DB_COMPACT *, int *, int *));
static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *));
-static int __bam_merge_records __P((DBC *, DBC*, u_int32_t, DB_COMPACT *));
-static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *));
+static int __bam_merge_records __P((DBC *,
+ DBC *, u_int32_t, DB_COMPACT *, int *));
+static int __bam_truncate_internal_overflow __P((DBC *,
+ PAGE *, DB_COMPACT *, int *));
static int __bam_truncate_root_page __P((DBC *,
- PAGE *, u_int32_t, DB_COMPACT *));
+ PAGE *, u_int32_t, DB_COMPACT *, int *));
#ifdef HAVE_FTRUNCATE
static int __bam_savekey __P((DBC *, int, DBT *));
@@ -145,13 +148,13 @@ __bam_csearch(dbc, start, sflag, level)
* PUBLIC: DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
*/
int
-__bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
+__bam_compact_int(dbc, start, stop, factor, spanp, c_data, isdonep)
DBC *dbc;
DBT *start, *stop;
u_int32_t factor;
int *spanp;
DB_COMPACT *c_data;
- int *donep;
+ int *isdonep;
{
BTREE_CURSOR *cp, *ncp;
DB *dbp;
@@ -168,7 +171,7 @@ __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
int check_dups, check_trunc, clear_root, do_commit, isdone;
int merged, next_p, pgs_done, ret, t_ret, tdone;
-#ifdef DEBUG
+#ifdef DEBUG_WOP
#define CTRACE(dbc, location, t, start, f) do { \
DBT __trace; \
DB_SET_DBT(__trace, t, strlen(t)); \
@@ -182,8 +185,8 @@ __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
CTRACE(dbc, location, __buf, start, f); \
} while (0)
#else
-#define CTRACE(dbc, location, t, start, f)
-#define PTRACE(dbc, location, p, start, f)
+#define CTRACE(dbc, location, t, start, f) NOP_STATEMENT
+#define PTRACE(dbc, location, p, start, f) NOP_STATEMENT
#endif
ndbc = NULL;
@@ -551,11 +554,10 @@ retry: pg = NULL;
if (ret != 0)
goto err1;
}
- pgs_done++;
- /* Get a fresh low numbered page. */
+ /* Try to swap to a lower numbered page. */
if ((ret = __db_exchange_page(dbc,
&cp->csp->page, ncp->csp->page,
- PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ PGNO_INVALID, DB_EXCH_DEFAULT, &pgs_done)) != 0)
goto err1;
if ((ret = __TLPUT(dbc, prev_lock)) != 0)
goto err1;
@@ -598,8 +600,8 @@ retry: pg = NULL;
merged = 0;
for (epg = cp->sp; epg != cp->csp; epg++) {
PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0);
- if ((ret = __bam_merge_internal(dbc,
- ndbc, LEVEL(epg->page), c_data, &merged)) != 0)
+ if ((ret = __bam_merge_internal(dbc, ndbc,
+ LEVEL(epg->page), c_data, &merged, &pgs_done)) != 0)
break;
if (merged)
break;
@@ -627,7 +629,7 @@ retry: pg = NULL;
}
PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0);
- /* if we remove the next page, then we need its next locked */
+ /* If we remove the next page, then we need its next locked. */
npgno = NEXT_PGNO(ncp->csp->page);
if (npgno != PGNO_INVALID) {
TRY_LOCK2(dbc, ndbc, npgno,
@@ -637,9 +639,8 @@ retry: pg = NULL;
}
/*lint -e{794} */
if ((ret = __bam_merge(dbc,
- ndbc, factor, stop, c_data, &isdone)) != 0)
+ ndbc, factor, stop, c_data, &isdone, &pgs_done)) != 0)
goto err1;
- pgs_done++;
/*
* __bam_merge could have freed our stack if it
* deleted a page possibly collapsing the tree.
@@ -722,8 +723,8 @@ retry: pg = NULL;
/* Get a fresh low numbered page. */
pgno = PGNO(pg);
if ((ret = __db_exchange_page(dbc,
- &cp->csp->page, NULL,
- PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ &cp->csp->page, NULL, PGNO_INVALID,
+ DB_EXCH_DEFAULT, &pgs_done)) != 0)
goto err1;
if ((ret = __TLPUT(dbc, prev_lock)) != 0)
goto err1;
@@ -734,10 +735,7 @@ retry: pg = NULL;
LOCK_INIT(next_lock);
saved_pgno = PGNO_INVALID;
pg = cp->csp->page;
- if (pgno != PGNO(pg)) {
- pgs_done++;
- pgno = PGNO(pg);
- }
+ pgno = PGNO(pg);
}
/*
* If we are going to leave this parent commit
@@ -752,7 +750,7 @@ retry: pg = NULL;
goto next_page;
}
- /* If they have the same parent, just dup the cursor */
+ /* If they have the same parent, just dup the cursor. */
if (ndbc != NULL && (ret = __dbc_close(ndbc)) != 0)
goto err1;
if ((ret = __dbc_dup(dbc, &ndbc, DB_POSITION)) != 0)
@@ -842,17 +840,15 @@ retry: pg = NULL;
pgno = PGNO(pg);
/* Get a fresh low numbered page. */
if ((ret = __db_exchange_page(dbc, &cp->csp->page,
- npg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ npg, PGNO_INVALID,
+ DB_EXCH_DEFAULT, &pgs_done)) != 0)
goto err1;
if ((ret = __TLPUT(dbc, prev_lock)) != 0)
goto err1;
LOCK_INIT(prev_lock);
prev_pgno = PGNO_INVALID;
pg = cp->csp->page;
- if (pgno != PGNO(pg)) {
- pgs_done++;
- pgno = PGNO(pg);
- }
+ pgno = PGNO(pg);
}
c_data->compact_pages_examine++;
@@ -887,11 +883,9 @@ retry: pg = NULL;
*/
PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0);
if ((ret = __bam_merge(dbc,
- ndbc, factor, stop, c_data, &isdone)) != 0)
+ ndbc, factor, stop, c_data, &isdone, &pgs_done)) != 0)
goto err1;
- pgs_done++;
-
if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
goto err1;
LOCK_INIT(nnext_lock);
@@ -932,7 +926,7 @@ next_page:
pg = NULL;
if ((ret = __bam_stkrel(dbc, STK_PGONLY)) != 0)
goto err;
- if (npgno != PGNO_INVALID &&
+ if (npgno != PGNO_INVALID && !do_commit &&
(ret = __db_lget(dbc, 0, npgno, DB_LOCK_READ, 0, &next_lock)) != 0)
goto err;
if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
@@ -1010,9 +1004,6 @@ err: /*
if ((t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
ret = t_ret;
- if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
- ret = t_ret;
-
if (pg != NULL && (t_ret =
__memp_fput(dbmp,
dbc->thread_info, pg, dbc->priority) != 0) && ret == 0)
@@ -1022,7 +1013,11 @@ err: /*
dbc->thread_info, npg, dbc->priority) != 0) && ret == 0)
ret = t_ret;
-out: *donep = isdone;
+out:
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ *isdonep = isdone;
/* For OPD trees return if we did anything in the span variable. */
if (F_ISSET(dbc, DBC_OPD))
@@ -1035,12 +1030,13 @@ out: *donep = isdone;
* __bam_merge -- do actual merging of leaf pages.
*/
static int
-__bam_merge(dbc, ndbc, factor, stop, c_data, donep)
+__bam_merge(dbc, ndbc, factor, stop, c_data, isdonep, pgs_donep)
DBC *dbc, *ndbc;
u_int32_t factor;
DBT *stop;
DB_COMPACT *c_data;
- int *donep;
+ int *isdonep;
+ int *pgs_donep;
{
BTREE_CURSOR *cp, *ncp;
DB *dbp;
@@ -1064,9 +1060,9 @@ __bam_merge(dbc, ndbc, factor, stop, c_data, donep)
/* Find if the stopping point is on this page. */
if (stop != NULL && stop->size != 0) {
- if ((ret = __bam_compact_isdone(dbc, stop, npg, donep)) != 0)
+ if ((ret = __bam_compact_isdone(dbc, stop, npg, isdonep)) != 0)
return (ret);
- if (*donep)
+ if (*isdonep)
return (0);
}
@@ -1080,20 +1076,23 @@ __bam_merge(dbc, ndbc, factor, stop, c_data, donep)
ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) ||
(int)(P_FREESPACE(dbp, pg) -
((dbp->pgsize - P_OVERHEAD(dbp)) -
- P_FREESPACE(dbp, npg))) < (int)factor)
- ret = __bam_merge_records(dbc, ndbc, factor, c_data);
- else
+ P_FREESPACE(dbp, npg))) < (int)factor) {
+ ret = __bam_merge_records(dbc, ndbc, factor, c_data, pgs_donep);
+ } else {
/*lint -e{794} */
free_page: ret = __bam_merge_pages(dbc, ndbc, c_data);
+ (*pgs_donep)++;
+ }
return (ret);
}
static int
-__bam_merge_records(dbc, ndbc, factor, c_data)
+__bam_merge_records(dbc, ndbc, factor, c_data, pgs_donep)
DBC *dbc, *ndbc;
u_int32_t factor;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
BINTERNAL *bi;
BKEYDATA *bk, *tmp_bk;
@@ -1126,8 +1125,8 @@ __bam_merge_records(dbc, ndbc, factor, c_data)
if (c_data->compact_truncate != PGNO_INVALID &&
PGNO(ncp->csp->page) > c_data->compact_truncate) {
/* Get a fresh low numbered page. */
- if ((ret = __db_exchange_page(ndbc,
- &ncp->csp->page, pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ if ((ret = __db_exchange_page(ndbc, &ncp->csp->page,
+ pg, PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0)
goto err;
}
@@ -1197,6 +1196,7 @@ __bam_merge_records(dbc, ndbc, factor, c_data)
/* If we have hit the first record then there is nothing we can move. */
if (indx == 0)
goto done;
+ (*pgs_donep)++;
if (TYPE(pg) != P_LBTREE && TYPE(pg) != P_LDUP) {
if (indx == nent)
return (__bam_merge_pages(dbc, ndbc, c_data));
@@ -1237,7 +1237,8 @@ __bam_merge_records(dbc, ndbc, factor, c_data)
indx -= adj;
}
bk = GET_BKEYDATA(dbp, npg, indx);
- len = (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len;
+ len = (B_TYPE(bk->type) == B_KEYDATA) ? bk->len :
+ ((B_TYPE(bk->type) == B_BLOB) ? BBLOB_DSIZE : BOVERFLOW_SIZE);
if (indx != 0 && BINTERNAL_SIZE(len) >= pfree) {
if (F_ISSET(dbc, DBC_OPD)) {
if (dbp->dup_compare == __bam_defcmp)
@@ -1281,8 +1282,9 @@ noprefix:
} while (indx != 0 && ninp[indx] == ninp[indx - adj]);
bk = GET_BKEYDATA(dbp, npg, indx);
- len =
- (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len;
+ len = (B_TYPE(bk->type) == B_KEYDATA) ?
+ bk->len : ((B_TYPE(bk->type) == B_BLOB) ?
+ BBLOB_DSIZE : BOVERFLOW_SIZE);
}
/*
@@ -1346,6 +1348,13 @@ no_check: is_dup = first_dup = next_dup = 0;
BOVERFLOW_SIZE, &data, NULL)) != 0)
goto err;
break;
+ case B_BLOB:
+ data.size = BBLOB_SIZE;
+ data.data = bk;
+ if ((ret = __db_pitem(dbc, pg,
+ pind, BBLOB_SIZE, &data, NULL)) != 0)
+ goto err;
+ break;
default:
__db_errx(env, DB_STR_A("1022",
"Unknown record format, page %lu, indx 0",
@@ -1538,15 +1547,20 @@ err: return (ret);
/*
* __bam_merge_internal --
* Merge internal nodes of the tree.
+ *
+ * The first key of an internal page does not have a guaranteed-
+ * useful key.
*/
static int
-__bam_merge_internal(dbc, ndbc, level, c_data, merged)
+__bam_merge_internal(dbc, ndbc, level, c_data, merged, pgs_donep)
DBC *dbc, *ndbc;
int level;
DB_COMPACT *c_data;
int *merged;
+ int *pgs_donep;
{
BINTERNAL bi, *bip, *fip;
+ BOVERFLOW bo;
BTREE_CURSOR *cp, *ncp;
DB *dbp;
DBT data, hdr;
@@ -1579,7 +1593,6 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged)
dbmp = dbp->mpf;
cp = (BTREE_CURSOR *)dbc->internal;
ncp = (BTREE_CURSOR *)ndbc->internal;
- *merged = 0;
ret = 0;
/*
@@ -1608,11 +1621,11 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged)
* Check for overflow keys on both pages while we have
* them locked.
*/
- if ((ret =
- __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0)
+ if ((ret = __bam_truncate_internal_overflow(dbc,
+ pg, c_data, pgs_donep)) != 0)
goto err;
- if ((ret =
- __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0)
+ if ((ret = __bam_truncate_internal_overflow(dbc,
+ npg, c_data, pgs_donep)) != 0)
goto err;
}
@@ -1624,7 +1637,12 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged)
*/
fip = NULL;
if (TYPE(pg) == P_IBTREE) {
- /* See where we run out of space. */
+ /* See where we run out of space. This does not yet include
+ * whatever extra pages are needed if an overflow key is
+ * going to be added to one or more parent pages. It would be
+ * better to use as little of the key that as necessary, though
+ * the effort of determining that might not be worthwhile.
+ */
freespace = P_FREESPACE(dbp, pg);
/*
* The leftmost key of an internal page is not accurate.
@@ -1704,12 +1722,37 @@ fits: memset(&bi, 0, sizeof(bi));
if (fip == NULL) {
data.size = bip->len;
data.data = bip->data;
+ } else if (fip->type == B_OVERFLOW) {
+ DB_ASSERT(dbc->env,
+ fip->len == sizeof(BOVERFLOW));
+ /* Cast to "BOVERFLOW *" to calm down lint. */
+ memmove(&bo,
+ (BOVERFLOW *)fip->data, sizeof(BOVERFLOW));
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr, bo.tlen,
+ bo.pgno, &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+ data.size = sizeof(bo);
+ data.data = &bo;
+ } else if (fip->type == B_BLOB) {
+ /* Blobs should never appear as keys. */
+ DB_ASSERT(dbc->env,
+ !(fip->type == B_BLOB &&
+ TYPE(pg) == P_IBTREE));
} else {
data.size = fip->len;
data.data = fip->data;
}
bi.len = data.size;
- B_TSET(bi.type, bip->type);
+ /*
+ * Set bi.type according to the data's type, to ensure
+ * that it is B_OVERLOW iff the data is BOVERFLOW.
+ */
+ B_TSET(bi.type, fip == NULL ? bip->type : fip->type);
bi.pgno = bip->pgno;
bi.nrecs = bip->nrecs;
hdr.data = &bi;
@@ -1750,7 +1793,12 @@ fits: memset(&bi, 0, sizeof(bi));
if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0)
goto err;
pind++;
- if (fip != NULL) {
+ /* add bip test so fortify does not complain */
+ if (fip != NULL && bip != NULL) {
+ if (B_TYPE(bip->type) == B_OVERFLOW &&
+ (ret = __db_doff(dbc,
+ ((BOVERFLOW *)bip->data)->pgno)) != 0)
+ goto err;
/* reset size to be for the record being deleted. */
size = BINTERNAL_SIZE(bip->len);
fip = NULL;
@@ -1848,14 +1896,14 @@ fits: memset(&bi, 0, sizeof(bi));
PGNO(npg) > c_data->compact_truncate &&
ncp->csp != ncp->sp) {
if ((ret = __db_exchange_page(ndbc, &ncp->csp->page,
- pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ pg, PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0)
goto err;
}
if (c_data->compact_truncate != PGNO_INVALID &&
PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) {
if ((ret = __db_exchange_page(dbc, &cp->csp->page,
ncp->csp->page,
- PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0)
goto err;
}
}
@@ -1875,13 +1923,13 @@ err: cp->csp = save_csp;
* We may or may not have a write lock on this page.
*/
static int
-__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
+__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, pgs_donep)
DBC *dbc;
PAGE **ppg;
u_int32_t factor;
int have_lock;
DB_COMPACT *c_data;
- int *donep;
+ int *pgs_donep;
{
BOVERFLOW *bo;
BTREE_CURSOR *cp;
@@ -1896,15 +1944,19 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
DB_ASSERT(NULL, dbc != NULL);
dbp = dbc->dbp;
dbmp = dbp->mpf;
+ /* XXX Don't reserve any free bytes (Force 100% fillfactor) in OPD trees
+ * to ensure forward progress.
+ */
+ factor = 0;
cp = (BTREE_CURSOR *)dbc->internal;
for (i = 0; i < NUM_ENT(*ppg); i++) {
bo = GET_BOVERFLOW(dbp, *ppg, i);
- if (B_TYPE(bo->type) == B_KEYDATA)
+ if (B_TYPE(bo->type) == B_KEYDATA ||
+ B_TYPE(bo->type) == B_BLOB)
continue;
c_data->compact_pages_examine++;
if (bo->pgno > c_data->compact_truncate) {
- (*donep)++;
if (!have_lock) {
/*
* The caller should have the page at
@@ -1925,8 +1977,9 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
goto err;
}
+ pgno = bo->pgno;
if ((ret = __bam_truncate_root_page(dbc,
- *ppg, i, c_data)) != 0)
+ *ppg, i, c_data, pgs_donep)) != 0)
goto err;
/* Just in case it should move. Could it? */
bo = GET_BOVERFLOW(dbp, *ppg, i);
@@ -1934,13 +1987,13 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
if (B_TYPE(bo->type) == B_OVERFLOW) {
if ((ret = __db_truncate_overflow(dbc,
- bo->pgno, have_lock ? NULL : ppg, c_data)) != 0)
+ bo->pgno, have_lock ? NULL : ppg,
+ c_data, pgs_donep)) != 0)
goto err;
- (*donep)++;
continue;
}
if ((ret = __bam_compact_opd(dbc, bo->pgno,
- have_lock ? NULL : ppg, factor, c_data, donep)) != 0)
+ have_lock ? NULL : ppg, factor, c_data, pgs_donep)) != 0)
goto err;
}
@@ -1955,13 +2008,13 @@ err:
* PUBLIC: db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *));
*/
int
-__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep)
+__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, pgs_donep)
DBC *dbc;
db_pgno_t root_pgno;
PAGE **ppg;
u_int32_t factor;
DB_COMPACT *c_data;
- int *donep;
+ int *pgs_donep;
{
BTREE_CURSOR *cp;
DBC *opd;
@@ -2021,7 +2074,7 @@ __bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep)
NULL, factor, &span, c_data, &isdone)) != 0)
break;
/* For OPD the number of pages dirtied is returned in span. */
- *donep += span;
+ *pgs_donep += span;
} while (!isdone);
if (start.data != NULL)
@@ -2041,11 +2094,12 @@ done:
* The page is reference by the pg/indx passed in.
*/
static int
-__bam_truncate_root_page(dbc, pg, indx, c_data)
+__bam_truncate_root_page(dbc, pg, indx, c_data, pgs_donep)
DBC *dbc;
PAGE *pg;
u_int32_t indx;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
BINTERNAL *bi;
BOVERFLOW *bo;
@@ -2053,8 +2107,8 @@ __bam_truncate_root_page(dbc, pg, indx, c_data)
db_pgno_t *pgnop;
u_int32_t tlen;
- COMPQUIET(c_data, NULL);
COMPQUIET(bo, NULL);
+ COMPQUIET(c_data, NULL);
dbp = dbc->dbp;
if (TYPE(pg) == P_IBTREE) {
bi = GET_BINTERNAL(dbp, pg, indx);
@@ -2075,7 +2129,7 @@ __bam_truncate_root_page(dbc, pg, indx, c_data)
DB_ASSERT(dbp->env, IS_DIRTY(pg));
- return (__db_truncate_root(dbc, pg, indx, pgnop, tlen));
+ return (__db_truncate_root(dbc, pg, indx, pgnop, tlen, pgs_donep));
}
/*
@@ -2086,10 +2140,11 @@ __bam_truncate_root_page(dbc, pg, indx, c_data)
* nodes they will get copied adding pages to the database.
*/
static int
-__bam_truncate_internal_overflow(dbc, page, c_data)
+__bam_truncate_internal_overflow(dbc, page, c_data, pgs_donep)
DBC *dbc;
PAGE *page;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
BINTERNAL *bi;
BOVERFLOW *bo;
@@ -2104,10 +2159,11 @@ __bam_truncate_internal_overflow(dbc, page, c_data)
continue;
bo = (BOVERFLOW *)(bi->data);
if (bo->pgno > c_data->compact_truncate && (ret =
- __bam_truncate_root_page(dbc, page, indx, c_data)) != 0)
+ __bam_truncate_root_page(dbc, page,
+ indx, c_data, pgs_donep)) != 0)
break;
- if ((ret = __db_truncate_overflow(
- dbc, bo->pgno, NULL, c_data)) != 0)
+ if ((ret = __db_truncate_overflow(dbc,
+ bo->pgno, NULL, c_data, pgs_donep)) != 0)
break;
}
return (ret);
@@ -2142,7 +2198,7 @@ __bam_compact_isdone(dbc, stop, pg, isdone)
} else {
DB_ASSERT(dbc->dbp->env, TYPE(pg) == P_LBTREE);
if ((ret = __bam_cmp(dbc, stop, pg, 0,
- t->bt_compare, &cmp)) != 0)
+ t->bt_compare, &cmp, NULL)) != 0)
return (ret);
*isdone = cmp <= 0;
@@ -2328,7 +2384,7 @@ __bam_savekey(dbc, next, start)
if (len == 0) {
no_key: __db_errx(env, DB_STR("1023",
"Compact cannot handle zero length key"));
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
} else {
@@ -2360,14 +2416,15 @@ retry: return (DB_LOCK_NOTGRANTED);
* Find high numbered pages in the internal nodes of a tree and
* swap them for lower numbered pages.
* PUBLIC: int __bam_truncate_ipages __P((DB *,
- * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *, int *));
*/
int
-__bam_truncate_ipages(dbp, ip, txn, c_data)
+__bam_truncate_ipages(dbp, ip, txn, c_data, pgs_donep)
DB *dbp;
DB_THREAD_INFO *ip;
DB_TXN *txn;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
BTMETA *meta;
BTREE *bt;
@@ -2480,8 +2537,9 @@ new_txn:
pgno = PGNO(cp->csp->page);
if (pgno > c_data->compact_truncate) {
- if ((ret = __db_exchange_page(dbc, &cp->csp->page,
- NULL, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ if ((ret = __db_exchange_page(dbc,
+ &cp->csp->page, NULL, PGNO_INVALID,
+ DB_EXCH_DEFAULT, pgs_donep)) != 0)
goto err;
}
@@ -2561,7 +2619,8 @@ again: if (F_ISSET(dbp, DB_AM_SUBDB) &&
}
if (PGNO(meta) > c_data->compact_truncate) {
dbmeta = (DBMETA *)meta;
- ret = __db_move_metadata(dbc, &dbmeta, c_data);
+ ret = __db_move_metadata(dbc,
+ &dbmeta, c_data, pgs_donep);
meta = (BTMETA *)dbmeta;
if (ret != 0)
goto err;
@@ -2583,8 +2642,8 @@ again: if (F_ISSET(dbp, DB_AM_SUBDB) &&
* page latch is released.
*/
++dbp->mpf->mfp->revision;
- if ((ret = __db_exchange_page(dbc,
- &root, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+ if ((ret = __db_exchange_page(dbc, &root, NULL,
+ PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
goto err;
if (PGNO(root) == bt->bt_root)
goto err;
diff --git a/src/btree/bt_compare.c b/src/btree/bt_compare.c
index 5c009071..8923c5fa 100644
--- a/src/btree/bt_compare.c
+++ b/src/btree/bt_compare.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -49,27 +49,39 @@
/*
* __bam_cmp --
- * Compare a key to a given record.
+ * Compare a key to a given record. We always start the comparison
+ * at an offset and update the offset with longest matching count
+ * after the comparison.
*
* PUBLIC: int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t,
- * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *),
+ * PUBLIC: int *, size_t *));
*/
int
-__bam_cmp(dbc, dbt, h, indx, func, cmpp)
+__bam_cmp(dbc, dbt, h, indx, func, cmpp, locp)
DBC *dbc;
const DBT *dbt;
PAGE *h;
u_int32_t indx;
- int (*func)__P((DB *, const DBT *, const DBT *));
+ int (*func)__P((DB *, const DBT *, const DBT *, size_t *));
int *cmpp;
+ size_t *locp;
{
+ BBLOB bl;
BINTERNAL *bi;
BKEYDATA *bk;
BOVERFLOW *bo;
DB *dbp;
DBT pg_dbt;
+ off_t blob_size;
+ int ret;
+ db_seq_t blob_id;
dbp = dbc->dbp;
+ ret = 0;
+
+ /* Assert that the func is non-Null. */
+ DB_ASSERT(dbp->env, func != NULL);
/*
* Returns:
@@ -91,11 +103,49 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp)
bk = GET_BKEYDATA(dbp, h, indx);
if (B_TYPE(bk->type) == B_OVERFLOW)
bo = (BOVERFLOW *)bk;
- else {
+ else if (B_TYPE(bk->type) == B_BLOB) {
+ /*
+ * This is very slow, but since blobs cannot be
+ * in databases with duplicates or be keys, it should
+ * only happen when using DB_GET_BOTH or DB_SET.
+ */
+ memcpy(&bl, bk, BBLOB_SIZE);
+ memset(&pg_dbt, 0, sizeof(DBT));
+ GET_BLOB_SIZE(dbc->env, bl, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ if (blob_size > UINT32_MAX)
+ pg_dbt.size = UINT32_MAX;
+ else
+ pg_dbt.size = (u_int32_t)blob_size;
+ blob_id = (db_seq_t)bl.id;
+ pg_dbt.flags = DB_DBT_USERMEM;
+ if ((ret = __os_malloc(
+ dbc->env, pg_dbt.size, &pg_dbt.data)) != 0)
+ return (ret);
+ pg_dbt.ulen = pg_dbt.size;
+ if ((ret = __blob_get(dbc,
+ &pg_dbt, blob_id, blob_size, NULL, NULL)) != 0) {
+ __os_free(dbc->env, pg_dbt.data);
+ return (ret);
+ }
+ *cmpp = func(dbp, dbt, &pg_dbt, locp);
+ /*
+ * There is no way to directly compare a blob file that
+ * is greater in size than UINT32_MAX, so instead we
+ * compare the data up to UINT32_MAX, and if they are
+ * equal return that the blob is larger, since it is
+ * longer than the input data.
+ */
+ if (*cmpp == 0 && (blob_size > UINT32_MAX))
+ *cmpp = -1;
+ __os_free(dbc->env, pg_dbt.data);
+ return (0);
+ } else {
pg_dbt.app_data = NULL;
pg_dbt.data = bk->data;
pg_dbt.size = bk->len;
- *cmpp = func(dbp, dbt, &pg_dbt);
+ *cmpp = func(dbp, dbt, &pg_dbt, locp);
return (0);
}
break;
@@ -123,13 +173,14 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp)
}
bi = GET_BINTERNAL(dbp, h, indx);
- if (B_TYPE(bi->type) == B_OVERFLOW)
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ DB_ASSERT(dbp->env, bi->len == BOVERFLOW_SIZE);
bo = (BOVERFLOW *)(bi->data);
- else {
+ } else {
pg_dbt.app_data = NULL;
pg_dbt.data = bi->data;
pg_dbt.size = bi->len;
- *cmpp = func(dbp, dbt, &pg_dbt);
+ *cmpp = func(dbp, dbt, &pg_dbt, locp);
return (0);
}
break;
@@ -141,42 +192,56 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp)
* Overflow.
*/
return (__db_moff(dbc, dbt, bo->pgno, bo->tlen,
- func == __bam_defcmp ? NULL : func, cmpp));
+ func == __bam_defcmp ? NULL : func, cmpp, locp));
}
/*
* __bam_defcmp --
- * Default comparison routine.
+ * Keep track of how far along in the two keys we find matching
+ * characters, and use that as an offset into the keys to begin
+ * future comparisons. This will save us the overhead of always
+ * starting the comparisons on the first character.
*
- * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *));
+ * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *, size_t *));
*/
int
-__bam_defcmp(dbp, a, b)
+__bam_defcmp(dbp, a, b, locp)
DB *dbp;
const DBT *a, *b;
+ size_t *locp;
{
- size_t len;
+ size_t len, i, start;
u_int8_t *p1, *p2;
COMPQUIET(dbp, NULL);
-
+ start = (locp == NULL ? 0 : *locp);
/*
* Returns:
* < 0 if a is < b
* = 0 if a is = b
* > 0 if a is > b
*
+ * We start the comparison from 'locp' and store the last match
+ * location in 'locp'.
+ *
* XXX
* If a size_t doesn't fit into a long, or if the difference between
* any two characters doesn't fit into an int, this routine can lose.
* What we need is a signed integral type that's guaranteed to be at
* least as large as a size_t, and there is no such thing.
*/
+ p1 = (u_int8_t *)a->data + start;
+ p2 = (u_int8_t *)b->data + start;
len = a->size > b->size ? b->size : a->size;
- for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
- if (*p1 != *p2)
- return ((long)*p1 - (long)*p2);
- return ((long)a->size - (long)b->size);
+ for (i = start; i < len; ++p1, ++p2, ++i)
+ if (*p1 != *p2) {
+ if (locp != NULL)
+ *locp = i;
+ return (*p1 < *p2 ? -1 : 1);
+ }
+ if (locp != NULL)
+ *locp = len;
+ return (a->size == b->size ? 0 : (a->size < b->size ? -1 : 1));
}
/*
diff --git a/src/btree/bt_compress.c b/src/btree/bt_compress.c
index 3f293461..479e7248 100644
--- a/src/btree/bt_compress.c
+++ b/src/btree/bt_compress.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
#include "db_config.h"
@@ -352,16 +352,20 @@ __bam_compress_marshal_data(dbp, data, destbuf)
* __bam_compress_dupcmp --
* Duplicate comparison function for compressed BTrees.
*
- * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *));
+ * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *,
+ * PUBLIC: size_t *));
*/
int
-__bam_compress_dupcmp(db, a, b)
+__bam_compress_dupcmp(db, a, b, locp)
DB *db;
const DBT *a;
const DBT *b;
+ size_t *locp;
{
DBT dcmp_a, dcmp_b;
+ COMPQUIET(locp, NULL);
+
/* Decompress the initial data in a */
CMP_UNMARSHAL_DATA(a, &dcmp_a);
dcmp_a.ulen = 0;
@@ -380,7 +384,7 @@ __bam_compress_dupcmp(db, a, b)
/* Call the user's duplicate compare function */
return ((BTREE *)db->bt_internal)->
- compress_dup_compare(db, &dcmp_a, &dcmp_b);
+ compress_dup_compare(db, &dcmp_a, &dcmp_b, NULL);
}
/*
@@ -636,7 +640,7 @@ __bamc_next_decompress(dbc)
db = dbc->dbp;
if (cp->compcursor >= cp->compend)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
cp->prevKey = cp->currentKey;
cp->prevData = cp->currentData;
@@ -1251,7 +1255,7 @@ __bamc_compress_merge_delete(dbc, stream, countp)
* chunk, but don't delete any more
* entries.
*/
- bulk_ret = DB_NOTFOUND;
+ bulk_ret = DBC_ERR(dbc, DB_NOTFOUND);
moreStream = 0;
iSmallEnough = 0;
} else
@@ -1318,7 +1322,7 @@ __bamc_compress_merge_delete(dbc, stream, countp)
CMP_FREE_DBT(env, &nextk);
CMP_FREE_DBT(env, &nextc);
- return (ret != 0 ? ret : bulk_ret);
+ return (ret != 0 ? ret : DBC_ERR(dbc, bulk_ret));
}
/*
@@ -1389,7 +1393,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp)
* in the database
*/
if (ifound == 0) {
- bulk_ret = DB_NOTFOUND;
+ bulk_ret = DBC_ERR(dbc, DB_NOTFOUND);
} else
++chunk_count;
break;
@@ -1463,7 +1467,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp)
* current chunk, but don't delete
* any more entries.
*/
- bulk_ret = DB_NOTFOUND;
+ bulk_ret = DBC_ERR(dbc, DB_NOTFOUND);
moreStream = 0;
iSmallEnough = 0;
} else
@@ -1541,7 +1545,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp)
CMP_FREE_DBT(env, &pdestdata);
CMP_FREE_DBT(env, &nextk);
- return (ret != 0 ? ret : bulk_ret);
+ return (ret != 0 ? ret : DBC_ERR(dbc, bulk_ret));
}
/******************************************************************************/
@@ -1641,8 +1645,8 @@ __bamc_compress_get_prev_dup(dbc, flags)
if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
return (ret);
- if (t->bt_compare(dbp, cp->currentKey, &cp->del_key) != 0)
- return (DB_NOTFOUND);
+ if (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) != 0)
+ return (DBC_ERR(dbc, DB_NOTFOUND));
return (0);
}
@@ -1684,7 +1688,7 @@ __bamc_compress_get_prev_nodup(dbc, flags)
do
if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
return (ret);
- while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+ while (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) == 0);
return (0);
}
@@ -1702,7 +1706,7 @@ __bamc_compress_get_next(dbc, flags)
if (F_ISSET(cp, C_COMPRESS_DELETED)) {
if (cp->currentKey == 0)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
F_CLR(cp, C_COMPRESS_DELETED);
return (0);
} else if (cp->currentKey) {
@@ -1722,7 +1726,7 @@ __bamc_compress_get_next(dbc, flags)
* to the right place
*/
__bamc_compress_reset(dbc);
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
} else if (ret != 0)
return (ret);
@@ -1753,17 +1757,18 @@ __bamc_compress_get_next_dup(dbc, key, flags)
* deleted entry.
*/
if (cp->currentKey == 0)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
F_CLR(cp, C_COMPRESS_DELETED);
- return (t->bt_compare(dbp,
- cp->currentKey, &cp->del_key) == 0 ? 0 : DB_NOTFOUND);
+ return (t->bt_compare(dbp, cp->currentKey,
+ &cp->del_key, NULL) == 0 ? 0 : DB_NOTFOUND);
} else if (cp->currentKey == 0)
return (EINVAL);
/* Check that the next entry has the same key as the previous entry */
ret = __bamc_next_decompress(dbc);
- if (ret == 0 && t->bt_compare(dbp, cp->currentKey, cp->prevKey) != 0)
- return (DB_NOTFOUND);
+ if (ret == 0 && t->bt_compare(dbp,
+ cp->currentKey, cp->prevKey, NULL) != 0)
+ return (DBC_ERR(dbc, DB_NOTFOUND));
if (ret != DB_NOTFOUND)
return (ret);
@@ -1783,7 +1788,7 @@ __bamc_compress_get_next_dup(dbc, key, flags)
* will end up pointing to the right place
*/
__bamc_compress_reset(dbc);
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
} else if (ret != 0)
return (ret);
@@ -1791,8 +1796,8 @@ __bamc_compress_get_next_dup(dbc, key, flags)
return (ret);
/* Check the keys are the same */
- if (t->bt_compare(dbp, cp->currentKey, key) != 0)
- return (DB_NOTFOUND);
+ if (t->bt_compare(dbp, cp->currentKey, key, NULL) != 0)
+ return (DBC_ERR(dbc, DB_NOTFOUND));
return (0);
}
@@ -1828,7 +1833,7 @@ __bamc_compress_get_next_nodup(dbc, flags)
do
if ((ret = __bamc_compress_get_next(dbc, flags)) != 0)
return (ret);
- while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+ while (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) == 0);
return (ret);
}
@@ -1888,14 +1893,14 @@ __bamc_compress_get_set(dbc, key, data, method, flags)
if (ret == 0 &&
__db_compare_both(dbp, cp->currentKey, 0, key, 0) != 0) {
/* We didn't find the key */
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
}
break;
case DB_GET_BOTH:
if (ret == 0 && (cmp != 0 || (!F_ISSET(dbp, DB_AM_DUPSORT) &&
- __bam_defcmp(dbp, cp->currentData, data) != 0))) {
+ __bam_defcmp(dbp, cp->currentData, data, NULL) != 0))) {
/* We didn't find the key/data pair */
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
}
break;
default:
@@ -1923,7 +1928,7 @@ __bamc_compress_get_bothc(dbc, data, flags)
position */
if (__db_compare_both(dbp, cp->currentKey,
cp->currentData, cp->currentKey, data) >= 0)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
cmp = 0;
/* Perform a linear search for the data in the current chunk */
@@ -1933,7 +1938,7 @@ __bamc_compress_get_bothc(dbc, data, flags)
continue;
if (ret == 0)
- return (cmp == 0 ? 0 : DB_NOTFOUND);
+ return (cmp == 0 ? 0 : DBC_ERR(dbc, DB_NOTFOUND));
if (ret != DB_NOTFOUND)
return (ret);
@@ -2277,7 +2282,7 @@ __bamc_compress_iput(dbc, key, data, flags)
switch (flags) {
case DB_CURRENT:
if (cp->currentKey == 0 || F_ISSET(cp, C_COMPRESS_DELETED)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto end;
}
@@ -2290,7 +2295,7 @@ __bamc_compress_iput(dbc, key, data, flags)
if (F_ISSET(dbp, DB_AM_DUPSORT) &&
((BTREE *)dbp->bt_internal)->compress_dup_compare(
- dbp, cp->currentData, data) != 0) {
+ dbp, cp->currentData, data, NULL) != 0) {
__db_errx(env, DB_STR("1032",
"Existing data sorts differently from put data"));
ret = EINVAL;
@@ -2464,7 +2469,7 @@ __bamc_compress_idel(dbc, flags)
if (F_ISSET(cp, C_COMPRESS_DELETED))
return DB_KEYEMPTY;
if (cp->currentKey == 0)
- return DB_NOTFOUND;
+ return (DBC_ERR(dbc, DB_NOTFOUND));
if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
cp->currentKey->data, cp->currentKey->size)) != 0)
@@ -3015,7 +3020,8 @@ __bam_compress_count(dbc, nkeysp, ndatap)
if (ret != 0)
goto err;
- if (t->bt_compare(dbp, cp_n->currentKey, cp_n->prevKey) != 0)
+ if (t->bt_compare(dbp,
+ cp_n->currentKey, cp_n->prevKey, NULL) != 0)
nkeys += 1;
}
diff --git a/src/btree/bt_conv.c b/src/btree/bt_conv.c
index 348ce5c2..85baeed8 100644
--- a/src/btree/bt_conv.c
+++ b/src/btree/bt_conv.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -88,7 +88,12 @@ __bam_mswap(env, pg)
SWAP32(p); /* re_len */
SWAP32(p); /* re_pad */
SWAP32(p); /* root */
- p += 92 * sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* threshold */
+ SWAP32(p); /* file id lo */
+ SWAP32(p); /* file id hi */
+ SWAP32(p); /* sdb id lo */
+ SWAP32(p); /* sdb id hi */
+ p += 87 * sizeof(u_int32_t); /* unused */
SWAP32(p); /* crypto_magic */
return (0);
diff --git a/src/btree/bt_curadj.c b/src/btree/bt_curadj.c
index 78606009..d3398ee8 100644
--- a/src/btree/bt_curadj.c
+++ b/src/btree/bt_curadj.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 860c31ce..d63b7373 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -938,7 +938,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
case DB_CURRENT:
/* It's not possible to return a deleted record. */
if (F_ISSET(cp, C_DELETED)) {
- ret = DB_KEYEMPTY;
+ ret = DBC_ERR(dbc, DB_KEYEMPTY);
goto err;
}
@@ -979,7 +979,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
goto err;
if (flags == DB_GET_BOTH) {
if (!exact) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
break;
@@ -1000,7 +1000,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
dbc, PGNO_INVALID, key, flags, &exact)) != 0)
return (ret);
if (!exact) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -1047,7 +1047,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
if ((ret = __bamc_next(dbc, 1, 0)) != 0)
goto err;
if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
break;
@@ -1077,7 +1077,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
if ((ret = __bamc_prev(dbc)) != 0)
goto err;
if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
break;
@@ -1173,12 +1173,15 @@ __bam_bulk(dbc, data, flags)
DBT *data;
u_int32_t flags;
{
+ BBLOB bl;
BKEYDATA *bk;
BOVERFLOW *bo;
BTREE_CURSOR *cp;
PAGE *pg;
db_indx_t *inp, indx, pg_keyoff;
int32_t *endp, key_off, *offp, *saveoffp;
+ off_t blob_size;
+ db_seq_t blob_id;
u_int8_t *dbuf, *dp, *np;
u_int32_t key_size, pagesize, size, space;
int adj, is_key, need_pg, next_key, no_dup, rec_key, ret;
@@ -1279,6 +1282,7 @@ next_pg:
*/
if (is_key && pg_keyoff != inp[indx]) {
bk = GET_BKEYDATA(dbc->dbp, pg, indx);
+ DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB);
if (B_TYPE(bk->type) == B_OVERFLOW) {
bo = (BOVERFLOW *)bk;
size = key_size = bo->tlen;
@@ -1403,6 +1407,31 @@ get_key_space:
*offp-- = (int32_t)(np - dbuf);
np += size;
*offp-- = (int32_t)size;
+ } else if (B_TYPE(bk->type) == B_BLOB) {
+ blob_size = 0;
+ blob_id = 0;
+ memcpy(&bl, bk, BBLOB_SIZE);
+ GET_BLOB_SIZE(dbc->env, bl, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ if (blob_size > UINT32_MAX) {
+ size = UINT32_MAX;
+ goto back_up;
+ }
+ size = (u_int32_t)blob_size;
+ if (size > space)
+ goto back_up;
+ blob_id = (db_seq_t)bl.id;
+ if ((ret = __blob_bulk(dbc, size, blob_id, np)) != 0)
+ return (ret);
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+ space -= size;
+ *offp-- = (int32_t)(np - dbuf);
+ np += size;
+ *offp-- = (int32_t)size;
} else {
if (need_pg) {
dp = np;
@@ -1764,11 +1793,11 @@ __bam_getbothc(dbc, data)
*/
if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare,
- &cmp)) != 0)
+ &cmp, NULL)) != 0)
return (ret);
if (cmp <= 0)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/* Discard the current page, we're going to do a full search. */
if ((ret = __memp_fput(mpf,
@@ -1791,7 +1820,7 @@ __bam_getbothc(dbc, data)
*/
if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
!IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
cp->indx += P_INDX;
return (__bam_getboth_finddatum(dbc, data, DB_GET_BOTH));
@@ -1842,7 +1871,7 @@ __bam_getlte(dbc, key, data)
/* Check if we're still on the correct key */
if ((ret = __bam_cmp(dbc, key, cp->page, cp->indx,
- ((BTREE*)dbp->bt_internal)->bt_compare, &exact)) != 0)
+ ((BTREE*)dbp->bt_internal)->bt_compare, &exact, NULL)) != 0)
goto end;
exact = (exact == 0);
}
@@ -1884,8 +1913,8 @@ __bam_getlte(dbc, key, data)
if (data != NULL) {
/* Check if we're still on the correct data */
if ((ret = __bam_cmp(
- dbc, data, ocp->page, ocp->indx,
- dbp->dup_compare, &exact)) != 0)
+ dbc, data, ocp->page, ocp->indx,
+ dbp->dup_compare, &exact, NULL)) != 0)
goto end;
exact = (exact == 0);
} else
@@ -1915,7 +1944,8 @@ __bam_getlte(dbc, key, data)
else {
/* Check if we're still on the correct data */
if ((ret = __bam_cmp(dbc, data, cp->page,
- cp->indx + O_INDX, dbp->dup_compare, &exact)) != 0)
+ cp->indx + O_INDX, dbp->dup_compare,
+ &exact, NULL)) != 0)
goto end;
exact = (exact == 0);
}
@@ -1982,7 +2012,7 @@ __bam_getboth_finddatum(dbc, data, flags)
if (!IS_CUR_DELETED(dbc)) {
if ((ret = __bam_cmp(
dbc, data, cp->page, cp->indx + O_INDX,
- __bam_defcmp, &cmp)) != 0)
+ __bam_defcmp, &cmp, NULL)) != 0)
return (ret);
if (cmp == 0)
return (0);
@@ -1992,7 +2022,8 @@ __bam_getboth_finddatum(dbc, data, flags)
!IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
break;
}
- return (DB_NOTFOUND);
+
+ return (DBC_ERR(dbc, DB_NOTFOUND));
}
/*
@@ -2008,18 +2039,18 @@ __bam_getboth_finddatum(dbc, data, flags)
break;
if (base == (top - P_INDX)) {
if ((ret = __bam_cmp(dbc, data, cp->page,
- cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ cp->indx + O_INDX, dbp->dup_compare, &cmp, NULL)) != 0)
return (ret);
if (cmp == 0 || (cmp < 0 && flags == DB_GET_BOTH_RANGE))
return (0);
cp->indx = top;
- return DB_NOTFOUND;
+ return (DBC_ERR(dbc, DB_NOTFOUND));
}
for (lim = (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) {
cp->indx = base + ((lim >> 1) * P_INDX);
if ((ret = __bam_cmp(dbc, data, cp->page,
- cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ cp->indx + O_INDX, dbp->dup_compare, &cmp, NULL)) != 0)
return (ret);
if (cmp == 0) {
/*
@@ -2039,7 +2070,7 @@ __bam_getboth_finddatum(dbc, data, flags)
/* No match found; if we're looking for an exact match, we're done. */
if (flags == DB_GET_BOTH)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/*
* Base is the smallest index greater than the data item, may be zero
@@ -2049,7 +2080,7 @@ __bam_getboth_finddatum(dbc, data, flags)
cp->indx = base;
while (cp->indx < top && IS_CUR_DELETED(dbc))
cp->indx += P_INDX;
- return (cp->indx < top ? 0 : DB_NOTFOUND);
+ return (cp->indx < top ? 0 : DBC_ERR(dbc, DB_NOTFOUND));
}
/*
@@ -2082,7 +2113,7 @@ split: ret = stack = 0;
switch (flags) {
case DB_CURRENT:
if (F_ISSET(cp, C_DELETED))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/* FALLTHROUGH */
case DB_AFTER:
case DB_BEFORE:
@@ -2206,7 +2237,8 @@ split: ret = stack = 0;
*/
for (;; cp->indx += P_INDX) {
if ((ret = __bam_cmp(dbc, data, cp->page,
- cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ cp->indx + O_INDX, dbp->dup_compare,
+ &cmp, NULL)) != 0)
goto err;
if (cmp < 0) {
iiop = DB_BEFORE;
@@ -2479,7 +2511,7 @@ __bamc_next(dbc, initial_move, deleted_okay)
*/
if (cp->indx >= NUM_ENT(cp->page)) {
if ((pgno = NEXT_PGNO(cp->page)) == PGNO_INVALID)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
if (ret != 0)
@@ -2539,7 +2571,7 @@ __bamc_prev(dbc)
if (cp->indx == 0) {
if ((pgno =
PREV_PGNO(cp->page)) == PGNO_INVALID)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
if (ret != 0)
@@ -2711,11 +2743,11 @@ __bamc_search(dbc, root_pgno, key, flags, exactp)
if (h->next_pgno == PGNO_INVALID) {
indx = NUM_ENT(h) - P_INDX;
if ((ret = __bam_cmp(dbc, key, h, indx,
- t->bt_compare, &cmp)) != 0)
+ t->bt_compare, &cmp, NULL)) != 0)
goto fast_miss;
if (cmp > 0) {
if (FLD_ISSET(sflags, SR_EXACT))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
else
indx += P_INDX;
}
@@ -2725,10 +2757,10 @@ __bamc_search(dbc, root_pgno, key, flags, exactp)
if (h->prev_pgno == PGNO_INVALID) {
indx = 0;
if ((ret = __bam_cmp(dbc, key, h, indx,
- t->bt_compare, &cmp)) != 0)
+ t->bt_compare, &cmp, NULL)) != 0)
goto fast_miss;
if (cmp < 0 && FLD_ISSET(sflags, SR_EXACT))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
if (cmp <= 0)
goto fast_hit;
}
@@ -2736,7 +2768,7 @@ __bamc_search(dbc, root_pgno, key, flags, exactp)
DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), P_INDX) {
DB_BINARY_SEARCH_INCR(indx, base, lim, P_INDX);
if ((ret = __bam_cmp(dbc, key, h, indx,
- t->bt_compare, &cmp)) != 0)
+ t->bt_compare, &cmp, NULL)) != 0)
goto fast_miss;
if (cmp == 0)
@@ -2752,7 +2784,7 @@ __bamc_search(dbc, root_pgno, key, flags, exactp)
indx = base;
if (indx > 0 && indx < NUM_ENT(h)) {
if (FLD_ISSET(sflags, SR_EXACT))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
goto fast_hit;
}
}
@@ -3068,7 +3100,7 @@ __bam_opd_exists(dbc, pgno)
if (NUM_ENT(h) == 0)
ret = 0;
else
- ret = DB_KEYEXIST;
+ ret = DBC_ERR(dbc, DB_KEYEXIST);
(void)__memp_fput(dbc->dbp->mpf, dbc->thread_info, h, dbc->priority);
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index 37496b3f..a1ccef71 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -61,15 +61,18 @@ __bam_ditem(dbc, h, indx)
PAGE *h;
u_int32_t indx;
{
+ BBLOB bl;
BINTERNAL *bi;
BKEYDATA *bk;
DB *dbp;
+ db_seq_t blob_id;
u_int32_t nbytes;
int ret;
db_indx_t *inp;
dbp = dbc->dbp;
inp = P_INP(dbp, h);
+ ret = 0;
/* The page should already have been dirtied by our caller. */
DB_ASSERT(dbp->env, IS_DIRTY(h));
@@ -139,6 +142,13 @@ __bam_ditem(dbc, h, indx)
dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0)
return (ret);
break;
+ case B_BLOB:
+ nbytes = BBLOB_SIZE;
+ memcpy(&bl, bk, BBLOB_SIZE);
+ blob_id = (db_seq_t)bl.id;
+ if ((ret = __blob_del(dbc, blob_id)) != 0)
+ return (ret);
+ break;
case B_KEYDATA:
nbytes = BKEYDATA_SIZE(bk->len);
break;
@@ -241,7 +251,7 @@ __bam_dpages(dbc, use_top, flags)
* single item deleted, and the rest of the pages are to be removed.
*
* Recno always has a stack to the root and __bam_merge operations
- * may have unneeded items in the sack. We find the lowest page
+ * may have unneeded items in the stack. We find the lowest page
* in the stack that has more than one record in it and start there.
*/
ret = 0;
@@ -493,7 +503,9 @@ stop: done = 1;
/*
* __bam_pupdate --
- * Update parent key pointers up the tree.
+ * Update parent key pointers up the tree after putting a new key
+ * at the start of a leaf page.
+ *
*
* PUBLIC: int __bam_pupdate __P((DBC *, PAGE *));
*/
diff --git a/src/btree/bt_method.c b/src/btree/bt_method.c
index 5cf93d2e..2fb33be2 100644
--- a/src/btree/bt_method.c
+++ b/src/btree/bt_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -15,7 +15,7 @@
static int __bam_set_bt_minkey __P((DB *, u_int32_t));
static int __bam_get_bt_compare
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
static int __bam_get_bt_prefix
__P((DB *, size_t(**)(DB *, const DBT *, const DBT *)));
static int __bam_set_bt_prefix
@@ -233,7 +233,7 @@ incompat:
static int
__bam_get_bt_compare(dbp, funcp)
DB *dbp;
- int (**funcp) __P((DB *, const DBT *, const DBT *));
+ int (**funcp) __P((DB *, const DBT *, const DBT *, size_t *));
{
BTREE *t;
@@ -251,13 +251,13 @@ __bam_get_bt_compare(dbp, funcp)
* __bam_set_bt_compare --
* Set the comparison function.
*
- * PUBLIC: int __bam_set_bt_compare
- * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ * PUBLIC: int __bam_set_bt_compare __P((DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *)));
*/
int
__bam_set_bt_compare(dbp, func)
DB *dbp;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
{
BTREE *t;
@@ -351,6 +351,13 @@ __bam_set_bt_compress(dbp, compress, decompress)
return (EINVAL);
}
+ /* Compression is incompatible with blob storage. */
+ if (dbp->blob_threshold > 0) {
+ __db_errx(dbp->env, DB_STR("1198",
+ "compression cannot be used with blobs enabled."));
+ return (EINVAL);
+ }
+
if (compress != 0 && decompress != 0) {
t->bt_compress = compress;
t->bt_decompress = decompress;
diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c
index 7be141c1..46a866d0 100644
--- a/src/btree/bt_open.c
+++ b/src/btree/bt_open.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -44,6 +44,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/crypto.h"
#include "dbinc/db_page.h"
#include "dbinc/db_swap.h"
@@ -119,6 +120,7 @@ __bam_metachk(dbp, name, btm)
int ret;
env = dbp->env;
+ ret = 0;
/*
* At this point, all we know is that the magic number is for a Btree.
@@ -136,6 +138,7 @@ __bam_metachk(dbp, name, btm)
return (DB_OLD_VERSION);
case 8:
case 9:
+ case 10:
break;
default:
__db_errx(env, DB_STR_A("1009",
@@ -269,6 +272,29 @@ __bam_metachk(dbp, name, btm)
/* Set the page size. */
dbp->pgsize = btm->dbmeta.pagesize;
+ dbp->blob_threshold = btm->blob_threshold;
+ GET_BLOB_FILE_ID(env, btm, dbp->blob_file_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB_SDB_ID(env, btm, dbp->blob_sdb_id, ret);
+ if (ret != 0)
+ return (ret);
+ /* Blob databases must be upgraded. */
+ if (vers == 9 && (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0)) {
+ __db_errx(env, DB_STR_A("1207",
+"%s: databases that support blobs must be upgraded.", "%s"),
+ name);
+ return (EINVAL);
+ }
+#ifndef HAVE_64BIT_TYPES
+ if (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0) {
+ __db_errx(env, DB_STR_A("1199",
+ "%s: blobs require 64 integer compiler support.", "%s"),
+ name);
+ return (DB_OPNOTSUP);
+ }
+#endif
+
/* Copy the file's ID. */
memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN);
@@ -442,6 +468,9 @@ __bam_init_meta(dbp, meta, pgno, lsnp)
meta->minkey = t->bt_minkey;
meta->re_len = t->re_len;
meta->re_pad = (u_int32_t)t->re_pad;
+ meta->blob_threshold = dbp->blob_threshold;
+ SET_BLOB_META_FILE_ID(meta, dbp->blob_file_id, BTMETA);
+ SET_BLOB_META_SDB_ID(meta, dbp->blob_sdb_id, BTMETA);
#ifdef HAVE_PARTITION
if ((part = dbp->p_internal) != NULL) {
@@ -535,6 +564,12 @@ __bam_new_file(dbp, ip, txn, fhp, name)
pginfo.type = dbp->type;
pdbt.data = &pginfo;
pdbt.size = sizeof(pginfo);
+ if (dbp->blob_threshold) {
+ if ((ret = __blob_generate_dir_ids(dbp, txn,
+ &dbp->blob_file_id)) != 0)
+ return (ret);
+
+ }
if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0)
return (ret);
meta = (BTMETA *)buf;
@@ -613,6 +648,12 @@ __bam_new_subdb(mdbp, dbp, ip, txn)
meta = NULL;
root = NULL;
+ if (dbp->blob_threshold) {
+ if ((ret = __blob_generate_dir_ids(dbp, txn,
+ &dbp->blob_sdb_id)) != 0)
+ return (ret);
+ }
+
if ((ret = __db_cursor(mdbp, ip, txn,
&dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
return (ret);
diff --git a/src/btree/bt_put.c b/src/btree/bt_put.c
index 13316181..5cd0ac12 100644
--- a/src/btree/bt_put.c
+++ b/src/btree/bt_put.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -56,8 +56,8 @@ static int __bam_dup_check __P((DBC *, u_int32_t,
static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t, u_int32_t));
static int __bam_ovput
__P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *));
-static u_int32_t
- __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t));
+static int __bam_partsize
+ __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t *));
/*
* __bam_iitem --
@@ -71,18 +71,22 @@ __bam_iitem(dbc, key, data, op, flags)
DBT *key, *data;
u_int32_t op, flags;
{
+ BBLOB bl, blob_buf;
BKEYDATA *bk, bk_tmp;
BTREE *t;
BTREE_CURSOR *cp;
DB *dbp;
- DBT bk_hdr, tdbt;
+ DBT bk_hdr, blob_dbt, tdbt;
DB_MPOOLFILE *mpf;
ENV *env;
+ DB_LSN lsn;
PAGE *h;
db_indx_t cnt, indx;
+ off_t blob_size;
+ db_seq_t blob_id, new_blob_id;
u_int32_t data_size, have_bytes, need_bytes, needed, pages, pagespace;
char tmp_ch;
- int cmp, bigkey, bigdata, del, dupadjust;
+ int cmp, bigkey, bigdata, blobdata, del, dupadjust;
int padrec, replace, ret, t_ret, was_deleted;
COMPQUIET(cnt, 0);
@@ -95,6 +99,7 @@ __bam_iitem(dbc, key, data, op, flags)
h = cp->page;
indx = cp->indx;
del = dupadjust = replace = was_deleted = 0;
+ blobdata = 0;
/*
* Fixed-length records with partial puts: it's an error to specify
@@ -112,8 +117,12 @@ __bam_iitem(dbc, key, data, op, flags)
* longer than the fixed-length, and we never require less than
* the fixed-length record size.
*/
- data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
- __bam_partsize(dbp, op, data, h, indx) : data->size;
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __bam_partsize(
+ dbp, op, data, h, indx, &data_size)) != 0)
+ return (ret);
+ } else
+ data_size = data->size;
padrec = 0;
if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
if (data_size > t->re_len)
@@ -190,6 +199,13 @@ __bam_iitem(dbc, key, data, op, flags)
}
if (!F_ISSET(data, DB_DBT_STREAMING) &&
(padrec || F_ISSET(data, DB_DBT_PARTIAL))) {
+ /* Partial puts need to be handled in the blob functions. */
+ if (op == DB_CURRENT) {
+ bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ?
+ O_INDX : 0));
+ if (B_TYPE(bk->type) == B_BLOB)
+ goto dup_cmp;
+ }
tdbt = *data;
if ((ret =
__bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0)
@@ -204,10 +220,10 @@ __bam_iitem(dbc, key, data, op, flags)
* screwing up the duplicate sort order. We have to do this after
* we build the real record so that we're comparing the real items.
*/
- if (op == DB_CURRENT && dbp->dup_compare != NULL) {
+dup_cmp:if (op == DB_CURRENT && dbp->dup_compare != NULL) {
if ((ret = __bam_cmp(dbc, data, h,
indx + (TYPE(h) == P_LBTREE ? O_INDX : 0),
- dbp->dup_compare, &cmp)) != 0)
+ dbp->dup_compare, &cmp, NULL)) != 0)
return (ret);
if (cmp != 0) {
__db_errx(env, DB_STR("1004",
@@ -218,10 +234,30 @@ __bam_iitem(dbc, key, data, op, flags)
/*
* If the key or data item won't fit on a page, we'll have to store
- * them on overflow pages.
+ * them on overflow pages. The exception is if we are inserting
+ * into an existing blob file, in that case it remains a blob
+ * file regardless of its new size.
*/
+ if (op == DB_CURRENT) {
+ bk = GET_BKEYDATA(
+ dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ if (B_TYPE(bk->type) == B_BLOB) {
+ blobdata = 1;
+ bigdata = 0;
+ } else
+ bigdata = data_size > cp->ovflsize;
+ } else {
+ if (dbp->blob_threshold &&
+ (dbp->blob_threshold <= data_size ||
+ F_ISSET(data, DB_DBT_BLOB))) {
+ blobdata = 1;
+ bigdata = 0;
+ } else {
+ blobdata = 0;
+ bigdata = data_size > cp->ovflsize;
+ }
+ }
needed = 0;
- bigdata = data_size > cp->ovflsize;
switch (op) {
case DB_KEYFIRST:
/* We're adding a new key and data pair. */
@@ -232,6 +268,8 @@ __bam_iitem(dbc, key, data, op, flags)
needed += BKEYDATA_PSIZE(key->size);
if (bigdata)
needed += BOVERFLOW_PSIZE;
+ else if (blobdata)
+ needed += BBLOB_PSIZE;
else
needed += BKEYDATA_PSIZE(data_size);
break;
@@ -254,6 +292,8 @@ __bam_iitem(dbc, key, data, op, flags)
indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
if (B_TYPE(bk->type) == B_KEYDATA)
have_bytes = BKEYDATA_PSIZE(bk->len);
+ else if (B_TYPE(bk->type) == B_BLOB)
+ have_bytes = BBLOB_PSIZE;
else
have_bytes = BOVERFLOW_PSIZE;
need_bytes = 0;
@@ -263,6 +303,8 @@ __bam_iitem(dbc, key, data, op, flags)
}
if (bigdata)
need_bytes += BOVERFLOW_PSIZE;
+ else if (blobdata)
+ need_bytes += BBLOB_PSIZE;
else
need_bytes += BKEYDATA_PSIZE(data_size);
@@ -405,7 +447,8 @@ __bam_iitem(dbc, key, data, op, flags)
* because we're going to immediately re-add the item into the
* same slot.
*/
- if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
+ if (bigdata || (B_TYPE(bk->type) != B_KEYDATA &&
+ B_TYPE(bk->type) != B_BLOB)) {
/*
* If streaming, don't delete the overflow item,
* just delete the item pointing to the overflow item.
@@ -448,13 +491,65 @@ __bam_iitem(dbc, key, data, op, flags)
bk_hdr.size = SSZA(BKEYDATA, data);
ret = __db_pitem(dbc, h, indx,
BKEYDATA_SIZE(data->size), &bk_hdr, data);
- } else if (replace)
- ret = __bam_ritem(dbc, h, indx, data, 0);
- else
- ret = __db_pitem(dbc, h, indx,
- BKEYDATA_SIZE(data->size), NULL, data);
+ } else if (replace) {
+ /*
+ * If updating a blob, replace the blob file with the
+ * new blob data and updated the blob db record.
+ */
+ if (blobdata) {
+ memcpy(&bl,
+ P_ENTRY(dbp, h, indx), BBLOB_SIZE);
+ memset(&blob_dbt, 0, sizeof(DBT));
+ blob_dbt.size = BBLOB_DSIZE;
+ if (F_ISSET(data, DB_DBT_BLOB_REC)) {
+ /*
+ * Replace the blob record with the
+ * blob record in the data DBT.
+ */
+ blob_dbt.data = BBLOB_DATA(data->data);
+ } else {
+ blob_id = (db_seq_t)bl.id;
+ GET_BLOB_SIZE(
+ dbp->env, bl, blob_size, ret);
+ if (ret != 0)
+ goto err;
+ if ((ret = __blob_repl(
+ dbc, data, blob_id,
+ &new_blob_id, &blob_size)) != 0)
+ goto err;
+ blob_dbt.data = BBLOB_DATA((&bl));
+ SET_BLOB_ID(&bl, new_blob_id, BBLOB);
+ SET_BLOB_SIZE(&bl, blob_size, BBLOB);
+ }
+ ret = __bam_ritem(
+ dbc, h, indx, &blob_dbt, B_BLOB);
+ } else
+ ret = __bam_ritem(dbc, h, indx, data, 0);
+ } else
+ if (blobdata) {
+ new_blob_id = 0;
+ blob_size = 0;
+ if ((ret = __blob_put(dbc, data,
+ &new_blob_id, &blob_size, &lsn)) != 0)
+ goto err;
+ memset(&blob_buf, 0, BBLOB_SIZE);
+ blob_buf.type = B_BLOB;
+ blob_buf.len = BBLOB_DSIZE;
+ tdbt.data = &blob_buf;
+ tdbt.size = BBLOB_SIZE;
+ SET_BLOB_ID(&blob_buf, new_blob_id, BBLOB);
+ SET_BLOB_SIZE(&blob_buf, blob_size, BBLOB);
+ SET_BLOB_FILE_ID(
+ &blob_buf, dbp->blob_file_id, BBLOB);
+ SET_BLOB_SDB_ID(
+ &blob_buf, dbp->blob_sdb_id, BBLOB);
+ ret = __db_pitem(dbc, h,
+ indx, BBLOB_SIZE, &tdbt, NULL);
+ } else
+ ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(data->size), NULL, data);
}
- if (ret != 0) {
+err: if (ret != 0) {
if (del == 1 && (t_ret =
__bam_ca_di(dbc, PGNO(h), indx + 1, -1)) != 0) {
__db_err(env, t_ret, DB_STR("1005",
@@ -504,32 +599,61 @@ __bam_iitem(dbc, key, data, op, flags)
* __bam_partsize --
* Figure out how much space a partial data item is in total.
*/
-static u_int32_t
-__bam_partsize(dbp, op, data, h, indx)
+static int
+__bam_partsize(dbp, op, data, h, indx, data_size)
DB *dbp;
u_int32_t op, indx;
DBT *data;
PAGE *h;
+ u_int32_t *data_size;
{
+ BBLOB bl;
BKEYDATA *bk;
+ int ret;
+ off_t blob_size;
u_int32_t nbytes;
+ ret = 0;
+
/*
* If the record doesn't already exist, it's simply the data we're
* provided.
*/
- if (op != DB_CURRENT)
- return (data->doff + data->size);
+ if (op != DB_CURRENT) {
+ *data_size = data->doff + data->size;
+ return (0);
+ }
/*
* Otherwise, it's the data provided plus any already existing data
* that we're not replacing.
*/
bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
- nbytes =
- B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
+ switch (B_TYPE(bk->type)) {
+ case B_BLOB:
+ memcpy(&bl, bk, BBLOB_SIZE);
+ GET_BLOB_SIZE(dbp->env, bl, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ /*
+ * It is not possible to add data past UINT32_MAX in the
+ * partial API, so this is safe.
+ */
+ if (blob_size > UINT32_MAX)
+ nbytes = UINT32_MAX;
+ else
+ nbytes = (u_int32_t)blob_size;
+ break;
+ case B_OVERFLOW:
+ nbytes = ((BOVERFLOW *)bk)->tlen;
+ break;
+ default:
+ nbytes = bk->len;
+ }
- return (__db_partsize(nbytes, data));
+ *data_size = __db_partsize(nbytes, data);
+
+ return (ret);
}
/*
@@ -848,6 +972,7 @@ __bam_irep(dbc, h, indx, hdr, data)
bi = GET_BINTERNAL(dbp, h, indx);
bn = (BINTERNAL *) hdr->data;
+ DB_ASSERT(dbc->env, B_TYPE(bi->type) != B_BLOB);
if (B_TYPE(bi->type) == B_OVERFLOW &&
(ret = __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
return (ret);
@@ -892,6 +1017,7 @@ __bam_dup_check(dbc, op, h, indx, sz, cntp)
/* Count the key once. */
bk = GET_BKEYDATA(dbp, h, indx);
+ DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB);
sz += B_TYPE(bk->type) == B_KEYDATA ?
BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
@@ -994,6 +1120,7 @@ __bam_dup_convert(dbc, h, indx, cnt)
* overflow, then free up those pages).
*/
bk = GET_BKEYDATA(dbp, h, dindx + 1);
+ DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB);
hdr.data = bk;
hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
diff --git a/src/btree/bt_rec.c b/src/btree/bt_rec.c
index 026564b6..eb44d04b 100644
--- a/src/btree/bt_rec.c
+++ b/src/btree/bt_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/btree/bt_reclaim.c b/src/btree/bt_reclaim.c
index f465cc5a..1203ea35 100644
--- a/src/btree/bt_reclaim.c
+++ b/src/btree/bt_reclaim.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/btree/bt_recno.c b/src/btree/bt_recno.c
index 9356a742..abbd8efb 100644
--- a/src/btree/bt_recno.c
+++ b/src/btree/bt_recno.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -234,7 +234,7 @@ __ramc_del(dbc, flags)
retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0)
goto err;
if (!exact) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
stack = 1;
@@ -256,7 +256,7 @@ retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0)
* if the record was "deleted", we could never have found it.
*/
if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) {
- ret = DB_KEYEMPTY;
+ ret = DBC_ERR(dbc, DB_KEYEMPTY);
goto err;
}
@@ -391,7 +391,7 @@ retry: switch (flags) {
* a dup, so we set flags to DB_NEXT and keep going.
*/
if (!F_ISSET(dbc, DBC_OPD))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/* FALLTHROUGH */
case DB_NEXT_NODUP:
/*
@@ -431,7 +431,7 @@ retry: switch (flags) {
* is a dup, so we set flags to DB_PREV and keep going.
*/
if (!F_ISSET(dbc, DBC_OPD))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/* FALLTHROUGH */
case DB_PREV_NODUP:
/*
@@ -443,7 +443,7 @@ retry: switch (flags) {
flags = DB_PREV;
if (cp->recno != RECNO_OOB) {
if (cp->recno == 1) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
--cp->recno;
@@ -458,7 +458,7 @@ retry: switch (flags) {
if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
goto err;
if (cp->recno == 0) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
break;
@@ -476,7 +476,7 @@ retry: switch (flags) {
cp->recno++;
break;
}
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
/* NOTREACHED */
case DB_GET_BOTH:
@@ -522,7 +522,7 @@ retry: switch (flags) {
1, &exact)) != 0)
goto err;
if (!exact) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -561,22 +561,22 @@ retry: switch (flags) {
(void)__bam_stkrel(dbc, STK_CLRDBC);
continue;
}
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
default:
- ret = DB_KEYEMPTY;
+ ret = DBC_ERR(dbc, DB_KEYEMPTY);
goto err;
}
if (flags == DB_GET_BOTH ||
flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
- __bam_defcmp, &cmp)) != 0)
+ __bam_defcmp, &cmp, NULL)) != 0)
return (ret);
if (cmp == 0)
break;
if (!F_ISSET(dbc, DBC_OPD)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
(void)__bam_stkrel(dbc, STK_CLRDBC);
@@ -1331,7 +1331,7 @@ __ram_sread(dbc, top)
if (0) {
eof: t->re_eof = 1;
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
}
err: if (!was_modified)
t->re_modified = 0;
@@ -1368,7 +1368,7 @@ retry: /* Find the slot for insertion. */
if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) &&
!B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) {
- ret = DB_KEYEXIST;
+ ret = DBC_ERR(dbc, DB_KEYEXIST);
goto err;
}
diff --git a/src/btree/bt_rsearch.c b/src/btree/bt_rsearch.c
index 36d1c667..4ada6e2d 100644
--- a/src/btree/bt_rsearch.c
+++ b/src/btree/bt_rsearch.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -147,7 +147,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp)
__TLPUT(dbc, lock)) != 0 && ret == 0)
ret = t_ret;
if (ret == 0)
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto done;
}
}
@@ -197,7 +197,8 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp)
lock)) != 0 && ret == 0)
ret = t_ret;
if (ret == 0)
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc,
+ DB_NOTFOUND);
goto err;
}
}
diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c
index e809a852..e3d69d16 100644
--- a/src/btree/bt_search.c
+++ b/src/btree/bt_search.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -51,8 +51,9 @@
/*
* __bam_get_root --
- * Fetch the root of a tree and see if we want to keep
- * it in the stack.
+ * Try to appropriately lock and fetch the root page of a tree;
+ * if successful enter it into the cursor's stack; on error, leave the stack
+ * unchanged.
*
* PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *));
*/
@@ -232,9 +233,11 @@ retry: if (lock_mode == DB_LOCK_WRITE)
} else if (atomic_read(&mpf->mfp->multiversion) != 0 &&
lock_mode == DB_LOCK_WRITE && (ret = __memp_dirty(mpf, &h,
dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
- (void)__memp_fput(mpf,
- dbc->thread_info, h, dbc->priority);
+ if (h != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
(void)__LPUT(dbc, lock);
+ return (ret);
}
}
@@ -272,9 +275,10 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp)
db_recno_t recno;
int adjust, cmp, deloffset, ret, set_stack, stack, t_ret;
int getlock, was_next;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
u_int32_t get_mode, wait;
u_int8_t level, saved_level;
+ size_t pos, pos_h, pos_l;
if (F_ISSET(dbc, DBC_OPD))
LOCK_CHECK_OFF(dbc->thread_info);
@@ -288,6 +292,7 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp)
t = dbp->bt_internal;
recno = 0;
t_ret = 0;
+ func = NULL;
BT_STK_CLR(cp);
LOCK_INIT(saved_lock);
@@ -339,11 +344,17 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
BT_STK_CLR(cp);
- /* Choose a comparison function. */
+ /*
+ * Choose a comparison function.
+ * We apply the prefix search optimization only when there
+ * is no user-specific comparsion function set.
+ */
func = F_ISSET(dbc, DBC_OPD) ?
(dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
t->bt_compare;
+ pos_h = 0;
+ pos_l = 0;
for (;;) {
if (TYPE(h) == P_LBTREE)
adjust = P_INDX;
@@ -389,9 +400,11 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
* match on a leaf page, we're done.
*/
DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) {
+ /* We compare from the common prefix */
+ pos = pos_l > pos_h ? pos_h : pos_l;
DB_BINARY_SEARCH_INCR(indx, base, lim, adjust);
if ((ret = __bam_cmp(dbc, key, h, indx,
- func, &cmp)) != 0)
+ func, &cmp, &pos)) != 0)
goto err;
if (cmp == 0) {
if (LEVEL(h) == LEAFLEVEL ||
@@ -403,9 +416,19 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
}
goto next;
}
- if (cmp > 0)
+ /*
+ * We have to maintain the offset in the keys where
+ * we begin comparing for both ends of the key range
+ * in which we are binary searching. So, update either
+ * the high or low position here, depending on how
+ * the comparison turned out.
+ */
+ if (cmp > 0) {
DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
lim, adjust);
+ pos_l = pos;
+ } else
+ pos_h = pos;
}
/*
@@ -421,7 +444,7 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
*exactp = 0;
if (LF_ISSET(SR_EXACT)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -444,13 +467,13 @@ get_next: /*
* at the root if the tree recently collapsed.
*/
if (PGNO(h) == root_pgno) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
indx = cp->sp->indx + 1;
if (indx == NUM_ENT(cp->sp->page)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
cp->csp++;
goto err;
}
@@ -863,7 +886,7 @@ found: *exactp = 1;
* DB_NOTFOUND.
*/
if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 8299c69a..f7719dc4 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -63,7 +63,7 @@ __bam_split(dbc, arg, root_pgnop)
db_pgno_t *root_pgnop;
{
BTREE_CURSOR *cp;
- DB_LOCK metalock, next_lock;
+ DB_LOCK meta_lock, next_lock;
enum { UP, DOWN } dir;
db_pgno_t pgno, next_pgno, root_pgno;
int exact, level, ret;
@@ -72,17 +72,16 @@ __bam_split(dbc, arg, root_pgnop)
LOCK_CHECK_OFF(dbc->thread_info);
cp = (BTREE_CURSOR *)dbc->internal;
+ LOCK_INIT(meta_lock);
LOCK_INIT(next_lock);
next_pgno = PGNO_INVALID;
/*
- * First get a lock on the metadata page, we will have to allocate
+ * First get a lock on the metadata page; we will have to allocate
* pages and cannot get a lock while we have the search tree pinned.
*/
-
pgno = PGNO_BASE_MD;
- if ((ret = __db_lget(dbc,
- 0, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
goto err;
root_pgno = BAM_ROOT_PGNO(dbc);
@@ -189,7 +188,7 @@ no_split: /* Once we've split the leaf page, we're done. */
if (root_pgnop != NULL)
*root_pgnop = BAM_ROOT_PGNO(dbc);
err:
-done: (void)__LPUT(dbc, metalock);
+done: (void)__LPUT(dbc, meta_lock);
(void)__TLPUT(dbc, next_lock);
if (F_ISSET(dbc, DBC_OPD))
@@ -685,6 +684,7 @@ __bam_broot(dbc, rootp, split, lp, rp)
DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
break;
+ case B_BLOB:
case B_DUPLICATE:
default:
goto pgfmt;
@@ -772,7 +772,30 @@ __ram_root(dbc, rootp, lp, rp)
/*
* __bam_pinsert --
- * Insert a new key into a parent page, completing the split.
+ *
+ * Construct a internal index item and place it in the parent page. It is
+ * primarily used by __bam_page() to add a new page into the tree. The sole
+ * other use is by __bam_pupdate() after a reverse split or compact has
+ * removed pages underneath it, in order to replace the parent's key/nrecs
+ * to match the new subtree.
+ *
+ * Parameters:
+ * parent - the page from the cursor stack to be modifed. The next entry
+ * in the stack (i.e., the next lower level in the tree) contains
+ * the key of the new item. The indx field must have been set
+ * when searching down the tree, to point to the new/replaced
+ * parent item.
+ * split - the indx in the cursor stack of the 'source' of the new item.
+ * lchild - the left child page is used *only* when attempting to use
+ * prefix key compression on a leaf (data) page.
+ * rchild - right child page. The source of the pgno of the new item.
+ * flags - BPI_REPLACE | BPI_NORENCUM
+ * BPI_NOLOGGING
+ *
+ * The pgno of the item always comes from rchild, which often is the same
+ * as parent[1].page. The key for DB_BTREE comes from the next lower page
+ * in the stack under parent, not from either lchild or rchild parameter --
+ * though often rchild is a copy of parent[1].page.
*
* PUBLIC: int __bam_pinsert
* PUBLIC: __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int));
@@ -867,12 +890,27 @@ __bam_pinsert(dbc, parent, split, lchild, rchild, flags)
size = BINTERNAL_SIZE(child_bi->len);
break;
case B_OVERFLOW:
- /* Reuse the overflow key. */
+ /* Copy the overflow key. */
child_bo = (BOVERFLOW *)child_bi->data;
memset(&bo, 0, sizeof(bo));
bo.type = B_OVERFLOW;
bo.tlen = child_bo->tlen;
- bo.pgno = child_bo->pgno;
+ if (LF_ISSET(BPI_REPLACE)) {
+ /*
+ * Replace (compact or reverse split) needs to
+ * copy in case the data item gets removed.
+ */
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr,
+ child_bo->tlen, child_bo->pgno,
+ &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+ } else
+ bo.pgno = child_bo->pgno;
bi.len = BOVERFLOW_SIZE;
B_TSET(bi.type, B_OVERFLOW);
bi.pgno = rchild->pgno;
@@ -881,6 +919,7 @@ __bam_pinsert(dbc, parent, split, lchild, rchild, flags)
DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
break;
+ case B_BLOB:
case B_DUPLICATE:
default:
goto pgfmt;
@@ -982,8 +1021,8 @@ noprefix: if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
-
break;
+ case B_BLOB:
case B_DUPLICATE:
default:
goto pgfmt;
@@ -1153,23 +1192,32 @@ __bam_psplit(dbc, cp, lp, rp, splitret)
nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
break;
case P_LBTREE:
- if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
- B_KEYDATA)
- nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
- pp, off)->len);
- else
+ switch (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type)) {
+ case B_KEYDATA:
+ nbytes += BKEYDATA_SIZE(
+ GET_BKEYDATA(dbp, pp, off)->len);
+ break;
+ case B_BLOB:
+ nbytes += BBLOB_SIZE;
+ break;
+ default:
nbytes += BOVERFLOW_SIZE;
-
+ }
++off;
/* FALLTHROUGH */
case P_LDUP:
case P_LRECNO:
- if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
- B_KEYDATA)
- nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
- pp, off)->len);
- else
+ switch (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type)) {
+ case B_KEYDATA:
+ nbytes += BKEYDATA_SIZE(
+ GET_BKEYDATA(dbp, pp, off)->len);
+ break;
+ case B_BLOB:
+ nbytes += BBLOB_SIZE;
+ break;
+ default:
nbytes += BOVERFLOW_SIZE;
+ }
break;
case P_IRECNO:
nbytes += RINTERNAL_SIZE;
@@ -1269,7 +1317,7 @@ __bam_copy(dbp, pp, cp, nxt, stop)
PAGE *pp, *cp;
u_int32_t nxt, stop;
{
- BINTERNAL internal;
+ BINTERNAL *bi, internal;
db_indx_t *cinp, nbytes, off, *pinp;
cinp = P_INP(dbp, cp);
@@ -1302,12 +1350,17 @@ __bam_copy(dbp, pp, cp, nxt, stop)
/* FALLTHROUGH */
case P_LDUP:
case P_LRECNO:
- if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) ==
- B_KEYDATA)
- nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp,
- pp, nxt)->len);
- else
+ switch (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type)) {
+ case B_KEYDATA:
+ nbytes = BKEYDATA_SIZE(
+ GET_BKEYDATA(dbp, pp, nxt)->len);
+ break;
+ case B_BLOB:
+ nbytes = BBLOB_SIZE;
+ break;
+ default:
nbytes = BOVERFLOW_SIZE;
+ }
break;
case P_IRECNO:
nbytes = RINTERNAL_SIZE;
@@ -1316,17 +1369,18 @@ __bam_copy(dbp, pp, cp, nxt, stop)
return (__db_pgfmt(dbp->env, pp->pgno));
}
cinp[off] = HOFFSET(cp) -= nbytes;
+ /* Minimize the first key on an IBTREE page; it isn't valid. */
+ bi = GET_BINTERNAL(dbp, pp, nxt);
if (off == 0 && nxt != 0 && TYPE(pp) == P_IBTREE) {
internal.len = 0;
UMRW_SET(internal.unused);
internal.type = B_KEYDATA;
- internal.pgno = GET_BINTERNAL(dbp, pp, nxt)->pgno;
- internal.nrecs = GET_BINTERNAL(dbp, pp, nxt)->nrecs;
+ internal.pgno = bi->pgno;
+ internal.nrecs = bi->nrecs;
memcpy(P_ENTRY(dbp, cp, off), &internal, nbytes);
}
else
- memcpy(P_ENTRY(dbp, cp, off),
- P_ENTRY(dbp, pp, nxt), nbytes);
+ memcpy(P_ENTRY(dbp, cp, off), bi, nbytes);
}
return (0);
}
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 668c4fdb..04c0fbcb 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -278,6 +278,8 @@ __bam_stat_print(dbc, flags)
"%#x\tFixed-length record pad", (u_int)sp->bt_re_pad);
}
__db_dl(env,
+ "Number of pages in the database", (u_long)sp->bt_pagecnt);
+ __db_dl(env,
"Underlying database page size", (u_long)sp->bt_pagesize);
if (dbp->type == DB_BTREE)
__db_dl(env, "Overflow key/data size",
@@ -288,6 +290,10 @@ __bam_stat_print(dbc, flags)
"Number of records in the tree", (u_long)sp->bt_nkeys);
__db_dl(env,
"Number of data items in the tree", (u_long)sp->bt_ndata);
+ if (dbp->type == DB_BTREE) {
+ __db_dl(env,
+ "Number of blobs in the tree", (u_long)sp->bt_nblobs);
+ }
__db_dl(env,
"Number of tree internal pages", (u_long)sp->bt_int_pg);
@@ -372,6 +378,10 @@ __bam_stat_callback(dbc, h, cookie, putp)
/* Ignore off-page duplicates. */
if (B_TYPE(type) != B_DUPLICATE)
++sp->bt_ndata;
+
+ /* Count blobs. */
+ if (B_TYPE(type) == B_BLOB)
+ ++sp->bt_nblobs;
}
++sp->bt_leaf_pg;
diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c
index c9123351..66e27d56 100644
--- a/src/btree/bt_upgrade.c
+++ b/src/btree/bt_upgrade.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_upgrade.h"
#include "dbinc/btree.h"
@@ -151,3 +152,94 @@ __bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp)
return (ret);
}
+
+/*
+ * __bam_60_btreemeta--
+ * Upgrade the version number.
+ *
+ * PUBLIC: int __bam_60_btreemeta
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_60_btreemeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BTMETA33 *bmeta;
+
+ COMPQUIET(flags, 0);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(dbp, NULL);
+ bmeta = (BTMETA33 *)h;
+
+ bmeta->dbmeta.version = 10;
+ *dirtyp = 1;
+
+ return (0);
+}
+
+/*
+ * __bam_60_lbtree --
+ * Upgrade the blob records on the database btree leaf pages.
+ *
+ * PUBLIC: int __bam_60_lbtree
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_60_lbtree(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BBLOB60 bl60;
+ BBLOB60P1 bl60p1;
+ BKEYDATA *bk;
+ db_seq_t blob_id, blob_size, file_id, sdb_id;
+ db_indx_t indx;
+ int ret;
+
+ COMPQUIET(flags, 0);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+ ret = 0;
+
+ DB_ASSERT(dbp->env, BBLOB60_SIZE == BBLOB_SIZE);
+ for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_BLOB ) {
+ memcpy(&bl60, bk, BBLOB60_SIZE);
+ memset(&bl60p1, 0, BBLOB_SIZE);
+ bl60p1.type = bl60.type;
+ bl60p1.len = BBLOB_DSIZE;
+ bl60p1.encoding = bl60.encoding;
+ GET_BLOB60_ID(dbp->env, bl60, blob_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_SIZE(dbp->env, bl60, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_FILE_ID(dbp->env, &bl60, file_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_SDB_ID(dbp->env, &bl60, sdb_id, ret);
+ if (ret != 0)
+ return (ret);
+ SET_BLOB_ID(&bl60p1, blob_id, BBLOB60P1);
+ SET_BLOB_SIZE(&bl60p1, blob_size, BBLOB60P1);
+ SET_BLOB_FILE_ID(&bl60p1, file_id, BBLOB60P1);
+ SET_BLOB_SDB_ID(&bl60p1, sdb_id, BBLOB60P1);
+ memcpy(bk, &bl60p1, BBLOB_SIZE);
+ *dirtyp = 1;
+ }
+ }
+
+ return (ret);
+}
diff --git a/src/btree/bt_verify.c b/src/btree/bt_verify.c
index 99354a58..8ceb50e6 100644
--- a/src/btree/bt_verify.c
+++ b/src/btree/bt_verify.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_verify.h"
#include "dbinc/btree.h"
@@ -20,8 +21,8 @@ static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *,
static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
db_indx_t *, u_int32_t));
static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *,
- BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *),
- u_int32_t));
+ BINTERNAL *, BINTERNAL *,
+ int (*)(DB *, const DBT *, const DBT *, size_t *), u_int32_t));
static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
db_indx_t *, u_int32_t));
@@ -44,6 +45,7 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
VRFY_PAGEINFO *pip;
int isbad, t_ret, ret;
db_indx_t ovflsize;
+ db_seq_t blob_id;
env = dbp->env;
isbad = 0;
@@ -201,6 +203,56 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
"%lu %lu"), (u_long)pgno, (u_long)pip->re_len));
}
+/*
+ * Where 64-bit integer support is not available,
+ * return an error if the file has any blobs.
+ */
+ t_ret = 0;
+#ifdef HAVE_64BIT_TYPES
+ GET_BLOB_FILE_ID(env, meta, blob_id, t_ret);
+ if (t_ret != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1187",
+ "Page %lu: blob file id overflow.", "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+ t_ret = 0;
+ GET_BLOB_SDB_ID(env, meta, blob_id, t_ret);
+ if (t_ret != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1188",
+ "Page %lu: blob subdatabase id overflow.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+#else /* HAVE_64BIT_TYPES */
+ /*
+ * db_seq_t is an int on systems that do not have 64 integers, so
+ * this will compile and run.
+ */
+ GET_BLOB_FILE_ID(env, meta, blob_id, t_ret);
+ if (t_ret != 0 || blob_id != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1200",
+ "Page %lu: blobs require 64 integer compiler support.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+ t_ret = 0;
+ GET_BLOB_SDB_ID(env, meta, blob_id, t_ret);
+ if (t_ret != 0 || blob_id != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1201",
+ "Page %lu: blobs require 64 integer compiler support.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+#endif
+
/*
* We do not check that the rest of the page is 0, because it may
* not be and may still be correct.
@@ -268,8 +320,7 @@ __ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
if (F_ISSET(pip, VRFY_HAS_DUPS)) {
EPRINT((env, DB_STR_A("1043",
- "Page %lu: Recno database has dups",
- "%lu"), (u_long)pgno));
+ "Page %lu: Recno database has dups", "%lu"), (u_long)pgno));
ret = DB_VERIFY_BAD;
goto err;
}
@@ -547,12 +598,15 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
db_indx_t *nentriesp;
u_int32_t flags;
{
+ BBLOB bl;
BKEYDATA *bk;
BOVERFLOW *bo;
ENV *env;
VRFY_CHILDINFO child;
VRFY_ITEM *pagelayout;
VRFY_PAGEINFO *pip;
+ off_t blob_size;
+ db_seq_t blob_id, file_id, sdb_id;
u_int32_t himark, offset; /*
* These would be db_indx_ts
* but for alignment.
@@ -563,6 +617,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
env = dbp->env;
isbad = isdupitem = 0;
nentries = 0;
+ file_id = sdb_id = 0;
memset(&child, 0, sizeof(VRFY_CHILDINFO));
if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
return (ret);
@@ -668,6 +723,9 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
else
endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
break;
+ case B_BLOB:
+ endoff = offset + BBLOB_SIZE - 1;
+ break;
case B_DUPLICATE:
/*
* Flag that we have dups; we'll check whether
@@ -731,6 +789,52 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
* already been done.
*/
break;
+ case B_BLOB:
+ if (TYPE(h) == P_IBTREE) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1189",
+ "Page %lu: blob item in internal btree page at item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ } else if (TYPE(h) == P_LRECNO) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1190",
+ "Page %lu: blob item referenced by recno page at item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ }
+ /*
+ * Blob item. Check that the blob file exists and is
+ * the same file size as is stored in the database
+ * record.
+ */
+ memcpy(&bl, bk, BBLOB_SIZE);
+ blob_id = (db_seq_t)bl.id;
+ GET_BLOB_SIZE(env, bl, blob_size, ret);
+ if (ret != 0 || blob_size < 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1192",
+ "Page %lu: blob file size value has overflowed at item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ }
+ file_id = (db_seq_t)bl.file_id;
+ sdb_id = (db_seq_t)bl.sdb_id;
+ if (file_id == 0 && sdb_id == 0) {
+ isbad = 1;
+ EPRINT((dbp->env, DB_STR_A("1195",
+ "Page %lu: invalid blob dir ids %llu %llu at item %lu",
+ "%lu %ll %ll %lu"), (u_long)pip->pgno,
+ (long long)file_id,
+ (long long)sdb_id, (u_long)i));
+ break;
+ }
+ if ((ret = __blob_vrfy(env, blob_id,
+ blob_size, file_id, sdb_id, pgno, flags)) != 0) {
+ isbad = 1;
+ break;
+ }
+ break;
case B_DUPLICATE:
if (TYPE(h) == P_IBTREE) {
isbad = 1;
@@ -751,9 +855,17 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
(BOVERFLOW *)(((BINTERNAL *)bk)->data) :
(BOVERFLOW *)bk;
- if (B_TYPE(bk->type) == B_OVERFLOW)
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ if (TYPE(h) == P_IBTREE &&
+ bk->len != BOVERFLOW_SIZE) {
+ EPRINT((env, DB_STR_A("1196",
+ "Page %lu: bad length %u in B_OVERFLOW item %lu",
+ "%lu %u %lu"),
+ (u_long)pgno, bk->len, (u_long)i));
+ isbad = 1;
+ }
/* Make sure tlen is reasonable. */
- if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
+ if (bo->tlen >= dbp->pgsize * vdp->last_pgno) {
isbad = 1;
EPRINT((env, DB_STR_A("1056",
"Page %lu: impossible tlen %lu, item %lu",
@@ -762,6 +874,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
/* Don't save as a child. */
break;
}
+ }
if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
bo->pgno == PGNO_INVALID) {
@@ -918,8 +1031,8 @@ __bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags)
VRFY_PAGEINFO *pip;
db_indx_t i, *inp;
int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret;
- int (*dupfunc) __P((DB *, const DBT *, const DBT *));
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*dupfunc) __P((DB *, const DBT *, const DBT *, size_t *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
void *buf1, *buf2, *tmpbuf;
/*
@@ -1066,6 +1179,11 @@ retry: p1 = &dbta;
if (B_TYPE(bk->type) == B_OVERFLOW) {
bo = (BOVERFLOW *)bk;
goto overflow;
+ } else if (B_TYPE(bk->type) == B_BLOB) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1197",
+ "Page %lu: Blob found in key item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
} else {
p2->data = bk->data;
p2->size = bk->len;
@@ -1124,7 +1242,8 @@ overflow: if (!ovflok) {
/* Compare with the last key. */
if (p1->data != NULL && p2->data != NULL) {
- cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2);
+ cmp = inp[i] == inp[i - adj] ? 0 :
+ func(dbp, p1, p2, NULL);
/* comparison succeeded */
if (cmp > 0) {
@@ -1236,8 +1355,8 @@ overflow: if (!ovflok) {
* until we do the structure check
* and see whether DUPSORT is set.
*/
- if (dupfunc(dbp, &dup_1, &dup_2) > 0 &&
- pip != NULL)
+ if (dupfunc(dbp, &dup_1, &dup_2,
+ NULL) > 0 && pip != NULL)
F_SET(pip, VRFY_DUPS_UNSORTED);
if (freedup_1)
@@ -1409,7 +1528,7 @@ __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
db_recno_t child_nrecs, nrecs;
u_int32_t child_level, child_relen, j, level, relen, stflags;
u_int8_t leaf_type;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
int isbad, p, ret, t_ret, toplevel;
if (levelp != NULL) /* Don't leave uninitialized on error. */
@@ -1524,7 +1643,7 @@ __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
* Don't do the prev/next_pgno checks if we've lost
* leaf pages due to another corruption.
*/
- if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) {
+ if (!F_ISSET(vdp, SALVAGE_LEAFCHAIN_BROKEN)) {
if (pip->pgno != vdp->next_pgno) {
isbad = 1;
EPRINT((env, DB_STR_A("1075",
@@ -1547,7 +1666,7 @@ bad_prev: isbad = 1;
}
vdp->prev_pgno = pip->pgno;
vdp->next_pgno = pip->next_pgno;
- F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN);
+ F_CLR(vdp, SALVAGE_LEAFCHAIN_BROKEN);
/*
* Overflow pages are common to all three leaf types;
@@ -1694,7 +1813,7 @@ bad_prev: isbad = 1;
* spew error messages about erroneous prev/next_pgnos,
* since that's probably not the real problem.
*/
- F_SET(vdp, VRFY_LEAFCHAIN_BROKEN);
+ F_SET(vdp, SALVAGE_LEAFCHAIN_BROKEN);
ret = DB_VERIFY_BAD;
goto err;
@@ -2042,7 +2161,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
DB_THREAD_INFO *ip;
PAGE *h;
BINTERNAL *lp, *rp;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
u_int32_t flags;
{
BOVERFLOW *bo;
@@ -2050,7 +2169,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
DBT dbt;
ENV *env;
db_indx_t last;
- int ret, cmp;
+ int cmp, ret, t_ret;
env = dbp->env;
memset(&dbt, 0, sizeof(DBT));
@@ -2077,7 +2196,6 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
return (__db_unknown_path(env, "__bam_vrfy_treeorder"));
}
- /* Populate a dummy cursor. */
if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
return (ret);
@@ -2095,9 +2213,6 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
* parent and falsely report a failure.)
*/
if (lp != NULL && TYPE(h) != P_IBTREE) {
- if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
- PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
- return (ret);
if (lp->type == B_KEYDATA) {
dbt.data = lp->data;
dbt.size = lp->len;
@@ -2105,13 +2220,13 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
bo = (BOVERFLOW *)lp->data;
if ((ret = __db_goff(dbc, &dbt,
bo->tlen, bo->pgno, NULL, NULL)) != 0)
- return (ret);
- } else
- return (
- __db_unknown_path(env, "__bam_vrfy_treeorder"));
+ goto err;
+ } else {
+ ret = __db_unknown_path(env, "__bam_vrfy_treeorder");
+ goto err;
+ }
- /* On error, fall through, free if needed, and return. */
- if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) {
+ if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp, NULL)) == 0) {
if (cmp > 0) {
EPRINT((env, DB_STR_A("1092",
"Page %lu: first item on page sorted greater than parent entry",
@@ -2126,7 +2241,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
if (dbt.data != lp->data)
__os_ufree(env, dbt.data);
if (ret != 0)
- return (ret);
+ goto err;
}
if (rp != NULL) {
@@ -2137,13 +2252,14 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
bo = (BOVERFLOW *)rp->data;
if ((ret = __db_goff(dbc, &dbt,
bo->tlen, bo->pgno, NULL, NULL)) != 0)
- return (ret);
- } else
- return (
- __db_unknown_path(env, "__bam_vrfy_treeorder"));
+ goto err;
+ } else {
+ ret = __db_unknown_path(env, "__bam_vrfy_treeorder");
+ goto err;
+ }
- /* On error, fall through, free if needed, and return. */
- if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) {
+ if ((ret = __bam_cmp(dbc,
+ &dbt, h, last, func, &cmp, NULL)) == 0) {
if (cmp < 0) {
EPRINT((env, DB_STR_A("1094",
"Page %lu: last item on page sorted greater than parent entry",
@@ -2158,6 +2274,9 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
if (dbt.data != rp->data)
__os_ufree(env, dbt.data);
}
+err:
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
return (ret);
}
@@ -2186,14 +2305,20 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
{
BKEYDATA *bk;
BOVERFLOW *bo;
+ BBLOB bl;
DBT dbt, repldbt, unknown_key, unknown_data;
ENV *env;
VRFY_ITEM *pgmap;
db_indx_t i, last, beg, end, *inp;
db_pgno_t ovflpg;
+ off_t blob_size, blob_offset, remaining;
+ u_int32_t blob_buf_size;
+ u_int8_t *blob_buf;
u_int32_t himark, ovfl_bufsz;
+ db_seq_t blob_id, file_id, sdb_id;
void *ovflbuf;
int adj, ret, t_ret, t2_ret;
+ char *prefix;
#ifdef HAVE_COMPRESSION
DBT kcpy, *last_key;
int unknown_dup_key;
@@ -2202,6 +2327,8 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
env = dbp->env;
ovflbuf = pgmap = NULL;
inp = P_INP(dbp, h);
+ blob_buf_size = 0;
+ blob_buf = NULL;
memset(&dbt, 0, sizeof(DBT));
dbt.flags = DB_DBT_REALLOC;
@@ -2543,6 +2670,68 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
}
#endif
break;
+ case B_BLOB:
+ memcpy(&bl, bk, BBLOB_SIZE);
+ blob_id = (db_seq_t)bl.id;
+ GET_BLOB_SIZE(env, bl, blob_size, ret);
+ if (ret != 0 || blob_size < 0)
+ goto err;
+ file_id = (db_seq_t)bl.file_id;
+ sdb_id = (db_seq_t)bl.sdb_id;
+
+ /* Read the blob, in pieces if it is too large.*/
+ blob_offset = 0;
+ if (blob_size > MEGABYTE) {
+ if (blob_buf_size < MEGABYTE) {
+ if ((ret = __os_realloc(
+ env, MEGABYTE, &blob_buf)) != 0)
+ goto err;
+ blob_buf_size = MEGABYTE;
+ }
+ } else if (blob_buf_size < blob_size) {
+ blob_buf_size = (u_int32_t)blob_size;
+ if ((ret = __os_realloc(env,
+ blob_buf_size, &blob_buf)) != 0)
+ goto err;
+ }
+ dbt.data = blob_buf;
+ dbt.ulen = blob_buf_size;
+ remaining = blob_size;
+ prefix = " ";
+ do {
+ if ((ret = __blob_salvage(env, blob_id,
+ blob_offset,
+ ((remaining < blob_buf_size) ?
+ (size_t)remaining : blob_buf_size),
+ file_id, sdb_id, &dbt)) != 0) {
+ if (LF_ISSET(DB_AGGRESSIVE)) {
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ goto err;
+ }
+ if (remaining > blob_buf_size)
+ F_SET(vdp, SALVAGE_STREAM_BLOB);
+ else
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ if ((t_ret = __db_vrfy_prdbt(
+ &dbt, 0, prefix,
+ handle, callback, 0, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ goto err;
+ }
+ prefix = NULL;
+ blob_offset += dbt.size;
+ if (remaining < blob_buf_size)
+ remaining = 0;
+ else
+ remaining -= blob_buf_size;
+ } while (remaining > 0);
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ break;
default:
/*
* We should never get here; __db_vrfy_inpitem should
@@ -2572,6 +2761,8 @@ err: if (pgmap != NULL)
__os_free(env, ovflbuf);
if (repldbt.data != NULL)
__os_free(env, repldbt.data);
+ if (blob_buf != NULL)
+ __os_free(env, blob_buf);
#ifdef HAVE_COMPRESSION
if (kcpy.data != NULL)
__os_free(env, kcpy.data);
diff --git a/src/btree/btree.src b/src/btree/btree.src
index 08e5a206..02088b88 100644
--- a/src/btree/btree.src
+++ b/src/btree/btree.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/clib/bsearch.c b/src/clib/bsearch.c
index 3e55009a..de15358b 100644
--- a/src/clib/bsearch.c
+++ b/src/clib/bsearch.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/clib/getcwd.c b/src/clib/getcwd.c
index 83e8b62d..028fc3f2 100644
--- a/src/clib/getcwd.c
+++ b/src/clib/getcwd.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1989, 1991, 1993
diff --git a/src/clib/getopt.c b/src/clib/getopt.c
index ca98e7f1..4e4dc6c8 100644
--- a/src/clib/getopt.c
+++ b/src/clib/getopt.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1987, 1993, 1994
diff --git a/src/clib/isalpha.c b/src/clib/isalpha.c
index 6bf1ffb7..39114c08 100644
--- a/src/clib/isalpha.c
+++ b/src/clib/isalpha.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/clib/isdigit.c b/src/clib/isdigit.c
index d1b2a65e..e4e1d3d8 100644
--- a/src/clib/isdigit.c
+++ b/src/clib/isdigit.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/clib/isprint.c b/src/clib/isprint.c
index 685e20ea..310894d5 100644
--- a/src/clib/isprint.c
+++ b/src/clib/isprint.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/clib/isspace.c b/src/clib/isspace.c
index df450d3b..48a20617 100644
--- a/src/clib/isspace.c
+++ b/src/clib/isspace.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/clib/memcmp.c b/src/clib/memcmp.c
index 7fec827c..7db1d3ad 100644
--- a/src/clib/memcmp.c
+++ b/src/clib/memcmp.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993
diff --git a/src/clib/memmove.c b/src/clib/memmove.c
index 34a181cc..866843dc 100644
--- a/src/clib/memmove.c
+++ b/src/clib/memmove.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993
diff --git a/src/clib/printf.c b/src/clib/printf.c
index a2c01296..f36eeb15 100644
--- a/src/clib/printf.c
+++ b/src/clib/printf.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/clib/raise.c b/src/clib/raise.c
index ad0e567f..223f797f 100644
--- a/src/clib/raise.c
+++ b/src/clib/raise.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/clib/rand.c b/src/clib/rand.c
index 6b810060..426627a9 100644
--- a/src/clib/rand.c
+++ b/src/clib/rand.c
@@ -13,6 +13,7 @@
* PUBLIC: void srand __P((unsigned int));
* PUBLIC: #endif
*/
+#ifndef HAVE_RAND
int rand(void) /* RAND_MAX assumed to be 32767 */
{
DB_GLOBAL(rand_next) = DB_GLOBAL(rand_next) * 1103515245 + 12345;
@@ -23,3 +24,4 @@ void srand(unsigned int seed)
{
DB_GLOBAL(rand_next) = seed;
}
+#endif
diff --git a/src/clib/snprintf.c b/src/clib/snprintf.c
index 6b31d850..8f1a6855 100644
--- a/src/clib/snprintf.c
+++ b/src/clib/snprintf.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/clib/strerror.c b/src/clib/strerror.c
index 62bd7dd5..b2d148e4 100644
--- a/src/clib/strerror.c
+++ b/src/clib/strerror.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1988, 1993
diff --git a/src/clib/time.c b/src/clib/time.c
index abc2ab2d..3a3f0c3e 100644
--- a/src/clib/time.c
+++ b/src/clib/time.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/clock.c b/src/common/clock.c
index e1f917af..21a17de6 100644
--- a/src/common/clock.c
+++ b/src/common/clock.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/crypto_stub.c b/src/common/crypto_stub.c
index 95faebdb..b961a620 100644
--- a/src/common/crypto_stub.c
+++ b/src/common/crypto_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/db_byteorder.c b/src/common/db_byteorder.c
index 71428f0a..13bc2d52 100644
--- a/src/common/db_byteorder.c
+++ b/src/common/db_byteorder.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/db_compint.c b/src/common/db_compint.c
index 9f5ccf9a..10317b2f 100644
--- a/src/common/db_compint.c
+++ b/src/common/db_compint.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
#include "db_config.h"
diff --git a/src/common/db_err.c b/src/common/db_err.c
index 6edc37b6..7acaa174 100644
--- a/src/common/db_err.c
+++ b/src/common/db_err.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -18,6 +18,11 @@
static void __db_msgcall __P((const DB_ENV *, const char *, va_list));
static void __db_msgfile __P((const DB_ENV *, const char *, va_list));
+#if defined(HAVE_ERROR_HISTORY)
+static void __db_thread_once_func __P((void));
+static void __db_deferred_free __P((void *));
+#endif
+
/*
* __db_fchk --
* General flags checking routine.
@@ -62,6 +67,9 @@ __db_ferr(env, name, iscombo)
const char *name;
int iscombo;
{
+ int ret;
+
+ ret = USR_ERR(env, EINVAL);
if (iscombo)
__db_errx(env, DB_STR_A("0054",
"illegal flag combination specified to %s", "%s"), name);
@@ -69,7 +77,7 @@ __db_ferr(env, name, iscombo)
__db_errx(env, DB_STR_A("0055",
"illegal flag specified to %s", "%s"), name);
- return (EINVAL);
+ return (ret);
}
/*
@@ -145,9 +153,24 @@ __db_assert(env, e, file, line)
if (DB_GLOBAL(j_assert) != NULL)
DB_GLOBAL(j_assert)(e, file, line);
else {
- __db_errx(env, DB_STR_A("0059",
- "assert failure: %s/%d: \"%s\"",
- "%s %d %s"), file, line, e);
+ /*
+ * If a panic has preceded this assertion failure, print that
+ * message as well -- it might be relevant.
+ */
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (PANIC_ISSET(env)) {
+ REGENV *renv;
+ renv = (env == NULL || env->reginfo == NULL) ?
+ NULL : env->reginfo->primary;
+ __db_errx(env, DB_STR_A("0242",
+ "assert failure (%s/%d: %s) after panic %s",
+ "%s %d %s %s"), file, line, e,
+ renv == NULL ? "" : renv->failure_symptom);
+ } else
+#endif
+ __db_errx(env, DB_STR_A("0059",
+ "assert failure: %s/%d: \"%s\"",
+ "%s %d %s"), file, line, e);
__os_abort(env);
/* NOTREACHED */
@@ -156,8 +179,49 @@ __db_assert(env, e, file, line)
#endif
/*
+ * __env_panic_event -
+ * Notify the application of a db_register, failchk, or generic panic.
+ *
+ * PUBLIC: void __env_panic_event __P((ENV *, int));
+ */
+void
+__env_panic_event(env, errval)
+ ENV *env;
+ int errval;
+{
+ DB_ENV *dbenv;
+ REGENV *renv;
+ u_int32_t event;
+ void *info;
+ DB_EVENT_FAILCHK_INFO failinfo;
+
+ dbenv = env->dbenv;
+ info = &errval;
+ if (dbenv->db_paniccall != NULL) /* Deprecated */
+ dbenv->db_paniccall(dbenv, errval);
+ /*
+ * We check for DB_EVENT_FAILCHK and DB_EVENT_REG_PANIC first because
+ * they are not set by themselves. If one of those is set, it means that
+ * this panic is somewhat an expected consequence of a previous failure.
+ */
+ renv = (env->reginfo == NULL) ? NULL : env->reginfo->primary;
+ if (renv != NULL && renv->failure_panic) {
+ event = DB_EVENT_FAILCHK_PANIC;
+ failinfo.error = errval;
+ (void)strncpy(failinfo.symptom,
+ renv->failure_symptom, sizeof(failinfo.symptom));
+ failinfo.symptom[sizeof(failinfo.symptom) - 1] = '\0';
+ info = &failinfo;
+ } else if (renv != NULL && renv->reg_panic)
+ event = DB_EVENT_REG_PANIC;
+ else
+ event = DB_EVENT_PANIC;
+ DB_EVENT(env, event, info);
+}
+
+/*
* __env_panic_msg --
- * Just report that someone else paniced.
+ * Report that we noticed a panic which had been set somewhere else.
*
* PUBLIC: int __env_panic_msg __P((ENV *));
*/
@@ -165,28 +229,16 @@ int
__env_panic_msg(env)
ENV *env;
{
- DB_ENV *dbenv;
int ret;
- dbenv = env->dbenv;
-
ret = DB_RUNRECOVERY;
+ /* Make a note saying where this panic was detected. */
+ (void)USR_ERR(env, ret);
__db_errx(env, DB_STR("0060",
"PANIC: fatal region error detected; run recovery"));
- if (dbenv->db_paniccall != NULL) /* Deprecated */
- dbenv->db_paniccall(dbenv, ret);
-
- /* Must check for DB_EVENT_REG_PANIC panic first because it is never
- * set by itself. If set, it means panic came from DB_REGISTER code
- * only, otherwise it could be from many possible places in the code.
- */
- if ((env->reginfo != NULL) &&
- (((REGENV *)env->reginfo->primary)->reg_panic))
- DB_EVENT(env, DB_EVENT_REG_PANIC, &ret);
- else
- DB_EVENT(env, DB_EVENT_PANIC, &ret);
+ __env_panic_event(env, ret);
return (ret);
}
@@ -202,28 +254,13 @@ __env_panic(env, errval)
ENV *env;
int errval;
{
- DB_ENV *dbenv;
-
- dbenv = env->dbenv;
-
if (env != NULL) {
__env_panic_set(env, 1);
- __db_err(env, errval, DB_STR("0061", "PANIC"));
+ if (errval != DB_RUNRECOVERY)
+ __db_err(env, errval, DB_STR("0061", "PANIC"));
- if (dbenv->db_paniccall != NULL) /* Deprecated */
- dbenv->db_paniccall(dbenv, errval);
-
- /* Must check for DB_EVENT_REG_PANIC first because it is never
- * set by itself. If set, it means panic came from DB_REGISTER
- * code only, otherwise it could be from many possible places
- * in the code.
- */
- if ((env->reginfo != NULL) &&
- (((REGENV *)env->reginfo->primary)->reg_panic))
- DB_EVENT(env, DB_EVENT_REG_PANIC, &errval);
- else
- DB_EVENT(env, DB_EVENT_PANIC, &errval);
+ __env_panic_event(env, errval);
}
#if defined(DIAGNOSTIC) && !defined(CONFIG_TEST)
@@ -302,6 +339,9 @@ db_strerror(error)
case DB_LOG_VERIFY_BAD:
return (DB_STR("0071",
"DB_LOG_VERIFY_BAD: Log verification failed"));
+ case DB_META_CHKSUM_FAIL:
+ return (DB_STR("0247",
+ "DB_META_CHKSUM_FAIL: Checksum mismatch detected on a database metadata page"));
case DB_NOSERVER:
return (DB_STR("0072",
"DB_NOSERVER: No message dispatch call-back function has been configured"));
@@ -419,18 +459,21 @@ __db_syserr(env, error, fmt, va_alist)
DB_ENV *dbenv;
dbenv = env == NULL ? NULL : env->dbenv;
+ if (env != NULL)
+ (void)USR_ERR(env, error);
/*
* The same as DB->err, except we don't default to writing to stderr
* after any output channel has been configured, and we use a system-
* specific function to translate errors to strings.
*/
- DB_REAL_ERR(dbenv, error, DB_ERROR_SYSTEM, 0, fmt);
+ DB_REAL_ERR(dbenv,
+ error, error == 0 ? DB_ERROR_NOT_SET : DB_ERROR_SYSTEM, 0, fmt);
}
/*
* __db_err --
- * Standard error routine.
+ * Standard error routine with an error code.
*
* PUBLIC: void __db_err __P((const ENV *, int, const char *, ...))
* PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4)));
@@ -450,6 +493,10 @@ __db_err(env, error, fmt, va_alist)
dbenv = env == NULL ? NULL : env->dbenv;
+ /* (If no deferred messages yet, at least?) add this calls' info.
+ (void)USR_ERR(env, error);
+ */
+
/*
* The same as DB->err, except we don't default to writing to stderr
* once an output channel has been configured.
@@ -459,7 +506,7 @@ __db_err(env, error, fmt, va_alist)
/*
* __db_errx --
- * Standard error routine.
+ * Standard error routine without any error code.
*
* PUBLIC: void __db_errx __P((const ENV *, const char *, ...))
* PUBLIC: __attribute__ ((__format__ (__printf__, 2, 3)));
@@ -500,25 +547,54 @@ __db_errcall(dbenv, error, error_set, fmt, ap)
const char *fmt;
va_list ap;
{
- char *p;
- char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
- char sysbuf[1024]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+ char *end, *p;
+ char buf[2048 + DB_ERROR_HISTORY_SIZE];
+ char sysbuf[1024];
+#ifdef HAVE_ERROR_HISTORY
+ DB_MSGBUF *deferred_mb;
+ ptrdiff_t len;
+#endif
p = buf;
+ /* Reserve 1 byte at the end for '\0'. */
+ end = buf + sizeof(buf) - 1;
if (fmt != NULL)
p += vsnprintf(buf, sizeof(buf), fmt, ap);
+
if (error_set != DB_ERROR_NOT_SET)
- p += snprintf(p,
- sizeof(buf) - (size_t)(p - buf), ": %s",
+ p += snprintf(p, (size_t)(end - p), ": %s",
error_set == DB_ERROR_SET ? db_strerror(error) :
__os_strerror(error, sysbuf, sizeof(sysbuf)));
+#ifdef HAVE_ERROR_HISTORY
+ /*
+ * Append any messages (e.g., diagnostics) stashed away in the deferred
+ * msgbuf. Strncpy() can't be trusted to append '\0', do it "manually".
+ */
+ if ((deferred_mb = __db_deferred_get()) != NULL &&
+ (len = deferred_mb->cur - deferred_mb->buf) != 0) {
+ p += snprintf(p,
+ (size_t)(end - p), "\nErrors during this API call:");
+ if (len > (end - p))
+ len = end - p;
+ if (len != 0) {
+ memmove(p, deferred_mb->buf, (size_t)len);
+ p[len] = '\0';
+ }
+ }
+#endif
+
dbenv->db_errcall(dbenv, dbenv->db_errpfx, buf);
}
/*
* __db_errfile --
- * Do the error message work for FILE *s.
+ * Do the error message work for FILE *s. Combine the messages into a
+ * single fprintf() call, to avoid interspersed output when there are
+ * multiple active threads.
+ *
+ * Display a ": " after the dbenv prefix, if it has one.
+ * Display a ": " before the error message string, if it error was set.
*
* PUBLIC: void __db_errfile
* PUBLIC: __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
@@ -532,29 +608,62 @@ __db_errfile(dbenv, error, error_set, fmt, ap)
va_list ap;
{
FILE *fp;
- int need_sep;
- char sysbuf[1024]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+ char *defintro, *defmsgs, *error_str, *prefix, *sep1, *sep2;
+ char sysbuf[200];
+ char prefix_buf[200];
+ char full_fmt[4096];
+#ifdef HAVE_ERROR_HISTORY
+ DB_MSGBUF *deferred_mb;
+ size_t room;
+#endif
+ prefix = sep1 = sep2 = error_str = "";
fp = dbenv == NULL ||
dbenv->db_errfile == NULL ? stderr : dbenv->db_errfile;
- need_sep = 0;
+ if (fmt == NULL)
+ fmt = "";
if (dbenv != NULL && dbenv->db_errpfx != NULL) {
- (void)fprintf(fp, "%s", dbenv->db_errpfx);
- need_sep = 1;
+ prefix = __db_fmt_quote(prefix_buf,
+ sizeof(prefix_buf), dbenv->db_errpfx);
+ sep1 = ": ";
}
- if (fmt != NULL && fmt[0] != '\0') {
- if (need_sep)
- (void)fprintf(fp, ": ");
- need_sep = 1;
- (void)vfprintf(fp, fmt, ap);
+ switch (error_set) {
+ case DB_ERROR_NOT_SET:
+ break;
+ case DB_ERROR_SET:
+ error_str = db_strerror(error);
+ sep2 = ": ";
+ break;
+ case DB_ERROR_SYSTEM:
+ error_str = __os_strerror(error, sysbuf, sizeof(sysbuf));
+ sep2 = ": ";
+ break;
}
- if (error_set != DB_ERROR_NOT_SET)
- (void)fprintf(fp, "%s%s",
- need_sep ? ": " : "",
- error_set == DB_ERROR_SET ? db_strerror(error) :
- __os_strerror(error, sysbuf, sizeof(sysbuf)));
- (void)fprintf(fp, "\n");
+#ifdef HAVE_ERROR_HISTORY
+ if ((deferred_mb = __db_deferred_get()) != NULL &&
+ deferred_mb->cur != deferred_mb->buf) {
+ defmsgs =
+ __db_fmt_quote(deferred_mb->buf, deferred_mb->len, NULL);
+ defintro = "\nErrors during this API call:";
+ /*
+ * If there are more deferred messages than will be displayed
+ * change the introductory message to warn of the truncation.
+ */
+ room = sizeof(full_fmt) - (strlen(sep1) +
+ strlen(fmt) + strlen(sep2) + strlen(error_str));
+ if (deferred_mb->len + strlen(defintro) > room) {
+ defintro =
+ "\nFirst recorded errors during this API call:";
+ memmove(defmsgs + room - 4, "...\n", 4);
+ }
+
+ } else
+#endif
+ defmsgs = defintro = "";
+ (void)snprintf(full_fmt, sizeof(full_fmt), "%s%s%s%s%s%s%s\n", prefix,
+ sep1, fmt, sep2, error_str, defintro, defmsgs);
+ (void)vfprintf(fp, full_fmt, ap);
(void)fflush(fp);
}
@@ -562,15 +671,15 @@ __db_errfile(dbenv, error, error_set, fmt, ap)
* __db_msgadd --
* Aggregate a set of strings into a buffer for the callback API.
*
- * PUBLIC: void __db_msgadd __P((ENV *, DB_MSGBUF *, const char *, ...))
+ * PUBLIC: void __db_msgadd __P((const ENV *, DB_MSGBUF *, const char *, ...))
* PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4)));
*/
void
#ifdef STDC_HEADERS
-__db_msgadd(ENV *env, DB_MSGBUF *mbp, const char *fmt, ...)
+__db_msgadd(const ENV *env, DB_MSGBUF *mbp, const char *fmt, ...)
#else
__db_msgadd(env, mbp, fmt, va_alist)
- ENV *env;
+ const ENV *env;
DB_MSGBUF *mbp;
const char *fmt;
va_dcl
@@ -592,17 +701,17 @@ __db_msgadd(env, mbp, fmt, va_alist)
* Aggregate a set of strings into a buffer for the callback API.
*
* PUBLIC: void __db_msgadd_ap
- * PUBLIC: __P((ENV *, DB_MSGBUF *, const char *, va_list));
+ * PUBLIC: __P((const ENV *, DB_MSGBUF *, const char *, va_list));
*/
void
__db_msgadd_ap(env, mbp, fmt, ap)
- ENV *env;
+ const ENV *env;
DB_MSGBUF *mbp;
const char *fmt;
va_list ap;
{
- size_t len, olen;
- char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+ size_t len, nlen, olen;
+ char buf[2048];
len = (size_t)vsnprintf(buf, sizeof(buf), fmt, ap);
@@ -613,9 +722,16 @@ __db_msgadd_ap(env, mbp, fmt, ap)
*/
olen = (size_t)(mbp->cur - mbp->buf);
if (olen + len >= mbp->len) {
- if (__os_realloc(env, mbp->len + len + 256, &mbp->buf))
+ /* Don't write too much for preallocated DB_MSGBUFs. */
+ if (F_ISSET(mbp, DB_MSGBUF_PREALLOCATED)) {
+ memset(mbp->cur, '*', mbp->len - olen);
+ mbp->cur = mbp->buf + mbp->len;
return;
- mbp->len += (len + 256);
+ }
+ nlen = mbp->len + len + (env == NULL ? 8192 : 256);
+ if (__os_realloc(env, nlen, &mbp->buf))
+ return;
+ mbp->len = nlen;
mbp->cur = mbp->buf + olen;
}
@@ -648,6 +764,42 @@ __db_msg(env, fmt, va_alist)
}
/*
+ * __db_debug_msg --
+ * Save a message to be displayed only if this API call returns an error.
+ * The message is discarded if this API call succeeds.
+ *
+ * PUBLIC: void __db_debug_msg __P((const ENV *, const char *, ...));
+ */
+void
+#ifdef STDC_HEADERS
+__db_debug_msg(const ENV *env, const char *fmt, ...)
+#else
+__db_debug_msg(env, fmt, va_alist)
+ const ENV *env;
+ const char *fmt;
+ va_dcl
+#endif
+{
+#ifdef HAVE_ERROR_HISTORY
+ DB_MSGBUF *mb;
+ va_list ap;
+
+ if (env == NULL || (mb = __db_deferred_get()) == NULL)
+ return;
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ __db_msgadd_ap(env, mb, fmt, ap);
+ va_end(ap);
+#endif
+ COMPQUIET(env, NULL);
+ COMPQUIET(fmt, NULL);
+}
+
+/*
* __db_repmsg --
* Replication system message routine.
*
@@ -665,7 +817,7 @@ __db_repmsg(env, fmt, va_alist)
#endif
{
va_list ap;
- char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+ char buf[2048];
#ifdef STDC_HEADERS
va_start(ap, fmt);
@@ -679,7 +831,7 @@ __db_repmsg(env, fmt, va_alist)
/*
* __db_msgcall --
- * Do the message work for callback functions.
+ * Do the message work for callback functions in DB_REAL_MSG().
*/
static void
__db_msgcall(dbenv, fmt, ap)
@@ -687,16 +839,15 @@ __db_msgcall(dbenv, fmt, ap)
const char *fmt;
va_list ap;
{
- char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */
+ char buf[2048];
(void)vsnprintf(buf, sizeof(buf), fmt, ap);
-
dbenv->db_msgcall(dbenv, buf);
}
/*
* __db_msgfile --
- * Do the message work for FILE *s.
+ * Do the message work for FILE *s in DB_REAL_MSG().
*/
static void
__db_msgfile(dbenv, fmt, ap)
@@ -805,6 +956,13 @@ __db_check_txn(dbp, txn, assoc_locker, read_op)
if (IS_RECOVERING(env) || F_ISSET(dbp, DB_AM_RECOVER))
return (0);
+ if (txn != NULL && dbp->blob_threshold &&
+ F_ISSET(txn, (TXN_READ_UNCOMMITTED | TXN_SNAPSHOT))) {
+ __db_errx(env, DB_STR("0237",
+"Blob enabled databases do not support DB_READ_UNCOMMITTED and TXN_SNAPSHOT"));
+ return (EINVAL);
+ }
+
/*
* Check for common transaction errors:
* an operation on a handle whose open commit hasn't completed.
@@ -1095,9 +1253,9 @@ __db_space_err(dbp)
/*
* __db_failed --
- * Common failed thread message.
+ * Common failed thread message, e.g., after it is seen to have crashed.
*
- * PUBLIC: int __db_failed __P((const ENV *,
+ PUBLIC: int __db_failed __P((const ENV *,
* PUBLIC: const char *, pid_t, db_threadid_t));
*/
int
@@ -1108,11 +1266,321 @@ __db_failed(env, msg, pid, tid)
db_threadid_t tid;
{
DB_ENV *dbenv;
- char buf[DB_THREADID_STRLEN];
+ int ret;
+ char tidstr[DB_THREADID_STRLEN], failmsg[DB_FAILURE_SYMPTOM_SIZE];
dbenv = env->dbenv;
+ (void)dbenv->thread_id_string(dbenv, pid, tid, tidstr);
+ ret = USR_ERR(env, DB_RUNRECOVERY);
+ snprintf(failmsg, sizeof(failmsg), DB_STR_A("0113",
+ "Thread/process %s failed: %s", "%s %s"), tidstr, msg);
+ (void)__env_failure_remember(env, failmsg);
+ __db_errx(env, "%s", failmsg);
+ return (ret);
+}
- __db_errx(env, DB_STR_A("0113", "Thread/process %s failed: %s",
- "%s %s"), dbenv->thread_id_string(dbenv, pid, tid, buf), msg);
- return (DB_RUNRECOVERY);
+/*
+ * __env_failure_remember --
+ * If this failure of a process in the environment is about to set panic
+ * for the first time, record that a crashed thread was thw culprit.
+ * Do nothing if panic has already been set. There are no mutexes here;
+ * in order to avoid hanging on any crashed threads.
+ *
+ * PUBLIC: int __env_failure_remember __P((const ENV *, const char *));
+ */
+int
+__env_failure_remember(env, reason)
+ const ENV *env;
+ const char *reason;
+{
+ REGENV *renv;
+
+ renv = env->reginfo->primary;
+ if (renv == NULL || renv->panic || renv->failure_panic)
+ return (0);
+ renv->failure_panic = 1;
+ if (renv->failure_symptom[0] == '\0') {
+ (void)strncpy(renv->failure_symptom,
+ reason, sizeof(renv->failure_symptom));
+ renv->failure_symptom[sizeof(renv->failure_symptom) - 1] = '\0';
+ }
+ return (0);
+}
+
+#if defined(HAVE_ERROR_HISTORY)
+/*
+ * __db_deferred_free --
+ * Pthread_exit() calls this to release DB_GLOBAL(msgs_key)'s
+ * thread-local storage.
+ */
+static void
+__db_deferred_free(void *p)
+{
+ DB_MSGBUF *mb;
+
+ if ((mb = p) != NULL) {
+ (void)pthread_setspecific(DB_GLOBAL(msgs_key), NULL);
+ if (mb->buf != NULL)
+ __os_free(NULL, mb->buf);
+ free(mb);
+ }
+}
+
+/*
+ * __db_thread_once_func --
+ * The pthread_once() functions to initialize thread local storage.
+ */
+static void
+__db_thread_once_func()
+{
+ (void)pthread_key_create(&DB_GLOBAL(msgs_key), __db_deferred_free);
+}
+
+/*
+ * __db_thread_init --
+ * Initialization hook to be called at least once per process, before
+ * deferring any messages.
+ *
+ * PUBLIC: #ifdef HAVE_ERROR_HISTORY
+ * PUBLIC: void __db_thread_init __P((void));
+ * PUBLIC: #endif
+ */
+void
+__db_thread_init()
+{
+ /*
+ * Assign the thread-local storage identifier. Tell thread exit to clean
+ * up withl __db_deferred_free().
+ */
+ (void)pthread_once(&DB_GLOBAL(thread_once), __db_thread_once_func);
+}
+
+/*
+ * __db_diags --
+ *
+ * Save the context which triggers the "first notice" of an error code;
+ * i.e., its creation. It doesn't touch anything when err == 0.
+ *
+ * PUBLIC: #ifdef HAVE_ERROR_HISTORY
+ * PUBLIC: int __db_diags __P((const ENV *, int));
+ * PUBLIC: #endif
+ */
+ int
+__db_diags(env, err)
+ const ENV *env;
+ int err;
+{
+ DB_MSGBUF *mb;
+
+ if (err != 0 && (mb = __db_deferred_get()) != NULL)
+ (void)__db_remember_context(env, mb, err);
+ return (err);
+}
+
+/*
+ * __db_deferred_get --
+ * Get this thread's deferred DB_MSGBUF, possibly allocating it.
+ *
+ * PUBLIC: #ifdef HAVE_ERROR_HISTORY
+ * PUBLIC: DB_MSGBUF *__db_deferred_get __P((void));
+ * PUBLIC: #endif
+ */
+DB_MSGBUF *
+__db_deferred_get()
+{
+ DB_MSGBUF *mb;
+
+ if ((mb = pthread_getspecific(DB_GLOBAL(msgs_key))) == NULL) {
+ if ((mb = calloc(1, sizeof(*mb))) != NULL)
+ if (pthread_setspecific(DB_GLOBAL(msgs_key), mb) != 0) {
+ /* Nothing else is safe do on an error. */
+ free(mb);
+ mb = NULL;
+ }
+ }
+ return (mb);
+}
+
+/*
+ * __db_deferred_discard --
+ * Discard any saved-up deferred messages, at e.g. the end of the command.
+ *
+ * PUBLIC: #ifdef HAVE_ERROR_HISTORY
+ * PUBLIC: void __db_deferred_discard __P((void));
+ * PUBLIC: #endif
+ */
+void
+__db_deferred_discard()
+{
+ DB_MSGBUF *mb;
+
+ if ((mb = pthread_getspecific(DB_GLOBAL(msgs_key))) != NULL)
+ mb->cur = mb->buf;
+}
+
+/*
+ * __db_remember_context
+ * Save the context which triggers the "first notice" of an error code;
+ * i.e., its creation. Include the time, thread, recent portion of the
+ * stack, and the error number. Add replication info too?
+ *
+ * Return the error number passed in, or 0?
+ *
+ * PUBLIC: #ifdef HAVE_ERROR_HISTORY
+ * PUBLIC: int __db_remember_context __P((const ENV *, DB_MSGBUF *, int));
+ * PUBLIC: #endif
+ */
+ int
+ __db_remember_context(env, mb, err)
+ const ENV *env;
+ DB_MSGBUF *mb;
+ int err;
+{
+ DB_ENV *dbenv;
+ LOG *lp;
+ db_timespec now;
+ pid_t pid;
+ db_threadid_t tid;
+ char threadid[DB_THREADID_STRLEN], timestr[CTIME_BUFLEN];
+
+ /* Limit the amount of context messges which are remembered. */
+ if (mb->len >= DB_ERROR_HISTORY_SIZE)
+ return (0);
+
+ lp = NULL;
+ if (env == NULL) {
+ dbenv = NULL;
+ threadid[0] = '\0';
+ } else {
+ dbenv = env->dbenv;
+ dbenv->thread_id(dbenv, &pid, &tid);
+ (void)dbenv->thread_id_string(dbenv, pid, tid, threadid);
+ if (LOGGING_ON(env) && !IS_RECOVERING(env))
+ lp = env->lg_handle->reginfo.primary;
+ }
+
+ __os_gettime(env, &now, 0);
+ (void)__db_ctimespec(&now, timestr);
+ __db_msgadd(env, mb, "\n[%s][%s] %s",
+ timestr, threadid, db_strerror(err));
+ if (lp != NULL)
+ __db_msgadd(env, mb, " lsn [%lu][%lu]",
+ (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
+
+#if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS)
+ /*
+ * Add many frames of stack trace to the record, skipping the first two
+ * frames: __os_stack_msgadd() and __db_remember_context().
+ */
+ __db_msgadd(env, mb, " from\n");
+ __os_stack_msgadd(env, mb, 15, 2, NULL);
+#endif
+
+ return (0);
+}
+#endif
+
+/*
+ * __db_ctimespec --
+ * Format a timespec in microseconds, similar to a terse __os_ctime(),
+ * storing the results into a CTIME_BUFLEN sized buffer.
+ * The result format depends on the availability of localtime, etc
+ * MM/DD HH:MM:SS.uuuuuu if strftime is available, or
+ * Jan DD HH:MM:SS.uuuuuu if only __os_ctime() is available.
+ * Both are small enough to use __os_ctime() sized buffer, e.g. 26.
+ * The other fields (year, day-of-week, ...) are intentionally removed.
+ *
+ * PUBLIC: char * __db_ctimespec __P((const db_timespec *, char *));
+ */
+char *
+__db_ctimespec(timespec, buf)
+ const db_timespec *timespec;
+ char *buf;
+{
+ char *d, date[CTIME_BUFLEN];
+#ifdef HAVE_STRFTIME
+ struct tm *tm_p;
+#ifdef HAVE_LOCALTIME_R
+ struct tm tm;
+#endif
+#endif
+
+ /* Print the time readably if possible; else print seconds. */
+#ifdef HAVE_STRFTIME
+#ifdef HAVE_LOCALTIME_R
+ tm_p = localtime_r(&timespec->tv_sec, &tm);
+#else
+ tm_p = localtime(&timespec->tv_sec);
+#endif
+ if (tm_p != NULL) {
+ d = date;
+ (void)strftime(d, sizeof(date), DB_GLOBAL(time_format), tm_p);
+ }
+ else
+#endif
+ {
+ /* Trim off the leading day-of-week; then the trailing year. */
+ d = __os_ctime(&timespec->tv_sec, date) + 4;
+ d[sizeof("Jan 01 00:00:00")] = '\0';
+ }
+ (void)snprintf(buf, CTIME_BUFLEN,
+ "%s.%06lu", d, (u_long)(timespec->tv_nsec / NS_PER_US));
+ buf[CTIME_BUFLEN - 1] = '\0'; /* In case of buggy snprintf. */
+ return (buf);
+}
+
+/*
+ * __db_fmt_quote --
+ * Copy a printf format string, quoting (doubling) each '%' along the way.
+ * Use this when inserting a user-defined string into a *printf format.
+ * If the src parameter is NULL, then quote in-place, shifting the
+ * rest of the string down by one character for each quote.
+ *
+ * PUBLIC: char *__db_fmt_quote __P((char *, size_t, const char *));
+ */
+char *
+__db_fmt_quote(dest, destsize, src)
+ char *dest;
+ size_t destsize;
+ const char *src;
+{
+ char *d, *end;
+ const char *s;
+ size_t len;
+
+ /* Stop early enough so that dest always has room for a '\0'. */
+ end = dest + destsize - 1;
+ if (src == NULL) {
+ d = dest;
+ while ((d = strchr(d, '%')) != NULL && d[1] != '\0') {
+ /*
+ * Shift the rest of the string by one byte to make
+ * space for another '%'. By starting at d and adding 1
+ * to the length, we double the '%' while copying the
+ * string and its terminating '\0'.
+ */
+ len = strlen(d) + 1;
+ memmove(d + 1, d, len);
+ /*
+ * We're done if the string now is larger than the
+ * reserved size; else advance over both '%'s.
+ */
+ if (d + len >= end) {
+ DB_ASSERT(NULL, d + len == end);
+ *end = '\0';
+ break;
+ }
+ d += 2;
+ }
+ } else {
+ for (s = src, d = dest; *s != '\0' && d < end; d++, s++)
+ if ((*d = *s) == '%') {
+ /* Discard a % at the end of the string. */
+ if (s[1] == '\0')
+ break;
+ *++d = '%';
+ }
+ *d = '\0';
+ }
+ return (dest);
}
diff --git a/src/common/db_getlong.c b/src/common/db_getlong.c
index cac55a0e..2dca6891 100644
--- a/src/common/db_getlong.c
+++ b/src/common/db_getlong.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/db_idspace.c b/src/common/db_idspace.c
index a9cbb1bf..4ac18e42 100644
--- a/src/common/db_idspace.c
+++ b/src/common/db_idspace.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/db_log2.c b/src/common/db_log2.c
index 9c929f84..42eb7e3a 100644
--- a/src/common/db_log2.c
+++ b/src/common/db_log2.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1995, 1996
diff --git a/src/common/db_shash.c b/src/common/db_shash.c
index a056e4b1..df862c04 100644
--- a/src/common/db_shash.c
+++ b/src/common/db_shash.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/dbt.c b/src/common/dbt.c
index 90409f2c..4a9970d9 100644
--- a/src/common/dbt.c
+++ b/src/common/dbt.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/mkpath.c b/src/common/mkpath.c
index c684692c..163dbfba 100644
--- a/src/common/mkpath.c
+++ b/src/common/mkpath.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/openflags.c b/src/common/openflags.c
index cec1f081..91d6e51b 100644
--- a/src/common/openflags.c
+++ b/src/common/openflags.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/os_method.c b/src/common/os_method.c
index 1ee06d7a..34627d59 100644
--- a/src/common/os_method.c
+++ b/src/common/os_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/util_arg.c b/src/common/util_arg.c
index 73416cb7..f5db1831 100644
--- a/src/common/util_arg.c
+++ b/src/common/util_arg.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/util_cache.c b/src/common/util_cache.c
index 1206940b..f0bc398d 100644
--- a/src/common/util_cache.c
+++ b/src/common/util_cache.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/util_log.c b/src/common/util_log.c
index d158d3f0..ffe69394 100644
--- a/src/common/util_log.c
+++ b/src/common/util_log.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/util_sig.c b/src/common/util_sig.c
index 02a0fcb2..b159cc80 100644
--- a/src/common/util_sig.c
+++ b/src/common/util_sig.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/common/zerofill.c b/src/common/zerofill.c
index 37662ddc..09d0dafe 100644
--- a/src/common/zerofill.c
+++ b/src/common/zerofill.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/crypto/aes_method.c b/src/crypto/aes_method.c
index 47193539..fed98f2b 100644
--- a/src/crypto/aes_method.c
+++ b/src/crypto/aes_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* Some parts of this code originally written by Adam Stubblefield,
* -- astubble@rice.edu.
diff --git a/src/crypto/crypto.c b/src/crypto/crypto.c
index b731496f..ba115dd3 100644
--- a/src/crypto/crypto.c
+++ b/src/crypto/crypto.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* Some parts of this code originally written by Adam Stubblefield
* -- astubble@rice.edu
@@ -15,6 +15,8 @@
#include "dbinc/db_page.h"
#include "dbinc/crypto.h"
+static void randomize __P((ENV *, void *, size_t));
+
/*
* __crypto_region_init --
* Initialize crypto.
@@ -110,7 +112,7 @@ __crypto_region_init(env)
* existing one, we are done with the passwd in the env. We smash
* N-1 bytes so that we don't overwrite the nul.
*/
- memset(dbenv->passwd, 0xff, dbenv->passwd_len-1);
+ randomize(env, dbenv->passwd, dbenv->passwd_len - 1);
__os_free(env, dbenv->passwd);
dbenv->passwd = NULL;
dbenv->passwd_len = 0;
@@ -135,9 +137,10 @@ __crypto_env_close(env)
dbenv = env->dbenv;
if (dbenv->passwd != NULL) {
- memset(dbenv->passwd, 0xff, dbenv->passwd_len-1);
+ randomize(env, dbenv->passwd, dbenv->passwd_len - 1);
__os_free(env, dbenv->passwd);
dbenv->passwd = NULL;
+ dbenv->passwd_len = 0;
}
if (!CRYPTO_ON(env))
@@ -225,7 +228,8 @@ __crypto_algsetup(env, db_cipher, alg, do_init)
/*
* __crypto_decrypt_meta --
- * Perform decryption on a metapage if needed.
+ * Perform decryption on a possible metadata page, if needed. This is used
+ * to help decide whether this is a real DB. Don't trust random data.
*
* PUBLIC: int __crypto_decrypt_meta __P((ENV *, DB *, u_int8_t *, int));
*/
@@ -241,6 +245,7 @@ __crypto_decrypt_meta(env, dbp, mbuf, do_metachk)
DB_CIPHER *db_cipher;
size_t pg_off;
int ret;
+ unsigned added_flags;
u_int8_t *iv;
/*
@@ -293,6 +298,7 @@ __crypto_decrypt_meta(env, dbp, mbuf, do_metachk)
*/
if (meta->encrypt_alg != 0) {
db_cipher = env->crypto_handle;
+ added_flags = 0;
if (!F_ISSET(dbp, DB_AM_ENCRYPT)) {
if (!CRYPTO_ON(env)) {
__db_errx(env, DB_STR("0178",
@@ -300,12 +306,14 @@ __crypto_decrypt_meta(env, dbp, mbuf, do_metachk)
return (EINVAL);
}
/*
- * User has a correct, secure env, but has encountered
- * a database in that env that is secure, but user
- * didn't dbp->set_flags. Since it is existing, use
- * encryption if it is that way already.
+ * User has a correct, secure env and has encountered
+ * a database in that env that APPEARS TO BE secure, but
+ * user didn't set the encryption flags. Since the db
+ * already exists, turn encryption on. Remember what was
+ * set, so the flags can restored if it doesn't decrypt.
*/
- F_SET(dbp, DB_AM_ENCRYPT|DB_AM_CHKSUM);
+ added_flags = DB_AM_ENCRYPT | DB_AM_CHKSUM;
+ F_SET(dbp, added_flags);
}
/*
* This was checked in set_flags when DB_AM_ENCRYPT was set.
@@ -316,6 +324,7 @@ __crypto_decrypt_meta(env, dbp, mbuf, do_metachk)
meta->encrypt_alg != db_cipher->alg) {
__db_errx(env, DB_STR("0179",
"Database encrypted using a different algorithm"));
+ F_CLR(dbp, added_flags);
return (EINVAL);
}
DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
@@ -334,12 +343,14 @@ alg_retry:
if (!F_ISSET(db_cipher, CIPHER_ANY)) {
if (do_metachk && (ret = db_cipher->decrypt(env,
db_cipher->data, iv, mbuf + pg_off,
- DBMETASIZE - pg_off)))
+ DBMETASIZE - pg_off))) {
+ F_CLR(dbp, added_flags);
return (ret);
- if (((BTMETA *)meta)->crypto_magic !=
- meta->magic) {
+ }
+ if (((BTMETA *)meta)->crypto_magic != meta->magic) {
__db_errx(env, DB_STR("0180",
"Invalid password"));
+ F_CLR(dbp, added_flags);
return (EINVAL);
}
/*
@@ -409,3 +420,45 @@ __crypto_set_passwd(env_src, env_dest)
sh_passwd = R_ADDR(infop, cipher->passwd);
return (__env_set_encrypt(env_dest->dbenv, sh_passwd, DB_ENCRYPT_AES));
}
+
+/*
+ * randomize
+ *
+ */
+static void
+randomize(env, base, size)
+ ENV *env;
+ void *base;
+ size_t size;
+{
+ size_t i, copysize;
+ u_int8_t last, *p;
+ u_int32_t value;
+
+ last = ((u_int8_t *)base)[size];
+ for (i = 0, p = base; i < size; i += copysize, p += copysize) {
+ value = __os_random();
+ if ((copysize = (size - i)) > sizeof(int32_t))
+ copysize = sizeof(int32_t);
+ switch (copysize)
+ {
+ default:
+ memmove(p, &value, sizeof(int32_t));
+ break;
+ case 3:
+ p[2] = (u_int8_t)(value >> 16);
+ /* FALLTHROUGH */
+ case 2:
+ p[1] = (u_int8_t)(value >> 8);
+ /* FALLTHROUGH */
+ case 1:
+ p[0] = (u_int8_t)(value);
+ break;
+ case 0:
+ DB_ASSERT(env, "randomize size 0?");
+ break;
+ }
+
+ }
+ DB_ASSERT(env, last == *p);
+}
diff --git a/src/crypto/mersenne/mt19937db.c b/src/crypto/mersenne/mt19937db.c
index 2d53c312..0460b994 100644
--- a/src/crypto/mersenne/mt19937db.c
+++ b/src/crypto/mersenne/mt19937db.c
@@ -156,7 +156,7 @@ __db_genrand(env)
* function will return 4 bytes if we don't send in a key.
*/
do {
- __os_gettime(env, &ts, 1);
+ __os_gettime(env, &ts, 0);
__db_chksum(NULL, (u_int8_t *)&ts.tv_sec,
sizeof(ts.tv_sec), NULL, (u_int8_t *)&seed);
} while (seed == 0);
diff --git a/src/crypto/rijndael/rijndael-api-fst.c b/src/crypto/rijndael/rijndael-api-fst.c
index 3fd6489d..5d67937c 100644
--- a/src/crypto/rijndael/rijndael-api-fst.c
+++ b/src/crypto/rijndael/rijndael-api-fst.c
@@ -56,7 +56,7 @@ __db_makeKey(key, direction, keyLen, keyMaterial)
{
u8 cipherKey[MAXKB];
- if (key == NULL) {
+ if (key == NULL || keyMaterial == NULL) {
return BAD_KEY_INSTANCE;
}
@@ -72,9 +72,7 @@ __db_makeKey(key, direction, keyLen, keyMaterial)
return BAD_KEY_MAT;
}
- if (keyMaterial != NULL) {
- memcpy(cipherKey, keyMaterial, key->keyLen/8);
- }
+ memcpy(cipherKey, keyMaterial, key->keyLen/8);
if (direction == DIR_ENCRYPT) {
key->Nr = __db_rijndaelKeySetupEnc(key->rk, cipherKey, keyLen);
diff --git a/src/db/crdel.src b/src/db/crdel.src
index 70473899..a1cbc0ed 100644
--- a/src/db/crdel.src
+++ b/src/db/crdel.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/db/crdel_rec.c b/src/db/crdel_rec.c
index 08e7bae8..2c529627 100644
--- a/src/db/crdel_rec.c
+++ b/src/db/crdel_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -81,7 +81,7 @@ __crdel_metasub_recover(env, dbtp, lsnp, op, info)
/*
* If this was an in-memory database and we are re-creating
* and this is the meta-data page, then we need to set up a
- * bunch of fields in the dbo as well.
+ * bunch of fields in the dbp as well.
*/
if (F_ISSET(file_dbp, DB_AM_INMEM) &&
argp->pgno == PGNO_BASE_MD &&
diff --git a/src/db/db.c b/src/db/db.c
index 0d9d1e6e..ffeb6d2b 100644
--- a/src/db/db.c
+++ b/src/db/db.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -41,6 +41,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc_auto/sequence_ext.h"
#include "dbinc/db_page.h"
#include "dbinc/db_swap.h"
#include "dbinc/btree.h"
@@ -92,6 +93,9 @@ __db_master_open(subdbp, ip, txn, name, flags, mode, dbpp)
if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0)
return (ret);
+ /* Set the creation directory. */
+ dbp->dirname = subdbp->dirname;
+
/*
* It's always a btree.
* Run in the transaction we've created.
@@ -105,6 +109,20 @@ __db_master_open(subdbp, ip, txn, name, flags, mode, dbpp)
DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE));
/*
+ * If creating the master database, disable blobs, but assign it a
+ * blob file id if blobs are enabled in the subdatabase. This means
+ * that subdatabses can only support blobs if the first subdatabse
+ * supports blobs. This is a temporary restriction, but is needed at
+ * the moment to prevent an infinite loop.
+ */
+ dbp->blob_threshold = 0;
+ if (LF_ISSET(DB_CREATE) && subdbp->blob_threshold != 0) {
+ if ((ret = __blob_generate_dir_ids(
+ dbp, txn, &dbp->blob_file_id)) != 0)
+ return (ret);
+ }
+
+ /*
* If there was a subdb specified, then we only want to apply
* DB_EXCL to the subdb, not the actual file. We only got here
* because there was a subdb specified.
@@ -819,6 +837,21 @@ __db_refresh(dbp, txn, flags, deferred_closep, reuse)
if (dbp->mpf == NULL)
LF_SET(DB_NOSYNC);
+#ifdef HAVE_64BIT_TYPES
+ /* Close the blob meta data databases. */
+ if (dbp->blob_seq != NULL) {
+ if ((t_ret = __seq_close(dbp->blob_seq, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ dbp->blob_seq = NULL;
+ }
+ if (dbp->blob_meta_db != NULL) {
+ if ((t_ret = __db_close(
+ dbp->blob_meta_db, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ dbp->blob_meta_db = NULL;
+ }
+#endif
+
/* If never opened, or not currently open, it's easy. */
if (!F_ISSET(dbp, DB_AM_OPEN_CALLED))
goto never_opened;
@@ -1164,6 +1197,10 @@ never_opened:
__os_free(dbp->env, dbp->dname);
dbp->dname = NULL;
}
+ if (dbp->blob_sub_dir != NULL) {
+ __os_free(dbp->env, dbp->blob_sub_dir);
+ dbp->blob_sub_dir = NULL;
+ }
/* Discard any memory used to store returned data. */
if (dbp->my_rskey.data != NULL)
@@ -1235,8 +1272,11 @@ __db_disassociate(sdbp)
sdbp->s_refcnt = 0;
while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
- if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0)
- ret = t_ret;
+ if ((t_ret = __dbc_destroy(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
F_CLR(sdbp, DB_AM_SECONDARY);
return (ret);
diff --git a/src/db/db.src b/src/db/db.src
index 879c7856..4a90ac16 100644
--- a/src/db/db.src
+++ b/src/db/db.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/db/db_am.c b/src/db/db_am.c
index 1cf3a505..84bb04bb 100644
--- a/src/db/db_am.c
+++ b/src/db/db_am.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -205,6 +205,7 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp)
/* Refresh the DBC structure. */
dbc->dbtype = dbtype;
RESET_RET_MEM(dbc);
+ dbc->db_stream = __dbc_db_stream;
dbc->set_priority = __dbc_set_priority;
dbc->get_priority = __dbc_get_priority;
dbc->priority = dbp->priority;
@@ -314,11 +315,11 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp)
if (F2_ISSET(dbp, DB2_AM_EXCL)) {
F_SET(dbc, DBC_DONTLOCK);
if (IS_REAL_TXN(txn)&& !LF_ISSET(DBC_OPD | DBC_DUPLICATE)) {
- /*
- * Exclusive databases can only have one active
- * transaction at a time since there are no internal
+ /*
+ * Exclusive databases can only have one active
+ * transaction at a time since there are no internal
* locks to prevent one transaction from reading and
- * writing another's uncommitted changes.
+ * writing another's uncommitted changes.
*/
if (dbp->cur_txn != NULL && dbp->cur_txn != txn) {
__db_errx(env, DB_STR("0749",
@@ -332,7 +333,7 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp)
memset(&req, 0, sizeof(req));
req.lock = dbp->handle_lock;
req.op = DB_LOCK_TRADE;
- if ((ret = __lock_vec(env, txn->locker, 0,
+ if ((ret = __lock_vec(env, txn->locker, 0,
&req, 1, 0)) != 0)
goto err;
dbp->cur_txn = txn;
@@ -397,10 +398,11 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp)
if (ip != NULL) {
dbc->thread_info = ip;
#ifdef DIAGNOSTIC
- if (dbc->locker != NULL)
+ if (dbc->locker != NULL) {
+ dbc->locker->prev_locker = ip->dbth_locker;
ip->dbth_locker =
R_OFFSET(&(env->lk_handle->reginfo), dbc->locker);
- else
+ } else
ip->dbth_locker = INVALID_ROFF;
#endif
} else if (txn != NULL)
diff --git a/src/db/db_backup.c b/src/db/db_backup.c
index 66d7382a..1c72e4d7 100644
--- a/src/db/db_backup.c
+++ b/src/db/db_backup.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -24,8 +24,9 @@ static int backup_read_data_dir
__P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t));
static int backup_dir_clean
__P((DB_ENV *, const char *, const char *, int *, u_int32_t));
-static int backup_data_copy
- __P((DB_ENV *, const char *, const char *, const char *, int));
+static int backup_lgconf_chk __P((DB_ENV *));
+static int __db_backup
+ __P((DB_ENV *, const char *, DB_THREAD_INFO *, int, u_int32_t));
/*
* __db_dbbackup_pp --
@@ -47,9 +48,9 @@ __db_dbbackup_pp(dbenv, dbfile, target, flags)
"DB_ENV->dbbackup", flags, DB_EXCL)) != 0)
return (ret);
ENV_ENTER(dbenv->env, ip);
-
- ret = __db_dbbackup(dbenv, ip, dbfile, target, flags);
-
+ REPLICATION_WRAP(dbenv->env,
+ (__db_dbbackup(
+ dbenv, ip, dbfile, target, flags, 0, NULL)), 0, ret);
ENV_LEAVE(dbenv->env, ip);
return (ret);
}
@@ -58,15 +59,17 @@ __db_dbbackup_pp(dbenv, dbfile, target, flags)
* __db_dbbackup --
* Copy a database file coordinated with mpool.
*
- * PUBLIC: int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *,
- * PUBLIC: const char *, const char *, u_int32_t));
+ * PUBLIC: int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, const char *,
+ * PUBLIC: const char *, u_int32_t, u_int32_t, const char *));
*/
int
-__db_dbbackup(dbenv, ip, dbfile, target, flags)
+__db_dbbackup(dbenv, ip, dbfile, target, flags, oflags, full_path)
DB_ENV *dbenv;
DB_THREAD_INFO *ip;
const char *dbfile, *target;
u_int32_t flags;
+ u_int32_t oflags;
+ const char *full_path;
{
DB *dbp;
DB_FH *fp;
@@ -77,8 +80,8 @@ __db_dbbackup(dbenv, ip, dbfile, target, flags)
retry_count = 0;
retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 &&
- (ret = __db_open(dbp, ip, NULL, dbfile, NULL,
- DB_UNKNOWN, DB_AUTO_COMMIT | DB_RDONLY, 0, PGNO_BASE_MD)) != 0) {
+ (ret = __db_open(dbp, ip, NULL, dbfile, NULL, DB_UNKNOWN,
+ DB_AUTO_COMMIT | DB_RDONLY | oflags, 0, PGNO_BASE_MD)) != 0) {
if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) {
(void)__db_close(dbp, NULL, DB_NOSYNC);
dbp = NULL;
@@ -91,9 +94,16 @@ retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 &&
}
}
+ /* Hot backup requires DB_LOG_BLOB. */
+ if (ret == 0 && dbp->blob_threshold != 0 &&
+ (ret = backup_lgconf_chk(dbenv)) != 0)
+ goto err;
+
+ if (full_path == NULL)
+ full_path = dbfile;
if (ret == 0) {
if ((ret = __memp_backup_open(dbenv->env,
- dbp->mpf, dbfile, target, flags, &fp, &handle)) == 0) {
+ dbp->mpf, full_path, target, flags, &fp, &handle)) == 0) {
if (dbp->type == DB_HEAP)
ret = __heap_backup(
dbenv, dbp, ip, fp, handle, flags);
@@ -104,10 +114,21 @@ retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 &&
fp, handle, flags);
}
if ((t_ret = __memp_backup_close(dbenv->env,
- dbp->mpf, dbfile, fp, handle)) != 0 && ret == 0)
+ dbp->mpf, full_path, fp, handle)) != 0 && ret == 0)
ret = t_ret;
}
+ /*
+ * Copy blob files. Since no locking is done here, it is possible
+ * that a blob file may be copied in the middle of being written.
+ * This is not a problem since hotbackup requires DB_LOG_BLOB and
+ * catastrophic recovery, which will fix any inconsistances in the
+ * blob files.
+ */
+ if (ret == 0 && dbp->blob_threshold != 0 &&
+ (t_ret = __blob_copy_all(dbp, target, flags)) != 0)
+ ret= t_ret;
+
#ifdef HAVE_QUEUE
/*
* For compatibility with the 5.2 and patch versions of db_copy
@@ -117,7 +138,7 @@ retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 &&
ret = __qam_backup_extents(dbp, ip, target, flags);
#endif
- if (dbp != NULL &&
+err: if (dbp != NULL &&
(t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
ret = t_ret;
@@ -205,8 +226,11 @@ backup_dir_clean(dbenv, backup_dir, log_dir, remove_maxp, flags)
/*
* backup_data_copy --
* Copy a non-database file into the backup directory.
+ *
+ * PUBLIC: int backup_data_copy __P((
+ * PUBLIC: DB_ENV *, const char *, const char *, const char *, int));
*/
-static int
+int
backup_data_copy(dbenv, file, from_dir, to_dir, log)
DB_ENV *dbenv;
const char *file, *from_dir, *to_dir;
@@ -352,13 +376,16 @@ backup_read_data_dir(dbenv, ip, dir, backup_dir, flags)
ENV *env;
FILE *savefile;
int fcnt, ret;
- size_t cnt;
+ size_t cnt, len;
const char *bd;
char **names, buf[DB_MAXPATHLEN], bbuf[DB_MAXPATHLEN];
+ char fullpath[DB_MAXPATHLEN];
void (*savecall) (const DB_ENV *, const char *, const char *);
env = dbenv->env;
memset(bbuf, 0, sizeof(bbuf));
+ memset(fullpath, 0, sizeof(fullpath));
+ len = 0;
bd = backup_dir;
if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && dir != env->db_home) {
@@ -401,6 +428,12 @@ backup_read_data_dir(dbenv, ip, dir, backup_dir, flags)
"%s: path too long", "%s"), buf);
return (EINVAL);
}
+ /* Save the original dir. */
+ if (!LF_ISSET(DB_BACKUP_SINGLE_DIR)) {
+ (void)snprintf(fullpath, sizeof(fullpath),
+ "%s%c%c", dir, PATH_SEPARATOR[0], '\0');
+ len = strlen(fullpath);
+ }
dir = buf;
}
/* Get a list of file names. */
@@ -449,7 +482,16 @@ backup_read_data_dir(dbenv, ip, dir, backup_dir, flags)
savefile = dbenv->db_errfile;
dbenv->db_errfile = NULL;
- ret = __db_dbbackup(dbenv, ip, names[cnt], bd, flags);
+ /*
+ * If it is not backing up to a single directory, prefix
+ * the file with 'dir' so that the file and directory structure
+ * in the source and backup location will be the same.
+ */
+ if (len != 0)
+ (void)snprintf(fullpath + len,
+ sizeof(fullpath) - len, "%s%c", names[cnt], '\0');
+ ret = __db_dbbackup(dbenv, ip, names[cnt],
+ backup_dir, flags, 0, len != 0 ? fullpath : NULL);
dbenv->db_errcall = savecall;
dbenv->db_errfile = savefile;
@@ -662,21 +704,22 @@ err: if (logd != dbenv->db_log_dir && logd != env->db_home)
* __db_backup --
* Backup databases in the enviornment.
*
- * PUBLIC: int __db_backup __P((DB_ENV *, const char *, u_int32_t));
+ * PUBLIC: int __db_backup_pp __P((DB_ENV *, const char *, u_int32_t));
*/
int
-__db_backup(dbenv, target, flags)
+__db_backup_pp(dbenv, target, flags)
DB_ENV *dbenv;
const char *target;
u_int32_t flags;
{
DB_THREAD_INFO *ip;
ENV *env;
- int copy_min, remove_max, ret;
- char **dir;
+ u_int32_t bytes;
+ int remove_max, ret;
env = dbenv->env;
- remove_max = copy_min = 0;
+ bytes = 0;
+ remove_max = 0;
#undef OKFLAGS
#define OKFLAGS \
@@ -692,6 +735,11 @@ __db_backup(dbenv, target, flags)
return (EINVAL);
}
+ /* Hot backup requires DB_LOG_BLOB. */
+ if ((ret = __env_get_blob_threshold_int(env, &bytes)) != 0 ||
+ (bytes != 0 && (ret = backup_lgconf_chk(dbenv)) != 0))
+ return (ret);
+
/*
* If the target directory for the backup does not exist, create it
* with mode read-write-execute for the owner. Ignore errors here,
@@ -714,6 +762,30 @@ __db_backup(dbenv, target, flags)
}
ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__db_backup(dbenv, target, ip, remove_max, flags)), 0, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_backup --
+ * Backup databases in the enviornment.
+ */
+static int
+__db_backup(dbenv, target, ip, remove_max, flags)
+ DB_ENV *dbenv;
+ const char *target;
+ DB_THREAD_INFO *ip;
+ int remove_max;
+ u_int32_t flags;
+{
+ ENV *env;
+ int copy_min, ret;
+ char **dir;
+
+ env = dbenv->env;
+ copy_min = 0;
/*
* If the UPDATE option was not specified, copy all database
@@ -724,6 +796,19 @@ __db_backup(dbenv, target, flags)
goto end;
F_SET(dbenv, DB_ENV_HOTBACKUP);
if (!LF_ISSET(DB_BACKUP_UPDATE)) {
+ /*
+ * Don't allow absolute path of blob directory when
+ * it is not backing up to a single directory.
+ */
+ if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) &&
+ dbenv->db_blob_dir != NULL &&
+ __os_abspath(dbenv->db_blob_dir)) {
+ __db_errx(env, DB_STR_A("0780",
+"blob directory '%s' is absolute path, not permitted unless backup is to a single directory",
+ "%s"), dbenv->db_blob_dir);
+ ret = EINVAL;
+ goto err;
+ }
if ((ret = backup_read_data_dir(dbenv,
ip, env->db_home, target, flags)) != 0)
goto err;
@@ -734,8 +819,8 @@ __db_backup(dbenv, target, flags)
* enviroment -- running recovery with them would
* corrupt the source files.
*/
- if (!LF_ISSET(DB_BACKUP_SINGLE_DIR)
- && __os_abspath(*dir)) {
+ if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) &&
+ __os_abspath(*dir)) {
__db_errx(env, DB_STR_A("0725",
"data directory '%s' is absolute path, not permitted unless backup is to a single directory",
"%s"), *dir);
@@ -751,7 +836,17 @@ __db_backup(dbenv, target, flags)
/*
* Copy all log files found in the log directory.
* The log directory defaults to the home directory.
+ * Don't allow absolute path of log directory when
+ * it is not backing up to a single directory.
*/
+ if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) &&
+ dbenv->db_log_dir != NULL && __os_abspath(dbenv->db_log_dir)) {
+ __db_errx(env, DB_STR_A("0781",
+"log directory '%s' is absolute path, not permitted unless backup is to a single directory",
+ "%s"), dbenv->db_log_dir);
+ ret = EINVAL;
+ goto err;
+ }
if ((ret = backup_read_log_dir(dbenv, target, &copy_min, flags)) != 0)
goto err;
/*
@@ -761,7 +856,7 @@ __db_backup(dbenv, target, flags)
* cleanup.
*/
if (LF_ISSET(DB_BACKUP_UPDATE) && remove_max < copy_min &&
- !(remove_max == 0 && copy_min == 1)) {
+ remove_max != 0 && copy_min != 1) {
__db_errx(env, DB_STR_A("0743",
"the largest log file removed (%d) must be greater than or equal the smallest log file copied (%d)",
"%d %d"), remove_max, copy_min);
@@ -770,6 +865,28 @@ __db_backup(dbenv, target, flags)
err: F_CLR(dbenv, DB_ENV_HOTBACKUP);
(void)__env_set_backup(env, 0);
-end: ENV_LEAVE(env, ip);
+end: return (ret);
+}
+
+/*
+ * __db_backup_fchk --
+ * Log configure checking for backup when blob is enabled.
+ */
+static int
+backup_lgconf_chk(dbenv)
+ DB_ENV *dbenv;
+{
+ int lgconf, ret;
+
+ ret = 0;
+
+ if (LOGGING_ON(dbenv->env) && ((ret = __log_get_config(dbenv,
+ DB_LOG_BLOB, &lgconf)) != 0 || lgconf == 0)) {
+ __db_errx(dbenv->env, DB_STR("0782",
+ "Hot backup requires DB_LOG_BLOB"));
+ if (ret == 0)
+ ret = EINVAL;
+ }
+
return (ret);
}
diff --git a/src/db/db_cam.c b/src/db/db_cam.c
index 6ee8b579..1a330bdb 100644
--- a/src/db/db_cam.c
+++ b/src/db/db_cam.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -11,6 +11,7 @@
#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/btree.h"
+#include "dbinc/fop.h"
#include "dbinc/hash.h"
#include "dbinc/heap.h"
#include "dbinc/lock.h"
@@ -83,6 +84,9 @@ __dbc_close(dbc)
DB *dbp;
DBC *opd;
DBC_INTERNAL *cp;
+#ifdef DIAGNOSTIC
+ DB_THREAD_INFO *ip;
+#endif
DB_TXN *txn;
ENV *env;
int ret, t_ret;
@@ -149,6 +153,14 @@ __dbc_close(dbc)
ret = t_ret;
F_CLR(dbc, DBC_FAMILY);
}
+#ifdef DIAGNOSTIC
+ if (dbc->locker != NULL) {
+ ENV_GET_THREAD_INFO(env, ip);
+ if (ip != NULL)
+ ip->dbth_locker = dbc->locker->prev_locker;
+ dbc->locker->prev_locker = INVALID_ROFF;
+ }
+#endif
if ((txn = dbc->txn) != NULL)
txn->cursors--;
@@ -510,6 +522,305 @@ __dbc_idel(dbc, flags)
return (ret);
}
+/*
+ * __dbc_db_stream --
+ *
+ * DBC->db_stream
+ *
+ * PUBLIC: int __dbc_db_stream __P((DBC *, DB_STREAM **, u_int32_t));
+ */
+int
+__dbc_db_stream(dbc, dbsp, flags)
+ DBC *dbc;
+ DB_STREAM **dbsp;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+ u_int32_t oflags;
+
+ env = dbc->env;
+ oflags = flags;
+
+ if ((ret = __db_fchk(
+ env, "DBC->db_stream", flags,
+ DB_STREAM_READ | DB_STREAM_WRITE | DB_STREAM_SYNC_WRITE)) != 0)
+ return (ret);
+
+ if (DB_IS_READONLY(dbc->dbp)) {
+ LF_SET(DB_STREAM_READ);
+ oflags |= DB_STREAM_READ;
+ }
+ if (LF_ISSET(DB_STREAM_READ) && LF_ISSET(DB_STREAM_WRITE)) {
+ ret = EINVAL;
+ __db_errx(env, DB_STR("0750",
+ "Error, cannot set both DB_STREAM_WRITE and DB_STREAM_READ."));
+ goto err;
+ }
+
+ if (oflags & DB_STREAM_READ)
+ LF_SET(DB_FOP_READONLY);
+ else
+ LF_SET(DB_FOP_WRITE);
+ if (oflags & DB_STREAM_SYNC_WRITE)
+ LF_SET(DB_FOP_SYNC_WRITE);
+
+ ret = __db_stream_init(dbc, dbsp, flags);
+
+err: return (ret);
+}
+
+/*
+ * __dbc_get_blob_id --
+ *
+ * Returns the blob id stored in the data record to which the cursor currently
+ * points. Returns EINVAL if the cursor does not point to a blob record.
+ *
+ * PUBLIC: int __dbc_get_blob_id __P((DBC *, db_seq_t *));
+ */
+int
+__dbc_get_blob_id(dbc, blob_id)
+ DBC *dbc;
+ db_seq_t *blob_id;
+{
+ DBT key, data;
+ BBLOB bl;
+ HBLOB hbl;
+ HEAPBLOBHDR bhdr;
+ int ret;
+
+ if (dbc->dbtype != DB_BTREE &&
+ dbc->dbtype != DB_HEAP && dbc->dbtype != DB_HASH) {
+ return (EINVAL);
+ }
+
+ ret = 0;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ /* Get the blob database record instead of the blob. */
+ data.flags |= DB_DBT_BLOB_REC;
+
+ /*
+ * It would be great if there was a more efficient way to do this, but
+ * the complexities of getting a page from a database, especially
+ * when taking into account things like partitions and compression,
+ * make that more trouble than it is worth.
+ */
+ if ((ret = __dbc_get(dbc, &key, &data, DB_CURRENT)) != 0)
+ goto err;
+
+ switch (dbc->dbtype) {
+ case DB_BTREE:
+ if (data.size != BBLOB_SIZE) {
+ ret = EINVAL;
+ goto err;
+ }
+ memcpy(&bl, data.data, BBLOB_SIZE);
+ if (B_TYPE(bl.type) != B_BLOB) {
+ ret = EINVAL;
+ goto err;
+ }
+ *blob_id = (db_seq_t)bl.id;
+ break;
+ case DB_HEAP:
+ if (data.size != HEAPBLOBREC_SIZE) {
+ ret = EINVAL;
+ goto err;
+ }
+ memcpy(&bhdr, data.data, HEAPBLOBREC_SIZE);
+ if (!F_ISSET(&bhdr.std_hdr, HEAP_RECBLOB)) {
+ ret = EINVAL;
+ goto err;
+ }
+ *blob_id = (db_seq_t)bhdr.id;
+ break;
+ case DB_HASH:
+ if (data.size != HBLOB_SIZE) {
+ ret = EINVAL;
+ goto err;
+ }
+ memcpy(&hbl, data.data, HBLOB_SIZE);
+ if (HPAGE_PTYPE(&hbl) != H_BLOB) {
+ ret = EINVAL;
+ goto err;
+ }
+ *blob_id = (db_seq_t)hbl.id;
+ break;
+ default:
+ ret = EINVAL;
+ goto err;
+ }
+
+err: return (ret);
+}
+
+/*
+ * __dbc_get_blob_size --
+ *
+ * Returns the blob file size stored in the data record to which the cursor
+ * currently points. Returns EINVAL if the cursor does not point to a blob
+ * record.
+ *
+ * PUBLIC: int __dbc_get_blob_size __P((DBC *, off_t *));
+ */
+int
+__dbc_get_blob_size(dbc, size)
+ DBC *dbc;
+ off_t *size;
+{
+ DBT key, data;
+ ENV *env;
+ BBLOB bl;
+ HBLOB hbl;
+ HEAPBLOBHDR bhdr;
+ int ret;
+
+ if (dbc->dbtype != DB_BTREE &&
+ dbc->dbtype != DB_HEAP && dbc->dbtype != DB_HASH) {
+ return (EINVAL);
+ }
+
+ env = dbc->env;
+ ret = 0;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ /* Get the blob database record instead of the blob. */
+ data.flags |= DB_DBT_BLOB_REC;
+
+ /*
+ * It would be great if there was a more efficient way to do this, but
+ * the complexities of getting a page from a database, especially
+ * when taking into account things like partitions and compression,
+ * make that more trouble than it is worth.
+ */
+ if ((ret = __dbc_get(dbc, &key, &data, DB_CURRENT)) != 0)
+ goto err;
+
+ switch (dbc->dbtype) {
+ case DB_BTREE:
+ if (data.size != BBLOB_SIZE) {
+ ret = EINVAL;
+ goto err;
+ }
+ memcpy(&bl, data.data, BBLOB_SIZE);
+ if (B_TYPE(bl.type) != B_BLOB) {
+ ret = EINVAL;
+ goto err;
+ }
+ GET_BLOB_SIZE(env, bl, *size, ret);
+ break;
+ case DB_HEAP:
+ if (data.size != HEAPBLOBREC_SIZE) {
+ ret = EINVAL;
+ goto err;
+ }
+ memcpy(&bhdr, data.data, HEAPBLOBREC_SIZE);
+ if (!F_ISSET(&bhdr.std_hdr, HEAP_RECBLOB)) {
+ ret = EINVAL;
+ goto err;
+ }
+ GET_BLOB_SIZE(env, bhdr, *size, ret);
+ break;
+ case DB_HASH:
+ if (data.size != HBLOB_SIZE) {
+ ret = EINVAL;
+ goto err;
+ }
+ memcpy(&hbl, data.data, HBLOB_SIZE);
+ if (HPAGE_PTYPE(&hbl) != H_BLOB) {
+ ret = EINVAL;
+ goto err;
+ }
+ GET_BLOB_SIZE(env, hbl, *size, ret);
+ break;
+ default:
+ ret = EINVAL;
+ goto err;
+ }
+
+err: return (ret);
+}
+
+/*
+ * __dbc_set_blob_size --
+ *
+ * Sets the blob file size in the data record to which the cursor
+ * currently points. Returns EINVAL if the cursor does not point to a blob
+ * record.
+ *
+ * PUBLIC: int __dbc_set_blob_size __P((DBC *, off_t));
+ */
+int
+__dbc_set_blob_size(dbc, size)
+ DBC *dbc;
+ off_t size;
+{
+ DBT key, data;
+ BBLOB *bl;
+ HBLOB *hbl;
+ HEAPBLOBHDR *bhdr;
+ int ret;
+
+ if (dbc->dbtype != DB_BTREE &&
+ dbc->dbtype != DB_HEAP && dbc->dbtype != DB_HASH) {
+ return (EINVAL);
+ }
+
+ ret = 0;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ /* Get the blob database record instead of the blob. */
+ data.flags |= DB_DBT_BLOB_REC;
+
+ /*
+ * It would be great if there was a more efficient way to do this, but
+ * the complexities of getting a page from a database, especially
+ * when taking into account things like partitions and compression,
+ * make that more trouble than it is worth.
+ */
+ if ((ret = __dbc_get(dbc, &key, &data, DB_CURRENT)) != 0)
+ goto err;
+
+ switch (dbc->dbtype) {
+ case DB_BTREE:
+ bl = (BBLOB *)data.data;
+ if (bl == NULL ||
+ B_TYPE(bl->type) != B_BLOB || data.size != BBLOB_SIZE) {
+ ret = EINVAL;
+ goto err;
+ }
+ SET_BLOB_SIZE(bl, size, BBLOB);
+ break;
+ case DB_HEAP:
+ bhdr = (HEAPBLOBHDR *)data.data;
+ if (bhdr == NULL ||
+ !F_ISSET(&bhdr->std_hdr, HEAP_RECBLOB) ||
+ data.size != HEAPBLOBREC_SIZE) {
+ ret = EINVAL;
+ goto err;
+ }
+ SET_BLOB_SIZE(bhdr, size, HEAPBLOBHDR);
+ break;
+ case DB_HASH:
+ hbl = data.data;
+ if (hbl == NULL ||
+ HPAGE_PTYPE(hbl) != H_BLOB || data.size != HBLOB_SIZE) {
+ ret = EINVAL;
+ goto err;
+ }
+ SET_BLOB_SIZE((HBLOB *)hbl, size, HBLOB);
+ break;
+ default:
+ ret = EINVAL;
+ goto err;
+ }
+
+ if ((ret = __dbc_put(dbc, &key, &data, DB_CURRENT)) != 0)
+ goto err;
+
+err: return (ret);
+}
+
#ifdef HAVE_COMPRESSION
/*
* __dbc_bulk_del --
@@ -632,6 +943,12 @@ __dbc_idup(dbc_orig, dbcp, flags)
int_n->stream_off = int_orig->stream_off;
int_n->stream_curr_pgno = int_orig->stream_curr_pgno;
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp)) {
+ if ((ret = __partc_dup(dbc_orig, dbc_n)) != 0)
+ goto err;
+ } else
+#endif
switch (dbc_orig->dbtype) {
case DB_QUEUE:
if ((ret = __qamc_dup(dbc_orig, dbc_n)) != 0)
@@ -859,7 +1176,11 @@ __dbc_iget(dbc, key, data, flags)
* we acquire a write lock in the primary tree and no locks in the
* off-page dup tree. If the DB_RMW flag was specified and the get
* operation is done in an off-page duplicate tree, call the primary
- * cursor's upgrade routine first.
+ * cursor's upgrade routine first. We fetch the primary tree's data
+ * page to follow the buffer latching order rules for btrees: latch from
+ * the top of the main tree down, even when also searching OPD trees.
+ * Deadlocks could otherwise occur if we need to fetch the main page
+ * while an OPD page is latched. [#22532]
*/
cp = dbc->internal;
if (cp->opd != NULL &&
@@ -868,6 +1189,10 @@ __dbc_iget(dbc, key, data, flags)
flags == DB_PREV || flags == DB_PREV_DUP)) {
if (tmp_rmw && (ret = dbc->am_writelock(dbc)) != 0)
goto err;
+ if (cp->page == NULL && (ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ goto err;
+
if (F_ISSET(dbc, DBC_TRANSIENT))
opd = cp->opd;
else if ((ret = __dbc_idup(cp->opd, &opd, DB_POSITION)) != 0)
@@ -1660,7 +1985,7 @@ __dbc_put_secondaries(dbc,
tskeyp, &oldpkey, rmw | DB_SET);
if (ret == 0) {
cmp = __bam_defcmp(sdbp,
- &oldpkey, pkey);
+ &oldpkey, pkey, NULL);
__os_ufree(env, oldpkey.data);
/*
* If the secondary key is unchanged,
@@ -1868,7 +2193,7 @@ __dbc_put_primary(dbc, key, data, flags)
olddata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
ret = __dbc_get(dbc, key, &olddata, DB_SET);
if (ret == 0) {
- ret = DB_KEYEXIST;
+ ret = DBC_ERR(dbc, DB_KEYEXIST);
goto done;
} else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
goto err;
@@ -2100,7 +2425,7 @@ __dbc_iput(dbc, key, data, flags)
if (dbc->dbtype == DB_HASH && F_ISSET(
((BTREE_CURSOR *)(dbc->internal->opd->internal)),
C_DELETED)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -2228,7 +2553,7 @@ __dbc_del_oldskey(sdbp, dbc, skey, pkey, olddata)
*/
for (i = 0, tskeyp = skey; i < nskey; i++, tskeyp++)
if (((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
- toldskeyp, tskeyp) == 0) {
+ toldskeyp, tskeyp, NULL) == 0) {
nsame++;
F_CLR(tskeyp, DB_DBT_ISSET);
break;
@@ -2382,12 +2707,14 @@ __dbc_cleanup(dbc, dbc_n, failed)
* cursors.
*/
if (!failed && ret == 0) {
+ MUTEX_LOCK(dbp->env, dbp->mutex);
if (opd != NULL)
opd->internal->pdbc = dbc;
if (internal->opd != NULL)
internal->opd->internal->pdbc = dbc_n;
dbc->internal = dbc_n->internal;
dbc_n->internal = internal;
+ MUTEX_UNLOCK(dbp->env, dbp->mutex);
}
/*
@@ -3501,6 +3828,32 @@ __db_check_skeyset(sdbp, skeyp)
for (key2 = key1 + 1; key2 < last_key; key2++)
DB_ASSERT(env,
((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
- key1, key2) != 0);
+ key1, key2, NULL) != 0);
+}
+#endif
+
+#ifdef HAVE_ERROR_HISTORY
+/*
+ * __dbc_diags
+ * Save the context which triggers the "first notice" of an error code;
+ * i.e., its creation. It doesn't touch anything when err == 0.
+ *
+ * PUBLIC: int __dbc_diags __P((DBC *, int));
+ */
+ int
+ __dbc_diags(dbc, err)
+ DBC *dbc;
+ int err;
+{
+ DB_MSGBUF *mb;
+
+ if (err != 0 && dbc->env != NULL &&
+ (mb = __db_deferred_get()) != NULL) {
+ (void)__db_remember_context(dbc->env, mb, err);
+ __db_msgadd(dbc->env, mb, "DB: %s:%s\n" ,
+ dbc->dbp->fname == NULL ? "in-mem" : dbc->dbp->fname,
+ dbc->dbp->dname == NULL ? "" : dbc->dbp->fname);
+ }
+ return (err);
}
#endif
diff --git a/src/db/db_cds.c b/src/db/db_cds.c
index 185d5487..d3cc990a 100644
--- a/src/db/db_cds.c
+++ b/src/db/db_cds.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -43,7 +43,15 @@ static int
__cdsgroup_abort(txn)
DB_TXN *txn;
{
- return (__cdsgroup_notsup(txn->mgrp->env, "abort"));
+ ENV *env;
+
+ env = txn->mgrp->env;
+ /*
+ * As the txn handle can not be used any more, we call
+ * __cdsgroup_commit to release the lock and destroy the handle.
+ */
+ (void)__cdsgroup_commit(txn, 0);
+ return (__cdsgroup_notsup(env, "abort"));
}
static int
@@ -83,8 +91,16 @@ static int __cdsgroup_discard(txn, flags)
DB_TXN *txn;
u_int32_t flags;
{
+ ENV *env;
+
COMPQUIET(flags, 0);
- return (__cdsgroup_notsup(txn->mgrp->env, "discard"));
+ env = txn->mgrp->env;
+ /*
+ * As the txn handle can not be used any more, we call
+ * __cdsgroup_commit to release the lock and destroy the handle.
+ */
+ (void)__cdsgroup_commit(txn, 0);
+ return (__cdsgroup_notsup(env, "discard"));
}
static u_int32_t __cdsgroup_id(txn)
diff --git a/src/db/db_compact.c b/src/db/db_compact.c
index d0f4801e..afe5a997 100644
--- a/src/db/db_compact.c
+++ b/src/db/db_compact.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -262,9 +262,11 @@ err: if (txn_local && txn != NULL) {
done: if (LF_ISSET(DB_FREE_SPACE)) {
DBMETA *meta;
db_pgno_t pgno;
+ int pgs_done;
pgno = PGNO_BASE_MD;
isdone = 1;
+ pgs_done = 0;
if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) &&
__memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta) == 0) {
isdone = meta->free == PGNO_INVALID;
@@ -281,7 +283,8 @@ done: if (LF_ISSET(DB_FREE_SPACE)) {
} else
#endif
if (!isdone)
- ret = __bam_truncate_ipages(dbp, ip, txn_orig, c_data);
+ ret = __bam_truncate_ipages(dbp,
+ ip, txn_orig, c_data, &pgs_done);
/* Clean up the free list. */
if (list != NULL)
@@ -387,17 +390,26 @@ err: if (dbc != NULL && (t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
#endif
/*
- * __db_exchange_page -- swap a page with a lower numbered page.
- * The routine will optionally free the higher numbered page. The cursor
- * has a stack which includes at least the immediate parent of this page.
- * PUBLIC: int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int));
+ * __db_exchange_page -- try to move a page 'down', to earlier in the file.
+ *
+ * This tries to move a page to a lower location the file, by swapping it
+ * with an earlier free page. The free page comes either from the free list or
+ * the newpgno parameter (e.g., __ham_compact_hash()). If the new page turns
+ * out to be higher than the original one, the allocation is undone and
+ * the caller is left unchanged. After a successful swap, this routine can
+ * optionally free the old, higher numbered page.
+ * The cursor's stack includes at least the immediate parent of this page.
+ *
+ * PUBLIC: int __db_exchange_page
+ * PUBLIC: __P((DBC *, PAGE **, PAGE *, db_pgno_t, int, int *));
*/
int
-__db_exchange_page(dbc, pgp, opg, newpgno, flags)
+__db_exchange_page(dbc, pgp, opg, newpgno, flags, pgs_donep)
DBC *dbc;
PAGE **pgp, *opg;
db_pgno_t newpgno;
int flags;
+ int *pgs_donep;
{
BTREE_CURSOR *cp;
DB *dbp;
@@ -445,7 +457,9 @@ __db_exchange_page(dbc, pgp, opg, newpgno, flags)
* are allocating at the same time, if so, just put it back.
*/
if (PGNO(newpage) > PGNO(*pgp)) {
- /* Its unfortunate but you can't just free a new overflow. */
+ /* It is unfortunate but you can't just free a new overflow. */
+ /* XXX Is the above comment still true? */
+ /* XXX Should __db_new(OVERFLOW) zero OV_LEN()? */
if (TYPE(newpage) == P_OVERFLOW)
OV_LEN(newpage) = 0;
if ((ret = __LPUT(dbc, lock)) != 0)
@@ -572,7 +586,9 @@ __db_exchange_page(dbc, pgp, opg, newpgno, flags)
if ((ret = __TLPUT(dbc, lock)) != 0)
return (ret);
-done: return (0);
+done:
+ (*pgs_donep)++;
+ return (0);
err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority);
(void)__TLPUT(dbc, lock);
@@ -584,15 +600,16 @@ err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority);
* Walk the pages of an overflow chain and swap out
* high numbered pages. We are passed the first page
* but only deal with the second and subsequent pages.
- * PUBLIC: int __db_truncate_overflow __P((DBC *,
- * PUBLIC: db_pgno_t, PAGE **, DB_COMPACT *));
+ * PUBLIC: int __db_truncate_overflow __P((DBC *, db_pgno_t,
+ * PUBLIC: PAGE **, DB_COMPACT *, int *));
*/
int
-__db_truncate_overflow(dbc, pgno, ppg, c_data)
+__db_truncate_overflow(dbc, pgno, ppg, c_data, pgs_donep)
DBC *dbc;
db_pgno_t pgno;
PAGE **ppg;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
DB *dbp;
DB_LOCK lock;
@@ -618,7 +635,7 @@ __db_truncate_overflow(dbc, pgno, ppg, c_data)
return (ret);
if (pgno <= c_data->compact_truncate)
continue;
- if (have_lock == 0) {
+ if (!have_lock) {
DB_ASSERT(dbp->env, ppg != NULL);
ppgno = PGNO(*ppg);
if ((ret = __memp_fput(dbp->mpf, dbc->thread_info,
@@ -635,30 +652,32 @@ __db_truncate_overflow(dbc, pgno, ppg, c_data)
have_lock = 1;
}
if ((ret = __db_exchange_page(dbc,
- &page, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+ &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
break;
}
err: if (page != NULL &&
- (t_ret = __memp_fput( dbp->mpf,
+ (t_ret = __memp_fput(dbp->mpf,
dbc->thread_info, page, dbc->priority)) != 0 && ret == 0)
ret = t_ret;
if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
+
/*
* __db_truncate_root -- swap a root page for a lower numbered page.
* PUBLIC: int __db_truncate_root __P((DBC *,
- * PUBLIC: PAGE *, u_int32_t, db_pgno_t *, u_int32_t));
+ * PUBLIC: PAGE *, u_int32_t, db_pgno_t *, u_int32_t, int *));
*/
int
-__db_truncate_root(dbc, ppg, indx, pgnop, tlen)
+__db_truncate_root(dbc, ppg, indx, pgnop, tlen, pgs_donep)
DBC *dbc;
PAGE *ppg;
u_int32_t indx;
db_pgno_t *pgnop;
u_int32_t tlen;
+ int *pgs_donep;
{
DB *dbp;
DBT orig;
@@ -693,7 +712,7 @@ __db_truncate_root(dbc, ppg, indx, pgnop, tlen)
} else {
LOCK_CHECK_OFF(dbc->thread_info);
ret = __db_exchange_page(dbc,
- &page, NULL, PGNO_INVALID, DB_EXCH_FREE);
+ &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep);
LOCK_CHECK_ON(dbc->thread_info);
if (ret != 0)
goto err;
@@ -705,8 +724,7 @@ __db_truncate_root(dbc, ppg, indx, pgnop, tlen)
/* Update the reference. */
if (DBC_LOGGING(dbc)) {
- if ((ret = __db_pgno_log(dbp,
- dbc->txn, &LSN(ppg), 0, PGNO(ppg),
+ if ((ret = __db_pgno_log(dbp, dbc->txn, &LSN(ppg), 0, PGNO(ppg),
&LSN(ppg), (u_int32_t)indx, *pgnop, newpgno)) != 0)
goto err;
} else
@@ -780,13 +798,13 @@ __db_find_free(dbc, type, size, bstart, freep)
goto err;
if (nelems == 0) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
for (i = 0; i < nelems; i++) {
if (list[i] > bstart) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
start = i;
@@ -812,7 +830,7 @@ __db_find_free(dbc, type, size, bstart, freep)
goto found;
}
}
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
found: /* We have size range of pages. Remove them. */
@@ -1005,13 +1023,15 @@ err: if (np != NULL && np != otherp)
* __db_move_metadata -- move a meta data page to a lower page number.
* The meta data page must be exclusively latched on entry.
*
- * PUBLIC: int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *));
+ * PUBLIC: int __db_move_metadata
+ * PUBLIC: __P((DBC *, DBMETA **, DB_COMPACT *, int *));
*/
int
-__db_move_metadata(dbc, metap, c_data)
+__db_move_metadata(dbc, metap, c_data, pgs_donep)
DBC *dbc;
DBMETA **metap;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
BTREE *bt;
DB *dbp, *mdbp;
@@ -1023,7 +1043,7 @@ __db_move_metadata(dbc, metap, c_data)
c_data->compact_pages_examine++;
if ((ret = __db_exchange_page(dbc,
- (PAGE**)metap, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+ (PAGE **)metap, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
return (ret);
if (PGNO(*metap) == dbp->meta_pgno)
diff --git a/src/db/db_conv.c b/src/db/db_conv.c
index 210b4d6e..77c6b760 100644
--- a/src/db/db_conv.c
+++ b/src/db/db_conv.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -487,8 +487,12 @@ __db_byteswap(dbp, pg, h, pagesize, pgin)
{
ENV *env;
BINTERNAL *bi;
+ BBLOB *bl;
BKEYDATA *bk;
BOVERFLOW *bo;
+ HEAPBLOBHDR *bhdr;
+ HEAPHDR *hh;
+ HEAPSPLITHDR *hsh;
RINTERNAL *ri;
db_indx_t i, *inp, len, tmp;
u_int8_t *end, *p, *pgend;
@@ -500,8 +504,14 @@ __db_byteswap(dbp, pg, h, pagesize, pgin)
M_32_SWAP(h->lsn.file);
M_32_SWAP(h->lsn.offset);
M_32_SWAP(h->pgno);
- M_32_SWAP(h->prev_pgno);
- M_32_SWAP(h->next_pgno);
+ if (TYPE(h) == P_HEAP) {
+ M_32_SWAP(((HEAPPG *)h)->high_pgno);
+ M_16_SWAP(((HEAPPG *)h)->high_indx);
+ M_16_SWAP(((HEAPPG *)h)->free_indx);
+ } else {
+ M_32_SWAP(h->prev_pgno);
+ M_32_SWAP(h->next_pgno);
+ }
M_16_SWAP(h->entries);
M_16_SWAP(h->hf_offset);
}
@@ -527,6 +537,14 @@ __db_byteswap(dbp, pg, h, pagesize, pgin)
continue;
switch (HPAGE_TYPE(dbp, h, i)) {
+ case H_BLOB:
+ p = HBLOB_ID(P_ENTRY(dbp, h, i));
+ SWAP64(p); /* id */
+ SWAP64(p); /* size */
+ p = HBLOB_FILE_ID(P_ENTRY(dbp, h, i));
+ SWAP64(p); /* file id */
+ SWAP64(p); /* sdb id */
+ break;
case H_KEYDATA:
break;
case H_DUPLICATE:
@@ -599,6 +617,14 @@ __db_byteswap(dbp, pg, h, pagesize, pgin)
if ((u_int8_t *)bk >= pgend)
continue;
switch (B_TYPE(bk->type)) {
+ case B_BLOB:
+ bl = (BBLOB *)bk;
+ M_16_SWAP(bl->len);
+ M_64_SWAP(bl->id); /* id */
+ M_64_SWAP(bl->size); /* size */
+ M_64_SWAP(bl->file_id); /* file id */
+ M_64_SWAP(bl->sdb_id); /* sdb id */
+ break;
case B_KEYDATA:
M_16_SWAP(bk->len);
break;
@@ -663,6 +689,32 @@ __db_byteswap(dbp, pg, h, pagesize, pgin)
}
break;
case P_HEAP:
+ for (i = 0; i <= HEAP_HIGHINDX(h); i++) {
+ if (pgin)
+ M_16_SWAP(inp[i]);
+ if (inp[i] == 0)
+ continue;
+
+ hh = (HEAPHDR *)P_ENTRY(dbp, h, i);
+ if ((u_int8_t *)hh >= pgend)
+ continue;
+ M_16_SWAP(hh->size);
+ if (F_ISSET(hh, HEAP_RECSPLIT)) {
+ hsh = (HEAPSPLITHDR *)hh;
+ M_32_SWAP(hsh->tsize);
+ M_32_SWAP(hsh->nextpg);
+ M_16_SWAP(hsh->nextindx);
+ } else if (F_ISSET(hh, HEAP_RECBLOB)) {
+ bhdr = (HEAPBLOBHDR *)hh;
+ M_64_SWAP(bhdr->id); /* id */
+ M_64_SWAP(bhdr->size); /* size */
+ M_64_SWAP(bhdr->file_id); /* file id */
+ }
+
+ if (!pgin)
+ M_16_SWAP(inp[i]);
+ }
+ break;
case P_IHEAP:
case P_INVALID:
case P_OVERFLOW:
@@ -678,8 +730,14 @@ out: if (!pgin) {
M_32_SWAP(h->lsn.file);
M_32_SWAP(h->lsn.offset);
M_32_SWAP(h->pgno);
- M_32_SWAP(h->prev_pgno);
- M_32_SWAP(h->next_pgno);
+ if (TYPE(h) == P_HEAP) {
+ M_32_SWAP(((HEAPPG *)h)->high_pgno);
+ M_16_SWAP(((HEAPPG *)h)->high_indx);
+ M_16_SWAP(((HEAPPG *)h)->free_indx);
+ } else {
+ M_32_SWAP(h->prev_pgno);
+ M_32_SWAP(h->next_pgno);
+ }
M_16_SWAP(h->entries);
M_16_SWAP(h->hf_offset);
}
@@ -718,7 +776,10 @@ __db_pageswap(env, dbp, pp, len, pdata, pgin)
case P_HASHMETA:
return (__ham_mswap(env, pp));
-
+#ifdef HAVE_HEAP
+ case P_HEAPMETA:
+ return (__heap_mswap(env, pp));
+#endif
case P_QAMMETA:
return (__qam_mswap(env, pp));
@@ -794,12 +855,17 @@ __db_recordswap(op, size, hdr, data, pgin)
void *hdr, *data;
u_int32_t pgin;
{
+ BBLOB *bl;
BKEYDATA *bk;
BOVERFLOW *bo;
BINTERNAL *bi;
+ DBT *dbt;
+ HEAPHDR *hh;
+ HEAPBLOBHDR bhdr;
+ HEAPSPLITHDR *hsh;
RINTERNAL *ri;
db_indx_t tmp;
- u_int8_t *p, *end;
+ u_int8_t buf[HEAPBLOBREC_SIZE], *end, *p;
if (size == 0)
return;
@@ -812,6 +878,14 @@ __db_recordswap(op, size, hdr, data, pgin)
case B_KEYDATA:
M_16_SWAP(bk->len);
break;
+ case B_BLOB:
+ bl = (BBLOB *)bk;
+ M_16_SWAP(bl->len);
+ M_64_SWAP(bl->id); /* id */
+ M_64_SWAP(bl->size); /* size */
+ M_64_SWAP(bl->file_id); /* file id */
+ M_64_SWAP(bl->sdb_id); /* sdb id */
+ break;
case B_DUPLICATE:
case B_OVERFLOW:
bo = (BOVERFLOW *)hdr;
@@ -835,6 +909,7 @@ __db_recordswap(op, size, hdr, data, pgin)
} else
bo = (BOVERFLOW *)data;
M_32_SWAP(bo->pgno);
+ M_32_SWAP(bo->tlen);
}
break;
case P_IRECNO:
@@ -867,10 +942,10 @@ __db_recordswap(op, size, hdr, data, pgin)
SWAP16(p);
}
break;
- /* These two record types include the full header. */
+ /* These three record types include the full header. */
case H_OFFDUP:
p = (u_int8_t *)hdr;
- p += SSZ(HOFFPAGE, pgno);
+ p += SSZ(HOFFDUP, pgno);
SWAP32(p); /* pgno */
break;
case H_OFFPAGE:
@@ -879,11 +954,61 @@ __db_recordswap(op, size, hdr, data, pgin)
SWAP32(p); /* pgno */
SWAP32(p); /* tlen */
break;
+ case H_BLOB:
+ p = HBLOB_ID(hdr);
+ SWAP64(p); /* id */
+ SWAP64(p); /* size */
+ p = HBLOB_FILE_ID(hdr);
+ SWAP64(p); /* file id */
+ SWAP64(p); /* sdb id */
+ break;
default:
DB_ASSERT(NULL, op != op);
}
break;
-
+ case P_HEAP:
+ hh = (HEAPHDR *)hdr;
+ M_16_SWAP(hh->size);
+ if (F_ISSET(hh, HEAP_RECSPLIT)) {
+ hsh = (HEAPSPLITHDR *)hdr;
+ M_32_SWAP(hsh->tsize);
+ M_32_SWAP(hsh->nextpg);
+ M_16_SWAP(hsh->nextindx);
+ }else if (F_ISSET(hh, HEAP_RECBLOB)) {
+ /*
+ * Heap blob records are broken into two parts when
+ * logged, the shared header and the part that is
+ * unique to blob records, which is stored in the
+ * log data field.
+ */
+ if (data != NULL) {
+ dbt = NULL;
+ if (pgin) {
+ dbt = data;
+ memcpy(buf + sizeof(HEAPHDR),
+ dbt->data, HEAPBLOBREC_DSIZE);
+ } else {
+ memcpy(buf + sizeof(HEAPHDR),
+ data, HEAPBLOBREC_DSIZE);
+ }
+ memcpy(&bhdr, buf, HEAPBLOBREC_SIZE);
+ M_64_SWAP(bhdr.id); /* id */
+ M_64_SWAP(bhdr.size); /* size */
+ M_64_SWAP(bhdr.file_id); /* file id */
+ memcpy(buf, &bhdr, HEAPBLOBREC_SIZE);
+ if (pgin) {
+ memcpy(dbt->data,
+ HEAPBLOBREC_DATA(buf),
+ HEAPBLOBREC_DSIZE);
+ } else {
+ memcpy(data,
+ HEAPBLOBREC_DATA(buf),
+ HEAPBLOBREC_DSIZE);
+ }
+ }
+ break;
+ }
+ break;
default:
DB_ASSERT(NULL, op != op);
}
diff --git a/src/db/db_copy.c b/src/db/db_copy.c
index 359c74be..d9786702 100644
--- a/src/db/db_copy.c
+++ b/src/db/db_copy.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/db/db_dispatch.c b/src/db/db_dispatch.c
index 06de4ef7..7cb7f9ca 100644
--- a/src/db/db_dispatch.c
+++ b/src/db/db_dispatch.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1995, 1996
@@ -639,7 +639,7 @@ __db_txnlist_find(env, hp, txnid, statusp)
DB_TXNLIST *entry;
if (txnid == 0)
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
return (__db_txnlist_find_internal(env, hp,
TXNLIST_TXNID, txnid, &entry, 0, statusp));
@@ -666,7 +666,7 @@ __db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok)
int ret;
if (txnid == 0)
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
ret = __db_txnlist_find_internal(env,
hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status);
@@ -715,7 +715,7 @@ __db_txnlist_find_internal(env,
ret = 0;
if (hp == NULL)
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
switch (type) {
case TXNLIST_TXNID:
@@ -759,7 +759,7 @@ __db_txnlist_find_internal(env,
return (ret);
}
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
}
/*
diff --git a/src/db/db_dup.c b/src/db/db_dup.c
index 9fd04791..e66ec92b 100644
--- a/src/db/db_dup.c
+++ b/src/db/db_dup.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/db/db_iface.c b/src/db/db_iface.c
index 59e0ba53..da6140a4 100644
--- a/src/db/db_iface.c
+++ b/src/db/db_iface.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -159,9 +159,15 @@ __db_associate_arg(dbp, sdbp, callback, flags)
env = dbp->env;
+ if (dbp->blob_threshold || sdbp->blob_threshold) {
+ __db_errx(env, DB_STR("0751",
+ "Secondary and primary databases cannot support blobs."));
+ return (EINVAL);
+ }
+
if (sdbp->type == DB_HEAP) {
- __db_errx(env,
- "Heap databases may not be used as secondary databases");
+ __db_errx(env, DB_STR("0752",
+ "Heap databases may not be used as secondary databases"));
return (EINVAL);
}
@@ -288,6 +294,7 @@ __db_cursor_pp(dbp, txn, dbcp, flags)
int rep_blocked, ret;
env = dbp->env;
+ (*dbcp) = NULL;
DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor");
@@ -331,7 +338,8 @@ __db_cursor_pp(dbp, txn, dbcp, flags)
* If a family transaction was passed in, the transaction handle in
* the cursor may not match.
*/
- txn = (*dbcp)->txn;
+ if ((*dbcp) != NULL)
+ txn = (*dbcp)->txn;
if (txn != NULL && ret == 0)
TAILQ_INSERT_HEAD(&(txn->my_cursors), *dbcp, txn_cursors);
@@ -434,6 +442,13 @@ __db_cursor_arg(dbp, flags)
return (__db_fnl(env, "DB->cursor"));
}
+ if (dbp->blob_threshold &&
+ LF_ISSET(DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT)) {
+ __db_errx(dbp->env, DB_STR("0753",
+"Blob enabled databases do not support READ_UNCOMMITTED and TXN_SNAPSHOT."));
+ return (EINVAL);
+ }
+
LF_CLR(DB_CURSOR_BULK |
DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT);
@@ -828,6 +843,12 @@ __db_get_arg(dbp, key, data, flags)
env = dbp->env;
+ if (dbp->blob_threshold && LF_ISSET(DB_READ_UNCOMMITTED)) {
+ __db_errx(env, DB_STR("0754",
+ "Blob enabled databases do not support DB_READ_UNCOMMITTED."));
+ return (EINVAL);
+ }
+
/*
* Check for read-modify-write validity. DB_RMW doesn't make sense
* with CDB cursors since if you're going to write the cursor, you
@@ -876,6 +897,9 @@ __db_get_arg(dbp, key, data, flags)
break;
case DB_CONSUME:
case DB_CONSUME_WAIT:
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env,
+ "DB->get CONSUME/CONSUME_WAIT"));
if (dirty) {
__db_errx(env, DB_STR_A("0583",
"%s is not supported with DB_CONSUME or DB_CONSUME_WAIT",
@@ -1148,6 +1172,13 @@ __db_open_pp(dbp, txn, fname, dname, type, flags, mode)
/* Save the current DB handle flags for refresh. */
dbp->orig_flags = dbp->flags;
+ if (fname == 0 && PREFMAS_IS_SET(env)) {
+ __db_errx(env, DB_STR("0783", "In-memory databases are not "
+ "supported in Replication Manager preferred master mode"));
+ ret = EINVAL;
+ goto err;
+ }
+
/* Check for replication block. */
handle_check = IS_ENV_REPLICATED(env);
if (handle_check &&
@@ -1389,6 +1420,18 @@ __db_open_arg(dbp, txn, fname, dname, type, flags)
return (EINVAL);
}
+ if (LF_ISSET(DB_MULTIVERSION) && dbp->blob_threshold) {
+ __db_errx(env, DB_STR("0755",
+ "DB_MULTIVERSION illegal with blob enabled databases"));
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_READ_UNCOMMITTED) && dbp->blob_threshold) {
+ __db_errx(env, DB_STR("0756",
+ "DB_READ_UNCOMMITTED illegal with blob enabled databases"));
+ return (EINVAL);
+ }
+
/* DB_TRUNCATE is neither transaction recoverable nor lockable. */
if (LF_ISSET(DB_TRUNCATE) && (LOCKING_ON(env) || txn != NULL)) {
__db_errx(env, DB_STR_A("0599",
@@ -1901,8 +1944,6 @@ __db_compact_pp(dbp, txn, start, stop, c_data, flags, end)
ret = __db_compact_int(dbp, ip,
txn, start, stop, dp, flags, end);
break;
- case DB_HEAP:
- break;
default:
ret = __dbh_am_chk(dbp, DB_OK_BTREE);
break;
@@ -2893,7 +2934,7 @@ __dbt_ferr(dbp, name, dbt, check_thread)
* database, without having to clear flags.
*/
if ((ret = __db_fchk(env, name, dbt->flags,
- DB_DBT_APPMALLOC | DB_DBT_BULK | DB_DBT_DUPOK |
+ DB_DBT_APPMALLOC | DB_DBT_BLOB | DB_DBT_BULK | DB_DBT_DUPOK |
DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERCOPY |
DB_DBT_USERMEM | DB_DBT_PARTIAL | DB_DBT_READONLY)) != 0)
return (ret);
diff --git a/src/db/db_join.c b/src/db/db_join.c
index 751cf9e2..24d5260e 100644
--- a/src/db/db_join.c
+++ b/src/db/db_join.c
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -717,7 +717,6 @@ __db_join_close(dbc)
DBC *dbc;
{
DB *dbp;
- DB_THREAD_INFO *ip;
ENV *env;
JOIN_CURSOR *jc;
int ret, t_ret;
@@ -737,7 +736,6 @@ __db_join_close(dbc)
TAILQ_REMOVE(&dbp->join_queue, dbc, links);
MUTEX_UNLOCK(env, dbp->mutex);
- ENV_ENTER(env, ip);
/*
* Close any open scratch cursors. In each case, there may
* not be as many outstanding as there are cursors in
@@ -757,7 +755,6 @@ __db_join_close(dbc)
(t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
ret = t_ret;
}
- ENV_LEAVE(env, ip);
__os_free(env, jc->j_exhausted);
__os_free(env, jc->j_curslist);
@@ -796,7 +793,7 @@ __db_join_getnext(dbc, key, data, exhausted, opmods)
int ret, cmp;
DB *dbp;
DBT ldata;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
dbp = dbc->dbp;
func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
@@ -812,7 +809,7 @@ __db_join_getnext(dbc, key, data, exhausted, opmods)
if ((ret = __dbc_get(dbc,
key, &ldata, opmods | DB_CURRENT)) != 0)
break;
- cmp = func(dbp, data, &ldata);
+ cmp = func(dbp, data, &ldata, NULL);
if (cmp == 0) {
/*
* We have to return the real data value. Copy
diff --git a/src/db/db_meta.c b/src/db/db_meta.c
index 8f97ebd8..53cf77cc 100644
--- a/src/db/db_meta.c
+++ b/src/db/db_meta.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -939,12 +939,14 @@ done: if (last_pgnop != NULL)
*last_pgnop = meta->last_pgno;
/*
- * The truncate point is the number of pages in the free
- * list back from the last page. The number of pages
- * in the free list are the number that we can swap in.
- * Adjust it down slightly so if we find higher numbered
- * pages early and then free other pages later we can
- * truncate them.
+ * Set the truncation point which determines which pages may be
+ * relocated. Pages above are candidates to be swapped with a lower one
+ * from the freelist by __db_exchange_page(); pages before the truncate
+ * point are not relocated.
+ * The truncation point starts as N pages less than the last_pgno, where
+ * N is the size of the free list. This is reduced by 1/4 in the hope
+ * that partially full pages will be coalesced together, creating
+ * additional free pages during the compact.
*/
if (c_data) {
c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
diff --git a/src/db/db_method.c b/src/db/db_method.c
index 82d03e5f..d807bab6 100644
--- a/src/db/db_method.c
+++ b/src/db/db_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/crypto.h"
#include "dbinc/db_page.h"
#include "dbinc/btree.h"
@@ -36,14 +37,15 @@ static int __db_set_alloc __P((DB *, void *(*)(size_t),
static int __db_get_append_recno __P((DB *,
int (**)(DB *, DBT *, db_recno_t)));
static int __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
+static int __db_get_blob_dir __P((DB *, const char **));
+static int __db_set_blob_dir __P((DB *, const char *));
+static int __db_get_blob_sub_dir __P((DB *, const char **));
static int __db_get_cachesize __P((DB *, u_int32_t *, u_int32_t *, int *));
static int __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int));
static int __db_get_create_dir __P((DB *, const char **));
static int __db_set_create_dir __P((DB *, const char *));
static int __db_get_dup_compare
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
-static int __db_set_dup_compare
- __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
static int __db_get_encrypt_flags __P((DB *, u_int32_t *));
static int __db_set_encrypt __P((DB *, const char *, u_int32_t));
static int __db_get_feedback __P((DB *, void (**)(DB *, int, int)));
@@ -90,6 +92,12 @@ db_create(dbpp, dbenv, flags)
ip = NULL;
env = dbenv == NULL ? NULL : dbenv->env;
+#ifdef HAVE_ERROR_HISTORY
+ /* Call thread local storage initializer at least once per process. */
+ if (env == NULL)
+ __db_thread_init();
+#endif
+
/* Check for invalid function flags. */
switch (flags) {
case 0:
@@ -206,12 +214,11 @@ __db_create_internal(dbpp, env, flags)
err: if (dbp != NULL) {
if (dbp->mpf != NULL)
(void)__memp_fclose(dbp->mpf, 0);
+ if (F_ISSET(env, ENV_DBLOCAL))
+ (void)__env_close(dbp->dbenv, 0);
__os_free(env, dbp);
}
- if (dbp != NULL && F_ISSET(env, ENV_DBLOCAL))
- (void)__env_close(dbp->dbenv, 0);
-
return (ret);
}
@@ -225,6 +232,7 @@ __db_init(dbp, flags)
u_int32_t flags;
{
int ret;
+ u_int32_t bytes;
dbp->locker = NULL;
dbp->alt_close = NULL;
@@ -254,6 +262,9 @@ __db_init(dbp, flags)
dbp->get_alloc = __db_get_alloc;
dbp->get_append_recno = __db_get_append_recno;
dbp->get_assoc_flags = __db_get_assoc_flags;
+ dbp->get_blob_dir = __db_get_blob_dir;
+ dbp->get_blob_sub_dir = __db_get_blob_sub_dir;
+ dbp->get_blob_threshold = __db_get_blob_threshold;
dbp->get_byteswapped = __db_get_byteswapped;
dbp->get_cachesize = __db_get_cachesize;
dbp->get_create_dir = __db_get_create_dir;
@@ -290,6 +301,8 @@ __db_init(dbp, flags)
dbp->rename = __db_rename_pp;
dbp->set_alloc = __db_set_alloc;
dbp->set_append_recno = __db_set_append_recno;
+ dbp->set_blob_dir = __db_set_blob_dir;
+ dbp->set_blob_threshold = __db_set_blob_threshold;
dbp->set_cachesize = __db_set_cachesize;
dbp->set_create_dir = __db_set_create_dir;
dbp->set_dup_compare = __db_set_dup_compare;
@@ -316,7 +329,11 @@ __db_init(dbp, flags)
dbp->verify = __db_verify_pp;
/* DB PUBLIC HANDLE LIST END */
- /* Access method specific. */
+ if ((ret = __env_get_blob_threshold_int(dbp->env, &bytes)) != 0)
+ return (ret);
+ dbp->blob_threshold = bytes;
+
+ /* Access method specific. */
if ((ret = __bam_db_create(dbp)) != 0)
return (ret);
if ((ret = __ham_db_create(dbp)) != 0)
@@ -535,6 +552,182 @@ __db_set_append_recno(dbp, func)
}
/*
+ * __db_get_blob_threshold --
+ * Get the current threshold size at which records are stored as blobs.
+ *
+ * PUBLIC: int __db_get_blob_threshold __P((DB *, u_int32_t *));
+ */
+int
+__db_get_blob_threshold(dbp, bytes)
+ DB *dbp;
+ u_int32_t *bytes;
+{
+ /*
+ * While shared, this value never changes after open, so it is safe
+ * to access it without mutex protection.
+ */
+ *bytes = dbp->blob_threshold;
+
+ return (0);
+}
+
+/*
+ * __db_set_blob_threshold --
+ * API to allow setting the threshold size at which records are stored
+ * as blobs rather than in database items. No flags currently supported.
+ * PUBLIC: int __db_set_blob_threshold __P((DB *, u_int32_t, u_int32_t));
+ */
+int
+__db_set_blob_threshold(dbp, bytes, flags)
+ DB *dbp;
+ u_int32_t bytes;
+ u_int32_t flags;
+{
+ if (__db_fchk(dbp->env, "DB->set_blob_threshold", flags, 0) != 0)
+ return (EINVAL);
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_blob_threshold");
+
+ if (bytes != 0 && F_ISSET(dbp,
+ (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_DUP | DB_AM_DUPSORT))) {
+ __db_errx(dbp->env, DB_STR("0760",
+"Cannot enable blobs in databases with checksum, encryption, or duplicates."));
+ return (EINVAL);
+ }
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp) && bytes != 0) {
+ __db_errx(dbp->env, DB_STR("0761",
+ "Cannot enable blobs in databases with compression."));
+ return (EINVAL);
+ }
+#endif
+
+ dbp->blob_threshold = bytes;
+
+ return (0);
+}
+
+/*
+ * __db_blobs_enabled --
+ *
+ * Used to tell if the database is configured to support blobs.
+ * PUBLIC: int __db_blobs_enabled __P((DB *));
+ */
+int
+__db_blobs_enabled(dbp)
+ DB *dbp;
+{
+ /* Blob threshold must be non-0. */
+ if (!dbp->blob_threshold)
+ return (0);
+ /* Blobs cannot support encryption or checksum, but that may change. */
+ if (F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT)))
+ return (0);
+ /* Blobs do not support compression, but that may change. */
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp))
+ return (0);
+#endif
+ if (dbp->env->dbenv != NULL &&
+ F_ISSET(dbp->env->dbenv, DB_ENV_TXN_SNAPSHOT))
+ return (0);
+ /* Cannot support blobs in recno or queue. */
+ if (dbp->type == DB_RECNO || dbp->type == DB_QUEUE)
+ return (0);
+ /*
+ * Cannot support dups because that would require comparing
+ * blob data items.
+ */
+ if (F_ISSET(dbp, (DB_AM_DUP | DB_AM_DUPSORT)))
+ return (0);
+ /* No place to put blob files when using an in-memory db. */
+ if (F_ISSET(dbp, (DB_AM_INMEM)))
+ return (0);
+
+ /* BDB managed databases should not support blobs. */
+ if ((dbp->fname != NULL && IS_DB_FILE(dbp->fname)) ||
+ (dbp->dname != NULL && IS_DB_FILE(dbp->dname)))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * __db_get_blob_sub_dir --
+ *
+ * Returns the subdirectory of the blob directory in which the blob files
+ * for the given db are stored, or NULL if there is none.
+ *
+ */
+static int
+__db_get_blob_sub_dir(dbp, dir)
+ DB *dbp;
+ const char **dir;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_blob_sub_dir");
+
+ *dir = dbp->blob_sub_dir;
+
+ return (0);
+}
+
+/*
+ * __db_get_blob_dir --
+ *
+ * Get the blob directory for this database.
+ */
+static int
+__db_get_blob_dir(dbp, dir)
+ DB *dbp;
+ const char **dir;
+{
+ DB_ENV *dbenv;
+ ENV *env;
+
+ env = dbp->env;
+ dbenv = dbp->env->dbenv;
+ *dir = NULL;
+
+ if (dbenv == NULL)
+ return (0);
+
+ if (dbenv->db_blob_dir != NULL)
+ *dir = dbenv->db_blob_dir;
+ else if (env->db_home != NULL)
+ *dir = BLOB_DEFAULT_DIR;
+
+ return (0);
+}
+
+/*
+ * __db_set_blob_dir --
+ *
+ * Set the blob directory in a local environment.
+ */
+static int
+__db_set_blob_dir(dbp, dir)
+ DB *dbp;
+ const char *dir;
+{
+ DB_ENV *dbenv;
+ ENV *env;
+
+ DB_ILLEGAL_IN_ENV(dbp, "DB->set_blob_dir");
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_blob_dir");
+ env = dbp->env;
+ dbenv = dbp->env->dbenv;
+
+ if (dbenv == NULL)
+ return (0);
+
+ if (dbenv->db_blob_dir != NULL)
+ __os_free(env, dbenv->db_blob_dir);
+ dbenv->db_blob_dir = NULL;
+
+ return (__os_strdup(env, dir, &dbenv->db_blob_dir));
+}
+
+/*
* __db_get_cachesize --
* Get underlying cache size.
*/
@@ -607,7 +800,7 @@ __db_get_create_dir(dbp, dirp)
static int
__db_get_dup_compare(dbp, funcp)
DB *dbp;
- int (**funcp) __P((DB *, const DBT *, const DBT *));
+ int (**funcp) __P((DB *, const DBT *, const DBT *, size_t *));
{
DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
@@ -628,11 +821,14 @@ __db_get_dup_compare(dbp, funcp)
/*
* __db_set_dup_compare --
* Set duplicate comparison routine.
+ *
+ * PUBLIC: int __db_set_dup_compare __P((DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *)));
*/
-static int
+int
__db_set_dup_compare(dbp, func)
DB *dbp;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
{
int ret;
@@ -900,6 +1096,13 @@ __db_set_flags(dbp, flags)
ENV_REQUIRES_CONFIG(env,
env->tx_handle, "DB_NOT_DURABLE", DB_INIT_TXN);
+ if (dbp->blob_threshold &&
+ LF_ISSET(DB_CHKSUM | DB_ENCRYPT | DB_DUP | DB_DUPSORT)) {
+ __db_errx(dbp->env, DB_STR("0763",
+"Cannot enable checksum, encryption, or duplicates with blob support."));
+ return (EINVAL);
+ }
+
__db_map_flags(dbp, &flags, &dbp->flags);
if ((ret = __bam_set_flags(dbp, &flags)) != 0)
diff --git a/src/db/db_open.c b/src/db/db_open.c
index fefda48f..21074b15 100644
--- a/src/db/db_open.c
+++ b/src/db/db_open.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -119,6 +119,15 @@ __db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno)
goto err;
/*
+ * Silently disabled blobs in databases that cannot support them.
+ * Most illegal configurations will have already been caught, this
+ * is to allow a user to set an environment wide blob threshold, but
+ * not have to explicitly turn it off for in-memory or queue databases.
+ */
+ if (!__db_blobs_enabled(dbp))
+ dbp->blob_threshold = 0;
+
+ /*
* If both fname and subname are NULL, it's always a create, so make
* sure that we have both DB_CREATE and a type specified. It would
* be nice if this checking were done in __db_open where most of the
@@ -259,6 +268,11 @@ __db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno)
if (ret != 0)
goto err;
+ if (dbp->blob_file_id != 0)
+ if ((ret = __blob_make_sub_dir(env, &dbp->blob_sub_dir,
+ dbp->blob_file_id, dbp->blob_sdb_id)) != 0)
+ goto err;
+
#ifdef HAVE_PARTITION
if (dbp->p_internal != NULL && (ret =
__partition_open(dbp, ip, txn, fname, type, flags, mode, 1)) != 0)
@@ -432,8 +446,10 @@ err: return (ret);
/*
* __db_chk_meta --
- * Take a buffer containing a meta-data page and check it for a valid LSN,
- * checksum (and verify the checksum if necessary) and possibly decrypt it.
+ * Validate a buffer containing a possible meta-data page. It is
+ * byte-swapped as necessary and checked for having a valid magic number.
+ * If it does, then it can validate the LSN, checksum (if necessary),
+ * and possibly decrypt it.
*
* Return 0 on success, >0 (errno).
*
@@ -447,44 +463,64 @@ __db_chk_meta(env, dbp, meta, flags)
u_int32_t flags;
{
DB_LSN swap_lsn;
- int is_hmac, ret, swapped;
- u_int32_t magic, orig_chk;
+ int is_hmac, needs_swap, ret;
+ u_int32_t magic;
u_int8_t *chksum;
ret = 0;
- swapped = 0;
+ needs_swap = 0;
+ /*
+ * We can verify that this is some kind of db now, before any potential
+ * decryption, because the first P_OVERHEAD() bytes of most pages are
+ * cleartext. This gets called both before and after swapping, so we
+ * need to check for byte swapping ourselves.
+ */
+ magic = meta->magic;
+magic_retry:
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ case DB_HASHMAGIC:
+ case DB_HEAPMAGIC:
+ case DB_QAMMAGIC:
+ case DB_RENAMEMAGIC:
+ break;
+ default:
+ if (needs_swap)
+ /* It's already been swapped, so it isn't a BDB file. */
+ return (EINVAL);
+ M_32_SWAP(magic);
+ needs_swap = 1;
+ goto magic_retry;
+ }
+
+ if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) {
+ swap_lsn = meta->lsn;
+ if (needs_swap) {
+ M_32_SWAP(swap_lsn.file);
+ M_32_SWAP(swap_lsn.offset);
+ }
+ if (!IS_REP_CLIENT(env) && !IS_NOT_LOGGED_LSN(swap_lsn) &&
+ !IS_ZERO_LSN(swap_lsn) && (ret =
+ __log_check_page_lsn(env, dbp, &swap_lsn)) != 0)
+ return (ret);
+ }
if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) {
if (dbp != NULL)
F_SET(dbp, DB_AM_CHKSUM);
-
- is_hmac = meta->encrypt_alg == 0 ? 0 : 1;
- chksum = ((BTMETA *)meta)->chksum;
-
- /*
- * If we need to swap, the checksum function overwrites the
- * original checksum with 0, so we need to save a copy of the
- * original for swapping later.
- */
- orig_chk = *(u_int32_t *)chksum;
-
/*
* We cannot add this to __db_metaswap because that gets done
* later after we've verified the checksum or decrypted.
*/
if (LF_ISSET(DB_CHK_META)) {
- swapped = 0;
-chk_retry: if ((ret =
+ is_hmac = meta->encrypt_alg != 0;
+ chksum = ((BTMETA *)meta)->chksum;
+ if (needs_swap && !is_hmac)
+ M_32_SWAP(*(u_int32_t *)chksum);
+ if ((ret =
__db_check_chksum(env, NULL, env->crypto_handle,
- chksum, meta, DBMETASIZE, is_hmac)) != 0) {
- if (is_hmac || swapped)
- return (DB_CHKSUM_FAIL);
-
- M_32_SWAP(orig_chk);
- swapped = 1;
- *(u_int32_t *)chksum = orig_chk;
- goto chk_retry;
- }
+ chksum, meta, DBMETASIZE, is_hmac)) != 0)
+ return (DB_CHKSUM_FAIL);
}
} else if (dbp != NULL)
F_CLR(dbp, DB_AM_CHKSUM);
@@ -492,44 +528,8 @@ chk_retry: if ((ret =
#ifdef HAVE_CRYPTO
if (__crypto_decrypt_meta(env,
dbp, (u_int8_t *)meta, LF_ISSET(DB_CHK_META)) != 0)
- ret = DB_CHKSUM_FAIL;
- else
+ ret = DB_CHKSUM_FAIL;
#endif
-
- /* Now that we're decrypted, we can check LSN. */
- if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) {
- /*
- * This gets called both before and after swapping, so we
- * need to check ourselves. If we already swapped it above,
- * we'll know that here.
- */
-
- swap_lsn = meta->lsn;
- magic = meta->magic;
-lsn_retry:
- if (swapped) {
- M_32_SWAP(swap_lsn.file);
- M_32_SWAP(swap_lsn.offset);
- M_32_SWAP(magic);
- }
- switch (magic) {
- case DB_BTREEMAGIC:
- case DB_HASHMAGIC:
- case DB_HEAPMAGIC:
- case DB_QAMMAGIC:
- case DB_RENAMEMAGIC:
- break;
- default:
- if (swapped)
- return (EINVAL);
- swapped = 1;
- goto lsn_retry;
- }
- if (!IS_REP_CLIENT(env) &&
- !IS_NOT_LOGGED_LSN(swap_lsn) && !IS_ZERO_LSN(swap_lsn))
- /* Need to do check. */
- ret = __log_check_page_lsn(env, dbp, &swap_lsn);
- }
return (ret);
}
@@ -598,7 +598,6 @@ swap_retry:
}
/*
- * We can only check the meta page if we are sure we have a meta page.
* If it is random data, then this check can fail. So only now can we
* checksum and decrypt. Don't distinguish between configuration and
* checksum match errors here, because we haven't opened the database
@@ -606,9 +605,9 @@ swap_retry:
* If DB_SKIP_CHK is set, it means the checksum was already checked
* and the page was already decrypted.
*/
- if (!LF_ISSET(DB_SKIP_CHK) &&
+ if (!LF_ISSET(DB_SKIP_CHK) &&
(ret = __db_chk_meta(env, dbp, meta, flags)) != 0) {
- if (ret == DB_CHKSUM_FAIL)
+ if (ret == DB_CHKSUM_FAIL)
__db_errx(env, DB_STR_A("0640",
"%s: metadata page checksum error", "%s"), name);
goto bad_format;
@@ -669,10 +668,9 @@ swap_retry:
}
if (FLD_ISSET(meta->metaflags,
- DBMETA_PART_RANGE | DBMETA_PART_CALLBACK))
- if ((ret =
- __partition_init(dbp, meta->metaflags)) != 0)
- return (ret);
+ DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) &&
+ (ret = __partition_init(dbp, meta->metaflags)) != 0)
+ return (ret);
return (0);
bad_format:
diff --git a/src/db/db_overflow.c b/src/db/db_overflow.c
index d992ec0d..22f349ed 100644
--- a/src/db/db_overflow.c
+++ b/src/db/db_overflow.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -58,39 +58,26 @@
*/
/*
- * __db_goff --
- * Get an offpage item.
+ * __db_alloc_dbt
*
- * PUBLIC: int __db_goff __P((DBC *,
- * PUBLIC: DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *));
+ * Allocate enough space in the dbt to hold the data. Also used by the
+ * blob file API.
+ *
+ * PUBLIC: int __db_alloc_dbt __P((ENV *, DBT *, u_int32_t, u_int32_t *,
+ * PUBLIC: u_int32_t *, void **, u_int32_t *));
*/
int
-__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz)
- DBC *dbc;
+__db_alloc_dbt(env, dbt, tlen, nd, st, bpp, bpsz)
+ ENV *env;
DBT *dbt;
u_int32_t tlen;
- db_pgno_t pgno;
+ u_int32_t *nd;
+ u_int32_t *st;
void **bpp;
u_int32_t *bpsz;
{
- DB *dbp;
- DB_MPOOLFILE *mpf;
- DB_TXN *txn;
- DBC_INTERNAL *cp;
- ENV *env;
- PAGE *h;
- DB_THREAD_INFO *ip;
- db_indx_t bytes;
- u_int32_t curoff, needed, start;
- u_int8_t *p, *src;
int ret;
-
- dbp = dbc->dbp;
- cp = dbc->internal;
- env = dbp->env;
- ip = dbc->thread_info;
- mpf = dbp->mpf;
- txn = dbc->txn;
+ u_int32_t needed, start;
/*
* Check if the buffer is big enough; if it is not and we are
@@ -110,6 +97,8 @@ __db_goff(dbc, dbt, tlen, pgno, bpp, bpsz)
start = 0;
needed = tlen;
}
+ *nd = needed;
+ *st = start;
/*
* If the caller has not requested any data, return success. This
@@ -123,7 +112,7 @@ __db_goff(dbc, dbt, tlen, pgno, bpp, bpsz)
}
if (F_ISSET(dbt, DB_DBT_USERCOPY))
- goto skip_alloc;
+ return (0);
/* Allocate any necessary memory. */
if (F_ISSET(dbt, DB_DBT_USERMEM)) {
@@ -152,7 +141,48 @@ __db_goff(dbc, dbt, tlen, pgno, bpp, bpsz)
return (DB_BUFFER_SMALL);
}
-skip_alloc:
+ return (0);
+}
+
+/*
+ * __db_goff --
+ * Get an offpage item.
+ *
+ * PUBLIC: int __db_goff __P((DBC *,
+ * PUBLIC: DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *));
+ */
+int
+__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz)
+ DBC *dbc;
+ DBT *dbt;
+ u_int32_t tlen;
+ db_pgno_t pgno;
+ void **bpp;
+ u_int32_t *bpsz;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ DB_TXN *txn;
+ DBC_INTERNAL *cp;
+ ENV *env;
+ PAGE *h;
+ DB_THREAD_INFO *ip;
+ db_indx_t bytes;
+ u_int32_t curoff, needed, start;
+ u_int8_t *p, *src;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = dbc->internal;
+ env = dbp->env;
+ ip = dbc->thread_info;
+ mpf = dbp->mpf;
+ txn = dbc->txn;
+
+ if (((ret = __db_alloc_dbt(
+ env, dbt, tlen, &needed, &start, bpp, bpsz)) != 0) || needed == 0)
+ return (ret);
+
/* Set up a start page in the overflow chain if streaming. */
if (cp->stream_start_pgno != PGNO_INVALID &&
pgno == cp->stream_start_pgno && start >= cp->stream_off &&
@@ -485,28 +515,33 @@ __db_doff(dbc, pgno)
/*
* __db_moff --
- * Match on overflow pages.
+ * Match on overflow pages from a specific offset.
*
- * Given a starting page number and a key, return <0, 0, >0 to indicate if the
- * key on the page is less than, equal to or greater than the key specified.
- * We optimize this by doing chunk at a time comparison unless the user has
- * specified a comparison function. In this case, we need to materialize
- * the entire object and call their comparison routine.
+ * Given a starting page number and a key, store <0, 0, >0 in 'cmpp' to indicate
+ * if the key on the page is less than, equal to or greater than the key
+ * specified. We optimize this by doing a chunk at a time comparison unless the
+ * user has specified a comparison function. In this case, we need to
+ * materialize the entire object and call their comparison routine.
+ *
+ * We start the comparison at an offset and update the offset with the
+ * longest matching count after the comparison.
*
* __db_moff and __db_coff are generic functions useful in searching and
* ordering off page items. __db_moff matches an overflow DBT with an offpage
* item. __db_coff compares two offpage items for lexicographic sort order.
*
* PUBLIC: int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t,
- * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *),
+ * PUBLIC: int *, size_t *));
*/
int
-__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp)
+__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp, locp)
DBC *dbc;
const DBT *dbt;
db_pgno_t pgno;
u_int32_t tlen;
- int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+ int (*cmpfunc) __P((DB *, const DBT *, const DBT *, size_t *)), *cmpp;
+ size_t *locp;
{
DB *dbp;
DBT local_dbt;
@@ -517,6 +552,7 @@ __db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp)
u_int32_t bufsize, cmp_bytes, key_left;
u_int8_t *p1, *p2;
int ret;
+ size_t pos, start;
dbp = dbc->dbp;
ip = dbc->thread_info;
@@ -535,39 +571,76 @@ __db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp)
&local_dbt, tlen, pgno, &buf, &bufsize)) != 0)
return (ret);
/* Pass the key as the first argument */
- *cmpp = cmpfunc(dbp, dbt, &local_dbt);
+ *cmpp = cmpfunc(dbp, dbt, &local_dbt, NULL);
__os_free(dbp->env, buf);
return (0);
}
+ /*
+ * We start the comparison from the location of 'locp' and store the
+ * last matching location into 'locp'.
+ */
+ start = (locp == NULL ? 0 : *locp);
+ pos = 0;
+
+ /* Subtract prefix length from lengths. */
+ tlen -= (u_int32_t)start;
+ key_left = dbt->size - (u_int32_t)start;
+ p1 = (u_int8_t *)dbt->data + start;
+
/* While there are both keys to compare. */
- for (*cmpp = 0, p1 = dbt->data,
- key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) {
+ for (*cmpp = 0; key_left > 0 &&
+ tlen > 0 && pgno != PGNO_INVALID;) {
if ((ret =
__memp_fget(mpf, &pgno, ip, dbc->txn, 0, &pagep)) != 0)
return (ret);
- cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left;
- tlen -= cmp_bytes;
- key_left -= cmp_bytes;
- for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp);
- cmp_bytes-- > 0; ++p1, ++p2)
- if (*p1 != *p2) {
- *cmpp = (long)*p1 - (long)*p2;
- break;
+ /*
+ * Figure out where to start comparison, and how many
+ * bytes to compare.
+ */
+ if (pos >= start) {
+ p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+ cmp_bytes = OV_LEN(pagep);
+ } else if (pos + OV_LEN(pagep) > start) {
+ p2 = (u_int8_t *)pagep +
+ P_OVERHEAD(dbp) + (start - pos);
+ cmp_bytes = OV_LEN(pagep) - (u_int32_t)(start - pos);
+ } else {
+ p2 = NULL;
+ cmp_bytes = 0;
+ }
+
+ pos += OV_LEN(pagep);
+
+ if (cmp_bytes != 0) {
+ if (cmp_bytes > key_left)
+ cmp_bytes = key_left;
+ tlen -= cmp_bytes;
+ key_left -= cmp_bytes;
+ for (;cmp_bytes-- > 0; ++p1, ++p2) {
+ if (*p1 != *p2) {
+ *cmpp = (long)*p1 - (long)*p2;
+ break;
+ }
+ if (locp != NULL)
+ ++(*locp);
}
+
+ }
pgno = NEXT_PGNO(pagep);
if ((ret = __memp_fput(mpf, ip, pagep, dbp->priority)) != 0)
return (ret);
if (*cmpp != 0)
return (0);
}
- if (key_left > 0) /* DBT is longer than the page key. */
- *cmpp = 1;
- else if (tlen > 0) /* DBT is shorter than the page key. */
- *cmpp = -1;
- else
- *cmpp = 0;
+
+ if (*cmpp == 0) {
+ if (key_left > 0) /* DBT is longer than the page key. */
+ *cmpp = 1;
+ else if (tlen > 0) /* DBT is shorter than the page key. */
+ *cmpp = -1;
+ }
return (0);
}
@@ -587,13 +660,13 @@ __db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp)
* DBT type.
*
* PUBLIC: int __db_coff __P((DBC *, const DBT *, const DBT *,
- * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *), int *));
*/
int
__db_coff(dbc, dbt, match, cmpfunc, cmpp)
DBC *dbc;
const DBT *dbt, *match;
- int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+ int (*cmpfunc) __P((DB *, const DBT *, const DBT *, size_t *)), *cmpp;
{
DB *dbp;
DB_THREAD_INFO *ip;
@@ -643,7 +716,7 @@ __db_coff(dbc, dbt, match, cmpfunc, cmpp)
match_pgno, &match_buf, &match_bufsz)) != 0)
goto err1;
/* The key needs to be the first argument for sort order */
- *cmpp = cmpfunc(dbp, &local_key, &local_match);
+ *cmpp = cmpfunc(dbp, &local_key, &local_match, NULL);
err1: if (dbt_buf != NULL)
__os_free(dbp->env, dbt_buf);
@@ -657,6 +730,7 @@ err1: if (dbt_buf != NULL)
if ((ret =
__memp_fget(mpf, &dbt_pgno, ip, txn, 0, &dbt_pagep)) != 0)
return (ret);
+ DB_ASSERT(dbc->env, TYPE(dbt_pagep) == P_OVERFLOW);
if ((ret =
__memp_fget(mpf, &match_pgno,
ip, txn, 0, &match_pagep)) != 0) {
@@ -664,6 +738,7 @@ err1: if (dbt_buf != NULL)
mpf, ip, dbt_pagep, DB_PRIORITY_UNCHANGED);
return (ret);
}
+ DB_ASSERT(dbc->env, TYPE(match_pagep) == P_OVERFLOW);
cmp_bytes = page_space < max_data ? page_space : max_data;
for (p1 = (u_int8_t *)dbt_pagep + P_OVERHEAD(dbp),
p2 = (u_int8_t *)match_pagep + P_OVERHEAD(dbp);
diff --git a/src/db/db_ovfl_vrfy.c b/src/db/db_ovfl_vrfy.c
index fa630f7b..55eb2b70 100644
--- a/src/db/db_ovfl_vrfy.c
+++ b/src/db/db_ovfl_vrfy.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
diff --git a/src/db/db_pr.c b/src/db/db_pr.c
index d95440f9..4933498e 100644
--- a/src/db/db_pr.c
+++ b/src/db/db_pr.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -11,6 +11,7 @@
#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/btree.h"
+#include "dbinc/fop.h"
#include "dbinc/hash.h"
#include "dbinc/heap.h"
#include "dbinc/mp.h"
@@ -25,6 +26,11 @@ static int __db_hmeta __P((ENV *, DB *, HMETA *, u_int32_t));
static void __db_meta __P((ENV *, DB *, DBMETA *, FN const *, u_int32_t));
static void __db_proff __P((ENV *, DB_MSGBUF *, void *));
static int __db_qmeta __P((ENV *, DB *, QMETA *, u_int32_t));
+static int __db_prblob __P((DBC *, DBT *, DBT *, int, const char *,
+ void *, int (*callback) __P((void *, const void *)), int, int));
+static int __db_prblob_id __P((DB *, db_seq_t,
+ off_t, DBT *, int, const char *, void *,
+ int (*callback) __P((void *, const void *))));
#ifdef HAVE_STATISTICS
static void __db_prdb __P((DB *, u_int32_t));
static int __db_prtree __P((DB *, DB_TXN *,
@@ -515,6 +521,11 @@ __db_bmeta(env, dbp, h, flags)
__db_msg(env, "\tre_len: %#lx re_pad: %#lx",
(u_long)h->re_len, (u_long)h->re_pad);
__db_msg(env, "\troot: %lu", (u_long)h->root);
+ __db_msg(env, "\tblob_threshold: %lu", (u_long)h->blob_threshold);
+ __db_msg(env, "\tblob_file_lo: %lu", (u_long)h->blob_file_lo);
+ __db_msg(env, "\tblob_file_hi: %lu", (u_long)h->blob_file_hi);
+ __db_msg(env, "\tblob_sdb_lo: %lu", (u_long)h->blob_sdb_lo);
+ __db_msg(env, "\tblob_sdb_hi: %lu", (u_long)h->blob_sdb_hi);
return (0);
}
@@ -549,6 +560,11 @@ __db_hmeta(env, dbp, h, flags)
__db_msg(env, "\tffactor: %lu", (u_long)h->ffactor);
__db_msg(env, "\tnelem: %lu", (u_long)h->nelem);
__db_msg(env, "\th_charkey: %#lx", (u_long)h->h_charkey);
+ __db_msg(env, "\tblob_threshold: %lu", (u_long)h->blob_threshold);
+ __db_msg(env, "\tblob_file_lo: %lu", (u_long)h->blob_file_lo);
+ __db_msg(env, "\tblob_file_hi: %lu", (u_long)h->blob_file_hi);
+ __db_msg(env, "\tblob_sdb_lo: %lu", (u_long)h->blob_sdb_lo);
+ __db_msg(env, "\tblob_sdb_hi: %lu", (u_long)h->blob_sdb_hi);
__db_msgadd(env, &mb, "\tspare points:\n\t");
for (i = 0; i < NCACHED; i++) {
__db_msgadd(env, &mb, "%lu (%lu) ", (u_long)h->spares[i],
@@ -604,6 +620,9 @@ __db_heapmeta(env, dbp, h, flags)
__db_msg(env, "\tnregions: %lu", (u_long)h->nregions);
__db_msg(env, "\tgbytes: %lu", (u_long)h->gbytes);
__db_msg(env, "\tbytes: %lu", (u_long)h->bytes);
+ __db_msg(env, "\tblob_threshold: %lu", (u_long)h->blob_threshold);
+ __db_msg(env, "\tblob_file_lo: %lu", (u_long)h->blob_file_lo);
+ __db_msg(env, "\tblob_file_hi: %lu", (u_long)h->blob_file_hi);
return (0);
}
@@ -682,14 +701,19 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags)
{
BINTERNAL *bi;
BKEYDATA *bk;
+ BBLOB bl;
HOFFPAGE a_hkd;
+ HBLOB hblob;
QAMDATA *qp, *qep;
RINTERNAL *ri;
HEAPHDR *hh;
HEAPSPLITHDR *hs;
+ HEAPBLOBHDR bhdr;
db_indx_t dlen, len, i, *inp, max;
db_pgno_t pgno;
db_recno_t recno;
+ off_t blob_size;
+ db_seq_t blob_id;
u_int32_t qlen;
u_int8_t *ep, *hk, *p;
int deleted, ret;
@@ -899,6 +923,23 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags)
(u_long)a_hkd.tlen, (u_long)a_hkd.pgno);
DB_MSGBUF_FLUSH(env, mbp);
break;
+ case H_BLOB:
+ memcpy(&hblob, hk, HBLOB_SIZE);
+ blob_id = (db_seq_t)hblob.id;
+ __db_msgadd(env, mbp, "blob: id: %llu ",
+ (long long)blob_id);
+ GET_BLOB_SIZE(env, hblob, blob_size, ret);
+ if (ret != 0)
+ __db_msgadd(env, mbp,
+ "blob: blob_size overflow. ");
+ __db_msgadd(env, mbp, "blob: size: %llu",
+ (long long)blob_size);
+ /*
+ * No point printing the blob file, it is
+ * likely not readable by humans.
+ */
+ DB_MSGBUF_FLUSH(env, mbp);
+ break;
default:
DB_MSGBUF_FLUSH(env, mbp);
__db_msg(env, "ILLEGAL HASH PAGE TYPE: %lu",
@@ -925,6 +966,7 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags)
__db_proff(env, mbp, bi->data);
break;
default:
+ /* B_BLOB does not appear on internal pages. */
DB_MSGBUF_FLUSH(env, mbp);
__db_msg(env, "ILLEGAL BINTERNAL TYPE: %lu",
(u_long)B_TYPE(bi->type));
@@ -950,6 +992,19 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags)
case B_OVERFLOW:
__db_proff(env, mbp, bk);
break;
+ case B_BLOB:
+ memcpy(&bl, bk, BBLOB_SIZE);
+ blob_id = (db_seq_t)bl.id;
+ __db_msgadd(env, mbp, "blob: id: %llu ",
+ (long long)blob_id);
+ GET_BLOB_SIZE(env, bl, blob_size, ret);
+ if (ret != 0)
+ __db_msgadd(env, mbp,
+ "blob: blob_size overflow. ");
+ __db_msgadd(env, mbp, "blob: size: %llu",
+ (long long)blob_size);
+ DB_MSGBUF_FLUSH(env, mbp);
+ break;
default:
DB_MSGBUF_FLUSH(env, mbp);
__db_msg(env,
@@ -961,9 +1016,27 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags)
break;
case P_HEAP:
hh = sp;
- if (!F_ISSET(hh,HEAP_RECSPLIT))
+ if (!F_ISSET(hh,HEAP_RECSPLIT) &&
+ !F_ISSET(hh, HEAP_RECBLOB))
hdata = (u_int8_t *)hh + sizeof(HEAPHDR);
- else {
+ else if (F_ISSET(hh, HEAP_RECBLOB)) {
+ memcpy(&bhdr, hh, HEAPBLOBREC_SIZE);
+ blob_id = (db_seq_t)bhdr.id;
+ __db_msgadd(env, mbp, "blob: id: %llu ",
+ (long long)blob_id);
+ GET_BLOB_SIZE(env, bhdr, blob_size, ret);
+ if (ret != 0)
+ __db_msgadd(env, mbp,
+ "blob: blob_size overflow. ");
+ __db_msgadd(env, mbp, "blob: size: %llu",
+ (long long)blob_size);
+ /*
+ * No point printing the blob file, it is
+ * likely not readable by humans.
+ */
+ DB_MSGBUF_FLUSH(env, mbp);
+ break;
+ } else {
hs = sp;
__db_msgadd(env, mbp,
"split: 0x%02x tsize: %lu next: %lu.%lu ",
@@ -1276,10 +1349,16 @@ __db_dump(dbp, subname, callback, handle, pflag, keyflag)
ENV *env;
db_recno_t recno;
int is_recno, is_heap, ret, t_ret;
+ u_int32_t blob_threshold;
void *pointer;
env = dbp->env;
is_heap = 0;
+ memset(&dataret, 0, sizeof(DBT));
+ memset(&keyret, 0, sizeof(DBT));
+
+ if ((ret = __db_get_blob_threshold(dbp, &blob_threshold)) != 0)
+ return (ret);
if ((ret = __db_prheader(
dbp, subname, pflag, keyflag, handle, callback, NULL, 0)) != 0)
@@ -1317,8 +1396,8 @@ retry: while ((ret =
!is_heap ? DB_NEXT | DB_MULTIPLE_KEY : DB_NEXT )) == 0) {
if (is_heap) {
/* Never dump keys for HEAP */
- if ((ret = __db_prdbt(
- &data, pflag, " ", handle, callback, 0, 0)) != 0)
+ if ((ret = __db_prdbt(&data,
+ pflag, " ", handle, callback, 0, 0, 0)) != 0)
goto err;
continue;
}
@@ -1337,17 +1416,24 @@ retry: while ((ret =
if ((keyflag &&
(ret = __db_prdbt(&keyret, pflag, " ",
- handle, callback, is_recno, 0)) != 0) ||
+ handle, callback, is_recno, 0, 0)) != 0) ||
(ret = __db_prdbt(&dataret, pflag, " ",
- handle, callback, 0, 0)) != 0)
+ handle, callback, 0, 0, 0)) != 0)
goto err;
}
}
if (ret == DB_BUFFER_SMALL) {
- data.size = (u_int32_t)DB_ALIGN(data.size, 1024);
- if ((ret = __os_realloc(env, data.size, &data.data)) != 0)
- goto err;
- data.ulen = data.size;
+ if (blob_threshold != 0 && data.size >= blob_threshold) {
+ if ((ret = __db_prblob(dbcp, &key, &data, pflag,
+ " ", handle, callback, is_heap, keyflag)) != 0)
+ goto err;
+ } else {
+ data.size = (u_int32_t)DB_ALIGN(data.size, 1024);
+ if ((ret = __os_realloc(
+ env, data.size, &data.data)) != 0)
+ goto err;
+ data.ulen = data.size;
+ }
goto retry;
}
if (ret == DB_NOTFOUND)
@@ -1365,14 +1451,153 @@ err: if ((t_ret = __dbc_close(dbcp)) != 0 && ret == 0)
}
/*
+ * __db_prblob
+ * Print a blob file.
+ */
+static int
+__db_prblob(dbc, key, data, checkprint,
+ prefix, handle, callback, is_heap, keyflag)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ int checkprint;
+ const char *prefix;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ int is_heap;
+ int keyflag;
+{
+ DBC *local;
+ DBT partial;
+ int ret, t_ret;
+ off_t blob_size;
+ db_seq_t blob_id;
+
+ local = NULL;
+ memset(&partial, 0, sizeof(DBT));
+ partial.flags = DB_DBT_PARTIAL;
+
+ if ((ret = __dbc_idup(dbc, &local, DB_POSITION)) != 0)
+ goto err;
+
+ /* Move the cursor to the blob. */
+ if ((ret = __dbc_get(local, key, &partial, DB_NEXT)) != 0)
+ return (ret);
+
+ if ((ret = __dbc_get_blob_id(local, &blob_id)) != 0) {
+ /*
+ * It is possible this is not a blob. Non-blob items that are
+ * larger than the blob threshold can exist if the item was
+ * smaller than the threshold when created, then later updated
+ * to larger than the threshold value.
+ */
+ if (ret == EINVAL) {
+ ret = 0;
+ data->size = (u_int32_t)DB_ALIGN(data->size, 1024);
+ if ((ret = __os_realloc(
+ dbc->env, data->size, &data->data)) != 0)
+ goto err;
+ data->ulen = data->size;
+ }
+ goto err;
+ }
+
+ if (data->ulen < MEGABYTE) {
+ if ((data->data = realloc(
+ data->data, data->ulen = MEGABYTE)) == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ }
+
+ if ((ret = __dbc_get_blob_size(local, &blob_size)) != 0)
+ goto err;
+
+ if (keyflag && !is_heap && (ret = __db_prdbt(
+ key, checkprint, " ", handle, callback, 0, 0, 0)) != 0)
+ goto err;
+
+ if ((ret = __db_prblob_id(local->dbp, blob_id, blob_size,
+ data, checkprint, prefix, handle, callback)) != 0)
+ goto err;
+
+ /* Move the cursor. */
+ ret = __dbc_get(dbc, key, &partial, DB_NEXT);
+
+err: if (local != NULL) {
+ if ((t_ret = __dbc_close(local)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_prblob_id --
+ * Print a blob file identified by the given id.
+ */
+static int
+__db_prblob_id(dbp, blob_id,
+ blob_size, data, checkprint, prefix, handle, callback)
+ DB *dbp;
+ db_seq_t blob_id;
+ off_t blob_size;
+ DBT *data;
+ int checkprint;
+ const char *prefix;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+{
+ DB_FH *fhp;
+ const char *pre;
+ int ret, skip_newline, t_ret;
+ off_t left, offset;
+
+ fhp = NULL;
+ offset = 0;
+
+ if ((ret = __blob_file_open(
+ dbp, &fhp, blob_id, DB_FOP_READONLY, 1)) != 0)
+ goto err;
+
+ left = blob_size;
+ while (left > 0) {
+ if ((ret = __blob_file_read(
+ dbp->env, fhp, data, offset, data->ulen)) != 0)
+ goto err;
+ if (offset == 0)
+ pre = prefix;
+ else
+ pre = NULL;
+ skip_newline = data->size < left ? 1 : 0;
+ if ((ret = __db_prdbt(data, checkprint, pre,
+ handle, callback, 0, 0, skip_newline)) != 0)
+ goto err;
+ if (data->size > left)
+ left = 0;
+ else
+ left = left - data->size;
+ offset = offset + data->size;
+ }
+
+err: if (fhp != NULL) {
+ if ((t_ret = __os_closehandle(dbp->env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
* __db_prdbt --
* Print out a DBT data element.
*
* PUBLIC: int __db_prdbt __P((DBT *, int, const char *, void *,
- * PUBLIC: int (*)(void *, const void *), int, int));
+ * PUBLIC: int (*)(void *, const void *), int, int, int));
*/
int
-__db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap)
+__db_prdbt(dbtp, checkprint,
+ prefix, handle, callback, is_recno, is_heap, no_newline)
DBT *dbtp;
int checkprint;
const char *prefix;
@@ -1380,16 +1605,17 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap)
int (*callback) __P((void *, const void *));
int is_recno;
int is_heap;
+ int no_newline;
{
- static const u_char hex[] = "0123456789abcdef";
db_recno_t recno;
DB_HEAP_RID rid;
- size_t len;
+ size_t count, len;
int ret;
+ u_int8_t *p;
#define DBTBUFLEN 100
- u_int8_t *p, *hp;
- char buf[DBTBUFLEN], hbuf[DBTBUFLEN];
+ char buf[DBTBUFLEN], hexbuf[2 * DBTBUFLEN + 1];
+ ret = 0;
/*
* !!!
* This routine is the routine that dumps out items in the format
@@ -1409,13 +1635,8 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap)
/* If we're printing data as hex, print keys as hex too. */
if (!checkprint) {
- for (len = strlen(buf), p = (u_int8_t *)buf,
- hp = (u_int8_t *)hbuf; len-- > 0; ++p) {
- *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4];
- *hp++ = hex[*p & 0x0f];
- }
- *hp = '\0';
- ret = callback(handle, hbuf);
+ (void)__db_tohex(buf, strlen(buf), hexbuf);
+ ret = callback(handle, hexbuf);
} else
ret = callback(handle, buf);
@@ -1433,44 +1654,46 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap)
/* If we're printing data as hex, print keys as hex too. */
if (!checkprint) {
- for (len = strlen(buf), p = (u_int8_t *)buf,
- hp = (u_int8_t *)hbuf; len-- > 0; ++p) {
- *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4];
- *hp++ = hex[*p & 0x0f];
- }
- *hp = '\0';
- ret = callback(handle, hbuf);
+ (void)__db_tohex(buf, strlen(buf), hexbuf);
+ ret = callback(handle, hexbuf);
} else
ret = callback(handle, buf);
if (ret != 0)
return (ret);
} else if (checkprint) {
+ /*
+ * Prepare buf for the 'isprint()' case: printable single char
+ * strings; prepare hexbuf for the other case '\<2 hex digits>'.
+ */
+ buf[1] = '\0';
+ hexbuf[0] = '\\';
for (len = dbtp->size, p = dbtp->data; len--; ++p)
if (isprint((int)*p)) {
if (*p == '\\' &&
(ret = callback(handle, "\\")) != 0)
return (ret);
- snprintf(buf, DBTBUFLEN, "%c", *p);
+ buf[0] = (char)*p;
if ((ret = callback(handle, buf)) != 0)
return (ret);
} else {
- snprintf(buf, DBTBUFLEN, "\\%c%c",
- hex[(u_int8_t)(*p & 0xf0) >> 4],
- hex[*p & 0x0f]);
- if ((ret = callback(handle, buf)) != 0)
+ (void)__db_tohex(p, 1, hexbuf + 1);
+ if ((ret = callback(handle, hexbuf)) != 0)
return (ret);
}
} else
- for (len = dbtp->size, p = dbtp->data; len--; ++p) {
- snprintf(buf, DBTBUFLEN, "%c%c",
- hex[(u_int8_t)(*p & 0xf0) >> 4],
- hex[*p & 0x0f]);
- if ((ret = callback(handle, buf)) != 0)
+ for (len = dbtp->size, p = dbtp->data, count = DBTBUFLEN;
+ len > 0; len -= count, p += count) {
+ if (count > len)
+ count = len;
+ (void)__db_tohex(p, count, hexbuf);
+ if ((ret = callback(handle, hexbuf)) != 0)
return (ret);
}
-
- return (callback(handle, "\n"));
+ if (no_newline == 0)
+ return (callback(handle, "\n"));
+ else
+ return (ret);
}
/*
@@ -1598,7 +1821,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
goto err;
DB_INIT_DBT(dbt, subname, strlen(subname));
if ((ret = __db_prdbt(&dbt, 1,
- NULL, handle, callback, 0, 0)) != 0)
+ NULL, handle, callback, 0, 0, 0)) != 0)
goto err;
}
switch (dbtype) {
@@ -1868,7 +2091,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
goto err;
for (i = 0; i < tmp_u_int32 - 1; i++)
if ((ret = __db_prdbt(&keys[i],
- pflag, " ", handle, callback, 0, 0)) != 0)
+ pflag, " ", handle, callback, 0, 0, 0)) != 0)
goto err;
}
}
@@ -1954,3 +2177,33 @@ __db_dbtype_to_string(type)
}
return ("UNKNOWN TYPE");
}
+
+/*
+ * __db_tohex --
+ * Generate a hex string representation of a byte array.
+ * The size of the destination must be at least 2*len + 1 bytes long,
+ * to allow for the '\0' terminator, which is always added.
+ *
+ * PUBLIC: char *__db_tohex __P((const void *, size_t, char *));
+ */
+char *
+__db_tohex(source, len, dest)
+ const void *source;
+ size_t len;
+ char *dest;
+{
+ static const char hex[] = "0123456789abcdef";
+ const u_int8_t *s;
+ char *d;
+
+ s = source;
+ d = dest;
+ while (len > 0) {
+ *d++ = hex[(*s & 0xf0) >> 4];
+ *d++ = hex[*s & 0x0f];
+ s++;
+ len--;
+ }
+ *d = '\0';
+ return ((char *)dest);
+}
diff --git a/src/db/db_rec.c b/src/db/db_rec.c
index 8ba1124e..98b29b22 100644
--- a/src/db/db_rec.c
+++ b/src/db/db_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -1194,8 +1194,9 @@ __db_pg_init_recover(env, dbtp, lsnp, op, info)
DB_LSN copy_lsn;
DB_MPOOLFILE *mpf;
PAGE *pagep;
- int cmp_n, cmp_p, ret, type;
+ int cmp_n, cmp_p, ret, t_ret, type;
+ pagep = NULL;
ip = ((DB_TXNHEAD *)info)->thread_info;
REC_PRINT(__db_pg_init_print);
REC_INTRO(__db_pg_init_read, ip, 0);
@@ -1247,11 +1248,12 @@ __db_pg_init_recover(env, dbtp, lsnp, op, info)
memcpy((u_int8_t*)pagep + HOFFSET(pagep),
argp->data.data, argp->data.size);
}
- if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
- goto out;
done: *lsnp = argp->prev_lsn;
out:
+ if (pagep != NULL && (t_ret =
+ __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
REC_CLOSE;
}
diff --git a/src/db/db_reclaim.c b/src/db/db_reclaim.c
index b902769a..abae33d9 100644
--- a/src/db/db_reclaim.c
+++ b/src/db/db_reclaim.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -181,6 +181,7 @@ __db_truncate_callback(dbc, p, cookie, putp)
switch (*H_PAIRDATA(dbp, p, indx)) {
case H_OFFDUP:
break;
+ case H_BLOB:
case H_OFFPAGE:
case H_KEYDATA:
++*countp;
diff --git a/src/db/db_remove.c b/src/db/db_remove.c
index 591a29b2..d6118fae 100644
--- a/src/db/db_remove.c
+++ b/src/db/db_remove.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -18,7 +18,7 @@
#include "dbinc/txn.h"
static int __db_dbtxn_remove __P((DB *,
- DB_THREAD_INFO *, DB_TXN *, const char *, const char *));
+ DB_THREAD_INFO *, DB_TXN *, const char *, const char *, APPNAME));
static int __db_subdb_remove __P((DB *,
DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
@@ -264,7 +264,7 @@ __db_remove_int(dbp, ip, txn, name, subdb, flags)
/* Handle transactional file removes separately. */
if (IS_REAL_TXN(txn)) {
- ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb);
+ ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb, DB_APP_DATA);
goto err;
}
@@ -293,6 +293,10 @@ __db_remove_int(dbp, ip, txn, name, subdb, flags)
(ret = dbp->db_am_remove(dbp, ip, NULL, name, subdb, flags)) != 0)
goto err;
+ if (dbp->db_am_remove == NULL &&
+ (ret = __blob_del_all(dbp, txn, 0)) != 0)
+ goto err;
+
ret = F_ISSET(dbp, DB_AM_INMEM) ?
__db_inmem_remove(dbp, NULL, real_name) :
__fop_remove(env,
@@ -407,6 +411,10 @@ __db_subdb_remove(dbp, ip, txn, name, subdb, flags)
txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0, PGNO_BASE_MD)) != 0)
goto err;
+ if (sdbp->blob_threshold != 0)
+ if ((ret = __blob_del_all(sdbp, txn, 0)) != 0)
+ goto err;
+
DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name);
/* Have the handle locked so we will not lock pages. */
@@ -460,18 +468,21 @@ err:
}
static int
-__db_dbtxn_remove(dbp, ip, txn, name, subdb)
+__db_dbtxn_remove(dbp, ip, txn, name, subdb, appname)
DB *dbp;
DB_THREAD_INFO *ip;
DB_TXN *txn;
const char *name, *subdb;
+ APPNAME appname;
{
ENV *env;
int ret;
char *tmpname;
+ u_int32_t flags;
env = dbp->env;
tmpname = NULL;
+ flags = DB_NOSYNC;
/*
* This is a transactional remove, so we have to keep the name
@@ -488,7 +499,12 @@ __db_dbtxn_remove(dbp, ip, txn, name, subdb)
DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
if ((ret = __db_rename_int(dbp,
- txn->thread_info, txn, name, subdb, tmpname, DB_NOSYNC)) != 0)
+ txn->thread_info, txn, name, subdb, tmpname, flags)) != 0)
+ goto err;
+
+ /* Delete all blob files, if this database supports blobs. */
+ if (appname != DB_APP_BLOB && (dbp->blob_file_id != 0 ||
+ dbp->blob_sdb_id != 0) && (ret = __blob_del_all(dbp, txn, 0)) != 0)
goto err;
/*
@@ -501,7 +517,7 @@ __db_dbtxn_remove(dbp, ip, txn, name, subdb)
ret = F_ISSET(dbp, DB_AM_INMEM) ?
__db_inmem_remove(dbp, txn, tmpname) :
__fop_remove(env,
- txn, dbp->fileid, tmpname, &dbp->dirname, DB_APP_DATA,
+ txn, dbp->fileid, tmpname, &dbp->dirname, appname,
F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
diff --git a/src/db/db_rename.c b/src/db/db_rename.c
index 2812b948..5b2bed42 100644
--- a/src/db/db_rename.c
+++ b/src/db/db_rename.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -285,10 +285,11 @@ __db_rename_int(dbp, ip, txn, name, subdb, newname, flags)
* taken care of in the fop layer.
*/
if (IS_REAL_TXN(txn)) {
- if ((ret = __fop_dummy(dbp, txn, old, newname)) != 0)
+ if ((ret =
+ __fop_dummy(dbp, txn, old, newname, DB_APP_DATA)) != 0)
goto err;
} else {
- if ((ret = __fop_dbrename(dbp, old, newname)) != 0)
+ if ((ret = __fop_dbrename(dbp, old, newname, DB_APP_DATA)) != 0)
goto err;
}
diff --git a/src/db/db_ret.c b/src/db/db_ret.c
index 709605f6..ddd0ef51 100644
--- a/src/db/db_ret.c
+++ b/src/db/db_ret.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -29,18 +29,27 @@ __db_ret(dbc, h, indx, dbt, memp, memsize)
void **memp;
u_int32_t *memsize;
{
+ BBLOB bl;
BKEYDATA *bk;
BOVERFLOW *bo;
DB *dbp;
+ ENV *env;
+ HBLOB hblob;
+ HEAPBLOBHDR bhdr;
HEAPHDR *hdr;
+ db_seq_t blob_id;
+ int ret;
HOFFPAGE ho;
+ off_t blob_size;
u_int32_t len;
u_int8_t *hk;
void *data;
if (F_ISSET(dbt, DB_DBT_READONLY))
return (0);
+ ret = 0;
dbp = dbc->dbp;
+ env = dbp->env;
switch (TYPE(h)) {
case P_HASH_UNSORTED:
@@ -50,6 +59,20 @@ __db_ret(dbc, h, indx, dbt, memp, memsize)
memcpy(&ho, hk, sizeof(HOFFPAGE));
return (__db_goff(dbc, dbt,
ho.tlen, ho.pgno, memp, memsize));
+ } else if (HPAGE_PTYPE(hk) == H_BLOB) {
+ /* Get the record instead of the blob item. */
+ if (F_ISSET(dbt, DB_DBT_BLOB_REC)) {
+ data = P_ENTRY(dbp, h, indx);
+ len = HBLOB_SIZE;
+ break;
+ }
+ memcpy(&hblob, hk, HBLOB_SIZE);
+ blob_id = (db_seq_t)hblob.id;
+ GET_BLOB_SIZE(env, hblob, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ return (__blob_get(
+ dbc, dbt, blob_id, blob_size, memp, memsize));
}
len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx);
data = HKEYDATA_DATA(hk);
@@ -58,6 +81,21 @@ __db_ret(dbc, h, indx, dbt, memp, memsize)
hdr = (HEAPHDR *)P_ENTRY(dbp, h, indx);
if (F_ISSET(hdr,(HEAP_RECSPLIT | HEAP_RECFIRST)))
return (__heapc_gsplit(dbc, dbt, memp, memsize));
+ else if (F_ISSET(hdr, HEAP_RECBLOB)) {
+ /* Get the record instead of the blob item. */
+ if (F_ISSET(dbt, DB_DBT_BLOB_REC)) {
+ data = P_ENTRY(dbp, h, indx);
+ len = HEAPBLOBREC_SIZE;
+ break;
+ }
+ memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE);
+ blob_id = (db_seq_t)bhdr.id;
+ GET_BLOB_SIZE(env, bhdr, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ return (__blob_get(
+ dbc, dbt, blob_id, blob_size, memp, memsize));
+ }
len = hdr->size;
data = (u_int8_t *)hdr + sizeof(HEAPHDR);
break;
@@ -69,6 +107,20 @@ __db_ret(dbc, h, indx, dbt, memp, memsize)
bo = (BOVERFLOW *)bk;
return (__db_goff(dbc, dbt,
bo->tlen, bo->pgno, memp, memsize));
+ } else if (B_TYPE(bk->type) == B_BLOB) {
+ /* Get the record instead of the blob item. */
+ if (F_ISSET(dbt, DB_DBT_BLOB_REC)) {
+ data = P_ENTRY(dbp, h, indx);
+ len = BBLOB_SIZE;
+ break;
+ }
+ memcpy(&bl, bk, BBLOB_SIZE);
+ blob_id = (db_seq_t)bl.id;
+ GET_BLOB_SIZE(env, bl, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ return (__blob_get(
+ dbc, dbt, blob_id, blob_size, memp, memsize));
}
len = bk->len;
data = bk->data;
@@ -167,3 +219,71 @@ __db_retcopy(env, dbt, data, len, memp, memsize)
return (ret);
}
+
+/*
+ * __db_dbt_clone --
+ * Clone a DBT from another DBT.
+ * The input dest DBT must be a zero initialized DBT that will be populated.
+ * The function does not allocate a dest DBT to allow for cloning into stack
+ * or locally allocated variables. It is the callers responsibility to free
+ * the memory allocated in dest->data.
+ *
+ * PUBLIC: int __db_dbt_clone __P((ENV *, DBT *, const DBT *));
+ */
+int
+__db_dbt_clone(env, dest, src)
+ ENV *env;
+ DBT *dest;
+ const DBT *src;
+{
+ u_int32_t err_flags;
+ int ret;
+
+ DB_ASSERT(env, dest->data == NULL);
+
+ ret = 0;
+
+ /* The function does not support the following DBT flags. */
+ err_flags = DB_DBT_MALLOC | DB_DBT_REALLOC |
+ DB_DBT_MULTIPLE | DB_DBT_PARTIAL;
+ if (F_ISSET(src, err_flags)) {
+ __db_errx(env, DB_STR("0758",
+ "Unsupported flags when cloning the DBT."));
+ return (EINVAL);
+ }
+
+ if ((ret = __os_malloc(env, src->size, &dest->data)) != 0)
+ return (ret);
+
+ memcpy(dest->data, src->data, src->size);
+ dest->ulen = src->size;
+ dest->size = src->size;
+ dest->flags = DB_DBT_USERMEM;
+
+ return (ret);
+}
+
+/*
+ * __db_dbt_clone_free --
+ * Free a DBT cloned by __db_dbt_clone
+ *
+ * PUBLIC: int __db_dbt_clone_free __P((ENV *, DBT *));
+ */
+int
+__db_dbt_clone_free(env, dbt)
+ ENV *env;
+ DBT *dbt;
+{
+ /* Currently only DB_DBT_USERMEM is supported. */
+ if (dbt->flags != DB_DBT_USERMEM) {
+ __db_errx(env, DB_STR("0759",
+ "Unsupported flags when freeing the cloned DBT."));
+ return (EINVAL);
+ }
+
+ if (dbt->data != NULL)
+ __os_free(env, dbt->data);
+ dbt->size = dbt->ulen = 0;
+
+ return (0);
+}
diff --git a/src/db/db_setid.c b/src/db/db_setid.c
index 697c3ff7..5c61a139 100644
--- a/src/db/db_setid.c
+++ b/src/db/db_setid.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/db/db_setlsn.c b/src/db/db_setlsn.c
index 1a3280ed..acee80f6 100644
--- a/src/db/db_setlsn.c
+++ b/src/db/db_setlsn.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/db/db_sort_multiple.c b/src/db/db_sort_multiple.c
index c5e2e941..7facb80e 100644
--- a/src/db/db_sort_multiple.c
+++ b/src/db/db_sort_multiple.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
#include "db_config.h"
@@ -34,7 +34,7 @@ __db_compare_both(db, akey, adata, bkey, bdata)
t = (BTREE *)db->bt_internal;
- cmp = t->bt_compare(db, akey, bkey);
+ cmp = t->bt_compare(db, akey, bkey, NULL);
if (cmp != 0) return cmp;
if (!F_ISSET(db, DB_AM_DUPSORT))
return (0);
@@ -44,9 +44,9 @@ __db_compare_both(db, akey, adata, bkey, bdata)
#ifdef HAVE_COMPRESSION
if (DB_IS_COMPRESSED(db))
- return t->compress_dup_compare(db, adata, bdata);
+ return t->compress_dup_compare(db, adata, bdata, NULL);
#endif
- return db->dup_compare(db, adata, bdata);
+ return db->dup_compare(db, adata, bdata, NULL);
}
#define DB_SORT_SWAP(a, ad, b, bd) \
diff --git a/src/db/db_stati.c b/src/db/db_stati.c
index 61744e81..b7367f37 100644
--- a/src/db/db_stati.c
+++ b/src/db/db_stati.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/db/db_truncate.c b/src/db/db_truncate.c
index 0eeb0c64..d57a23b2 100644
--- a/src/db/db_truncate.c
+++ b/src/db/db_truncate.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -191,6 +191,10 @@ __db_truncate(dbp, ip, txn, countp)
if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
ret = t_ret;
+ /* Delete all blob files. */
+ if (ret == 0)
+ ret = __blob_del_all(dbp, txn, 1);
+
DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL);
DB_TEST_RECOVERY_LABEL
diff --git a/src/db/db_upg.c b/src/db/db_upg.c
index de5d0dc7..7dcc3b1c 100644
--- a/src/db/db_upg.c
+++ b/src/db/db_upg.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -13,6 +13,7 @@
#include "dbinc/db_swap.h"
#include "dbinc/btree.h"
#include "dbinc/hash.h"
+#include "dbinc/heap.h"
#include "dbinc/qam.h"
/*
@@ -98,6 +99,27 @@ static int (* const func_46_list[P_PAGETYPE_MAX])
NULL, /* P_IHEAP */
};
+static int (* const func_60_list[P_PAGETYPE_MAX])
+ __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
+ NULL, /* P_INVALID */
+ NULL, /* __P_DUPLICATE */
+ NULL, /* P_HASH_UNSORTED */
+ NULL, /* P_IBTREE */
+ NULL, /* P_IRECNO */
+ __bam_60_lbtree, /* P_LBTREE */
+ NULL, /* P_LRECNO */
+ NULL, /* P_OVERFLOW */
+ __ham_60_hashmeta, /* P_HASHMETA */
+ __bam_60_btreemeta, /* P_BTREEMETA */
+ NULL, /* P_QAMMETA */
+ NULL, /* P_QAMDATA */
+ NULL, /* P_LDUP */
+ __ham_60_hash, /* P_HASH */
+ __heap_60_heapmeta, /* P_HEAPMETA */
+ __heap_60_heap, /* P_HEAP */
+ NULL, /* P_IHEAP */
+};
+
static int __db_page_pass __P((DB *, char *, u_int32_t, int (* const [])
(DB *, char *, u_int32_t, DB_FH *, PAGE *, int *), DB_FH *));
static int __db_set_lastpgno __P((DB *, char *, DB_FH *));
@@ -181,6 +203,34 @@ __db_upgrade(dbp, fname, flags)
goto err;
/* FALLTHROUGH */
case 9:
+ /*
+ * Various blob ids and size use two u_int32_t values
+ * to represent 64 bit integers in early 6.0. Change
+ * those values to 64 bit integers.
+ */
+ /*
+ * Read the encrypt_alg and chksum fields from the
+ * metadata page.
+ */
+ meta = (DBMETA *)mbuf;
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+ F_SET(dbp, DB_AM_CHKSUM);
+ if (meta->encrypt_alg != 0) {
+ if (!CRYPTO_ON(dbp->env)) {
+ __db_errx(env, DB_STR("0777",
+"Attempt to upgrade an encrypted database without providing a password."));
+ ret = EINVAL;
+ goto err;
+ }
+ F_SET(dbp, DB_AM_ENCRYPT);
+ }
+ memcpy(&dbp->pgsize,
+ &meta->pagesize, sizeof(u_int32_t));
+ if ((ret = __db_page_pass(dbp,
+ real_name, flags, func_60_list, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 10:
break;
default:
__db_errx(env, DB_STR_A("0666",
@@ -307,6 +357,34 @@ __db_upgrade(dbp, fname, flags)
/* FALLTHROUGH */
case 9:
+ /*
+ * Various blob ids and size use two u_int32_t values
+ * to represent 64 bit integers in early 6.0. Change
+ * those values to 64 bit integers.
+ */
+ meta = (DBMETA*)mbuf;
+ memcpy(&dbp->pgsize,
+ &meta->pagesize, sizeof(u_int32_t));
+ /*
+ * Read the encrypt_alg and chksum fields from the
+ * metadata page.
+ */
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+ F_SET(dbp, DB_AM_CHKSUM);
+ if (meta->encrypt_alg != 0) {
+ if (!CRYPTO_ON(dbp->env)) {
+ __db_errx(env, DB_STR("0778",
+"Attempt to upgrade an encrypted database without providing a password."));
+ ret = EINVAL;
+ goto err;
+ }
+ F_SET(dbp, DB_AM_ENCRYPT);
+ }
+ if ((ret = __db_page_pass(dbp,
+ real_name, flags, func_60_list, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 10:
break;
default:
__db_errx(env, DB_STR_A("0668",
@@ -317,9 +395,45 @@ __db_upgrade(dbp, fname, flags)
}
break;
case DB_HEAPMAGIC:
- /*
- * There's no upgrade needed for Heap yet.
- */
+ switch (((DBMETA *)mbuf)->version) {
+ case 1:
+ /*
+ * Various blob ids and size use two u_int32_t values
+ * to represent 64 bit integers in early 6.0. Change
+ * those values to 64 bit integers.
+ */
+ meta = (DBMETA*)mbuf;
+ memcpy(&dbp->pgsize,
+ &meta->pagesize, sizeof(u_int32_t));
+ /*
+ * Read the encrypt_alg and chksum fields from the
+ * metadata page.
+ */
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+ F_SET(dbp, DB_AM_CHKSUM);
+ if (meta->encrypt_alg != 0) {
+ if (!CRYPTO_ON(dbp->env)) {
+ __db_errx(env, DB_STR("0779",
+"Attempt to upgrade an encrypted database without providing a password."));
+ ret = EINVAL;
+ goto err;
+ }
+ F_SET(dbp, DB_AM_ENCRYPT);
+ }
+ if ((ret = __db_page_pass(dbp,
+ real_name, flags, func_60_list, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 2:
+ break;
+ default:
+ __db_errx(env, DB_STR_A("0776",
+ "%s: unsupported heap version: %lu",
+ "%s %lu"), real_name,
+ (u_long)((DBMETA *)mbuf)->version);
+ ret = DB_OLD_VERSION;
+ goto err;
+ }
break;
case DB_QAMMAGIC:
switch (((DBMETA *)mbuf)->version) {
diff --git a/src/db/db_upg_opd.c b/src/db/db_upg_opd.c
index 992115ad..6f6dfb71 100644
--- a/src/db/db_upg_opd.c
+++ b/src/db/db_upg_opd.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -37,6 +37,9 @@ static int __db_up_ovref __P((DB *, DB_FH *, db_pgno_t));
* __db_31_offdup --
* Convert 3.0 off-page duplicates to 3.1 off-page duplicates.
*
+ * This code and its descendants should be removed when support for
+ * upgrading from a 3.0 database format is removed.
+ *
* PUBLIC: int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *));
*/
int
@@ -317,7 +320,7 @@ __db_build_ri(dbp, fhp, ipage, page, indx, nomemp)
/*
* __db_up_ovref --
- * Increment/decrement the reference count on an overflow page.
+ * Increment the reference count on an overflow page.
*/
static int
__db_up_ovref(dbp, fhp, pgno)
diff --git a/src/db/db_vrfy.c b/src/db/db_vrfy.c
index 9cb94ad2..a8c80cae 100644
--- a/src/db/db_vrfy.c
+++ b/src/db/db_vrfy.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -553,7 +553,7 @@ __db_vrfy_pagezero(dbp, vdp, fhp, name, flags)
if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0)
return (ret);
- if ((ret = __db_chk_meta(env, dbp, meta, 1)) != 0) {
+ if ((ret = __db_chk_meta(env, dbp, meta, DB_CHK_META)) != 0) {
EPRINT((env, DB_STR_A("0522",
"Page %lu: metadata page corrupted", "%lu"),
(u_long)PGNO_BASE_MD));
@@ -920,7 +920,7 @@ err1: if (ret == 0)
* If we've seen a Queue metadata page, we may need to walk Queue
* extent pages that won't show up between 0 and vdp->last_pgno.
*/
- if (F_ISSET(vdp, VRFY_QMETA_SET) && (t_ret =
+ if (F_ISSET(vdp, SALVAGE_QMETA_SET) && (t_ret =
__qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)) != 0) {
if (ret == 0)
ret = t_ret;
@@ -1563,6 +1563,10 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags)
* If we don't have FTRUNCATE then mpool could include some
* zeroed pages at the end of the file, we assume the meta page
* is correct. Queue does not update the meta page's last_pgno.
+ *
+ * We have seen one false positive after a failure while rolling the log
+ * forward, last_pgno was updated and the file had not yet been
+ * extended. [#18418]
*/
if (pgno == PGNO_BASE_MD &&
dbtype != DB_QUEUE && meta->last_pgno != vdp->last_pgno) {
@@ -2401,6 +2405,15 @@ __db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp)
* length, so it's not possible to certify it as safe.
*/
switch (B_TYPE(bk->type)) {
+ case B_BLOB:
+ len = bk->len;
+ if (len != BBLOB_DSIZE) {
+ EPRINT((env, DB_STR_A("0771",
+ "Page %lu: item %lu illegal size.",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+ break;
case B_KEYDATA:
len = bk->len;
break;
diff --git a/src/db/db_vrfy_stub.c b/src/db/db_vrfy_stub.c
index 5037f33e..a9eed84c 100644
--- a/src/db/db_vrfy_stub.c
+++ b/src/db/db_vrfy_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/db/db_vrfyutil.c b/src/db/db_vrfyutil.c
index d72e1188..3a64bd50 100644
--- a/src/db/db_vrfyutil.c
+++ b/src/db/db_vrfyutil.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -43,6 +43,9 @@ __db_vrfy_dbinfo_create(env, ip, pgsize, vdpp)
if ((ret = __db_create_internal(&cdbp, env, 0)) != 0)
goto err;
+ if ((ret = __db_set_blob_threshold(cdbp, 0, 0)) != 0)
+ goto err;
+
if ((ret = __db_set_flags(cdbp, DB_DUP)) != 0)
goto err;
@@ -60,6 +63,9 @@ __db_vrfy_dbinfo_create(env, ip, pgsize, vdpp)
if ((ret = __db_create_internal(&pgdbp, env, 0)) != 0)
goto err;
+ if ((ret = __db_set_blob_threshold(pgdbp, 0, 0)) != 0)
+ goto err;
+
if ((ret = __db_set_pagesize(pgdbp, pgsize)) != 0)
goto err;
@@ -928,5 +934,6 @@ __db_vrfy_prdbt(dbtp, checkprint, prefix,
}
return (
__db_prdbt(dbtp, checkprint,
- prefix, handle, callback, is_recno, is_heap));
+ prefix, handle, callback, is_recno, is_heap,
+ vdp != NULL && F_ISSET(vdp, SALVAGE_STREAM_BLOB) ? 1 : 0));
}
diff --git a/src/db/partition.c b/src/db/partition.c
index f8beaf16..86491ba3 100644
--- a/src/db/partition.c
+++ b/src/db/partition.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -32,13 +32,12 @@ static int __partc_writelock __P((DBC*));
static int __partition_chk_meta __P((DB *,
DB_THREAD_INFO *, DB_TXN *, u_int32_t));
static int __partition_setup_keys __P((DBC *,
- DB_PARTITION *, DBMETA *, u_int32_t));
+ DB_PARTITION *, u_int32_t, u_int32_t));
static int __part_key_cmp __P((const void *, const void *));
static inline void __part_search __P((DB *,
DB_PARTITION *, DBT *, u_int32_t *));
-static char *Alloc_err = DB_STR_A("0644",
- "Partition open failed to allocate %d bytes", "%d");
+#define ALLOC_ERR DB_STR_A("0764","Partition failed to allocate %d bytes","%d")
/*
* Allocate a partition cursor and copy flags to the partition cursor.
@@ -70,20 +69,27 @@ static inline void __part_search(dbp, part, key, part_idp)
{
db_indx_t base, indx, limit;
int cmp;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
+ size_t pos, pos_h, pos_l;
DB_ASSERT(dbp->env, part->nparts != 0);
COMPQUIET(cmp, 0);
COMPQUIET(indx, 0);
+ pos_h = 0;
+ pos_l = 0;
func = ((BTREE *)dbp->bt_internal)->bt_compare;
DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) {
+ pos = pos_l > pos_h ? pos_h : pos_l;
DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX);
- cmp = func(dbp, key, &part->keys[indx]);
+ cmp = func(dbp, key, &part->keys[indx], &pos);
if (cmp == 0)
break;
- if (cmp > 0)
+ if (cmp > 0) {
DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX);
+ pos_l = pos;
+ } else
+ pos_h = pos;
}
if (cmp == 0)
*part_idp = indx;
@@ -146,7 +152,8 @@ __partition_set(dbp, parts, keys, callback)
{
DB_PARTITION *part;
ENV *env;
- int ret;
+ u_int32_t i;
+ int ret, t_ret;
DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition");
env = dbp->dbenv->env;
@@ -155,6 +162,11 @@ __partition_set(dbp, parts, keys, callback)
__db_errx(env, DB_STR("0646",
"Must specify at least 2 partitions."));
return (EINVAL);
+ } else if (parts > PART_MAXIMUM) {
+ __db_errx(env, DB_STR_A("0772",
+ "Must not specify more than %u partitions.", "%u"),
+ (unsigned int)PART_MAXIMUM);
+ return (EINVAL);
}
if (keys == NULL && callback == NULL) {
@@ -178,11 +190,59 @@ bad: __db_errx(env, DB_STR("0648",
(part->callback != NULL && keys != NULL))
goto bad;
+ /*
+ * Free a key array that was allocated by an earlier set_partition call.
+ */
+ if (part->keys != NULL) {
+ for (i = 0; i < part->nparts - 1; i++) {
+ /*
+ * Always free all entries in the key array and return
+ * the first error code.
+ */
+ if ((t_ret = __db_dbt_clone_free(dbp->env,
+ &part->keys[i])) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ __os_free(dbp->env, part->keys);
+ part->keys = NULL;
+ }
+
+ if (ret != 0)
+ return (ret);
+
part->nparts = parts;
- part->keys = keys;
part->callback = callback;
- return (0);
+ /*
+ * Take a copy of the users key array otherwise we cannot be sure
+ * that the memory will still be valid when the database is opened.
+ */
+ if (keys != NULL) {
+ if ((ret = __os_calloc(dbp->env,
+ part->nparts - 1, sizeof(DBT), &part->keys)) != 0)
+ goto err;
+
+ for (i = 0, parts = 0; i < part->nparts - 1; i++, parts++)
+ if ((ret = __db_dbt_clone(dbp->env,
+ &part->keys[i], &keys[i])) != 0)
+ goto err;
+ }
+
+err: if (ret != 0 && part->keys != NULL) {
+ /*
+ * Always free those entries cloned successfully in the key
+ * array and the one which fails in __db_dbt_clone, and
+ * return the first error code. As ret != 0 here, so it is
+ * safe to ignore any error from __db_dbt_clone_free.
+ */
+ for (i = 0; i < parts; i++)
+ (void)__db_dbt_clone_free(dbp->env, &part->keys[i]);
+ if (parts < part->nparts - 1 && part->keys[parts].data != NULL)
+ __os_free(dbp->env, part->keys[parts].data);
+ __os_free(dbp->env, part->keys);
+ part->keys = NULL;
+ }
+ return (ret);
}
/*
@@ -288,15 +348,16 @@ __partition_open(dbp, ip, txn, fname, type, flags, mode, do_open)
if ((ret = __os_calloc(env,
part->nparts, sizeof(*part->handles), &part->handles)) != 0) {
- __db_errx(env,
- Alloc_err, part->nparts * sizeof(*part->handles));
+ __db_errx(env, ALLOC_ERR,
+ (int)(part->nparts * sizeof(*part->handles)));
goto err;
}
DB_ASSERT(env, fname != NULL);
if ((ret = __os_malloc(env,
strlen(fname) + PART_LEN + 1, &name)) != 0) {
- __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+ __db_errx(env, ALLOC_ERR,
+ (int)(strlen(fname) + PART_LEN + 1));
goto err;
}
@@ -330,6 +391,9 @@ __partition_open(dbp, ip, txn, fname, type, flags, mode, do_open)
part_db->dup_compare = dbp->dup_compare;
part_db->app_private = dbp->app_private;
part_db->api_internal = dbp->api_internal;
+ part_db->blob_threshold = dbp->blob_threshold;
+ part_db->blob_file_id = dbp->blob_file_id;
+ part_db->blob_sdb_id = dbp->blob_sdb_id;
if (dbp->type == DB_BTREE)
__bam_copy_config(dbp, part_db, part->nparts);
@@ -388,7 +452,8 @@ __partition_chk_meta(dbp, ip, txn, flags)
DB_MPOOLFILE *mpf;
ENV *env;
db_pgno_t base_pgno;
- int ret, t_ret;
+ int ret, set_keys, t_ret;
+ u_int32_t pgsize;
dbc = NULL;
meta = NULL;
@@ -397,6 +462,14 @@ __partition_chk_meta(dbp, ip, txn, flags)
mpf = dbp->mpf;
env = dbp->env;
ret = 0;
+ set_keys = 0;
+
+ /*
+ * Just to fix the lint warning.
+ * The real value will be set later, and we will
+ * only use the value after being set properly.
+ */
+ pgsize = dbp->pgsize;
/* Get a cursor on the main db. */
dbp->p_internal = NULL;
@@ -475,10 +548,12 @@ __partition_chk_meta(dbp, ip, txn, flags)
}
} else if (meta->magic != DB_BTREEMAGIC) {
__db_errx(env, DB_STR("0658",
- "Partitioning only supported on BTREE nad HASH."));
+ "Partitioning only supported on BTREE and HASH."));
ret = EINVAL;
- } else
- ret = __partition_setup_keys(dbc, part, meta, flags);
+ } else {
+ set_keys = 1;
+ pgsize = meta->pagesize;
+ }
err: /* Put the metadata page back. */
if (meta != NULL && (t_ret = __memp_fput(mpf,
@@ -487,6 +562,15 @@ err: /* Put the metadata page back. */
if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
ret = t_ret;
+ /*
+ * We can only call __partition_setup_keys after putting
+ * the meta page and releasing the meta lock, or self-deadlock
+ * will occur.
+ */
+ if (ret == 0 && set_keys && (t_ret =
+ __partition_setup_keys(dbc, part, pgsize, flags)) != 0)
+ ret = t_ret;
+
if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
ret = t_ret;
@@ -502,7 +586,7 @@ err: /* Put the metadata page back. */
struct key_sort {
DB *dbp;
DBT *key;
- int (*compare) __P((DB *, const DBT *, const DBT *));
+ int (*compare) __P((DB *, const DBT *, const DBT *, size_t *));
};
static int __part_key_cmp(a, b)
@@ -512,7 +596,7 @@ static int __part_key_cmp(a, b)
ka = a;
kb = b;
- return (ka->compare(ka->dbp, ka->key, kb->key));
+ return (ka->compare(ka->dbp, ka->key, kb->key, NULL));
}
/*
* __partition_setup_keys --
@@ -520,25 +604,22 @@ static int __part_key_cmp(a, b)
* are creating a partitioned database.
*/
static int
-__partition_setup_keys(dbc, part, meta, flags)
+__partition_setup_keys(dbc, part, pgsize, flags)
DBC *dbc;
DB_PARTITION *part;
- DBMETA *meta;
- u_int32_t flags;
+ u_int32_t flags, pgsize;
{
BTREE *t;
DB *dbp;
- DBT data, key, *keys, *kp;
+ DBT data, key, *keys, *kp, *okp;
ENV *env;
- u_int32_t ds, i, j;
- u_int8_t *dd;
+ db_pgno_t last_pgno;
+ u_int32_t cgetflags, i, j;
+ size_t dsize;
struct key_sort *ks;
- int have_keys, ret;
- int (*compare) __P((DB *, const DBT *, const DBT *));
- void *dp;
+ int have_keys, ret, t_ret;
+ int (*compare) __P((DB *, const DBT *, const DBT *, size_t *));
- COMPQUIET(dd, NULL);
- COMPQUIET(ds, 0);
memset(&data, 0, sizeof(data));
memset(&key, 0, sizeof(key));
ks = NULL;
@@ -549,6 +630,9 @@ __partition_setup_keys(dbc, part, meta, flags)
/* Need to just read the main database. */
dbp->p_internal = NULL;
have_keys = 0;
+ dsize = 0;
+
+ keys = part->keys;
/* First verify that things what we expect. */
if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) {
@@ -581,11 +665,15 @@ __partition_setup_keys(dbc, part, meta, flags)
}
if (LF_ISSET(DB_CREATE) && have_keys == 0) {
- /* Insert the keys into the master database. */
+ /*
+ * Insert the keys into the master database. We will also
+ * compute the total size of the keys for later use.
+ */
for (i = 0; i < part->nparts - 1; i++) {
if ((ret = __db_put(dbp, dbc->thread_info,
dbc->txn, &part->keys[i], &data, 0)) != 0)
goto err;
+ dsize += part->keys[i].size;
}
/*
@@ -604,39 +692,71 @@ __partition_setup_keys(dbc, part, meta, flags)
}
done: if (F_ISSET(part, PART_RANGE)) {
/*
- * Allocate one page to hold the keys plus space at the
- * end of the buffer to put an array of DBTs. If there
- * is not enough space __dbc_get will return how much
- * is needed and we realloc.
+ * If we just did the insert, we have known the total size of
+ * the keys. Otherwise, the keys must have been in the database,
+ * and we can calculate the size by checking the last pgno of
+ * the corresponding mpoolfile.
+ *
+ * We make the size aligned at 1024 for performance.
*/
+ if (dsize == 0) {
+ ret = __memp_get_last_pgno(dbp->mpf, &last_pgno);
+ if (ret != 0)
+ goto err;
+ if (last_pgno > 1)
+ last_pgno--;
+ dsize = last_pgno * pgsize;
+ }
+ dsize = DB_ALIGN(dsize, 1024);
+
if ((ret = __os_malloc(env,
- meta->pagesize + (sizeof(DBT) * part->nparts),
+ dsize + (sizeof(DBT) * part->nparts),
&part->data)) != 0) {
- __db_errx(env, Alloc_err, meta->pagesize);
+ __db_errx(env, ALLOC_ERR, (int)dsize);
goto err;
}
+ memset(part->data, 0,
+ dsize + (sizeof(DBT) * part->nparts));
+
+ kp = okp = (DBT *)
+ ((u_int8_t *)part->data + dsize);
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
- data.data = part->data;
- data.ulen = meta->pagesize;
data.flags = DB_DBT_USERMEM;
-again: if ((ret = __dbc_get(dbc, &key, &data,
- DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) {
- if ((ret = __os_realloc(env,
- data.size + (sizeof(DBT) * part->nparts),
- &part->data)) != 0)
+ j = 0;
+ cgetflags = DB_FIRST;
+ while ((ret = __dbc_get(dbc, &key, &data, cgetflags)) == 0) {
+ /* It is an error if we get more keys than expect. */
+ if ((u_int32_t)(kp - okp) > part->nparts) {
+ ret = EINVAL;
goto err;
- data.data = part->data;
- data.ulen = data.size;
- goto again;
+ }
+ kp->size = key.size;
+ kp->data = (u_int8_t *)part->data + j;
+ /* It is an error if the keys overflow the space. */
+ if (j + kp->size > dsize) {
+ ret = EINVAL;
+ goto err;
+ }
+ memcpy(kp->data, key.data, kp->size);
+ j += kp->size;
+ cgetflags = DB_NEXT;
+ kp++;
}
+
+ /*
+ * We should get part->nparts keys back, otherwise it means
+ * the passed-in keys are not valid.
+ */
+ if (ret == DB_NOTFOUND && (u_int32_t)(kp - okp) == part->nparts)
+ ret = 0;
+
if (ret == 0) {
/*
* They passed in keys, they must match.
*/
- keys = NULL;
compare = NULL;
- if (have_keys == 1 && (keys = part->keys) != NULL) {
+ if (have_keys == 1 && keys != NULL) {
t = dbc->dbp->bt_internal;
compare = t->bt_compare;
if ((ret = __os_malloc(env, (part->nparts - 1)
@@ -651,20 +771,15 @@ again: if ((ret = __dbc_get(dbc, &key, &data,
qsort(ks, (size_t)part->nparts - 1,
sizeof(struct key_sort), __part_key_cmp);
}
- DB_MULTIPLE_INIT(dp, &data);
part->keys = (DBT *)
- ((u_int8_t *)part->data + data.size);
+ ((u_int8_t *)part->data + dsize);
+ F_SET(part, PART_KEYS_SETUP);
j = 0;
for (kp = part->keys;
kp < &part->keys[part->nparts]; kp++, j++) {
- DB_MULTIPLE_KEY_NEXT(dp,
- &data, kp->data, kp->size, dd, ds);
- if (dp == NULL) {
- ret = DB_NOTFOUND;
- break;
- }
- if (keys != NULL && j != 0 &&
- compare(dbc->dbp, ks[j - 1].key, kp) != 0) {
+ if (have_keys == 1 && keys != NULL && j != 0 &&
+ compare(dbc->dbp, ks[j - 1].key,
+ kp, NULL) != 0) {
if (kp->data == NULL &&
F_ISSET(dbp, DB_AM_RECOVER))
goto err;
@@ -683,6 +798,24 @@ again: if ((ret = __dbc_get(dbc, &key, &data,
err: dbp->p_internal = part;
if (ks != NULL)
__os_free(env, ks);
+
+ /*
+ * We only free the original copy of the key array when
+ * the keys have been setup properly, otherwise we let
+ * the close function to free the memory.
+ */
+ if (keys != NULL && F_ISSET(part, PART_KEYS_SETUP)) {
+ for (i = 0; i < part->nparts - 1; i++)
+ /*
+ * Always free all entries in the key array and return
+ * the first error code.
+ */
+ if ((t_ret = __db_dbt_clone_free(env,
+ &keys[i])) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, keys);
+ }
+
return (ret);
}
@@ -1183,6 +1316,15 @@ __partition_close(dbp, txn, flags)
ret = t_ret;
__os_free(env, part->handles);
}
+ if (!F_ISSET(part, PART_KEYS_SETUP) && part->keys != NULL) {
+ for (i = 0; i < part->nparts - 1; i++) {
+ if (part->keys[i].data != NULL && (t_ret =
+ __db_dbt_clone_free(env, &part->keys[i])) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+ __os_free(env, part->keys);
+ }
if (part->dirs != NULL)
__os_free(env, (char **)part->dirs);
if (part->data != NULL)
@@ -1471,7 +1613,8 @@ __part_fileid_reset(env, ip, fname, nparts, encrypted)
if ((ret = __os_malloc(env,
strlen(fname) + PART_LEN + 1, &name)) != 0) {
- __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+ __db_errx(env, ALLOC_ERR,
+ (int)(strlen(fname) + PART_LEN + 1));
return (ret);
}
@@ -1747,7 +1890,8 @@ __part_rr(dbp, ip, txn, name, subdb, newname, flags)
COMPQUIET(np, NULL);
if (newname != NULL && (ret = __os_malloc(env,
strlen(newname) + PART_LEN + 1, &np)) != 0) {
- __db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1);
+ __db_errx(env, ALLOC_ERR,
+ (int)(strlen(newname) + PART_LEN + 1));
goto err;
}
for (i = 0; i < part->nparts; i++, pdbp++) {
@@ -1790,6 +1934,32 @@ err: /*
}
return (ret);
}
+
+/*
+ * __partc_dup --
+ * Duplicate a cursor on a partitioned database.
+ *
+ * PUBLIC: int __partc_dup __P((DBC *, DBC *));
+ */
+int
+__partc_dup(dbc_orig, dbc_n)
+ DBC *dbc_orig;
+ DBC *dbc_n;
+{
+ PART_CURSOR *orig, *new;
+
+ orig = (PART_CURSOR *)dbc_orig->internal;
+ new = (PART_CURSOR *)dbc_n->internal;
+
+ /*
+ * A cursor on a partitioned database contains the identifier
+ * of the underlying database and a regular cursor that points
+ * to the underlying database. Copy both pieces.
+ */
+ new->part_id = orig->part_id;
+
+ return (__dbc_dup(orig->sub_cursor, &new->sub_cursor, DB_POSITION));
+}
#ifdef HAVE_VERIFY
/*
* __part_verify --
diff --git a/src/dbinc/atomic.h b/src/dbinc/atomic.h
index 096176a5..61f2ead9 100644
--- a/src/dbinc/atomic.h
+++ b/src/dbinc/atomic.h
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2009, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -79,12 +79,11 @@ typedef struct {
#define WINCE_ATOMIC_MAGIC(p) \
/* \
* Memory mapped regions on Windows CE cause problems with \
- * InterlockedXXX calls. Each page in a mapped region needs to \
- * have been written to prior to an InterlockedXXX call, or the \
- * InterlockedXXX call hangs. This does not seem to be \
- * documented anywhere. For now, read/write a non-critical \
- * piece of memory from the shared region prior to attempting \
- * shared region prior to attempting an InterlockedExchange \
+ * InterlockedXXX calls. Each process making an InterlockedXXX \
+ * call must make sure that it has written to the page prior to \
+ * the call, or the InterlockedXXX call hangs. This does not \
+ * seem to be documented anywhere. Write a non-critical piece \
+ * of memory from the shared region prior to attempting an \
* InterlockedXXX operation. \
*/ \
(p)->dummy = 0
@@ -144,7 +143,7 @@ typedef LONG volatile *interlocked_val;
#define atomic_inc(env, p) __atomic_inc(p)
#define atomic_dec(env, p) __atomic_dec(p)
#define atomic_compare_exchange(env, p, o, n) \
- __atomic_compare_exchange((p), (o), (n))
+ __atomic_compare_exchange_int((p), (o), (n))
static inline int __atomic_inc(db_atomic_t *p)
{
int temp;
@@ -176,7 +175,7 @@ static inline int __atomic_dec(db_atomic_t *p)
* http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
* which configure could be changed to use.
*/
-static inline int __atomic_compare_exchange(
+static inline int __atomic_compare_exchange_int(
db_atomic_t *p, atomic_value_t oldval, atomic_value_t newval)
{
atomic_value_t was;
diff --git a/src/dbinc/blob.h b/src/dbinc/blob.h
new file mode 100644
index 00000000..f4ff475b
--- /dev/null
+++ b/src/dbinc/blob.h
@@ -0,0 +1,103 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_BLOB_H_
+#define _DB_BLOB_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * How many characters can the path for a blob file use?
+ * Up to 6 subdirectory separators.
+ * Up to 6 directory names of up to three characters each.
+ * Up to 21 characters for blob_id identifier.
+ * 7 characters for the standard prefix (__db.bl)
+ * 1 for luck (or NULL)
+ * The largest blob id, 9,223,372,036,854,775,807 would
+ * produce a path and file name:
+ * 009/223/372/036/854/775/807/__db.bl009223372036854775807
+ */
+#define MAX_BLOB_PATH "009/223/372/036/854/775/807/__db.bl009223372036854775807"
+#define MAX_BLOB_PATH_SZ sizeof(MAX_BLOB_PATH)
+#define BLOB_DEFAULT_DIR "__db_bl"
+#define BLOB_META_FILE_NAME "__db_blob_meta.db"
+#define BLOB_DIR_PREFIX "__db"
+#define BLOB_FILE_PREFIX "__db.bl"
+
+#define BLOB_DIR_ELEMS 1000
+
+#define IS_BLOB_META(name) \
+ (name != NULL && strstr(name, BLOB_META_FILE_NAME) != NULL)
+#define IS_BLOB_FILE(name) \
+ (name != NULL && strstr(name, BLOB_FILE_PREFIX) != NULL)
+
+/*
+ * Combines two unsigned 32 bit integers into a 64 bit integer.
+ * Blob database file ids and sub database ids are 64 bit integers,
+ * but have to be stored on database metadata pages that must
+ * be readable on 32 bit only compilers. So the ids are split into
+ * two 32 bit integers, and combined when needed.
+ */
+#define GET_LO_HI(e, lo, hi, o, ret) do { \
+ DB_ASSERT((e), sizeof(o) <= 8); \
+ if (sizeof(o) == 8) { \
+ (o) = (hi); \
+ (o) = ((o) << 32); \
+ (o) += (lo); \
+ } else { \
+ if ((hi) > 0) { \
+ __db_errx((e), DB_STR("0765", \
+ "Offset or id size overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ (o) = (lo); \
+ } \
+} while (0);
+
+#define GET_BLOB_FILE_ID(e, p, o, ret) \
+ GET_LO_HI(e, (p)->blob_file_lo, (p)->blob_file_hi, o, ret);
+
+#define GET_BLOB_SDB_ID(e, p, o, ret) \
+ GET_LO_HI(e, (p)->blob_sdb_lo, (p)->blob_sdb_hi, o, ret);
+
+/* Splits a 64 bit integer into two unsigned 32 bit integers. */
+#define SET_LO_HI(p, v, type, field_lo, field_hi) do { \
+ u_int32_t tmp; \
+ if (sizeof((v)) == 8) { \
+ tmp = (u_int32_t)((v) >> 32); \
+ memcpy(((u_int8_t *)p) + SSZ(type, field_hi), \
+ &tmp, sizeof(u_int32_t)); \
+ } else { \
+ memset(((u_int8_t *)p) + SSZ(type, field_hi), \
+ 0, sizeof(u_int32_t)); \
+ } \
+ tmp = (u_int32_t)(v); \
+ memcpy(((u_int8_t *)p) + SSZ(type, field_lo), \
+ &tmp, sizeof(u_int32_t)); \
+} while (0);
+
+#define SET_LO_HI_VAR(v, field_lo, field_hi) do { \
+ if (sizeof((v)) == 8) \
+ field_hi = (u_int32_t)((v) >> 32); \
+ else \
+ field_hi = 0; \
+ field_lo = (u_int32_t)(v); \
+} while (0);
+
+#define SET_BLOB_META_FILE_ID(p, v, type) \
+ SET_LO_HI(p, v, type, blob_file_lo, blob_file_hi);
+
+#define SET_BLOB_META_SDB_ID(p, v, type) \
+ SET_LO_HI(p, v, type, blob_sdb_lo, blob_sdb_hi);
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_BLOB_H_ */
diff --git a/src/dbinc/btree.h b/src/dbinc/btree.h
index 86bbec14..a8b9e1ee 100644
--- a/src/dbinc/btree.h
+++ b/src/dbinc/btree.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -472,7 +472,7 @@ struct __btree { /* Btree access method. */
u_int32_t bt_minkey; /* Minimum keys per page. */
/* Btree comparison function. */
- int (*bt_compare) __P((DB *, const DBT *, const DBT *));
+ int (*bt_compare) __P((DB *, const DBT *, const DBT *, size_t *));
/* Btree prefix function. */
size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *));
/* Btree compress function. */
@@ -483,7 +483,8 @@ struct __btree { /* Btree access method. */
int (*bt_decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
DBT *));
/* dup_compare for compression */
- int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *));
+ int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *,
+ size_t *));
#endif
/* Recno access method. */
@@ -539,7 +540,7 @@ typedef enum {
* Flags for __bam_pinsert.
*/
#define BPI_SPACEONLY 0x01 /* Only check for space to update. */
-#define BPI_NORECNUM 0x02 /* Not update the recnum on the left. */
+#define BPI_NORECNUM 0x02 /* Don't update the left's recnum. */
#define BPI_NOLOGGING 0x04 /* Don't log the update. */
#define BPI_REPLACE 0x08 /* Replace the record. */
diff --git a/src/dbinc/clock.h b/src/dbinc/clock.h
index caeaee70..b2815ea2 100644
--- a/src/dbinc/clock.h
+++ b/src/dbinc/clock.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -125,6 +125,13 @@ typedef struct {
timespecadd((vvp), &__tmp); \
} while (0)
+#define TIMESPEC_SUB_DB_TIMEOUT(vvp, t) \
+ do { \
+ db_timespec __tmp; \
+ DB_TIMEOUT_TO_TIMESPEC(t, &__tmp); \
+ timespecsub((vvp), &__tmp); \
+ } while (0)
+
#if defined(__cplusplus)
}
#endif
diff --git a/src/dbinc/crypto.h b/src/dbinc/crypto.h
index ea7a9cf0..4d889fd9 100644
--- a/src/dbinc/crypto.h
+++ b/src/dbinc/crypto.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/cxx_int.h b/src/dbinc/cxx_int.h
index 5492ead7..368bac86 100644
--- a/src/dbinc/cxx_int.h
+++ b/src/dbinc/cxx_int.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/db.in b/src/dbinc/db.in
index a948910e..b592b746 100644
--- a/src/dbinc/db.in
+++ b/src/dbinc/db.in
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*
@@ -102,6 +102,7 @@ extern "C" {
@FILE_t_decl@
@off_t_decl@
+@db_off_t_decl@
@pid_t_decl@
@size_t_decl@
#ifdef HAVE_MIXED_SIZE_ADDRESSING
@@ -131,9 +132,9 @@ typedef u_int16_t db_indx_t; /* Page offset type. */
#define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */
typedef u_int32_t db_recno_t; /* Record number type. */
-#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */
+#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a recno tree. */
-typedef u_int32_t db_timeout_t; /* Type of a timeout. */
+typedef u_int32_t db_timeout_t; /* Type of a timeout in microseconds. */
/*
* Region offsets are the difference between a pointer in a region and the
@@ -157,6 +158,10 @@ struct __db_compact; typedef struct __db_compact DB_COMPACT;
struct __db_dbt; typedef struct __db_dbt DBT;
struct __db_distab; typedef struct __db_distab DB_DISTAB;
struct __db_env; typedef struct __db_env DB_ENV;
+struct __db_event_mutex_died_info;
+ typedef struct __db_event_mutex_died_info DB_EVENT_MUTEX_DIED_INFO;
+struct __db_event_failchk_info;
+ typedef struct __db_event_failchk_info DB_EVENT_FAILCHK_INFO;
struct __db_h_stat; typedef struct __db_h_stat DB_HASH_STAT;
struct __db_heap_rid; typedef struct __db_heap_rid DB_HEAP_RID;
struct __db_heap_stat; typedef struct __db_heap_stat DB_HEAP_STAT;
@@ -189,6 +194,7 @@ struct __db_repmgr_site;typedef struct __db_repmgr_site DB_REPMGR_SITE;
struct __db_repmgr_stat;typedef struct __db_repmgr_stat DB_REPMGR_STAT;
struct __db_seq_record; typedef struct __db_seq_record DB_SEQ_RECORD;
struct __db_seq_stat; typedef struct __db_seq_stat DB_SEQUENCE_STAT;
+struct __db_stream; typedef struct __db_stream DB_STREAM;
struct __db_site; typedef struct __db_site DB_SITE;
struct __db_sequence; typedef struct __db_sequence DB_SEQUENCE;
struct __db_thread_info;typedef struct __db_thread_info DB_THREAD_INFO;
@@ -226,18 +232,20 @@ struct __db_dbt {
void *app_data;
-#define DB_DBT_APPMALLOC 0x001 /* Callback allocated memory. */
-#define DB_DBT_BULK 0x002 /* Internal: Insert if duplicate. */
-#define DB_DBT_DUPOK 0x004 /* Internal: Insert if duplicate. */
-#define DB_DBT_ISSET 0x008 /* Lower level calls set value. */
-#define DB_DBT_MALLOC 0x010 /* Return in malloc'd memory. */
-#define DB_DBT_MULTIPLE 0x020 /* References multiple records. */
-#define DB_DBT_PARTIAL 0x040 /* Partial put/get. */
-#define DB_DBT_REALLOC 0x080 /* Return in realloc'd memory. */
-#define DB_DBT_READONLY 0x100 /* Readonly, don't update. */
-#define DB_DBT_STREAMING 0x200 /* Internal: DBT is being streamed. */
-#define DB_DBT_USERCOPY 0x400 /* Use the user-supplied callback. */
-#define DB_DBT_USERMEM 0x800 /* Return in user's memory. */
+#define DB_DBT_APPMALLOC 0x0001 /* Callback allocated memory. */
+#define DB_DBT_BULK 0x0002 /* Internal: Insert if duplicate. */
+#define DB_DBT_DUPOK 0x0004 /* Internal: Insert if duplicate. */
+#define DB_DBT_ISSET 0x0008 /* Lower level calls set value. */
+#define DB_DBT_MALLOC 0x0010 /* Return in malloc'd memory. */
+#define DB_DBT_MULTIPLE 0x0020 /* References multiple records. */
+#define DB_DBT_PARTIAL 0x0040 /* Partial put/get. */
+#define DB_DBT_REALLOC 0x0080 /* Return in realloc'd memory. */
+#define DB_DBT_READONLY 0x0100 /* Readonly, don't update. */
+#define DB_DBT_STREAMING 0x0200 /* Internal: DBT is being streamed. */
+#define DB_DBT_USERCOPY 0x0400 /* Use the user-supplied callback. */
+#define DB_DBT_USERMEM 0x0800 /* Return in user's memory. */
+#define DB_DBT_BLOB 0x1000 /* Data item is a blob. */
+#define DB_DBT_BLOB_REC 0x2000 /* Internal: Blob database record. */
u_int32_t flags;
};
@@ -274,6 +282,23 @@ struct __db_mutex_stat { /* SHARED */
#endif
};
+/* Buffers passed to __mutex_describe() must be at least this large. */
+#define DB_MUTEX_DESCRIBE_STRLEN 128
+
+/* This is the info of a DB_EVENT_MUTEX_DIED event notification. */
+struct __db_event_mutex_died_info {
+ pid_t pid; /* Process which last owned the mutex */
+ db_threadid_t tid; /* Thread which last owned the mutex */
+ db_mutex_t mutex; /* ID of the mutex */
+ char desc[DB_MUTEX_DESCRIBE_STRLEN];
+};
+
+/* This is the info of a DB_EVENT_FAILCHK event notification. */
+#define DB_FAILURE_SYMPTOM_SIZE 120
+struct __db_event_failchk_info {
+ int error;
+ char symptom[DB_FAILURE_SYMPTOM_SIZE];
+};
/* This is the length of the buffer passed to DB_ENV->thread_id_string() */
#define DB_THREADID_STRLEN 128
@@ -400,6 +425,8 @@ struct __db_lock_stat { /* SHARED */
uintmax_t st_lockers_nowait; /* Locker lock granted without wait. */
uintmax_t st_region_wait; /* Region lock granted after wait. */
uintmax_t st_region_nowait; /* Region lock granted without wait. */
+ uintmax_t st_nlockers_hit; /* Lockers found in thread info. */
+ uintmax_t st_nlockers_reused; /* Lockers reallocated from thread info. */
u_int32_t st_hash_len; /* Max length of bucket. */
roff_t st_regsize; /* Region size. */
#endif
@@ -469,7 +496,7 @@ struct __db_lockreq {
/*******************************************************
* Logging.
*******************************************************/
-#define DB_LOGVERSION 19 /* Current log version. */
+#define DB_LOGVERSION 22 /* Current log version. */
#define DB_LOGVERSION_LATCHING 15 /* Log version using latching: db-4.8 */
#define DB_LOGCHKSUM 12 /* Check sum headers: db-4.5 */
#define DB_LOGOLDVER 8 /* Oldest version supported: db-4.2 */
@@ -595,7 +622,8 @@ typedef enum {
LOGREC_PGDDBT,
LOGREC_PGLIST,
LOGREC_POINTER,
- LOGREC_TIME
+ LOGREC_TIME,
+ LOGREC_LONGARG
} log_rec_type_t;
typedef const struct __log_rec_spec {
@@ -755,6 +783,7 @@ struct __db_mpool_stat { /* SHARED */
uintmax_t st_mvcc_frozen; /* Buffers frozen. */
uintmax_t st_mvcc_thawed; /* Buffers thawed. */
uintmax_t st_mvcc_freed; /* Frozen buffers freed. */
+ uintmax_t st_mvcc_reused; /* Outdated invisible buffers reused. */
uintmax_t st_alloc; /* Number of page allocations. */
uintmax_t st_alloc_buckets; /* Buckets checked during allocation. */
uintmax_t st_alloc_max_buckets;/* Max checked during allocation. */
@@ -762,6 +791,8 @@ struct __db_mpool_stat { /* SHARED */
uintmax_t st_alloc_max_pages; /* Max checked during allocation. */
uintmax_t st_io_wait; /* Thread waited on buffer I/O. */
uintmax_t st_sync_interrupted; /* Number of times sync interrupted. */
+ u_int32_t st_oddfsize_detect; /* Odd file size detected. */
+ u_int32_t st_oddfsize_resolve; /* Odd file size resolved. */
roff_t st_regsize; /* Region size. */
roff_t st_regmax; /* Region max. */
#endif
@@ -956,7 +987,7 @@ struct __db_txn {
#define TXN_SNAPSHOT 0x08000 /* Snapshot Isolation. */
#define TXN_SYNC 0x10000 /* Write and sync on prepare/commit. */
#define TXN_WRITE_NOSYNC 0x20000 /* Write only on prepare/commit. */
-#define TXN_BULK 0x40000 /* Enable bulk loading optimization. */
+#define TXN_BULK 0x40000 /* Enable bulk loading optimization. */
u_int32_t flags;
};
@@ -1065,30 +1096,34 @@ struct __db_txn_token {
/*
* Event notification types. (Tcl testing interface currently assumes there are
- * no more than 32 of these.)
+ * no more than 32 of these.). Comments include any relevant event_info types.
*/
#define DB_EVENT_PANIC 0
-#define DB_EVENT_REG_ALIVE 1
-#define DB_EVENT_REG_PANIC 2
-#define DB_EVENT_REP_CLIENT 3
-#define DB_EVENT_REP_CONNECT_BROKEN 4
-#define DB_EVENT_REP_CONNECT_ESTD 5
-#define DB_EVENT_REP_CONNECT_TRY_FAILED 6
-#define DB_EVENT_REP_DUPMASTER 7
-#define DB_EVENT_REP_ELECTED 8
-#define DB_EVENT_REP_ELECTION_FAILED 9
-#define DB_EVENT_REP_INIT_DONE 10
-#define DB_EVENT_REP_JOIN_FAILURE 11
-#define DB_EVENT_REP_LOCAL_SITE_REMOVED 12
-#define DB_EVENT_REP_MASTER 13
-#define DB_EVENT_REP_MASTER_FAILURE 14
-#define DB_EVENT_REP_NEWMASTER 15
-#define DB_EVENT_REP_PERM_FAILED 16
-#define DB_EVENT_REP_SITE_ADDED 17
-#define DB_EVENT_REP_SITE_REMOVED 18
-#define DB_EVENT_REP_STARTUPDONE 19
-#define DB_EVENT_REP_WOULD_ROLLBACK 20 /* Undocumented; C API only. */
-#define DB_EVENT_WRITE_FAILED 21
+#define DB_EVENT_REG_ALIVE 1 /* int: pid which was in env */
+#define DB_EVENT_REG_PANIC 2 /* int: error causing the panic. */
+#define DB_EVENT_REP_AUTOTAKEOVER_FAILED 3
+#define DB_EVENT_REP_CLIENT 4
+#define DB_EVENT_REP_CONNECT_BROKEN 5 /* DB_REPMGR_CONN_ERR */
+#define DB_EVENT_REP_CONNECT_ESTD 6 /* int: EID of remote site */
+#define DB_EVENT_REP_CONNECT_TRY_FAILED 7 /* DB_REPMGR_CONN_ERR */
+#define DB_EVENT_REP_DUPMASTER 8
+#define DB_EVENT_REP_ELECTED 9
+#define DB_EVENT_REP_ELECTION_FAILED 10
+#define DB_EVENT_REP_INIT_DONE 11
+#define DB_EVENT_REP_INQUEUE_FULL 12
+#define DB_EVENT_REP_JOIN_FAILURE 13
+#define DB_EVENT_REP_LOCAL_SITE_REMOVED 14
+#define DB_EVENT_REP_MASTER 15
+#define DB_EVENT_REP_MASTER_FAILURE 16
+#define DB_EVENT_REP_NEWMASTER 17 /* int: new master's site id */
+#define DB_EVENT_REP_PERM_FAILED 18
+#define DB_EVENT_REP_SITE_ADDED 19 /* int: eid */
+#define DB_EVENT_REP_SITE_REMOVED 20 /* int: eid */
+#define DB_EVENT_REP_STARTUPDONE 21
+#define DB_EVENT_REP_WOULD_ROLLBACK 22 /* Undocumented; C API only. */
+#define DB_EVENT_WRITE_FAILED 23
+#define DB_EVENT_MUTEX_DIED 24 /* DB_EVENT_MUTEX_DIED_INFO */
+#define DB_EVENT_FAILCHK_PANIC 25 /* DB_EVENT_FAILCHK_INFO */
#define DB_EVENT_NO_SUCH_EVENT 0xffffffff /* OOB sentinel value */
/* Replication Manager site status. */
@@ -1102,6 +1137,7 @@ struct __db_repmgr_site {
u_int32_t status;
#define DB_REPMGR_ISPEER 0x01
+#define DB_REPMGR_ISVIEW 0x02
u_int32_t flags;
};
@@ -1117,6 +1153,7 @@ struct __db_rep_stat { /* SHARED */
* circumstances, garbaged).
*/
u_int32_t st_startup_complete; /* Site completed client sync-up. */
+ u_int32_t st_view; /* Site is a view. */
#ifndef __TEST_DB_NO_STATISTICS
uintmax_t st_log_queued; /* Log records currently queued.+ */
u_int32_t st_status; /* Current replication status. */
@@ -1194,6 +1231,7 @@ struct __db_rep_stat { /* SHARED */
/* Undocumented statistics only used by the test system. */
#ifdef CONFIG_TEST
u_int32_t st_filefail_cleanups; /* # of FILE_FAIL cleanups done. */
+ uintmax_t st_log_futuredup; /* Future log records that are dups. */
#endif
#endif
};
@@ -1204,10 +1242,18 @@ struct __db_repmgr_stat { /* SHARED */
uintmax_t st_msgs_queued; /* # msgs queued for network delay. */
uintmax_t st_msgs_dropped; /* # msgs discarded due to excessive
queue length. */
+ u_int32_t st_incoming_queue_gbytes; /* Incoming queue size: GB. */
+ u_int32_t st_incoming_queue_bytes; /* Incoming queue size: B. */
+ uintmax_t st_incoming_msgs_dropped; /* # of msgs discarded due to
+ incoming queue full. */
uintmax_t st_connection_drop; /* Existing connections dropped. */
uintmax_t st_connect_fail; /* Failed new connection attempts. */
- uintmax_t st_elect_threads; /* # of active election threads. */
- uintmax_t st_max_elect_threads; /* Max concurrent e-threads ever. */
+ u_int32_t st_elect_threads; /* # of active election threads. */
+ u_int32_t st_max_elect_threads; /* Max concurrent e-threads ever. */
+ u_int32_t st_site_participants; /* # of repgroup participant sites. */
+ u_int32_t st_site_total; /* # of repgroup total sites. */
+ u_int32_t st_site_views; /* # of repgroup view sites. */
+ uintmax_t st_takeovers; /* # of automatic listener takeovers. */
};
/* Replication Manager connection error. */
@@ -1238,7 +1284,7 @@ struct __db_sequence {
db_mutex_t mtx_seq; /* Mutex if sequence is threaded. */
DB_SEQ_RECORD *seq_rp; /* Pointer to current data. */
DB_SEQ_RECORD seq_record; /* Data from DB_SEQUENCE. */
- int32_t seq_cache_size; /* Number of values cached. */
+ u_int32_t seq_cache_size; /* Number of values cached. */
db_seq_t seq_last_value; /* Last value cached. */
db_seq_t seq_prev_value; /* Last value returned. */
DBT seq_key; /* DBT pointing to sequence key. */
@@ -1250,8 +1296,8 @@ struct __db_sequence {
/* DB_SEQUENCE PUBLIC HANDLE LIST BEGIN */
int (*close) __P((DB_SEQUENCE *, u_int32_t));
int (*get) __P((DB_SEQUENCE *,
- DB_TXN *, int32_t, db_seq_t *, u_int32_t));
- int (*get_cachesize) __P((DB_SEQUENCE *, int32_t *));
+ DB_TXN *, u_int32_t, db_seq_t *, u_int32_t));
+ int (*get_cachesize) __P((DB_SEQUENCE *, u_int32_t *));
int (*get_db) __P((DB_SEQUENCE *, DB **));
int (*get_flags) __P((DB_SEQUENCE *, u_int32_t *));
int (*get_key) __P((DB_SEQUENCE *, DBT *));
@@ -1261,7 +1307,7 @@ struct __db_sequence {
int (*open) __P((DB_SEQUENCE *,
DB_TXN *, DBT *, u_int32_t));
int (*remove) __P((DB_SEQUENCE *, DB_TXN *, u_int32_t));
- int (*set_cachesize) __P((DB_SEQUENCE *, int32_t));
+ int (*set_cachesize) __P((DB_SEQUENCE *, u_int32_t));
int (*set_flags) __P((DB_SEQUENCE *, u_int32_t));
int (*set_range) __P((DB_SEQUENCE *, db_seq_t, db_seq_t));
int (*stat) __P((DB_SEQUENCE *,
@@ -1278,7 +1324,7 @@ struct __db_seq_stat { /* SHARED */
db_seq_t st_last_value; /* Last cached value. */
db_seq_t st_min; /* Minimum value. */
db_seq_t st_max; /* Maximum value. */
- int32_t st_cache_size; /* Cache size. */
+ u_int32_t st_cache_size; /* Cache size. */
u_int32_t st_flags; /* Flag value. */
};
@@ -1300,15 +1346,15 @@ typedef enum {
#define DB_RENAMEMAGIC 0x030800 /* File has been renamed. */
-#define DB_BTREEVERSION 9 /* Current btree version. */
+#define DB_BTREEVERSION 10 /* Current btree version. */
#define DB_BTREEOLDVER 8 /* Oldest btree version supported. */
#define DB_BTREEMAGIC 0x053162
-#define DB_HASHVERSION 9 /* Current hash version. */
+#define DB_HASHVERSION 10 /* Current hash version. */
#define DB_HASHOLDVER 7 /* Oldest hash version supported. */
#define DB_HASHMAGIC 0x061561
-#define DB_HEAPVERSION 1 /* Current heap version. */
+#define DB_HEAPVERSION 2 /* Current heap version. */
#define DB_HEAPOLDVER 1 /* Oldest heap version supported. */
#define DB_HEAPMAGIC 0x074582
@@ -1377,6 +1423,7 @@ typedef enum {
#define DB_LOCK_NOTGRANTED (-30992)/* Lock unavailable. */
#define DB_LOG_BUFFER_FULL (-30991)/* In-memory log buffer full. */
#define DB_LOG_VERIFY_BAD (-30990)/* Log verification failed. */
+#define DB_META_CHKSUM_FAIL (-30968)/* Metadata page checksum failed. */
#define DB_NOSERVER (-30989)/* Server panic return. */
#define DB_NOTFOUND (-30988)/* Key/data pair not found (EOF). */
#define DB_OLD_VERSION (-30987)/* Out-of-date version. */
@@ -1405,6 +1452,8 @@ typedef enum {
#define DB_DELETED (-30897)/* Recovery file marked deleted. */
#define DB_EVENT_NOT_HANDLED (-30896)/* Forward event to application. */
#define DB_NEEDSPLIT (-30895)/* Page needs to be split. */
+#define DB_NOINTMP (-30886)/* Sequences not supported in temporary
+ or in-memory databases. */
#define DB_REP_BULKOVF (-30894)/* Rep bulk buffer overflow. */
#define DB_REP_LOGREADY (-30893)/* Rep log ready for recovery. */
#define DB_REP_NEWMASTER (-30892)/* We have learned of a new master. */
@@ -1415,6 +1464,13 @@ typedef enum {
#define DB_TXN_CKP (-30888)/* Encountered ckp record in log. */
#define DB_VERIFY_FATAL (-30887)/* DB->verify cannot proceed. */
+/*
+ * This exit status indicates that a BDB utility failed because it needed a
+ * resource which had been held by a process which crashed or otherwise did
+ * not exit cleanly.
+ */
+#define DB_EXIT_FAILCHK 3
+
/* Database handle. */
struct __db {
/*******************************************************
@@ -1426,7 +1482,7 @@ struct __db {
/* Callbacks. */
int (*db_append_recno) __P((DB *, DBT *, db_recno_t));
void (*db_feedback) __P((DB *, int, int));
- int (*dup_compare) __P((DB *, const DBT *, const DBT *));
+ int (*dup_compare) __P((DB *, const DBT *, const DBT *, size_t *));
void *app_private; /* Application-private handle. */
@@ -1450,6 +1506,8 @@ struct __db {
u_int32_t adj_fileid; /* File's unique ID for curs. adj. */
+ u_int32_t blob_threshold; /* Blob threshold record size. */
+
#define DB_LOGFILEID_INVALID -1
FNAME *log_filename; /* File's naming info for logging. */
@@ -1593,6 +1651,12 @@ struct __db {
/* Reference to foreign -- set in the secondary. */
DB *s_foreign;
+ DB *blob_meta_db; /* Databases holding blob metadata. */
+ DB_SEQUENCE *blob_seq; /* Sequence of blob ids. */
+ char *blob_sub_dir; /* Subdirectory for blob files */
+ db_seq_t blob_file_id; /* Id of the file blob directory. */
+ db_seq_t blob_sdb_id; /* Id of the subdb blob directory. */
+
/* API-private structure: used by DB 1.85, C++, Java, Perl and Tcl */
void *api_internal;
@@ -1623,8 +1687,11 @@ struct __db {
void *(**)(void *, size_t), void (**)(void *)));
int (*get_append_recno) __P((DB *, int (**)(DB *, DBT *, db_recno_t)));
int (*get_assoc_flags) __P((DB *, u_int32_t *));
+ int (*get_blob_dir) __P((DB *, const char **));
+ int (*get_blob_sub_dir) __P((DB *, const char **));
+ int (*get_blob_threshold) __P((DB *, u_int32_t *));
int (*get_bt_compare)
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
int (*get_bt_compress) __P((DB *,
int (**)(DB *,
const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
@@ -1637,7 +1704,7 @@ struct __db {
int (*get_create_dir) __P((DB *, const char **));
int (*get_dbname) __P((DB *, const char **, const char **));
int (*get_dup_compare)
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
int (*get_encrypt_flags) __P((DB *, u_int32_t *));
DB_ENV *(*get_env) __P((DB *));
void (*get_errcall) __P((DB *,
@@ -1647,7 +1714,7 @@ struct __db {
int (*get_feedback) __P((DB *, void (**)(DB *, int, int)));
int (*get_flags) __P((DB *, u_int32_t *));
int (*get_h_compare)
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
int (*get_h_ffactor) __P((DB *, u_int32_t *));
int (*get_h_hash)
__P((DB *, u_int32_t (**)(DB *, const void *, u_int32_t)));
@@ -1688,8 +1755,10 @@ struct __db {
int (*set_alloc) __P((DB *, void *(*)(size_t),
void *(*)(void *, size_t), void (*)(void *)));
int (*set_append_recno) __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
+ int (*set_blob_dir) __P((DB *, const char *));
+ int (*set_blob_threshold) __P((DB *, u_int32_t, u_int32_t));
int (*set_bt_compare)
- __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *)));
int (*set_bt_compress) __P((DB *,
int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
@@ -1699,7 +1768,7 @@ struct __db {
int (*set_cachesize) __P((DB *, u_int32_t, u_int32_t, int));
int (*set_create_dir) __P((DB *, const char *));
int (*set_dup_compare)
- __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *)));
int (*set_encrypt) __P((DB *, const char *, u_int32_t));
void (*set_errcall) __P((DB *,
void (*)(const DB_ENV *, const char *, const char *)));
@@ -1708,7 +1777,7 @@ struct __db {
int (*set_feedback) __P((DB *, void (*)(DB *, int, int)));
int (*set_flags) __P((DB *, u_int32_t));
int (*set_h_compare)
- __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *)));
int (*set_h_ffactor) __P((DB *, u_int32_t));
int (*set_h_hash)
__P((DB *, u_int32_t (*)(DB *, const void *, u_int32_t)));
@@ -1808,13 +1877,34 @@ struct __db {
u_int32_t orig_flags; /* Flags at open, for refresh */
u_int32_t flags;
-#define DB2_AM_EXCL 0x00000001 /* Exclusively lock the handle */
-#define DB2_AM_INTEXCL 0x00000002 /* Internal exclusive lock. */
-#define DB2_AM_NOWAIT 0x00000004 /* Do not wait for handle lock */
- u_int32_t orig_flags2; /* Second flags word; for refresh */
+#define DB2_AM_EXCL 0x00000001 /* Exclusively lock the handle */
+#define DB2_AM_INTEXCL 0x00000002 /* Internal exclusive lock. */
+#define DB2_AM_NOWAIT 0x00000004 /* Do not wait for handle lock */
u_int32_t flags2; /* Second flags word */
};
+/*
+ * Stream interface for blob files.
+ */
+struct __db_stream {
+ DBC *dbc; /* Cursor pointing to the db blob record. */
+ DB_FH *fhp;
+
+ /* DB_STREAM PUBLIC HANDLE LIST BEGIN */
+ int (*close) __P((DB_STREAM *, u_int32_t));
+ int (*read) __P((DB_STREAM *, DBT *, db_off_t, u_int32_t, u_int32_t));
+ int (*size) __P((DB_STREAM *, db_off_t *, u_int32_t));
+ int (*write) __P((DB_STREAM *, DBT *, db_off_t, u_int32_t));
+ /* DB_STREAM PUBLIC HANDLE LIST END */
+
+ u_int32_t flags;
+#define DB_STREAM_READ 0x00000001 /* Stream is read only. */
+#define DB_STREAM_WRITE 0x00000002 /* Stream is writeable. */
+#define DB_STREAM_SYNC_WRITE 0x00000004 /* Sync file on each write. */
+ db_seq_t blob_id;
+ db_off_t file_size;
+};
+
/*
* Macros for bulk operations. These are only intended for the C API.
* For C++, use DbMultiple*Iterator or DbMultiple*Builder.
@@ -1889,7 +1979,7 @@ struct __db {
pointer = __p; \
} while (0)
-#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \
+#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \
do { \
(dbt)->flags |= DB_DBT_BULK; \
pointer = (u_int8_t *)(dbt)->data + \
@@ -1897,7 +1987,7 @@ struct __db {
*(u_int32_t *)(pointer) = (u_int32_t)-1; \
} while (0)
-#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \
+#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \
do { \
u_int32_t *__p = (u_int32_t *)(pointer); \
u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
@@ -1914,7 +2004,7 @@ struct __db {
} \
} while (0)
-#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \
+#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \
do { \
void *__destd; \
DB_MULTIPLE_RESERVE_NEXT((pointer), (dbt), \
@@ -1925,7 +2015,7 @@ struct __db {
memcpy(__destd, (writedata), (writedlen)); \
} while (0)
-#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
+#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
do { \
u_int32_t *__p = (u_int32_t *)(pointer); \
u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
@@ -1948,7 +2038,7 @@ struct __db {
} \
} while (0)
-#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
+#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
do { \
void *__destk, *__destd; \
DB_MULTIPLE_KEY_RESERVE_NEXT((pointer), (dbt), \
@@ -1962,7 +2052,7 @@ struct __db {
} \
} while (0)
-#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \
+#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \
do { \
(dbt)->flags |= DB_DBT_BULK; \
pointer = (u_int8_t *)(dbt)->data + \
@@ -1970,7 +2060,7 @@ struct __db {
*(u_int32_t *)(pointer) = 0; \
} while (0)
-#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \
+#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \
do { \
u_int32_t *__p = (u_int32_t *)(pointer); \
u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
@@ -1988,7 +2078,7 @@ struct __db {
} \
} while (0)
-#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\
+#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\
do { \
void *__destd; \
DB_MULTIPLE_RECNO_RESERVE_NEXT((pointer), (dbt), \
@@ -2003,7 +2093,7 @@ struct __db_heap_rid {
db_pgno_t pgno; /* Page number. */
db_indx_t indx; /* Index in the offset table. */
};
-#define DB_HEAP_RID_SZ (sizeof(db_pgno_t) + sizeof(db_indx_t))
+#define DB_HEAP_RID_SZ (sizeof(db_pgno_t) + sizeof(db_indx_t))
/*******************************************************
* Access method cursors.
@@ -2074,6 +2164,7 @@ struct __dbc {
int (*close) __P((DBC *));
int (*cmp) __P((DBC *, DBC *, int *, u_int32_t));
int (*count) __P((DBC *, db_recno_t *, u_int32_t));
+ int (*db_stream) __P((DBC *, DB_STREAM **, u_int32_t));
int (*del) __P((DBC *, u_int32_t));
int (*dup) __P((DBC *, DBC **, u_int32_t));
int (*get) __P((DBC *, DBT *, DBT *, u_int32_t));
@@ -2151,6 +2242,7 @@ struct __db_bt_stat { /* SHARED */
u_int32_t bt_pagecnt; /* Page count. */
u_int32_t bt_pagesize; /* Page size. */
u_int32_t bt_minkey; /* Minkey value. */
+ u_int32_t bt_nblobs; /* Number of blobs. */
u_int32_t bt_re_len; /* Fixed-length record length. */
u_int32_t bt_re_pad; /* Fixed-length record pad. */
u_int32_t bt_levels; /* Tree levels. */
@@ -2179,7 +2271,7 @@ struct __db_compact {
u_int32_t compact_deadlock; /* Number of deadlocks. */
db_pgno_t compact_pages_truncated; /* Pages truncated to OS. */
/* Internal. */
- db_pgno_t compact_truncate; /* Page number for truncation */
+ db_pgno_t compact_truncate; /* Exchange pages above here. */
};
/* Hash statistics structure. */
@@ -2189,6 +2281,7 @@ struct __db_h_stat { /* SHARED */
u_int32_t hash_metaflags; /* Metadata flags. */
u_int32_t hash_nkeys; /* Number of unique keys. */
u_int32_t hash_ndata; /* Number of data items. */
+ u_int32_t hash_nblobs; /* Number of blobs. */
u_int32_t hash_pagecnt; /* Page count. */
u_int32_t hash_pagesize; /* Page size. */
u_int32_t hash_ffactor; /* Fill factor specified at create. */
@@ -2208,6 +2301,7 @@ struct __db_heap_stat { /* SHARED */
u_int32_t heap_magic; /* Magic number. */
u_int32_t heap_version; /* Version number. */
u_int32_t heap_metaflags; /* Metadata flags. */
+ u_int32_t heap_nblobs; /* Number of blobs. */
u_int32_t heap_nrecs; /* Number of records. */
u_int32_t heap_pagecnt; /* Page count. */
u_int32_t heap_pagesize; /* Page size. */
@@ -2267,21 +2361,15 @@ typedef enum {
* Backup configuration types.
*/
typedef enum {
- DB_BACKUP_READ_COUNT = 1,
- DB_BACKUP_READ_SLEEP = 2,
- DB_BACKUP_SIZE = 3,
- DB_BACKUP_WRITE_DIRECT = 4
+ DB_BACKUP_READ_COUNT=1,
+ DB_BACKUP_READ_SLEEP=2,
+ DB_BACKUP_SIZE=3,
+ DB_BACKUP_WRITE_DIRECT=4
} DB_BACKUP_CONFIG;
struct __db_env {
ENV *env; /* Linked ENV structure */
- /*
- * The DB_ENV structure can be used concurrently, so field access is
- * protected.
- */
- db_mutex_t mtx_db_env; /* DB_ENV structure mutex */
-
/* Error message callback */
void (*db_errcall) __P((const DB_ENV *, const char *, const char *));
FILE *db_errfile; /* Error message file stream */
@@ -2304,6 +2392,7 @@ struct __db_env {
char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *));
/* Application specified paths */
+ char *db_blob_dir; /* Blob file directory */
char *db_log_dir; /* Database log file directory */
char *db_md_dir; /* Persistent metadata directory */
char *db_tmp_dir; /* Database tmp file directory */
@@ -2327,6 +2416,8 @@ struct __db_env {
u_int32_t verbose; /* DB_VERB_XXX flags */
+ u_int32_t blob_threshold; /* Blob threshold record size */
+
/* Mutex configuration */
u_int32_t mutex_align; /* Mutex alignment */
u_int32_t mutex_cnt; /* Number of mutexes to configure */
@@ -2395,6 +2486,11 @@ struct __db_env {
* build settings.
*/
db_timeout_t envreg_timeout; /* DB_REGISTER wait timeout */
+ /*
+ * When failchk broadcasting is active, any wait for a mutex will wake
+ * up this frequently in order to check whether the mutex has died.
+ */
+ db_timeout_t mutex_failchk_timeout;
#define DB_ENV_AUTO_COMMIT 0x00000001 /* DB_AUTO_COMMIT */
#define DB_ENV_CDB_ALLDB 0x00000002 /* CDB environment wide locking */
@@ -2414,8 +2510,8 @@ struct __db_env {
#define DB_ENV_TXN_SNAPSHOT 0x00008000 /* DB_TXN_SNAPSHOT set */
#define DB_ENV_TXN_WRITE_NOSYNC 0x00010000 /* DB_TXN_WRITE_NOSYNC set */
#define DB_ENV_YIELDCPU 0x00020000 /* DB_YIELDCPU set */
-#define DB_ENV_HOTBACKUP 0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */
-#define DB_ENV_NOFLUSH 0x00080000 /* DB_NOFLUSH set */
+#define DB_ENV_HOTBACKUP 0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */
+#define DB_ENV_NOFLUSH 0x00080000 /* DB_NOFLUSH set */
u_int32_t flags;
/* DB_ENV PUBLIC HANDLE LIST BEGIN */
@@ -2436,6 +2532,8 @@ struct __db_env {
void *(**)(void *, size_t), void (**)(void *)));
int (*get_app_dispatch)
__P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+ int (*get_blob_dir) __P((DB_ENV *, const char **));
+ int (*get_blob_threshold) __P((DB_ENV*, u_int32_t *));
int (*get_cache_max) __P((DB_ENV *, u_int32_t *, u_int32_t *));
int (*get_cachesize) __P((DB_ENV *, u_int32_t *, u_int32_t *, int *));
int (*get_create_dir) __P((DB_ENV *, const char **));
@@ -2451,8 +2549,8 @@ struct __db_env {
void (**)(const DB_ENV *, const char *, const char *)));
void (*get_errfile) __P((DB_ENV *, FILE **));
void (*get_errpfx) __P((DB_ENV *, const char **));
- int (*get_flags) __P((DB_ENV *, u_int32_t *));
int (*get_feedback) __P((DB_ENV *, void (**)(DB_ENV *, int, int)));
+ int (*get_flags) __P((DB_ENV *, u_int32_t *));
int (*get_home) __P((DB_ENV *, const char **));
int (*get_intermediate_dir_mode) __P((DB_ENV *, const char **));
int (*get_isalive) __P((DB_ENV *,
@@ -2568,17 +2666,23 @@ struct __db_env {
int (*rep_set_timeout) __P((DB_ENV *, int, db_timeout_t));
int (*rep_set_transport) __P((DB_ENV *, int, int (*)(DB_ENV *,
const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
+ int (*rep_set_view) __P((DB_ENV *, int (*)(DB_ENV *,
+ const char *, int *, u_int32_t)));
int (*rep_start) __P((DB_ENV *, DBT *, u_int32_t));
int (*rep_stat) __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
int (*rep_stat_print) __P((DB_ENV *, u_int32_t));
int (*rep_sync) __P((DB_ENV *, u_int32_t));
int (*repmgr_channel) __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
int (*repmgr_get_ack_policy) __P((DB_ENV *, int *));
+ int (*repmgr_get_incoming_queue_max)
+ __P((DB_ENV *, u_int32_t *, u_int32_t *));
int (*repmgr_local_site) __P((DB_ENV *, DB_SITE **));
int (*repmgr_msg_dispatch) __P((DB_ENV *,
void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t),
u_int32_t));
int (*repmgr_set_ack_policy) __P((DB_ENV *, int));
+ int (*repmgr_set_incoming_queue_max)
+ __P((DB_ENV *, u_int32_t, u_int32_t));
int (*repmgr_site)
__P((DB_ENV *, const char *, u_int, DB_SITE**, u_int32_t));
int (*repmgr_site_by_eid) __P((DB_ENV *, int, DB_SITE**));
@@ -2590,6 +2694,8 @@ struct __db_env {
void *(*)(void *, size_t), void (*)(void *)));
int (*set_app_dispatch)
__P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+ int (*set_blob_dir) __P((DB_ENV *, const char *));
+ int (*set_blob_threshold) __P((DB_ENV *, u_int32_t, u_int32_t));
int (*set_cache_max) __P((DB_ENV *, u_int32_t, u_int32_t));
int (*set_cachesize) __P((DB_ENV *, u_int32_t, u_int32_t, int));
int (*set_create_dir) __P((DB_ENV *, const char *));
@@ -2662,8 +2768,8 @@ struct __db_env {
/* DB_ENV PUBLIC HANDLE LIST END */
/* DB_ENV PRIVATE HANDLE LIST BEGIN */
- int (*prdbt) __P((DBT *, int,
- const char *, void *, int (*)(void *, const void *), int, int));
+ int (*prdbt) __P((DBT *, int, const char *, void *,
+ int (*)(void *, const void *), int, int, int));
/* DB_ENV PRIVATE HANDLE LIST END */
};
diff --git a/src/dbinc/db_185.in b/src/dbinc/db_185.in
index 43735344..3aef2eca 100644
--- a/src/dbinc/db_185.in
+++ b/src/dbinc/db_185.in
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
diff --git a/src/dbinc/db_am.h b/src/dbinc/db_am.h
index f34578c4..2b5c49d2 100644
--- a/src/dbinc/db_am.h
+++ b/src/dbinc/db_am.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -199,12 +199,16 @@ struct __db_foreign_info {
#define DB_IS_PRIMARY(dbp) (LIST_FIRST(&dbp->s_secondaries) != NULL)
/*
* A database should be required to be readonly if it's been explicitly
- * specified as such or if we're a client in a replicated environment
- * and the user did not specify DB_TXN_NOT_DURABLE.
+ * specified as such, if we're a client in a replicated environment
+ * and the user did not specify DB_TXN_NOT_DURABLE, or if we're a master
+ * in a replicated environment and the REP_F_READONLY_MASTER flag has been
+ * set in preparation for a preferred master takeover.
*/
#define DB_IS_READONLY(dbp) \
(F_ISSET(dbp, DB_AM_RDONLY) || \
- (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE)))
+ (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE)) \
+ || (IS_REP_MASTER((dbp)->env) && \
+ F_ISSET((dbp)->env->rep_handle->region, REP_F_READONLY_MASTER)))
#ifdef HAVE_COMPRESSION
/*
diff --git a/src/dbinc/db_cxx.in b/src/dbinc/db_cxx.in
index 84fc0f88..5b29f7e8 100644
--- a/src/dbinc/db_cxx.in
+++ b/src/dbinc/db_cxx.in
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -76,6 +76,7 @@ class DbMpoolFile; // forward
class DbPreplist; // forward
class DbSequence; // forward
class DbSite; // forward
+class DbStream; // forward
class Dbt; // forward
class DbTxn; // forward
@@ -159,13 +160,13 @@ extern "C" {
typedef void (*db_free_fcn_type)
(void *);
typedef int (*bt_compare_fcn_type) /*C++ version available*/
- (DB *, const DBT *, const DBT *);
+ (DB *, const DBT *, const DBT *, size_t *);
typedef size_t (*bt_prefix_fcn_type) /*C++ version available*/
(DB *, const DBT *, const DBT *);
typedef int (*dup_compare_fcn_type) /*C++ version available*/
- (DB *, const DBT *, const DBT *);
+ (DB *, const DBT *, const DBT *, size_t *);
typedef int (*h_compare_fcn_type) /*C++ version available*/
- (DB *, const DBT *, const DBT *);
+ (DB *, const DBT *, const DBT *, size_t *);
typedef u_int32_t (*h_hash_fcn_type) /*C++ version available*/
(DB *, const void *, u_int32_t);
typedef int (*pgin_fcn_type)
@@ -204,7 +205,10 @@ public:
virtual int get_alloc(
db_malloc_fcn_type *, db_realloc_fcn_type *, db_free_fcn_type *);
virtual int get_append_recno(int (**)(Db *, Dbt *, db_recno_t));
- virtual int get_bt_compare(int (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_blob_dir(const char **);
+ virtual int get_blob_threshold(u_int32_t *);
+ virtual int get_bt_compare(
+ int (**)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int get_bt_compress(
int (**)(
Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *),
@@ -215,7 +219,8 @@ public:
virtual int get_cachesize(u_int32_t *, u_int32_t *, int *);
virtual int get_create_dir(const char **);
virtual int get_dbname(const char **, const char **);
- virtual int get_dup_compare(int (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_dup_compare(
+ int (**)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int get_encrypt_flags(u_int32_t *);
virtual void get_errcall(
void (**)(const DbEnv *, const char *, const char *));
@@ -225,7 +230,8 @@ public:
virtual int get_flags(u_int32_t *);
virtual int get_heapsize(u_int32_t *, u_int32_t *);
virtual int get_heap_regionsize(u_int32_t *);
- virtual int get_h_compare(int (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_h_compare(
+ int (**)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int get_h_ffactor(u_int32_t *);
virtual int get_h_hash(u_int32_t (**)(Db *, const void *, u_int32_t));
virtual int get_h_nelem(u_int32_t *);
@@ -261,8 +267,11 @@ public:
db_malloc_fcn_type, db_realloc_fcn_type, db_free_fcn_type);
virtual void set_app_private(void *);
virtual int set_append_recno(int (*)(Db *, Dbt *, db_recno_t));
+ virtual int set_blob_dir(const char *);
+ virtual int set_blob_threshold(u_int32_t, u_int32_t);
virtual int set_bt_compare(bt_compare_fcn_type); /*deprecated*/
- virtual int set_bt_compare(int (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_bt_compare(
+ int (*)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int set_bt_compress(
int (*)
(Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *),
@@ -273,7 +282,8 @@ public:
virtual int set_cachesize(u_int32_t, u_int32_t, int);
virtual int set_create_dir(const char *);
virtual int set_dup_compare(dup_compare_fcn_type); /*deprecated*/
- virtual int set_dup_compare(int (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_dup_compare(
+ int (*)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int set_encrypt(const char *, u_int32_t);
virtual void set_errcall(
void (*)(const DbEnv *, const char *, const char *));
@@ -284,7 +294,8 @@ public:
virtual int set_heapsize(u_int32_t, u_int32_t);
virtual int set_heap_regionsize(u_int32_t);
virtual int set_h_compare(h_compare_fcn_type); /*deprecated*/
- virtual int set_h_compare(int (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_h_compare(
+ int (*)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int set_h_ffactor(u_int32_t);
virtual int set_h_hash(h_hash_fcn_type); /*deprecated*/
virtual int set_h_hash(u_int32_t (*)(Db *, const void *, u_int32_t));
@@ -383,16 +394,16 @@ public:
int (*associate_callback_)(Db *, const Dbt *, const Dbt *, Dbt *);
int (*associate_foreign_callback_)
(Db *, const Dbt *, Dbt *, const Dbt *, int *);
- int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *);
+ int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *);
int (*bt_compress_callback_)(
Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *);
int (*bt_decompress_callback_)(
Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *);
size_t (*bt_prefix_callback_)(Db *, const Dbt *, const Dbt *);
u_int32_t (*db_partition_callback_)(Db *, Dbt *);
- int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *);
+ int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *);
void (*feedback_callback_)(Db *, int, int);
- int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *);
+ int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *);
u_int32_t (*h_hash_callback_)(Db *, const void *, u_int32_t);
};
@@ -407,6 +418,7 @@ public:
int close();
int cmp(Dbc *other_csr, int *result, u_int32_t flags);
int count(db_recno_t *countp, u_int32_t flags);
+ int db_stream(DbStream **dbsp, u_int32_t flags);
int del(u_int32_t flags);
int dup(Dbc** cursorp, u_int32_t flags);
int get(Dbt* key, Dbt *data, u_int32_t flags);
@@ -527,6 +539,10 @@ public:
int (*)(DbEnv *, const char *, void *));
virtual int get_backup_config(DB_BACKUP_CONFIG, u_int32_t *);
virtual int set_backup_config(DB_BACKUP_CONFIG, u_int32_t);
+ virtual int get_blob_dir(const char **);
+ virtual int set_blob_dir(const char *);
+ virtual int get_blob_threshold(u_int32_t *);
+ virtual int set_blob_threshold(u_int32_t, u_int32_t);
virtual int get_cachesize(u_int32_t *, u_int32_t *, int *);
virtual int set_cachesize(u_int32_t, u_int32_t, int);
virtual int get_cache_max(u_int32_t *, u_int32_t *);
@@ -761,10 +777,16 @@ public:
virtual int rep_set_priority(u_int32_t priority);
virtual int rep_get_timeout(int which, db_timeout_t *timeout);
virtual int rep_set_timeout(int which, db_timeout_t timeout);
+ virtual int rep_set_view(int (*)(DbEnv *,
+ const char *, int *, u_int32_t));
virtual int repmgr_channel(int eid, DbChannel **channel,
u_int32_t flags);
virtual int repmgr_get_ack_policy(int *policy);
virtual int repmgr_set_ack_policy(int policy);
+ virtual int repmgr_get_incoming_queue_max(u_int32_t *gbytesp,
+ u_int32_t *bytesp);
+ virtual int repmgr_set_incoming_queue_max(u_int32_t gbytes,
+ u_int32_t bytes);
virtual int repmgr_local_site(DbSite **site);
virtual int repmgr_msg_dispatch(void (*) (DbEnv *,
DbChannel *, Dbt *, u_int32_t, u_int32_t), u_int32_t flags);
@@ -824,6 +846,8 @@ public:
static int _backup_write_intercept(DB_ENV *dbenv, u_int32_t off_gbytes,
u_int32_t off_bytes, u_int32_t size, u_int8_t *buf, void *handle);
static void _paniccall_intercept(DB_ENV *dbenv, int errval);
+ static int _partial_rep_intercept(DB_ENV *dbenv,
+ const char *name, int *result, u_int32_t flags);
static void _feedback_intercept(DB_ENV *dbenv, int opcode, int pct);
static void _event_func_intercept(DB_ENV *dbenv, u_int32_t, void *);
static int _isalive_intercept(DB_ENV *dbenv, pid_t pid,
@@ -872,6 +896,7 @@ private:
void (*feedback_callback_)(DbEnv *, int, int);
void (*message_callback_)(const DbEnv *, const char *);
void (*paniccall_callback_)(DbEnv *, int);
+ int (*partial_rep_callback_)(DbEnv *, const char *, int *, u_int32_t);
void (*event_func_callback_)(DbEnv *, u_int32_t, void *);
int (*rep_send_callback_)(DbEnv *, const Dbt *, const Dbt *,
const DbLsn *, int, u_int32_t);
@@ -1057,9 +1082,9 @@ public:
int stat(DB_SEQUENCE_STAT **sp, u_int32_t flags);
int stat_print(u_int32_t flags);
- int get(DbTxn *txnid, int32_t delta, db_seq_t *retp, u_int32_t flags);
- int get_cachesize(int32_t *sizep);
- int set_cachesize(int32_t size);
+ int get(DbTxn *txnid, u_int32_t delta, db_seq_t *retp, u_int32_t flags);
+ int get_cachesize(u_int32_t *sizep);
+ int set_cachesize(u_int32_t size);
int get_flags(u_int32_t *flagsp);
int set_flags(u_int32_t flags);
int get_range(db_seq_t *minp, db_seq_t *maxp);
@@ -1137,6 +1162,34 @@ private:
};
//
+// DbStream
+//
+class _exported DbStream : protected DB_STREAM
+{
+ friend class Dbc;
+
+public:
+ int close(u_int32_t flags);
+ int read(Dbt *data, db_off_t offset, u_int32_t size, u_int32_t flags);
+ int size(db_off_t *size, u_int32_t flags);
+ int write(Dbt *data, db_off_t offset, u_int32_t flags);
+
+private:
+ // No data is permitted in this class (see comment at top)
+
+ // Note: use Dbc::dbstream() to get pointers to a DbStream,
+ // and call Dbstream::close() rather than delete to release them.
+ //
+ DbStream();
+ ~DbStream();
+
+ // no copying
+ DbStream(const DbStream &);
+ DbStream &operator = (const DbStream &);
+
+};
+
+//
// Transaction
//
class _exported DbTxn
@@ -1245,6 +1298,7 @@ class _exported Dbt : private DBT
friend class DbEnv;
friend class DbLogc;
friend class DbSequence;
+ friend class DbStream;
public:
// key/data
diff --git a/src/dbinc/db_dispatch.h b/src/dbinc/db_dispatch.h
index b6382871..b3aedab1 100644
--- a/src/dbinc/db_dispatch.h
+++ b/src/dbinc/db_dispatch.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1995, 1996
diff --git a/src/dbinc/db_int.in b/src/dbinc/db_int.in
index 42439107..593deef6 100644
--- a/src/dbinc/db_int.in
+++ b/src/dbinc/db_int.in
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -73,6 +73,17 @@
#endif /* !HAVE_SYSTEM_INCLUDE_FILES */
+/*
+ * The Windows compiler needs to be told about structures that are available
+ * outside a dll.
+ */
+#if defined(DB_WIN32) && defined(_MSC_VER) && \
+ !defined(DB_CREATE_DLL) && !defined(_LIB)
+#define __DB_IMPORT __declspec(dllimport)
+#else
+#define __DB_IMPORT
+#endif
+
#ifdef DB_WIN32
#include "dbinc/win_db.h"
#endif
@@ -88,22 +99,12 @@
#include "dbinc/queue.h"
#include "dbinc/shqueue.h"
#include "dbinc/perfmon.h"
+#include "dbinc/clock.h"
#if defined(__cplusplus)
extern "C" {
#endif
-/*
- * The Windows compiler needs to be told about structures that are available
- * outside a dll.
- */
-#if defined(DB_WIN32) && defined(_MSC_VER) && \
- !defined(DB_CREATE_DLL) && !defined(_LIB)
-#define __DB_IMPORT __declspec(dllimport)
-#else
-#define __DB_IMPORT
-#endif
-
/*******************************************************
* Forward structure declarations.
*******************************************************/
@@ -366,22 +367,27 @@ typedef struct __fn {
/*
* Structure used for callback message aggregation.
*
- * Display values in XXX_stat_print calls.
+ * DB_MSGBUF_FLUSH displays values in XXX_stat_print calls.
+ * DB_MSGBUF_REP_FLUSH displays replication system messages.
*/
typedef struct __db_msgbuf {
char *buf; /* Heap allocated buffer. */
char *cur; /* Current end of message. */
size_t len; /* Allocated length of buffer. */
+ int flags;
} DB_MSGBUF;
+#define DB_MSGBUF_PREALLOCATED 0x0001
+
#define DB_MSGBUF_INIT(a) do { \
(a)->buf = (a)->cur = NULL; \
- (a)->len = 0; \
+ (a)->len = (a)->flags = 0; \
} while (0)
#define DB_MSGBUF_FLUSH(env, a) do { \
if ((a)->buf != NULL) { \
if ((a)->cur != (a)->buf) \
__db_msg(env, "%s", (a)->buf); \
- __os_free(env, (a)->buf); \
+ if (!F_ISSET((a), DB_MSGBUF_PREALLOCATED)) \
+ __os_free(env, (a)->buf); \
DB_MSGBUF_INIT(a); \
} \
} while (0)
@@ -392,18 +398,14 @@ typedef struct __db_msgbuf {
if (regular_msg) \
DB_MSGBUF_FLUSH(env, a); \
else { \
- __os_free(env, (a)->buf); \
+ if (!F_ISSET((a), DB_MSGBUF_PREALLOCATED)) \
+ __os_free(env, (a)->buf); \
DB_MSGBUF_INIT(a); \
} \
} \
} while (0)
-#define STAT_FMT(msg, fmt, type, v) do { \
- DB_MSGBUF __mb; \
- DB_MSGBUF_INIT(&__mb); \
- __db_msgadd(env, &__mb, fmt, (type)(v)); \
- __db_msgadd(env, &__mb, "\t%s", msg); \
- DB_MSGBUF_FLUSH(env, &__mb); \
-} while (0)
+#define STAT_FMT(msg, fmt, type, v) \
+ __db_msg(env, fmt "\t%s", (type)(v), msg);
#define STAT_HEX(msg, v) \
__db_msg(env, "%#lx\t%s", (u_long)(v), msg)
#define STAT_ISSET(msg, p) \
@@ -441,25 +443,21 @@ typedef struct __db_msgbuf {
*
* Error message IDs are automatically assigned by dist/s_message_id script.
*/
-#ifdef HAVE_LOCALIZATION
-#define _(msg) msg /* Replace with localization function. */
-#else
-#define _(msg) msg
-#endif
-
#ifdef HAVE_STRIPPED_MESSAGES
#define DB_STR_C(msg, fmt) fmt
#else
-#define DB_STR_C(msg, fmt) _(msg)
+#define DB_STR_C(msg, fmt) msg
#endif
-#define DB_MSGID(id) "BDB" id
-
-#define DB_STR(id, msg) DB_MSGID(id) " " DB_STR_C(msg, "")
-
-#define DB_STR_A(id, msg, fmt) DB_MSGID(id) " " DB_STR_C(msg, fmt)
+#ifdef HAVE_LOCALIZATION
+#define _(msg) (msg) /* Replace with localization function. */
+#else
+#define _(msg) msg
+#endif
-#define DB_STR_P(msg) _(msg)
+#define DB_STR(id, msg) _("BDB" id " " DB_STR_C(msg, ""))
+#define DB_STR_A(id, msg, fmt) _("BDB" id " " DB_STR_C(msg, fmt))
+#define DB_STR_P(msg) _(msg)
/*
* There are quite a few places in Berkeley DB where we want to initialize
@@ -542,6 +540,7 @@ typedef struct __db_msgbuf {
/* Type passed to __db_appname(). */
typedef enum {
DB_APP_NONE=0, /* No type (region). */
+ DB_APP_BLOB, /* Blob file. */
DB_APP_DATA, /* Data file. */
DB_APP_LOG, /* Log file. */
DB_APP_META, /* Persistent metadata file. */
@@ -612,8 +611,13 @@ typedef enum {
if (F_ISSET((env), ENV_OPEN_CALLED)) \
ENV_REQUIRES_CONFIG(env, handle, i, flags)
+/*
+ * The ENV_ENTER and ENV_LEAVE macros announce to other threads that
+ * the current thread is entering or leaving the BDB api.
+ */
#define ENV_ENTER_RET(env, ip, ret) do { \
ret = 0; \
+ DISCARD_HISTORY(env); \
PANIC_CHECK_RET(env, ret); \
if (ret == 0) { \
if ((env)->thr_hashtab == NULL) \
@@ -631,6 +635,10 @@ typedef enum {
return (__ret); \
} while (0)
+/*
+ * Publicize the current thread's intention to run failchk. This invokes
+ * DB_ENV->is_alive() in the mutex code, to avoid hanging on dead processes.
+ */
#define FAILCHK_THREAD(env, ip) do { \
if ((ip) != NULL) \
(ip)->dbth_state = THREAD_FAILCHK; \
@@ -638,20 +646,15 @@ typedef enum {
#define ENV_GET_THREAD_INFO(env, ip) ENV_ENTER(env, ip)
-#ifdef DIAGNOSTIC
#define ENV_LEAVE(env, ip) do { \
- if ((ip) != NULL) { \
- DB_ASSERT(env, ((ip)->dbth_state == THREAD_ACTIVE || \
- (ip)->dbth_state == THREAD_FAILCHK)); \
+ if ((ip) != NULL) { \
+ DB_ASSERT((env), (ip)->dbth_state == THREAD_ACTIVE || \
+ (ip)->dbth_state == THREAD_FAILCHK); \
(ip)->dbth_state = THREAD_OUT; \
} \
} while (0)
-#else
-#define ENV_LEAVE(env, ip) do { \
- if ((ip) != NULL) \
- (ip)->dbth_state = THREAD_OUT; \
-} while (0)
-#endif
+
+
#ifdef DIAGNOSTIC
#define CHECK_THREAD(env) do { \
if ((env)->thr_hashtab != NULL) \
@@ -688,6 +691,23 @@ typedef struct __pin_list {
} PIN_LIST;
#define PINMAX 4
+typedef enum {
+ MUTEX_ACTION_UNLOCKED=0,
+ MUTEX_ACTION_INTEND_SHARE, /* Thread is attempting a read-lock. */
+ MUTEX_ACTION_SHARED /* Thread has gotten a read lock. */
+} MUTEX_ACTION;
+
+typedef struct __mutex_state { /* SHARED */
+ db_mutex_t mutex;
+ MUTEX_ACTION action;
+#ifdef DIAGNOSTIC
+ db_timespec when;
+#endif
+} MUTEX_STATE;
+
+#define MUTEX_STATE_MAX 10 /* It only needs enough for shared latches. */
+
+
struct __db_thread_info { /* SHARED */
pid_t dbth_pid;
db_threadid_t dbth_tid;
@@ -707,11 +727,25 @@ struct __db_thread_info { /* SHARED */
u_int16_t dbth_pinmax; /* Number of slots allocated. */
roff_t dbth_pinlist; /* List of pins. */
PIN_LIST dbth_pinarray[PINMAX]; /* Initial array of slots. */
+
+ /*
+ * While thread tracking is active this caches one of the lockers
+ * created by each thread. This locker remains allocated, with an
+ * invalid id, even after the locker id is freed.
+ */
+ roff_t dbth_local_locker;
+ /*
+ * Each latch shared by this thread has an entry here. Exclusive
+ * ownership, for both latches and mutexes, are in the DB_MUTEX.
+ */
+ MUTEX_STATE dbth_latches[MUTEX_STATE_MAX];
#ifdef DIAGNOSTIC
roff_t dbth_locker; /* Current locker for this thread. */
u_int32_t dbth_check_off; /* Count of number of LOCK_OFF calls. */
#endif
+ db_timespec dbth_failtime; /* Time when its crash was detected. */
};
+
#ifdef DIAGNOSTIC
#define LOCK_CHECK_OFF(ip) if ((ip) != NULL) \
(ip)->dbth_check_off++
@@ -729,7 +763,7 @@ struct __db_thread_info { /* SHARED */
#define LOCK_CHECK(dbc, pgno, mode) NOP_STATEMENT
#endif
-typedef struct __env_thread_info {
+typedef struct __env_thread_info { /* SHARED */
u_int32_t thr_count;
u_int32_t thr_init;
u_int32_t thr_max;
@@ -803,6 +837,11 @@ struct __env {
#define ENV_DEF_DATA_LEN 100
u_int32_t data_len; /* Data length in __db_prbytes. */
+ /* Registered processes */
+ size_t num_active_pids; /* number of entries in active_pids */
+ size_t size_active_pids; /* allocated size of active_pids */
+ pid_t *active_pids; /* array active pids */
+
/* Thread tracking */
u_int32_t thr_nbucket; /* Number of hash buckets */
DB_HASHTAB *thr_hashtab; /* Hash table of DB_THREAD_INFO */
@@ -866,6 +905,7 @@ struct __env {
#define DB_TEST_PREOPEN 10 /* before __os_open */
#define DB_TEST_REPMGR_PERM 11 /* repmgr perm/archiving tests */
#define DB_TEST_SUBDB_LOCKS 12 /* subdb locking tests */
+#define DB_TEST_REPMGR_HEARTBEAT 13 /* repmgr stop sending heartbeats */
int test_abort; /* Abort value for testing */
int test_check; /* Checkpoint value for testing */
int test_copy; /* Copy value for testing */
@@ -881,7 +921,9 @@ struct __env {
#define ENV_REF_COUNTED 0x00000100 /* Region references this handle */
#define ENV_SYSTEM_MEM 0x00000200 /* DB_SYSTEM_MEM set */
#define ENV_THREAD 0x00000400 /* DB_THREAD set */
-#define ENV_FORCE_TXN_BULK 0x00000800 /* Txns use bulk mode-for testing */
+#define ENV_FORCE_TXN_BULK 0x00000800 /* Txns use bulk mode-for testing */
+#define ENV_REMEMBER_PANIC 0x00001000 /* Panic was on during cleanup. */
+#define ENV_FORCESYNCENV 0x00002000 /* Force msync on closing. */
u_int32_t flags;
};
@@ -1106,7 +1148,6 @@ typedef struct __dbpginfo {
@db_int_def@
#include "dbinc/globals.h"
-#include "dbinc/clock.h"
#include "dbinc/debug.h"
#include "dbinc/region.h"
#include "dbinc_auto/env_ext.h"
@@ -1118,6 +1159,7 @@ typedef struct __dbpginfo {
#include "dbinc/os.h"
#include "dbinc_auto/clib_ext.h"
#include "dbinc_auto/common_ext.h"
+#include "dbinc_auto/blob_ext.h"
/*******************************************************
* Remaining Log.
diff --git a/src/dbinc/db_join.h b/src/dbinc/db_join.h
index aecf059a..8f22adcb 100644
--- a/src/dbinc/db_join.h
+++ b/src/dbinc/db_join.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/db_page.h b/src/dbinc/db_page.h
index 2d4de2e5..4694c4cf 100644
--- a/src/dbinc/db_page.h
+++ b/src/dbinc/db_page.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -93,6 +93,7 @@ typedef struct _dbmeta33 {
u_int8_t uid[DB_FILE_ID_LEN];
} DBMETA33, DBMETA;
+
/************************************************************************
BTREE METADATA PAGE LAYOUT
************************************************************************/
@@ -113,7 +114,13 @@ typedef struct _btmeta33 {
u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */
u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */
u_int32_t root; /* 88-91: Root page. */
- u_int32_t unused2[92]; /* 92-459: Unused space. */
+ u_int32_t blob_threshold;
+ /* 92-95: Minimum blob file size. */
+ u_int32_t blob_file_lo; /* 96-99: Blob file dir id lo. */
+ u_int32_t blob_file_hi; /* 100-103: Blob file dir id hi. */
+ u_int32_t blob_sdb_lo; /* 104-107: Blob sdb dir id lo */
+ u_int32_t blob_sdb_hi; /* 108-111: Blob sdb dir id hi */
+ u_int32_t unused2[87]; /* 112-459: Unused space. */
u_int32_t crypto_magic; /* 460-463: Crypto magic number */
u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
@@ -142,7 +149,13 @@ typedef struct _hashmeta33 {
#define NCACHED 32 /* number of spare points */
/* 96-223: Spare pages for overflow */
u_int32_t spares[NCACHED];
- u_int32_t unused[59]; /* 224-459: Unused space */
+ u_int32_t blob_threshold;
+ /* 224-227: Minimum blob file size. */
+ u_int32_t blob_file_lo; /* 228-231: Blob file dir id lo. */
+ u_int32_t blob_file_hi; /* 232-235: Blob file dir id hi. */
+ u_int32_t blob_sdb_lo; /* 236-239: Blob sdb dir id lo. */
+ u_int32_t blob_sdb_hi; /* 240-243: Blob sdb dir id hi. */
+ u_int32_t unused[54]; /* 244-459: Unused space */
u_int32_t crypto_magic; /* 460-463: Crypto magic number */
u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
@@ -168,7 +181,10 @@ typedef struct _heapmeta {
u_int32_t gbytes; /* 80-83: GBytes for fixed size heap. */
u_int32_t bytes; /* 84-87: Bytes for fixed size heap. */
u_int32_t region_size; /* 88-91: Max region size. */
- u_int32_t unused2[92]; /* 92-459: Unused space.*/
+ u_int32_t blob_threshold; /* 92-95: Minimum blob file size. */
+ u_int32_t blob_file_lo; /* 96-97: Blob file dir id lo. */
+ u_int32_t blob_file_hi; /* 98-101: Blob file dir id hi. */
+ u_int32_t unused2[89]; /* 102-459: Unused space.*/
u_int32_t crypto_magic; /* 460-463: Crypto magic number */
u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
@@ -371,6 +387,7 @@ typedef struct __heaphdr {
#define HEAP_RECSPLIT 0x01 /* Heap data record is split */
#define HEAP_RECFIRST 0x02 /* First piece of a split record */
#define HEAP_RECLAST 0x04 /* Last piece of a split record */
+#define HEAP_RECBLOB 0x08 /* Record refers to a blob */
u_int8_t flags; /* 00: Flags describing record. */
u_int8_t unused; /* 01: Padding. */
u_int16_t size; /* 02-03: The size of the stored data piece. */
@@ -384,8 +401,35 @@ typedef struct __heaphdrsplt {
u_int16_t unused; /* 14-15: Padding. */
} HEAPSPLITHDR;
+/*
+ * HEAPBLOB, the blob database record for heap.
+ * Saving bytes is not a concern for the blob record type - if too many
+ * fit onto a single page, then we're likely to introduce unnecessary
+ * contention for blobs. Using blobs implies storing large items, thus slightly
+ * more per-item overhead is acceptable.
+ * If this proves untrue, the crypto section of the record could be optional.
+ * encoding, lsn, encryption, and checksum fields are unused at the moment, but
+ * included to make adding those features easier.
+ */
+typedef struct _heapblob {
+ HEAPHDR std_hdr; /* 00-03: The standard data header */
+ u_int8_t encoding; /* 04: Encoding of blob file. */
+ u_int8_t unused[7]; /* 05-11: Padding, unused. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */
+ DB_LSN lsn; /* 48-55: LSN for blob file update. */
+ u_int64_t id; /* 56-63: Blob file identifier. */
+ u_int64_t size; /* 64-71: Blob file size. */
+ u_int64_t file_id; /* 72-80: File directory. */
+} HEAPBLOBHDR, HEAPBLOBHDR60P1;
+
#define HEAP_HDRSIZE(hdr) \
- (F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : sizeof(HEAPHDR))
+ (F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : \
+ sizeof(HEAPHDR))
+
+#define HEAPBLOBREC_SIZE (sizeof(HEAPBLOBHDR))
+#define HEAPBLOBREC_DSIZE (sizeof(HEAPBLOBHDR) - sizeof(HEAPHDR))
+#define HEAPBLOBREC_DATA(p) (((u_int8_t *)p) + sizeof(HEAPHDR))
#define HEAPPG_SZ(dbp) \
(F_ISSET((dbp), DB_AM_ENCRYPT) ? HEAPPG_SEC : \
@@ -441,12 +485,12 @@ typedef struct __heaphdrsplt {
/* Return the amount of free space on a heap data page. */
#define HEAP_FREESPACE(dbp, p) \
- (HOFFSET(p) - HEAPPG_SZ(dbp) - \
+ ((HOFFSET(p) - HEAPPG_SZ(dbp)) - \
(NUM_ENT(p) == 0 ? 0 : ((HEAP_HIGHINDX(p) + 1) * sizeof(db_indx_t))))
/* The maximum amount of data that can fit on an empty heap data page. */
#define HEAP_MAXDATASIZE(dbp) \
- ((dbp)->pgsize - HEAPPG_SZ(dbp) - sizeof(db_indx_t))
+ (((dbp)->pgsize - HEAPPG_SZ(dbp)) - sizeof(db_indx_t))
#define HEAP_FREEINDX(p) (((HEAPPG *)p)->free_indx)
#define HEAP_HIGHINDX(p) (((HEAPPG *)p)->high_indx)
@@ -549,9 +593,9 @@ typedef struct _qpage {
* The amount of overflow data stored on each page is stored in the
* hf_offset field.
*
- * The implementation reference counts overflow items as it's possible
- * for them to be promoted onto btree internal pages. The reference
- * count is stored in the entries field.
+ * Before 4.3 the implementation reference counted overflow items as it
+ * once was possible for them to be promoted onto btree internal pages.
+ * The reference count is stored in the entries field.
*/
#define OV_LEN(p) (((PAGE *)p)->hf_offset)
#define OV_REF(p) (((PAGE *)p)->entries)
@@ -571,6 +615,7 @@ typedef struct _qpage {
#define H_DUPLICATE 2 /* Duplicate key/data item. */
#define H_OFFPAGE 3 /* Overflow key/data item. */
#define H_OFFDUP 4 /* Overflow page of duplicates. */
+#define H_BLOB 5 /* Blob file data item. */
/*
* !!!
@@ -685,6 +730,78 @@ typedef struct _hoffdup {
*/
#define HOFFDUP_SIZE (sizeof(HOFFDUP))
+/*
+ * The fifth type is the H_BLOB, represented by the HBLOB structure.
+ * Saving bytes is not a concern for the blob record type - if too many
+ * fit onto a single page, then we're likely to introduce unnecessary
+ * contention for blobs. Using blobs implies storing large items, thus slightly
+ * more per-item overhead is acceptable.
+ * If this proves untrue, the crypto section of the record could be optional.
+ * encoding, encryption, and checksum fields are unused at the moment, but
+ * included to make adding those features easier.
+ */
+typedef struct _hblob {
+ u_int8_t type; /* 00: Page type and delete flag. */
+ u_int8_t encoding; /* 01: Encoding of blob file. */
+ u_int8_t unused[10]; /* 02-11: Padding, unused. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */
+ u_int64_t id; /* 48-55: Blob file identifier. */
+ u_int64_t size; /* 56-63: Blob file size. */
+ u_int64_t file_id; /* 64-71: File directory. */
+ u_int64_t sdb_id; /* 72-79: Subdb that owns this blob. */
+} HBLOB, HBLOB60P1;
+
+#define HBLOB_ID(p) (((u_int8_t *)p) + SSZ(HBLOB, id))
+#define HBLOB_FILE_ID(p) (((u_int8_t *)p) + SSZ(HBLOB, file_id))
+
+/*
+ * Return a off_t version of the u_int64_t blob size.
+ * Since off_t can be a 32 or 64 integer on different systems, this macro
+ * is used to catch cases of overflow.
+ */
+#define GET_BLOB_SIZE(e, p, o, ret) do { \
+ DB_ASSERT((e), sizeof(o) <= 8); \
+ if (sizeof(o) == 8) { \
+ (o) = (off_t)(p).size; \
+ } else { \
+ if ((p).size > INT_MAX) { \
+ __db_errx((e), DB_STR("0769", \
+ "Blob size overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ (o) = (int32_t)(p).size; \
+ } \
+} while (0);
+
+#define SET_BLOB_FIELD(p, v, type, field) do { \
+ u_int64_t tmp; \
+ tmp = (u_int64_t)(v); \
+ memcpy((u_int8_t *)(p) + SSZ(type, field), \
+ &tmp, sizeof(u_int64_t)); \
+} while (0);
+
+#define SET_BLOB_ID(p, v, type) \
+ SET_BLOB_FIELD(p, v, type, id)
+
+#define SET_BLOB_SIZE(p, v, type) \
+ SET_BLOB_FIELD(p, v, type, size)
+
+#define SET_BLOB_FILE_ID(p, v, type) \
+ SET_BLOB_FIELD(p, v, type, file_id)
+
+#define SET_BLOB_SDB_ID(p, v, type) \
+ SET_BLOB_FIELD(p, v, type, sdb_id)
+
+/*
+ * Page space required to add a new HBLOB item to the page, with and
+ * without the index value.
+ */
+#define HBLOB_SIZE (sizeof(HBLOB))
+#define HBLOB_DSIZE (sizeof(HBLOB) - SSZA(HKEYDATA, data))
+#define HBLOB_PSIZE (HBLOB_SIZE + sizeof(db_indx_t))
+
+
/************************************************************************
BTREE PAGE LAYOUT
************************************************************************/
@@ -693,6 +810,7 @@ typedef struct _hoffdup {
#define B_KEYDATA 1 /* Key/data item. */
#define B_DUPLICATE 2 /* Duplicate key/data item. */
#define B_OVERFLOW 3 /* Overflow key/data item. */
+#define B_BLOB 4 /* Blob file key/data item. */
/*
* We have to store a deleted entry flag in the page. The reason is complex,
@@ -746,6 +864,32 @@ typedef struct _boverflow {
u_int32_t tlen; /* 08-11: Total length of item. */
} BOVERFLOW;
+/*
+ * The fourth type is the B_BLOB, represented by the BBLOB structure.
+ * Saving bytes is not a concern for the blob record type - if too many
+ * fit onto a single page, then we're likely to introduce unnecessary
+ * contention for blobs. Using blobs implies storing large items, thus slightly
+ * more per-item overhead is acceptable.
+ * The len field is set to BBLOB_DSIZE, so that a B_BLOB can be treated just
+ * like a B_KEYDATA for the purposes of moving items between or on a page.
+ * If this proves untrue, the crypto section of the record could be optional.
+ * encoding, lsn, encryption, and checksum fields are unused at the moment, but
+ * included to make adding those features easier.
+ */
+typedef struct _bblob {
+ db_indx_t len; /* 00-01: BBLOB_DSIZE. */
+ u_int8_t type; /* 02: Page type and delete flag. */
+ u_int8_t encoding; /* 03: Encoding of blob file. */
+ u_int8_t unused[8]; /* 04-11: Padding, unused. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */
+ u_int64_t id; /* 48-55: Blob file identifier. */
+ u_int64_t size; /* 56-63: Blob file size. */
+ u_int64_t file_id; /* 64-71: File directory. */
+ u_int64_t sdb_id; /* 72-79: Subdb that owns this blob. */
+} BBLOB, BBLOB60P1;
+#define BBLOB_DATA(p) ((u_int8_t *)((BKEYDATA *)p)->data)
+
/* Get a BOVERFLOW item for a specific index. */
#define GET_BOVERFLOW(dbp, pg, indx) \
((BOVERFLOW *)P_ENTRY(dbp, pg, indx))
@@ -759,13 +903,26 @@ typedef struct _boverflow {
#define BOVERFLOW_PSIZE \
(BOVERFLOW_SIZE + sizeof(db_indx_t))
+/*
+ * Page space required to add a new BBLOB item to the page, with and
+ * without the index value. BBLOB_DSIZE is used so that a B_BLOB item
+ * can be treated just like a B_KEYDATA for the purposes of moving items
+ * between or on a page, such as when doing compaction.
+ */
+#define BBLOB_SIZE \
+ ((u_int16_t)DB_ALIGN(sizeof(BBLOB), sizeof(u_int32_t)))
+#define BBLOB_DSIZE \
+ (BBLOB_SIZE - SSZA(BKEYDATA, data))
+#define BBLOB_PSIZE \
+ (BBLOB_SIZE + sizeof(db_indx_t))
+
#define BITEM_SIZE(bk) \
- (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_SIZE : \
- BKEYDATA_SIZE((bk)->len))
+ (B_TYPE((bk)->type) == B_KEYDATA ? BKEYDATA_SIZE((bk)->len) : \
+ (B_TYPE((bk)->type) == B_BLOB ? BBLOB_SIZE : BOVERFLOW_SIZE))
#define BITEM_PSIZE(bk) \
- (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_PSIZE : \
- BKEYDATA_PSIZE((bk)->len))
+ (B_TYPE((bk)->type) == B_KEYDATA ? BKEYDATA_PSIZE((bk)->len) : \
+ (B_TYPE((bk)->type) == B_BLOB ? BBLOB_PSIZE : BOVERFLOW_PSIZE))
/*
* Btree leaf and hash page layouts group indices in sets of two, one for the
diff --git a/src/dbinc/db_swap.h b/src/dbinc/db_swap.h
index 352ae227..06f4eb47 100644
--- a/src/dbinc/db_swap.h
+++ b/src/dbinc/db_swap.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
@@ -51,15 +51,26 @@ extern "C" {
#define M_64_SWAP(a) { \
u_int64_t _tmp; \
_tmp = (u_int64_t)a; \
- ((u_int8_t *)&a)[0] = ((u_int8_t *)&_tmp)[7]; \
- ((u_int8_t *)&a)[1] = ((u_int8_t *)&_tmp)[6]; \
- ((u_int8_t *)&a)[2] = ((u_int8_t *)&_tmp)[5]; \
- ((u_int8_t *)&a)[3] = ((u_int8_t *)&_tmp)[4]; \
- ((u_int8_t *)&a)[4] = ((u_int8_t *)&_tmp)[3]; \
- ((u_int8_t *)&a)[5] = ((u_int8_t *)&_tmp)[2]; \
- ((u_int8_t *)&a)[6] = ((u_int8_t *)&_tmp)[1]; \
- ((u_int8_t *)&a)[7] = ((u_int8_t *)&_tmp)[0]; \
+ ((u_int8_t *)&(a))[0] = ((u_int8_t *)&_tmp)[7]; \
+ ((u_int8_t *)&(a))[1] = ((u_int8_t *)&_tmp)[6]; \
+ ((u_int8_t *)&(a))[2] = ((u_int8_t *)&_tmp)[5]; \
+ ((u_int8_t *)&(a))[3] = ((u_int8_t *)&_tmp)[4]; \
+ ((u_int8_t *)&(a))[4] = ((u_int8_t *)&_tmp)[3]; \
+ ((u_int8_t *)&(a))[5] = ((u_int8_t *)&_tmp)[2]; \
+ ((u_int8_t *)&(a))[6] = ((u_int8_t *)&_tmp)[1]; \
+ ((u_int8_t *)&(a))[7] = ((u_int8_t *)&_tmp)[0]; \
}
+#undef P_64_COPYSWAP
+#define P_64_COPYSWAP(a, b) do { \
+ ((u_int8_t *)b)[0] = ((u_int8_t *)a)[7]; \
+ ((u_int8_t *)b)[1] = ((u_int8_t *)a)[6]; \
+ ((u_int8_t *)b)[2] = ((u_int8_t *)a)[5]; \
+ ((u_int8_t *)b)[3] = ((u_int8_t *)a)[4]; \
+ ((u_int8_t *)b)[4] = ((u_int8_t *)a)[3]; \
+ ((u_int8_t *)b)[5] = ((u_int8_t *)a)[2]; \
+ ((u_int8_t *)b)[6] = ((u_int8_t *)a)[1]; \
+ ((u_int8_t *)b)[7] = ((u_int8_t *)a)[0]; \
+} while (0)
#undef P_64_COPY
#define P_64_COPY(a, b) { \
((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \
@@ -113,7 +124,7 @@ extern "C" {
P_32_COPYSWAP(&_tmp, a); \
} while (0)
#undef M_32_SWAP
-#define M_32_SWAP(a) P_32_SWAP(&a)
+#define M_32_SWAP(a) P_32_SWAP(&(a))
/*
* Little endian <==> big endian 16-bit swap macros.
@@ -139,8 +150,13 @@ extern "C" {
P_16_COPYSWAP(&_tmp, a); \
} while (0)
#undef M_16_SWAP
-#define M_16_SWAP(a) P_16_SWAP(&a)
+#define M_16_SWAP(a) P_16_SWAP(&(a))
+#undef SWAP64
+#define SWAP64(p) { \
+ P_64_SWAP(p); \
+ (p) += sizeof(u_int64_t); \
+}
#undef SWAP32
#define SWAP32(p) { \
P_32_SWAP(p); \
@@ -168,6 +184,25 @@ extern "C" {
P_32_SWAP(p); \
} while (0)
+#undef DB_NTOHLL_COPYIN
+#define DB_NTOHLL_COPYIN(env, i, p) do { \
+ u_int8_t *tmp; \
+ tmp = (u_int8_t *)&(i); \
+ if (F_ISSET(env, ENV_LITTLEENDIAN)) { \
+ tmp[7] = *p++; \
+ tmp[6] = *p++; \
+ tmp[5] = *p++; \
+ tmp[4] = *p++; \
+ tmp[3] = *p++; \
+ tmp[2] = *p++; \
+ tmp[1] = *p++; \
+ tmp[0] = *p++; \
+ } else { \
+ memcpy(&(i), p, sizeof(u_int64_t)); \
+ p = (u_int8_t *)p + sizeof(u_int64_t); \
+ } \
+} while (0)
+
#undef DB_NTOHL_COPYIN
#define DB_NTOHL_COPYIN(env, i, p) do { \
u_int8_t *tmp; \
@@ -178,7 +213,7 @@ extern "C" {
tmp[1] = *p++; \
tmp[0] = *p++; \
} else { \
- memcpy(&i, p, sizeof(u_int32_t)); \
+ memcpy(&(i), p, sizeof(u_int32_t)); \
p = (u_int8_t *)p + sizeof(u_int32_t); \
} \
} while (0)
@@ -191,11 +226,29 @@ extern "C" {
tmp[1] = *p++; \
tmp[0] = *p++; \
} else { \
- memcpy(&i, p, sizeof(u_int16_t)); \
+ memcpy(&(i), p, sizeof(u_int16_t)); \
p = (u_int8_t *)p + sizeof(u_int16_t); \
} \
} while (0)
+#undef DB_HTONLL_COPYOUT
+#define DB_HTONLL_COPYOUT(env, p, i) do { \
+ u_int8_t *tmp; \
+ tmp = (u_int8_t *)p; \
+ if (F_ISSET(env, ENV_LITTLEENDIAN)) { \
+ *tmp++ = ((u_int8_t *)&(i))[7]; \
+ *tmp++ = ((u_int8_t *)&(i))[6]; \
+ *tmp++ = ((u_int8_t *)&(i))[5]; \
+ *tmp++ = ((u_int8_t *)&(i))[4]; \
+ *tmp++ = ((u_int8_t *)&(i))[3]; \
+ *tmp++ = ((u_int8_t *)&(i))[2]; \
+ *tmp++ = ((u_int8_t *)&(i))[1]; \
+ *tmp++ = ((u_int8_t *)&(i))[0]; \
+ } else \
+ memcpy(p, &(i), sizeof(u_int64_t)); \
+ p = (u_int8_t *)p + sizeof(u_int64_t); \
+} while (0)
+
#undef DB_HTONL_COPYOUT
#define DB_HTONL_COPYOUT(env, p, i) do { \
u_int8_t *tmp; \
@@ -206,7 +259,7 @@ extern "C" {
*tmp++ = ((u_int8_t *)&(i))[1]; \
*tmp++ = ((u_int8_t *)&(i))[0]; \
} else \
- memcpy(p, &i, sizeof(u_int32_t)); \
+ memcpy(p, &(i), sizeof(u_int32_t)); \
p = (u_int8_t *)p + sizeof(u_int32_t); \
} while (0)
@@ -229,6 +282,13 @@ extern "C" {
*/
#define LOG_SWAPPED(env) !F_ISSET(env, ENV_LITTLEENDIAN)
+#define LOGCOPY_64(env, x, p) do { \
+ if (LOG_SWAPPED(env)) \
+ P_64_COPYSWAP((p), (x)); \
+ else \
+ memcpy((x), (p), sizeof(u_int64_t)); \
+} while (0)
+
#define LOGCOPY_32(env, x, p) do { \
if (LOG_SWAPPED(env)) \
P_32_COPYSWAP((p), (x)); \
diff --git a/src/dbinc/db_upgrade.h b/src/dbinc/db_upgrade.h
index 45fb624d..716594c9 100644
--- a/src/dbinc/db_upgrade.h
+++ b/src/dbinc/db_upgrade.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -242,6 +242,123 @@ typedef struct hashhdr { /* Disk resident portion */
*/
} HASHHDR;
+
+/************************************************************************
+ BLOB RECORD LAYOUTS
+ ************************************************************************/
+
+/*
+ * Hash BLOB record layout.
+ */
+typedef struct _hblob60 {
+ u_int8_t type; /* 00: Page type and delete flag. */
+ u_int8_t encoding; /* 01: Encoding of blob file. */
+ u_int8_t unused[2]; /* 02-03: Padding, unused. */
+ u_int32_t id_lo; /* 04-07: Blob file identifier. */
+ u_int32_t id_hi; /* 07-11: Blob file identifier. */
+ u_int32_t size_lo; /* 12-15: Blob file size. */
+ u_int32_t size_hi; /* 15-19: Blob file size. */
+ DB_LSN lsn; /* 20-27: LSN for blob file update. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */
+ u_int32_t file_id_lo; /* 64-67: File directory lo. */
+ u_int32_t file_id_hi; /* 68-71: File directory hi. */
+ u_int32_t sdb_id_lo; /* 72-75: Subdb that owns this blob. */
+ u_int32_t sdb_id_hi; /* 76-79: Subdb that owns this blob. */
+} HBLOB60;
+
+#define HBLOB60_SIZE (sizeof(HBLOB60))
+
+/*
+ * Btree BLOB record layout.
+ */
+typedef struct _bblob60 {
+ db_indx_t len; /* 00-01: BBLOB_DSIZE. */
+ u_int8_t type; /* 02: Page type and delete flag. */
+ u_int8_t encoding; /* 03: Encoding of blob file. */
+ u_int32_t id_lo; /* 04-07: Blob file identifier. */
+ u_int32_t id_hi; /* 08-11: Blob file identifier. */
+ u_int32_t size_lo; /* 12-15: Blob file size. */
+ u_int32_t size_hi; /* 15-19: Blob file size. */
+ DB_LSN lsn; /* 20-27: LSN for blob file update. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */
+ u_int32_t file_id_lo; /* 64-67: File directory lo. */
+ u_int32_t file_id_hi; /* 68-71: File directory hi. */
+ u_int32_t sdb_id_lo; /* 72-75: Subdb that owns this blob. */
+ u_int32_t sdb_id_hi; /* 76-79: Subdb that owns this blob. */
+} BBLOB60;
+
+#define BBLOB60_SIZE \
+ ((u_int16_t)DB_ALIGN(sizeof(BBLOB60), sizeof(u_int32_t)))
+/*
+ * Heap BLOB record layout.
+ */
+typedef struct _heapblob60 {
+ u_int8_t flags; /* 00: Flags describing record. */
+ u_int8_t unused; /* 01: Padding. */
+ u_int16_t size; /* 02-03: The size of the stored data piece. */
+ u_int8_t encoding; /* 04: Encoding of blob file. */
+ u_int8_t unused2[3]; /* 05-07: Padding, unused. */
+ u_int32_t id_lo; /* 08-11: Blob file identifier. */
+ u_int32_t id_hi; /* 12-15: Blob file identifier. */
+ u_int32_t size_lo; /* 16-19: Blob file size. */
+ u_int32_t size_hi; /* 20-23: Blob file size. */
+ u_int8_t unused3[4]; /* 24-27: Padding, unused. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */
+ DB_LSN lsn; /* 64-67: LSN for blob file update. */
+ u_int32_t file_id_lo; /* 68-71: File directory lo. */
+ u_int32_t file_id_hi; /* 72-75: File directory hi. */
+} HEAPBLOBHDR60;
+
+#define HEAPBLOBREC60_SIZE (sizeof(HEAPBLOBHDR60))
+
+#define GET_BLOB60_FILE_ID(e, p, o, ret) \
+ GET_LO_HI(e, (p)->file_id_lo, (p)->file_id_hi, o, ret);
+
+#define GET_BLOB60_SDB_ID(e, p, o, ret) \
+ GET_LO_HI(e, (p)->sdb_id_lo, (p)->sdb_id_hi, o, ret);
+
+/* Return a uintmax_t version of blob_id. */
+#define GET_BLOB60_ID(e, p, o, ret) do { \
+ DB_ASSERT((e), sizeof(o) <= 8); \
+ if (sizeof(o) == 8) { \
+ (o) = (p).id_hi; \
+ (o) = (o) << 32; \
+ (o) += (p).id_lo; \
+ } else { \
+ if ((p).id_hi > 0) { \
+ __db_errx((e), DB_STR("0766", \
+ "Blob identifier overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ (o) = (p).id_lo; \
+ } \
+} while (0);
+
+/* Return a off_t version of blob size. */
+#define GET_BLOB60_SIZE(e, p, o, ret) do { \
+ DB_ASSERT((e), sizeof(o) <= 8); \
+ if (sizeof(o) == 8) { \
+ (o) = (p).size_hi; \
+ (o) = (o) << 32; \
+ (o) += (p).size_lo; \
+ } else { \
+ if ((p).size_hi > 0) { \
+ __db_errx((e), DB_STR("0767", \
+ "Blob size overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ if ((p).size_lo > INT_MAX) { \
+ __db_errx((e), DB_STR("0768", \
+ "Blob size overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ (o) = (int32_t)(p).size_lo; \
+ } \
+} while (0);
+
#if defined(__cplusplus)
}
#endif
diff --git a/src/dbinc/db_verify.h b/src/dbinc/db_verify.h
index 68acbf6c..ea87680f 100644
--- a/src/dbinc/db_verify.h
+++ b/src/dbinc/db_verify.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -120,9 +120,10 @@ struct __vrfy_dbinfo {
#define SALVAGE_PRINTABLE 0x01 /* Output printable chars literally. */
#define SALVAGE_PRINTHEADER 0x02 /* Print the unknown-key header. */
#define SALVAGE_PRINTFOOTER 0x04 /* Print the unknown-key footer. */
-#define SALVAGE_HASSUBDBS 0x08 /* There are subdatabases to salvage. */
-#define VRFY_LEAFCHAIN_BROKEN 0x10 /* Lost one or more Btree leaf pgs. */
-#define VRFY_QMETA_SET 0x20 /* We've seen a QUEUE meta page and
+#define SALVAGE_STREAM_BLOB 0x08 /* Currently streaming a blob. */
+#define SALVAGE_HASSUBDBS 0x10 /* There are subdatabases to salvage. */
+#define SALVAGE_LEAFCHAIN_BROKEN 0x20 /* Lost one or more Btree leaf pgs. */
+#define SALVAGE_QMETA_SET 0x40 /* We've seen a QUEUE meta page and
set things up for it. */
u_int32_t flags;
}; /* VRFY_DBINFO */
diff --git a/src/dbinc/debug.h b/src/dbinc/debug.h
index a8da000d..5388b791 100644
--- a/src/dbinc/debug.h
+++ b/src/dbinc/debug.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -36,7 +36,13 @@ extern "C" {
#define DB_ASSERT(env, e) \
((e) ? (void)0 : __db_assert(env, #e, __FILE__, __LINE__))
#else
-#define DB_ASSERT(env, e) NOP_STATEMENT
+#define DB_ASSERT(env, e) ((void)0)
+#endif
+
+#if defined(HAVE_ERROR_HISTORY)
+#define DB_DEBUG_MSG __db_debug_msg
+#else
+#define DB_DEBUG_MSG if (0) __db_debug_msg
#endif
/*
@@ -55,10 +61,11 @@ extern "C" {
* of structure fields whose only purpose is padding, as well as when heap
* memory that was never initialized is written to disk.
*/
+#define UMRW_SET(var) UMRW_SET_VALUE((var), 0)
#ifdef UMRW
-#define UMRW_SET(v) (v) = 0
+#define UMRW_SET_VALUE(var, value) (var) = (value)
#else
-#define UMRW_SET(v) NOP_STATEMENT
+#define UMRW_SET_VALUE(var, value) NOP_STATEMENT
#endif
/*
@@ -73,6 +80,34 @@ typedef enum {
} db_error_set_t;
/*
+ * Use these macros wherever an error condition is initially noticed, e.g., when
+ * setting a value to any of the user visible error return codes, whether
+ * defined by Berkeley DB or by the operating environment (EINVAL).
+ * saving the specific source of an instance of an error code, including the
+ * time, stack, db name, current LSN, etc. If the error turns out to be
+ * important, the deferred message text is added to the text produced by
+ * __db_err(), __db_errx, and __db_syserr(). The additional information can be
+ * useful for diagnosing the behavior of applications under error conditions.
+ * It is enabled by configuring with --enable-error_history. The current
+ * implmentation requires pthreads' version of thread local storage.
+ */
+#ifdef HAVE_ERROR_HISTORY
+#define USR_ERR(env, errcode) __db_diags((env), (errcode))
+#define DBC_ERR(dbc, errcode) __dbc_diags((dbc), (errcode))
+#define MUTEX_ERR(env, mutex, errcode) __mutex_diags((env), (mutex), (errcode))
+#define DISCARD_HISTORY(env) __db_deferred_discard()
+/* Save at most 10KB of error history in an API call. Adjust this as desired. */
+#define DB_ERROR_HISTORY_SIZE (10 * 1024)
+#else
+#define USR_ERR(env, errcode) (errcode)
+#define DBC_ERR(dbc, errcode) (errcode)
+#define MUTEX_ERR(env, mutex, errcode) (errcode)
+#define DISCARD_HISTORY(env) NOP_STATEMENT
+/* No space is needed when error history is disabled. */
+#define DB_ERROR_HISTORY_SIZE 0
+#endif
+
+/*
* Message handling. Use a macro instead of a function because va_list
* references to variadic arguments cannot be reset to the beginning of the
* variadic argument list (and then rescanned), by functions other than the
@@ -102,6 +137,7 @@ typedef enum {
((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \
__db_errfile(dbenv, error, error_set, fmt, __ap); \
va_end(__ap); \
+ DISCARD_HISTORY((dbenv)->env); \
}
#else
#define DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) { \
@@ -127,6 +163,7 @@ typedef enum {
((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \
__db_errfile(env, error, error_set, fmt, __ap); \
va_end(__ap); \
+ DISCARD_HISTORY(env); \
}
#endif
#if defined(STDC_HEADERS) || defined(__cplusplus)
@@ -192,7 +229,7 @@ typedef enum {
#define LOG_OP(C, T, O, K, A, F) { \
DB_LSN __lsn; \
DBT __op; \
- if (DBC_LOGGING((C))) { \
+ if ((C)->dbp->log_filename != NULL && DBC_LOGGING((C))) { \
memset(&__op, 0, sizeof(__op)); \
__op.data = O; \
__op.size = (u_int32_t)strlen(O) + 1; \
diff --git a/src/dbinc/fop.h b/src/dbinc/fop.h
index 94f27f9f..7ea62023 100644
--- a/src/dbinc/fop.h
+++ b/src/dbinc/fop.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -23,6 +23,20 @@ extern "C" {
(void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 0); \
} while (0)
+/*
+ * Never change the value of DB_FOP_CREATE (0x00000002),
+ * DB_FOP_APPEND (0x00000001), and DB_FOP_REDO(0x00000008),
+ * as those values are used in write_file logs.
+ */
+#define DB_FOP_APPEND 0x00000001 /* Appending to a file. */
+#define DB_FOP_CREATE 0x00000002 /* Creating the file. */
+#define DB_FOP_PARTIAL_LOG 0x00000004 /* Partial logging of file data. */
+#define DB_FOP_REDO 0x00000008 /* File operation can be redone. */
+#define DB_FOP_READONLY 0x00000010 /* File is read only. */
+#define DB_FOP_WRITE 0x00000020 /* File is writeable. */
+#define DB_FOP_SYNC_WRITE 0x00000040 /* Sync file on each write. */
+
+
#include "dbinc_auto/fileops_auto.h"
#include "dbinc_auto/fileops_ext.h"
diff --git a/src/dbinc/globals.h b/src/dbinc/globals.h
index 95e5c118..becd6365 100644
--- a/src/dbinc/globals.h
+++ b/src/dbinc/globals.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -52,21 +52,27 @@ typedef struct __db_globals {
char error_buf[40]; /* Error string buffer. */
- int uid_init; /* srand set in UID generator */
+ int random_seeded; /* Has __os_srandom been called? */
- u_long rand_next; /* rand/srand value */
+#if defined(HAVE_RANDOM_R)
+ struct random_data random_data; /* srandom_r/random_r argument */
+ char random_state[64]; /* random number state */
+#elif !defined(HAVE_RAND) && !defined(HAVE_RANDOM)
+ u_long rand_next; /* next rand value for clib/rand.c */
+#endif
u_int32_t fid_serial; /* file id counter */
int db_errno; /* Errno value if not available */
- size_t num_active_pids; /* number of entries in active_pids */
-
- size_t size_active_pids; /* allocated size of active_pids */
+ char *saved_errstr; /* saved error string from backup */
- pid_t *active_pids; /* array active pids */
+ char *time_format; /* strftime-format for printing dates */
- char *saved_errstr; /* saved error string from backup */
+#if defined(HAVE_ERROR_HISTORY) && defined(HAVE_PTHREAD_SELF)
+ pthread_key_t msgs_key;
+ pthread_once_t thread_once;
+#endif
/* Underlying OS interface jump table.*/
void (*j_assert) __P((const char *, const char *, int));
diff --git a/src/dbinc/hash.h b/src/dbinc/hash.h
index f485128a..55a64f87 100644
--- a/src/dbinc/hash.h
+++ b/src/dbinc/hash.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
@@ -56,7 +56,7 @@ typedef struct hash_t {
u_int32_t h_nelem; /* Number of elements. */
/* Hash and compare functions. */
u_int32_t (*h_hash) __P((DB *, const void *, u_int32_t));
- int (*h_compare) __P((DB *, const DBT *, const DBT *));
+ int (*h_compare) __P((DB *, const DBT *, const DBT *, size_t *));
} HASH;
/* Cursor structure definitions. */
diff --git a/src/dbinc/heap.h b/src/dbinc/heap.h
index ca3407e0..bb96ebec 100644
--- a/src/dbinc/heap.h
+++ b/src/dbinc/heap.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _DB_HEAP_H_
@@ -26,7 +26,8 @@ struct __heap { /* Heap access method. */
db_pgno_t curregion; /* The region of the next insert. */
db_pgno_t maxpgno; /* Maximum page number of a fixed size heap. */
- int curpgindx; /* The last used offset in the region's space bitmap. */
+ u_int32_t curpgindx; /* The last used offset in the
+ * region's space bitmap. */
};
struct __heap_cursor {
diff --git a/src/dbinc/hmac.h b/src/dbinc/hmac.h
index 2a495b17..f87965eb 100644
--- a/src/dbinc/hmac.h
+++ b/src/dbinc/hmac.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/lock.h b/src/dbinc/lock.h
index eab51832..298b8527 100644
--- a/src/dbinc/lock.h
+++ b/src/dbinc/lock.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -37,7 +37,10 @@ extern "C" {
*/
#define LOCK_INVALID INVALID_ROFF
#define LOCK_ISSET(lock) ((lock).off != LOCK_INVALID)
-#define LOCK_INIT(lock) ((lock).off = LOCK_INVALID)
+#define LOCK_INIT(lock) do { \
+ (lock).off = LOCK_INVALID; \
+ UMRW_SET_VALUE((lock).mode, DB_LOCK_NG); \
+} while(0)
/*
* Macro to identify a write lock for the purpose of counting locks
@@ -66,8 +69,8 @@ extern "C" {
typedef struct __db_lockregion { /* SHARED */
db_mutex_t mtx_region; /* Region mutex. */
- u_int32_t need_dd; /* flag for deadlock detector */
- u_int32_t detect; /* run dd on every conflict */
+ u_int32_t need_dd; /* run dd on every conflict */
+ u_int32_t detect; /* flag for deadlock detector */
db_timespec next_timeout; /* next time to expire a lock */
db_mutex_t mtx_dd; /* mutex for lock object dd list. */
db_mutex_t mtx_lockers; /* mutex for locker allocation. */
@@ -92,7 +95,7 @@ typedef struct __db_lockregion { /* SHARED */
u_int32_t lock_id; /* Current lock(er) id to allocate. */
u_int32_t cur_maxid; /* Current max lock(er) id. */
- u_int32_t nlockers; /* Current number of lockers. */
+ u_int32_t nlockers; /* Current number of locker ids. */
int32_t nmodes; /* Number of modes in conflict table. */
DB_LOCK_STAT stat; /* stats about locking. */
} DB_LOCKREGION;
@@ -157,12 +160,16 @@ struct __db_locker { /* SHARED */
db_timespec lk_expire; /* When current lock expires. */
db_timespec tx_expire; /* When this txn expires. */
db_timeout_t lk_timeout; /* How long do we let locks live. */
+#ifdef DIAGNOSTIC
+ roff_t prev_locker; /* The thread's previous dbth_locker. */
+#endif
#define DB_LOCKER_DIRTY 0x0001 /* Has write locks. */
#define DB_LOCKER_INABORT 0x0002 /* Is aborting, don't abort again. */
#define DB_LOCKER_TIMEOUT 0x0004 /* Has timeout set. */
#define DB_LOCKER_FAMILY_LOCKER 0x0008 /* Part of a family of lockers. */
#define DB_LOCKER_HANDLE_LOCKER 0x0010 /* Not associated with a thread. */
+#define DB_LOCKER_FREE 0x0020 /* Diag: it is on the free list. */
u_int32_t flags;
};
diff --git a/src/dbinc/log.h b/src/dbinc/log.h
index c4dea6fc..2e2929f0 100644
--- a/src/dbinc/log.h
+++ b/src/dbinc/log.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -55,6 +55,8 @@ struct __fname {
/* number of txn referencing + 1 for the db handle. */
u_int32_t txn_ref;
+ db_seq_t blob_file_id; /* BLOB file directory id. */
+
#define DB_FNAME_CLOSED 0x01 /* DBP was closed. */
#define DB_FNAME_DURABLE 0x02 /* File is durable. */
#define DB_FNAME_INMEM 0x04 /* File is in memory. */
@@ -137,16 +139,18 @@ struct __db_log {
ENV *env; /* Environment */
REGINFO reginfo; /* Region information. */
-#define DBLOG_AUTOREMOVE 0x01 /* Autoremove log files. */
-#define DBLOG_DIRECT 0x02 /* Do direct I/O on the log. */
-#define DBLOG_DSYNC 0x04 /* Set OS_DSYNC on the log. */
-#define DBLOG_FORCE_OPEN 0x08 /* Force the DB open even if it appears
+#define DBLOG_AUTOREMOVE 0x001 /* Autoremove log files. */
+#define DBLOG_BLOB 0x002 /* Full logging of blob data. */
+#define DBLOG_DIRECT 0x004 /* Do direct I/O on the log. */
+#define DBLOG_DSYNC 0x008 /* Set OS_DSYNC on the log. */
+#define DBLOG_FORCE_OPEN 0x010 /* Force the DB open even if it appears
* to be deleted. */
-#define DBLOG_INMEMORY 0x10 /* Logging is in memory. */
-#define DBLOG_OPENFILES 0x20 /* Prepared files need to be open. */
-#define DBLOG_RECOVER 0x40 /* We are in recovery. */
-#define DBLOG_ZERO 0x80 /* Zero fill the log. */
-#define DBLOG_VERIFYING 0x100 /* The log is being verified. */
+#define DBLOG_INMEMORY 0x020 /* Logging is in memory. */
+#define DBLOG_NOSYNC 0x040 /* Don't sync log files during flush. */
+#define DBLOG_OPENFILES 0x080 /* Prepared files need to be open. */
+#define DBLOG_RECOVER 0x100 /* We are in recovery. */
+#define DBLOG_ZERO 0x200 /* Zero fill the log. */
+#define DBLOG_VERIFYING 0x400 /* The log is being verified. */
u_int32_t flags;
};
@@ -251,7 +255,8 @@ struct __log { /* SHARED */
* rather than by the region mutex.
*/
db_mutex_t mtx_flush; /* Mutex guarding flushing. */
- int32_t in_flush; /* Log flush in progress. */
+ int32_t in_flush; /* Log flush in progress. */
+ int32_t nosync; /* log_set_config(DB_LOG_NOSYNC) */
DB_LSN s_lsn; /* LSN of the last sync. */
DB_LOG_STAT stat; /* Log statistics. */
diff --git a/src/dbinc/log_verify.h b/src/dbinc/log_verify.h
index fa90ace4..ec43c4d7 100644
--- a/src/dbinc/log_verify.h
+++ b/src/dbinc/log_verify.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h
index 9a10c6d9..598ca366 100644
--- a/src/dbinc/mp.h
+++ b/src/dbinc/mp.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -226,10 +226,15 @@ struct __mpool { /* SHARED */
#define DB_MEMP_SYNC_INTERRUPT 0x02
u_int32_t config_flags;
- /* Free frozen buffer headers, protected by the region lock. */
+ /* These MVCC fields are protected by the mpool region lock. */
+
+ /* This is the free list of BH_FROZEN_PAGEs, the frozen headers. */
SH_TAILQ_HEAD(__free_frozen) free_frozen;
- /* Allocated blocks of frozen buffer headers. */
+ /*
+ * This list of BH_FROZEN_ALLOCs contains all the BH_FROZEN_PAGEs,
+ * whether they are in free_frozen or busy (in a bh.vc version chain).
+ */
SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen;
};
@@ -550,9 +555,10 @@ struct __bh { /* SHARED */
#define BH_FROZEN 0x040 /* Frozen buffer: allocate & re-read. */
#define BH_TRASH 0x080 /* Page is garbage. */
#define BH_THAWED 0x100 /* Page was thawed. */
+#define BH_UNREACHABLE 0x200 /* Discard this defunct MVCC version. */
u_int16_t flags;
- u_int32_t priority; /* Priority. */
+ u_int32_t priority; /* Cache priority. */
SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */
db_pgno_t pgno; /* Underlying MPOOLFILE page number. */
@@ -587,9 +593,12 @@ struct __bh_frozen_p {
/*
* BH_FROZEN_ALLOC --
- * Frozen buffer headers are allocated a page at a time in general. This
- * structure is allocated at the beginning of the page so that the
- * allocation chunks can be tracked and freed (for private environments).
+ * This structure is the container for one or more frozen buffer headers.
+ * Blocks of BH_FROZEN_PAGE structs are usually allocated a page at a time,
+ * though when an mpool is nearly full and a whole page isn't available
+ * there can be single-item blocks. BH_FROZEN_ALLOC is the block header
+ * allocated at the beginning of the chunk and is linked to the mpool's
+ * alloc_frozen so that the allocation chunks can be tracked and freed.
*/
struct __bh_frozen_a {
SH_TAILQ_ENTRY links;
@@ -602,33 +611,36 @@ struct __bh_frozen_a {
(F_ISSET(PAGE_TO_BH(p), BH_DIRTY|BH_EXCLUSIVE) == (BH_DIRTY|BH_EXCLUSIVE))
#define BH_OWNER(env, bhp) \
- ((TXN_DETAIL *)R_ADDR(&env->tx_handle->reginfo, bhp->td_off))
+ ((TXN_DETAIL *)R_ADDR(&(env)->tx_handle->reginfo, (bhp)->td_off))
#define BH_OWNED_BY(env, bhp, txn) ((txn) != NULL && \
- (bhp)->td_off != INVALID_ROFF && \
- (txn)->td == BH_OWNER(env, bhp))
+ (bhp)->td_off != INVALID_ROFF && (txn)->td == BH_OWNER(env, bhp))
-#define VISIBLE_LSN(env, bhp) \
- (&BH_OWNER(env, bhp)->visible_lsn)
+#define VISIBLE_LSN(env, bhp) (&BH_OWNER(env, bhp)->visible_lsn)
/*
- * Make a copy of the buffer's visible LSN, one field at a time. We rely on the
- * 32-bit operations being atomic. The visible_lsn starts at MAX_LSN and is
- * set during commit or abort to the current LSN.
+ * MVCC Versions are visible only to snapshot transactions whose read_lsn is at
+ * least as recent (large) as the buffer's lsn. Visibility checks must be made
+ * from newest to oldest along bhp.vc, stopping at the first visible one.
+ * Unversioned buffers (those with invalid td_off) are always visible.
+ *
+ * BH_VISIBLE() makes a copy of the buffer's visible LSN, one field at a time.
+ * We rely on the 32-bit operations being atomic. The visible_lsn starts at
+ * MAX_LSN and is set during commit or abort to the current LSN.
*
- * If we race with a commit / abort, we may see either the file or the offset
+ * If we race with a commit or abort, we may see either the file or the offset
* still at UINT32_MAX, so vlsn is guaranteed to be in the future. That's OK,
* since we had to take the log region lock to allocate the read LSN so we were
* never going to see this buffer anyway.
*/
#define BH_VISIBLE(env, bhp, read_lsnp, vlsn) \
(bhp->td_off == INVALID_ROFF || \
- ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \
+ ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \
(vlsn).offset = VISIBLE_LSN(env, bhp)->offset, \
LOG_COMPARE((read_lsnp), &(vlsn)) >= 0))
#define BH_OBSOLETE(bhp, old_lsn, vlsn) (SH_CHAIN_HASNEXT(bhp, vc) ? \
- BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) :\
+ BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) : \
BH_VISIBLE(env, bhp, &(old_lsn), vlsn))
#define MVCC_SKIP_CURADJ(dbc, pgno) (dbc->txn != NULL && \
diff --git a/src/dbinc/mutex.h b/src/dbinc/mutex.h
index b699142c..334d8f96 100644
--- a/src/dbinc/mutex.h
+++ b/src/dbinc/mutex.h
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -24,10 +24,14 @@ extern "C" {
#endif
/*
- * By default, spin 50 times per processor if fail to acquire a test-and-set
- * mutex, we have anecdotal evidence it's a reasonable value.
+ * These specify the default spin parameters for test-and-set mutexes. A single
+ * processor system spins just once, a multiprocessor system spins 50 times per
+ * processor up to a default maximum of 200. This limit reduces excessive
+ * busy-waiting on machines with many hyperthreads. We have anecdotal evidence
+ * that these are reasonable default values.
*/
#define MUTEX_SPINS_PER_PROCESSOR 50
+#define MUTEX_SPINS_DEFAULT_MAX 200
/*
* Mutexes are represented by unsigned, 32-bit integral values. As the
@@ -163,13 +167,6 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
#define __mutex_rdlock(a, b) __db_win32_mutex_readlock(a, b)
#define __mutex_tryrdlock(a, b) __db_win32_mutex_tryreadlock(a, b)
#endif
-#elif defined(HAVE_MUTEX_FCNTL)
-#define __mutex_init(a, b, c) __db_fcntl_mutex_init(a, b, c)
-#define __mutex_lock(a, b) __db_fcntl_mutex_lock(a, b, 0)
-#define __mutex_timedlock(a, b, c) __db_fcntl_lock(a, b, c)
-#define __mutex_trylock(a, b) __db_fcntl_mutex_trylock(a, b)
-#define __mutex_unlock(a, b) __db_fcntl_mutex_unlock(a, b)
-#define __mutex_destroy(a, b) __db_fcntl_mutex_destroy(a, b)
#else
#define __mutex_init(a, b, c) __db_tas_mutex_init(a, b, c)
#define __mutex_lock(a, b) __db_tas_mutex_lock(a, b, 0)
@@ -184,9 +181,8 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
#endif
/*
- * When there is no method to get a shared latch, fall back to
- * implementing __mutex_rdlock() as getting an exclusive one.
- * This occurs either when !HAVE_SHARED_LATCHES or HAVE_MUTEX_FCNTL.
+ * When there is no method to get a shared latch, fall back to implementing
+ * __mutex_rdlock() as an exclusive one. This may no longer be supported?
*/
#ifndef __mutex_rdlock
#define __mutex_rdlock(a, b) __mutex_lock(a, b)
@@ -199,17 +195,25 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
* Lock/unlock a mutex. If the mutex was never required, the thread of
* control can proceed without it.
*
- * We never fail to acquire or release a mutex without panicing. Simplify
+ * We rarely fail to acquire or release a mutex without panicing. Simplify
* the macros to always return a panic value rather than saving the actual
- * return value of the mutex routine.
+ * return value of the mutex routine. Use MUTEX_LOCK_RET() when the caller has
+ * a code path for a mutex failure, e.g., when cleaning up after a panic.
*/
#ifdef HAVE_MUTEX_SUPPORT
#define MUTEX_LOCK(env, mutex) do { \
- if ((mutex) != MUTEX_INVALID && \
- __mutex_lock(env, mutex) != 0) \
+ if ((mutex) != MUTEX_INVALID && __mutex_lock(env, mutex) != 0) \
return (DB_RUNRECOVERY); \
} while (0)
+#define MUTEX_LOCK_RET(env, mutex) \
+ ((mutex) == MUTEX_INVALID ? 0 : __mutex_lock(env, mutex))
+
+/*
+ * Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success,
+ * or possibly DB_RUNRECOVERY for failchk.
+ */
+
/*
* Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success,
* or DB_LOCK_NOTGRANTED, or possibly DB_RUNRECOVERY for failchk.
@@ -217,9 +221,7 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
#define MUTEX_TRYLOCK(env, mutex) \
(((mutex) == MUTEX_INVALID) ? 0 : __mutex_trylock(env, mutex))
-/*
- * Acquire a DB_MUTEX_SHARED "mutex" in shared mode.
- */
+/* Acquire a latch (a DB_MUTEX_SHARED "mutex") in shared mode. */
#define MUTEX_READLOCK(env, mutex) do { \
if ((mutex) != MUTEX_INVALID && \
__mutex_rdlock(env, mutex) != 0) \
@@ -234,30 +236,68 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
return (DB_RUNRECOVERY); \
} while (0)
-#define MUTEX_WAIT(env, mutex, duration) do { \
- int __ret; \
- if ((mutex) != MUTEX_INVALID && \
- (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \
- __ret != DB_TIMEOUT) \
- return (DB_RUNRECOVERY); \
+#define MUTEX_WAIT(env, mutex, duration) do { \
+ int __ret; \
+ if ((mutex) != MUTEX_INVALID && \
+ (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \
+ __ret != DB_TIMEOUT) \
+ return (DB_RUNRECOVERY); \
} while (0)
+
+/*
+ * Check that a particular mutex is exclusively held at least by someone, not
+ * necessarily the current thread.
+ */
+#define MUTEX_IS_OWNED(env, mutex) \
+ (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \
+ F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \
+ F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED))
#else
/*
* There are calls to lock/unlock mutexes outside of #ifdef's -- replace
* the call with something the compiler can discard, but which will make
- * if-then-else blocks work correctly.
+ * if-then-else blocks work correctly, and suppress unused variable messages.
+ */
+#define MUTEX_LOCK(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_LOCK_RET(env, mutex) ( env = (env), mutex = (mutex), 0)
+#define MUTEX_TRYLOCK(env, mutex) ( env = (env), mutex = (mutex), 0)
+#define MUTEX_READLOCK(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_TRY_READLOCK(env, mutex) ( env = (env), mutex = (mutex), 0 )
+#define MUTEX_UNLOCK(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_REQUIRED(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_REQUIRED_READ(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_WAIT(env, mutex, duration) { \
+ (env) = (env); (mutex) = (mutex); (duration) = (duration); \
+}
+
+/*
+ * Every MUTEX_IS_OWNED() caller expects to own it. When there is no mutex
+ * support, act as if we have ownership.
*/
-#define MUTEX_LOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_TRYLOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_READLOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_TRY_READLOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_UNLOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_REQUIRED(env, mutex) (mutex) = (mutex)
-#define MUTEX_REQUIRED_READ(env, mutex) (mutex) = (mutex)
-#define MUTEX_WAIT(env, mutex, duration) (mutex) = (mutex)
+#define MUTEX_IS_OWNED(env, mutex) 1
#endif
/*
+ * Bulk initialization of mutexes in regions.
+ */
+
+#define MUTEX_BULK_INIT(env, region, start, howmany) do { \
+ DB_MUTEX *__mutexp; \
+ db_mutex_t __i = start; \
+ u_int32_t __n = howmany; \
+ for (__mutexp = MUTEXP_SET(env, __i); \
+ --__n > 0; \
+ __mutexp = MUTEXP_SET(env, __i)) { \
+ __mutexp->flags = 0; \
+ __i = (F_ISSET(env, ENV_PRIVATE)) ? \
+ ((uintptr_t)__mutexp + region->mutex_size) : __i + 1; \
+ __mutexp->mutex_next_link = __i; \
+ } \
+ __mutexp->flags = 0; \
+ __mutexp->mutex_next_link = MUTEX_INVALID; \
+} while (0)
+
+/*
* Berkeley DB ports may require single-threading at places in the code.
*/
#ifdef HAVE_MUTEX_VXWORKS
diff --git a/src/dbinc/mutex_int.h b/src/dbinc/mutex_int.h
index b9bccdf7..4a4468af 100644
--- a/src/dbinc/mutex_int.h
+++ b/src/dbinc/mutex_int.h
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -73,6 +73,14 @@ extern "C" {
else \
RET_SET((pthread_mutex_lock(&(mutexp)->u.m.mutex)), ret); \
} while (0)
+#define RET_SET_PTHREAD_TIMEDLOCK(mutexp, timespec, ret) do { \
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \
+ RET_SET(pthread_rwlock_timedwrlock(&(mutexp)->u.rwlock, \
+ (timespec)), ret); \
+ else \
+ RET_SET(pthread_mutex_timedlock(&(mutexp)->u.m.mutex, \
+ (timespec)), ret); \
+} while (0)
#define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) do { \
if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \
RET_SET((pthread_rwlock_trywrlock(&(mutexp)->u.rwlock)), \
@@ -84,6 +92,9 @@ extern "C" {
#else
#define RET_SET_PTHREAD_LOCK(mutexp, ret) \
RET_SET(pthread_mutex_lock(&(mutexp)->u.m.mutex), ret);
+#define RET_SET_PTHREAD_TIMEDLOCK(mutexp, timespec, ret) \
+ RET_SET(pthread_mutex_timedlock(&(mutexp)->u.m.mutex, \
+ (timespec)), ret);
#define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) \
RET_SET(pthread_mutex_trylock(&(mutexp)->u.m.mutex), ret);
#endif
@@ -267,6 +278,11 @@ typedef abilock_t tsl_t;
#include <sys/machlock.h>
typedef lock_t tsl_t;
+/*
+ * Solaris requires 8 byte alignment for pthread_mutex_t values.
+ */
+#define MUTEX_ALIGN 8
+
/*
* The functions are declared in <sys/machlock.h>, but under #ifdef KERNEL.
* Re-declare them here to avoid warnings.
@@ -778,6 +794,7 @@ MUTEX_SET(tsl_t *tsl) {
static inline void
MUTEX_UNSET(tsl_t *tsl) {
__asm__ volatile(
+ " .set mips2 \n"
" .set noreorder \n"
" sync \n"
" sw $0, %0 \n"
@@ -892,15 +909,22 @@ struct __db_mutexmgr {
REGINFO reginfo; /* Region information */
void *mutex_array; /* Base of the mutex array */
+#ifdef HAVE_FAILCHK_BROADCAST
+ /*
+ * The mutex lock functions wait for at most this long between checks
+ * for DB_MUTEX_OWNER_DEAD. This field needs no mutex protection.
+ */
+ db_timeout_t failchk_polltime;
+#endif
};
/* Macros to lock/unlock the mutex region as a whole. */
-#define MUTEX_SYSTEM_LOCK(dbenv) \
- MUTEX_LOCK(dbenv, ((DB_MUTEXREGION *) \
- (dbenv)->mutex_handle->reginfo.primary)->mtx_region)
-#define MUTEX_SYSTEM_UNLOCK(dbenv) \
- MUTEX_UNLOCK(dbenv, ((DB_MUTEXREGION *) \
- (dbenv)->mutex_handle->reginfo.primary)->mtx_region)
+#define MUTEX_SYSTEM_LOCK(env) \
+ MUTEX_LOCK(env, ((DB_MUTEXREGION *) \
+ (env)->mutex_handle->reginfo.primary)->mtx_region)
+#define MUTEX_SYSTEM_UNLOCK(env) \
+ MUTEX_UNLOCK(env, ((DB_MUTEXREGION *) \
+ (env)->mutex_handle->reginfo.primary)->mtx_region)
/*
* DB_MUTEXREGION --
@@ -927,6 +951,16 @@ typedef struct __db_mutexregion { /* SHARED */
} DB_MUTEXREGION;
#ifdef HAVE_MUTEX_SUPPORT
+/*
+ * MTX_DIAG turns on the recording of when and where a mutex was locked. It has
+ * a large impact, and should only be turned on when debugging mutexes.
+ */
+#define MUTEX_STACK_TEXT_SIZE 600
+typedef struct __mutex_history { /* SHARED */
+ db_timespec when;
+ char stacktext[MUTEX_STACK_TEXT_SIZE];
+} MUTEX_HISTORY;
+
struct __db_mutex_t { /* SHARED */ /* Mutex. */
#ifdef MUTEX_FIELDS
MUTEX_FIELDS /* Opaque thread mutex structures. */
@@ -959,9 +993,9 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */
db_mutex_t mutex_next_link; /* Linked list of free mutexes. */
-#ifdef HAVE_STATISTICS
int alloc_id; /* Allocation ID. */
+#ifdef HAVE_STATISTICS
u_int32_t mutex_set_wait; /* Granted after wait. */
u_int32_t mutex_set_nowait; /* Granted without waiting. */
#ifdef HAVE_SHARED_LATCHES
@@ -973,7 +1007,9 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */
u_int32_t hybrid_wakeup; /* for counting spurious wakeups */
#endif
#endif
-
+#ifdef MUTEX_DIAG
+ MUTEX_HISTORY mutex_history;
+#endif
/*
* A subset of the flag arguments for __mutex_alloc().
*
@@ -992,19 +1028,6 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */
(indx) * \
((DB_MUTEXREGION *)env->mutex_handle->reginfo.primary)->mutex_size))
-/*
- * Check that a particular mutex is exclusively held at least by someone, not
- * necessarily the current thread.
- */
-#ifdef HAVE_MUTEX_SUPPORT
-#define MUTEX_IS_OWNED(env, mutex) \
- (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \
- F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \
- F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED))
-#else
-#define MUTEX_IS_OWNED(env, mutex) 0
-#endif
-
#if defined(HAVE_MUTEX_HYBRID) || defined(DB_WIN32) || \
(defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS))
#define MUTEXP_IS_BUSY(mutexp) \
diff --git a/src/dbinc/os.h b/src/dbinc/os.h
index 2515e6ee..ea1fd2c4 100644
--- a/src/dbinc/os.h
+++ b/src/dbinc/os.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/partition.h b/src/dbinc/partition.h
index 09e42573..11cdfa6f 100644
--- a/src/dbinc/partition.h
+++ b/src/dbinc/partition.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* $Id$
@@ -22,6 +22,7 @@ typedef struct __db_partition {
u_int32_t (*callback) (DB *, DBT *);
#define PART_CALLBACK 0x01
#define PART_RANGE 0x02
+#define PART_KEYS_SETUP 0x04
u_int32_t flags;
} DB_PARTITION;
@@ -36,7 +37,14 @@ typedef struct __part_internal {
#ifdef HAVE_PARTITION
#define PART_NAME "__dbp.%s.%03d"
-#define PART_LEN (strlen("__dbp..")+3)
+/*
+ * Currently we only support no more than 1000000 partitions.
+ * If the limit is changed, the PART_DIGITS and PART_MAXIMUM
+ * should be changed accordingly.
+ */
+#define PART_DIGITS 6
+#define PART_MAXIMUM 1000000
+#define PART_LEN (sizeof("__dbp..") + PART_DIGITS)
#define PART_PREFIX "__dbp."
#define IS_PARTITION_DB_FILE(name) (strncmp(name, PART_PREFIX, \
sizeof(PART_PREFIX) - 1) == 0)
diff --git a/src/dbinc/perfmon.h b/src/dbinc/perfmon.h
index c3b9b9fa..e89eba33 100644
--- a/src/dbinc/perfmon.h
+++ b/src/dbinc/perfmon.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/qam.h b/src/dbinc/qam.h
index 657c11e2..d18f91f3 100644
--- a/src/dbinc/qam.h
+++ b/src/dbinc/qam.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/queue.h b/src/dbinc/queue.h
index 5a62741a..c53941ab 100644
--- a/src/dbinc/queue.h
+++ b/src/dbinc/queue.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1991, 1993
diff --git a/src/dbinc/region.h b/src/dbinc/region.h
index ac0ff16f..070aff5f 100644
--- a/src/dbinc/region.h
+++ b/src/dbinc/region.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -134,7 +134,10 @@ typedef enum {
REGION_TYPE_LOG,
REGION_TYPE_MPOOL,
REGION_TYPE_MUTEX,
- REGION_TYPE_TXN } reg_type_t;
+ REGION_TYPE_TXN,
+ /* This enum always must be the last, and is the largest valid type. */
+ REGION_TYPE_MAX = REGION_TYPE_TXN
+} reg_type_t;
#define INVALID_REGION_SEGID -1 /* Segment IDs are either shmget(2) or
* Win16 segment identifiers. They are
@@ -196,10 +199,10 @@ typedef struct __db_reg_env { /* SHARED */
/*
- * The mtx_regenv mutex protects the environment reference count and
- * memory allocation from the primary shared region (the crypto, thread
- * control block and replication implementations allocate memory from
- * the primary shared region).
+ * The mtx_regenv mutex protects the environment reference count,
+ * blob threshold and memory allocation from the primary shared region
+ * (the crypto, thread control block and replication implementations
+ * allocate memory from the primary shared region).
*
* The rest of the fields are initialized at creation time, and don't
* need mutex protection. The flags, op_timestamp and rep_timestamp
@@ -209,6 +212,7 @@ typedef struct __db_reg_env { /* SHARED */
*/
db_mutex_t mtx_regenv; /* Refcnt, region allocation mutex. */
u_int32_t refcnt; /* References to the environment. */
+ u_int32_t blob_threshold; /* Environment wide blob threshold. */
u_int32_t region_cnt; /* Number of REGIONs. */
roff_t region_off; /* Offset of region array */
@@ -227,6 +231,8 @@ typedef struct __db_reg_env { /* SHARED */
time_t op_timestamp; /* Timestamp for operations. */
time_t rep_timestamp; /* Timestamp for rep db handles. */
u_int32_t reg_panic; /* DB_REGISTER triggered panic */
+ u_int32_t failure_panic; /* Failchk or mutex lock saw a crash. */
+ char failure_symptom[DB_FAILURE_SYMPTOM_SIZE];
uintmax_t unused; /* The ALLOC_LAYOUT structure follows
* the REGENV structure in memory and
* contains uintmax_t fields. Force
@@ -308,11 +314,14 @@ struct __db_reginfo_t { /* __env_region_attach IN parameters. */
/*
* PANIC_ISSET, PANIC_CHECK:
- * Check to see if the DB environment is dead.
+ * Check to see if the DB environment is dead. If the environment is still
+ * attached to its regions, look in the REGENV. Otherwise, check whether
+ * the region had the panic state set when this even detached from it.
*/
#define PANIC_ISSET(env) \
- ((env) != NULL && (env)->reginfo != NULL && \
- ((REGENV *)(env)->reginfo->primary)->panic != 0 && \
+ ((env) != NULL && ((env)->reginfo != NULL ? \
+ ((REGENV *)(env)->reginfo->primary)->panic != 0 : \
+ F_ISSET(env, ENV_REMEMBER_PANIC)) && \
!F_ISSET((env)->dbenv, DB_ENV_NOPANIC))
#define PANIC_CHECK(env) \
diff --git a/src/dbinc/rep.h b/src/dbinc/rep.h
index 75004239..f3bdf481 100644
--- a/src/dbinc/rep.h
+++ b/src/dbinc/rep.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -19,6 +19,7 @@ extern "C" {
* Names of client temp databases.
*/
#define REPFILEPREFIX "__db.rep"
+#define REPBLOBNAME "__db.rep.blob.db"
#define REPDBNAME "__db.rep.db"
#define REPPAGENAME "__db.reppg.db"
@@ -42,43 +43,58 @@ extern "C" {
/*
* Message types
*/
-#define REP_INVALID 0 /* Invalid message type. */
-#define REP_ALIVE 1 /* I am alive message. */
-#define REP_ALIVE_REQ 2 /* Request for alive messages. */
-#define REP_ALL_REQ 3 /* Request all log records greater than LSN. */
-#define REP_BULK_LOG 4 /* Bulk transfer of log records. */
-#define REP_BULK_PAGE 5 /* Bulk transfer of pages. */
-#define REP_DUPMASTER 6 /* Duplicate master detected; propagate. */
-#define REP_FILE 7 /* Page of a database file. NOTUSED */
-#define REP_FILE_FAIL 8 /* File requested does not exist. */
-#define REP_FILE_REQ 9 /* Request for a database file. NOTUSED */
-#define REP_LEASE_GRANT 10 /* Client grants a lease to a master. */
-#define REP_LOG 11 /* Log record. */
-#define REP_LOG_MORE 12 /* There are more log records to request. */
-#define REP_LOG_REQ 13 /* Request for a log record. */
-#define REP_MASTER_REQ 14 /* Who is the master */
-#define REP_NEWCLIENT 15 /* Announces the presence of a new client. */
-#define REP_NEWFILE 16 /* Announce a log file change. */
-#define REP_NEWMASTER 17 /* Announces who the master is. */
-#define REP_NEWSITE 18 /* Announces that a site has heard from a new
- * site; like NEWCLIENT, but indirect. A
- * NEWCLIENT message comes directly from the new
- * client while a NEWSITE comes indirectly from
- * someone who heard about a NEWSITE.
- */
-#define REP_PAGE 19 /* Database page. */
-#define REP_PAGE_FAIL 20 /* Requested page does not exist. */
-#define REP_PAGE_MORE 21 /* There are more pages to request. */
-#define REP_PAGE_REQ 22 /* Request for a database page. */
-#define REP_REREQUEST 23 /* Force rerequest. */
-#define REP_START_SYNC 24 /* Tell client to begin syncing a ckp.*/
-#define REP_UPDATE 25 /* Environment hotcopy information. */
-#define REP_UPDATE_REQ 26 /* Request for hotcopy information. */
-#define REP_VERIFY 27 /* A log record for verification. */
-#define REP_VERIFY_FAIL 28 /* The client is outdated. */
-#define REP_VERIFY_REQ 29 /* Request for a log record to verify. */
-#define REP_VOTE1 30 /* Send out your information for an election. */
-#define REP_VOTE2 31 /* Send a "you are master" vote. */
+#define REP_INVALID 0 /* Invalid message type. */
+#define REP_ALIVE 1 /* I am alive message. */
+#define REP_ALIVE_REQ 2 /* Request for alive messages. */
+#define REP_ALL_REQ 3 /* Request all log records greater than
+ * LSN. */
+#define REP_BLOB_ALL_REQ 4 /* Request all the given blob files. */
+#define REP_BLOB_CHUNK 5 /* A piece of data contained in a blob
+ * file. */
+#define REP_BLOB_CHUNK_REQ 6 /* Request a piece of data from a blob
+ * file. */
+#define REP_BLOB_UPDATE 7 /* A list of blob files for a
+ * database. */
+#define REP_BLOB_UPDATE_REQ 8 /* Request blob files. */
+#define REP_BULK_LOG 9 /* Bulk transfer of log records. */
+#define REP_BULK_PAGE 10 /* Bulk transfer of pages. */
+#define REP_DUPMASTER 11 /* Duplicate master detected;
+ * propagate. */
+#define REP_FILE 12 /* Page of a database file. NOTUSED */
+#define REP_FILE_FAIL 13 /* File requested does not exist. */
+#define REP_FILE_REQ 14 /* Request for a database file.
+ * NOTUSED */
+#define REP_LEASE_GRANT 15 /* Client grants a lease to a master. */
+#define REP_LOG 16 /* Log record. */
+#define REP_LOG_MORE 17 /* There are more log records to
+ * request. */
+#define REP_LOG_REQ 18 /* Request for a log record. */
+#define REP_MASTER_REQ 19 /* Who is the master */
+#define REP_NEWCLIENT 20 /* Announces the presence of a new
+ * client. */
+#define REP_NEWFILE 21 /* Announce a log file change. */
+#define REP_NEWMASTER 22 /* Announces who the master is. */
+#define REP_NEWSITE 23 /* Announces that a site has heard from
+ * a new site; like NEWCLIENT, but
+ * indirect. A NEWCLIENT message comes
+ * directly from the new client while a
+ * NEWSITE comes indirectly from
+ * someone who heard about a NEWSITE.*/
+#define REP_PAGE 24 /* Database page. */
+#define REP_PAGE_FAIL 25 /* Requested page does not exist. */
+#define REP_PAGE_MORE 26 /* There are more pages to request. */
+#define REP_PAGE_REQ 27 /* Request for a database page. */
+#define REP_REREQUEST 28 /* Force rerequest. */
+#define REP_START_SYNC 29 /* Tell client to begin syncing a ckp.*/
+#define REP_UPDATE 30 /* Environment hotcopy information. */
+#define REP_UPDATE_REQ 31 /* Request for hotcopy information. */
+#define REP_VERIFY 32 /* A log record for verification. */
+#define REP_VERIFY_FAIL 33 /* The client is outdated. */
+#define REP_VERIFY_REQ 34 /* Request for a log record to
+ * verify. */
+#define REP_VOTE1 35 /* Send out your information for an
+ * election. */
+#define REP_VOTE2 36 /* Send a "you are master" vote. */
/*
* Maximum message number for conversion tables. Update this
* value as the largest message number above increases.
@@ -90,7 +106,7 @@ extern "C" {
* NOTE: When changing messages above, the two tables for upgrade support
* need adjusting. They are in rep_util.c.
*/
-#define REP_MAX_MSG 31
+#define REP_MAX_MSG 36
/*
* This is the list of client-to-client requests messages.
@@ -99,6 +115,8 @@ extern "C" {
*/
#define REP_MSG_REQ(rectype) \
(rectype == REP_ALL_REQ || \
+ rectype == REP_BLOB_ALL_REQ || \
+ rectype == REP_BLOB_CHUNK_REQ || \
rectype == REP_LOG_REQ || \
rectype == REP_PAGE_REQ || \
rectype == REP_VERIFY_REQ)
@@ -125,6 +143,9 @@ extern "C" {
#define DB_LOGVERSION_51 17
#define DB_LOGVERSION_52 18
#define DB_LOGVERSION_53 19
+#define DB_LOGVERSION_60 20
+#define DB_LOGVERSION_60p1 21
+#define DB_LOGVERSION_61 22
#define DB_LOGVERSION_MIN DB_LOGVERSION_44
#define DB_REPVERSION_INVALID 0
#define DB_REPVERSION_44 3
@@ -132,11 +153,12 @@ extern "C" {
#define DB_REPVERSION_46 4
#define DB_REPVERSION_47 5
#define DB_REPVERSION_48 5
-#define DB_REPVERSION_50 5
#define DB_REPVERSION_51 5
#define DB_REPVERSION_52 6
#define DB_REPVERSION_53 7
-#define DB_REPVERSION DB_REPVERSION_53
+#define DB_REPVERSION_60 7
+#define DB_REPVERSION_61 8
+#define DB_REPVERSION DB_REPVERSION_61
#define DB_REPVERSION_MIN DB_REPVERSION_44
/*
@@ -204,9 +226,20 @@ extern "C" {
#define REP_INITVERSION 3
/*
+ * View/partial replication file name.
+ * The file is empty. It exists as a permanent indicator that this
+ * environment can never be master.
+ */
+#define REPVIEW "__db.rep.view"
+#define IS_VIEW_SITE(env) \
+ (REP_ON(env) && \
+ ((env)->rep_handle->region->stat.st_view != 0))
+
+/*
* Database types for __rep_client_dbinit
*/
typedef enum {
+ REP_BLOB, /* Blob file database. */
REP_DB, /* Log record database. */
REP_PG /* Pg database. */
} repdb_t;
@@ -239,7 +272,7 @@ typedef enum {
typedef enum {
SYNC_OFF, /* No recovery. */
SYNC_LOG, /* Recovery - log. */
- SYNC_PAGE, /* Recovery - pages. */
+ SYNC_PAGE, /* Recovery - pages and blobs. */
SYNC_UPDATE, /* Recovery - update. */
SYNC_VERIFY /* Recovery - verify. */
} repsync_t;
@@ -346,6 +379,17 @@ typedef struct __rep { /* SHARED */
u_int32_t first_vers; /* Log version of first log file. */
DB_LSN last_lsn; /* Latest LSN we need. */
/* These are protected by mtx_clientdb. */
+ db_seq_t gap_bl_hi_id; /* Last id in the blob gap. */
+ db_seq_t gap_bl_hi_sid; /* Last sid in the blob gap. */
+ off_t gap_bl_hi_off; /* Last offset in the blob gap. */
+ db_seq_t last_blob_id; /* Last id on the list to process. */
+ db_seq_t last_blob_sid; /* Last sid on the list to process. */
+ db_seq_t prev_blob_id; /* Previous last id on list. */
+ db_seq_t prev_blob_sid; /* Previous last sid on list. */
+ db_seq_t highest_id; /* Highest file id to request. */
+ u_int32_t blob_more_files;/* More blob files to be processed. */
+ int blob_sync; /* Currently handling blobs. */
+ int blob_rereq; /* When to rereq a blob update msg. */
db_timespec last_pg_ts; /* Last page stored timestamp. */
db_pgno_t ready_pg; /* Next pg expected. */
db_pgno_t waiting_pg; /* First pg after gap. */
@@ -391,11 +435,13 @@ typedef struct __rep { /* SHARED */
roff_t siteinfo_off; /* Offset of site array region. */
u_int site_cnt; /* Array slots in use. */
u_int site_max; /* Total array slots allocated. */
+ u_int sites_avail; /* Total number of available sites. */
int self_eid; /* Where to find the local site. */
u_int siteinfo_seq; /* Number of updates to this info. */
u_int32_t min_log_file; /* Earliest log needed by repgroup. */
pid_t listener;
+ u_int listener_nthreads; /* # of msg threads in listener. */
int perm_policy;
db_timeout_t ack_timeout;
@@ -403,6 +449,11 @@ typedef struct __rep { /* SHARED */
db_timeout_t connection_retry_wait;
db_timeout_t heartbeat_frequency; /* Max period between msgs. */
db_timeout_t heartbeat_monitor_timeout;
+ u_int32_t inqueue_max_gbytes;
+ u_int32_t inqueue_max_bytes;
+ u_int32_t inqueue_rz_gbytes;
+ u_int32_t inqueue_rz_bytes;
+ u_int32_t inqueue_full_event_on;
#endif /* HAVE_REPLICATION_THREADS */
/* Statistics. */
@@ -419,12 +470,16 @@ typedef struct __rep { /* SHARED */
#define REP_C_2SITE_STRICT 0x00001 /* Don't cheat on elections. */
#define REP_C_AUTOINIT 0x00002 /* Auto initialization. */
#define REP_C_AUTOROLLBACK 0x00004 /* Discard client txns: sync. */
-#define REP_C_BULK 0x00008 /* Bulk transfer. */
-#define REP_C_DELAYCLIENT 0x00010 /* Delay client sync-up. */
-#define REP_C_ELECTIONS 0x00020 /* Repmgr to use elections. */
-#define REP_C_INMEM 0x00040 /* In-memory replication. */
-#define REP_C_LEASE 0x00080 /* Leases configured. */
-#define REP_C_NOWAIT 0x00100 /* Immediate error return. */
+#define REP_C_AUTOTAKEOVER 0x00008 /* Auto listener take over. */
+#define REP_C_BULK 0x00010 /* Bulk transfer. */
+#define REP_C_DELAYCLIENT 0x00020 /* Delay client sync-up. */
+#define REP_C_ELECT_LOGLENGTH 0x00040 /* Log length wins election. */
+#define REP_C_ELECTIONS 0x00080 /* Repmgr to use elections. */
+#define REP_C_INMEM 0x00100 /* In-memory replication. */
+#define REP_C_LEASE 0x00200 /* Leases configured. */
+#define REP_C_NOWAIT 0x00400 /* Immediate error return. */
+#define REP_C_PREFMAS_CLIENT 0x00800 /* Preferred master client. */
+#define REP_C_PREFMAS_MASTER 0x01000 /* Preferred master site. */
u_int32_t config; /* Configuration flags. */
/* Election. */
@@ -455,15 +510,17 @@ typedef struct __rep { /* SHARED */
#define REP_F_CLIENT 0x00000008 /* Client replica. */
#define REP_F_DELAY 0x00000010 /* Delaying client sync-up. */
#define REP_F_GROUP_ESTD 0x00000020 /* Rep group is established. */
-#define REP_F_INUPDREQ 0x00000040 /* Thread in rep_update_req. */
-#define REP_F_LEASE_EXPIRED 0x00000080 /* Leases guaranteed expired. */
-#define REP_F_MASTER 0x00000100 /* Master replica. */
-#define REP_F_MASTERELECT 0x00000200 /* Master elect. */
-#define REP_F_NEWFILE 0x00000400 /* Newfile in progress. */
-#define REP_F_NIMDBS_LOADED 0x00000800 /* NIMDBs are materialized. */
-#define REP_F_SKIPPED_APPLY 0x00001000 /* Skipped applying a record. */
-#define REP_F_START_CALLED 0x00002000 /* Rep_start called. */
-#define REP_F_SYS_DB_OP 0x00004000 /* Operation in progress. */
+#define REP_F_HOLD_GEN 0x00000040 /* PrefMas startup hold gen. */
+#define REP_F_INUPDREQ 0x00000080 /* Thread in rep_update_req. */
+#define REP_F_LEASE_EXPIRED 0x00000100 /* Leases guaranteed expired. */
+#define REP_F_MASTER 0x00000200 /* Master replica. */
+#define REP_F_MASTERELECT 0x00000400 /* Master elect. */
+#define REP_F_NEWFILE 0x00000800 /* Newfile in progress. */
+#define REP_F_NIMDBS_LOADED 0x00001000 /* NIMDBs are materialized. */
+#define REP_F_READONLY_MASTER 0x00002000 /* PrefMas readonly master. */
+#define REP_F_SKIPPED_APPLY 0x00004000 /* Skipped applying a record. */
+#define REP_F_START_CALLED 0x00008000 /* Rep_start called. */
+#define REP_F_SYS_DB_OP 0x00010000 /* Operation in progress. */
u_int32_t flags;
} REP;
@@ -525,7 +582,7 @@ do { \
/*
* REP_F_EPHASE0 is not a *real* election phase. It is used for
* master leases and allowing the client to find the master or
- * expire its lease. However, EPHASE0 is cleared by __rep_elect_done.
+ * expire its lease.
*/
#define IN_ELECTION(R) \
FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2)
@@ -594,6 +651,22 @@ do { \
} while (0)
+/* Macros to determine current replication configuration options. */
+#define REP_CONFIG_IS_SET(env, flags) \
+ (REP_ON(env) ? \
+ FLD_ISSET(((env)->rep_handle->region)->config, flags) : \
+ FLD_ISSET(((env)->rep_handle)->config, flags))
+#ifdef HAVE_REPLICATION_THREADS
+#define PREFMAS_IS_SET(env) \
+ (REP_CONFIG_IS_SET(env, \
+ (REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT)))
+#else
+#define PREFMAS_IS_SET(env) 0
+#endif
+#define IS_PREFMAS_MODE(env) \
+ (REP_ON(env) && PREFMAS_IS_SET(env) && \
+ ((env)->rep_handle->region)->config_nsites < 3)
+
/*
* Gap processing flags. These provide control over the basic
* gap processing algorithm for some special cases.
@@ -603,11 +676,28 @@ do { \
/* REREQUEST is a superset of FORCE. */
/*
+ * Internal options for rep_start_int(). These are used by preferred master
+ * mode to help coordinate between the sites during changes of master.
+ */
+#define REP_START_FORCE_ROLECHG 0x001 /* Force role change to advance gen. */
+#define REP_START_HOLD_CLIGEN 0x002 /* Hold client gen before doing
+ * lsnhist match. */
+#define REP_START_WAIT_LOCKMSG 0x004 /* Wait for REP_LOCKOUT_MSG. */
+
+/*
* Flags indicating what kind of record we want to back up to, in the log.
*/
-#define REP_REC_COMMIT 0x001 /* Most recent commit record. */
-#define REP_REC_PERM 0x002 /* Most recent perm record. */
+#define REP_REC_COMMIT 0x001 /* Most recent commit record. */
+#define REP_REC_PERM 0x002 /* Most recent perm record. */
/* PERM is a superset of COMMIT. */
+#define REP_REC_PERM_DEL 0x004 /* Most recent PERM, or fail if a
+ * file delete is found first. */
+
+/*
+ * Permanent record types.
+ */
+#define IS_PERM_RECTYPE(rectype) \
+ ((rectype) == DB___txn_regop || (rectype) == DB___txn_ckp)
/*
* Basic pre/post-amble processing.
@@ -692,7 +782,7 @@ do { \
* machine instruction. A single 32-bit integer value is safe without a
* mutex, but most other types of value should use a mutex.
*
- * Any use of a mutex must be inside a matched pair of ENV_ENTER() and
+ * Use of a db_mutex_t mutex must be inside a matched pair of ENV_ENTER() and
* ENV_LEAVE() macros. This ensures that if a thread dies while holding
* a lock (i.e. a mutex), recovery can clean it up so that it does not
* indefinitely block other threads.
@@ -727,6 +817,9 @@ struct __db_rep {
/*
* End of shared configuration information.
*/
+ int (*partial) /* View/partial replication function. */
+ __P((DB_ENV *, const char *, int *, u_int32_t));
+
int (*send) /* Send function. */
__P((DB_ENV *, const DBT *, const DBT *,
const DB_LSN *, int, u_int32_t));
@@ -745,6 +838,7 @@ struct __db_rep {
DB_MPOOLFILE *file_mpf; /* Mpoolfile for current database. */
DB *file_dbp; /* This file's page info. */
DBC *queue_dbc; /* Dbc for a queue file. */
+ DB *blob_dbp; /* Blob file database. */
/*
* Please change __rep_print_all (rep_stat.c) to track any changes made
@@ -759,6 +853,7 @@ struct __db_rep {
/*
* Replication Framework (repmgr) per-process information.
*/
+ int config_nthreads;/* Configured msg processing threads. */
u_int nthreads; /* Msg processing threads. */
u_int athreads; /* Space allocated for msg threads. */
u_int non_rep_th; /* Threads in GMDB or channel msgs. */
@@ -771,10 +866,13 @@ struct __db_rep {
db_timeout_t connection_retry_wait;
db_timeout_t heartbeat_frequency; /* Max period between msgs. */
db_timeout_t heartbeat_monitor_timeout;
+ u_int32_t inqueue_max_gbytes;
+ u_int32_t inqueue_max_bytes;
/* Thread synchronization. */
REPMGR_RUNNABLE *selector, **messengers, **elect_threads;
REPMGR_RUNNABLE *preferred_elect_thr;
+ REPMGR_RUNNABLE *takeover_thread;
db_timespec repstart_time;
mgr_mutex_t *mutex;
cond_var_t check_election, gmdb_idle, msg_avail;
@@ -799,12 +897,18 @@ struct __db_rep {
CONNECTION_LIST connections;
RETRY_Q_HEADER retries; /* Sites needing connection retry. */
struct {
- int size;
+ u_int32_t gbytes;
+ u_int32_t bytes;
STAILQ_HEAD(__repmgr_q_header, __repmgr_message) header;
} input_queue;
socket_t listen_fd;
db_timespec last_bcast; /* Time of last broadcast msg. */
+ db_timespec last_hbeat; /* Time of last heartbeat (prefmas). */
+ db_timespec l_listener_chk; /* Time to check local listener. */
+ db_timeout_t l_listener_wait;/* Timeout to check local listener. */
+ db_timespec m_listener_chk; /* Time to check master listener. */
+ db_timeout_t m_listener_wait;/* Timeout to check master listener. */
/*
* Status of repmgr. It is ready when repmgr is not yet started. It
@@ -813,12 +917,15 @@ struct __db_rep {
*/
enum { ready, running, stopped } repmgr_status;
int new_connection; /* Since last master seek attempt. */
+ int demotion_pending; /* We're being demoted to a view. */
int takeover_pending; /* We've been elected master. */
+ int rejoin_pending; /* Join group retry after rejection. */
int gmdb_busy;
int client_intent; /* Will relinquish master role. */
int gmdb_dirty;
int have_gmdb;
int seen_repmsg;
+ int view_mismatch; /* View callback and gmdb don't match. */
/*
* Flag to show what kind of transaction is currently in progress.
@@ -854,6 +961,16 @@ struct __db_rep {
u_int8_t *restored_list;
size_t restored_list_length;
+ /*
+ * Preferred master mode indicator for a pending action. A
+ * master_switch is initiated when the preferred master site is
+ * ready to take over as master. A start_temp_master is initiated
+ * when the client site needs to start as the temporary master.
+ */
+ enum { no_action, master_switch, start_temp_master } prefmas_pending;
+ /* The LSN at the very beginning of preferred master site startup. */
+ DB_LSN prefmas_init_lsn;
+
/* Application's message dispatch call-back function. */
void (*msg_dispatch) __P((DB_ENV *, DB_CHANNEL *,
DBT *, u_int32_t, u_int32_t));
@@ -920,6 +1037,10 @@ struct __db_rep {
} else if (!F_ISSET((env)->rep_handle, DBREP_APP_REPMGR)) \
F_SET((env)->rep_handle, DBREP_APP_BASEAPI); \
} while (0)
+#define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \
+ (db_rep)->l_listener_wait = timeout; \
+ (db_rep)->m_listener_wait = 3 * timeout; \
+} while (0)
#else
/*
@@ -935,6 +1056,9 @@ struct __db_rep {
#define APP_SET_BASEAPI(env) do { \
; \
} while (0)
+#define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \
+ ; \
+} while (0)
#endif /* HAVE_REPLICATION_THREADS */
/*
@@ -945,22 +1069,27 @@ struct __db_rep {
* compatibility with old versions, these values must be reserved explicitly in
* the list of flag values (below)
*/
-#define DB_LOG_PERM_42_44 0x20
-#define DB_LOG_RESEND_42_44 0x40
-#define REPCTL_INIT_45 0x02 /* Back compatible flag value. */
-
-#define REPCTL_ELECTABLE 0x01 /* Upgraded client is electable. */
-#define REPCTL_FLUSH 0x02 /* Record should be flushed. */
-#define REPCTL_GROUP_ESTD 0x04 /* Message from site in a group. */
-#define REPCTL_INIT 0x08 /* Internal init message. */
-#define REPCTL_LEASE 0x10 /* Lease related message.. */
+#define DB_LOG_PERM_42_44 0x020
+#define DB_LOG_RESEND_42_44 0x040
+#define REPCTL_INIT_45 0x002 /* Back compatible flag value. */
+
+/*
+ * Add new REPCTL flags to the end of this list to preserve compatibility
+ * with old versions.
+ */
+#define REPCTL_ELECTABLE 0x001 /* Upgraded client is electable. */
+#define REPCTL_FLUSH 0x002 /* Record should be flushed. */
+#define REPCTL_GROUP_ESTD 0x004 /* Message from site in a group. */
+#define REPCTL_INIT 0x008 /* Internal init message. */
+#define REPCTL_LEASE 0x010 /* Lease related message. */
/*
* Skip over reserved values 0x20
* and 0x40, as explained above.
*/
-#define REPCTL_LOG_END 0x80 /* Approximate end of group-wide log. */
+#define REPCTL_LOG_END 0x080 /* Approximate end of group-wide log. */
#define REPCTL_PERM DB_LOG_PERM_42_44
#define REPCTL_RESEND DB_LOG_RESEND_42_44
+#define REPCTL_INMEM_ONLY 0x100 /* In-memory databases only. */
/*
* File info flags for internal init. The per-database (i.e., file) flag
@@ -1094,6 +1223,20 @@ typedef struct {
DBT *objs;
} linfo_t;
+/*
+ * Used to store information on the child transaction that opens a blob meta
+ * database. In partial replication processing the child transaction of the
+ * blob meta database must be delayed until after processing the child
+ * transaction that opens the database that owns the BMD.
+ */
+typedef struct {
+ db_seq_t blob_file_id;
+ DB_LSN lsn;
+ u_int32_t child;
+ void *next;
+ void *prev;
+} DELAYED_BLOB_LIST;
+
#if defined(__cplusplus)
}
#endif
diff --git a/src/dbinc/repmgr.h b/src/dbinc/repmgr.h
index d8fd199c..a38defa2 100644
--- a/src/dbinc/repmgr.h
+++ b/src/dbinc/repmgr.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -47,20 +47,29 @@ extern "C" {
* In protocol version one there were only three message types: 1, 2, and 3; so
* 3 was the max. In protocol version 2 we introduced heartbeats, type 4.
* (Protocol version 3 did not introduce any new message types.) In version 4
- * we introduced a few more new message types, the largest of which had value 7.
+ * we introduced a few more new message types, the largest of which had value 8.
+ * Protocol version 5 did not introduce any new message types, but changed
+ * the format of site info and membership data to support views.
+ *
+ * Protocol version 6 introduced preferred master mode, which added several
+ * new REPMGR_OWN messages.
*/
#define REPMGR_MAX_V1_MSG_TYPE 3
#define REPMGR_MAX_V2_MSG_TYPE 4
#define REPMGR_MAX_V3_MSG_TYPE 4
#define REPMGR_MAX_V4_MSG_TYPE 8
+#define REPMGR_MAX_V5_MSG_TYPE 8
+#define REPMGR_MAX_V6_MSG_TYPE 8
#define HEARTBEAT_MIN_VERSION 2
#define CHANNEL_MIN_VERSION 4
#define CONN_COLLISION_VERSION 4
#define GM_MIN_VERSION 4
#define OWN_MIN_VERSION 4
+#define VIEW_MIN_VERSION 5
+#define PREFMAS_MIN_VERSION 6
/* The range of protocol versions we're willing to support. */
-#define DB_REPMGR_VERSION 4
+#define DB_REPMGR_VERSION 6
#define DB_REPMGR_MIN_VERSION 1
/*
@@ -73,18 +82,30 @@ extern "C" {
* Like the message format types, these message type values should be
* permanently frozen.
*/
-#define REPMGR_CONNECT_REJECT 1
-#define REPMGR_GM_FAILURE 2
-#define REPMGR_GM_FORWARD 3
-#define REPMGR_JOIN_REQUEST 4
-#define REPMGR_JOIN_SUCCESS 5
-#define REPMGR_PARM_REFRESH 6
-#define REPMGR_REJOIN 7
-#define REPMGR_REMOVE_REQUEST 8
-#define REPMGR_REMOVE_SUCCESS 9
-#define REPMGR_RESOLVE_LIMBO 10
-#define REPMGR_SHARING 11
-
+#define REPMGR_CONNECT_REJECT 1
+#define REPMGR_GM_FAILURE 2
+#define REPMGR_GM_FORWARD 3
+#define REPMGR_JOIN_REQUEST 4
+#define REPMGR_JOIN_SUCCESS 5
+#define REPMGR_PARM_REFRESH 6
+#define REPMGR_REJOIN 7
+#define REPMGR_REMOVE_REQUEST 8
+#define REPMGR_REMOVE_SUCCESS 9
+#define REPMGR_RESOLVE_LIMBO 10
+#define REPMGR_SHARING 11
+#define REPMGR_LSNHIST_REQUEST 12
+#define REPMGR_LSNHIST_RESPONSE 13
+#define REPMGR_PREFMAS_FAILURE 14
+#define REPMGR_PREFMAS_SUCCESS 15
+#define REPMGR_READONLY_MASTER 16
+#define REPMGR_READONLY_RESPONSE 17
+#define REPMGR_RESTART_CLIENT 18
+
+/* Detect inconsistencies between view callback and site's gmdb. */
+#define PARTICIPANT_TO_VIEW(db_rep, site) \
+ ((db_rep)->partial && !FLD_ISSET((site)->gmdb_flags, SITE_VIEW))
+#define VIEW_TO_PARTICIPANT(db_rep, site) \
+ (!(db_rep)->partial && FLD_ISSET((site)->gmdb_flags, SITE_VIEW))
struct __repmgr_connection;
typedef struct __repmgr_connection REPMGR_CONNECTION;
@@ -98,7 +119,8 @@ struct __cond_waiters_table;
typedef struct __cond_waiters_table COND_WAITERS_TABLE;
/* Current Group Membership DB format ID. */
-#define REPMGR_GMDB_FMT_VERSION 1
+#define REPMGR_GMDB_FMT_VERSION 2
+#define REPMGR_GMDB_FMT_MIN_VERSION 1
#ifdef DB_WIN32
typedef SOCKET socket_t;
@@ -151,6 +173,17 @@ typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1];
#define DB_REPMGR_DEFAULT_ELECTION_RETRY (10 * US_PER_SEC)
#define DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT (5 * US_PER_SEC)
+/* Default preferred master automatic configuration values. */
+#define DB_REPMGR_PREFMAS_ELECTION_RETRY (1 * US_PER_SEC)
+#define DB_REPMGR_PREFMAS_HEARTBEAT_MONITOR (2 * US_PER_SEC)
+#define DB_REPMGR_PREFMAS_HEARTBEAT_SEND (75 * (US_PER_SEC / 100))
+#define DB_REPMGR_PREFMAS_PRIORITY_CLIENT 75
+#define DB_REPMGR_PREFMAS_PRIORITY_MASTER 200
+
+/* Defaults for undocumented incoming queue maximum messages. */
+#define DB_REPMGR_DEFAULT_INQUEUE_MAX (100 * MEGABYTE)
+#define DB_REPMGR_INQUEUE_REDZONE_PERCENT 85
+
typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST;
typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER;
typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER;
@@ -170,14 +203,20 @@ struct __repmgr_runnable {
/*
* Options governing requested behavior of election thread.
*/
-#define ELECT_F_EVENT_NOTIFY 0x01 /* Notify application of master failure. */
-#define ELECT_F_FAST 0x02 /* First election "fast" (n-1 trick). */
-#define ELECT_F_IMMED 0x04 /* Start with immediate election. */
-#define ELECT_F_INVITEE 0x08 /* Honor (remote) inviter's nsites. */
-#define ELECT_F_STARTUP 0x10 /* Observe repmgr_start() policy. */
+#define ELECT_F_CLIENT_RESTART 0x01 /* Do client restarts but no elections. */
+#define ELECT_F_EVENT_NOTIFY 0x02 /* Notify application of master failure. */
+#define ELECT_F_FAST 0x04 /* First election "fast" (n-1 trick). */
+#define ELECT_F_IMMED 0x08 /* Start with immediate election. */
+#define ELECT_F_INVITEE 0x10 /* Honor (remote) inviter's nsites. */
+#define ELECT_F_STARTUP 0x20 /* Observe repmgr_start() policy. */
u_int32_t flags;
- int eid; /* For Connector thread. */
+ /* For connector thread. */
+ struct {
+ int eid;
+#define CONNECT_F_REFRESH 0x01 /* New connection to replace old one. */
+ u_int32_t flags;
+ } conn_th;
/*
* Args for other thread types can be added here in the future
@@ -265,6 +304,7 @@ struct __queued_output {
*/
typedef struct __repmgr_message {
STAILQ_ENTRY(__repmgr_message) entries;
+ size_t size;
__repmgr_msg_hdr_args msg_hdr;
union {
struct {
@@ -343,6 +383,7 @@ struct __repmgr_connection {
#define CONN_PARAMETERS 5 /* Awaiting parameters handshake. */
#define CONN_READY 6 /* Everything's fine. */
int state;
+ u_int32_t auto_takeover;/* Connection to remote listener candidate. */
/*
* Input: while we're reading a message, we keep track of what phase
@@ -464,6 +505,8 @@ typedef struct {
SITEADDR addr; /* Unprocessed network address of site. */
u_int32_t config; /* Configuration flags: peer, helper, etc. */
u_int32_t status; /* Group membership status. */
+ u_int32_t flags; /* Group membership flags. */
+ u_int32_t listener_cand;/* Number of listener candidates of site. */
} SITEINFO;
/*
@@ -489,6 +532,42 @@ typedef struct {
((u_int)i) < db_rep->site_cnt; \
(int)(++(i)) == db_rep->self_eid ? ++(i) : i)
+/*
+ * Enable replication manager auto listener takeover.
+ */
+#define HAVE_REPLICATION_LISTENER_TAKEOVER 1
+
+/* Listener candidate, that is subordinate rep-aware process. */
+#define IS_LISTENER_CAND(db_rep) \
+ (FLD_ISSET((db_rep)->region->config, REP_C_AUTOTAKEOVER) && \
+ IS_SUBORDINATE(db_rep) && (db_rep)->repmgr_status == running)
+
+/*
+ * The number of listener candidates for each remote site is maintained in
+ * the listener process and used in subordinate rep-aware processes.
+ */
+#define SET_LISTENER_CAND(cond, op) \
+ do { \
+ if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER) && \
+ !IS_SUBORDINATE(db_rep) && (cond)) { \
+ MUTEX_LOCK(env, rep->mtx_repmgr); \
+ sites = R_ADDR(env->reginfo, rep->siteinfo_off);\
+ (sites[eid].listener_cand)op; \
+ MUTEX_UNLOCK(env, rep->mtx_repmgr); \
+ } \
+ } while (0)
+
+#define CHECK_LISTENER_CAND(val, op, tval, fval) \
+ do { \
+ if (IS_LISTENER_CAND(db_rep)) { \
+ MUTEX_LOCK(env, rep->mtx_repmgr); \
+ sites = R_ADDR(env->reginfo, rep->siteinfo_off);\
+ val = ((sites[eid].listener_cand)op) ? \
+ (tval) : (fval); \
+ MUTEX_UNLOCK(env, rep->mtx_repmgr); \
+ } \
+ } while (0)
+
struct __repmgr_site {
repmgr_netaddr_t net_addr;
@@ -499,12 +578,14 @@ struct __repmgr_site {
* host/port network address is promised to be associated with the
* locally known EID for the life of the environment.
*/
- u_int32_t membership; /* Status flags from GMDB. */
+ u_int32_t membership; /* Status value from GMDB. */
+ u_int32_t gmdb_flags; /* Flags from GMDB. */
u_int32_t config; /* Flags from site->set_config() */
/*
* Everything below here is applicable only to remote sites.
*/
+ u_int32_t max_ack_gen; /* Master generation for max_ack. */
DB_LSN max_ack; /* Best ack we've heard from this site. */
int ack_policy; /* Or 0 if unknown. */
u_int16_t alignment; /* Requirements for app channel msgs. */
@@ -604,11 +685,11 @@ struct __channel {
* connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and
* (3) db_rep->connections.
*
- * 1. SITE->ref.conn points to our connection with the main process running
- * at the given site, if such a connection exists. We may have initiated
- * the connection to the site ourselves, or we may have received it as an
- * incoming connection. Once it is established there is very little
- * difference between those two cases.
+ * 1. SITE->ref.conn points to our connection with the listener process
+ * running at the given site, if such a connection exists. We may have
+ * initiated the connection to the site ourselves, or we may have received
+ * it as an incoming connection. Once it is established there is very
+ * little difference between those two cases.
*
* 2. SITE->sub_conns is a list of connections we have with subordinate
* processes running at the given site. There can be any number of these
@@ -694,6 +775,7 @@ struct __channel {
*/
#define APP_CHANNEL_CONNECTION 0x02 /* Connection used for app channel. */
#define ELECTABLE_SITE 0x04
+#define REPMGR_AUTOTAKEOVER 0x08 /* Could become main connection. */
#define REPMGR_SUBORDINATE 0x01 /* This is a subordinate connection. */
/*
@@ -719,13 +801,20 @@ typedef struct {
* As with message formats, stored formats are defined in repmgr.msg.
*/
/*
- * Flags for the Group Membership data portion of a record. Like message type
- * codes, these values are frozen across releases, in order to avoid pointless
- * churn.
+ * Status values for the Group Membership data portion of a record. Like
+ * message type codes, these values are frozen across releases, in order to
+ * avoid pointless churn. These values are mutually exclusive.
*/
#define SITE_ADDING 0x01
#define SITE_DELETING 0x02
#define SITE_PRESENT 0x04
+/*
+ * Flags for the Group Membership data portion of a record. These values are
+ * also frozen across releases. These values are bit fields and may be OR'ed
+ * together.
+ */
+#define SITE_VIEW 0x01
+#define SITE_JOIN_ELECTABLE 0x02
/*
* Message types whose processing could take a long time. We're careful to
@@ -755,9 +844,9 @@ typedef struct {
* fraction of the code, it's a tiny fraction of the time: repmgr spends most of
* its time in a call to select(), and as well a bit in calls into the Base
* replication API. All of those release the mutex.
- * Access to repmgr's shared list of site addresses is protected by
- * another mutex: mtx_repmgr. And, when changing space allocation for that site
- * list we conform to the convention of acquiring renv->mtx_regenv. These are
+ * Access to repmgr's shared values is protected by another mutex:
+ * mtx_repmgr. And, when changing space allocation for that site list
+ * we conform to the convention of acquiring renv->mtx_regenv. These are
* less frequent of course.
* When it's necessary to acquire more than one of these mutexes, the
* ordering priority (or "lock ordering protocol") is:
diff --git a/src/dbinc/shqueue.h b/src/dbinc/shqueue.h
index 22464462..20e0fae7 100644
--- a/src/dbinc/shqueue.h
+++ b/src/dbinc/shqueue.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -140,6 +140,17 @@ struct { \
((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next)))
/*
+ * __SH_LIST_WAS_EMPTY is private API. SH_LIST_FIRST is not thread-safe;
+ * the slh_first field could be evaluated multiple times if the optimizer
+ * does not eliminate the second load. __SH_LIST_WAS_EMPTY tests whether a
+ * prior call of SH_LIST_FIRSTP occurred while the list was empty; i.e., its
+ * relative offset was -1. It is thread-safe to call SH_LIST_FIRSTP and then
+ * test the resulting pointer with __SH_LIST_WAS_EMPTY.
+ */
+#define __SH_LIST_WAS_EMPTY(head, ptr) \
+ ((u_int8_t *)(ptr) == (((u_int8_t *)(head)) + (-1)))
+
+ /*
*__SH_LIST_PREV_OFF is private API. It calculates the address of
* the elm->field.sle_next member of a SH_LIST structure. All offsets
* between elements are relative to that point in SH_LIST structures.
diff --git a/src/dbinc/tcl_db.h b/src/dbinc/tcl_db.h
index 4c56164f..99992467 100644
--- a/src/dbinc/tcl_db.h
+++ b/src/dbinc/tcl_db.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -16,7 +16,7 @@ extern "C" {
#define MSG_SIZE 100 /* Message size */
enum INFOTYPE {
- I_AUX, I_DB, I_DBC, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN};
+ I_AUX, I_DB, I_DBC, I_DBSTREAM, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN};
#define MAX_ID 8 /* Maximum number of sub-id's we need */
#define DBTCL_PREP 64 /* Size of txn_recover preplist */
@@ -24,9 +24,11 @@ enum INFOTYPE {
#define DBTCL_DBM 1
#define DBTCL_NDBM 2
-#define DBTCL_GETCLOCK 0
-#define DBTCL_GETLIMIT 1
-#define DBTCL_GETREQ 2
+#define DBTCL_GETCLOCK 0
+#define DBTCL_GETINQUEUE_MAX 1
+#define DBTCL_GETINQUEUE_REDZONE 2
+#define DBTCL_GETLIMIT 3
+#define DBTCL_GETREQ 4
#define DBTCL_MUT_ALIGN 0
#define DBTCL_MUT_INCR 1
@@ -36,9 +38,11 @@ enum INFOTYPE {
/*
* Data structure to record information about events that have occurred. Tcl
- * command "env event_info" can retrieve the information. For now, we record
- * only one occurrence per event type; "env event_info -clear" can be used to
- * reset the info.
+ * command "env event_info" can retrieve all the information except the number
+ * of times, and "env event_count" can retrieve the number of times a specific
+ * event is fired. We added "env event_count" instead of merging the times
+ * information into "env event_info" to avoid breaking the existing tests.
+ * Tcl command "env event_info -clear" can be used to reset the info.
*
* Besides the bit flag that records the fact that an event type occurred, some
* event types have associated "info" and we record that here too. When new
@@ -47,16 +51,17 @@ enum INFOTYPE {
* with the "env event_info" results.
*/
typedef struct dbtcl_event_info {
- u_int32_t events; /* Bit flag on for each event fired. */
- int panic_error;
- int newmaster_eid;
- int added_eid;
- int removed_eid;
- pid_t attached_process;
- int connected_eid;
+ u_int32_t events; /* Bit flag on for each event fired. */
+ int panic_error;
+ int newmaster_eid;
+ int added_eid;
+ int removed_eid;
+ pid_t attached_process;
+ int connected_eid;
DB_REPMGR_CONN_ERR conn_broken_info;
DB_REPMGR_CONN_ERR conn_failed_try_info;
- DB_LSN sync_point;
+ DB_LSN sync_point;
+ size_t count[32]; /* The number of times for each event. */
} DBTCL_EVENT_INFO;
/*
@@ -99,6 +104,7 @@ typedef struct dbtcl_info {
DB_LOCK *lock;
DB_LOGC *logc;
DB_MPOOLFILE *mp;
+ DB_STREAM *dbsp;
DB_TXN *txnp;
void *anyp;
} un;
@@ -128,6 +134,7 @@ typedef struct dbtcl_info {
Tcl_Obj *i_isalive;
Tcl_Obj *i_part_callback;
Tcl_Obj *i_rep_send;
+ Tcl_Obj *i_rep_view;
Tcl_Obj *i_second_call;
/* Environment ID for the i_rep_send callback. */
@@ -144,6 +151,7 @@ typedef struct dbtcl_info {
#define i_anyp un.anyp
#define i_dbp un.dbp
#define i_dbcp un.dbcp
+#define i_dbsp un.dbsp
#define i_envp un.envp
#define i_lock un.lock
#define i_logc un.logc
@@ -170,6 +178,8 @@ typedef struct dbtcl_info {
#define i_dbdbcid i_otherid[0]
+#define i_dbcdbsid i_otherid[0]
+
extern int __debug_on, __debug_print, __debug_stop, __debug_test;
typedef struct dbtcl_global {
@@ -202,6 +212,7 @@ extern DBTCL_GLOBAL __dbtcl_global;
* functions this will typically go before the "free" function to free the
* stat structure returned by DB.
*/
+#ifdef HAVE_STATISTICS
#define MAKE_STAT_LIST(s, v) do { \
result = _SetListElemInt(interp, res, (s), (long)(v)); \
if (result != TCL_OK) \
@@ -213,6 +224,11 @@ extern DBTCL_GLOBAL __dbtcl_global;
if (result != TCL_OK) \
goto error; \
} while (0)
+#else
+/* These do-nothing versions streamline the code & reduce warning messages. */
+#define MAKE_STAT_LIST(s, v) if (0) goto error
+#define MAKE_WSTAT_LIST(s, v) if (0) goto error
+#endif
/*
* MAKE_STAT_LSN appends a {name {LSNfile LSNoffset}} pair to a result list
@@ -257,13 +273,14 @@ extern DBTCL_GLOBAL __dbtcl_global;
* This macro also assumes a label "error" to go to in the event of a Tcl
* error.
*/
-#define MAKE_SITE_LIST(e, h, p, s, pr) do { \
- myobjc = 5; \
+#define MAKE_SITE_LIST(e, h, p, s, pr, vw) do { \
+ myobjc = 6; \
myobjv[0] = Tcl_NewIntObj(e); \
myobjv[1] = Tcl_NewStringObj((h), (int)strlen(h)); \
myobjv[2] = Tcl_NewIntObj((int)p); \
myobjv[3] = Tcl_NewStringObj((s), (int)strlen(s)); \
myobjv[4] = Tcl_NewStringObj((pr), (int)strlen(pr)); \
+ myobjv[5] = Tcl_NewStringObj((vw), (int)strlen(vw)); \
thislist = Tcl_NewListObj(myobjc, myobjv); \
result = Tcl_ListObjAppendElement(interp, res, thislist); \
if (result != TCL_OK) \
diff --git a/src/dbinc/txn.h b/src/dbinc/txn.h
index 7cbae263..682d7c42 100644
--- a/src/dbinc/txn.h
+++ b/src/dbinc/txn.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/win_db.h b/src/dbinc/win_db.h
index ba57cd1f..e22aba98 100644
--- a/src/dbinc/win_db.h
+++ b/src/dbinc/win_db.h
@@ -1,17 +1,21 @@
/*-
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* The following provides the information necessary to build Berkeley
* DB on native Windows, and other Windows environments such as MinGW.
*/
/*
- * Berkeley DB requires at least Windows 2000, tell Visual Studio of the
- * requirement.
+ * Berkeley DB requires at least Windows 2000, and Windows XP if we are using
+ * Visual Studio 2012. Tell Visual Studio of the requirement.
*/
#ifndef _WIN32_WINNT
+#if _MSC_VER >= 1700
+#define _WIN32_WINNT 0x0501
+#else
#define _WIN32_WINNT 0x0500
#endif
+#endif
#ifndef DB_WINCE
#include <sys/types.h>
@@ -69,12 +73,46 @@
#endif
#define getpid GetCurrentProcessId
#define snprintf _snprintf
+#ifndef strcasecmp
#define strcasecmp _stricmp
#define strncasecmp _strnicmp
+#endif
#define vsnprintf _vsnprintf
#define h_errno WSAGetLastError()
+#ifdef DB_WINCE
+/* Macros used by setvbuf on WINCE */
+#ifndef _IOFBF
+#define _IOFBF 0x0000
+#endif
+#ifndef _IOLBF
+#define _IOLBF 0x0040
+#endif
+#ifndef _IONBF
+#define _IONBF 0x0004
+#endif
+/* The macros for time functions */
+#define freopen __ce_freopen
+#define gmtime __ce_gmtime
+#define mktime __ce_mktime
+#define remove __ce_remove
+#define SECSPERMIN 60
+#define MINSPERHOUR 60
+#define HOURSPERDAY 24
+#define DAYSPERWEEK 7
+#define DAYSPERNYEAR 365
+#define DAYSPERLYEAR 366
+#define SECSPERHOUR (SECSPERMIN * MINSPERHOUR)
+#define SECSPERDAY ((long) SECSPERHOUR * HOURSPERDAY)
+#define MONSPERYEAR 12
+#define TM_YEAR_BASE 1900
+#define TM_YEAR_EPOCH 1970
+#define isleap(y) ((((y) % 4) == 0 && ((y) % 100) != 0) || ((y) % 400) == 0)
+extern const __DB_IMPORT unsigned int mon_lengths[][MONSPERYEAR];
+extern const __DB_IMPORT unsigned int year_lengths[];
+#endif
+
/*
* Win32 does not have getopt.
*
diff --git a/src/dbinc/xa.h b/src/dbinc/xa.h
index 7283c1ea..7b7e2cb0 100644
--- a/src/dbinc/xa.h
+++ b/src/dbinc/xa.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc_auto/api_flags.in b/src/dbinc_auto/api_flags.in
index 9727ede2..a10b6b62 100644
--- a/src/dbinc_auto/api_flags.in
+++ b/src/dbinc_auto/api_flags.in
@@ -36,6 +36,7 @@
#define DB_FLUSH 0x00000002
#define DB_FORCE 0x00000001
#define DB_FORCESYNC 0x00000001
+#define DB_FORCESYNCENV 0x00000002
#define DB_FOREIGN_ABORT 0x00000001
#define DB_FOREIGN_CASCADE 0x00000002
#define DB_FOREIGN_NULLIFY 0x00000004
@@ -53,8 +54,9 @@
#define DB_INIT_REP 0x00001000
#define DB_INIT_TXN 0x00002000
#define DB_INORDER 0x00000020
-#define DB_INTERNAL_PERSISTENT_DB 0x00001000
-#define DB_INTERNAL_TEMPORARY_DB 0x00002000
+#define DB_INTERNAL_BLOB_DB 0x00001000
+#define DB_INTERNAL_PERSISTENT_DB 0x00002000
+#define DB_INTERNAL_TEMPORARY_DB 0x00004000
#define DB_JOIN_NOSORT 0x00000001
#define DB_LEGACY 0x00000004
#define DB_LOCAL_SITE 0x00000008
@@ -67,12 +69,14 @@
#define DB_LOCK_SWITCH 0x00000020
#define DB_LOCK_UPGRADE 0x00000040
#define DB_LOG_AUTO_REMOVE 0x00000001
+#define DB_LOG_BLOB 0x00000002
#define DB_LOG_CHKPNT 0x00000001
#define DB_LOG_COMMIT 0x00000004
-#define DB_LOG_DIRECT 0x00000002
-#define DB_LOG_DSYNC 0x00000004
-#define DB_LOG_IN_MEMORY 0x00000008
+#define DB_LOG_DIRECT 0x00000004
+#define DB_LOG_DSYNC 0x00000008
+#define DB_LOG_IN_MEMORY 0x00000010
#define DB_LOG_NOCOPY 0x00000008
+#define DB_LOG_NOSYNC 0x00000020
#define DB_LOG_NOT_DURABLE 0x00000010
#define DB_LOG_NO_DATA 0x00000002
#define DB_LOG_VERIFY_CAF 0x00000001
@@ -84,7 +88,7 @@
#define DB_LOG_VERIFY_VERBOSE 0x00000040
#define DB_LOG_VERIFY_WARNING 0x00000080
#define DB_LOG_WRNOSYNC 0x00000020
-#define DB_LOG_ZERO 0x00000010
+#define DB_LOG_ZERO 0x00000040
#define DB_MPOOL_CREATE 0x00000001
#define DB_MPOOL_DIRTY 0x00000002
#define DB_MPOOL_DISCARD 0x00000001
@@ -102,17 +106,18 @@
#define DB_MUTEX_ALLOCATED 0x00000001
#define DB_MUTEX_LOCKED 0x00000002
#define DB_MUTEX_LOGICAL_LOCK 0x00000004
+#define DB_MUTEX_OWNER_DEAD 0x00000020
#define DB_MUTEX_PROCESS_ONLY 0x00000008
#define DB_MUTEX_SELF_BLOCK 0x00000010
-#define DB_MUTEX_SHARED 0x00000020
-#define DB_NOERROR 0x00004000
+#define DB_MUTEX_SHARED 0x00000040
+#define DB_NOERROR 0x00008000
#define DB_NOFLUSH 0x00001000
#define DB_NOLOCKING 0x00002000
#define DB_NOMMAP 0x00000010
#define DB_NOORDERCHK 0x00000002
#define DB_NOPANIC 0x00004000
#define DB_NOSYNC 0x00000001
-#define DB_NO_AUTO_COMMIT 0x00008000
+#define DB_NO_AUTO_COMMIT 0x00010000
#define DB_NO_CHECKPOINT 0x00008000
#define DB_ODDFILESIZE 0x00000080
#define DB_ORDERCHKONLY 0x00000004
@@ -123,7 +128,7 @@
#define DB_PR_PAGE 0x00000010
#define DB_PR_RECOVERYTEST 0x00000020
#define DB_RDONLY 0x00000400
-#define DB_RDWRMASTER 0x00010000
+#define DB_RDWRMASTER 0x00020000
#define DB_READ_COMMITTED 0x00000400
#define DB_READ_UNCOMMITTED 0x00000200
#define DB_RECNUM 0x00000040
@@ -134,17 +139,20 @@
#define DB_RENUMBER 0x00000080
#define DB_REPMGR_CONF_2SITE_STRICT 0x00000001
#define DB_REPMGR_CONF_ELECTIONS 0x00000002
+#define DB_REPMGR_CONF_PREFMAS_CLIENT 0x00000004
+#define DB_REPMGR_CONF_PREFMAS_MASTER 0x00000008
#define DB_REPMGR_NEED_RESPONSE 0x00000001
#define DB_REPMGR_PEER 0x00000010
#define DB_REP_ANYWHERE 0x00000001
#define DB_REP_CLIENT 0x00000001
-#define DB_REP_CONF_AUTOINIT 0x00000004
-#define DB_REP_CONF_AUTOROLLBACK 0x00000008
-#define DB_REP_CONF_BULK 0x00000010
-#define DB_REP_CONF_DELAYCLIENT 0x00000020
-#define DB_REP_CONF_INMEM 0x00000040
-#define DB_REP_CONF_LEASE 0x00000080
-#define DB_REP_CONF_NOWAIT 0x00000100
+#define DB_REP_CONF_AUTOINIT 0x00000010
+#define DB_REP_CONF_AUTOROLLBACK 0x00000020
+#define DB_REP_CONF_BULK 0x00000040
+#define DB_REP_CONF_DELAYCLIENT 0x00000080
+#define DB_REP_CONF_ELECT_LOGLENGTH 0x00000100
+#define DB_REP_CONF_INMEM 0x00000200
+#define DB_REP_CONF_LEASE 0x00000400
+#define DB_REP_CONF_NOWAIT 0x00000800
#define DB_REP_ELECTION 0x00000004
#define DB_REP_MASTER 0x00000002
#define DB_REP_NOBUFFER 0x00000002
@@ -161,8 +169,9 @@
#define DB_SEQ_WRAP 0x00000008
#define DB_SEQ_WRAPPED 0x00000010
#define DB_SET_LOCK_TIMEOUT 0x00000001
-#define DB_SET_REG_TIMEOUT 0x00000004
-#define DB_SET_TXN_NOW 0x00000008
+#define DB_SET_MUTEX_FAILCHK_TIMEOUT 0x00000004
+#define DB_SET_REG_TIMEOUT 0x00000008
+#define DB_SET_TXN_NOW 0x00000010
#define DB_SET_TXN_TIMEOUT 0x00000002
#define DB_SHALLOW_DUP 0x00000100
#define DB_SNAPSHOT 0x00000200
@@ -188,7 +197,7 @@
#define DB_SYSTEM_MEM 0x00080000
#define DB_THREAD 0x00000020
#define DB_TIME_NOTGRANTED 0x00040000
-#define DB_TRUNCATE 0x00020000
+#define DB_TRUNCATE 0x00040000
#define DB_TXN_BULK 0x00000010
#define DB_TXN_FAMILY 0x00000040
#define DB_TXN_NOSYNC 0x00000001
@@ -206,23 +215,24 @@
#define DB_VERB_DEADLOCK 0x00000002
#define DB_VERB_FILEOPS 0x00000004
#define DB_VERB_FILEOPS_ALL 0x00000008
-#define DB_VERB_RECOVERY 0x00000010
-#define DB_VERB_REGISTER 0x00000020
-#define DB_VERB_REPLICATION 0x00000040
-#define DB_VERB_REPMGR_CONNFAIL 0x00000080
-#define DB_VERB_REPMGR_MISC 0x00000100
-#define DB_VERB_REP_ELECT 0x00000200
-#define DB_VERB_REP_LEASE 0x00000400
-#define DB_VERB_REP_MISC 0x00000800
-#define DB_VERB_REP_MSGS 0x00001000
-#define DB_VERB_REP_SYNC 0x00002000
-#define DB_VERB_REP_SYSTEM 0x00004000
-#define DB_VERB_REP_TEST 0x00008000
-#define DB_VERB_WAITSFOR 0x00010000
+#define DB_VERB_MVCC 0x00000010
+#define DB_VERB_RECOVERY 0x00000020
+#define DB_VERB_REGISTER 0x00000040
+#define DB_VERB_REPLICATION 0x00000080
+#define DB_VERB_REPMGR_CONNFAIL 0x00000100
+#define DB_VERB_REPMGR_MISC 0x00000200
+#define DB_VERB_REP_ELECT 0x00000400
+#define DB_VERB_REP_LEASE 0x00000800
+#define DB_VERB_REP_MISC 0x00001000
+#define DB_VERB_REP_MSGS 0x00002000
+#define DB_VERB_REP_SYNC 0x00004000
+#define DB_VERB_REP_SYSTEM 0x00008000
+#define DB_VERB_REP_TEST 0x00010000
+#define DB_VERB_WAITSFOR 0x00020000
#define DB_VERIFY 0x00000002
#define DB_VERIFY_PARTITION 0x00040000
#define DB_WRITECURSOR 0x00000010
#define DB_WRITELOCK 0x00000020
-#define DB_WRITEOPEN 0x00040000
+#define DB_WRITEOPEN 0x00080000
#define DB_XA_CREATE 0x00000001
#define DB_YIELDCPU 0x00080000
diff --git a/src/dbinc_auto/blob_ext.h b/src/dbinc_auto/blob_ext.h
new file mode 100644
index 00000000..3eac5c8d
--- /dev/null
+++ b/src/dbinc_auto/blob_ext.h
@@ -0,0 +1,41 @@
+/* DO NOT EDIT: automatically built by dist/s_include. */
+#ifndef _blob_ext_h_
+#define _blob_ext_h_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __blob_file_create __P ((DBC *, DB_FH **, db_seq_t *));
+int __blob_file_close __P ((DBC *, DB_FH *, u_int32_t));
+int __blob_file_delete __P((DBC *, db_seq_t));
+int __blob_file_open __P((DB *, DB_FH **, db_seq_t, u_int32_t, int));
+int __blob_file_read __P((ENV *, DB_FH *, DBT *, off_t, u_int32_t));
+int __blob_file_write __P((DBC *, DB_FH *, DBT *, off_t, db_seq_t, off_t *, u_int32_t));
+int __blob_bulk __P((DBC *, u_int32_t, db_seq_t, u_int8_t *));
+int __blob_get __P((DBC *, DBT *, db_seq_t, off_t, void **, u_int32_t *));
+int __blob_put __P(( DBC *, DBT *, db_seq_t *, off_t *size, DB_LSN *));
+int __blob_repl __P((DBC *, DBT *, db_seq_t, db_seq_t *,off_t *));
+int __blob_del __P((DBC *, db_seq_t));
+int __db_stream_init __P((DBC *, DB_STREAM **, u_int32_t));
+int __db_stream_close_int __P ((DB_STREAM *));
+int __blob_make_sub_dir __P((ENV *, char **, db_seq_t, db_seq_t));
+int __blob_make_meta_fname __P((ENV *, DB *, char **));
+int __blob_get_dir __P((DB *, char **));
+int __blob_generate_dir_ids __P((DB *, DB_TXN *, db_seq_t *));
+int __blob_generate_id __P((DB *, DB_TXN *, db_seq_t *));
+int __blob_highest_id __P((DB *, DB_TXN *, db_seq_t *));
+void __blob_calculate_dirs __P((db_seq_t, char *, int *, int *));
+int __blob_id_to_path __P((ENV *, const char *, db_seq_t, char **));
+int __blob_str_to_id __P((ENV *, const char **, db_seq_t *));
+int __blob_path_to_dir_ids __P((ENV *, const char *, db_seq_t *, db_seq_t *));
+int __blob_salvage __P((ENV *, db_seq_t, off_t, size_t, db_seq_t, db_seq_t, DBT *));
+int __blob_vrfy __P((ENV *, db_seq_t, off_t, db_seq_t, db_seq_t, db_pgno_t, u_int32_t));
+int __blob_del_hierarchy __P((ENV *));
+int __blob_del_all __P((DB *, DB_TXN *, int));
+int __blob_copy_all __P((DB*, const char *, u_int32_t));
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_blob_ext_h_ */
diff --git a/src/dbinc_auto/btree_ext.h b/src/dbinc_auto/btree_ext.h
index c90f5b80..bdd95750 100644
--- a/src/dbinc_auto/btree_ext.h
+++ b/src/dbinc_auto/btree_ext.h
@@ -8,11 +8,11 @@ extern "C" {
int __bam_compact_int __P((DBC *, DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
int __bam_compact_opd __P((DBC *, db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *));
-int __bam_truncate_ipages __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
-int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *));
-int __bam_defcmp __P((DB *, const DBT *, const DBT *));
+int __bam_truncate_ipages __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *, int *));
+int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t, int (*)(DB *, const DBT *, const DBT *, size_t *), int *, size_t *));
+int __bam_defcmp __P((DB *, const DBT *, const DBT *, size_t *));
size_t __bam_defpfx __P((DB *, const DBT *, const DBT *));
-int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *));
+int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *, size_t *));
int __bam_defcompress __P((DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *));
int __bam_defdecompress __P((DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *));
int __bamc_compress_get __P((DBC *, DBT *, DBT *, u_int32_t));
@@ -52,7 +52,7 @@ int __bam_db_create __P((DB *));
int __bam_db_close __P((DB *));
void __bam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
int __bam_set_flags __P((DB *, u_int32_t *flagsp));
-int __bam_set_bt_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+int __bam_set_bt_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *)));
int __bam_set_bt_compress __P((DB *, int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
int __bam_get_bt_minkey __P((DB *, u_int32_t *));
void __bam_copy_config __P((DB *, DB*, u_int32_t));
@@ -115,6 +115,8 @@ int __bam_traverse __P((DBC *, db_lockmode_t, db_pgno_t, int (*)(DBC *, PAGE *,
int __bam_30_btreemeta __P((DB *, char *, u_int8_t *));
int __bam_31_btreemeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
int __bam_31_lbtree __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __bam_60_btreemeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __bam_60_lbtree __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *, db_pgno_t, u_int32_t));
int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
diff --git a/src/dbinc_auto/common_ext.h b/src/dbinc_auto/common_ext.h
index ac16e9db..1a94d3a1 100644
--- a/src/dbinc_auto/common_ext.h
+++ b/src/dbinc_auto/common_ext.h
@@ -25,6 +25,7 @@ int __db_pgfmt __P((ENV *, db_pgno_t));
#ifdef DIAGNOSTIC
void __db_assert __P((ENV *, const char *, const char *, int));
#endif
+void __env_panic_event __P((ENV *, int));
int __env_panic_msg __P((ENV *));
int __env_panic __P((ENV *, int));
char *__db_unknown_error __P((int));
@@ -33,9 +34,10 @@ void __db_err __P((const ENV *, int, const char *, ...)) __attribute__ ((__forma
void __db_errx __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3)));
void __db_errcall __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
void __db_errfile __P((const DB_ENV *, int, db_error_set_t, const char *, va_list));
-void __db_msgadd __P((ENV *, DB_MSGBUF *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
-void __db_msgadd_ap __P((ENV *, DB_MSGBUF *, const char *, va_list));
+void __db_msgadd __P((const ENV *, DB_MSGBUF *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4)));
+void __db_msgadd_ap __P((const ENV *, DB_MSGBUF *, const char *, va_list));
void __db_msg __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3)));
+void __db_debug_msg __P((const ENV *, const char *, ...));
void __db_repmsg __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3)));
int __db_unknown_flag __P((ENV *, char *, u_int32_t));
int __db_unknown_type __P((ENV *, char *, DBTYPE));
@@ -50,6 +52,24 @@ int __db_check_lsn __P((ENV *, DB_LSN *, DB_LSN *));
int __db_rdonly __P((const ENV *, const char *));
int __db_space_err __P((const DB *));
int __db_failed __P((const ENV *, const char *, pid_t, db_threadid_t));
+int __env_failure_remember __P((const ENV *, const char *));
+#ifdef HAVE_ERROR_HISTORY
+void __db_thread_init __P((void));
+#endif
+#ifdef HAVE_ERROR_HISTORY
+int __db_diags __P((const ENV *, int));
+#endif
+#ifdef HAVE_ERROR_HISTORY
+DB_MSGBUF *__db_deferred_get __P((void));
+#endif
+#ifdef HAVE_ERROR_HISTORY
+void __db_deferred_discard __P((void));
+#endif
+#ifdef HAVE_ERROR_HISTORY
+int __db_remember_context __P((const ENV *, DB_MSGBUF *, int));
+#endif
+char * __db_ctimespec __P((const db_timespec *, char *));
+char *__db_fmt_quote __P((char *, size_t, const char *));
int __db_getlong __P((DB_ENV *, const char *, char *, long, long, long *));
int __db_getulong __P((DB_ENV *, const char *, char *, u_long, u_long, u_long *));
void __db_idspace __P((u_int32_t *, int, u_int32_t *, u_int32_t *));
diff --git a/src/dbinc_auto/db_ext.h b/src/dbinc_auto/db_ext.h
index de2a6ce4..719fc0c5 100644
--- a/src/dbinc_auto/db_ext.h
+++ b/src/dbinc_auto/db_ext.h
@@ -62,14 +62,19 @@ int __db_merge_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __db_pgno_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __db_init_print __P((ENV *, DB_DISTAB *));
int __db_dbbackup_pp __P((DB_ENV *, const char *, const char *, u_int32_t));
-int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t));
-int __db_backup __P((DB_ENV *, const char *, u_int32_t));
+int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t, u_int32_t, const char *));
+int backup_data_copy __P(( DB_ENV *, const char *, const char *, const char *, int));
+int __db_backup_pp __P((DB_ENV *, const char *, u_int32_t));
int __dbc_close __P((DBC *));
int __dbc_destroy __P((DBC *));
int __dbc_cmp __P((DBC *, DBC *, int *));
int __dbc_count __P((DBC *, db_recno_t *));
int __dbc_del __P((DBC *, u_int32_t));
int __dbc_idel __P((DBC *, u_int32_t));
+int __dbc_db_stream __P((DBC *, DB_STREAM **, u_int32_t));
+int __dbc_get_blob_id __P((DBC *, db_seq_t *));
+int __dbc_get_blob_size __P((DBC *, off_t *));
+int __dbc_set_blob_size __P((DBC *, off_t));
#ifdef HAVE_COMPRESSION
int __dbc_bulk_del __P((DBC *, DBT *, u_int32_t));
#endif
@@ -93,15 +98,16 @@ u_int32_t __db_partsize __P((u_int32_t, DBT *));
#ifdef DIAGNOSTIC
void __db_check_skeyset __P((DB *, DBT *));
#endif
+int __dbc_diags __P((DBC *, int));
int __cdsgroup_begin __P((ENV *, DB_TXN **));
int __cdsgroup_begin_pp __P((DB_ENV *, DB_TXN **));
int __db_compact_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
-int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int));
-int __db_truncate_overflow __P((DBC *, db_pgno_t, PAGE **, DB_COMPACT *));
-int __db_truncate_root __P((DBC *, PAGE *, u_int32_t, db_pgno_t *, u_int32_t));
+int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int, int *));
+int __db_truncate_overflow __P((DBC *, db_pgno_t, PAGE **, DB_COMPACT *, int *));
+int __db_truncate_root __P((DBC *, PAGE *, u_int32_t, db_pgno_t *, u_int32_t, int *));
int __db_find_free __P((DBC *, u_int32_t, u_int32_t, db_pgno_t, db_pgno_t *));
int __db_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t));
-int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *));
+int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *, int *));
int __db_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *));
int __db_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *));
int __db_decrypt_pg __P((ENV *, DB *, PAGE *));
@@ -185,6 +191,10 @@ int __db_has_pagelock __P((ENV *, DB_LOCKER *, DB_MPOOLFILE *, PAGE *, db_lockmo
int __db_lput __P((DBC *, DB_LOCK *));
int __db_create_internal __P((DB **, ENV *, u_int32_t));
int __dbh_am_chk __P((DB *, u_int32_t));
+int __db_get_blob_threshold __P((DB *, u_int32_t *));
+int __db_set_blob_threshold __P((DB *, u_int32_t, u_int32_t));
+int __db_blobs_enabled __P((DB *));
+int __db_set_dup_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *)));
int __db_get_flags __P((DB *, u_int32_t *));
int __db_set_flags __P((DB *, u_int32_t));
int __db_get_lorder __P((DB *, int *));
@@ -197,12 +207,13 @@ int __db_init_subdb __P((DB *, DB *, const char *, DB_THREAD_INFO *, DB_TXN *));
int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t));
int __db_meta_setup __P((ENV *, DB *, const char *, DBMETA *, u_int32_t, u_int32_t));
int __db_reopen __P((DBC *));
+int __db_alloc_dbt __P((ENV *, DBT *, u_int32_t, u_int32_t *, u_int32_t *, void **, u_int32_t *));
int __db_goff __P((DBC *, DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *));
int __db_poff __P((DBC *, const DBT *, db_pgno_t *));
int __db_ovref __P((DBC *, db_pgno_t));
int __db_doff __P((DBC *, db_pgno_t));
-int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *));
-int __db_coff __P((DBC *, const DBT *, const DBT *, int (*)(DB *, const DBT *, const DBT *), int *));
+int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t, int (*)(DB *, const DBT *, const DBT *, size_t *), int *, size_t *));
+int __db_coff __P((DBC *, const DBT *, const DBT *, int (*)(DB *, const DBT *, const DBT *, size_t *), int *));
int __db_vrfy_overflow __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
int __db_vrfy_ovfl_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, u_int32_t));
int __db_safe_goff __P((DB *, VRFY_DBINFO *, db_pgno_t, DBT *, void *, u_int32_t *, u_int32_t));
@@ -221,11 +232,12 @@ int __db_name_to_val __P((FN const *, char *));
const char *__db_pagetype_to_string __P((u_int32_t));
int __db_dump_pp __P((DB *, const char *, int (*)(void *, const void *), void *, int, int));
int __db_dump __P((DB *, const char *, int (*)(void *, const void *), void *, int, int));
-int __db_prdbt __P((DBT *, int, const char *, void *, int (*)(void *, const void *), int, int));
+int __db_prdbt __P((DBT *, int, const char *, void *, int (*)(void *, const void *), int, int, int));
int __db_prheader __P((DB *, const char *, int, int, void *, int (*)(void *, const void *), VRFY_DBINFO *, db_pgno_t));
int __db_prfooter __P((void *, int (*)(void *, const void *)));
int __db_pr_callback __P((void *, const void *));
const char * __db_dbtype_to_string __P((DBTYPE));
+char *__db_tohex __P((const void *, size_t, char *));
int __db_addrem_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __db_addrem_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __db_big_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
@@ -263,6 +275,8 @@ int __db_rename_pp __P((DB *, const char *, const char *, const char *, u_int32_
int __db_rename_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *, u_int32_t));
int __db_ret __P((DBC *, PAGE *, u_int32_t, DBT *, void **, u_int32_t *));
int __db_retcopy __P((ENV *, DBT *, void *, u_int32_t, void **, u_int32_t *));
+int __db_dbt_clone __P((ENV *, DBT *, const DBT *));
+int __db_dbt_clone_free __P((ENV *, DBT *));
int __env_fileid_reset_pp __P((DB_ENV *, const char *, u_int32_t));
int __env_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, int));
int __env_lsn_reset_pp __P((DB_ENV *, const char *, u_int32_t));
@@ -329,6 +343,7 @@ int __part_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, i
int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
int __part_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t));
int __part_rename __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *));
+int __partc_dup __P((DBC *, DBC *));
int __part_verify __P((DB *, VRFY_DBINFO *, const char *, void *, int (*)(void *, const void *), u_int32_t));
int __part_testdocopy __P((DB *, const char *));
int __db_no_partition __P((ENV *));
diff --git a/src/dbinc_auto/dbreg_auto.h b/src/dbinc_auto/dbreg_auto.h
index 63ad0cd3..22f1e84c 100644
--- a/src/dbinc_auto/dbreg_auto.h
+++ b/src/dbinc_auto/dbreg_auto.h
@@ -3,6 +3,28 @@
#ifndef __dbreg_AUTO_H
#define __dbreg_AUTO_H
#include "dbinc/log.h"
+#define DB___dbreg_register_42 2
+typedef struct ___dbreg_register_42_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ DBT name;
+ DBT uid;
+ int32_t fileid;
+ DBTYPE ftype;
+ db_pgno_t meta_pgno;
+ u_int32_t id;
+} __dbreg_register_42_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __dbreg_register_42_desc[];
+static inline int __dbreg_register_42_read(ENV *env,
+ void *data, __dbreg_register_42_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __dbreg_register_42_desc, sizeof(__dbreg_register_42_args), (void**)arg));
+}
#define DB___dbreg_register 2
typedef struct ___dbreg_register_args {
u_int32_t type;
@@ -15,22 +37,25 @@ typedef struct ___dbreg_register_args {
DBTYPE ftype;
db_pgno_t meta_pgno;
u_int32_t id;
+ u_int32_t blob_fid_lo;
+ u_int32_t blob_fid_hi;
} __dbreg_register_args;
extern __DB_IMPORT DB_LOG_RECSPEC __dbreg_register_desc[];
static inline int
__dbreg_register_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
u_int32_t opcode, const DBT *name, const DBT *uid, int32_t fileid, DBTYPE ftype,
- db_pgno_t meta_pgno, u_int32_t id)
+ db_pgno_t meta_pgno, u_int32_t id, u_int32_t blob_fid_lo, u_int32_t blob_fid_hi)
{
return (__log_put_record(env, NULL, txnp, ret_lsnp,
flags, DB___dbreg_register, 0,
sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
sizeof(u_int32_t) + LOG_DBT_SIZE(name) + LOG_DBT_SIZE(uid) +
sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
- sizeof(u_int32_t),
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t),
__dbreg_register_desc,
- opcode, name, uid, fileid, ftype, meta_pgno, id));
+ opcode, name, uid, fileid, ftype, meta_pgno, id, blob_fid_lo,
+ blob_fid_hi));
}
static inline int __dbreg_register_read(ENV *env,
diff --git a/src/dbinc_auto/dbreg_ext.h b/src/dbinc_auto/dbreg_ext.h
index 0f495c33..421c7989 100644
--- a/src/dbinc_auto/dbreg_ext.h
+++ b/src/dbinc_auto/dbreg_ext.h
@@ -20,9 +20,11 @@ int __dbreg_failchk __P((ENV *));
int __dbreg_log_close __P((ENV *, FNAME *, DB_TXN *, u_int32_t));
int __dbreg_log_id __P((DB *, DB_TXN *, int32_t, int));
int __dbreg_init_recover __P((ENV *, DB_DISTAB *));
+int __dbreg_register_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __dbreg_register_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __dbreg_init_print __P((ENV *, DB_DISTAB *));
int __dbreg_register_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __dbreg_register_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __dbreg_stat_print __P((ENV *, u_int32_t));
void __dbreg_print_fname __P((ENV *, FNAME *));
int __dbreg_add_dbentry __P((ENV *, DB_LOG *, DB *, int32_t));
@@ -36,8 +38,9 @@ int __dbreg_invalidate_files __P((ENV *, int));
int __dbreg_id_to_db __P((ENV *, DB_TXN *, DB **, int32_t, int));
int __dbreg_id_to_fname __P((DB_LOG *, int32_t, int, FNAME **));
int __dbreg_fid_to_fname __P((DB_LOG *, u_int8_t *, int, FNAME **));
+int __dbreg_blob_file_to_fname __P((DB_LOG *, db_seq_t, int, FNAME **));
int __dbreg_get_name __P((ENV *, u_int8_t *, char **, char **));
-int __dbreg_do_open __P((ENV *, DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t, void *, u_int32_t, u_int32_t));
+int __dbreg_do_open __P((ENV *, DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t, void *, u_int32_t, u_int32_t, db_seq_t));
int __dbreg_lazy_id __P((DB *));
#if defined(__cplusplus)
diff --git a/src/dbinc_auto/env_ext.h b/src/dbinc_auto/env_ext.h
index 55dbcba4..7df61ea9 100644
--- a/src/dbinc_auto/env_ext.h
+++ b/src/dbinc_auto/env_ext.h
@@ -36,9 +36,13 @@ void __db_env_destroy __P((DB_ENV *));
int __env_get_alloc __P((DB_ENV *, void *(**)(size_t), void *(**)(void *, size_t), void (**)(void *)));
int __env_set_alloc __P((DB_ENV *, void *(*)(size_t), void *(*)(void *, size_t), void (*)(void *)));
int __env_get_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t *));
+int __env_get_blob_threshold_pp __P ((DB_ENV *, u_int32_t *));
+int __env_get_blob_threshold_int __P ((ENV *, u_int32_t *));
+int __env_set_blob_threshold __P((DB_ENV *, u_int32_t, u_int32_t));
int __env_set_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t));
int __env_get_memory_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
int __env_set_memory_max __P((DB_ENV *, u_int32_t, u_int32_t));
+int __env_set_blob_dir __P((DB_ENV *, const char *));
int __env_get_encrypt_flags __P((DB_ENV *, u_int32_t *));
int __env_set_encrypt __P((DB_ENV *, const char *, u_int32_t));
void __env_map_flags __P((const FLAG_MAP *, u_int, u_int32_t *, u_int32_t *));
@@ -91,6 +95,7 @@ void __env_panic_set __P((ENV *, int));
int __env_ref_increment __P((ENV *));
int __env_ref_decrement __P((ENV *));
int __env_ref_get __P((DB_ENV *, u_int32_t *));
+int __env_region_cleanup __P((ENV *));
int __env_detach __P((ENV *, int));
int __env_remove_env __P((ENV *));
int __env_region_attach __P((ENV *, REGINFO *, size_t, size_t));
@@ -102,6 +107,7 @@ int __envreg_xunlock __P((ENV *));
int __envreg_isalive __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t));
u_int32_t __env_struct_sig __P((void));
int __env_stat_print_pp __P((DB_ENV *, u_int32_t));
+int __env_print_thread __P((ENV *));
void __db_print_fh __P((ENV *, const char *, DB_FH *, u_int32_t));
void __db_print_fileid __P((ENV *, u_int8_t *, const char *));
void __db_dl __P((ENV *, const char *, u_long));
@@ -119,6 +125,18 @@ int __repmgr_get_ack_policy __P((DB_ENV *, int *));
int __repmgr_set_ack_policy __P((DB_ENV *, int));
#endif
#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_incoming_queue_redzone __P((DB_ENV *, u_int32_t *, u_int32_t *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *, int *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
#endif
#ifndef HAVE_REPLICATION_THREADS
@@ -128,10 +146,10 @@ int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
#endif
#ifndef HAVE_REPLICATION_THREADS
-int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+int __repmgr_site_list_pp __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
#endif
#ifndef HAVE_REPLICATION_THREADS
-int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t));
#endif
#ifndef HAVE_REPLICATION_THREADS
int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
diff --git a/src/dbinc_auto/fileops_auto.h b/src/dbinc_auto/fileops_auto.h
index 59385c88..3894c23d 100644
--- a/src/dbinc_auto/fileops_auto.h
+++ b/src/dbinc_auto/fileops_auto.h
@@ -21,6 +21,25 @@ static inline int __fop_create_42_read(ENV *env,
return (__log_read_record(env,
NULL, NULL, data, __fop_create_42_desc, sizeof(__fop_create_42_args), (void**)arg));
}
+#define DB___fop_create_60 143
+typedef struct ___fop_create_60_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ DBT dirname;
+ u_int32_t appname;
+ u_int32_t mode;
+} __fop_create_60_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_create_60_desc[];
+static inline int __fop_create_60_read(ENV *env,
+ void *data, __fop_create_60_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_create_60_desc, sizeof(__fop_create_60_args), (void**)arg));
+}
#define DB___fop_create 143
typedef struct ___fop_create_args {
u_int32_t type;
@@ -53,6 +72,24 @@ static inline int __fop_create_read(ENV *env,
return (__log_read_record(env,
NULL, NULL, data, __fop_create_desc, sizeof(__fop_create_args), (void**)arg));
}
+#define DB___fop_remove_60 144
+typedef struct ___fop_remove_60_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ DBT fid;
+ u_int32_t appname;
+} __fop_remove_60_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_remove_60_desc[];
+static inline int __fop_remove_60_read(ENV *env,
+ void *data, __fop_remove_60_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_remove_60_desc, sizeof(__fop_remove_60_args), (void**)arg));
+}
#define DB___fop_remove 144
typedef struct ___fop_remove_args {
u_int32_t type;
@@ -105,6 +142,29 @@ static inline int __fop_write_42_read(ENV *env,
return (__log_read_record(env,
NULL, NULL, data, __fop_write_42_desc, sizeof(__fop_write_42_args), (void**)arg));
}
+#define DB___fop_write_60 145
+typedef struct ___fop_write_60_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ DBT dirname;
+ u_int32_t appname;
+ u_int32_t pgsize;
+ db_pgno_t pageno;
+ u_int32_t offset;
+ DBT page;
+ u_int32_t flag;
+} __fop_write_60_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_60_desc[];
+static inline int __fop_write_60_read(ENV *env,
+ void *data, __fop_write_60_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_write_60_desc, sizeof(__fop_write_60_args), (void**)arg));
+}
#define DB___fop_write 145
typedef struct ___fop_write_args {
u_int32_t type;
@@ -143,6 +203,66 @@ static inline int __fop_write_read(ENV *env,
return (__log_read_record(env,
NULL, NULL, data, __fop_write_desc, sizeof(__fop_write_args), (void**)arg));
}
+#define DB___fop_write_file_60 86
+typedef struct ___fop_write_file_60_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ DBT dirname;
+ u_int32_t appname;
+ u_int32_t offset_lo;
+ u_int32_t offset_hi;
+ DBT old_data;
+ DBT new_data;
+ u_int32_t flag;
+} __fop_write_file_60_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_file_60_desc[];
+static inline int __fop_write_file_60_read(ENV *env,
+ void *data, __fop_write_file_60_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_write_file_60_desc, sizeof(__fop_write_file_60_args), (void**)arg));
+}
+#define DB___fop_write_file 86
+typedef struct ___fop_write_file_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT name;
+ DBT dirname;
+ u_int32_t appname;
+ u_int64_t offset;
+ DBT old_data;
+ DBT new_data;
+ u_int32_t flag;
+} __fop_write_file_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_file_desc[];
+static inline int
+__fop_write_file_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
+ const DBT *name, const DBT *dirname, u_int32_t appname, u_int64_t offset, const DBT *old_data,
+ const DBT *new_data, u_int32_t flag)
+{
+ return (__log_put_record(env, NULL, txnp, ret_lsnp,
+ flags, DB___fop_write_file, 0,
+ sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
+ LOG_DBT_SIZE(name) + LOG_DBT_SIZE(dirname) + sizeof(u_int32_t) +
+ sizeof(u_int64_t) + LOG_DBT_SIZE(old_data) + LOG_DBT_SIZE(new_data) +
+ sizeof(u_int32_t),
+ __fop_write_file_desc,
+ name, dirname, appname, offset, old_data, new_data, flag));
+}
+
+static inline int __fop_write_file_read(ENV *env,
+ void *data, __fop_write_file_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_write_file_desc, sizeof(__fop_write_file_args), (void**)arg));
+}
#define DB___fop_rename_42 146
#define DB___fop_rename_noundo_46 150
typedef struct ___fop_rename_42_args {
@@ -171,6 +291,35 @@ static inline int __fop_rename_noundo_46_read(ENV *env,
return (__log_read_record(env,
NULL, NULL, data, __fop_rename_noundo_46_desc, sizeof(__fop_rename_42_args), (void**)arg));
}
+#define DB___fop_rename_60 146
+#define DB___fop_rename_noundo_60 150
+typedef struct ___fop_rename_60_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT oldname;
+ DBT newname;
+ DBT dirname;
+ DBT fileid;
+ u_int32_t appname;
+} __fop_rename_60_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_60_desc[];
+static inline int __fop_rename_60_read(ENV *env,
+ void *data, __fop_rename_60_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_rename_60_desc, sizeof(__fop_rename_60_args), (void**)arg));
+}
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_noundo_60_desc[];
+static inline int __fop_rename_noundo_60_read(ENV *env,
+ void *data, __fop_rename_60_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_rename_noundo_60_desc, sizeof(__fop_rename_60_args), (void**)arg));
+}
#define DB___fop_rename 146
#define DB___fop_rename_noundo 150
typedef struct ___fop_rename_args {
@@ -226,6 +375,26 @@ static inline int __fop_rename_noundo_read(ENV *env,
return (__log_read_record(env,
NULL, NULL, data, __fop_rename_noundo_desc, sizeof(__fop_rename_args), (void**)arg));
}
+#define DB___fop_file_remove_60 141
+typedef struct ___fop_file_remove_60_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ DBT real_fid;
+ DBT tmp_fid;
+ DBT name;
+ u_int32_t appname;
+ u_int32_t child;
+} __fop_file_remove_60_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __fop_file_remove_60_desc[];
+static inline int __fop_file_remove_60_read(ENV *env,
+ void *data, __fop_file_remove_60_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ NULL, NULL, data, __fop_file_remove_60_desc, sizeof(__fop_file_remove_60_args), (void**)arg));
+}
#define DB___fop_file_remove 141
typedef struct ___fop_file_remove_args {
u_int32_t type;
diff --git a/src/dbinc_auto/fileops_ext.h b/src/dbinc_auto/fileops_ext.h
index 0aa6c1e1..89306183 100644
--- a/src/dbinc_auto/fileops_ext.h
+++ b/src/dbinc_auto/fileops_ext.h
@@ -8,35 +8,51 @@ extern "C" {
int __fop_init_recover __P((ENV *, DB_DISTAB *));
int __fop_create_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_create_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_remove_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_write_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_write_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_file_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_file_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_rename_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_rename_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_file_remove_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_file_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_init_print __P((ENV *, DB_DISTAB *));
int __fop_create __P((ENV *, DB_TXN *, DB_FH **, const char *, const char **, APPNAME, int, u_int32_t));
int __fop_remove __P((ENV *, DB_TXN *, u_int8_t *, const char *, const char **, APPNAME, u_int32_t));
int __fop_write __P((ENV *, DB_TXN *, const char *, const char *, APPNAME, DB_FH *, u_int32_t, db_pgno_t, u_int32_t, void *, u_int32_t, u_int32_t, u_int32_t));
+int __fop_write_file __P((ENV *, DB_TXN *, const char *, const char *, APPNAME, DB_FH *, off_t, void *, size_t, u_int32_t));
int __fop_rename __P((ENV *, DB_TXN *, const char *, const char *, const char **, u_int8_t *, APPNAME, int, u_int32_t));
int __fop_create_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_create_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_remove_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_write_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_write_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_file_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_file_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_rename_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_rename_noundo_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_noundo_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_rename_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_rename_noundo_46_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_file_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_file_remove_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_lock_handle __P((ENV *, DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip, DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, int, u_int32_t));
int __fop_remove_setup __P((DB *, DB_TXN *, const char *, u_int32_t));
int __fop_read_meta __P((ENV *, const char *, u_int8_t *, size_t, DB_FH *, int, size_t *));
-int __fop_dummy __P((DB *, DB_TXN *, const char *, const char *));
-int __fop_dbrename __P((DB *, const char *, const char *));
+int __fop_dummy __P((DB *, DB_TXN *, const char *, const char *, APPNAME));
+int __fop_dbrename __P((DB *, const char *, const char *, APPNAME));
#if defined(__cplusplus)
}
diff --git a/src/dbinc_auto/hash_ext.h b/src/dbinc_auto/hash_ext.h
index e83fe817..4d7c2e9c 100644
--- a/src/dbinc_auto/hash_ext.h
+++ b/src/dbinc_auto/hash_ext.h
@@ -57,7 +57,7 @@ int __ham_return_meta __P((DBC *, u_int32_t, DBMETA **));
int __ham_db_create __P((DB *));
int __ham_db_close __P((DB *));
int __ham_get_h_ffactor __P((DB *, u_int32_t *));
-int __ham_set_h_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+int __ham_set_h_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *)));
int __ham_get_h_nelem __P((DB *, u_int32_t *));
void __ham_copy_config __P((DB *, DB*, u_int32_t));
int __ham_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char * name, db_pgno_t, u_int32_t));
@@ -116,6 +116,8 @@ int __ham_31_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
int __ham_31_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
int __ham_46_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
int __ham_46_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_60_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __ham_60_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
int __ham_vrfy_meta __P((DB *, VRFY_DBINFO *, HMETA *, db_pgno_t, u_int32_t));
int __ham_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
int __ham_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
diff --git a/src/dbinc_auto/heap_auto.h b/src/dbinc_auto/heap_auto.h
index bf288627..f91cacfe 100644
--- a/src/dbinc_auto/heap_auto.h
+++ b/src/dbinc_auto/heap_auto.h
@@ -26,7 +26,7 @@ __heap_addrem_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags,
const DBT *hdr, const DBT *dbt, DB_LSN * pagelsn)
{
return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp,
- flags, DB___heap_addrem, 0,
+ flags, DB___heap_addrem, 1,
sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) +
sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) +
sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(hdr) +
@@ -42,6 +42,52 @@ static inline int __heap_addrem_read(ENV *env,
return (__log_read_record(env,
dbpp, td, data, __heap_addrem_desc, sizeof(__heap_addrem_args), (void**)arg));
}
+#define DB___heap_addrem_60 151
+typedef struct ___heap_addrem_60_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT hdr;
+ DBT dbt;
+ DB_LSN pagelsn;
+} __heap_addrem_60_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_addrem_60_desc[];
+static inline int __heap_addrem_60_read(ENV *env,
+ DB **dbpp, void *td, void *data, __heap_addrem_60_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __heap_addrem_60_desc, sizeof(__heap_addrem_60_args), (void**)arg));
+}
+#define DB___heap_addrem_50 151
+typedef struct ___heap_addrem_50_args {
+ u_int32_t type;
+ DB_TXN *txnp;
+ DB_LSN prev_lsn;
+ u_int32_t opcode;
+ int32_t fileid;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT hdr;
+ DBT dbt;
+ DB_LSN pagelsn;
+} __heap_addrem_50_args;
+
+extern __DB_IMPORT DB_LOG_RECSPEC __heap_addrem_50_desc[];
+static inline int __heap_addrem_50_read(ENV *env,
+ DB **dbpp, void *td, void *data, __heap_addrem_50_args **arg)
+{
+ *arg = NULL;
+ return (__log_read_record(env,
+ dbpp, td, data, __heap_addrem_50_desc, sizeof(__heap_addrem_50_args), (void**)arg));
+}
#define DB___heap_pg_alloc 152
typedef struct ___heap_pg_alloc_args {
u_int32_t type;
diff --git a/src/dbinc_auto/heap_ext.h b/src/dbinc_auto/heap_ext.h
index 8bc24b61..e886d6c9 100644
--- a/src/dbinc_auto/heap_ext.h
+++ b/src/dbinc_auto/heap_ext.h
@@ -15,6 +15,8 @@ int __heapc_gsplit __P((DBC *, DBT *, void **, u_int32_t *));
int __heapc_refresh __P((DBC *));
int __heap_init_recover __P((ENV *, DB_DISTAB *));
int __heap_addrem_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_addrem_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_addrem_50_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_pg_alloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_trunc_meta_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_trunc_page_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
@@ -39,6 +41,8 @@ int __heap_addrem_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_pg_alloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_trunc_meta_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_trunc_page_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_addrem_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_addrem_50_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_truncate __P((DBC *, u_int32_t *));
int __heap_stat __P((DBC *, void *, u_int32_t));
int __heap_stat_print __P((DBC *, u_int32_t));
@@ -46,6 +50,8 @@ void __heap_print_cursor __P((DBC *));
int __heap_stat_callback __P((DBC *, PAGE *, void *, int *));
int __heap_traverse __P((DBC *, int (*)(DBC *, PAGE *, void *, int *), void *));
int __db_no_heap_am __P((ENV *));
+int __heap_60_heapmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+int __heap_60_heap __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
int __heap_vrfy_meta __P((DB *, VRFY_DBINFO *, HEAPMETA *, db_pgno_t, u_int32_t));
int __heap_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
int __heap_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t));
diff --git a/src/dbinc_auto/int_def.in b/src/dbinc_auto/int_def.in
index dce2831c..5042dfd0 100644
--- a/src/dbinc_auto/int_def.in
+++ b/src/dbinc_auto/int_def.in
@@ -85,13 +85,18 @@
#define __db_init_print __db_init_print@DB_VERSION_UNIQUE_NAME@
#define __db_dbbackup_pp __db_dbbackup_pp@DB_VERSION_UNIQUE_NAME@
#define __db_dbbackup __db_dbbackup@DB_VERSION_UNIQUE_NAME@
-#define __db_backup __db_backup@DB_VERSION_UNIQUE_NAME@
+#define backup_data_copy backup_data_copy@DB_VERSION_UNIQUE_NAME@
+#define __db_backup_pp __db_backup_pp@DB_VERSION_UNIQUE_NAME@
#define __dbc_close __dbc_close@DB_VERSION_UNIQUE_NAME@
#define __dbc_destroy __dbc_destroy@DB_VERSION_UNIQUE_NAME@
#define __dbc_cmp __dbc_cmp@DB_VERSION_UNIQUE_NAME@
#define __dbc_count __dbc_count@DB_VERSION_UNIQUE_NAME@
#define __dbc_del __dbc_del@DB_VERSION_UNIQUE_NAME@
#define __dbc_idel __dbc_idel@DB_VERSION_UNIQUE_NAME@
+#define __dbc_db_stream __dbc_db_stream@DB_VERSION_UNIQUE_NAME@
+#define __dbc_get_blob_id __dbc_get_blob_id@DB_VERSION_UNIQUE_NAME@
+#define __dbc_get_blob_size __dbc_get_blob_size@DB_VERSION_UNIQUE_NAME@
+#define __dbc_set_blob_size __dbc_set_blob_size@DB_VERSION_UNIQUE_NAME@
#ifdef HAVE_COMPRESSION
#define __dbc_bulk_del __dbc_bulk_del@DB_VERSION_UNIQUE_NAME@
#endif
@@ -115,6 +120,7 @@
#ifdef DIAGNOSTIC
#define __db_check_skeyset __db_check_skeyset@DB_VERSION_UNIQUE_NAME@
#endif
+#define __dbc_diags __dbc_diags@DB_VERSION_UNIQUE_NAME@
#define __cdsgroup_begin __cdsgroup_begin@DB_VERSION_UNIQUE_NAME@
#define __cdsgroup_begin_pp __cdsgroup_begin_pp@DB_VERSION_UNIQUE_NAME@
#define __db_compact_int __db_compact_int@DB_VERSION_UNIQUE_NAME@
@@ -207,6 +213,10 @@
#define __db_lput __db_lput@DB_VERSION_UNIQUE_NAME@
#define __db_create_internal __db_create_internal@DB_VERSION_UNIQUE_NAME@
#define __dbh_am_chk __dbh_am_chk@DB_VERSION_UNIQUE_NAME@
+#define __db_get_blob_threshold __db_get_blob_threshold@DB_VERSION_UNIQUE_NAME@
+#define __db_set_blob_threshold __db_set_blob_threshold@DB_VERSION_UNIQUE_NAME@
+#define __db_blobs_enabled __db_blobs_enabled@DB_VERSION_UNIQUE_NAME@
+#define __db_set_dup_compare __db_set_dup_compare@DB_VERSION_UNIQUE_NAME@
#define __db_get_flags __db_get_flags@DB_VERSION_UNIQUE_NAME@
#define __db_set_flags __db_set_flags@DB_VERSION_UNIQUE_NAME@
#define __db_get_lorder __db_get_lorder@DB_VERSION_UNIQUE_NAME@
@@ -219,6 +229,7 @@
#define __db_chk_meta __db_chk_meta@DB_VERSION_UNIQUE_NAME@
#define __db_meta_setup __db_meta_setup@DB_VERSION_UNIQUE_NAME@
#define __db_reopen __db_reopen@DB_VERSION_UNIQUE_NAME@
+#define __db_alloc_dbt __db_alloc_dbt@DB_VERSION_UNIQUE_NAME@
#define __db_goff __db_goff@DB_VERSION_UNIQUE_NAME@
#define __db_poff __db_poff@DB_VERSION_UNIQUE_NAME@
#define __db_ovref __db_ovref@DB_VERSION_UNIQUE_NAME@
@@ -248,6 +259,7 @@
#define __db_prfooter __db_prfooter@DB_VERSION_UNIQUE_NAME@
#define __db_pr_callback __db_pr_callback@DB_VERSION_UNIQUE_NAME@
#define __db_dbtype_to_string __db_dbtype_to_string@DB_VERSION_UNIQUE_NAME@
+#define __db_tohex __db_tohex@DB_VERSION_UNIQUE_NAME@
#define __db_addrem_recover __db_addrem_recover@DB_VERSION_UNIQUE_NAME@
#define __db_addrem_42_recover __db_addrem_42_recover@DB_VERSION_UNIQUE_NAME@
#define __db_big_recover __db_big_recover@DB_VERSION_UNIQUE_NAME@
@@ -285,6 +297,8 @@
#define __db_rename_int __db_rename_int@DB_VERSION_UNIQUE_NAME@
#define __db_ret __db_ret@DB_VERSION_UNIQUE_NAME@
#define __db_retcopy __db_retcopy@DB_VERSION_UNIQUE_NAME@
+#define __db_dbt_clone __db_dbt_clone@DB_VERSION_UNIQUE_NAME@
+#define __db_dbt_clone_free __db_dbt_clone_free@DB_VERSION_UNIQUE_NAME@
#define __env_fileid_reset_pp __env_fileid_reset_pp@DB_VERSION_UNIQUE_NAME@
#define __env_fileid_reset __env_fileid_reset@DB_VERSION_UNIQUE_NAME@
#define __env_lsn_reset_pp __env_lsn_reset_pp@DB_VERSION_UNIQUE_NAME@
@@ -351,6 +365,7 @@
#define __part_key_range __part_key_range@DB_VERSION_UNIQUE_NAME@
#define __part_remove __part_remove@DB_VERSION_UNIQUE_NAME@
#define __part_rename __part_rename@DB_VERSION_UNIQUE_NAME@
+#define __partc_dup __partc_dup@DB_VERSION_UNIQUE_NAME@
#define __part_verify __part_verify@DB_VERSION_UNIQUE_NAME@
#define __part_testdocopy __part_testdocopy@DB_VERSION_UNIQUE_NAME@
#define __db_no_partition __db_no_partition@DB_VERSION_UNIQUE_NAME@
@@ -361,6 +376,34 @@
#define __partition_init __partition_init@DB_VERSION_UNIQUE_NAME@
#define __part_fileid_reset __part_fileid_reset@DB_VERSION_UNIQUE_NAME@
#define __partition_set_dirs __partition_set_dirs@DB_VERSION_UNIQUE_NAME@
+#define __blob_file_create __blob_file_create@DB_VERSION_UNIQUE_NAME@
+#define __blob_file_close __blob_file_close@DB_VERSION_UNIQUE_NAME@
+#define __blob_file_delete __blob_file_delete@DB_VERSION_UNIQUE_NAME@
+#define __blob_file_open __blob_file_open@DB_VERSION_UNIQUE_NAME@
+#define __blob_file_read __blob_file_read@DB_VERSION_UNIQUE_NAME@
+#define __blob_file_write __blob_file_write@DB_VERSION_UNIQUE_NAME@
+#define __blob_bulk __blob_bulk@DB_VERSION_UNIQUE_NAME@
+#define __blob_get __blob_get@DB_VERSION_UNIQUE_NAME@
+#define __blob_put __blob_put@DB_VERSION_UNIQUE_NAME@
+#define __blob_repl __blob_repl@DB_VERSION_UNIQUE_NAME@
+#define __blob_del __blob_del@DB_VERSION_UNIQUE_NAME@
+#define __db_stream_init __db_stream_init@DB_VERSION_UNIQUE_NAME@
+#define __db_stream_close_int __db_stream_close_int@DB_VERSION_UNIQUE_NAME@
+#define __blob_make_sub_dir __blob_make_sub_dir@DB_VERSION_UNIQUE_NAME@
+#define __blob_make_meta_fname __blob_make_meta_fname@DB_VERSION_UNIQUE_NAME@
+#define __blob_get_dir __blob_get_dir@DB_VERSION_UNIQUE_NAME@
+#define __blob_generate_dir_ids __blob_generate_dir_ids@DB_VERSION_UNIQUE_NAME@
+#define __blob_generate_id __blob_generate_id@DB_VERSION_UNIQUE_NAME@
+#define __blob_highest_id __blob_highest_id@DB_VERSION_UNIQUE_NAME@
+#define __blob_calculate_dirs __blob_calculate_dirs@DB_VERSION_UNIQUE_NAME@
+#define __blob_id_to_path __blob_id_to_path@DB_VERSION_UNIQUE_NAME@
+#define __blob_str_to_id __blob_str_to_id@DB_VERSION_UNIQUE_NAME@
+#define __blob_path_to_dir_ids __blob_path_to_dir_ids@DB_VERSION_UNIQUE_NAME@
+#define __blob_salvage __blob_salvage@DB_VERSION_UNIQUE_NAME@
+#define __blob_vrfy __blob_vrfy@DB_VERSION_UNIQUE_NAME@
+#define __blob_del_hierarchy __blob_del_hierarchy@DB_VERSION_UNIQUE_NAME@
+#define __blob_del_all __blob_del_all@DB_VERSION_UNIQUE_NAME@
+#define __blob_copy_all __blob_copy_all@DB_VERSION_UNIQUE_NAME@
#define __bam_compact_int __bam_compact_int@DB_VERSION_UNIQUE_NAME@
#define __bam_compact_opd __bam_compact_opd@DB_VERSION_UNIQUE_NAME@
#define __bam_truncate_ipages __bam_truncate_ipages@DB_VERSION_UNIQUE_NAME@
@@ -470,6 +513,8 @@
#define __bam_30_btreemeta __bam_30_btreemeta@DB_VERSION_UNIQUE_NAME@
#define __bam_31_btreemeta __bam_31_btreemeta@DB_VERSION_UNIQUE_NAME@
#define __bam_31_lbtree __bam_31_lbtree@DB_VERSION_UNIQUE_NAME@
+#define __bam_60_btreemeta __bam_60_btreemeta@DB_VERSION_UNIQUE_NAME@
+#define __bam_60_lbtree __bam_60_lbtree@DB_VERSION_UNIQUE_NAME@
#define __bam_vrfy_meta __bam_vrfy_meta@DB_VERSION_UNIQUE_NAME@
#define __ram_vrfy_leaf __ram_vrfy_leaf@DB_VERSION_UNIQUE_NAME@
#define __bam_vrfy __bam_vrfy@DB_VERSION_UNIQUE_NAME@
@@ -628,6 +673,7 @@
#ifdef DIAGNOSTIC
#define __db_assert __db_assert@DB_VERSION_UNIQUE_NAME@
#endif
+#define __env_panic_event __env_panic_event@DB_VERSION_UNIQUE_NAME@
#define __env_panic_msg __env_panic_msg@DB_VERSION_UNIQUE_NAME@
#define __env_panic __env_panic@DB_VERSION_UNIQUE_NAME@
#define __db_unknown_error __db_unknown_error@DB_VERSION_UNIQUE_NAME@
@@ -639,6 +685,7 @@
#define __db_msgadd __db_msgadd@DB_VERSION_UNIQUE_NAME@
#define __db_msgadd_ap __db_msgadd_ap@DB_VERSION_UNIQUE_NAME@
#define __db_msg __db_msg@DB_VERSION_UNIQUE_NAME@
+#define __db_debug_msg __db_debug_msg@DB_VERSION_UNIQUE_NAME@
#define __db_repmsg __db_repmsg@DB_VERSION_UNIQUE_NAME@
#define __db_unknown_flag __db_unknown_flag@DB_VERSION_UNIQUE_NAME@
#define __db_unknown_type __db_unknown_type@DB_VERSION_UNIQUE_NAME@
@@ -653,6 +700,24 @@
#define __db_rdonly __db_rdonly@DB_VERSION_UNIQUE_NAME@
#define __db_space_err __db_space_err@DB_VERSION_UNIQUE_NAME@
#define __db_failed __db_failed@DB_VERSION_UNIQUE_NAME@
+#define __env_failure_remember __env_failure_remember@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_ERROR_HISTORY
+#define __db_thread_init __db_thread_init@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef HAVE_ERROR_HISTORY
+#define __db_diags __db_diags@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef HAVE_ERROR_HISTORY
+#define __db_deferred_get __db_deferred_get@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef HAVE_ERROR_HISTORY
+#define __db_deferred_discard __db_deferred_discard@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef HAVE_ERROR_HISTORY
+#define __db_remember_context __db_remember_context@DB_VERSION_UNIQUE_NAME@
+#endif
+#define __db_ctimespec __db_ctimespec@DB_VERSION_UNIQUE_NAME@
+#define __db_fmt_quote __db_fmt_quote@DB_VERSION_UNIQUE_NAME@
#define __db_getlong __db_getlong@DB_VERSION_UNIQUE_NAME@
#define __db_getulong __db_getulong@DB_VERSION_UNIQUE_NAME@
#define __db_idspace __db_idspace@DB_VERSION_UNIQUE_NAME@
@@ -709,11 +774,14 @@
#define __dbreg_failchk __dbreg_failchk@DB_VERSION_UNIQUE_NAME@
#define __dbreg_log_close __dbreg_log_close@DB_VERSION_UNIQUE_NAME@
#define __dbreg_log_id __dbreg_log_id@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_register_42_desc __dbreg_register_42_desc@DB_VERSION_UNIQUE_NAME@
#define __dbreg_register_desc __dbreg_register_desc@DB_VERSION_UNIQUE_NAME@
#define __dbreg_init_recover __dbreg_init_recover@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_register_42_print __dbreg_register_42_print@DB_VERSION_UNIQUE_NAME@
#define __dbreg_register_print __dbreg_register_print@DB_VERSION_UNIQUE_NAME@
#define __dbreg_init_print __dbreg_init_print@DB_VERSION_UNIQUE_NAME@
#define __dbreg_register_recover __dbreg_register_recover@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_register_42_recover __dbreg_register_42_recover@DB_VERSION_UNIQUE_NAME@
#define __dbreg_stat_print __dbreg_stat_print@DB_VERSION_UNIQUE_NAME@
#define __dbreg_print_fname __dbreg_print_fname@DB_VERSION_UNIQUE_NAME@
#define __dbreg_add_dbentry __dbreg_add_dbentry@DB_VERSION_UNIQUE_NAME@
@@ -727,6 +795,7 @@
#define __dbreg_id_to_db __dbreg_id_to_db@DB_VERSION_UNIQUE_NAME@
#define __dbreg_id_to_fname __dbreg_id_to_fname@DB_VERSION_UNIQUE_NAME@
#define __dbreg_fid_to_fname __dbreg_fid_to_fname@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_blob_file_to_fname __dbreg_blob_file_to_fname@DB_VERSION_UNIQUE_NAME@
#define __dbreg_get_name __dbreg_get_name@DB_VERSION_UNIQUE_NAME@
#define __dbreg_do_open __dbreg_do_open@DB_VERSION_UNIQUE_NAME@
#define __dbreg_lazy_id __dbreg_lazy_id@DB_VERSION_UNIQUE_NAME@
@@ -760,9 +829,13 @@
#define __env_get_alloc __env_get_alloc@DB_VERSION_UNIQUE_NAME@
#define __env_set_alloc __env_set_alloc@DB_VERSION_UNIQUE_NAME@
#define __env_get_memory_init __env_get_memory_init@DB_VERSION_UNIQUE_NAME@
+#define __env_get_blob_threshold_pp __env_get_blob_threshold_pp@DB_VERSION_UNIQUE_NAME@
+#define __env_get_blob_threshold_int __env_get_blob_threshold_int@DB_VERSION_UNIQUE_NAME@
+#define __env_set_blob_threshold __env_set_blob_threshold@DB_VERSION_UNIQUE_NAME@
#define __env_set_memory_init __env_set_memory_init@DB_VERSION_UNIQUE_NAME@
#define __env_get_memory_max __env_get_memory_max@DB_VERSION_UNIQUE_NAME@
#define __env_set_memory_max __env_set_memory_max@DB_VERSION_UNIQUE_NAME@
+#define __env_set_blob_dir __env_set_blob_dir@DB_VERSION_UNIQUE_NAME@
#define __env_get_encrypt_flags __env_get_encrypt_flags@DB_VERSION_UNIQUE_NAME@
#define __env_set_encrypt __env_set_encrypt@DB_VERSION_UNIQUE_NAME@
#define __env_map_flags __env_map_flags@DB_VERSION_UNIQUE_NAME@
@@ -815,6 +888,7 @@
#define __env_ref_increment __env_ref_increment@DB_VERSION_UNIQUE_NAME@
#define __env_ref_decrement __env_ref_decrement@DB_VERSION_UNIQUE_NAME@
#define __env_ref_get __env_ref_get@DB_VERSION_UNIQUE_NAME@
+#define __env_region_cleanup __env_region_cleanup@DB_VERSION_UNIQUE_NAME@
#define __env_detach __env_detach@DB_VERSION_UNIQUE_NAME@
#define __env_remove_env __env_remove_env@DB_VERSION_UNIQUE_NAME@
#define __env_region_attach __env_region_attach@DB_VERSION_UNIQUE_NAME@
@@ -826,6 +900,7 @@
#define __envreg_isalive __envreg_isalive@DB_VERSION_UNIQUE_NAME@
#define __env_struct_sig __env_struct_sig@DB_VERSION_UNIQUE_NAME@
#define __env_stat_print_pp __env_stat_print_pp@DB_VERSION_UNIQUE_NAME@
+#define __env_print_thread __env_print_thread@DB_VERSION_UNIQUE_NAME@
#define __db_print_fh __db_print_fh@DB_VERSION_UNIQUE_NAME@
#define __db_print_fileid __db_print_fileid@DB_VERSION_UNIQUE_NAME@
#define __db_dl __db_dl@DB_VERSION_UNIQUE_NAME@
@@ -843,6 +918,18 @@
#define __repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_get_incoming_queue_max __repmgr_get_incoming_queue_max@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_set_incoming_queue_max __repmgr_set_incoming_queue_max@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_get_incoming_queue_redzone __repmgr_get_incoming_queue_redzone@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_get_incoming_queue_fullevent __repmgr_get_incoming_queue_fullevent@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
#define __repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
@@ -852,10 +939,10 @@
#define __repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
-#define __repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_list_pp __repmgr_site_list_pp@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
-#define __repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_start_pp __repmgr_start_pp@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
#define __repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@
@@ -876,39 +963,63 @@
#define __repmgr_init_recover __repmgr_init_recover@DB_VERSION_UNIQUE_NAME@
#endif
#define __fop_create_42_desc __fop_create_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_60_desc __fop_create_60_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_create_desc __fop_create_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove_60_desc __fop_remove_60_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_remove_desc __fop_remove_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_write_42_desc __fop_write_42_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_60_desc __fop_write_60_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_write_desc __fop_write_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_file_60_desc __fop_write_file_60_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_file_desc __fop_write_file_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_42_desc __fop_rename_42_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_noundo_46_desc __fop_rename_noundo_46_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_60_desc __fop_rename_60_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_noundo_60_desc __fop_rename_noundo_60_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_desc __fop_rename_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_noundo_desc __fop_rename_noundo_desc@DB_VERSION_UNIQUE_NAME@
+#define __fop_file_remove_60_desc __fop_file_remove_60_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_file_remove_desc __fop_file_remove_desc@DB_VERSION_UNIQUE_NAME@
#define __fop_init_recover __fop_init_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_create_42_print __fop_create_42_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_60_print __fop_create_60_print@DB_VERSION_UNIQUE_NAME@
#define __fop_create_print __fop_create_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove_60_print __fop_remove_60_print@DB_VERSION_UNIQUE_NAME@
#define __fop_remove_print __fop_remove_print@DB_VERSION_UNIQUE_NAME@
#define __fop_write_42_print __fop_write_42_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_60_print __fop_write_60_print@DB_VERSION_UNIQUE_NAME@
#define __fop_write_print __fop_write_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_file_60_print __fop_write_file_60_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_file_print __fop_write_file_print@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_42_print __fop_rename_42_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_60_print __fop_rename_60_print@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_print __fop_rename_print@DB_VERSION_UNIQUE_NAME@
+#define __fop_file_remove_60_print __fop_file_remove_60_print@DB_VERSION_UNIQUE_NAME@
#define __fop_file_remove_print __fop_file_remove_print@DB_VERSION_UNIQUE_NAME@
#define __fop_init_print __fop_init_print@DB_VERSION_UNIQUE_NAME@
#define __fop_create __fop_create@DB_VERSION_UNIQUE_NAME@
#define __fop_remove __fop_remove@DB_VERSION_UNIQUE_NAME@
#define __fop_write __fop_write@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_file __fop_write_file@DB_VERSION_UNIQUE_NAME@
#define __fop_rename __fop_rename@DB_VERSION_UNIQUE_NAME@
#define __fop_create_recover __fop_create_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_60_recover __fop_create_60_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_create_42_recover __fop_create_42_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_remove_recover __fop_remove_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove_60_recover __fop_remove_60_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_write_recover __fop_write_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_60_recover __fop_write_60_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_write_42_recover __fop_write_42_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_file_recover __fop_write_file_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_file_60_recover __fop_write_file_60_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_recover __fop_rename_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_noundo_recover __fop_rename_noundo_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_60_recover __fop_rename_60_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_noundo_60_recover __fop_rename_noundo_60_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_42_recover __fop_rename_42_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_noundo_46_recover __fop_rename_noundo_46_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_file_remove_recover __fop_file_remove_recover@DB_VERSION_UNIQUE_NAME@
+#define __fop_file_remove_60_recover __fop_file_remove_60_recover@DB_VERSION_UNIQUE_NAME@
#define __fop_lock_handle __fop_lock_handle@DB_VERSION_UNIQUE_NAME@
#define __fop_file_setup __fop_file_setup@DB_VERSION_UNIQUE_NAME@
#define __fop_subdb_setup __fop_subdb_setup@DB_VERSION_UNIQUE_NAME@
@@ -1041,6 +1152,8 @@
#define __ham_31_hash __ham_31_hash@DB_VERSION_UNIQUE_NAME@
#define __ham_46_hashmeta __ham_46_hashmeta@DB_VERSION_UNIQUE_NAME@
#define __ham_46_hash __ham_46_hash@DB_VERSION_UNIQUE_NAME@
+#define __ham_60_hashmeta __ham_60_hashmeta@DB_VERSION_UNIQUE_NAME@
+#define __ham_60_hash __ham_60_hash@DB_VERSION_UNIQUE_NAME@
#define __ham_vrfy_meta __ham_vrfy_meta@DB_VERSION_UNIQUE_NAME@
#define __ham_vrfy __ham_vrfy@DB_VERSION_UNIQUE_NAME@
#define __ham_vrfy_structure __ham_vrfy_structure@DB_VERSION_UNIQUE_NAME@
@@ -1055,11 +1168,15 @@
#define __heapc_gsplit __heapc_gsplit@DB_VERSION_UNIQUE_NAME@
#define __heapc_refresh __heapc_refresh@DB_VERSION_UNIQUE_NAME@
#define __heap_addrem_desc __heap_addrem_desc@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_60_desc __heap_addrem_60_desc@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_50_desc __heap_addrem_50_desc@DB_VERSION_UNIQUE_NAME@
#define __heap_pg_alloc_desc __heap_pg_alloc_desc@DB_VERSION_UNIQUE_NAME@
#define __heap_trunc_meta_desc __heap_trunc_meta_desc@DB_VERSION_UNIQUE_NAME@
#define __heap_trunc_page_desc __heap_trunc_page_desc@DB_VERSION_UNIQUE_NAME@
#define __heap_init_recover __heap_init_recover@DB_VERSION_UNIQUE_NAME@
#define __heap_addrem_print __heap_addrem_print@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_60_print __heap_addrem_60_print@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_50_print __heap_addrem_50_print@DB_VERSION_UNIQUE_NAME@
#define __heap_pg_alloc_print __heap_pg_alloc_print@DB_VERSION_UNIQUE_NAME@
#define __heap_trunc_meta_print __heap_trunc_meta_print@DB_VERSION_UNIQUE_NAME@
#define __heap_trunc_page_print __heap_trunc_page_print@DB_VERSION_UNIQUE_NAME@
@@ -1084,6 +1201,8 @@
#define __heap_pg_alloc_recover __heap_pg_alloc_recover@DB_VERSION_UNIQUE_NAME@
#define __heap_trunc_meta_recover __heap_trunc_meta_recover@DB_VERSION_UNIQUE_NAME@
#define __heap_trunc_page_recover __heap_trunc_page_recover@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_60_recover __heap_addrem_60_recover@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_50_recover __heap_addrem_50_recover@DB_VERSION_UNIQUE_NAME@
#define __heap_truncate __heap_truncate@DB_VERSION_UNIQUE_NAME@
#define __heap_stat __heap_stat@DB_VERSION_UNIQUE_NAME@
#define __heap_stat_print __heap_stat_print@DB_VERSION_UNIQUE_NAME@
@@ -1091,6 +1210,8 @@
#define __heap_stat_callback __heap_stat_callback@DB_VERSION_UNIQUE_NAME@
#define __heap_traverse __heap_traverse@DB_VERSION_UNIQUE_NAME@
#define __db_no_heap_am __db_no_heap_am@DB_VERSION_UNIQUE_NAME@
+#define __heap_60_heapmeta __heap_60_heapmeta@DB_VERSION_UNIQUE_NAME@
+#define __heap_60_heap __heap_60_heap@DB_VERSION_UNIQUE_NAME@
#define __heap_vrfy_meta __heap_vrfy_meta@DB_VERSION_UNIQUE_NAME@
#define __heap_vrfy __heap_vrfy@DB_VERSION_UNIQUE_NAME@
#define __heap_vrfy_structure __heap_vrfy_structure@DB_VERSION_UNIQUE_NAME@
@@ -1129,6 +1250,7 @@
#define __lock_addfamilylocker __lock_addfamilylocker@DB_VERSION_UNIQUE_NAME@
#define __lock_freelocker __lock_freelocker@DB_VERSION_UNIQUE_NAME@
#define __lock_familyremove __lock_familyremove@DB_VERSION_UNIQUE_NAME@
+#define __lock_local_locker_invalidate __lock_local_locker_invalidate@DB_VERSION_UNIQUE_NAME@
#define __lock_fix_list __lock_fix_list@DB_VERSION_UNIQUE_NAME@
#define __lock_get_list __lock_get_list@DB_VERSION_UNIQUE_NAME@
#define __lock_list_print __lock_list_print@DB_VERSION_UNIQUE_NAME@
@@ -1154,6 +1276,7 @@
#define __lock_set_env_timeout __lock_set_env_timeout@DB_VERSION_UNIQUE_NAME@
#define __lock_open __lock_open@DB_VERSION_UNIQUE_NAME@
#define __lock_env_refresh __lock_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __lock_region_detach __lock_region_detach@DB_VERSION_UNIQUE_NAME@
#define __lock_region_mutex_count __lock_region_mutex_count@DB_VERSION_UNIQUE_NAME@
#define __lock_region_mutex_max __lock_region_mutex_max@DB_VERSION_UNIQUE_NAME@
#define __lock_region_max __lock_region_max@DB_VERSION_UNIQUE_NAME@
@@ -1162,6 +1285,7 @@
#define __lock_stat_print_pp __lock_stat_print_pp@DB_VERSION_UNIQUE_NAME@
#define __lock_stat_print __lock_stat_print@DB_VERSION_UNIQUE_NAME@
#define __lock_printlock __lock_printlock@DB_VERSION_UNIQUE_NAME@
+#define __lock_dump_locker __lock_dump_locker@DB_VERSION_UNIQUE_NAME@
#define __lock_set_timeout __lock_set_timeout@DB_VERSION_UNIQUE_NAME@
#define __lock_set_timeout_internal __lock_set_timeout_internal@DB_VERSION_UNIQUE_NAME@
#define __lock_inherit_timeout __lock_inherit_timeout@DB_VERSION_UNIQUE_NAME@
@@ -1169,6 +1293,7 @@
#define __lock_lhash __lock_lhash@DB_VERSION_UNIQUE_NAME@
#define __lock_nomem __lock_nomem@DB_VERSION_UNIQUE_NAME@
#define __log_open __log_open@DB_VERSION_UNIQUE_NAME@
+#define __log_region_detach __log_region_detach@DB_VERSION_UNIQUE_NAME@
#define __log_find __log_find@DB_VERSION_UNIQUE_NAME@
#define __log_valid __log_valid@DB_VERSION_UNIQUE_NAME@
#define __log_env_refresh __log_env_refresh@DB_VERSION_UNIQUE_NAME@
@@ -1234,6 +1359,7 @@
#define __log_file_pp __log_file_pp@DB_VERSION_UNIQUE_NAME@
#define __log_name __log_name@DB_VERSION_UNIQUE_NAME@
#define __log_rep_put __log_rep_put@DB_VERSION_UNIQUE_NAME@
+#define __log_rep_write __log_rep_write@DB_VERSION_UNIQUE_NAME@
#define __log_put_record_pp __log_put_record_pp@DB_VERSION_UNIQUE_NAME@
#define __log_put_record __log_put_record@DB_VERSION_UNIQUE_NAME@
#define __log_stat_pp __log_stat_pp@DB_VERSION_UNIQUE_NAME@
@@ -1277,6 +1403,7 @@
#define __db_merge_verify __db_merge_verify@DB_VERSION_UNIQUE_NAME@
#define __db_pgno_verify __db_pgno_verify@DB_VERSION_UNIQUE_NAME@
#define __dbreg_register_verify __dbreg_register_verify@DB_VERSION_UNIQUE_NAME@
+#define __dbreg_register_42_verify __dbreg_register_42_verify@DB_VERSION_UNIQUE_NAME@
#define __bam_split_verify __bam_split_verify@DB_VERSION_UNIQUE_NAME@
#define __bam_split_42_verify __bam_split_42_verify@DB_VERSION_UNIQUE_NAME@
#define __bam_rsplit_verify __bam_rsplit_verify@DB_VERSION_UNIQUE_NAME@
@@ -1291,12 +1418,19 @@
#define __bam_relink_43_verify __bam_relink_43_verify@DB_VERSION_UNIQUE_NAME@
#define __bam_merge_44_verify __bam_merge_44_verify@DB_VERSION_UNIQUE_NAME@
#define __fop_create_42_verify __fop_create_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_create_60_verify __fop_create_60_verify@DB_VERSION_UNIQUE_NAME@
#define __fop_create_verify __fop_create_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_remove_60_verify __fop_remove_60_verify@DB_VERSION_UNIQUE_NAME@
#define __fop_remove_verify __fop_remove_verify@DB_VERSION_UNIQUE_NAME@
#define __fop_write_42_verify __fop_write_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_60_verify __fop_write_60_verify@DB_VERSION_UNIQUE_NAME@
#define __fop_write_verify __fop_write_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_file_60_verify __fop_write_file_60_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_write_file_verify __fop_write_file_verify@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_42_verify __fop_rename_42_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_rename_60_verify __fop_rename_60_verify@DB_VERSION_UNIQUE_NAME@
#define __fop_rename_verify __fop_rename_verify@DB_VERSION_UNIQUE_NAME@
+#define __fop_file_remove_60_verify __fop_file_remove_60_verify@DB_VERSION_UNIQUE_NAME@
#define __fop_file_remove_verify __fop_file_remove_verify@DB_VERSION_UNIQUE_NAME@
#define __ham_insdel_verify __ham_insdel_verify@DB_VERSION_UNIQUE_NAME@
#define __ham_newpage_verify __ham_newpage_verify@DB_VERSION_UNIQUE_NAME@
@@ -1312,6 +1446,7 @@
#define __ham_curadj_verify __ham_curadj_verify@DB_VERSION_UNIQUE_NAME@
#define __ham_chgpg_verify __ham_chgpg_verify@DB_VERSION_UNIQUE_NAME@
#define __heap_addrem_verify __heap_addrem_verify@DB_VERSION_UNIQUE_NAME@
+#define __heap_addrem_60_verify __heap_addrem_60_verify@DB_VERSION_UNIQUE_NAME@
#define __heap_pg_alloc_verify __heap_pg_alloc_verify@DB_VERSION_UNIQUE_NAME@
#define __heap_trunc_meta_verify __heap_trunc_meta_verify@DB_VERSION_UNIQUE_NAME@
#define __heap_trunc_page_verify __heap_trunc_page_verify@DB_VERSION_UNIQUE_NAME@
@@ -1363,6 +1498,7 @@
#define __del_txn_pages __del_txn_pages@DB_VERSION_UNIQUE_NAME@
#define __is_ancestor_txn __is_ancestor_txn@DB_VERSION_UNIQUE_NAME@
#define __return_txn_pages __return_txn_pages@DB_VERSION_UNIQUE_NAME@
+#define __memp_bh_unreachable __memp_bh_unreachable@DB_VERSION_UNIQUE_NAME@
#define __memp_alloc __memp_alloc@DB_VERSION_UNIQUE_NAME@
#define __memp_free __memp_free@DB_VERSION_UNIQUE_NAME@
#define __memp_backup_open __memp_backup_open@DB_VERSION_UNIQUE_NAME@
@@ -1375,6 +1511,7 @@
#define __memp_bhfree __memp_bhfree@DB_VERSION_UNIQUE_NAME@
#define __memp_fget_pp __memp_fget_pp@DB_VERSION_UNIQUE_NAME@
#define __memp_fget __memp_fget@DB_VERSION_UNIQUE_NAME@
+#define __memp_find_obsolete_version __memp_find_obsolete_version@DB_VERSION_UNIQUE_NAME@
#define __memp_fcreate_pp __memp_fcreate_pp@DB_VERSION_UNIQUE_NAME@
#define __memp_fcreate __memp_fcreate@DB_VERSION_UNIQUE_NAME@
#define __memp_set_clear_len __memp_set_clear_len@DB_VERSION_UNIQUE_NAME@
@@ -1385,6 +1522,7 @@
#define __memp_get_ftype __memp_get_ftype@DB_VERSION_UNIQUE_NAME@
#define __memp_set_ftype __memp_set_ftype@DB_VERSION_UNIQUE_NAME@
#define __memp_set_lsn_offset __memp_set_lsn_offset@DB_VERSION_UNIQUE_NAME@
+#define __memp_set_maxpgno __memp_set_maxpgno@DB_VERSION_UNIQUE_NAME@
#define __memp_get_pgcookie __memp_get_pgcookie@DB_VERSION_UNIQUE_NAME@
#define __memp_set_pgcookie __memp_set_pgcookie@DB_VERSION_UNIQUE_NAME@
#define __memp_get_priority __memp_get_priority@DB_VERSION_UNIQUE_NAME@
@@ -1432,10 +1570,12 @@
#define __memp_bh_freeze __memp_bh_freeze@DB_VERSION_UNIQUE_NAME@
#define __memp_bh_thaw __memp_bh_thaw@DB_VERSION_UNIQUE_NAME@
#define __memp_open __memp_open@DB_VERSION_UNIQUE_NAME@
+#define __memp_region_detach __memp_region_detach@DB_VERSION_UNIQUE_NAME@
#define __memp_init __memp_init@DB_VERSION_UNIQUE_NAME@
#define __memp_max_regions __memp_max_regions@DB_VERSION_UNIQUE_NAME@
#define __memp_region_mutex_count __memp_region_mutex_count@DB_VERSION_UNIQUE_NAME@
#define __memp_env_refresh __memp_env_refresh@DB_VERSION_UNIQUE_NAME@
+#define __memp_region_bhfree __memp_region_bhfree@DB_VERSION_UNIQUE_NAME@
#define __memp_register_pp __memp_register_pp@DB_VERSION_UNIQUE_NAME@
#define __memp_register __memp_register@DB_VERSION_UNIQUE_NAME@
#define __memp_get_bucket __memp_get_bucket@DB_VERSION_UNIQUE_NAME@
@@ -1460,13 +1600,13 @@
#define __mutex_alloc_int __mutex_alloc_int@DB_VERSION_UNIQUE_NAME@
#define __mutex_free __mutex_free@DB_VERSION_UNIQUE_NAME@
#define __mutex_free_int __mutex_free_int@DB_VERSION_UNIQUE_NAME@
+#define __mutex_died __mutex_died@DB_VERSION_UNIQUE_NAME@
#define __mutex_refresh __mutex_refresh@DB_VERSION_UNIQUE_NAME@
-#define __mut_failchk __mut_failchk@DB_VERSION_UNIQUE_NAME@
-#define __db_fcntl_mutex_init __db_fcntl_mutex_init@DB_VERSION_UNIQUE_NAME@
-#define __db_fcntl_mutex_lock __db_fcntl_mutex_lock@DB_VERSION_UNIQUE_NAME@
-#define __db_fcntl_mutex_trylock __db_fcntl_mutex_trylock@DB_VERSION_UNIQUE_NAME@
-#define __db_fcntl_mutex_unlock __db_fcntl_mutex_unlock@DB_VERSION_UNIQUE_NAME@
-#define __db_fcntl_mutex_destroy __db_fcntl_mutex_destroy@DB_VERSION_UNIQUE_NAME@
+#define __mutex_record_lock __mutex_record_lock@DB_VERSION_UNIQUE_NAME@
+#define __mutex_record_unlock __mutex_record_unlock@DB_VERSION_UNIQUE_NAME@
+#define __mutex_record_print __mutex_record_print@DB_VERSION_UNIQUE_NAME@
+#define __mutex_failchk __mutex_failchk@DB_VERSION_UNIQUE_NAME@
+#define __mutex_failchk_thread __mutex_failchk_thread@DB_VERSION_UNIQUE_NAME@
#define __mutex_alloc_pp __mutex_alloc_pp@DB_VERSION_UNIQUE_NAME@
#define __mutex_free_pp __mutex_free_pp@DB_VERSION_UNIQUE_NAME@
#define __mutex_lock_pp __mutex_lock_pp@DB_VERSION_UNIQUE_NAME@
@@ -1481,6 +1621,9 @@
#define __mutex_set_max __mutex_set_max@DB_VERSION_UNIQUE_NAME@
#define __mutex_get_tas_spins __mutex_get_tas_spins@DB_VERSION_UNIQUE_NAME@
#define __mutex_set_tas_spins __mutex_set_tas_spins@DB_VERSION_UNIQUE_NAME@
+#ifdef HAVE_ERROR_HISTORY
+#define __mutex_diags __mutex_diags@DB_VERSION_UNIQUE_NAME@
+#endif
#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
#define __atomic_inc __atomic_inc@DB_VERSION_UNIQUE_NAME@
#endif
@@ -1503,6 +1646,7 @@
#define __db_pthread_mutex_unlock __db_pthread_mutex_unlock@DB_VERSION_UNIQUE_NAME@
#define __db_pthread_mutex_destroy __db_pthread_mutex_destroy@DB_VERSION_UNIQUE_NAME@
#define __mutex_open __mutex_open@DB_VERSION_UNIQUE_NAME@
+#define __mutex_region_detach __mutex_region_detach@DB_VERSION_UNIQUE_NAME@
#define __mutex_env_refresh __mutex_env_refresh@DB_VERSION_UNIQUE_NAME@
#define __mutex_resource_return __mutex_resource_return@DB_VERSION_UNIQUE_NAME@
#define __mutex_stat_pp __mutex_stat_pp@DB_VERSION_UNIQUE_NAME@
@@ -1512,6 +1656,7 @@
#define __mutex_print_debug_stats __mutex_print_debug_stats@DB_VERSION_UNIQUE_NAME@
#define __mutex_set_wait_info __mutex_set_wait_info@DB_VERSION_UNIQUE_NAME@
#define __mutex_clear __mutex_clear@DB_VERSION_UNIQUE_NAME@
+#define __mutex_describe __mutex_describe@DB_VERSION_UNIQUE_NAME@
#define __db_tas_mutex_init __db_tas_mutex_init@DB_VERSION_UNIQUE_NAME@
#define __db_tas_mutex_lock __db_tas_mutex_lock@DB_VERSION_UNIQUE_NAME@
#define __db_tas_mutex_trylock __db_tas_mutex_trylock@DB_VERSION_UNIQUE_NAME@
@@ -1582,6 +1727,7 @@
#define __os_concat_path __os_concat_path@DB_VERSION_UNIQUE_NAME@
#define __os_id __os_id@DB_VERSION_UNIQUE_NAME@
#define __os_rename __os_rename@DB_VERSION_UNIQUE_NAME@
+#define __os_rmdir __os_rmdir@DB_VERSION_UNIQUE_NAME@
#define __os_isroot __os_isroot@DB_VERSION_UNIQUE_NAME@
#define __db_rpath __db_rpath@DB_VERSION_UNIQUE_NAME@
#define __os_io __os_io@DB_VERSION_UNIQUE_NAME@
@@ -1590,16 +1736,37 @@
#define __os_physwrite __os_physwrite@DB_VERSION_UNIQUE_NAME@
#define __os_seek __os_seek@DB_VERSION_UNIQUE_NAME@
#define __os_stack __os_stack@DB_VERSION_UNIQUE_NAME@
+#define __os_stack_top __os_stack_top@DB_VERSION_UNIQUE_NAME@
+#define __os_stack_text __os_stack_text@DB_VERSION_UNIQUE_NAME@
+#define __os_stack_save __os_stack_save@DB_VERSION_UNIQUE_NAME@
+#define __os_stack_msgadd __os_stack_msgadd@DB_VERSION_UNIQUE_NAME@
#define __os_exists __os_exists@DB_VERSION_UNIQUE_NAME@
#define __os_ioinfo __os_ioinfo@DB_VERSION_UNIQUE_NAME@
#define __os_tmpdir __os_tmpdir@DB_VERSION_UNIQUE_NAME@
#define __os_truncate __os_truncate@DB_VERSION_UNIQUE_NAME@
#define __os_unique_id __os_unique_id@DB_VERSION_UNIQUE_NAME@
+#define __os_srandom __os_srandom@DB_VERSION_UNIQUE_NAME@
+#define __os_random __os_random@DB_VERSION_UNIQUE_NAME@
#define __os_unlink __os_unlink@DB_VERSION_UNIQUE_NAME@
#define __os_yield __os_yield@DB_VERSION_UNIQUE_NAME@
#ifdef HAVE_QNX
#define __os_qnx_region_open __os_qnx_region_open@DB_VERSION_UNIQUE_NAME@
#endif
+#ifdef DB_WINCE
+#define __ce_freopen __ce_freopen@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef DB_WINCE
+#define __ce_gmtime __ce_gmtime@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef DB_WINCE
+#define localtime localtime@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef DB_WINCE
+#define __ce_mktime __ce_mktime@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifdef DB_WINCE
+#define __ce_remove __ce_remove@DB_VERSION_UNIQUE_NAME@
+#endif
#define __os_is_winnt __os_is_winnt@DB_VERSION_UNIQUE_NAME@
#define __os_cpu_count __os_cpu_count@DB_VERSION_UNIQUE_NAME@
#ifdef HAVE_REPLICATION_THREADS
@@ -1673,6 +1840,8 @@
#define __rep_egen_unmarshal __rep_egen_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __rep_fileinfo_marshal __rep_fileinfo_marshal@DB_VERSION_UNIQUE_NAME@
#define __rep_fileinfo_unmarshal __rep_fileinfo_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_fileinfo_v7_marshal __rep_fileinfo_v7_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_fileinfo_v7_unmarshal __rep_fileinfo_v7_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __rep_fileinfo_v6_marshal __rep_fileinfo_v6_marshal@DB_VERSION_UNIQUE_NAME@
#define __rep_fileinfo_v6_unmarshal __rep_fileinfo_v6_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __rep_grant_info_marshal __rep_grant_info_marshal@DB_VERSION_UNIQUE_NAME@
@@ -1691,13 +1860,29 @@
#define __rep_lsn_hist_key_unmarshal __rep_lsn_hist_key_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __rep_lsn_hist_data_marshal __rep_lsn_hist_data_marshal@DB_VERSION_UNIQUE_NAME@
#define __rep_lsn_hist_data_unmarshal __rep_lsn_hist_data_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_update_req_marshal __rep_blob_update_req_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_update_req_unmarshal __rep_blob_update_req_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_update_marshal __rep_blob_update_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_update_unmarshal __rep_blob_update_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_file_marshal __rep_blob_file_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_file_unmarshal __rep_blob_file_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_chunk_marshal __rep_blob_chunk_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_chunk_unmarshal __rep_blob_chunk_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_chunk_req_marshal __rep_blob_chunk_req_marshal@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_chunk_req_unmarshal __rep_blob_chunk_req_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __rep_update_req __rep_update_req@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_update_req __rep_blob_update_req@DB_VERSION_UNIQUE_NAME@
#define __rep_page_req __rep_page_req@DB_VERSION_UNIQUE_NAME@
#define __rep_update_setup __rep_update_setup@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_update __rep_blob_update@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_allreq __rep_blob_allreq@DB_VERSION_UNIQUE_NAME@
#define __rep_bulk_page __rep_bulk_page@DB_VERSION_UNIQUE_NAME@
#define __rep_page __rep_page@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_chunk __rep_blob_chunk@DB_VERSION_UNIQUE_NAME@
#define __rep_init_cleanup __rep_init_cleanup@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_chunk_req __rep_blob_chunk_req@DB_VERSION_UNIQUE_NAME@
#define __rep_pggap_req __rep_pggap_req@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_rereq __rep_blob_rereq@DB_VERSION_UNIQUE_NAME@
#define __rep_finfo_alloc __rep_finfo_alloc@DB_VERSION_UNIQUE_NAME@
#define __rep_remove_init_file __rep_remove_init_file@DB_VERSION_UNIQUE_NAME@
#define __rep_reset_init __rep_reset_init@DB_VERSION_UNIQUE_NAME@
@@ -1727,24 +1912,32 @@
#define __rep_start_int __rep_start_int@DB_VERSION_UNIQUE_NAME@
#define __rep_open_sysdb __rep_open_sysdb@DB_VERSION_UNIQUE_NAME@
#define __rep_client_dbinit __rep_client_dbinit@DB_VERSION_UNIQUE_NAME@
+#define __rep_blob_cmp __rep_blob_cmp@DB_VERSION_UNIQUE_NAME@
+#define __rep_offset_cmp __rep_offset_cmp@DB_VERSION_UNIQUE_NAME@
#define __rep_get_limit __rep_get_limit@DB_VERSION_UNIQUE_NAME@
#define __rep_set_limit __rep_set_limit@DB_VERSION_UNIQUE_NAME@
#define __rep_set_nsites_pp __rep_set_nsites_pp@DB_VERSION_UNIQUE_NAME@
#define __rep_set_nsites_int __rep_set_nsites_int@DB_VERSION_UNIQUE_NAME@
#define __rep_get_nsites __rep_get_nsites@DB_VERSION_UNIQUE_NAME@
-#define __rep_set_priority __rep_set_priority@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_priority_pp __rep_set_priority_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_priority_int __rep_set_priority_int@DB_VERSION_UNIQUE_NAME@
#define __rep_get_priority __rep_get_priority@DB_VERSION_UNIQUE_NAME@
-#define __rep_set_timeout __rep_set_timeout@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_timeout_pp __rep_set_timeout_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_timeout_int __rep_set_timeout_int@DB_VERSION_UNIQUE_NAME@
#define __rep_get_timeout __rep_get_timeout@DB_VERSION_UNIQUE_NAME@
#define __rep_get_request __rep_get_request@DB_VERSION_UNIQUE_NAME@
#define __rep_set_request __rep_set_request@DB_VERSION_UNIQUE_NAME@
+#define __rep_set_view __rep_set_view@DB_VERSION_UNIQUE_NAME@
+#define __rep_call_partial __rep_call_partial@DB_VERSION_UNIQUE_NAME@
#define __rep_set_transport_pp __rep_set_transport_pp@DB_VERSION_UNIQUE_NAME@
#define __rep_set_transport_int __rep_set_transport_int@DB_VERSION_UNIQUE_NAME@
#define __rep_get_clockskew __rep_get_clockskew@DB_VERSION_UNIQUE_NAME@
#define __rep_set_clockskew __rep_set_clockskew@DB_VERSION_UNIQUE_NAME@
-#define __rep_flush __rep_flush@DB_VERSION_UNIQUE_NAME@
+#define __rep_flush_pp __rep_flush_pp@DB_VERSION_UNIQUE_NAME@
+#define __rep_flush_int __rep_flush_int@DB_VERSION_UNIQUE_NAME@
#define __rep_sync __rep_sync@DB_VERSION_UNIQUE_NAME@
#define __rep_txn_applied __rep_txn_applied@DB_VERSION_UNIQUE_NAME@
+#define __rep_read_lsn_history __rep_read_lsn_history@DB_VERSION_UNIQUE_NAME@
#define __rep_process_message_pp __rep_process_message_pp@DB_VERSION_UNIQUE_NAME@
#define __rep_process_message_int __rep_process_message_int@DB_VERSION_UNIQUE_NAME@
#define __rep_apply __rep_apply@DB_VERSION_UNIQUE_NAME@
@@ -1760,6 +1953,7 @@
#define __rep_closefiles __rep_closefiles@DB_VERSION_UNIQUE_NAME@
#define __rep_write_egen __rep_write_egen@DB_VERSION_UNIQUE_NAME@
#define __rep_write_gen __rep_write_gen@DB_VERSION_UNIQUE_NAME@
+#define __rep_check_view __rep_check_view@DB_VERSION_UNIQUE_NAME@
#define __rep_stat_pp __rep_stat_pp@DB_VERSION_UNIQUE_NAME@
#define __rep_stat_print_pp __rep_stat_print_pp@DB_VERSION_UNIQUE_NAME@
#define __rep_stat_print __rep_stat_print@DB_VERSION_UNIQUE_NAME@
@@ -1798,6 +1992,8 @@
#define __rep_get_maxpermlsn __rep_get_maxpermlsn@DB_VERSION_UNIQUE_NAME@
#define __rep_is_internal_rep_file __rep_is_internal_rep_file@DB_VERSION_UNIQUE_NAME@
#define __rep_get_datagen __rep_get_datagen@DB_VERSION_UNIQUE_NAME@
+#define __rep_become_readonly_master __rep_become_readonly_master@DB_VERSION_UNIQUE_NAME@
+#define __rep_get_lsnhist_data __rep_get_lsnhist_data@DB_VERSION_UNIQUE_NAME@
#define __rep_verify __rep_verify@DB_VERSION_UNIQUE_NAME@
#define __rep_verify_fail __rep_verify_fail@DB_VERSION_UNIQUE_NAME@
#define __rep_verify_req __rep_verify_req@DB_VERSION_UNIQUE_NAME@
@@ -1827,6 +2023,8 @@
#define __repmgr_membership_key_unmarshal __repmgr_membership_key_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_membership_data_marshal __repmgr_membership_data_marshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_membership_data_unmarshal __repmgr_membership_data_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v4membership_data_marshal __repmgr_v4membership_data_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v4membership_data_unmarshal __repmgr_v4membership_data_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_member_metadata_marshal __repmgr_member_metadata_marshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_member_metadata_unmarshal __repmgr_member_metadata_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_gm_fwd_marshal __repmgr_gm_fwd_marshal@DB_VERSION_UNIQUE_NAME@
@@ -1835,21 +2033,34 @@
#define __repmgr_membr_vers_unmarshal __repmgr_membr_vers_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_site_info_marshal __repmgr_site_info_marshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_site_info_unmarshal __repmgr_site_info_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v4site_info_marshal __repmgr_v4site_info_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v4site_info_unmarshal __repmgr_v4site_info_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_connect_reject_marshal __repmgr_connect_reject_marshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_connect_reject_unmarshal __repmgr_connect_reject_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v4connect_reject_marshal __repmgr_v4connect_reject_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_v4connect_reject_unmarshal __repmgr_v4connect_reject_unmarshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_lsnhist_match_marshal __repmgr_lsnhist_match_marshal@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_lsnhist_match_unmarshal __repmgr_lsnhist_match_unmarshal@DB_VERSION_UNIQUE_NAME@
#define __repmgr_member_print __repmgr_member_print@DB_VERSION_UNIQUE_NAME@
#define __repmgr_init_print __repmgr_init_print@DB_VERSION_UNIQUE_NAME@
#define __repmgr_init_election __repmgr_init_election@DB_VERSION_UNIQUE_NAME@
#define __repmgr_claim_victory __repmgr_claim_victory@DB_VERSION_UNIQUE_NAME@
#define __repmgr_turn_on_elections __repmgr_turn_on_elections@DB_VERSION_UNIQUE_NAME@
-#define __repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_start_pp __repmgr_start_pp@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_start_int __repmgr_start_int@DB_VERSION_UNIQUE_NAME@
#define __repmgr_valid_config __repmgr_valid_config@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_prefmas_auto_config __repmgr_prefmas_auto_config@DB_VERSION_UNIQUE_NAME@
#define __repmgr_autostart __repmgr_autostart@DB_VERSION_UNIQUE_NAME@
#define __repmgr_start_selector __repmgr_start_selector@DB_VERSION_UNIQUE_NAME@
#define __repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@
#define __repmgr_stop __repmgr_stop@DB_VERSION_UNIQUE_NAME@
#define __repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@
#define __repmgr_get_ack_policy __repmgr_get_ack_policy@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_incoming_queue_max __repmgr_set_incoming_queue_max@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_get_incoming_queue_max __repmgr_get_incoming_queue_max@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_set_incoming_queue_redzone __repmgr_set_incoming_queue_redzone@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_get_incoming_queue_redzone __repmgr_get_incoming_queue_redzone@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_get_incoming_queue_fullevent __repmgr_get_incoming_queue_fullevent@DB_VERSION_UNIQUE_NAME@
#define __repmgr_env_create __repmgr_env_create@DB_VERSION_UNIQUE_NAME@
#define __repmgr_env_destroy __repmgr_env_destroy@DB_VERSION_UNIQUE_NAME@
#define __repmgr_stop_threads __repmgr_stop_threads@DB_VERSION_UNIQUE_NAME@
@@ -1870,7 +2081,8 @@
#define __repmgr_get_site_address __repmgr_get_site_address@DB_VERSION_UNIQUE_NAME@
#define __repmgr_get_eid __repmgr_get_eid@DB_VERSION_UNIQUE_NAME@
#define __repmgr_get_config __repmgr_get_config@DB_VERSION_UNIQUE_NAME@
-#define __repmgr_site_config __repmgr_site_config@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_config_pp __repmgr_site_config_pp@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_config_int __repmgr_site_config_int@DB_VERSION_UNIQUE_NAME@
#define __repmgr_site_close __repmgr_site_close@DB_VERSION_UNIQUE_NAME@
#define __repmgr_msg_thread __repmgr_msg_thread@DB_VERSION_UNIQUE_NAME@
#define __repmgr_send_err_resp __repmgr_send_err_resp@DB_VERSION_UNIQUE_NAME@
@@ -1930,7 +2142,6 @@
#define __repmgr_queue_destroy __repmgr_queue_destroy@DB_VERSION_UNIQUE_NAME@
#define __repmgr_queue_get __repmgr_queue_get@DB_VERSION_UNIQUE_NAME@
#define __repmgr_queue_put __repmgr_queue_put@DB_VERSION_UNIQUE_NAME@
-#define __repmgr_queue_size __repmgr_queue_size@DB_VERSION_UNIQUE_NAME@
#define __repmgr_member_recover __repmgr_member_recover@DB_VERSION_UNIQUE_NAME@
#define __repmgr_select_thread __repmgr_select_thread@DB_VERSION_UNIQUE_NAME@
#define __repmgr_bow_out __repmgr_bow_out@DB_VERSION_UNIQUE_NAME@
@@ -1938,6 +2149,7 @@
#define __repmgr_compute_timeout __repmgr_compute_timeout@DB_VERSION_UNIQUE_NAME@
#define __repmgr_connected_master __repmgr_connected_master@DB_VERSION_UNIQUE_NAME@
#define __repmgr_check_timeouts __repmgr_check_timeouts@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_refresh_selector __repmgr_refresh_selector@DB_VERSION_UNIQUE_NAME@
#define __repmgr_first_try_connections __repmgr_first_try_connections@DB_VERSION_UNIQUE_NAME@
#define __repmgr_send_v1_handshake __repmgr_send_v1_handshake@DB_VERSION_UNIQUE_NAME@
#define __repmgr_read_from_site __repmgr_read_from_site@DB_VERSION_UNIQUE_NAME@
@@ -1949,7 +2161,8 @@
#define __repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@
#define __repmgr_stat_print_pp __repmgr_stat_print_pp@DB_VERSION_UNIQUE_NAME@
#define __repmgr_stat_print __repmgr_stat_print@DB_VERSION_UNIQUE_NAME@
-#define __repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_list_pp __repmgr_site_list_pp@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_list_int __repmgr_site_list_int@DB_VERSION_UNIQUE_NAME@
#ifndef HAVE_REPLICATION_THREADS
#define __repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@
#endif
@@ -1960,6 +2173,18 @@
#define __repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_get_incoming_queue_max __repmgr_get_incoming_queue_max@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_set_incoming_queue_max __repmgr_set_incoming_queue_max@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_get_incoming_queue_redzone __repmgr_get_incoming_queue_redzone@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+#define __repmgr_get_incoming_queue_fullevent __repmgr_get_incoming_queue_fullevent@DB_VERSION_UNIQUE_NAME@
+#endif
+#ifndef HAVE_REPLICATION_THREADS
#define __repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
@@ -1969,10 +2194,10 @@
#define __repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
-#define __repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_site_list_pp __repmgr_site_list_pp@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
-#define __repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_start_pp __repmgr_start_pp@DB_VERSION_UNIQUE_NAME@
#endif
#ifndef HAVE_REPLICATION_THREADS
#define __repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@
@@ -2023,7 +2248,14 @@
#define __repmgr_failchk __repmgr_failchk@DB_VERSION_UNIQUE_NAME@
#define __repmgr_master_is_known __repmgr_master_is_known@DB_VERSION_UNIQUE_NAME@
#define __repmgr_stable_lsn __repmgr_stable_lsn@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_make_request_conn __repmgr_make_request_conn@DB_VERSION_UNIQUE_NAME@
#define __repmgr_send_sync_msg __repmgr_send_sync_msg@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_read_own_msg __repmgr_read_own_msg@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_prefmas_connected __repmgr_prefmas_connected@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_restart_site_as_client __repmgr_restart_site_as_client@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_make_site_readonly_master __repmgr_make_site_readonly_master@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_lsnhist_match __repmgr_lsnhist_match@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_prefmas_get_wait __repmgr_prefmas_get_wait@DB_VERSION_UNIQUE_NAME@
#define __repmgr_marshal_member_list __repmgr_marshal_member_list@DB_VERSION_UNIQUE_NAME@
#define __repmgr_refresh_membership __repmgr_refresh_membership@DB_VERSION_UNIQUE_NAME@
#define __repmgr_reload_gmdb __repmgr_reload_gmdb@DB_VERSION_UNIQUE_NAME@
@@ -2040,10 +2272,15 @@
#define __repmgr_bcast_parm_refresh __repmgr_bcast_parm_refresh@DB_VERSION_UNIQUE_NAME@
#define __repmgr_chg_prio __repmgr_chg_prio@DB_VERSION_UNIQUE_NAME@
#define __repmgr_bcast_own_msg __repmgr_bcast_own_msg@DB_VERSION_UNIQUE_NAME@
+#define __repmgr_bcast_member_list __repmgr_bcast_member_list@DB_VERSION_UNIQUE_NAME@
#define __seq_stat __seq_stat@DB_VERSION_UNIQUE_NAME@
#define __seq_stat_print __seq_stat_print@DB_VERSION_UNIQUE_NAME@
#define __db_get_seq_flags_fn __db_get_seq_flags_fn@DB_VERSION_UNIQUE_NAME@
#define __db_get_seq_flags_fn __db_get_seq_flags_fn@DB_VERSION_UNIQUE_NAME@
+#define __seq_open __seq_open@DB_VERSION_UNIQUE_NAME@
+#define __seq_initial_value __seq_initial_value@DB_VERSION_UNIQUE_NAME@
+#define __seq_get __seq_get@DB_VERSION_UNIQUE_NAME@
+#define __seq_close __seq_close@DB_VERSION_UNIQUE_NAME@
#define bdb_HCommand bdb_HCommand@DB_VERSION_UNIQUE_NAME@
#if DB_DBM_HSEARCH != 0
#define bdb_NdbmOpen bdb_NdbmOpen@DB_VERSION_UNIQUE_NAME@
@@ -2057,9 +2294,12 @@
#define tcl_CompactStat tcl_CompactStat@DB_VERSION_UNIQUE_NAME@
#define tcl_rep_send tcl_rep_send@DB_VERSION_UNIQUE_NAME@
#define dbc_Cmd dbc_Cmd@DB_VERSION_UNIQUE_NAME@
+#define dbstream_Cmd dbstream_Cmd@DB_VERSION_UNIQUE_NAME@
#define env_Cmd env_Cmd@DB_VERSION_UNIQUE_NAME@
#define tcl_EnvRemove tcl_EnvRemove@DB_VERSION_UNIQUE_NAME@
#define tcl_EnvClose tcl_EnvClose@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvBackup tcl_EnvBackup@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvDbBackup tcl_EnvDbBackup@DB_VERSION_UNIQUE_NAME@
#define tcl_EnvIdReset tcl_EnvIdReset@DB_VERSION_UNIQUE_NAME@
#define tcl_EnvLsnReset tcl_EnvLsnReset@DB_VERSION_UNIQUE_NAME@
#define tcl_EnvVerbose tcl_EnvVerbose@DB_VERSION_UNIQUE_NAME@
@@ -2069,6 +2309,7 @@
#define tcl_EnvGetEncryptFlags tcl_EnvGetEncryptFlags@DB_VERSION_UNIQUE_NAME@
#define tcl_EnvSetErrfile tcl_EnvSetErrfile@DB_VERSION_UNIQUE_NAME@
#define tcl_EnvSetMsgfile tcl_EnvSetMsgfile@DB_VERSION_UNIQUE_NAME@
+#define tcl_EnvCloseMsgfile tcl_EnvCloseMsgfile@DB_VERSION_UNIQUE_NAME@
#define tcl_EnvSetErrpfx tcl_EnvSetErrpfx@DB_VERSION_UNIQUE_NAME@
#define tcl_EnvStatPrint tcl_EnvStatPrint@DB_VERSION_UNIQUE_NAME@
#define _NewInfo _NewInfo@DB_VERSION_UNIQUE_NAME@
@@ -2111,9 +2352,11 @@
#define tcl_LogPut tcl_LogPut@DB_VERSION_UNIQUE_NAME@
#define tcl_LogStat tcl_LogStat@DB_VERSION_UNIQUE_NAME@
#define tcl_LogStatPrint tcl_LogStatPrint@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogVerify tcl_LogVerify@DB_VERSION_UNIQUE_NAME@
#define logc_Cmd logc_Cmd@DB_VERSION_UNIQUE_NAME@
#define tcl_LogConfig tcl_LogConfig@DB_VERSION_UNIQUE_NAME@
#define tcl_LogGetConfig tcl_LogGetConfig@DB_VERSION_UNIQUE_NAME@
+#define tcl_LogSetMax tcl_LogSetMax@DB_VERSION_UNIQUE_NAME@
#define _MpInfoDelete _MpInfoDelete@DB_VERSION_UNIQUE_NAME@
#define tcl_MpSync tcl_MpSync@DB_VERSION_UNIQUE_NAME@
#define tcl_MpTrickle tcl_MpTrickle@DB_VERSION_UNIQUE_NAME@
@@ -2121,6 +2364,7 @@
#define tcl_MpStat tcl_MpStat@DB_VERSION_UNIQUE_NAME@
#define tcl_MpStatPrint tcl_MpStatPrint@DB_VERSION_UNIQUE_NAME@
#define tcl_Mutex tcl_Mutex@DB_VERSION_UNIQUE_NAME@
+#define tcl_MutexFailchkTimeout tcl_MutexFailchkTimeout@DB_VERSION_UNIQUE_NAME@
#define tcl_MutFree tcl_MutFree@DB_VERSION_UNIQUE_NAME@
#define tcl_MutGet tcl_MutGet@DB_VERSION_UNIQUE_NAME@
#define tcl_MutLock tcl_MutLock@DB_VERSION_UNIQUE_NAME@
@@ -2227,6 +2471,7 @@
#define __txn_get_prepared __txn_get_prepared@DB_VERSION_UNIQUE_NAME@
#define __txn_openfiles __txn_openfiles@DB_VERSION_UNIQUE_NAME@
#define __txn_open __txn_open@DB_VERSION_UNIQUE_NAME@
+#define __txn_region_detach __txn_region_detach@DB_VERSION_UNIQUE_NAME@
#define __txn_findlastckp __txn_findlastckp@DB_VERSION_UNIQUE_NAME@
#define __txn_env_refresh __txn_env_refresh@DB_VERSION_UNIQUE_NAME@
#define __txn_region_mutex_count __txn_region_mutex_count@DB_VERSION_UNIQUE_NAME@
@@ -2234,7 +2479,7 @@
#define __txn_region_size __txn_region_size@DB_VERSION_UNIQUE_NAME@
#define __txn_region_max __txn_region_max@DB_VERSION_UNIQUE_NAME@
#define __txn_id_set __txn_id_set@DB_VERSION_UNIQUE_NAME@
-#define __txn_oldest_reader __txn_oldest_reader@DB_VERSION_UNIQUE_NAME@
+#define __txn_get_readers __txn_get_readers@DB_VERSION_UNIQUE_NAME@
#define __txn_add_buffer __txn_add_buffer@DB_VERSION_UNIQUE_NAME@
#define __txn_remove_buffer __txn_remove_buffer@DB_VERSION_UNIQUE_NAME@
#define __txn_stat_pp __txn_stat_pp@DB_VERSION_UNIQUE_NAME@
diff --git a/src/dbinc_auto/lock_ext.h b/src/dbinc_auto/lock_ext.h
index d5981e18..3d2c37a3 100644
--- a/src/dbinc_auto/lock_ext.h
+++ b/src/dbinc_auto/lock_ext.h
@@ -28,10 +28,11 @@ int __lock_id_free_pp __P((DB_ENV *, u_int32_t));
int __lock_id_free __P((ENV *, DB_LOCKER *));
int __lock_id_set __P((ENV *, u_int32_t, u_int32_t));
int __lock_getlocker __P((DB_LOCKTAB *, u_int32_t, int, DB_LOCKER **));
-int __lock_getlocker_int __P((DB_LOCKTAB *, u_int32_t, int, DB_LOCKER **));
+int __lock_getlocker_int __P((DB_LOCKTAB *, u_int32_t, int, DB_THREAD_INFO *, DB_LOCKER **));
int __lock_addfamilylocker __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
int __lock_freelocker __P((DB_LOCKTAB *, DB_LOCKER *));
int __lock_familyremove __P((DB_LOCKTAB *, DB_LOCKER *));
+int __lock_local_locker_invalidate __P((ENV *, db_mutex_t));
int __lock_fix_list __P((ENV *, DBT *, u_int32_t));
int __lock_get_list __P((ENV *, DB_LOCKER *, u_int32_t, db_lockmode_t, DBT *));
void __lock_list_print __P((ENV *, DB_MSGBUF *, DBT *));
@@ -57,6 +58,7 @@ int __lock_get_env_timeout __P((DB_ENV *, db_timeout_t *, u_int32_t));
int __lock_set_env_timeout __P((DB_ENV *, db_timeout_t, u_int32_t));
int __lock_open __P((ENV *));
int __lock_env_refresh __P((ENV *));
+int __lock_region_detach __P((ENV *, DB_LOCKTAB *));
u_int32_t __lock_region_mutex_count __P((ENV *));
u_int32_t __lock_region_mutex_max __P((ENV *));
size_t __lock_region_max __P((ENV *));
@@ -65,6 +67,7 @@ int __lock_stat_pp __P((DB_ENV *, DB_LOCK_STAT **, u_int32_t));
int __lock_stat_print_pp __P((DB_ENV *, u_int32_t));
int __lock_stat_print __P((ENV *, u_int32_t));
void __lock_printlock __P((DB_LOCKTAB *, DB_MSGBUF *mbp, struct __db_lock *, int));
+int __lock_dump_locker __P((ENV *, DB_MSGBUF *, DB_LOCKTAB *, DB_LOCKER *));
int __lock_set_timeout __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t));
int __lock_set_timeout_internal __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t));
int __lock_inherit_timeout __P((ENV *, DB_LOCKER *, DB_LOCKER *));
diff --git a/src/dbinc_auto/log_ext.h b/src/dbinc_auto/log_ext.h
index dde6742d..769643fa 100644
--- a/src/dbinc_auto/log_ext.h
+++ b/src/dbinc_auto/log_ext.h
@@ -7,6 +7,7 @@ extern "C" {
#endif
int __log_open __P((ENV *));
+int __log_region_detach __P((ENV *, DB_LOG *));
int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
int __log_valid __P((DB_LOG *, u_int32_t, int, DB_FH **, u_int32_t, logfile_validity *, u_int32_t *));
int __log_env_refresh __P((ENV *));
@@ -72,6 +73,7 @@ int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t));
int __log_name __P((DB_LOG *, u_int32_t, char **, DB_FH **, u_int32_t));
int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
+int __log_rep_write __P((ENV *));
int __log_put_record_pp __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, ...));
int __log_put_record __P((ENV *, DB *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, ...));
int __log_stat_pp __P((DB_ENV *, DB_LOG_STAT **, u_int32_t));
@@ -115,6 +117,7 @@ int __db_relink_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __db_merge_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __db_pgno_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __dbreg_register_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __dbreg_register_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __bam_split_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __bam_split_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __bam_rsplit_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
@@ -129,12 +132,19 @@ int __bam_rcuradj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __bam_relink_43_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __bam_merge_44_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_create_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_create_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_create_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_remove_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_write_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_write_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_file_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_write_file_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_rename_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_rename_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_rename_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __fop_file_remove_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __fop_file_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __ham_insdel_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __ham_newpage_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
@@ -150,6 +160,7 @@ int __ham_contract_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __ham_curadj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __ham_chgpg_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_addrem_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+int __heap_addrem_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_pg_alloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_trunc_meta_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __heap_trunc_page_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
diff --git a/src/dbinc_auto/mp_ext.h b/src/dbinc_auto/mp_ext.h
index d142b584..3f5a397b 100644
--- a/src/dbinc_auto/mp_ext.h
+++ b/src/dbinc_auto/mp_ext.h
@@ -6,6 +6,7 @@
extern "C" {
#endif
+int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int));
int __memp_alloc __P((DB_MPOOL *, REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
void __memp_free __P((REGINFO *, void *));
int __memp_backup_open __P((ENV *, DB_MPOOLFILE *, const char *, const char *, u_int32_t, DB_FH **, void**));
@@ -18,6 +19,7 @@ int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int));
int __memp_bhfree __P((DB_MPOOL *, REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t));
int __memp_fget_pp __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
int __memp_fget __P((DB_MPOOLFILE *, db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *));
+int __memp_find_obsolete_version __P((ENV *, BH *, DB_MPOOL_HASH *, BH **));
int __memp_fcreate_pp __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
int __memp_fcreate __P((ENV *, DB_MPOOLFILE **));
int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t));
@@ -28,6 +30,7 @@ int __memp_set_flags __P((DB_MPOOLFILE *, u_int32_t, int));
int __memp_get_ftype __P((DB_MPOOLFILE *, int *));
int __memp_set_ftype __P((DB_MPOOLFILE *, int));
int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t));
+void __memp_set_maxpgno __P((MPOOLFILE *, u_int32_t, u_int32_t));
int __memp_get_pgcookie __P((DB_MPOOLFILE *, DBT *));
int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *));
int __memp_get_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *));
@@ -75,10 +78,12 @@ int __memp_skip_curadj __P((DBC *, db_pgno_t));
int __memp_bh_freeze __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, int *));
int __memp_bh_thaw __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, BH *));
int __memp_open __P((ENV *, int));
+int __memp_region_detach __P((ENV *, DB_MPOOL *));
int __memp_init __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int));
u_int32_t __memp_max_regions __P((ENV *));
u_int32_t __memp_region_mutex_count __P((ENV *));
int __memp_env_refresh __P((ENV *));
+int __memp_region_bhfree __P((REGINFO *));
int __memp_register_pp __P((DB_ENV *, int, int (*)(DB_ENV *, db_pgno_t, void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
int __memp_register __P((ENV *, int, int (*)(DB_ENV *, db_pgno_t, void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
int __memp_get_bucket __P((ENV *, MPOOLFILE *, db_pgno_t, REGINFO **, DB_MPOOL_HASH **, u_int32_t *));
diff --git a/src/dbinc_auto/mutex_ext.h b/src/dbinc_auto/mutex_ext.h
index 1a2a1b2b..673c18d0 100644
--- a/src/dbinc_auto/mutex_ext.h
+++ b/src/dbinc_auto/mutex_ext.h
@@ -10,13 +10,13 @@ int __mutex_alloc __P((ENV *, int, u_int32_t, db_mutex_t *));
int __mutex_alloc_int __P((ENV *, int, int, u_int32_t, db_mutex_t *));
int __mutex_free __P((ENV *, db_mutex_t *));
int __mutex_free_int __P((ENV *, int, db_mutex_t *));
+int __mutex_died __P((ENV *, db_mutex_t));
int __mutex_refresh __P((ENV *, db_mutex_t));
-int __mut_failchk __P((ENV *));
-int __db_fcntl_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
-int __db_fcntl_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
-int __db_fcntl_mutex_trylock __P((ENV *, db_mutex_t));
-int __db_fcntl_mutex_unlock __P((ENV *, db_mutex_t));
-int __db_fcntl_mutex_destroy __P((ENV *, db_mutex_t));
+int __mutex_record_lock __P((ENV *, db_mutex_t, MUTEX_ACTION, MUTEX_STATE **));
+int __mutex_record_unlock __P((ENV *, db_mutex_t));
+int __mutex_record_print __P((ENV *, DB_THREAD_INFO *));
+int __mutex_failchk __P((ENV *));
+int __mutex_failchk_thread __P((ENV *, DB_THREAD_INFO *));
int __mutex_alloc_pp __P((DB_ENV *, u_int32_t, db_mutex_t *));
int __mutex_free_pp __P((DB_ENV *, db_mutex_t));
int __mutex_lock_pp __P((DB_ENV *, db_mutex_t));
@@ -31,6 +31,9 @@ int __mutex_get_max __P((DB_ENV *, u_int32_t *));
int __mutex_set_max __P((DB_ENV *, u_int32_t));
int __mutex_get_tas_spins __P((DB_ENV *, u_int32_t *));
int __mutex_set_tas_spins __P((DB_ENV *, u_int32_t));
+#ifdef HAVE_ERROR_HISTORY
+int __mutex_diags __P((ENV *, db_mutex_t, int));
+#endif
#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
atomic_value_t __atomic_inc __P((ENV *, db_atomic_t *));
#endif
@@ -53,6 +56,7 @@ int __db_hybrid_mutex_suspend __P((ENV *, db_mutex_t, db_timespec *, int));
int __db_pthread_mutex_unlock __P((ENV *, db_mutex_t));
int __db_pthread_mutex_destroy __P((ENV *, db_mutex_t));
int __mutex_open __P((ENV *, int));
+int __mutex_region_detach __P((ENV *, DB_MUTEXMGR *));
int __mutex_env_refresh __P((ENV *));
void __mutex_resource_return __P((ENV *, REGINFO *));
int __mutex_stat_pp __P((DB_ENV *, DB_MUTEX_STAT **, u_int32_t));
@@ -62,6 +66,7 @@ void __mutex_print_debug_single __P((ENV *, const char *, db_mutex_t, u_int32_t)
void __mutex_print_debug_stats __P((ENV *, DB_MSGBUF *, db_mutex_t, u_int32_t));
void __mutex_set_wait_info __P((ENV *, db_mutex_t, uintmax_t *, uintmax_t *));
void __mutex_clear __P((ENV *, db_mutex_t));
+char *__mutex_describe __P((ENV *, db_mutex_t, char *));
int __db_tas_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
int __db_tas_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
int __db_tas_mutex_trylock __P((ENV *, db_mutex_t));
diff --git a/src/dbinc_auto/os_ext.h b/src/dbinc_auto/os_ext.h
index a0a7b791..26cf2127 100644
--- a/src/dbinc_auto/os_ext.h
+++ b/src/dbinc_auto/os_ext.h
@@ -6,7 +6,7 @@
extern "C" {
#endif
-void __os_abort __P((ENV *));
+void __os_abort __P((const ENV *));
int __os_abspath __P((const char *));
#if defined(HAVE_REPLICATION_THREADS)
int __os_getaddrinfo __P((ENV *, const char *, u_int, const char *, const ADDRINFO *, ADDRINFO **));
@@ -18,12 +18,12 @@ int __os_umalloc __P((ENV *, size_t, void *));
int __os_urealloc __P((ENV *, size_t, void *));
void __os_ufree __P((ENV *, void *));
int __os_strdup __P((ENV *, const char *, void *));
-int __os_calloc __P((ENV *, size_t, size_t, void *));
-int __os_malloc __P((ENV *, size_t, void *));
-int __os_realloc __P((ENV *, size_t, void *));
-void __os_free __P((ENV *, void *));
+int __os_calloc __P((const ENV *, size_t, size_t, void *));
+int __os_malloc __P((const ENV *, size_t, void *));
+int __os_realloc __P((const ENV *, size_t, void *));
+void __os_free __P((const ENV *, void *));
void *__ua_memcpy __P((void *, const void *, size_t));
-void __os_gettime __P((ENV *, db_timespec *, int));
+void __os_gettime __P((const ENV *, db_timespec *, int));
int __os_fs_notzero __P((void));
int __os_support_direct_io __P((void));
int __os_support_db_register __P((void));
@@ -54,6 +54,7 @@ int __os_open __P((ENV *, const char *, u_int32_t, u_int32_t, int, DB_FH **));
int __os_concat_path __P((char *, size_t, const char *, const char *));
void __os_id __P((DB_ENV *, pid_t *, db_threadid_t*));
int __os_rename __P((ENV *, const char *, const char *, u_int32_t));
+int __os_rmdir __P((ENV *, const char *));
int __os_isroot __P((void));
char *__db_rpath __P((const char *));
int __os_io __P((ENV *, int, DB_FH *, db_pgno_t, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, size_t *));
@@ -61,17 +62,38 @@ int __os_read __P((ENV *, DB_FH *, void *, size_t, size_t *));
int __os_write __P((ENV *, DB_FH *, void *, size_t, size_t *));
int __os_physwrite __P((ENV *, DB_FH *, void *, size_t, size_t *));
int __os_seek __P((ENV *, DB_FH *, db_pgno_t, u_int32_t, off_t));
-void __os_stack __P((ENV *));
+void __os_stack __P((const ENV *));
+void __os_stack_top __P((const ENV *, unsigned, unsigned));
+void __os_stack_text __P((const ENV *, char *, size_t, unsigned, unsigned));
+int __os_stack_save __P((const ENV *, unsigned, void **));
+void __os_stack_msgadd __P((const ENV *, DB_MSGBUF *, unsigned, unsigned, void **));
int __os_exists __P((ENV *, const char *, int *));
int __os_ioinfo __P((ENV *, const char *, DB_FH *, u_int32_t *, u_int32_t *, u_int32_t *));
int __os_tmpdir __P((ENV *, u_int32_t));
-int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t));
+int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t, off_t));
void __os_unique_id __P((ENV *, u_int32_t *));
+void __os_srandom __P((u_int));
+u_int __os_random __P((void));
int __os_unlink __P((ENV *, const char *, int));
void __os_yield __P((ENV *, u_long, u_long));
#ifdef HAVE_QNX
int __os_qnx_region_open __P((ENV *, const char *, int, int, DB_FH **));
#endif
+#ifdef DB_WINCE
+FILE * __ce_freopen __P((const char *, const char *, FILE *));
+#endif
+#ifdef DB_WINCE
+struct tm * __ce_gmtime __P((const time_t *));
+#endif
+#ifdef DB_WINCE
+struct tm * localtime __P((const time_t *));
+#endif
+#ifdef DB_WINCE
+time_t __ce_mktime __P((struct tm *));
+#endif
+#ifdef DB_WINCE
+int __ce_remove __P((const char *path));
+#endif
int __os_is_winnt __P((void));
u_int32_t __os_cpu_count __P((void));
#ifdef HAVE_REPLICATION_THREADS
diff --git a/src/dbinc_auto/rep_automsg.h b/src/dbinc_auto/rep_automsg.h
index 584040cf..f52c8907 100644
--- a/src/dbinc_auto/rep_automsg.h
+++ b/src/dbinc_auto/rep_automsg.h
@@ -32,7 +32,7 @@ typedef struct ___rep_egen_args {
u_int32_t egen;
} __rep_egen_args;
-#define __REP_FILEINFO_SIZE 40
+#define __REP_FILEINFO_SIZE 48
typedef struct ___rep_fileinfo_args {
u_int32_t pgsize;
db_pgno_t pgno;
@@ -44,8 +44,24 @@ typedef struct ___rep_fileinfo_args {
DBT uid;
DBT info;
DBT dir;
+ u_int32_t blob_fid_lo;
+ u_int32_t blob_fid_hi;
} __rep_fileinfo_args;
+#define __REP_FILEINFO_V7_SIZE 40
+typedef struct ___rep_fileinfo_v7_args {
+ u_int32_t pgsize;
+ db_pgno_t pgno;
+ db_pgno_t max_pgno;
+ u_int32_t filenum;
+ u_int32_t finfo_flags;
+ u_int32_t type;
+ u_int32_t db_flags;
+ DBT uid;
+ DBT info;
+ DBT dir;
+} __rep_fileinfo_v7_args;
+
#define __REP_FILEINFO_V6_SIZE 36
typedef struct ___rep_fileinfo_v6_args {
u_int32_t pgsize;
@@ -116,5 +132,46 @@ typedef struct ___rep_lsn_hist_data_args {
u_int32_t hist_nsec;
} __rep_lsn_hist_data_args;
-#define __REP_MAXMSG_SIZE 40
+#define __REP_BLOB_UPDATE_REQ_SIZE 32
+typedef struct ___rep_blob_update_req_args {
+ u_int64_t blob_fid;
+ u_int64_t blob_sid;
+ u_int64_t blob_id;
+ u_int64_t highest_id;
+} __rep_blob_update_req_args;
+
+#define __REP_BLOB_UPDATE_SIZE 24
+typedef struct ___rep_blob_update_args {
+ u_int64_t blob_fid;
+ u_int64_t highest_id;
+ u_int32_t flags;
+ u_int32_t num_blobs;
+} __rep_blob_update_args;
+
+#define __REP_BLOB_FILE_SIZE 24
+typedef struct ___rep_blob_file_args {
+ u_int64_t blob_sid;
+ u_int64_t blob_id;
+ u_int64_t blob_size;
+} __rep_blob_file_args;
+
+#define __REP_BLOB_CHUNK_SIZE 40
+typedef struct ___rep_blob_chunk_args {
+ u_int32_t flags;
+ u_int64_t blob_fid;
+ u_int64_t blob_sid;
+ u_int64_t blob_id;
+ u_int64_t offset;
+ DBT data;
+} __rep_blob_chunk_args;
+
+#define __REP_BLOB_CHUNK_REQ_SIZE 32
+typedef struct ___rep_blob_chunk_req_args {
+ u_int64_t blob_fid;
+ u_int64_t blob_sid;
+ u_int64_t blob_id;
+ u_int64_t offset;
+} __rep_blob_chunk_req_args;
+
+#define __REP_MAXMSG_SIZE 48
#endif
diff --git a/src/dbinc_auto/rep_ext.h b/src/dbinc_auto/rep_ext.h
index 89bdc797..97740acf 100644
--- a/src/dbinc_auto/rep_ext.h
+++ b/src/dbinc_auto/rep_ext.h
@@ -14,6 +14,8 @@ int __rep_egen_marshal __P((ENV *, __rep_egen_args *, u_int8_t *, size_t, size_t
int __rep_egen_unmarshal __P((ENV *, __rep_egen_args *, u_int8_t *, size_t, u_int8_t **));
int __rep_fileinfo_marshal __P((ENV *, u_int32_t, __rep_fileinfo_args *, u_int8_t *, size_t, size_t *));
int __rep_fileinfo_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_args **, u_int8_t *, size_t, u_int8_t **));
+int __rep_fileinfo_v7_marshal __P((ENV *, u_int32_t, __rep_fileinfo_v7_args *, u_int8_t *, size_t, size_t *));
+int __rep_fileinfo_v7_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_v7_args **, u_int8_t *, size_t, u_int8_t **));
int __rep_fileinfo_v6_marshal __P((ENV *, u_int32_t, __rep_fileinfo_v6_args *, u_int8_t *, size_t, size_t *));
int __rep_fileinfo_v6_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_v6_args **, u_int8_t *, size_t, u_int8_t **));
int __rep_grant_info_marshal __P((ENV *, __rep_grant_info_args *, u_int8_t *, size_t, size_t *));
@@ -32,13 +34,29 @@ void __rep_lsn_hist_key_marshal __P((ENV *, __rep_lsn_hist_key_args *, u_int8_t
int __rep_lsn_hist_key_unmarshal __P((ENV *, __rep_lsn_hist_key_args *, u_int8_t *, size_t, u_int8_t **));
void __rep_lsn_hist_data_marshal __P((ENV *, __rep_lsn_hist_data_args *, u_int8_t *));
int __rep_lsn_hist_data_unmarshal __P((ENV *, __rep_lsn_hist_data_args *, u_int8_t *, size_t, u_int8_t **));
+void __rep_blob_update_req_marshal __P((ENV *, __rep_blob_update_req_args *, u_int8_t *));
+int __rep_blob_update_req_unmarshal __P((ENV *, __rep_blob_update_req_args *, u_int8_t *, size_t, u_int8_t **));
+void __rep_blob_update_marshal __P((ENV *, __rep_blob_update_args *, u_int8_t *));
+int __rep_blob_update_unmarshal __P((ENV *, __rep_blob_update_args *, u_int8_t *, size_t, u_int8_t **));
+void __rep_blob_file_marshal __P((ENV *, __rep_blob_file_args *, u_int8_t *));
+int __rep_blob_file_unmarshal __P((ENV *, __rep_blob_file_args *, u_int8_t *, size_t, u_int8_t **));
+void __rep_blob_chunk_marshal __P((ENV *, __rep_blob_chunk_args *, u_int8_t *));
+int __rep_blob_chunk_unmarshal __P((ENV *, __rep_blob_chunk_args *, u_int8_t *, size_t, u_int8_t **));
+void __rep_blob_chunk_req_marshal __P((ENV *, __rep_blob_chunk_req_args *, u_int8_t *));
+int __rep_blob_chunk_req_unmarshal __P((ENV *, __rep_blob_chunk_req_args *, u_int8_t *, size_t, u_int8_t **));
int __rep_update_req __P((ENV *, __rep_control_args *));
+int __rep_blob_update_req __P((ENV *, DB_THREAD_INFO *, DBT *));
int __rep_page_req __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
int __rep_update_setup __P((ENV *, int, __rep_control_args *, DBT *, time_t, DB_LSN *));
+int __rep_blob_update __P((ENV *, int, DB_THREAD_INFO *, DBT *));
+int __rep_blob_allreq __P((ENV *, int, DBT *));
int __rep_bulk_page __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
int __rep_page __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+int __rep_blob_chunk __P((ENV *, int, DB_THREAD_INFO *, DBT *));
int __rep_init_cleanup __P((ENV *, REP *, int));
+int __rep_blob_chunk_req __P((ENV *, int, DBT *));
int __rep_pggap_req __P((ENV *, REP *, __rep_fileinfo_args *, u_int32_t));
+int __rep_blob_rereq __P((ENV *, REP *));
int __rep_finfo_alloc __P((ENV *, __rep_fileinfo_args *, __rep_fileinfo_args **));
int __rep_remove_init_file __P((ENV *));
int __rep_reset_init __P((ENV *));
@@ -65,27 +83,35 @@ void __rep_env_destroy __P((DB_ENV *));
int __rep_get_config __P((DB_ENV *, u_int32_t, int *));
int __rep_set_config __P((DB_ENV *, u_int32_t, int));
int __rep_start_pp __P((DB_ENV *, DBT *, u_int32_t));
-int __rep_start_int __P((ENV *, DBT *, u_int32_t));
+int __rep_start_int __P((ENV *, DBT *, u_int32_t, u_int32_t));
int __rep_open_sysdb __P((ENV *, DB_THREAD_INFO *, DB_TXN *, const char *, u_int32_t, DB **));
int __rep_client_dbinit __P((ENV *, int, repdb_t));
+int __rep_blob_cmp __P((DB *, const DBT *, const DBT *, size_t *));
+int __rep_offset_cmp __P((DB *, const DBT *, const DBT *, size_t *));
int __rep_get_limit __P((DB_ENV *, u_int32_t *, u_int32_t *));
int __rep_set_limit __P((DB_ENV *, u_int32_t, u_int32_t));
int __rep_set_nsites_pp __P((DB_ENV *, u_int32_t));
int __rep_set_nsites_int __P((ENV *, u_int32_t));
int __rep_get_nsites __P((DB_ENV *, u_int32_t *));
-int __rep_set_priority __P((DB_ENV *, u_int32_t));
+int __rep_set_priority_pp __P((DB_ENV *, u_int32_t));
+int __rep_set_priority_int __P((ENV *, u_int32_t));
int __rep_get_priority __P((DB_ENV *, u_int32_t *));
-int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t));
+int __rep_set_timeout_pp __P((DB_ENV *, int, db_timeout_t));
+int __rep_set_timeout_int __P((ENV *, int, db_timeout_t));
int __rep_get_timeout __P((DB_ENV *, int, db_timeout_t *));
int __rep_get_request __P((DB_ENV *, db_timeout_t *, db_timeout_t *));
int __rep_set_request __P((DB_ENV *, db_timeout_t, db_timeout_t));
+int __rep_set_view __P((DB_ENV *, int (*)(DB_ENV *, const char *, int *, u_int32_t)));
+int __rep_call_partial __P((ENV *, const char *, int *, u_int32_t, DELAYED_BLOB_LIST **));
int __rep_set_transport_pp __P((DB_ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
int __rep_set_transport_int __P((ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
int __rep_get_clockskew __P((DB_ENV *, u_int32_t *, u_int32_t *));
int __rep_set_clockskew __P((DB_ENV *, u_int32_t, u_int32_t));
-int __rep_flush __P((DB_ENV *));
+int __rep_flush_pp __P((DB_ENV *));
+int __rep_flush_int __P((ENV *));
int __rep_sync __P((DB_ENV *, u_int32_t));
int __rep_txn_applied __P((ENV *, DB_THREAD_INFO *, DB_COMMIT_INFO *, db_timeout_t));
+int __rep_read_lsn_history __P((ENV *, DB_THREAD_INFO *, DB_TXN **, DBC **, u_int32_t, __rep_lsn_hist_data_args *, struct rep_waitgoal *, u_int32_t, int));
int __rep_process_message_pp __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *));
int __rep_process_message_int __P((ENV *, DBT *, DBT *, int, DB_LSN *));
int __rep_apply __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, DBT *, DB_LSN *, int *, DB_LSN *));
@@ -101,6 +127,7 @@ int __rep_preclose __P((ENV *));
int __rep_closefiles __P((ENV *));
int __rep_write_egen __P((ENV *, REP *, u_int32_t));
int __rep_write_gen __P((ENV *, REP *, u_int32_t));
+int __rep_check_view __P((ENV *));
int __rep_stat_pp __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
int __rep_stat_print_pp __P((DB_ENV *, u_int32_t));
int __rep_stat_print __P((ENV *, u_int32_t));
@@ -139,6 +166,8 @@ int __rep_log_backup __P((ENV *, DB_LOGC *, DB_LSN *, u_int32_t));
int __rep_get_maxpermlsn __P((ENV *, DB_LSN *));
int __rep_is_internal_rep_file __P((char *));
int __rep_get_datagen __P((ENV *, u_int32_t *));
+int __rep_become_readonly_master __P((ENV *, u_int32_t *, DB_LSN *));
+int __rep_get_lsnhist_data __P((ENV *, DB_THREAD_INFO *, u_int32_t, __rep_lsn_hist_data_args *));
int __rep_verify __P((ENV *, __rep_control_args *, DBT *, int, time_t));
int __rep_verify_fail __P((ENV *, __rep_control_args *));
int __rep_verify_req __P((ENV *, __rep_control_args *, int));
diff --git a/src/dbinc_auto/repmgr_automsg.h b/src/dbinc_auto/repmgr_automsg.h
index 1b2b928c..17e467e9 100644
--- a/src/dbinc_auto/repmgr_automsg.h
+++ b/src/dbinc_auto/repmgr_automsg.h
@@ -72,11 +72,17 @@ typedef struct ___repmgr_membership_key_args {
u_int16_t port;
} __repmgr_membership_key_args;
-#define __REPMGR_MEMBERSHIP_DATA_SIZE 4
+#define __REPMGR_MEMBERSHIP_DATA_SIZE 8
typedef struct ___repmgr_membership_data_args {
+ u_int32_t status;
u_int32_t flags;
} __repmgr_membership_data_args;
+#define __REPMGR_V4MEMBERSHIP_DATA_SIZE 4
+typedef struct ___repmgr_v4membership_data_args {
+ u_int32_t flags;
+} __repmgr_v4membership_data_args;
+
#define __REPMGR_MEMBER_METADATA_SIZE 8
typedef struct ___repmgr_member_metadata_args {
u_int32_t format;
@@ -96,18 +102,41 @@ typedef struct ___repmgr_membr_vers_args {
u_int32_t gen;
} __repmgr_membr_vers_args;
-#define __REPMGR_SITE_INFO_SIZE 10
+#define __REPMGR_SITE_INFO_SIZE 14
typedef struct ___repmgr_site_info_args {
DBT host;
u_int16_t port;
+ u_int32_t status;
u_int32_t flags;
} __repmgr_site_info_args;
-#define __REPMGR_CONNECT_REJECT_SIZE 8
+#define __REPMGR_V4SITE_INFO_SIZE 10
+typedef struct ___repmgr_v4site_info_args {
+ DBT host;
+ u_int16_t port;
+ u_int32_t flags;
+} __repmgr_v4site_info_args;
+
+#define __REPMGR_CONNECT_REJECT_SIZE 12
typedef struct ___repmgr_connect_reject_args {
u_int32_t version;
u_int32_t gen;
+ u_int32_t status;
} __repmgr_connect_reject_args;
-#define __REPMGR_MAXMSG_SIZE 12
+#define __REPMGR_V4CONNECT_REJECT_SIZE 8
+typedef struct ___repmgr_v4connect_reject_args {
+ u_int32_t version;
+ u_int32_t gen;
+} __repmgr_v4connect_reject_args;
+
+#define __REPMGR_LSNHIST_MATCH_SIZE 24
+typedef struct ___repmgr_lsnhist_match_args {
+ DB_LSN lsn;
+ u_int32_t hist_sec;
+ u_int32_t hist_nsec;
+ DB_LSN next_gen_lsn;
+} __repmgr_lsnhist_match_args;
+
+#define __REPMGR_MAXMSG_SIZE 24
#endif
diff --git a/src/dbinc_auto/repmgr_ext.h b/src/dbinc_auto/repmgr_ext.h
index b1237950..3ff59ffe 100644
--- a/src/dbinc_auto/repmgr_ext.h
+++ b/src/dbinc_auto/repmgr_ext.h
@@ -29,6 +29,8 @@ int __repmgr_membership_key_marshal __P((ENV *, __repmgr_membership_key_args *,
int __repmgr_membership_key_unmarshal __P((ENV *, __repmgr_membership_key_args *, u_int8_t *, size_t, u_int8_t **));
void __repmgr_membership_data_marshal __P((ENV *, __repmgr_membership_data_args *, u_int8_t *));
int __repmgr_membership_data_unmarshal __P((ENV *, __repmgr_membership_data_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_v4membership_data_marshal __P((ENV *, __repmgr_v4membership_data_args *, u_int8_t *));
+int __repmgr_v4membership_data_unmarshal __P((ENV *, __repmgr_v4membership_data_args *, u_int8_t *, size_t, u_int8_t **));
void __repmgr_member_metadata_marshal __P((ENV *, __repmgr_member_metadata_args *, u_int8_t *));
int __repmgr_member_metadata_unmarshal __P((ENV *, __repmgr_member_metadata_args *, u_int8_t *, size_t, u_int8_t **));
int __repmgr_gm_fwd_marshal __P((ENV *, __repmgr_gm_fwd_args *, u_int8_t *, size_t, size_t *));
@@ -37,21 +39,34 @@ void __repmgr_membr_vers_marshal __P((ENV *, __repmgr_membr_vers_args *, u_int8_
int __repmgr_membr_vers_unmarshal __P((ENV *, __repmgr_membr_vers_args *, u_int8_t *, size_t, u_int8_t **));
int __repmgr_site_info_marshal __P((ENV *, __repmgr_site_info_args *, u_int8_t *, size_t, size_t *));
int __repmgr_site_info_unmarshal __P((ENV *, __repmgr_site_info_args *, u_int8_t *, size_t, u_int8_t **));
+int __repmgr_v4site_info_marshal __P((ENV *, __repmgr_v4site_info_args *, u_int8_t *, size_t, size_t *));
+int __repmgr_v4site_info_unmarshal __P((ENV *, __repmgr_v4site_info_args *, u_int8_t *, size_t, u_int8_t **));
void __repmgr_connect_reject_marshal __P((ENV *, __repmgr_connect_reject_args *, u_int8_t *));
int __repmgr_connect_reject_unmarshal __P((ENV *, __repmgr_connect_reject_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_v4connect_reject_marshal __P((ENV *, __repmgr_v4connect_reject_args *, u_int8_t *));
+int __repmgr_v4connect_reject_unmarshal __P((ENV *, __repmgr_v4connect_reject_args *, u_int8_t *, size_t, u_int8_t **));
+void __repmgr_lsnhist_match_marshal __P((ENV *, __repmgr_lsnhist_match_args *, u_int8_t *));
+int __repmgr_lsnhist_match_unmarshal __P((ENV *, __repmgr_lsnhist_match_args *, u_int8_t *, size_t, u_int8_t **));
int __repmgr_member_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
int __repmgr_init_print __P((ENV *, DB_DISTAB *));
int __repmgr_init_election __P((ENV *, u_int32_t));
int __repmgr_claim_victory __P((ENV *));
int __repmgr_turn_on_elections __P((ENV *));
-int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t));
+int __repmgr_start_int __P((ENV *, int, u_int32_t));
int __repmgr_valid_config __P((ENV *, u_int32_t));
+int __repmgr_prefmas_auto_config __P((DB_ENV *, u_int32_t *));
int __repmgr_autostart __P((ENV *));
int __repmgr_start_selector __P((ENV *));
int __repmgr_close __P((ENV *));
int __repmgr_stop __P((ENV *));
int __repmgr_set_ack_policy __P((DB_ENV *, int));
int __repmgr_get_ack_policy __P((DB_ENV *, int *));
+int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t, u_int32_t));
+int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+void __repmgr_set_incoming_queue_redzone __P((void *, u_int32_t, u_int32_t));
+int __repmgr_get_incoming_queue_redzone __P((DB_ENV *, u_int32_t *, u_int32_t *));
+int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *, int *));
int __repmgr_env_create __P((ENV *, DB_REP *));
void __repmgr_env_destroy __P((ENV *, DB_REP *));
int __repmgr_stop_threads __P((ENV *));
@@ -72,12 +87,13 @@ int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
int __repmgr_get_site_address __P((DB_SITE *, const char **, u_int *));
int __repmgr_get_eid __P((DB_SITE *, int *));
int __repmgr_get_config __P((DB_SITE *, u_int32_t, u_int32_t *));
-int __repmgr_site_config __P((DB_SITE *, u_int32_t, u_int32_t));
+int __repmgr_site_config_pp __P((DB_SITE *, u_int32_t, u_int32_t));
+int __repmgr_site_config_int __P((DB_SITE *, u_int32_t, u_int32_t));
int __repmgr_site_close __P((DB_SITE *));
void *__repmgr_msg_thread __P((void *));
int __repmgr_send_err_resp __P((ENV *, CHANNEL *, int));
int __repmgr_handle_event __P((ENV *, u_int32_t, void *));
-int __repmgr_update_membership __P((ENV *, DB_THREAD_INFO *, int, u_int32_t));
+int __repmgr_update_membership __P((ENV *, DB_THREAD_INFO *, int, u_int32_t, u_int32_t));
int __repmgr_set_gm_version __P((ENV *, DB_THREAD_INFO *, DB_TXN *, u_int32_t));
int __repmgr_setup_gmdb_op __P((ENV *, DB_THREAD_INFO *, DB_TXN **, u_int32_t));
int __repmgr_cleanup_gmdb_op __P((ENV *, int));
@@ -132,7 +148,6 @@ int __repmgr_select_loop __P((ENV *));
int __repmgr_queue_destroy __P((ENV *));
int __repmgr_queue_get __P((ENV *, REPMGR_MESSAGE **, REPMGR_RUNNABLE *));
int __repmgr_queue_put __P((ENV *, REPMGR_MESSAGE *));
-int __repmgr_queue_size __P((ENV *));
int __repmgr_member_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
void *__repmgr_select_thread __P((void *));
int __repmgr_bow_out __P((ENV *));
@@ -140,6 +155,7 @@ int __repmgr_accept __P((ENV *));
int __repmgr_compute_timeout __P((ENV *, db_timespec *));
REPMGR_SITE *__repmgr_connected_master __P((ENV *));
int __repmgr_check_timeouts __P((ENV *));
+int __repmgr_refresh_selector __P((ENV *));
int __repmgr_first_try_connections __P((ENV *));
int __repmgr_send_v1_handshake __P((ENV *, REPMGR_CONNECTION *, void *, size_t));
int __repmgr_read_from_site __P((ENV *, REPMGR_CONNECTION *));
@@ -151,7 +167,8 @@ int __repmgr_write_some __P((ENV *, REPMGR_CONNECTION *));
int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t));
int __repmgr_stat_print __P((ENV *, u_int32_t));
-int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+int __repmgr_site_list_pp __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+int __repmgr_site_list_int __P((ENV *, u_int *, DB_REPMGR_SITE **));
#ifndef HAVE_REPLICATION_THREADS
int __repmgr_close __P((ENV *));
#endif
@@ -162,6 +179,18 @@ int __repmgr_get_ack_policy __P((DB_ENV *, int *));
int __repmgr_set_ack_policy __P((DB_ENV *, int));
#endif
#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t, u_int32_t));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_incoming_queue_redzone __P((DB_ENV *, u_int32_t *, u_int32_t *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
+int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *, int *));
+#endif
+#ifndef HAVE_REPLICATION_THREADS
int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
#endif
#ifndef HAVE_REPLICATION_THREADS
@@ -171,10 +200,10 @@ int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **));
int __repmgr_local_site __P((DB_ENV *, DB_SITE **));
#endif
#ifndef HAVE_REPLICATION_THREADS
-int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+int __repmgr_site_list_pp __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
#endif
#ifndef HAVE_REPLICATION_THREADS
-int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t));
#endif
#ifndef HAVE_REPLICATION_THREADS
int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t));
@@ -213,8 +242,8 @@ int __repmgr_thread_failure __P((ENV *, int));
char *__repmgr_format_eid_loc __P((DB_REP *, REPMGR_CONNECTION *, char *));
char *__repmgr_format_site_loc __P((REPMGR_SITE *, char *));
char *__repmgr_format_addr_loc __P((repmgr_netaddr_t *, char *));
-int __repmgr_repstart __P((ENV *, u_int32_t));
-int __repmgr_become_master __P((ENV *));
+int __repmgr_repstart __P((ENV *, u_int32_t, u_int32_t));
+int __repmgr_become_master __P((ENV *, u_int32_t));
int __repmgr_each_connection __P((ENV *, CONNECTION_ACTION, void *, int));
int __repmgr_open __P((ENV *, void *));
int __repmgr_join __P((ENV *, void *));
@@ -225,9 +254,16 @@ int __repmgr_init_new_sites __P((ENV *, int, int));
int __repmgr_failchk __P((ENV *));
int __repmgr_master_is_known __P((ENV *));
int __repmgr_stable_lsn __P((ENV *, DB_LSN *));
+int __repmgr_make_request_conn __P((ENV *, repmgr_netaddr_t *, REPMGR_CONNECTION **));
int __repmgr_send_sync_msg __P((ENV *, REPMGR_CONNECTION *, u_int32_t, u_int8_t *, u_int32_t));
-int __repmgr_marshal_member_list __P((ENV *, u_int8_t **, size_t *));
-int __repmgr_refresh_membership __P((ENV *, u_int8_t *, size_t));
+int __repmgr_read_own_msg __P((ENV *, REPMGR_CONNECTION *, u_int32_t *, u_int8_t **, size_t *));
+int __repmgr_prefmas_connected __P((ENV *));
+int __repmgr_restart_site_as_client __P((ENV *, int));
+int __repmgr_make_site_readonly_master __P((ENV *, int, u_int32_t *, DB_LSN *));
+int __repmgr_lsnhist_match __P((ENV *, DB_THREAD_INFO *, int, int *));
+int __repmgr_prefmas_get_wait __P((ENV *, u_int32_t *, u_long *));
+int __repmgr_marshal_member_list __P((ENV *, u_int32_t, u_int8_t **, size_t *));
+int __repmgr_refresh_membership __P((ENV *, u_int8_t *, size_t, u_int32_t));
int __repmgr_reload_gmdb __P((ENV *));
int __repmgr_gmdb_version_cmp __P((ENV *, u_int32_t, u_int32_t));
int __repmgr_init_save __P((ENV *, DBT *));
@@ -238,10 +274,11 @@ void __repmgr_print_conn_err __P((ENV *, repmgr_netaddr_t *, int));
int __repmgr_become_client __P((ENV *));
REPMGR_SITE *__repmgr_lookup_site __P((ENV *, const char *, u_int));
int __repmgr_find_site __P((ENV *, const char *, u_int, int *));
-int __repmgr_set_membership __P((ENV *, const char *, u_int, u_int32_t));
+int __repmgr_set_membership __P((ENV *, const char *, u_int, u_int32_t, u_int32_t));
int __repmgr_bcast_parm_refresh __P((ENV *));
int __repmgr_chg_prio __P((ENV *, u_int32_t, u_int32_t));
int __repmgr_bcast_own_msg __P((ENV *, u_int32_t, u_int8_t *, size_t));
+int __repmgr_bcast_member_list __P((ENV *));
#if defined(__cplusplus)
}
diff --git a/src/dbinc_auto/sequence_ext.h b/src/dbinc_auto/sequence_ext.h
index a2c114cf..8f8b8473 100644
--- a/src/dbinc_auto/sequence_ext.h
+++ b/src/dbinc_auto/sequence_ext.h
@@ -10,6 +10,10 @@ int __seq_stat __P((DB_SEQUENCE *, DB_SEQUENCE_STAT **, u_int32_t));
int __seq_stat_print __P((DB_SEQUENCE *, u_int32_t));
const FN * __db_get_seq_flags_fn __P((void));
const FN * __db_get_seq_flags_fn __P((void));
+int __seq_open __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t));
+int __seq_initial_value __P((DB_SEQUENCE *, db_seq_t));
+int __seq_get __P((DB_SEQUENCE *, DB_TXN *, u_int32_t, db_seq_t *, u_int32_t));
+int __seq_close __P((DB_SEQUENCE *, u_int32_t));
#if defined(__cplusplus)
}
diff --git a/src/dbinc_auto/tcl_ext.h b/src/dbinc_auto/tcl_ext.h
index 8b076c8b..4ea037c0 100644
--- a/src/dbinc_auto/tcl_ext.h
+++ b/src/dbinc_auto/tcl_ext.h
@@ -19,9 +19,12 @@ int db_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
int tcl_CompactStat __P((Tcl_Interp *, DBTCL_INFO *));
int tcl_rep_send __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
int dbc_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
+int dbstream_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
int env_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
int tcl_EnvRemove __P((Tcl_Interp *, int, Tcl_Obj * CONST*));
int tcl_EnvClose __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
+int tcl_EnvBackup __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_EnvDbBackup __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_EnvIdReset __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_EnvLsnReset __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_EnvVerbose __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *));
@@ -30,7 +33,8 @@ int tcl_EnvSetFlags __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *));
int tcl_EnvTest __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_EnvGetEncryptFlags __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
void tcl_EnvSetErrfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
-void tcl_EnvSetMsgfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
+int tcl_EnvSetMsgfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
+int tcl_EnvCloseMsgfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *));
int tcl_EnvSetErrpfx __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *));
int tcl_EnvStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
DBTCL_INFO *_NewInfo __P((Tcl_Interp *, void *, char *, enum INFOTYPE));
@@ -73,9 +77,11 @@ int tcl_LogGet __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_LogPut __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_LogStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_LogStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_LogVerify __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int logc_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*));
int tcl_LogConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *));
int tcl_LogGetConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *));
+int tcl_LogSetMax __P((Tcl_Interp *, DB_ENV *,Tcl_Obj *,u_int32_t *,u_int32_t *));
void _MpInfoDelete __P((Tcl_Interp *, DBTCL_INFO *));
int tcl_MpSync __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_MpTrickle __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
@@ -83,6 +89,7 @@ int tcl_Mp __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *));
int tcl_MpStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_MpStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_Mutex __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
+int tcl_MutexFailchkTimeout __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_MutFree __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
int tcl_MutGet __P((Tcl_Interp *, DB_ENV *, int));
int tcl_MutLock __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *));
diff --git a/src/dbinc_auto/txn_ext.h b/src/dbinc_auto/txn_ext.h
index 7c21455f..2fbcd147 100644
--- a/src/dbinc_auto/txn_ext.h
+++ b/src/dbinc_auto/txn_ext.h
@@ -60,6 +60,7 @@ int __txn_recover __P((ENV *, DB_PREPLIST *, long, long *, u_int32_t));
int __txn_get_prepared __P((ENV *, XID *, DB_PREPLIST *, long, long *, u_int32_t));
int __txn_openfiles __P((ENV *, DB_THREAD_INFO *, DB_LSN *, int));
int __txn_open __P((ENV *));
+int __txn_region_detach __P((ENV *, DB_TXNMGR *));
int __txn_findlastckp __P((ENV *, DB_LSN *, DB_LSN *));
int __txn_env_refresh __P((ENV *));
u_int32_t __txn_region_mutex_count __P((ENV *));
@@ -67,7 +68,7 @@ u_int32_t __txn_region_mutex_max __P((ENV *));
size_t __txn_region_size __P((ENV *));
size_t __txn_region_max __P((ENV *));
int __txn_id_set __P((ENV *, u_int32_t, u_int32_t));
-int __txn_oldest_reader __P((ENV *, DB_LSN *));
+int __txn_get_readers __P((ENV *, DB_LSN **, int *));
int __txn_add_buffer __P((ENV *, TXN_DETAIL *));
int __txn_remove_buffer __P((ENV *, TXN_DETAIL *, db_mutex_t));
int __txn_stat_pp __P((DB_ENV *, DB_TXN_STAT **, u_int32_t));
diff --git a/src/dbreg/dbreg.c b/src/dbreg/dbreg.c
index 5067edac..99a80959 100644
--- a/src/dbreg/dbreg.c
+++ b/src/dbreg/dbreg.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/log.h"
#include "dbinc/txn.h"
@@ -171,6 +172,7 @@ __dbreg_setup(dbp, fname, dname, create_txnid)
F_SET(fnp, DBREG_EXCL);
fnp->txn_ref = 1;
fnp->mutex = dbp->mutex;
+ fnp->blob_file_id = dbp->blob_file_id;
dbp->log_filename = fnp;
@@ -722,7 +724,7 @@ __dbreg_failchk(env)
MUTEX_LOCK(env, lp->mtx_filelist);
for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); fnp != NULL; fnp = nnp) {
nnp = SH_TAILQ_NEXT(fnp, q, __fname);
- if (dbenv->is_alive(dbenv,
+ if (dbenv->is_alive(dbenv,
fnp->pid, unused, DB_MUTEX_PROCESS_ONLY))
continue;
MUTEX_LOCK(env, fnp->mutex);
@@ -773,6 +775,7 @@ __dbreg_log_close(env, fnp, txn, op)
DB_LOG *dblp;
DB_LSN r_unused;
int ret;
+ u_int32_t blob_file_lo, blob_file_hi;
dblp = env->lg_handle;
ret = 0;
@@ -788,10 +791,12 @@ __dbreg_log_close(env, fnp, txn, op)
memset(&fid_dbt, 0, sizeof(fid_dbt));
fid_dbt.data = fnp->ufid;
fid_dbt.size = DB_FILE_ID_LEN;
+ SET_LO_HI_VAR(fnp->blob_file_id, blob_file_lo, blob_file_hi);
if ((ret = __dbreg_register_log(env, txn, &r_unused,
F_ISSET(fnp, DB_FNAME_DURABLE) ? 0 : DB_LOG_NOT_DURABLE,
op, dbtp, &fid_dbt, fnp->id,
- fnp->s_type, fnp->meta_pgno, TXN_INVALID)) != 0) {
+ fnp->s_type, fnp->meta_pgno, TXN_INVALID, blob_file_lo,
+ blob_file_hi)) != 0) {
/*
* We are trying to close, but the log write failed.
* Unfortunately, close needs to plow forward, because
@@ -958,6 +963,7 @@ __dbreg_log_id(dbp, txn, id, needlock)
LOG *lp;
u_int32_t op;
int i, ret;
+ u_int32_t blob_file_lo, blob_file_hi;
env = dbp->env;
dblp = env->lg_handle;
@@ -996,14 +1002,16 @@ __dbreg_log_id(dbp, txn, id, needlock)
fid_dbt.size = DB_FILE_ID_LEN;
op = !F_ISSET(dbp, DB_AM_OPEN_CALLED) ? DBREG_PREOPEN :
- (F_ISSET(dbp, DB_AM_INMEM) ?
+ (F_ISSET(dbp, DB_AM_INMEM) ?
(F2_ISSET(dbp, DB2_AM_EXCL) ? DBREG_XREOPEN : DBREG_REOPEN):
(F2_ISSET(dbp, DB2_AM_EXCL) ? DBREG_XOPEN : DBREG_OPEN));
+ SET_LO_HI_VAR(fnp->blob_file_id, blob_file_lo, blob_file_hi);
ret = __dbreg_register_log(env, txn, &unused,
F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0,
op | F_ISSET(fnp, DB_FNAME_DBREG_MASK),
r_name.size == 0 ? NULL : &r_name, &fid_dbt, id,
- fnp->s_type, fnp->meta_pgno, fnp->create_txnid);
+ fnp->s_type, fnp->meta_pgno, fnp->create_txnid,
+ blob_file_lo, blob_file_hi);
if (needlock)
MUTEX_UNLOCK(env, lp->mtx_filelist);
diff --git a/src/dbreg/dbreg.src b/src/dbreg/dbreg.src
index c7740d63..3187bc4f 100644
--- a/src/dbreg/dbreg.src
+++ b/src/dbreg/dbreg.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -26,7 +26,7 @@ INCLUDE
* ftype: database type
* id: transaction id of the subtransaction that created the fs object
*/
-BEGIN register 42 2
+BEGIN_COMPAT register 42 2
DBOP opcode u_int32_t lu
DBT name DBT s
DBT uid DBT s
@@ -35,3 +35,26 @@ ARG ftype DBTYPE lx
ARG meta_pgno db_pgno_t lu
ARG id u_int32_t lx
END
+
+/*
+ * Used for registering name/id translations at open or close.
+ * opcode: register or unregister
+ * name: file name
+ * fileid: unique file id
+ * ftype: file type
+ * ftype: database type
+ * id: transaction id of the subtransaction that created the fs object
+ * blob_fid_lo/hi: The blob file directory id
+ */
+BEGIN register 61 2
+DBOP opcode u_int32_t lu
+DBT name DBT s
+DBT uid DBT s
+ARG fileid int32_t ld
+ARG ftype DBTYPE lx
+ARG meta_pgno db_pgno_t lu
+ARG id u_int32_t lx
+ARG blob_fid_lo u_int32_t lu
+ARG blob_fid_hi u_int32_t lu
+END
+
diff --git a/src/dbreg/dbreg_auto.c b/src/dbreg/dbreg_auto.c
index a26e5527..3d9f01c7 100644
--- a/src/dbreg/dbreg_auto.c
+++ b/src/dbreg/dbreg_auto.c
@@ -8,6 +8,16 @@
#include "dbinc/db_am.h"
#include "dbinc/txn.h"
+DB_LOG_RECSPEC __dbreg_register_42_desc[] = {
+ {LOGREC_DBOP, SSZ(__dbreg_register_42_args, opcode), "opcode", ""},
+ {LOGREC_DBT, SSZ(__dbreg_register_42_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__dbreg_register_42_args, uid), "uid", ""},
+ {LOGREC_ARG, SSZ(__dbreg_register_42_args, fileid), "fileid", "%ld"},
+ {LOGREC_ARG, SSZ(__dbreg_register_42_args, ftype), "ftype", "%lx"},
+ {LOGREC_ARG, SSZ(__dbreg_register_42_args, meta_pgno), "meta_pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__dbreg_register_42_args, id), "id", "%lx"},
+ {LOGREC_Done, 0, "", ""}
+};
DB_LOG_RECSPEC __dbreg_register_desc[] = {
{LOGREC_DBOP, SSZ(__dbreg_register_args, opcode), "opcode", ""},
{LOGREC_DBT, SSZ(__dbreg_register_args, name), "name", ""},
@@ -16,6 +26,8 @@ DB_LOG_RECSPEC __dbreg_register_desc[] = {
{LOGREC_ARG, SSZ(__dbreg_register_args, ftype), "ftype", "%lx"},
{LOGREC_ARG, SSZ(__dbreg_register_args, meta_pgno), "meta_pgno", "%lu"},
{LOGREC_ARG, SSZ(__dbreg_register_args, id), "id", "%lx"},
+ {LOGREC_ARG, SSZ(__dbreg_register_args, blob_fid_lo), "blob_fid_lo", "%lu"},
+ {LOGREC_ARG, SSZ(__dbreg_register_args, blob_fid_hi), "blob_fid_hi", "%lu"},
{LOGREC_Done, 0, "", ""}
};
/*
diff --git a/src/dbreg/dbreg_autop.c b/src/dbreg/dbreg_autop.c
index ea43addd..931bc2d9 100644
--- a/src/dbreg/dbreg_autop.c
+++ b/src/dbreg/dbreg_autop.c
@@ -10,6 +10,23 @@
#include "dbinc/txn.h"
/*
+ * PUBLIC: int __dbreg_register_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__dbreg_register_42_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__dbreg_register_42", __dbreg_register_42_desc, info));
+}
+
+/*
* PUBLIC: int __dbreg_register_print __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
diff --git a/src/dbreg/dbreg_rec.c b/src/dbreg/dbreg_rec.c
index 1b387bb7..066efa03 100644
--- a/src/dbreg/dbreg_rec.c
+++ b/src/dbreg/dbreg_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1995, 1996
@@ -37,12 +37,16 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_am.h"
#include "dbinc/txn.h"
static int __dbreg_open_file __P((ENV *,
DB_TXN *, __dbreg_register_args *, void *));
+static int __dbreg_register_recover_int
+ __P((ENV *, DBT *, db_recops, void *, __dbreg_register_args *));
+
/*
* PUBLIC: int __dbreg_register_recover
* PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
@@ -56,21 +60,97 @@ __dbreg_register_recover(env, dbtp, lsnp, op, info)
void *info;
{
__dbreg_register_args *argp;
+ int ret;
+
+ argp = NULL;
+
+ if ((ret = __dbreg_register_read(env, dbtp->data, &argp)) != 0)
+ goto out;
+
+ ret = __dbreg_register_recover_int(env, dbtp, op, info, argp);
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+out: if (argp != NULL)
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __dbreg_register_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__dbreg_register_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __dbreg_register_42_args *argp;
+ __dbreg_register_args arg;
+ int ret;
+
+ argp = NULL;
+ if ((ret = __dbreg_register_42_read(env, dbtp->data, &argp)) != 0)
+ goto err;
+
+ /*
+ * Databases before 6.0 cannot support blobs, so the blob_fid is 0.
+ * After 6.0 they can support blobs, so it is possible it has a non-0
+ * blob_fid, but since logging that value in dbreg_register
+ * is only used in replication, and replication does not support blobs
+ * until 6.1, this is safe.
+ */
+ memcpy(&arg, argp, sizeof(__dbreg_register_42_args));
+ arg.blob_fid_lo = 0;
+ arg.blob_fid_hi = 0;
+
+ ret = __dbreg_register_recover_int(env, dbtp, op, info, &arg);
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+err: if (argp != NULL)
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * Internal register recovery function for both the 42 log version and the
+ * 61 log version.
+ */
+static int
+__dbreg_register_recover_int(env, dbtp, op, info, argp)
+ ENV *env;
+ DBT *dbtp;
+ db_recops op;
+ void *info;
+ __dbreg_register_args *argp;
+{
DB_ENTRY *dbe;
DB_LOG *dblp;
DB *dbp;
u_int32_t opcode, status;
int do_close, do_open, do_rem, ret, t_ret;
+#ifdef HAVE_REPLICATION
+ DB_REP *db_rep;
+ DELAYED_BLOB_LIST *dbl;
+ int view_partial;
+
+ dbl = NULL;
+#endif
dblp = env->lg_handle;
dbp = NULL;
+ ret = 0;
#ifdef DEBUG_RECOVER
REC_PRINT(__dbreg_register_print);
+#else
+ COMPQUIET(dbtp, NULL);
#endif
do_open = do_close = 0;
- if ((ret = __dbreg_register_read(env, dbtp->data, &argp)) != 0)
- goto out;
opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK);
switch (opcode) {
@@ -123,12 +203,54 @@ __dbreg_register_recover(env, dbtp, lsnp, op, info)
}
if (do_open) {
+#ifdef HAVE_REPLICATION
+ /*
+ * Partial replication may apply at this time. Invoke
+ * the callback if several conditions are met:
+ * - We are a view.
+ * - This is the OPENFILES pass of recovery.
+ * - The file is not a BDB owned database.
+ * - The dbreg operation is a create (id != TXN_INVALID).
+ *
+ * If the file is to be skipped, then we have to TXN_IGNORE
+ * the txnlist for that create operation.
+ */
+ if (IS_VIEW_SITE(env) && op == DB_TXN_OPENFILES &&
+ (!IS_DB_FILE(argp->name.data) ||
+ IS_BLOB_META(argp->name.data)) &&
+ argp->id != TXN_INVALID) {
+ db_rep = env->rep_handle;
+ /*
+ * Once a view, always a view. Must have set
+ * a callback already.
+ */
+ if (db_rep->partial == NULL) {
+ __db_errx(env, DB_STR("1592",
+ "Must set a view callback."));
+ ret = EINVAL;
+ goto out;
+ }
+ if ((ret = __rep_call_partial(env,
+ argp->name.data, &view_partial, 0, &dbl)) != 0)
+ goto out;
+ DB_ASSERT(env, dbl == NULL);
+
+ /*
+ * If this should not be replicated, then set
+ * the child txnlist to TXN_IGNORE.
+ */
+ if (view_partial == 0 &&
+ (ret = __db_txnlist_update(env, info,
+ argp->id, TXN_IGNORE, NULL, &status, 1)) != 0)
+ goto out;
+ }
+#endif
/*
* We must open the db even if the meta page is not
* yet written as we may be creating subdatabase.
*/
- if (op == DB_TXN_OPENFILES && opcode != DBREG_CHKPNT
- && opcode != DBREG_XCHKPNT)
+ if (op == DB_TXN_OPENFILES && opcode != DBREG_CHKPNT &&
+ opcode != DBREG_XCHKPNT)
F_SET(dblp, DBLOG_FORCE_OPEN);
/*
@@ -205,7 +327,7 @@ __dbreg_register_recover(env, dbtp, lsnp, op, info)
if (dbe->dbp == NULL && !dbe->deleted) {
/* No valid entry here. Nothing to do. */
MUTEX_UNLOCK(env, dblp->mtx_dbreg);
- goto done;
+ goto out;
}
/* We have either an open entry or a deleted entry. */
@@ -273,11 +395,7 @@ __dbreg_register_recover(env, dbtp, lsnp, op, info)
}
}
}
-done: if (ret == 0)
- *lsnp = argp->prev_lsn;
-out: if (argp != NULL)
- __os_free(env, argp);
- return (ret);
+out: return (ret);
}
/*
@@ -296,11 +414,13 @@ __dbreg_open_file(env, txn, argp, info)
DB *dbp;
DB_ENTRY *dbe;
DB_LOG *dblp;
+ db_seq_t blob_file_id;
u_int32_t id, opcode, status;
int ret;
dblp = env->lg_handle;
opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK);
+ ret = 0;
/*
* When we're opening, we have to check that the name we are opening
@@ -336,7 +456,7 @@ __dbreg_open_file(env, txn, argp, info)
* bit and try to open it again.
*/
if ((dbp = dbe->dbp) != NULL) {
- if (opcode == DBREG_REOPEN ||
+ if (opcode == DBREG_REOPEN ||
opcode == DBREG_XREOPEN ||
!F_ISSET(dbp, DB_AM_OPEN_CALLED) ||
dbp->meta_pgno != argp->meta_pgno ||
@@ -393,7 +513,11 @@ reopen:
txn->mgrp = env->tx_handle;
}
- return (__dbreg_do_open(env,
- txn, dblp, argp->uid.data, argp->name.data, argp->ftype,
- argp->fileid, argp->meta_pgno, info, argp->id, opcode));
+ GET_LO_HI(env,
+ argp->blob_fid_lo, argp->blob_fid_hi, blob_file_id, ret);
+ if (ret != 0)
+ return (ret);
+ return (__dbreg_do_open(env, txn, dblp, argp->uid.data,
+ argp->name.data, argp->ftype, argp->fileid,
+ argp->meta_pgno, info, argp->id, opcode, blob_file_id));
}
diff --git a/src/dbreg/dbreg_stat.c b/src/dbreg/dbreg_stat.c
index 6dfb3869..ad4bbdc2 100644
--- a/src/dbreg/dbreg_stat.c
+++ b/src/dbreg/dbreg_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbreg/dbreg_util.c b/src/dbreg/dbreg_util.c
index 80de4d91..0d483f93 100644
--- a/src/dbreg/dbreg_util.c
+++ b/src/dbreg/dbreg_util.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_am.h"
#include "dbinc/fop.h"
@@ -103,6 +104,7 @@ __dbreg_log_files(env, opcode)
LOG *lp;
u_int32_t lopcode;
int ret;
+ u_int32_t blob_file_hi, blob_file_lo;
dblp = env->lg_handle;
lp = dblp->reginfo.primary;
@@ -137,11 +139,12 @@ __dbreg_log_files(env, opcode)
lopcode = opcode;
if ( opcode == DBREG_CHKPNT && F_ISSET(fnp, DBREG_EXCL))
lopcode = DBREG_XCHKPNT;
+ SET_LO_HI_VAR(fnp->blob_file_id, blob_file_lo, blob_file_hi);
if ((ret = __dbreg_register_log(env, NULL, &r_unused,
F_ISSET(fnp, DB_FNAME_DURABLE) ? 0 : DB_LOG_NOT_DURABLE,
lopcode | F_ISSET(fnp, DB_FNAME_DBREG_MASK),
dbtp, &fid_dbt, fnp->id, fnp->s_type, fnp->meta_pgno,
- TXN_INVALID)) != 0)
+ TXN_INVALID, blob_file_lo, blob_file_hi)) != 0)
break;
}
@@ -429,7 +432,7 @@ __dbreg_id_to_db(env, txn, dbpp, ndx, tryopen)
if ((ret = __dbreg_do_open(env, txn, dblp,
fname->ufid, name, fname->s_type, ndx, fname->meta_pgno,
NULL, TXN_INVALID, F_ISSET(fname, DB_FNAME_INMEM) ?
- DBREG_REOPEN : DBREG_OPEN)) != 0)
+ DBREG_REOPEN : DBREG_OPEN, fname->blob_file_id)) != 0)
return (ret);
*dbpp = dblp->dbentry[ndx].dbp;
@@ -540,6 +543,53 @@ __dbreg_fid_to_fname(dblp, fid, have_lock, fnamep)
}
/*
+ * __dbreg_blob_file_to_fname --
+ * Traverse the shared-memory list of database file names, looking for
+ * the entry that matches the passed blob file id. Returns 0 on success;
+ * -1 on error.
+ *
+ * PUBLIC: int __dbreg_blob_file_to_fname
+ * PUBLIC: __P((DB_LOG *, db_seq_t, int, FNAME **));
+ */
+int
+__dbreg_blob_file_to_fname(dblp, blob_file_id, have_lock, fnamep)
+ DB_LOG *dblp;
+ db_seq_t blob_file_id;
+ int have_lock;
+ FNAME **fnamep;
+{
+ ENV *env;
+ FNAME *fnp;
+ LOG *lp;
+ int ret;
+
+ env = dblp->env;
+ lp = dblp->reginfo.primary;
+
+ ret = -1;
+
+ /*
+ * If blob_file is 0 then blobs are not enabled and the value is not
+ * unique.
+ */
+ if (blob_file_id == 0)
+ return (ret);
+
+ if (!have_lock)
+ MUTEX_LOCK(env, lp->mtx_filelist);
+ SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+ if (fnp->blob_file_id == blob_file_id) {
+ *fnamep = fnp;
+ ret = 0;
+ break;
+ }
+ if (!have_lock)
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+
+ return (ret);
+}
+
+/*
* __dbreg_get_name
*
* Interface to get name of registered files. This is mainly diagnostic
@@ -577,14 +627,14 @@ __dbreg_get_name(env, fid, fnamep, dnamep)
* is not protected by the thread mutex.
* PUBLIC: int __dbreg_do_open __P((ENV *,
* PUBLIC: DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE,
- * PUBLIC: int32_t, db_pgno_t, void *, u_int32_t, u_int32_t));
+ * PUBLIC: int32_t, db_pgno_t, void *, u_int32_t, u_int32_t, db_seq_t));
*/
int
-__dbreg_do_open(env,
- txn, lp, uid, name, ftype, ndx, meta_pgno, info, id, opcode)
+__dbreg_do_open(env, txn,
+ dblp, uid, name, ftype, ndx, meta_pgno, info, id, opcode, blob_file_id)
ENV *env;
DB_TXN *txn;
- DB_LOG *lp;
+ DB_LOG *dblp;
u_int8_t *uid;
char *name;
DBTYPE ftype;
@@ -592,6 +642,7 @@ __dbreg_do_open(env,
db_pgno_t meta_pgno;
void *info;
u_int32_t id, opcode;
+ db_seq_t blob_file_id;
{
DB *dbp;
u_int32_t cstat, ret_stat;
@@ -604,7 +655,7 @@ __dbreg_do_open(env,
try_inmem = 0;
retry_inmem:
- if ((ret = __db_create_internal(&dbp, lp->env, 0)) != 0)
+ if ((ret = __db_create_internal(&dbp, dblp->env, 0)) != 0)
return (ret);
/*
@@ -700,7 +751,7 @@ err: if (cstat == TXN_UNEXPECTED)
* handling those cases specially, above.
*/
if (try_inmem == 0 &&
- opcode != DBREG_PREOPEN && opcode != DBREG_REOPEN &&
+ opcode != DBREG_PREOPEN && opcode != DBREG_REOPEN &&
opcode != DBREG_XREOPEN) {
if ((ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0)
return (ret);
@@ -725,6 +776,7 @@ err: if (cstat == TXN_UNEXPECTED)
* we are closing a non-existent file and need to mark
* it as deleted.
*/
+ dbp->blob_file_id = blob_file_id;
if (dbp->log_filename == NULL &&
(ret = __dbreg_setup(dbp, name, NULL, id)) != 0)
return (ret);
@@ -736,7 +788,8 @@ not_right:
return (ret == 0 ? t_ret : ret);
/* Add this file as deleted. */
- if ((t_ret = __dbreg_add_dbentry(env, lp, NULL, ndx)) != 0 && ret == 0)
+ if ((t_ret = __dbreg_add_dbentry(env, dblp, NULL, ndx)) != 0 &&
+ ret == 0)
ret = t_ret;
return (ret);
}
diff --git a/src/env/env_alloc.c b/src/env/env_alloc.c
index 700bfb27..9c8fd046 100644
--- a/src/env/env_alloc.c
+++ b/src/env/env_alloc.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/env/env_backup.c b/src/env/env_backup.c
index 9c79dbb4..2940f44b 100644
--- a/src/env/env_backup.c
+++ b/src/env/env_backup.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/env/env_config.c b/src/env/env_config.c
index 57496909..56cebb63 100644
--- a/src/env/env_config.c
+++ b/src/env/env_config.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -84,8 +84,10 @@ static const CFG_DESC config_descs[] = {
{ "rep_set_clockskew", CFG_2UINT, __rep_set_clockskew },
{ "rep_set_limit", CFG_2UINT, __rep_set_limit },
{ "rep_set_nsites", CFG_UINT, __rep_set_nsites_pp },
- { "rep_set_priority", CFG_UINT, __rep_set_priority },
+ { "rep_set_priority", CFG_UINT, __rep_set_priority_pp },
{ "rep_set_request", CFG_2UINT, __rep_set_request },
+ { "set_blob_dir", CFG_STRING, __env_set_blob_dir },
+ { "set_blob_threshold", CFG_2UINT, __env_set_blob_threshold },
{ "set_cache_max", CFG_2UINT, __memp_set_cache_max },
{ "set_create_dir", CFG_STRING, __env_set_create_dir },
{ "set_data_dir", CFG_STRING, __env_set_data_dir },
@@ -133,11 +135,16 @@ static const FN config_rep_config[] = {
{ DB_REP_CONF_AUTOROLLBACK, "db_rep_conf_autorollback" },
{ DB_REP_CONF_BULK, "db_rep_conf_bulk" },
{ DB_REP_CONF_DELAYCLIENT, "db_rep_conf_delayclient" },
+ { DB_REP_CONF_ELECT_LOGLENGTH, "db_rep_conf_elect_loglength" },
{ DB_REP_CONF_INMEM, "db_rep_conf_inmem" },
{ DB_REP_CONF_LEASE, "db_rep_conf_lease" },
{ DB_REP_CONF_NOWAIT, "db_rep_conf_nowait" },
{ DB_REPMGR_CONF_2SITE_STRICT, "db_repmgr_conf_2site_strict" },
{ DB_REPMGR_CONF_ELECTIONS, "db_repmgr_conf_elections" },
+ { DB_REPMGR_CONF_PREFMAS_CLIENT,
+ "db_repmgr_conf_prefmas_client" },
+ { DB_REPMGR_CONF_PREFMAS_MASTER,
+ "db_repmgr_conf_prefmas_master" },
{ 0, NULL }
};
@@ -198,7 +205,9 @@ static const FN config_set_flags_forlog[] = {
{ DB_LOG_DIRECT, "db_direct_log" },
{ DB_LOG_DSYNC, "db_dsync_log" },
{ DB_LOG_AUTO_REMOVE, "db_log_autoremove" },
+ { DB_LOG_BLOB, "db_log_blob" },
{ DB_LOG_IN_MEMORY, "db_log_inmemory" },
+ { DB_LOG_NOSYNC, "db_log_nosync" },
{ 0, NULL }
};
@@ -206,7 +215,9 @@ static const FN config_log_set_config[] = {
{ DB_LOG_DIRECT, "db_log_direct" },
{ DB_LOG_DSYNC, "db_log_dsync" },
{ DB_LOG_AUTO_REMOVE, "db_log_auto_remove" },
+ { DB_LOG_BLOB, "db_log_blob" },
{ DB_LOG_IN_MEMORY, "db_log_in_memory" },
+ { DB_LOG_NOSYNC, "db_log_nosync" },
{ DB_LOG_ZERO, "db_log_zero" },
{ 0, NULL }
};
@@ -237,6 +248,7 @@ static const FN config_set_verbose[] = {
{ DB_VERB_DEADLOCK, "db_verb_deadlock" },
{ DB_VERB_FILEOPS, "db_verb_fileops" },
{ DB_VERB_FILEOPS_ALL, "db_verb_fileops_all" },
+ { DB_VERB_MVCC, "db_verb_mvcc" },
{ DB_VERB_RECOVERY, "db_verb_recovery" },
{ DB_VERB_REGISTER, "db_verb_register" },
{ DB_VERB_REPLICATION, "db_verb_replication" },
@@ -462,7 +474,7 @@ format: __db_errx(env, DB_STR_A("1584",
if ((lv1 = __db_name_to_val(config_rep_timeout, argv[1])) == -1)
goto format;
CFG_GET_UINT32(argv[2], &uv2);
- return (__rep_set_timeout(dbenv, lv1, (db_timeout_t)uv2));
+ return (__rep_set_timeout_pp(dbenv, lv1, (db_timeout_t)uv2));
}
/* repmgr_set_ack_policy db_repmgr_acks_XXX */
@@ -475,6 +487,15 @@ format: __db_errx(env, DB_STR_A("1584",
return (__repmgr_set_ack_policy(dbenv, lv1));
}
+ if (strcasecmp(argv[0], "repmgr_set_incoming_queue_max") == 0) {
+ if (nf != 3)
+ goto format;
+ CFG_GET_UINT32(argv[1], &uv1);
+ CFG_GET_UINT32(argv[2], &uv2);
+ return (__repmgr_set_incoming_queue_max(
+ dbenv, (u_int32_t)uv1, (u_int32_t)uv2));
+ }
+
/*
* Configure name/value pairs of config information for a site (local or
* remote).
@@ -503,7 +524,7 @@ format: __db_errx(env, DB_STR_A("1584",
uv2 = 0;
else
CFG_GET_UINT32(argv[i + 1], &uv2);
- if ((ret = __repmgr_site_config(site,
+ if ((ret = __repmgr_site_config_int(site,
(u_int32_t)lv1, (u_int32_t)uv2)) != 0)
break;
}
@@ -630,6 +651,15 @@ format: __db_errx(env, DB_STR_A("1584",
dbenv, DB_REGION_INIT, lv1 == 0 ? 0 : 1));
}
+ /* set_mutex_failchk_timeout <unsigned timeout> */
+ if (strcasecmp(argv[0], "set_mutex_failchk_timeout") == 0) {
+ if (nf != 2)
+ goto format;
+ CFG_GET_UINT32(argv[1], &uv1);
+ return (__env_set_timeout(
+ dbenv, (u_int32_t)uv1, DB_SET_MUTEX_FAILCHK_TIMEOUT));
+ }
+
/* set_reg_timeout <unsigned timeout> */
if (strcasecmp(argv[0], "set_reg_timeout") == 0) {
if (nf != 2)
diff --git a/src/env/env_failchk.c b/src/env/env_failchk.c
index 05752f07..ad9bed0b 100644
--- a/src/env/env_failchk.c
+++ b/src/env/env_failchk.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -22,9 +22,26 @@ static int __env_in_api __P((ENV *));
static void __env_clear_state __P((ENV *));
/*
+ * When failchk broadcast is enabled continue after the first error, to try to
+ * find all of them; without broadcasting stop at the first failure.
+ */
+#ifdef HAVE_FAILCHK_BROADCAST
+#define FAILCHK_PROCESS_ERROR(t_ret, ret) \
+ if ((t_ret) != 0 && (ret) == 0) (ret) = (t_ret)
+#else
+#define FAILCHK_PROCESS_ERROR(t_ret, ret) \
+ if (((ret) = (t_ret)) != 0) goto err
+#endif
+
+/*
* __env_failchk_pp --
* ENV->failchk pre/post processing.
*
+ * Single process failchk continues after recoverable failures but stops as
+ * soon as recovery is required. Broadcast failchks continue even after
+ * DB_RUNRECOVERY failures are detected, to maximize the possibility to
+ * wake up processes blocked on dead resources, e.g. mutexes.
+ *
* PUBLIC: int __env_failchk_pp __P((DB_ENV *, u_int32_t));
*/
int
@@ -46,7 +63,7 @@ __env_failchk_pp(dbenv, flags)
*/
if (!ALIVE_ON(env)) {
__db_errx(env, DB_STR("1503",
- "DB_ENV->failchk requires DB_ENV->is_alive be configured"));
+ "DB_ENV->failchk requires DB_ENV->is_alive be configured"));
return (EINVAL);
}
@@ -59,10 +76,14 @@ __env_failchk_pp(dbenv, flags)
ENV_LEAVE(env, ip);
return (ret);
}
+
/*
* __env_failchk_int --
* Process the subsystem failchk routines
*
+ * The FAILCHK_PROCESS_ERROR macro (defined at the top of this file)
+ * differs between the broadcast and single process versions of failchk.
+ *
* PUBLIC: int __env_failchk_int __P((DB_ENV *));
*/
int
@@ -70,42 +91,52 @@ __env_failchk_int(dbenv)
DB_ENV *dbenv;
{
ENV *env;
- int ret;
+ int ret, t_ret;
env = dbenv->env;
+ ret = 0;
F_SET(dbenv, DB_ENV_FAILCHK);
/*
- * We check for dead threads in the API first as this would be likely
- * to hang other things we try later, like locks and transactions.
+ * We check for dead threads in the API first as this would likely
+ * hang other things we try later, like locks and transactions.
*/
- if ((ret = __env_in_api(env)) != 0)
+ if ((ret = __env_in_api(env)) != 0) {
+ __db_err(env, ret, "__env_in_api");
goto err;
+ }
- if (LOCKING_ON(env) && (ret = __lock_failchk(env)) != 0)
- goto err;
+ if (LOCKING_ON(env) && (t_ret = __lock_failchk(env)) != 0)
+ FAILCHK_PROCESS_ERROR(t_ret, ret);
- if (TXN_ON(env) &&
- ((ret = __txn_failchk(env)) != 0 ||
- (ret = __dbreg_failchk(env)) != 0))
- goto err;
+ if (TXN_ON(env) && ret == 0 && ((t_ret = __txn_failchk(env)) != 0 ||
+ (t_ret = __dbreg_failchk(env)) != 0))
+ FAILCHK_PROCESS_ERROR(t_ret, ret);
- if ((ret = __memp_failchk(env)) != 0)
- goto err;
+ if ((t_ret = __memp_failchk(env)) != 0)
+ FAILCHK_PROCESS_ERROR(t_ret, ret);
#ifdef HAVE_REPLICATION_THREADS
- if (REP_ON(env) && (ret = __repmgr_failchk(env)) != 0)
- goto err;
+ if (REP_ON(env) && (t_ret = __repmgr_failchk(env)) != 0)
+ FAILCHK_PROCESS_ERROR(t_ret, ret);
#endif
- /* Mark any dead blocked threads as dead. */
- __env_clear_state(env);
+err:
#ifdef HAVE_MUTEX_SUPPORT
- ret = __mut_failchk(env);
+ if ((t_ret = __mutex_failchk(env)) != 0 && ret == 0)
+ ret = t_ret;
#endif
-err: F_CLR(dbenv, DB_ENV_FAILCHK);
+ /* Any dead blocked thread slots are no longer needed; allow reuse. */
+ if (ret == 0)
+ __env_clear_state(env);
+ if (ret == DB_RUNRECOVERY) {
+ /* Announce a panic; avoid __env_panic()'s diag core dump. */
+ __env_panic_set(env, 1);
+ __env_panic_event(env, ret);
+ }
+ F_CLR(dbenv, DB_ENV_FAILCHK);
return (ret);
}
@@ -312,7 +343,8 @@ __env_in_api(env)
REGINFO *infop;
THREAD_INFO *thread;
u_int32_t i;
- int unpin, ret;
+ pid_t pid;
+ int unpin, ret, t_ret;
if ((htab = env->thr_hashtab) == NULL)
return (EINVAL);
@@ -322,10 +354,13 @@ __env_in_api(env)
renv = infop->primary;
thread = R_ADDR(infop, renv->thread_off);
unpin = 0;
+ ret = 0;
for (i = 0; i < env->thr_nbucket; i++)
SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) {
+ pid = ip->dbth_pid;
if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE ||
+ ip->dbth_state == THREAD_BLOCKED_DEAD ||
(ip->dbth_state == THREAD_OUT &&
thread->thr_count < thread->thr_max))
continue;
@@ -341,26 +376,63 @@ __env_in_api(env)
ip->dbth_state = THREAD_SLOT_NOT_IN_USE;
continue;
}
- return (__db_failed(env, DB_STR("1507",
+ /*
+ * The above tests are not atomic, so it is possible that
+ * the process pointed by ip has changed during the tests.
+ * In particular, if the process pointed by ip when is_alive
+ * was executed terminated normally, a new process may reuse
+ * the same ip structure and change its dbth_state before the
+ * next two tests were performed. Therefore, we need to test
+ * here that all four tests above are done on the same process.
+ * If the process pointed by ip changed, all tests are invalid
+ * and can be ignored.
+ * Similarly, it's also possible for two processes racing to
+ * change the dbth_state of the same ip structure. For example,
+ * both process A and B reach the above test for the same
+ * terminated process C where C's dbth_state is THREAD_OUT.
+ * If A goes into the 'if' block and changes C's dbth_state to
+ * THREAD_SLOT_NOT_IN_USE before B checks the condition, B
+ * would incorrectly fail the test and run into this line.
+ * Therefore, we need to check C's dbth_state again and fail
+ * the db only if C's dbth_state is indeed THREAD_ACTIVE.
+ */
+ if (ip->dbth_state != THREAD_ACTIVE || ip->dbth_pid != pid)
+ continue;
+ __os_gettime(env, &ip->dbth_failtime, 0);
+ t_ret = __db_failed(env, DB_STR("1507",
"Thread died in Berkeley DB library"),
- ip->dbth_pid, ip->dbth_tid));
+ ip->dbth_pid, ip->dbth_tid);
+ if (ret == 0)
+ ret = t_ret;
+ /*
+ * Classic failchk stop after one dead thread in the
+ * api, but broadcasting looks for all.
+ */
+#ifndef HAVE_FAILCHK_BROADCAST
+ return (ret);
+#endif
}
if (unpin == 0)
- return (0);
+ return (ret);
for (i = 0; i < env->thr_nbucket; i++)
SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info)
if (ip->dbth_state == THREAD_BLOCKED_DEAD &&
- (ret = __memp_unpin_buffers(env, ip)) != 0)
+ (t_ret = __memp_unpin_buffers(env, ip)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+#ifndef HAVE_FAILCHK_BROADCAST
return (ret);
+#endif
+ }
- return (0);
+ return (ret);
}
/*
* __env_clear_state --
- * Look for threads which died while blockedi and clear them..
+ * Look for threads which died while blocked and clear them..
*/
static void
__env_clear_state(env)
@@ -441,6 +513,9 @@ __env_set_state(env, ipp, state)
#endif
}
+ /* A failchk thread must not block on a lock -- that would be faulty. */
+ if (state == THREAD_BLOCKED && ip != NULL)
+ DB_ASSERT(env, ip->dbth_state != THREAD_FAILCHK);
/*
* If ipp is not null, return the thread control block if found.
* Check to ensure the thread of control has been registered.
@@ -457,7 +532,9 @@ __env_set_state(env, ipp, state)
*ipp = NULL;
ret = 0;
- if (ip == NULL) {
+ if (ip != NULL)
+ ip->dbth_state = state;
+ else {
infop = env->reginfo;
renv = infop->primary;
thread = R_ADDR(infop, renv->thread_off);
@@ -503,11 +580,13 @@ __env_set_state(env, ipp, state)
init: ip->dbth_pid = id.pid;
ip->dbth_tid = id.tid;
ip->dbth_state = state;
+ for (indx = 0; indx != MUTEX_STATE_MAX; indx++)
+ ip->dbth_latches[indx].mutex = MUTEX_INVALID;
SH_TAILQ_INIT(&ip->dbth_xatxn);
}
MUTEX_UNLOCK(env, renv->mtx_regenv);
- } else
- ip->dbth_state = state;
+ }
+
*ipp = ip;
DB_ASSERT(env, ret == 0);
@@ -535,7 +614,7 @@ __env_thread_id_string(dbenv, pid, tid, buf)
#ifdef UINT64_FMT
char fmt[20];
- snprintf(fmt, sizeof(fmt), "%s/%s", UINT64_FMT, UINT64_FMT);
+ snprintf(fmt, sizeof(fmt), "%s/%s", INT64_FMT, UINT64_FMT);
snprintf(buf,
DB_THREADID_STRLEN, fmt, (u_int64_t)pid, (u_int64_t)(uintptr_t)tid);
#else
diff --git a/src/env/env_file.c b/src/env/env_file.c
index b102404d..d6e29b21 100644
--- a/src/env/env_file.c
+++ b/src/env/env_file.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2002, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -12,7 +12,7 @@
/*
* __db_file_extend --
- * Initialize a regular file by writing the last page of the file.
+ * Initialize or extend a regular file by writing to its last page.
*
* PUBLIC: int __db_file_extend __P((ENV *, DB_FH *, size_t));
*/
@@ -27,7 +27,19 @@ __db_file_extend(env, fhp, size)
u_int32_t relative;
int ret;
char buf;
+#ifdef HAVE_MMAP_EXTEND
+ unsigned pagesize;
+ /*
+ * Round up size to the VM pagesize. If it isn't aligned, then the bytes
+ * ending the mapping might have no corresponding backing location on
+ * disk, and could be silently lost when the process exits. [#23290]
+ */
+ if (F_ISSET(fhp, DB_FH_REGION)) {
+ pagesize = (unsigned)getpagesize();
+ size = DB_ALIGN(size, pagesize);
+ }
+#endif
buf = '\0';
/*
* Extend the file by writing the last page. If the region is >4Gb,
diff --git a/src/env/env_globals.c b/src/env/env_globals.c
index 955e6738..2d665661 100644
--- a/src/env/env_globals.c
+++ b/src/env/env_globals.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -31,14 +31,21 @@ DB_GLOBALS __db_global_values = {
"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", /* db_line */
{ 0 }, /* error_buf */
- 0, /* uid_init */
- 0, /* rand_next */
+ 0, /* random_seeded */
+#if defined(HAVE_RANDOM_R)
+ { 0 }, /* random_r random_data */
+ { 0 }, /* random_r state */
+#elif !defined(HAVE_RAND) && !defined(HAVE_RANDOM)
+ 0, /* rand/srand value */
+#endif
0, /* fid_serial */
0, /* db_errno */
- 0, /* num_active_pids */
- 0, /* size_active_pids */
- NULL, /* active_pids */
NULL, /* saved_errstr */
+ "%m/%d %H:%M:%S", /* strftime format for dates */
+#if defined(HAVE_ERROR_HISTORY)
+ 0, /* thread local msgs_key */
+ PTHREAD_ONCE_INIT, /* pthread_once initializer */
+#endif
NULL, /* j_assert */
NULL, /* j_close */
NULL, /* j_dirfree */
diff --git a/src/env/env_method.c b/src/env/env_method.c
index 63deacea..c246febc 100644
--- a/src/env/env_method.c
+++ b/src/env/env_method.c
@@ -1,9 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
- * $Id: env_method.c,v dabaaeb7d839 2010/08/03 17:28:53 mike $
+ * $Id$
*/
#include "db_config.h"
@@ -40,6 +40,7 @@ static int __env_get_app_dispatch
__P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
static int __env_set_app_dispatch
__P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+static int __env_get_blob_dir __P((DB_ENV *, const char **));
static int __env_set_event_notify
__P((DB_ENV *, void (*)(DB_ENV *, u_int32_t, void *)));
static int __env_get_feedback __P((DB_ENV *, void (**)(DB_ENV *, int, int)));
@@ -81,6 +82,11 @@ db_env_create(dbenvpp, flags)
if (flags != 0)
return (EINVAL);
+#ifdef HAVE_ERROR_HISTORY
+ /* Call thread local storage initializer at least once per process. */
+ __db_thread_init();
+#endif
+
/* Allocate the DB_ENV and ENV structures -- we always have both. */
if ((ret = __os_calloc(NULL, 1, sizeof(DB_ENV), &dbenv)) != 0)
return (ret);
@@ -159,7 +165,7 @@ __db_env_init(dbenv)
*/
/* DB_ENV PUBLIC HANDLE LIST BEGIN */
dbenv->add_data_dir = __env_add_data_dir;
- dbenv->backup = __db_backup;
+ dbenv->backup = __db_backup_pp;
dbenv->dbbackup = __db_dbbackup_pp;
dbenv->cdsgroup_begin = __cdsgroup_begin_pp;
dbenv->close = __env_close_pp;
@@ -175,6 +181,8 @@ __db_env_init(dbenv)
dbenv->get_cachesize = __memp_get_cachesize;
dbenv->get_backup_callbacks = __env_get_backup_callbacks;
dbenv->get_backup_config = __env_get_backup_config;
+ dbenv->get_blob_dir = __env_get_blob_dir;
+ dbenv->get_blob_threshold = __env_get_blob_threshold_pp;
dbenv->get_create_dir = __env_get_create_dir;
dbenv->get_data_dirs = __env_get_data_dirs;
dbenv->get_data_len = __env_get_data_len;
@@ -269,7 +277,7 @@ __db_env_init(dbenv)
dbenv->open = __env_open_pp;
dbenv->remove = __env_remove;
dbenv->rep_elect = __rep_elect_pp;
- dbenv->rep_flush = __rep_flush;
+ dbenv->rep_flush = __rep_flush_pp;
dbenv->rep_get_clockskew = __rep_get_clockskew;
dbenv->rep_get_config = __rep_get_config;
dbenv->rep_get_limit = __rep_get_limit;
@@ -282,29 +290,34 @@ __db_env_init(dbenv)
dbenv->rep_set_config = __rep_set_config;
dbenv->rep_set_limit = __rep_set_limit;
dbenv->rep_set_nsites = __rep_set_nsites_pp;
- dbenv->rep_set_priority = __rep_set_priority;
+ dbenv->rep_set_priority = __rep_set_priority_pp;
dbenv->rep_set_request = __rep_set_request;
- dbenv->rep_set_timeout = __rep_set_timeout;
+ dbenv->rep_set_timeout = __rep_set_timeout_pp;
dbenv->rep_set_transport = __rep_set_transport_pp;
+ dbenv->rep_set_view = __rep_set_view;
dbenv->rep_start = __rep_start_pp;
dbenv->rep_stat = __rep_stat_pp;
dbenv->rep_stat_print = __rep_stat_print_pp;
dbenv->rep_sync = __rep_sync;
dbenv->repmgr_channel = __repmgr_channel;
dbenv->repmgr_get_ack_policy = __repmgr_get_ack_policy;
+ dbenv->repmgr_get_incoming_queue_max = __repmgr_get_incoming_queue_max;
dbenv->repmgr_local_site = __repmgr_local_site;
dbenv->repmgr_msg_dispatch = __repmgr_set_msg_dispatch;
dbenv->repmgr_set_ack_policy = __repmgr_set_ack_policy;
+ dbenv->repmgr_set_incoming_queue_max = __repmgr_set_incoming_queue_max;
dbenv->repmgr_site = __repmgr_site;
dbenv->repmgr_site_by_eid = __repmgr_site_by_eid;
- dbenv->repmgr_site_list = __repmgr_site_list;
- dbenv->repmgr_start = __repmgr_start;
+ dbenv->repmgr_site_list = __repmgr_site_list_pp;
+ dbenv->repmgr_start = __repmgr_start_pp;
dbenv->repmgr_stat = __repmgr_stat_pp;
dbenv->repmgr_stat_print = __repmgr_stat_print_pp;
dbenv->set_alloc = __env_set_alloc;
dbenv->set_app_dispatch = __env_set_app_dispatch;
dbenv->set_backup_callbacks = __env_set_backup_callbacks;
dbenv->set_backup_config = __env_set_backup_config;
+ dbenv->set_blob_dir = __env_set_blob_dir;
+ dbenv->set_blob_threshold = __env_set_blob_threshold;
dbenv->set_cache_max = __memp_set_cache_max;
dbenv->set_cachesize = __memp_set_cachesize;
dbenv->set_create_dir = __env_set_create_dir;
@@ -370,10 +383,11 @@ __db_env_init(dbenv)
dbenv->thread_id = __os_id;
dbenv->thread_id_string = __env_thread_id_string;
+ dbenv->mutex_failchk_timeout = US_PER_SEC;
+
env = dbenv->env;
__os_id(NULL, &env->pid_cache, NULL);
- env->db_ref = 0;
env->log_verify_wrap = __log_verify_wrap;
env->data_len = ENV_DEF_DATA_LEN;
TAILQ_INIT(&env->fdlist);
@@ -561,6 +575,97 @@ __env_get_memory_init(dbenv, type, countp)
}
/*
+ * __env_get_blob_threshold_pp --
+ * Get the blob threshold for the environment. Any data item larger
+ * than the blob threshold is automatically saved as a blob file.
+ *
+ * PUBLIC: int __env_get_blob_threshold_pp
+ * PUBLIC: __P ((DB_ENV *, u_int32_t *));
+ */
+int
+__env_get_blob_threshold_pp(dbenv, bytes)
+ DB_ENV *dbenv;
+ u_int32_t *bytes;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ENTER(env, ip);
+ ret = __env_get_blob_threshold_int(env, bytes);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __env_get_blob_threshold_int --
+ * Get the blob threshold for the environment. Any data item larger
+ * than the blob threshold is automatically saved as a blob file.
+ *
+ * PUBLIC: int __env_get_blob_threshold_int
+ * PUBLIC: __P ((ENV *, u_int32_t *));
+ */
+int
+__env_get_blob_threshold_int(env, bytes)
+ ENV *env;
+ u_int32_t *bytes;
+{
+ REGENV *renv;
+ REGINFO *infop;
+
+ if (F_ISSET(env, ENV_OPEN_CALLED)) {
+ infop = env->reginfo;
+ renv = infop->primary;
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ *bytes = renv->blob_threshold;
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ } else
+ *bytes = env->dbenv->blob_threshold;
+
+ return (0);
+}
+
+/*
+ * __env_set_blob_threshold --
+ * Set the default blob threshold for the environment. Any data item larger
+ * than the blob threshold is automatically saved as a blob file.
+ *
+ * PUBLIC: int __env_set_blob_threshold __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__env_set_blob_threshold(dbenv, bytes, flags)
+ DB_ENV *dbenv;
+ u_int32_t bytes;
+ u_int32_t flags;
+{
+ ENV *env;
+ REGENV *renv;
+ REGINFO *infop;
+ DB_THREAD_INFO *ip;
+
+ env = dbenv->env;
+
+ if (__db_fchk(dbenv->env, "DB_ENV->set_blob_threshold", flags, 0) != 0)
+ return (EINVAL);
+
+ if (F_ISSET(env, ENV_OPEN_CALLED)) {
+ infop = env->reginfo;
+ renv = infop->primary;
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ renv->blob_threshold = bytes;
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ ENV_LEAVE(env, ip);
+ } else
+ dbenv->blob_threshold = bytes;
+
+ return (0);
+}
+
+/*
* __env_set_memory_init --
* DB_ENV->set_memory_init.
*
@@ -697,6 +802,43 @@ __env_set_app_dispatch(dbenv, app_dispatch)
}
/*
+ * __env_set_blob_dir --
+ * API to allow the user to override the default blob file
+ * root directory. Must be set if blobs are enabled and an
+ * unnamed environment is created.
+ *
+ * PUBLIC: int __env_set_blob_dir __P((DB_ENV *, const char *));
+ */
+int
+__env_set_blob_dir(dbenv, dir)
+ DB_ENV *dbenv;
+ const char *dir;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_blob_dir");
+
+ if (dbenv->db_blob_dir != NULL)
+ __os_free(env, dbenv->db_blob_dir);
+ return (__os_strdup(env, dir, &dbenv->db_blob_dir));
+}
+
+/*
+ * __env_get_blob_dir --
+ * Get the blob file root directory.
+ */
+static int
+__env_get_blob_dir(dbenv, dirp)
+ DB_ENV *dbenv;
+ const char **dirp;
+{
+ *dirp = dbenv->db_blob_dir;
+ return (0);
+}
+
+/*
* __env_get_encrypt_flags --
* {DB_ENV,DB}->get_encrypt_flags.
*
@@ -1061,6 +1203,10 @@ __env_set_backup(env, on)
return (EINVAL);
}
+ /*
+ * This code does not need env_rep_enter for the checkpoint because
+ * it can only happen if there is an active bulk txn existing.
+ */
if (needs_checkpoint && (ret = __txn_checkpoint(env, 0, 0, 0)))
return (ret);
return (0);
@@ -1244,6 +1390,11 @@ __env_set_data_len(dbenv, data_len)
DB_ENV *dbenv;
u_int32_t data_len;
{
+ if (data_len == 0) {
+ __db_errx(dbenv->env, DB_STR("1593",
+"Maximum number of bytes to display for each key/data item can not be 0."));
+ return (EINVAL);
+ }
dbenv->env->data_len = data_len;
return (0);
@@ -1720,6 +1871,7 @@ __env_get_verbose(dbenv, which, onoffp)
case DB_VERB_DEADLOCK:
case DB_VERB_FILEOPS:
case DB_VERB_FILEOPS_ALL:
+ case DB_VERB_MVCC:
case DB_VERB_RECOVERY:
case DB_VERB_REGISTER:
case DB_VERB_REPLICATION:
@@ -1758,6 +1910,7 @@ __env_set_verbose(dbenv, which, on)
case DB_VERB_DEADLOCK:
case DB_VERB_FILEOPS:
case DB_VERB_FILEOPS_ALL:
+ case DB_VERB_MVCC:
case DB_VERB_RECOVERY:
case DB_VERB_REGISTER:
case DB_VERB_REPLICATION:
@@ -1888,9 +2041,15 @@ __env_get_timeout(dbenv, timeoutp, flags)
int ret;
ret = 0;
- if (flags == DB_SET_REG_TIMEOUT) {
+ if (flags == DB_SET_REG_TIMEOUT)
*timeoutp = dbenv->envreg_timeout;
- } else
+ else if (flags == DB_SET_MUTEX_FAILCHK_TIMEOUT)
+#ifdef HAVE_FAILCHK_BROADCAST
+ *timeoutp = dbenv->mutex_failchk_timeout;
+#else
+ ret = USR_ERR(dbenv->env, DB_OPNOTSUP);
+#endif
+ else
ret = __lock_get_env_timeout(dbenv, timeoutp, flags);
return (ret);
}
@@ -1912,6 +2071,12 @@ __env_set_timeout(dbenv, timeout, flags)
ret = 0;
if (flags == DB_SET_REG_TIMEOUT)
dbenv->envreg_timeout = timeout;
+ else if (flags == DB_SET_MUTEX_FAILCHK_TIMEOUT)
+#ifdef HAVE_FAILCHK_BROADCAST
+ dbenv->mutex_failchk_timeout = timeout;
+#else
+ ret = USR_ERR(dbenv->env, DB_OPNOTSUP);
+#endif
else
ret = __lock_set_env_timeout(dbenv, timeout, flags);
return (ret);
diff --git a/src/env/env_name.c b/src/env/env_name.c
index a3a0b371..d0dd5635 100644
--- a/src/env/env_name.c
+++ b/src/env/env_name.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
static int __db_fullpath
__P((ENV *, const char *, const char *, int, int, char **));
@@ -122,7 +123,7 @@ __db_appname(env, appname, file, dirp, namep)
{
DB_ENV *dbenv;
char **ddp;
- const char *dir;
+ const char *blob_dir, *dir;
int ret;
dbenv = env->dbenv;
@@ -141,6 +142,8 @@ __db_appname(env, appname, file, dirp, namep)
/*
* DB_APP_NONE:
* DB_HOME/file
+ * DB_APP_BLOB:
+ * DB_HOME/DB_BLOB_DIR/file
* DB_APP_DATA:
* DB_HOME/DB_DATA_DIR/file
* DB_APP_LOG:
@@ -151,6 +154,12 @@ __db_appname(env, appname, file, dirp, namep)
switch (appname) {
case DB_APP_NONE:
break;
+ case DB_APP_BLOB:
+ if (dbenv != NULL && dbenv->db_blob_dir != NULL)
+ dir = dbenv->db_blob_dir;
+ else
+ dir = BLOB_DEFAULT_DIR;
+ break;
case DB_APP_RECOVER:
case DB_APP_DATA:
/*
@@ -164,6 +173,13 @@ __db_appname(env, appname, file, dirp, namep)
/* Second, look in the environment home directory. */
DB_CHECKFILE(file, NULL, 1, 0, namep, dirp);
+ /* Third, check the blob directory. */
+ if (dbenv != NULL && dbenv->db_blob_dir != NULL)
+ blob_dir = dbenv->db_blob_dir;
+ else
+ blob_dir = BLOB_DEFAULT_DIR;
+ DB_CHECKFILE(file, blob_dir, 1, 0, namep, dirp);
+
/*
* Otherwise, we're going to create. Use the specified
* directory unless we're in recovery and it doesn't exist.
diff --git a/src/env/env_open.c b/src/env/env_open.c
index 7eddca3a..85189369 100644
--- a/src/env/env_open.c
+++ b/src/env/env_open.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -107,10 +107,16 @@ __env_open_pp(dbenv, db_home, flags, mode)
__db_errx(env, DB_STR("1589", "DB_PRIVATE is not "
"supported by 64-bit applications in "
"mixed-size-addressing mode"));
- return (EINVAL);
- }
+ return (EINVAL);
+ }
#endif
+ if (LF_ISSET(DB_PRIVATE) && PREFMAS_IS_SET(env)) {
+ __db_errx(env, DB_STR("1594", "DB_PRIVATE is not "
+ "supported in Replication Manager preferred master mode"));
+ return (EINVAL);
+ }
+
return (__env_open(dbenv, db_home, flags, mode));
}
@@ -129,12 +135,20 @@ __env_open(dbenv, db_home, flags, mode)
{
DB_THREAD_INFO *ip;
ENV *env;
- u_int32_t orig_flags;
- int register_recovery, ret, t_ret;
+ u_int32_t orig_flags, retry_flags;
+ int recovery_failed, register_recovery, ret, t_ret;
+ char *old_passwd;
+ size_t old_passwd_len;
+ u_int32_t old_encrypt_flags;
ip = NULL;
env = dbenv->env;
+ recovery_failed = 1;
register_recovery = 0;
+ retry_flags = 0;
+ old_passwd = NULL;
+ old_passwd_len = 0;
+ old_encrypt_flags = 0;
/* Initial configuration. */
if ((ret = __env_config(dbenv, db_home, &flags, mode)) != 0)
@@ -171,13 +185,27 @@ __env_open(dbenv, db_home, flags, mode)
dbenv->is_alive = __envreg_isalive;
}
- if ((ret =
- __envreg_register(env, &register_recovery, flags)) != 0)
+ /*
+ * Backup the current key, because it would be consumed by
+ * __envreg_register below
+ */
+ if (dbenv->passwd != NULL) {
+ if ((ret = __os_strdup(env, dbenv->passwd, &old_passwd)) != 0)
+ goto err;
+ old_passwd_len = dbenv->passwd_len;
+ (void)__env_get_encrypt_flags(dbenv, &old_encrypt_flags);
+ }
+
+ F_SET(dbenv, DB_ENV_NOPANIC);
+ ret = __envreg_register(env, &register_recovery, flags);
+ dbenv->flags = orig_flags;
+ if (ret != 0)
goto err;
if (register_recovery) {
if (!LF_ISSET(DB_RECOVER)) {
__db_errx(env, DB_STR("1567",
"The DB_RECOVER flag was not specified, and recovery is needed"));
+ recovery_failed = 0;
ret = DB_RUNRECOVERY;
goto err;
}
@@ -197,16 +225,27 @@ __env_open(dbenv, db_home, flags, mode)
* want to remove files left over for any reason, from any session.
*/
retry: if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))
+ if (
#ifdef HAVE_REPLICATION
- if ((ret = __rep_reset_init(env)) != 0 ||
- (ret = __env_remove_env(env)) != 0 ||
-#else
- if ((ret = __env_remove_env(env)) != 0 ||
+ (ret = __rep_reset_init(env)) != 0 ||
#endif
- (ret = __env_refresh(dbenv, orig_flags, 0)) != 0)
+ (ret = __env_remove_env(env)) != 0 ||
+ (ret = __env_refresh(dbenv,
+ orig_flags | retry_flags, 0)) != 0)
goto err;
- if ((ret = __env_attach_regions(dbenv, flags, orig_flags, 1)) != 0)
+ /* Restore the database key. */
+ if (LF_ISSET(DB_REGISTER) && old_passwd != NULL) {
+ ret = __env_set_encrypt(dbenv, old_passwd, old_encrypt_flags);
+ memset(old_passwd, 0xff, old_passwd_len - 1);
+ __os_free(env, old_passwd);
+ if (ret != 0)
+ goto err;
+ }
+
+ DB_ASSERT(env, ret == 0);
+ if ((ret = __env_attach_regions(dbenv,
+ flags, orig_flags | retry_flags, 1)) != 0)
goto err;
/*
@@ -216,8 +255,18 @@ retry: if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))
*/
if (LF_ISSET(DB_FAILCHK) && !register_recovery) {
ENV_ENTER(env, ip);
- if ((ret = __env_failchk_int(dbenv)) != 0)
+ /*
+ * Set the thread state so that any waiting for a potentially
+ * dead thread will call is_alive() in order to avoid hanging.
+ */
+ FAILCHK_THREAD(env, ip);
+ ret = __env_failchk_int(dbenv);
+ if (ret != 0) {
+ __db_err(env, ret,
+ DB_STR("1595",
+ "failchk crash after clean registry"));
goto err;
+ }
ENV_LEAVE(env, ip);
}
@@ -230,12 +279,12 @@ err: if (ret != 0)
* processes can now proceed.
*
* If recovery failed, unregister now and let another process
- * clean up.
+ * clean up and run recovery.
*/
if (ret == 0 && (t_ret = __envreg_xunlock(env)) != 0)
ret = t_ret;
if (ret != 0)
- (void)__envreg_unregister(env, 1);
+ (void)__envreg_unregister(env, recovery_failed);
}
/*
@@ -247,7 +296,11 @@ err: if (ret != 0)
*/
if (ret == DB_RUNRECOVERY && !register_recovery &&
!LF_ISSET(DB_RECOVER) && LF_ISSET(DB_REGISTER)) {
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, DB_STR("1596",
+ "env_open DB_REGISTER w/o RECOVER panic: trying w/recovery"));
LF_SET(DB_RECOVER);
+ retry_flags = DB_ENV_NOPANIC;
goto retry;
}
@@ -304,6 +357,9 @@ __env_open_arg(dbenv, flags)
"replication requires transaction support"));
return (EINVAL);
}
+ if ((ret =
+ __log_set_config_int(dbenv, DB_LOG_BLOB, 1, 1)) != 0)
+ return (ret);
}
if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) {
if ((ret = __db_fcchk(env,
@@ -349,30 +405,6 @@ __env_open_arg(dbenv, flags)
}
#endif
-#ifdef HAVE_MUTEX_FCNTL
- /*
- * !!!
- * We need a file descriptor for fcntl(2) locking. We use the file
- * handle from the REGENV file for this purpose.
- *
- * Since we may be using shared memory regions, e.g., shmget(2), and
- * not a mapped-in regular file, the backing file may be only a few
- * bytes in length. So, this depends on the ability to call fcntl to
- * lock file offsets much larger than the actual physical file. I
- * think that's safe -- besides, very few systems actually need this
- * kind of support, SunOS is the only one still in wide use of which
- * I'm aware.
- *
- * The error case is if an application lacks spinlocks and wants to be
- * threaded. That doesn't work because fcntl will lock the underlying
- * process, including all its threads.
- */
- if (F_ISSET(env, ENV_THREAD)) {
- __db_errx(env, DB_STR("1578",
- "architecture lacks fast mutexes: applications cannot be threaded"));
- return (EINVAL);
- }
-#endif
return (ret);
}
@@ -506,7 +538,7 @@ __env_close_pp(dbenv, flags)
{
DB_THREAD_INFO *ip;
ENV *env;
- int rep_check, ret, t_ret;
+ int ret, t_ret;
u_int32_t close_flags, flags_orig;
env = dbenv->env;
@@ -517,65 +549,75 @@ __env_close_pp(dbenv, flags)
* Validate arguments, but as a DB_ENV handle destructor, we can't
* fail.
*/
- if (flags != 0 && flags != DB_FORCESYNC &&
- (t_ret = __db_ferr(env, "DB_ENV->close", 0)) != 0 && ret == 0)
- ret = t_ret;
+#undef OKFLAGS
+#define OKFLAGS (DB_FORCESYNC | DB_FORCESYNCENV)
+
+ ret = __db_fchk(env, "DB_ENV->close", flags, OKFLAGS);
#define DBENV_FORCESYNC 0x00000001
#define DBENV_CLOSE_REPCHECK 0x00000010
- if (flags == DB_FORCESYNC)
+ if (LF_ISSET(DB_FORCESYNC))
close_flags |= DBENV_FORCESYNC;
+ if (LF_ISSET(DB_FORCESYNCENV))
+ F_SET(env, ENV_FORCESYNCENV);
+
+ /*
+ * Call __env_close() to clean up resources even though the open
+ * didn't fully succeed.
+ * */
+ if (!F_ISSET(env, ENV_OPEN_CALLED))
+ goto do_close;
/*
* If the environment has panic'd, all we do is try and discard
* the important resources.
*/
if (PANIC_ISSET(env)) {
+ /*
+ * Temporarily set no panic so we do not trigger the
+ * LAST_PANIC_CHECK_BEFORE_IO check in __os_physwrite thus
+ * allowing the unregister to happen correctly.
+ */
+ flags_orig = dbenv->flags;
+ F_SET(dbenv, DB_ENV_NOPANIC);
+ ENV_ENTER(env, ip);
/* clean up from registry file */
- if (dbenv->registry != NULL) {
- /*
- * Temporarily set no panic so we do not trigger the
- * LAST_PANIC_CHECK_BEFORE_IO check in __os_physwr
- * thus allowing the unregister to happen correctly.
- */
- flags_orig = F_ISSET(dbenv, DB_ENV_NOPANIC);
- F_SET(dbenv, DB_ENV_NOPANIC);
+ if (dbenv->registry != NULL)
(void)__envreg_unregister(env, 0);
- dbenv->registry = NULL;
- if (!flags_orig)
- F_CLR(dbenv, DB_ENV_NOPANIC);
- }
/* Close all underlying threads and sockets. */
- if (IS_ENV_REPLICATED(env))
- (void)__repmgr_close(env);
+ (void)__repmgr_close(env);
/* Close all underlying file handles. */
(void)__file_handle_cleanup(env);
+ ENV_LEAVE(env, ip);
+
+ dbenv->flags = flags_orig;
+ (void)__env_region_cleanup(env);
- PANIC_CHECK(env);
+ return (__env_panic_msg(env));
}
ENV_ENTER(env, ip);
- rep_check = IS_ENV_REPLICATED(env) ? 1 : 0;
- if (rep_check) {
#ifdef HAVE_REPLICATION_THREADS
- /*
- * Shut down Replication Manager threads first of all. This
- * must be done before __env_rep_enter to avoid a deadlock that
- * could occur if repmgr's background threads try to do a rep
- * operation that needs __rep_lockout.
- */
- if ((t_ret = __repmgr_close(env)) != 0 && ret == 0)
- ret = t_ret;
+ /*
+ * Shut down Replication Manager threads first of all. This
+ * must be done before __env_rep_enter to avoid a deadlock that
+ * could occur if repmgr's background threads try to do a rep
+ * operation that needs __rep_lockout.
+ */
+ if ((t_ret = __repmgr_close(env)) != 0 && ret == 0)
+ ret = t_ret;
#endif
+ if (IS_ENV_REPLICATED(env)) {
if ((t_ret = __env_rep_enter(env, 0)) != 0 && ret == 0)
ret = t_ret;
+ if (ret == 0)
+ close_flags |= DBENV_CLOSE_REPCHECK;
}
- if (rep_check)
- close_flags |= DBENV_CLOSE_REPCHECK;
+do_close:
if ((t_ret = __env_close(dbenv, close_flags)) != 0 && ret == 0)
ret = t_ret;
@@ -640,8 +682,11 @@ __env_close(dbenv, flags)
t_ret = dbp->alt_close(dbp, close_flags);
else
t_ret = __db_close(dbp, NULL, close_flags);
- if (t_ret != 0 && ret == 0)
- ret = t_ret;
+ if (t_ret != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
}
/*
@@ -661,10 +706,8 @@ __env_close(dbenv, flags)
#endif
/* If we're registered, clean up. */
- if (dbenv->registry != NULL) {
+ if (dbenv->registry != NULL)
(void)__envreg_unregister(env, 0);
- dbenv->registry = NULL;
- }
/* Check we've closed all underlying file handles. */
if ((t_ret = __file_handle_cleanup(env)) != 0 && ret == 0)
@@ -680,6 +723,9 @@ __env_close(dbenv, flags)
if (dbenv->db_md_dir != NULL)
__os_free(env, dbenv->db_md_dir);
dbenv->db_md_dir = NULL;
+ if (dbenv->db_blob_dir != NULL)
+ __os_free(env, dbenv->db_blob_dir);
+ dbenv->db_blob_dir = NULL;
if (dbenv->db_data_dir != NULL) {
for (p = dbenv->db_data_dir; *p != NULL; ++p)
__os_free(env, *p);
@@ -761,9 +807,7 @@ __env_refresh(dbenv, orig_flags, rep_check)
ret = t_ret;
}
- /* Discard the DB_ENV, ENV handle mutexes. */
- if ((t_ret = __mutex_free(env, &dbenv->mtx_db_env)) != 0 && ret == 0)
- ret = t_ret;
+ /* Discard the ENV handle mutex. */
if ((t_ret = __mutex_free(env, &env->mtx_env)) != 0 && ret == 0)
ret = t_ret;
@@ -936,17 +980,38 @@ __file_handle_cleanup(env)
ENV *env;
{
DB_FH *fhp;
+ DB_MPOOL *dbmp;
+ u_int i;
- if (TAILQ_FIRST(&env->fdlist) == NULL)
+ if (TAILQ_EMPTY(&env->fdlist))
return (0);
- __db_errx(env, DB_STR("1581",
- "File handles still open at environment close"));
+ __db_errx(env,
+ DB_STR("1581", "File handles still open at environment close"));
while ((fhp = TAILQ_FIRST(&env->fdlist)) != NULL) {
- __db_errx(env, DB_STR_A("1582", "Open file handle: %s", "%s"),
- fhp->name);
- (void)__os_closehandle(env, fhp);
+ __db_errx(env,
+ DB_STR_A("1582", "Open file handle: %s", "%s"), fhp->name);
+ if (__os_closehandle(env, fhp) != 0)
+ break;
}
+ if (env->lockfhp != NULL)
+ env->lockfhp = NULL;
+ /* Invalidate saved pointers to the regions' files: all are closed. */
+ if (env->reginfo != NULL)
+ env->reginfo->fhp = NULL;
+ if (env->lg_handle != NULL)
+ env->lg_handle->reginfo.fhp = NULL;
+ if (env->lk_handle != NULL)
+ env->lk_handle->reginfo.fhp = NULL;
+#ifdef HAVE_MUTEX_SUPPORT
+ if (env->mutex_handle != NULL)
+ env->mutex_handle->reginfo.fhp = NULL;
+#endif
+ if (env->tx_handle != NULL)
+ env->tx_handle->reginfo.fhp = NULL;
+ if ((dbmp = env->mp_handle) != NULL && dbmp->reginfo != NULL)
+ for (i = 0; i < env->dbenv->mp_ncache; ++i)
+ dbmp->reginfo[i].fhp = NULL;
return (EINVAL);
}
@@ -1109,11 +1174,9 @@ __env_attach_regions(dbenv, flags, orig_flags, retry_ok)
goto err;
/*
- * Initialize the handle mutexes.
+ * Initialize the handle mutex.
*/
if ((ret = __mutex_alloc(env,
- MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbenv->mtx_db_env)) != 0 ||
- (ret = __mutex_alloc(env,
MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &env->mtx_env)) != 0)
goto err;
@@ -1125,8 +1188,15 @@ __env_attach_regions(dbenv, flags, orig_flags, retry_ok)
goto err;
rep_check = IS_ENV_REPLICATED(env) ? 1 : 0;
- if (rep_check && (ret = __env_rep_enter(env, 0)) != 0)
+ if (rep_check && (ret = __env_rep_enter(env, 0)) != 0) {
+ /*
+ * If we get an error we didn't increment handle_cnt,
+ * so we don't want to decrement it later. Turn off
+ * rep_check here.
+ */
+ rep_check = 0;
goto err;
+ }
if (LF_ISSET(DB_INIT_MPOOL)) {
if ((ret = __memp_open(env, create_ok)) != 0)
diff --git a/src/env/env_recover.c b/src/env/env_recover.c
index 9636554a..fb7ddee7 100644
--- a/src/env/env_recover.c
+++ b/src/env/env_recover.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -18,17 +18,15 @@
#include "dbinc/qam.h"
#include "dbinc/txn.h"
-#ifndef lint
-static const char copyright[] =
- "Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.\n";
-#endif
-
static int __db_log_corrupt __P((ENV *, DB_LSN *));
static int __env_init_rec_42 __P((ENV *));
static int __env_init_rec_43 __P((ENV *));
static int __env_init_rec_46 __P((ENV *));
static int __env_init_rec_47 __P((ENV *));
static int __env_init_rec_48 __P((ENV *));
+static int __env_init_rec_53 __P((ENV *));
+static int __env_init_rec_60 __P((ENV *));
+static int __env_init_rec_60p1 __P((ENV *));
static int __log_earliest __P((ENV *, DB_LOGC *, int32_t *, DB_LSN *));
static double __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int));
@@ -632,6 +630,12 @@ err: if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
dbenv->tx_timestamp = 0;
+ /*
+ * Failure means that the env has panicked. Disable locking so that the
+ * env can close without its mutexes calls causing additional panics.
+ */
+ if (ret != 0)
+ F_SET(env->dbenv, DB_ENV_NOLOCKING);
F_CLR(env->lg_handle, DBLOG_RECOVER);
F_CLR(region, TXN_IN_RECOVERY);
@@ -690,7 +694,8 @@ __lsn_diff(low, high, current, max, is_forward)
* is trying to sync up with a master whose max LSN is less than this
* client's max lsn; we want to roll back everything after that.
*
- * Find the latest checkpoint whose ckp_lsn is less than the max lsn.
+ * Find the latest checkpoint less than or equal to max lsn and
+ * return the ckp_lsn from that checkpoint.
*/
static int
__log_backup(env, logc, max_lsn, start_lsn)
@@ -713,10 +718,11 @@ __log_backup(env, logc, max_lsn, start_lsn)
return (ret);
/*
* Follow checkpoints through the log until
- * we find one with a ckp_lsn less than
- * or equal max_lsn.
+ * we find one less than or equal max_lsn.
+ * Then return the ckp_lsn from that checkpoint as it
+ * is our earliest outstanding txn needed.
*/
- if (LOG_COMPARE(&ckp_args->ckp_lsn, max_lsn) <= 0) {
+ if (LOG_COMPARE(&lsn, max_lsn) <= 0) {
*start_lsn = ckp_args->ckp_lsn;
break;
}
@@ -727,7 +733,7 @@ __log_backup(env, logc, max_lsn, start_lsn)
* done. Break with DB_NOTFOUND.
*/
if (IS_ZERO_LSN(lsn)) {
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
break;
}
__os_free(env, ckp_args);
@@ -880,6 +886,9 @@ __db_log_corrupt(env, lsnp)
/*
* __env_init_rec --
*
+ * Install recover functions in the environment. Whenever this is updated,
+ * corresponding changes are needed by db_printlog's env_init_print().
+ *
* PUBLIC: int __env_init_rec __P((ENV *, u_int32_t));
*/
int
@@ -924,6 +933,29 @@ __env_init_rec(env, version)
* oldest revision that applies must be used. Therefore we override
* the recovery functions in reverse log version order.
*/
+ if (version == DB_LOGVERSION)
+ goto done;
+
+ /* DB_LOGVERSION_61 add the blob file id to the dbreg logs. */
+ if (version > DB_LOGVERSION_60p1)
+ goto done;
+ if ((ret = __env_init_rec_60p1(env)) != 0)
+ goto err;
+
+ /*
+ * DB_LOGVERSION_60p1 changed the two u_int32_t offset fields in the
+ * log for fop_write_file into a single int64.
+ */
+ if (version > DB_LOGVERSION_60)
+ goto done;
+ if ((ret = __env_init_rec_60(env)) != 0)
+ goto err;
+
+ /* DB_LOGVERSION_53 changed the heap addrem log record. */
+ if (version > DB_LOGVERSION_53)
+ goto done;
+ if ((ret = __env_init_rec_53(env)) != 0)
+ goto err;
/*
* DB_LOGVERSION_53 is a strict superset of DB_LOGVERSION_50.
* So, only check > DB_LOGVERSION_48p2. If/When log records are
@@ -931,6 +963,8 @@ __env_init_rec(env, version)
*/
if (version > DB_LOGVERSION_48p2)
goto done;
+ if (version >= DB_LOGVERSION_50)
+ goto done;
if ((ret = __env_init_rec_48(env)) != 0)
goto err;
/*
@@ -1091,3 +1125,77 @@ __env_init_rec_48(env)
err:
return (ret);
}
+
+static int
+__env_init_rec_53(env)
+ ENV *env;
+{
+ int ret;
+
+#ifdef HAVE_HEAP
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __heap_addrem_50_recover, DB___heap_addrem_50)) != 0)
+ goto err;
+#else
+ COMPQUIET(env, NULL);
+ COMPQUIET(ret, 0);
+ goto err;
+#endif
+err:
+ return (ret);
+}
+
+static int
+__env_init_rec_60(env)
+ ENV *env;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_create_60_recover, DB___fop_create_60)) != 0)
+ goto err;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_remove_60_recover, DB___fop_remove_60)) != 0)
+ goto err;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_rename_60_recover, DB___fop_rename_60)) != 0)
+ goto err;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_rename_noundo_60_recover, DB___fop_rename_noundo_60)) != 0)
+ goto err;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_file_remove_60_recover, DB___fop_file_remove_60)) != 0)
+ goto err;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_write_60_recover, DB___fop_write_60)) != 0)
+ goto err;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __fop_write_file_60_recover, DB___fop_write_file_60)) != 0)
+ goto err;
+err:
+ return (ret);
+}
+
+static int
+__env_init_rec_60p1(env)
+ ENV *env;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __dbreg_register_42_recover, DB___dbreg_register_42)) != 0)
+ goto err;
+#ifdef HAVE_HEAP
+ if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
+ __heap_addrem_60_recover, DB___heap_addrem_60)) != 0)
+ goto err;
+#endif
+err:
+ return (ret);
+}
diff --git a/src/env/env_region.c b/src/env/env_region.c
index 113bea21..cf7085b7 100644
--- a/src/env/env_region.c
+++ b/src/env/env_region.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -90,8 +90,11 @@ loop: renv = NULL;
* it's actually a creation or not, and we'll have to fall-back to a
* join if it's not a create.
*/
- if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL)
+ if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL) {
+ DB_DEBUG_MSG(env, "env_attach: creating %s",
+ F_ISSET(env, ENV_PRIVATE) ? "private" : "user map func");
goto creation;
+ }
/*
* Try to create the file, if we have the authority. We have to ensure
@@ -179,14 +182,15 @@ loop: renv = NULL;
* something in the region file other than meta-data and that
* shouldn't happen.
*/
- if (size < sizeof(ref))
+ if (size < sizeof(ref)) {
+ DB_DEBUG_MSG(env, "region size %d is too small", (int)size);
goto retry;
- else {
+ } else {
if (size == sizeof(ref))
F_SET(env, ENV_SYSTEM_MEM);
else if (F_ISSET(env, ENV_SYSTEM_MEM)) {
- ret = EINVAL;
+ ret = USR_ERR(env, EINVAL);
__db_err(env, ret, DB_STR_A("1535",
"%s: existing environment not created in system memory",
"%s"), infop->name);
@@ -197,6 +201,7 @@ loop: renv = NULL;
nrw < (size_t)sizeof(rbuf) ||
(ret = __os_seek(env,
env->lockfhp, 0, 0, rbuf.region_off)) != 0) {
+ ret = USR_ERR(env, ret);
__db_err(env, ret, DB_STR_A("1536",
"%s: unable to read region info", "%s"),
infop->name);
@@ -207,7 +212,8 @@ loop: renv = NULL;
if ((ret = __os_read(env, env->lockfhp, &ref,
sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) {
if (ret == 0)
- ret = EIO;
+ ret = USR_ERR(env, EIO);
+ (void)USR_ERR(env, ret);
__db_err(env, ret, DB_STR_A("1537",
"%s: unable to read system-memory information",
"%s"), infop->name);
@@ -218,18 +224,16 @@ loop: renv = NULL;
segid = ref.segid;
}
-#ifndef HAVE_MUTEX_FCNTL
/*
- * If we're not doing fcntl locking, we can close the file handle. We
- * no longer need it and the less contact between the buffer cache and
- * the VM, the better.
+ * We no longer need the file handle; the less contact between the
+ * buffer cache and the VM, the better.
*/
(void)__os_closehandle(env, env->lockfhp);
env->lockfhp = NULL;
-#endif
/* Call the region join routine to acquire the region. */
memset(&tregion, 0, sizeof(tregion));
+ tregion.type = REGION_TYPE_ENV;
tregion.size = (roff_t)size;
tregion.max = (roff_t)max;
tregion.segid = segid;
@@ -257,15 +261,15 @@ user_map_functions:
"Program version %d.%d doesn't match environment version %d.%d",
"%d %d %d %d"), DB_VERSION_MAJOR, DB_VERSION_MINOR,
renv->majver, renv->minver);
- ret = DB_VERSION_MISMATCH;
+ ret = USR_ERR(env, DB_VERSION_MISMATCH);
} else
- ret = EINVAL;
+ ret = USR_ERR(env, EINVAL);
goto err;
}
if (renv->signature != signature) {
__db_errx(env, DB_STR("1539",
"Build signature doesn't match environment"));
- ret = DB_VERSION_MISMATCH;
+ ret = USR_ERR(env, DB_VERSION_MISMATCH);
goto err;
}
@@ -287,8 +291,16 @@ user_map_functions:
ret = __env_panic_msg(env);
goto err;
}
- if (renv->magic != DB_REGION_MAGIC)
+ if (renv->magic != DB_REGION_MAGIC) {
+ DB_DEBUG_MSG(env,
+ "attach sees bad region magic 0x%lx", (u_long)renv->magic);
goto retry;
+ }
+
+ if (dbenv->blob_threshold != 0 &&
+ renv->blob_threshold != dbenv->blob_threshold)
+ __db_msg(env, DB_STR("1591",
+"Warning: Ignoring blob_threshold size when joining environment"));
/*
* Get a reference to the underlying REGION information for this
@@ -329,7 +341,7 @@ user_map_functions:
if (*init_flagsp != 0) {
__db_errx(env, DB_STR("1540",
"configured environment flags incompatible with existing environment"));
- ret = EINVAL;
+ ret = USR_ERR(env, EINVAL);
goto err;
}
*init_flagsp = renv->init_flags;
@@ -437,6 +449,8 @@ creation:
renv->minver = (u_int32_t)minver;
renv->patchver = (u_int32_t)patchver;
renv->signature = signature;
+ renv->failure_panic = 0;
+ renv->failure_symptom[0] = '\0';
(void)time(&renv->timestamp);
__os_unique_id(env, &renv->envid);
@@ -447,6 +461,8 @@ creation:
*/
renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp;
+ renv->blob_threshold = dbenv->blob_threshold;
+
/*
* Set up the region array. We use an array rather than a linked list
* as we have to traverse this list after failure in some cases, and
@@ -513,17 +529,14 @@ find_err: __db_errx(env, DB_STR_A("1544",
}
}
-#ifndef HAVE_MUTEX_FCNTL
/*
- * If we're not doing fcntl locking, we can close the file handle. We
- * no longer need it and the less contact between the buffer cache and
- * the VM, the better.
+ * We no longer need the file handle and the less contact between the
+ * buffer cache and the VM, the better.
*/
if (env->lockfhp != NULL) {
(void)__os_closehandle(env, env->lockfhp);
env->lockfhp = NULL;
}
-#endif
/* Everything looks good, we're done. */
env->reginfo = infop;
@@ -550,7 +563,7 @@ retry: /* Close any open file handle. */
(void)__env_sys_detach(env,
infop, F_ISSET(infop, REGION_CREATE));
- if (rp != NULL && F_ISSET(env, DB_PRIVATE))
+ if (rp != NULL && F_ISSET(env, ENV_PRIVATE))
__env_alloc_free(infop, rp);
}
@@ -674,8 +687,23 @@ __env_panic_set(env, on)
ENV *env;
int on;
{
- if (env != NULL && env->reginfo != NULL)
- ((REGENV *)env->reginfo->primary)->panic = on ? 1 : 0;
+ REGENV *renv;
+
+ if (env != NULL && env->reginfo != NULL) {
+ /*
+ * Remember it in the process' env as well, so that the
+ * panic-ness is still known on exit from the final close.
+ */
+ renv = env->reginfo->primary;
+ if (on) {
+ F_SET(env, ENV_REMEMBER_PANIC);
+ if (F_ISSET(env->dbenv, DB_ENV_FAILCHK))
+ renv->failure_panic = 1;
+ }
+ else
+ F_CLR(env, ENV_REMEMBER_PANIC);
+ renv->panic = on ? 1 : 0;
+ }
}
/*
@@ -775,6 +803,31 @@ __env_ref_get(dbenv, countp)
}
/*
+ * __env_region_cleanup --
+ * Detach from any regions, e.g., when closing after a panic.
+ *
+ * PUBLIC: int __env_region_cleanup __P((ENV *));
+ */
+int
+__env_region_cleanup(env)
+ ENV *env;
+{
+ if (env->reginfo != NULL) {
+#ifdef HAVE_MUTEX_SUPPORT
+ (void)__lock_region_detach(env, env->lk_handle);
+ (void)__mutex_region_detach(env, env->mutex_handle);
+#endif
+ (void)__log_region_detach(env, env->lg_handle);
+ (void)__memp_region_detach(env, env->mp_handle);
+ (void)__txn_region_detach(env, env->tx_handle);
+ (void)__env_detach(env, 0);
+ /* Remember the panic state after detaching. */
+ F_SET(env, ENV_REMEMBER_PANIC);
+ }
+ return (0);
+}
+
+/*
* __env_detach --
* Detach from the environment.
*
@@ -796,9 +849,7 @@ __env_detach(env, destroy)
/* Close the locking file handle. */
if (env->lockfhp != NULL) {
- if ((t_ret =
- __os_closehandle(env, env->lockfhp)) != 0 && ret == 0)
- ret = t_ret;
+ ret = __os_closehandle(env, env->lockfhp);
env->lockfhp = NULL;
}
@@ -1249,13 +1300,13 @@ __env_sys_attach(env, infop, rp)
__db_errx(env, DB_STR_A("1548",
"region size %lu is too large; maximum is %lu", "%lu %lu"),
(u_long)rp->size, (u_long)DB_REGIONSIZE_MAX);
- return (EINVAL);
+ return (USR_ERR(env, EINVAL));
}
if (rp->max > DB_REGIONSIZE_MAX) {
__db_errx(env, DB_STR_A("1549",
"region max %lu is too large; maximum is %lu", "%lu %lu"),
(u_long)rp->max, (u_long)DB_REGIONSIZE_MAX);
- return (EINVAL);
+ return (USR_ERR(env, EINVAL));
}
#endif
@@ -1281,7 +1332,7 @@ __env_sys_attach(env, infop, rp)
"architecture does not support locks inside process-local (malloc) memory"));
__db_errx(env, DB_STR("1551",
"application may not specify both DB_PRIVATE and DB_THREAD"));
- return (EINVAL);
+ return (USR_ERR(env, EINVAL));
}
#endif
if ((ret = __os_malloc(
@@ -1310,7 +1361,7 @@ __env_sys_attach(env, infop, rp)
"region memory was not correctly aligned"));
(void)__env_sys_detach(env, infop,
F_ISSET(infop, REGION_CREATE));
- return (EINVAL);
+ return (USR_ERR(env, EINVAL));
}
return (0);
@@ -1402,7 +1453,7 @@ __env_des_get(env, env_infop, infop, rpp)
* the region, fail. The caller generates any error message.
*/
if (!F_ISSET(infop, REGION_CREATE_OK))
- return (ENOENT);
+ return (USR_ERR(env, ENOENT));
/*
* If we didn't find a region and don't have room to create the region
@@ -1411,7 +1462,7 @@ __env_des_get(env, env_infop, infop, rpp)
if (empty_slot == NULL) {
__db_errx(env, DB_STR("1553",
"no room remaining for additional REGIONs"));
- return (ENOENT);
+ return (USR_ERR(env, ENOENT));
}
/*
diff --git a/src/env/env_register.c b/src/env/env_register.c
index 7475444d..731ddd1f 100644
--- a/src/env/env_register.c
+++ b/src/env/env_register.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -30,6 +30,7 @@
static int __envreg_add __P((ENV *, int *, u_int32_t));
static int __envreg_pid_compare __P((const void *, const void *));
static int __envreg_create_active_pid __P((ENV *, char *));
+static int __envreg_add_active_pid __P((ENV*, char *));
/*
* Support for portable, multi-process database environment locking, based on
@@ -137,7 +138,7 @@ __envreg_register(env, need_recoveryp, flags)
if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
__db_msg(env, DB_STR_A("1524",
- "%lu: register environment", "%lu"), (u_long)pid);
+ "%lu: register environment", "%lu"), (u_long)pid);
/* Build the path name and open the registry file. */
if ((ret = __db_appname(env,
@@ -176,7 +177,6 @@ __envreg_register(env, need_recoveryp, flags)
/* Register this process. */
if ((ret = __envreg_add(env, need_recoveryp, flags)) != 0)
goto err;
-
/*
* Release our exclusive lock if we don't need to run recovery. If
* we need to run recovery, ENV->open will call back into register
@@ -186,8 +186,7 @@ __envreg_register(env, need_recoveryp, flags)
goto err;
if (0) {
-err: *need_recoveryp = 0;
-
+err:
/*
* !!!
* Closing the file handle must release all of our locks.
@@ -196,7 +195,6 @@ err: *need_recoveryp = 0;
(void)__os_closehandle(env, dbenv->registry);
dbenv->registry = NULL;
}
-
if (pp != NULL)
__os_free(env, pp);
@@ -222,11 +220,11 @@ __envreg_add(env, need_recoveryp, flags)
size_t nr, nw;
u_int lcnt;
u_int32_t bytes, mbytes, orig_flags;
- int need_recovery, ret, t_ret;
+ int need_failchk, ret, t_ret;
char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10];
dbenv = env->dbenv;
- need_recovery = 0;
+ need_failchk = t_ret = 0;
COMPQUIET(dead, 0);
COMPQUIET(p, NULL);
ip = NULL;
@@ -269,7 +267,7 @@ kill_all: /*
* registering.
*/
if (nr != PID_LEN) {
- need_recovery = 1;
+ need_failchk = 1;
break;
}
@@ -299,7 +297,7 @@ kill_all: /*
}
#if DB_ENVREG_KILL_ALL
- if (need_recovery) {
+ if (need_failchk) {
pid = (pid_t)strtoul(buf, NULL, 10);
(void)kill(pid, SIGKILL);
@@ -318,7 +316,7 @@ kill_all: /*
__db_msg(env, DB_STR_A("1530",
"%02u: %s: FAILED", "%02u %s"), lcnt, p);
- need_recovery = 1;
+ need_failchk = 1;
dead = pos;
#if DB_ENVREG_KILL_ALL
goto kill_all;
@@ -331,16 +329,27 @@ kill_all: /*
"%02u: %s: LOCKED", "%02u %s"), lcnt, p);
}
+ /* Check for a panic; if so there's no need to call failchk. */
+ if (__env_attach(env, NULL, 0, 0) != 0)
+ goto sig_proc;
+ infop = env->reginfo;
+ renv = infop->primary;
+ *need_recoveryp = renv->panic != 0;
+ (void)__env_detach(env, 0);
+ if (*need_recoveryp)
+ return (0);
+
/*
- * If we have to perform recovery...
+ * If we have to perform failchk...
*
* Mark all slots empty. Registry ignores empty slots we can't lock,
* so it doesn't matter if any of the processes are in the middle of
* exiting Berkeley DB -- they'll discard their lock when they exit.
*/
- if (need_recovery) {
+ if (need_failchk) {
if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
- __db_msg(env, "%lu: recovery required", (u_long)pid);
+ __db_msg(env,
+ "%lu: failchk recovery required", (u_long)pid);
if (LF_ISSET(DB_FAILCHK) || LF_ISSET(DB_FAILCHK_ISALIVE)) {
if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
@@ -352,13 +361,14 @@ kill_all: /*
env, pid_buf)) != 0)
goto sig_proc;
- /* The environment will already exist, so we do not
+ /*
+ * The environment will already exist, so we do not
* want DB_CREATE set, nor do we want any recovery at
* this point. No need to put values back as flags is
* passed in by value. Save original dbenv flags in
* case we need to recover/remove existing environment.
* Set DB_ENV_FAILCHK before attach to help ensure we
- * dont block on a mutex held by the dead process.
+ * don't block on a mutex held by the dead process.
*/
LF_CLR(DB_CREATE | DB_RECOVER | DB_RECOVER_FATAL);
orig_flags = dbenv->flags;
@@ -367,44 +377,53 @@ kill_all: /*
if ((ret = __env_attach_regions(
dbenv, flags, orig_flags, 0)) != 0)
goto sig_proc;
- if ((t_ret =
- __env_set_state(env, &ip, THREAD_FAILCHK)) != 0 &&
- ret == 0)
+ if ((t_ret = __env_set_state(env,
+ &ip, THREAD_FAILCHK)) != 0 && ret == 0)
ret = t_ret;
- if ((t_ret =
- __env_failchk_int(dbenv)) != 0 && ret == 0)
+ if (ret == 0 && (t_ret = __env_failchk_int(dbenv)) != 0)
ret = t_ret;
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env,
+ "%lu: failchk returned %d, ret is %d",
+ (u_long)pid, t_ret, ret);
/* Free active pid array if used. */
if (LF_ISSET(DB_FAILCHK_ISALIVE)) {
- DB_GLOBAL(num_active_pids) = 0;
- DB_GLOBAL(size_active_pids) = 0;
- __os_free( env, DB_GLOBAL(active_pids));
+ env->num_active_pids = 0;
+ env->size_active_pids = 0;
+ __os_free(env, env->active_pids);
+ env->active_pids = NULL;
}
/* Detach from environment and deregister thread. */
- if ((t_ret =
- __env_refresh(dbenv, orig_flags, 0)) != 0 &&
- ret == 0)
+ if ((t_ret = __env_refresh(dbenv,
+ orig_flags, 0)) != 0 && ret == 0)
ret = t_ret;
+ F_CLR(env, ENV_OPEN_CALLED);
+
if (ret == 0) {
if ((ret = __os_seek(env, dbenv->registry,
- 0, 0,(u_int32_t)dead)) != 0 ||
+ 0, 0, (u_int32_t)dead)) != 0 ||
(ret = __os_write(env, dbenv->registry,
PID_EMPTY, PID_LEN, &nw)) != 0)
return (ret);
- need_recovery = 0;
+ need_failchk = 0;
goto add;
}
}
/* If we can't attach, then we cannot set DB_REGISTER panic. */
-sig_proc: if (__env_attach(env, NULL, 0, 0) == 0) {
+sig_proc:
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
+ __db_msg(env, "%lu: sig_proc attaching errs %s/ret %s",
+ (u_long)pid, db_strerror(t_ret), db_strerror(ret));
+ if (__env_attach(env, NULL, 0, 0) == 0) {
infop = env->reginfo;
renv = infop->primary;
- /* Indicate DB_REGSITER panic. Also, set environment
- * panic as this is the panic trigger mechanism in
- * the code that everything looks for.
+ /*
+ * Indicate DB_REGISTER panic. Also, set (or re-set)
+ * environment panic as this is the panic trigger
+ * mechanism in the code that everything looks for.
*/
renv->reg_panic = 1;
renv->panic = 1;
@@ -484,7 +503,7 @@ add: if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
}
}
- if (need_recovery)
+ if (need_failchk)
*need_recoveryp = 1;
return (ret);
@@ -543,8 +562,9 @@ __envreg_unregister(env, recovery_failed)
* also releasing our slot lock, we could race. That can't happen, I
* don't think.
*/
-err: if ((t_ret =
- __os_closehandle(env, dbenv->registry)) != 0 && ret == 0)
+err:
+ if (dbenv->registry != NULL &&
+ (t_ret = __os_closehandle(env, dbenv->registry)) != 0 && ret == 0)
ret = t_ret;
dbenv->registry = NULL;
@@ -610,6 +630,10 @@ __envreg_isalive(dbenv, pid, tid, flags )
db_threadid_t tid;
u_int32_t flags;
{
+ ENV *env;
+
+ env = dbenv->env;
+
/* in this case we really do not care about tid, simply for lint */
DB_THREADID_INIT(tid);
@@ -617,15 +641,14 @@ __envreg_isalive(dbenv, pid, tid, flags )
if (!((flags == 0) || (flags == DB_MUTEX_PROCESS_ONLY)))
return (EINVAL);
- if (DB_GLOBAL(active_pids) == NULL ||
- DB_GLOBAL(num_active_pids) == 0 || dbenv == NULL)
+ if (env->active_pids == NULL || env->num_active_pids == 0)
return (0);
/*
* bsearch returns a pointer to an entry in active_pids if a match
* is found on pid, else no match found it returns NULL. This
* routine will return a 1 if a match is found, else a 0.
*/
- if (bsearch(&pid, DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids),
+ if (bsearch(&pid, env->active_pids, env->num_active_pids,
sizeof(pid_t), __envreg_pid_compare))
return 1;
@@ -635,7 +658,8 @@ __envreg_isalive(dbenv, pid, tid, flags )
/*
* __envreg_create_active_pid --
* Create array of pids, if need more room in array then double size.
- * Only add active pids from DB_REGISTER file into array.
+ * Only add active pids from DB_REGISTER file into array. The given
+ * active my_pid is also added into array.
*/
static int
__envreg_create_active_pid(env, my_pid)
@@ -646,8 +670,7 @@ __envreg_create_active_pid(env, my_pid)
char buf[PID_LEN + 10];
int ret;
off_t pos;
- pid_t pid, *tmparray;
- size_t tmpsize, nr;
+ size_t nr;
u_int lcnt;
dbenv = env->dbenv;
@@ -655,6 +678,15 @@ __envreg_create_active_pid(env, my_pid)
ret = 0;
/*
+ * The process getting here has not been added to the DB_REGISTER
+ * file yet, so include it as the first item in array
+ */
+ if (env->num_active_pids == 0) {
+ if ((ret = __envreg_add_active_pid(env, my_pid)) != 0)
+ return (ret);
+ }
+
+ /*
* Walk through DB_REGISTER file, we grab pid entries that are locked
* as those represent processes that are still alive. Ignore empty
* slots, or those that are unlocked.
@@ -678,53 +710,50 @@ __envreg_create_active_pid(env, my_pid)
if ((ret = REGISTRY_UNLOCK(env, pos)) != 0)
return (ret);
} else {
- /* first, check to make sure we have room in arrary */
- if (DB_GLOBAL(num_active_pids) + 1 >
- DB_GLOBAL(size_active_pids)) {
- tmpsize =
- DB_GLOBAL(size_active_pids) * sizeof(pid_t);
-
- /* start with 512, then double if must grow */
- tmpsize = tmpsize>0 ? tmpsize*2 : 512;
- if ((ret = __os_malloc
- (env, tmpsize, &tmparray )) != 0)
- return (ret);
-
- /* if array exists, then copy and free */
- if (DB_GLOBAL(active_pids)) {
- memcpy( tmparray,
- DB_GLOBAL(active_pids),
- DB_GLOBAL(num_active_pids) *
- sizeof(pid_t));
- __os_free( env, DB_GLOBAL(active_pids));
- }
-
- DB_GLOBAL(active_pids) = tmparray;
- DB_GLOBAL(size_active_pids) = tmpsize;
-
- /*
- * The process getting here has not been added
- * to the DB_REGISTER file yet, so include it
- * as the first item in array
- */
- if (DB_GLOBAL(num_active_pids) == 0) {
- pid = (pid_t)strtoul(my_pid, NULL, 10);
- DB_GLOBAL(active_pids)
- [DB_GLOBAL(num_active_pids)++] = pid;
- }
- }
-
- /* insert into array */
- pid = (pid_t)strtoul(buf, NULL, 10);
- DB_GLOBAL(active_pids)
- [DB_GLOBAL(num_active_pids)++] = pid;
-
+ if ((ret = __envreg_add_active_pid(env, buf)) != 0)
+ return (ret);
}
}
/* lets sort the array to allow for binary search in isalive func */
- qsort(DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids),
+ qsort(env->active_pids, env->num_active_pids,
sizeof(pid_t), __envreg_pid_compare);
return (ret);
}
+
+/*
+ * __envreg_add_active_pid --
+ * Add an active pid into array, if need more room in array then double size.
+ */
+static int
+__envreg_add_active_pid(env, pid)
+ ENV *env;
+ char *pid;
+{
+ int ret;
+ size_t tmpsize;
+
+ ret = 0;
+
+ /* first, check to make sure we have room in arrary */
+ if (env->num_active_pids + 1 >
+ env->size_active_pids) {
+ tmpsize =
+ env->size_active_pids * sizeof(pid_t);
+
+ /* start with 512, then double if must grow */
+ tmpsize = tmpsize > 0 ? tmpsize * 2 : 512;
+ if ((ret = __os_realloc
+ (env, tmpsize, &(env->active_pids) )) != 0)
+ return (ret);
+
+ env->size_active_pids = tmpsize / sizeof(pid_t);
+ }
+
+ /* insert into array */
+ env->active_pids
+ [env->num_active_pids++] = (pid_t)strtoul(pid, NULL, 10);
+
+ return (0);
+}
diff --git a/src/env/env_sig.c b/src/env/env_sig.c
index 6d127f85..57e64228 100644
--- a/src/env/env_sig.c
+++ b/src/env/env_sig.c
@@ -28,9 +28,9 @@
* shared memory.
*/
#ifdef HAVE_MIXED_SIZE_ADDRESSING
-#define __STRUCTURE_COUNT 41
+#define __STRUCTURE_COUNT 48
#else
-#define __STRUCTURE_COUNT (41 + 104)
+#define __STRUCTURE_COUNT (48 + 108)
#endif
/*
@@ -66,7 +66,11 @@ __env_struct_sig()
__ADD(__db_h_stat);
__ADD(__db_heap_stat);
__ADD(__db_qam_stat);
+#ifdef HAVE_MUTEX_SUPPORT
+ __ADD(__mutex_state);
+#endif
__ADD(__db_thread_info);
+ __ADD(__env_thread_info);
__ADD(__db_lockregion);
__ADD(__sh_dbt);
__ADD(__db_lockobj);
@@ -82,6 +86,9 @@ __env_struct_sig()
__ADD(__db_mutexregion);
#endif
#ifdef HAVE_MUTEX_SUPPORT
+ __ADD(__mutex_history);
+#endif
+#ifdef HAVE_MUTEX_SUPPORT
__ADD(__db_mutex_t);
#endif
__ADD(__db_reg_env);
@@ -92,6 +99,10 @@ __env_struct_sig()
#ifndef HAVE_MIXED_SIZE_ADDRESSING
__ADD(__db_dbt);
+#ifdef HAVE_MUTEX_SUPPORT
+ __ADD(__db_event_mutex_died_info);
+#endif
+ __ADD(__db_event_failchk_info);
__ADD(__db_lockreq);
__ADD(__db_log_cursor);
__ADD(__log_rec_spec);
@@ -113,6 +124,7 @@ __env_struct_sig()
__ADD(__cq_fq);
__ADD(__cq_aq);
__ADD(__cq_jq);
+ __ADD(__db_stream);
__ADD(__db_heap_rid);
__ADD(__dbc);
__ADD(__key_range);
@@ -125,7 +137,6 @@ __env_struct_sig()
__ADD(__fn);
__ADD(__db_msgbuf);
__ADD(__pin_list);
- __ADD(__env_thread_info);
__ADD(__flag_map);
__ADD(__db_backup_handle);
__ADD(__env);
diff --git a/src/env/env_stat.c b/src/env/env_stat.c
index 9bc3fe7e..094d0545 100644
--- a/src/env/env_stat.c
+++ b/src/env/env_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -21,11 +21,9 @@ static int __env_print_dbenv_all __P((ENV *, u_int32_t));
static int __env_print_env_all __P((ENV *, u_int32_t));
static int __env_print_fh __P((ENV *));
static int __env_print_stats __P((ENV *, u_int32_t));
-static int __env_print_thread __P((ENV *));
static int __env_stat_print __P((ENV *, u_int32_t));
static char *__env_thread_state_print __P((DB_THREAD_STATE));
-static const char *
- __reg_type __P((reg_type_t));
+static const char * __reg_type __P((reg_type_t));
/*
* __env_stat_print_pp --
@@ -146,7 +144,6 @@ __env_stat_print(env, flags)
/*
* __env_print_stats --
* Display the default environment statistics.
- *
*/
static int
__env_print_stats(env, flags)
@@ -186,6 +183,10 @@ __env_print_stats(env, flags)
(u_long)0, (u_long)0, (u_long)infop->rp->size);
__db_dlbytes(env, "Maximum region size",
(u_long)0, (u_long)0, (u_long)infop->rp->max);
+ STAT_LONG("Process failure detected", renv->failure_panic);
+ if (renv->failure_symptom[0] != '\0')
+ __db_msg(env,
+ "%s:\tFirst failure symptom", renv->failure_symptom);
return (0);
}
@@ -267,8 +268,6 @@ __env_print_dbenv_all(env, flags)
__db_msg(env, "%s", DB_GLOBAL(db_line));
STAT_POINTER("ENV", dbenv->env);
- __mutex_print_debug_single(
- env, "DB_ENV handle mutex", dbenv->mtx_db_env, flags);
STAT_ISSET("Errcall", dbenv->db_errcall);
STAT_ISSET("Errfile", dbenv->db_errfile);
STAT_STRING("Errpfx", dbenv->db_errpfx);
@@ -286,6 +285,7 @@ __env_print_dbenv_all(env, flags)
STAT_ISSET("ThreadId", dbenv->thread_id);
STAT_ISSET("ThreadIdString", dbenv->thread_id_string);
+ STAT_STRING("Blob dir", dbenv->db_blob_dir);
STAT_STRING("Log dir", dbenv->db_log_dir);
STAT_STRING("Metadata dir", dbenv->db_md_dir);
STAT_STRING("Tmp dir", dbenv->db_tmp_dir);
@@ -304,6 +304,8 @@ __env_print_dbenv_all(env, flags)
STAT_ISSET("Password", dbenv->passwd);
+ STAT_ULONG("Blob threshold", dbenv->blob_threshold);
+
STAT_ISSET("App private", dbenv->app_private);
STAT_ISSET("Api1 internal", dbenv->api1_internal);
STAT_ISSET("Api2 internal", dbenv->api2_internal);
@@ -314,6 +316,7 @@ __env_print_dbenv_all(env, flags)
STAT_ULONG("Mutex cnt", dbenv->mutex_cnt);
STAT_ULONG("Mutex inc", dbenv->mutex_inc);
STAT_ULONG("Mutex tas spins", dbenv->mutex_tas_spins);
+ STAT_LONG("Mutex failchk timeout", dbenv->mutex_failchk_timeout);
STAT_ISSET("Lock conflicts", dbenv->lk_conflicts);
STAT_LONG("Lock modes", dbenv->lk_modes);
@@ -356,6 +359,7 @@ __env_print_dbenv_all(env, flags)
__db_prflags(env,
NULL, dbenv->flags, db_env_fn, NULL, "\tPublic environment flags");
+ COMPQUIET(flags, 0);
return (0);
}
@@ -507,6 +511,8 @@ __env_thread_state_print(state)
return ("blocked and dead");
case THREAD_OUT:
return ("out");
+ case THREAD_VERIFY:
+ return ("verify");
default:
return ("unknown");
}
@@ -516,14 +522,17 @@ __env_thread_state_print(state)
/*
* __env_print_thread --
* Display the thread block state.
+ *
+ * PUBLIC: int __env_print_thread __P((ENV *));
*/
-static int
+int
__env_print_thread(env)
ENV *env;
{
BH *bhp;
DB_ENV *dbenv;
DB_HASHTAB *htab;
+ DB_LOCKER *locker;
DB_MPOOL *dbmp;
DB_THREAD_INFO *ip;
PIN_LIST *list, *lp;
@@ -532,6 +541,7 @@ __env_print_thread(env)
THREAD_INFO *thread;
u_int32_t i;
char buf[DB_THREADID_STRLEN];
+ char time_buf[CTIME_BUFLEN];
dbenv = env->dbenv;
@@ -561,6 +571,10 @@ __env_print_thread(env)
dbenv->thread_id_string(
dbenv, ip->dbth_pid, ip->dbth_tid, buf),
__env_thread_state_print(ip->dbth_state));
+ if (timespecisset(&ip->dbth_failtime))
+ __db_msg(env, "Crashed at %s",
+ __db_ctimespec(&ip->dbth_failtime,
+ time_buf));
list = R_ADDR(env->reginfo, ip->dbth_pinlist);
for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) {
if (lp->b_ref == INVALID_ROFF)
@@ -570,6 +584,18 @@ __env_print_thread(env)
__db_msg(env,
"\t\tpins: %lu", (u_long)bhp->pgno);
}
+ if (ip->dbth_local_locker != INVALID_ROFF) {
+ locker = (DB_LOCKER *)
+ R_ADDR(&env->lk_handle->reginfo,
+ ip->dbth_local_locker);
+ __db_msg(env, "\t\tcached locker %lx mtx %lu",
+ (u_long)locker->id,
+ (u_long)locker->mtx_locker);
+
+ }
+#ifdef HAVE_MUTEX_SUPPORT
+ (void)__mutex_record_print(env, ip);
+#endif
}
return (0);
}
@@ -846,6 +872,7 @@ __reg_type(t)
return ("Transaction");
case INVALID_REGION_TYPE:
return ("Invalid");
+ /*lint -e{787} */
}
return ("Unknown");
}
diff --git a/src/fileops/fileops.src b/src/fileops/fileops.src
index cdb6af27..3cb874b7 100644
--- a/src/fileops/fileops.src
+++ b/src/fileops/fileops.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -30,7 +30,14 @@ ARG appname u_int32_t lu
ARG mode u_int32_t o
END
-BEGIN create 48 143
+BEGIN_COMPAT create 60 143
+DBT name DBT s
+DBT dirname DBT s
+ARG appname u_int32_t lu
+ARG mode u_int32_t o
+END
+
+BEGIN create 60p1 143
DBT name DBT s
DBT dirname DBT s
ARG appname u_int32_t lu
@@ -43,7 +50,13 @@ END
* name: name in the file system
* appname: indicates if the name needs to go through __db_appname
*/
-BEGIN remove 42 144
+BEGIN_COMPAT remove 60 144
+DBT name DBT s
+DBT fid DBT s
+ARG appname u_int32_t lu
+END
+
+BEGIN remove 60p1 144
DBT name DBT s
DBT fid DBT s
ARG appname u_int32_t lu
@@ -71,7 +84,18 @@ DBT page DBT s
ARG flag u_int32_t lu
END
-BEGIN write 48 145
+BEGIN_COMPAT write 60 145
+DBT name DBT s
+DBT dirname DBT s
+ARG appname u_int32_t lu
+ARG pgsize u_int32_t lu
+ARG pageno db_pgno_t lu
+ARG offset u_int32_t lu
+DBT page DBT s
+ARG flag u_int32_t lu
+END
+
+BEGIN write 60p1 145
DBT name DBT s
DBT dirname DBT s
ARG appname u_int32_t lu
@@ -83,6 +107,42 @@ ARG flag u_int32_t lu
END
/*
+ * write_file: log the writing of data into a file.
+ *
+ * name: file containing the data.
+ * appname: indicates if the name needs to go through __db_appname
+ * offset_lo: offset in the file, low part of a 64 bit integer.
+ * offset_hi: offset in the file, high part of a 64 bit integer.
+ * old_data: Data being overwritten, if there is any
+ * new_data: Data being written to the file.
+ * flag: DB_FOP_APPEND (0x00000001), DB_FOP_CREATE (0x00000002) and
+ * DB_FOP_REDO (0x00000008). Used to tell how the operation can be
+ * undone, truncating in the case of append and deleting the file in
+ * the case of create, and whether enough information was logged so
+ * that the operation can be redone.
+ */
+BEGIN_COMPAT write_file 60 86
+DBT name DBT s
+DBT dirname DBT s
+ARG appname u_int32_t lu
+ARG offset_lo u_int32_t lu
+ARG offset_hi u_int32_t lu
+DBT old_data DBT s
+DBT new_data DBT s
+ARG flag u_int32_t lu
+END
+
+BEGIN write_file 60p1 86
+DBT name DBT s
+DBT dirname DBT s
+ARG appname u_int32_t lu
+LONGARG offset u_int64_t llu
+DBT old_data DBT s
+DBT new_data DBT s
+ARG flag u_int32_t lu
+END
+
+/*
* rename: move a file from one name to another.
* The appname value indicates if this is a path name that should be used
* directly (i.e., no interpretation) or if it is a pathname that should
@@ -105,8 +165,17 @@ DBT fileid DBT s
ARG appname u_int32_t lu
END
-BEGIN rename 48 146
-DUPLICATE rename_noundo 46 150
+BEGIN_COMPAT rename 60 146
+DUPLICATE rename_noundo 60 150
+DBT oldname DBT s
+DBT newname DBT s
+DBT dirname DBT s
+DBT fileid DBT s
+ARG appname u_int32_t lu
+END
+
+BEGIN rename 60p1 146
+DUPLICATE rename_noundo 60p1 150
DBT oldname DBT s
DBT newname DBT s
DBT dirname DBT s
@@ -128,7 +197,15 @@ END
* child: The transaction that removed or renamed the file.
*/
*/
-BEGIN file_remove 42 141
+BEGIN_COMPAT file_remove 60 141
+DBT real_fid DBT s
+DBT tmp_fid DBT s
+DBT name DBT s
+ARG appname u_int32_t lu
+ARG child u_int32_t lx
+END
+
+BEGIN file_remove 60p1 141
DBT real_fid DBT s
DBT tmp_fid DBT s
DBT name DBT s
diff --git a/src/fileops/fileops_auto.c b/src/fileops/fileops_auto.c
index 0db619a5..eff1377b 100644
--- a/src/fileops/fileops_auto.c
+++ b/src/fileops/fileops_auto.c
@@ -14,6 +14,13 @@ DB_LOG_RECSPEC __fop_create_42_desc[] = {
{LOGREC_ARG, SSZ(__fop_create_42_args, mode), "mode", "%o"},
{LOGREC_Done, 0, "", ""}
};
+DB_LOG_RECSPEC __fop_create_60_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_create_60_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__fop_create_60_args, dirname), "dirname", ""},
+ {LOGREC_ARG, SSZ(__fop_create_60_args, appname), "appname", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_create_60_args, mode), "mode", "%o"},
+ {LOGREC_Done, 0, "", ""}
+};
DB_LOG_RECSPEC __fop_create_desc[] = {
{LOGREC_DBT, SSZ(__fop_create_args, name), "name", ""},
{LOGREC_DBT, SSZ(__fop_create_args, dirname), "dirname", ""},
@@ -21,6 +28,12 @@ DB_LOG_RECSPEC __fop_create_desc[] = {
{LOGREC_ARG, SSZ(__fop_create_args, mode), "mode", "%o"},
{LOGREC_Done, 0, "", ""}
};
+DB_LOG_RECSPEC __fop_remove_60_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_remove_60_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__fop_remove_60_args, fid), "fid", ""},
+ {LOGREC_ARG, SSZ(__fop_remove_60_args, appname), "appname", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
DB_LOG_RECSPEC __fop_remove_desc[] = {
{LOGREC_DBT, SSZ(__fop_remove_args, name), "name", ""},
{LOGREC_DBT, SSZ(__fop_remove_args, fid), "fid", ""},
@@ -37,6 +50,17 @@ DB_LOG_RECSPEC __fop_write_42_desc[] = {
{LOGREC_ARG, SSZ(__fop_write_42_args, flag), "flag", "%lu"},
{LOGREC_Done, 0, "", ""}
};
+DB_LOG_RECSPEC __fop_write_60_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_write_60_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__fop_write_60_args, dirname), "dirname", ""},
+ {LOGREC_ARG, SSZ(__fop_write_60_args, appname), "appname", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_60_args, pgsize), "pgsize", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_60_args, pageno), "pageno", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_60_args, offset), "offset", "%lu"},
+ {LOGREC_DBT, SSZ(__fop_write_60_args, page), "page", ""},
+ {LOGREC_ARG, SSZ(__fop_write_60_args, flag), "flag", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
DB_LOG_RECSPEC __fop_write_desc[] = {
{LOGREC_DBT, SSZ(__fop_write_args, name), "name", ""},
{LOGREC_DBT, SSZ(__fop_write_args, dirname), "dirname", ""},
@@ -48,6 +72,27 @@ DB_LOG_RECSPEC __fop_write_desc[] = {
{LOGREC_ARG, SSZ(__fop_write_args, flag), "flag", "%lu"},
{LOGREC_Done, 0, "", ""}
};
+DB_LOG_RECSPEC __fop_write_file_60_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_write_file_60_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__fop_write_file_60_args, dirname), "dirname", ""},
+ {LOGREC_ARG, SSZ(__fop_write_file_60_args, appname), "appname", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_file_60_args, offset_lo), "offset_lo", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_write_file_60_args, offset_hi), "offset_hi", "%lu"},
+ {LOGREC_DBT, SSZ(__fop_write_file_60_args, old_data), "old_data", ""},
+ {LOGREC_DBT, SSZ(__fop_write_file_60_args, new_data), "new_data", ""},
+ {LOGREC_ARG, SSZ(__fop_write_file_60_args, flag), "flag", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_write_file_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_write_file_args, name), "name", ""},
+ {LOGREC_DBT, SSZ(__fop_write_file_args, dirname), "dirname", ""},
+ {LOGREC_ARG, SSZ(__fop_write_file_args, appname), "appname", "%lu"},
+ {LOGREC_LONGARG, SSZ(__fop_write_file_args, offset), "offset", ""},
+ {LOGREC_DBT, SSZ(__fop_write_file_args, old_data), "old_data", ""},
+ {LOGREC_DBT, SSZ(__fop_write_file_args, new_data), "new_data", ""},
+ {LOGREC_ARG, SSZ(__fop_write_file_args, flag), "flag", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
DB_LOG_RECSPEC __fop_rename_42_desc[] = {
{LOGREC_DBT, SSZ(__fop_rename_42_args, oldname), "oldname", ""},
{LOGREC_DBT, SSZ(__fop_rename_42_args, newname), "newname", ""},
@@ -62,6 +107,22 @@ DB_LOG_RECSPEC __fop_rename_noundo_46_desc[] = {
{LOGREC_ARG, SSZ(__fop_rename_42_args, appname), "appname", "%lu"},
{LOGREC_Done, 0, "", ""}
};
+DB_LOG_RECSPEC __fop_rename_60_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_rename_60_args, oldname), "oldname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_60_args, newname), "newname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_60_args, dirname), "dirname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_60_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__fop_rename_60_args, appname), "appname", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __fop_rename_noundo_60_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_rename_60_args, oldname), "oldname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_60_args, newname), "newname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_60_args, dirname), "dirname", ""},
+ {LOGREC_DBT, SSZ(__fop_rename_60_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__fop_rename_60_args, appname), "appname", "%lu"},
+ {LOGREC_Done, 0, "", ""}
+};
DB_LOG_RECSPEC __fop_rename_desc[] = {
{LOGREC_DBT, SSZ(__fop_rename_args, oldname), "oldname", ""},
{LOGREC_DBT, SSZ(__fop_rename_args, newname), "newname", ""},
@@ -78,6 +139,14 @@ DB_LOG_RECSPEC __fop_rename_noundo_desc[] = {
{LOGREC_ARG, SSZ(__fop_rename_args, appname), "appname", "%lu"},
{LOGREC_Done, 0, "", ""}
};
+DB_LOG_RECSPEC __fop_file_remove_60_desc[] = {
+ {LOGREC_DBT, SSZ(__fop_file_remove_60_args, real_fid), "real_fid", ""},
+ {LOGREC_DBT, SSZ(__fop_file_remove_60_args, tmp_fid), "tmp_fid", ""},
+ {LOGREC_DBT, SSZ(__fop_file_remove_60_args, name), "name", ""},
+ {LOGREC_ARG, SSZ(__fop_file_remove_60_args, appname), "appname", "%lu"},
+ {LOGREC_ARG, SSZ(__fop_file_remove_60_args, child), "child", "%lx"},
+ {LOGREC_Done, 0, "", ""}
+};
DB_LOG_RECSPEC __fop_file_remove_desc[] = {
{LOGREC_DBT, SSZ(__fop_file_remove_args, real_fid), "real_fid", ""},
{LOGREC_DBT, SSZ(__fop_file_remove_args, tmp_fid), "tmp_fid", ""},
@@ -106,6 +175,9 @@ __fop_init_recover(env, dtabp)
__fop_write_recover, DB___fop_write)) != 0)
return (ret);
if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_write_file_recover, DB___fop_write_file)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
__fop_rename_recover, DB___fop_rename)) != 0)
return (ret);
if ((ret = __db_add_recovery_int(env, dtabp,
diff --git a/src/fileops/fileops_autop.c b/src/fileops/fileops_autop.c
index 6e271a17..784aa1d0 100644
--- a/src/fileops/fileops_autop.c
+++ b/src/fileops/fileops_autop.c
@@ -27,6 +27,23 @@ __fop_create_42_print(env, dbtp, lsnp, notused2, info)
}
/*
+ * PUBLIC: int __fop_create_60_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_create_60_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_create_60", __fop_create_60_desc, info));
+}
+
+/*
* PUBLIC: int __fop_create_print __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -44,6 +61,23 @@ __fop_create_print(env, dbtp, lsnp, notused2, info)
}
/*
+ * PUBLIC: int __fop_remove_60_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_remove_60_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_remove_60", __fop_remove_60_desc, info));
+}
+
+/*
* PUBLIC: int __fop_remove_print __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -78,6 +112,23 @@ __fop_write_42_print(env, dbtp, lsnp, notused2, info)
}
/*
+ * PUBLIC: int __fop_write_60_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_60_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_write_60", __fop_write_60_desc, info));
+}
+
+/*
* PUBLIC: int __fop_write_print __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -95,6 +146,40 @@ __fop_write_print(env, dbtp, lsnp, notused2, info)
}
/*
+ * PUBLIC: int __fop_write_file_60_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_file_60_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_write_file_60", __fop_write_file_60_desc, info));
+}
+
+/*
+ * PUBLIC: int __fop_write_file_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_file_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_write_file", __fop_write_file_desc, info));
+}
+
+/*
* PUBLIC: int __fop_rename_42_print __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -112,6 +197,23 @@ __fop_rename_42_print(env, dbtp, lsnp, notused2, info)
}
/*
+ * PUBLIC: int __fop_rename_60_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_rename_60_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_rename_60", __fop_rename_60_desc, info));
+}
+
+/*
* PUBLIC: int __fop_rename_print __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -129,6 +231,23 @@ __fop_rename_print(env, dbtp, lsnp, notused2, info)
}
/*
+ * PUBLIC: int __fop_file_remove_60_print __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__fop_file_remove_60_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__fop_file_remove_60", __fop_file_remove_60_desc, info));
+}
+
+/*
* PUBLIC: int __fop_file_remove_print __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -165,6 +284,9 @@ __fop_init_print(env, dtabp)
__fop_write_print, DB___fop_write)) != 0)
return (ret);
if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_write_file_print, DB___fop_write_file)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
__fop_rename_print, DB___fop_rename)) != 0)
return (ret);
if ((ret = __db_add_recovery_int(env, dtabp,
diff --git a/src/fileops/fop_basic.c b/src/fileops/fop_basic.c
index d6c707f2..c1280d76 100644
--- a/src/fileops/fop_basic.c
+++ b/src/fileops/fop_basic.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -253,6 +253,220 @@ err: if (local_open &&
}
/*
+ * Used to reduce the maximum amount of data that will be logged at a time.
+ * Large writes are logged as a series of smaller writes to prevent a
+ * single log from being larger than the log buffer or a log file.
+ */
+#define LOG_OVERWRITE_MULTIPLIER 0.75
+#define LOG_REDO_MULTIPLIER 0.75
+#define LOG_OVERWRITE_REDO_MULTIPLIER 0.33
+
+/*
+ * __fop_write_file
+ *
+ * Write "size" bytes from "buf" to file "name" beginning at offset "off."
+ * dirname is the directory in which the file is stored, fhp the file
+ * handle to write too, and flags contains whether this is creating or
+ * appending data, which changes how the data is logged.
+ * The other __fop_write is designed for writing pages to databases, this
+ * function writes generic data to files, usually blob files.
+ *
+ * PUBLIC: int __fop_write_file __P((ENV *, DB_TXN *,
+ * PUBLIC: const char *, const char *, APPNAME, DB_FH *,
+ * PUBLIC: off_t, void *, size_t, u_int32_t));
+ */
+int
+__fop_write_file(env, txn,
+ name, dirname, appname, fhp, off, buf, size, flags)
+ ENV *env;
+ DB_TXN *txn;
+ const char *name, *dirname;
+ APPNAME appname;
+ DB_FH *fhp;
+ off_t off;
+ void *buf;
+ size_t size;
+ u_int32_t flags;
+{
+ DBT new_data, old_data, namedbt, dirdbt;
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ off_t cur_off;
+ int local_open, ret, t_ret;
+ size_t cur_size, nbytes, tmp_size;
+ u_int32_t lflags, lgbuf_size, lgsize, lgfile_size;
+ char *real_name;
+ void *cur_ptr;
+
+ ret = local_open = 0;
+ real_name = NULL;
+ lflags = 0;
+ memset(&new_data, 0, sizeof(new_data));
+ memset(&old_data, 0, sizeof(old_data));
+ ZERO_LSN(lsn);
+
+ if (fhp == NULL) {
+ /* File isn't open; we need to reopen it. */
+ if ((ret = __db_appname(env,
+ appname, name, &dirname, &real_name)) != 0)
+ return (ret);
+
+ if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0)
+ goto err;
+ local_open = 1;
+ }
+
+ if (DBENV_LOGGING(env)
+#if !defined(DEBUG_WOP)
+ && txn != NULL
+#endif
+ ) {
+ DB_INIT_DBT(namedbt, name, strlen(name) + 1);
+ if (dirname != NULL)
+ DB_INIT_DBT(dirdbt, dirname, strlen(dirname) + 1);
+ else
+ memset(&dirdbt, 0, sizeof(dirdbt));
+ /*
+ * If the write is larger than the log buffer or file size,
+ * then log it as a set of smaller writes.
+ */
+ cur_off = off;
+ cur_ptr = buf;
+ cur_size = size;
+ dblp = env->lg_handle;
+ LOG_SYSTEM_LOCK(env);
+ lgfile_size = ((LOG *)dblp->reginfo.primary)->log_nsize;
+ LOG_SYSTEM_UNLOCK(env);
+ if ((ret = __log_get_lg_bsize(env->dbenv, &lgbuf_size)) != 0)
+ goto err;
+
+ if (lgfile_size > lgbuf_size)
+ lgsize = lgbuf_size;
+ else
+ lgsize = lgfile_size;
+
+ /*
+ * Parial logging only logs enough data to undo an operation.
+ */
+ if (LF_ISSET(DB_FOP_PARTIAL_LOG)) {
+ /* No data needs to be logged for append and create. */
+ if (LF_ISSET(DB_FOP_APPEND | DB_FOP_CREATE)) {
+ lflags |=
+ flags & (DB_FOP_APPEND | DB_FOP_CREATE);
+ cur_size = 0;
+ goto log;
+ } else {
+ /*
+ * Writting in the middle of the blob requires
+ * logging the data being overwritten.
+ */
+ lgsize = (u_int32_t)
+ (lgsize * LOG_OVERWRITE_MULTIPLIER);
+ }
+ } else {
+ /* Log that the operation can be redone from logs. */
+ lflags |= DB_FOP_REDO;
+ /* Just log the new data for append and create */
+ if (LF_ISSET(DB_FOP_APPEND | DB_FOP_CREATE)) {
+ lgsize = (u_int32_t)
+ (lgsize * LOG_REDO_MULTIPLIER);
+ lflags |= flags &
+ (DB_FOP_APPEND | DB_FOP_CREATE);
+ } else {
+ /*
+ * Writting in the middle of the blob requires
+ * logging both the old and new data.
+ */
+ lgsize = (u_int32_t)
+ (lgsize * LOG_OVERWRITE_REDO_MULTIPLIER);
+ }
+ }
+
+ while (cur_size > 0) {
+ new_data.data = cur_ptr;
+ if (cur_size > lgsize) {
+ new_data.size = lgsize;
+ cur_size -= lgsize;
+ } else {
+ new_data.size = (u_int32_t)cur_size;
+ cur_size = 0;
+ }
+ cur_ptr = (unsigned char *)cur_ptr + new_data.size;
+ /*
+ * If not creating or appending the file, then
+ * the data being overwritten needs to be read
+ * in so it can be written back in on abort.
+ */
+ if (!(lflags & (DB_FOP_CREATE | DB_FOP_APPEND))) {
+ DB_ASSERT(env, old_data.data == NULL ||
+ new_data.size <= old_data.size);
+ old_data.size = new_data.size;
+ if (old_data.data == NULL) {
+ if ((ret = __os_malloc(env,
+ old_data.size,
+ &old_data.data)) != 0)
+ goto err;
+ }
+ if ((ret = __os_seek(
+ env, fhp, 0, 0, cur_off)) != 0)
+ goto err;
+ if ((ret = __os_read(env, fhp, old_data.data,
+ old_data.size, &nbytes)) != 0)
+ goto err;
+ }
+log: tmp_size = new_data.size;
+ /*
+ * No need to log the new data if this operation
+ * cannot be redone from logs.
+ */
+ if (!(lflags & DB_FOP_REDO))
+ memset(&new_data, 0, sizeof(new_data));
+ if ((ret = __fop_write_file_log(
+ env, txn, &lsn, flags, &namedbt, &dirdbt,
+ (u_int32_t)appname, (u_int64_t)cur_off,
+ &old_data, &new_data, lflags)) != 0)
+ goto err;
+ cur_off += tmp_size;
+ }
+ /*
+ * If not creating, we have to flush the logs so that they
+ * will be available to undo internal writes and appends in case
+ * of a crash.
+ */
+ if (!(LF_ISSET(DB_FOP_CREATE)) &&
+ txn != NULL && !F_ISSET(txn, TXN_NOSYNC))
+ if ((ret = __log_flush(env, &lsn)) != 0)
+ goto err;
+ }
+
+ /* Seek to offset. */
+ if ((ret = __os_seek(env, fhp, 0, 0, off)) != 0)
+ goto err;
+
+ /* Now do the write. */
+ if ((ret = __os_write(env, fhp, buf, size, &nbytes)) != 0)
+ goto err;
+
+ if (nbytes != size) {
+ __db_errx(env, DB_STR_A("0238",
+ "Error wrote %lld bytes to file %s instead of %lld .",
+ "%lld %s %lld"),
+ (long long)nbytes, name, (long long)size);
+ goto err;
+ }
+
+err: if (local_open &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (real_name != NULL)
+ __os_free(env, real_name);
+ if (old_data.data != NULL)
+ __os_free(env, old_data.data);
+ return (ret);
+}
+
+/*
* __fop_rename --
* Change a file's name.
*
diff --git a/src/fileops/fop_rec.c b/src/fileops/fop_rec.c
index 52d6175d..71a81ad6 100644
--- a/src/fileops/fop_rec.c
+++ b/src/fileops/fop_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,16 +9,63 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/fop.h"
#include "dbinc/db_am.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"
+typedef enum {
+ DB_APP53_NONE=0, /* No type (region). */
+ DB_APP53_DATA, /* Data file. */
+ DB_APP53_LOG, /* Log file. */
+ DB_APP53_META, /* Persistent metadata file. */
+ DB_APP53_RECOVER, /* We are in recovery. */
+ DB_APP53_TMP /* Temporary file. */
+} APPNAME53;
+
+static APPNAME __fop_convert_appname __P((ENV *, APPNAME53));
+static int __fop_create_recover_int __P((ENV *, char *, db_recops, int));
static int __fop_rename_recover_int
__P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
+static int __fop_rename_60_recover_int
+ __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
static int __fop_rename_42_recover_int
__P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
+static int __fop_write_file_recover_int
+ __P((ENV *, db_recops,
+ APPNAME, u_int32_t, DBT *, DBT *, DBT *, DBT *, off_t, DB_TXN *));
+
+/*
+ * The APPNAME enumermation was changed in 6.0 to include DB_APP_BLOB. APPNAME
+ * is used by the log records __fop_create, __fop_write, and __fop_rename.
+ * __fop_write_file also includes an APPNAME field, but that record was created
+ * in 6.0.
+ */
+static APPNAME
+__fop_convert_appname(env, appname)
+ ENV *env;
+ APPNAME53 appname;
+{
+ switch(appname)
+ {
+ case DB_APP53_NONE:
+ return (DB_APP_NONE);
+ case DB_APP53_DATA:
+ return (DB_APP_DATA);
+ case DB_APP53_LOG:
+ return (DB_APP_LOG);
+ case DB_APP53_META:
+ return (DB_APP_META);
+ case DB_APP53_RECOVER:
+ return (DB_APP_RECOVER);
+ case DB_APP53_TMP:
+ return (DB_APP_TMP);
+ }
+ DB_ASSERT(env, 0);
+ return (DB_APP_NONE);
+}
/*
* The transactional guarantees Berkeley DB provides for file
@@ -50,6 +97,85 @@ static int __fop_rename_42_recover_int
* it does not apply.
*/
+static int
+__fop_create_recover_int(env, real_name, op, mode)
+ ENV *env;
+ char *real_name;
+ db_recops op;
+ int mode;
+{
+ DB_FH *fhp;
+ DBMETA *meta;
+ u_int8_t mbuf[DBMETASIZE];
+ int ret;
+ char *path;
+#ifdef HAVE_REPLICATION
+ DELAYED_BLOB_LIST *dbl;
+ int view_partial;
+
+ dbl = NULL;
+#endif
+ meta = (DBMETA *)mbuf;
+ ret = 0;
+
+ if (DB_UNDO(op)) {
+ /*
+ * If the file was opened in mpool, we must mark it as
+ * dead via nameop which will also unlink the file.
+ */
+ if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
+ if (__fop_read_meta(env,
+ real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
+ __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0) {
+ if ((ret = __memp_nameop(env,
+ meta->uid, NULL, real_name, NULL, 0)) != 0)
+ goto out;
+ } else {
+ (void)__os_closehandle(env, fhp);
+ goto do_unlink;
+ }
+ (void)__os_closehandle(env, fhp);
+ } else
+do_unlink: (void)__os_unlink(env, real_name, 0);
+ } else if (DB_REDO(op)) {
+ path = real_name;
+#ifdef DB_WIN32
+ /*
+ * Absolute paths on windows can result in it creating a
+ * "C" or "D" directory in the working directory.
+ */
+ if (__os_abspath(real_name))
+ path += 2;
+#endif
+
+#ifdef HAVE_REPLICATION
+ /*
+ * Prevent replication of blob files if their owning database
+ * is not replicated.
+ */
+ if (IS_VIEW_SITE(env) && IS_BLOB_FILE(path)) {
+ if ((ret = __rep_call_partial(env,
+ path, &view_partial, 0, &dbl)) != 0)
+ goto out;
+ DB_ASSERT(env, dbl == NULL);
+ if (view_partial == 0)
+ goto out;
+ }
+#endif
+ /* Blob directories might not exist yet. */
+ if (__os_exists(env, real_name, NULL) != 0 &&
+ (ret = __db_mkpath(env, path)) != 0)
+ goto out;
+
+ if ((ret = __os_open(env, real_name,
+ 0, DB_OSO_CREATE, mode, &fhp)) == 0)
+ (void)__os_closehandle(env, fhp);
+ else
+ goto out;
+ }
+out: return (ret);
+}
+
/*
* __fop_create_recover --
* Recovery function for create.
@@ -66,9 +192,6 @@ __fop_create_recover(env, dbtp, lsnp, op, info)
void *info;
{
__fop_create_args *argp;
- DB_FH *fhp;
- DBMETA *meta;
- u_int8_t mbuf[DBMETASIZE];
int ret;
char *real_name;
const char *dirname;
@@ -78,7 +201,6 @@ __fop_create_recover(env, dbtp, lsnp, op, info)
real_name = NULL;
REC_PRINT(__fop_create_print);
REC_NOOP_INTRO(__fop_create_read);
- meta = (DBMETA *)mbuf;
if (argp->dirname.size == 0)
dirname = NULL;
@@ -90,32 +212,60 @@ __fop_create_recover(env, dbtp, lsnp, op, info)
(const char *)argp->name.data, &dirname, &real_name)) != 0)
goto out;
- if (DB_UNDO(op)) {
- /*
- * If the file was opened in mpool, we must mark it as
- * dead via nameop which will also unlink the file.
- */
- if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
- if (__fop_read_meta(env,
- real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
- __db_chk_meta(env, NULL, meta, 1) == 0) {
- if ((ret = __memp_nameop(env,
- meta->uid, NULL, real_name, NULL, 0)) != 0)
- goto out;
- } else {
- (void)__os_closehandle(env, fhp);
- goto do_unlink;
- }
- (void)__os_closehandle(env, fhp);
- } else
-do_unlink: (void)__os_unlink(env, real_name, 0);
- } else if (DB_REDO(op)) {
- if ((ret = __os_open(env, real_name, 0,
- DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
- (void)__os_closehandle(env, fhp);
- else
- goto out;
- }
+ if ((ret = __fop_create_recover_int(
+ env, real_name, op, (int)argp->mode)) != 0)
+ goto out;
+
+ *lsnp = argp->prev_lsn;
+
+out: if (real_name != NULL)
+ __os_free(env, real_name);
+
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_create_60_recover --
+ * Recovery function for create.
+ *
+ * PUBLIC: int __fop_create_60_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_create_60_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_create_60_args *argp;
+ APPNAME appname;
+ int ret;
+ char *real_name;
+ const char *dirname;
+
+ COMPQUIET(info, NULL);
+
+ real_name = NULL;
+ REC_PRINT(__fop_create_60_print);
+ REC_NOOP_INTRO(__fop_create_60_read);
+
+ if (argp->dirname.size == 0)
+ dirname = NULL;
+ else
+ dirname = (const char *)argp->dirname.data;
+
+ appname = __fop_convert_appname(env, (APPNAME53)argp->appname);
+
+ if ((ret = __db_appname(env,
+ appname == DB_APP_DATA ? DB_APP_RECOVER : appname,
+ (const char *)argp->name.data, &dirname, &real_name)) != 0)
+ goto out;
+
+ if ((ret = __fop_create_recover_int(
+ env, real_name, op, (int)argp->mode)) != 0)
+ goto out;
*lsnp = argp->prev_lsn;
@@ -144,6 +294,7 @@ __fop_create_42_recover(env, dbtp, lsnp, op, info)
DB_FH *fhp;
DBMETA *meta;
u_int8_t mbuf[DBMETASIZE];
+ APPNAME appname;
int ret;
char *real_name;
@@ -153,8 +304,9 @@ __fop_create_42_recover(env, dbtp, lsnp, op, info)
REC_PRINT(__fop_create_print);
REC_NOOP_INTRO(__fop_create_read);
meta = (DBMETA *)mbuf;
+ appname = __fop_convert_appname(env, (APPNAME53)argp->appname);
- if ((ret = __db_appname(env, (APPNAME)argp->appname,
+ if ((ret = __db_appname(env, appname,
(const char *)argp->name.data, NULL, &real_name)) != 0)
goto out;
@@ -166,7 +318,7 @@ __fop_create_42_recover(env, dbtp, lsnp, op, info)
if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
if (__fop_read_meta(env,
real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
- __db_chk_meta(env, NULL, meta, 1) == 0) {
+ __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0) {
if ((ret = __memp_nameop(env,
meta->uid, NULL, real_name, NULL, 0)) != 0)
goto out;
@@ -232,6 +384,49 @@ out: if (real_name != NULL)
}
/*
+ * __fop_remove_60_recover --
+ * Recovery function for remove.
+ *
+ * PUBLIC: int __fop_remove_60_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_remove_60_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_remove_60_args *argp;
+ APPNAME appname;
+ int ret;
+ char *real_name;
+
+ COMPQUIET(info, NULL);
+
+ real_name = NULL;
+ REC_PRINT(__fop_remove_60_print);
+ REC_NOOP_INTRO(__fop_remove_60_read);
+
+ appname = __fop_convert_appname(env, (APPNAME53)argp->appname);
+
+ if ((ret = __db_appname(env, appname,
+ (const char *)argp->name.data, NULL, &real_name)) != 0)
+ goto out;
+
+ /* Its ok if the file is not there. */
+ if (DB_REDO(op))
+ (void)__memp_nameop(env,
+ (u_int8_t *)argp->fid.data, NULL, real_name, NULL, 0);
+
+ *lsnp = argp->prev_lsn;
+out: if (real_name != NULL)
+ __os_free(env, real_name);
+ REC_NOOP_CLOSE;
+}
+
+/*
* __fop_write_recover --
* Recovery function for writechunk.
*
@@ -251,6 +446,15 @@ __fop_write_recover(env, dbtp, lsnp, op, info)
COMPQUIET(info, NULL);
+#ifndef HAVE_64BIT_TYPES
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, 0);
+ __db_errx(env, DB_STR("0243",
+ "Blobs require 64 integer compiler support."));
+ return (DB_OPNOTSUP);
+#endif
+
REC_PRINT(__fop_write_print);
REC_NOOP_INTRO(__fop_write_read);
@@ -272,6 +476,48 @@ __fop_write_recover(env, dbtp, lsnp, op, info)
}
/*
+ * __fop_write_60_recover --
+ * Recovery function for writechunk.
+ *
+ * PUBLIC: int __fop_write_60_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_write_60_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_write_60_args *argp;
+ APPNAME appname;
+ int ret;
+
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__fop_write_60_print);
+ REC_NOOP_INTRO(__fop_write_60_read);
+
+ ret = 0;
+ if (DB_UNDO(op))
+ DB_ASSERT(env, argp->flag != 0);
+ else if (DB_REDO(op)) {
+ appname = __fop_convert_appname(env, (APPNAME53)argp->appname);
+ ret = __fop_write(env,
+ argp->txnp, argp->name.data,
+ argp->dirname.size == 0 ? NULL : argp->dirname.data,
+ appname == DB_APP_DATA ? DB_APP_RECOVER : appname,
+ NULL, argp->pgsize, argp->pageno, argp->offset,
+ argp->page.data, argp->page.size, argp->flag, 0);
+ }
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+ REC_NOOP_CLOSE;
+}
+
+/*
* __fop_write_42_recover --
* Recovery function for writechunk.
*
@@ -287,6 +533,7 @@ __fop_write_42_recover(env, dbtp, lsnp, op, info)
void *info;
{
__fop_write_args *argp;
+ APPNAME appname;
int ret;
COMPQUIET(info, NULL);
@@ -297,18 +544,194 @@ __fop_write_42_recover(env, dbtp, lsnp, op, info)
ret = 0;
if (DB_UNDO(op))
DB_ASSERT(env, argp->flag != 0);
- else if (DB_REDO(op))
+ else if (DB_REDO(op)) {
+ appname = __fop_convert_appname(env, (APPNAME53)argp->appname);
ret = __fop_write(env,
- argp->txnp, argp->name.data, NULL, (APPNAME)argp->appname,
+ argp->txnp, argp->name.data, NULL, appname,
NULL, argp->pgsize, argp->pageno, argp->offset,
argp->page.data, argp->page.size, argp->flag, 0);
+ }
+
+ if (ret == 0)
+ *lsnp = argp->prev_lsn;
+ REC_NOOP_CLOSE;
+}
+
+static int
+__fop_write_file_recover_int(
+ env, op, appname, flag, dirname, name, new_data, old_data, offset, txn)
+ ENV *env;
+ db_recops op;
+ APPNAME appname;
+ u_int32_t flag;
+ DBT *dirname;
+ DBT *name;
+ DBT *new_data;
+ DBT *old_data;
+ off_t offset;
+ DB_TXN *txn;
+{
+ DB_FH *fhp;
+ int ret;
+ size_t nbytes;
+ char *path;
+
+ fhp = NULL;
+ path = NULL;
+ ret = 0;
+
+ if (DB_UNDO(op)) {
+ if (flag & DB_FOP_CREATE) {
+ /*
+ * File was created in this transaction. Do nothing,
+ * destroying the file will undo the write.
+ */
+ } else {
+ if ((ret = __db_appname(env,
+ appname == DB_APP_DATA ? DB_APP_RECOVER :
+ appname, name->data, NULL, &path)) != 0)
+ goto end;
+
+ if (__os_open(env, path, 0, 0, DB_MODE_600, &fhp) != 0)
+ goto end;
+
+ if (flag & DB_FOP_APPEND) {
+ /*
+ * Appended to the end of the file, undo by
+ * truncating the file.
+ */
+ (void)__os_truncate(env, fhp, 0, 0, offset);
+ } else {
+ /*
+ * Data overwritten in the middle of the file,
+ * undo by writing back in the old data.
+ */
+
+ /* Seek to offset. */
+ if ((__os_seek(env, fhp, 0, 0, offset)) != 0)
+ goto end;
+
+ /* Now do the write. */
+ ret = __os_write(env, fhp,
+ old_data->data, old_data->size, &nbytes);
+ }
+ }
+ } else if (DB_REDO(op)) {
+ /*
+ * Not all operations log enough data to be redone. Since
+ * files are flushed before the transaction commit this is
+ * not an issue, unless we are on an HA client or initializing
+ * from a backup.
+ */
+ if (flag & DB_FOP_REDO) {
+ ret = __fop_write_file(env, txn, name->data,
+ dirname->size == 0 ? NULL : dirname->data,
+ appname == DB_APP_DATA ? DB_APP_RECOVER : appname,
+ NULL, offset, new_data->data, new_data->size, 0);
+#ifdef HAVE_REPLICATION
+ /*
+ * Blob files of databases that are not replicated are
+ * also not replicated. So assume any ENOENT errors
+ * are because the file was not replicated.
+ */
+ if (ret == ENOENT && IS_VIEW_SITE(env))
+ ret = 0;
+#endif
+ } else {
+ /* DB_ASSERT(env, !IS_REP_CLIENT(env)); */
+ }
+ }
+
+end: if (path != NULL)
+ __os_free(env, path);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ return (ret);
+}
+/*
+ * __fop_write_file_recover --
+ * Recovery function for writing to a blob file. Files are flushed before
+ * the transaction is committed, so often the file operations do not need
+ * to be redone or undone. However, since no lsn is stored in the file,
+ * we always try to redo or undo the operation, since it will not change
+ * the final state of the file if the operation is not needed. This also
+ * means that this function has to be very tolerant of errors, such as
+ * trying to open a file that was deleted, or truncate a file that is
+ * already short.
+ *
+ * PUBLIC: int __fop_write_file_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_write_file_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_write_file_args *argp;
+ int ret;
+ COMPQUIET(info, NULL);
+
+#ifndef HAVE_64BIT_TYPES
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, 0);
+ __db_errx(env, DB_STR("0244",
+ "Blobs require 64 integer compiler support."));
+ return (DB_OPNOTSUP);
+#endif
+
+ REC_PRINT(__fop_write_file_print);
+ REC_NOOP_INTRO(__fop_write_file_read);
+
+ ret = __fop_write_file_recover_int(env, op,
+ (APPNAME)argp->appname, argp->flag, &argp->dirname, &argp->name,
+ &argp->new_data, &argp->old_data, (off_t)argp->offset, argp->txnp);
if (ret == 0)
*lsnp = argp->prev_lsn;
REC_NOOP_CLOSE;
}
/*
+ * __fop_write_file_60_recover --
+ *
+ * PUBLIC: int __fop_write_file_60_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_write_file_60_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_write_file_60_args *argp;
+ off_t offset;
+ int ret;
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__fop_write_file_60_print);
+ REC_NOOP_INTRO(__fop_write_file_60_read);
+
+ /* The offset is stored as two u_in32_t values. */
+ GET_LO_HI(env, argp->offset_lo, argp->offset_hi, offset, ret);
+ if (ret != 0)
+ goto end;
+
+ ret = __fop_write_file_recover_int(env, op,
+ (APPNAME)argp->appname, argp->flag, &argp->dirname, &argp->name,
+ &argp->new_data, &argp->old_data, offset, argp->txnp);
+
+end: if (ret == 0)
+ *lsnp = argp->prev_lsn;
+ REC_NOOP_CLOSE;
+}
+
+/*
* __fop_rename_recover --
* Recovery functions for rename. There are two variants that
* both use the same utility function. Had we known about this on day
@@ -408,7 +831,148 @@ __fop_rename_recover_int(env, dbtp, lsnp, op, info, undo)
if (__fop_read_meta(env,
src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
goto done;
- if (__db_chk_meta(env, NULL, meta, 1) != 0)
+ if (__db_chk_meta(env, NULL, meta, DB_CHK_META) != 0)
+ goto done;
+ if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
+ goto done;
+ (void)__os_closehandle(env, fhp);
+ fhp = NULL;
+ if (DB_REDO(op)) {
+ /*
+ * Check to see if the target file exists. If it
+ * does and it does not have the proper id then
+ * it is a later version. We just remove the source
+ * file since the state of the world is beyond this
+ * point.
+ */
+ if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
+ __fop_read_meta(env, src, mbuf,
+ DBMETASIZE, fhp, 1, NULL) == 0 &&
+ __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0 &&
+ memcmp(argp->fileid.data,
+ meta->uid, DB_FILE_ID_LEN) != 0) {
+ (void)__memp_nameop(env,
+ fileid, NULL, real_old, NULL, 0);
+ goto done;
+ }
+ }
+ }
+
+ if (undo && DB_UNDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->oldname.data, real_new, real_old, 0);
+ if (DB_REDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->newname.data, real_old, real_new, 0);
+
+done: *lsnp = argp->prev_lsn;
+out: if (real_new != NULL)
+ __os_free(env, real_new);
+ if (real_old != NULL)
+ __os_free(env, real_old);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_rename_60_recover --
+ *
+ * PUBLIC: int __fop_rename_60_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * PUBLIC: int __fop_rename_noundo_60_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+
+int
+__fop_rename_60_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ return (__fop_rename_60_recover_int(env, dbtp, lsnp, op, info, 1));
+}
+
+int
+__fop_rename_noundo_60_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ return (__fop_rename_60_recover_int(env, dbtp, lsnp, op, info, 0));
+}
+
+static int
+__fop_rename_60_recover_int(env, dbtp, lsnp, op, info, undo)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+ int undo;
+{
+ __fop_rename_60_args *argp;
+ APPNAME appname;
+ DB_FH *fhp;
+ DBMETA *meta;
+ u_int8_t *fileid, mbuf[DBMETASIZE];
+ int ret;
+ char *real_new, *real_old, *src;
+ const char *dirname;
+
+ COMPQUIET(info, NULL);
+
+ fhp = NULL;
+ meta = (DBMETA *)&mbuf[0];
+ ret = 0;
+ real_new = real_old = NULL;
+
+ REC_PRINT(__fop_rename_60_print);
+ REC_NOOP_INTRO(__fop_rename_60_read);
+ fileid = argp->fileid.data;
+
+ if (argp->dirname.size == 0)
+ dirname = NULL;
+ else
+ dirname = (const char *)argp->dirname.data;
+
+
+ appname = __fop_convert_appname(env, (APPNAME53)argp->appname);
+ if (appname == DB_APP_DATA)
+ appname = DB_APP_RECOVER;
+
+ if ((ret = __db_appname(env, appname, (const char *)argp->newname.data,
+ &dirname, &real_new)) != 0)
+ goto out;
+ if ((ret = __db_appname(env, appname, (const char *)argp->oldname.data,
+ &dirname, &real_old)) != 0)
+ goto out;
+
+ /*
+ * Verify that we are manipulating the correct file. We should always
+ * be OK on an ABORT or an APPLY, but during recovery, we have to
+ * check.
+ */
+ if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
+ src = DB_UNDO(op) ? real_new : real_old;
+ /*
+ * Interpret any error as meaning that the file either doesn't
+ * exist, doesn't have a meta-data page, or is in some other
+ * way, shape or form, incorrect, so that we should not restore
+ * it.
+ */
+ if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
+ goto done;
+ if (__fop_read_meta(env,
+ src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
+ goto done;
+ if (__db_chk_meta(env, NULL, meta, DB_CHK_META) != 0)
goto done;
if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
goto done;
@@ -425,7 +989,7 @@ __fop_rename_recover_int(env, dbtp, lsnp, op, info, undo)
if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
__fop_read_meta(env, src, mbuf,
DBMETASIZE, fhp, 1, NULL) == 0 &&
- __db_chk_meta(env, NULL, meta, 1) == 0 &&
+ __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0 &&
memcmp(argp->fileid.data,
meta->uid, DB_FILE_ID_LEN) != 0) {
(void)__memp_nameop(env,
@@ -501,6 +1065,7 @@ __fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo)
DB_FH *fhp;
DBMETA *meta;
u_int8_t *fileid, mbuf[DBMETASIZE];
+ APPNAME appname;
int ret;
char *real_new, *real_old, *src;
@@ -515,10 +1080,11 @@ __fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo)
REC_NOOP_INTRO(__fop_rename_read);
fileid = argp->fileid.data;
- if ((ret = __db_appname(env, (APPNAME)argp->appname,
+ appname = __fop_convert_appname(env, (APPNAME53)argp->appname);
+ if ((ret = __db_appname(env, appname,
(const char *)argp->newname.data, NULL, &real_new)) != 0)
goto out;
- if ((ret = __db_appname(env, (APPNAME)argp->appname,
+ if ((ret = __db_appname(env, appname,
(const char *)argp->oldname.data, NULL, &real_old)) != 0)
goto out;
@@ -540,7 +1106,7 @@ __fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo)
if (__fop_read_meta(env,
src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
goto done;
- if (__db_chk_meta(env, NULL, meta, 1) != 0)
+ if (__db_chk_meta(env, NULL, meta, DB_CHK_META) != 0)
goto done;
if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
goto done;
@@ -557,7 +1123,7 @@ __fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo)
if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
__fop_read_meta(env, src, mbuf,
DBMETASIZE, fhp, 1, NULL) == 0 &&
- __db_chk_meta(env, NULL, meta, 1) == 0 &&
+ __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0 &&
memcmp(argp->fileid.data,
meta->uid, DB_FILE_ID_LEN) != 0) {
(void)__memp_nameop(env,
@@ -652,7 +1218,115 @@ __fop_file_remove_recover(env, dbtp, lsnp, op, info)
* We can ignore errors here since we'll simply fail the
* checks below and assume this is the wrong file.
*/
- (void)__db_chk_meta(env, NULL, meta, 1);
+ (void)__db_chk_meta(env, NULL, meta, DB_CHK_META);
+ is_real =
+ memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
+ is_tmp =
+ memcmp(argp->tmp_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
+
+ if (!is_real && !is_tmp)
+ /* File exists, but isn't what we were removing. */
+ cstat = TXN_IGNORE;
+ else
+ /* File exists and is the one that we were removing. */
+ cstat = TXN_COMMIT;
+ }
+ if (fhp != NULL) {
+ (void)__os_closehandle(env, fhp);
+ fhp = NULL;
+ }
+
+ if (DB_UNDO(op)) {
+ /* On the backward pass, we leave a note for the child txn. */
+ if ((ret = __db_txnlist_update(env,
+ info, argp->child, cstat, NULL, &ret_stat, 1)) != 0)
+ goto out;
+ } else if (DB_REDO(op)) {
+ /*
+ * On the forward pass, check if someone recreated the
+ * file while we weren't looking.
+ */
+ if (cstat == TXN_COMMIT)
+ (void)__memp_nameop(env,
+ is_real ? argp->real_fid.data : argp->tmp_fid.data,
+ NULL, real_name, NULL, 0);
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (real_name != NULL)
+ __os_free(env, real_name);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __fop_file_remove_60_recover --
+ *
+ * PUBLIC: int __fop_file_remove_60_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__fop_file_remove_60_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __fop_file_remove_60_args *argp;
+ DBMETA *meta;
+ DB_FH *fhp;
+ size_t len;
+ u_int8_t mbuf[DBMETASIZE];
+ u_int32_t cstat, ret_stat;
+ APPNAME appname;
+ int is_real, is_tmp, ret;
+ char *real_name;
+
+ fhp = NULL;
+ meta = (DBMETA *)&mbuf[0];
+ is_real = is_tmp = 0;
+ real_name = NULL;
+ REC_PRINT(__fop_file_remove_60_print);
+ REC_NOOP_INTRO(__fop_file_remove_60_read);
+
+ /*
+ * This record is only interesting on the backward, forward, and
+ * apply phases.
+ */
+ if (op != DB_TXN_BACKWARD_ROLL &&
+ op != DB_TXN_FORWARD_ROLL && op != DB_TXN_APPLY)
+ goto done;
+
+ appname = __fop_convert_appname(env, (APPNAME53)argp->appname);
+ if ((ret = __db_appname(env, appname,
+ argp->name.data, NULL, &real_name)) != 0)
+ goto out;
+
+ /* Verify that we are manipulating the correct file. */
+ len = 0;
+ if (__os_open(env, real_name, 0, 0, 0, &fhp) != 0 ||
+ (ret = __fop_read_meta(env, real_name,
+ mbuf, DBMETASIZE, fhp, 1, &len)) != 0) {
+ /*
+ * If len is non-zero, then the file exists and has something
+ * in it, but that something isn't a full meta-data page, so
+ * this is very bad. Bail out!
+ */
+ if (len != 0)
+ goto out;
+
+ /* File does not exist. */
+ cstat = TXN_EXPECTED;
+ } else {
+ /*
+ * We can ignore errors here since we'll simply fail the
+ * checks below and assume this is the wrong file.
+ */
+ (void)__db_chk_meta(env, NULL, meta, DB_CHK_META);
is_real =
memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
is_tmp =
@@ -695,3 +1369,4 @@ out: if (real_name != NULL)
(void)__os_closehandle(env, fhp);
REC_NOOP_CLOSE;
}
+
diff --git a/src/fileops/fop_util.c b/src/fileops/fop_util.c
index 1925ffd1..d51aba0f 100644
--- a/src/fileops/fop_util.c
+++ b/src/fileops/fop_util.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -24,9 +24,10 @@ static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t,
u_int32_t));
static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *,
const char *, const char *, const char *, DB_LOCKER *));
-static int __fop_ondisk_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
+static int __fop_ondisk_dummy __P((
+ DB *, DB_TXN *, const char *, u_int8_t *, APPNAME));
static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *,
- const char *, const char *, const char *, DB_LOCKER *));
+ const char *, const char *, const char *, DB_LOCKER *, APPNAME));
/*
* Acquire the environment meta-data lock. The parameters are the
@@ -115,7 +116,7 @@ __fop_lock_handle(env, dbp, locker, mode, elockp, flags)
/*
* If we are in recovery, the only locking we should be
* doing is on the global environment. The one exception
- * is if we are opening an exclusive database on a client
+ * is if we are opening an exclusive database on a client
* syncing with the master.
*/
if (IS_RECOVERING(env) && !F2_ISSET(dbp, DB2_AM_INTEXCL))
@@ -234,8 +235,8 @@ __fop_file_setup(dbp, ip, txn, name, mode, flags, retidp)
real_name = real_tmpname = tmpname = NULL;
dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
aflags = LF_ISSET(DB_INTERNAL_PERSISTENT_DB) ? DB_APP_META :
- (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA);
- LF_CLR(DB_INTERNAL_PERSISTENT_DB | DB_INTERNAL_TEMPORARY_DB);
+ (LF_ISSET(DB_INTERNAL_BLOB_DB) ? DB_APP_BLOB :
+ (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA));
ret = 0;
retries = 0;
@@ -394,14 +395,14 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
goto done;
}
- /*
+ /*
* Case 4: This is a valid file. Now check the
- * checksum and decrypt the file so the file
+ * checksum and decrypt the file so the file
* id can be obtained for the handle lock. Note that
* the checksum can fail if the database is being
* written (possible because the handle lock has
* not been obtained yet). So on checksum fail retry
- * until the checksum succeeds or the number of
+ * until the checksum succeeds or the number of
* retries is exhausted, then throw an error.
*/
if (ret == 0 && (ret = __db_chk_meta(env, dbp,
@@ -410,7 +411,7 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
ret = t_ret;
goto err;
}
- /*
+ /*
* Retry unless the number of retries is
* exhausted.
*/
@@ -423,8 +424,7 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
ret = EINVAL;
goto err;
}
- if ((ret = __os_closehandle(env, fhp)) != 0)
- goto err;
+ CLOSE_HANDLE(dbp, fhp);
goto retry;
}
/* Get the file id for the handle lock. */
@@ -464,11 +464,8 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
* any application level FCNTL semantics.
*/
DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING));
- if (!F_ISSET(dbp, DB_AM_INMEM)) {
- if ((ret = __os_closehandle(env, fhp)) != 0)
- goto err;
- fhp = NULL;
- }
+ if (!F_ISSET(dbp, DB_AM_INMEM))
+ CLOSE_HANDLE(dbp, fhp);
if ((ret = __fop_lock_handle(env,
dbp, locker, lockmode, &elock, 0)) != 0) {
if (F_ISSET(dbp, DB_AM_INMEM))
@@ -495,7 +492,7 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
}
- /*
+ /*
* If we got here, then we have the handle lock, it is now
* safe to check the rest of the meta data, since the file
* will not be deleted out from under the handle.
@@ -505,7 +502,7 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
dbp, txn, name, flags, DB_SKIP_CHK)) != 0)
goto err;
} else {
- if ((ret = __db_meta_setup(env, dbp, real_name,
+ if ((ret = __db_meta_setup(env, dbp, real_name,
(DBMETA *)mbuf, flags, DB_SKIP_CHK)) != 0)
goto err;
}
@@ -524,9 +521,8 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
if (create_ok) {
if (F_ISSET(dbp, DB_AM_INMEM)) {
RESET_MPF(dbp, DB_MPOOL_DISCARD);
- } else if ((ret =
- __os_closehandle(env, fhp)) != 0)
- goto err;
+ } else
+ CLOSE_HANDLE(dbp, fhp);
LF_SET(DB_CREATE);
goto create;
} else {
@@ -856,6 +852,7 @@ retry: if ((ret = __db_master_open(dbp,
/* Copy the pagesize and set the sub-database flag. */
dbp->pgsize = mdbp->pgsize;
F_SET(dbp, DB_AM_SUBDB);
+ dbp->blob_file_id = mdbp->blob_file_id;
if (name != NULL && (ret = __db_master_update(mdbp, dbp,
ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0) {
@@ -881,6 +878,8 @@ retry: if ((ret = __db_master_open(dbp,
DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname);
+ dbp->dirname = mdbp->dirname;
+
/*
* We copy our fileid from our master so that we all open
* the same file in mpool. We'll use the meta-pgno to lock
@@ -1174,13 +1173,14 @@ err:
* remove).
*
* PUBLIC: int __fop_dummy __P((DB *,
- * PUBLIC: DB_TXN *, const char *, const char *));
+ * PUBLIC: DB_TXN *, const char *, const char *, APPNAME));
*/
int
-__fop_dummy(dbp, txn, old, new)
+__fop_dummy(dbp, txn, old, new, appname)
DB *dbp;
DB_TXN *txn;
const char *old, *new;
+ APPNAME appname;
{
DB *tmpdbp;
DB_TXN *stxn;
@@ -1214,17 +1214,19 @@ __fop_dummy(dbp, txn, old, new)
if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
(ret = __db_set_flags(tmpdbp, DB_TXN_NOT_DURABLE)) != 0)
goto err;
+ tmpdbp->dirname = dbp->dirname;
memset(mbuf, 0, sizeof(mbuf));
ret = F_ISSET(dbp, DB_AM_INMEM) ?
__fop_inmem_dummy(tmpdbp, stxn, back, mbuf) :
- __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf);
+ __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf, appname);
if (ret != 0)
goto err;
ret = F_ISSET(dbp, DB_AM_INMEM) ?
__fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) :
- __fop_ondisk_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker);
+ __fop_ondisk_swap(
+ dbp, tmpdbp, stxn, old, new, back, txn->locker, appname);
stxn = NULL;
if (ret != 0)
goto err;
@@ -1246,12 +1248,13 @@ err: if (stxn != NULL)
* and the subsequent calls in __db_rename do the work for the
* transactional case).
*
- * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *));
+ * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *, APPNAME));
*/
int
-__fop_dbrename(dbp, old, new)
+__fop_dbrename(dbp, old, new, appname)
DB *dbp;
const char *old, *new;
+ APPNAME appname;
{
DB_LOCK elock;
ENV *env;
@@ -1269,11 +1272,11 @@ __fop_dbrename(dbp, old, new)
} else {
/* Get full names. */
if ((ret = __db_appname(env,
- DB_APP_DATA, old, &dbp->dirname, &real_old)) != 0)
+ appname, old, &dbp->dirname, &real_old)) != 0)
goto err;
if ((ret = __db_appname(env,
- DB_APP_DATA, new, &dbp->dirname, &real_new)) != 0)
+ appname, new, &dbp->dirname, &real_new)) != 0)
goto err;
}
@@ -1414,9 +1417,11 @@ __fop_inmem_read_meta(dbp, txn, name, flags, chkflags)
if ((ret = __db_chk_meta(dbp->env, dbp, metap, chkflags)) == 0)
memcpy(dbp->fileid,
((DBMETA *)metap)->uid, DB_FILE_ID_LEN);
- } else
+ } else
ret = __db_meta_setup(
dbp->env, dbp, name, metap, flags, chkflags);
+ if (ret == DB_CHKSUM_FAIL)
+ ret = DB_META_CHKSUM_FAIL;
if ((t_ret =
__memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0)
@@ -1426,11 +1431,12 @@ __fop_inmem_read_meta(dbp, txn, name, flags, chkflags)
}
static int
-__fop_ondisk_dummy(dbp, txn, name, mbuf)
+__fop_ondisk_dummy(dbp, txn, name, mbuf, appname)
DB *dbp;
DB_TXN *txn;
const char *name;
u_int8_t *mbuf;
+ APPNAME appname;
{
ENV *env;
int ret;
@@ -1442,11 +1448,11 @@ __fop_ondisk_dummy(dbp, txn, name, mbuf)
dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
if ((ret = __db_appname(env,
- DB_APP_DATA, name, &dbp->dirname, &realname)) != 0)
+ appname, name, &dbp->dirname, &realname)) != 0)
goto err;
if ((ret = __fop_create(env,
- txn, NULL, name, &dbp->dirname, DB_APP_DATA, 0, dflags)) != 0)
+ txn, NULL, name, &dbp->dirname, appname, 0, dflags)) != 0)
goto err;
if ((ret =
@@ -1455,7 +1461,7 @@ __fop_ondisk_dummy(dbp, txn, name, mbuf)
((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
if ((ret = __fop_write(env, txn, name, dbp->dirname,
- DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0)
+ appname, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0)
goto err;
memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
@@ -1511,11 +1517,12 @@ err: return (ret);
}
static int
-__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker)
+__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker, appname)
DB *dbp, *tmpdbp;
DB_TXN *txn;
const char *old, *new, *back;
DB_LOCKER *locker;
+ APPNAME appname;
{
DBT fiddbt, namedbt, tmpdbt;
DB_FH *fhp;
@@ -1538,7 +1545,7 @@ __fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker)
dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
if ((ret = __db_appname(env,
- DB_APP_DATA, new, &dbp->dirname, &realnew)) != 0)
+ appname, new, &dbp->dirname, &realnew)) != 0)
goto err;
/* Now, lock the name space while we initialize this file. */
@@ -1634,10 +1641,10 @@ retry: GET_ENVLOCK(env, locker, &elock);
* swap for the handle lock.
*/
if ((ret = __fop_rename(env, txn,
- old, new, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
+ old, new, &dbp->dirname, dbp->fileid, appname, 1, dflags)) != 0)
goto err;
if ((ret = __fop_rename(env, txn, back, old,
- &dbp->dirname, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0)
+ &dbp->dirname, tmpdbp->fileid, appname, 0, dflags)) != 0)
goto err;
if ((ret = __fop_lock_handle(env,
tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0)
@@ -1673,12 +1680,12 @@ retry: GET_ENVLOCK(env, locker, &elock);
DB_INIT_DBT(namedbt, old, strlen(old) + 1);
if ((t_ret = __fop_file_remove_log(env,
parent, &lsn, dflags, &fiddbt, &tmpdbt, &namedbt,
- (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0)
+ (u_int32_t)appname, child_txnid)) != 0 && ret == 0)
ret = t_ret;
/* This is a delayed delete of the dummy file. */
if ((ret = __db_appname(env,
- DB_APP_DATA, old, &dbp->dirname, &realold)) != 0)
+ appname, old, &dbp->dirname, &realold)) != 0)
goto err;
if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0)
diff --git a/src/hash/hash.c b/src/hash/hash.c
index ae5736e7..5bff1dee 100644
--- a/src/hash/hash.c
+++ b/src/hash/hash.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
@@ -298,6 +298,7 @@ __hamc_count(dbc, recnop)
}
switch (HPAGE_PTYPE(H_PAIRDATA(dbp, hcp->page, hcp->indx))) {
+ case H_BLOB:
case H_KEYDATA:
case H_OFFPAGE:
recno = 1;
@@ -379,7 +380,7 @@ __hamc_del(dbc, flags)
hcp = (HASH_CURSOR *)dbc->internal;
if (F_ISSET(hcp, H_DELETED))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
if ((ret = __ham_get_meta(dbc)) != 0)
goto out;
@@ -535,7 +536,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop);
case DB_CURRENT:
/* cgetchk has already determined that the cursor is set. */
if (F_ISSET(hcp, H_DELETED)) {
- ret = DB_KEYEMPTY;
+ ret = DBC_ERR(dbc, DB_KEYEMPTY);
goto err;
}
@@ -554,7 +555,8 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop);
if (ret != 0 && ret != DB_NOTFOUND)
goto err;
else if (F_ISSET(hcp, H_OK)) {
- if (*pgnop == PGNO_INVALID)
+ if (*pgnop == PGNO_INVALID && HPAGE_PTYPE(
+ H_PAIRDATA(dbp, hcp->page, hcp->indx)) != H_BLOB)
ret = __ham_dup_return(dbc, data, flags);
break;
} else if (!F_ISSET(hcp, H_NOMORE)) {
@@ -576,7 +578,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop);
dbc->thread_info, hcp->page, dbc->priority);
hcp->page = NULL;
if (hcp->bucket == 0) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
hcp->pgno = PGNO_INVALID;
goto err;
}
@@ -598,7 +600,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop);
F_CLR(hcp, H_ISDUP);
hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
if (hcp->bucket > hcp->hdr->max_bucket) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
hcp->pgno = PGNO_INVALID;
goto err;
}
@@ -612,7 +614,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop);
case DB_SET:
case DB_SET_RANGE:
/* Key not found. */
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
case DB_CURRENT:
/*
@@ -621,7 +623,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop);
* locking. We return the same error code as we would
* if the cursor were deleted.
*/
- ret = DB_KEYEMPTY;
+ ret = DBC_ERR(dbc, DB_KEYEMPTY);
goto err;
default:
DB_ASSERT(env, 0);
@@ -649,11 +651,14 @@ __ham_bulk(dbc, data, flags)
DB *dbp;
DB_MPOOLFILE *mpf;
HASH_CURSOR *cp;
+ HBLOB hblob;
PAGE *pg;
db_indx_t dup_len, dup_off, dup_tlen, indx, *inp;
db_lockmode_t lock_mode;
db_pgno_t pgno;
+ off_t blob_size;
int32_t *endp, *offp, *saveoff;
+ db_seq_t blob_id;
u_int32_t key_off, key_size, pagesize, size, space;
u_int8_t *dbuf, *dp, *hk, *np, *tmp;
int is_dup, is_key;
@@ -708,6 +713,10 @@ next_pg:
space -= key_size;
key_off = (u_int32_t)(np - dbuf);
np += key_size;
+ } else if (HPAGE_PTYPE(hk) == H_BLOB) {
+ __db_errx(dbp->env, DB_STR("1185",
+ "Blob item key."));
+ (void)__env_panic(dbp->env, DB_RUNRECOVERY);
} else {
if (need_pg) {
dp = np;
@@ -982,6 +991,38 @@ get_space:
np += size;
space -= size;
break;
+ case H_BLOB:
+ space -= (is_key ? 4 : 2) * sizeof(*offp);
+ if (space > data->ulen)
+ goto back_up;
+
+ memcpy(&hblob, hk, HBLOB_SIZE);
+ blob_id = (db_seq_t)hblob.id;
+ GET_BLOB_SIZE(dbc->env, hblob, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ if (blob_size > UINT32_MAX) {
+ size = UINT32_MAX;
+ goto back_up;
+ }
+ size = (u_int32_t)blob_size;
+ if (size > space)
+ goto back_up;
+
+ if ((ret = __blob_bulk(dbc, size, blob_id, np)) != 0)
+ return (ret);
+
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+
+ *offp-- = (int32_t)(np - dbuf);
+ *offp-- = (int32_t)size;
+
+ np += size;
+ space -= size;
+ break;
default:
/* Do nothing. */
break;
@@ -1014,7 +1055,7 @@ get_space:
* DBC->get(DB_NEXT) will return DB_NOTFOUND.
*/
cp->bucket--;
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
} else {
/*
* Start on the next bucket.
@@ -1071,7 +1112,7 @@ __hamc_put(dbc, key, data, flags, pgnop)
if (F_ISSET(hcp, H_DELETED) && flags != DB_KEYFIRST &&
flags != DB_KEYLAST && flags != DB_OVERWRITE_DUP)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
if ((ret = __ham_get_meta(dbc)) != 0)
goto err1;
@@ -1083,9 +1124,15 @@ __hamc_put(dbc, key, data, flags, pgnop)
case DB_NOOVERWRITE:
case DB_OVERWRITE_DUP:
nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE :
- HKEYDATA_PSIZE(key->size)) +
- (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE :
- HKEYDATA_PSIZE(data->size));
+ HKEYDATA_PSIZE(key->size));
+ if (dbp->blob_threshold && (data->size >=
+ dbp->blob_threshold || F_ISSET(data, DB_DBT_BLOB)))
+ nbytes += HBLOB_PSIZE;
+ else if (ISBIG(hcp, data->size))
+ nbytes += HOFFPAGE_PSIZE;
+ else
+ nbytes += HKEYDATA_PSIZE(data->size);
+
if ((ret = __ham_lookup(dbc,
key, nbytes, DB_LOCK_WRITE, pgnop)) == DB_NOTFOUND) {
if (hcp->seek_found_page != PGNO_INVALID &&
@@ -1124,7 +1171,7 @@ __hamc_put(dbc, key, data, flags, pgnop)
} else if (ret == 0 && flags == DB_NOOVERWRITE &&
!F_ISSET(hcp, H_DELETED)) {
if (*pgnop == PGNO_INVALID)
- ret = DB_KEYEXIST;
+ ret = DBC_ERR(dbc, DB_KEYEXIST);
else
ret = __bam_opd_exists(dbc, *pgnop);
if (ret != 0)
@@ -1468,6 +1515,7 @@ __ham_dup_return(dbc, val, flags)
type = HPAGE_TYPE(dbp, hcp->page, ndx);
pp = hcp->page;
myval = val;
+ cmp = 0;
/*
* There are 4 cases:
@@ -1545,9 +1593,13 @@ __ham_dup_return(dbc, val, flags)
memcpy(&pgno,
HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
if ((ret = __db_moff(dbc, val, pgno, tlen,
- dbp->dup_compare, &cmp)) != 0)
+ dbp->dup_compare, &cmp, NULL)) != 0)
return (ret);
cmp = -cmp;
+ } else if (((HKEYDATA *)hk)->type == H_BLOB) {
+ __db_errx(dbp->env, DB_STR("1186",
+ "Error - found a blob file in a duplicate data set."));
+ (void)__env_panic(dbp->env, DB_RUNRECOVERY);
} else {
/*
* We do not zero tmp_val since the comparison
@@ -1557,8 +1609,8 @@ __ham_dup_return(dbc, val, flags)
tmp_val.size = LEN_HDATA(dbp, hcp->page,
dbp->pgsize, hcp->indx);
cmp = dbp->dup_compare == NULL ?
- __bam_defcmp(dbp, &tmp_val, val) :
- dbp->dup_compare(dbp, &tmp_val, val);
+ __bam_defcmp(dbp, &tmp_val, val, NULL) :
+ dbp->dup_compare(dbp, &tmp_val, val, NULL);
}
if (cmp > 0 && flags == DB_GET_BOTH_RANGE &&
@@ -1567,7 +1619,7 @@ __ham_dup_return(dbc, val, flags)
}
if (cmp != 0)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
}
/*
@@ -1654,17 +1706,21 @@ __ham_overwrite(dbc, nval, flags)
u_int32_t flags;
{
DB *dbp;
- DBT *myval, tmp_val, tmp_val2;
+ DBT *myval, tmp_val, tmp_val2, old_rec, new_rec;
ENV *env;
HASH_CURSOR *hcp;
+ HBLOB hblob;
void *newrec;
u_int8_t *hk, *p;
u_int32_t len, nondup_size;
+ db_seq_t blob_id, new_blob_id;
db_indx_t newsize;
+ off_t blob_size;
int ret;
dbp = dbc->dbp;
env = dbp->env;
+ ret = 0;
hcp = (HASH_CURSOR *)dbc->internal;
if (F_ISSET(hcp, H_ISDUP)) {
/*
@@ -1717,7 +1773,7 @@ __ham_overwrite(dbc, nval, flags)
NULL, nval, flags, NULL));
}
- if ((ret = __os_malloc(dbp->env,
+ if ((ret = __os_malloc(env,
DUP_SIZE(newsize), &newrec)) != 0)
return (ret);
memset(&tmp_val2, 0, sizeof(tmp_val2));
@@ -1765,7 +1821,7 @@ __ham_overwrite(dbc, nval, flags)
(u_int8_t *)newrec + sizeof(db_indx_t);
tmp_val2.size = newsize;
if (dbp->dup_compare(
- dbp, &tmp_val, &tmp_val2) != 0) {
+ dbp, &tmp_val, &tmp_val2, NULL) != 0) {
__os_free(env, newrec);
return (__db_duperr(dbp, flags));
}
@@ -1816,7 +1872,7 @@ __ham_overwrite(dbc, nval, flags)
sizeof(db_indx_t);
tmp_val2.size = hcp->dup_len;
if (dbp->dup_compare(
- dbp, nval, &tmp_val2) != 0) {
+ dbp, nval, &tmp_val2, NULL) != 0) {
__db_errx(env, DB_STR("1131",
"Existing data sorts differently from put data"));
return (EINVAL);
@@ -1848,16 +1904,84 @@ __ham_overwrite(dbc, nval, flags)
hcp->dup_len = (db_indx_t)nval->size;
}
myval = &tmp_val;
+ goto end;
+ }
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ if (HPAGE_PTYPE(hk) == H_BLOB) {
+ memcpy(&hblob, hk, HBLOB_SIZE);
+ memset(&old_rec, 0, sizeof(DBT));
+ memset(&new_rec, 0, sizeof(DBT));
+ if (DBC_LOGGING(dbc)) {
+ new_rec.data = HKEYDATA_DATA(&hblob);
+ if ((ret = __os_malloc(
+ env, HBLOB_SIZE, &old_rec.data)) != 0)
+ return (ret);
+ memcpy(old_rec.data,
+ HKEYDATA_DATA(&hblob), HBLOB_DSIZE);
+ new_rec.size = old_rec.size = HBLOB_DSIZE;
+ }
+ /*
+ * Inserting a blob record instead of blob data, only
+ * used internally by the DB_STREAM api.
+ */
+ if (F_ISSET(nval, DB_DBT_BLOB_REC)) {
+ DB_ASSERT(env, nval->size == HBLOB_SIZE);
+ DB_ASSERT(env, HPAGE_PTYPE(nval->data) == H_BLOB);
+ memcpy(&hblob, nval->data, nval->size);
+ } else {
+ /*
+ * A blob file overwrite is simpler than other
+ * replace operations. It's simply a matter
+ * deleting the old blob file, and creating a
+ * new one. We may need to be careful of
+ * cursors when we have support for blob
+ * cursors.
+ * That means that we can skip the replpair
+ * call.
+ */
+ blob_id = (db_seq_t)hblob.id;
+ GET_BLOB_SIZE(env, hblob, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ if ((ret = __blob_repl(dbc,
+ nval, blob_id, &new_blob_id, &blob_size)) == 0) {
+ SET_BLOB_ID(&hblob, new_blob_id, HBLOB);
+ SET_BLOB_SIZE(&hblob, blob_size, HBLOB);
+ }
+ }
+ if (ret == 0) {
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __ham_replace_log(dbp,
+ dbc->txn, &LSN(hcp->page), 0,
+ PGNO(hcp->page),
+ (u_int32_t)H_DATAINDEX(hcp->indx),
+ &LSN(hcp->page), 0,
+ OP_SET(H_BLOB, hcp->page), &old_rec,
+ OP_SET(H_BLOB, hcp->page),
+ &new_rec)) != 0) {
+ memcpy(HKEYDATA_DATA(&hblob),
+ old_rec.data, HBLOB_DSIZE);
+ __os_free(env, old_rec.data);
+ return (ret);
+ }
+
+ } else
+ LSN_NOT_LOGGED(LSN(hcp->page));
+ }
+ /* Copy the updated blob data back to the page. */
+ memcpy(hk, &hblob, HBLOB_SIZE);
+ if (old_rec.data != NULL)
+ __os_free(env, old_rec.data);
+ return (ret);
} else if (!F_ISSET(nval, DB_DBT_PARTIAL)) {
/* Put/overwrite */
memcpy(&tmp_val, nval, sizeof(*nval));
F_SET(&tmp_val, DB_DBT_PARTIAL);
tmp_val.doff = 0;
- hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
- if (HPAGE_PTYPE(hk) == H_OFFPAGE)
+ if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
memcpy(&tmp_val.dlen,
HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
- else
+ } else
tmp_val.dlen = LEN_HDATA(dbp, hcp->page,
hcp->hdr->dbmeta.pagesize, hcp->indx);
myval = &tmp_val;
@@ -1865,7 +1989,7 @@ __ham_overwrite(dbc, nval, flags)
/* Regular partial put */
myval = nval;
- return (__ham_replpair(dbc, myval,
+end: return (__ham_replpair(dbc, myval,
F_ISSET(hcp, H_ISDUP) ? H_DUPLICATE : H_KEYDATA));
}
@@ -1955,7 +2079,7 @@ __ham_lookup(dbc, key, sought, mode, pgnop)
return (ret);
}
F_SET(hcp, H_NOMORE);
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
}
/*
diff --git a/src/hash/hash.src b/src/hash/hash.src
index e544c6f3..f56a9c5b 100644
--- a/src/hash/hash.src
+++ b/src/hash/hash.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/hash/hash_compact.c b/src/hash/hash_compact.c
index 83b5ffb1..79fb6004 100644
--- a/src/hash/hash_compact.c
+++ b/src/hash/hash_compact.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
* $Id$
*/
@@ -118,7 +118,8 @@ __ham_compact_int(dbc, start, stop, factor, c_data, donep, flags)
break;
origpgno = pgno;
if ((ret = __db_truncate_root(dbc, hcp->page,
- H_DATAINDEX(hcp->indx), &pgno, 0)) != 0)
+ H_DATAINDEX(hcp->indx),
+ &pgno, 0, &pgs_done)) != 0)
break;
if (pgno != origpgno) {
memcpy(HOFFDUP_PGNO(H_PAIRDATA(dbp,
@@ -247,7 +248,7 @@ __ham_compact_bucket(dbc, c_data, pgs_donep)
if (check_trunc && PREV_PGNO(pg) != PGNO_INVALID &&
PGNO(pg) > c_data->compact_truncate &&
(ret = __db_exchange_page(dbc, &pg,
- hcp->page, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+ hcp->page, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
break;
if (pgno != PGNO(pg))
(*pgs_donep)++;
@@ -400,8 +401,8 @@ __ham_truncate_overflow(dbc, indx, c_data, pgs_done)
if ((ret = __memp_dirty(dbp->mpf, &hcp->page,
dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
return (ret);
- if ((ret =
- __db_truncate_root(dbc, hcp->page, indx, &pgno, 0)) != 0)
+ if ((ret = __db_truncate_root(dbc,
+ hcp->page, indx, &pgno, 0, pgs_done)) != 0)
return (ret);
if (pgno != origpgno) {
memcpy(HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)),
@@ -410,7 +411,8 @@ __ham_truncate_overflow(dbc, indx, c_data, pgs_done)
c_data->compact_pages--;
}
}
- if ((ret = __db_truncate_overflow(dbc, pgno, NULL, c_data)) != 0)
+ if ((ret =
+ __db_truncate_overflow(dbc, pgno, NULL, c_data, pgs_done)) != 0)
return (ret);
return (0);
}
@@ -434,10 +436,11 @@ __ham_compact_hash(dbp, ip, txn, c_data)
HMETA *meta;
PAGE *oldpage;
db_pgno_t free_pgno, last_pgno, pgno, start_pgno;
- int flags, local_txn, ret, t_ret;
+ int flags, local_txn, pgs_done, ret, t_ret;
u_int32_t bucket, i, size;
local_txn = IS_DB_AUTO_COMMIT(dbp, txn);
+ pgs_done = 0;
oldpage = NULL;
dbc = NULL;
LOCK_INIT(lock);
@@ -506,8 +509,8 @@ __ham_compact_hash(dbp, ip, txn, c_data)
flags = 0;
else
flags = DB_EXCH_FREE;
- if ((ret = __db_exchange_page(dbc,
- &oldpage, NULL, free_pgno, flags)) != 0)
+ if ((ret = __db_exchange_page(dbc, &oldpage,
+ NULL, free_pgno, flags, &pgs_done)) != 0)
goto err;
} else if (pgno >= last_pgno) {
if ((ret = __db_free(dbc, oldpage, 0)) != 0)
@@ -526,7 +529,8 @@ __ham_compact_hash(dbp, ip, txn, c_data)
}
if (ret == 0 && F_ISSET(dbp, DB_AM_SUBDB) &&
PGNO(hcp->hdr) > c_data->compact_truncate)
- ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr, c_data);
+ ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr,
+ c_data, &pgs_done);
err: if (oldpage != NULL && (t_ret = __memp_fput(dbp->mpf,
dbc->thread_info, oldpage, dbc->priority)) != 0 && ret == 0)
diff --git a/src/hash/hash_conv.c b/src/hash/hash_conv.c
index fa084f2a..7a53a037 100644
--- a/src/hash/hash_conv.c
+++ b/src/hash/hash_conv.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -104,7 +104,12 @@ __ham_mswap(env, pg)
SWAP32(p); /* h_charkey */
for (i = 0; i < NCACHED; ++i)
SWAP32(p); /* spares */
- p += 59 * sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* threshold */
+ SWAP32(p); /* file id lo */
+ SWAP32(p); /* file id hi */
+ SWAP32(p); /* sdb id lo */
+ SWAP32(p); /* sdb id hi */
+ p += 54 * sizeof(u_int32_t); /* unused */
SWAP32(p); /* crypto_magic */
return (0);
}
diff --git a/src/hash/hash_dup.c b/src/hash/hash_dup.c
index 879c33d7..523d7227 100644
--- a/src/hash/hash_dup.c
+++ b/src/hash/hash_dup.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
@@ -368,6 +368,7 @@ finish: if (ret == 0) {
off += len + 2 * sizeof(db_indx_t);
}
break;
+ case H_BLOB:
default:
ret = __db_pgfmt(env, hcp->pgno);
break;
@@ -772,7 +773,7 @@ __ham_dsearch(dbc, dbt, offp, cmpp, flags)
DBT cur;
HASH_CURSOR *hcp;
db_indx_t i, len;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
u_int8_t *data;
dbp = dbc->dbp;
@@ -794,7 +795,7 @@ __ham_dsearch(dbc, dbt, offp, cmpp, flags)
* we're done. In the latter case, if permitting partial
* matches, it's not a failure.
*/
- *cmpp = func(dbp, dbt, &cur);
+ *cmpp = func(dbp, dbt, &cur, NULL);
if (*cmpp == 0)
break;
if (*cmpp < 0 && dbp->dup_compare != NULL) {
diff --git a/src/hash/hash_func.c b/src/hash/hash_func.c
index baf6061c..1e83b00a 100644
--- a/src/hash/hash_func.c
+++ b/src/hash/hash_func.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993
diff --git a/src/hash/hash_meta.c b/src/hash/hash_meta.c
index d9a35cb4..aefdffb8 100644
--- a/src/hash/hash_meta.c
+++ b/src/hash/hash_meta.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/hash/hash_method.c b/src/hash/hash_method.c
index 1da81e70..a05bcea6 100644
--- a/src/hash/hash_method.c
+++ b/src/hash/hash_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -20,7 +20,7 @@ static int __ham_set_h_hash
static int __ham_set_h_nelem __P((DB *, u_int32_t));
static int __ham_get_h_compare
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
/*
* __ham_db_create --
@@ -153,7 +153,7 @@ __ham_set_h_hash(dbp, func)
static int
__ham_get_h_compare(dbp, funcp)
DB *dbp;
- int (**funcp) __P((DB *, const DBT *, const DBT *));
+ int (**funcp) __P((DB *, const DBT *, const DBT *, size_t *));
{
HASH *t;
@@ -170,13 +170,13 @@ __ham_get_h_compare(dbp, funcp)
* __ham_set_h_compare --
* Set the comparison function.
*
- * PUBLIC: int __ham_set_h_compare
- * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ * PUBLIC: int __ham_set_h_compare __P((DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *)));
*/
int
__ham_set_h_compare(dbp, func)
DB *dbp;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
{
HASH *t;
diff --git a/src/hash/hash_open.c b/src/hash/hash_open.c
index 3d0bb220..0104a57f 100644
--- a/src/hash/hash_open.c
+++ b/src/hash/hash_open.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
@@ -44,6 +44,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/crypto.h"
#include "dbinc/db_page.h"
#include "dbinc/hash.h"
@@ -149,6 +150,7 @@ __ham_metachk(dbp, name, hashm)
int ret;
env = dbp->env;
+ ret = 0;
/*
* At this point, all we know is that the magic number is for a Hash.
@@ -168,6 +170,7 @@ __ham_metachk(dbp, name, hashm)
case 7:
case 8:
case 9:
+ case 10:
break;
default:
__db_errx(env, DB_STR_A("1126",
@@ -230,6 +233,29 @@ __ham_metachk(dbp, name, hashm)
/* Set the page size. */
dbp->pgsize = hashm->dbmeta.pagesize;
+ dbp->blob_threshold = hashm->blob_threshold;
+ GET_BLOB_FILE_ID(env, hashm, dbp->blob_file_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB_SDB_ID(env, hashm, dbp->blob_sdb_id, ret);
+ if (ret != 0)
+ return (ret);
+ /* Blob databases must be upgraded. */
+ if (vers == 9 && (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0)) {
+ __db_errx(env, DB_STR_A("1208",
+"%s: databases that support blobs must be upgraded.", "%s"),
+ name);
+ return (EINVAL);
+ }
+#ifndef HAVE_64BIT_TYPES
+ if (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0) {
+ __db_errx(env, DB_STR_A("1202",
+ "%s: blobs require 64 integer compiler support.", "%s"),
+ name);
+ return (EINVAL);
+ }
+#endif
+
/* Copy the file's ID. */
memcpy(dbp->fileid, hashm->dbmeta.uid, DB_FILE_ID_LEN);
@@ -297,6 +323,9 @@ __ham_init_meta(dbp, meta, pgno, lsnp)
meta->nelem = hashp->h_nelem;
meta->h_charkey = hashp->h_hash(dbp, CHARKEY, sizeof(CHARKEY));
memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+ meta->blob_threshold = dbp->blob_threshold;
+ SET_BLOB_META_FILE_ID(meta, dbp->blob_file_id, HMETA);
+ SET_BLOB_META_SDB_ID(meta, dbp->blob_sdb_id, HMETA);
if (F_ISSET(dbp, DB_AM_DUP))
F_SET(&meta->dbmeta, DB_HASH_DUP);
@@ -414,6 +443,12 @@ __ham_new_file(dbp, ip, txn, fhp, name)
F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
pdbt.data = &pginfo;
pdbt.size = sizeof(pginfo);
+ if (dbp->blob_threshold) {
+ if ((ret = __blob_generate_dir_ids(
+ dbp, txn, &dbp->blob_file_id)) != 0)
+ return (ret);
+
+ }
if ((ret = __os_calloc(dbp->env, 1, dbp->pgsize, &buf)) != 0)
return (ret);
meta = (HMETA *)buf;
@@ -491,6 +526,13 @@ __ham_new_subdb(mdbp, dbp, ip, txn)
LOCK_INIT(metalock);
LOCK_INIT(mmlock);
+ if (dbp->blob_threshold) {
+ if ((ret = __blob_generate_dir_ids(
+ dbp, txn, &dbp->blob_sdb_id)) != 0)
+ return (ret);
+
+ }
+
if ((ret = __db_cursor(mdbp, ip, txn,
&dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
return (ret);
diff --git a/src/hash/hash_page.c b/src/hash/hash_page.c
index 7576fe61..8e0f897d 100644
--- a/src/hash/hash_page.c
+++ b/src/hash/hash_page.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
@@ -129,7 +129,7 @@ recheck:
/* Fetch next page. */
if (NEXT_PGNO(hcp->page) == PGNO_INVALID) {
F_SET(hcp, H_NOMORE);
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
}
next_pgno = NEXT_PGNO(hcp->page);
hcp->indx = 0;
@@ -344,7 +344,7 @@ __ham_item_prev(dbc, mode, pgnop)
if (hcp->pgno == PGNO_INVALID) {
/* Beginning of bucket. */
F_SET(hcp, H_NOMORE);
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
} else if ((ret =
__ham_next_cpage(dbc, hcp->pgno)) != 0)
return (ret);
@@ -371,7 +371,7 @@ __ham_item_prev(dbc, mode, pgnop)
if (hcp->indx == 0) {
/* Bucket was empty. */
F_SET(hcp, H_NOMORE);
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
}
}
@@ -497,7 +497,8 @@ __ham_insertpair(dbc, p, indxp, key_dbt, data_dbt, key_type, data_type)
inp = P_INP(dbp, p);
ksize = (key_type == H_OFFPAGE) ?
key_dbt->size : HKEYDATA_SIZE(key_dbt->size);
- dsize = (data_type == H_OFFPAGE || data_type == H_OFFDUP) ?
+ dsize = (data_type == H_OFFPAGE ||
+ data_type == H_OFFDUP || data_type == H_BLOB) ?
data_dbt->size : HKEYDATA_SIZE(data_dbt->size);
increase = ksize + dsize;
@@ -579,7 +580,8 @@ __ham_insertpair(dbc, p, indxp, key_dbt, data_dbt, key_type, data_type)
else
PUT_HKEYDATA(P_ENTRY(dbp, p, indx), key_dbt->data,
key_dbt->size, key_type);
- if (data_type == H_OFFPAGE || data_type == H_OFFDUP)
+ if (data_type == H_BLOB ||
+ data_type == H_OFFPAGE || data_type == H_OFFDUP)
memcpy(P_ENTRY(dbp, p, indx+1), data_dbt->data,
data_dbt->size);
else
@@ -618,6 +620,8 @@ __ham_getindex(dbc, p, key, key_type, match, indx)
{
/* Since all entries are key/data pairs. */
DB_ASSERT(dbc->env, NUM_ENT(p)%2 == 0 );
+ /* Blob files can only be stored as data items. */
+ DB_ASSERT(dbc->env, key_type != H_BLOB );
/* Support pre 4.6 unsorted hash pages. */
if (p->type == P_HASH_UNSORTED)
@@ -672,7 +676,7 @@ __ham_getindex_unsorted(dbc, p, key, match, indx)
memcpy(&pgno,
HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
if ((ret = __db_moff(dbc, key, pgno, tlen,
- t->h_compare, &res)) != 0)
+ t->h_compare, &res, NULL)) != 0)
return (ret);
}
break;
@@ -681,7 +685,7 @@ __ham_getindex_unsorted(dbc, p, key, match, indx)
DB_INIT_DBT(pg_dbt,
HKEYDATA_DATA(hk), key->size);
if (t->h_compare(
- dbp, key, &pg_dbt) != 0)
+ dbp, key, &pg_dbt, NULL) != 0)
break;
} else if (key->size ==
LEN_HKEY(dbp, p, dbp->pgsize, i))
@@ -784,7 +788,7 @@ __ham_getindex_sorted(dbc, p, key, key_type, match, indxp)
(void)__ua_memcpy(&off_pgno,
HOFFPAGE_PGNO(offp), sizeof(db_pgno_t));
if ((ret = __db_moff(dbc, key, off_pgno,
- itemlen, t->h_compare, &res)) != 0)
+ itemlen, t->h_compare, &res, NULL)) != 0)
return (ret);
}
} else {
@@ -799,7 +803,7 @@ __ham_getindex_sorted(dbc, p, key, key_type, match, indxp)
(void)__ua_memcpy(&off_len, HOFFPAGE_TLEN(offp),
sizeof(u_int32_t));
if ((ret = __db_moff(dbc, &tmp_dbt, off_pgno,
- off_len, t->h_compare, &res)) != 0)
+ off_len, t->h_compare, &res, NULL)) != 0)
return (ret);
/*
* Since we switched the key/match parameters
@@ -810,7 +814,7 @@ __ham_getindex_sorted(dbc, p, key, key_type, match, indxp)
} else if (t->h_compare != NULL) {
/* Case 4, with a user comparison func */
DB_INIT_DBT(tmp_dbt, data, itemlen);
- res = t->h_compare(dbp, key, &tmp_dbt);
+ res = t->h_compare(dbp, key, &tmp_dbt, NULL);
} else {
/* Case 4, without a user comparison func */
if ((res = memcmp(key->data, data,
@@ -899,8 +903,8 @@ __ham_verify_sorted_page (dbc, p)
sizeof(u_int32_t));
memcpy(&tpgno, HOFFPAGE_PGNO(H_PAIRKEY(dbp, p, i-2)),
sizeof(db_pgno_t));
- if ((ret = __db_moff(dbc,
- &curr_dbt, tpgno, tlen, t->h_compare, &res)) != 0)
+ if ((ret = __db_moff(dbc, &curr_dbt,
+ tpgno, tlen, t->h_compare, &res, NULL)) != 0)
return (ret);
} else if (HPAGE_TYPE(dbp, p, i) == H_OFFPAGE) {
memset(&prev_dbt, 0, sizeof(prev_dbt));
@@ -910,8 +914,8 @@ __ham_verify_sorted_page (dbc, p)
sizeof(u_int32_t));
memcpy(&tpgno, HOFFPAGE_PGNO(H_PAIRKEY(dbp, p, i)),
sizeof(db_pgno_t));
- if ((ret = __db_moff(dbc,
- &prev_dbt, tpgno, tlen, t->h_compare, &res)) != 0)
+ if ((ret = __db_moff(dbc, &prev_dbt, tpgno, tlen,
+ t->h_compare, &res, NULL)) != 0)
return (ret);
} else
res = memcmp(prev, curr, min(curr_len, prev_len));
@@ -1047,9 +1051,11 @@ __ham_del_pair(dbc, flags, ppg)
DBT data_dbt, key_dbt;
DB_LSN new_lsn, *n_lsn, tmp_lsn;
DB_MPOOLFILE *mpf;
+ HBLOB hblob;
HASH_CURSOR *hcp;
PAGE *n_pagep, *nn_pagep, *p, *p_pagep;
db_ham_mode op;
+ db_seq_t blob_id;
db_indx_t ndx;
db_pgno_t chg_pgno, pgno, tmp_pgno;
u_int32_t data_type, key_type, order;
@@ -1067,6 +1073,8 @@ __ham_del_pair(dbc, flags, ppg)
DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &hcp->page)) != 0)
return (ret);
p = hcp->page;
+ key_type = HPAGE_PTYPE(H_PAIRKEY(dbp, p, ndx));
+ data_type = HPAGE_PTYPE(H_PAIRDATA(dbp, p, ndx));
/*
* We optimize for the normal case which is when neither the key nor
@@ -1075,8 +1083,7 @@ __ham_del_pair(dbc, flags, ppg)
* to remove the big item and then update the page to remove the
* entry referring to the big item.
*/
- if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) &&
- HPAGE_PTYPE(H_PAIRKEY(dbp, p, ndx)) == H_OFFPAGE) {
+ if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) && key_type == H_OFFPAGE) {
memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(dbp, p, H_KEYINDEX(ndx))),
sizeof(db_pgno_t));
ret = __db_doff(dbc, pgno);
@@ -1084,7 +1091,13 @@ __ham_del_pair(dbc, flags, ppg)
ret = 0;
if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) && ret == 0)
- switch (HPAGE_PTYPE(H_PAIRDATA(dbp, p, ndx))) {
+ switch (data_type) {
+ case H_BLOB:
+ memcpy(&hblob,
+ P_ENTRY(dbp, p, H_DATAINDEX(ndx)), HBLOB_SIZE);
+ blob_id = (db_seq_t)hblob.id;
+ ret = __blob_del(dbc, blob_id);
+ break;
case H_OFFPAGE:
memcpy(&pgno,
HOFFPAGE_PGNO(P_ENTRY(dbp, p, H_DATAINDEX(ndx))),
@@ -1111,7 +1124,7 @@ __ham_del_pair(dbc, flags, ppg)
/* Now log the delete off this page. */
if (DBC_LOGGING(dbc)) {
hk = H_PAIRKEY(dbp, hcp->page, ndx);
- if ((key_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) {
+ if (key_type == H_OFFPAGE) {
key_dbt.data = hk;
key_dbt.size = HOFFPAGE_SIZE;
} else {
@@ -1120,9 +1133,12 @@ __ham_del_pair(dbc, flags, ppg)
LEN_HKEY(dbp, hcp->page, dbp->pgsize, ndx);
}
hk = H_PAIRDATA(dbp, hcp->page, ndx);
- if ((data_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) {
+ if (data_type == H_OFFPAGE) {
data_dbt.data = hk;
data_dbt.size = HOFFPAGE_SIZE;
+ } else if (data_type == H_BLOB) {
+ data_dbt.data = hk;
+ data_dbt.size = HBLOB_SIZE;
} else if (data_type == H_OFFDUP) {
data_dbt.data = hk;
data_dbt.size = HOFFDUP_SIZE;
@@ -1404,6 +1420,8 @@ __ham_replpair(dbc, dbt, newtype)
* unless it is an append, when we extend the offpage item, and
* update the HOFFPAGE item on the current page to have the new size
* via a delete/add.
+ *
+ * Updating a record won't cause it to become a blob file or vice versa.
*/
dbp = dbc->dbp;
env = dbp->env;
@@ -2464,15 +2482,18 @@ __ham_add_el(dbc, key, val, type)
const DBT *pkey, *pdata;
DB *dbp;
DBT key_dbt, data_dbt;
- DB_LSN new_lsn;
+ DB_LSN blob_lsn, new_lsn;
DB_MPOOLFILE *mpf;
HASH_CURSOR *hcp;
HOFFPAGE doff, koff;
+ HBLOB dblob;
PAGE *new_pagep;
db_pgno_t next_pgno, pgno;
+ off_t file_size;
+ db_seq_t blob_id;
u_int32_t data_size, data_type, key_size, key_type;
u_int32_t pages, pagespace, pairsize;
- int do_expand, is_keybig, is_databig, match, ret;
+ int do_expand, is_keybig, match, ret;
dbp = dbc->dbp;
mpf = dbp->mpf;
@@ -2485,14 +2506,33 @@ __ham_add_el(dbc, key, val, type)
dbc->thread_info, dbc->txn, DB_MPOOL_CREATE, &hcp->page)) != 0)
return (ret);
+ /*
+ * Key is either:
+ * - On page
+ * - On overflow page(s)
+ */
key_size = HKEYDATA_PSIZE(key->size);
- data_size = HKEYDATA_PSIZE(val->size);
is_keybig = ISBIG(hcp, key->size);
- is_databig = ISBIG(hcp, val->size);
if (is_keybig)
key_size = HOFFPAGE_PSIZE;
- if (is_databig)
+ /*
+ * Data is either:
+ * - On page (H_KEYDATA or H_DUPLICATE)
+ * - On overflow page(s)
+ * - In a blob file
+ */
+ data_type =
+ (dbp->blob_threshold && (val->size >= dbp->blob_threshold ||
+ F_ISSET(val, DB_DBT_BLOB))) ?
+ H_BLOB : (ISBIG(hcp, val->size)) ? H_OFFPAGE : H_KEYDATA;
+ if (data_type == H_KEYDATA || data_type == H_DUPLICATE)
+ data_size = HKEYDATA_PSIZE(val->size);
+ else if (data_type == H_OFFPAGE)
data_size = HOFFPAGE_PSIZE;
+ else { /* H_BLOB */
+ DB_ASSERT(dbp->env, data_type == H_BLOB);
+ data_size = HBLOB_PSIZE;
+ }
pairsize = key_size + data_size;
@@ -2536,17 +2576,17 @@ __ham_add_el(dbc, key, val, type)
* run out of file space before updating the key or data.
*/
if (dbc->txn == NULL &&
- dbp->mpf->mfp->maxpgno != 0 && (is_keybig || is_databig)) {
+ dbp->mpf->mfp->maxpgno != 0 &&
+ (is_keybig || data_type == H_OFFPAGE)) {
pagespace = P_MAXSPACE(dbp, dbp->pgsize);
pages = 0;
- if (is_databig)
+ if (data_type == H_OFFPAGE)
pages = ((data_size - 1) / pagespace) + 1;
- if (is_keybig) {
+ if (is_keybig)
pages += ((key->size - 1) / pagespace) + 1;
- if (pages >
- (dbp->mpf->mfp->maxpgno - dbp->mpf->mfp->last_pgno))
- return (__db_space_err(dbp));
- }
+ if (pages >
+ (dbp->mpf->mfp->maxpgno - dbp->mpf->mfp->last_pgno))
+ return (__db_space_err(dbp));
}
if ((ret = __memp_dirty(mpf,
@@ -2575,7 +2615,7 @@ __ham_add_el(dbc, key, val, type)
key_type = H_KEYDATA;
}
- if (is_databig) {
+ if (data_type == H_OFFPAGE) {
doff.type = H_OFFPAGE;
UMRW_SET(doff.unused[0]);
UMRW_SET(doff.unused[1]);
@@ -2587,6 +2627,22 @@ __ham_add_el(dbc, key, val, type)
data_dbt.size = sizeof(doff);
pdata = &data_dbt;
data_type = H_OFFPAGE;
+ } else if (data_type == H_BLOB) {
+ memset(&dblob, 0, HBLOB_SIZE);
+ dblob.type = H_BLOB;
+ blob_id = 0;
+ file_size = 0;
+ if ((ret = __blob_put(
+ dbc, (DBT *)val, &blob_id, &file_size, &blob_lsn)) != 0)
+ return (ret);
+ SET_BLOB_ID(&dblob, blob_id, HBLOB);
+ SET_BLOB_SIZE(&dblob, file_size, HBLOB);
+ SET_BLOB_FILE_ID(&dblob, dbp->blob_file_id, HBLOB);
+ SET_BLOB_SDB_ID(&dblob, dbp->blob_sdb_id, HBLOB);
+ data_dbt.data = &dblob;
+ data_dbt.size = sizeof(dblob);
+ pdata = &data_dbt;
+ data_type = H_BLOB;
} else {
pdata = val;
data_type = type;
@@ -2673,7 +2729,7 @@ __ham_add_el(dbc, key, val, type)
/*
* Special insert pair call -- copies a key/data pair from one page to
* another. Works for all types of hash entries (H_OFFPAGE, H_KEYDATA,
- * H_DUPLICATE, H_OFFDUP). Since we log splits at a high level, we
+ * H_DUPLICATE, H_OFFDUP, H_BLOB). Since we log splits at a high level, we
* do not need to log them here.
*
* dest_indx is an optional parameter, it serves several purposes:
@@ -2715,7 +2771,7 @@ __ham_copypair(dbc, src_page, src_ndx, dest_page, dest_indx, log)
tkey.data = HKEYDATA_DATA(P_ENTRY(dbp, src_page, kindx));
tkey.size = LEN_HKEYDATA(dbp, src_page, dbp->pgsize, kindx);
}
- if (dtype == H_OFFPAGE || dtype == H_OFFDUP) {
+ if (dtype == H_OFFPAGE || dtype == H_OFFDUP || dtype == H_BLOB) {
tdata.data = P_ENTRY(dbp, src_page, dindx);
tdata.size = LEN_HITEM(dbp, src_page, dbp->pgsize, dindx);
} else {
diff --git a/src/hash/hash_rec.c b/src/hash/hash_rec.c
index 58965569..8a39d880 100644
--- a/src/hash/hash_rec.c
+++ b/src/hash/hash_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1995, 1996
@@ -232,6 +232,7 @@ __ham_insdel_42_recover(env, dbtp, lsnp, op, info)
REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
ktype = DB_UNDO(op) || PAIR_ISKEYBIG(argp->opcode) ?
H_OFFPAGE : H_KEYDATA;
+ /* TODO: May need a PAIR_ISDATABLOB here. */
if (PAIR_ISDATADUP(argp->opcode))
dtype = H_DUPLICATE;
else if (DB_UNDO(op) || PAIR_ISDATABIG(argp->opcode))
@@ -957,9 +958,8 @@ __ham_metagroup_recover(env, dbtp, lsnp, op, info)
if (IS_ZERO_LSN(LSN(pagep))) {
REC_DIRTY(mpf, ip, dbc->priority, &pagep);
- P_INIT(pagep, file_dbp->pgsize,
- PGNO_INVALID, PGNO_INVALID, PGNO_INVALID,
- 0, P_HASH);
+ P_INIT(pagep, file_dbp->pgsize, pgno,
+ PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
}
if ((ret =
__memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
diff --git a/src/hash/hash_reclaim.c b/src/hash/hash_reclaim.c
index ce3f6d9e..55980444 100644
--- a/src/hash/hash_reclaim.c
+++ b/src/hash/hash_reclaim.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/hash/hash_stat.c b/src/hash/hash_stat.c
index 683ce5a6..7ccf472d 100644
--- a/src/hash/hash_stat.c
+++ b/src/hash/hash_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -188,15 +188,19 @@ __ham_stat_print(dbc, flags)
sp->hash_bfree, sp->hash_buckets, sp->hash_pagesize), "ff");
__db_dl(env,
- "Number of overflow pages", (u_long)sp->hash_bigpages);
- __db_dl_pct(env, "Number of bytes free in overflow pages",
+ "Number of blobs", (u_long)sp->hash_nblobs);
+ __db_dl(env,
+ "Number of hash overflow (big item) pages",
+ (u_long)sp->hash_bigpages);
+ __db_dl_pct(env,
+ "Number of bytes free in hash overflow (big item) pages",
(u_long)sp->hash_big_bfree, DB_PCT_PG(
sp->hash_big_bfree, sp->hash_bigpages, sp->hash_pagesize), "ff");
__db_dl(env,
"Number of bucket overflow pages", (u_long)sp->hash_overflows);
__db_dl_pct(env,
- "Number of bytes free in bucket overflow pages",
+ "Number of bytes free on bucket overflow pages",
(u_long)sp->hash_ovfl_free, DB_PCT_PG(
sp->hash_ovfl_free, sp->hash_overflows, sp->hash_pagesize), "ff");
@@ -258,6 +262,9 @@ __ham_stat_callback(dbc, pagep, cookie, putp)
switch (*H_PAIRDATA(dbp, pagep, indx)) {
case H_OFFDUP:
break;
+ case H_BLOB:
+ sp->hash_nblobs++;
+ /* fall through */
case H_OFFPAGE:
case H_KEYDATA:
sp->hash_ndata++;
@@ -480,6 +487,7 @@ __ham_traverse(dbc, mode, callback, cookie, look_past_max)
opgno, callback, cookie)) != 0)
goto err;
break;
+ case H_BLOB:
case H_KEYDATA:
case H_DUPLICATE:
break;
diff --git a/src/hash/hash_stub.c b/src/hash/hash_stub.c
index 57337ea9..89307670 100644
--- a/src/hash/hash_stub.c
+++ b/src/hash/hash_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -127,6 +127,40 @@ __ham_46_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
}
int
+__ham_60_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(dirtyp, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
+__ham_60_hash(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(dirtyp, NULL);
+ return (__db_no_hash_am(dbp->env));
+}
+
+int
__hamc_cmp(dbc, other_dbc, result)
DBC *dbc, *other_dbc;
int *result;
diff --git a/src/hash/hash_upgrade.c b/src/hash/hash_upgrade.c
index f66a7a58..17014a5c 100644
--- a/src/hash/hash_upgrade.c
+++ b/src/hash/hash_upgrade.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/hash.h"
#include "dbinc/db_upgrade.h"
@@ -321,3 +322,93 @@ __ham_46_hash(dbp, real_name, flags, fhp, h, dirtyp)
return (ret);
}
+
+/*
+ * __ham_60_hashmeta--
+ * Upgrade the version number.
+ *
+ * PUBLIC: int __ham_60_hashmeta
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_60_hashmeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ HMETA33 *hmeta;
+
+ COMPQUIET(flags, 0);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(dbp, NULL);
+ hmeta = (HMETA33 *)h;
+
+ hmeta->dbmeta.version = 10;
+ *dirtyp = 1;
+
+ return (0);
+}
+
+/*
+ * __ham_60_hash --
+ * Upgrade the blob records on the database hash leaf pages.
+ *
+ * PUBLIC: int __ham_60_hash
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__ham_60_hash(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ HBLOB60 hb60;
+ HBLOB60P1 hb60p1;
+ HKEYDATA *hk;
+ db_seq_t blob_id, blob_size, file_id, sdb_id;
+ db_indx_t indx;
+ int ret;
+
+ COMPQUIET(flags, 0);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+ ret = 0;
+
+ DB_ASSERT(dbp->env, HBLOB60_SIZE == HBLOB_SIZE);
+ for (indx = 0; indx < NUM_ENT(h); indx += 2) {
+ hk = (HKEYDATA *)H_PAIRDATA(dbp, h, indx);
+ if (HPAGE_PTYPE(hk) == H_BLOB) {
+ memcpy(&hb60, hk, HBLOB60_SIZE);
+ memset(&hb60p1, 0, HBLOB_SIZE);
+ hb60p1.type = hb60.type;
+ hb60p1.encoding = hb60.encoding;
+ GET_BLOB60_ID(dbp->env, hb60, blob_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_SIZE(dbp->env, hb60, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_FILE_ID(dbp->env, &hb60, file_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_SDB_ID(dbp->env, &hb60, sdb_id, ret);
+ if (ret != 0)
+ return (ret);
+ SET_BLOB_ID(&hb60p1, blob_id, HBLOB60P1);
+ SET_BLOB_SIZE(&hb60p1, blob_size, HBLOB60P1);
+ SET_BLOB_FILE_ID(&hb60p1, file_id, HBLOB60P1);
+ SET_BLOB_SDB_ID(&hb60p1, sdb_id, HBLOB60P1);
+ memcpy(hk, &hb60p1, HBLOB_SIZE);
+ *dirtyp = 1;
+ }
+ }
+
+ return (ret);
+}
diff --git a/src/hash/hash_verify.c b/src/hash/hash_verify.c
index 662e7ac8..302d42d8 100644
--- a/src/hash/hash_verify.c
+++ b/src/hash/hash_verify.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_verify.h"
#include "dbinc/btree.h"
@@ -47,6 +48,7 @@ __ham_vrfy_meta(dbp, vdp, m, pgno, flags)
int i, ret, t_ret, isbad;
u_int32_t pwr, mbucket;
u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
+ db_seq_t blob_id;
env = dbp->env;
isbad = 0;
@@ -164,6 +166,55 @@ __ham_vrfy_meta(dbp, vdp, m, pgno, flags)
}
}
+/*
+ * Where 64-bit integer support is not available,
+ * return an error if the file has any blobs.
+ */
+ t_ret = 0;
+#ifdef HAVE_64BIT_TYPES
+ GET_BLOB_FILE_ID(env, m, blob_id, t_ret);
+ if (t_ret != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1178",
+ "Page %lu: blob file id overflow.", "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+ t_ret = 0;
+ GET_BLOB_SDB_ID(env, m, blob_id, t_ret);
+ if (t_ret != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1179",
+ "Page %lu: blob subdatabase id overflow.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+#else /* HAVE_64BIT_TYPES */
+ /*
+ * db_seq_t is an int on systems that do not have 64 integer types, so
+ * this will compile and run.
+ */
+ GET_BLOB_FILE_ID(env, m, blob_id, t_ret);
+ if (t_ret != 0 || blob_id != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1203",
+ "Page %lu: blobs require 64 integer compiler support.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+ GET_BLOB_SDB_ID(env, m, blob_id, t_ret);
+ if (t_ret != 0 || blob_id != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1204",
+ "Page %lu: blobs require 64 integer compiler support.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret == t_ret;
+ }
+#endif
+
err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
ret = t_ret;
if (LF_ISSET(DB_SALVAGE) &&
@@ -272,12 +323,15 @@ __ham_vrfy_item(dbp, vdp, pgno, h, i, flags)
PAGE *h;
u_int32_t i, flags;
{
+ HBLOB hblob;
HOFFDUP hod;
HOFFPAGE hop;
VRFY_CHILDINFO child;
VRFY_PAGEINFO *pip;
db_indx_t offset, len, dlen, elen;
int ret, t_ret;
+ off_t blob_size;
+ db_seq_t blob_id, file_id, sdb_id;
u_int8_t *databuf;
if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
@@ -287,6 +341,38 @@ __ham_vrfy_item(dbp, vdp, pgno, h, i, flags)
case H_KEYDATA:
/* Nothing to do here--everything but the type field is data */
break;
+ case H_BLOB:
+ /*
+ * Blob item. Check that the blob file exists and is the same
+ * file size as is stored in the database record.
+ */
+ memcpy(&hblob, P_ENTRY(dbp, h, i), HBLOB_SIZE);
+ blob_id = (db_seq_t)hblob.id;
+ GET_BLOB_SIZE(dbp->env, hblob, blob_size, ret);
+ if (ret != 0 || blob_size < 0) {
+ EPRINT((dbp->env, DB_STR_A("1181",
+ "Page %lu: blob file size value has overflowed",
+ "%lu"), (u_long)pip->pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ file_id = (db_seq_t)hblob.file_id;
+ sdb_id = (db_seq_t)hblob.sdb_id;
+ if (file_id == 0 && sdb_id == 0) {
+ EPRINT((dbp->env, DB_STR_A("1184",
+ "Page %lu: invalid blob dir ids %llu %llu at item %lu",
+ "%lu %llu %llu %lu"),
+ (u_long)pip->pgno, (unsigned long long)file_id,
+ (unsigned long long)sdb_id, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = __blob_vrfy(dbp->env, blob_id,
+ blob_size, file_id, sdb_id, pip->pgno, flags)) != 0) {
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ break;
case H_DUPLICATE:
/* Are we a datum or a key? Better be the former. */
if (i % 2 == 0) {
@@ -822,15 +908,23 @@ __ham_salvage(dbp, vdp, pgno, h, handle, callback, flags)
u_int32_t flags;
{
DBT dbt, key_dbt, unkdbt;
+ ENV *env;
+ HBLOB hblob;
+ char *prefix;
db_pgno_t dpgno;
int ret, err_ret, t_ret;
- u_int32_t himark, i, ovfl_bufsz;
- u_int8_t *hk, *p;
+ off_t blob_size, blob_offset, remaining;
+ u_int32_t blob_buf_size, himark, i, ovfl_bufsz;
+ u_int8_t *blob_buf, *hk, *p;
+ db_seq_t blob_id, file_id, sdb_id;
void *buf, *key_buf;
db_indx_t dlen, len, tlen;
memset(&dbt, 0, sizeof(DBT));
dbt.flags = DB_DBT_REALLOC;
+ blob_buf = NULL;
+ blob_buf_size = 0;
+ env = dbp->env;
DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1);
@@ -840,9 +934,9 @@ __ham_salvage(dbp, vdp, pgno, h, handle, callback, flags)
* Allocate a buffer for overflow items. Start at one page;
* __db_safe_goff will realloc as needed.
*/
- if ((ret = __os_malloc(dbp->env, dbp->pgsize, &buf)) != 0)
+ if ((ret = __os_malloc(env, dbp->pgsize, &buf)) != 0)
return (ret);
- ovfl_bufsz = dbp->pgsize;
+ ovfl_bufsz = dbp->pgsize;
himark = dbp->pgsize;
for (i = 0;; i++) {
@@ -886,6 +980,70 @@ keydata: memcpy(buf, HKEYDATA_DATA(hk), len);
0, " ", handle, callback, 0, 0, vdp)) != 0)
err_ret = ret;
break;
+ case H_BLOB:
+ memcpy(&hblob, hk, HBLOB_SIZE);
+ blob_id = (db_seq_t)hblob.id;
+ GET_BLOB_SIZE(env, hblob, blob_size, ret);
+ if (ret != 0 || blob_size < 0) {
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+ file_id = (db_seq_t)hblob.file_id;
+ sdb_id = (db_seq_t)hblob.sdb_id;
+ /* Read the blob, in pieces if too large.*/
+ blob_offset = 0;
+ if (blob_size > MEGABYTE) {
+ if (blob_buf_size < MEGABYTE) {
+ if ((ret = __os_realloc(
+ env, MEGABYTE,
+ &blob_buf)) != 0) {
+ err_ret = ret;
+ continue;
+ }
+ blob_buf_size = MEGABYTE;
+ }
+ } else if (blob_buf_size < blob_size) {
+ blob_buf_size = (u_int32_t)blob_size;
+ if ((ret = __os_realloc(env,
+ blob_buf_size, &blob_buf)) != 0) {
+ err_ret = ret;
+ continue;
+ }
+ }
+ dbt.data = blob_buf;
+ dbt.ulen = blob_buf_size;
+ remaining = blob_size;
+ prefix = " ";
+ do {
+ if ((ret = __blob_salvage(env, blob_id,
+ blob_offset,
+ (remaining < blob_buf_size ?
+ (size_t)remaining : blob_buf_size),
+ file_id, sdb_id, &dbt)) != 0) {
+ err_ret = DB_VERIFY_BAD;
+ break;
+ }
+ if (remaining > blob_buf_size)
+ F_SET(
+ vdp, SALVAGE_STREAM_BLOB);
+ else
+ F_CLR(
+ vdp, SALVAGE_STREAM_BLOB);
+ if ((ret = __db_vrfy_prdbt(
+ &dbt, 0, prefix, handle,
+ callback, 0, 0, vdp)) != 0) {
+ err_ret = ret;
+ break;
+ }
+ prefix = NULL;
+ blob_offset += dbt.size;
+ if (remaining < blob_buf_size)
+ remaining = 0;
+ else
+ remaining -= blob_buf_size;
+ } while (remaining > 0);
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ break;
case H_OFFPAGE:
if (len < HOFFPAGE_SIZE) {
err_ret = DB_VERIFY_BAD;
@@ -960,7 +1118,7 @@ keydata: memcpy(buf, HKEYDATA_DATA(hk), len);
*/
memset(&key_dbt, 0, sizeof(key_dbt));
if ((ret = __os_malloc(
- dbp->env, dbt.size, &key_buf)) != 0)
+ env, dbt.size, &key_buf)) != 0)
return (ret);
memcpy(key_buf, buf, dbt.size);
key_dbt.data = key_buf;
@@ -1002,7 +1160,7 @@ keydata: memcpy(buf, HKEYDATA_DATA(hk), len);
handle, callback, 0, 0, vdp)) != 0)
err_ret = ret;
}
- __os_free(dbp->env, key_buf);
+ __os_free(env, key_buf);
break;
default:
if (!LF_ISSET(DB_AGGRESSIVE))
@@ -1013,7 +1171,9 @@ keydata: memcpy(buf, HKEYDATA_DATA(hk), len);
}
}
- __os_free(dbp->env, buf);
+ if (blob_buf != NULL)
+ __os_free(env, blob_buf);
+ __os_free(env, buf);
if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
return (t_ret);
return ((ret == 0 && err_ret != 0) ? err_ret : ret);
@@ -1129,7 +1289,7 @@ __ham_dups_unsorted(dbp, buf, len)
{
DBT a, b;
db_indx_t offset, dlen;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
memset(&a, 0, sizeof(DBT));
memset(&b, 0, sizeof(DBT));
@@ -1146,7 +1306,7 @@ __ham_dups_unsorted(dbp, buf, len)
b.data = buf + offset + sizeof(db_indx_t);
b.size = dlen;
- if (a.data != NULL && func(dbp, &a, &b) > 0)
+ if (a.data != NULL && func(dbp, &a, &b, NULL) > 0)
return (1);
a.data = b.data;
diff --git a/src/heap/heap.c b/src/heap/heap.c
index ab404658..7aec416b 100644
--- a/src/heap/heap.c
+++ b/src/heap/heap.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -24,6 +24,8 @@ static int __heapc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
static int __heapc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
static int __heapc_reloc __P((DBC *, DBT *, DBT *));
static int __heapc_reloc_partial __P((DBC *, DBT *, DBT *));
+static void __heapc_search __P((DBC *, HEAPPG *, db_indx_t,
+ int, db_indx_t *, int *));
static int __heapc_split __P((DBC *, DBT *, DBT *, int));
/*
@@ -134,12 +136,15 @@ __heap_bulk(dbc, data, flags)
DB_HEAP_RID prev_rid, rid;
DBT sdata;
HEAP_CURSOR *cp;
+ HEAPBLOBHDR bhdr;
HEAPHDR *hdr;
HEAPSPLITHDR *shdr;
PAGE *pg;
db_lockmode_t lock_type;
int is_key, ret;
int32_t *offp;
+ off_t blob_size;
+ db_seq_t blob_id;
u_int32_t data_size, key_size, needed, space;
u_int8_t *dbuf, *np;
@@ -183,6 +188,7 @@ __heap_bulk(dbc, data, flags)
next_pg:
rid.indx = cp->indx;
rid.pgno = cp->pgno;
+ prev_rid = rid;
pg = cp->page;
/*
@@ -213,6 +219,14 @@ next_pg:
if (F_ISSET(hdr, HEAP_RECSPLIT)) {
shdr = (HEAPSPLITHDR *)hdr;
data_size = DB_ALIGN(shdr->tsize, sizeof(u_int32_t));
+ } else if (F_ISSET(hdr, HEAP_RECBLOB)) {
+ memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE);
+ GET_BLOB_SIZE(dbp->env, bhdr, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ if (blob_size > UINT32_MAX)
+ return (DB_BUFFER_SMALL);
+ data_size = (u_int32_t)blob_size;
} else
data_size = DB_ALIGN(hdr->size, sizeof(u_int32_t));
needed += 2 * sizeof(*offp) + data_size;
@@ -250,13 +264,21 @@ next_pg:
if ((ret = __heapc_gsplit(
dbc, &sdata, NULL, NULL)) != 0)
return (ret);
- } else {
+ } else if (F_ISSET(hdr, HEAP_RECBLOB)) {
+ memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE);
+ blob_id = (db_seq_t)bhdr.id;
+ if ((ret = __blob_bulk(
+ dbc, data_size, blob_id, np)) != 0)
+ return (ret);
+ }else {
memcpy(np,
(u_int8_t *)hdr + sizeof(HEAPHDR), hdr->size);
}
*offp-- = (int32_t)(np - dbuf);
if (F_ISSET(hdr, HEAP_RECSPLIT))
*offp-- = (int32_t)shdr->tsize;
+ else if (F_ISSET(hdr, HEAP_RECBLOB))
+ *offp-- = (int32_t)data_size;
else
*offp-- = (int32_t)hdr->size;
np += data_size;
@@ -296,7 +318,6 @@ __heapc_close(dbc, root_pgno, rmroot)
db_pgno_t root_pgno;
int *rmroot;
{
- DB_MPOOLFILE *mpf;
HEAP_CURSOR *cp;
int ret;
@@ -304,7 +325,6 @@ __heapc_close(dbc, root_pgno, rmroot)
COMPQUIET(rmroot, 0);
cp = (HEAP_CURSOR *)dbc->internal;
- mpf = dbc->dbp->mpf;
ret = 0;
/* Release the page/lock held by the cursor. */
@@ -325,11 +345,14 @@ __heapc_del(dbc, flags)
DB_MPOOLFILE *mpf;
DBT hdr_dbt, log_dbt;
HEAP *h;
+ HEAPBLOBHDR bhdr;
HEAPHDR *hdr;
HEAPPG *rpage;
HEAP_CURSOR *cp;
db_pgno_t region_pgno;
- int oldspacebits, ret, spacebits, t_ret;
+ int ret, t_ret;
+ db_seq_t blob_id;
+ u_int32_t oldspacebits, spacebits;
u_int16_t data_size, size;
dbp = dbc->dbp;
@@ -337,6 +360,7 @@ __heapc_del(dbc, flags)
h = dbp->heap_internal;
cp = (HEAP_CURSOR *)dbc->internal;
rpage = NULL;
+ ret = 0;
COMPQUIET(flags, 0);
/*
@@ -377,6 +401,14 @@ start: if (STD_LOCKING(dbc) && (ret = __db_lget(dbc,
next_rid.indx = 0;
}
+ /* Delete the blob file. */
+ if (F_ISSET(hdr, HEAP_RECBLOB)) {
+ memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE);
+ blob_id = (db_seq_t)bhdr.id;
+ if ((ret = __blob_del(dbc, blob_id)) != 0)
+ return (ret);
+ }
+
/* Log the deletion. */
if (DBC_LOGGING(dbc)) {
hdr_dbt.data = hdr;
@@ -384,8 +416,9 @@ start: if (STD_LOCKING(dbc) && (ret = __db_lget(dbc,
log_dbt.data = (u_int8_t *)hdr + hdr_dbt.size;
log_dbt.size = data_size;
if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
- 0, DB_REM_HEAP, cp->pgno, (u_int32_t)cp->indx,
- size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
+ 0, OP_SET(DB_REM_HEAP, cp->page),
+ cp->pgno, (u_int32_t)cp->indx, size,
+ &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
} else
LSN_NOT_LOGGED(LSN(cp->page));
@@ -414,7 +447,7 @@ start: if (STD_LOCKING(dbc) && (ret = __db_lget(dbc,
dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
goto err;
HEAP_SETSPACE(dbp, rpage,
- cp->pgno - region_pgno - 1, spacebits);
+ (cp->pgno - region_pgno) - 1, spacebits);
}
err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
@@ -443,7 +476,8 @@ err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
/*
* __heap_ditem --
- * Remove an item from a page.
+ * Remove an item from a page. Note when deleting blob records that the file
+ * has to be deleted separate from calling this function.
*
* PUBLIC: int __heap_ditem
* PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t));
@@ -537,19 +571,21 @@ __heapc_get(dbc, key, data, flags, pgnop)
DB_MPOOLFILE *mpf;
DB_LOCK meta_lock;
DBT tmp_val;
- HEAP *h;
+ HEAPBLOBHDR bhdr;
HEAPHDR *hdr;
HEAPMETA *meta;
HEAPPG *dpage;
HEAP_CURSOR *cp;
db_lockmode_t lock_type;
db_pgno_t pgno;
- int cmp, f_indx, found, getpage, indx, ret;
+ int cmp, np_inc, f_indx, found, getpage, indx, ret;
+ off_t blob_size;
+ db_seq_t blob_id;
dbp = dbc->dbp;
mpf = dbp->mpf;
- h = dbp->heap_internal;
cp = (HEAP_CURSOR *)dbc->internal;
+ pgno = PGNO_INVALID;
LOCK_INIT(meta_lock);
COMPQUIET(pgnop, NULL);
@@ -564,7 +600,7 @@ __heapc_get(dbc, key, data, flags, pgnop)
else
lock_type = DB_LOCK_READ;
- ret = 0;
+ np_inc = ret = 0;
found = getpage = FALSE;
meta = NULL;
dpage = NULL;
@@ -579,7 +615,7 @@ __heapc_get(dbc, key, data, flags, pgnop)
ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret);
if (ret != 0) {
if (ret == DB_PAGE_NOTFOUND)
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -591,7 +627,7 @@ __heapc_get(dbc, key, data, flags, pgnop)
hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, cp->indx);
if (F_ISSET(hdr, HEAP_RECSPLIT) &&
!F_ISSET(hdr, HEAP_RECFIRST)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -610,7 +646,7 @@ first: pgno = FIRST_HEAP_DPAGE;
ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
if (ret != 0 ) {
if (ret == DB_PAGE_NOTFOUND)
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
dpage = (HEAPPG *)cp->page;
@@ -620,25 +656,10 @@ first: pgno = FIRST_HEAP_DPAGE;
* finding first non-split record or first piece of a
* split record, then set up cursor.
*/
- if (TYPE(dpage) == P_HEAP && NUM_ENT(dpage) != 0) {
- for (indx = 0;
- indx <= HEAP_HIGHINDX(dpage); indx++) {
- if (HEAP_OFFSETTBL(
- dbp, dpage)[indx] == 0)
- continue;
- hdr = (HEAPHDR *)P_ENTRY(
- dbp, dpage, indx);
- if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
- F_ISSET(hdr, HEAP_RECFIRST)) {
- found = TRUE;
- cp->pgno = pgno;
- cp->indx = indx;
- break;
- }
- }
- if (!found)
- pgno++;
- } else
+ __heapc_search(dbc, dpage, 0, 1, &cp->indx, &found);
+ if (found)
+ cp->pgno = pgno;
+ else
pgno++;
}
break;
@@ -668,7 +689,7 @@ last: pgno = PGNO_BASE_MD;
while (!found) {
/* Don't look earlier than the first data page. */
if (pgno < FIRST_HEAP_DPAGE) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -683,33 +704,33 @@ last: pgno = PGNO_BASE_MD;
* non-split record or the first piece of a split record
* is found.
*/
- if (TYPE(dpage) == P_HEAP && NUM_ENT(dpage) != 0) {
- for (indx = HEAP_HIGHINDX(dpage);
- indx >= 0; indx--) {
- if (HEAP_OFFSETTBL(
- dbp, dpage)[indx] == 0)
- continue;
- hdr = (HEAPHDR *)P_ENTRY(
- dbp, dpage, indx);
- if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
- F_ISSET(hdr, HEAP_RECFIRST)) {
- found = TRUE;
- cp->pgno = pgno;
- cp->indx = indx;
- break;
- }
- }
- if (!found)
- pgno--;
- } else
+ __heapc_search(dbc,
+ dpage, HEAP_HIGHINDX(dpage), 1, &cp->indx, &found);
+ if (found)
+ cp->pgno = pgno;
+ else
pgno--;
}
break;
case DB_NEXT_NODUP:
case DB_NEXT:
- /* If cursor not initialize, behave as DB_FIRST */
- if (dbc->internal->pgno == PGNO_INVALID)
- goto first;
+ case DB_PREV_NODUP:
+ case DB_PREV:
+ /*
+ * np_inc stores whether to increment or decrement when
+ * iterating through records on a page and pages in the file.
+ */
+ if (flags == DB_NEXT_NODUP || flags == DB_NEXT)
+ np_inc = 1;
+ else
+ np_inc = -1;
+ /* If cursor not initialized, behave as DB_FIRST/DB_LAST */
+ if (dbc->internal->pgno == PGNO_INVALID) {
+ if (np_inc == 1)
+ goto first;
+ else
+ goto last;
+ }
/*
* Acquire the current page with the lock we have already,
@@ -720,108 +741,49 @@ last: pgno = PGNO_BASE_MD;
goto err;
dpage = (HEAPPG *)cp->page;
- /* At end of current page, must get next page */
- if (cp->indx >= HEAP_HIGHINDX(dpage))
+ if (np_inc == 1 && cp->indx >= HEAP_HIGHINDX(dpage))
+ /* At end of current page, must get next page. */
getpage = TRUE;
-
- while (!found) {
- if (getpage) {
- pgno = cp->pgno + 1;
-
- /* Put current page/lock and get next one */
- ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
- if (ret != 0) {
- /* Beyond last page? */
- if (ret == DB_PAGE_NOTFOUND)
- ret = DB_NOTFOUND;
- goto err;
- }
- dpage = (HEAPPG *)cp->page;
-
- /*
- * If page is a spam page or its a data
- * page without entries, try again.
- */
- if (TYPE(dpage) != P_HEAP ||
- (TYPE(dpage) == P_HEAP &&
- NUM_ENT(dpage) == 0))
- continue;
-
- /* When searching, indx gets bumped to 0 */
- cp->indx = -1;
- getpage = FALSE;
- }
-
+ else if (np_inc == -1) {
/*
- * Bump index and loop through the offset table finding
- * first nonzero entry. If the offset is for a split
- * record, make sure it's the first piece of the split
- * record. HEAP_HIGHINDX always points to highest filled
- * entry on page.
+ * Loop through indexes and find first used slot. Check
+ * if already at the first slot.
*/
- cp->indx++;
- for (indx=cp->indx;
- indx <= HEAP_HIGHINDX(dpage); indx++) {
- if (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)
- continue;
- hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
- if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
- F_ISSET(hdr, HEAP_RECFIRST)) {
- found = TRUE;
- cp->indx = indx;
- break;
- }
+ for (f_indx=0; (f_indx <= HEAP_HIGHINDX(dpage)) &&
+ (HEAP_OFFSETTBL(dbp, dpage)[f_indx] == 0); f_indx++)
+ {
+ /* No-op. */
}
- /* Nothing of interest on page, so try next */
- if (!found)
+ /* At the beginning of current page, get new page */
+ if (cp->indx == 0 || cp->indx <= f_indx) {
+ if (cp->pgno == FIRST_HEAP_DPAGE) {
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
+ goto err;
+ }
getpage = TRUE;
- }
- break;
- case DB_PREV_NODUP:
- case DB_PREV:
- /* If cursor not initialize, behave as DB_LAST */
- if (dbc->internal->pgno == PGNO_INVALID)
- goto last;
-
- /*
- * Acquire the current page with the lock we have already,
- * unless user has asked for a write lock.
- */
- ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret);
- if (ret != 0)
- goto err;
- dpage = (HEAPPG *)cp->page;
-
- /*
- * Loop through indexes and find first used slot. Check if
- * already at the first slot.
- */
- for (f_indx=0; (f_indx <= HEAP_HIGHINDX(dpage)) &&
- (HEAP_OFFSETTBL(dbp, dpage)[f_indx] == 0); f_indx++) ;
-
- /* At the beginning of current page, must get new page */
- if (cp->indx == 0 || cp->indx <= f_indx) {
- if (cp->pgno == FIRST_HEAP_DPAGE) {
- ret = DB_NOTFOUND;
- goto err;
}
- getpage = TRUE;
}
while (!found) {
if (getpage) {
- pgno = cp->pgno - 1;
- /* Do not go past first page */
+ if (np_inc == -1)
+ pgno = cp->pgno - 1;
+ else if (np_inc == 1)
+ pgno = cp->pgno + 1;
if (pgno < FIRST_HEAP_DPAGE) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
- /* Put current page/lock and get prev page. */
+ /* Put current page/lock and get next one */
ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret);
- if (ret != 0)
+ if (ret != 0) {
+ if (np_inc == 1 &&
+ ret == DB_PAGE_NOTFOUND)
+ /* Beyond last page */
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
-
+ }
dpage = (HEAPPG *)cp->page;
/*
@@ -833,31 +795,36 @@ last: pgno = PGNO_BASE_MD;
NUM_ENT(dpage) == 0))
continue;
- /* When search, this gets bumped to high indx */
- cp->indx = HEAP_HIGHINDX(dpage) + 1;
+ if (np_inc == 1)
+ /*
+ * When searching, indx gets
+ * bumped to 0
+ */
+ cp->indx = UINT16_MAX;
+ else
+ /*
+ * When searching, indx gets bumped to
+ * high indx
+ */
+ cp->indx = HEAP_HIGHINDX(dpage) + 1;
getpage = FALSE;
}
/*
- * Decrement index and loop through the offset table
- * finding previous nonzero entry.
+ * Bump index and loop through the offset table finding
+ * first nonzero entry. If the offset is for a split
+ * record, make sure it's the first piece of the split
+ * record. HEAP_HIGHINDX always points to highest filled
+ * entry on page.
*/
- cp->indx--;
- for (indx=cp->indx;
- indx >= 0; indx--) {
- if (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)
- continue;
- hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
- if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
- F_ISSET(hdr, HEAP_RECFIRST)) {
- found = TRUE;
- cp->indx = indx;
- break;
- }
- }
-
- /* Nothing of interest on page, so try previous */
+ if (np_inc == -1)
+ cp->indx--;
+ else if (np_inc == 1)
+ cp->indx++;
+ __heapc_search(dbc,
+ dpage, cp->indx, np_inc, &cp->indx, &found);
if (!found)
+ /* Nothing of interest on page, so try next */
getpage = TRUE;
}
break;
@@ -871,7 +838,7 @@ last: pgno = PGNO_BASE_MD;
/* First make sure we're trying to get a data page. */
if (pgno == PGNO_BASE_MD ||
pgno == HEAP_REGION_PGNO(dbp, pgno)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -880,7 +847,7 @@ last: pgno = PGNO_BASE_MD;
if (ret != 0) {
if (ret == DB_PAGE_NOTFOUND)
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
dpage = (HEAPPG *)cp->page;
@@ -889,14 +856,14 @@ last: pgno = PGNO_BASE_MD;
if ((indx > HEAP_HIGHINDX(dpage)) ||
(HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)) {
DISCARD(dbc, cp->page, cp->lock, 0, ret);
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
if (F_ISSET(hdr, HEAP_RECSPLIT) &&
!F_ISSET(hdr, HEAP_RECFIRST)) {
DISCARD(dbc, cp->page, cp->lock, 0, ret);
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -911,16 +878,30 @@ last: pgno = PGNO_BASE_MD;
if ((ret = __heapc_gsplit(
dbc, &tmp_val, NULL, 0)) != 0)
goto err;
+ } else if (F_ISSET(hdr, HEAP_RECBLOB)) {
+ memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE);
+ blob_id = (db_seq_t)bhdr.id;
+ GET_BLOB_SIZE(dbc->env, bhdr, blob_size, ret);
+ if (ret != 0)
+ goto err;
+ if (blob_size > UINT32_MAX) {
+ ret = DB_BUFFER_SMALL;
+ goto err;
+ }
+ tmp_val.flags = DB_DBT_MALLOC;
+ if ((ret = __blob_get(dbc, &tmp_val,
+ blob_id, blob_size, NULL, 0)) != 0)
+ goto err;
} else {
tmp_val.data =
(void *)((u_int8_t *)hdr + sizeof(HEAPHDR));
tmp_val.size = hdr->size;
}
- cmp = __bam_defcmp(dbp, &tmp_val, data);
+ cmp = __bam_defcmp(dbp, &tmp_val, data, NULL);
if (F_ISSET(&tmp_val, DB_DBT_MALLOC))
__os_ufree(dbp->env, tmp_val.data);
if (cmp != 0) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
}
@@ -928,7 +909,7 @@ last: pgno = PGNO_BASE_MD;
break;
case DB_NEXT_DUP:
case DB_PREV_DUP:
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
default:
/* DB_GET_RECNO, DB_JOIN_ITEM, DB_SET_RECNO are invalid */
@@ -959,6 +940,53 @@ err: if (ret == 0 ) {
return (ret);
}
+/*
+ * __heapc_search --
+ * Search a given a heap page, starting at a given index, for a viable heap
+ * record. Return the index of the found record in indxp.
+ */
+static void
+__heapc_search(dbc, dpage, begin, dir, indxp, found)
+ DBC *dbc;
+ HEAPPG *dpage;
+ db_indx_t begin;
+ int dir;
+ db_indx_t *indxp;
+ int *found;
+{
+ DB *dbp;
+ HEAPHDR *hdr;
+ db_indx_t indx;
+
+ dbp = dbc->dbp;
+ DB_ASSERT(dbp->env, dir == -1 || dir == 1);
+
+ *found = FALSE;
+ if (TYPE(dpage) != P_HEAP || NUM_ENT(dpage) == 0)
+ return;
+
+ indx = begin;
+ for (;;) {
+ if (HEAP_OFFSETTBL(dbp, dpage)[indx] != 0) {
+ hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx);
+ if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
+ F_ISSET(hdr, HEAP_RECFIRST)) {
+ *found = TRUE;
+ *indxp = indx;
+ break;
+ }
+ }
+ if ((dir == -1 && indx == 0) ||
+ (dir == 1 && indx == HEAP_HIGHINDX(dpage)))
+ break;
+
+ if (dir == -1)
+ indx--;
+ else
+ indx++;
+ }
+}
+
#undef IS_FIRST
#define IS_FIRST (last_rid.pgno == PGNO_INVALID)
/*
@@ -993,6 +1021,7 @@ __heapc_reloc_partial(dbc, key, data)
/* We only work on partial puts. */
DB_ASSERT(dbp->env, F_ISSET(data, DB_DBT_PARTIAL));
+ DB_ASSERT(dbp->env, !F_ISSET(old_hdr, HEAP_RECBLOB));
/*
* Start by calculating the data_size, total size of the new record, and
@@ -1014,7 +1043,7 @@ __heapc_reloc_partial(dbc, key, data)
dlen = old_size - doff;
else
dlen = data->dlen;
- data_size = old_size - dlen + data->size;
+ data_size = (old_size - dlen) + data->size;
}
/*
@@ -1075,8 +1104,8 @@ __heapc_reloc_partial(dbc, key, data)
*/
data_size = doff + (add_bytes ? data->size : 0);
else
- data_size = old_hdr->size -
- dlen + (add_bytes ? data->size : 0);
+ data_size = (old_hdr->size -
+ dlen) + (add_bytes ? data->size : 0);
data_size += remaining;
if (data_size > buflen) {
@@ -1120,7 +1149,7 @@ __heapc_reloc_partial(dbc, key, data)
if (doff + dlen < old_hdr->size) {
olddata += dlen;
memcpy(buf,
- olddata, old_hdr->size - doff - dlen);
+ olddata, (old_hdr->size - doff) - dlen);
dlen = 0;
} else
/*
@@ -1145,8 +1174,8 @@ __heapc_reloc_partial(dbc, key, data)
log_dbt.size = DB_ALIGN(
old_hdr->size, sizeof(u_int32_t));
if ((ret = __heap_addrem_log(dbp, dbc->txn,
- &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
- (u_int32_t)cp->indx, old_size,
+ &LSN(cp->page), 0, OP_SET(DB_REM_HEAP, cp->page),
+ cp->pgno, (u_int32_t)cp->indx, old_size,
&hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
} else
@@ -1185,7 +1214,8 @@ __heapc_reloc_partial(dbc, key, data)
log_dbt.size = DB_ALIGN(
old_hdr->size, sizeof(u_int32_t));
if ((ret = __heap_addrem_log(dbp, dbc->txn,
- &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+ &LSN(cp->page), 0,
+ OP_SET(DB_REM_HEAP, cp->page), cp->pgno,
(u_int32_t)cp->indx, old_size,
&hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
@@ -1197,7 +1227,8 @@ __heapc_reloc_partial(dbc, key, data)
if (DBC_LOGGING(dbc)) {
if ((ret = __heap_addrem_log(dbp, dbc->txn,
- &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno,
+ &LSN(cp->page), 0,
+ OP_SET(DB_ADD_HEAP, cp->page), cp->pgno,
(u_int32_t)cp->indx, old_size,
&hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
@@ -1231,7 +1262,7 @@ __heapc_reloc_partial(dbc, key, data)
size -= sizeof(db_indx_t);
/* Round down to a multiple of 4. */
size = DB_ALIGN(
- size - sizeof(u_int32_t) + 1, sizeof(u_int32_t));
+ (size - sizeof(u_int32_t)) + 1, sizeof(u_int32_t));
DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR));
/*
@@ -1261,7 +1292,8 @@ __heapc_reloc_partial(dbc, key, data)
if (DBC_LOGGING(dbc)) {
if ((ret = __heap_addrem_log(dbp,
dbc->txn, &LSN(cp->page), 0,
- DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx,
+ OP_SET(DB_ADD_HEAP, cp->page), cp->pgno,
+ (u_int32_t)cp->indx,
size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0)
goto err;
} else
@@ -1343,7 +1375,8 @@ next_pg: last_rid.pgno = cp->pgno;
log_dbt.size = DB_ALIGN(
old_hdr->size, sizeof(u_int32_t));
if ((ret = __heap_addrem_log(dbp, dbc->txn,
- &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+ &LSN(cp->page), 0,
+ OP_SET(DB_REM_HEAP, cp->page), cp->pgno,
(u_int32_t)cp->indx, old_size,
&hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
@@ -1355,7 +1388,8 @@ next_pg: last_rid.pgno = cp->pgno;
if (DBC_LOGGING(dbc)) {
if ((ret = __heap_addrem_log(dbp, dbc->txn,
- &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno,
+ &LSN(cp->page), 0,
+ OP_SET(DB_ADD_HEAP, cp->page), cp->pgno,
(u_int32_t)cp->indx, old_size,
&hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
@@ -1397,6 +1431,8 @@ __heapc_reloc(dbc, key, data)
memset(&hdr_dbt, 0, sizeof(DBT));
memset(&log_dbt, 0, sizeof(DBT));
COMPQUIET(key, NULL);
+ /* Blob database records never change size. */
+ DB_ASSERT(dbp->env, !F_ISSET(old_hdr, HEAP_RECBLOB));
/*
* We are updating an existing record, which will grow into a split
@@ -1436,7 +1472,8 @@ __heapc_reloc(dbc, key, data)
log_dbt.size = DB_ALIGN(
old_hdr->size, sizeof(u_int32_t));
if ((ret = __heap_addrem_log(dbp, dbc->txn,
- &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+ &LSN(cp->page), 0,
+ OP_SET(DB_REM_HEAP, cp->page), cp->pgno,
(u_int32_t)cp->indx, old_size,
&hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
@@ -1471,7 +1508,7 @@ __heapc_reloc(dbc, key, data)
size -= sizeof(db_indx_t);
/* Round down to a multiple of 4. */
size = DB_ALIGN(
- size - sizeof(u_int32_t) + 1, sizeof(u_int32_t));
+ (size - sizeof(u_int32_t)) + 1, sizeof(u_int32_t));
DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR));
new_hdr.std_hdr.size =
(u_int16_t)(size - sizeof(HEAPSPLITHDR));
@@ -1495,7 +1532,8 @@ __heapc_reloc(dbc, key, data)
if (DBC_LOGGING(dbc)) {
if ((ret = __heap_addrem_log(dbp,
dbc->txn, &LSN(cp->page), 0,
- DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx,
+ OP_SET(DB_ADD_HEAP, cp->page),
+ cp->pgno, (u_int32_t)cp->indx,
size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0)
goto err;
} else
@@ -1565,7 +1603,8 @@ next_pg: if (next_rid.pgno != PGNO_INVALID) {
log_dbt.size = DB_ALIGN(
old_hdr->size, sizeof(u_int32_t));
if ((ret = __heap_addrem_log(dbp, dbc->txn,
- &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno,
+ &LSN(cp->page), 0,
+ OP_SET(DB_REM_HEAP, cp->page), cp->pgno,
(u_int32_t)cp->indx, old_size,
&hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
@@ -1577,7 +1616,8 @@ next_pg: if (next_rid.pgno != PGNO_INVALID) {
if (DBC_LOGGING(dbc)) {
if ((ret = __heap_addrem_log(dbp, dbc->txn,
- &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno,
+ &LSN(cp->page), 0,
+ OP_SET(DB_ADD_HEAP, cp->page), cp->pgno,
(u_int32_t)cp->indx,old_size,
&hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
@@ -1608,20 +1648,26 @@ __heapc_put(dbc, key, data, flags, pgnop)
DB *dbp;
DBT hdr_dbt, log_dbt, new_data;
DB_MPOOLFILE *mpf;
+ HEAPBLOBHDR bhdr;
HEAPHDR hdr, *old_hdr;
HEAP_CURSOR *cp;
PAGE *rpage;
db_pgno_t region_pgno;
- int oldspace, ret, space, t_ret;
- u_int32_t data_size, dlen, new_size, old_flags, old_size, tot_size;
- u_int8_t *buf, *olddata, *src, *dest;
+ int buf_alloc, ret, t_ret;
+ off_t blob_size;
+ db_seq_t blob_id, new_blob_id;
+ u_int32_t data_size, dlen, new_size, old_flags, old_size;
+ u_int32_t oldspace, space, tot_size;
+ u_int8_t *buf, *olddata;
dbp = dbc->dbp;
mpf = dbp->mpf;
cp = (HEAP_CURSOR *)dbc->internal;
rpage = NULL;
- buf = dest = src = NULL;
+ buf = NULL;
+ buf_alloc = 0;
dlen = 0;
+ blob_id = new_blob_id = 0;
if (flags != DB_CURRENT) {
/* We're going to write following the get, so use RMW. */
@@ -1668,7 +1714,8 @@ __heapc_put(dbc, key, data, flags, pgnop)
DB_ALIGN(old_hdr->size + HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t));
if (old_size < sizeof(HEAPSPLITHDR))
old_size = sizeof(HEAPSPLITHDR);
- if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ /* Partial puts on blobs are dealt with in the blob code. */
+ if (F_ISSET(data, DB_DBT_PARTIAL) && !F_ISSET(old_hdr, HEAP_RECBLOB)) {
if (F_ISSET(old_hdr, HEAP_RECSPLIT))
tot_size = ((HEAPSPLITHDR *)old_hdr)->tsize;
else
@@ -1682,9 +1729,11 @@ __heapc_put(dbc, key, data, flags, pgnop)
dlen = tot_size - data->doff;
else
dlen = data->dlen;
- data_size = tot_size - dlen + data->size;
+ data_size = (tot_size - dlen) + data->size;
}
- } else
+ } else if F_ISSET(old_hdr, HEAP_RECBLOB)
+ data_size = HEAPBLOBREC_DSIZE;
+ else
data_size = data->size;
new_size = DB_ALIGN(data_size + sizeof(HEAPHDR), sizeof(u_int32_t));
if (new_size < sizeof(HEAPSPLITHDR))
@@ -1694,6 +1743,8 @@ __heapc_put(dbc, key, data, flags, pgnop)
if (F_ISSET(old_hdr, HEAP_RECSPLIT) ||
(new_size > old_size &&
new_size - old_size > HEAP_FREESPACE(dbp, cp->page))) {
+ /* Blob database records never change size. */
+ DB_ASSERT(dbp->env, !F_ISSET(old_hdr, HEAP_RECBLOB));
/*
* We've got to split the record, not enough room on the
* page. Splitting the record will remove old_size bytes and
@@ -1707,13 +1758,14 @@ __heapc_put(dbc, key, data, flags, pgnop)
memset(&new_data, 0, sizeof(DBT));
new_data.size = data_size;
- if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if (F_ISSET(data, DB_DBT_PARTIAL) && !F_ISSET(old_hdr, HEAP_RECBLOB)) {
/*
* Before replacing the old data, we need to use it to build the
* new data.
*/
if ((ret = __os_malloc(dbp->env, data_size, &buf)) != 0)
goto err;
+ buf_alloc = 1;
new_data.data = buf;
/*
@@ -1736,10 +1788,32 @@ __heapc_put(dbc, key, data, flags, pgnop)
buf += data->size;
/* Fill in remaining data from the old record, skipping dlen. */
- if (data->doff < old_hdr->size) {
+ if ((data->doff + data->dlen) < old_hdr->size) {
olddata += data->doff + data->dlen;
- memcpy(buf,
- olddata, old_hdr->size - data->doff - data->dlen);
+ memcpy(buf, olddata,
+ (old_hdr->size - data->doff) - data->dlen);
+ }
+ } else if (F_ISSET(old_hdr, HEAP_RECBLOB)) {
+ data_size = HEAPBLOBREC_DSIZE;
+ new_data.size = HEAPBLOBREC_DSIZE;
+ if (F_ISSET(data, DB_DBT_BLOB_REC)) {
+ DB_ASSERT(dbp->env,
+ F_ISSET(((HEAPHDR *)data->data), HEAP_RECBLOB));
+ new_data.data = HEAPBLOBREC_DATA(data->data);
+ } else {
+ memcpy(&bhdr, old_hdr, HEAPBLOBREC_SIZE);
+ blob_id = (db_seq_t)bhdr.id;
+ GET_BLOB_SIZE(dbp->env, bhdr, blob_size, ret);
+ if (ret != 0)
+ goto err;
+ if ((ret = __blob_repl(dbc,
+ data, blob_id, &new_blob_id, &blob_size)) != 0)
+ goto err;
+ bhdr.std_hdr.flags = HEAP_RECBLOB;
+ bhdr.std_hdr.size = HEAPBLOBREC_DSIZE;
+ SET_BLOB_SIZE(&bhdr, blob_size, HEAPBLOBHDR);
+ SET_BLOB_ID(&bhdr, new_blob_id, HEAPBLOBHDR);
+ new_data.data = HEAPBLOBREC_DATA(&bhdr);
}
} else {
new_data.data = data->data;
@@ -1751,19 +1825,23 @@ __heapc_put(dbc, key, data, flags, pgnop)
*/
memset(&hdr, 0, sizeof(HEAPHDR));
hdr.size = data_size;
+ if (F_ISSET(old_hdr, HEAP_RECBLOB))
+ hdr.flags = HEAP_RECBLOB;
if (DBC_LOGGING(dbc)) {
hdr_dbt.data = old_hdr;
hdr_dbt.size = HEAP_HDRSIZE(old_hdr);
log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size;
log_dbt.size = DB_ALIGN(old_hdr->size, sizeof(u_int32_t));
if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
- 0, DB_REM_HEAP, cp->pgno, (u_int32_t)cp->indx,
+ 0, OP_SET(DB_REM_HEAP, cp->page), cp->pgno,
+ (u_int32_t)cp->indx,
old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0)
goto err;
hdr_dbt.data = &hdr;
hdr_dbt.size = HEAP_HDRSIZE(&hdr);
if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
- 0, DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx,
+ 0, OP_SET(DB_ADD_HEAP, cp->page), cp->pgno,
+ (u_int32_t)cp->indx,
new_size, &hdr_dbt, &new_data, &LSN(cp->page))) != 0)
goto err;
} else
@@ -1788,14 +1866,14 @@ __heapc_put(dbc, key, data, flags, pgnop)
dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
goto err;
- HEAP_SETSPACE(dbp, rpage, cp->pgno - region_pgno - 1, space);
+ HEAP_SETSPACE(dbp, rpage, (cp->pgno - region_pgno) - 1, space);
}
err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
if (rpage != NULL && (t_ret = __memp_fput(mpf,
dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
ret = t_ret;
- if (F_ISSET(data, DB_DBT_PARTIAL))
+ if (buf_alloc)
__os_free(dbp->env, new_data.data);
if (ret != 0 && LOCK_ISSET(cp->lock))
@@ -1823,18 +1901,21 @@ __heap_getpage(dbc, size, avail)
HEAP *h;
HEAPPG *rpage;
HEAP_CURSOR *cp;
- db_pgno_t data_pgno, *lkd_pgs, meta_pgno, region_pgno, start_region;
- int i, lk_mode, max, p, ret, space, start, t_ret;
+ db_pgno_t data_pgno, i, max, meta_pgno, p, region_pgno, start;
+ db_pgno_t start_region;
+ int ret, t_ret;
+ u_int32_t lk_mode, space;
LOCK_INIT(meta_lock);
+ data_pgno = PGNO_INVALID;
dbp = dbc->dbp;
mpf = dbp->mpf;
cp = (HEAP_CURSOR *)dbc->internal;
h = dbp->heap_internal;
start_region = region_pgno = h->curregion;
max = HEAP_REGION_SIZE(dbp);
- i = ret = t_ret = 0;
- lkd_pgs = NULL;
+ i = 0;
+ ret = t_ret = 0;
/*
* The algorithm for finding a page:
@@ -1897,10 +1978,10 @@ find: while ((ret = __memp_fget(mpf, &region_pgno,
max = h->maxpgno - region_pgno;
/*
* Look in the bitmap for a page with sufficient free space. We use i
- * in a slightly strange way. Because the 2-bits in the bitmap are only
- * an estimate, there is a chance the data won't fit on the page we
- * choose. In that case, we re-start the process and want to be able to
- * resume this loop where we left off.
+ * in a slightly strange way. Because the 2-bits in the bitmap are
+ * only an estimate, there is a chance the data won't fit on the page
+ * we choose. In that case, we re-start the process and want to be
+ * able to resume this loop where we left off.
*/
for (; i < max; i++) {
p = start + i;
@@ -1908,7 +1989,7 @@ find: while ((ret = __memp_fget(mpf, &region_pgno,
p -= max;
if ((*avail = HEAP_SPACE(dbp, rpage, p)) > space)
continue;
- data_pgno = region_pgno + p + 1;
+ data_pgno = (region_pgno + p) + 1;
ACQUIRE_CUR(dbc,
DB_LOCK_WRITE, data_pgno, DB_LOCK_NOWAIT, 0, ret);
/*
@@ -2071,7 +2152,7 @@ pg_err: if (p != 0) {
if (ret == DB_LOCK_NOTGRANTED)
ret = 0;
else if (ret != 0) {
- /*
+ /*
* Free up the metadata lock. If this was an error
* other than a missing region page, bail.
*/
@@ -2165,7 +2246,7 @@ check: if (size + sizeof(db_indx_t) > HEAP_FREESPACE(dbp, cp->page)) {
}
}
- h->curpgindx = data_pgno - region_pgno - 1;
+ h->curpgindx = (data_pgno - region_pgno) - 1;
err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
if (rpage != NULL && (t_ret = __memp_fput(mpf,
dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0)
@@ -2187,26 +2268,40 @@ __heap_append(dbc, key, data)
DBT *data, *key;
{
DB *dbp;
- DBT tmp_dbt;
+ DBT tmp_dbt, data_dbt;
DB_HEAP_RID rid;
+ DB_LSN lsn;
DB_MPOOLFILE *mpf;
HEAPPG *rpage;
+ HEAPBLOBHDR bhdr;
HEAPHDR hdr;
HEAP_CURSOR *cp;
db_indx_t indx;
db_pgno_t region_pgno;
- int ret, space, t_ret;
+ int is_blob, ret, t_ret;
+ off_t blob_size;
+ db_seq_t blob_id;
u_int8_t avail;
- u_int32_t data_size;
+ u_int32_t data_size, space;
dbp = dbc->dbp;
mpf = dbp->mpf;
ret = t_ret = 0;
rpage = NULL;
cp = (HEAP_CURSOR *)dbc->internal;
+ blob_size = 0;
+ blob_id = 0;
+
+ if (dbp->blob_threshold &&
+ (data->size >= dbp->blob_threshold || F_ISSET(data, DB_DBT_BLOB)))
+ is_blob = 1;
+ else
+ is_blob = 0;
/* Need data.size + header size, 4-byte aligned. */
- if (F_ISSET(data, DB_DBT_PARTIAL))
+ if (is_blob)
+ data_size = HEAPBLOBREC_SIZE;
+ else if (F_ISSET(data, DB_DBT_PARTIAL))
data_size = DB_ALIGN(data->doff +
data->size + sizeof(HEAPHDR), sizeof(u_int32_t));
else
@@ -2222,24 +2317,42 @@ __heap_append(dbc, key, data)
goto err;
indx = HEAP_FREEINDX(cp->page);
- memset(&hdr, 0, sizeof(HEAPHDR));
- hdr.size = data->size;
- if (F_ISSET(data, DB_DBT_PARTIAL))
- hdr.size += data->doff;
- tmp_dbt.data = &hdr;
- tmp_dbt.size = sizeof(HEAPHDR);
+ if (is_blob) {
+ if ((ret = __blob_put(
+ dbc, data, &blob_id, &blob_size, &lsn)) != 0)
+ goto err;
+ memset(&bhdr, 0, HEAPBLOBREC_SIZE);
+ bhdr.std_hdr.flags = HEAP_RECBLOB;
+ bhdr.std_hdr.size = HEAPBLOBREC_DSIZE;
+ SET_BLOB_SIZE(&bhdr, blob_size, HEAPBLOBHDR);
+ SET_BLOB_ID(&bhdr, blob_id, HEAPBLOBHDR);
+ SET_BLOB_FILE_ID(&bhdr, dbp->blob_file_id, HEAPBLOBHDR);
+ tmp_dbt.data = &bhdr;
+ tmp_dbt.size = sizeof(HEAPHDR);
+ memset(&data_dbt, 0, sizeof(DBT));
+ data_dbt.data = HEAPBLOBREC_DATA((&bhdr));
+ data_dbt.size = HEAPBLOBREC_DSIZE;
+ } else {
+ memset(&hdr, 0, sizeof(HEAPHDR));
+ hdr.size = data->size;
+ if (F_ISSET(data, DB_DBT_PARTIAL))
+ hdr.size += data->doff;
+ tmp_dbt.data = &hdr;
+ tmp_dbt.size = sizeof(HEAPHDR);
+ memcpy(&data_dbt, data, sizeof(DBT));
+ }
/* Log the write. */
if (DBC_LOGGING(dbc)) {
if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page),
- 0, DB_ADD_HEAP, cp->pgno, (u_int32_t)indx,
- data_size, &tmp_dbt, data, &LSN(cp->page))) != 0)
+ 0, OP_SET(DB_ADD_HEAP, cp->page), cp->pgno, (u_int32_t)indx,
+ data_size, &tmp_dbt, &data_dbt, &LSN(cp->page))) != 0)
goto err;
} else
LSN_NOT_LOGGED(LSN(cp->page));
if ((ret = __heap_pitem(
- dbc, (PAGE *)cp->page, indx, data_size, &tmp_dbt, data)) != 0)
+ dbc, (PAGE *)cp->page, indx, data_size, &tmp_dbt, &data_dbt)) != 0)
goto err;
rid.pgno = cp->pgno;
@@ -2256,7 +2369,7 @@ __heap_append(dbc, key, data)
dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0)
goto err;
- HEAP_SETSPACE(dbp, rpage, cp->pgno - region_pgno - 1, space);
+ HEAP_SETSPACE(dbp, rpage, (cp->pgno - region_pgno) - 1, space);
}
err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND);
@@ -2292,8 +2405,8 @@ __heapc_split(dbc, key, data, is_first)
HEAP_CURSOR *cp;
db_indx_t indx;
db_pgno_t region_pgno;
- int ret, spacebits, t_ret;
- u_int32_t buflen, doff, left, size;
+ int ret, t_ret;
+ u_int32_t buflen, doff, left, size, spacebits;
u_int8_t availbits, *buf;
dbp = dbc->dbp;
@@ -2308,7 +2421,6 @@ __heapc_split(dbc, key, data, is_first)
ret = t_ret = 0;
indx = 0;
buf = NULL;
- buflen = 0;
/*
* Write the record to multiple pages, in chunks starting from the end.
@@ -2322,6 +2434,9 @@ __heapc_split(dbc, key, data, is_first)
left += data->doff;
}
hdrs.tsize = left;
+ buflen = 1;
+ if ((ret = __os_malloc(dbp->env, buflen, &buf)) != 0)
+ return (ret);
while (left > 0) {
size = DB_ALIGN(left + sizeof(HEAPSPLITHDR), sizeof(u_int32_t));
if (size < sizeof(HEAPSPLITHDR))
@@ -2336,8 +2451,10 @@ __heapc_split(dbc, key, data, is_first)
else
hdrs.std_hdr.flags |= HEAP_RECFIRST;
- if ((ret = __heap_getpage(dbc, size, &availbits)) != 0)
+ if ((ret = __heap_getpage(dbc, size, &availbits)) != 0) {
+ __os_free(dbp->env, buf);
return (ret);
+ }
/*
* size is the total number of bytes being written to the page.
@@ -2363,7 +2480,7 @@ __heapc_split(dbc, key, data, is_first)
size -= sizeof(db_indx_t);
/* Round down to a multiple of 4. */
size = DB_ALIGN(
- size - sizeof(u_int32_t) + 1, sizeof(u_int32_t));
+ (size - sizeof(u_int32_t)) + 1, sizeof(u_int32_t));
DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR));
hdrs.std_hdr.size =
(u_int16_t)(size - sizeof(HEAPSPLITHDR));
@@ -2401,10 +2518,10 @@ __heapc_split(dbc, key, data, is_first)
* page minus the bytes we're taking from data.
*/
t_data.data = buf;
- memset(buf, '\0', t_data.size - left + doff);
- buf += t_data.size - left + doff;
+ memset(buf, 0, (t_data.size - left) + doff);
+ buf += (t_data.size - left) + doff;
memcpy(buf, data->data, left - doff);
- doff -= t_data.size - left + doff;
+ doff -= (t_data.size - left) + doff;
buf = t_data.data;
}
hdr_dbt.data = &hdrs;
@@ -2415,7 +2532,8 @@ __heapc_split(dbc, key, data, is_first)
if (DBC_LOGGING(dbc)) {
if ((ret = __heap_addrem_log(dbp,
dbc->txn, &LSN(cp->page), 0,
- DB_ADD_HEAP, cp->pgno, (u_int32_t)indx,
+ OP_SET(DB_ADD_HEAP, cp->page),
+ cp->pgno, (u_int32_t)indx,
size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0)
goto err;
} else
@@ -2447,7 +2565,7 @@ __heapc_split(dbc, key, data, is_first)
goto err;
HEAP_SETSPACE(dbp,
- rpage, cp->pgno - region_pgno - 1, spacebits);
+ rpage, (cp->pgno - region_pgno) - 1, spacebits);
ret = __memp_fput(mpf,
dbc->thread_info, rpage, dbc->priority);
rpage = NULL;
diff --git a/src/heap/heap.src b/src/heap/heap.src
index 47bd4bb0..a08ad5eb 100644
--- a/src/heap/heap.src
+++ b/src/heap/heap.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -29,7 +29,29 @@ INCLUDE
* dbt: data that is to be added or deleted.
* pagelsn: former lsn of the page.
*/
-BEGIN addrem 49 151
+BEGIN addrem 61 151
+OP opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG nbytes u_int32_t lu
+HDR hdr DBT s
+DATA dbt DBT s
+POINTER pagelsn DB_LSN * lu
+END
+
+BEGIN_COMPAT addrem 60 151
+OP opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG nbytes u_int32_t lu
+HDR hdr DBT s
+DBT dbt DBT s
+POINTER pagelsn DB_LSN * lu
+END
+
+BEGIN_COMPAT addrem 50 151
ARG opcode u_int32_t lu
DB fileid int32_t ld
ARG pgno db_pgno_t lu
diff --git a/src/heap/heap_auto.c b/src/heap/heap_auto.c
index 1cb705f4..9fdcce7a 100644
--- a/src/heap/heap_auto.c
+++ b/src/heap/heap_auto.c
@@ -9,16 +9,38 @@
#include "dbinc/txn.h"
DB_LOG_RECSPEC __heap_addrem_desc[] = {
- {LOGREC_ARG, SSZ(__heap_addrem_args, opcode), "opcode", "%lu"},
+ {LOGREC_OP, SSZ(__heap_addrem_args, opcode), "opcode", "%lu"},
{LOGREC_DB, SSZ(__heap_addrem_args, fileid), "fileid", ""},
{LOGREC_ARG, SSZ(__heap_addrem_args, pgno), "pgno", "%lu"},
{LOGREC_ARG, SSZ(__heap_addrem_args, indx), "indx", "%lu"},
{LOGREC_ARG, SSZ(__heap_addrem_args, nbytes), "nbytes", "%lu"},
- {LOGREC_DBT, SSZ(__heap_addrem_args, hdr), "hdr", ""},
- {LOGREC_DBT, SSZ(__heap_addrem_args, dbt), "dbt", ""},
+ {LOGREC_HDR, SSZ(__heap_addrem_args, hdr), "hdr", ""},
+ {LOGREC_DATA, SSZ(__heap_addrem_args, dbt), "dbt", ""},
{LOGREC_POINTER, SSZ(__heap_addrem_args, pagelsn), "pagelsn", ""},
{LOGREC_Done, 0, "", ""}
};
+DB_LOG_RECSPEC __heap_addrem_60_desc[] = {
+ {LOGREC_OP, SSZ(__heap_addrem_60_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__heap_addrem_60_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__heap_addrem_60_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_addrem_60_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_addrem_60_args, nbytes), "nbytes", "%lu"},
+ {LOGREC_HDR, SSZ(__heap_addrem_60_args, hdr), "hdr", ""},
+ {LOGREC_DBT, SSZ(__heap_addrem_60_args, dbt), "dbt", ""},
+ {LOGREC_POINTER, SSZ(__heap_addrem_60_args, pagelsn), "pagelsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
+DB_LOG_RECSPEC __heap_addrem_50_desc[] = {
+ {LOGREC_ARG, SSZ(__heap_addrem_50_args, opcode), "opcode", "%lu"},
+ {LOGREC_DB, SSZ(__heap_addrem_50_args, fileid), "fileid", ""},
+ {LOGREC_ARG, SSZ(__heap_addrem_50_args, pgno), "pgno", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_addrem_50_args, indx), "indx", "%lu"},
+ {LOGREC_ARG, SSZ(__heap_addrem_50_args, nbytes), "nbytes", "%lu"},
+ {LOGREC_DBT, SSZ(__heap_addrem_50_args, hdr), "hdr", ""},
+ {LOGREC_DBT, SSZ(__heap_addrem_50_args, dbt), "dbt", ""},
+ {LOGREC_POINTER, SSZ(__heap_addrem_50_args, pagelsn), "pagelsn", ""},
+ {LOGREC_Done, 0, "", ""}
+};
DB_LOG_RECSPEC __heap_pg_alloc_desc[] = {
{LOGREC_DB, SSZ(__heap_pg_alloc_args, fileid), "fileid", ""},
{LOGREC_POINTER, SSZ(__heap_pg_alloc_args, meta_lsn), "meta_lsn", ""},
diff --git a/src/heap/heap_autop.c b/src/heap/heap_autop.c
index b767203b..ac08441b 100644
--- a/src/heap/heap_autop.c
+++ b/src/heap/heap_autop.c
@@ -28,6 +28,40 @@ __heap_addrem_print(env, dbtp, lsnp, notused2, info)
}
/*
+ * PUBLIC: int __heap_addrem_60_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__heap_addrem_60_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__heap_addrem_60", __heap_addrem_60_desc, info));
+}
+
+/*
+ * PUBLIC: int __heap_addrem_50_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__heap_addrem_50_print(env, dbtp, lsnp, notused2, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *info;
+{
+ COMPQUIET(notused2, DB_TXN_PRINT);
+
+ return (__log_print_record(env, dbtp, lsnp, "__heap_addrem_50", __heap_addrem_50_desc, info));
+}
+
+/*
* PUBLIC: int __heap_pg_alloc_print __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
diff --git a/src/heap/heap_backup.c b/src/heap/heap_backup.c
index 4588b0ba..77b0eaaa 100644
--- a/src/heap/heap_backup.c
+++ b/src/heap/heap_backup.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/heap/heap_conv.c b/src/heap/heap_conv.c
index 9f432d13..dbf059a4 100644
--- a/src/heap/heap_conv.c
+++ b/src/heap/heap_conv.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*/
#include "db_config.h"
@@ -86,7 +86,10 @@ __heap_mswap(env, pg)
SWAP32(p); /* gbytes */
SWAP32(p); /* bytes */
SWAP32(p); /* region_size */
- p += 92 * sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* threshold */
+ SWAP32(p); /* file id lo */
+ SWAP32(p); /* file id hi */
+ p += 89 * sizeof(u_int32_t); /* unused */
SWAP32(p); /* crypto_magic */
return (0);
diff --git a/src/heap/heap_method.c b/src/heap/heap_method.c
index f938b5e7..2667f4fe 100644
--- a/src/heap/heap_method.c
+++ b/src/heap/heap_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -49,15 +49,11 @@ __heap_db_close(dbp)
DB *dbp;
{
HEAP *h;
- int ret;
-
- ret = 0;
- if ((h = dbp->heap_internal) == NULL)
- return (0);
-
- __os_free(dbp->env, h);
- dbp->heap_internal = NULL;
+ if ((h = dbp->heap_internal) != NULL) {
+ __os_free(dbp->env, h);
+ dbp->heap_internal = NULL;
+ }
return (0);
}
diff --git a/src/heap/heap_open.c b/src/heap/heap_open.c
index 6827450d..f5bb72ae 100644
--- a/src/heap/heap_open.c
+++ b/src/heap/heap_open.c
@@ -1,19 +1,19 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*/
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/crypto.h"
#include "dbinc/db_page.h"
#include "dbinc/db_swap.h"
#include "dbinc/fop.h"
#include "dbinc/heap.h"
#include "dbinc/lock.h"
-#include "dbinc/log.h"
#include "dbinc/mp.h"
static void __heap_init_meta __P((DB *, HEAPMETA *, db_pgno_t, DB_LSN*));
@@ -82,6 +82,7 @@ __heap_metachk(dbp, name, hm)
env = dbp->env;
h = (HEAP *)dbp->heap_internal;
+ ret = 0;
/*
* At this point, all we know is that the magic number is for a Heap.
@@ -92,6 +93,7 @@ __heap_metachk(dbp, name, hm)
M_32_SWAP(vers);
switch (vers) {
case 1:
+ case 2:
break;
default:
__db_errx(env,
@@ -116,6 +118,26 @@ __heap_metachk(dbp, name, hm)
/* Set the page size. */
dbp->pgsize = hm->dbmeta.pagesize;
+ dbp->blob_threshold = hm->blob_threshold;
+ GET_BLOB_FILE_ID(env, hm, dbp->blob_file_id, ret);
+ if (ret != 0)
+ return (ret);
+ /* Blob databases must be upgraded. */
+ if (vers == 1 && dbp->blob_file_id != 0) {
+ __db_errx(env, DB_STR_A("1209",
+"%s: databases that support blobs must be upgraded.", "%s"),
+ name);
+ return (EINVAL);
+ }
+#ifndef HAVE_64BIT_TYPES
+ if (dbp->blob_file_id != 0) {
+ __db_errx(env, DB_STR_A("1205",
+ "%s: blobs require 64 integer compiler support.", "%s"),
+ name);
+ return (EINVAL);
+ }
+#endif
+
/* Copy the file's ID. */
memcpy(dbp->fileid, hm->dbmeta.uid, DB_FILE_ID_LEN);
@@ -179,7 +201,8 @@ __heap_read_meta(dbp, ip, txn, meta_pgno, flags)
h->region_size = meta->region_size;
if (PGNO(meta) == PGNO_BASE_MD && !F_ISSET(dbp, DB_AM_RECOVER))
- __memp_set_last_pgno(mpf, meta->dbmeta.last_pgno);
+ (void)__memp_set_last_pgno(
+ mpf, meta->dbmeta.last_pgno);
} else {
DB_ASSERT(dbp->env,
IS_RECOVERING(dbp->env) || F_ISSET(dbp, DB_AM_RECOVER));
@@ -285,6 +308,12 @@ __heap_new_file(dbp, ip, txn, fhp, name)
pginfo.type = dbp->type;
pdbt.data = &pginfo;
pdbt.size = sizeof(pginfo);
+ if (dbp->blob_threshold) {
+ if ((ret = __blob_generate_dir_ids(
+ dbp, txn, &dbp->blob_file_id)) != 0)
+ return (ret);
+
+ }
if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0)
return (ret);
meta = (HEAPMETA *)buf;
@@ -394,7 +423,9 @@ done: if (region != NULL && (t_ret = __memp_fput(mpf,
dbc->thread_info, region, dbc->priority)) != 0 && ret == 0)
ret = t_ret;
- ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0)
ret = t_ret;
@@ -436,4 +467,6 @@ __heap_init_meta(dbp, meta, pgno, lsnp)
meta->region_size = h->region_size;
meta->nregions = 1;
meta->curregion = 1;
+ meta->blob_threshold = dbp->blob_threshold;
+ SET_BLOB_META_FILE_ID(meta, dbp->blob_file_id, HEAPMETA);
}
diff --git a/src/heap/heap_rec.c b/src/heap/heap_rec.c
index 578a61c4..01803a70 100644
--- a/src/heap/heap_rec.c
+++ b/src/heap/heap_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*/
#include "db_config.h"
@@ -9,7 +9,6 @@
#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/heap.h"
-#include "dbinc/log.h"
#include "dbinc/mp.h"
/*
@@ -34,7 +33,8 @@ __heap_addrem_recover(env, dbtp, lsnp, op, info)
DB_THREAD_INFO *ip;
PAGE *pagep, *regionp;
db_pgno_t region_pgno;
- int cmp_n, cmp_p, modified, oldspace, ret, space;
+ int cmp_n, cmp_p, modified, ret;
+ u_int32_t oldspace, opcode, space;
ip = ((DB_TXNHEAD *)info)->thread_info;
pagep = NULL;
@@ -44,19 +44,20 @@ __heap_addrem_recover(env, dbtp, lsnp, op, info)
REC_FGET(mpf, ip, argp->pgno, &pagep, done);
modified = 0;
+ opcode = OP_MODE_GET(argp->opcode);
cmp_n = log_compare(lsnp, &LSN(pagep));
cmp_p = log_compare(&LSN(pagep), &argp->pagelsn);
- if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_HEAP) ||
- (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_HEAP)) {
+ if ((cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_HEAP) ||
+ (cmp_n == 0 && DB_UNDO(op) && opcode == DB_REM_HEAP)) {
/* We are either redo-ing an add or undoing a delete. */
REC_DIRTY(mpf, ip, dbc->priority, &pagep);
if ((ret = __heap_pitem(dbc, pagep,
argp->indx, argp->nbytes, &argp->hdr, &argp->dbt)) != 0)
goto out;
modified = 1;
- } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_HEAP) ||
- (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_HEAP)) {
+ } else if ((cmp_n == 0 && DB_UNDO(op) && opcode == DB_ADD_HEAP) ||
+ (cmp_p == 0 && DB_REDO(op) && opcode == DB_REM_HEAP)) {
/* We are either undoing an add or redo-ing a delete. */
REC_DIRTY(mpf, ip, dbc->priority, &pagep);
if ((ret = __heap_ditem(
@@ -76,11 +77,11 @@ __heap_addrem_recover(env, dbtp, lsnp, op, info)
HEAP_CALCSPACEBITS(
file_dbp, HEAP_FREESPACE(file_dbp, pagep), space);
oldspace = HEAP_SPACE(file_dbp, regionp,
- argp->pgno - region_pgno - 1);
+ (argp->pgno - region_pgno) - 1);
if (space != oldspace) {
REC_DIRTY(mpf, ip, dbc->priority, &regionp);
HEAP_SETSPACE(file_dbp,
- regionp, argp->pgno - region_pgno - 1, space);
+ regionp, (argp->pgno - region_pgno) - 1, space);
}
if ((ret = __memp_fput(mpf, ip, regionp, dbc->priority)) != 0)
goto out;
@@ -384,3 +385,200 @@ out: if (pagep != NULL)
(void)__memp_fput(mpf, ip, pagep, dbc->priority);
REC_CLOSE;
}
+
+/*
+ * __heap_addrem_60_recover --
+ * Recovery function for addrem.
+ *
+ * PUBLIC: int __heap_addrem_60_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_addrem_60_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __heap_addrem_60_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ HEAPBLOBHDR bhdr;
+ HEAPHDR *hhdr;
+ PAGE *pagep, *regionp;
+ db_pgno_t region_pgno;
+ int cmp_n, cmp_p, modified, ret;
+ u_int32_t oldspace, opcode, space;
+ u_int8_t buf[HEAPBLOBREC_SIZE];
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__heap_addrem_60_print);
+ REC_INTRO(__heap_addrem_60_read, ip, 1);
+ region_pgno = HEAP_REGION_PGNO(file_dbp, argp->pgno);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+ modified = 0;
+ opcode = OP_MODE_GET(argp->opcode);
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->pagelsn);
+
+ if ((cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_HEAP) ||
+ (cmp_n == 0 && DB_UNDO(op) && opcode == DB_REM_HEAP)) {
+ hhdr = argp->hdr.data;
+ /*
+ * In 6.0 heap blob log records were not correctly byte
+ * swapped, so do the swapping here if the blob file id of the
+ * database does not match the blob file id stored in the
+ * record. Technically byte swapping the blob file id could
+ * produce the same value, but that would only happen in
+ * practice if the environment contained over 4 billion blob
+ * databases. 0 is an invalid blob file id.
+ */
+ if (F_ISSET(hhdr, HEAP_RECBLOB)) {
+ memcpy(buf + sizeof(HEAPHDR),
+ argp->dbt.data, HEAPBLOBREC_DSIZE);
+ memcpy(&bhdr, buf, HEAPBLOBREC_SIZE);
+ if ((db_seq_t)bhdr.file_id != dbc->dbp->blob_file_id) {
+ M_64_SWAP(bhdr.id);
+ M_64_SWAP(bhdr.size);
+ M_64_SWAP(bhdr.file_id);
+ DB_ASSERT(env,
+ (db_seq_t)bhdr.file_id
+ == dbc->dbp->blob_file_id);
+ memcpy(buf, &bhdr, HEAPBLOBREC_SIZE);
+ memcpy(argp->dbt.data,
+ buf + sizeof(HEAPHDR), HEAPBLOBREC_DSIZE);
+ }
+ }
+ /* We are either redo-ing an add or undoing a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __heap_pitem(dbc, pagep,
+ argp->indx, argp->nbytes, &argp->hdr, &argp->dbt)) != 0)
+ goto out;
+ modified = 1;
+ } else if ((cmp_n == 0 && DB_UNDO(op) && opcode == DB_ADD_HEAP) ||
+ (cmp_p == 0 && DB_REDO(op) && opcode == DB_REM_HEAP)) {
+ /* We are either undoing an add or redo-ing a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __heap_ditem(
+ dbc, pagep, argp->indx, argp->nbytes)) != 0)
+ goto out;
+ modified = 1;
+ }
+
+ if (modified) {
+ REC_FGET(mpf, ip, region_pgno, &regionp, done);
+ if (DB_REDO(op))
+ LSN(pagep) = *lsnp;
+ else
+ LSN(pagep) = argp->pagelsn;
+
+ /* Update the available space bitmap, if necessary. */
+ HEAP_CALCSPACEBITS(
+ file_dbp, HEAP_FREESPACE(file_dbp, pagep), space);
+ oldspace = HEAP_SPACE(file_dbp, regionp,
+ (argp->pgno - region_pgno) - 1);
+ if (space != oldspace) {
+ REC_DIRTY(mpf, ip, dbc->priority, &regionp);
+ HEAP_SETSPACE(file_dbp,
+ regionp, (argp->pgno - region_pgno) - 1, space);
+ }
+ if ((ret = __memp_fput(mpf, ip, regionp, dbc->priority)) != 0)
+ goto out;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+
+}
+
+/*
+ * __heap_addrem_50_recover --
+ * Recovery function for addrem.
+ *
+ * PUBLIC: int __heap_addrem_50_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_addrem_50_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __heap_addrem_50_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *pagep, *regionp;
+ db_pgno_t region_pgno;
+ int cmp_n, cmp_p, modified, ret;
+ u_int32_t oldspace, space;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__heap_addrem_50_print);
+ REC_INTRO(__heap_addrem_50_read, ip, 1);
+ region_pgno = HEAP_REGION_PGNO(file_dbp, argp->pgno);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->pagelsn);
+
+ if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_HEAP) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_HEAP)) {
+ /* We are either redo-ing an add or undoing a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __heap_pitem(dbc, pagep,
+ argp->indx, argp->nbytes, &argp->hdr, &argp->dbt)) != 0)
+ goto out;
+ modified = 1;
+ } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_HEAP) ||
+ (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_HEAP)) {
+ /* We are either undoing an add or redo-ing a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __heap_ditem(
+ dbc, pagep, argp->indx, argp->nbytes)) != 0)
+ goto out;
+ modified = 1;
+ }
+
+ if (modified) {
+ REC_FGET(mpf, ip, region_pgno, &regionp, done);
+ if (DB_REDO(op))
+ LSN(pagep) = *lsnp;
+ else
+ LSN(pagep) = argp->pagelsn;
+
+ /* Update the available space bitmap, if necessary. */
+ HEAP_CALCSPACEBITS(
+ file_dbp, HEAP_FREESPACE(file_dbp, pagep), space);
+ oldspace = HEAP_SPACE(file_dbp,
+ regionp, (argp->pgno - region_pgno) - 1);
+ if (space != oldspace) {
+ REC_DIRTY(mpf, ip, dbc->priority, &regionp);
+ HEAP_SETSPACE(file_dbp,
+ regionp, (argp->pgno - region_pgno) - 1, space);
+ }
+ if ((ret = __memp_fput(mpf, ip, regionp, dbc->priority)) != 0)
+ goto out;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
diff --git a/src/heap/heap_reclaim.c b/src/heap/heap_reclaim.c
index 8cedb223..463e40c0 100644
--- a/src/heap/heap_reclaim.c
+++ b/src/heap/heap_reclaim.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -50,7 +50,7 @@ __heap_truncate(dbc, countp)
return (ret);
if ((ret = __memp_fget(mpf, &pgno,
dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0) {
- __TLPUT(dbc, lock);
+ (void)__TLPUT(dbc, lock);
goto err;
}
diff --git a/src/heap/heap_stat.c b/src/heap/heap_stat.c
index 9f4361a7..13bd36a2 100644
--- a/src/heap/heap_stat.c
+++ b/src/heap/heap_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -29,7 +29,7 @@ __heap_stat(dbc, spp, flags)
{
DB *dbp;
DB_HEAP_STAT *sp;
- DB_LOCK lock, metalock;
+ DB_LOCK metalock;
DB_MPOOLFILE *mpf;
ENV *env;
HEAPMETA *meta;
@@ -41,7 +41,6 @@ __heap_stat(dbc, spp, flags)
meta = NULL;
LOCK_INIT(metalock);
- LOCK_INIT(lock);
mpf = dbp->mpf;
sp = NULL;
ret = t_ret = write_meta = 0;
@@ -147,6 +146,8 @@ __heap_stat_print(dbc, flags)
"Underlying database page size", (u_long)sp->heap_pagesize);
__db_dl(env,
"Number of records in the database", (u_long)sp->heap_nrecs);
+ __db_dl(env,
+ "Number of blobs in the database", (u_long)sp->heap_nblobs);
__db_dl(env, "Number of database pages", (u_long)sp->heap_pagecnt);
__db_dl(env, "Number of database regions", (u_long)sp->heap_nregions);
__db_dl(env,
@@ -200,11 +201,13 @@ __heap_stat_callback(dbc, h, cookie, putp)
* We can't just use NUM_ENT, otherwise we'd mis-count split
* records.
*/
- for (i = 0; i < NUM_ENT(h); i++) {
+ for (i = 0; i <= HEAP_HIGHINDX(h); i++) {
hdr = (HEAPHDR *)P_ENTRY(dbp, h, i);
if (!F_ISSET(hdr, HEAP_RECSPLIT) ||
F_ISSET(hdr, HEAP_RECFIRST))
sp->heap_nrecs++;
+ if (F_ISSET(hdr, HEAP_RECBLOB))
+ sp->heap_nblobs++;
}
break;
case P_HEAPMETA: /* Fallthrough */
diff --git a/src/heap/heap_stub.c b/src/heap/heap_stub.c
index b4feb2f3..3093abc2 100644
--- a/src/heap/heap_stub.c
+++ b/src/heap/heap_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id:
*/
@@ -35,6 +35,40 @@ __db_no_heap_am(env)
}
int
+__heap_60_heapmeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(dirtyp, NULL);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
+__heap_60_heap(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(flags, 0);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(h, NULL);
+ COMPQUIET(dirtyp, NULL);
+ return (__db_no_heap_am(dbp->env));
+}
+
+int
__heap_db_create(dbp)
DB *dbp;
{
diff --git a/src/heap/heap_upgrade.c b/src/heap/heap_upgrade.c
new file mode 100644
index 00000000..35fa78b9
--- /dev/null
+++ b/src/heap/heap_upgrade.c
@@ -0,0 +1,106 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/blob.h"
+#include "dbinc/db_page.h"
+#include "dbinc/heap.h"
+#include "dbinc/db_upgrade.h"
+
+/*
+ * __heap_60_heapmeta--
+ * Upgrade the version number.
+ *
+ * PUBLIC: int __heap_60_heapmeta
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__heap_60_heapmeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ HEAPMETA *hmeta;
+
+ COMPQUIET(flags, 0);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(dbp, NULL);
+ hmeta = (HEAPMETA *)h;
+
+ hmeta->dbmeta.version = 2;
+ *dirtyp = 1;
+
+ return (0);
+}
+
+/*
+ * __heap_60_heap --
+ * Upgrade the blob records on the database heap pages.
+ *
+ * PUBLIC: int __heap_60_heap
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__heap_60_heap(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ HEAPBLOBHDR60 hb60;
+ HEAPBLOBHDR60P1 hb60p1;
+ HEAPHDR *hdr;
+ db_seq_t blob_id, blob_size, file_id;
+ db_indx_t indx, *offtbl;
+ int ret;
+
+ COMPQUIET(flags, 0);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+ offtbl = (db_indx_t *)HEAP_OFFSETTBL(dbp, h);
+ ret = 0;
+
+ DB_ASSERT(dbp->env, HEAPBLOBREC60_SIZE == HEAPBLOBREC_SIZE);
+ for (indx = 0; indx <= HEAP_HIGHINDX(h); indx++) {
+ if (offtbl[indx] == 0)
+ continue;
+ hdr = (HEAPHDR *)P_ENTRY(dbp, h, indx);
+ if (F_ISSET(hdr, HEAP_RECBLOB)) {
+ memcpy(&hb60, hdr, HEAPBLOBREC60_SIZE);
+ memset(&hb60p1, 0, HEAPBLOBREC_SIZE);
+ hb60p1.std_hdr.flags = hb60.flags;
+ hb60p1.std_hdr.size = hb60.size;
+ hb60p1.encoding = hb60.encoding;
+ hb60p1.lsn = hb60.lsn;
+ GET_BLOB60_ID(dbp->env, hb60, blob_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_SIZE(dbp->env, hb60, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_FILE_ID(dbp->env, &hb60, file_id, ret);
+ if (ret != 0)
+ return (ret);
+ SET_BLOB_ID(&hb60p1, blob_id, HEAPBLOBHDR60P1);
+ SET_BLOB_SIZE(&hb60p1, blob_size, HEAPBLOBHDR60P1);
+ SET_BLOB_FILE_ID(&hb60p1, file_id, HEAPBLOBHDR60P1);
+ memcpy(hdr, &hb60p1, HEAPBLOBREC_SIZE);
+ *dirtyp = 1;
+ }
+ }
+
+ return (ret);
+}
diff --git a/src/heap/heap_verify.c b/src/heap/heap_verify.c
index ea15c28b..7c90caf0 100644
--- a/src/heap/heap_verify.c
+++ b/src/heap/heap_verify.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,13 +9,14 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_verify.h"
#include "dbinc/heap.h"
#include "dbinc/lock.h"
#include "dbinc/mp.h"
-static int __heap_safe_gsplit __P((DB *, VRFY_DBINFO *, PAGE *, db_indx_t,
+static int __heap_safe_gsplit __P((DB *, VRFY_DBINFO *, PAGE *, unsigned,
DBT *));
static int __heap_verify_offset_cmp __P((const void *, const void *));
@@ -37,7 +38,8 @@ __heap_vrfy_meta(dbp, vdp, meta, pgno, flags)
HEAP *h;
VRFY_PAGEINFO *pip;
db_pgno_t last_pgno, max_pgno, npgs;
- int isbad, ret;
+ int isbad, ret, t_ret;
+ db_seq_t blob_id;
if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
return (ret);
@@ -97,8 +99,40 @@ __heap_vrfy_meta(dbp, vdp, meta, pgno, flags)
"%lu"), (u_long)pgno));
isbad = 1;
}
+ h->gbytes = meta->gbytes;
+ h->bytes = meta->bytes;
}
+/*
+ * Where 64-bit integer support is not available,
+ * return an error if the file has any blobs.
+ */
+ t_ret = 0;
+#ifdef HAVE_64BIT_TYPES
+ GET_BLOB_FILE_ID(dbp->env, meta, blob_id, t_ret);
+ if (t_ret != 0) {
+ isbad = 1;
+ EPRINT((dbp->env, DB_STR_A("1173",
+ "Page %lu: blob file id overflow.", "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+#else /* HAVE_64BIT_TYPES */
+ /*
+ * db_seq_t is an int on systems that do not have 64 integers types, so
+ * this will compile and run.
+ */
+ GET_BLOB_FILE_ID(env, meta, blob_id, t_ret);
+ if (t_ret != 0 || blob_id != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1206",
+ "Page %lu: blobs require 64 integer compiler support.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+#endif
+
err: if (LF_ISSET(DB_SALVAGE))
ret = __db_salvage_markdone(vdp, pgno);
@@ -120,12 +154,16 @@ __heap_vrfy(dbp, vdp, h, pgno, flags)
db_pgno_t pgno;
u_int32_t flags;
{
+ HEAPBLOBHDR bhdr;
HEAPHDR *hdr;
- int cnt, i, j, ret;
+ int i, j, ret;
+ off_t blob_size;
+ db_seq_t blob_id, file_id;
db_indx_t *offsets, *offtbl, end;
+ u_int32_t cnt;
if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0)
- goto err;
+ return (ret);
if (TYPE(h) == P_IHEAP)
/* Nothing to verify on a region page. */
@@ -140,7 +178,7 @@ __heap_vrfy(dbp, vdp, h, pgno, flags)
/*
* Build a sorted list of all the offsets in the table. Entries in the
* offset table are not always sorted. While we're here, check that
- * flags are sane.
+ * flags are sane, and that the blob entries are sane.
*/
cnt = 0;
for (i = 0; i <= HEAP_HIGHINDX(h); i++) {
@@ -164,6 +202,36 @@ __heap_vrfy(dbp, vdp, h, pgno, flags)
ret = DB_VERIFY_BAD;
goto err;
}
+ if (F_ISSET(hdr, HEAP_RECBLOB)) {
+ /*
+ * Check that the blob file exists and is the same
+ * file size as is stored in the database record.
+ */
+ memcpy(&bhdr, hdr, sizeof(HEAPBLOBHDR));
+ blob_id = (db_seq_t)bhdr.id;
+ GET_BLOB_SIZE(dbp->env, bhdr, blob_size, ret);
+ if (ret != 0 || blob_size < 0) {
+ EPRINT((dbp->env, DB_STR_A("1175",
+ "Page %lu: blob file size value has overflowed",
+ "%lu"), (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ file_id = (db_seq_t)bhdr.file_id;
+ if (file_id == 0) {
+ EPRINT((dbp->env, DB_STR_A("1177",
+ "Page %lu: invalid blob dir id %llu at item %lu",
+ "%lu %llu, %lu"), (u_long)pgno,
+ (unsigned long long)file_id, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = __blob_vrfy(dbp->env, blob_id,
+ blob_size, file_id, 0, pgno, flags)) != 0) {
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ }
offsets[cnt] = offtbl[i];
cnt++;
@@ -180,7 +248,7 @@ __heap_vrfy(dbp, vdp, h, pgno, flags)
* record. We can't use the P_ENTRY macro because we've kept track of
* the offsets, not the indexes.
*/
- for (i = 0; i < cnt - 1; i++) {
+ for (i = 0; i < (int)cnt - 1; i++) {
hdr = (HEAPHDR *)((u_int8_t *)h + offsets[i]);
end = offsets[i] + HEAP_HDRSIZE(hdr) + hdr->size;
if (end > offsets[i+1]) {
@@ -328,12 +396,22 @@ __heap_salvage(dbp, vdp, pgno, h, handle, callback, flags)
u_int32_t flags;
{
DBT dbt;
+ ENV *env;
HEAPHDR *hdr;
+ HEAPBLOBHDR bhdr;
db_indx_t i, *offtbl;
+ char *prefix;
int err_ret, ret, t_ret;
+ off_t blob_size, blob_offset, remaining;
+ u_int32_t blob_buf_size;
+ u_int8_t *blob_buf;
+ db_seq_t blob_id, file_id;
COMPQUIET(flags, 0);
memset(&dbt, 0, sizeof(DBT));
+ blob_buf = NULL;
+ blob_buf_size = 0;
+ env = dbp->env;
offtbl = (db_indx_t *)HEAP_OFFSETTBL(dbp, h);
err_ret = ret = t_ret = 0;
@@ -357,9 +435,74 @@ __heap_salvage(dbp, vdp, pgno, h, handle, callback, flags)
if (dbt.size > dbp->pgsize * 4)
dbt.size = dbp->pgsize * 4;
if ((ret =
- __os_malloc(dbp->env, dbt.size, &dbt.data)) != 0)
+ __os_malloc(env, dbt.size, &dbt.data)) != 0)
goto err;
- __heap_safe_gsplit(dbp, vdp, h, i, &dbt);
+ if ((ret = __heap_safe_gsplit
+ (dbp, vdp, h, i, &dbt)) != 0) {
+ err_ret = ret;
+ __os_free(env, dbt.data);
+ continue;
+ }
+ } else if (F_ISSET(hdr, HEAP_RECBLOB)) {
+ memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE);
+ blob_id = (db_seq_t)bhdr.id;
+ GET_BLOB_SIZE(env, bhdr, blob_size, ret);
+ if (ret != 0 || blob_size < 0)
+ goto err;
+ file_id = (db_seq_t)bhdr.file_id;
+ /* Read the blob, in pieces if it is too large.*/
+ blob_offset = 0;
+ if (blob_size > MEGABYTE) {
+ if (blob_buf_size < MEGABYTE) {
+ if ((ret = __os_realloc(
+ env, MEGABYTE, &blob_buf)) != 0)
+ goto err;
+ blob_buf_size = MEGABYTE;
+ }
+ } else if (blob_buf_size < blob_size) {
+ blob_buf_size = (u_int32_t)blob_size;
+ if ((ret = __os_realloc(
+ env, blob_buf_size, &blob_buf)) != 0)
+ goto err;
+ }
+ dbt.data = blob_buf;
+ dbt.ulen = blob_buf_size;
+ remaining = blob_size;
+ prefix = " ";
+ do {
+ if ((ret = __blob_salvage(env, blob_id,
+ blob_offset,
+ ((remaining < blob_buf_size) ?
+ (size_t)remaining : blob_buf_size),
+ file_id, 0, &dbt)) != 0) {
+ if (LF_ISSET(DB_AGGRESSIVE)) {
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ goto err;
+ }
+ if (remaining > blob_buf_size)
+ F_SET(vdp, SALVAGE_STREAM_BLOB);
+ else
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ if ((t_ret = __db_vrfy_prdbt(
+ &dbt, 0, prefix, handle,
+ callback, 0, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ goto err;
+ }
+ prefix = NULL;
+ blob_offset += dbt.size;
+ if (remaining < blob_buf_size)
+ remaining = 0;
+ else
+ remaining -= blob_buf_size;
+ } while (remaining > 0);
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ continue;
} else {
dbt.data = (u_int8_t *)hdr + HEAP_HDRSIZE(hdr);
dbt.size = hdr->size;
@@ -369,11 +512,13 @@ __heap_salvage(dbp, vdp, pgno, h, handle, callback, flags)
0, " ", handle, callback, 0, 0, vdp)) != 0)
err_ret = ret;
if (F_ISSET(hdr, HEAP_RECSPLIT))
- __os_free(dbp->env, dbt.data);
+ __os_free(env, dbt.data);
}
err: if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
return (t_ret);
+ if (blob_buf != NULL)
+ __os_free(env, blob_buf);
return ((ret == 0 && err_ret != 0) ? err_ret : ret);
}
@@ -386,7 +531,7 @@ __heap_safe_gsplit(dbp, vdp, h, i, dbt)
DB *dbp;
VRFY_DBINFO *vdp;
PAGE *h;
- db_indx_t i;
+ unsigned i;
DBT *dbt;
{
DB_MPOOLFILE *mpf;
@@ -433,7 +578,7 @@ __heap_safe_gsplit(dbp, vdp, h, i, dbt)
err: if (gotpg && (t_ret = __memp_fput(
mpf, vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
- t_ret = ret;
+ ret = t_ret;
return (ret);
}
diff --git a/src/hmac/hmac.c b/src/hmac/hmac.c
index 4febfc60..acaca6bc 100644
--- a/src/hmac/hmac.c
+++ b/src/hmac/hmac.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* Some parts of this code originally written by Adam Stubblefield,
* -- astubble@rice.edu.
diff --git a/src/lock/Design b/src/lock/Design
index f82bc7e8..2a1d1145 100644
--- a/src/lock/Design
+++ b/src/lock/Design
@@ -298,4 +298,4 @@ A: We currently do not support any automatic configuration for FINE_GRAIN
locking. When we do, will need to document that atomicity discussion
listed above (it is bug-report #553).
-Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
diff --git a/src/lock/lock.c b/src/lock/lock.c
index e4627734..bcebbe44 100644
--- a/src/lock/lock.c
+++ b/src/lock/lock.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -31,8 +31,8 @@ static int __lock_trade __P((ENV *, DB_LOCK *, DB_LOCKER *));
static int __lock_vec_api __P((ENV *,
u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **));
-static const char __db_lock_invalid[] = "%s: Lock is no longer valid";
-static const char __db_locker_invalid[] = "Locker is not valid";
+#define LOCK_INVALID_ERR DB_STR_A("2056", "%s: Lock is no longer valid", "%s")
+#define LOCKER_INVALID_ERR DB_STR("2057", "Locker is not valid")
#ifdef DEBUG
extern void __db_loadme (void);
@@ -111,7 +111,8 @@ __lock_vec(env, sh_locker, flags, list, nlist, elistp)
DB_LOCKREQ *list, **elistp;
{
struct __db_lock *lp, *next_lock;
- DB_LOCK lock; DB_LOCKOBJ *sh_obj;
+ DB_LOCK lock;
+ DB_LOCKOBJ *sh_obj;
DB_LOCKREGION *region;
DB_LOCKTAB *lt;
DBT *objlist, *np;
@@ -200,12 +201,18 @@ __lock_vec(env, sh_locker, flags, list, nlist, elistp)
if (writes == 1 ||
lp->mode == DB_LOCK_READ ||
lp->mode == DB_LOCK_READ_UNCOMMITTED) {
- SH_LIST_REMOVE(lp,
- locker_links, __db_lock);
+ /*
+ * It is safe to look at lp before
+ * locking because any threads sharing
+ * this locker must not be in the API
+ * at the same time.
+ */
sh_obj = SH_OFF_TO_PTR(lp,
lp->obj, DB_LOCKOBJ);
ndx = sh_obj->indx;
OBJECT_LOCK_NDX(lt, region, ndx);
+ SH_LIST_REMOVE(lp,
+ locker_links, __db_lock);
/*
* We are not letting lock_put_internal
* unlink the lock, so we'll have to
@@ -423,7 +430,7 @@ __lock_get_api(env, locker, flags, obj, lock_mode, lock)
region = env->lk_handle->reginfo.primary;
LOCK_LOCKERS(env, region);
- ret = __lock_getlocker_int(env->lk_handle, locker, 0, &sh_locker);
+ ret = __lock_getlocker_int(env->lk_handle, locker, 0, NULL, &sh_locker);
UNLOCK_LOCKERS(env, region);
LOCK_SYSTEM_LOCK(env->lk_handle, region);
if (ret == 0)
@@ -979,12 +986,21 @@ in_abort: newl->status = DB_LSTAT_WAITING;
goto err;
}
+ /*
+ * Sleep until someone releases a lock which might let us in.
+ * Since we want to set the thread state back to ACTIVE, don't
+ * use the normal MUTEX_LOCK() macro, which would immediately
+ * return a panic error code. Instead, return the panic after
+ * restoring the thread state.
+ */
PERFMON2(env, lock, suspend, (DBT *) obj, lock_mode);
- MUTEX_LOCK(env, newl->mtx_lock);
+ ret = __mutex_lock(env, newl->mtx_lock);
PERFMON2(env, lock, resume, (DBT *) obj, lock_mode);
if (ip != NULL)
ip->dbth_state = THREAD_ACTIVE;
+ if (ret != 0)
+ return (ret);
LOCK_SYSTEM_LOCK(lt, region);
OBJECT_LOCK_NDX(lt, region, ndx);
@@ -1165,7 +1181,7 @@ __lock_put_nolock(env, lock, runp, flags)
lockp = R_ADDR(&lt->reginfo, lock->off);
DB_ASSERT(env, lock->gen == lockp->gen);
if (lock->gen != lockp->gen) {
- __db_errx(env, __db_lock_invalid, "DB_LOCK->lock_put");
+ __db_errx(env, LOCK_INVALID_ERR, "DB_LOCK->lock_put");
LOCK_INIT(*lock);
return (EINVAL);
}
@@ -1224,7 +1240,7 @@ __lock_downgrade(env, lock, new_mode, flags)
lockp = R_ADDR(&lt->reginfo, lock->off);
if (lock->gen != lockp->gen) {
- __db_errx(env, __db_lock_invalid, "lock_downgrade");
+ __db_errx(env, LOCK_INVALID_ERR, "lock_downgrade");
ret = EINVAL;
goto out;
}
@@ -1662,7 +1678,7 @@ __lock_inherit_locks(lt, sh_locker, flags)
* locks, so inheritance is easy!
*/
if (sh_locker == NULL) {
- __db_errx(env, __db_locker_invalid);
+ __db_errx(env, LOCKER_INVALID_ERR);
return (EINVAL);
}
@@ -1683,11 +1699,15 @@ __lock_inherit_locks(lt, sh_locker, flags)
for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock);
lp != NULL;
lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) {
- SH_LIST_REMOVE(lp, locker_links, __db_lock);
-
- /* See if the parent already has a lock. */
+ /*
+ * See if the parent already has a lock. It is safe to look at
+ * lp before locking it because any threads sharing this locker
+ * must not be in the API with the same time.
+ */
obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ);
OBJECT_LOCK_NDX(lt, region, obj->indx);
+ SH_LIST_REMOVE(lp, locker_links, __db_lock);
+
SH_TAILQ_FOREACH(hlp, &obj->holders, links, __db_lock)
if (hlp->holder == poff && lp->mode == hlp->mode)
break;
@@ -1917,7 +1937,7 @@ __lock_trade(env, lock, new_locker)
/* If the lock is already released, simply return. */
if (lp->gen != lock->gen)
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
if (new_locker == NULL) {
__db_errx(env, DB_STR("2040", "Locker does not exist"));
diff --git a/src/lock/lock_alloc.incl b/src/lock/lock_alloc.incl
index edea07d2..e10cbcbf 100644
--- a/src/lock/lock_alloc.incl
+++ b/src/lock/lock_alloc.incl
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/lock/lock_deadlock.c b/src/lock/lock_deadlock.c
index 3c00d7f1..79086687 100644
--- a/src/lock/lock_deadlock.c
+++ b/src/lock/lock_deadlock.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -683,38 +683,45 @@ again: memset(bitmap, 0, count * sizeof(u_int32_t) * nentries);
/*
* Now for each locker, record its last lock and set abort status.
* We need to look at the heldby list carefully. We have the LOCKERS
- * locked so they cannot go away. The lock at the head of the
- * list can be removed by locking the object it points at.
- * Since lock memory is not freed if we get a lock we can look
- * at it safely but SH_LIST_FIRST is not atomic, so we check that
- * the list has not gone empty during that macro. We check abort
- * status after building the bit maps so that we will not detect
- * a blocked transaction without noting that it is already aborting.
+ * locked so they cannot go away. The LOCK_SYSTEM_LOCK keeps things
+ * steady when the lock table is not partitioned. However, if there are
+ * multiple lock partitions then the head of the heldby list can be
+ * changed by another thread locking the object it points at. That
+ * thread will have OBJECT_LOCK()'d that lock's partition. We need to
+ * look at the lock entry in order to determine which partition to
+ * mutex_lock. Since lock structs are never really freed, once we get
+ * the pointer we can look at it safely. However SH_LIST_FIRST is not
+ * atomic, so we first fetch the pointer and then check that the list
+ * was not empty during the fetch. This lets us at least mutex_lock the
+ * partition of the lock. Afterwards, we retry if the lock is no longer
+ * the first for that locker -- it might have changed to something ELSE
+ * since then. We check abort status after building the bit maps so that
+ * we will not pick a blocked transaction without noting that it is
+ * already aborting.
*/
for (id = 0; id < count; id++) {
if (!id_array[id].valid)
continue;
- if ((ret = __lock_getlocker_int(lt,
- id_array[id].id, 0, &lockerp)) != 0 || lockerp == NULL)
+ if ((ret = __lock_getlocker_int(lt, id_array[id].id,
+ 0, NULL, &lockerp)) != 0 || lockerp == NULL)
continue;
/*
- * If this is a master transaction, try to
- * find one of its children's locks first,
- * as they are probably more recent.
+ * If this is a master transaction, try to find one of its
+ * children's locks first, as they are probably more recent.
*/
child = SH_LIST_FIRST(&lockerp->child_locker, __db_locker);
if (child != NULL) {
do {
-c_retry: lp = SH_LIST_FIRST(&child->heldby, __db_lock);
- if (SH_LIST_EMPTY(&child->heldby) || lp == NULL)
+c_retry: lp = SH_LIST_FIRSTP(&child->heldby, __db_lock);
+ if (__SH_LIST_WAS_EMPTY(&child->heldby, lp))
goto c_next;
if (F_ISSET(child, DB_LOCKER_INABORT))
id_array[id].in_abort = 1;
ndx = lp->indx;
OBJECT_LOCK_NDX(lt, region, ndx);
- if (lp != SH_LIST_FIRST(
+ if (lp != SH_LIST_FIRSTP(
&child->heldby, __db_lock) ||
ndx != lp->indx) {
OBJECT_UNLOCK(lt, region, ndx);
@@ -733,11 +740,11 @@ c_next: child = SH_LIST_NEXT(
} while (child != NULL);
}
-l_retry: lp = SH_LIST_FIRST(&lockerp->heldby, __db_lock);
- if (!SH_LIST_EMPTY(&lockerp->heldby) && lp != NULL) {
+l_retry: lp = SH_LIST_FIRSTP(&lockerp->heldby, __db_lock);
+ if (!__SH_LIST_WAS_EMPTY(&lockerp->heldby, lp)) {
ndx = lp->indx;
OBJECT_LOCK_NDX(lt, region, ndx);
- if (lp != SH_LIST_FIRST(&lockerp->heldby, __db_lock) ||
+ if (lp != SH_LIST_FIRSTP(&lockerp->heldby, __db_lock) ||
lp->indx != ndx) {
OBJECT_UNLOCK(lt, region, ndx);
goto l_retry;
@@ -869,7 +876,7 @@ __dd_abort(env, info, statusp)
* detecting, return that.
*/
if ((ret = __lock_getlocker_int(lt,
- info->last_locker_id, 0, &lockerp)) != 0)
+ info->last_locker_id, 0, NULL, &lockerp)) != 0)
goto err;
if (lockerp == NULL || F_ISSET(lockerp, DB_LOCKER_INABORT)) {
*statusp = DB_ALREADY_ABORTED;
diff --git a/src/lock/lock_failchk.c b/src/lock/lock_failchk.c
index 59fb010f..84f757bf 100644
--- a/src/lock/lock_failchk.c
+++ b/src/lock/lock_failchk.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -15,7 +15,7 @@
/*
* __lock_failchk --
* Check for locks held by dead threads of control and release
- * read locks. If any write locks were held by dead non-trasnactional
+ * read locks. If any write locks were held by dead non-transactional
* lockers then we must abort and run recovery. Otherwise we release
* read locks for lockers owned by dead threads. Write locks for
* dead transactional lockers will be freed when we abort the transaction.
@@ -98,9 +98,8 @@ retry: LOCK_LOCKERS(env, lrp);
/*
* This locker is most likely referenced by a cursor
* which is owned by a dead thread. Normally the
- * cursor would be available for other threads
- * but we assume the dead thread will never release
- * it.
+ * cursor would be available for other threads but we
+ * assume the dead thread will never release it.
*/
if (lip->id < TXN_MINIMUM &&
(ret = __lock_freelocker(lt, lip)) != 0)
diff --git a/src/lock/lock_id.c b/src/lock/lock_id.c
index 24b545d1..e0dbaa01 100644
--- a/src/lock/lock_id.c
+++ b/src/lock/lock_id.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -17,7 +17,7 @@ static int __lock_freelocker_int
/*
* __lock_id_pp --
- * ENV->lock_id pre/post processing.
+ * DB_ENV->lock_id pre/post processing.
*
* PUBLIC: int __lock_id_pp __P((DB_ENV *, u_int32_t *));
*/
@@ -43,7 +43,11 @@ __lock_id_pp(dbenv, idp)
/*
* __lock_id --
- * ENV->lock_id.
+ * Allocate a new lock id as well as a locker struct to hold it. If we wrap
+ * around then we find the minimum currently in use and make sure we can
+ * stay below that. This is similar to __txn_begin_int's code to recover
+ * txn ids.
+ *
*
* PUBLIC: int __lock_id __P((ENV *, u_int32_t *, DB_LOCKER **));
*/
@@ -59,22 +63,15 @@ __lock_id(env, idp, lkp)
u_int32_t id, *ids;
int nids, ret;
- lk = NULL;
lt = env->lk_handle;
region = lt->reginfo.primary;
id = DB_LOCK_INVALIDID;
- ret = 0;
-
- id = DB_LOCK_INVALIDID;
lk = NULL;
+ ret = 0;
LOCK_LOCKERS(env, region);
/*
- * Allocate a new lock id. If we wrap around then we find the minimum
- * currently in use and make sure we can stay below that. This code is
- * similar to code in __txn_begin_int for recovering txn ids.
- *
* Our current valid range can span the maximum valid value, so check
* for it and wrap manually.
*/
@@ -98,7 +95,7 @@ __lock_id(env, idp, lkp)
id = ++region->lock_id;
/* Allocate a locker for this id. */
- ret = __lock_getlocker_int(lt, id, 1, &lk);
+ ret = __lock_getlocker_int(lt, id, 1, NULL, &lk);
err: UNLOCK_LOCKERS(env, region);
@@ -165,7 +162,8 @@ __lock_id_free_pp(dbenv, id)
LOCK_LOCKERS(env, region);
if ((ret =
- __lock_getlocker_int(env->lk_handle, id, 0, &sh_locker)) == 0) {
+ __lock_getlocker_int(env->lk_handle,
+ id, 0, NULL, &sh_locker)) == 0) {
if (sh_locker != NULL)
ret = __lock_freelocker_int(lt, region, sh_locker, 1);
else {
@@ -194,8 +192,10 @@ __lock_id_free(env, sh_locker)
ENV *env;
DB_LOCKER *sh_locker;
{
+ DB_LOCKER locker;
DB_LOCKREGION *region;
DB_LOCKTAB *lt;
+ DB_MSGBUF mb;
int ret;
lt = env->lk_handle;
@@ -203,9 +203,14 @@ __lock_id_free(env, sh_locker)
ret = 0;
if (sh_locker->nlocks != 0) {
- __db_errx(env, DB_STR("2046",
- "Locker still has locks"));
- ret = EINVAL;
+ locker = *sh_locker;
+ ret = USR_ERR(env, EINVAL);
+ __db_errx(env, DB_STR_A("2046",
+ "Locker %d still has %d locks", "%d %d"),
+ locker.id, locker.nlocks );
+ DB_MSGBUF_INIT(&mb);
+ (void)__lock_dump_locker(env, &mb, lt, sh_locker);
+ DB_MSGBUF_FLUSH(env, &mb);
goto err;
}
@@ -243,17 +248,19 @@ __lock_id_set(env, cur_id, max_id)
}
/*
- * __lock_getlocker --
- * Get a locker in the locker hash table. The create parameter
- * indicates if the locker should be created if it doesn't exist in
- * the table.
+ * __lock_getlocker,__lock_getlocker_int --
+ * Get a locker in the locker hash table. The create parameter indicates
+ * whether the locker should be created if it doesn't exist in the table. If
+ * there's a matching locker cached in the thread info, use that without
+ * locking.
*
- * This must be called with the locker mutex lock if create == 1.
+ * The internal version does not check the thread info cache; it must be called
+ * with the locker mutex locked.
*
* PUBLIC: int __lock_getlocker __P((DB_LOCKTAB *,
* PUBLIC: u_int32_t, int, DB_LOCKER **));
* PUBLIC: int __lock_getlocker_int __P((DB_LOCKTAB *,
- * PUBLIC: u_int32_t, int, DB_LOCKER **));
+ * PUBLIC: u_int32_t, int, DB_THREAD_INFO *, DB_LOCKER **));
*/
int
__lock_getlocker(lt, locker, create, retp)
@@ -263,32 +270,47 @@ __lock_getlocker(lt, locker, create, retp)
DB_LOCKER **retp;
{
DB_LOCKREGION *region;
+ DB_THREAD_INFO *ip;
ENV *env;
int ret;
COMPQUIET(region, NULL);
env = lt->env;
region = lt->reginfo.primary;
-
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /* Check to see if the locker is already in the thread info */
+ if (ip != NULL && ip->dbth_local_locker != INVALID_ROFF) {
+ *retp = (DB_LOCKER *)
+ R_ADDR(&lt->reginfo, ip->dbth_local_locker);
+ if ((*retp)->id == locker) {
+ DB_ASSERT(env, !F_ISSET(*retp, DB_LOCKER_FREE));
+#ifdef HAVE_STATISTICS
+ region->stat.st_nlockers_hit++;
+#endif
+ return (0);
+ }
+ }
LOCK_LOCKERS(env, region);
- ret = __lock_getlocker_int(lt, locker, create, retp);
+ ret = __lock_getlocker_int(lt, locker, create, ip, retp);
UNLOCK_LOCKERS(env, region);
-
return (ret);
}
int
-__lock_getlocker_int(lt, locker, create, retp)
+__lock_getlocker_int(lt, locker, create, ip, retp)
DB_LOCKTAB *lt;
u_int32_t locker;
int create;
+ DB_THREAD_INFO *ip;
DB_LOCKER **retp;
{
DB_LOCKER *sh_locker;
DB_LOCKREGION *region;
- DB_THREAD_INFO *ip;
+#ifdef DIAGNOSTIC
+ DB_THREAD_INFO *diag;
+#endif
ENV *env;
- db_mutex_t mutex;
u_int32_t i, indx, nlockers;
int ret;
@@ -304,59 +326,85 @@ __lock_getlocker_int(lt, locker, create, retp)
SH_TAILQ_FOREACH(sh_locker, &lt->locker_tab[indx], links, __db_locker)
if (sh_locker->id == locker)
break;
+
if (sh_locker == NULL && create) {
- nlockers = 0;
- /* Create new locker and then insert it into hash table. */
- if ((ret = __mutex_alloc(env, MTX_LOGICAL_LOCK,
- DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_SELF_BLOCK,
- &mutex)) != 0)
- return (ret);
- else
- MUTEX_LOCK(env, mutex);
- if ((sh_locker = SH_TAILQ_FIRST(
- &region->free_lockers, __db_locker)) == NULL) {
- nlockers = region->stat.st_lockers >> 2;
- /* Just in case. */
- if (nlockers == 0)
- nlockers = 1;
- if (region->stat.st_maxlockers != 0 &&
- region->stat.st_maxlockers <
- region->stat.st_lockers + nlockers)
- nlockers = region->stat.st_maxlockers -
- region->stat.st_lockers;
- /*
- * Don't hold lockers when getting the region,
- * we could deadlock. When creating a locker
- * there is no race since the id allocation
- * is synchronized.
- */
- UNLOCK_LOCKERS(env, region);
- LOCK_REGION_LOCK(env);
- /*
- * If the max memory is not sized for max objects,
- * allocate as much as possible.
- */
- F_SET(&lt->reginfo, REGION_TRACKED);
- while (__env_alloc(&lt->reginfo, nlockers *
- sizeof(struct __db_locker), &sh_locker) != 0)
- if ((nlockers >> 1) == 0)
- break;
- F_CLR(&lt->reginfo, REGION_TRACKED);
- LOCK_REGION_UNLOCK(lt->env);
- LOCK_LOCKERS(env, region);
- for (i = 0; i < nlockers; i++) {
+ /* Can we reuse a locker struct cached in the thread info? */
+ if (ip != NULL && ip->dbth_local_locker != INVALID_ROFF &&
+ (sh_locker = (DB_LOCKER*)R_ADDR(&lt->reginfo,
+ ip->dbth_local_locker))->id == DB_LOCK_INVALIDID) {
+ DB_ASSERT(env, !F_ISSET(sh_locker, DB_LOCKER_FREE));
+#ifdef HAVE_STATISTICS
+ region->stat.st_nlockers_reused++;
+#endif
+ } else {
+ /* Create new locker and insert it into hash table. */
+ if ((sh_locker = SH_TAILQ_FIRST(
+ &region->free_lockers, __db_locker)) == NULL) {
+ nlockers = region->stat.st_lockers >> 2;
+ /* Just in case. */
+ if (nlockers == 0)
+ nlockers = 1;
+ if (region->stat.st_maxlockers != 0 &&
+ region->stat.st_maxlockers <
+ region->stat.st_lockers + nlockers)
+ nlockers = region->stat.st_maxlockers -
+ region->stat.st_lockers;
+ /*
+ * Don't hold lockers when getting the region,
+ * we could deadlock. When creating a locker
+ * there is no race since the id allocation
+ * is synchronized.
+ */
+ UNLOCK_LOCKERS(env, region);
+ LOCK_REGION_LOCK(env);
+ /*
+ * If the max memory is not sized for max
+ * objects, allocate as much as possible.
+ */
+ F_SET(&lt->reginfo, REGION_TRACKED);
+ while (__env_alloc(&lt->reginfo, nlockers *
+ sizeof(struct __db_locker),
+ &sh_locker) != 0) {
+ nlockers >>= 1;
+ if (nlockers == 0)
+ break;
+ }
+ F_CLR(&lt->reginfo, REGION_TRACKED);
+ LOCK_REGION_UNLOCK(lt->env);
+ LOCK_LOCKERS(env, region);
+ for (i = 0; i < nlockers; i++) {
+ SH_TAILQ_INSERT_HEAD(
+ &region->free_lockers,
+ sh_locker, links, __db_locker);
+ sh_locker->mtx_locker = MUTEX_INVALID;
+#ifdef DIAGNOSTIC
+ sh_locker->prev_locker = INVALID_ROFF;
+#endif
+ sh_locker++;
+ }
+ if (nlockers == 0)
+ return (__lock_nomem(env,
+ "locker entries"));
+ region->stat.st_lockers += nlockers;
+ sh_locker = SH_TAILQ_FIRST(
+ &region->free_lockers, __db_locker);
+ }
+ SH_TAILQ_REMOVE(
+ &region->free_lockers,
+ sh_locker, links, __db_locker);
+ }
+ F_CLR(sh_locker, DB_LOCKER_FREE);
+ if (sh_locker->mtx_locker == MUTEX_INVALID) {
+ if ((ret = __mutex_alloc(env, MTX_LOGICAL_LOCK,
+ DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_SELF_BLOCK,
+ &sh_locker->mtx_locker)) != 0) {
SH_TAILQ_INSERT_HEAD(&region->free_lockers,
sh_locker, links, __db_locker);
- sh_locker++;
+ return (ret);
}
- if (nlockers == 0)
- return (__lock_nomem(env, "locker entries"));
- region->stat.st_lockers += nlockers;
- sh_locker = SH_TAILQ_FIRST(
- &region->free_lockers, __db_locker);
+ MUTEX_LOCK(env, sh_locker->mtx_locker);
}
- SH_TAILQ_REMOVE(
- &region->free_lockers, sh_locker, links, __db_locker);
+
++region->nlockers;
#ifdef HAVE_STATISTICS
STAT_PERFMON2(env, lock, nlockers, region->nlockers, locker);
@@ -365,10 +413,10 @@ __lock_getlocker_int(lt, locker, create, retp)
region->stat.st_maxnlockers,
region->nlockers, locker);
#endif
+
sh_locker->id = locker;
env->dbenv->thread_id(
env->dbenv, &sh_locker->pid, &sh_locker->tid);
- sh_locker->mtx_locker = mutex;
sh_locker->dd_id = 0;
sh_locker->master_locker = INVALID_ROFF;
sh_locker->parent_locker = INVALID_ROFF;
@@ -386,10 +434,20 @@ __lock_getlocker_int(lt, locker, create, retp)
&lt->locker_tab[indx], sh_locker, links, __db_locker);
SH_TAILQ_INSERT_HEAD(&region->lockers,
sh_locker, ulinks, __db_locker);
- ENV_GET_THREAD_INFO(env, ip);
+
+ if (ip != NULL && ip->dbth_local_locker == INVALID_ROFF)
+ ip->dbth_local_locker =
+ R_OFFSET(&lt->reginfo, sh_locker);
#ifdef DIAGNOSTIC
- if (ip != NULL)
- ip->dbth_locker = R_OFFSET(&lt->reginfo, sh_locker);
+ /*
+ * __db_has_pagelock checks for proper locking by dbth_locker.
+ */
+ if ((diag = ip) == NULL)
+ ENV_GET_THREAD_INFO(env, diag);
+ if (diag != NULL) {
+ sh_locker->prev_locker = diag->dbth_locker;
+ diag->dbth_locker = R_OFFSET(&lt->reginfo, sh_locker);
+ }
#endif
}
@@ -420,7 +478,7 @@ __lock_addfamilylocker(env, pid, id, is_family)
LOCK_LOCKERS(env, region);
/* get/create the parent locker info */
- if ((ret = __lock_getlocker_int(lt, pid, 1, &mlockerp)) != 0)
+ if ((ret = __lock_getlocker_int(lt, pid, 1, NULL, &mlockerp)) != 0)
goto err;
/*
@@ -430,7 +488,7 @@ __lock_addfamilylocker(env, pid, id, is_family)
* we manipulate it, nor can another child in the
* family be created at the same time.
*/
- if ((ret = __lock_getlocker_int(lt, id, 1, &lockerp)) != 0)
+ if ((ret = __lock_getlocker_int(lt, id, 1, NULL, &lockerp)) != 0)
goto err;
/* Point to our parent. */
@@ -466,9 +524,9 @@ err: UNLOCK_LOCKERS(env, region);
}
/*
- * __lock_freelocker_int
+ * __lock_freelocker_int --
* Common code for deleting a locker; must be called with the
- * locker bucket locked.
+ * lockers mutex locked.
*/
static int
__lock_freelocker_int(lt, region, sh_locker, reallyfree)
@@ -478,15 +536,21 @@ __lock_freelocker_int(lt, region, sh_locker, reallyfree)
int reallyfree;
{
ENV *env;
+ DB_MSGBUF mb;
+ DB_THREAD_INFO *ip;
u_int32_t indx;
int ret;
env = lt->env;
-
- if (SH_LIST_FIRST(&sh_locker->heldby, __db_lock) != NULL) {
- __db_errx(env, DB_STR("2047",
- "Freeing locker with locks"));
- return (EINVAL);
+ if (!SH_LIST_EMPTY(&sh_locker->heldby)) {
+ ret = USR_ERR(env, EINVAL);
+ __db_errx(env,
+ DB_STR("2060", "Freeing locker %x with locks"),
+ sh_locker->id);
+ DB_MSGBUF_INIT(&mb);
+ (void)__lock_dump_locker(env, &mb, lt, sh_locker);
+ DB_MSGBUF_FLUSH(env, &mb);
+ return (ret);
}
/* If this is part of a family, we must fix up its links. */
@@ -499,16 +563,29 @@ __lock_freelocker_int(lt, region, sh_locker, reallyfree)
LOCKER_HASH(lt, region, sh_locker->id, indx);
SH_TAILQ_REMOVE(&lt->locker_tab[indx], sh_locker,
links, __db_locker);
- if (sh_locker->mtx_locker != MUTEX_INVALID &&
- (ret = __mutex_free(env, &sh_locker->mtx_locker)) != 0)
- return (ret);
- SH_TAILQ_INSERT_HEAD(&region->free_lockers, sh_locker,
- links, __db_locker);
SH_TAILQ_REMOVE(&region->lockers, sh_locker,
ulinks, __db_locker);
region->nlockers--;
STAT_PERFMON2(env,
lock, nlockers, region->nlockers, sh_locker->id);
+ /*
+ * If this locker is cached in the thread info, zero the id and
+ * leave it allocated. Otherwise, put it back on the free list.
+ */
+ ENV_GET_THREAD_INFO(env, ip);
+ if (ip != NULL && ip->dbth_local_locker ==
+ R_OFFSET(&lt->reginfo, sh_locker)) {
+ DB_ASSERT(env,
+ MUTEX_IS_BUSY(env, sh_locker->mtx_locker));
+ sh_locker->id = DB_LOCK_INVALIDID;
+ } else {
+ if (sh_locker->mtx_locker != MUTEX_INVALID && (ret =
+ __mutex_free(env, &sh_locker->mtx_locker)) != 0)
+ return (ret);
+ F_SET(sh_locker, DB_LOCKER_FREE);
+ SH_TAILQ_INSERT_HEAD(&region->free_lockers, sh_locker,
+ links, __db_locker);
+ }
}
return (0);
@@ -518,7 +595,7 @@ __lock_freelocker_int(lt, region, sh_locker, reallyfree)
* __lock_freelocker
* Remove a locker its family from the hash table.
*
- * This must be called without the locker bucket locked.
+ * This must be called without the lockers mutex locked.
*
* PUBLIC: int __lock_freelocker __P((DB_LOCKTAB *, DB_LOCKER *));
*/
@@ -570,3 +647,42 @@ __lock_familyremove(lt, sh_locker)
return (ret);
}
+
+/*
+ * __lock_local_locker_invalidate --
+ * Search the thread info table's cached lockers and discard any reference
+ * to this mutex.
+ *
+ * PUBLIC: int __lock_local_locker_invalidate __P((ENV *, db_mutex_t));
+ */
+int
+__lock_local_locker_invalidate(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_HASHTAB *htab;
+ DB_LOCKER *locker;
+ DB_THREAD_INFO *ip;
+ u_int32_t i;
+ char buf[DB_THREADID_STRLEN];
+
+ htab = env->thr_hashtab;
+ for (i = 0; i < env->thr_nbucket; i++) {
+ SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) {
+ if (ip->dbth_local_locker == INVALID_ROFF)
+ continue;
+ locker = (DB_LOCKER *)R_ADDR(&env->lk_handle->reginfo,
+ ip->dbth_local_locker);
+ if (locker->mtx_locker == mutex) {
+ __db_msg(env,
+DB_STR_A("2061", "Removing cached locker mutex %lu reference by %s", "%lu %s"),
+ (u_long)mutex,
+ env->dbenv->thread_id_string(env->dbenv,
+ locker->pid, locker->tid, buf));
+ locker->mtx_locker = MUTEX_INVALID;
+ return (0);
+ }
+ }
+ }
+ return (0);
+}
diff --git a/src/lock/lock_list.c b/src/lock/lock_list.c
index 1e3d2a55..5d55e4a0 100644
--- a/src/lock/lock_list.c
+++ b/src/lock/lock_list.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/lock/lock_method.c b/src/lock/lock_method.c
index 0cc2e19d..0e6c0428 100644
--- a/src/lock/lock_method.c
+++ b/src/lock/lock_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/lock/lock_region.c b/src/lock/lock_region.c
index 1aae1815..ecc7ba47 100644
--- a/src/lock/lock_region.c
+++ b/src/lock/lock_region.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -120,13 +120,15 @@ __lock_open(env)
}
/*
- * A process joining the region may have reset the lock and transaction
- * timeouts.
+ * Lock and transaction timeouts will be ignored when joining the
+ * environment, so print a warning if either was set.
*/
- if (dbenv->lk_timeout != 0)
- region->lk_timeout = dbenv->lk_timeout;
- if (dbenv->tx_timeout != 0)
- region->tx_timeout = dbenv->tx_timeout;
+ if (dbenv->lk_timeout != 0 && region->lk_timeout != dbenv->lk_timeout)
+ __db_msg(env, DB_STR("2058",
+"Warning: Ignoring DB_SET_LOCK_TIMEOUT when joining the environment."));
+ if (dbenv->tx_timeout != 0 && region->tx_timeout != dbenv->tx_timeout)
+ __db_msg(env, DB_STR("2059",
+"Warning: Ignoring DB_SET_TXN_TIMEOUT when joining the environment."));
LOCK_REGION_UNLOCK(env);
region_locked = 0;
@@ -396,13 +398,30 @@ __lock_env_refresh(env)
R_ADDR(reginfo, lr->locker_mem_off));
}
- /* Detach from the region. */
- ret = __env_region_detach(env, reginfo, 0);
+ ret = __lock_region_detach(env, lt);
- /* Discard DB_LOCKTAB. */
- __os_free(env, lt);
- env->lk_handle = NULL;
+ return (ret);
+}
+
+/*
+ * __lock_region_detach --
+ *
+ * PUBLIC: int __lock_region_detach __P((ENV *, DB_LOCKTAB *));
+ */
+int
+__lock_region_detach(env, lt)
+ ENV *env;
+ DB_LOCKTAB *lt;
+{
+ int ret;
+ ret = 0;
+ if (lt != NULL) {
+ ret = __env_region_detach(env, &lt->reginfo, 0);
+ /* Discard DB_LOCKTAB. */
+ __os_free(env, lt);
+ env->lk_handle = NULL;
+ }
return (ret);
}
diff --git a/src/lock/lock_stat.c b/src/lock/lock_stat.c
index 11b934aa..1ce0796a 100644
--- a/src/lock/lock_stat.c
+++ b/src/lock/lock_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -15,8 +15,6 @@
#include "dbinc/db_am.h"
#ifdef HAVE_STATISTICS
-static int __lock_dump_locker
- __P((ENV *, DB_MSGBUF *, DB_LOCKTAB *, DB_LOCKER *));
static int __lock_dump_object __P((DB_LOCKTAB *, DB_MSGBUF *, DB_LOCKOBJ *));
static int __lock_print_all __P((ENV *, u_int32_t));
static int __lock_print_stats __P((ENV *, u_int32_t));
@@ -363,6 +361,11 @@ __lock_print_stats(env, flags)
__db_dl(env, "Maximum number of lockers at any one time",
(u_long)sp->st_maxnlockers);
__db_dl(env,
+ "Number of hits in the thread locker cache",
+ (u_long)sp->st_nlockers_hit);
+ __db_dl(env,
+ "Total number of lockers reused", (u_long)sp->st_nlockers_reused);
+ __db_dl(env,
"Number of current lock objects", (u_long)sp->st_nobjects);
__db_dl(env, "Maximum number of lock objects at any one time",
(u_long)sp->st_maxnobjects);
@@ -463,9 +466,17 @@ __lock_print_all(env, flags)
if (timespecisset(&lrp->next_timeout)) {
#ifdef HAVE_STRFTIME
time_t t = (time_t)lrp->next_timeout.tv_sec;
+ struct tm *tm_p;
char tbuf[64];
+#ifdef HAVE_LOCALTIME_R
+ struct tm tm;
+
+ tm_p = localtime_r(&t, &tm);
+#else
+ tm_p = localtime(&t);
+#endif
if (strftime(tbuf, sizeof(tbuf),
- "%m-%d-%H:%M:%S", localtime(&t)) != 0)
+ "%m-%d-%H:%M:%S", tm_p) != 0)
__db_msg(env, "next_timeout: %s.%09lu",
tbuf, (u_long)lrp->next_timeout.tv_nsec);
else
@@ -519,80 +530,6 @@ __lock_print_all(env, flags)
}
static int
-__lock_dump_locker(env, mbp, lt, lip)
- ENV *env;
- DB_MSGBUF *mbp;
- DB_LOCKTAB *lt;
- DB_LOCKER *lip;
-{
- DB_LOCKREGION *lrp;
- struct __db_lock *lp;
- char buf[DB_THREADID_STRLEN];
- u_int32_t ndx;
-
- lrp = lt->reginfo.primary;
-
- __db_msgadd(env,
- mbp, "%8lx dd=%2ld locks held %-4d write locks %-4d pid/thread %s",
- (u_long)lip->id, (long)lip->dd_id, lip->nlocks, lip->nwrites,
- env->dbenv->thread_id_string(env->dbenv, lip->pid, lip->tid, buf));
- __db_msgadd(env, mbp,
- " flags %-4x priority %-10u", lip->flags, lip->priority);
-
- if (timespecisset(&lip->tx_expire)) {
-#ifdef HAVE_STRFTIME
- time_t t = (time_t)lip->tx_expire.tv_sec;
- char tbuf[64];
- if (strftime(tbuf, sizeof(tbuf),
- "%m-%d-%H:%M:%S", localtime(&t)) != 0)
- __db_msgadd(env, mbp, "expires %s.%09lu",
- tbuf, (u_long)lip->tx_expire.tv_nsec);
- else
-#endif
- __db_msgadd(env, mbp, "expires %lu.%09lu",
- (u_long)lip->tx_expire.tv_sec,
- (u_long)lip->tx_expire.tv_nsec);
- }
- if (F_ISSET(lip, DB_LOCKER_TIMEOUT))
- __db_msgadd(
- env, mbp, " lk timeout %lu", (u_long)lip->lk_timeout);
- if (timespecisset(&lip->lk_expire)) {
-#ifdef HAVE_STRFTIME
- time_t t = (time_t)lip->lk_expire.tv_sec;
- char tbuf[64];
- if (strftime(tbuf,
- sizeof(tbuf), "%m-%d-%H:%M:%S", localtime(&t)) != 0)
- __db_msgadd(env, mbp, " lk expires %s.%09lu",
- tbuf, (u_long)lip->lk_expire.tv_nsec);
- else
-#endif
- __db_msgadd(env, mbp, " lk expires %lu.%09lu",
- (u_long)lip->lk_expire.tv_sec,
- (u_long)lip->lk_expire.tv_nsec);
- }
- DB_MSGBUF_FLUSH(env, mbp);
-
- /*
- * We need some care here since the list may change while we
- * look.
- */
-retry: SH_LIST_FOREACH(lp, &lip->heldby, locker_links, __db_lock) {
- if (!SH_LIST_EMPTY(&lip->heldby) && lp != NULL) {
- ndx = lp->indx;
- OBJECT_LOCK_NDX(lt, lrp, ndx);
- if (lp->indx == ndx)
- __lock_printlock(lt, mbp, lp, 1);
- else {
- OBJECT_UNLOCK(lt, lrp, ndx);
- goto retry;
- }
- OBJECT_UNLOCK(lt, lrp, ndx);
- }
- }
- return (0);
-}
-
-static int
__lock_dump_object(lt, mbp, op)
DB_LOCKTAB *lt;
DB_MSGBUF *mbp;
@@ -619,6 +556,31 @@ __lock_print_header(env)
"Count", "Status", "----------------- Object ---------------");
}
+#else /* !HAVE_STATISTICS */
+
+int
+__lock_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_LOCK_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__lock_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
+
/*
* __lock_printlock --
*
@@ -744,27 +706,81 @@ __lock_printlock(lt, mbp, lp, ispgno)
DB_MSGBUF_FLUSH(env, mbp);
}
-#else /* !HAVE_STATISTICS */
-
+/*
+ * __lock_dump_locker --
+ * Display the identity and statistics of a locker. This is used during
+ * diagnostic error paths as well as when printing statistics.
+ *
+ * PUBLIC: int __lock_dump_locker
+ * PUBLIC: __P((ENV *, DB_MSGBUF *, DB_LOCKTAB *, DB_LOCKER *));
+ */
int
-__lock_stat_pp(dbenv, statp, flags)
- DB_ENV *dbenv;
- DB_LOCK_STAT **statp;
- u_int32_t flags;
+__lock_dump_locker(env, mbp, lt, lip)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ DB_LOCKTAB *lt;
+ DB_LOCKER *lip;
{
- COMPQUIET(statp, NULL);
- COMPQUIET(flags, 0);
+ DB_LOCKREGION *lrp;
+ struct __db_lock *lp;
+ char buf[DB_THREADID_STRLEN];
+ u_int32_t ndx;
- return (__db_stat_not_built(dbenv->env));
-}
+ lrp = lt->reginfo.primary;
-int
-__lock_stat_print_pp(dbenv, flags)
- DB_ENV *dbenv;
- u_int32_t flags;
-{
- COMPQUIET(flags, 0);
+ __db_msgadd(env,
+ mbp, "%8lx dd=%2ld locks held %-4d write locks %-4d pid/thread %s",
+ (u_long)lip->id, (long)lip->dd_id, lip->nlocks, lip->nwrites,
+ env->dbenv->thread_id_string(env->dbenv, lip->pid, lip->tid, buf));
+ __db_msgadd(env, mbp,
+ " flags %-4x priority %-10u", lip->flags, lip->priority);
- return (__db_stat_not_built(dbenv->env));
-}
+ if (timespecisset(&lip->tx_expire)) {
+#ifdef HAVE_STRFTIME
+ time_t t = (time_t)lip->tx_expire.tv_sec;
+ char tbuf[64];
+ if (strftime(tbuf, sizeof(tbuf),
+ "%m-%d-%H:%M:%S", localtime(&t)) != 0)
+ __db_msgadd(env, mbp, "expires %s.%09lu",
+ tbuf, (u_long)lip->tx_expire.tv_nsec);
+ else
#endif
+ __db_msgadd(env, mbp, "expires %lu.%09lu",
+ (u_long)lip->tx_expire.tv_sec,
+ (u_long)lip->tx_expire.tv_nsec);
+ }
+ if (F_ISSET(lip, DB_LOCKER_TIMEOUT))
+ __db_msgadd(
+ env, mbp, " lk timeout %lu", (u_long)lip->lk_timeout);
+ if (timespecisset(&lip->lk_expire)) {
+#ifdef HAVE_STRFTIME
+ time_t t = (time_t)lip->lk_expire.tv_sec;
+ char tbuf[64];
+ if (strftime(tbuf,
+ sizeof(tbuf), "%m-%d-%H:%M:%S", localtime(&t)) != 0)
+ __db_msgadd(env, mbp, " lk expires %s.%09lu",
+ tbuf, (u_long)lip->lk_expire.tv_nsec);
+ else
+#endif
+ __db_msgadd(env, mbp, " lk expires %lu.%09lu",
+ (u_long)lip->lk_expire.tv_sec,
+ (u_long)lip->lk_expire.tv_nsec);
+ }
+ DB_MSGBUF_FLUSH(env, mbp);
+
+ /* We need some care here since the list may change while we look. */
+retry: SH_LIST_FOREACH(lp, &lip->heldby, locker_links, __db_lock) {
+ if (!SH_LIST_EMPTY(&lip->heldby) && lp != NULL) {
+ ndx = lp->indx;
+ OBJECT_LOCK_NDX(lt, lrp, ndx);
+ if (lp->indx == ndx)
+ __lock_printlock(lt, mbp, lp, 1);
+ else {
+ OBJECT_UNLOCK(lt, lrp, ndx);
+ goto retry;
+ }
+ OBJECT_UNLOCK(lt, lrp, ndx);
+ }
+ }
+ return (0);
+}
diff --git a/src/lock/lock_stub.c b/src/lock/lock_stub.c
index 3875af55..a916c6df 100644
--- a/src/lock/lock_stub.c
+++ b/src/lock/lock_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -359,6 +359,7 @@ size_t
__lock_region_max(env)
ENV *env;
{
+ COMPQUIET(env, NULL);
return (0);
}
@@ -367,6 +368,7 @@ __lock_region_size(env, other_alloc)
ENV *env;
size_t other_alloc;
{
+ COMPQUIET(env, NULL);
COMPQUIET(other_alloc, 0);
return (0);
}
@@ -584,6 +586,7 @@ __lock_list_print(env, mbp, list)
DBT *list;
{
COMPQUIET(env, NULL);
+ COMPQUIET(mbp, NULL);
COMPQUIET(list, NULL);
}
@@ -625,7 +628,7 @@ __lock_change(env, old_lock, new_lock)
ENV *env;
DB_LOCK *old_lock, *new_lock;
{
- COMPQUIET(env, NULL);
COMPQUIET(old_lock, NULL);
COMPQUIET(new_lock, NULL);
+ return (__db_nolocking(env));
}
diff --git a/src/lock/lock_timer.c b/src/lock/lock_timer.c
index 943047f0..9744438a 100644
--- a/src/lock/lock_timer.c
+++ b/src/lock/lock_timer.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/lock/lock_util.c b/src/lock/lock_util.c
index f7029cd7..07fdce72 100644
--- a/src/lock/lock_util.c
+++ b/src/lock/lock_util.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/log/log.c b/src/log/log.c
index 5808145f..9bef8d69 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -32,6 +32,7 @@ __log_open(env)
DB_ENV *dbenv;
DB_LOG *dblp;
LOG *lp;
+ u_int32_t log_flags;
u_int8_t *bulk;
int region_locked, ret;
@@ -130,47 +131,59 @@ __log_open(env)
}
} else {
/*
- * A process joining the region may have reset the log file
- * size, too. If so, it only affects the next log file we
- * create. We need to check that the size is reasonable given
- * the buffer size in the region.
+ * The log file size and DB_LOG_AUTO_REMOVE will be ignored
+ * when joining the environment, so print a warning if either
+ * was set.
*/
- LOG_SYSTEM_LOCK(env);
- region_locked = 1;
-
- if (dbenv->lg_size != 0) {
- if ((ret =
- __log_check_sizes(env, dbenv->lg_size, 0)) != 0)
- goto err;
-
- lp->log_nsize = dbenv->lg_size;
- }
-
- LOG_SYSTEM_UNLOCK(env);
- region_locked = 0;
-
- if (dbenv->lg_flags != 0 && (ret =
- __log_set_config_int(dbenv, dbenv->lg_flags, 1, 0)) != 0)
+ if (dbenv->lg_size != 0 && lp->log_nsize != dbenv->lg_size)
+ __db_msg(env, DB_STR("2585",
+"Warning: Ignoring maximum log file size when joining the environment"));
+
+ log_flags = dbenv->lg_flags & ~DB_LOG_AUTO_REMOVE;
+ if ((dbenv->lg_flags & DB_LOG_AUTO_REMOVE) &&
+ lp->db_log_autoremove == 0)
+ __db_msg(env, DB_STR("2586",
+"Warning: Ignoring DB_LOG_AUTO_REMOVE when joining the environment."));
+ if (log_flags != 0 && (ret =
+ __log_set_config_int(dbenv, log_flags, 1, 0)) != 0)
return (ret);
}
dblp->reginfo.mtx_alloc = lp->mtx_region;
return (0);
-err: if (dblp->reginfo.addr != NULL) {
- if (region_locked)
- LOG_SYSTEM_UNLOCK(env);
- (void)__env_region_detach(env, &dblp->reginfo, 0);
- }
- env->lg_handle = NULL;
-
+err: if (region_locked)
+ LOG_SYSTEM_UNLOCK(env);
(void)__mutex_free(env, &dblp->mtx_dbreg);
- __os_free(env, dblp);
+ (void)__log_region_detach(env, dblp);
return (ret);
}
/*
+ * __log_region_detach --
+ *
+ * PUBLIC: int __log_region_detach __P((ENV *, DB_LOG *));
+ */
+int
+__log_region_detach(env, dblp)
+ ENV *env;
+ DB_LOG *dblp;
+{
+ int ret;
+
+ ret = 0;
+ if (dblp != NULL) {
+ if (dblp->reginfo.addr != NULL)
+ ret = __env_region_detach(env, &dblp->reginfo, 0);
+ /* Discard DB_LOG. */
+ __os_free(env, dblp);
+ env->lg_handle = NULL;
+ }
+ return (ret);
+}
+
+/*
* __log_init --
* Initialize a log region in shared memory.
*/
@@ -638,7 +651,6 @@ __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
recsize = sizeof(LOGP);
if (CRYPTO_ON(env)) {
hdrsize = HDR_CRYPTO_SZ;
- recsize = sizeof(LOGP);
recsize += db_cipher->adj_size(recsize);
is_hmac = 1;
}
@@ -700,7 +712,7 @@ __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
* we can only detect that by having an unreasonable
* data length for our persistent data.
*/
- if ((hdr->len - hdrsize) != sizeof(LOGP)) {
+ if ((hdr->len - hdrsize) != recsize) {
__db_errx(env, "log record size mismatch");
goto err;
}
@@ -722,10 +734,10 @@ __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
hdr->len - hdrsize, is_hmac)) != 0)
goto bad_checksum;
/*
- * The checksum verifies without the header. Make note
- * of that, because it is only acceptable when the log
- * version < DB_LOGCHKSUM. Later, when we determine log
- * version, we will confirm this.
+ * The checksum verifies without the header. Make note
+ * of that, because it is only acceptable when the log
+ * version < DB_LOGCHKSUM. Later, when we determine log
+ * version, we will confirm this.
*/
chksum_includes_hdr = 0;
}
@@ -800,7 +812,7 @@ __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
/*
* We might have to declare a checksum failure here, if:
* - the checksum verified only by ignoring the header, and
- * - the log version indicates that the header should have
+ * - the log version indicates that the header should have
* been included.
*/
if (!chksum_includes_hdr && logversion >= DB_LOGCHKSUM)
@@ -899,66 +911,69 @@ __log_env_refresh(env)
/*
* After we close the files, check for any unlogged closes left in
* the shared memory queue. If we find any, try to log it, otherwise
- * return the error. We cannot say the environment was closed
- * cleanly.
+ * return the error; we cannot say the environment was closed cleanly.
+ * This does not use the typical MUTEX_LOCK(), but MUTEX_LOCK_RET(). The
+ * normal function would immediately return DB_RUNRECOVERY if we are
+ * closing the env down during a panic. By using MUTEX_LOCK_RET(), we
+ * continue with the rest of the cleanup.
*/
- MUTEX_LOCK(env, lp->mtx_filelist);
- SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
- if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) &&
- (t_ret = __dbreg_close_id_int(
- env, fnp, DBREG_CLOSE, 1)) != 0)
- ret = t_ret;
- MUTEX_UNLOCK(env, lp->mtx_filelist);
-
+ if (MUTEX_LOCK_RET(env, lp->mtx_filelist) == 0) {
+ SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
+ if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) &&
+ (t_ret = __dbreg_close_id_int(
+ env, fnp, DBREG_CLOSE, 1)) != 0)
+ ret = t_ret;
+ MUTEX_UNLOCK(env, lp->mtx_filelist);
+ }
/*
- * If a private region, return the memory to the heap. Not needed for
- * filesystem-backed or system shared memory regions, that memory isn't
- * owned by any particular process.
+ * If a private region, return the memory to the heap. Not
+ * needed for filesystem-backed or system shared memory regions,
+ * that memory isn't owned by any particular process.
*/
if (F_ISSET(env, ENV_PRIVATE)) {
- reginfo->mtx_alloc = MUTEX_INVALID;
- /* Discard the flush mutex. */
- if ((t_ret =
- __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0)
- ret = t_ret;
-
- /* Discard the buffer. */
- __env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off));
-
- /* Discard stack of free file IDs. */
- if (lp->free_fid_stack != INVALID_ROFF)
- __env_alloc_free(reginfo,
- R_ADDR(reginfo, lp->free_fid_stack));
-
- /* Discard the list of in-memory log file markers. */
- while ((filestart = SH_TAILQ_FIRST(&lp->logfiles,
- __db_filestart)) != NULL) {
- SH_TAILQ_REMOVE(&lp->logfiles, filestart, links,
- __db_filestart);
- __env_alloc_free(reginfo, filestart);
- }
-
- while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles,
- __db_filestart)) != NULL) {
- SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links,
- __db_filestart);
- __env_alloc_free(reginfo, filestart);
- }
-
- /* Discard commit queue elements. */
- while ((commit = SH_TAILQ_FIRST(&lp->free_commits,
- __db_commit)) != NULL) {
- SH_TAILQ_REMOVE(&lp->free_commits, commit, links,
- __db_commit);
- __env_alloc_free(reginfo, commit);
- }
-
- /* Discard replication bulk buffer. */
- if (lp->bulk_buf != INVALID_ROFF) {
- __env_alloc_free(reginfo,
- R_ADDR(reginfo, lp->bulk_buf));
- lp->bulk_buf = INVALID_ROFF;
- }
+ reginfo->mtx_alloc = MUTEX_INVALID;
+ /* Discard the flush mutex. */
+ if ((t_ret =
+ __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the log buffer. */
+ __env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off));
+
+ /* Discard stack of free file IDs. */
+ if (lp->free_fid_stack != INVALID_ROFF)
+ __env_alloc_free(reginfo,
+ R_ADDR(reginfo, lp->free_fid_stack));
+
+ /* Discard the list of in-memory log file markers. */
+ while ((filestart = SH_TAILQ_FIRST(&lp->logfiles,
+ __db_filestart)) != NULL) {
+ SH_TAILQ_REMOVE(&lp->logfiles, filestart, links,
+ __db_filestart);
+ __env_alloc_free(reginfo, filestart);
+ }
+
+ while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles,
+ __db_filestart)) != NULL) {
+ SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links,
+ __db_filestart);
+ __env_alloc_free(reginfo, filestart);
+ }
+
+ /* Discard commit queue elements. */
+ while ((commit = SH_TAILQ_FIRST(&lp->free_commits,
+ __db_commit)) != NULL) {
+ SH_TAILQ_REMOVE(&lp->free_commits, commit, links,
+ __db_commit);
+ __env_alloc_free(reginfo, commit);
+ }
+
+ /* Discard replication bulk buffer. */
+ if (lp->bulk_buf != INVALID_ROFF) {
+ __env_alloc_free(reginfo,
+ R_ADDR(reginfo, lp->bulk_buf));
+ lp->bulk_buf = INVALID_ROFF;
+ }
}
/* Discard the per-thread DBREG mutex. */
@@ -1394,7 +1409,7 @@ __log_inmem_lsnoff(dblp, lsnp, offsetp)
return (0);
}
- return (DB_NOTFOUND);
+ return (USR_ERR(dblp->env, DB_NOTFOUND));
}
/*
diff --git a/src/log/log_archive.c b/src/log/log_archive.c
index 280a2071..fb98e10b 100644
--- a/src/log/log_archive.c
+++ b/src/log/log_archive.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -304,7 +304,7 @@ __log_get_stable_lsn(env, stable_lsn, group_wide)
* so that the caller knows it may be done.
*/
if (IS_ZERO_LSN(*stable_lsn)) {
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto err;
}
} else if ((ret = __txn_getckp(env, stable_lsn)) != 0)
diff --git a/src/log/log_compare.c b/src/log/log_compare.c
index 97b59338..9bd28854 100644
--- a/src/log/log_compare.c
+++ b/src/log/log_compare.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/log/log_debug.c b/src/log/log_debug.c
index 32fb2542..d8f10798 100644
--- a/src/log/log_debug.c
+++ b/src/log/log_debug.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/log/log_get.c b/src/log/log_get.c
index db30c969..332dab8e 100644
--- a/src/log/log_get.c
+++ b/src/log/log_get.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -472,7 +472,7 @@ nextrec:
/* If at start-of-file, move to the previous file. */
if (nlsn.offset == 0) {
if (nlsn.file == 1) {
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto err;
}
if ((!lp->db_log_inmemory &&
@@ -480,7 +480,7 @@ nextrec:
0, &status, NULL) != 0 ||
(status != DB_LV_NORMAL &&
status != DB_LV_OLD_READABLE)))) {
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto err;
}
@@ -607,7 +607,7 @@ nohdr: switch (flags) {
if (eof && logc->bp_lsn.file != nlsn.file)
__db_errx(env, DB_STR_A("2583",
"Log file %d not found, check log directory configuration", "%d"),
- nlsn.file);
+ nlsn.file);
else
__db_errx(env, DB_STR("2576",
"Encountered zero length records while traversing backwards"));
@@ -624,7 +624,7 @@ nohdr: switch (flags) {
/* FALLTHROUGH */
case DB_SET:
default:
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto err;
}
}
@@ -830,7 +830,7 @@ __logc_incursor(logc, lsn, hdr, pp)
if (LOG_SWAPPED(env))
__log_hdrswap(hdr, CRYPTO_ON(env));
if (__logc_hdrchk(logc, lsn, hdr, &eof))
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
if (eof || logc->bp_lsn.offset + logc->bp_rlen < lsn->offset + hdr->len)
return (0);
@@ -914,7 +914,7 @@ __logc_inregion(logc, lsn, rlockp, last_lsn, hdr, pp, need_cksump)
if (IS_ZERO_LSN(lp->lsn))
return (0);
if (LOG_COMPARE(lsn, &lp->lsn) >= 0)
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
else if (lp->db_log_inmemory) {
if ((ret = __log_inmem_lsnoff(dblp, lsn, &b_region)) != 0)
return (ret);
@@ -949,14 +949,14 @@ __logc_inregion(logc, lsn, rlockp, last_lsn, hdr, pp, need_cksump)
if (LOG_SWAPPED(env))
__log_hdrswap(hdr, CRYPTO_ON(env));
if (__logc_hdrchk(logc, lsn, hdr, &eof) != 0)
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
if (eof)
return (0);
if (lp->db_log_inmemory) {
if (RINGBUF_LEN(lp, b_region, lp->b_off) < hdr->len)
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
} else if (lsn->offset + hdr->len > lp->w_off + lp->buffer_size)
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
if (logc->bp_size <= hdr->len) {
len = (size_t)DB_ALIGN((uintmax_t)hdr->len * 2, 128);
if ((ret =
@@ -1535,6 +1535,10 @@ __log_read_record(env, dbpp, td, recbuf, spec, size, argpp)
LOGCOPY_32(env, ap + sp->offset, bp);
bp += sizeof(uinttmp);
break;
+ case LOGREC_LONGARG:
+ LOGCOPY_64(env, ap + sp->offset, bp);
+ bp += sizeof(u_int64_t);
+ break;
case LOGREC_OP:
LOGCOPY_32(env, &op, bp);
*(u_int32_t *)(ap + sp->offset) = op;
diff --git a/src/log/log_method.c b/src/log/log_method.c
index d5aec116..09fbe863 100644
--- a/src/log/log_method.c
+++ b/src/log/log_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -342,6 +342,10 @@ __log_get_flags(dbenv, flagsp)
LF_SET(DB_LOG_IN_MEMORY);
else
LF_CLR(DB_LOG_IN_MEMORY);
+ if (lp->nosync)
+ LF_SET(DB_LOG_NOSYNC);
+ else
+ LF_CLR(DB_LOG_NOSYNC);
*flagsp = flags;
}
@@ -369,6 +373,8 @@ __log_set_flags(env, flags, on)
lp->db_log_autoremove = on ? 1 : 0;
if (LF_ISSET(DB_LOG_IN_MEMORY))
lp->db_log_inmemory = on ? 1 : 0;
+ if (LF_ISSET(DB_LOG_NOSYNC))
+ lp->nosync = on ? 1 : 0;
}
/*
@@ -377,13 +383,15 @@ __log_set_flags(env, flags, on)
*/
#undef OK_FLAGS
#define OK_FLAGS \
- (DB_LOG_AUTO_REMOVE | DB_LOG_DIRECT | \
- DB_LOG_DSYNC | DB_LOG_IN_MEMORY | DB_LOG_ZERO)
+ (DB_LOG_AUTO_REMOVE | DB_LOG_BLOB | DB_LOG_DIRECT | \
+ DB_LOG_DSYNC | DB_LOG_IN_MEMORY | DB_LOG_NOSYNC | DB_LOG_ZERO)
static const FLAG_MAP LogMap[] = {
{ DB_LOG_AUTO_REMOVE, DBLOG_AUTOREMOVE},
+ { DB_LOG_BLOB, DBLOG_BLOB},
{ DB_LOG_DIRECT, DBLOG_DIRECT},
{ DB_LOG_DSYNC, DBLOG_DSYNC},
{ DB_LOG_IN_MEMORY, DBLOG_INMEMORY},
+ { DB_LOG_NOSYNC, DBLOG_NOSYNC},
{ DB_LOG_ZERO, DBLOG_ZERO}
};
/*
@@ -406,10 +414,14 @@ __log_get_config(dbenv, which, onp)
if (FLD_ISSET(which, ~OK_FLAGS))
return (__db_ferr(env, "DB_ENV->log_get_config", 0));
dblp = env->lg_handle;
- ENV_REQUIRES_CONFIG(env, dblp, "DB_ENV->log_get_config", DB_INIT_LOG);
+ ENV_NOT_CONFIGURED(env, dblp, "DB_ENV->log_get_config", DB_INIT_LOG);
+
+ if (LOGGING_ON(env)) {
+ __env_fetch_flags(LogMap, sizeof(LogMap), &dblp->flags, &flags);
+ __log_get_flags(dbenv, &flags);
+ } else
+ flags = dbenv->lg_flags;
- __env_fetch_flags(LogMap, sizeof(LogMap), &dblp->flags, &flags);
- __log_get_flags(dbenv, &flags);
if (LF_ISSET(which))
*onp = 1;
else
@@ -459,6 +471,17 @@ __log_set_config_int(dbenv, flags, on, in_open)
"DB_ENV->log_set_config: direct I/O either not configured or not supported");
return (EINVAL);
}
+ if (REP_ON(env) && LF_ISSET(DB_LOG_BLOB) && !on) {
+ __db_errx(env,
+"DB_ENV->log_set_config: DB_LOG_BLOB must be enabled with replication.");
+ return (EINVAL);
+ }
+ if (FLD_ISSET(flags, DB_LOG_IN_MEMORY) && on > 0 &&
+ PREFMAS_IS_SET(env)) {
+ __db_errx(env, DB_STR("2587", "DB_LOG_IN_MEMORY is not "
+ "supported in Replication Manager preferred master mode"));
+ return (EINVAL);
+ }
if (LOGGING_ON(env)) {
if (!in_open && LF_ISSET(DB_LOG_IN_MEMORY) &&
diff --git a/src/log/log_print.c b/src/log/log_print.c
index d2cda519..e5c920b6 100644
--- a/src/log/log_print.c
+++ b/src/log/log_print.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -37,6 +37,7 @@ __log_print_record(env, recbuf, lsnp, name, spec, info)
LOG *lp;
PAGE *hdrstart, *hdrtmp;
int32_t inttmp;
+ u_int64_t ulltmp;
u_int32_t hdrsize, op, uinttmp;
u_int32_t type, txnid;
u_int8_t *bp, *datatmp;
@@ -150,6 +151,14 @@ __log_print_record(env, recbuf, lsnp, name, spec, info)
__db_msgadd(env, &msgbuf, "\n");
bp += sizeof(uinttmp);
break;
+ case LOGREC_LONGARG:
+ LOGCOPY_64(env, &ulltmp, bp);
+ __db_msgadd(env, &msgbuf, "\t%s: ", sp->name);
+ __db_msgadd(env,
+ &msgbuf, "%llu", (unsigned long long)ulltmp);
+ __db_msgadd(env, &msgbuf, "\n");
+ bp += sizeof(ulltmp);
+ break;
case LOGREC_TIME:
/* time_t is long but we only store 32 bits. */
LOGCOPY_32(env, &uinttmp, bp);
diff --git a/src/log/log_put.c b/src/log/log_put.c
index 8f7e23d8..4d6c3d2f 100644
--- a/src/log/log_put.c
+++ b/src/log/log_put.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -280,8 +280,7 @@ __log_put(env, lsnp, udbt, flags)
* If the send fails and we're a commit or checkpoint,
* there's nothing we can do; the record's in the log.
* Flush it, even if we're running with TXN_NOSYNC,
- * on the grounds that it should be in durable
- * form somewhere.
+ * on the grounds that it should be in durable form somewhere.
*/
if (ret != 0 && FLD_ISSET(ctlflags, REPCTL_PERM))
LF_SET(DB_FLUSH);
@@ -473,12 +472,12 @@ __log_put_next(env, lsn, dbt, hdr, old_lsnp)
*/
if (adv_file || lp->lsn.offset == 0 ||
lp->lsn.offset + hdr->size + dbt->size > lp->log_size) {
- if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) {
+ if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_nsize) {
__db_errx(env, DB_STR_A("2513",
"DB_ENV->log_put: record larger than maximum file size (%lu > %lu)",
"%lu %lu"),
(u_long)hdr->size + sizeof(LOGP) + dbt->size,
- (u_long)lp->log_size);
+ (u_long)lp->log_nsize);
return (EINVAL);
}
@@ -561,7 +560,12 @@ __log_flush_commit(env, lsnp, flags)
"Write failed on MASTER commit."));
return (__env_panic(env, ret));
}
-
+ /*
+ * If this is a panic don't attempt to abort just this transaction;
+ * it may trip over the panic, and the whole env needs to go anyway.
+ */
+ if (ret == DB_RUNRECOVERY)
+ return (__env_panic(env, ret));
/*
* Else, make sure that the commit record does not get out after we
* abort the transaction. Do this by overwriting the commit record
@@ -735,7 +739,7 @@ __log_newfile(dblp, lsnp, logfile, version)
__log_persistswap(tpersist);
if ((ret =
- __log_encrypt_record(env, &t, &hdr, (u_int32_t)tsize)) != 0)
+ __log_encrypt_record(env, &t, &hdr, (u_int32_t)sizeof(LOGP))) != 0)
goto err;
if ((ret = __log_putr(dblp, &lsn,
@@ -1118,12 +1122,15 @@ flush: MUTEX_LOCK(env, lp->mtx_flush);
LOG_SYSTEM_UNLOCK(env);
/* Sync all writes to disk. */
- if ((ret = __os_fsync(env, dblp->lfhp)) != 0) {
- MUTEX_UNLOCK(env, lp->mtx_flush);
- if (release)
- LOG_SYSTEM_LOCK(env);
- lp->in_flush--;
- goto done;
+ if (!lp->nosync) {
+ if ((ret = __os_fsync(env, dblp->lfhp)) != 0) {
+ MUTEX_UNLOCK(env, lp->mtx_flush);
+ if (release)
+ LOG_SYSTEM_LOCK(env);
+ lp->in_flush--;
+ goto done;
+ }
+ STAT(++lp->stat.st_scount);
}
/*
@@ -1143,7 +1150,6 @@ flush: MUTEX_LOCK(env, lp->mtx_flush);
LOG_SYSTEM_LOCK(env);
lp->in_flush--;
- STAT(++lp->stat.st_scount);
/*
* How many flush calls (usually commits) did this call actually sync?
@@ -1440,7 +1446,7 @@ __log_newfh(dblp, create)
"DB_ENV->log_newfh: %lu", (u_long)lp->lsn.file);
else if (status != DB_LV_NORMAL && status != DB_LV_INCOMPLETE &&
status != DB_LV_OLD_READABLE)
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
return (ret);
}
@@ -1621,6 +1627,37 @@ err:
return (ret);
}
+/*
+ * __log_rep_write --
+ * Way for replication clients to write the log buffer for the
+ * DB_TXN_WRITE_NOSYNC option. This is just a thin PUBLIC wrapper
+ * for __log_write that is similar to __log_flush_commit.
+ *
+ * Note that the REP->mtx_clientdb should be held when this is called.
+ * Note that we acquire the log region mutex while holding mtx_clientdb.
+ *
+ * PUBLIC: int __log_rep_write __P((ENV *));
+ */
+int
+__log_rep_write(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+ int ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+ LOG_SYSTEM_LOCK(env);
+ if (!lp->db_log_inmemory && lp->b_off != 0)
+ if ((ret = __log_write(dblp, dblp->bufp,
+ (u_int32_t)lp->b_off)) == 0)
+ lp->b_off = 0;
+ LOG_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
static int
__log_encrypt_record(env, dbt, hdr, orig)
ENV *env;
@@ -1773,6 +1810,7 @@ __log_put_record_int(env, dbp, txnp, ret_lsnp,
DB_TXNLOGREC *lr;
LOG *lp;
PAGE *pghdrstart;
+ u_int64_t ulltmp;
u_int32_t hdrsize, op, zero, uinttmp, txn_num;
u_int npad;
u_int8_t *bp;
@@ -1819,7 +1857,7 @@ __log_put_record_int(env, dbp, txnp, ret_lsnp,
return (ret);
/*
* We need to assign begin_lsn while holding region mutex.
- * That assignment is done inside the DbEnv->log_put call,
+ * That assignment is done inside the __log_put call,
* so pass in the appropriate memory location to be filled
* in by the log_put code.
*/
@@ -1842,8 +1880,7 @@ __log_put_record_int(env, dbp, txnp, ret_lsnp,
}
if (is_durable || txnp == NULL) {
- if ((ret =
- __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ if ((ret = __os_malloc(env, logrec.size, &logrec.data)) != 0)
return (ret);
} else {
if ((ret = __os_malloc(env,
@@ -1891,10 +1928,15 @@ __log_put_record_int(env, dbp, txnp, ret_lsnp,
LOGCOPY_32(env, bp, &uinttmp);
bp += sizeof(uinttmp);
break;
+ case LOGREC_LONGARG:
+ ulltmp = va_arg(argp, u_int64_t);
+ LOGCOPY_64(env, bp, &ulltmp);
+ bp += sizeof(ulltmp);
+ break;
case LOGREC_OP:
op = va_arg(argp, u_int32_t);
LOGCOPY_32(env, bp, &op);
- bp += sizeof(uinttmp);
+ bp += sizeof(op);
break;
case LOGREC_DBT:
case LOGREC_PGLIST:
diff --git a/src/log/log_stat.c b/src/log/log_stat.c
index 37b74c74..95fe0e2e 100644
--- a/src/log/log_stat.c
+++ b/src/log/log_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/log/log_verify.c b/src/log/log_verify.c
index e7f8f688..2ed2f0f2 100644
--- a/src/log/log_verify.c
+++ b/src/log/log_verify.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -38,6 +38,12 @@ __log_verify_pp(dbenv, lvconfig)
lsnrg = ret = timerg = 0;
phome = NULL;
+ if (lvconfig == NULL) {
+ __db_errx(dbenv->env, DB_STR("2584",
+ "Must provide a configuration structure."));
+ ret = EINVAL;
+ goto err;
+ }
if (!IS_ZERO_LSN(lvconfig->start_lsn) ||
!IS_ZERO_LSN(lvconfig->end_lsn))
lsnrg = 1;
@@ -64,7 +70,8 @@ __log_verify_pp(dbenv, lvconfig)
}
ENV_ENTER(dbenv->env, ip);
- ret = __log_verify(dbenv, lvconfig, ip);
+ REPLICATION_WRAP(dbenv->env,
+ (__log_verify(dbenv, lvconfig, ip)), 0, ret);
ENV_LEAVE(dbenv->env, ip);
err: return (ret);
}
@@ -79,18 +86,16 @@ __log_verify(dbenv, lvconfig, ip)
const DB_LOG_VERIFY_CONFIG *lvconfig;
DB_THREAD_INFO *ip;
{
-
- u_int32_t logcflag, max_fileno;
+ DB_LOG_VRFY_INFO *logvrfy_hdl;
DB_LOGC *logc;
- ENV *env;
- DBT data;
DB_DISTAB dtab;
DB_LSN key, start, start2, stop, stop2, verslsn;
- u_int32_t newversion, version;
+ DBT data;
+ ENV *env;
+ u_int32_t logcflag, max_fileno, newversion, version;
int cmp, fwdscroll, goprev, ret, tret;
time_t starttime, endtime;
const char *okmsg;
- DB_LOG_VRFY_INFO *logvrfy_hdl;
okmsg = NULL;
fwdscroll = 1;
@@ -98,6 +103,7 @@ __log_verify(dbenv, lvconfig, ip)
goprev = 0;
env = dbenv->env;
logc = NULL;
+ logvrfy_hdl = NULL;
memset(&dtab, 0, sizeof(dtab));
memset(&data, 0, sizeof(data));
version = newversion = 0;
@@ -333,11 +339,12 @@ out:
err:
if (logc != NULL)
(void)__logc_close(logc);
- if ((tret = __destroy_log_vrfy_info(logvrfy_hdl)) != 0 && ret == 0)
+ if (logvrfy_hdl != NULL &&
+ (tret = __destroy_log_vrfy_info(logvrfy_hdl)) != 0 && ret == 0)
ret = tret;
- if (dtab.int_dispatch)
+ if (dtab.int_dispatch != NULL)
__os_free(dbenv->env, dtab.int_dispatch);
- if (dtab.ext_dispatch)
+ if (dtab.ext_dispatch != NULL)
__os_free(dbenv->env, dtab.ext_dispatch);
return (ret);
diff --git a/src/log/log_verify_auto.c b/src/log/log_verify_auto.c
index 08bc5d64..de08998d 100644
--- a/src/log/log_verify_auto.c
+++ b/src/log/log_verify_auto.c
@@ -174,6 +174,9 @@ __fop_init_verify(env, dtabp)
__fop_write_verify, DB___fop_write)) != 0)
return (ret);
if ((ret = __db_add_recovery_int(env, dtabp,
+ __fop_write_file_verify, DB___fop_write_file)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
__fop_rename_verify, DB___fop_rename)) != 0)
return (ret);
if ((ret = __db_add_recovery_int(env, dtabp,
diff --git a/src/log/log_verify_int.c b/src/log/log_verify_int.c
index abe564c6..f69f01c0 100644
--- a/src/log/log_verify_int.c
+++ b/src/log/log_verify_int.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -593,7 +593,7 @@ __crdel_metasub_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -628,7 +628,7 @@ __crdel_inmem_create_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __crdel_inmem_create_read(env, dbtp->data, &argp)) != 0)
@@ -661,7 +661,7 @@ __crdel_inmem_rename_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __crdel_inmem_rename_read(env, dbtp->data, &argp)) != 0)
@@ -694,7 +694,7 @@ __crdel_inmem_remove_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __crdel_inmem_remove_read(env, dbtp->data, &argp)) != 0)
@@ -727,7 +727,7 @@ __db_addrem_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -762,7 +762,7 @@ __db_big_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -797,7 +797,7 @@ __db_ovref_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -832,7 +832,7 @@ __db_relink_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -864,7 +864,7 @@ __db_debug_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __db_debug_read(env, dbtp->data, &argp)) != 0)
@@ -897,7 +897,7 @@ __db_noop_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -931,7 +931,7 @@ __db_pg_alloc_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -963,7 +963,7 @@ __db_pg_alloc_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -998,7 +998,7 @@ __db_pg_free_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1030,7 +1030,7 @@ __db_pg_free_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1065,7 +1065,7 @@ __db_cksum_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
@@ -1098,7 +1098,7 @@ __db_pg_freedata_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1130,7 +1130,7 @@ __db_pg_freedata_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1165,7 +1165,7 @@ __db_pg_init_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1200,7 +1200,7 @@ __db_pg_sort_44_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1232,7 +1232,7 @@ __db_pg_trunc_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1264,7 +1264,7 @@ __db_realloc_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1299,7 +1299,7 @@ __db_relink_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1334,7 +1334,7 @@ __db_merge_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1369,7 +1369,7 @@ __db_pgno_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1515,7 +1515,7 @@ __dbreg_register_verify(env, dbtp, lsnp, notused2, lvhp)
opcode = 0;
ret = ret2 = rmv_dblife = 0;
puid = NULL;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
fregp = NULL;
pflife = NULL;
@@ -1749,6 +1749,36 @@ err:
}
/*
+ * PUBLIC: int __dbreg_register_42_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__dbreg_register_42_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __dbreg_register_42_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __dbreg_register_42_read(env, dbtp->data, &argp)) != 0)
+ goto err;
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */
+
+err:
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
* PUBLIC: int __bam_split_verify __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -1764,7 +1794,7 @@ __bam_split_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1804,7 +1834,7 @@ __bam_split_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1836,7 +1866,7 @@ __bam_rsplit_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1873,7 +1903,7 @@ __bam_adj_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1910,7 +1940,7 @@ __bam_irep_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1947,7 +1977,7 @@ __bam_cadjust_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -1984,7 +2014,7 @@ __bam_cdel_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2021,7 +2051,7 @@ __bam_repl_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2058,7 +2088,7 @@ __bam_root_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2093,7 +2123,7 @@ __bam_curadj_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2129,7 +2159,7 @@ __bam_rcuradj_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2165,7 +2195,7 @@ __bam_relink_43_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2197,7 +2227,7 @@ __bam_merge_44_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2229,7 +2259,7 @@ __fop_create_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __fop_create_42_read(env, dbtp->data, &argp)) != 0)
@@ -2245,6 +2275,37 @@ err:
}
/*
+ * PUBLIC: int __fop_create_60_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_create_60_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_create_60_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_create_60_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
* PUBLIC: int __fop_create_verify __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -2260,7 +2321,7 @@ __fop_create_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __fop_create_read(env, dbtp->data, &argp)) != 0)
@@ -2278,6 +2339,38 @@ err:
}
/*
+ * PUBLIC: int __fop_remove_60_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_remove_60_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_remove_60_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_remove_60_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ //LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
* PUBLIC: int __fop_remove_verify __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -2293,7 +2386,7 @@ __fop_remove_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __fop_remove_read(env, dbtp->data, &argp)) != 0)
@@ -2326,7 +2419,7 @@ __fop_write_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __fop_write_42_read(env, dbtp->data, &argp)) != 0)
@@ -2341,6 +2434,36 @@ err:
}
/*
+ * PUBLIC: int __fop_write_60_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_60_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_write_60_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_write_60_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
* PUBLIC: int __fop_write_verify __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -2356,7 +2479,7 @@ __fop_write_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __fop_write_read(env, dbtp->data, &argp)) != 0)
@@ -2373,6 +2496,67 @@ err:
}
/*
+ * PUBLIC: int __fop_write_file_60_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_file_60_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_write_file_60_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_write_file_60_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /*LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);*/
+err:
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __fop_write_file_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_write_file_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_write_file_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_write_file_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+ ON_PAGE_UPDATE4 /* No pages are locked by txns. */
+out:
+
+err:
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
* PUBLIC: int __fop_rename_42_verify __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -2388,7 +2572,7 @@ __fop_rename_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __fop_rename_42_read(env, dbtp->data, &argp)) != 0)
@@ -2404,6 +2588,37 @@ err:
}
/*
+ * PUBLIC: int __fop_rename_60_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_rename_60_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_rename_60_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_rename_60_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
* PUBLIC: int __fop_rename_verify __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -2423,7 +2638,7 @@ __fop_rename_verify(env, dbtp, lsnp, notused2, lvhp)
VRFY_FILEREG_INFO freg, *fregp;
memset(&freg, 0, sizeof(freg));
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
buf = NULL;
@@ -2470,6 +2685,38 @@ err:
}
/*
+ * PUBLIC: int __fop_file_remove_60_verify __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__fop_file_remove_60_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __fop_file_remove_60_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret = __fop_file_remove_60_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type);
+ //LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);
+
+err:
+
+ __os_free(env, argp);
+
+ return (ret);
+}
+
+/*
* PUBLIC: int __fop_file_remove_verify __P((ENV *, DBT *, DB_LSN *,
* PUBLIC: db_recops, void *));
*/
@@ -2485,7 +2732,7 @@ __fop_file_remove_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __fop_file_remove_read(env, dbtp->data, &argp)) != 0)
@@ -2519,7 +2766,7 @@ __ham_insdel_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2555,7 +2802,7 @@ __ham_newpage_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2592,7 +2839,7 @@ __ham_splitdata_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2629,7 +2876,7 @@ __ham_replace_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2667,7 +2914,7 @@ __ham_copypage_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2703,7 +2950,7 @@ __ham_metagroup_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2735,7 +2982,7 @@ __ham_metagroup_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2771,7 +3018,7 @@ __ham_groupalloc_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2807,7 +3054,7 @@ __ham_groupalloc_verify(env, dbtp, lsnp, notused2, lvhp)
ret = 0;
pflife = NULL;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2863,7 +3110,7 @@ __ham_changeslot_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2900,7 +3147,7 @@ __ham_contract_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2936,7 +3183,7 @@ __ham_curadj_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -2973,7 +3220,7 @@ __ham_chgpg_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3011,7 +3258,7 @@ __heap_addrem_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3030,6 +3277,40 @@ err:
}
/*
+ * PUBLIC: int __heap_addrem_60_verify
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__heap_addrem_60_verify(env, dbtp, lsnp, notused2, lvhp)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *lvhp;
+{
+ __heap_addrem_60_args *argp;
+ DB_LOG_VRFY_INFO *lvh;
+ int ret;
+
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
+ lvh = (DB_LOG_VRFY_INFO *)lvhp;
+
+ if ((ret =
+ __heap_addrem_60_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid);
+ ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno);
+ if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0)
+ goto err;
+out:
+
+err:
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
* PUBLIC: int __heap_pg_alloc_verify
* PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
*/
@@ -3045,7 +3326,7 @@ __heap_pg_alloc_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3060,7 +3341,7 @@ out:
err:
__os_free(env, argp);
- return (ret);
+ return (ret);
}
/*
@@ -3079,7 +3360,7 @@ __heap_trunc_meta_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3095,7 +3376,7 @@ out:
err:
__os_free(env, argp);
- return (ret);
+ return (ret);
}
/*
@@ -3114,7 +3395,7 @@ __heap_trunc_page_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3150,7 +3431,7 @@ __qam_incfirst_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3186,7 +3467,7 @@ __qam_mvptr_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3222,7 +3503,7 @@ __qam_del_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3258,7 +3539,7 @@ __qam_add_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3294,7 +3575,7 @@ __qam_delext_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret =
@@ -3331,7 +3612,7 @@ __txn_regop_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0)
@@ -3365,7 +3646,7 @@ __txn_regop_verify(env, dbtp, lsnp, notused2, lvhp)
VRFY_TIMESTAMP_INFO tsinfo;
ptvi = pptvi = NULL;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
ret = ret2 = started = 0;
@@ -3480,7 +3761,7 @@ __txn_ckp_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0)
@@ -3517,7 +3798,7 @@ __txn_ckp_verify(env, dbtp, lsnp, notused2, lvhp)
time_t ckp_time, lastckp_time;
lastckp = NULL;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
memset(&ckpinfo, 0, sizeof(ckpinfo));
memset(&cvp, 0, sizeof(cvp));
@@ -3675,7 +3956,7 @@ __txn_child_verify(env, dbtp, lsnp, notused2, lvhp)
* we never know the T0 has an active child txn T1, all child txns
* we know are committed.
*/
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
ptvi = ptvi2 = NULL;
ret = ret2 = started = 0;
@@ -3811,7 +4092,7 @@ __txn_xa_regop_42_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __txn_xa_regop_42_read(env, dbtp->data, &argp)) != 0)
@@ -3844,7 +4125,7 @@ __txn_prepare_verify(env, dbtp, lsnp, notused2, lvhp)
ret = ret2 = started = 0;
ptvi = NULL;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
if ((ret = __txn_prepare_read(env, dbtp->data, &argp)) != 0)
@@ -3924,7 +4205,7 @@ __txn_recycle_verify(env, dbtp, lsnp, notused2, lvhp)
DB_LOG_VRFY_INFO *lvh;
int ret;
- notused2 = DB_TXN_LOG_VERIFY;
+ COMPQUIET(notused2, DB_TXN_LOG_VERIFY);
lvh = (DB_LOG_VRFY_INFO *)lvhp;
ret = 0;
diff --git a/src/log/log_verify_stub.c b/src/log/log_verify_stub.c
index e6589a50..fdd9a795 100644
--- a/src/log/log_verify_stub.c
+++ b/src/log/log_verify_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/log/log_verify_util.c b/src/log/log_verify_util.c
index 88682921..b0cfe0cb 100644
--- a/src/log/log_verify_util.c
+++ b/src/log/log_verify_util.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -53,16 +53,16 @@
} \
} while (0)
-typedef int (*btcmp_funct)(DB *, const DBT *, const DBT *);
-typedef int (*dupcmp_funct)(DB *, const DBT *, const DBT *);
+typedef int (*btcmp_funct)(DB *, const DBT *, const DBT *, size_t *);
+typedef int (*dupcmp_funct)(DB *, const DBT *, const DBT *, size_t *);
static int __lv_add_recycle_handler __P((
DB_LOG_VRFY_INFO *, VRFY_TXN_INFO *, void *));
static int __lv_add_recycle_lsn __P((VRFY_TXN_INFO *, const DB_LSN *));
static size_t __lv_dbt_arrsz __P((const DBT *, u_int32_t));
-static int __lv_fidpgno_cmp __P((DB *, const DBT *, const DBT *));
-static int __lv_i32_cmp __P((DB *, const DBT *, const DBT *));
-static int __lv_lsn_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_fidpgno_cmp __P((DB *, const DBT *, const DBT *, size_t *));
+static int __lv_i32_cmp __P((DB *, const DBT *, const DBT *, size_t *));
+static int __lv_lsn_cmp __P((DB *, const DBT *, const DBT *, size_t *));
static void __lv_on_bdbop_err __P((int));
static int __lv_open_db __P((DB_ENV *, DB **, DB_THREAD_INFO *,
const char *, int, btcmp_funct, u_int32_t, dupcmp_funct));
@@ -73,8 +73,8 @@ static int __lv_seccbk_fname __P((DB *, const DBT *, const DBT *, DBT *));
static int __lv_seccbk_lsn __P((DB *, const DBT *, const DBT *, DBT *));
static int __lv_seccbk_txnpg __P((DB *, const DBT *, const DBT *, DBT *));
static void __lv_setup_logtype_names __P((DB_LOG_VRFY_INFO *lvinfo));
-static int __lv_txnrgns_lsn_cmp __P((DB *, const DBT *, const DBT *));
-static int __lv_ui32_cmp __P((DB *, const DBT *, const DBT *));
+static int __lv_txnrgns_lsn_cmp __P((DB *, const DBT *, const DBT *, size_t *));
+static int __lv_ui32_cmp __P((DB *, const DBT *, const DBT *, size_t *));
static int __lv_unpack_txn_vrfy_info __P((VRFY_TXN_INFO **, const DBT *));
static int __lv_unpack_filereg __P((const DBT *, VRFY_FILEREG_INFO **));
@@ -383,16 +383,18 @@ err:
/* Btree compare function for a [fileid, pgno] key. */
static int
-__lv_fidpgno_cmp(db, dbt1, dbt2)
+__lv_fidpgno_cmp(db, dbt1, dbt2, locp)
DB *db;
const DBT *dbt1;
const DBT *dbt2;
+ size_t * locp;
{
db_pgno_t pgno1, pgno2;
int ret;
size_t len;
COMPQUIET(db, NULL);
+ COMPQUIET(locp, NULL);
len = DB_FILE_ID_LEN;
ret = memcmp(dbt1->data, dbt2->data, len);
if (ret == 0) {
@@ -408,14 +410,16 @@ __lv_fidpgno_cmp(db, dbt1, dbt2)
/* Btree compare function for a int32_t type of key. */
static int
-__lv_i32_cmp(db, dbt1, dbt2)
+__lv_i32_cmp(db, dbt1, dbt2, locp)
DB *db;
const DBT *dbt1;
const DBT *dbt2;
+ size_t *locp;
{
int32_t k1, k2;
COMPQUIET(db, NULL);
+ COMPQUIET(locp, NULL);
memcpy(&k1, dbt1->data, sizeof(k1));
memcpy(&k2, dbt2->data, sizeof(k2));
@@ -424,14 +428,16 @@ __lv_i32_cmp(db, dbt1, dbt2)
/* Btree compare function for a u_int32_t type of key. */
static int
-__lv_ui32_cmp(db, dbt1, dbt2)
+__lv_ui32_cmp(db, dbt1, dbt2, locp)
DB *db;
const DBT *dbt1;
const DBT *dbt2;
+ size_t *locp;
{
u_int32_t k1, k2;
COMPQUIET(db, NULL);
+ COMPQUIET(locp, NULL);
memcpy(&k1, dbt1->data, sizeof(k1));
memcpy(&k2, dbt2->data, sizeof(k2));
@@ -440,18 +446,21 @@ __lv_ui32_cmp(db, dbt1, dbt2)
/* Btree compare function for a DB_LSN type of key. */
static int
-__lv_lsn_cmp(db, dbt1, dbt2)
+__lv_lsn_cmp(db, dbt1, dbt2, locp)
DB *db;
const DBT *dbt1;
const DBT *dbt2;
+ size_t *locp;
{
DB_LSN lsn1, lsn2;
+ COMPQUIET(locp, NULL);
DB_ASSERT(db->env, dbt1->size == sizeof(DB_LSN));
DB_ASSERT(db->env, dbt2->size == sizeof(DB_LSN));
memcpy(&lsn1, dbt1->data, sizeof(DB_LSN));
memcpy(&lsn2, dbt2->data, sizeof(DB_LSN));
+ COMPQUIET(db, NULL);
return (LOG_COMPARE(&lsn1, &lsn2));
}
@@ -1663,17 +1672,21 @@ int __put_timestamp_info (lvinfo, tsinfo)
}
static int
-__lv_txnrgns_lsn_cmp (db, d1, d2)
+__lv_txnrgns_lsn_cmp (db, d1, d2, locp)
DB *db;
const DBT *d1, *d2;
+ size_t *locp;
{
struct __lv_txnrange r1, r2;
+ COMPQUIET(locp, NULL);
+
DB_ASSERT(db->env, d1->size == sizeof(r1));
DB_ASSERT(db->env, d2->size == sizeof(r2));
memcpy(&r1, d1->data, d1->size);
memcpy(&r2, d2->data, d2->size);
+ COMPQUIET(db, NULL);
return (LOG_COMPARE(&(r1.end), &(r2.end)));
}
diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c
index dc331215..011f54c6 100644
--- a/src/mp/mp_alloc.c
+++ b/src/mp/mp_alloc.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -22,8 +22,112 @@
#endif
/*
+ * __memp_bh_unreachable --
+ *
+ * Determine whether this buffer can not ever be seen again: is the next
+ * newer version visible to the same transaction which sees this one?
+ * If both versions are visibile to the same transaction, there is no
+ * reason to keep the older one: it can be purged.
+ *
+ * If this buffer has a more recent version, and there is a transaction
+ * with a read_lsn between this buffer's and that more recent version's,
+ * the buffer is visible to at least that transaction, so return FALSE.
+ * Otherwise return TRUE.
+ *
+ * txns: 3/10 2/10 2/5 2/1 1/10
+ * vers: 3/15 2/15 2/14 2/10 2/8 1/150
+ * vis vis unreach vis unreach vis
+ * who new txns 3/10 2/10 2/5, 2/1
+ * sees
+ *
+ * Note: in the abvove example, the page was allocated after txn 1/10
+ * started. 1/10 would not see any version of the page.
+ *
+ * PUBLIC: int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int));
+ */
+int
+__memp_bh_unreachable(env, bhp, snapshots, n_snapshots)
+ ENV *env;
+ BH *bhp;
+ DB_LSN *snapshots;
+ int n_snapshots;
+{
+ BH *newer_bhp;
+ DB_LSN b_vlsn, n_vlsn;
+ int i, ret;
+#ifdef DIAGNOSTIC
+ DB_MPOOL *dbmp;
+ DB_MSGBUF mb;
+ MPOOLFILE *bh_mfp;
+#endif
+
+ /*
+ * The buffer can't be purged if it is being used, or is the most recent
+ * version, or the next newer version isn't a copy yet.
+ */
+ if (BH_REFCOUNT(bhp) != 0 ||
+ (newer_bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) == NULL ||
+ newer_bhp->td_off == INVALID_ROFF)
+ return (FALSE);
+
+ /*
+ * Find the visiblity LSNs for this buffer (b_vlsn) and the more recent,
+ * newer buffer (n_vlsn). If the newer version hasn't committed yet the
+ * bhp could be needed.
+ */
+ n_vlsn = *VISIBLE_LSN(env, newer_bhp);
+ if (IS_MAX_LSN(n_vlsn))
+ return (FALSE);
+ if (bhp->td_off == INVALID_ROFF)
+ INIT_LSN(b_vlsn);
+ else
+ b_vlsn = *VISIBLE_LSN(env, bhp);
+
+ ret = TRUE;
+ /*
+ * Look for a transaction which is between n_lsn and b_lsn - determining
+ * that bhp is reachable. Stop looking once the transactions get so
+ * small (old) that they precede the buffer's version; no earlier txn
+ * could be between n_vlsn and b_vlsn.
+ */
+ for (i = 0;
+ i < n_snapshots && LOG_COMPARE(&snapshots[i], &b_vlsn) >= 0;
+ i++) {
+ if (LOG_COMPARE(&snapshots[i], &n_vlsn) < 0) {
+ /*
+ * This txn can see (started after) bhp, but not
+ * newer_bhp (which committed after this txn started).
+ */
+ ret = FALSE;
+ break;
+ }
+ }
+
+#ifdef DIAGNOSTIC
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) {
+ dbmp = env->mp_handle;
+ bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ DB_MSGBUF_INIT(&mb);
+ __db_msgadd(env, &mb,
+ "bh_unreachable %s pgno %d %s %lu/%lu %x newer %lu/%lu txn #%d in\n",
+ __memp_fns(dbmp, bh_mfp), bhp->pgno,
+ ret ? "purgeable" : "needed",
+ (u_long)b_vlsn.file, (u_long)b_vlsn.offset, bhp->flags,
+ (u_long)n_vlsn.file, (u_long)n_vlsn.offset, i);
+ for (i = 0; i != n_snapshots; i++)
+ __db_msgadd(env, &mb, " %lu/%lu",
+ (u_long)snapshots[i].file,
+ (u_long)snapshots[i].offset);
+ DB_MSGBUF_FLUSH(env, &mb);
+ }
+#endif
+ return (ret);
+}
+
+/*
* __memp_alloc --
- * Allocate some space from a cache region.
+ * Allocate some space from a cache region. If the region is full then
+ * reuse one or more cache buffers.
*
* PUBLIC: int __memp_alloc __P((DB_MPOOL *,
* PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
@@ -39,7 +143,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
{
BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp;
BH_FROZEN_PAGE *frozen_bhp;
- DB_LSN oldest_reader, vlsn;
+ DB_LSN *snapshots, vlsn;
DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp;
ENV *env;
MPOOL *c_mp;
@@ -49,7 +153,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
u_int32_t dirty_eviction, high_priority, priority, versions;
u_int32_t priority_saved, put_counter, lru_generation, total_buckets;
int aggressive, alloc_freeze, b_lock, giveup;
- int h_locked, need_free, obsolete, ret, write_error;
+ int h_locked, need_free, n_snapshots, obsolete, ret, write_error;
u_int8_t *endp;
void *p;
@@ -58,11 +162,10 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
dbht = R_ADDR(infop, c_mp->htab);
hp_end = &dbht[c_mp->htab_buckets];
hp_saved = NULL;
- priority_saved = 0;
- write_error = 0;
-
+ snapshots = NULL;
+ priority_saved = write_error = 0;
buckets = buffers = put_counter = total_buckets = versions = 0;
- aggressive = alloc_freeze = giveup = h_locked = 0;
+ aggressive = alloc_freeze = giveup = h_locked = n_snapshots = 0;
/*
* If we're allocating a buffer, and the one we're discarding is the
@@ -138,13 +241,15 @@ found: if (offsetp != NULL)
c_mp->stat.st_alloc_pages, buffers, infop->id);
}
#endif
- return (0);
+ goto done;
} else if (giveup || c_mp->pages == 0) {
MPOOL_REGION_UNLOCK(env, infop);
__db_errx(env, DB_STR("3017",
"unable to allocate space from the buffer cache"));
- return ((ret == ENOMEM && write_error != 0) ? EIO : ret);
+ if (ret == ENOMEM && write_error != 0)
+ ret = EIO;
+ goto done;
}
search:
@@ -158,7 +263,6 @@ search:
lru_generation = c_mp->lru_generation;
ret = 0;
- MAX_LSN(oldest_reader);
/*
* We re-attempt the allocation every time we've freed 3 times what
@@ -222,6 +326,13 @@ search:
goto alloc;
MPOOL_REGION_UNLOCK(env, infop);
+ /* Refresh the list of mvcc reader transactions. */
+ if (snapshots != NULL)
+ __os_free(env, snapshots);
+ if ((ret = __txn_get_readers(
+ env, &snapshots, &n_snapshots)) != 0)
+ goto err;
+
aggressive++;
/*
* Once aggressive, we consider all buffers. By setting
@@ -266,13 +377,6 @@ search:
if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
continue;
- /* Set aggressive if we have already searched for too long. */
- if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) {
- aggressive = 1;
- /* Once aggressive, we consider all buffers. */
- high_priority = MPOOL_LRU_MAX;
- }
-
/* Unlock the region and lock the hash bucket. */
MPOOL_REGION_UNLOCK(env, infop);
MUTEX_READLOCK(env, hp->mtx_hash);
@@ -280,29 +384,45 @@ search:
b_lock = 0;
/*
+ * Set aggressive to consider all buffers if we have already
+ * searched in too many buckets.
+ */
+ if (buckets > MPOOL_ALLOC_SEARCH_LIMIT && aggressive == 0) {
+ aggressive = 1;
+ /* Once aggressive, we consider all buffers. */
+ high_priority = MPOOL_LRU_MAX;
+ if (snapshots == NULL && (ret = __txn_get_readers(
+ env, &snapshots, &n_snapshots)) != 0)
+ goto err;
+ }
+
+ /*
* Find a buffer we can use.
+ * Skip over refcount > 0 buffers; we can't get rid of them.
*
- * We use the lowest-LRU singleton buffer if we find one and
- * it's better than the result of another hash bucket we've
+ * Without MVCC we use the lowest-LRU singleton buffer we find
+ * that's better than the result of another hash bucket we've
* reviewed. We do not use a buffer which has a priority
* greater than high_priority unless we are being aggressive.
*
- * With MVCC buffers, the situation is more complicated: we
- * don't want to free a buffer out of the middle of an MVCC
- * chain, since that requires I/O. So, walk the buffers,
- * looking for an obsolete buffer at the end of an MVCC chain.
- * Once a buffer becomes obsolete, its LRU priority is
- * irrelevant because that version can never be accessed again.
+ * MVCC requires looking at additional factors: we don't want to
+ * free a still-relevent buffer out of the middle of an MVCC
+ * chain, since that requires freezing - lots of I/O. So,
+ * walk the buffers, looking for an obsolete buffer at the
+ * end of the MVCC chain. Once a buffer becomes obsolete, its
+ * LRU priority is irrelevant because that version can never
+ * be accessed again.
*
* If we don't find any obsolete MVCC buffers, we will get
* aggressive, and in that case consider the lowest priority
* buffer within a chain.
- *
- * Ignore referenced buffers, we can't get rid of them.
*/
retry_search: bhp = NULL;
bucket_priority = high_priority;
obsolete = 0;
+ if (n_snapshots > 0 && LOG_COMPARE(&snapshots[n_snapshots - 1],
+ &hp->old_reader) > 0)
+ hp->old_reader = snapshots[n_snapshots - 1];
SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) {
/*
* First, do the standard LRU check for singletons.
@@ -340,55 +460,63 @@ retry_search: bhp = NULL;
mvcc_bhp != NULL;
oldest_bhp = mvcc_bhp,
mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) {
+ DB_ASSERT(env, mvcc_bhp !=
+ SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
#ifdef MPOOL_ALLOC_SEARCH_DYN
if (aggressive == 0 &&
- ++high_priority >= c_mp->lru_priority)
+ ++high_priority >= c_mp->lru_priority) {
aggressive = 1;
+ if (snapshots == NULL && (ret =
+ __txn_readers(env,
+ &snapshots, &n_snapshots)) != 0)
+ goto err;
+ }
#endif
- DB_ASSERT(env, mvcc_bhp !=
- SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
- if ((aggressive < 2 &&
- ++versions < (buffers >> 2)) ||
- BH_REFCOUNT(mvcc_bhp) != 0)
+ if (n_snapshots > 0 &&
+ __memp_bh_unreachable(env,
+ mvcc_bhp, snapshots, n_snapshots)) {
+ oldest_bhp = mvcc_bhp;
+ goto is_obsolete;
+ }
+ if (bhp != NULL &&
+ mvcc_bhp->priority >= bhp->priority)
+ continue;
+ if (BH_REFCOUNT(mvcc_bhp) != 0)
+ continue;
+ /*
+ * Since taking still-relevant versions requires
+ * freezing, skip over them at low aggression
+ * levels unless we see that a high proportion
+ * of buffers (over 1/4) are MVCC copies.
+ */
+ if (aggressive < 2 &&
+ ++versions < (buffers >> 2))
continue;
buffers++;
- if (!F_ISSET(mvcc_bhp, BH_FROZEN) &&
- (bhp == NULL ||
- bhp->priority > mvcc_bhp->priority)) {
- if (bhp != NULL)
- atomic_dec(env, &bhp->ref);
- bhp = mvcc_bhp;
- atomic_inc(env, &bhp->ref);
- }
+ if (F_ISSET(mvcc_bhp, BH_FROZEN))
+ continue;
+ /*
+ * Select mvcc_bhp as current best candidate,
+ * releasing the current candidate, if any.
+ */
+ if (bhp != NULL)
+ atomic_dec(env, &bhp->ref);
+ bhp = mvcc_bhp;
+ atomic_inc(env, &bhp->ref);
}
/*
* oldest_bhp is the last buffer on the MVCC chain, and
* an obsolete buffer at the end of the MVCC chain gets
- * used without further search. Before checking for
- * obsolescence, update the cached oldest reader LSN in
- * the bucket if it is older than call's oldest_reader.
+ * used without further search.
*/
if (BH_REFCOUNT(oldest_bhp) != 0)
continue;
- if (LOG_COMPARE(&oldest_reader, &hp->old_reader) > 0) {
- if (IS_MAX_LSN(oldest_reader) &&
- (ret = __txn_oldest_reader(
- env, &oldest_reader)) != 0) {
- MUTEX_UNLOCK(env, hp->mtx_hash);
- if (bhp != NULL)
- atomic_dec(env, &bhp->ref);
- return (ret);
- }
- if (LOG_COMPARE(&oldest_reader,
- &hp->old_reader) > 0)
- hp->old_reader = oldest_reader;
- }
-
if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) {
if (aggressive < 2)
buffers++;
+is_obsolete:
obsolete = 1;
if (bhp != NULL)
atomic_dec(env, &bhp->ref);
@@ -410,10 +538,18 @@ retry_search: bhp = NULL;
/*
* Compare two hash buckets and select the one with the lower
- * priority. Performance testing showed looking at two improves
- * the LRU-ness and looking at more only does a little better.
+ * priority, except mvcc at high aggression levels. Performance
+ * testing shows looking at two improves the LRU-ness and
+ * looking at more only does a little better.
*/
if (hp_saved == NULL) {
+ /*
+ * At high aggressive levels when mvcc is active, stop
+ * looking for candidate once one has been found.
+ * Freezing takes more time than writing out to a db.
+ */
+ if (aggressive > 1 && n_snapshots > 1)
+ goto this_buffer;
hp_saved = hp;
priority_saved = priority;
goto next_hb;
@@ -487,11 +623,15 @@ this_buffer: /*
/* We cannot block as the caller is probably holding locks. */
if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) {
- if (ret != DB_LOCK_NOTGRANTED)
- return (ret);
+ if (ret != DB_LOCK_NOTGRANTED) {
+ goto err;
+ }
+ ret = 0;
goto next_hb;
}
F_SET(bhp, BH_EXCLUSIVE);
+ if (obsolete)
+ F_SET(bhp, BH_UNREACHABLE);
b_lock = 1;
/* Someone may have grabbed it while we got the lock. */
@@ -557,7 +697,7 @@ this_buffer: /*
F_CLR(bhp, BH_EXCLUSIVE);
MUTEX_UNLOCK(env, bhp->mtx_buf);
DB_ASSERT(env, !h_locked);
- return (ret);
+ goto err;
}
}
@@ -573,16 +713,25 @@ this_buffer: /*
if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) ||
(SH_CHAIN_HASNEXT(bhp, vc) &&
SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off &&
- !BH_OBSOLETE(bhp, hp->old_reader, vlsn)))
+ !(obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))) {
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env,
+ "memp_alloc next_hb past bhp %lx flags %x ref %d %lx/%lx",
+ (u_long)R_OFFSET(infop, bhp), bhp->flags,
+ BH_REFCOUNT(bhp),
+ (u_long)R_OFFSET(infop, SH_CHAIN_NEXTP(bhp, vc, __bh)),
+ (u_long)R_OFFSET(infop, SH_CHAIN_PREVP(bhp, vc, __bh)));
goto next_hb;
+ }
/*
* If the buffer is frozen, thaw it and look for another one
- * we can use. (Calling __memp_bh_freeze above will not
- * mark bhp BH_FROZEN.)
+ * we can use. (Calling __memp_bh_freeze above will not mark
+ * this bhp BH_FROZEN; it creates another frozen one.)
*/
if (F_ISSET(bhp, BH_FROZEN)) {
- DB_ASSERT(env, obsolete || SH_CHAIN_SINGLETON(bhp, vc));
+ DB_ASSERT(env, SH_CHAIN_SINGLETON(bhp, vc) ||
+ obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn));
DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
if (!F_ISSET(bhp, BH_THAWED)) {
/*
@@ -592,10 +741,10 @@ this_buffer: /*
*/
if ((ret = __memp_bh_thaw(dbmp,
infop, hp, bhp, NULL)) != 0)
- return (ret);
+ goto done;
MUTEX_READLOCK(env, hp->mtx_hash);
} else {
- need_free = (atomic_dec(env, &bhp->ref) == 0);
+ need_free = atomic_dec(env, &bhp->ref) == 0;
F_CLR(bhp, BH_EXCLUSIVE);
MUTEX_UNLOCK(env, bhp->mtx_buf);
if (need_free) {
@@ -626,7 +775,10 @@ this_buffer: /*
if (alloc_freeze) {
if ((ret = __memp_bhfree(dbmp,
infop, bh_mfp, hp, bhp, 0)) != 0)
- return (ret);
+ goto err;
+ DB_ASSERT(env, bhp->mtx_buf != MUTEX_INVALID);
+ if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0)
+ goto err;
b_lock = 0;
h_locked = 0;
@@ -654,23 +806,21 @@ this_buffer: /*
}
/*
- * Check to see if the buffer is the size we're looking for.
- * If so, we can simply reuse it. Otherwise, free the buffer
- * and its space and keep looking.
+ * If the buffer is the size we're looking for, we can simply
+ * reuse it. Otherwise, free it and keep looking.
*/
if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) {
if ((ret = __memp_bhfree(dbmp,
infop, bh_mfp, hp, bhp, 0)) != 0)
- return (ret);
+ goto err;
p = bhp;
goto found;
}
freed_space += sizeof(*bhp) + bh_mfp->pagesize;
- if ((ret =
- __memp_bhfree(dbmp, infop,
- bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)
- return (ret);
+ if ((ret = __memp_bhfree(dbmp,
+ infop, bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)
+ goto err;
/* Reset "aggressive" and "write_error" if we free any space. */
if (aggressive > 1)
@@ -689,12 +839,14 @@ next_hb: if (bhp != NULL) {
if (b_lock) {
F_CLR(bhp, BH_EXCLUSIVE);
MUTEX_UNLOCK(env, bhp->mtx_buf);
+ b_lock = 0;
}
}
if (h_locked)
MUTEX_UNLOCK(env, hp->mtx_hash);
h_locked = 0;
}
+ obsolete = 0;
MPOOL_REGION_LOCK(env, infop);
/*
@@ -706,7 +858,15 @@ next_hb: if (bhp != NULL) {
if (freed_space >= 3 * len)
goto alloc;
}
- /* NOTREACHED */
+err:
+ if (h_locked) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+ }
+done:
+ if (snapshots != NULL)
+ __os_free(env, snapshots);
+ return (ret);
}
/*
diff --git a/src/mp/mp_backup.c b/src/mp/mp_backup.c
index f376cda7..f1072292 100644
--- a/src/mp/mp_backup.c
+++ b/src/mp/mp_backup.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -145,6 +145,9 @@ __memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags)
if (backup == NULL || (len = backup->size) == 0)
len = MEGABYTE;
+ /* Ensure backup page size is at least as big as db page size */
+ if (len < mfp->pagesize)
+ len = mfp->pagesize;
if ((ret = __os_malloc(env, len, &buf)) != 0)
return (ret);
write_size = (u_int32_t)(len / mfp->pagesize);
@@ -188,7 +191,7 @@ __memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags)
if (backup != NULL && backup->write != NULL) {
if ((ret = backup->write(
- env->dbenv, gigs, off, (u_int32_t)nr,
+ env->dbenv, gigs, off, (u_int32_t)nr,
buf, handle)) != 0)
break;
} else {
diff --git a/src/mp/mp_bh.c b/src/mp/mp_bh.c
index 1df8e206..30293f29 100644
--- a/src/mp/mp_bh.c
+++ b/src/mp/mp_bh.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -157,7 +157,7 @@ __memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
opened = 1;
if ((ret = __memp_fopen(dbmfp, mfp, NULL,
NULL, DB_FLUSH | DB_DURABLE_UNKNOWN, 0, mfp->pagesize)) != 0) {
- dbmfp->ref--;
+ dbmfp->ref--;
(void)__memp_fclose(dbmfp, 0);
/*
@@ -264,7 +264,7 @@ __memp_pgread(dbmfp, bhp, can_create)
* how to handle the error.
*/
if (!can_create) {
- ret = DB_PAGE_NOTFOUND;
+ ret = USR_ERR(env, DB_PAGE_NOTFOUND);
goto err;
}
@@ -557,6 +557,9 @@ err: __db_errx(env, DB_STR_A("3016",
* __memp_bhfree --
* Free a bucket header and its referenced data.
*
+ * The hash bucket is unlocked before returning except when flags includes
+ * BH_FREE_UNLOCKED -- or there was no hp passed in to begin with.
+ *
* PUBLIC: int __memp_bhfree __P((DB_MPOOL *,
* PUBLIC: REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t));
*/
@@ -600,10 +603,13 @@ __memp_bhfree(dbmp, infop, mfp, hp, bhp, flags)
(SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off ||
bhp->td_off == INVALID_ROFF ||
IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) ||
+ F_ISSET(bhp, BH_UNREACHABLE) ||
BH_OBSOLETE(bhp, hp->old_reader, vlsn))));
PERFMON3(env, mpool, evict, __memp_fns(dbmp, mfp), bhp->pgno, bhp);
-
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env, "bhfree pgno %lu roff %lx",
+ (u_long)bhp->pgno, (u_long)R_OFFSET(dbmp->reginfo, bhp));
/*
* Delete the buffer header from the hash bucket queue or the
* version chain.
diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c
index 5f9a4bf9..270135bd 100644
--- a/src/mp/mp_fget.c
+++ b/src/mp/mp_fget.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -53,15 +53,19 @@ __memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp)
* time, which we don't want to do because one of our big goals in life
* is to keep database files small. It's sleazy as hell, but we catch
* any attempt to actually write the file in memp_fput().
+ *
+ * CREATE, LAST, and NEW are mutually exclusive. DIRTY and EDIT are also
+ * mutually exclusive - that is checked in __memp_fget() itself..
*/
+#undef OKMODE
#undef OKFLAGS
-#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \
- DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW)
+#define OKMODE (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
+#define OKFLAGS (OKMODE | DB_MPOOL_DIRTY | DB_MPOOL_EDIT)
if (flags != 0) {
if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0)
return (ret);
- switch (FLD_CLR(flags, DB_MPOOL_DIRTY | DB_MPOOL_EDIT)) {
+ switch (FLD_ISSET(flags, OKMODE)) {
case DB_MPOOL_CREATE:
case DB_MPOOL_LAST:
case DB_MPOOL_NEW:
@@ -131,6 +135,7 @@ __memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp)
#ifdef DIAGNOSTIC
DB_LOCKTAB *lt;
DB_LOCKER *locker;
+ int pagelock_err;
#endif
*(void **)addrp = NULL;
@@ -274,7 +279,7 @@ retry: MUTEX_LOCK(env, hp->mtx_hash);
* the BTREE in a subsequent txn).
*/
if (bhp == NULL) {
- ret = DB_PAGE_NOTFOUND;
+ ret = USR_ERR(env, DB_PAGE_NOTFOUND);
goto err;
}
}
@@ -303,7 +308,10 @@ retry: MUTEX_LOCK(env, hp->mtx_hash);
MUTEX_UNLOCK(env, hp->mtx_hash);
h_locked = 0;
if (dirty || extending || makecopy || F_ISSET(bhp, BH_FROZEN)) {
-xlatch: if (LF_ISSET(DB_MPOOL_TRY)) {
+#ifdef HAVE_SHARED_LATCHES
+xlatch:
+#endif
+ if (LF_ISSET(DB_MPOOL_TRY)) {
if ((ret =
MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0)
goto err;
@@ -373,11 +381,11 @@ thawed: need_free = (atomic_dec(env, &bhp->ref) == 0);
bhp = NULL;
goto retry;
} else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) {
- ret = DB_LOCK_DEADLOCK;
+ ret = USR_ERR(env, DB_LOCK_DEADLOCK);
goto err;
} else if (F_ISSET(bhp, BH_FREED) && flags != DB_MPOOL_CREATE &&
flags != DB_MPOOL_NEW && flags != DB_MPOOL_FREE) {
- ret = DB_PAGE_NOTFOUND;
+ ret = USR_ERR(env, DB_PAGE_NOTFOUND);
goto err;
}
@@ -508,9 +516,13 @@ revive: if (F_ISSET(bhp, BH_FREED))
/*
* With multiversion databases, we might need to
* allocate a new buffer into which we can copy the one
- * that we found. In that case, check the last buffer
+ * that we found. In that case, check the old versions
* in the chain to see whether we can reuse an obsolete
- * buffer.
+ * or unreachable buffer. First see whether the oldest
+ * version is truly obsolete. If not, look for somewhat
+ * more recent versions which are no longer needed
+ * because the snapshot transactions which once could
+ * have seen them have now exited.
*
* To provide snapshot isolation, we need to make sure
* that we've seen a buffer older than the oldest
@@ -523,24 +535,17 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
}
if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
SH_CHAIN_HASPREV(bhp, vc)) {
- oldest_bhp = SH_CHAIN_PREVP(bhp, vc, __bh);
- while (SH_CHAIN_HASPREV(oldest_bhp, vc))
- oldest_bhp = SH_CHAIN_PREVP(
- oldest_bhp, vc, __bh);
-
- if (BH_REFCOUNT(oldest_bhp) == 0 &&
- !BH_OBSOLETE(
- oldest_bhp, hp->old_reader, vlsn) &&
- (ret = __txn_oldest_reader(env,
- &hp->old_reader)) != 0)
+ if ((ret = __memp_find_obsolete_version(env,
+ bhp, hp, &oldest_bhp)) != 0)
goto err;
-
- if (BH_OBSOLETE(
- oldest_bhp, hp->old_reader, vlsn) &&
- BH_REFCOUNT(oldest_bhp) == 0) {
+ if (oldest_bhp != NULL) {
DB_ASSERT(env,
!F_ISSET(oldest_bhp, BH_DIRTY));
atomic_inc(env, &oldest_bhp->ref);
+#ifdef HAVE_STATISTICS
+ if (SH_CHAIN_HASPREV(oldest_bhp, vc))
+ c_mp->stat.st_mvcc_reused++;
+#endif
if (F_ISSET(oldest_bhp, BH_FROZEN)) {
/*
* This call will release the
@@ -606,7 +611,7 @@ newpg: /*
mfp->last_pgno >= mfp->maxpgno) {
__db_errx(env, DB_STR_A("3023",
"%s: file limited to %lu pages", "%s %lu"),
- __memp_fn(dbmfp), (u_long)mfp->maxpgno);
+ __memp_fn(dbmfp), (u_long)mfp->maxpgno + 1);
ret = ENOSPC;
} else
*pgnoaddr = mfp->last_pgno + 1;
@@ -615,7 +620,7 @@ newpg: /*
if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) {
__db_errx(env, DB_STR_A("3024",
"%s: file limited to %lu pages", "%s %lu"),
- __memp_fn(dbmfp), (u_long)mfp->maxpgno);
+ __memp_fn(dbmfp), (u_long)mfp->maxpgno + 1);
ret = ENOSPC;
} else if (!extending)
extending = *pgnoaddr > mfp->last_pgno;
@@ -937,8 +942,17 @@ alloc: /* Allocate a new buffer header and data space. */
* need to make copy, so we now need to allocate another buffer
* to hold the new copy.
*/
- if (alloc_bhp == NULL)
+ if (alloc_bhp == NULL) {
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env,
+ "fget makecopy txn %08x %lu/%lu going to reuse pgno %d from %lu/%lu",
+ txn->txnid, td == NULL ? 0L :
+ (u_long)td->read_lsn.file, td == NULL ? 0L :
+ (u_long)td->read_lsn.offset, bhp->pgno,
+ (u_long)VISIBLE_LSN(env, bhp)->file,
+ (u_long)VISIBLE_LSN(env, bhp)->offset);
goto reuse;
+ }
DB_ASSERT(env, bhp != NULL && alloc_bhp != bhp);
DB_ASSERT(env, bhp->td_off == INVALID_ROFF ||
@@ -1019,6 +1033,15 @@ alloc: /* Allocate a new buffer header and data space. */
F_CLR(bhp, BH_EXCLUSIVE);
MUTEX_UNLOCK(env, bhp->mtx_buf);
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env,
+ "fget makecopy txn %08x %lx pgno %d from %lu/%lu",
+ txn->txnid, (u_long)R_OFFSET(infop, bhp),
+ bhp->pgno, bhp->td_off == INVALID_ROFF ? 0L :
+ (u_long)VISIBLE_LSN(env, bhp)->file,
+ bhp->td_off == INVALID_ROFF ? 0L :
+ (u_long)VISIBLE_LSN(env, bhp)->offset);
+
bhp = alloc_bhp;
DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
b_incr = 1;
@@ -1164,8 +1187,15 @@ alloc: /* Allocate a new buffer header and data space. */
lt = env->lk_handle;
locker = (DB_LOCKER *)
(R_ADDR(&lt->reginfo, ip->dbth_locker));
- DB_ASSERT(env, __db_has_pagelock(env, locker, dbmfp,
- (PAGE*)bhp->buf, DB_LOCK_WRITE) == 0);
+ pagelock_err = __db_has_pagelock(env, locker, dbmfp,
+ (PAGE *)bhp->buf, DB_LOCK_WRITE);
+ if (pagelock_err != 0) {
+ if (pagelock_err == DB_RUNRECOVERY)
+ return (pagelock_err);
+ __db_syserr(env, pagelock_err,
+ "Locker %x has no page lock for pgno %d",
+ locker->id, ((PAGE *)bhp->buf)->pgno);
+ }
}
#endif
@@ -1228,3 +1258,85 @@ err: /*
return (ret);
}
+
+/*
+ * __memp_find_obsolete_version --
+ *
+ * Search the version chain, from oldest to youngest, looking for buffers
+ * which are no longer BH_VISIBLE() to any existing transaction.
+ *
+ * The hash bucket is locked, no buffer is locked.
+ *
+ * PUBLIC: int __memp_find_obsolete_version
+ * PUBLIC: __P((ENV *, BH *, DB_MPOOL_HASH *, BH **));
+ */
+int
+__memp_find_obsolete_version(env, vis_bhp, hp, foundp)
+ ENV *env;
+ BH *vis_bhp;
+ DB_MPOOL_HASH *hp;
+ BH **foundp;
+{
+ BH *bhp;
+ DB_LSN *readers, vlsn;
+ int n_readers, ret;
+
+ *foundp = NULL;
+ readers = NULL;
+ ret = 0;
+ bhp = SH_CHAIN_PREVP(vis_bhp, vc, __bh);
+ while (SH_CHAIN_HASPREV(bhp, vc))
+ bhp = SH_CHAIN_PREVP(bhp, vc, __bh);
+
+ /*
+ * The least-expensive case is finding an obsolete version without
+ * needing to build the active snapshot transactionn list.
+ */
+ if (BH_OBSOLETE(bhp, hp->old_reader, vlsn) && BH_REFCOUNT(bhp) == 0) {
+ *foundp = bhp;
+ goto out;
+ }
+
+ if ((ret = __txn_get_readers(env, &readers, &n_readers)) != 0)
+ goto out;
+
+ if (LOG_COMPARE(&readers[n_readers - 1], &hp->old_reader) > 0) {
+ hp->old_reader = readers[n_readers - 1];
+ if (BH_OBSOLETE(bhp, hp->old_reader, vlsn) &&
+ BH_REFCOUNT(bhp) == 0) {
+ *foundp = bhp;
+ goto cleanup;
+ }
+ }
+
+ while ((bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) != vis_bhp) {
+ if (BH_REFCOUNT(bhp) == 0 &&
+ __memp_bh_unreachable(env, bhp, readers, n_readers)) {
+ *foundp = bhp;
+#ifdef DIAGNOSTIC
+ /*
+ * Usually when the hash bucket is locked, the refcount
+ * is incremented and the bucket unlocked before the
+ * buffer is locked; this avoids mtx_buf deadlocks.
+ * This unreachable version cannot be involved with any
+ * deadlock-creating locking, though the head of the
+ * version chain could be locked. No TRYLOCK needed.
+ */
+ MUTEX_LOCK(env, bhp->mtx_buf);
+ F_SET(bhp, BH_UNREACHABLE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+#endif
+ break;
+ }
+ }
+
+cleanup:
+ if (readers != NULL)
+ __os_free(env, readers);
+out:
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC) && *foundp != NULL)
+ __db_msg(env, "fget reusing %p pgno %d @%lu/%lu", bhp,
+ bhp->pgno, (u_long)VISIBLE_LSN(env, bhp)->file,
+ (u_long)VISIBLE_LSN(env, bhp)->offset);
+ return (ret);
+}
diff --git a/src/mp/mp_fmethod.c b/src/mp/mp_fmethod.c
index 41bd638c..4974f57c 100644
--- a/src/mp/mp_fmethod.c
+++ b/src/mp/mp_fmethod.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -315,7 +315,7 @@ __memp_set_lsn_offset(dbmfp, lsn_offset)
/*
* __memp_get_maxsize --
- * Get the file's maximum size.
+ * Get the file's maximum size, returning zeroes if none is set.
*/
static int
__memp_get_maxsize(dbmfp, gbytesp, bytesp)
@@ -334,11 +334,22 @@ __memp_get_maxsize(dbmfp, gbytesp, bytesp)
ENV_ENTER(env, ip);
MUTEX_LOCK(env, mfp->mutex);
- *gbytesp = (u_int32_t)
- (mfp->maxpgno / (GIGABYTE / mfp->pagesize));
- *bytesp = (u_int32_t)
- ((mfp->maxpgno % (GIGABYTE / mfp->pagesize)) *
- mfp->pagesize);
+ if (mfp->maxpgno == 0) {
+ *gbytesp = *bytesp = 0;
+ } else {
+ *gbytesp = (u_int32_t)
+ (mfp->maxpgno / (GIGABYTE / mfp->pagesize));
+ *bytesp = (u_int32_t) (mfp->maxpgno %
+ (GIGABYTE / mfp->pagesize) + 1) * mfp->pagesize;
+ /*
+ * After converting from 0-based maxpgno to #pages, we
+ * might have bumped into the next gigabyte boundary.
+ */
+ if (*bytesp >= GIGABYTE) {
+ *bytesp -= GIGABYTE;
+ *gbytesp += 1;
+ }
+ }
MUTEX_UNLOCK(env, mfp->mutex);
ENV_LEAVE(env, ip);
@@ -348,8 +359,34 @@ __memp_get_maxsize(dbmfp, gbytesp, bytesp)
}
/*
+ * __memp_set_maxpgno --
+ * Set the file's maxpgno from the configured max size. If that size is
+ * pagesize or less then the filesize limit is disabled.
+ *
+ * PUBLIC: void __memp_set_maxpgno __P((MPOOLFILE *, u_int32_t, u_int32_t));
+ */
+void
+__memp_set_maxpgno(mfp, gbytes, bytes)
+ MPOOLFILE *mfp;
+ u_int32_t gbytes, bytes;
+{
+ if (gbytes == 0 && bytes <= mfp->pagesize)
+ mfp->maxpgno = 0;
+ else {
+ mfp->maxpgno = (db_pgno_t)
+ (gbytes * (GIGABYTE / mfp->pagesize));
+ /* Round up to account for any fractional page. */
+ mfp->maxpgno += (db_pgno_t)
+ ((bytes + mfp->pagesize - 1) / mfp->pagesize);
+ /* Convert from #pages to the zero-based max pgno. */
+ mfp->maxpgno--;
+ }
+}
+
+/*
* __memp_set_maxsize --
- * Set the file's maximum size.
+ * Set the file's maximum size; if the size is <= pagesize then
+ * remove any file size limit.
*/
static int
__memp_set_maxsize(dbmfp, gbytes, bytes)
@@ -368,10 +405,7 @@ __memp_set_maxsize(dbmfp, gbytes, bytes)
ENV_ENTER(env, ip);
MUTEX_LOCK(env, mfp->mutex);
- mfp->maxpgno = (db_pgno_t)
- (gbytes * (GIGABYTE / mfp->pagesize));
- mfp->maxpgno += (db_pgno_t)
- ((bytes + mfp->pagesize - 1) / mfp->pagesize);
+ __memp_set_maxpgno(mfp, gbytes, bytes);
MUTEX_UNLOCK(env, mfp->mutex);
ENV_LEAVE(env, ip);
diff --git a/src/mp/mp_fopen.c b/src/mp/mp_fopen.c
index ef7f886a..dbe7b9c8 100644
--- a/src/mp/mp_fopen.c
+++ b/src/mp/mp_fopen.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -89,8 +89,9 @@ __memp_fopen_pp(dbmfp, path, flags, mode, pagesize)
* Generate the number of user opens. If there is no backing file
* there is an extra open count to keep the in memory db around.
*/
-#define MFP_OPEN_CNT(mfp) ((mfp)->mpf_cnt - ((mfp)->neutral_cnt + \
+#define MFP_OPEN_CNT(mfp) ((mfp)->mpf_cnt - ((mfp)->neutral_cnt + \
(u_int32_t)(mfp)->no_backing_file))
+#define MP_IOINFO_RETRIES 5
/*
* __memp_fopen --
* DB_MPOOLFILE->open.
@@ -118,7 +119,7 @@ __memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize)
size_t maxmap;
db_pgno_t last_pgno;
u_int32_t bucket, mbytes, bytes, oflags, pagesize;
- int refinc, ret, isdir;
+ int isdir, refinc, ret, tries;
char *rpath;
/* If this handle is already open, return. */
@@ -249,7 +250,7 @@ __memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize)
if (MFP_OPEN_CNT(mfp) > 0 &&
atomic_read(&mfp->multiversion) == 0) {
mvcc_err: __db_errx(env, DB_STR("3041",
-"DB_MULTIVERSION cannot be specified on a database file which is already open"));
+"DB_MULTIVERSION cannot be specified on a database file that is already open"));
ret = EINVAL;
goto err;
}
@@ -399,11 +400,44 @@ mvcc_err: __db_errx(env, DB_STR("3041",
if (LF_ISSET(DB_ODDFILESIZE))
bytes -= (u_int32_t)(bytes % pagesize);
else {
- __db_errx(env, DB_STR_A("3037",
- "%s: file size not a multiple of the pagesize", "%s"),
- rpath);
- ret = EINVAL;
- goto err;
+ /*
+ * If the file size is not a multiple of the
+ * pagesize, it is likely because the ioinfo
+ * call is racing with a write that is extending
+ * the file. Many file systems will extend
+ * in fs block size units, and if the pagesize
+ * is larger than that, we can briefly see a
+ * file size that is not a multiple of pagesize.
+ *
+ * Yield the processor to allow that to finish
+ * and try again a few times.
+ */
+ tries = 0;
+ STAT((mp->stat.st_oddfsize_detect++));
+ while (tries < MP_IOINFO_RETRIES) {
+ if ((ret = __os_ioinfo(env, rpath,
+ dbmfp->fhp, &mbytes, &bytes,
+ NULL)) != 0) {
+ __db_err(env, ret, "%s", rpath);
+ goto err;
+ }
+ if (bytes % pagesize != 0) {
+ __os_yield(env, 0, 50000);
+ tries++;
+ } else {
+ STAT((
+ mp->stat.st_oddfsize_resolve++));
+ break;
+ }
+ }
+ if (tries == MP_IOINFO_RETRIES) {
+ __db_errx(env, DB_STR_A("3043",
+ "%s: file size (%lu %lu) not a multiple of the pagesize %lu",
+ "%s %lu %lu %lu"),
+ rpath, (u_long)mbytes, (u_long)bytes, (u_long)pagesize);
+ ret = EINVAL;
+ goto err;
+ }
}
}
@@ -786,13 +820,7 @@ __memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
mfp->lsn_off = dbmfp->lsn_offset;
mfp->clear_len = dbmfp->clear_len;
mfp->priority = dbmfp->priority;
- if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) {
- mfp->maxpgno = (db_pgno_t)
- (dbmfp->gbytes * (GIGABYTE / mfp->pagesize));
- mfp->maxpgno += (db_pgno_t)
- ((dbmfp->bytes + mfp->pagesize - 1) /
- mfp->pagesize);
- }
+ __memp_set_maxpgno(mfp, dbmfp->gbytes, dbmfp->bytes);
if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
mfp->no_backing_file = 1;
if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK))
@@ -1019,6 +1047,7 @@ __memp_fclose(dbmfp, flags)
ret = t_ret;
__os_free(env, rpath);
}
+ mfp->unlink_on_close = 0;
}
if (MFP_OPEN_CNT(mfp) == 0) {
F_CLR(mfp, MP_NOT_DURABLE);
@@ -1068,6 +1097,7 @@ __memp_mf_discard(dbmp, mfp, hp_locked)
DB_MPOOL_STAT *sp;
#endif
MPOOL *mp;
+ char *rpath;
int need_sync, ret, t_ret;
env = dbmp->env;
@@ -1095,6 +1125,23 @@ __memp_mf_discard(dbmp, mfp, hp_locked)
*/
mfp->deadfile = 1;
+ /* We should unlink the file if necessary. */
+ if (mfp->block_cnt == 0 && mfp->mpf_cnt == 0 && mfp->unlink_on_close &&
+ !F_ISSET(mfp, MP_TEMP) && !mfp->no_backing_file) {
+ if ((t_ret = __db_appname(env, DB_APP_DATA,
+ R_ADDR(dbmp->reginfo, mfp->path_off), NULL,
+ &rpath)) != 0 && ret == 0)
+ ret = t_ret;
+ if (t_ret == 0) {
+ if ((t_ret = __os_unlink(
+ dbmp->env, rpath, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, rpath);
+ }
+ mfp->unlink_on_close = 0;
+ need_sync = 0;
+ }
+
/* Discard the mutex we're holding and return it too the pool. */
MUTEX_UNLOCK(env, mfp->mutex);
if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0)
diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c
index 7a900fd0..06b30fd4 100644
--- a/src/mp/mp_fput.c
+++ b/src/mp/mp_fput.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -52,7 +52,8 @@ __memp_fput_pp(dbmfp, pgaddr, priority, flags)
/*
* __memp_fput --
- * DB_MPOOLFILE->put.
+ * DB_MPOOLFILE->put. Release this reference to the page. If the reference
+ * count drop to zero adjust the buffer's cache priority.
*
* PUBLIC: int __memp_fput __P((DB_MPOOLFILE *,
* PUBLIC: DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY));
diff --git a/src/mp/mp_fset.c b/src/mp/mp_fset.c
index 1129853f..770ec5c8 100644
--- a/src/mp/mp_fset.c
+++ b/src/mp/mp_fset.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/mp/mp_method.c b/src/mp/mp_method.c
index 7afae248..56d6c42b 100644
--- a/src/mp/mp_method.c
+++ b/src/mp/mp_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -67,6 +67,7 @@ __memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep)
int *ncachep;
{
DB_MPOOL *dbmp;
+ DB_THREAD_INFO *ip;
ENV *env;
MPOOL *mp;
@@ -78,12 +79,16 @@ __memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep)
if (MPOOL_ON(env)) {
dbmp = env->mp_handle;
mp = dbmp->reginfo[0].primary;
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, mp->mtx_resize);
if (gbytesp != NULL)
*gbytesp = mp->gbytes;
if (bytesp != NULL)
*bytesp = mp->bytes;
if (ncachep != NULL)
*ncachep = (int)mp->nreg;
+ MUTEX_UNLOCK(env, mp->mtx_resize);
+ ENV_LEAVE(env, ip);
} else {
if (gbytesp != NULL)
*gbytesp = dbenv->mp_gbytes;
@@ -380,7 +385,7 @@ __memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_max_write", DB_INIT_MPOOL);
if (MPOOL_ON(env)) {
dbmp = env->mp_handle;
@@ -448,7 +453,7 @@ __memp_set_mp_mmapsize(dbenv, mp_mmapsize)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->set_mp_max_mmapsize", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_mmapsize", DB_INIT_MPOOL);
if (MPOOL_ON(env)) {
dbmp = env->mp_handle;
@@ -512,7 +517,7 @@ __memp_set_mp_pagesize(dbenv, mp_pagesize)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_pagesize", DB_INIT_MPOOL);
ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_pagesize");
dbenv->mp_pagesize = mp_pagesize;
@@ -561,7 +566,7 @@ __memp_set_mp_tablesize(dbenv, mp_tablesize)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_tablesize", DB_INIT_MPOOL);
ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_tablesize");
dbenv->mp_tablesize = mp_tablesize;
@@ -583,7 +588,7 @@ __memp_get_mp_mtxcount(dbenv, mp_mtxcountp)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_mtxcount", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->get_mp_mtxcount", DB_INIT_MPOOL);
if (MPOOL_ON(env)) {
dbmp = env->mp_handle;
@@ -610,7 +615,7 @@ __memp_set_mp_mtxcount(dbenv, mp_mtxcount)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_mtxcount", DB_INIT_MPOOL);
ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_mtxcount");
dbenv->mp_mtxcount = mp_mtxcount;
@@ -870,7 +875,7 @@ __memp_ftruncate(dbmfp, txn, ip, pgno, flags)
!mfp->no_backing_file && pgno <= mfp->last_flushed_pgno)
#ifdef HAVE_FTRUNCATE
ret = __os_truncate(env,
- dbmfp->fhp, pgno, mfp->pagesize);
+ dbmfp->fhp, pgno, mfp->pagesize, 0);
#else
ret = __db_zero_extend(env,
dbmfp->fhp, pgno, mfp->last_pgno, mfp->pagesize);
diff --git a/src/mp/mp_mvcc.c b/src/mp/mp_mvcc.c
index 47531528..b51ae135 100644
--- a/src/mp/mp_mvcc.c
+++ b/src/mp/mp_mvcc.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -151,6 +151,11 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
real_name = NULL;
fhp = NULL;
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env, "freeze %s %d @%lu/%lu", __memp_fns(dbmp, mfp),
+ bhp->pgno, (u_long)VISIBLE_LSN(env, bhp)->file,
+ (u_long)VISIBLE_LSN(env, bhp)->offset);
+
MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE);
MPOOL_REGION_LOCK(env, infop);
@@ -161,7 +166,7 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
} else {
*need_frozenp = 1;
- /* There might be a small amount of unallocated space. */
+ /* There might be enough space for a single-item block. */
if (__env_alloc(infop,
sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE),
&frozen_alloc) == 0) {
@@ -405,6 +410,12 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
ret = 0;
real_name = NULL;
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env, "thaw %s %d @%lu/%lu", __memp_fns(dbmp, mfp),
+ frozen_bhp->pgno,
+ (u_long)VISIBLE_LSN(env, frozen_bhp)->file,
+ (u_long)VISIBLE_LSN(env, frozen_bhp)->offset);
+
MUTEX_REQUIRED(env, hp->mtx_hash);
DB_ASSERT(env, F_ISSET(frozen_bhp, BH_EXCLUSIVE) || alloc_bhp == NULL);
h_locked = 1;
@@ -414,7 +425,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
DB_ASSERT(env, alloc_bhp != NULL ||
SH_CHAIN_SINGLETON(frozen_bhp, vc) ||
(SH_CHAIN_HASNEXT(frozen_bhp, vc) &&
- BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn)));
+ BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn)) ||
+ F_ISSET(frozen_bhp, BH_UNREACHABLE));
DB_ASSERT(env, alloc_bhp == NULL || !F_ISSET(alloc_bhp, BH_FROZEN));
spgno = ((BH_FROZEN_PAGE *)frozen_bhp)->spgno;
@@ -516,7 +528,7 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
else {
maxpgno -= (db_pgno_t)ntrunc;
if ((ret = __os_truncate(env, fhp,
- maxpgno + 1, pagesize)) != 0)
+ maxpgno + 1, pagesize, 0)) != 0)
goto err;
/* Fix up the linked list */
diff --git a/src/mp/mp_region.c b/src/mp/mp_region.c
index 07134de7..ba836cf4 100644
--- a/src/mp/mp_region.c
+++ b/src/mp/mp_region.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -11,7 +11,7 @@
#include "db_int.h"
#include "dbinc/mp.h"
-static int __memp_init_config __P((ENV *, MPOOL *));
+static int __memp_init_config __P((ENV *, MPOOL *, int));
static void __memp_region_size __P((ENV *, roff_t *, u_int32_t *));
#define MPOOL_DEFAULT_PAGESIZE (4 * 1024)
@@ -34,7 +34,7 @@ __memp_open(env, create_ok)
roff_t cache_size, max_size, reg_size;
u_int i, max_nreg;
u_int32_t htab_buckets, *regids;
- int ret;
+ int create, ret;
dbenv = env->dbenv;
cache_size = 0;
@@ -77,7 +77,8 @@ __memp_open(env, create_ok)
* If we created the region, initialize it. Create or join any
* additional regions.
*/
- if (F_ISSET(&reginfo, REGION_CREATE)) {
+ create = F_ISSET(&reginfo, REGION_CREATE);
+ if (create) {
/*
* We define how many regions there are going to be, allocate
* the REGINFO structures and create them. Make sure we don't
@@ -167,23 +168,38 @@ __memp_open(env, create_ok)
env->mp_handle = dbmp;
/* A process joining the region may reset the mpool configuration. */
- if ((ret = __memp_init_config(env, mp)) != 0)
+ if ((ret = __memp_init_config(env, mp, create)) != 0)
return (ret);
return (0);
-err: env->mp_handle = NULL;
- if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
- for (i = 0; i < dbenv->mp_ncache; ++i)
+err: (void)__mutex_free(env, &dbmp->mutex);
+ (void)__memp_region_detach(env, dbmp);
+ return (ret);
+}
+
+/* __memp_region_detach
+ * Detach from any attached mempool regions.
+ *
+ * PUBLIC: int __memp_region_detach __P((ENV *, DB_MPOOL *));
+ */
+int
+__memp_region_detach(env, dbmp)
+ ENV *env;
+ DB_MPOOL *dbmp;
+{
+ u_int i;
+
+ if (dbmp != NULL &&
+ dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
+ for (i = 0; i < env->dbenv->mp_ncache; ++i)
if (dbmp->reginfo[i].id != INVALID_REGION_ID)
(void)__env_region_detach(
env, &dbmp->reginfo[i], 0);
__os_free(env, dbmp->reginfo);
}
-
- (void)__mutex_free(env, &dbmp->mutex);
- __os_free(env, dbmp);
- return (ret);
+ env->mp_handle = NULL;
+ return (0);
}
/*
@@ -207,7 +223,7 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
MPOOL *mp, *main_mp;
REGINFO *infop;
db_mutex_t mtx_base, mtx_discard, mtx_prev;
- u_int32_t i;
+ u_int32_t i, mp_mtxcount;
int ret;
void *p;
@@ -224,6 +240,23 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
__mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0)
return (ret);
+ /*
+ * Intializing the first mpool region allocates the mpool region id
+ * array, file table and, if not ENV_PRIVATE, all the cache regions'
+ * hash bucket mutexes in a single contiguous block of mutex ids, which
+ * remain allocated when the cache is resized. The block is 'known' to
+ * start with the first id (mtx_base), and to end #regions * mp_mtxcount
+ * later. In private environments, mutex ids are not smallish integers,
+ * but __env_alloc()'d pointers. Since a range of (base, count) doesn't
+ * work for these likely-scattered mutexes, we allocate private threaded
+ * mutexes as they are needed. Private non-threaded caches don't need
+ * any mutexes at all.
+ */
+ if ((mp_mtxcount = dbenv->mp_mtxcount) == 0)
+ mp_mtxcount = dbenv->mp_mtxcount = htab_buckets;
+ if (!MUTEX_ON(env) ||
+ F_ISSET(env, ENV_PRIVATE | ENV_THREAD) == ENV_PRIVATE)
+ mp_mtxcount = dbenv->mp_mtxcount = 0;
if (reginfo_off == 0) {
ZERO_LSN(mp->lsn);
@@ -248,15 +281,10 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
atomic_init(&htab[i].hash_page_dirty, 0);
}
- /*
- * Allocate all of the hash bucket mutexes up front. We do
- * this so that we don't need to free and reallocate mutexes as
- * the cache is resized.
- */
mtx_base = mtx_prev = MUTEX_INVALID;
- if (!MUTEX_ON(env) || F_ISSET(env, ENV_PRIVATE))
+ if (F_ISSET(env, ENV_PRIVATE))
goto no_prealloc;
- for (i = 0; i < mp->max_nreg * dbenv->mp_mtxcount; i++) {
+ for (i = 0; i < mp->max_nreg * mp_mtxcount; i++) {
if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
DB_MUTEX_SHARED, &mtx_discard)) != 0)
return (ret);
@@ -274,13 +302,12 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
}
/*
- * We preallocated all of the mutexes in a block, so for regions after
- * the first, we skip mutexes in use in earlier regions. Each region
- * has the same number of buckets
+ * If we preallocated all the mutexes, then in regions after the first,
+ * we skip mutexes in use in earlier regions. Each region has the same
+ * number of buckets.
*/
no_prealloc:
- if (MUTEX_ON(env))
- mtx_base += reginfo_off * dbenv->mp_mtxcount;
+ mtx_base += reginfo_off * mp_mtxcount;
/* Allocate hash table space and initialize it. */
if ((ret = __env_alloc(infop,
@@ -289,18 +316,21 @@ no_prealloc:
mp->htab = R_OFFSET(infop, htab);
for (i = 0; i < htab_buckets; i++) {
hp = &htab[i];
- if (!MUTEX_ON(env) || dbenv->mp_mtxcount == 0)
+ /*
+ * Set mtx_hash to do no locking, or share a mutex with an
+ * earlier hash bucket in this region, or assign it from the
+ * block of mutexes allocated above, or (in a private
+ * environment) allocate a new mutex.
+ */
+ if (mp_mtxcount == 0)
hp->mtx_hash = MUTEX_INVALID;
- else if (F_ISSET(env, ENV_PRIVATE)) {
- if (i >= dbenv->mp_mtxcount)
- hp->mtx_hash =
- htab[i % dbenv->mp_mtxcount].mtx_hash;
- else if
- ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
- DB_MUTEX_SHARED, &hp->mtx_hash)) != 0)
- return (ret);
- } else
- hp->mtx_hash = mtx_base + (i % dbenv->mp_mtxcount);
+ else if (i >= mp_mtxcount)
+ hp->mtx_hash = htab[i % mp_mtxcount].mtx_hash;
+ else if (!F_ISSET(env, ENV_PRIVATE))
+ hp->mtx_hash = mtx_base + i;
+ else if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
+ DB_MUTEX_SHARED, &hp->mtx_hash)) != 0)
+ return (ret);
SH_TAILQ_INIT(&hp->hash_bucket);
atomic_init(&hp->hash_page_dirty, 0);
#ifdef HAVE_STATISTICS
@@ -311,7 +341,7 @@ no_prealloc:
ZERO_LSN(hp->old_reader);
}
mp->htab_buckets = htab_buckets;
- mp->htab_mutexes = dbenv->mp_mtxcount;
+ mp->htab_mutexes = mp_mtxcount;
mp->pagesize = dbenv->mp_pagesize == 0 ?
MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize;
@@ -443,11 +473,21 @@ __memp_region_mutex_count(env)
dbenv = env->dbenv;
__memp_region_size(env, &reg_size, &htab_buckets);
- if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
- pgsize = sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE);
- if ((pgsize = dbenv->mp_pagesize) == 0)
- pgsize = MPOOL_DEFAULT_PAGESIZE;
+ if (dbenv->mp_mtxcount != 0)
+ htab_buckets = dbenv->mp_mtxcount;
max_region = __memp_max_regions(env);
+ if ((pgsize = dbenv->mp_pagesize) == 0) {
+ /*
+ * If MVCC is on during environment creation, provide enough
+ * mutexes so that half the cache can be frozen buffer headers.
+ */
+ if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
+ pgsize = (MPOOL_DEFAULT_PAGESIZE +
+ sizeof(BH_FROZEN_ALLOC) +
+ sizeof(BH_FROZEN_PAGE)) / 2;
+ else
+ pgsize = MPOOL_DEFAULT_PAGESIZE;
+ }
/*
* We need a couple of mutexes for the region itself, one for each
@@ -456,10 +496,6 @@ __memp_region_mutex_count(env)
* hash bucket. We then need one mutex per page in the cache,
* the worst case is really big if the pages are 512 bytes.
*/
- if (dbenv->mp_mtxcount != 0)
- htab_buckets = dbenv->mp_mtxcount;
- else
- dbenv->mp_mtxcount = htab_buckets;
num_per_cache = htab_buckets + (u_int32_t)(reg_size / pgsize);
return ((max_region * num_per_cache) + 50 + MPOOL_FILE_BUCKETS);
}
@@ -469,23 +505,39 @@ __memp_region_mutex_count(env)
* Initialize shared configuration information.
*/
static int
-__memp_init_config(env, mp)
+__memp_init_config(env, mp, create)
ENV *env;
MPOOL *mp;
+ int create;
{
DB_ENV *dbenv;
dbenv = env->dbenv;
MPOOL_SYSTEM_LOCK(env);
- if (dbenv->mp_mmapsize != 0)
+ if (create) {
mp->mp_mmapsize = (db_size_t)dbenv->mp_mmapsize;
- if (dbenv->mp_maxopenfd != 0)
mp->mp_maxopenfd = dbenv->mp_maxopenfd;
- if (dbenv->mp_maxwrite != 0)
mp->mp_maxwrite = dbenv->mp_maxwrite;
- if (dbenv->mp_maxwrite_sleep != 0)
mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep;
+ } else {
+ if (dbenv->mp_mmapsize != 0 &&
+ mp->mp_mmapsize != (db_size_t)dbenv->mp_mmapsize)
+ __db_msg(env, DB_STR("3044",
+"Warning: Ignoring maximum memory map size when joining environment"));
+
+ if (dbenv->mp_maxopenfd != 0 &&
+ mp->mp_maxopenfd != dbenv->mp_maxopenfd)
+ __db_msg(env, DB_STR("3045",
+"Warning: Ignoring max open file descriptors value when joining environment"));
+
+ if ((dbenv->mp_maxwrite != 0 &&
+ mp->mp_maxwrite != dbenv->mp_maxwrite) ||
+ (dbenv->mp_maxwrite_sleep != 0 &&
+ mp->mp_maxwrite_sleep != dbenv->mp_maxwrite_sleep))
+ __db_msg(env, DB_STR("3046",
+"Warning: Ignoring maximum sequential writes value when joining environment"));
+ }
MPOOL_SYSTEM_UNLOCK(env);
return (0);
@@ -501,22 +553,18 @@ int
__memp_env_refresh(env)
ENV *env;
{
- BH *bhp;
- BH_FROZEN_ALLOC *frozen_alloc;
DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
- DB_MPOOL_HASH *hp;
DB_MPREG *mpreg;
MPOOL *mp, *c_mp;
REGINFO *infop;
- u_int32_t bucket, i, nreg;
+ u_int32_t i, nreg;
int ret, t_ret;
ret = 0;
dbmp = env->mp_handle;
mp = dbmp->reginfo[0].primary;
nreg = mp->nreg;
- hp = R_ADDR(&dbmp->reginfo[0], mp->htab);
/*
* If a private region, return the memory to the heap. Not needed for
@@ -526,49 +574,20 @@ __memp_env_refresh(env)
if (!F_ISSET(env, ENV_PRIVATE))
goto not_priv;
- /* Discard buffers. */
for (i = 0; i < nreg; ++i) {
infop = &dbmp->reginfo[i];
- c_mp = infop->primary;
- for (hp = R_ADDR(infop, c_mp->htab), bucket = 0;
- bucket < c_mp->htab_buckets; ++hp, ++bucket) {
- while ((bhp = SH_TAILQ_FIRST(
- &hp->hash_bucket, __bh)) != NULL)
- if (F_ISSET(bhp, BH_FROZEN))
- SH_TAILQ_REMOVE(
- &hp->hash_bucket, bhp,
- hq, __bh);
- else {
- if (F_ISSET(bhp, BH_DIRTY)) {
- atomic_dec(env,
- &hp->hash_page_dirty);
- F_CLR(bhp,
- BH_DIRTY | BH_DIRTY_CREATE);
- }
- atomic_inc(env, &bhp->ref);
- if ((t_ret = __memp_bhfree(dbmp, infop,
- R_ADDR(dbmp->reginfo,
- bhp->mf_offset), hp, bhp,
- BH_FREE_FREEMEM |
- BH_FREE_UNLOCKED)) != 0 && ret == 0)
- ret = t_ret;
- }
- }
- MPOOL_REGION_LOCK(env, infop);
- while ((frozen_alloc = SH_TAILQ_FIRST(
- &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) {
- SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc,
- links, __bh_frozen_a);
- __env_alloc_free(infop, frozen_alloc);
- }
- MPOOL_REGION_UNLOCK(env, infop);
+ if ((t_ret = __memp_region_bhfree(infop)) != 0 && ret == 0)
+ ret = t_ret;
}
not_priv:
/* Discard DB_MPOOLFILEs. */
while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
- if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0 && ret == 0)
- ret = t_ret;
+ if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
/* Discard DB_MPREGs. */
if (dbmp->pg_inout != NULL)
@@ -618,3 +637,62 @@ not_priv:
env->mp_handle = NULL;
return (ret);
}
+
+/*
+ * __memp_region_bhfree --
+ * Discard the buffers for a region.
+ *
+ * PUBLIC: int __memp_region_bhfree __P((REGINFO *));
+ */
+int
+__memp_region_bhfree(infop)
+ REGINFO *infop;
+{
+ BH *bhp;
+ BH_FROZEN_ALLOC *frozen_alloc;
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ ENV *env;
+ MPOOL *c_mp;
+ u_int32_t bucket;
+ int ret, t_ret;
+
+ env = infop->env;
+ dbmp = env->mp_handle;
+ ret = 0;
+
+ /* Discard buffers. */
+ c_mp = infop->primary;
+ for (hp = R_ADDR(infop, c_mp->htab), bucket = 0;
+ bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ while ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
+ if (F_ISSET(bhp, BH_FROZEN))
+ SH_TAILQ_REMOVE(&hp->hash_bucket,
+ bhp, hq, __bh);
+ else {
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ atomic_dec(env, &hp->hash_page_dirty);
+ F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+ }
+ atomic_inc(env, &bhp->ref);
+ if ((t_ret = __memp_bhfree(dbmp, infop,
+ R_ADDR(dbmp->reginfo, bhp->mf_offset),
+ hp, bhp, BH_FREE_FREEMEM |
+ BH_FREE_UNLOCKED)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+ }
+ }
+ MPOOL_REGION_LOCK(env, infop);
+ while ((frozen_alloc = SH_TAILQ_FIRST(
+ &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) {
+ SH_TAILQ_REMOVE(&c_mp->alloc_frozen,
+ frozen_alloc, links, __bh_frozen_a);
+ __env_alloc_free(infop, frozen_alloc);
+ }
+ MPOOL_REGION_UNLOCK(env, infop);
+
+ return (ret);
+}
diff --git a/src/mp/mp_register.c b/src/mp/mp_register.c
index dc7015a7..cc59af9c 100644
--- a/src/mp/mp_register.c
+++ b/src/mp/mp_register.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/mp/mp_resize.c b/src/mp/mp_resize.c
index 97719554..932a1baa 100644
--- a/src/mp/mp_resize.c
+++ b/src/mp/mp_resize.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -126,12 +126,13 @@ __memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket)
MPOOLFILE *mfp;
REGINFO *new_infop, *old_infop;
u_int32_t bucket, high_mask, new_region, old_region;
- int ret;
+ int expanding, ret;
env = dbmp->env;
mp = dbmp->reginfo[0].primary;
new_bhp = NULL;
ret = 0;
+ expanding = (mp->nbuckets > new_nbuckets) ? 0 : 1;
MP_MASK(new_nbuckets, high_mask);
@@ -150,36 +151,42 @@ __memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket)
/*
* Before merging, we need to check that there are no old buffers left
* in the target hash bucket after a previous split.
+ * Only free the buffers if we are expanding into new buckets. If
+ * we are contracting, the buffers in the original (old) bucket should
+ * not be freed.
*/
free_old:
- MUTEX_LOCK(env, new_hp->mtx_hash);
- SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) {
- MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket);
+ if (expanding != 0) {
+ MUTEX_LOCK(env, new_hp->mtx_hash);
+ SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) {
+ MP_BUCKET(
+ bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket);
+
+ if (bucket != new_bucket) {
+ /*
+ * There is no way that an old buffer can be
+ * locked after a split, since everyone will
+ * look for it in the new hash bucket.
+ */
+ DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) &&
+ atomic_read(&bhp->ref) == 0);
+ atomic_inc(env, &bhp->ref);
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ if ((ret = __memp_bhfree(dbmp, new_infop,
+ mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) {
+ MUTEX_UNLOCK(env, new_hp->mtx_hash);
+ return (ret);
+ }
- if (bucket != new_bucket) {
- /*
- * There is no way that an old buffer can be locked
- * after a split, since everyone will look for it in
- * the new hash bucket.
- */
- DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) &&
- atomic_read(&bhp->ref) == 0);
- atomic_inc(env, &bhp->ref);
- mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
- if ((ret = __memp_bhfree(dbmp, new_infop,
- mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) {
- MUTEX_UNLOCK(env, new_hp->mtx_hash);
- return (ret);
+ /*
+ * The free has modified the list of buffers and
+ * dropped the mutex. We need to start again.
+ */
+ goto free_old;
}
-
- /*
- * The free has modified the list of buffers and
- * dropped the mutex. We need to start again.
- */
- goto free_old;
}
+ MUTEX_UNLOCK(env, new_hp->mtx_hash);
}
- MUTEX_UNLOCK(env, new_hp->mtx_hash);
/*
* Before we begin, make sure that all of the buffers we care about are
@@ -305,7 +312,9 @@ err: atomic_dec(env, &bhp->ref);
next_bhp, alloc_bhp, vc, __bh);
}
- DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash);
+ /* The mutexes must be different, unless they aren't in use. */
+ DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash ||
+ new_hp->mtx_hash == MUTEX_INVALID);
MUTEX_LOCK(env, new_hp->mtx_hash);
SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq);
if (F_ISSET(new_bhp, BH_DIRTY))
@@ -362,16 +371,15 @@ __memp_add_region(dbmp)
MPOOL *mp;
REGINFO *infop;
int ret;
- roff_t cache_size, reg_size;
+ roff_t reg_size;
u_int i;
u_int32_t *regids;
env = dbmp->env;
mp = dbmp->reginfo[0].primary;
- cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes;
/* All cache regions are the same size. */
- reg_size = dbmp->reginfo[0].rp->size;
+ reg_size = dbmp->reginfo[0].rp->max;
ret = 0;
infop = &dbmp->reginfo[mp->nreg];
@@ -384,9 +392,6 @@ __memp_add_region(dbmp)
if ((ret = __memp_init(env,
dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0)
return (ret);
- cache_size += reg_size;
- mp->gbytes = (u_int32_t)(cache_size / GIGABYTE);
- mp->bytes = (u_int32_t)(cache_size % GIGABYTE);
regids = R_ADDR(dbmp->reginfo, mp->regids);
regids[mp->nreg++] = infop->id;
@@ -425,16 +430,13 @@ __memp_remove_region(dbmp)
{
DB_MPOOL_HASH *hp;
ENV *env;
- MPOOL *mp;
+ MPOOL *mp, *c_mp;
REGINFO *infop;
int ret;
- roff_t cache_size, reg_size;
u_int i;
env = dbmp->env;
mp = dbmp->reginfo[0].primary;
- reg_size = dbmp->reginfo[0].rp->size;
- cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes;
ret = 0;
if (mp->nreg == 1) {
@@ -448,21 +450,36 @@ __memp_remove_region(dbmp)
return (ret);
/* Detach from the region then destroy it. */
- infop = &dbmp->reginfo[mp->nreg];
+ infop = &dbmp->reginfo[mp->nreg - 1];
+ c_mp = infop->primary;
+ hp = R_ADDR(infop, c_mp->htab);
+ /*
+ * For private enviroment, we need to free everything, and
+ * for non-private environment, we need to refresh the mutexes
+ * so that they can be in a ready state for later resize.
+ */
if (F_ISSET(env, ENV_PRIVATE)) {
- hp = R_ADDR(infop, ((MPOOL*)infop->primary)->htab);
- for (i = 0; i < env->dbenv->mp_mtxcount; i++)
- if ((ret = __mutex_free(env, &hp[i].mtx_hash)) != 0)
+ if ((ret = __memp_region_bhfree(infop)) != 0)
+ return (ret);
+ if (MUTEX_ON(env)) {
+ DB_ASSERT(env,
+ env->dbenv->mp_mtxcount == mp->htab_mutexes);
+ for (i = 0; i < mp->htab_mutexes; i++)
+ if ((ret = __mutex_free(env,
+ &hp[i].mtx_hash)) != 0)
+ return (ret);
+ }
+ __env_alloc_free(infop, hp);
+ } else if (MUTEX_ON(env)) {
+ DB_ASSERT(env, env->dbenv->mp_mtxcount == mp->htab_mutexes);
+ for (i = 0; i < mp->htab_mutexes; i++)
+ if ((ret = __mutex_refresh(env, hp[i].mtx_hash)) != 0)
return (ret);
}
ret = __env_region_detach(env, infop, 1);
- if (ret == 0) {
+ if (ret == 0)
mp->nreg--;
- cache_size -= reg_size;
- mp->gbytes = (u_int32_t)(cache_size / GIGABYTE);
- mp->bytes = (u_int32_t)(cache_size % GIGABYTE);
- }
return (ret);
}
@@ -511,6 +528,9 @@ __memp_map_regions(dbmp)
}
/*
+ * __memp_resize --
+ * Change the overall cache size by adding or removing cache regions.
+ *
* PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t));
*/
int
@@ -526,7 +546,7 @@ __memp_resize(dbmp, gbytes, bytes)
env = dbmp->env;
mp = dbmp->reginfo[0].primary;
- reg_size = dbmp->reginfo[0].rp->size;
+ reg_size = dbmp->reginfo[0].rp->max;
total_size = (roff_t)gbytes * GIGABYTE + bytes;
ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size);
@@ -546,6 +566,9 @@ __memp_resize(dbmp, gbytes, bytes)
__memp_add_region(dbmp) :
__memp_remove_region(dbmp))) != 0)
break;
+ total_size = reg_size * (roff_t)mp->nreg;
+ mp->gbytes = (u_int32_t)(total_size / GIGABYTE);
+ mp->bytes = (u_int32_t)(total_size % GIGABYTE);
MUTEX_UNLOCK(env, mp->mtx_resize);
return (ret);
@@ -567,13 +590,13 @@ __memp_get_cache_max(dbenv, max_gbytesp, max_bytesp)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->get_cache_max", DB_INIT_MPOOL);
if (MPOOL_ON(env)) {
/* Cannot be set after open, no lock required to read. */
dbmp = env->mp_handle;
mp = dbmp->reginfo[0].primary;
- reg_size = dbmp->reginfo[0].rp->size;
+ reg_size = dbmp->reginfo[0].rp->max;
max_size = mp->max_nreg * reg_size;
*max_gbytesp = (u_int32_t)(max_size / GIGABYTE);
*max_bytesp = (u_int32_t)(max_size % GIGABYTE);
diff --git a/src/mp/mp_stat.c b/src/mp/mp_stat.c
index 246b44d7..81ea35c1 100644
--- a/src/mp/mp_stat.c
+++ b/src/mp/mp_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -133,7 +133,14 @@ __memp_stat(env, gspp, fspp, flags)
sp->st_ro_evict += c_mp->stat.st_ro_evict;
sp->st_rw_evict += c_mp->stat.st_rw_evict;
sp->st_page_trickle += c_mp->stat.st_page_trickle;
+ sp->st_mvcc_reused += c_mp->stat.st_mvcc_reused;
sp->st_pages += c_mp->pages;
+ /* Undocumented field used by tests only. */
+ sp->st_oddfsize_detect +=
+ c_mp->stat.st_oddfsize_detect;
+ /* Undocumented field used by tests only. */
+ sp->st_oddfsize_resolve +=
+ c_mp->stat.st_oddfsize_resolve;
/*
* st_page_dirty calculated by __memp_stat_hash
* st_page_clean calculated here
@@ -195,7 +202,12 @@ __memp_stat(env, gspp, fspp, flags)
/* Count the MPOOLFILE structures. */
i = 0;
- len = 0;
+ /*
+ * Allow space for the first __memp_get_files() to align the
+ * structure array to uintmax_t, DB_MPOOL_STAT's most
+ * restrictive field. [#23150]
+ */
+ len = sizeof(uintmax_t);
if ((ret = __memp_walk_files(env,
mp, __memp_count_files, &len, &i, flags)) != 0)
return (ret);
@@ -252,6 +264,11 @@ __memp_file_stats(env, mfp, argp, countp, flags)
return (0);
}
+/*
+ * __memp_count_files --
+ * This __memp_walk_files() iterator counts the number of files as well as
+ * the space needed for their statistics, including file names.
+ */
static int
__memp_count_files(env, mfp, argp, countp, flags)
ENV *env;
@@ -277,13 +294,25 @@ __memp_count_files(env, mfp, argp, countp, flags)
/*
* __memp_get_files --
- * get file specific statistics
+ * get another file's specific statistics
*
- * Build each individual entry. We assume that an array of pointers are
- * aligned correctly to be followed by an array of structures, which should
- * be safe (in this particular case, the first element of the structure
- * is a pointer, so we're doubly safe). The array is followed by space
- * for the text file names.
+ * Add a file statistics entry to the current list. The chunk of memory
+ * starts with an array of DB_MPOOL_FSTAT pointers, a null pointer to mark
+ * the last one, then an aligned array of DB_MPOOL_FSTAT structures, then
+ * characters space for the file names.
+ * +-----------------------------------------------+
+ * | count * DB_MPOOL_FSTAT pointers |
+ * +-----------------------------------------------+
+ * | null pointer +
+ * +-----------------------------------------------|
+ * | [space for aligning DB_MPOOL_FSTAT array] |
+ * +-----------------------------------------------+
+ * | count * DB_MPOOL_FSTAT structs |
+ * +-----------------------------------------------+
+ * | first file name | second file name | third... |
+ * +-----------------------------------------------+
+ * | file name | ... |
+ * +-----------------------------------------------+
*/
static int
__memp_get_files(env, mfp, argp, countp, flags)
@@ -305,11 +334,21 @@ __memp_get_files(env, mfp, argp, countp, flags)
tfsp = *(DB_MPOOL_FSTAT ***)argp;
if (*tfsp == NULL) {
- /* Add 1 to count because we need to skip over the NULL. */
- tstruct = (DB_MPOOL_FSTAT *)(tfsp + *countp + 1);
- tname = (char *)(tstruct + *countp);
+ /*
+ * Add 1 to count because to skip over the NULL end marker.
+ * Align it further for DB_MPOOL_STAT's most restrictive field
+ * because uintmax_t might require stricter alignment than
+ * pointers; e.g., IP32 LL64 SPARC. [#23150]
+ */
+ tstruct = (DB_MPOOL_FSTAT *)&tfsp[*countp + 1];
+ tstruct = ALIGNP_INC(tstruct, sizeof(uintmax_t));
+ tname = (char *)&tstruct[*countp];
*tfsp = tstruct;
} else {
+ /*
+ * This stat struct follows the previous one; the file name
+ * follows the previous entry's filename.
+ */
tstruct = *tfsp + 1;
tname = (*tfsp)->file_name + strlen((*tfsp)->file_name) + 1;
*++tfsp = tstruct;
@@ -486,6 +525,8 @@ __memp_print_stats(env, flags)
(u_long)gsp->st_mvcc_thawed);
__db_dl(env, "The number of frozen buffers freed",
(u_long)gsp->st_mvcc_freed);
+ __db_dl(env, "The number of outdated intermediate versions reused",
+ (u_long)gsp->st_mvcc_reused);
__db_dl(env, "The number of page allocations", (u_long)gsp->st_alloc);
__db_dl(env,
"The number of hash buckets examined during allocations",
@@ -744,11 +785,18 @@ __memp_print_hash(env, dbmp, reginfo, fmap, flags)
vbhp != NULL;
vbhp = SH_CHAIN_PREV(vbhp, vc, __bh)) {
__memp_print_bh(env, dbmp,
- " next:\t", vbhp, fmap);
+ " prev:\t", vbhp, fmap);
}
}
MUTEX_UNLOCK(env, hp->mtx_hash);
}
+#ifdef DIAGNOSTIC
+ SH_TAILQ_FOREACH(bhp, &c_mp->free_frozen, hq, __bh) {
+ __db_msg(env, "free frozen %lu pgno %lu mtx_buf %lu",
+ (u_long)R_OFFSET(dbmp->reginfo, bhp),
+ (u_long)bhp->pgno, (u_long)bhp->mtx_buf);
+ }
+#endif
return (0);
}
@@ -775,6 +823,7 @@ __memp_print_bh(env, dbmp, prefix, bhp, fmap)
{ BH_FROZEN, "frozen" },
{ BH_TRASH, "trash" },
{ BH_THAWED, "thawed" },
+ { BH_UNREACHABLE, "unreachable" },
{ 0, NULL }
};
DB_MSGBUF mb;
diff --git a/src/mp/mp_sync.c b/src/mp/mp_sync.c
index fa06b1d4..82d5c8de 100644
--- a/src/mp/mp_sync.c
+++ b/src/mp/mp_sync.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -95,9 +95,11 @@ __memp_discard_all_mpfs (env, mp)
while ((mfp = SH_TAILQ_FIRST(
&hp->hash_bucket, __mpoolfile)) != NULL) {
MUTEX_LOCK(env, mfp->mutex);
- if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0 &&
- ret == 0)
- ret = t_ret;
+ if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
}
MUTEX_UNLOCK(env, hp->mtx_hash);
}
@@ -837,6 +839,7 @@ __memp_mf_sync(dbmp, mfp, locked)
MPOOLFILE *mfp;
int locked;
{
+ APPNAME appname;
DB_FH *fhp;
DB_MPOOL_HASH *hp;
ENV *env;
@@ -846,6 +849,7 @@ __memp_mf_sync(dbmp, mfp, locked)
COMPQUIET(hp, NULL);
env = dbmp->env;
+ appname = DB_APP_DATA;
/*
* We need to be holding the hash lock: we're using the path name
@@ -859,13 +863,20 @@ __memp_mf_sync(dbmp, mfp, locked)
MUTEX_LOCK(env, hp->mtx_hash);
}
- if ((ret = __db_appname(env, DB_APP_DATA,
+mpsync: if ((ret = __db_appname(env, appname,
R_ADDR(dbmp->reginfo, mfp->path_off), NULL, &rpath)) == 0) {
if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) {
ret = __os_fsync(env, fhp);
if ((t_ret =
__os_closehandle(env, fhp)) != 0 && ret == 0)
ret = t_ret;
+ } else {
+ /* We may be syncing the blob meta db. */
+ if (appname != DB_APP_BLOB) {
+ __os_free(env, rpath);
+ appname = DB_APP_BLOB;
+ goto mpsync;
+ }
}
__os_free(env, rpath);
}
diff --git a/src/mp/mp_trickle.c b/src/mp/mp_trickle.c
index fba528b3..ff8cb875 100644
--- a/src/mp/mp_trickle.c
+++ b/src/mp/mp_trickle.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/mutex/mut_alloc.c b/src/mutex/mut_alloc.c
index 5df3de53..06b3541e 100644
--- a/src/mutex/mut_alloc.c
+++ b/src/mutex/mut_alloc.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,9 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/log.h"
+
+static char *__mutex_action_print __P((MUTEX_ACTION));
/*
* __mutex_alloc --
@@ -35,8 +38,7 @@ __mutex_alloc(env, alloc_id, flags, indxp)
if (alloc_id != MTX_APPLICATION && alloc_id != MTX_MUTEX_TEST &&
(F_ISSET(env->dbenv, DB_ENV_NOLOCKING) ||
(!F_ISSET(env, ENV_THREAD) &&
- (LF_ISSET(DB_MUTEX_PROCESS_ONLY) ||
- F_ISSET(env, ENV_PRIVATE)))))
+ (LF_ISSET(DB_MUTEX_PROCESS_ONLY) || F_ISSET(env, ENV_PRIVATE)))))
return (0);
/* Private environments never share mutexes. */
@@ -109,13 +111,17 @@ nomem: __db_errx(env, DB_STR("2034",
mtxregion->stat.st_mutex_max)
cnt = mtxregion->stat.st_mutex_max -
mtxregion->stat.st_mutex_cnt;
+
+ /* Set i to the first newly created db_mutex_t. */
if (F_ISSET(env, ENV_PRIVATE)) {
F_SET(&mtxmgr->reginfo, REGION_TRACKED);
while (__env_alloc(&mtxmgr->reginfo,
(cnt * mtxregion->mutex_size) +
- mtxregion->stat.st_mutex_align, &i) != 0)
- if ((cnt >> 1) == 0)
+ mtxregion->stat.st_mutex_align, &i) != 0) {
+ cnt >>= 1;
+ if (cnt == 0)
break;
+ }
F_CLR(&mtxmgr->reginfo, REGION_TRACKED);
i = (db_mutex_t)ALIGNP_INC(i,
mtxregion->stat.st_mutex_align);
@@ -130,21 +136,16 @@ nomem: __db_errx(env, DB_STR("2034",
}
if (cnt == 0)
goto nomem;
- mutexp = MUTEXP_SET(env, i);
+
mtxregion->stat.st_mutex_free = cnt;
mtxregion->mutex_next = i;
mtxregion->stat.st_mutex_cnt += cnt;
- while (--cnt > 0) {
- mutexp->flags = 0;
- if (F_ISSET(env, ENV_PRIVATE))
- mutexp->mutex_next_link =
- (uintptr_t)(mutexp + 1);
- else
- mutexp->mutex_next_link = ++i;
- mutexp++;
- }
- mutexp->flags = 0;
- mutexp->mutex_next_link = MUTEX_INVALID;
+
+ /*
+ * Now link the rest of the newly allocated db_mutex_t's into
+ * the free list.
+ */
+ MUTEX_BULK_INIT(env, mtxregion, i, cnt);
}
*indxp = mtxregion->mutex_next;
@@ -158,14 +159,12 @@ nomem: __db_errx(env, DB_STR("2034",
if (mtxregion->stat.st_mutex_inuse > mtxregion->stat.st_mutex_inuse_max)
mtxregion->stat.st_mutex_inuse_max =
mtxregion->stat.st_mutex_inuse;
- if (locksys)
- MUTEX_SYSTEM_UNLOCK(env);
/* Initialize the mutex. */
memset(mutexp, 0, sizeof(*mutexp));
F_SET(mutexp, DB_MUTEX_ALLOCATED |
- LF_ISSET(DB_MUTEX_LOGICAL_LOCK |
- DB_MUTEX_PROCESS_ONLY | DB_MUTEX_SHARED));
+ LF_ISSET(DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_PROCESS_ONLY |
+ DB_MUTEX_SELF_BLOCK | DB_MUTEX_SHARED));
/*
* If the mutex is associated with a single process, set the process
@@ -182,7 +181,9 @@ nomem: __db_errx(env, DB_STR("2034",
#endif
if ((ret = __mutex_init(env, *indxp, flags)) != 0)
- (void)__mutex_free_int(env, locksys, indxp);
+ (void)__mutex_free_int(env, 0, indxp);
+ if (locksys)
+ MUTEX_SYSTEM_UNLOCK(env);
return (ret);
}
@@ -262,6 +263,44 @@ __mutex_free_int(env, locksys, indxp)
return (ret);
}
+#ifdef HAVE_FAILCHK_BROADCAST
+/*
+ * __mutex_died --
+ * Announce that a mutex request couldn't been granted because the last
+ * thread to own it was killed by failchk. Sets ENV_DEAD_MUTEX in the
+ * possibly shared environment so that mutex unlock calls don't complain.
+ *
+ *
+ * PUBLIC: int __mutex_died __P((ENV *, db_mutex_t));
+ */
+int
+__mutex_died(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_ENV *dbenv;
+ DB_EVENT_MUTEX_DIED_INFO info;
+ DB_MUTEX *mutexp;
+ char tidstr[DB_THREADID_STRLEN], failmsg[DB_FAILURE_SYMPTOM_SIZE];
+
+ dbenv = env->dbenv;
+
+ mutexp = MUTEXP_SET(env, mutex);
+ info.mutex = mutex;
+ info.pid = mutexp->pid;
+ info.tid = mutexp->tid;
+ (void)dbenv->thread_id_string(dbenv, mutexp->pid, mutexp->tid, tidstr);
+ (void)__mutex_describe(env, mutex, info.desc);
+ (void)snprintf(failmsg, sizeof(failmsg), DB_STR_A("2073",
+ "Mutex died: %s owned %s", "%s %s"), tidstr, info.desc);
+ __db_errx(env, "%s", failmsg);
+ /* If this is the first crashed process, save its description. */
+ (void)__env_failure_remember(env, failmsg);
+ DB_EVENT(env, DB_EVENT_MUTEX_DIED, &info);
+ return (__env_panic(env, USR_ERR(env, DB_RUNRECOVERY)));
+}
+#endif
+
/*
* __mutex_refresh --
* Reinitialize a mutex, if we are not sure of its state.
@@ -289,3 +328,154 @@ __mutex_refresh(env, mutex)
}
return (ret);
}
+
+/*
+ * __mutex_record_lock --
+ * Record that this thread is about to lock a latch.
+ * The last parameter is updated to point to this mutex's entry in the
+ * per-thread mutex state array, so that it can update it if it gets the
+ * mutex, or free it if the mutex is not acquired (e.g. it times out).
+ * Mutexes which can be unlocked by other threads are not placed in this
+ * list, because it would be too costly for that other thread to to find
+ * the right slot to clear. The caller has already checked that thread
+ * tracking is enabled.
+ *
+ * PUBLIC: int __mutex_record_lock
+ * PUBLIC: __P((ENV *, db_mutex_t, MUTEX_ACTION, MUTEX_STATE **));
+ */
+int
+__mutex_record_lock(env, mutex, action, retp)
+ ENV *env;
+ db_mutex_t mutex;
+ MUTEX_ACTION action;
+ MUTEX_STATE **retp;
+{
+ DB_MUTEX *mutexp;
+ DB_THREAD_INFO *ip;
+ int i, ret;
+
+ *retp = NULL;
+ mutexp = MUTEXP_SET(env, mutex);
+ if (!F_ISSET(mutexp, DB_MUTEX_SHARED))
+ return (0);
+ if ((ret = __env_set_state(env, &ip, THREAD_VERIFY)) != 0)
+ return (ret);
+ for (i = 0; i != MUTEX_STATE_MAX; i++) {
+ if (ip->dbth_latches[i].action == MUTEX_ACTION_UNLOCKED) {
+ ip->dbth_latches[i].mutex = mutex;
+ ip->dbth_latches[i].action = action;
+#ifdef DIAGNOSTIC
+ __os_gettime(env, &ip->dbth_latches[i].when, 0);
+#endif
+ *retp = &ip->dbth_latches[i];
+ return (0);
+ }
+ }
+ __db_errx(env, DB_STR_A("2074",
+ "No space available in latch table for %lu", "%lu"), (u_long)mutex);
+ (void)__mutex_record_print(env, ip);
+ return (__env_panic(env, USR_ERR(env, DB_RUNRECOVERY)));
+}
+
+/*
+ * __mutex_record_unlock --
+ * Verify that this thread owns the mutex it is about to unlock.
+ *
+ * PUBLIC: int __mutex_record_unlock __P((ENV *, db_mutex_t));
+ */
+int
+__mutex_record_unlock(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ DB_MUTEX *mutexp;
+ DB_THREAD_INFO *ip;
+ int i, ret;
+
+ if (env->thr_hashtab == NULL)
+ return (0);
+ mutexp = MUTEXP_SET(env, mutex);
+ if (!F_ISSET(mutexp, DB_MUTEX_SHARED))
+ return (0);
+ if ((ret = __env_set_state(env, &ip, THREAD_VERIFY)) != 0)
+ return (ret);
+ for (i = 0; i != MUTEX_STATE_MAX; i++) {
+ if (ip->dbth_latches[i].mutex == mutex &&
+ ip->dbth_latches[i].action != MUTEX_ACTION_UNLOCKED) {
+ ip->dbth_latches[i].action = MUTEX_ACTION_UNLOCKED;
+ return (0);
+ }
+ }
+ (void)__mutex_record_print(env, ip);
+ if (ip->dbth_state == THREAD_FAILCHK) {
+ DB_DEBUG_MSG(env, "mutex_record_unlock %lu by failchk thread",
+ (u_long)mutex);
+ return (0);
+ }
+ __db_errx(env, DB_STR_A("2075",
+ "Latch %lu was not held", "%lu"), (u_long)mutex);
+ return (__env_panic(env, USR_ERR(env, DB_RUNRECOVERY)));
+}
+
+static char *
+__mutex_action_print(action)
+ MUTEX_ACTION action;
+{
+ switch (action) {
+ case MUTEX_ACTION_UNLOCKED:
+ return ("unlocked");
+ case MUTEX_ACTION_INTEND_SHARE:
+ return ("waiting to share");
+ case MUTEX_ACTION_SHARED:
+ return ("sharing");
+ default:
+ return ("unknown");
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __mutex_record_print --
+ * Display the thread's mutex state via __db_msg(), including any
+ * information which would be relevant for db_stat or diagnostic messages.
+ *
+ * PUBLIC: int __mutex_record_print __P((ENV *, DB_THREAD_INFO *));
+ */
+int
+__mutex_record_print(env, ip)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+{
+ DB_MSGBUF mb, *mbp;
+ db_mutex_t mutex;
+ int i;
+ char desc[DB_MUTEX_DESCRIBE_STRLEN];
+ char time_buf[CTIME_BUFLEN];
+
+ DB_MSGBUF_INIT(&mb);
+ mbp = &mb;
+ for (i = 0; i != MUTEX_STATE_MAX; i++) {
+ if (ip->dbth_latches[i].action == MUTEX_ACTION_UNLOCKED)
+ continue;
+ if ((mutex = ip->dbth_latches[i].mutex) ==
+ MUTEX_INVALID)
+ continue;
+ time_buf[4] = '\0';
+#ifdef DIAGNOSTIC
+ if (timespecisset(&ip->dbth_latches[i].when))
+ (void)__db_ctimespec(&ip->dbth_latches[i].when,
+ time_buf);
+ else
+#endif
+ time_buf[0] = '\0';
+
+ __db_msgadd(env, mbp, "%s %s %s ",
+ __mutex_describe(env, mutex, desc),
+ __mutex_action_print(ip->dbth_latches[i].action), time_buf);
+#ifdef HAVE_STATISTICS
+ __mutex_print_debug_stats(env, mbp, mutex, 0);
+#endif
+ DB_MSGBUF_FLUSH(env, mbp);
+ }
+ return (0);
+}
diff --git a/src/mutex/mut_failchk.c b/src/mutex/mut_failchk.c
index 1425389f..28e5d992 100644
--- a/src/mutex/mut_failchk.c
+++ b/src/mutex/mut_failchk.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,68 +9,193 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/lock.h"
+
+static int __mutex_failchk_single __P((ENV *, db_mutex_t, DB_THREAD_INFO *));
/*
- * __mut_failchk --
- * Check for mutexes held by dead processes.
+ * __mutex_failchk --
+ * Clean up after dead processes which left behind allocated per-process or
+ * locked mutexes.
*
- * PUBLIC: int __mut_failchk __P((ENV *));
+ * PUBLIC: int __mutex_failchk __P((ENV *));
*/
int
-__mut_failchk(env)
+__mutex_failchk(env)
ENV *env;
{
- DB_ENV *dbenv;
- DB_MUTEX *mutexp;
+ DB_HASHTAB *htab;
DB_MUTEXMGR *mtxmgr;
DB_MUTEXREGION *mtxregion;
- db_mutex_t i;
- int ret;
- char buf[DB_THREADID_STRLEN];
- db_threadid_t unused;
+ DB_THREAD_INFO *ip;
+ db_mutex_t mutex;
+ unsigned i;
+ int count;
- if (F_ISSET(env, ENV_PRIVATE))
+ if (F_ISSET(env, ENV_PRIVATE) || (htab = env->thr_hashtab) == NULL)
return (0);
- DB_THREADID_INIT(unused);
-
- dbenv = env->dbenv;
mtxmgr = env->mutex_handle;
mtxregion = mtxmgr->reginfo.primary;
- ret = 0;
+ count = 0;
+ DB_ASSERT(env, F_ISSET(env->dbenv, DB_ENV_FAILCHK));
MUTEX_SYSTEM_LOCK(env);
- for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i, ++mutexp) {
- mutexp = MUTEXP_SET(env, i);
- /*
- * We're looking for per-process mutexes where the process
- * has died.
- */
- if (!F_ISSET(mutexp, DB_MUTEX_ALLOCATED) ||
- !F_ISSET(mutexp, DB_MUTEX_PROCESS_ONLY))
+ /*
+ * The first loop does each thread's read-locked latches; the second
+ * does all locked mutexes.
+ */
+ for (i = 0; i < env->thr_nbucket; i++)
+ SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) {
+ if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE)
+ continue;
+ count += __mutex_failchk_thread(env, ip);
+ }
+
+ for (mutex = 1; mutex <= mtxregion->stat.st_mutex_cnt; mutex++)
+ if (__mutex_failchk_single(env, mutex, NULL) != 0)
+ count++;
+
+ MUTEX_SYSTEM_UNLOCK(env);
+
+ if (count == 0)
+ return (count);
+ else
+ return (USR_ERR(env, DB_RUNRECOVERY));
+}
+
+/*
+ * __mutex_failchk_thread -
+ * Do the per-latch failchk work on each of this thread's shared latches.
+ *
+ * PUBLIC: int __mutex_failchk_thread __P((ENV *, DB_THREAD_INFO *));
+ */
+int
+__mutex_failchk_thread(env, ip)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+{
+ db_mutex_t mutex;
+ int count, i;
+
+ count = 0;
+ for (i = 0; i != MUTEX_STATE_MAX; i++) {
+ if (ip->dbth_latches[i].action == MUTEX_ACTION_UNLOCKED ||
+ (mutex = ip->dbth_latches[i].mutex) == MUTEX_INVALID)
continue;
+ if (__mutex_failchk_single(env, mutex, ip) != 0)
+ count++;
+ }
+ return (count);
+}
+/*
+ * __mutex_failchk_single --
+ * Determine whether this mutex is locked or shared by a potentially
+ * dead thread. If so, and the call to is_alive() finds that it is dead,
+ * clean up if possible (a process-only mutex); else wake up any waiters.
+ */
+static int
+__mutex_failchk_single(env, mutex, ip)
+ ENV *env;
+ db_mutex_t mutex;
+ DB_THREAD_INFO *ip;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ db_threadid_t threadid;
+ pid_t pid;
+ int already_dead, ret;
+ u_int32_t flags;
+ char id_str[DB_THREADID_STRLEN];
+ char mtx_desc[DB_MUTEX_DESCRIBE_STRLEN];
+
+ dbenv = env->dbenv;
+ mutexp = MUTEXP_SET(env, mutex);
+ flags = mutexp->flags;
+ /*
+ * Filter out mutexes which couldn't possibly be "interesting", in order
+ * to reduce the number of possibly costly is_alive() calls. Check that:
+ * it is allocated
+ * is it either locked, or a shared latch, or a per-process mutex
+ * it is nether a logical lock, nor self-block, nor already dead.
+ * Self-blocking mutexes are skipped because it is expected that they
+ * can still be locked even though they are really 'idle', as with
+ * the wait case in __lock_get_internal(), LOG->free_commits, and
+ * __rep_waiter->mtx_repwait; or they were allocated by the application.
+ */
+ if (!LF_ISSET(DB_MUTEX_ALLOCATED))
+ return (0);
+ if (!LF_ISSET(
+ DB_MUTEX_SHARED | DB_MUTEX_LOCKED | DB_MUTEX_PROCESS_ONLY))
+ return (0);
+ if (LF_ISSET(
+ DB_MUTEX_SELF_BLOCK | DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_OWNER_DEAD))
+ return (0);
+
+ already_dead = ip != NULL && timespecisset(&ip->dbth_failtime);
+ /*
+ * The pid in the mutex is valid when for locked or per-process mutexes.
+ * The tid is correct only when exclusively locked. It's okay to look at
+ * the tid of an unlocked per-process mutex, we won't use it in the
+ * is_alive() call.
+ */
+ if (LF_ISSET(DB_MUTEX_LOCKED | DB_MUTEX_PROCESS_ONLY)) {
+ pid = mutexp->pid;
+ threadid = mutexp->tid;
+ } else {
+ DB_ASSERT(env, LF_ISSET(DB_MUTEX_SHARED));
/*
- * The thread that allocated the mutex may have exited, but
- * we cannot reclaim the mutex if the process is still alive.
+ * If we get here with no thread, then this is an shared latch
+ * which is neither locked nor shared, we're done with it.
*/
- if (dbenv->is_alive(
- dbenv, mutexp->pid, unused, DB_MUTEX_PROCESS_ONLY))
- continue;
+ if (ip == NULL)
+ return (0);
+ pid = ip->dbth_pid;
+ threadid = ip->dbth_tid;
+ }
+ if (!already_dead && dbenv->is_alive(dbenv,
+ pid, threadid, LF_ISSET(DB_MUTEX_PROCESS_ONLY)))
+ return (0);
+
+ /* The thread is dead; the mutex type indicates the kind of cleanup. */
+ (void)dbenv->thread_id_string(dbenv, pid, threadid, id_str);
+ (void)__mutex_describe(env, mutex, mtx_desc);
- __db_msg(env, DB_STR_A("2017",
- "Freeing mutex for process: %s", "%s"),
- dbenv->thread_id_string(dbenv, mutexp->pid, unused, buf));
+ if (LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+ if (already_dead)
+ return (0);
+
+ __db_errx(env, DB_STR_A("2065",
+ "Freeing %s for process: %s", "%s %s"), mtx_desc, id_str);
+
+ /* Clear the mutex id if it is in a cached locker. */
+ if ((ret = __lock_local_locker_invalidate(env, mutex)) != 0)
+ return (ret);
/* Unlock and free the mutex. */
- if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
- MUTEX_UNLOCK(env, i);
+ if (LF_ISSET(DB_MUTEX_LOCKED))
+ MUTEX_UNLOCK(env, mutex);
- if ((ret = __mutex_free_int(env, 0, &i)) != 0)
- break;
+ return (__mutex_free_int(env, 0, &mutex));
}
- MUTEX_SYSTEM_UNLOCK(env);
-
- return (ret);
+#ifdef HAVE_FAILCHK_BROADCAST
+ else if (LF_ISSET(DB_MUTEX_LOCKED)) {
+ __db_errx(env, DB_STR_A("2066",
+ "Marking %s as owned by dead thread %s", "%lu %s"),
+ mtx_desc, id_str);
+ F_SET(mutexp, DB_MUTEX_OWNER_DEAD);
+ } else if (LF_ISSET(DB_MUTEX_SHARED)) {
+ __db_errx(env, DB_STR_A("2067",
+ "Marking %s as shared by dead thread %s", "%lu %s"),
+ mtx_desc, id_str);
+ F_SET(mutexp, DB_MUTEX_OWNER_DEAD);
+ } else {
+ __db_errx(env, DB_STR_A("2068",
+ "mutex_failchk: unknown state for %s with dead thread %s", "%lu %s"),
+ mtx_desc, id_str);
+ }
+#endif
+ return (USR_ERR(env, DB_RUNRECOVERY));
}
diff --git a/src/mutex/mut_fcntl.c b/src/mutex/mut_fcntl.c
deleted file mode 100644
index 0694aa59..00000000
--- a/src/mutex/mut_fcntl.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
- *
- * $Id$
- */
-
-#include "db_config.h"
-
-#include "db_int.h"
-
-static inline int __db_fcntl_mutex_lock_int
- __P((ENV *, db_mutex_t, db_timeout_t, int));
-
-/*
- * __db_fcntl_mutex_init --
- * Initialize a fcntl mutex.
- *
- * PUBLIC: int __db_fcntl_mutex_init __P((ENV *, db_mutex_t, u_int32_t));
- */
-int
-__db_fcntl_mutex_init(env, mutex, flags)
- ENV *env;
- db_mutex_t mutex;
- u_int32_t flags;
-{
- COMPQUIET(env, NULL);
- COMPQUIET(mutex, MUTEX_INVALID);
- COMPQUIET(flags, 0);
-
- return (0);
-}
-
-/*
- * __db_fcntl_mutex_lock_int
- * Internal function to lock a mutex, blocking only when requested
- */
-inline int
-__db_fcntl_mutex_lock_int(env, mutex, timeout, wait)
- ENV *env;
- db_mutex_t mutex;
- db_timeout_t timeout;
- int wait;
-{
- DB_ENV *dbenv;
- DB_MUTEX *mutexp;
- DB_THREAD_INFO *ip;
- struct flock k_lock;
- int locked, ms, ret;
- db_timespec now, timespec;
- db_timeout_t time_left;
-
- dbenv = env->dbenv;
-
- if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
- return (0);
-
- mutexp = MUTEXP_SET(env, mutex);
-
- CHECK_MTX_THREAD(env, mutexp);
-
-#ifdef HAVE_STATISTICS
- if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
- ++mutexp->mutex_set_wait;
- else
- ++mutexp->mutex_set_nowait;
-#endif
-
- /* Initialize the lock. */
- k_lock.l_whence = SEEK_SET;
- k_lock.l_start = mutex;
- k_lock.l_len = 1;
-
- if (timeout != 0) {
- timespecclear(&timespec);
- __clock_set_expires(env, &timespec, timeout);
- }
-
- /*
- * Only check the thread state once, by initializing the thread
- * control block pointer to null. If it is not the failchk
- * thread, then ip will have a valid value subsequent times
- * in the loop.
- */
- ip = NULL;
-
- for (locked = 0;;) {
- /*
- * Wait for the lock to become available; wait 1ms initially,
- * up to 1 second.
- */
- for (ms = 1; F_ISSET(mutexp, DB_MUTEX_LOCKED);) {
- if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
- ip == NULL && dbenv->is_alive(dbenv,
- mutexp->pid, mutexp->tid, 0) == 0) {
- ret = __env_set_state(env, &ip, THREAD_VERIFY);
- if (ret != 0 ||
- ip->dbth_state == THREAD_FAILCHK)
- return (DB_RUNRECOVERY);
- }
- if (!wait)
- return (DB_LOCK_NOTGRANTED);
- if (timeout != 0) {
- timespecclear(&now);
- if (__clock_expired(env, &now, &timespec))
- return (DB_TIMEOUT);
- DB_TIMESPEC_TO_TIMEOUT(time_left, &now, 0);
- time_left = timeout - time_left;
- if (ms * US_PER_MS > time_left)
- ms = time_left / US_PER_MS;
- }
- __os_yield(NULL, 0, ms * US_PER_MS);
- if ((ms <<= 1) > MS_PER_SEC)
- ms = MS_PER_SEC;
- }
-
- /* Acquire an exclusive kernel lock on the byte. */
- k_lock.l_type = F_WRLCK;
- if (fcntl(env->lockfhp->fd, F_SETLKW, &k_lock))
- goto err;
-
- /* If the resource is still available, it's ours. */
- if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
- locked = 1;
-
- F_SET(mutexp, DB_MUTEX_LOCKED);
- dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
- }
-
- /* Release the kernel lock. */
- k_lock.l_type = F_UNLCK;
- if (fcntl(env->lockfhp->fd, F_SETLK, &k_lock))
- goto err;
-
- /*
- * If we got the resource lock we're done.
- *
- * !!!
- * We can't check to see if the lock is ours, because we may
- * be trying to block ourselves in the lock manager, and so
- * the holder of the lock that's preventing us from getting
- * the lock may be us! (Seriously.)
- */
- if (locked)
- break;
- }
-
-#ifdef DIAGNOSTIC
- /*
- * We want to switch threads as often as possible. Yield every time
- * we get a mutex to ensure contention.
- */
- if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
- __os_yield(env, 0, 0);
-#endif
- return (0);
-
-err: ret = __os_get_syserr();
- __db_syserr(env, ret, DB_STR("2019", "fcntl lock failed"));
- return (__env_panic(env, __os_posix_err(ret)));
-}
-
-/*
- * __db_fcntl_mutex_lock
- * Lock a mutex, blocking if necessary.
- *
- * PUBLIC: int __db_fcntl_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t));
- */
-int
-__db_fcntl_mutex_lock(env, mutex, timeout)
- ENV *env;
- db_mutex_t mutex;
- db_timeout_t timeout;
-{
- return (__db_fcntl_mutex_lock_int(env, mutex, timeout, 1));
-}
-
-/*
- * __db_fcntl_mutex_trylock
- * Try to lock a mutex, without blocking when it is busy.
- *
- * PUBLIC: int __db_fcntl_mutex_trylock __P((ENV *, db_mutex_t));
- */
-int
-__db_fcntl_mutex_trylock(env, mutex)
- ENV *env;
- db_mutex_t mutex;
-{
- return (__db_fcntl_mutex_lock_int(env, mutex, 0, 0));
-}
-
-/*
- * __db_fcntl_mutex_unlock --
- * Release a mutex.
- *
- * PUBLIC: int __db_fcntl_mutex_unlock __P((ENV *, db_mutex_t));
- */
-int
-__db_fcntl_mutex_unlock(env, mutex)
- ENV *env;
- db_mutex_t mutex;
-{
- DB_ENV *dbenv;
- DB_MUTEX *mutexp;
-
- dbenv = env->dbenv;
-
- if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
- return (0);
-
- mutexp = MUTEXP_SET(env, mutex);
-
-#ifdef DIAGNOSTIC
- if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
- __db_errx(env, DB_STR("2020",
- "fcntl unlock failed: lock already unlocked"));
- return (__env_panic(env, EACCES));
- }
-#endif
-
- /*
- * Release the resource. We don't have to acquire any locks because
- * processes trying to acquire the lock are waiting for the flag to
- * go to 0. Once that happens the waiters will serialize acquiring
- * an exclusive kernel lock before locking the mutex.
- */
- F_CLR(mutexp, DB_MUTEX_LOCKED);
-
- return (0);
-}
-
-/*
- * __db_fcntl_mutex_destroy --
- * Destroy a mutex.
- *
- * PUBLIC: int __db_fcntl_mutex_destroy __P((ENV *, db_mutex_t));
- */
-int
-__db_fcntl_mutex_destroy(env, mutex)
- ENV *env;
- db_mutex_t mutex;
-{
- COMPQUIET(env, NULL);
- COMPQUIET(mutex, MUTEX_INVALID);
-
- return (0);
-}
diff --git a/src/mutex/mut_method.c b/src/mutex/mut_method.c
index cb666082..99bafeae 100644
--- a/src/mutex/mut_method.c
+++ b/src/mutex/mut_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -371,6 +371,33 @@ __mutex_set_tas_spins(dbenv, tas_spins)
return (0);
}
+#ifdef HAVE_ERROR_HISTORY
+/*
+ * __mutex_diags --
+ *
+ * PUBLIC: #ifdef HAVE_ERROR_HISTORY
+ * PUBLIC: int __mutex_diags __P((ENV *, db_mutex_t, int));
+ * PUBLIC: #endif
+ */
+int
+__mutex_diags(env, mutex, error)
+ ENV *env;
+ db_mutex_t mutex;
+ int error;
+{
+ DB_MSGBUF *mb;
+
+ if ((mb = __db_deferred_get()) != NULL) {
+ (void)__db_remember_context(env, mb, error);
+ __db_msgadd(env, mb, "Mutex %u ", (unsigned int)mutex);
+#ifdef HAVE_STATISTICS
+ __mutex_print_debug_stats(env, mb, mutex, 0);
+#endif
+ }
+ return (error);
+}
+#endif
+
#if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT)
/*
* Provide atomic operations for platforms which have mutexes yet do not have
diff --git a/src/mutex/mut_pthread.c b/src/mutex/mut_pthread.c
index 1ec4fb9c..4b2cfb81 100644
--- a/src/mutex/mut_pthread.c
+++ b/src/mutex/mut_pthread.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -64,6 +64,19 @@
} while (0)
/*
+ * !!!
+ * Solaris bug workaround: pthread_cond_wait() sometimes returns ETIME -- out
+ * of sheer paranoia, check both ETIME and ETIMEDOUT. We believe this happens
+ * when the application uses SIGALRM for some purpose, e.g., the C library sleep
+ * call, and Solaris delivers the signal to the wrong LWP.
+ */
+#ifdef ETIME
+#define ETIME_TO_ETIMEDOUT(ret) ((ret) == ETIME ? ETIMEDOUT : (ret))
+#else
+#define ETIME_TO_ETIMEDOUT(ret) (ret)
+#endif
+
+/*
* __db_pthread_mutex_init --
* Initialize a pthread mutex: either a native one or
* just the mutex for block/wakeup of a hybrid test-and-set mutex
@@ -104,18 +117,18 @@ __db_pthread_mutex_init(env, mutex, flags)
pthread_rwlockattr_t rwlockattr, *rwlockattrp = NULL;
#ifndef HAVE_MUTEX_THREAD_ONLY
if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
- RET_SET((pthread_rwlockattr_init(&rwlockattr)), ret);
+ RET_SET(pthread_rwlockattr_init(&rwlockattr), ret);
if (ret != 0)
goto err;
- RET_SET((pthread_rwlockattr_setpshared(
- &rwlockattr, PTHREAD_PROCESS_SHARED)), ret);
+ RET_SET(pthread_rwlockattr_setpshared(
+ &rwlockattr, PTHREAD_PROCESS_SHARED), ret);
rwlockattrp = &rwlockattr;
}
#endif
if (ret == 0)
- RET_SET((pthread_rwlock_init(&mutexp->u.rwlock,
- rwlockattrp)), ret);
+ RET_SET(pthread_rwlock_init(&mutexp->u.rwlock,
+ rwlockattrp), ret);
if (rwlockattrp != NULL)
(void)pthread_rwlockattr_destroy(rwlockattrp);
@@ -127,18 +140,18 @@ __db_pthread_mutex_init(env, mutex, flags)
#endif
#ifndef HAVE_MUTEX_THREAD_ONLY
if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
- RET_SET((pthread_mutexattr_init(&mutexattr)), ret);
+ RET_SET(pthread_mutexattr_init(&mutexattr), ret);
if (ret != 0)
goto err;
- RET_SET((pthread_mutexattr_setpshared(
- &mutexattr, PTHREAD_PROCESS_SHARED)), ret);
+ RET_SET(pthread_mutexattr_setpshared(
+ &mutexattr, PTHREAD_PROCESS_SHARED), ret);
mutexattrp = &mutexattr;
}
#endif
if (ret == 0)
RET_SET(
- (pthread_mutex_init(&mutexp->u.m.mutex, mutexattrp)), ret);
+ pthread_mutex_init(&mutexp->u.m.mutex, mutexattrp), ret);
if (mutexattrp != NULL)
(void)pthread_mutexattr_destroy(mutexattrp);
@@ -147,19 +160,19 @@ __db_pthread_mutex_init(env, mutex, flags)
if (LF_ISSET(DB_MUTEX_SELF_BLOCK)) {
#ifndef HAVE_MUTEX_THREAD_ONLY
if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
- RET_SET((pthread_condattr_init(&condattr)), ret);
+ RET_SET(pthread_condattr_init(&condattr), ret);
if (ret != 0)
goto err;
condattrp = &condattr;
- RET_SET((pthread_condattr_setpshared(
- &condattr, PTHREAD_PROCESS_SHARED)), ret);
+ RET_SET(pthread_condattr_setpshared(
+ &condattr, PTHREAD_PROCESS_SHARED), ret);
}
#endif
if (ret == 0)
- RET_SET((pthread_cond_init(
- &mutexp->u.m.cond, condattrp)), ret);
+ RET_SET(pthread_cond_init(
+ &mutexp->u.m.cond, condattrp), ret);
F_SET(mutexp, DB_MUTEX_SELF_BLOCK);
if (condattrp != NULL)
@@ -239,6 +252,9 @@ __db_pthread_mutex_prep(env, mutex, mutexp, exclusive)
{
DB_ENV *dbenv;
DB_THREAD_INFO *ip;
+#ifdef HAVE_FAILCHK_BROADCAST
+ db_timespec timespec;
+#endif
int ret;
dbenv = env->dbenv;
@@ -266,13 +282,32 @@ __db_pthread_mutex_prep(env, mutex, mutexp, exclusive)
* hadn't gone down the 'if
* DB_ENV_FAILCHK' path to start with.
*/
- RET_SET_PTHREAD_LOCK(mutexp, ret);
- break;
+ goto lockit;
}
+ __os_yield(env, 0, 10);
}
}
- } else
- RET_SET_PTHREAD_LOCK(mutexp, ret);
+ } else {
+lockit:
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (dbenv->mutex_failchk_timeout != 0) {
+ timespecclear(&timespec);
+ __clock_set_expires(env,
+ &timespec, dbenv->mutex_failchk_timeout);
+ do {
+ RET_SET_PTHREAD_TIMEDLOCK(mutexp,
+ (struct timespec *)&timespec, ret);
+ ret = ETIME_TO_ETIMEDOUT(ret);
+ if (ret == ETIMEDOUT &&
+ F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ !F_ISSET(dbenv, DB_ENV_FAILCHK))
+ ret = USR_ERR(env,
+ __mutex_died(env, mutex));
+ } while (ret == ETIMEDOUT);
+ } else
+#endif
+ RET_SET_PTHREAD_LOCK(mutexp, ret);
+ }
PERFMON4(env,
mutex, resume, mutex, exclusive, mutexp->alloc_id, mutexp);
@@ -302,49 +337,75 @@ __db_pthread_mutex_condwait(env, mutex, mutexp, timespec)
DB_MUTEX *mutexp;
db_timespec *timespec;
{
+ DB_ENV *dbenv;
int ret;
-
-#ifdef MUTEX_DIAG
- printf("condwait %ld %x wait busy %x count %d\n",
- mutex, pthread_self(), MUTEXP_BUSY_FIELD(mutexp), mutexp->wait);
+#ifdef HAVE_FAILCHK_BROADCAST
+ db_timespec failchk_timespec;
#endif
+
+ dbenv = env->dbenv;
PERFMON4(env, mutex, suspend, mutex, TRUE, mutexp->alloc_id, mutexp);
+#ifdef HAVE_FAILCHK_BROADCAST
+ /*
+ * If the failchk timeout would be soon than the timeout passed in,
+ * argument, use the failchk timeout. The caller handles "short" waits.
+ */
+ if (dbenv->mutex_failchk_timeout != 0) {
+ timespecclear(&failchk_timespec);
+ __clock_set_expires(env,
+ &failchk_timespec, dbenv->mutex_failchk_timeout);
+ if (timespec == NULL ||
+ timespeccmp(timespec, &failchk_timespec, >))
+ timespec = &failchk_timespec;
+ }
+#endif
+
if (timespec != NULL) {
- RET_SET((pthread_cond_timedwait(&mutexp->u.m.cond,
- &mutexp->u.m.mutex, (struct timespec *) timespec)), ret);
+ RET_SET(pthread_cond_timedwait(&mutexp->u.m.cond,
+ &mutexp->u.m.mutex, (struct timespec *) timespec), ret);
+ ret = ETIME_TO_ETIMEDOUT(ret);
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ !F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+ ret = USR_ERR(env, __mutex_died(env, mutex));
+ goto err;
+ }
+#endif
if (ret == ETIMEDOUT) {
ret = DB_TIMEOUT;
- goto ret;
+ goto err;
}
} else
- RET_SET((pthread_cond_wait(&mutexp->u.m.cond,
- &mutexp->u.m.mutex)), ret);
-#ifdef MUTEX_DIAG
- printf("condwait %ld %x wait returns %d busy %x\n",
- mutex, pthread_self(), ret, MUTEXP_BUSY_FIELD(mutexp));
+ RET_SET(pthread_cond_wait(&mutexp->u.m.cond,
+ &mutexp->u.m.mutex), ret);
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (ret == 0 && F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ !F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+ ret = USR_ERR(env, __mutex_died(env, mutex));
+ goto err;
+ }
#endif
/*
* !!!
* Solaris bug workaround: pthread_cond_wait() sometimes returns ETIME
- * -- out of sheer paranoia, check both ETIME and ETIMEDOUT. We
+ * -- out of sheer paranoia, check both ETIME and ETIMEDOUT. We
* believe this happens when the application uses SIGALRM for some
* purpose, e.g., the C library sleep call, and Solaris delivers the
- * signal to the wrong LWP.
+ * signal to the wrong LWP.
*/
if (ret != 0) {
- if (ret == ETIMEDOUT ||
-#ifdef ETIME
- ret == ETIME ||
-#endif
+ if ((ret = ETIME_TO_ETIMEDOUT(ret)) == ETIMEDOUT ||
ret == EINTR)
ret = 0;
- else
+ else {
/* Failure, caller shouldn't condwait again. */
(void)pthread_mutex_unlock(&mutexp->u.m.mutex);
+ (void)MUTEX_ERR(env, mutex, ret);
+ }
}
-ret:
+err:
PERFMON4(env, mutex, resume, mutex, TRUE, mutexp->alloc_id, mutexp);
COMPQUIET(mutex, 0);
@@ -356,7 +417,10 @@ ret:
/*
* __db_pthread_mutex_lock
* Lock on a mutex, blocking if necessary.
- * Timeouts are supported only for self-blocking mutexes.
+ * Timeouts are supported only for self-blocking mutexes. When both a
+ * given timeout and a dbenv-wide failchk timeout are specified, the
+ * given timeout takes precedence -- a process failure might not be noticed
+ * for a little while.
*
* Self-blocking shared latches are not supported.
*
@@ -372,6 +436,7 @@ __db_pthread_mutex_lock(env, mutex, timeout)
{
DB_ENV *dbenv;
DB_MUTEX *mutexp;
+ db_timeout_t checktimeout;
db_timespec timespec;
int ret, t_ret;
@@ -385,7 +450,6 @@ __db_pthread_mutex_lock(env, mutex, timeout)
CHECK_MTX_THREAD(env, mutexp);
-#if defined(HAVE_STATISTICS)
/*
* We want to know which mutexes are contentious, but don't want to
* do an interlocked test here -- that's slower when the underlying
@@ -398,6 +462,11 @@ __db_pthread_mutex_lock(env, mutex, timeout)
else
STAT_INC(env,
mutex, set_nowait, mutexp->mutex_set_nowait, mutex);
+
+ checktimeout = timeout;
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (checktimeout == 0 || checktimeout > dbenv->mutex_failchk_timeout)
+ checktimeout = dbenv->mutex_failchk_timeout;
#endif
/* Single-thread the next block, except during the possible condwait. */
@@ -405,14 +474,12 @@ __db_pthread_mutex_lock(env, mutex, timeout)
goto err;
if (F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK)) {
- if (timeout != 0)
+ if (checktimeout != 0)
timespecclear(&timespec);
while (MUTEXP_IS_BUSY(mutexp)) {
/* Set expiration timer upon first need. */
- if (timeout != 0 && !timespecisset(&timespec)) {
- timespecclear(&timespec);
+ if (checktimeout != 0 && !timespecisset(&timespec))
__clock_set_expires(env, &timespec, timeout);
- }
t_ret = __db_pthread_mutex_condwait(env,
mutex, mutexp, timeout == 0 ? NULL : &timespec);
if (t_ret != 0) {
@@ -428,18 +495,20 @@ __db_pthread_mutex_lock(env, mutex, timeout)
out:
/* #2471: HP-UX can sporadically return EFAULT. See above */
RETRY_ON_EFAULT(pthread_mutex_unlock(&mutexp->u.m.mutex), ret);
- if (ret != 0)
+ if (ret != 0) {
+ (void)MUTEX_ERR(env, mutex, ret);
goto err;
+ }
} else {
#ifdef DIAGNOSTIC
if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
char buf[DB_THREADID_STRLEN];
(void)dbenv->thread_id_string(dbenv,
mutexp->pid, mutexp->tid, buf);
+ ret = MUTEX_ERR(env, mutex, EINVAL);
__db_errx(env, DB_STR_A("2022",
"pthread lock failed: lock currently in use: pid/tid: %s",
"%s"), buf);
- ret = EINVAL;
goto err;
}
#endif
@@ -455,6 +524,13 @@ out:
if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
__os_yield(env, 0, 0);
#endif
+#ifdef MUTEX_DIAG
+ if (t_ret == 0) {
+ __os_gettime(env, &mutexp->mutex_history.when, 0);
+ __os_stack_text(env, mutexp->mutex_history.stacktext,
+ sizeof(mutexp->mutex_history.stacktext), 12, 2);
+ }
+#endif
return (t_ret);
err:
@@ -479,6 +555,10 @@ __db_pthread_mutex_readlock(env, mutex)
{
DB_ENV *dbenv;
DB_MUTEX *mutexp;
+ MUTEX_STATE *state;
+#ifdef HAVE_FAILCHK_BROADCAST
+ db_timespec timespec;
+#endif
int ret;
dbenv = env->dbenv;
@@ -491,7 +571,6 @@ __db_pthread_mutex_readlock(env, mutex)
CHECK_MTX_THREAD(env, mutexp);
-#if defined(HAVE_STATISTICS)
/*
* We want to know which mutexes are contentious, but don't want to
* do an interlocked test here -- that's slower when the underlying
@@ -505,15 +584,52 @@ __db_pthread_mutex_readlock(env, mutex)
else
STAT_INC(env,
mutex, set_rd_nowait, mutexp->mutex_set_rd_nowait, mutex);
-#endif
+
+ state = NULL;
+ if (env->thr_hashtab != NULL && (ret = __mutex_record_lock(env,
+ mutex, MUTEX_ACTION_INTEND_SHARE, &state)) != 0)
+ return (ret);
PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp);
- RET_SET((pthread_rwlock_rdlock(&mutexp->u.rwlock)), ret);
+
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (dbenv->mutex_failchk_timeout != 0) {
+ do {
+ timespecclear(&timespec);
+ __clock_set_expires(env,
+ &timespec, dbenv->mutex_failchk_timeout);
+ RET_SET(pthread_rwlock_timedrdlock(&mutexp->u.rwlock,
+ (struct timespec *)&timespec), ret);
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ !F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+ if (ret == 0)
+ RETRY_ON_EFAULT(pthread_rwlock_unlock(
+ &mutexp->u.rwlock), ret);
+ ret = USR_ERR(env, __mutex_died(env, mutex));
+ goto err;
+ }
+ } while (ret == DB_TIMEOUT);
+ } else
+#endif
+ RET_SET(pthread_rwlock_rdlock(&mutexp->u.rwlock), ret);
+
PERFMON4(env, mutex, resume, mutex, FALSE, mutexp->alloc_id, mutexp);
DB_ASSERT(env, !F_ISSET(mutexp, DB_MUTEX_LOCKED));
if (ret != 0)
goto err;
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ !F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+ ret = USR_ERR(env, __mutex_died(env, mutex));
+ goto err;
+ }
+#endif
+#ifdef MUTEX_DIAG
+ __os_gettime(env, &mutexp->mutex_history.when, 0);
+ __os_stack_text(env, mutexp->mutex_history.stacktext,
+ sizeof(mutexp->mutex_history.stacktext), 12, 2);
+#endif
#ifdef DIAGNOSTIC
/*
* We want to switch threads as often as possible. Yield every time
@@ -524,7 +640,10 @@ __db_pthread_mutex_readlock(env, mutex)
#endif
return (0);
-err: __db_err(env, ret, DB_STR("2024", "pthread readlock failed"));
+err:
+ if (state != NULL)
+ state->action = MUTEX_ACTION_UNLOCKED;
+ __db_err(env, ret, DB_STR("2024", "pthread readlock failed"));
return (__env_panic(env, ret));
}
#endif
@@ -532,8 +651,10 @@ err: __db_err(env, ret, DB_STR("2024", "pthread readlock failed"));
#ifdef HAVE_MUTEX_HYBRID
/*
* __db_hybrid_mutex_suspend
- * Suspend this thread until the mutex is free enough to give the caller a
- * good chance of getting the mutex in the requested exclusivity mode.
+ * Suspend this thread, usually until the mutex is free enough to give the
+ * caller a good chance of getting the mutex in the requested exclusivity
+ * mode. Return early if the timeout is reached or a dead mutex is found
+ * to be dead.
*
* The major difference between this and the old __db_pthread_mutex_lock()
* is the additional 'exclusive' parameter.
@@ -551,6 +672,9 @@ __db_hybrid_mutex_suspend(env, mutex, timespec, exclusive)
int exclusive;
{
DB_MUTEX *mutexp;
+#ifdef HAVE_FAILCHECK_BROADCAST
+ db_timespec failchk_timespec;
+#endif
int ret, t_ret;
t_ret = 0;
@@ -571,7 +695,7 @@ __db_hybrid_mutex_suspend(env, mutex, timespec, exclusive)
* before checking the wait counter.
*/
mutexp->wait++;
- MUTEX_MEMBAR(mutexp->wait);
+ (void)MUTEX_MEMBAR(mutexp->wait);
while (exclusive ? MUTEXP_IS_BUSY(mutexp) :
atomic_read(&mutexp->sharecount) == MUTEX_SHARE_ISEXCLUSIVE) {
t_ret = __db_pthread_mutex_condwait(env,
@@ -582,7 +706,7 @@ __db_hybrid_mutex_suspend(env, mutex, timespec, exclusive)
ret = t_ret;
goto err;
}
- MUTEX_MEMBAR(mutexp->flags);
+ (void)MUTEX_MEMBAR(mutexp->flags);
}
mutexp->wait--;
@@ -627,8 +751,8 @@ __db_pthread_mutex_unlock(env, mutex)
DB_ENV *dbenv;
DB_MUTEX *mutexp;
int ret;
-#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID)
- int waiters;
+#ifndef HAVE_MUTEX_HYBRID
+ char description[DB_MUTEX_DESCRIBE_STRLEN];
#endif
dbenv = env->dbenv;
@@ -637,14 +761,13 @@ __db_pthread_mutex_unlock(env, mutex)
return (0);
mutexp = MUTEXP_SET(env, mutex);
-#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID)
- waiters = mutexp->wait;
-#endif
-#if !defined(HAVE_MUTEX_HYBRID) && defined(DIAGNOSTIC)
+#if !defined(HAVE_MUTEX_HYBRID)
if (!F_ISSET(mutexp, DB_MUTEX_LOCKED | DB_MUTEX_SHARED)) {
- __db_errx(env, DB_STR("2025",
- "pthread unlock failed: lock already unlocked"));
+ if (!PANIC_ISSET(env))
+ __db_errx(env, DB_STR("2069",
+ "pthread unlock %s: already unlocked"),
+ __mutex_describe(env, mutex, description));
return (__env_panic(env, EACCES));
}
#endif
@@ -662,14 +785,19 @@ __db_pthread_mutex_unlock(env, mutex)
if (F_ISSET(mutexp, DB_MUTEX_SHARED))
RET_SET(
- (pthread_cond_broadcast(&mutexp->u.m.cond)), ret);
+ pthread_cond_broadcast(&mutexp->u.m.cond), ret);
else
- RET_SET((pthread_cond_signal(&mutexp->u.m.cond)), ret);
+ RET_SET(pthread_cond_signal(&mutexp->u.m.cond), ret);
if (ret != 0)
goto err;
} else {
#ifndef HAVE_MUTEX_HYBRID
- F_CLR(mutexp, DB_MUTEX_LOCKED);
+
+ if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
+ F_CLR(mutexp, DB_MUTEX_LOCKED);
+ else if (env->thr_hashtab != NULL &&
+ (ret = __mutex_record_unlock(env, mutex)) != 0)
+ goto err;
#endif
}
@@ -685,12 +813,6 @@ err: if (ret != 0) {
__db_err(env, ret, "pthread unlock failed");
return (__env_panic(env, ret));
}
-#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID)
- if (!MUTEXP_IS_BUSY(mutexp) && mutexp->wait != 0)
- printf("unlock %ld %x busy %x waiters %d/%d\n",
- mutex, pthread_self(), ret,
- MUTEXP_BUSY_FIELD(mutexp), waiters, mutexp->wait);
-#endif
return (ret);
}
@@ -739,7 +861,7 @@ __db_pthread_mutex_destroy(env, mutex)
if (!failchk_thread)
#endif
RET_SET(
- (pthread_rwlock_destroy(&mutexp->u.rwlock)), ret);
+ pthread_rwlock_destroy(&mutexp->u.rwlock), ret);
/* For rwlocks, we're done - must not destroy rest of union */
return (ret);
#endif
@@ -754,15 +876,14 @@ __db_pthread_mutex_destroy(env, mutex)
#ifdef HAVE_PTHREAD_COND_REINIT_OKAY
if (!failchk_thread)
#endif
- RET_SET((pthread_cond_destroy(&mutexp->u.m.cond)), ret);
+ RET_SET(pthread_cond_destroy(&mutexp->u.m.cond), ret);
if (ret != 0)
__db_err(env, ret, DB_STR("2026",
"unable to destroy cond"));
}
- RET_SET((pthread_mutex_destroy(&mutexp->u.m.mutex)), t_ret);
+ RET_SET(pthread_mutex_destroy(&mutexp->u.m.mutex), t_ret);
if (t_ret != 0 && !failchk_thread) {
- __db_err(env, t_ret, DB_STR("2027",
- "unable to destroy mutex"));
+ __db_err(env, t_ret, DB_STR("2027", "unable to destroy mutex"));
if (ret == 0)
ret = t_ret;
}
diff --git a/src/mutex/mut_region.c b/src/mutex/mut_region.c
index 26ae0a03..976ff231 100644
--- a/src/mutex/mut_region.c
+++ b/src/mutex/mut_region.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -34,7 +34,7 @@ __mutex_open(env, create_ok)
DB_MUTEXMGR *mtxmgr;
DB_MUTEXREGION *mtxregion;
size_t size;
- u_int32_t cpu_count;
+ u_int32_t cpu_count, tas_spins;
int ret;
#ifndef HAVE_ATOMIC_SUPPORT
u_int i;
@@ -55,8 +55,14 @@ __mutex_open(env, create_ok)
dbenv->mutex_align = MUTEX_ALIGN;
if (dbenv->mutex_tas_spins == 0) {
cpu_count = __os_cpu_count();
- if ((ret = __mutex_set_tas_spins(dbenv, cpu_count == 1 ?
- cpu_count : cpu_count * MUTEX_SPINS_PER_PROCESSOR)) != 0)
+ if (cpu_count == 1)
+ tas_spins = 1;
+ else {
+ tas_spins = cpu_count * MUTEX_SPINS_PER_PROCESSOR;
+ if (tas_spins > MUTEX_SPINS_DEFAULT_MAX)
+ tas_spins = MUTEX_SPINS_DEFAULT_MAX;
+ }
+ if ((ret = __mutex_set_tas_spins(dbenv, tas_spins)) != 0)
return (ret);
}
@@ -118,11 +124,29 @@ __mutex_open(env, create_ok)
return (0);
-err: env->mutex_handle = NULL;
- if (mtxmgr->reginfo.addr != NULL)
- (void)__env_region_detach(env, &mtxmgr->reginfo, 0);
+err: (void)__mutex_region_detach(env, mtxmgr);
+ return (ret);
+}
- __os_free(env, mtxmgr);
+/*
+ * __mutex_region_detach --
+ *
+ * PUBLIC: int __mutex_region_detach __P((ENV *, DB_MUTEXMGR *));
+ */
+int
+__mutex_region_detach(env, mtxmgr)
+ ENV *env;
+ DB_MUTEXMGR *mtxmgr;
+{
+ int ret;
+
+ ret = 0;
+ if (mtxmgr != NULL) {
+ if (mtxmgr->reginfo.addr != NULL)
+ ret = __env_region_detach(env, &mtxmgr->reginfo, 0);
+ __os_free(env, mtxmgr);
+ env->mutex_handle = NULL;
+ }
return (ret);
}
@@ -136,7 +160,6 @@ __mutex_region_init(env, mtxmgr)
DB_MUTEXMGR *mtxmgr;
{
DB_ENV *dbenv;
- DB_MUTEX *mutexp;
DB_MUTEXREGION *mtxregion;
db_mutex_t mutex;
int ret;
@@ -144,8 +167,6 @@ __mutex_region_init(env, mtxmgr)
dbenv = env->dbenv;
- COMPQUIET(mutexp, NULL);
-
if ((ret = __env_alloc(&mtxmgr->reginfo,
sizeof(DB_MUTEXREGION), &mtxmgr->reginfo.primary)) != 0) {
__db_errx(env, DB_STR("2013",
@@ -205,26 +226,11 @@ __mutex_region_init(env, mtxmgr)
* in each link.
*/
env->mutex_handle = mtxmgr;
- if (F_ISSET(env, ENV_PRIVATE)) {
- mutexp = (DB_MUTEX *)mutex_array;
- mutexp++;
- mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
- mtxregion->mutex_next = (db_mutex_t)mutexp;
- } else {
- mtxregion->mutex_next = 1;
- mutexp = MUTEXP_SET(env, 1);
- }
- for (mutex = 1; mutex < mtxregion->stat.st_mutex_cnt; ++mutex) {
- mutexp->flags = 0;
- if (F_ISSET(env, ENV_PRIVATE))
- mutexp->mutex_next_link = (db_mutex_t)(mutexp + 1);
- else
- mutexp->mutex_next_link = mutex + 1;
- mutexp++;
- mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
- }
- mutexp->flags = 0;
- mutexp->mutex_next_link = MUTEX_INVALID;
+ mtxregion->mutex_next = (F_ISSET(env, ENV_PRIVATE) ?
+ ((uintptr_t)mutex_array + mtxregion->mutex_size) : 1);
+ MUTEX_BULK_INIT(env,
+ mtxregion, mtxregion->mutex_next, mtxregion->stat.st_mutex_cnt);
+
mtxregion->stat.st_mutex_free = mtxregion->stat.st_mutex_cnt;
mtxregion->stat.st_mutex_inuse = mtxregion->stat.st_mutex_inuse_max = 0;
if ((ret = __mutex_alloc(env, MTX_MUTEX_REGION, 0, &mutex)) != 0)
diff --git a/src/mutex/mut_stat.c b/src/mutex/mut_stat.c
index b64207fa..af622c7d 100644
--- a/src/mutex/mut_stat.c
+++ b/src/mutex/mut_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -19,6 +19,17 @@ static int __mutex_print_stats __P((ENV *, u_int32_t));
static void __mutex_print_summary __P((ENV *));
static int __mutex_stat __P((ENV *, DB_MUTEX_STAT **, u_int32_t));
+static const FN MutexFlagNames[] = {
+ { DB_MUTEX_ALLOCATED, "alloc" },
+ { DB_MUTEX_LOCKED, "locked" },
+ { DB_MUTEX_LOGICAL_LOCK, "logical" },
+ { DB_MUTEX_OWNER_DEAD, "ower-dead" },
+ { DB_MUTEX_PROCESS_ONLY, "process-private" },
+ { DB_MUTEX_SELF_BLOCK, "self-block" },
+ { DB_MUTEX_SHARED, "shared" },
+ { 0, NULL }
+};
+
/*
* __mutex_stat_pp --
* ENV->mutex_stat pre/post processing.
@@ -170,11 +181,12 @@ __mutex_print_summary(env)
size = 0;
if (F_ISSET(env, ENV_PRIVATE)) {
- mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1;
+ mutexp = (DB_MUTEX *)((uintptr_t)mtxmgr->mutex_array +
+ mtxregion->mutex_size);
chunk = NULL;
size = __env_elem_size(env,
ROFF_TO_P(mtxregion->mutex_off_alloc));
- size -= sizeof(*mutexp);
+ size -= mtxregion->mutex_size;
} else
mutexp = MUTEXP_SET(env, 1);
for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) {
@@ -185,13 +197,15 @@ __mutex_print_summary(env)
else
counts[mutexp->alloc_id]++;
- mutexp++;
+ mutexp = (DB_MUTEX *)((uintptr_t)mutexp +
+ mtxregion->mutex_size);
if (F_ISSET(env, ENV_PRIVATE) &&
(size -= sizeof(*mutexp)) < sizeof(*mutexp)) {
mutexp =
__env_get_chunk(&mtxmgr->reginfo, &chunk, &size);
+ mutexp = ALIGNP_INC(mutexp,
+ mtxregion->stat.st_mutex_align);
}
- mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
}
__db_msg(env, "Mutex counts");
__db_msg(env, "%d\tUnallocated", counts[0]);
@@ -252,14 +266,6 @@ __mutex_print_all(env, flags)
ENV *env;
u_int32_t flags;
{
- static const FN fn[] = {
- { DB_MUTEX_ALLOCATED, "alloc" },
- { DB_MUTEX_LOCKED, "locked" },
- { DB_MUTEX_LOGICAL_LOCK, "logical" },
- { DB_MUTEX_PROCESS_ONLY, "process-private" },
- { DB_MUTEX_SELF_BLOCK, "self-block" },
- { 0, NULL }
- };
DB_MSGBUF mb, *mbp;
DB_MUTEX *mutexp;
DB_MUTEXMGR *mtxmgr;
@@ -294,37 +300,32 @@ __mutex_print_all(env, flags)
__db_msg(env, "mutex\twait/nowait, pct wait, holder, flags");
size = 0;
if (F_ISSET(env, ENV_PRIVATE)) {
- mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1;
+ mutexp = (DB_MUTEX *)((uintptr_t)mtxmgr->mutex_array +
+ mtxregion->mutex_size);
chunk = NULL;
size = __env_elem_size(env,
ROFF_TO_P(mtxregion->mutex_off_alloc));
- size -= sizeof(*mutexp);
+ size -= mtxregion->mutex_size;
} else
mutexp = MUTEXP_SET(env, 1);
for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) {
if (F_ISSET(mutexp, DB_MUTEX_ALLOCATED)) {
__db_msgadd(env, mbp, "%5lu\t", (u_long)i);
-
__mutex_print_debug_stats(env, mbp,
F_ISSET(env, ENV_PRIVATE) ?
(db_mutex_t)mutexp : i, flags);
-
- if (mutexp->alloc_id != 0)
- __db_msgadd(env, mbp,
- ", %s", __mutex_print_id(mutexp->alloc_id));
-
- __db_prflags(env, mbp, mutexp->flags, fn, " (", ")");
-
DB_MSGBUF_FLUSH(env, mbp);
}
- mutexp++;
+ mutexp = (DB_MUTEX *)((uintptr_t)mutexp +
+ mtxregion->mutex_size);
if (F_ISSET(env, ENV_PRIVATE) &&
- (size -= sizeof(*mutexp)) < sizeof(*mutexp)) {
+ (size -= mtxregion->mutex_size) < mtxregion->mutex_size) {
mutexp =
__env_get_chunk(&mtxmgr->reginfo, &chunk, &size);
+ mutexp = ALIGNP_INC(mutexp,
+ mtxregion->stat.st_mutex_align);
}
- mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align);
}
return (0);
@@ -332,8 +333,7 @@ __mutex_print_all(env, flags)
/*
* __mutex_print_debug_single --
- * Print mutex internal debugging statistics for a single mutex on a
- * single output line.
+ * Print mutex internal debugging statistics for a single mutex.
*
* PUBLIC: void __mutex_print_debug_single
* PUBLIC: __P((ENV *, const char *, db_mutex_t, u_int32_t));
@@ -359,8 +359,9 @@ __mutex_print_debug_single(env, tag, mutex, flags)
/*
* __mutex_print_debug_stats --
- * Print mutex internal debugging statistics, that is, the statistics
- * in the [] square brackets.
+ * Print the mutex internal debugging statistics in square bracket,s on a
+ * followed by the allocation id and flags, on single line. When MUTEX_DIAG
+ * is on and the mutex is held, append the owner's stack trace.
*
* PUBLIC: void __mutex_print_debug_stats
* PUBLIC: __P((ENV *, DB_MSGBUF *, db_mutex_t, u_int32_t));
@@ -380,6 +381,9 @@ __mutex_print_debug_stats(env, mbp, mutex, flags)
!defined(HAVE_MUTEX_PTHREADS))
int sharecount;
#endif
+#ifdef MUTEX_DIAG
+ char timestr[CTIME_BUFLEN];
+#endif
if (mutex == MUTEX_INVALID) {
__db_msgadd(env, mbp, "[!Set]");
@@ -448,6 +452,22 @@ __mutex_print_debug_stats(env, mbp, mutex, flags)
mutexp->hybrid_wait, mutexp->hybrid_wakeup);
#endif
+ if (mutexp->alloc_id != 0)
+ __db_msgadd(env,
+ mbp, ", %s", __mutex_print_id(mutexp->alloc_id));
+
+ __db_prflags(env, mbp, mutexp->flags, MutexFlagNames, " (", ")");
+#ifdef MUTEX_DIAG
+ if (mutexp->alloc_id != MTX_LOGICAL_LOCK &&
+ timespecisset(&mutexp->mutex_history.when)) {
+ __db_ctimespec(&mutexp->mutex_history.when, timestr);
+ __db_msgadd(env, mbp, "\nLocked %s", timestr);
+ if (mutexp->mutex_history.stacktext[0] != '\0')
+ __db_msgadd(env, mbp, "\n%.*s",
+ (int)sizeof(mutexp->mutex_history.stacktext) - 1,
+ mutexp->mutex_history.stacktext);
+ }
+#endif
if (LF_ISSET(DB_STAT_CLEAR))
__mutex_clear(env, mutex);
}
@@ -495,7 +515,8 @@ __mutex_print_id(alloc_id)
case MTX_TXN_COMMIT: return ("txn commit");
case MTX_TXN_MVCC: return ("txn mvcc");
case MTX_TXN_REGION: return ("txn region");
- default: return ("unknown mutex type");
+ case 0: return ("invalid 0 mutex type");
+ default: return ("unknown non-zero mutex type");
/* NOTREACHED */
}
}
@@ -577,3 +598,39 @@ __mutex_stat_print_pp(dbenv, flags)
return (__db_stat_not_built(dbenv->env));
}
#endif
+
+/*
+ * __mutex_describe
+ * Fill in a buffer with the mutex #, alloc_id, and any other
+ * characteristics which are likely to be useful for diagnostics. The
+ * destination buffer must hold at least DB_MUTEX_DESCRIBE_STRLEN bytes.
+ *
+ * PUBLIC: char *__mutex_describe __P((ENV *, db_mutex_t, char *));
+ */
+char *
+__mutex_describe(env, mutex, dest)
+ ENV *env;
+ db_mutex_t mutex;
+ char *dest;
+{
+ DB_MUTEX *mutexp;
+ DB_MSGBUF mb, *mbp;
+ const char *type;
+
+ DB_MSGBUF_INIT(&mb);
+ mbp = &mb;
+ mutexp = MUTEXP_SET(env, mutex);
+ type = F_ISSET(mutexp, DB_MUTEX_SHARED) ? "latch" : "mutex";
+#ifdef HAVE_STATISTICS
+ __db_msgadd(env, mbp, "%s %s id %ld ",
+ __mutex_print_id(mutexp->alloc_id), type, (long)mutex);
+ __db_prflags(env, mbp, mutexp->flags, MutexFlagNames, " (", ")");
+#else
+ __db_msgadd(env, mbp, "%s flags %x id %ld ",
+ type, mutexp->flags, (long)mutex);
+#endif
+ (void)snprintf(dest, DB_MUTEX_DESCRIBE_STRLEN - 1,
+ "%.*s", (int)(mbp->cur - mbp->buf), mbp->buf);
+ dest[DB_MUTEX_DESCRIBE_STRLEN - 1] = '\0';
+ return (dest);
+}
diff --git a/src/mutex/mut_stub.c b/src/mutex/mut_stub.c
index 61ecc80c..0ece9a9d 100644
--- a/src/mutex/mut_stub.c
+++ b/src/mutex/mut_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -157,6 +157,16 @@ __mutex_print_debug_stats(env, mbp, mutex, flags)
}
int
+__mutex_refresh(env, mutex)
+ ENV *env;
+ db_mutex_t mutex;
+{
+ COMPQUIET(env, NULL);
+ COMPQUIET(mutex, MUTEX_INVALID);
+ return (0);
+}
+
+int
__mutex_set_align(dbenv, align)
DB_ENV *dbenv;
u_int32_t align;
diff --git a/src/mutex/mut_tas.c b/src/mutex/mut_tas.c
index 0899d237..c7cc3ea5 100644
--- a/src/mutex/mut_tas.c
+++ b/src/mutex/mut_tas.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -52,8 +52,7 @@ __db_tas_mutex_init(env, mutex, flags)
#endif
if (MUTEX_INIT(&mutexp->tas)) {
ret = __os_get_syserr();
- __db_syserr(env, ret, DB_STR("2029",
- "TAS: mutex initialize"));
+ __db_syserr(env, ret, DB_STR("2029", "TAS: mutex initialize"));
return (__os_posix_err(ret));
}
#ifdef HAVE_MUTEX_HYBRID
@@ -66,7 +65,9 @@ __db_tas_mutex_init(env, mutex, flags)
/*
* __db_tas_mutex_lock_int
- * Internal function to lock a mutex, or just try to lock it without waiting
+ * Internal function to lock a mutex, or just try to lock it without
+ * waiting. MUTEX_WAIT() passes in a timeout to allow an early exit
+ * returning DB_TIMEOUT.
*/
inline static int
__db_tas_mutex_lock_int(env, mutex, timeout, nowait)
@@ -80,13 +81,15 @@ __db_tas_mutex_lock_int(env, mutex, timeout, nowait)
DB_MUTEXMGR *mtxmgr;
DB_MUTEXREGION *mtxregion;
DB_THREAD_INFO *ip;
- db_timespec now, timespec;
+ db_timespec now, timeout_timespec;
u_int32_t nspins;
+ u_long micros;
int ret;
-#ifdef HAVE_MUTEX_HYBRID
- const u_long micros = 0;
-#else
- u_long micros, max_micros;
+#ifdef DIAGNOSTIC
+ char buf[DB_THREADID_STRLEN];
+#endif
+#ifndef HAVE_MUTEX_HYBRID
+ u_long max_micros;
db_timeout_t time_left;
#endif
@@ -95,21 +98,23 @@ __db_tas_mutex_lock_int(env, mutex, timeout, nowait)
if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING))
return (0);
+ PANIC_CHECK(env);
+
mtxmgr = env->mutex_handle;
mtxregion = mtxmgr->reginfo.primary;
mutexp = MUTEXP_SET(env, mutex);
CHECK_MTX_THREAD(env, mutexp);
-#ifdef HAVE_STATISTICS
if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
STAT_INC(env, mutex, set_wait, mutexp->mutex_set_wait, mutex);
else
STAT_INC(env,
mutex, set_nowait, mutexp->mutex_set_nowait, mutex);
-#endif
-#ifndef HAVE_MUTEX_HYBRID
+#ifdef HAVE_MUTEX_HYBRID
+ micros = 0;
+#else
/*
* Wait 1ms initially, up to 10ms for mutexes backing logical database
* locks, and up to 25 ms for mutual exclusion data structure mutexes.
@@ -119,16 +124,15 @@ __db_tas_mutex_lock_int(env, mutex, timeout, nowait)
max_micros = F_ISSET(mutexp, DB_MUTEX_LOGICAL_LOCK) ? 10000 : 25000;
#endif
- /* Clear the ending timespec so it'll be initialed upon first need. */
+ /* Clear the ending timespec so it'll be initialized upon first need. */
if (timeout != 0)
- timespecclear(&timespec);
+ timespecclear(&timeout_timespec);
/*
- * Only check the thread state once, by initializing the thread
- * control block pointer to null. If it is not the failchk
- * thread, then ip will have a valid value subsequent times
- * in the loop.
- */
+ * Only check the thread state once, by initializing the thread
+ * control block pointer to null. If it is not the failchk thread,
+ * then ip will be valid during the subsequent times in the loop.
+ */
ip = NULL;
loop: /* Attempt to acquire the resource for N spins. */
@@ -151,16 +155,45 @@ loop: /* Attempt to acquire the resource for N spins. */
if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
ip == NULL && dbenv->is_alive(dbenv,
mutexp->pid, mutexp->tid, 0) == 0) {
+ /*
+ * The process owing the mutex is "dead" now, but it may
+ * have already released the mutex. We need to check again
+ * by going back to the top of the loop if the mutex is
+ * still held by the "dead" process. We yield 10 us to
+ * increase the likelyhood of mutexp fields being up-to-date.
+ * Set spin so we spin one more time because there isno need
+ * to spin more if the dead process owns the mutex.
+ */
+ if (nspins > 1) {
+ nspins = 2;
+ __os_yield(env, 0, 10);
+ continue;
+ }
ret = __env_set_state(env, &ip, THREAD_VERIFY);
if (ret != 0 ||
- ip->dbth_state == THREAD_FAILCHK)
- return (DB_RUNRECOVERY);
+ ip->dbth_state == THREAD_FAILCHK) {
+ /*
+ * Either we could not get the thread
+ * state or we did and found that this
+ * is the failchk thread. Return a panic
+ * code in either case, but if the
+ * failchk thread don't give more
+ * notice of the already-existing panic.
+ */
+ if (ret == 0)
+ return (USR_ERR(env,
+ DB_RUNRECOVERY));
+ else
+ return (__env_panic(env,
+ USR_ERR(env, ret)));
+ }
}
if (nowait)
- return (DB_LOCK_NOTGRANTED);
+ return (USR_ERR(env, DB_LOCK_NOTGRANTED));
/*
* Some systems (notably those with newer Intel CPUs)
* need a small pause here. [#6975]
+ * XXX Is there some better post-Pentum 4?
*/
MUTEX_PAUSE
continue;
@@ -189,9 +222,14 @@ loop: /* Attempt to acquire the resource for N spins. */
* the DB mutex unlock function.
*/
#endif
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD)) {
+ MUTEX_UNSET(&mutexp->tas);
+ return (__mutex_died(env, mutex));
+ }
+#endif
#ifdef DIAGNOSTIC
if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
- char buf[DB_THREADID_STRLEN];
__db_errx(env, DB_STR_A("2030",
"TAS lock failed: lock %ld currently in use: ID: %s",
"%ld %s"), (long)mutex,
@@ -202,6 +240,12 @@ loop: /* Attempt to acquire the resource for N spins. */
#endif
F_SET(mutexp, DB_MUTEX_LOCKED);
dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+#if defined(MUTEX_DIAG)
+ __os_gettime(env, &mutexp->mutex_history.when, 0);
+ /* Why 3? Skip __os_stack_text, __db_tas_mutex_lock{_int,} */
+ __os_stack_text(env, mutexp->mutex_history.stacktext,
+ sizeof(mutexp->mutex_history.stacktext), 12, 3);
+#endif
#ifdef DIAGNOSTIC
/*
@@ -215,20 +259,20 @@ loop: /* Attempt to acquire the resource for N spins. */
}
/*
- * We need to wait for the lock to become available.
- * Possibly setup timeouts if this is the first wait, or
- * check expiration times for the second and subsequent waits.
+ * We need to wait for the lock to become available. Setup timeouts if
+ * this is the first wait, or the failchk timeout is smaller than the
+ * wait timeout. Check expiration times for subsequent waits.
*/
if (timeout != 0) {
/* Set the expiration time if this is the first sleep . */
- if (!timespecisset(&timespec))
- __clock_set_expires(env, &timespec, timeout);
+ if (!timespecisset(&timeout_timespec))
+ __clock_set_expires(env, &timeout_timespec, timeout);
else {
timespecclear(&now);
- if (__clock_expired(env, &now, &timespec))
- return (DB_TIMEOUT);
+ if (__clock_expired(env, &now, &timeout_timespec))
+ return (USR_ERR(env, DB_TIMEOUT));
#ifndef HAVE_MUTEX_HYBRID
- timespecsub(&now, &timespec);
+ timespecsub(&now, &timeout_timespec);
DB_TIMESPEC_TO_TIMEOUT(time_left, &now, 0);
time_left = timeout - time_left;
if (micros > time_left)
@@ -253,13 +297,21 @@ loop: /* Attempt to acquire the resource for N spins. */
goto loop;
/* Wait until the mutex can be obtained exclusively or it times out. */
if ((ret = __db_hybrid_mutex_suspend(env,
- mutex, timeout == 0 ? NULL : &timespec, TRUE)) != 0)
+ mutex, timeout == 0 ? NULL : &timeout_timespec, TRUE)) != 0) {
+ DB_DEBUG_MSG(env,
+ "mutex_lock %ld suspend returned %d", (u_long)mutex, ret);
return (ret);
+ }
#else
if ((micros <<= 1) > max_micros)
micros = max_micros;
#endif
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ dbenv->mutex_failchk_timeout != 0)
+ return (__mutex_died(env, mutex));
+#endif
/*
* We're spinning. The environment might be hung, and somebody else
* has already recovered it. The first thing recovery does is panic
@@ -291,7 +343,7 @@ __db_tas_mutex_lock(env, mutex, timeout)
* Try to exclusively lock a mutex without ever blocking - ever!
*
* Returns 0 on success,
- * DB_LOCK_NOTGRANTED on timeout
+ * DB_LOCK_NOTGRANTED if it is busy.
* Possibly DB_RUNRECOVERY if DB_ENV_FAILCHK or panic.
*
* This will work for DB_MUTEX_SHARED, though it always tries
@@ -324,9 +376,9 @@ __db_tas_mutex_readlock_int(env, mutex, nowait)
DB_MUTEXMGR *mtxmgr;
DB_MUTEXREGION *mtxregion;
DB_THREAD_INFO *ip;
- int lock;
+ MUTEX_STATE *state;
+ int lock, ret;
u_int32_t nspins;
- int ret;
#ifndef HAVE_MUTEX_HYBRID
u_long micros, max_micros;
#endif
@@ -342,14 +394,17 @@ __db_tas_mutex_readlock_int(env, mutex, nowait)
CHECK_MTX_THREAD(env, mutexp);
DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SHARED));
-#ifdef HAVE_STATISTICS
if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
STAT_INC(env,
mutex, set_rd_wait, mutexp->mutex_set_rd_wait, mutex);
else
STAT_INC(env,
mutex, set_rd_nowait, mutexp->mutex_set_rd_nowait, mutex);
-#endif
+
+ state = NULL;
+ if (env->thr_hashtab != NULL && (ret = __mutex_record_lock(env,
+ mutex, MUTEX_ACTION_INTEND_SHARE, &state)) != 0)
+ return (ret);
#ifndef HAVE_MUTEX_HYBRID
/*
@@ -375,25 +430,52 @@ loop: /* Attempt to acquire the resource for N spins. */
MUTEX_PAUSE
continue;
}
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ !F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+ (void)atomic_compare_exchange(env,
+ &mutexp->sharecount, lock, lock - 1);
+ if (state != NULL)
+ state->action = MUTEX_ACTION_UNLOCKED;
+ return (__mutex_died(env, mutex));
+ }
+#endif
MEMBAR_ENTER();
+#ifdef MUTEX_DIAG
+ __os_gettime(env, &mutexp->mutex_history.when, 0);
+ __os_stack_text(env, mutexp->mutex_history.stacktext,
+ sizeof(mutexp->mutex_history.stacktext), 12, 3);
+#endif
/* For shared latches the threadid is the last requestor's id.
*/
dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid);
+ if (state != NULL)
+ state->action = MUTEX_ACTION_SHARED;
return (0);
}
- /*
- * Waiting for the latched must be avoided when it could allow a
- * 'failchk'ing thread to hang.
- */
+ /* Waiting for the latch must be avoided if it could hang up failchk. */
if (F_ISSET(dbenv, DB_ENV_FAILCHK) &&
dbenv->is_alive(dbenv, mutexp->pid, mutexp->tid, 0) == 0) {
ret = __env_set_state(env, &ip, THREAD_VERIFY);
- if (ret != 0 || ip->dbth_state == THREAD_FAILCHK)
- return (DB_RUNRECOVERY);
+ if (ret != 0 || ip->dbth_state == THREAD_FAILCHK) {
+ if (state != NULL)
+ state->action = MUTEX_ACTION_UNLOCKED;
+ if (ret == 0)
+ return (USR_ERR(env, DB_RUNRECOVERY));
+ else
+ return (__env_panic(env, USR_ERR(env, ret)));
+ }
}
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD)) {
+ if (state != NULL)
+ state->action = MUTEX_ACTION_UNLOCKED;
+ return (__mutex_died(env, mutex));
+ }
+#endif
/*
* It is possible to spin out when the latch is just shared, due to
@@ -403,6 +485,8 @@ loop: /* Attempt to acquire the resource for N spins. */
if (nowait) {
if (atomic_read(&mutexp->sharecount) != MUTEX_SHARE_ISEXCLUSIVE)
goto loop;
+ if (state != NULL)
+ state->action = MUTEX_ACTION_UNLOCKED;
return (DB_LOCK_NOTGRANTED);
}
@@ -419,8 +503,11 @@ loop: /* Attempt to acquire the resource for N spins. */
if (atomic_read(&mutexp->sharecount) != MUTEX_SHARE_ISEXCLUSIVE)
goto loop;
/* Wait until the mutex is no longer exclusively locked. */
- if ((ret = __db_hybrid_mutex_suspend(env, mutex, NULL, FALSE)) != 0)
+ if ((ret = __db_hybrid_mutex_suspend(env, mutex, NULL, FALSE)) != 0) {
+ if (state != NULL)
+ state->action = MUTEX_ACTION_UNLOCKED;
return (ret);
+ }
#else
PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp);
__os_yield(env, 0, micros);
@@ -486,17 +573,13 @@ __db_tas_mutex_tryreadlock(env, mutex)
*/
int
__db_tas_mutex_unlock(env, mutex)
- ENV *env;
+ ENV *env;
db_mutex_t mutex;
{
DB_ENV *dbenv;
DB_MUTEX *mutexp;
-#ifdef HAVE_MUTEX_HYBRID
int ret;
-#ifdef MUTEX_DIAG
- int waiters;
-#endif
-#endif
+ char description[DB_MUTEX_DESCRIBE_STRLEN];
#ifdef HAVE_SHARED_LATCHES
int sharecount;
#endif
@@ -506,14 +589,14 @@ __db_tas_mutex_unlock(env, mutex)
return (0);
mutexp = MUTEXP_SET(env, mutex);
-#if defined(HAVE_MUTEX_HYBRID) && defined(MUTEX_DIAG)
- waiters = mutexp->wait;
-#endif
#if defined(DIAGNOSTIC)
#if defined(HAVE_SHARED_LATCHES)
if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
if (atomic_read(&mutexp->sharecount) == 0) {
+ if (PANIC_ISSET(env))
+ return (__env_panic(env,
+ USR_ERR(env, DB_RUNRECOVERY)));
__db_errx(env, DB_STR_A("2031",
"shared unlock %ld already unlocked", "%ld"),
(long)mutex);
@@ -522,16 +605,39 @@ __db_tas_mutex_unlock(env, mutex)
} else
#endif
if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ if (PANIC_ISSET(env))
+ return (__env_panic(env,
+ USR_ERR(env, DB_RUNRECOVERY)));
__db_errx(env, DB_STR_A("2032",
"unlock %ld already unlocked", "%ld"), (long)mutex);
return (__env_panic(env, EACCES));
}
#endif
+#ifdef MUTEX_DIAG
+ timespecclear(&mutexp->mutex_history.when);
+#endif
#ifdef HAVE_SHARED_LATCHES
if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
sharecount = atomic_read(&mutexp->sharecount);
- /*MUTEX_MEMBAR(mutexp->sharecount);*/ /* XXX why? */
+ /*
+ * Many code paths contain sequence of the form
+ * MUTEX_LOCK(); ret = function(); MUTEX_UNLOCK();
+ * If function() sees or causes a panic while it had temporarily
+ * unlocked the mutex it won't be locked anymore. Don't confuse
+ * the error by generating spurious follow-on messages.
+ */
+ if (sharecount == 0) {
+was_not_locked:
+ if (!PANIC_ISSET(env)) {
+ __db_errx(env, DB_STR_A("2070",
+ "Shared unlock %s: already unlocked", "%s"),
+ __mutex_describe(env, mutex, description));
+ return (__env_panic(env,
+ USR_ERR(env, DB_RUNRECOVERY)));
+ }
+ return (__env_panic(env, EACCES));
+ }
if (sharecount == MUTEX_SHARE_ISEXCLUSIVE) {
F_CLR(mutexp, DB_MUTEX_LOCKED);
/* Flush flag update before zeroing count */
@@ -542,12 +648,17 @@ __db_tas_mutex_unlock(env, mutex)
MEMBAR_EXIT();
sharecount = atomic_dec(env, &mutexp->sharecount);
DB_ASSERT(env, sharecount >= 0);
+ if (env->thr_hashtab != NULL &&
+ (ret = __mutex_record_unlock(env, mutex)) != 0)
+ return (ret);
if (sharecount > 0)
return (0);
}
} else
#endif
{
+ if (!F_ISSET(mutexp, DB_MUTEX_LOCKED))
+ goto was_not_locked;
F_CLR(mutexp, DB_MUTEX_LOCKED);
MUTEX_UNSET(&mutexp->tas);
}
@@ -559,17 +670,10 @@ __db_tas_mutex_unlock(env, mutex)
#endif
/* Prevent the load of wait from being hoisted before MUTEX_UNSET */
- MUTEX_MEMBAR(mutexp->flags);
+ (void)MUTEX_MEMBAR(mutexp->flags);
if (mutexp->wait &&
(ret = __db_pthread_mutex_unlock(env, mutex)) != 0)
return (ret);
-
-#ifdef MUTEX_DIAG
- if (mutexp->wait)
- printf("tas_unlock %ld %x waiters! busy %x waiters %d/%d\n",
- mutex, pthread_self(),
- MUTEXP_BUSY_FIELD(mutexp), waiters, mutexp->wait);
-#endif
#endif
return (0);
diff --git a/src/mutex/mut_win32.c b/src/mutex/mut_win32.c
index 07d5a8dd..270e03fb 100644
--- a/src/mutex/mut_win32.c
+++ b/src/mutex/mut_win32.c
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2002, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -92,6 +92,9 @@ __db_win32_mutex_lock_int(env, mutex, timeout, wait)
db_timespec now, tempspec, timeoutspec;
db_timeout_t time_left;
int ret;
+#ifdef DIAGNOSTIC
+ char buf[DB_THREADID_STRLEN];
+#endif
#ifdef MUTEX_DIAG
LARGE_INTEGER now;
#endif
@@ -143,8 +146,10 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */
mutexp->pid, mutexp->tid, 0) == 0) {
ret = __env_set_state(env, &ip, THREAD_VERIFY);
if (ret != 0 ||
- ip->dbth_state == THREAD_FAILCHK)
- return (DB_RUNRECOVERY);
+ ip->dbth_state == THREAD_FAILCHK) {
+ ret = DB_RUNRECOVERY;
+ goto failed;
+ }
}
if (!wait)
return (DB_LOCK_NOTGRANTED);
@@ -155,15 +160,20 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */
MUTEX_PAUSE
continue;
}
-
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD)) {
+ MUTEX_UNSET(&mutexp->tas);
+ goto died;
+ }
+#endif
#ifdef DIAGNOSTIC
if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
- char buf[DB_THREADID_STRLEN];
__db_errx(env, DB_STR_A("2003",
"Win32 lock failed: mutex already locked by %s",
"%s"), dbenv->thread_id_string(dbenv,
mutexp->pid, mutexp->tid, buf));
- return (__env_panic(env, EACCES));
+ ret = __env_panic(env, EACCES);
+ goto failed;
}
#endif
F_SET(mutexp, DB_MUTEX_LOCKED);
@@ -179,11 +189,12 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */
CloseHandle(event);
InterlockedDecrement(&mutexp->nwaiters);
#ifdef MUTEX_DIAG
+ /* "ret" was set by WaitForSingleObject(). */
if (ret != WAIT_OBJECT_0) {
QueryPerformanceCounter(&diag_now);
printf(DB_STR_A("2004",
- "[%I64d]: Lost signal on mutex %p, "
- "id %d, ms %d\n", "%I64d %p %d %d"),
+ "[%lld]: Lost signal on mutex %p, "
+ "id %d, ms %d\n", "%lld %p %d %d"),
diag_now.QuadPart, mutexp, mutexp->id, ms);
}
#endif
@@ -210,11 +221,8 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */
if (timeout != 0) {
timespecclear(&now);
if (__clock_expired(env, &now, &timeoutspec)) {
- if (event != NULL) {
- CloseHandle(event);
- InterlockedDecrement(&mutexp->nwaiters);
- }
- return (DB_TIMEOUT);
+ ret = DB_TIMEOUT;
+ goto failed;
}
/* Reduce the event wait if the timeout would happen first. */
tempspec = timeoutspec;
@@ -228,24 +236,41 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */
#ifdef MUTEX_DIAG
QueryPerformanceCounter(&diag_now);
printf(DB_STR_A("2005",
- "[%I64d]: Waiting on mutex %p, id %d\n",
- "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
+ "[%lld]: Waiting on mutex %p, id %d\n",
+ "%lld %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
#endif
InterlockedIncrement(&mutexp->nwaiters);
- if ((ret = get_handle(env, mutexp, &event)) != 0)
- goto err;
+ if ((ret = get_handle(env, mutexp, &event)) != 0) {
+ InterlockedDecrement(&mutexp->nwaiters);
+ goto syserr;
+ }
}
if ((ret = WaitForSingleObject(event, ms)) == WAIT_FAILED) {
ret = __os_get_syserr();
- goto err;
+ goto syserr;
}
if ((ms <<= 1) > MS_PER_SEC)
ms = MS_PER_SEC;
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ !F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+died:
+ ret = __mutex_died(env, mutex);
+ goto failed;
+ }
+#endif
PANIC_CHECK(env);
goto loop;
-err: __db_syserr(env, ret, DB_STR("2006", "Win32 lock failed"));
+failed:
+ if (event != NULL) {
+ CloseHandle(event);
+ InterlockedDecrement(&mutexp->nwaiters);
+ }
+ return (ret);
+
+syserr: __db_syserr(env, ret, DB_STR("2006", "Win32 lock failed"));
return (__env_panic(env, __os_posix_err(ret)));
}
@@ -266,6 +291,12 @@ __db_win32_mutex_init(env, mutex, flags)
mutexp = MUTEXP_SET(env, mutex);
mutexp->id = ((getpid() & 0xffff) << 16) ^ P_TO_UINT32(mutexp);
F_SET(mutexp, flags);
+ /*
+ * See WINCE_ATOMIC_MAGIC definition for details.
+ * Use sharecount, because the value just needs to be a db_atomic_t
+ * memory mapped onto the same page as those being Interlocked*.
+ */
+ WINCE_ATOMIC_MAGIC(&mutexp->sharecount);
return (0);
}
@@ -315,9 +346,11 @@ __db_win32_mutex_readlock_int(env, mutex, nowait)
DB_MUTEXMGR *mtxmgr;
DB_MUTEXREGION *mtxregion;
HANDLE event;
+ MUTEX_STATE *state;
u_int32_t nspins;
- int ms, ret;
- long exch_ret, mtx_val;
+ int max_ms, ms, ret;
+ long mtx_val;
+
#ifdef MUTEX_DIAG
LARGE_INTEGER diag_now;
#endif
@@ -342,11 +375,23 @@ __db_win32_mutex_readlock_int(env, mutex, nowait)
event = NULL;
ms = 50;
ret = 0;
+
+ state = NULL;
+ if (env->thr_hashtab != NULL && (ret = __mutex_record_lock(env,
+ mutex, MUTEX_ACTION_INTEND_SHARE, &state)) != 0)
+ return (ret);
+#ifdef HAVE_FAILCHK_BROADCAST
/*
- * This needs to be initialized, since if mutexp->tas
- * is write locked on the first pass, it needs a value.
+ * Limit WaitForSingleObject() sleeps to at most the failchk timeout,
+ * and least 1 millisecond. When failchk broadcasting is not
+ * supported check at least every second.
*/
- exch_ret = 0;
+ if (dbenv->mutex_failchk_timeout != 0 &&
+ (max_ms = (dbenv->mutex_failchk_timeout / US_PER_MS)) == 0)
+ max_ms = 1;
+ else
+#endif
+ max_ms = MS_PER_SEC;
loop: /* Attempt to acquire the resource for N spins. */
for (nspins =
@@ -357,9 +402,10 @@ loop: /* Attempt to acquire the resource for N spins. */
*/
retry: mtx_val = atomic_read(&mutexp->sharecount);
if (mtx_val == MUTEX_SHARE_ISEXCLUSIVE) {
- if (nowait)
- return (DB_LOCK_NOTGRANTED);
-
+ if (nowait) {
+ ret = DB_LOCK_NOTGRANTED;
+ goto failed;
+ }
continue;
} else if (!atomic_compare_exchange(env, &mutexp->sharecount,
mtx_val, mtx_val + 1)) {
@@ -370,6 +416,15 @@ retry: mtx_val = atomic_read(&mutexp->sharecount);
MUTEX_PAUSE
goto retry;
}
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ !F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+ InterlockedDecrement(
+ (interlocked_val)&mutexp->sharecount);
+ ret = __mutex_died(env, mutex);
+ goto failed;
+ }
+#endif
#ifdef HAVE_STATISTICS
if (event == NULL)
@@ -384,12 +439,14 @@ retry: mtx_val = atomic_read(&mutexp->sharecount);
if (ret != WAIT_OBJECT_0) {
QueryPerformanceCounter(&diag_now);
printf(DB_STR_A("2007",
- "[%I64d]: Lost signal on mutex %p, "
- "id %d, ms %d\n", "%I64d %p %d %d"),
+ "[%lld]: Lost signal on mutex %p, "
+ "id %d, ms %d\n", "%lld %p %d %d"),
diag_now.QuadPart, mutexp, mutexp->id, ms);
}
#endif
}
+ if (state != NULL)
+ state->action = MUTEX_ACTION_SHARED;
#ifdef DIAGNOSTIC
/*
@@ -404,17 +461,17 @@ retry: mtx_val = atomic_read(&mutexp->sharecount);
}
/*
- * Yield the processor; wait 50 ms initially, up to 1 second. This
- * loop is needed to work around a race where the signal from the
- * unlocking thread gets lost. We start at 50 ms because it's unlikely
- * to happen often and we want to avoid wasting CPU.
+ * Yield the processor; wait 50 ms initially, up to 1 second or the
+ * failchk timeout. This loop works around a race where the signal from
+ * the unlocking thread gets lost. We start at 50 ms because it's
+ * unlikely to happen often and we want to avoid wasting CPU.
*/
if (event == NULL) {
#ifdef MUTEX_DIAG
QueryPerformanceCounter(&diag_now);
printf(DB_STR_A("2008",
- "[%I64d]: Waiting on mutex %p, id %d\n",
- "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
+ "[%lld]: Waiting on mutex %p, id %d\n",
+ "%lld %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
#endif
InterlockedIncrement(&mutexp->nwaiters);
if ((ret = get_handle(env, mutexp, &event)) != 0)
@@ -424,12 +481,32 @@ retry: mtx_val = atomic_read(&mutexp->sharecount);
ret = __os_get_syserr();
goto err;
}
- if ((ms <<= 1) > MS_PER_SEC)
- ms = MS_PER_SEC;
+
+#ifdef HAVE_FAILCHK_BROADCAST
+ if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) &&
+ !F_ISSET(dbenv, DB_ENV_FAILCHK)) {
+ (void)atomic_compare_exchange(env,
+ &mutexp->sharecount, mtx_val, mtx_val - 1);
+ ret = __mutex_died(env, mutex);
+ goto failed;
+ }
+#endif
PANIC_CHECK(env);
+
+ if ((ms <<= 1) > max_ms)
+ ms = max_ms;
goto loop;
+failed:
+ if (event != NULL) {
+ CloseHandle(event);
+ InterlockedDecrement(&mutexp->nwaiters);
+ }
+ if (state != NULL)
+ state->action = MUTEX_ACTION_UNLOCKED;
+ return (ret);
+
err: __db_syserr(env, ret, DB_STR("2009",
"Win32 read lock failed"));
return (__env_panic(env, __os_posix_err(ret)));
@@ -482,7 +559,8 @@ __db_win32_mutex_unlock(env, mutex)
DB_ENV *dbenv;
DB_MUTEX *mutexp;
HANDLE event;
- int ret;
+ int ret, sharecount;
+ char description[DB_MUTEX_DESCRIBE_STRLEN];
#ifdef MUTEX_DIAG
LARGE_INTEGER diag_now;
#endif
@@ -510,6 +588,16 @@ __db_win32_mutex_unlock(env, mutex)
*/
#ifdef HAVE_SHARED_LATCHES
if (F_ISSET(mutexp, DB_MUTEX_SHARED)) {
+ sharecount = atomic_read(&mutexp->sharecount);
+ if (sharecount == 0) {
+ if (!PANIC_ISSET(env)) {
+ __db_errx(env, DB_STR_A("2071",
+ "Shared unlock %s: already unlocked", "%s"),
+ __mutex_describe(env, mutex, description));
+ return (DB_RUNRECOVERY);
+ }
+ return (__env_panic(env, EACCES));
+ }
if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
F_CLR(mutexp, DB_MUTEX_LOCKED);
if ((ret = InterlockedExchange(
@@ -519,12 +607,26 @@ __db_win32_mutex_unlock(env, mutex)
ret = DB_RUNRECOVERY;
goto err;
}
- } else if (InterlockedDecrement(
- (interlocked_val)(&atomic_read(&mutexp->sharecount))) > 0)
- return (0);
+ } else {
+ if (env->thr_hashtab != NULL &&
+ (ret = __mutex_record_unlock(env, mutex)) != 0)
+ return (ret);
+ if (InterlockedDecrement((interlocked_val)
+ (&atomic_read(&mutexp->sharecount))) > 0)
+ return (0);
+ }
} else
#endif
{
+ if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) {
+ if (!PANIC_ISSET(env)) {
+ __db_errx(env, DB_STR_A("2072",
+ "Unlock %s: already unlocked", "%s"),
+ __mutex_describe(env, mutex, description));
+ return (DB_RUNRECOVERY);
+ }
+ return (__env_panic(env, EACCES));
+ }
F_CLR(mutexp, DB_MUTEX_LOCKED);
MUTEX_UNSET(&mutexp->tas);
}
@@ -536,8 +638,8 @@ __db_win32_mutex_unlock(env, mutex)
#ifdef MUTEX_DIAG
QueryPerformanceCounter(&diag_now);
printf(DB_STR_A("2011",
- "[%I64d]: Signalling mutex %p, id %d\n",
- "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
+ "[%lld]: Signalling mutex %p, id %d\n",
+ "%lld %p %d"), diag_now.QuadPart, mutexp, mutexp->id);
#endif
if (!PulseEvent(event)) {
ret = __os_get_syserr();
diff --git a/src/mutex/test_mutex.c b/src/mutex/test_mutex.c
index 24c18016..d6183bdb 100644
--- a/src/mutex/test_mutex.c
+++ b/src/mutex/test_mutex.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* Standalone mutex tester for Berkeley DB mutexes.
*
@@ -13,7 +13,6 @@
#include "db_int.h"
#ifdef DB_WIN32
-#define MUTEX_THREAD_TEST 1
extern int getopt(int, char * const *, const char *);
@@ -33,29 +32,13 @@ typedef HANDLE os_thread_t;
#include <sys/wait.h>
typedef pid_t os_pid_t;
-
-/*
- * There's only one mutex implementation that can't support thread-level
- * locking: UNIX/fcntl mutexes.
- *
- * The general Berkeley DB library configuration doesn't look for the POSIX
- * pthread functions, with one exception -- pthread_yield.
- *
- * Use these two facts to decide if we're going to build with or without
- * threads.
- */
-#if !defined(HAVE_MUTEX_FCNTL) && defined(HAVE_PTHREAD_YIELD)
-#define MUTEX_THREAD_TEST 1
-
-#include <pthread.h>
-
typedef pthread_t os_thread_t;
#define os_thread_create(thrp, attr, func, arg) \
pthread_create((thrp), (attr), (func), (arg))
#define os_thread_join(thr, statusp) pthread_join((thr), (statusp))
#define os_thread_self() pthread_self()
-#endif /* HAVE_PTHREAD_YIELD */
+
#endif /* !DB_WIN32 */
#define OS_BAD_PID ((os_pid_t)-1)
@@ -76,28 +59,25 @@ typedef struct {
u_int wakeme; /* Request to awake. */
} TM;
-DB_ENV *dbenv; /* Backing environment */
+DB_ENV *dbenv; /* Backing environment. */
ENV *env;
size_t len; /* Backing data chunk size. */
+u_int alignment = 0; /* Specify mutex alignment. */
+
u_int8_t *gm_addr; /* Global mutex */
u_int8_t *lm_addr; /* Locker mutexes */
u_int8_t *tm_addr; /* Thread mutexes */
-#ifdef MUTEX_THREAD_TEST
os_thread_t *kidsp; /* Locker threads */
os_thread_t wakep; /* Wakeup thread */
-#endif
#ifndef HAVE_MMAP
u_int nprocs = 1; /* -p: Processes. */
u_int nthreads = 20; /* -t: Threads. */
-#elif MUTEX_THREAD_TEST
+#else
u_int nprocs = 5; /* -p: Processes. */
u_int nthreads = 4; /* -t: Threads. */
-#else
-u_int nprocs = 20; /* -p: Processes. */
-u_int nthreads = 1; /* -t: Threads. */
#endif
u_int maxlocks = 20; /* -l: Backing locks. */
@@ -147,8 +127,11 @@ main(argc, argv)
rtype = PARENT;
id = 0;
tmpath = argv[0];
- while ((ch = getopt(argc, argv, "l:n:p:T:t:v")) != EOF)
+ while ((ch = getopt(argc, argv, "a:l:n:p:T:t:v")) != EOF)
switch (ch) {
+ case 'a':
+ alignment = (u_int)atoi(optarg);
+ break;
case 'l':
maxlocks = (u_int)atoi(optarg);
break;
@@ -161,14 +144,6 @@ main(argc, argv)
case 't':
if ((nthreads = (u_int)atoi(optarg)) == 0)
nthreads = 1;
-#if !defined(MUTEX_THREAD_TEST)
- if (nthreads != 1) {
- fprintf(stderr,
- "%s: thread support not available or not compiled for this platform.\n",
- progname);
- return (EXIT_FAILURE);
- }
-#endif
break;
case 'T':
if (!memcmp(optarg, "locker", sizeof("locker") - 1))
@@ -242,7 +217,11 @@ main(argc, argv)
*
* Clean up from any previous runs.
*/
+#ifdef DB_WIN32
+ snprintf(cmd, sizeof(cmd), "rmdir /S /Q %s", TESTDIR);
+#else
snprintf(cmd, sizeof(cmd), "rm -rf %s", TESTDIR);
+#endif
(void)system(cmd);
snprintf(cmd, sizeof(cmd), "mkdir %s", TESTDIR);
(void)system(cmd);
@@ -292,8 +271,8 @@ main(argc, argv)
/* Wait for all lockers to exit. */
if ((err = os_wait(pids, nprocs)) != 0) {
- fprintf(stderr, "%s: locker wait failed with %d\n",
- progname, err);
+ fprintf(stderr, "%s: locker wait failed with %s\n",
+ progname, db_strerror(err));
goto fail;
}
@@ -357,7 +336,6 @@ int
locker_start(id)
u_long id;
{
-#if defined(MUTEX_THREAD_TEST)
u_int i;
int err;
@@ -378,17 +356,13 @@ locker_start(id)
return (1);
}
return (0);
-#else
- return (run_lthread((void *)id) == NULL ? 0 : 1);
-#endif
}
int
locker_wait()
{
-#if defined(MUTEX_THREAD_TEST)
u_int i;
- void *retp;
+ void *retp = NULL;
/* Wait for the threads to exit. */
for (i = 0; i < nthreads; i++) {
@@ -400,7 +374,6 @@ locker_wait()
}
}
free(kidsp);
-#endif
return (0);
}
@@ -414,11 +387,7 @@ run_lthread(arg)
int err, i;
id = (u_long)arg;
-#if defined(MUTEX_THREAD_TEST)
tid = (u_long)os_thread_self();
-#else
- tid = 0;
-#endif
printf("Locker: ID %03lu (PID: %lu; TID: %lx)\n",
id, (u_long)getpid(), tid);
@@ -534,7 +503,6 @@ int
wakeup_start(id)
u_long id;
{
-#if defined(MUTEX_THREAD_TEST)
int err;
/*
@@ -547,16 +515,12 @@ wakeup_start(id)
return (1);
}
return (0);
-#else
- return (run_wthread((void *)id) == NULL ? 0 : 1);
-#endif
}
int
wakeup_wait()
{
-#if defined(MUTEX_THREAD_TEST)
- void *retp;
+ void *retp = NULL;
/*
* A file is created when the wakeup thread is no longer needed.
@@ -567,7 +531,6 @@ wakeup_wait()
"%s: wakeup thread exited with error\n", progname);
return (1);
}
-#endif
return (0);
}
@@ -586,11 +549,7 @@ run_wthread(arg)
id = (u_long)arg;
quitcheck = 0;
-#if defined(MUTEX_THREAD_TEST)
tid = (u_long)os_thread_self();
-#else
- tid = 0;
-#endif
printf("Wakeup: ID %03lu (PID: %lu; TID: %lx)\n",
id, (u_long)getpid(), tid);
@@ -683,6 +642,12 @@ tm_env_init()
home = TESTDIR;
if (nthreads != 1)
flags |= DB_THREAD;
+ if (alignment != 0 &&
+ (ret = dbenv->mutex_set_align(dbenv, alignment)) != 0) {
+ dbenv->err(dbenv, ret, "set_align(%d): %s", alignment, home);
+ return (1);
+ }
+
if ((ret = dbenv->open(dbenv, home, flags, 0)) != 0) {
dbenv->err(dbenv, ret, "environment open: %s", home);
return (1);
@@ -748,8 +713,10 @@ tm_mutex_init()
if (verbose)
printf("\n");
- if (verbose)
+ if (verbose) {
+ (void)dbenv->mutex_stat_print(dbenv, DB_STAT_ALL);
printf("Allocate %d per-lock mutexes: ", maxlocks);
+ }
for (i = 0; i < maxlocks; ++i) {
mp = (TM *)(lm_addr + i * sizeof(TM));
if ((err = dbenv->mutex_alloc(dbenv, 0, &mp->mutex)) != 0) {
@@ -930,7 +897,7 @@ int
usage()
{
fprintf(stderr, "usage: %s %s\n\t%s\n", progname,
- "[-v] [-l maxlocks]",
+ "[-a alignment] [-v] [-l maxlocks]",
"[-n locks] [-p procs] [-T locker=ID|wakeup=ID] [-t threads]");
return (EXIT_FAILURE);
}
diff --git a/src/mutex/uts4_cc.s b/src/mutex/uts4_cc.s
index 4f59e9c8..76eeed6c 100644
--- a/src/mutex/uts4_cc.s
+++ b/src/mutex/uts4_cc.s
@@ -1,6 +1,6 @@
/ See the file LICENSE for redistribution information.
/
- / Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ / Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
/
/ $Id$
/
diff --git a/src/os/os_abort.c b/src/os/os_abort.c
index 68b4bc05..72ac6751 100644
--- a/src/os/os_abort.c
+++ b/src/os/os_abort.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -13,11 +13,11 @@
/*
* __os_abort --
*
- * PUBLIC: void __os_abort __P((ENV *));
+ * PUBLIC: void __os_abort __P((const ENV *));
*/
void
__os_abort(env)
- ENV *env;
+ const ENV *env;
{
__os_stack(env); /* Try and get a stack trace. */
diff --git a/src/os/os_abs.c b/src/os/os_abs.c
index 4a1a5abd..a241c653 100644
--- a/src/os/os_abs.c
+++ b/src/os/os_abs.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_addrinfo.c b/src/os/os_addrinfo.c
index 205f41ec..aec30386 100644
--- a/src/os/os_addrinfo.c
+++ b/src/os/os_addrinfo.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_alloc.c b/src/os/os_alloc.c
index fb7bf109..478924df 100644
--- a/src/os/os_alloc.c
+++ b/src/os/os_alloc.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -11,7 +11,7 @@
#include "db_int.h"
#ifdef DIAGNOSTIC
-static void __os_guard __P((ENV *));
+static void __os_guard __P((const ENV *));
typedef union {
size_t size;
@@ -204,11 +204,11 @@ __os_strdup(env, str, storep)
* __os_calloc --
* The calloc(3) function for DB.
*
- * PUBLIC: int __os_calloc __P((ENV *, size_t, size_t, void *));
+ * PUBLIC: int __os_calloc __P((const ENV *, size_t, size_t, void *));
*/
int
__os_calloc(env, num, size, storep)
- ENV *env;
+ const ENV *env;
size_t num, size;
void *storep;
{
@@ -227,11 +227,11 @@ __os_calloc(env, num, size, storep)
* __os_malloc --
* The malloc(3) function for DB.
*
- * PUBLIC: int __os_malloc __P((ENV *, size_t, void *));
+ * PUBLIC: int __os_malloc __P((const ENV *, size_t, void *));
*/
int
__os_malloc(env, size, storep)
- ENV *env;
+ const ENV *env;
size_t size;
void *storep;
{
@@ -261,9 +261,11 @@ __os_malloc(env, size, storep)
* Windows/NT in an MT environment.
*/
if ((ret = __os_get_errno_ret_zero()) == 0) {
- ret = ENOMEM;
+ ret = USR_ERR(env, ENOMEM);
__os_set_errno(ENOMEM);
}
+ else
+ (void)USR_ERR(env, ret);
__db_err(env, ret, DB_STR_A("0147", "malloc: %lu", "%lu"),
(u_long)size);
return (ret);
@@ -292,11 +294,11 @@ __os_malloc(env, size, storep)
* __os_realloc --
* The realloc(3) function for DB.
*
- * PUBLIC: int __os_realloc __P((ENV *, size_t, void *));
+ * PUBLIC: int __os_realloc __P((const ENV *, size_t, void *));
*/
int
__os_realloc(env, size, storep)
- ENV *env;
+ const ENV *env;
size_t size;
void *storep;
{
@@ -345,7 +347,7 @@ __os_realloc(env, size, storep)
* Windows/NT in an MT environment.
*/
if ((ret = __os_get_errno_ret_zero()) == 0) {
- ret = ENOMEM;
+ ret = USR_ERR(env, ENOMEM);
__os_set_errno(ENOMEM);
}
__db_err(env, ret, DB_STR_A("0148", "realloc: %lu", "%lu"),
@@ -368,11 +370,11 @@ __os_realloc(env, size, storep)
* __os_free --
* The free(3) function for DB.
*
- * PUBLIC: void __os_free __P((ENV *, void *));
+ * PUBLIC: void __os_free __P((const ENV *, void *));
*/
void
__os_free(env, ptr)
- ENV *env;
+ const ENV *env;
void *ptr;
{
#ifdef DIAGNOSTIC
@@ -416,7 +418,7 @@ __os_free(env, ptr)
*/
static void
__os_guard(env)
- ENV *env;
+ const ENV *env;
{
__db_errx(env, DB_STR("0149",
"Guard byte incorrect during free"));
diff --git a/src/os/os_clock.c b/src/os/os_clock.c
index 25eeb704..78f1c8df 100644
--- a/src/os/os_clock.c
+++ b/src/os/os_clock.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -14,11 +14,15 @@
* __os_gettime --
* Return the current time-of-day clock in seconds and nanoseconds.
*
- * PUBLIC: void __os_gettime __P((ENV *, db_timespec *, int));
+ * If you want the time of day, pass 0 in the monotonic argument. If you pass
+ * non-zero, you might get time-of-day or you might get a non-decreasing number
+ * which is unrelated to the time of day, such as the seconds since system boot.
+ *
+ * PUBLIC: void __os_gettime __P((const ENV *, db_timespec *, int));
*/
void
__os_gettime(env, tp, monotonic)
- ENV *env;
+ const ENV *env;
db_timespec *tp;
int monotonic;
{
@@ -35,7 +39,6 @@ __os_gettime(env, tp, monotonic)
RETRY_CHK((clock_gettime(
CLOCK_REALTIME, (struct timespec *)tp)), ret);
- RETRY_CHK((clock_gettime(CLOCK_REALTIME, (struct timespec *)tp)), ret);
if (ret != 0) {
sc = "clock_gettime";
goto err;
@@ -69,5 +72,5 @@ __os_gettime(env, tp, monotonic)
return;
err: __db_syserr(env, ret, "%s", sc);
- (void)__env_panic(env, __os_posix_err(ret));
+ (void)__env_panic((ENV *) env, __os_posix_err(ret));
}
diff --git a/src/os/os_config.c b/src/os/os_config.c
index c455a349..3fe2f045 100644
--- a/src/os/os_config.c
+++ b/src/os/os_config.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_cpu.c b/src/os/os_cpu.c
index 6b7f9f1e..53cadecb 100644
--- a/src/os/os_cpu.c
+++ b/src/os/os_cpu.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_ctime.c b/src/os/os_ctime.c
index 3f656c32..82925cc1 100644
--- a/src/os/os_ctime.c
+++ b/src/os/os_ctime.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -32,10 +32,7 @@ __os_ctime(tod, time_buf)
* int.
*/
#if defined(HAVE_VXWORKS)
- {
- size_t buflen = CTIME_BUFLEN;
- (void)ctime_r(tod, time_buf, &buflen);
- }
+ (void)ctime_r(tod, time_buf);
#elif defined(HAVE_CTIME_R_3ARG)
(void)ctime_r(tod, time_buf, CTIME_BUFLEN);
#elif defined(HAVE_CTIME_R)
diff --git a/src/os/os_dir.c b/src/os/os_dir.c
index 42bad194..7bd91bff 100644
--- a/src/os/os_dir.c
+++ b/src/os/os_dir.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_errno.c b/src/os/os_errno.c
index a8219f90..9bc15513 100644
--- a/src/os/os_errno.c
+++ b/src/os/os_errno.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_fid.c b/src/os/os_fid.c
index f2d80e25..43c61202 100644
--- a/src/os/os_fid.c
+++ b/src/os/os_fid.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_flock.c b/src/os/os_flock.c
index 904d5efe..8f58f244 100644
--- a/src/os/os_flock.c
+++ b/src/os/os_flock.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_fsync.c b/src/os/os_fsync.c
index 4b757b2c..377d7ff3 100644
--- a/src/os/os_fsync.c
+++ b/src/os/os_fsync.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_getenv.c b/src/os/os_getenv.c
index 05972112..b7c4e990 100644
--- a/src/os/os_getenv.c
+++ b/src/os/os_getenv.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_handle.c b/src/os/os_handle.c
index 8ae9dc7f..7dbe31e1 100644
--- a/src/os/os_handle.c
+++ b/src/os/os_handle.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -90,7 +90,7 @@ __os_openhandle(env, name, flags, mode, fhpp)
* return EEXISTS.
*/
DB_END_SINGLE_THREAD;
- ret = EEXIST;
+ ret = USR_ERR(env, EEXIST);
goto err;
}
/*
@@ -127,7 +127,10 @@ __os_openhandle(env, name, flags, mode, fhpp)
break;
}
- switch (ret = __os_posix_err(__os_get_syserr())) {
+ ret = __os_posix_err(__os_get_syserr());
+ if (ret != ENOENT)
+ (void)USR_ERR(env, ret);
+ switch (ret) {
case EMFILE:
case ENFILE:
case ENOSPC:
@@ -160,9 +163,8 @@ __os_openhandle(env, name, flags, mode, fhpp)
/* Deny file descriptor access to any child process. */
if ((fcntl_flags = fcntl(fhp->fd, F_GETFD)) == -1 ||
fcntl(fhp->fd, F_SETFD, fcntl_flags | FD_CLOEXEC) == -1) {
- ret = __os_get_syserr();
- __db_syserr(env, ret, DB_STR("0162",
- "fcntl(F_SETFD)"));
+ ret = USR_ERR(env, __os_get_syserr());
+ __db_syserr(env, ret, DB_STR("0162", "fcntl(F_SETFD)"));
ret = __os_posix_err(ret);
goto err;
}
@@ -226,6 +228,7 @@ __os_closehandle(env, fhp)
else
RETRY_CHK((close(fhp->fd)), ret);
if (ret != 0) {
+ ret = USR_ERR(env, ret);
__db_syserr(env, ret, DB_STR("0164", "close"));
ret = __os_posix_err(ret);
}
diff --git a/src/os/os_map.c b/src/os/os_map.c
index 0528f473..b17bf107 100644
--- a/src/os/os_map.c
+++ b/src/os/os_map.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -213,6 +213,15 @@ __os_attach(env, infop, rp)
if (rp->max < rp->size)
rp->max = rp->size;
if (ret == 0 && F_ISSET(infop, REGION_CREATE)) {
+#ifdef HAVE_MLOCK
+ /*
+ * When locking the region in memory extend it fully so that it
+ * can all be mlock()'d now, and not later when paging could
+ * interfere with the application. [#21379]
+ */
+ if (F_ISSET(env, ENV_LOCKDOWN))
+ rp->size = rp->max;
+#endif
if (F_ISSET(dbenv, DB_ENV_REGION_INIT))
ret = __db_file_write(env, infop->fhp,
rp->size / MEGABYTE, rp->size % MEGABYTE, 0x00);
@@ -255,7 +264,7 @@ __os_detach(env, infop, destroy)
{
DB_ENV *dbenv;
REGION *rp;
- int ret;
+ int ret, t_ret;
/*
* We pass a DB_ENV handle to the user's replacement unmap function,
@@ -263,8 +272,16 @@ __os_detach(env, infop, destroy)
*/
DB_ASSERT(env, env != NULL && env->dbenv != NULL);
dbenv = env->dbenv;
+ ret = 0;
+ /*
+ * Don't use a region which is no longer valid, e.g., after the
+ * env has been removed.
+ */
rp = infop->rp;
+ if ((rp->id != 0 && rp->id != infop->id) ||
+ rp->type <= INVALID_REGION_TYPE || rp->type > REGION_TYPE_MAX)
+ return (EINVAL);
/* If the user replaced the unmap call, call through their interface. */
if (DB_GLOBAL(j_region_unmap) != NULL)
@@ -314,16 +331,26 @@ __os_detach(env, infop, destroy)
return (ret);
}
+ if (F_ISSET(env, ENV_FORCESYNCENV))
+ if (msync(infop->addr, rp->max, MS_INVALIDATE | MS_SYNC) != 0) {
+ t_ret = __os_get_syserr();
+ __db_syserr(env, t_ret, DB_STR("0248",
+ "msync failed on closing environment"));
+ if (ret == 0)
+ ret = t_ret;
+ }
+
if (munmap(infop->addr, rp->max) != 0) {
- ret = __os_get_syserr();
- __db_syserr(env, ret, DB_STR("0123", "munmap"));
- return (__os_posix_err(ret));
+ t_ret = __os_get_syserr();
+ __db_syserr(env, t_ret, DB_STR("0123", "munmap"));
+ if (ret == 0)
+ ret = t_ret;
}
- if (destroy && (ret = __os_unlink(env, infop->name, 1)) != 0)
- return (ret);
+ if (destroy && (t_ret = __os_unlink(env, infop->name, 1)) != 0 && ret == 0)
+ ret = t_ret;
- return (0);
+ return (ret);
#else
COMPQUIET(destroy, 0);
COMPQUIET(ret, 0);
diff --git a/src/os/os_mkdir.c b/src/os/os_mkdir.c
index 800d445c..b3034e30 100644
--- a/src/os/os_mkdir.c
+++ b/src/os/os_mkdir.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_open.c b/src/os/os_open.c
index 5090c8e1..0c58848e 100644
--- a/src/os/os_open.c
+++ b/src/os/os_open.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_path.c b/src/os/os_path.c
index 478fdf45..b712b31a 100644
--- a/src/os/os_path.c
+++ b/src/os/os_path.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_pid.c b/src/os/os_pid.c
index b1b94d60..9efe4633 100644
--- a/src/os/os_pid.c
+++ b/src/os/os_pid.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -40,7 +40,7 @@ __os_id(dbenv, pidp, tidp)
*pidp = dbenv->env->pid_cache;
}
-/*
+/*
* When building on MinGW, we define both HAVE_PTHREAD_SELF and DB_WIN32,
* and we are using pthreads instead of Windows threads implementation.
* So here, we need to check the thread implementations before checking
diff --git a/src/os/os_rename.c b/src/os/os_rename.c
index 63aac7bb..1a3d7cbd 100644
--- a/src/os/os_rename.c
+++ b/src/os/os_rename.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_rmdir.c b/src/os/os_rmdir.c
new file mode 100644
index 00000000..ab3a1556
--- /dev/null
+++ b/src/os/os_rmdir.c
@@ -0,0 +1,38 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_rmdir --
+ * Remove a directory.
+ *
+ * PUBLIC: int __os_rmdir __P((ENV *, const char *));
+ */
+int
+__os_rmdir(env, name)
+ ENV *env;
+ const char *name;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0239", "fileops: rmdir %s",
+ "%s"), name);
+
+ RETRY_CHK((rmdir(name)), ret);
+ if (ret != 0)
+ return (__os_posix_err(ret));
+
+ return (ret);
+}
diff --git a/src/os/os_root.c b/src/os/os_root.c
index 77e7a72c..6634a4a2 100644
--- a/src/os/os_root.c
+++ b/src/os/os_root.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_rpath.c b/src/os/os_rpath.c
index 16f3e54c..48c59b3d 100644
--- a/src/os/os_rpath.c
+++ b/src/os/os_rpath.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_rw.c b/src/os/os_rw.c
index c0967514..cc665ee4 100644
--- a/src/os/os_rw.c
+++ b/src/os/os_rw.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_seek.c b/src/os/os_seek.c
index 4676d33a..95408f3d 100644
--- a/src/os/os_seek.c
+++ b/src/os/os_seek.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_stack.c b/src/os/os_stack.c
index 037080f3..9844930f 100644
--- a/src/os/os_stack.c
+++ b/src/os/os_stack.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -14,32 +14,143 @@
#include <execinfo.h>
#endif
+#undef __DB_STACK_MAXFRAMES
+#define __DB_STACK_MAXFRAMES 25
+
/*
* __os_stack --
- * Output a stack trace to the message file handle.
+ * Output a stack trace in a single write to the error file handle.
*
- * PUBLIC: void __os_stack __P((ENV *));
+ * PUBLIC: void __os_stack __P((const ENV *));
*/
void
__os_stack(env)
- ENV *env;
+ const ENV *env;
+{
+ /* Adjust by 2 to exclude __os_stack() and __os_stack_top(). */
+ __os_stack_top(env, __DB_STACK_MAXFRAMES - 2, 2);
+}
+
+/*
+ * __os_stack_top --
+ * Output just a certain range of stack frames to the error file handle.
+ *
+ * PUBLIC: void __os_stack_top __P((const ENV *, unsigned, unsigned));
+ */
+void
+__os_stack_top(env, nframes, skipframes)
+ const ENV *env;
+ unsigned nframes;
+ unsigned skipframes;
{
#if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS)
- void *array[200];
- size_t i, size;
- char **strings;
+ char buf[__DB_STACK_MAXFRAMES * 80]; /* Allow for 80 chars/line. */
+ __os_stack_text(env, buf, sizeof(buf), nframes, skipframes + 1);
+ __db_errx(env, "Top of stack:\n%s", buf);
+#else
+ COMPQUIET(env, NULL);
+ COMPQUIET(nframes, 0);
+ COMPQUIET(skipframes, 0);
+#endif
+}
+
+/*
+ * __os_stack_text --
+ * 'Print' the current stack into a char text buffer.
+ *
+ * PUBLIC: void __os_stack_text
+ * PUBLIC: __P((const ENV *, char *, size_t, unsigned, unsigned));
+ */
+void
+__os_stack_text(env, result, bufsize, nframes, skip)
+ const ENV *env;
+ char *result;
+ size_t bufsize;
+ unsigned nframes;
+ unsigned skip;
+{
+ DB_MSGBUF mb;
+
+ DB_MSGBUF_INIT(&mb);
+ mb.buf = mb.cur = result;
+ mb.len = bufsize;
+ F_SET(&mb, DB_MSGBUF_PREALLOCATED);
+ __os_stack_msgadd(env, &mb, nframes, skip, NULL);
+}
+
+/*
+ * __os_stack_save --
+ * Save a certain range of stack frames into the frames argument.
+ *
+ * PUBLIC: int __os_stack_save __P((const ENV *, unsigned, void **));
+ */
+int
+__os_stack_save(env, nframes, frames)
+ const ENV *env;
+ unsigned nframes;
+ void **frames;
+{
+ COMPQUIET(env, NULL);
+#if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS)
/*
* Solaris and the GNU C library support this interface. Solaris
* has additional interfaces (printstack and walkcontext), I don't
* know if they offer any additional value or not.
*/
- size = backtrace(array, sizeof(array) / sizeof(array[0]));
- strings = backtrace_symbols(array, size);
+ return ((int) backtrace(frames, nframes));
+#else
+ COMPQUIET(nframes, 0);
+ COMPQUIET(frames, NULL);
+ return (0);
+#endif
+}
+
+/*
+ * __os_stack_msgadd --
+ * Decode a stack and add it to a DB_MSGBUF. The stack was either
+ * previously obtained stack, e.g., from __os_stack_save(), or if it is
+ * null, the current stack is fetched here.
+ *
+ * PUBLIC: void __os_stack_msgadd
+ * PUBLIC: __P((const ENV *, DB_MSGBUF *, unsigned, unsigned, void **));
+ */
+void
+__os_stack_msgadd(env, mb, totalframes, skipframes, stack)
+ const ENV *env;
+ DB_MSGBUF *mb;
+ unsigned totalframes;
+ unsigned skipframes;
+ void **stack;
+{
+#if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS)
+ char **strings;
+ void *local_frames[__DB_STACK_MAXFRAMES];
+ unsigned i;
+
+ if (stack == NULL) {
+ stack = local_frames;
+ if (totalframes > __DB_STACK_MAXFRAMES)
+ totalframes = __DB_STACK_MAXFRAMES;
+ totalframes = backtrace(local_frames, totalframes);
+ skipframes++;
+ }
+
+ /*
+ * Solaris and the GNU C library support this interface. Solaris
+ * has additional interfaces (printstack and walkcontext) which have
+ * know if they offer any additional value or not.
+ */
+ strings = backtrace_symbols(stack, totalframes);
- for (i = 0; i < size; ++i)
- __db_errx(env, "%s", strings[i]);
+ for (i = skipframes; i < totalframes; ++i)
+ __db_msgadd((ENV *)env, mb, "\t%s\n", strings[i]);
free(strings);
-#endif
+#else
COMPQUIET(env, NULL);
+ COMPQUIET(mb, NULL);
+ COMPQUIET(totalframes, 0);
+ COMPQUIET(skipframes, 0);
+ COMPQUIET(stack, NULL);
+#endif
}
diff --git a/src/os/os_stat.c b/src/os/os_stat.c
index 43c66075..493531b7 100644
--- a/src/os/os_stat.c
+++ b/src/os/os_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_tmpdir.c b/src/os/os_tmpdir.c
index 06d35ba9..f41383d7 100644
--- a/src/os/os_tmpdir.c
+++ b/src/os/os_tmpdir.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_truncate.c b/src/os/os_truncate.c
index f559e9cb..473db9cc 100644
--- a/src/os/os_truncate.c
+++ b/src/os/os_truncate.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -14,14 +14,16 @@
* __os_truncate --
* Truncate the file.
*
- * PUBLIC: int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t));
+ * PUBLIC: int __os_truncate
+ * PUBLIC: __P((ENV *, DB_FH *, db_pgno_t, u_int32_t, off_t));
*/
int
-__os_truncate(env, fhp, pgno, pgsize)
+__os_truncate(env, fhp, pgno, pgsize, relative)
ENV *env;
DB_FH *fhp;
db_pgno_t pgno;
u_int32_t pgsize;
+ off_t relative;
{
DB_ENV *dbenv;
off_t offset;
@@ -33,7 +35,7 @@ __os_truncate(env, fhp, pgno, pgsize)
* Truncate a file so that "pgno" is discarded from the end of the
* file.
*/
- offset = (off_t)pgsize * pgno;
+ offset = (off_t)pgsize * pgno + relative;
if (dbenv != NULL &&
FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
diff --git a/src/os/os_uid.c b/src/os/os_uid.c
index 2e5c9f87..c3bccb3d 100644
--- a/src/os/os_uid.c
+++ b/src/os/os_uid.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -26,8 +26,6 @@ __os_unique_id(env, idp)
pid_t pid;
u_int32_t id;
- *idp = 0;
-
dbenv = env == NULL ? NULL : env->dbenv;
/*
@@ -35,21 +33,60 @@ __os_unique_id(env, idp)
* time of day and a stack address, all XOR'd together.
*/
__os_id(dbenv, &pid, NULL);
- __os_gettime(env, &v, 1);
+ __os_gettime(env, &v, 0);
id = (u_int32_t)pid ^
(u_int32_t)v.tv_sec ^ (u_int32_t)v.tv_nsec ^ P_TO_UINT32(&pid);
- /*
- * We could try and find a reasonable random-number generator, but
- * that's not all that easy to do. Seed and use srand()/rand(), if
- * we can find them.
- */
- if (DB_GLOBAL(uid_init) == 0) {
- DB_GLOBAL(uid_init) = 1;
- srand((u_int)id);
- }
- id ^= (u_int)rand();
+ if (DB_GLOBAL(random_seeded) == 0)
+ __os_srandom(id);
+ id ^= __os_random();
*idp = id;
}
+
+/*
+ * __os_srandom --
+ * Set the random number generator seed for BDB.
+ *
+ * PUBLIC: void __os_srandom __P((u_int));
+ */
+void
+__os_srandom(seed)
+ u_int seed;
+{
+ DB_GLOBAL(random_seeded) = 1;
+#ifdef HAVE_RANDOM_R
+ (void)initstate_r(seed, &DB_GLOBAL(random_state),
+ sizeof(DB_GLOBAL(random_state)), &DB_GLOBAL(random_data));
+ (void)srandom_r(seed, &DB_GLOBAL(random_data));
+#elif defined(HAVE_RANDOM)
+ srandom(seed);
+#else
+ srand(seed);
+#endif
+}
+
+/*
+ * __os_random --
+ * Return the next the random number generator for BDB.
+ *
+ * PUBLIC: u_int __os_random __P((void));
+ */
+u_int
+__os_random()
+{
+#ifdef HAVE_RANDOM_R
+ int32_t result;
+#endif
+ if (DB_GLOBAL(random_seeded) == 0)
+ __os_srandom((u_int)time(NULL));
+#ifdef HAVE_RANDOM_R
+ random_r(&DB_GLOBAL(random_data), &result);
+ return ((u_int)result);
+#elif defined(HAVE_RANDOM)
+ return ((u_int)random());
+#else
+ return ((u_int)rand());
+#endif
+}
diff --git a/src/os/os_unlink.c b/src/os/os_unlink.c
index f9a0b688..9b6d26fa 100644
--- a/src/os/os_unlink.c
+++ b/src/os/os_unlink.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os/os_yield.c b/src/os/os_yield.c
index f0e170f0..ff54921e 100644
--- a/src/os/os_yield.c
+++ b/src/os/os_yield.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_qnx/os_qnx_fsync.c b/src/os_qnx/os_qnx_fsync.c
index 827fa446..6ea04b00 100644
--- a/src/os_qnx/os_qnx_fsync.c
+++ b/src/os_qnx/os_qnx_fsync.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_qnx/os_qnx_open.c b/src/os_qnx/os_qnx_open.c
index d0214a0d..cf2f781e 100644
--- a/src/os_qnx/os_qnx_open.c
+++ b/src/os_qnx/os_qnx_open.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_vxworks/os_vx_abs.c b/src/os_vxworks/os_vx_abs.c
index 69413ee5..78342fce 100644
--- a/src/os_vxworks/os_vx_abs.c
+++ b/src/os_vxworks/os_vx_abs.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_vxworks/os_vx_config.c b/src/os_vxworks/os_vx_config.c
index 649a3b4a..7c7fa4c8 100644
--- a/src/os_vxworks/os_vx_config.c
+++ b/src/os_vxworks/os_vx_config.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_vxworks/os_vx_map.c b/src/os_vxworks/os_vx_map.c
index 517cadae..859bde6c 100644
--- a/src/os_vxworks/os_vx_map.c
+++ b/src/os_vxworks/os_vx_map.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* This code is derived from software contributed to Sleepycat Software by
* Frederick G.M. Roeber of Netscape Communications Corp.
diff --git a/src/os_vxworks/os_vx_rpath.c b/src/os_vxworks/os_vx_rpath.c
index 1ffd3549..d7202c78 100644
--- a/src/os_vxworks/os_vx_rpath.c
+++ b/src/os_vxworks/os_vx_rpath.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_vxworks/os_vx_yield.c b/src/os_vxworks/os_vx_yield.c
index c7c54cf2..e3741c3f 100644
--- a/src/os_vxworks/os_vx_yield.c
+++ b/src/os_vxworks/os_vx_yield.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/ce_ctime.c b/src/os_windows/ce_ctime.c
index e8ae76aa..d4e6a4fc 100644
--- a/src/os_windows/ce_ctime.c
+++ b/src/os_windows/ce_ctime.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -25,8 +25,8 @@ __os_ctime(tod, time_buf)
__int64 i64_tod;
struct _FILETIME file_tod, file_loc;
struct _SYSTEMTIME sys_loc;
-static const __int64 SECS_BETWEEN_EPOCHS = 11644473600;
-static const __int64 SECS_TO_100NS = 10000000; /* 10^7 */
+ static const __int64 SECS_BETWEEN_EPOCHS = 11644473600;
+ static const __int64 SECS_TO_100NS = 10000000; /* 10^7 */
strcpy(time_buf, "Thu Jan 01 00:00:00 1970");
time_buf[CTIME_BUFLEN - 1] = '\0';
diff --git a/src/os_windows/ce_freopen.c b/src/os_windows/ce_freopen.c
new file mode 100644
index 00000000..331450d0
--- /dev/null
+++ b/src/os_windows/ce_freopen.c
@@ -0,0 +1,52 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __ce_freopen --
+ * Reopen a stream on WinCE.
+ *
+ * PUBLIC: #ifdef DB_WINCE
+ * PUBLIC: FILE * __ce_freopen
+ * PUBLIC: __P((const char *, const char *, FILE *));
+ * PUBLIC: #endif
+ */
+FILE *
+__ce_freopen(path, mode, stream)
+ const char *path, *mode;
+ FILE *stream;
+{
+ size_t lenm, lenp;
+ wchar_t *wpath, *wmode;
+ FILE *handle;
+
+ wpath = NULL;
+ wmode = NULL;
+ handle = NULL;
+ lenp = strlen(path) + 1;
+ lenm = strlen(mode) + 1;
+
+ if (__os_malloc(NULL, lenp * sizeof(wchar_t), &wpath) != 0 ||
+ __os_malloc(NULL, lenm * sizeof(wchar_t), &wmode) != 0)
+ goto err;
+
+ if (mbstowcs(wpath, path, lenp) != lenp ||
+ mbstowcs(wmode, mode, lenm) != lenm)
+ goto err;
+
+ handle = _wfreopen(wpath, wmode, stream);
+err:
+ if (wpath != NULL)
+ __os_free(NULL, wpath);
+ if (wmode != NULL)
+ __os_free(NULL, wmode);
+ return handle;
+}
diff --git a/src/os_windows/ce_gmtime.c b/src/os_windows/ce_gmtime.c
new file mode 100644
index 00000000..55605c89
--- /dev/null
+++ b/src/os_windows/ce_gmtime.c
@@ -0,0 +1,58 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __ce_gmtime --
+ * gmtime implementation on WinCE.
+ *
+ * PUBLIC: #ifdef DB_WINCE
+ * PUBLIC: struct tm * __ce_gmtime __P((const time_t *));
+ * PUBLIC: #endif
+ */
+
+struct tm *
+__ce_gmtime(timer)
+ const time_t *timer;
+{
+ static struct tm br_time;
+ struct tm *timep;
+ time_t ti;
+ unsigned long dayclock, dayno;
+ int year;
+
+ timep = &br_time;
+ ti = *timer;
+ dayclock = (unsigned long)ti % SECSPERDAY;
+ dayno = (unsigned long)ti / SECSPERDAY;
+ year = TM_YEAR_EPOCH;
+
+ timep->tm_sec = dayclock % 60;
+ timep->tm_min = (dayclock % 3600) / 60;
+ timep->tm_hour = dayclock / 3600;
+ /* day 0 was a thursday */
+ timep->tm_wday = (dayno + 4) % 7;
+ while (dayno >= year_lengths[isleap(year)]) {
+ dayno -= year_lengths[isleap(year)];
+ year++;
+ }
+ timep->tm_year = year - TM_YEAR_BASE;
+ timep->tm_yday = dayno;
+ timep->tm_mon = 0;
+ while (dayno >= mon_lengths[isleap(year)][timep->tm_mon]) {
+ dayno -= mon_lengths[isleap(year)][timep->tm_mon];
+ timep->tm_mon++;
+ }
+ timep->tm_mday = dayno + 1;
+ timep->tm_isdst = 0;
+
+ return timep;
+}
diff --git a/src/os_windows/ce_localtime.c b/src/os_windows/ce_localtime.c
new file mode 100644
index 00000000..23c53bed
--- /dev/null
+++ b/src/os_windows/ce_localtime.c
@@ -0,0 +1,44 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __ce_localtime --
+ * localtime implementation on WinCE.
+ *
+ * PUBLIC: #ifdef DB_WINCE
+ * PUBLIC: struct tm * localtime __P((const time_t *));
+ * PUBLIC: #endif
+ */
+struct tm *
+localtime(t)
+ const time_t *t;
+{
+ static struct tm y;
+ FILETIME uTm, lTm;
+ SYSTEMTIME pTm;
+ int64_t t64;
+
+ t64 = *t;
+ t64 = (t64 + 11644473600)*10000000;
+ uTm.dwLowDateTime = (DWORD)(t64 & 0xFFFFFFFF);
+ uTm.dwHighDateTime= (DWORD)(t64 >> 32);
+ FileTimeToLocalFileTime(&uTm,&lTm);
+ FileTimeToSystemTime(&lTm,&pTm);
+ y.tm_year = pTm.wYear - 1900;
+ y.tm_mon = pTm.wMonth - 1;
+ y.tm_wday = pTm.wDayOfWeek;
+ y.tm_mday = pTm.wDay;
+ y.tm_hour = pTm.wHour;
+ y.tm_min = pTm.wMinute;
+ y.tm_sec = pTm.wSecond;
+ return &y;
+}
diff --git a/src/os_windows/ce_mktime.c b/src/os_windows/ce_mktime.c
new file mode 100644
index 00000000..0d3a0906
--- /dev/null
+++ b/src/os_windows/ce_mktime.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 1987, 1989 Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Arthur David Olson of the National Cancer Institute.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. */
+
+/*static char *sccsid = "from: @(#)ctime.c 5.26 (Berkeley) 2/23/91";*/
+
+/*
+ * This implementation of mktime is lifted straight from the NetBSD (BSD 4.4)
+ * version. I modified it slightly to divorce it from the internals of the
+ * ctime library. Thus this version can't use details of the internal
+ * timezone state file to figure out strange unnormalized struct tm values,
+ * as might result from someone doing date math on the tm struct then passing
+ * it to mktime.
+ *
+ * It just does as well as it can at normalizing the tm input, then does a
+ * binary search of the time space using the system's localtime() function.
+ *
+ * The original binary search was defective in that it didn't consider the
+ * setting of tm_isdst when comparing tm values, causing the search to be
+ * flubbed for times near the dst/standard time changeover. The original
+ * code seems to make up for this by grubbing through the timezone info
+ * whenever the binary search barfed. Since I don't have that luxury in
+ * portable code, I have to take care of tm_isdst in the comparison routine.
+ * This requires knowing how many minutes offset dst is from standard time.
+ *
+ * So, if you live somewhere in the world where dst is not 60 minutes offset,
+ * and your vendor doesn't supply mktime(), you'll have to edit this variable
+ * by hand. Sorry about that.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#undef DSTMINUTES
+#define DSTMINUTES 60
+
+#undef FALSE
+#undef TRUE
+#define FALSE 0
+#define TRUE 1
+
+/*
+** Adapted from code provided by Robert Elz, who writes:
+** The "best" way to do mktime I think is based on an idea of Bob
+** Kridle's (so its said...) from a long time ago. (mtxinu!kridle now).
+** It does a binary search of the time_t space. Since time_t's are
+** just 32 bits, its a max of 32 iterations (even at 64 bits it
+** would still be very reasonable).
+*/
+
+#undef WRONG
+#define WRONG (-1)
+
+const unsigned int mon_lengths[2][MONSPERYEAR] = {
+ { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 },
+ { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }
+};
+const unsigned int year_lengths[2] = {
+ DAYSPERNYEAR, DAYSPERLYEAR
+};
+
+static void
+normalize(tensptr, unitsptr, base)
+ int base, *tensptr, *unitsptr;
+{
+ if (*unitsptr >= base) {
+ *tensptr += *unitsptr / base;
+ *unitsptr %= base;
+ } else if (*unitsptr < 0) {
+ --*tensptr;
+ *unitsptr += base;
+ if (*unitsptr < 0) {
+ *tensptr -= 1 + (-*unitsptr) / base;
+ *unitsptr = base - (-*unitsptr) % base;
+ }
+ }
+}
+
+static struct tm *
+mkdst(tmp)
+ struct tm * tmp;
+{
+ /* jds */
+ static struct tm tmbuf;
+
+ tmbuf = *tmp;
+ tmbuf.tm_isdst = 1;
+ tmbuf.tm_min += DSTMINUTES;
+ normalize(&tmbuf.tm_hour, &tmbuf.tm_min, MINSPERHOUR);
+ return &tmbuf;
+}
+
+static int
+tmcomp(atmp, btmp)
+ register struct tm *atmp, *btmp;
+{
+ register int result;
+
+ /* compare down to the same day */
+ if ((result = (atmp->tm_year - btmp->tm_year)) == 0 &&
+ (result = (atmp->tm_mon - btmp->tm_mon)) == 0)
+ result = (atmp->tm_mday - btmp->tm_mday);
+
+ if (result != 0)
+ return result;
+
+ /* get rid of one-sided dst bias */
+ if (atmp->tm_isdst == 1 && !btmp->tm_isdst)
+ btmp = mkdst(btmp);
+ else if (btmp->tm_isdst == 1 && !atmp->tm_isdst)
+ atmp = mkdst(atmp);
+
+ /* compare the rest of the way */
+ if ((result = (atmp->tm_hour - btmp->tm_hour)) == 0 &&
+ (result = (atmp->tm_min - btmp->tm_min)) == 0)
+ result = atmp->tm_sec - btmp->tm_sec;
+
+ return result;
+}
+
+static time_t
+time2(tmp, okayp, usezn)
+ struct tm *tmp;
+ int *okayp, usezn;
+{
+ register int bits, dir, i, saved_seconds;
+ time_t t;
+ struct tm yourtm, mytm;
+
+ *okayp = FALSE;
+ yourtm = *tmp;
+ if (yourtm.tm_sec >= SECSPERMIN + 2 || yourtm.tm_sec < 0)
+ normalize(&yourtm.tm_min, &yourtm.tm_sec, SECSPERMIN);
+ normalize(&yourtm.tm_hour, &yourtm.tm_min, MINSPERHOUR);
+ normalize(&yourtm.tm_mday, &yourtm.tm_hour, HOURSPERDAY);
+ normalize(&yourtm.tm_year, &yourtm.tm_mon, MONSPERYEAR);
+ while (yourtm.tm_mday <= 0) {
+ --yourtm.tm_year;
+ yourtm.tm_mday +=
+ year_lengths[isleap(yourtm.tm_year + TM_YEAR_BASE)];
+ }
+ for ( ; ; ) {
+ i = mon_lengths[isleap(yourtm.tm_year +
+ TM_YEAR_BASE)][yourtm.tm_mon];
+ if (yourtm.tm_mday <= i)
+ break;
+ yourtm.tm_mday -= i;
+ if (++yourtm.tm_mon >= MONSPERYEAR) {
+ yourtm.tm_mon = 0;
+ ++yourtm.tm_year;
+ }
+ }
+ saved_seconds = yourtm.tm_sec;
+ yourtm.tm_sec = 0;
+ /*
+ ** Calculate the number of magnitude bits in a time_t
+ ** (this works regardless of whether time_t is
+ ** signed or unsigned, though lint complains if unsigned).
+ */
+ for (bits = 0, t = 1; t > 0; ++bits, t <<= 1)
+ ;
+ /*
+ ** If time_t is signed, then 0 is the median value,
+ ** if time_t is unsigned, then 1 << bits is median.
+ */
+ t = (t < 0) ? 0 : ((time_t) 1 << bits);
+ for ( ; ; ) {
+ if (usezn)
+ mytm = *localtime(&t);
+ else
+ mytm = *gmtime(&t);
+ dir = tmcomp(&mytm, &yourtm);
+ if (dir != 0) {
+ if (bits-- < 0)
+ return WRONG;
+ if (bits < 0)
+ --t;
+ else if (dir > 0)
+ t -= (time_t) 1 << bits;
+ else t += (time_t) 1 << bits;
+ continue;
+ }
+ if (yourtm.tm_isdst < 0 || mytm.tm_isdst == yourtm.tm_isdst)
+ break;
+
+ return WRONG;
+ }
+ t += saved_seconds;
+ if (usezn)
+ *tmp = *localtime(&t);
+ else
+ *tmp = *gmtime(&t);
+ *okayp = TRUE;
+ return t;
+}
+
+static time_t
+time1(tmp)
+ struct tm * tmp;
+{
+ register time_t t;
+ int okay;
+
+ if (tmp->tm_isdst > 1)
+ tmp->tm_isdst = 1;
+ t = time2(tmp, &okay, 1);
+ if (okay || tmp->tm_isdst < 0)
+ return t;
+
+ return WRONG;
+}
+
+/*
+ * mktime --
+ *
+ * PUBLIC: #ifdef DB_WINCE
+ * PUBLIC: time_t __ce_mktime __P((struct tm *));
+ * PUBLIC: #endif
+ */
+time_t
+__ce_mktime(tmp)
+ struct tm * tmp;
+{
+ return time1(tmp);
+}
diff --git a/src/os_windows/ce_remove.c b/src/os_windows/ce_remove.c
new file mode 100644
index 00000000..f955f3b4
--- /dev/null
+++ b/src/os_windows/ce_remove.c
@@ -0,0 +1,26 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * remove implementation on WinCE.
+ *
+ * PUBLIC: #ifdef DB_WINCE
+ * PUBLIC: int __ce_remove __P((const char *path));
+ * PUBLIC: #endif
+ */
+
+int
+__ce_remove(path)
+ const char *path;
+{
+ return __os_unlink(NULL, path, 0);
+}
diff --git a/src/os_windows/ce_util_sig.c b/src/os_windows/ce_util_sig.c
new file mode 100644
index 00000000..11fb4ad7
--- /dev/null
+++ b/src/os_windows/ce_util_sig.c
@@ -0,0 +1,35 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * The stub functions for signal handling.
+ * WinCE does not support signal handling, so we just define stub functions to
+ * avoid linkage errors for utilities build.
+ */
+
+void
+__db_util_siginit()
+{
+ return;
+}
+
+int
+__db_util_interrupted()
+{
+ return (0);
+}
+
+void
+__db_util_sigresend()
+{
+ return;
+}
diff --git a/src/os_windows/os_abs.c b/src/os_windows/os_abs.c
index e769ab2c..f9be934e 100644
--- a/src/os_windows/os_abs.c
+++ b/src/os_windows/os_abs.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_clock.c b/src/os_windows/os_clock.c
index e548729b..80a96785 100644
--- a/src/os_windows/os_clock.c
+++ b/src/os_windows/os_clock.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -16,7 +16,7 @@
*/
void
__os_gettime(env, tp, monotonic)
- ENV *env;
+ const ENV *env;
db_timespec *tp;
int monotonic;
{
diff --git a/src/os_windows/os_config.c b/src/os_windows/os_config.c
index 4250dbd4..c4b61700 100644
--- a/src/os_windows/os_config.c
+++ b/src/os_windows/os_config.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_cpu.c b/src/os_windows/os_cpu.c
index 0922071f..41004753 100644
--- a/src/os_windows/os_cpu.c
+++ b/src/os_windows/os_cpu.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_dir.c b/src/os_windows/os_dir.c
index 31d364d7..4065d182 100644
--- a/src/os_windows/os_dir.c
+++ b/src/os_windows/os_dir.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_errno.c b/src/os_windows/os_errno.c
index ba8ec359..a8c35480 100644
--- a/src/os_windows/os_errno.c
+++ b/src/os_windows/os_errno.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_fid.c b/src/os_windows/os_fid.c
index f2d190b1..bfd4182c 100644
--- a/src/os_windows/os_fid.c
+++ b/src/os_windows/os_fid.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -118,12 +118,12 @@ __os_fileid(env, fname, unique_okay, fidp)
DB_GLOBAL(fid_serial) = (u_int32_t)pid;
} else
DB_GLOBAL(fid_serial) += 100000;
-
+ tmp = (u_int32_t)DB_GLOBAL(fid_serial);
} else {
tmp = (u_int32_t)fi.dwVolumeSerialNumber;
- for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
- *fidp++ = *p++;
}
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
return (0);
}
diff --git a/src/os_windows/os_flock.c b/src/os_windows/os_flock.c
index cb3e4986..9dcd1e81 100644
--- a/src/os_windows/os_flock.c
+++ b/src/os_windows/os_flock.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_fsync.c b/src/os_windows/os_fsync.c
index 8824aac1..5194c00b 100644
--- a/src/os_windows/os_fsync.c
+++ b/src/os_windows/os_fsync.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_getenv.c b/src/os_windows/os_getenv.c
index aad59d01..0ac1db0a 100644
--- a/src/os_windows/os_getenv.c
+++ b/src/os_windows/os_getenv.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_handle.c b/src/os_windows/os_handle.c
index e6edc3ef..65809017 100644
--- a/src/os_windows/os_handle.c
+++ b/src/os_windows/os_handle.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_map.c b/src/os_windows/os_map.c
index 8f646d68..eefa3e8b 100644
--- a/src/os_windows/os_map.c
+++ b/src/os_windows/os_map.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -105,9 +105,12 @@ __os_detach(env, infop, destroy)
int destroy;
{
DB_ENV *dbenv;
+ REGION *rp;
int ret, t_ret;
dbenv = env->dbenv;
+ rp = infop->rp;
+ ret = 0;
if (infop->wnt_handle != NULL) {
(void)CloseHandle(infop->wnt_handle);
@@ -120,10 +123,19 @@ __os_detach(env, infop, destroy)
return (ret);
}
- ret = !UnmapViewOfFile(infop->addr) ? __os_get_syserr() : 0;
- if (ret != 0) {
- __db_syserr(env, ret, DB_STR("0007", "UnmapViewOfFile"));
- ret = __os_posix_err(ret);
+ if (F_ISSET(env, ENV_FORCESYNCENV))
+ if (!FlushViewOfFile(infop->addr, rp->max)) {
+ ret = __os_get_syserr();
+ __db_syserr(env, ret, DB_STR("0249",
+ "FlushViewOfFile failed on closing environment"));
+ ret = __os_posix_err(ret);
+ }
+
+ t_ret = !UnmapViewOfFile(infop->addr) ? __os_get_syserr() : 0;
+ if (t_ret != 0) {
+ __db_syserr(env, t_ret, DB_STR("0007", "UnmapViewOfFile"));
+ if (ret == 0)
+ ret = __os_posix_err(t_ret);
}
if (!F_ISSET(env, ENV_SYSTEM_MEM) && destroy &&
diff --git a/src/os_windows/os_mkdir.c b/src/os_windows/os_mkdir.c
index b87f3f9d..7ad7eed2 100644
--- a/src/os_windows/os_mkdir.c
+++ b/src/os_windows/os_mkdir.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_open.c b/src/os_windows/os_open.c
index 44f2faf3..bc715a96 100644
--- a/src/os_windows/os_open.c
+++ b/src/os_windows/os_open.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_rename.c b/src/os_windows/os_rename.c
index 791f53a5..d70f20ca 100644
--- a/src/os_windows/os_rename.c
+++ b/src/os_windows/os_rename.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_rmdir.c b/src/os_windows/os_rmdir.c
new file mode 100644
index 00000000..18090f09
--- /dev/null
+++ b/src/os_windows/os_rmdir.c
@@ -0,0 +1,42 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_rmdir --
+ * Remove a directory.
+ */
+int
+__os_rmdir(env, name)
+ ENV *env;
+ const char *name;
+{
+ DB_ENV *dbenv;
+ _TCHAR *tname;
+ int ret;
+
+ dbenv = env == NULL ? NULL : env->dbenv;
+
+ if (dbenv != NULL &&
+ FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
+ __db_msg(env, DB_STR_A("0240", "fileops: rmdir %s",
+ "%s"), name);
+
+ TO_TSTRING(env, name, tname, ret);
+ if (ret != 0)
+ return (ret);
+ RETRY_CHK(!RemoveDirectory(tname), ret);
+ FREE_STRING(env, tname);
+ if (ret != 0)
+ return (__os_posix_err(ret));
+
+ return (ret);
+}
diff --git a/src/os_windows/os_rw.c b/src/os_windows/os_rw.c
index e64a7d08..20644e6e 100644
--- a/src/os_windows/os_rw.c
+++ b/src/os_windows/os_rw.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_seek.c b/src/os_windows/os_seek.c
index 7632c15d..613e4a7c 100644
--- a/src/os_windows/os_seek.c
+++ b/src/os_windows/os_seek.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_stat.c b/src/os_windows/os_stat.c
index 11248886..5c3a0fcc 100644
--- a/src/os_windows/os_stat.c
+++ b/src/os_windows/os_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_truncate.c b/src/os_windows/os_truncate.c
index fcbb37b2..d1150c85 100644
--- a/src/os_windows/os_truncate.c
+++ b/src/os_windows/os_truncate.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -15,11 +15,12 @@
* Truncate the file.
*/
int
-__os_truncate(env, fhp, pgno, pgsize)
+__os_truncate(env, fhp, pgno, pgsize, relative)
ENV *env;
DB_FH *fhp;
db_pgno_t pgno;
u_int32_t pgsize;
+ off_t relative;
{
/* Yes, this really is how Microsoft have designed their API */
union {
@@ -34,7 +35,7 @@ __os_truncate(env, fhp, pgno, pgsize)
int ret;
dbenv = env == NULL ? NULL : env->dbenv;
- offset = (off_t)pgsize * pgno;
+ offset = (off_t)pgsize * pgno + relative;
ret = 0;
if (dbenv != NULL &&
@@ -84,7 +85,7 @@ __os_truncate(env, fhp, pgno, pgsize)
* We can't switch to SetFilePointerEx, which knows about 64-bit
* offsets, because it isn't supported on Win9x/ME.
*/
- RETRY_CHK((off.bigint = (__int64)pgsize * pgno,
+ RETRY_CHK((off.bigint = (__int64)pgsize * pgno + relative,
(SetFilePointer(fhp->trunc_handle, off.low, &off.high, FILE_BEGIN)
== INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) ||
!SetEndOfFile(fhp->trunc_handle)), ret);
diff --git a/src/os_windows/os_unlink.c b/src/os_windows/os_unlink.c
index 6a0a6572..5c63a5e6 100644
--- a/src/os_windows/os_unlink.c
+++ b/src/os_windows/os_unlink.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/os_windows/os_yield.c b/src/os_windows/os_yield.c
index 0d32ef69..bf326ee2 100644
--- a/src/os_windows/os_yield.c
+++ b/src/os_windows/os_yield.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/qam/qam.c b/src/qam/qam.c
index e81d4795..0c71fd0d 100644
--- a/src/qam/qam.c
+++ b/src/qam/qam.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -260,7 +260,7 @@ __qamc_put(dbc, key, data, flags, pgnop)
}
if (exact != 0 && flags == DB_NOOVERWRITE)
- ret = DB_KEYEXIST;
+ ret = DBC_ERR(dbc, DB_KEYEXIST);
else
/* Put the item on the page. */
ret = __qam_pitem(dbc,
@@ -526,7 +526,7 @@ __qamc_del(dbc, flags)
return (ret);
if (QAM_NOT_VALID(meta, cp->recno)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
first = meta->first_recno;
@@ -549,7 +549,7 @@ __qamc_del(dbc, flags)
goto err;
if (!exact) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -636,9 +636,9 @@ __qamc_get(dbc, key, data, flags, pgnop)
QUEUE_CURSOR *cp;
db_lockmode_t lock_mode;
db_pgno_t metapno;
- db_recno_t first;
+ db_recno_t first, old_first;
int exact, inorder, is_first, ret, t_ret, wait, with_delete;
- int retrying;
+ int retrying, stay;
u_int32_t skip, meta_mode;
dbp = dbc->dbp;
@@ -652,7 +652,9 @@ __qamc_get(dbc, key, data, flags, pgnop)
meta = NULL;
*pgnop = 0;
pg = NULL;
- retrying = t_ret = wait = with_delete = 0;
+ retrying = t_ret = wait = with_delete = 0;
+ stay = 1;
+ old_first = RECNO_OOB;
if (flags == DB_CONSUME_WAIT) {
wait = 1;
@@ -676,25 +678,25 @@ __qamc_get(dbc, key, data, flags, pgnop)
t = (QUEUE *)dbp->q_internal;
metapno = t->q_meta;
- /*
- * Get the meta page first
- */
- if ((ret = __memp_fget(mpf, &metapno,
- dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0)
- return (ret);
-
/* Release any previous lock if not in a transaction. */
if ((ret = __TLPUT(dbc, cp->lock)) != 0)
goto err;
skip = 0;
-retry: /* Update the record number. */
+retry:
+ /*
+ * Get the meta page first
+ */
+ if (meta == NULL && (ret = __memp_fget(mpf, &metapno,
+ dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0)
+ return (ret); /* Update the record number. */
+
switch (flags) {
case DB_CURRENT:
break;
case DB_NEXT_DUP:
case DB_PREV_DUP:
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
/* NOTREACHED */
case DB_NEXT:
@@ -711,7 +713,7 @@ retry: /* Update the record number. */
if (QAM_AFTER_CURRENT(meta, cp->recno)) {
pg = NULL;
if (!wait) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
/*
@@ -774,6 +776,7 @@ retry: /* Update the record number. */
DB_LOCK_UPGRADE, &metalock)) != 0) {
if (ret == DB_LOCK_DEADLOCK)
ret = DB_LOCK_NOTGRANTED;
+ (void)DBC_ERR(dbc, ret);
goto err;
}
@@ -792,6 +795,8 @@ retry: /* Update the record number. */
/* get the first record number */
cp->recno = first = meta->first_recno;
+ if (old_first == RECNO_OOB)
+ old_first = first;
break;
case DB_PREV:
@@ -799,7 +804,7 @@ retry: /* Update the record number. */
if (cp->recno != RECNO_OOB) {
if (cp->recno == meta->first_recno ||
QAM_BEFORE_FIRST(meta, cp->recno)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
QAM_DEC_RECNO(cp->recno);
@@ -808,7 +813,7 @@ retry: /* Update the record number. */
/* FALLTHROUGH */
case DB_LAST:
if (meta->first_recno == meta->cur_recno) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
cp->recno = meta->cur_recno;
@@ -892,11 +897,11 @@ dolock: if (!with_delete || inorder || retrying) {
LOCK_INIT(lock);
goto release_retry;
}
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto lerr;
}
if (QAM_AFTER_CURRENT(meta, cp->recno)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto lerr;
}
}
@@ -951,9 +956,37 @@ release_retry: /* Release locks and retry, if possible. */
case DB_NEXT_NODUP:
if (!with_delete)
is_first = 0;
- else if (first == cp->recno)
+ else if (first == cp->recno) {
/* we have verified that this record is gone. */
QAM_INC_RECNO(first);
+ /*
+ * If we are reading in order and the first
+ * record was not there, we need to reflect
+ * this in the meta page, so that we can
+ * avoid checking this record again and again.
+ */
+ if (inorder && cp->recno == meta->first_recno) {
+ if (DBC_LOGGING(dbc)) {
+#ifdef QDEBUG
+ (void)__log_printf(
+ dbp->env, dbc->txn,
+ "Queue O: %x %u %u %u",
+ dbc->locker ?
+ dbc->locker->id : 0,
+ cp->recno, first,
+ meta->cur_recno);
+#endif
+ if ((ret = __qam_incfirst_log(
+ dbp, dbc->txn,
+ &meta->dbmeta.lsn, 0,
+ cp->recno,
+ PGNO_BASE_MD)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(meta->dbmeta.lsn);
+ meta->first_recno = first;
+ }
+ }
if (QAM_BEFORE_FIRST(meta, cp->recno) &&
DONT_NEED_LOCKS(dbc))
flags = DB_FIRST;
@@ -979,7 +1012,7 @@ release_retry: /* Release locks and retry, if possible. */
default:
/* this is for the SET and GET_BOTH cases */
- ret = DB_KEYEMPTY;
+ ret = DBC_ERR(dbc, DB_KEYEMPTY);
goto err1;
}
retrying = 0;
@@ -1031,10 +1064,10 @@ release_retry: /* Release locks and retry, if possible. */
*/
tmp.data = qp->data;
tmp.size = t->re_len;
- if ((ret = __bam_defcmp(dbp, data, &tmp)) != 0) {
+ if ((ret = __bam_defcmp(dbp, data, &tmp, NULL)) != 0) {
if (flags == DB_GET_BOTH_RANGE)
goto release_retry;
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err1;
}
}
@@ -1139,14 +1172,17 @@ release_retry: /* Release locks and retry, if possible. */
* If we deleted the first record we checked then we moved
* the first pointer properly.
*/
-
- if (first == cp->recno && (skip = (first % t->rec_page)) != 0)
+ if (((QUEUE *)dbp->q_internal)->page_ext != 0)
+ stay = (QAM_RECNO_EXTENT(dbp, old_first) ==
+ QAM_RECNO_EXTENT(dbp, first));
+ if (stay && first == cp->recno &&
+ (skip = (first % t->rec_page)) != 0)
goto done;
if (meta == NULL &&
(ret = __memp_fget(mpf, &metapno,
dbc->thread_info, dbc->txn, 0, &meta)) != 0)
goto err;
- if (skip && !QAM_BEFORE_FIRST(meta, first))
+ if (stay && skip && !QAM_BEFORE_FIRST(meta, first))
goto done;
#ifdef QDEBUG
@@ -1156,7 +1192,11 @@ release_retry: /* Release locks and retry, if possible. */
dbc->locker ? dbc->locker->id : 0,
cp->recno, first, meta->first_recno);
#endif
- ret = __qam_consume(dbc, meta, first);
+ if (stay) {
+ ret = __qam_consume(dbc, meta, first);
+ } else {
+ ret = __qam_consume(dbc, meta, old_first);
+ }
}
err1: if (cp->page != NULL) {
@@ -1272,8 +1312,8 @@ __qam_consume(dbc, meta, first)
*/
if (rec_extent != 0 &&
((exact = (first % rec_extent == 0)) ||
- (first % meta->rec_page == 0) ||
- first == UINT32_MAX)) {
+ (exact = (first == UINT32_MAX)) ||
+ (first % meta->rec_page == 0))) {
#ifdef QDEBUG
if (DBC_LOGGING(dbc))
(void)__log_printf(dbp->env, dbc->txn,
diff --git a/src/qam/qam.src b/src/qam/qam.src
index a8e2e4e0..eca6c07c 100644
--- a/src/qam/qam.src
+++ b/src/qam/qam.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/qam/qam_conv.c b/src/qam/qam_conv.c
index beb7c973..34ce321a 100644
--- a/src/qam/qam_conv.c
+++ b/src/qam/qam_conv.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/qam/qam_files.c b/src/qam/qam_files.c
index e9a9ff07..f5c7d2ec 100644
--- a/src/qam/qam_files.c
+++ b/src/qam/qam_files.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -445,6 +445,8 @@ __qam_fremove(dbp, pgnoaddr)
* sizeof(array->mpfarray[0]));
array->mpfarray[
array->hi_extent - array->low_extent].mpf = NULL;
+ array->mpfarray[
+ array->hi_extent - array->low_extent].pinref = 0;
if (array->low_extent != array->hi_extent)
array->low_extent++;
} else {
@@ -570,8 +572,11 @@ again:
for (i = first; i >= first && i <= stop; i += rec_extent) {
if ((ret = __qam_fprobe(dbc, QAM_RECNO_PAGE(dbp, i),
&fp->mpf, QAM_PROBE_MPF, dbp->priority, 0)) != 0) {
- if (ret == ENOENT)
+ if (ret == ENOENT) {
+ /* Missing extents are acceptable; skip them. */
+ ret = 0;
continue;
+ }
goto err;
}
fp->id = QAM_RECNO_EXTENT(dbp, i);
diff --git a/src/qam/qam_method.c b/src/qam/qam_method.c
index 0867e5dd..5d796cdb 100644
--- a/src/qam/qam_method.c
+++ b/src/qam/qam_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/qam/qam_open.c b/src/qam/qam_open.c
index 69f6cb75..5be78f68 100644
--- a/src/qam/qam_open.c
+++ b/src/qam/qam_open.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/qam/qam_rec.c b/src/qam/qam_rec.c
index c9ff6c83..c5f6b3f4 100644
--- a/src/qam/qam_rec.c
+++ b/src/qam/qam_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -152,6 +152,10 @@ __qam_incfirst_recover(env, dbtp, lsnp, op, info)
REC_DIRTY(mpf, ip, dbc->priority, &meta);
LSN(meta) = *lsnp;
}
+ if (QAM_BEFORE_FIRST(meta, argp->recno)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ meta->first_recno = argp->recno;
+ }
if ((ret = __qam_adjust_first(file_dbp,
dbc, meta, argp->recno + 1)) != 0)
goto err;
diff --git a/src/qam/qam_stat.c b/src/qam/qam_stat.c
index 15c41bb5..19e09383 100644
--- a/src/qam/qam_stat.c
+++ b/src/qam/qam_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/qam/qam_stub.c b/src/qam/qam_stub.c
index f5140079..6df0536c 100644
--- a/src/qam/qam_stub.c
+++ b/src/qam/qam_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/qam/qam_upgrade.c b/src/qam/qam_upgrade.c
index ac96c889..4b9e9453 100644
--- a/src/qam/qam_upgrade.c
+++ b/src/qam/qam_upgrade.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/qam/qam_verify.c b/src/qam/qam_verify.c
index af5ab5db..d2f8ab79 100644
--- a/src/qam/qam_verify.c
+++ b/src/qam/qam_verify.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -115,14 +115,14 @@ __qam_vrfy_meta(dbp, vdp, meta, pgno, flags)
* this assumption fails. (We need the qp info to be reasonable
* before we do per-page verification of queue extents.)
*/
- if (F_ISSET(vdp, VRFY_QMETA_SET)) {
+ if (F_ISSET(vdp, SALVAGE_QMETA_SET)) {
isbad = 1;
EPRINT((env, DB_STR_A("1148",
"Page %lu: database contains multiple Queue metadata pages",
"%lu"), (u_long)pgno));
goto err;
}
- F_SET(vdp, VRFY_QMETA_SET);
+ F_SET(vdp, SALVAGE_QMETA_SET);
qp->page_ext = meta->page_ext;
dbp->pgsize = meta->dbmeta.pagesize;
qp->q_meta = pgno;
diff --git a/src/rep/mlease.html b/src/rep/mlease.html
index 7d44b465..4e82f63c 100644
--- a/src/rep/mlease.html
+++ b/src/rep/mlease.html
@@ -1,5 +1,5 @@
<!DOCTYPE doctype PUBLIC "-//w3c//dtd html 4.0 transitional//en">
-<!--Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.-->
+<!--Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved.-->
<html>
<head>
<meta http-equiv="Content-Type"
diff --git a/src/rep/rep.msg b/src/rep/rep.msg
index b751a64d..d5c56d93 100644
--- a/src/rep/rep.msg
+++ b/src/rep/rep.msg
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -57,7 +57,22 @@ ARG pgsize u_int32_t
ARG pgno db_pgno_t
ARG max_pgno db_pgno_t
ARG filenum u_int32_t
-ARG finfo_flags u_int32_t
+ARG finfo_flags u_int32_t
+ARG type u_int32_t
+ARG db_flags u_int32_t
+ARG uid DBT
+ARG info DBT
+ARG dir DBT
+ARG blob_fid_lo u_int32_t
+ARG blob_fid_hi u_int32_t
+END
+
+BEGIN_MSG fileinfo_v7 alloc check_length version
+ARG pgsize u_int32_t
+ARG pgno db_pgno_t
+ARG max_pgno db_pgno_t
+ARG filenum u_int32_t
+ARG finfo_flags u_int32_t
ARG type u_int32_t
ARG db_flags u_int32_t
ARG uid DBT
@@ -158,3 +173,54 @@ ARG lsn DB_LSN
ARG hist_sec u_int32_t
ARG hist_nsec u_int32_t
END
+
+/*
+ * Request for blob files.
+ */
+BEGIN_MSG blob_update_req
+ARG blob_fid u_int64_t
+ARG blob_sid u_int64_t
+ARG blob_id u_int64_t
+ARG highest_id u_int64_t
+END
+
+/*
+ * A list of blob file for a database.
+ */
+BEGIN_MSG blob_update
+ARG blob_fid u_int64_t
+ARG highest_id u_int64_t
+ARG flags u_int32_t
+ARG num_blobs u_int32_t
+END
+
+/*
+ * Blob file description, part of blob_update.
+ */
+BEGIN_MSG blob_file
+ARG blob_sid u_int64_t
+ARG blob_id u_int64_t
+ARG blob_size u_int64_t
+END
+
+/*
+ * A piece of data from a blob file.
+ */
+BEGIN_MSG blob_chunk
+ARG flags u_int32_t
+ARG blob_fid u_int64_t
+ARG blob_sid u_int64_t
+ARG blob_id u_int64_t
+ARG offset u_int64_t
+ARG data DBT
+END
+
+/*
+ * Request for data from a blob file at the given offset.
+ */
+BEGIN_MSG blob_chunk_req
+ARG blob_fid u_int64_t
+ARG blob_sid u_int64_t
+ARG blob_id u_int64_t
+ARG offset u_int64_t
+END
diff --git a/src/rep/rep_automsg.c b/src/rep/rep_automsg.c
index 5d8155fb..cab68b3e 100644
--- a/src/rep/rep_automsg.c
+++ b/src/rep/rep_automsg.c
@@ -280,6 +280,16 @@ __rep_fileinfo_marshal(env, version, argp, bp, max, lenp)
memcpy(bp, argp->dir.data, argp->dir.size);
bp += argp->dir.size;
}
+ if (copy_only) {
+ memcpy(bp, &argp->blob_fid_lo, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->blob_fid_lo);
+ if (copy_only) {
+ memcpy(bp, &argp->blob_fid_hi, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->blob_fid_hi);
*lenp = (size_t)(bp - start);
return (0);
@@ -386,6 +396,16 @@ __rep_fileinfo_unmarshal(env, version, argpp, bp, max, nextp)
if (max < needed)
goto too_few;
bp += argp->dir.size;
+ if (copy_only) {
+ memcpy(&argp->blob_fid_lo, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->blob_fid_lo, bp);
+ if (copy_only) {
+ memcpy(&argp->blob_fid_hi, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->blob_fid_hi, bp);
if (nextp != NULL)
*nextp = bp;
@@ -399,6 +419,211 @@ too_few:
}
/*
+ * PUBLIC: int __rep_fileinfo_v7_marshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_fileinfo_v7_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_fileinfo_v7_marshal(env, version, argp, bp, max, lenp)
+ ENV *env;
+ u_int32_t version;
+ __rep_fileinfo_v7_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ int copy_only;
+ u_int8_t *start;
+
+ if (max < __REP_FILEINFO_V7_SIZE
+ + (size_t)argp->uid.size
+ + (size_t)argp->info.size
+ + (size_t)argp->dir.size)
+ return (ENOMEM);
+ start = bp;
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(bp, &argp->pgsize, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->pgsize);
+ if (copy_only) {
+ memcpy(bp, &argp->pgno, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->pgno);
+ if (copy_only) {
+ memcpy(bp, &argp->max_pgno, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->max_pgno);
+ if (copy_only) {
+ memcpy(bp, &argp->filenum, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->filenum);
+ if (copy_only) {
+ memcpy(bp, &argp->finfo_flags, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->finfo_flags);
+ if (copy_only) {
+ memcpy(bp, &argp->type, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->type);
+ if (copy_only) {
+ memcpy(bp, &argp->db_flags, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->db_flags);
+ if (copy_only) {
+ memcpy(bp, &argp->uid.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->uid.size);
+ if (argp->uid.size > 0) {
+ memcpy(bp, argp->uid.data, argp->uid.size);
+ bp += argp->uid.size;
+ }
+ if (copy_only) {
+ memcpy(bp, &argp->info.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->info.size);
+ if (argp->info.size > 0) {
+ memcpy(bp, argp->info.data, argp->info.size);
+ bp += argp->info.size;
+ }
+ if (copy_only) {
+ memcpy(bp, &argp->dir.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->dir.size);
+ if (argp->dir.size > 0) {
+ memcpy(bp, argp->dir.data, argp->dir.size);
+ bp += argp->dir.size;
+ }
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_v7_unmarshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_fileinfo_v7_args **, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_fileinfo_v7_unmarshal(env, version, argpp, bp, max, nextp)
+ ENV *env;
+ u_int32_t version;
+ __rep_fileinfo_v7_args **argpp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+ __rep_fileinfo_v7_args *argp;
+ int ret;
+ int copy_only;
+
+ needed = __REP_FILEINFO_V7_SIZE;
+ if (max < needed)
+ goto too_few;
+ if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0)
+ return (ret);
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(&argp->pgsize, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->pgsize, bp);
+ if (copy_only) {
+ memcpy(&argp->pgno, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->pgno, bp);
+ if (copy_only) {
+ memcpy(&argp->max_pgno, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->max_pgno, bp);
+ if (copy_only) {
+ memcpy(&argp->filenum, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->filenum, bp);
+ if (copy_only) {
+ memcpy(&argp->finfo_flags, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->finfo_flags, bp);
+ if (copy_only) {
+ memcpy(&argp->type, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->type, bp);
+ if (copy_only) {
+ memcpy(&argp->db_flags, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->db_flags, bp);
+ if (copy_only) {
+ memcpy(&argp->uid.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->uid.size, bp);
+ if (argp->uid.size == 0)
+ argp->uid.data = NULL;
+ else
+ argp->uid.data = bp;
+ needed += (size_t)argp->uid.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->uid.size;
+ if (copy_only) {
+ memcpy(&argp->info.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->info.size, bp);
+ if (argp->info.size == 0)
+ argp->info.data = NULL;
+ else
+ argp->info.data = bp;
+ needed += (size_t)argp->info.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->info.size;
+ if (copy_only) {
+ memcpy(&argp->dir.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->dir.size, bp);
+ if (argp->dir.size == 0)
+ argp->dir.data = NULL;
+ else
+ argp->dir.data = bp;
+ needed += (size_t)argp->dir.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->dir.size;
+
+ if (nextp != NULL)
+ *nextp = bp;
+ *argpp = argp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_fileinfo_v7 message"));
+ return (EINVAL);
+}
+
+/*
* PUBLIC: int __rep_fileinfo_v6_marshal __P((ENV *, u_int32_t,
* PUBLIC: __rep_fileinfo_v6_args *, u_int8_t *, size_t, size_t *));
*/
@@ -1039,3 +1264,245 @@ too_few:
return (EINVAL);
}
+/*
+ * PUBLIC: void __rep_blob_update_req_marshal __P((ENV *,
+ * PUBLIC: __rep_blob_update_req_args *, u_int8_t *));
+ */
+void
+__rep_blob_update_req_marshal(env, argp, bp)
+ ENV *env;
+ __rep_blob_update_req_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_fid);
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_sid);
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_id);
+ DB_HTONLL_COPYOUT(env, bp, argp->highest_id);
+}
+
+/*
+ * PUBLIC: int __rep_blob_update_req_unmarshal __P((ENV *,
+ * PUBLIC: __rep_blob_update_req_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_blob_update_req_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_blob_update_req_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_BLOB_UPDATE_REQ_SIZE)
+ goto too_few;
+ DB_NTOHLL_COPYIN(env, argp->blob_fid, bp);
+ DB_NTOHLL_COPYIN(env, argp->blob_sid, bp);
+ DB_NTOHLL_COPYIN(env, argp->blob_id, bp);
+ DB_NTOHLL_COPYIN(env, argp->highest_id, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_blob_update_req message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __rep_blob_update_marshal __P((ENV *,
+ * PUBLIC: __rep_blob_update_args *, u_int8_t *));
+ */
+void
+__rep_blob_update_marshal(env, argp, bp)
+ ENV *env;
+ __rep_blob_update_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_fid);
+ DB_HTONLL_COPYOUT(env, bp, argp->highest_id);
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+ DB_HTONL_COPYOUT(env, bp, argp->num_blobs);
+}
+
+/*
+ * PUBLIC: int __rep_blob_update_unmarshal __P((ENV *,
+ * PUBLIC: __rep_blob_update_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_blob_update_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_blob_update_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_BLOB_UPDATE_SIZE)
+ goto too_few;
+ DB_NTOHLL_COPYIN(env, argp->blob_fid, bp);
+ DB_NTOHLL_COPYIN(env, argp->highest_id, bp);
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+ DB_NTOHL_COPYIN(env, argp->num_blobs, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_blob_update message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __rep_blob_file_marshal __P((ENV *,
+ * PUBLIC: __rep_blob_file_args *, u_int8_t *));
+ */
+void
+__rep_blob_file_marshal(env, argp, bp)
+ ENV *env;
+ __rep_blob_file_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_sid);
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_id);
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_size);
+}
+
+/*
+ * PUBLIC: int __rep_blob_file_unmarshal __P((ENV *,
+ * PUBLIC: __rep_blob_file_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_blob_file_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_blob_file_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_BLOB_FILE_SIZE)
+ goto too_few;
+ DB_NTOHLL_COPYIN(env, argp->blob_sid, bp);
+ DB_NTOHLL_COPYIN(env, argp->blob_id, bp);
+ DB_NTOHLL_COPYIN(env, argp->blob_size, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_blob_file message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __rep_blob_chunk_marshal __P((ENV *,
+ * PUBLIC: __rep_blob_chunk_args *, u_int8_t *));
+ */
+void
+__rep_blob_chunk_marshal(env, argp, bp)
+ ENV *env;
+ __rep_blob_chunk_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_fid);
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_sid);
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_id);
+ DB_HTONLL_COPYOUT(env, bp, argp->offset);
+ DB_HTONL_COPYOUT(env, bp, argp->data.size);
+ if (argp->data.size > 0) {
+ memcpy(bp, argp->data.data, argp->data.size);
+ bp += argp->data.size;
+ }
+}
+
+/*
+ * PUBLIC: int __rep_blob_chunk_unmarshal __P((ENV *,
+ * PUBLIC: __rep_blob_chunk_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_blob_chunk_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_blob_chunk_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+
+ needed = __REP_BLOB_CHUNK_SIZE;
+ if (max < needed)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+ DB_NTOHLL_COPYIN(env, argp->blob_fid, bp);
+ DB_NTOHLL_COPYIN(env, argp->blob_sid, bp);
+ DB_NTOHLL_COPYIN(env, argp->blob_id, bp);
+ DB_NTOHLL_COPYIN(env, argp->offset, bp);
+ DB_NTOHL_COPYIN(env, argp->data.size, bp);
+ if (argp->data.size == 0)
+ argp->data.data = NULL;
+ else
+ argp->data.data = bp;
+ needed += (size_t)argp->data.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->data.size;
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_blob_chunk message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __rep_blob_chunk_req_marshal __P((ENV *,
+ * PUBLIC: __rep_blob_chunk_req_args *, u_int8_t *));
+ */
+void
+__rep_blob_chunk_req_marshal(env, argp, bp)
+ ENV *env;
+ __rep_blob_chunk_req_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_fid);
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_sid);
+ DB_HTONLL_COPYOUT(env, bp, argp->blob_id);
+ DB_HTONLL_COPYOUT(env, bp, argp->offset);
+}
+
+/*
+ * PUBLIC: int __rep_blob_chunk_req_unmarshal __P((ENV *,
+ * PUBLIC: __rep_blob_chunk_req_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_blob_chunk_req_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_blob_chunk_req_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_BLOB_CHUNK_REQ_SIZE)
+ goto too_few;
+ DB_NTOHLL_COPYIN(env, argp->blob_fid, bp);
+ DB_NTOHLL_COPYIN(env, argp->blob_sid, bp);
+ DB_NTOHLL_COPYIN(env, argp->blob_id, bp);
+ DB_NTOHLL_COPYIN(env, argp->offset, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __rep_blob_chunk_req message"));
+ return (EINVAL);
+}
+
diff --git a/src/rep/rep_backup.c b/src/rep/rep_backup.c
index cfde7622..14bc63bb 100644
--- a/src/rep/rep_backup.c
+++ b/src/rep/rep_backup.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_am.h"
#include "dbinc/fop.h"
@@ -26,21 +27,45 @@
* Note that the fileinfo for the first file in the list always appears at
* (constant) offset __REP_UPDATE_SIZE in the buffer.
*/
+#define FILE_CTX_INMEM_ONLY 0x01
typedef struct {
u_int8_t *buf; /* Buffer base address. */
u_int32_t size; /* Total allocated buffer size. */
u_int8_t *fillptr; /* Pointer to first unused space. */
u_int32_t count; /* Number of entries currently in list. */
u_int32_t version; /* Rep version of marshaled format. */
+ u_int32_t flags; /* Context flags. */
} FILE_LIST_CTX;
#define FIRST_FILE_PTR(buf) ((buf) + __REP_UPDATE_SIZE)
/*
+ * Flags used to show the state of blob files on the master in messages
+ * sent to the client.
+ */
+#define BLOB_DONE 0x01
+#define BLOB_DELETE 0x02
+#define BLOB_CHUNK_FAIL 0x04
+
+#define BLOB_ID_SIZE sizeof(db_seq_t)
+#define BLOB_KEY_SIZE (2 * BLOB_ID_SIZE)
+
+/*
* Function that performs any desired processing on a single file, as part of
* the traversal of a list of database files, such as with internal init.
*/
typedef int (FILE_WALK_FN) __P((ENV *, __rep_fileinfo_args *, void *));
+static int __rep_add_files_to_list __P((
+ ENV *, const char *, const char *, FILE_LIST_CTX *, const char **, int));
+static int __rep_blob_chunk_gap
+ __P((ENV *, int, DB_THREAD_INFO *, REP *, int *, db_seq_t, int));
+static int __rep_blob_cleanup __P((ENV *, REP *));
+static int __rep_blobdone
+ __P((ENV *, int, DB_THREAD_INFO *, REP *, db_seq_t, int));
+static int __rep_blob_find_files __P((ENV *, DB_THREAD_INFO *, const char *,
+ db_seq_t *, db_seq_t, db_seq_t, db_seq_t *, DBT *, size_t *, u_int32_t *));
+static int __rep_blob_sort_dirs __P((ENV *,
+ int (*)(const char *), char **, int, char ***, int *));
static FILE_WALK_FN __rep_check_uid;
static int __rep_clean_interrupted __P((ENV *));
static FILE_WALK_FN __rep_cleanup_nimdbs;
@@ -52,6 +77,8 @@ static int __rep_get_fileinfo __P((ENV *, const char *,
const char *, __rep_fileinfo_args *, u_int8_t *));
static int __rep_get_file_list __P((ENV *,
DB_FH *, u_int32_t, u_int32_t *, DBT *));
+static int __rep_init_file_list_context __P((ENV *,
+ u_int32_t, u_int32_t, int, FILE_LIST_CTX *));
static int __rep_is_replicated_db __P((const char *, const char *));
static int __rep_log_setup __P((ENV *,
REP *, u_int32_t, u_int32_t, DB_LSN *));
@@ -72,9 +99,12 @@ static FILE_WALK_FN __rep_remove_file;
static int __rep_remove_logs __P((ENV *));
static int __rep_remove_nimdbs __P((ENV *));
static int __rep_rollback __P((ENV *, DB_LSN *));
+static int __rep_select_blob_file __P((const char *));
+static int __rep_select_blob_sdb __P((const char *));
static int __rep_unlink_by_list __P((ENV *, u_int32_t,
u_int8_t *, u_int32_t, u_int32_t));
static FILE_WALK_FN __rep_unlink_file;
+static int __rep_walk_blob_dir __P((ENV *, FILE_LIST_CTX*));
static int __rep_walk_filelist __P((ENV *, u_int32_t, u_int8_t *,
u_int32_t, u_int32_t, FILE_WALK_FN *, void *));
static int __rep_walk_dir __P((ENV *, const char *, const char *,
@@ -129,14 +159,12 @@ __rep_update_req(env, rp)
dblp = env->lg_handle;
logc = NULL;
- if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
- goto err_noalloc;
- context.size = MEGABYTE;
- context.count = 0;
- context.version = rp->rep_version;
/* Reserve space for the update_args, and fill in file info. */
- context.fillptr = FIRST_FILE_PTR(context.buf);
+ if ((ret = __rep_init_file_list_context(env, rp->rep_version,
+ F_ISSET(rp, REPCTL_INMEM_ONLY) ? FILE_CTX_INMEM_ONLY : 0,
+ 1, &context)) != 0)
+ goto err_noalloc;
if ((ret = __rep_find_dbs(env, &context)) != 0)
goto err;
@@ -214,6 +242,472 @@ err_noalloc:
}
/*
+ * Passed to the __rep_blob_sort_dirs function.
+ * Select blob files, of the form __db.bl###
+ */
+static int
+__rep_select_blob_file(file)
+ const char *file;
+{
+ if (strncmp(BLOB_FILE_PREFIX, file, strlen(BLOB_FILE_PREFIX)) == 0)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Passed to the __rep_blob_sort_dirs function.
+ * Select blob subdatabase directories, of the form __db###
+ */
+static int
+__rep_select_blob_sdb(file)
+ const char *file;
+{
+ if (strncmp(BLOB_DIR_PREFIX, file, strlen(BLOB_DIR_PREFIX)) == 0 &&
+ strncmp(BLOB_FILE_PREFIX, file, strlen(BLOB_FILE_PREFIX)) != 0 &&
+ strcmp(BLOB_META_FILE_NAME, file) != 0)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * __rep_blob_sort_dirs
+ * Create a sorted list of directory names that all share a type that
+ * is selected using the given function.
+ */
+static int
+__rep_blob_sort_dirs(env, select_fn, dirs, dirs_cnt, sorted, sorted_cnt)
+ ENV *env;
+ int (*select_fn) __P((const char *));
+ char **dirs;
+ int dirs_cnt;
+ char ***sorted;
+ int *sorted_cnt;
+{
+ char **sort, *tmp;
+ int i, ret, size, sort_cnt, swapped;
+
+ *sorted = NULL;
+ *sorted_cnt = 0;
+ sort_cnt = 0;
+
+ if ((ret = __os_malloc(env,
+ (sizeof(char *) * (unsigned int)dirs_cnt), &sort)) != 0)
+ return (ret);
+
+ for (i = 0; i < dirs_cnt; i++) {
+ if (select_fn(dirs[i])) {
+ sort[sort_cnt] = dirs[i];
+ sort_cnt++;
+ }
+ }
+
+ /*
+ * Directories are usually returned in order, or close to it, so use
+ * Bubble Sort to sort the list.
+ */
+ size = sort_cnt;
+ swapped = 1;
+ while (swapped == 1 && size > 1) {
+ swapped = 0;
+ for (i = 0; (i + 1) < size; i++) {
+ if (strcmp(sort[i], sort[i+1]) > 0) {
+ tmp = sort[i];
+ sort[i] = sort[i+1];
+ sort[i+1] = tmp;
+ swapped = 1;
+ }
+ }
+ size--;
+ }
+
+ *sorted = sort;
+ *sorted_cnt = sort_cnt;
+
+ return (0);
+}
+
+#define BLOB_THROTTLE_DEFAULT (10 * MEGABYTE)
+
+/*
+ * __rep_blob_update_req
+ * Send a list of blob files, starting after the blob id and sub-database
+ * id sent in the BLOB_UPDATE_REQ message.
+ *
+ * PUBLIC: int __rep_blob_update_req __P((ENV *, DB_THREAD_INFO *, DBT *));
+ */
+int
+__rep_blob_update_req(env, ip, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DBT *rec;
+{
+ DBT rbudbt;
+ REP *rep;
+ __rep_blob_update_args rbu;
+ __rep_blob_update_req_args rbur;
+ db_seq_t blob_fid, blob_id, blob_sdb, tmp;
+ int cur, dirs_cnt, ret, sdb_cnt;
+ size_t sent;
+ char *blob_sub_dir, *dir, **dirs, **sdb;
+ u_int32_t num_blobs, throttle;
+ u_int8_t *ptr;
+
+ memset(&rbu, 0, sizeof(__rep_blob_update_args));
+ memset(&rbudbt, 0, sizeof(DBT));
+ blob_sub_dir = dir = NULL;
+ dirs = sdb = NULL;
+ sent = 0;
+ num_blobs = 0;
+ cur = dirs_cnt = sdb_cnt = 0;
+ rep = env->rep_handle->region;
+ throttle = rep->gbytes * GIGABYTE + rep->bytes;
+ if (throttle == 0)
+ throttle = BLOB_THROTTLE_DEFAULT;
+
+ if ((ret = __rep_blob_update_req_unmarshal(
+ env, &rbur, rec->data, rec->size, &ptr)) != 0)
+ goto err;
+
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+"blob_update_req: file_id %llu sdb_id %llu blob_id %llu highest %llu",
+ (long long)rbur.blob_fid, (long long)rbur.blob_sid,
+ (long long)rbur.blob_id, (long long)rbur.highest_id));
+
+ rbu.blob_fid = rbur.blob_fid;
+
+ if ((ret = __os_malloc(env, MEGABYTE, &rbudbt.data)) != 0)
+ goto err;
+ rbudbt.ulen = MEGABYTE;
+ rbudbt.size = __REP_BLOB_UPDATE_SIZE;
+
+ blob_fid = (db_seq_t)rbur.blob_fid;
+ blob_sdb = (db_seq_t)rbur.blob_sid;
+ blob_id = (db_seq_t)rbur.blob_id;
+
+ /* Find the first blob file if it is unknown. */
+ if (blob_id == 0 && blob_sdb == 0) {
+find_sdb: if (dirs == NULL) {
+ if ((ret = __blob_make_sub_dir(
+ env, &blob_sub_dir, blob_fid, 0)) != 0)
+ goto err;
+ if ((ret = __db_appname(
+ env, DB_APP_BLOB, blob_sub_dir, NULL, &dir)) != 0)
+ goto err;
+ /* If no directory, there are no blobs to send. */
+ if (__os_exists(env, dir, NULL) != 0)
+ goto filedone;
+
+ if ((ret = __os_dirlist(
+ env, dir, 1, &dirs, &dirs_cnt)) != 0)
+ goto err;
+
+ if (dirs_cnt == 0)
+ goto filedone;
+
+ if ((ret = __rep_blob_sort_dirs(
+ env, __rep_select_blob_sdb,
+ dirs, dirs_cnt, &sdb, &sdb_cnt)) != 0)
+ goto err;
+ }
+ /*
+ * Iterate through the list of subdirectories, until we find
+ * one that has an id larger than the current subdirectory id.
+ */
+ while (cur < sdb_cnt) {
+ if ((ret = __blob_path_to_dir_ids(
+ env, sdb[cur], &tmp, NULL)) != 0)
+ goto err;
+ if (blob_sdb < tmp) {
+ blob_sdb = tmp;
+ break;
+ }
+ cur++;
+ }
+ /* Check if no more subdirectories to search */
+ if (sdb_cnt != 0 && cur == sdb_cnt)
+ goto filedone;
+ if (dir != NULL)
+ __os_free(env, dir);
+ dir = NULL;
+ if (blob_sub_dir != NULL)
+ __os_free(env, blob_sub_dir);
+ blob_sub_dir = NULL;
+ }
+
+ if (blob_sub_dir == NULL && (ret =
+ __blob_make_sub_dir(env, &blob_sub_dir, blob_fid, blob_sdb)) != 0)
+ goto err;
+
+ if (dir == NULL && (ret = __db_appname(
+ env, DB_APP_BLOB, blob_sub_dir, NULL, &dir)) != 0)
+ goto err;
+ /* Search the current directory for blob files with id > blob_id. */
+ if ((ret = __rep_blob_find_files(
+ env, ip, dir, &blob_id, blob_sdb, blob_fid,
+ (db_seq_t *)&rbur.highest_id, &rbudbt, &sent, &num_blobs)) != 0)
+ goto err;
+
+ /*
+ * If we have not reached the send limit, and there are still
+ * directories to search, then search the next directory.
+ */
+ if (sent < throttle) {
+ if (blob_sdb != 0) {
+ rbur.highest_id = 0;
+ blob_id = 0;
+ __os_free(env, blob_sub_dir);
+ blob_sub_dir = NULL;
+ __os_free(env, dir);
+ dir = NULL;
+ goto find_sdb;
+ } else {
+ /* Mark as the end of the files. */
+filedone: F_SET(&rbu, BLOB_DONE);
+ rbur.highest_id = 0;
+ }
+ } else
+ STAT(rep->stat.st_nthrottles++);
+
+ rbu.num_blobs = num_blobs;
+ rbu.highest_id = rbur.highest_id;
+ __rep_blob_update_marshal(env, &rbu, rbudbt.data);
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "Sending blob_update: file_id %llu, num_blobs %lu, flags %lu",
+ (long long)rbu.blob_fid,
+ (long)num_blobs, (unsigned long)rbu.flags));
+ (void)__rep_send_message(
+ env, DB_EID_BROADCAST, REP_BLOB_UPDATE, NULL, &rbudbt, 0, 0);
+
+err: if (sdb != NULL)
+ __os_free(env, sdb);
+ if (dirs != NULL)
+ __os_dirfree(env, dirs, dirs_cnt);
+ if (dir != NULL)
+ __os_free(env, dir);
+ if (blob_sub_dir != NULL)
+ __os_free(env, blob_sub_dir);
+ if (rbudbt.data != NULL)
+ __os_free(env, rbudbt.data);
+ return (ret);
+}
+
+/*
+ * __rep_blob_find_files
+ *
+ * Search a directory for blob files, starting with the given blob id and
+ * sub-database id. Add information for each blob to the message buffer until
+ * there are no more files, or it has reached the maximum send amount in terms
+ * of combined blob files size.
+ *
+ * This search is complicated because the blobs have to be sent in order by id,
+ * but there can be huge holes between a blob file and the one with the next
+ * highest id, so iterating through the ids looking to see if the file exists
+ * for each id will take too long. The solution is to walk the directory
+ * hierarchy in order, reading every file in that directory, sorting them by
+ * id, and adding them to the update list.
+ */
+static int
+__rep_blob_find_files(
+ env, ip, dir, blob_id, blob_sid, blob_fid, highest, buf, sent, num)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ const char *dir;
+ db_seq_t *blob_id;
+ db_seq_t blob_sid;
+ db_seq_t blob_fid;
+ db_seq_t *highest;
+ DBT *buf;
+ size_t *sent;
+ u_int32_t *num;
+{
+ DB *bmd;
+ DB_FH *fhp;
+ DB_TXN *txn;
+ REP *rep;
+ __rep_blob_file_args rbf;
+ char blob_path[MAX_BLOB_PATH_SZ], **dirs, **files, *path, *ptr;
+ db_seq_t tmp;
+ int blob_path_len, cur, depth, dirs_cnt, files_cnt, ret;
+ off_t blob_size;
+ size_t len;
+ u_int32_t bytes, mbytes, throttle;
+
+ bmd = NULL;
+ txn = NULL;
+ fhp = NULL;
+ path = NULL;
+ dirs = files = NULL;
+ dirs_cnt = files_cnt = 0;
+ rbf.blob_sid = (u_int64_t)blob_sid;
+ rep = env->rep_handle->region;
+ throttle = rep->gbytes * GIGABYTE + rep->bytes;
+ if (throttle == 0)
+ throttle = BLOB_THROTTLE_DEFAULT;
+
+ if ((ret = __os_malloc(
+ env, strlen(dir) + MAX_BLOB_PATH_SZ, &path)) != 0)
+ goto err;
+
+ /*
+ * Read the highest possible blob id from the blob meta database, so
+ * we know when to stop looking for files for this database. The
+ * highest value is reset everytime we switch to a new subdatabase.
+ */
+ if (*highest == 0) {
+ if ((ret = __db_create_internal(&bmd, env, 0)) != 0)
+ goto err;
+
+ if ((ret = __txn_begin(
+ env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
+ goto err;
+
+ bmd->blob_file_id = blob_fid;
+ bmd->blob_sdb_id = blob_sid;
+ if ((ret = __blob_highest_id(bmd, txn, highest) ) != 0)
+ goto err;
+
+ if ((ret = __txn_abort(txn)) != 0)
+ goto err;
+ txn = NULL;
+ if ((ret = __db_close(bmd, NULL, 0)) != 0)
+ goto err;
+ bmd = NULL;
+ (*highest)++;
+ }
+
+ (*blob_id)++;
+ while (*sent < throttle && *blob_id < *highest) {
+ memset(blob_path, 0, MAX_BLOB_PATH_SZ);
+ blob_path_len = depth = 0;
+
+ /* Calucate the subdirectory from the blob id. */
+ __blob_calculate_dirs(
+ *blob_id, blob_path, &blob_path_len, &depth);
+ if (blob_path_len != 0) {
+ (void)sprintf(path, "%s%c%s%c",
+ dir, PATH_SEPARATOR[0], blob_path, PATH_SEPARATOR[0]);
+ } else
+ (void)sprintf(path, "%s", dir);
+ len = strlen(path);
+
+ /* If the sub-directory does not exist, look for the next. */
+ if (__os_exists(env, path, NULL) != 0) {
+ (*blob_id) +=
+ BLOB_DIR_ELEMS - (*blob_id % BLOB_DIR_ELEMS);
+ continue;
+ }
+
+ /* Get a list of all the blob files, sorted by id. */
+ if ((ret = __os_dirlist(env, path, 0, &dirs, &dirs_cnt)) != 0)
+ goto err;
+
+ if ((ret = __rep_blob_sort_dirs(env, __rep_select_blob_file,
+ dirs, dirs_cnt, &files, &files_cnt)) != 0)
+ goto err;
+
+ /*
+ * Find the first blob file with an id greater than or equal to
+ * the last id.
+ */
+ for (cur = 0; cur < files_cnt; cur++) {
+ ptr = files[cur];
+ ptr += strlen(BLOB_FILE_PREFIX);
+ if ((ret = __blob_str_to_id(
+ env, (const char **)&ptr, &tmp)) != 0)
+ goto err;
+ DB_ASSERT(env, tmp != 0);
+ if (tmp >= *blob_id)
+ break;
+ }
+
+ /* Add each remaining blob file to the message buffer. */
+ while (cur < files_cnt) {
+ /* Get the blob id from the current file name. */
+ (void)sprintf(path + len, "%s", files[cur]);
+ ptr = path + len + strlen(BLOB_FILE_PREFIX);
+ if ((ret = __blob_str_to_id(
+ env, (const char **)&ptr, blob_id)) != 0)
+ goto err;
+ rbf.blob_id = (u_int64_t)*blob_id;
+ /* Open the file and get its size. */
+ if ((ret = __os_open(
+ env, path, 0, DB_OSO_RDONLY, 0, &fhp)) != 0) {
+ if (ret == ENOENT) {
+ ret = 0;
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "blob_update blob file: %llu deleted, skipping.",
+ (long long)rbf.blob_id));
+ cur++;
+ continue;
+ }
+ goto err;
+ }
+ if ((ret = __os_ioinfo(
+ env, path, fhp, &mbytes, &bytes, NULL)) != 0)
+ goto err;
+ if ((ret =__os_closehandle(env, fhp)) != 0)
+ goto err;
+ fhp = NULL;
+ blob_size = ((off_t)mbytes * (off_t)MEGABYTE) + bytes;
+ rbf.blob_size = (u_int64_t)blob_size;
+ if (blob_size > UINT32_MAX)
+ (*sent) = throttle + 1;
+ else {
+ if (((*sent) + (size_t)blob_size) < (*sent))
+ (*sent) = throttle + 1;
+ else
+ (*sent) += (size_t)blob_size;
+ }
+ __rep_blob_file_marshal(
+ env, &rbf, (u_int8_t *)buf->data + buf->size);
+ (*num)++;
+ buf->size += __REP_BLOB_FILE_SIZE;
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "blob_update adding: blob_sid %llu, blob_id %llu blob_size %llu",
+ (long long)rbf.blob_sid,
+ (long long)rbf.blob_id, (long long)rbf.blob_size));
+ if ((*sent) > throttle)
+ goto err;
+
+ /* Resize if there is not enough space to grow. */
+ if (buf->size > (buf->ulen - __REP_BLOB_FILE_SIZE)) {
+ if ((ret = __os_realloc(
+ env, buf->ulen * 2, &buf->data)) != 0)
+ goto err;
+ buf->ulen *= 2;
+ }
+ cur++;
+ }
+ /*
+ * Move to the next directory of blob files by setting the blob
+ * id to the next largest possible value.
+ */
+ (*blob_id) += BLOB_DIR_ELEMS - (*blob_id % BLOB_DIR_ELEMS);
+ __os_free(env, files);
+ files = NULL;
+ __os_dirfree(env, dirs, dirs_cnt);
+ dirs = NULL;
+ }
+err:
+ if (path != NULL)
+ __os_free(env, path);
+ if (files != NULL)
+ __os_free(env, files);
+ if (dirs != NULL)
+ __os_dirfree(env, dirs, dirs_cnt);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ if (txn != NULL)
+ (void)__txn_abort(txn);
+ if (bmd != NULL)
+ (void)__db_close(bmd, NULL, 0);
+
+ return (ret);
+}
+
+/*
* __rep_find_dbs -
* Walk through all the named files/databases including those in the
* environment or data_dirs and those that in named and in-memory. We
@@ -240,7 +734,8 @@ __rep_find_dbs(env, context)
* replicated user databases. If the application has a metadata_dir,
* this will also find any persistent internal system databases.
*/
- if (dbenv->db_data_dir != NULL) {
+ if (!F_ISSET(context, FILE_CTX_INMEM_ONLY) &&
+ dbenv->db_data_dir != NULL) {
for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) {
if ((ret = __db_appname(env,
DB_APP_NONE, *ddir, NULL, &real_dir)) != 0)
@@ -252,16 +747,24 @@ __rep_find_dbs(env, context)
real_dir = NULL;
}
}
+
/*
* Walk the environment directory. If the application doesn't
* have a metadata_dir, this will return persistent internal system
* databases. If the application doesn't have a separate data
* directory, this will also return all user databases.
*/
- if (ret == 0)
+ if (!F_ISSET(context, FILE_CTX_INMEM_ONLY) && ret == 0)
ret = __rep_walk_dir(env, env->db_home, NULL, context);
- /* Now, collect any in-memory named databases. */
+ /* Gather the databases in the blob directory. */
+ if (!F_ISSET(context, FILE_CTX_INMEM_ONLY) && ret == 0)
+ ret = __rep_walk_blob_dir(env, context);
+
+ /*
+ * Now, collect any in-memory named databases. We do this no
+ * matter if the INMEM_ONLY flag is set or not.
+ */
if (ret == 0)
ret = __rep_walk_dir(env, NULL, NULL, context);
@@ -271,6 +774,148 @@ __rep_find_dbs(env, context)
}
/*
+ * __rep_walk_blob_dir --
+ *
+ * The blob directory hierarchy consists of a top layer that contains the
+ * blob meta database (BMD) and a set of blob directories (BLDIR).
+ * Each BLDIR corresponds to a database file. If the database file doesn't
+ * contain subdatabases, the BLDIR contains a BMD and blob files. If the
+ * database file contains subdatabases, the BLDIR contains a BLSDIR
+ * subdirectory for each subdatabase. Each BLSDIR contains a BMD and blob
+ * files.
+ *
+ * This function walks the blob directory hierarchy and records any BMD.
+ * It first checks if the top level BMD exists, and if it does searches
+ * the first and second layers of the hierarchy for BMDs.
+ */
+static int
+__rep_walk_blob_dir(env, context)
+ ENV *env;
+ FILE_LIST_CTX *context;
+{
+ int cnt, cnt2, i, j, ret;
+ size_t len;
+ char *blob_dir, *blob_sub, **dirs, *name, *name2, **subdirs;
+ char blob_sub_buf[MAX_BLOB_PATH_SZ];
+ const char *bmd, *dirp;
+
+ cnt = cnt2 = 0;
+ blob_dir = name = name2 = NULL;
+ dirs = subdirs = NULL;
+ bmd = BLOB_META_FILE_NAME;
+ blob_sub = blob_sub_buf;
+
+ if ((ret = __db_appname(
+ env, DB_APP_BLOB, BLOB_META_FILE_NAME, &dirp, &name)) != 0)
+ goto err;
+
+ /*
+ * If the main blob meta database does not exist, then no databases in
+ * the environment supports blobs.
+ */
+ if ((ret = __os_exists(env, name, NULL)) != 0) {
+ ret = 0;
+ goto err;
+ }
+
+ /* Get the blob directory. */
+ if ((ret = __db_appname(
+ env, DB_APP_BLOB, NULL, &dirp, &blob_dir)) != 0)
+ goto err;
+
+ if ((ret = __rep_add_files_to_list(
+ env, blob_dir, NULL, context, &bmd, 1)) != 0)
+ goto err;
+
+ if ((ret = __os_dirlist(env, blob_dir, 1, &dirs, &cnt)) != 0)
+ goto err;
+
+ __os_free(env, name);
+ name = NULL;
+ if ((ret = __os_malloc(
+ env, MAX_BLOB_PATH_SZ + strlen(blob_dir), &name)) != 0)
+ goto err;
+
+ for (i = 0; i < cnt; i++) {
+ /*
+ * Skip blob files and the top level BMD
+ * (which was handled above).
+ */
+ if (IS_BLOB_META(dirs[i]) || IS_BLOB_FILE(dirs[i]))
+ continue;
+ len = strlen(blob_dir) +
+ strlen(dirs[i]) + strlen(BLOB_META_FILE_NAME) + 3;
+ (void)snprintf(name, len, "%s%c%s%c%s", blob_dir,
+ PATH_SEPARATOR[0], dirs[i], PATH_SEPARATOR[0],
+ BLOB_META_FILE_NAME);
+ /*
+ * If a blob meta database exists, add it to the list, and move
+ * on to the next directory, otherwise get a directory list and
+ * check the second layer for BMD. If a directory contains a
+ * BMD, then it cannot contain subdirectories with BMD.
+ */
+ if (__os_exists(env, name, NULL) == 0) {
+ (void)snprintf(blob_sub,
+ strlen(dirs[i]) + strlen(bmd) + 2,
+ "%s%c%s", dirs[i], PATH_SEPARATOR[0], bmd);
+ if ((ret = __rep_add_files_to_list(env, blob_dir,
+ NULL, context, (const char **)&blob_sub, 1)) != 0)
+ goto err;
+ } else {
+ len = strlen(blob_dir) + strlen(dirs[i]) + 2;
+ (void)snprintf(name, len, "%s%c%s",
+ blob_dir, PATH_SEPARATOR[0], dirs[i]);
+ if ((ret = __os_dirlist(
+ env, name, 1, &subdirs, &cnt2)) != 0)
+ goto err;
+ if (name2 == NULL) {
+ if ((ret = __os_malloc(env,
+ MAX_BLOB_PATH_SZ + strlen(name),
+ &name2)) != 0)
+ goto err;
+ }
+ for (j = 0; j < cnt2; j++) {
+ if (IS_BLOB_FILE(subdirs[j]))
+ continue;
+ len = strlen(name) + strlen(subdirs[j])
+ + strlen(BLOB_META_FILE_NAME) + 3;
+ (void)snprintf(name2, len, "%s%c%s%c%s",
+ name, PATH_SEPARATOR[0], subdirs[j],
+ PATH_SEPARATOR[0], BLOB_META_FILE_NAME);
+ if ((ret = __os_exists(
+ env, name2, NULL)) == 0) {
+ len = strlen(dirs[i])
+ + strlen(subdirs[j])
+ + strlen(bmd) + 3;
+ (void)snprintf(blob_sub,
+ len, "%s%c%s%c%s", dirs[i],
+ PATH_SEPARATOR[0], subdirs[j],
+ PATH_SEPARATOR[0], bmd);
+ if ((ret = __rep_add_files_to_list(
+ env, blob_dir, NULL, context,
+ (const char **)&blob_sub, 1)) != 0)
+ goto err;
+ }
+ }
+ __os_dirfree(env, subdirs, cnt2);
+ subdirs = NULL;
+ }
+ }
+
+err: if (name != NULL)
+ __os_free(env, name);
+ if (name2 != NULL)
+ __os_free(env, name2);
+ if (blob_dir != NULL)
+ __os_free(env, blob_dir);
+ if (dirs != NULL)
+ __os_dirfree(env, dirs, cnt);
+ if (subdirs != NULL)
+ __os_dirfree(env, subdirs, cnt2);
+ return (ret);
+}
+
+/*
* __rep_walk_dir --
*
* This is the routine that walks a directory and fills in the structures
@@ -284,11 +929,8 @@ __rep_walk_dir(env, dir, datadir, context)
const char *dir, *datadir;
FILE_LIST_CTX *context;
{
- __rep_fileinfo_args tmpfp;
- size_t avail, len;
- int cnt, first_file, i, ret;
- u_int8_t uid[DB_FILE_ID_LEN];
- char *file, **names, *subdb;
+ int cnt, ret;
+ char **names;
if (dir == NULL) {
VPRINT(env, (env, DB_VERB_REP_SYNC,
@@ -304,7 +946,34 @@ __rep_walk_dir(env, dir, datadir, context)
}
VPRINT(env, (env, DB_VERB_REP_SYNC, "Walk_dir: Dir %s has %d files",
(dir == NULL) ? "INMEM" : dir, cnt));
+ ret = __rep_add_files_to_list(
+ env, dir, datadir, context, (const char **)names, cnt);
+
+ __os_dirfree(env, names, cnt);
+ return (ret);
+}
+
+/*
+ * __rep_add_files_to_list --
+ *
+ * Add the given files to the file list.
+ */
+static int
+__rep_add_files_to_list(env, dir, datadir, context, names, cnt)
+ ENV *env;
+ const char *dir, *datadir;
+ FILE_LIST_CTX *context;
+ const char **names;
+ int cnt;
+{
+ __rep_fileinfo_args tmpfp;
+ size_t avail, len;
+ int first_file, i, ret;
+ u_int8_t uid[DB_FILE_ID_LEN];
+ const char *file, *subdb;
+
first_file = 1;
+ ret = 0;
for (i = 0; i < cnt; i++) {
VPRINT(env, (env, DB_VERB_REP_SYNC,
"Walk_dir: File %d name: %s", i, names[i]));
@@ -372,15 +1041,19 @@ __rep_walk_dir(env, dir, datadir, context)
DB_SET_DBT(tmpfp.uid, uid, DB_FILE_ID_LEN);
retry: avail = (size_t)(&context->buf[context->size] -
context->fillptr);
+ /*
+ * It is safe to cast to the old structs
+ * because the first part of the current
+ * struct matches the old structs.
+ */
if (context->version < DB_REPVERSION_53)
- /*
- * It is safe to cast to the old struct
- * because the first part of the current
- * struct matches the old struct.
- */
ret = __rep_fileinfo_v6_marshal(env, context->version,
(__rep_fileinfo_v6_args *)&tmpfp,
context->fillptr, avail, &len);
+ else if (context->version < DB_REPVERSION_61)
+ ret = __rep_fileinfo_v7_marshal(env, context->version,
+ (__rep_fileinfo_v7_args *)&tmpfp,
+ context->fillptr, avail, &len);
else
ret = __rep_fileinfo_marshal(env, context->version,
&tmpfp, context->fillptr, avail, &len);
@@ -409,9 +1082,7 @@ retry: avail = (size_t)(&context->buf[context->size] -
*/
context->fillptr += len;
}
-err:
- __os_dirfree(env, names, cnt);
- return (ret);
+err: return (ret);
}
/*
@@ -430,7 +1101,7 @@ __rep_is_replicated_db(name, dir)
/*
* Remaining things that don't have a "__db" prefix are eligible.
*/
- if (!IS_DB_FILE(name))
+ if (!IS_DB_FILE(name) || IS_BLOB_META(name))
return (1);
/* Here, we know we have a "__db" name. */
@@ -470,7 +1141,7 @@ __rep_check_uid(env, rfp, uid)
if (memcmp(rfp->uid.data, uid, DB_FILE_ID_LEN) == 0) {
VPRINT(env, (env, DB_VERB_REP_SYNC,
"Check_uid: Found matching file."));
- ret = DB_KEYEXIST;
+ ret = USR_ERR(env, DB_KEYEXIST);
}
return (ret);
@@ -489,6 +1160,7 @@ __rep_get_fileinfo(env, file, subdb, rfp, uid)
DB_THREAD_INFO *ip;
PAGE *pagep;
int lorder, ret, t_ret;
+ u_int32_t flags;
dbp = NULL;
dbc = NULL;
@@ -503,11 +1175,15 @@ __rep_get_fileinfo(env, file, subdb, rfp, uid)
* database handles would block the master from handling UPDATE_REQ.
*/
F_SET(dbp, DB_AM_RECOVER);
- if ((ret = __db_open(dbp, ip, NULL, file, subdb, DB_UNKNOWN,
- DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0),
- 0, PGNO_BASE_MD)) != 0)
+ flags = DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
+ if (file != NULL && IS_BLOB_META(file))
+ LF_SET(DB_INTERNAL_BLOB_DB);
+ if ((ret = __db_open(dbp, ip, NULL,
+ file, subdb, DB_UNKNOWN, flags, 0, PGNO_BASE_MD)) != 0)
goto err;
+ SET_LO_HI_VAR(dbp->blob_file_id, rfp->blob_fid_lo, rfp->blob_fid_hi);
+
if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
goto err;
if ((ret = __memp_fget(dbp->mpf, &dbp->meta_pgno, ip, dbc->txn,
@@ -574,6 +1250,7 @@ __rep_page_req(env, ip, eid, rp, rec)
{
__rep_fileinfo_args *msgfp, msgf;
__rep_fileinfo_v6_args *msgfpv6;
+ __rep_fileinfo_v7_args *msgfpv7;
DB_MPOOLFILE *mpf;
DB_REP *db_rep;
REP *rep;
@@ -584,21 +1261,30 @@ __rep_page_req(env, ip, eid, rp, rec)
db_rep = env->rep_handle;
rep = db_rep->region;
+ /*
+ * Build a current struct by copying in the older
+ * version struct and then setting up the new fields.
+ * This is safe because all old fields are in the
+ * same location in the current struct.
+ */
if (rp->rep_version < DB_REPVERSION_53) {
- /*
- * Build a current struct by copying in the older
- * version struct and then setting up the data_dir.
- * This is safe because all old fields are in the
- * same location in the current struct.
- */
if ((ret = __rep_fileinfo_v6_unmarshal(env, rp->rep_version,
&msgfpv6, rec->data, rec->size, &next)) != 0)
return (ret);
memcpy(&msgf, msgfpv6, sizeof(__rep_fileinfo_v6_args));
msgf.dir.data = NULL;
msgf.dir.size = 0;
+ msgf.blob_fid_lo = msgf.blob_fid_hi = 0;
msgfp = &msgf;
msgfree = msgfpv6;
+ } else if (rp->rep_version < DB_REPVERSION_61) {
+ if ((ret = __rep_fileinfo_v7_unmarshal(env, rp->rep_version,
+ &msgfpv7, rec->data, rec->size, &next)) != 0)
+ return (ret);
+ memcpy(&msgf, msgfpv7, sizeof(__rep_fileinfo_v7_args));
+ msgf.blob_fid_lo = msgf.blob_fid_hi = 0;
+ msgfp = &msgf;
+ msgfree = msgfpv7;
} else {
if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version,
&msgfp, rec->data, rec->size, &next)) != 0)
@@ -624,7 +1310,7 @@ __rep_page_req(env, ip, eid, rp, rec)
(void)__rep_send_message(env, eid, REP_FILE_FAIL,
NULL, rec, 0, 0);
else
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto err;
}
@@ -738,7 +1424,7 @@ __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp)
#ifdef HAVE_QUEUE
if ((ret = __qam_fget(qdbc, &p, 0, &pagep)) == ENOENT)
#endif
- ret = DB_PAGE_NOTFOUND;
+ ret = USR_ERR(env, DB_PAGE_NOTFOUND);
} else
ret = __memp_fget(mpf, &p, ip, NULL, 0, &pagep);
msgfp->pgno = p;
@@ -748,16 +1434,21 @@ __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp)
RPRINT(env, (env, DB_VERB_REP_SYNC,
"sendpages: PAGE_FAIL on page %lu",
(u_long)p));
+ /*
+ * It is safe to cast to the old structs
+ * because the first part of the current
+ * struct matches the old structs.
+ */
if (rp->rep_version < DB_REPVERSION_53)
- /*
- * It is safe to cast to the old struct
- * because the first part of the current
- * struct matches the old struct.
- */
ret = __rep_fileinfo_v6_marshal(env,
rp->rep_version,
(__rep_fileinfo_v6_args *)msgfp,
buf, msgsz, &len);
+ else if (rp->rep_version < DB_REPVERSION_61)
+ ret = __rep_fileinfo_v7_marshal(env,
+ rp->rep_version,
+ (__rep_fileinfo_v7_args *)msgfp,
+ buf, msgsz, &len);
else
ret = __rep_fileinfo_marshal(env,
rp->rep_version, msgfp, buf,
@@ -772,7 +1463,7 @@ __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp)
REP_PAGE_FAIL, &lsn, &msgdbt, 0, 0);
continue;
} else
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto err;
} else if (ret != 0)
goto err;
@@ -796,16 +1487,21 @@ __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp)
RPRINT(env, (env, DB_VERB_REP_SYNC,
"sendpages: %lu, page lsn [%lu][%lu]", (u_long)p,
(u_long)pagep->lsn.file, (u_long)pagep->lsn.offset));
+ /*
+ * It is safe to cast to the old structs
+ * because the first part of the current
+ * structs matches the old struct.
+ */
if (rp->rep_version < DB_REPVERSION_53)
- /*
- * It is safe to cast to the old struct
- * because the first part of the current
- * struct matches the old struct.
- */
ret = __rep_fileinfo_v6_marshal(env,
rp->rep_version,
(__rep_fileinfo_v6_args *)msgfp,
buf, msgsz, &len);
+ else if (rp->rep_version < DB_REPVERSION_61)
+ ret = __rep_fileinfo_v7_marshal(env,
+ rp->rep_version,
+ (__rep_fileinfo_v7_args *)msgfp,
+ buf, msgsz, &len);
else
ret = __rep_fileinfo_marshal(env, rp->rep_version,
msgfp, buf, msgsz, &len);
@@ -1010,7 +1706,8 @@ __rep_update_setup(env, eid, rp, rec, savetime, lsn)
ZERO_LSN(lp->waiting_lsn);
ZERO_LSN(lp->max_wait_lsn);
ZERO_LSN(lp->max_perm_lsn);
- if (db_rep->rep_db == NULL)
+ ret = __rep_blob_cleanup(env, rep);
+ if (ret == 0 && db_rep->rep_db == NULL)
ret = __rep_client_dbinit(env, 0, REP_DB);
MUTEX_UNLOCK(env, rep->mtx_clientdb);
if (ret != 0)
@@ -1148,6 +1845,337 @@ err: /*
return (ret);
}
+/*
+ * __rep_blob_update
+ * Prepare to receive blob file data by setting up the blob gap database,
+ * then requesting the blob file data.
+ *
+ * PUBLIC: int __rep_blob_update __P((ENV *, int, DB_THREAD_INFO *, DBT *));
+ */
+int
+__rep_blob_update(env, eid, ip, rec)
+ ENV *env;
+ int eid;
+ DB_THREAD_INFO *ip;
+ DBT *rec;
+{
+ DBC *dbc;
+ DB_REP *db_rep;
+ DBT data, key;
+ REP *rep;
+ REGINFO *infop;
+ __rep_blob_file_args rbf;
+ __rep_blob_update_args rbu;
+ __rep_fileinfo_args *rfp;
+ db_seq_t blob_fid;
+ int ret;
+ off_t offset;
+ size_t len;
+ u_int32_t num_blobs;
+ u_int8_t keybuf[BLOB_KEY_SIZE], *ptr;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ rfp = NULL;
+ dbc = NULL;
+ memset(&rbu, 0, sizeof(__rep_blob_update_args));
+ memset(&rbf, 0, sizeof(__rep_blob_file_args));
+
+ if ((ret = __rep_blob_update_unmarshal(
+ env, &rbu, rec->data, rec->size, &ptr)) != 0)
+ return (ret);
+ len = rec->size - __REP_BLOB_UPDATE_SIZE;
+
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+"blob_update: file_id %llu, num_blobs %lu, flags %lu, highest %llu",
+ (long long)rbu.blob_fid, (long)rbu.num_blobs,
+ (unsigned long)rbu.flags, (long long)rbu.highest_id));
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+
+ /*
+ * Check if the world changed.
+ */
+ if (rep->sync_state != SYNC_PAGE)
+ goto unlock;
+
+ /* Make sure this is for the current database. */
+ GET_CURINFO(rep, infop, rfp);
+ GET_LO_HI(env, rfp->blob_fid_lo, rfp->blob_fid_hi, blob_fid, ret);
+ if (ret != 0)
+ goto unlock;
+
+ if (blob_fid != (db_seq_t)rbu.blob_fid)
+ goto unlock;
+
+ rep->highest_id = (db_seq_t)rbu.highest_id;
+ /*
+ * For each blob file, add an entry to the database for each 1 MB
+ * section of that file. The entries will be deleted as the
+ * coresponding blob chunks arrive and are written to disk.
+ */
+ if (db_rep->blob_dbp == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_BLOB)) != 0)
+ goto unlock;
+
+ if ((ret = __db_cursor(db_rep->blob_dbp, ip, NULL, &dbc, 0)) != 0)
+ goto unlock;
+
+ /*
+ * Make sure no one else has populated the database, this could happen
+ * if the update message is sent twice.
+ */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != DB_NOTFOUND)
+ goto unlock;
+
+ /* It is possible for a blob database to have no blobs. */
+ if (rbu.num_blobs == 0) {
+ (void)__dbc_close(dbc);
+ dbc = NULL;
+ rep->blob_more_files = 0;
+ rep->gap_bl_hi_id = rep->gap_bl_hi_sid = 0;
+ rep->last_blob_id = rep->last_blob_sid = 0;
+ rep->prev_blob_id = rep->prev_blob_sid = 0;
+ rep->gap_bl_hi_off = 0;
+ rep->blob_sync = 0;
+ rep->highest_id = 0;
+ rep->blob_rereq = 0;
+ ret = __rep_blobdone(env, eid, ip, rep, blob_fid, 0);
+ goto unlock;
+ }
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ data.flags = key.flags = DB_DBT_USERMEM;
+ key.data = keybuf;
+ key.ulen = key.size = BLOB_KEY_SIZE;
+ data.data = (void *)&offset;
+ data.ulen = data.size = sizeof(offset);
+ num_blobs = 0;
+ while (num_blobs < rbu.num_blobs) {
+ if ((ret =
+ __rep_blob_file_unmarshal(env, &rbf, ptr, len, &ptr)) != 0)
+ goto unlock;
+ len -= __REP_BLOB_FILE_SIZE;
+
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "blob_update adding file: blob_id %llu, sdb_id %llu, blob_size %llu",
+ (long long)rbf.blob_id, (long long)rbf.blob_sid,
+ (long long)rbf.blob_size));
+
+ memcpy(keybuf, &rbf.blob_sid, BLOB_ID_SIZE);
+ memcpy(&(keybuf[BLOB_ID_SIZE]), &rbf.blob_id, BLOB_ID_SIZE);
+ offset = 0;
+ /*
+ * Add an entry for each megabyte of the blob file. Zero
+ * length blob files should have at least one entry.
+ */
+ do {
+ if ((ret = __dbc_put(dbc, &key, &data, 0)) != 0)
+ goto unlock;
+ offset += MEGABYTE;
+ /*
+ * Check for overflow, this can happen when the master
+ * supports 64 file offsets, but the client does not.
+ */
+ if (offset < 0) {
+ __db_errx(env,
+ DB_STR("3704",
+ "Blob file offset overflow"));
+ ret = EINVAL;
+ goto unlock;
+ }
+ } while ((u_int32_t)offset < rbf.blob_size);
+ num_blobs++;
+ }
+ /* Set whether there are more files after the ones on the list. */
+ if (F_ISSET(&rbu, BLOB_DONE))
+ rep->blob_more_files = 0;
+ else
+ rep->blob_more_files = 1;
+ rep->prev_blob_id = rep->last_blob_id;
+ rep->prev_blob_sid = rep->last_blob_sid;
+ rep->last_blob_sid = (db_seq_t)rbf.blob_sid;
+ rep->last_blob_id = (db_seq_t)rbf.blob_id;
+
+ /*
+ * Send the same message payload in a REP_BLOB_ALL_REQ message to get
+ * the blob data. Peer-to-peer initialization is not supported for
+ * blobs, so we can only send this back to the master despite the fact
+ * that building the list of blob files is expensive.
+ */
+ (void)__rep_send_message(
+ env, rep->master_id, REP_BLOB_ALL_REQ, NULL, rec, 0, 0);
+
+unlock: REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (dbc != NULL)
+ (void)__dbc_close(dbc);
+
+ return (ret);
+}
+
+/*
+ * __rep_blob_allreq
+ * Request blob file data.
+ *
+ * PUBLIC: int __rep_blob_allreq __P((ENV *, int, DBT *));
+ */
+int
+__rep_blob_allreq(env, eid, rec)
+ ENV *env;
+ int eid;
+ DBT *rec;
+{
+ DB *dbp;
+ DB_FH *fhp;
+ DBT msg;
+ __rep_blob_chunk_args rbc;
+ __rep_blob_file_args rbf;
+ __rep_blob_update_args rbu;
+ db_seq_t old_sdb_id;
+ int done, ret;
+ off_t offset;
+ size_t len;
+ u_int32_t num_blobs;
+ u_int8_t *chunk_buf, *msg_buf, *ptr;
+
+ dbp = NULL;
+ fhp = NULL;
+ chunk_buf = msg_buf = NULL;
+ memset(&rbu, 0, sizeof(__rep_blob_update_args));
+ memset(&rbc, 0, sizeof(__rep_blob_chunk_args));
+ memset(&msg, 0, sizeof(DBT));
+
+ if ((ret =
+ __os_malloc(env, MEGABYTE + __REP_BLOB_CHUNK_SIZE, &msg_buf)) != 0)
+ goto err;
+ msg.data = msg_buf;
+ msg.ulen = MEGABYTE + __REP_BLOB_CHUNK_SIZE;
+ if ((ret = __os_malloc(env, MEGABYTE, &chunk_buf)) != 0)
+ goto err;
+ rbc.data.data = chunk_buf;
+ rbc.data.ulen = MEGABYTE;
+ rbc.data.flags = DB_DBT_USERMEM;
+
+ /*
+ * The REP_BLOB_ALL_REQ message sends the REP_BLOB_UPDATE message
+ * payload back to the master to request the actual blobs after the
+ * client has prepared itself to receive them.
+ */
+ len = rec->size;
+ if ((ret = __rep_blob_update_unmarshal(
+ env, &rbu, rec->data, rec->size, &ptr)) != 0)
+ goto err;
+ len -= __REP_BLOB_UPDATE_SIZE;
+
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "blob_all_req: file_id %llu, num_blobs %lu, flags %lu",
+ (long long)rbu.blob_fid, (long)rbu.num_blobs,
+ (unsigned long)rbu.flags));
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ dbp->blob_file_id = (db_seq_t)rbu.blob_fid;
+ rbc.blob_fid = rbu.blob_fid;
+ num_blobs = 0;
+ /*
+ * The list of files to send is included in the message, go
+ * through the list and send each file in pieces.
+ */
+ while (num_blobs < rbu.num_blobs) {
+ num_blobs++;
+ if ((ret = __rep_blob_file_unmarshal(
+ env, &rbf, ptr, len, &ptr)) != 0)
+ goto err;
+ len -= __REP_BLOB_FILE_SIZE;
+ old_sdb_id = dbp->blob_sdb_id;
+ dbp->blob_sdb_id = (db_seq_t)rbf.blob_sid;
+ rbc.flags = 0;
+ rbc.blob_sid = rbf.blob_sid;
+ rbc.blob_id = rbf.blob_id;
+ /* Free the sub-directory information if it has changed. */
+ if (old_sdb_id != dbp->blob_sdb_id &&
+ dbp->blob_sub_dir != NULL) {
+ __os_free(env, dbp->blob_sub_dir);
+ dbp->blob_sub_dir = NULL;
+ }
+ if (dbp->blob_sub_dir == NULL) {
+ if ((ret = __blob_make_sub_dir(env, &dbp->blob_sub_dir,
+ dbp->blob_file_id, dbp->blob_sdb_id)) != 0)
+ goto err;
+ }
+ if ((ret = __blob_file_open(dbp,
+ &fhp, (db_seq_t)rbf.blob_id, DB_FOP_READONLY, 0)) != 0) {
+ /*
+ * The file may have been deleted between creating the
+ * list and sending the data. Send a message saying
+ * the file has been deleted.
+ */
+ if (ret == ENOENT) {
+ F_SET(&rbc, BLOB_DELETE);
+ rbc.data.size = 0;
+ __rep_blob_chunk_marshal(env, &rbc, msg.data);
+ msg.size = __REP_BLOB_CHUNK_SIZE;
+ (void)__rep_send_message(env,
+ eid, REP_BLOB_CHUNK, NULL, &msg, 0, 0);
+ ret = 0;
+ fhp = NULL;
+ continue;
+ }
+ goto err;
+ }
+ offset = 0;
+ do {
+ done = 0;
+ rbc.flags = 0;
+ if ((ret = __blob_file_read(
+ env, fhp, &rbc.data, offset, MEGABYTE)) != 0)
+ goto err;
+ DB_ASSERT(env, rbc.data.size <= MEGABYTE);
+
+ /*
+ * In rare cases the blob file may have gotten shorter
+ * since the list was created.
+ */
+ if (rbc.data.size < (u_int32_t)MEGABYTE && (u_int64_t)
+ (offset + rbc.data.size) < rbf.blob_size) {
+ F_SET(&rbc, BLOB_CHUNK_FAIL);
+ done = 1;
+ }
+ /* File may have grown since the list was made. */
+ if ((u_int64_t)
+ (offset + rbc.data.size) > rbf.blob_size) {
+ rbc.data.size =
+ (u_int32_t)((off_t)rbf.blob_size - offset);
+ }
+ rbc.offset = (u_int64_t)offset;
+ __rep_blob_chunk_marshal(env, &rbc, msg.data);
+ msg.size = __REP_BLOB_CHUNK_SIZE + rbc.data.size;
+ (void)__rep_send_message(
+ env, eid, REP_BLOB_CHUNK, NULL, &msg, 0, 0);
+ offset += MEGABYTE;
+ } while ((u_int64_t)offset < rbf.blob_size && !done);
+
+ if (fhp != NULL && (ret = __os_closehandle(env, fhp)) != 0)
+ goto err;
+ fhp = NULL;
+ }
+err: if (chunk_buf != NULL)
+ __os_free(env, chunk_buf);
+ if (msg_buf != NULL)
+ __os_free(env, msg_buf);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ if (dbp != 0)
+ (void)__db_close(dbp, NULL, 0);
+ return (ret);
+}
+
static int
__rep_find_inmem(env, rfp, unused)
ENV *env;
@@ -1157,6 +2185,11 @@ __rep_find_inmem(env, rfp, unused)
COMPQUIET(env, NULL);
COMPQUIET(unused, NULL);
+ /*
+ * Cannot assume all databases are in-memory because abbreviated
+ * internal inits from 5.3 and earlier are not limited to in-memory
+ * databases.
+ */
return (FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? DB_KEYEXIST : 0);
}
@@ -1172,12 +2205,9 @@ __rep_remove_nimdbs(env)
FILE_LIST_CTX context;
int ret;
- if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+ if ((ret = __rep_init_file_list_context(env,
+ DB_REPVERSION, 0, 0, &context)) != 0)
return (ret);
- context.size = MEGABYTE;
- context.count = 0;
- context.fillptr = context.buf;
- context.version = DB_REPVERSION;
/* NB: "NULL" asks walk_dir to consider only in-memory DBs */
if ((ret = __rep_walk_dir(env, NULL, NULL, &context)) != 0)
@@ -1240,14 +2270,11 @@ __rep_remove_all(env, msg_version, rec)
* 1. Get list of databases currently present at this client, which we
* intend to remove.
*/
- if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
- return (ret);
- context.size = MEGABYTE;
- context.count = 0;
- context.version = DB_REPVERSION;
/* Reserve space for the marshaled update_args. */
- context.fillptr = FIRST_FILE_PTR(context.buf);
+ if ((ret = __rep_init_file_list_context(env,
+ DB_REPVERSION, 0, 1, &context)) != 0)
+ return (ret);
if ((ret = __rep_find_dbs(env, &context)) != 0)
goto out;
@@ -1333,6 +2360,9 @@ __rep_remove_all(env, msg_version, rec)
FIRST_FILE_PTR(context.buf), context.size,
context.count, __rep_remove_file, NULL)) != 0)
goto out;
+ /* Remove the blob directory. */
+ if ((ret = __blob_del_hierarchy(env)) != 0)
+ goto out;
/*
* 4. Safe-store the (new) list of database files we intend to copy from
@@ -1445,6 +2475,8 @@ __rep_remove_file(env, rfp, unused)
#ifdef HAVE_QUEUE
DB_THREAD_INFO *ip;
#endif
+ APPNAME appname;
+ db_seq_t blob_fid, blob_sid;
char *name;
int ret, t_ret;
@@ -1496,29 +2528,53 @@ __rep_remove_file(env, rfp, unused)
* That will only have removed extent files. Now
* we need to deal with the actual file itself.
*/
+ appname = __rep_is_internal_rep_file(rfp->info.data) ?
+ DB_APP_META : (IS_BLOB_META(rfp->info.data) ?
+ DB_APP_BLOB : DB_APP_DATA);
if (FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
return (ret);
MAKE_INMEM(dbp);
F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */
ret = __db_inmem_remove(dbp, NULL, name);
- } else if ((ret = __fop_remove(env,
- NULL, rfp->uid.data, name, (const char **)&rfp->dir.data,
- __rep_is_internal_rep_file(rfp->info.data) ?
- DB_APP_META : DB_APP_DATA, 0)) != 0)
+ } else if ((ret = __fop_remove(env, NULL, rfp->uid.data, name,
+ (const char **)&rfp->dir.data, appname, 0)) != 0) {
/*
* If fop_remove fails, it could be because
* the client has a different data_dir
* structure than the master. Retry with the
- * local, default settings.
+ * local, default settings.
*/
ret = __fop_remove(env,
- NULL, rfp->uid.data, name, NULL,
- __rep_is_internal_rep_file(rfp->info.data) ?
- DB_APP_META : DB_APP_DATA, 0);
-#ifdef HAVE_QUEUE
-out:
+ NULL, rfp->uid.data, name, NULL, appname, 0);
+#ifdef DB_WIN32
+ /*
+ * Deleting a blob meta database can result in a
+ * ERROR_PATH_NOT_FOUND error on windows, so treat
+ * that as an ENOENT.
+ */
+ if (__os_posix_err(ret) == ENOENT)
+ ret = ENOENT;
#endif
+ }
+ /* Clean any blob directories. */
+ if (ret == 0 && appname == DB_APP_BLOB) {
+ /* dbp has not been set, since queues do not support blobs. */
+ DB_ASSERT(env, dbp == NULL);
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto out;
+ if ((ret = __blob_path_to_dir_ids(
+ env, name, &blob_fid, &blob_sid)) != 0)
+ goto out;
+ /* blob_fid == 0 if it is the top level blob meta db. */
+ if (blob_fid != 0) {
+ dbp->blob_file_id = blob_fid;
+ dbp->blob_sdb_id = blob_sid;
+ if ((ret = __blob_del_all(dbp, NULL, 0)) != 0)
+ goto out;
+ }
+ }
+out:
if (dbp != NULL &&
(t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
ret = t_ret;
@@ -1610,10 +2666,11 @@ __rep_page(env, ip, eid, rp, rec)
{
DB_REP *db_rep;
- DBT key, data;
+ DBT data, key;
REP *rep;
__rep_fileinfo_args *msgfp, msgf;
__rep_fileinfo_v6_args *msgfpv6;
+ __rep_fileinfo_v7_args *msgfpv7;
db_recno_t recno;
int ret;
char *msg;
@@ -1647,21 +2704,30 @@ __rep_page(env, ip, eid, rp, rec)
(u_long)rep->first_lsn.offset));
return (DB_REP_PAGEDONE);
}
+ /*
+ * Build a current struct by copying in the older
+ * version struct and then setting up the new fields.
+ * This is safe because all old fields are in the
+ * same location in the current struct.
+ */
if (rp->rep_version < DB_REPVERSION_53) {
- /*
- * Build a current struct by copying in the older
- * version struct and then setting up the data_dir.
- * This is safe because all old fields are in the
- * same location in the current struct.
- */
if ((ret = __rep_fileinfo_v6_unmarshal(env, rp->rep_version,
&msgfpv6, rec->data, rec->size, NULL)) != 0)
return (ret);
memcpy(&msgf, msgfpv6, sizeof(__rep_fileinfo_v6_args));
msgf.dir.data = NULL;
msgf.dir.size = 0;
+ msgf.blob_fid_lo = msgf.blob_fid_hi = 0;
msgfp = &msgf;
msgfree = msgfpv6;
+ } else if (rp->rep_version < DB_REPVERSION_61) {
+ if ((ret = __rep_fileinfo_v7_unmarshal(env, rp->rep_version,
+ &msgfpv7, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ memcpy(&msgf, msgfpv7, sizeof(__rep_fileinfo_v7_args));
+ msgf.blob_fid_lo = msgf.blob_fid_hi = 0;
+ msgfp = &msgf;
+ msgfree = msgfpv7;
} else {
if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version,
&msgfp, rec->data, rec->size, NULL)) != 0)
@@ -1671,9 +2737,9 @@ __rep_page(env, ip, eid, rp, rec)
MUTEX_LOCK(env, rep->mtx_clientdb);
REP_SYSTEM_LOCK(env);
/*
- * Check if the world changed.
+ * Check if the world changed or if we are in the blob sync phase.
*/
- if (rep->sync_state != SYNC_PAGE) {
+ if (rep->sync_state != SYNC_PAGE || rep->blob_sync != 0) {
ret = DB_REP_PAGEDONE;
goto err;
}
@@ -1785,6 +2851,218 @@ err: REP_SYSTEM_UNLOCK(env);
}
/*
+ * __rep_blob_chunk
+ * Process a blob chunk message. When a blob chunk arrives, delete its
+ * entry in the blob chunk gap database to show that it has arrived, and
+ * write the data to the blob file.
+ *
+ * PUBLIC: int __rep_blob_chunk __P((ENV *, int, DB_THREAD_INFO *, DBT *));
+ */
+int
+__rep_blob_chunk(env, eid, ip, rec)
+ ENV *env;
+ int eid;
+ DB_THREAD_INFO *ip;
+ DBT *rec;
+{
+ DB_REP *db_rep;
+ DBC *dbc;
+ DB_FH *fhp;
+ DBT data, key;
+ REP *rep;
+ REGINFO *infop;
+ __rep_blob_chunk_args rbc;
+ __rep_fileinfo_args *rfp;
+ db_seq_t blob_fid;
+ char *blob_sub_dir, *last, *mkpath, *name, *path;
+ int ret;
+ off_t offset;
+ u_int8_t keybuf[BLOB_KEY_SIZE], *ptr;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ dbc = NULL;
+ blob_sub_dir = name = NULL;
+ path = NULL;
+ fhp = NULL;
+
+ if (rep->sync_state != SYNC_PAGE)
+ return (DB_REP_PAGEDONE);
+
+ if ((ret = __rep_blob_chunk_unmarshal(
+ env, &rbc, rec->data, rec->size, &ptr)) != 0)
+ return (ret);
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Check if the world changed.
+ */
+ if (rep->sync_state != SYNC_PAGE) {
+ ret = DB_REP_PAGEDONE;
+ goto err;
+ }
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ /* Make sure this is for the current file. */
+ GET_CURINFO(rep, infop, rfp);
+ GET_LO_HI(env, rfp->blob_fid_lo, rfp->blob_fid_hi, blob_fid, ret);
+ if (ret != 0)
+ goto err;
+
+ if (blob_fid != (db_seq_t)rbc.blob_fid) {
+ ret = DB_REP_PAGEDONE;
+ goto err;
+ }
+
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+"REP_BLOB_CHUNK: blob_fid %llu, blob_sid %llu, blob_id %llu, offset %llu",
+ (unsigned long long)rbc.blob_fid,
+ (unsigned long long)rbc.blob_sid,
+ (unsigned long long)rbc.blob_id, (long long)rbc.offset));
+
+ if (db_rep->blob_dbp == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_BLOB)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "REP_BLOB_CHUNK: Client_dbinit %s",
+ db_strerror(ret)));
+ goto err;
+ }
+
+ /* Set the highest blob chunk received. */
+ if (rbc.blob_sid > (u_int64_t)rep->gap_bl_hi_sid ||
+ (rbc.blob_sid == (u_int64_t)rep->gap_bl_hi_sid &&
+ rbc.blob_id > (u_int64_t)rep->gap_bl_hi_id) ||
+ (rbc.blob_sid == (u_int64_t)rep->gap_bl_hi_sid &&
+ rbc.blob_id == (u_int64_t)rep->gap_bl_hi_id &&
+ rbc.offset > (u_int64_t)rep->gap_bl_hi_off)) {
+ rep->gap_bl_hi_id = (db_seq_t)rbc.blob_id;
+ rep->gap_bl_hi_sid = (db_seq_t)rbc.blob_sid;
+ rep->gap_bl_hi_off = (off_t)rbc.offset;
+ }
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ data.flags = key.flags = DB_DBT_USERMEM;
+ key.data = keybuf;
+ key.ulen = key.size = BLOB_KEY_SIZE;
+ data.data = (void *)&offset;
+ data.ulen = data.size = sizeof(offset);
+ /* BLOB_DELETE is set if the blob file was deleted. */
+ if (F_ISSET(&rbc, BLOB_DELETE)) {
+ memcpy(keybuf, &rbc.blob_sid, BLOB_ID_SIZE);
+ memcpy(&(keybuf[BLOB_ID_SIZE]), &rbc.blob_id, BLOB_ID_SIZE);
+ if ((ret = __db_del(
+ db_rep->blob_dbp, ip, NULL, &key, 0)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+ goto done;
+ }
+
+ if ((ret = __db_cursor(db_rep->blob_dbp, ip, NULL, &dbc, 0)) != 0)
+ goto err;
+ offset = (off_t)rbc.offset;
+ memcpy(keybuf, &rbc.blob_sid, BLOB_ID_SIZE);
+ memcpy(&(keybuf[BLOB_ID_SIZE]), &rbc.blob_id, BLOB_ID_SIZE);
+ /* If not found we have already dealt with this chunk. */
+ if ((ret = __dbc_get(dbc, &key, &data, DB_GET_BOTH)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ goto done;
+ }
+ goto err;
+ }
+ /*
+ * BLOB_CHUNK_FAIL is set if the blob file was truncated to shorter
+ * than the BLOB_CHUNK offset.
+ */
+ if (F_ISSET(&rbc, BLOB_CHUNK_FAIL)) {
+ while (ret == 0) {
+ if ((ret = __dbc_del(dbc, 0)) != 0)
+ goto err;
+ ret = __dbc_get(dbc, &key, &data, DB_NEXT_DUP);
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if ((ret = __dbc_close(dbc)) != 0)
+ goto err;
+ dbc = NULL;
+ goto done;
+ }
+ if ((ret = __dbc_del(dbc, 0)) != 0)
+ goto err;
+ if ((ret = __dbc_close(dbc)) != 0)
+ goto err;
+ dbc = NULL;
+
+ if ((ret = __blob_make_sub_dir(env, &blob_sub_dir,
+ (db_seq_t)rbc.blob_fid, (db_seq_t)rbc.blob_sid)) != 0)
+ goto err;
+
+ if ((ret = __blob_id_to_path(
+ env, blob_sub_dir, (db_seq_t)rbc.blob_id, &name)) != 0)
+ goto err;
+
+ if ((ret = __db_appname(env, DB_APP_BLOB, name, NULL, &path)) != 0 )
+ goto err;
+
+ last = __db_rpath(path);
+ DB_ASSERT(env, last != NULL);
+ *last = '\0';
+ if (__os_exists(env, path, NULL) != 0) {
+ *last = PATH_SEPARATOR[0];
+ mkpath = path;
+#ifdef DB_WIN32
+ /*
+ * Absolute paths on windows can result in it creating a "C"
+ * or "D" directory in the working directory.
+ */
+ if (__os_abspath(mkpath))
+ mkpath += 2;
+#endif
+ if ((ret = __db_mkpath(env, mkpath)) != 0)
+ goto err;
+ }
+ *last = PATH_SEPARATOR[0];
+ if ((ret = __os_open(
+ env, path, 0, DB_OSO_CREATE, env->db_mode, &fhp)) != 0)
+ goto err;
+
+ /* Write the data into the blob file. */
+ if ((ret = __fop_write_file(env, NULL, name, NULL, DB_APP_BLOB,
+ fhp, (off_t)rbc.offset, rbc.data.data, rbc.data.size, 0)) != 0)
+ goto err;
+ if ((ret = __os_closehandle(env, fhp)) != 0)
+ goto err;
+ fhp = NULL;
+
+done: ret = __rep_blobdone(env, eid, ip, rep, blob_fid, 0);
+
+err: REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (path != NULL)
+ __os_free(env, path);
+ if (blob_sub_dir != NULL)
+ __os_free(env, blob_sub_dir);
+ if (name != NULL)
+ __os_free(env, name);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ if (dbc != NULL)
+ (void)__dbc_close(dbc);
+
+ return (ret);
+}
+
+/*
* __rep_write_page -
* Write this page into a database.
*/
@@ -1801,13 +3079,16 @@ __rep_write_page(env, ip, rep, msgfp)
DB_PGINFO *pginfo;
DB_REP *db_rep;
REGINFO *infop;
+ APPNAME appname;
__rep_fileinfo_args *rfp;
+ char *blob_path;
int ret;
void *dst;
db_rep = env->rep_handle;
infop = env->reginfo;
rfp = NULL;
+ blob_path = NULL;
/*
* If this is the first page we're putting in this database, we need
@@ -1830,15 +3111,39 @@ __rep_write_page(env, ip, rep, msgfp)
RPRINT(env, (env, DB_VERB_REP_SYNC,
"rep_write_page: Calling fop_create for %s",
(char *)rfp->info.data));
+ appname = (__rep_is_internal_rep_file(rfp->info.data) ?
+ DB_APP_META : (IS_BLOB_META((char *)rfp->info.data)
+ ? DB_APP_BLOB : DB_APP_DATA));
+ /*
+ * May have to create the directory structure for blob
+ * metadata databases.
+ */
+ if (appname == DB_APP_BLOB) {
+ if ((ret = __db_appname(env,
+ appname, rfp->info.data,
+ (const char **)&rfp->dir.data,
+ &blob_path)) != 0)
+ goto err;
+#ifdef DB_WIN32
+ /*
+ * Absolute paths on windows can result in
+ * it creating a "C" or "D"
+ * directory in the working directory.
+ */
+ if (__os_abspath(blob_path))
+ blob_path += 2;
+#endif
+ if ((ret = __db_mkpath(env, blob_path)) != 0)
+ goto err;
+ }
if ((ret = __fop_create(env, NULL, NULL,
rfp->info.data, (const char **)&rfp->dir.data,
- __rep_is_internal_rep_file(rfp->info.data) ?
- DB_APP_META : DB_APP_DATA, env->db_mode, 0)) != 0) {
+ appname, env->db_mode, 0)) != 0) {
/*
* If fop_create fails, it could be because
* the client has a different data_dir
* structure than the master. Retry with the
- * local, default settings.
+ * local, default settings.
*/
RPRINT(env, (env, DB_VERB_REP_SYNC,
"rep_write_page: fop_create ret %d. Retry for %s, master datadir %s",
@@ -1929,7 +3234,10 @@ __rep_write_page(env, ip, rep, msgfp)
ret = __memp_fput(db_rep->file_mpf,
ip, dst, db_rep->file_dbp->priority);
-err: return (ret);
+err: if (blob_path != NULL)
+ __os_free(env, blob_path);
+
+ return (ret);
}
/*
@@ -1976,7 +3284,7 @@ __rep_page_gap(env, rep, msgfp, type)
* Make sure we're still talking about the same file.
* If not, we're done here.
*/
- if (rfp->filenum != msgfp->filenum) {
+ if (rfp->filenum != msgfp->filenum || rep->blob_sync != 0) {
ret = DB_REP_PAGEDONE;
goto err;
}
@@ -2135,6 +3443,53 @@ err:
}
/*
+ * __rep_blob_cleanup -
+ * Clean up blob internal init information.
+ *
+ * Caller must hold client database mutex (mtx_clientdb) and
+ * REP_SYSTEM_LOCK.
+ */
+static int
+__rep_blob_cleanup(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ int ret, t_ret;
+ u_int32_t count;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+
+ /*
+ * Delete any remaining records in the blob chunk database. The blob
+ * chunk database contains descriptions of the blob chunks that have
+ * yet to arrive. If not deleted, the remaining records could
+ * interfere with how the next REP_BLOB_UPDATE message is handled.
+ */
+ if (db_rep->blob_dbp != NULL) {
+ ENV_GET_THREAD_INFO(env, ip);
+ ret = __db_truncate(db_rep->blob_dbp, ip, NULL, &count);
+ t_ret = __db_close(db_rep->blob_dbp, NULL, DB_NOSYNC);
+ if (ret == 0)
+ ret = t_ret;
+ db_rep->blob_dbp = NULL;
+ }
+ /* Reset blob internal init control values. */
+ rep->gap_bl_hi_id = rep->gap_bl_hi_sid = 0;
+ rep->last_blob_id = rep->last_blob_sid = 0;
+ rep->prev_blob_id = rep->prev_blob_sid = 0;
+ rep->gap_bl_hi_off = 0;
+ rep->blob_more_files = 0;
+ rep->blob_sync = 0;
+ rep->highest_id = 0;
+ rep->blob_rereq = 0;
+
+ return (ret);
+}
+
+/*
* __rep_init_cleanup -
* Clean up internal initialization pieces.
*
@@ -2162,9 +3517,10 @@ __rep_init_cleanup(env, rep, force)
/*
* 1. Close up the file data pointer we used.
* 2. Close/reset the page database.
- * 3. Close/reset the queue database if we're forcing a cleanup.
- * 4. Free current file info.
- * 5. If we have all files or need to force, free original file info.
+ * 3. Close/truncate the blob chunk gap database.
+ * 4. Close/reset the queue database if we're forcing a cleanup.
+ * 5. Free current file info.
+ * 6. If we have all files or need to force, free original file info.
*/
if (db_rep->file_mpf != NULL) {
ret = __memp_fclose(db_rep->file_mpf, 0);
@@ -2176,6 +3532,15 @@ __rep_init_cleanup(env, rep, force)
if (ret == 0)
ret = t_ret;
}
+ /*
+ * Truncate the blob chunk gap database, since entries in the database
+ * are for blob chunks we are expecting to arrive. Also reset blob
+ * internal init control values.
+ */
+ t_ret = __rep_blob_cleanup(env, rep);
+ if (ret == 0)
+ ret = t_ret;
+
if (force && db_rep->queue_dbc != NULL) {
queue_dbp = db_rep->queue_dbc->dbp;
if ((t_ret = __dbc_close(db_rep->queue_dbc)) != 0 && ret == 0)
@@ -2324,8 +3689,8 @@ __rep_clean_interrupted(env)
* __rep_filedone -
* We need to check if we're done with the current file after
* processing the current page. Stat the database to see if
- * we have all the pages. If so, we need to clean up/close
- * this one, set up for the next one, and ask for its pages,
+ * we have all the pages and blobs. If so, we need to clean up/close
+ * this one, set up for the next one, and ask for its pages and blobs,
* or if this is the last file, request the log records and
* move to the REP_RECOVER_LOG state.
*/
@@ -2338,9 +3703,14 @@ __rep_filedone(env, ip, eid, rep, msgfp, type)
__rep_fileinfo_args *msgfp;
u_int32_t type;
{
+ DBT msg;
REGINFO *infop;
__rep_fileinfo_args *rfp;
+ __rep_blob_update_req_args rbur;
int ret;
+ u_int8_t buf[__REP_BLOB_UPDATE_REQ_SIZE];
+
+ memset(&msg, 0, sizeof(DBT));
/*
* We've put our page, now we need to do any gap processing
@@ -2375,8 +3745,96 @@ __rep_filedone(env, ip, eid, rep, msgfp, type)
((ret = __rep_queue_filedone(env, ip, rep, rfp)) !=
DB_REP_PAGEDONE))
return (ret);
+
+ /* Request blob files. */
+ if (rfp->blob_fid_lo != 0 || rfp->blob_fid_hi != 0) {
+ ret = 0;
+ rep->blob_sync = 1;
+ memset(&rbur, 0, sizeof(__rep_blob_update_req_args));
+ GET_LO_HI(env,
+ rfp->blob_fid_lo, rfp->blob_fid_hi, rbur.blob_fid, ret);
+ msg.size = __REP_BLOB_UPDATE_REQ_SIZE;
+ msg.data = buf;
+ __rep_blob_update_req_marshal(env, &rbur, msg.data);
+ (void)__rep_send_message(env,
+ rep->master_id, REP_BLOB_UPDATE_REQ, NULL, &msg, 0, 0);
+ return (ret);
+ }
+
+ /*
+ * We have all the data for this file. Clean up.
+ */
+ if ((ret = __rep_init_cleanup(env, rep, 0)) != 0)
+ return (ret);
+
+ rep->curfile++;
+ ret = __rep_nextfile(env, eid, rep);
+
+ return (ret);
+}
+
+/*
+ * __rep_blobdone -
+ * We need to check if we're done with the current file after
+ * processing the current blob chunk.
+ *
+ * Caller must hold client database mutex (mtx_clientdb) and
+ * REP_SYSTEM_LOCK.
+ */
+static int
+__rep_blobdone(env, eid, ip, rep, blob_fid, force)
+ ENV *env;
+ int eid;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ db_seq_t blob_fid;
+ int force;
+{
+ DBT msg;
+ __rep_blob_update_req_args rbur;
+ int done, ret;
+ u_int8_t buf[__REP_BLOB_UPDATE_REQ_SIZE];
+
/*
- * We have all the pages for this file. Clean up.
+ * We've written our blob chunk, now we need to do any gap processing
+ * that might be needed to re-request chunks.
+ */
+ done = 0;
+ ret = __rep_blob_chunk_gap(env, eid, ip, rep, &done, blob_fid, force);
+ /*
+ * The world changed while we were doing gap processing.
+ * We're done here.
+ */
+ if (ret == DB_REP_PAGEDONE)
+ return (0);
+ else if (ret != 0)
+ goto err;
+
+ /*
+ * If the blob database is empty then all files in the current list
+ * have been processed. However, there may be more files on the
+ * master, so request the next list if that is the case.
+ */
+ if (done && rep->blob_more_files) {
+ memset(&rbur, 0, sizeof(__rep_blob_update_req_args));
+ rbur.blob_fid = (u_int64_t)blob_fid;
+ rbur.blob_sid = (u_int64_t)rep->last_blob_sid;
+ rbur.blob_id = (u_int64_t)rep->last_blob_id;
+ rbur.highest_id = (u_int64_t)rep->highest_id;
+ rep->gap_bl_hi_id = rep->gap_bl_hi_sid = 0;
+ rep->gap_bl_hi_off = 0;
+ rep->blob_rereq = 0;
+ msg.size = __REP_BLOB_UPDATE_REQ_SIZE;
+ msg.data = buf;
+ __rep_blob_update_req_marshal(env, &rbur, msg.data);
+ (void)__rep_send_message(env,
+ rep->master_id, REP_BLOB_UPDATE_REQ, NULL, &msg, 0, 0);
+ return (0);
+ } else if (!done)
+ return (0);
+
+ /*
+ * We have all the data for this file. Clean up.
*/
if ((ret = __rep_init_cleanup(env, rep, 0)) != 0)
goto err;
@@ -2388,6 +3846,255 @@ err:
}
/*
+ * __rep_blob_chunk_gap -
+ * We have written a blob chunk. Now check if there are any that need
+ * to be re-requested. The blob chunk gap database contains
+ * descriptions of all the blob chunks that have yet to arrive.
+ *
+ * Caller must hold client database mutex (mtx_clientdb) and
+ * REP_SYSTEM_LOCK.
+ */
+static int
+__rep_blob_chunk_gap(env, eid, ip, rep, done, blob_fid, force)
+ ENV *env;
+ int eid;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ int *done;
+ db_seq_t blob_fid;
+ int force;
+{
+ DBC *dbc;
+ DBT data, high, key, msg;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REGINFO *infop;
+ __rep_blob_chunk_req_args rbcr;
+ __rep_fileinfo_args *rfp;
+ db_seq_t cur_blob_fid;
+ off_t offset;
+ int ret;
+ u_int8_t buf[BLOB_KEY_SIZE], msgbuf[__REP_BLOB_CHUNK_REQ_SIZE];
+
+ db_rep = env->rep_handle;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ infop = env->reginfo;
+ ret = 0;
+ dbc = NULL;
+ *done = 0;
+
+ /* eid will be used when peer-to-peer is re-enabled for blobs. */
+ COMPQUIET(eid, 0);
+
+ /*
+ * Make sure we're still talking about the same file.
+ * If not, we're done here.
+ */
+ GET_CURINFO(rep, infop, rfp);
+ GET_LO_HI(env, rfp->blob_fid_lo, rfp->blob_fid_hi, cur_blob_fid, ret);
+ if (cur_blob_fid != blob_fid) {
+ ret = DB_REP_PAGEDONE;
+ goto err;
+ }
+
+ /* Get the first missing blob chunk. */
+ if ((ret = __db_cursor(db_rep->blob_dbp, ip, NULL, &dbc, 0)) != 0)
+ goto err;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ ret = __dbc_get(dbc, &key, &data, DB_FIRST);
+ if (ret == DB_NOTFOUND) {
+ /* All blobs received. */
+ ret = 0;
+ *done = 1;
+ goto err;
+ } else if (ret != 0)
+ goto err;
+
+ DB_ASSERT(env, key.size == BLOB_KEY_SIZE);
+ DB_ASSERT(env, data.size == sizeof(off_t));
+ offset = *(off_t *)data.data;
+ /*
+ * Format the sdbid and id of the high chunk as a blob gap
+ * database key, so it can be compared with the entries in that
+ * database.
+ */
+ memset(&high, 0, sizeof(DBT));
+ memcpy(buf, &rep->gap_bl_hi_sid, BLOB_ID_SIZE);
+ memcpy(buf + BLOB_ID_SIZE, &rep->gap_bl_hi_id, BLOB_ID_SIZE);
+ high.data = buf;
+ high.size = BLOB_KEY_SIZE;
+
+ /*
+ * If the first chunk in the database is larger than the highest chunk
+ * received, then there is no gap.
+ *
+ * If a gap does exist, check if it is time to do a re-request. If so,
+ * re-request every chunk that exists before the highest received.
+ */
+ if (!force && (__rep_blob_cmp(NULL, &key, &high, NULL) > 0 ||
+ (__rep_blob_cmp(NULL, &key, &high, NULL) == 0 &&
+ offset > rep->gap_bl_hi_off))) {
+ lp->wait_ts = db_rep->request_gap;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ } else if (force || __rep_check_doreq(env, rep)) {
+ /*
+ * Re-request every chunk less than the highest one, plus the
+ * next blob chunk that we are expecting. The next expected
+ * blob chunk is requested in case the last blob chunk is lost
+ * in transit.
+ */
+ do {
+ memset(&rbcr, 0, sizeof(__rep_blob_chunk_req_args));
+ memcpy(&(rbcr.blob_sid), key.data, BLOB_ID_SIZE);
+ memcpy(&(rbcr.blob_id),
+ (u_int8_t *)key.data + BLOB_ID_SIZE, BLOB_ID_SIZE);
+ rbcr.offset = *(u_int64_t *)data.data;
+ rbcr.blob_fid = (u_int64_t)blob_fid;
+ msg.size = __REP_BLOB_CHUNK_REQ_SIZE;
+ msg.data = msgbuf;
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+"blob_chunk_gap: Req file_id %llu, sdb_id %llu, blob_id %llu, offset %llu",
+ (long long)rbcr.blob_fid, (long long)rbcr.blob_sid,
+ (long long)rbcr.blob_id, (long long)rbcr.offset));
+ __rep_blob_chunk_req_marshal(env, &rbcr, msg.data);
+ /*
+ * Note that peer-to-peer initialization is not
+ * supported for blobs.
+ */
+ (void)__rep_send_message(
+ env, rep->master_id,
+ REP_BLOB_CHUNK_REQ, NULL, &msg, 0, 0);
+ /*
+ * Break after requesting the chunk after the highest
+ * one.
+ */
+ if (__rep_blob_cmp(NULL, &key, &high, NULL) > 0 ||
+ (__rep_blob_cmp(NULL, &key, &high, NULL) == 0 &&
+ offset > rep->gap_bl_hi_off))
+ break;
+ if ((ret = __dbc_get(
+ dbc, &key, &data, DB_NEXT)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ break;
+ }
+ goto err;
+ }
+ } while (1);
+ }
+
+err: if (dbc != NULL)
+ (void)__dbc_close(dbc);
+
+ return (ret);
+}
+
+/*
+ * __rep_blob_chunk_req
+ * Answer a request for a specific blob chunk.
+ *
+ * PUBLIC: int __rep_blob_chunk_req __P((ENV *, int, DBT *));
+ */
+int
+__rep_blob_chunk_req(env, eid, rec)
+ ENV *env;
+ int eid;
+ DBT *rec;
+{
+ DB *dbp;
+ DBT msg;
+ DB_FH *fhp;
+ __rep_blob_chunk_args rbc;
+ __rep_blob_chunk_req_args rbcr;
+ int ret;
+ u_int8_t *chunk_buf, *msg_buf, *ptr;
+
+ dbp = NULL;
+ fhp = NULL;
+ chunk_buf = msg_buf = NULL;
+
+ if ((ret =
+ __os_malloc(env, MEGABYTE + __REP_BLOB_CHUNK_SIZE, &msg_buf)) != 0)
+ goto err;
+ memset(&msg, 0, sizeof(DBT));
+ msg.data = msg_buf;
+ msg.ulen = MEGABYTE + __REP_BLOB_CHUNK_SIZE;
+ if ((ret = __os_malloc(env, MEGABYTE, &chunk_buf)) != 0)
+ goto err;
+ memset(&rbc, 0, sizeof(__rep_blob_chunk_args));
+ rbc.data.data = chunk_buf;
+ rbc.data.ulen = MEGABYTE;
+ rbc.data.flags = DB_DBT_USERMEM;
+
+ if ((ret = __rep_blob_chunk_req_unmarshal(
+ env, &rbcr, rec->data, rec->size, &ptr)) != 0)
+ goto err;
+
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "blob_chunk_req: file_id %llu, sdbid %llu, id %llu, offset %llu",
+ (long long)rbcr.blob_fid, (long long)rbcr.blob_sid,
+ (long long)rbcr.blob_id, (long long)rbcr.offset));
+
+ rbc.blob_fid = rbcr.blob_fid;
+ rbc.blob_id = rbcr.blob_id;
+ rbc.blob_sid = rbcr.blob_sid;
+ rbc.offset = rbcr.offset;
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ dbp->blob_file_id = (db_seq_t)rbcr.blob_fid;
+ dbp->blob_sdb_id = (db_seq_t)rbcr.blob_sid;
+ if ((ret = __blob_make_sub_dir(env, &dbp->blob_sub_dir,
+ (db_seq_t)rbcr.blob_fid, (db_seq_t)rbcr.blob_sid)) != 0)
+ goto err;
+ if ((ret = __blob_file_open(
+ dbp, &fhp, (db_seq_t)rbcr.blob_id, DB_FOP_READONLY, 0)) != 0) {
+ /*
+ * The file may have been deleted between creating the
+ * list and sending the request. Send a message saying
+ * the file has been deleted.
+ */
+ if (ret == ENOENT) {
+ ret = 0;
+ F_SET(&rbc, BLOB_DELETE);
+ rbc.data.size = 0;
+ __rep_blob_chunk_marshal(env, &rbc, msg.data);
+ msg.size = __REP_BLOB_CHUNK_SIZE;
+ (void)__rep_send_message(
+ env, eid, REP_BLOB_CHUNK, NULL, &msg, 0, 0);
+ goto err;
+ }
+ goto err;
+ }
+ if ((ret = __blob_file_read(
+ env, fhp, &rbc.data, (off_t)rbcr.offset, MEGABYTE)) != 0)
+ goto err;
+ DB_ASSERT(env, rbc.data.size <= MEGABYTE);
+
+ /*
+ * In rare cases the blob file may have gotten shorter
+ * since the list was created.
+ */
+ if (rbc.data.size == 0)
+ F_SET(&rbc, BLOB_CHUNK_FAIL);
+ __rep_blob_chunk_marshal(env, &rbc, msg.data);
+ msg.size = __REP_BLOB_CHUNK_SIZE + rbc.data.size;
+ (void)__rep_send_message(env, eid, REP_BLOB_CHUNK, NULL, &msg, 0, 0);
+
+err: if (chunk_buf != NULL)
+ __os_free(env, chunk_buf);
+ if (msg_buf != NULL)
+ __os_free(env, msg_buf);
+ if (fhp != NULL)
+ (void)__os_closehandle(env, fhp);
+ if (dbp != 0)
+ (void)__db_close(dbp, NULL, 0);
+ return (ret);
+}
+
+/*
* Starts requesting pages for the next file in the list (if any), or if not,
* proceeds to the next stage: requesting logs.
*
@@ -2404,19 +4111,25 @@ __rep_nextfile(env, eid, rep)
DBT dbt;
__rep_logreq_args lr_args;
DB_LOG *dblp;
+ DB_REP *db_rep;
+ DELAYED_BLOB_LIST *dbl;
LOG *lp;
REGENV *renv;
REGINFO *infop;
__rep_fileinfo_args *curinfo, *rfp, rf;
__rep_fileinfo_v6_args *rfpv6;
- int *curbuf, ret;
+ __rep_fileinfo_v7_args *rfpv7;
+ int *curbuf, ret, view_partial;
u_int8_t *buf, *info_ptr, lrbuf[__REP_LOGREQ_SIZE], *nextinfo;
size_t len, msgsz;
+ char *name;
void *rffree;
infop = env->reginfo;
renv = infop->primary;
+ db_rep = env->rep_handle;
rfp = NULL;
+ dbl = NULL;
/*
* Always direct the next request to the master (at least nominally),
@@ -2430,13 +4143,13 @@ __rep_nextfile(env, eid, rep)
/* Set curinfo to next file and examine it. */
info_ptr = R_ADDR(infop,
rep->originfo_off + (rep->originfolen - rep->infolen));
+ /*
+ * Build a current struct by copying in the older
+ * version struct and then setting up the new fields.
+ * This is safe because all old fields are in the
+ * same location in the current struct.
+ */
if (rep->infoversion < DB_REPVERSION_53) {
- /*
- * Build a current struct by copying in the older
- * version struct and then setting up the data_dir.
- * This is safe because all old fields are in the
- * same location in the current struct.
- */
if ((ret = __rep_fileinfo_v6_unmarshal(env,
rep->infoversion, &rfpv6,
info_ptr, rep->infolen, &nextinfo)) != 0)
@@ -2444,8 +4157,18 @@ __rep_nextfile(env, eid, rep)
memcpy(&rf, rfpv6, sizeof(__rep_fileinfo_v6_args));
rf.dir.data = NULL;
rf.dir.size = 0;
+ rf.blob_fid_lo = rf.blob_fid_hi = 0;
rfp = &rf;
rffree = rfpv6;
+ } else if (rep->infoversion < DB_REPVERSION_61) {
+ if ((ret = __rep_fileinfo_v7_unmarshal(env,
+ rep->infoversion, &rfpv7,
+ info_ptr, rep->infolen, &nextinfo)) != 0)
+ return (ret);
+ memcpy(&rf, rfpv7, sizeof(__rep_fileinfo_v7_args));
+ rf.blob_fid_lo = rf.blob_fid_hi = 0;
+ rfp = &rf;
+ rffree = rfpv7;
} else {
if ((ret = __rep_fileinfo_unmarshal(env,
rep->infoversion, &rfp, info_ptr,
@@ -2457,6 +4180,14 @@ __rep_nextfile(env, eid, rep)
}
rffree = rfp;
}
+#ifndef HAVE_64BIT_TYPES
+ if (rfp->blob_fid_lo != 0 || rfp->blob_fid_hi != 0) {
+ __db_errx(env, DB_STR("3705",
+ "Blobs require 64 integer compiler support."));
+ __os_free(env, rffree);
+ return (DB_OPNOTSUP);
+ }
+#endif
rep->infolen -= (u_int32_t)(nextinfo - info_ptr);
MUTEX_LOCK(env, renv->mtx_regenv);
ret = __env_alloc(infop, sizeof(__rep_fileinfo_args) +
@@ -2484,19 +4215,55 @@ __rep_nextfile(env, eid, rep)
rfp->dir.data, rfp->dir.size);
__os_free(env, rffree);
- /* Skip over regular DB's in "abbreviated" internal inits. */
- if (F_ISSET(rep, REP_F_ABBREVIATED) &&
+ /*
+ * If a partial callback is set, invoke the callback to see if
+ * this file should be replicated.
+ */
+ if (IS_VIEW_SITE(env) && curinfo->info.size > 0 &&
!FLD_ISSET(curinfo->db_flags, DB_AM_INMEM)) {
+ name = (char *)curinfo->info.data;
+ DB_ASSERT(env, db_rep->partial != NULL);
+ /*
+ * Always replicate system owned databases.
+ */
+ if (IS_DB_FILE(name) && !IS_BLOB_META(name))
+ view_partial = 1;
+ else if ((ret = __rep_call_partial(env,
+ name, &view_partial, 0, &dbl)) != 0) {
+ VPRINT(env, (env, DB_VERB_REP_SYNC,
+ "rep_nextfile: partial cb err %d for %s",
+ ret, name));
+ return (ret);
+ }
+ /*
+ * dbl != NULL when we could not find the name of the
+ * database that owns a blob meta database. If that
+ * happens then it was never opened, which means it
+ * was not replicated, and as such neither should its
+ * bmd be replicated.
+ */
+ if (dbl != NULL) {
+ view_partial = 0;
+ __os_free(env, dbl);
+ dbl = NULL;
+ }
VPRINT(env, (env, DB_VERB_REP_SYNC,
- "Skipping file %d in abbreviated internal init",
- curinfo->filenum));
- MUTEX_LOCK(env, renv->mtx_regenv);
- __env_alloc_free(infop,
- R_ADDR(infop, rep->curinfo_off));
- MUTEX_UNLOCK(env, renv->mtx_regenv);
- rep->curinfo_off = INVALID_ROFF;
- rep->curfile++;
- continue;
+ "rep_nextfile: %s file %s %d on view site.",
+ view_partial == 0 ?
+ "Skipping" : "Replicating",
+ name, curinfo->filenum));
+ /*
+ * If we're skipping the file, move to the next one.
+ */
+ if (view_partial == 0) {
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->curinfo_off));
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ rep->curinfo_off = INVALID_ROFF;
+ rep->curfile++;
+ continue;
+ }
}
/* Request this file's pages. */
@@ -2519,15 +4286,19 @@ __rep_nextfile(env, eid, rep)
curinfo->uid.size + curinfo->info.size;
if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
return (ret);
+ /*
+ * It is safe to cast to the old structs
+ * because the first part of the current
+ * struct matches the old structs.
+ */
if (rep->infoversion < DB_REPVERSION_53)
- /*
- * It is safe to cast to the old struct
- * because the first part of the current
- * struct matches the old struct.
- */
ret = __rep_fileinfo_v6_marshal(env, rep->infoversion,
(__rep_fileinfo_v6_args *)curinfo, buf,
msgsz, &len);
+ else if (rep->infoversion < DB_REPVERSION_61)
+ ret = __rep_fileinfo_v7_marshal(env, rep->infoversion,
+ (__rep_fileinfo_v7_args *)curinfo, buf,
+ msgsz, &len);
else
ret = __rep_fileinfo_marshal(env, rep->infoversion,
curinfo, buf, msgsz, &len);
@@ -2834,16 +4605,19 @@ __rep_pggap_req(env, rep, reqfp, gapflags)
* new info into rep->finfo. Assert that the sizes never
* change. The only thing this should do is change
* the pgno field. Everything else remains the same.
+ *
+ * It is safe to cast to the old structs
+ * because the first part of the current
+ * struct matches the old structs.
*/
if (rep->infoversion < DB_REPVERSION_53)
- /*
- * It is safe to cast to the old struct
- * because the first part of the current
- * struct matches the old struct.
- */
ret = __rep_fileinfo_v6_marshal(env, rep->infoversion,
(__rep_fileinfo_v6_args *)tmpfp, buf,
msgsz, &len);
+ else if (rep->infoversion < DB_REPVERSION_61)
+ ret = __rep_fileinfo_v7_marshal(env, rep->infoversion,
+ (__rep_fileinfo_v7_args *)tmpfp, buf,
+ msgsz, &len);
else
ret = __rep_fileinfo_marshal(env, rep->infoversion,
tmpfp, buf, msgsz, &len);
@@ -2865,6 +4639,94 @@ err:
}
/*
+ * __rep_blob_rereq -
+ *
+ * Re-request lost blob messages, such as REP_BLOB_CHUNK_REQ, REP_BLOB_ALL_REQ,
+ * or REP_BLOB_UPDATE_REQ. Note that the blob chunk gap database contains
+ * descriptions of the blob chunks that we are expecting to arrive.
+ *
+ * Assumes the caller holds mtx_clientdb and rep_mutex.
+ *
+ * PUBLIC: int __rep_blob_rereq __P((ENV *, REP *));
+ */
+int
+__rep_blob_rereq(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ REGINFO *infop;
+ __rep_fileinfo_args *rfp;
+ db_seq_t blob_fid;
+ int master, ret;
+ u_int32_t count;
+
+ db_rep = env->rep_handle;
+ infop = env->reginfo;
+ rfp = NULL;
+ ret = 0;
+
+ /* First check if the master is around to answer the re-request. */
+ master = rep->master_id;
+ if (master == DB_EID_INVALID) {
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0);
+ goto err;
+ }
+
+ if (db_rep->blob_dbp == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_BLOB)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_SYNC,
+ "REP_BLOB_CHUNK: Client_dbinit %s",
+ db_strerror(ret)));
+ goto err;
+ }
+
+ /*
+ * If the gap blob id is 0 then we either lost a REP_BLOB_ALL_REQ or
+ * a REP_BLOB_UPDATE_REQ message. Since we do not have the information
+ * to reconstruct a REP_BLOB_ALL_REQ message, reset the blob gap
+ * database and start over at the REP_BLOB_UPDATE_REQ stage.
+ *
+ * If the blob gap id is not 0, we lost a REP_BLOB_CHUNK_REQ message,
+ * so perform blob gap processing.
+ */
+ ENV_GET_THREAD_INFO(env, ip);
+ if (rep->gap_bl_hi_id == 0) {
+ /*
+ * It takes a while to create the blob update message, so skip
+ * the first time it asks.
+ */
+ if (rep->blob_rereq == 0) {
+ rep->blob_rereq = 1;
+ goto err;
+ }
+ rep->blob_rereq = 0;
+ if ((ret = __db_truncate(
+ db_rep->blob_dbp, ip, NULL, &count)) != 0)
+ goto err;
+ rep->blob_more_files = 1;
+ rep->last_blob_id = rep->prev_blob_id;
+ rep->last_blob_sid = rep->prev_blob_sid;
+ }
+
+ GET_CURINFO(rep, infop, rfp);
+ GET_LO_HI(env, rfp->blob_fid_lo, rfp->blob_fid_hi, blob_fid, ret);
+ if (ret != 0)
+ goto err;
+ /*
+ * If there are entries in the blob gap database, __rep_blobdone
+ * will perform gap processing, otherwise it will send
+ * a REP_BLOB_UPDATE_REQ.
+ */
+ ret = __rep_blobdone(env, master, ip, rep, blob_fid, 1);
+
+err:
+ return (ret);
+}
+
+/*
* __rep_finfo_alloc -
* Allocate and initialize a fileinfo structure.
*
@@ -3521,6 +5383,7 @@ __rep_walk_filelist(env, version, files, size, count, fn, arg)
{
__rep_fileinfo_args *rfp, rf;
__rep_fileinfo_v6_args *rfpv6;
+ __rep_fileinfo_v7_args *rfpv7;
u_int8_t *next;
int ret;
void *rffree;
@@ -3530,21 +5393,30 @@ __rep_walk_filelist(env, version, files, size, count, fn, arg)
rfpv6 = NULL;
rffree = NULL;
while (count-- > 0) {
+ /*
+ * Build a current struct by copying in the older
+ * version struct and then setting up the new fields.
+ * This is safe because all old fields are in the
+ * same location in the current struct.
+ */
if (version < DB_REPVERSION_53) {
- /*
- * Build a current struct by copying in the older
- * version struct and then setting up the data_dir.
- * This is safe because all old fields are in the
- * same location in the current struct.
- */
if ((ret = __rep_fileinfo_v6_unmarshal(env, version,
&rfpv6, files, size, &next)) != 0)
break;
memcpy(&rf, rfpv6, sizeof(__rep_fileinfo_v6_args));
rf.dir.data = NULL;
rf.dir.size = 0;
+ rf.blob_fid_lo = rf.blob_fid_hi = 0;
rfp = &rf;
rffree = rfpv6;
+ } else if (version < DB_REPVERSION_61) {
+ if ((ret = __rep_fileinfo_v7_unmarshal(env, version,
+ &rfpv7, files, size, &next)) != 0)
+ break;
+ memcpy(&rf, rfpv7, sizeof(__rep_fileinfo_v7_args));
+ rf.blob_fid_lo = rf.blob_fid_hi = 0;
+ rfp = &rf;
+ rffree = rfpv7;
} else {
if ((ret = __rep_fileinfo_unmarshal(env, version,
&rfp, files, size, &next)) != 0)
@@ -3566,3 +5438,33 @@ __rep_walk_filelist(env, version, files, size, count, fn, arg)
__os_free(env, rffree);
return (ret);
}
+
+/*
+ * Initializes a FILE_LIST_CTX structure.
+ *
+ * Pass in a non-zero value for update_space to reserve space for
+ * update_args in the context's buffer.
+ */
+static int
+__rep_init_file_list_context(env, version, flags, update_space, context)
+ ENV *env;
+ u_int32_t version;
+ u_int32_t flags;
+ int update_space;
+ FILE_LIST_CTX *context;
+{
+ int ret;
+
+ if ((ret = __os_calloc(env, 1, MEGABYTE, &context->buf)) != 0)
+ return (ret);
+ context->size = MEGABYTE;
+ context->count = 0;
+ context->version = version;
+ context->flags = flags;
+ /* Reserve space for update_args. */
+ if (update_space)
+ context->fillptr = FIRST_FILE_PTR(context->buf);
+ else
+ context->fillptr = context->buf;
+ return (ret);
+}
diff --git a/src/rep/rep_elect.c b/src/rep/rep_elect.c
index 9e8c5249..234daf31 100644
--- a/src/rep/rep_elect.c
+++ b/src/rep/rep_elect.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -53,8 +53,9 @@ __rep_elect_pp(dbenv, given_nsites, nvotes, flags)
u_int32_t given_nsites, nvotes;
u_int32_t flags;
{
- DB_REP *db_rep;
ENV *env;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
int ret;
env = dbenv->env;
@@ -89,7 +90,9 @@ __rep_elect_pp(dbenv, given_nsites, nvotes, flags)
return (EINVAL);
}
+ ENV_ENTER(env, ip);
ret = __rep_elect_int(env, given_nsites, nvotes, flags);
+ ENV_LEAVE(env, ip);
/*
* The DB_REP_IGNORE return code can be of use to repmgr (which of
@@ -120,7 +123,6 @@ __rep_elect_int(env, given_nsites, nvotes, flags)
DB_LOGC *logc;
DB_LSN lsn;
DB_REP *db_rep;
- DB_THREAD_INFO *ip;
LOG *lp;
REP *rep;
int done, elected, in_progress;
@@ -140,6 +142,15 @@ __rep_elect_int(env, given_nsites, nvotes, flags)
ret = 0;
/*
+ * View sites never participate in elections.
+ */
+ if (IS_VIEW_SITE(env)) {
+ __db_errx(env, DB_STR("3687",
+ "View sites may not participate in elections"));
+ return (EINVAL);
+ }
+
+ /*
* Specifying 0 for nsites signals us to use the value configured
* previously via rep_set_nsites. Similarly, if the given nvotes is 0,
* it asks us to compute the value representing a simple majority.
@@ -185,7 +196,6 @@ __rep_elect_int(env, given_nsites, nvotes, flags)
* real, configured priority, as retrieved from REP region.
*/
ctlflags = realpri != 0 ? REPCTL_ELECTABLE : 0;
- ENV_ENTER(env, ip);
orig_tally = 0;
/* If we are already master, simply broadcast that fact and return. */
@@ -597,8 +607,7 @@ out:
DB_ASSERT(env, rep->elect_th > 0);
rep->elect_th--;
if (rep->elect_th == 0) {
- need_req = F_ISSET(rep, REP_F_SKIPPED_APPLY) &&
- !I_HAVE_WON(rep, rep->winner);
+ need_req = F_ISSET(rep, REP_F_SKIPPED_APPLY) && !elected;
FLD_CLR(rep->lockout_flags, REP_LOCKOUT_APPLY);
F_CLR(rep, REP_F_SKIPPED_APPLY);
}
@@ -641,7 +650,6 @@ out:
unlck_lv: REP_SYSTEM_UNLOCK(env);
}
envleave:
- ENV_LEAVE(env, ip);
return (ret);
}
@@ -1106,7 +1114,7 @@ __rep_cmp_vote(env, rep, eid, lsnp, priority, gen, data_gen, tiebreaker, flags)
u_int32_t priority;
u_int32_t data_gen, flags, gen, tiebreaker;
{
- int cmp, like_pri;
+ int cmp, genlog_cmp, like_pri;
cmp = LOG_COMPARE(lsnp, &rep->w_lsn);
/*
@@ -1140,9 +1148,18 @@ __rep_cmp_vote(env, rep, eid, lsnp, priority, gen, data_gen, tiebreaker, flags)
like_pri = (priority == 0 && rep->w_priority == 0) ||
(priority != 0 && rep->w_priority != 0);
- if ((priority != 0 && rep->w_priority == 0) ||
- (like_pri && data_gen > rep->w_datagen) ||
- (like_pri && data_gen == rep->w_datagen && cmp > 0) ||
+ /*
+ * The undocumented ELECT_LOGLENGTH option requires that the
+ * election should be won based on log length without regard
+ * for datagen. Do not include datagen in the comparison if
+ * this option is enabled.
+ */
+ if (FLD_ISSET(rep->config, REP_C_ELECT_LOGLENGTH))
+ genlog_cmp = like_pri && cmp > 0;
+ else
+ genlog_cmp = (like_pri && data_gen > rep->w_datagen) ||
+ (like_pri && data_gen == rep->w_datagen && cmp > 0);
+ if ((priority != 0 && rep->w_priority == 0) || genlog_cmp ||
(cmp == 0 && (priority > rep->w_priority ||
(priority == rep->w_priority &&
(tiebreaker > rep->w_tiebreaker))))) {
@@ -1306,8 +1323,9 @@ __rep_wait(env, timeoutp, full_elect, egen, flags)
{
DB_REP *db_rep;
REP *rep;
- int done;
- u_int32_t sleeptime, sleeptotal, timeout;
+ db_timespec exptime, mytime;
+ int diff_timeout, done;
+ u_int32_t sleeptime, timeout;
db_rep = env->rep_handle;
rep = db_rep->region;
@@ -1315,10 +1333,20 @@ __rep_wait(env, timeoutp, full_elect, egen, flags)
timeout = *timeoutp;
sleeptime = SLEEPTIME(timeout);
- sleeptotal = 0;
- while (sleeptotal < timeout) {
+ __os_gettime(env, &exptime, 0);
+ TIMESPEC_ADD_DB_TIMEOUT(&exptime, timeout);
+ while (!done) {
+ __os_gettime(env, &mytime, 0);
+ /*
+ * Check if the timeout has expired. __os_yield might sleep
+ * a slightly shorter time than requested, so check the exact
+ * amount of time that has passed. If we do not sleep the
+ * full PHASE0 time, old unexpired lease grants could
+ * incorrectly prevent the election from happening.
+ */
+ if (timespeccmp(&mytime, &exptime, >))
+ break;
__os_yield(env, 0, sleeptime);
- sleeptotal += sleeptime;
REP_SYSTEM_LOCK(env);
/*
* Check if group membership changed while we were
@@ -1331,19 +1359,19 @@ __rep_wait(env, timeoutp, full_elect, egen, flags)
if (!LF_ISSET(REP_E_PHASE0) &&
full_elect && F_ISSET(rep, REP_F_GROUP_ESTD)) {
*timeoutp = rep->elect_timeout;
+ if ((diff_timeout = (int)(*timeoutp - timeout)) > 0)
+ TIMESPEC_ADD_DB_TIMEOUT(&exptime, diff_timeout);
+ else {
+ diff_timeout = -diff_timeout;
+ TIMESPEC_SUB_DB_TIMEOUT(&exptime, diff_timeout);
+ }
timeout = *timeoutp;
- if (sleeptotal >= timeout)
- done = 1;
- else
- sleeptime = SLEEPTIME(timeout);
+ sleeptime = SLEEPTIME(timeout);
}
if (egen != rep->egen || !FLD_ISSET(rep->elect_flags, flags))
done = 1;
REP_SYSTEM_UNLOCK(env);
-
- if (done)
- return (0);
}
return (0);
}
diff --git a/src/rep/rep_lease.c b/src/rep/rep_lease.c
index 047c39a7..b6010046 100644
--- a/src/rep/rep_lease.c
+++ b/src/rep/rep_lease.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2007, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -45,10 +45,20 @@ __rep_update_grant(env, ts)
timespecclear(&mytime);
/*
+ * If we are a view, we never grant a lease.
+ */
+ if (IS_VIEW_SITE(env))
+ return (0);
+
+ /*
* Get current time, and add in the (skewed) lease duration
- * time to send the grant to the master.
+ * time to send the grant to the master. We need to use '0'
+ * for a non-monotonic (i.e. realtime) timestamp. Some systems
+ * use "time since boot" for monotonic time, which would not
+ * work between machines here. We already document that for leases,
+ * the time cannot go backward.
*/
- __os_gettime(env, &mytime, 1);
+ __os_gettime(env, &mytime, 0);
timespecadd(&mytime, &rep->lease_duration);
REP_SYSTEM_LOCK(env);
/*
@@ -108,7 +118,7 @@ __rep_islease_granted(env)
* Get current time and compare against our granted lease.
*/
timespecclear(&mytime);
- __os_gettime(env, &mytime, 1);
+ __os_gettime(env, &mytime, 0);
return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0);
}
@@ -319,9 +329,15 @@ __rep_lease_check(env, refresh)
max_tries = LEASE_REFRESH_MIN;
retry:
REP_SYSTEM_LOCK(env);
- min_leases = rep->config_nsites / 2;
+ /*
+ * We need enough leases so that we're guaranteed any successful
+ * election will include at least one site with the lease-guaranteed
+ * data. Note this is based on total number of sites so leases
+ * cannot be used with half or more unelectable sites.
+ */
+ min_leases = (rep->config_nsites - 1) / 2;
ret = 0;
- __os_gettime(env, &curtime, 1);
+ __os_gettime(env, &curtime, 0);
VPRINT(env, (env, DB_VERB_REP_LEASE,
"%s %d of %d refresh %d min_leases %lu curtime %lu %lu, maxLSN [%lu][%lu]",
"lease_check: try ", tries, max_tries, refresh,
@@ -526,7 +542,7 @@ __rep_lease_waittime(env)
if (!F_ISSET(rep, REP_F_LEASE_EXPIRED))
to = rep->lease_timeout;
} else {
- __os_gettime(env, &mytime, 1);
+ __os_gettime(env, &mytime, 0);
RPRINT(env, (env, DB_VERB_REP_LEASE,
"wait_time: mytime %lu %lu, grant_expire %lu %lu",
(u_long)mytime.tv_sec, (u_long)mytime.tv_nsec,
diff --git a/src/rep/rep_log.c b/src/rep/rep_log.c
index 42300685..bf72db9e 100644
--- a/src/rep/rep_log.c
+++ b/src/rep/rep_log.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -110,7 +110,7 @@ __rep_allreq(env, rp, eid)
*/
if (ret == 0 && repth.lsn.file != 1 && flags == DB_FIRST) {
if (F_ISSET(rep, REP_F_CLIENT))
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
else
(void)__rep_send_message(env, eid,
REP_VERIFY_FAIL, &repth.lsn, NULL, 0, 0);
@@ -466,8 +466,8 @@ __rep_log_split(env, ip, rp, rec, ret_lsnp, last_lsnp)
if (p >= ep && save_flags)
F_SET(&tmprp, save_flags);
/*
- * A previous call to __rep_apply indicated an earlier
- * record is a dup and the next_new_lsn we are waiting for.
+ * A previous call to __rep_apply indicated an earlier record
+ * is a past dup and the next_new_lsn for which we are waiting.
* Skip log records until we catch up with next_new_lsn.
*/
if (is_dup && LOG_COMPARE(&tmprp.lsn, &next_new_lsn) < 0) {
@@ -482,7 +482,20 @@ __rep_log_split(env, ip, rp, rec, ret_lsnp, last_lsnp)
VPRINT(env, (env, DB_VERB_REP_MISC,
"log_split: rep_apply ret %d, dup %d, tmp_lsn [%lu][%lu]",
ret, is_dup, (u_long)tmp_lsn.file, (u_long)tmp_lsn.offset));
- if (is_dup)
+ /*
+ * We can skip log records between a past dup and tmp_lsn
+ * returned by rep_apply() because we know we have all
+ * those log records. For a past dup, this log record is
+ * less than or equal to tmp_lsn (which is either ready_lsn
+ * or max_perm_lsn) and we only have records to skip when
+ * it is less than tmp_lsn.
+ *
+ * We cannot skip log records for a future dup because we
+ * may not have all of them. In this case, this log record
+ * is greater than or equal to tmp_lsn (which is either
+ * ready_lsn or this log record).
+ */
+ if (is_dup && LOG_COMPARE(&tmprp.lsn, &tmp_lsn) < 0)
next_new_lsn = tmp_lsn;
switch (ret) {
/*
@@ -637,7 +650,7 @@ __rep_logreq(env, rp, rec, eid)
if (LOG_COMPARE(&firstlsn, &rp->lsn) > 0) {
/* Case 3 */
if (F_ISSET(rep, REP_F_CLIENT)) {
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto err;
}
(void)__rep_send_message(env, eid,
@@ -662,7 +675,7 @@ __rep_logreq(env, rp, rec, eid)
ret = 0;
goto err;
} else
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
}
}
@@ -812,6 +825,14 @@ __rep_loggap_req(env, rep, lsnp, gapflags)
ret = 0;
/*
+ * If we are in SYNC_LOG and have all the log we need (i.e.
+ * rep->last_lsn is ZERO_LSN), just return, as there is nothing
+ * to do while recovery is running.
+ */
+ if (rep->sync_state == SYNC_LOG && IS_ZERO_LSN(rep->last_lsn))
+ return (0);
+
+ /*
* Check if we need to ask for the gap.
* We ask for the gap if:
* We are forced to with gapflags.
@@ -1030,7 +1051,7 @@ __rep_chk_newfile(env, logc, rep, rp, eid)
REP_VERIFY_FAIL, &rp->lsn,
NULL, 0, 0);
} else
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
} else {
endlsn.offset += logc->len;
if ((ret = __logc_version(logc,
@@ -1054,7 +1075,7 @@ __rep_chk_newfile(env, logc, rep, rp, eid)
}
}
} else
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
return (ret);
}
diff --git a/src/rep/rep_method.c b/src/rep/rep_method.c
index f9f1924c..e0e7dd19 100644
--- a/src/rep/rep_method.c
+++ b/src/rep/rep_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -10,6 +10,7 @@
#include "db_int.h"
#include "dbinc/db_page.h"
+#include "dbinc/blob.h"
#include "dbinc/btree.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"
@@ -17,14 +18,12 @@
static int __rep_abort_prepared __P((ENV *));
static int __rep_await_condition __P((ENV *,
struct rep_waitgoal *, db_timeout_t));
-static int __rep_bt_cmp __P((DB *, const DBT *, const DBT *));
+static int __rep_bt_cmp __P((DB *, const DBT *, const DBT *, size_t *));
static int __rep_check_applied __P((ENV *,
DB_THREAD_INFO *, DB_COMMIT_INFO *, struct rep_waitgoal *));
static void __rep_config_map __P((ENV *, u_int32_t *, u_int32_t *));
static u_int32_t __rep_conv_vers __P((ENV *, u_int32_t));
-static int __rep_read_lsn_history __P((ENV *,
- DB_THREAD_INFO *, DB_TXN **, DBC **, u_int32_t,
- __rep_lsn_hist_data_args *, struct rep_waitgoal *, u_int32_t));
+static int __rep_defview __P((DB_ENV *, const char *, int *, u_int32_t));
static int __rep_restore_prepared __P((ENV *));
static int __rep_save_lsn_hist __P((ENV *, DB_THREAD_INFO *, DB_LSN *));
/*
@@ -123,9 +122,11 @@ __rep_get_config(dbenv, which, onp)
#undef OK_FLAGS
#define OK_FLAGS \
(DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK | \
- DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \
+ DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | \
+ DB_REP_CONF_ELECT_LOGLENGTH | DB_REP_CONF_INMEM | \
DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT | \
- DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS)
+ DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS | \
+ DB_REPMGR_CONF_PREFMAS_CLIENT | DB_REPMGR_CONF_PREFMAS_MASTER)
if (FLD_ISSET(which, ~OK_FLAGS))
return (__db_ferr(env, "DB_ENV->rep_get_config", 0));
@@ -171,19 +172,30 @@ __rep_set_config(dbenv, which, on)
REP *rep;
REP_BULK bulk;
u_int32_t mapped, orig;
- int ret, t_ret;
+ int inmemlog, pm_ret, ret, t_ret;
env = dbenv->env;
db_rep = env->rep_handle;
ret = 0;
+ pm_ret = 0;
+ inmemlog = 0;
#undef OK_FLAGS
#define OK_FLAGS \
(DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK | \
- DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \
+ DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | \
+ DB_REP_CONF_ELECT_LOGLENGTH | DB_REP_CONF_INMEM | \
DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT | \
- DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS)
-#define REPMGR_FLAGS (REP_C_2SITE_STRICT | REP_C_ELECTIONS)
+ DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS | \
+ DB_REPMGR_CONF_PREFMAS_CLIENT | DB_REPMGR_CONF_PREFMAS_MASTER)
+#define REPMGR_FLAGS (REP_C_2SITE_STRICT | REP_C_ELECTIONS | \
+ REP_C_PREFMAS_CLIENT | REP_C_PREFMAS_MASTER)
+
+#define TURNING_ON_PREFMAS(orig, curr) \
+ ((FLD_ISSET(curr, REP_C_PREFMAS_MASTER) && \
+ !FLD_ISSET(orig, REP_C_PREFMAS_MASTER)) || \
+ (FLD_ISSET(curr, REP_C_PREFMAS_CLIENT) && \
+ !FLD_ISSET(orig, REP_C_PREFMAS_CLIENT)))
ENV_NOT_CONFIGURED(
env, db_rep->region, "DB_ENV->rep_set_config", DB_INIT_REP);
@@ -224,6 +236,62 @@ __rep_set_config(dbenv, which, on)
return (EINVAL);
}
/*
+ * The undocumented ELECT_LOGLENGTH option and the preferred
+ * master options cannot be changed after calling repmgr_start.
+ */
+ if (FLD_ISSET(mapped, (REP_C_ELECT_LOGLENGTH |
+ REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT)) &&
+ F_ISSET(rep, REP_F_START_CALLED)) {
+ __db_errx(env, DB_STR("3706",
+ "DB_ENV->rep_set_config: %s "
+ "must be configured before DB_ENV->repmgr_start"),
+ FLD_ISSET(mapped, REP_C_ELECT_LOGLENGTH) ?
+ "ELECT_LOGLENGTH" : "preferred master");
+ ENV_LEAVE(env, ip);
+ return (EINVAL);
+ }
+ /*
+ * Do not allow users to turn on preferred master if
+ * leases or in-memory replication files are in effect,
+ * or with a private environment or in-memory log files.
+ */
+ if (FLD_ISSET(mapped,
+ (REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT)) &&
+ (REP_CONFIG_IS_SET(env, (REP_C_LEASE | REP_C_INMEM)) ||
+ (__log_get_config(dbenv,
+ DB_LOG_IN_MEMORY, &inmemlog) == 0 &&
+ (inmemlog > 0 || F_ISSET(env, ENV_PRIVATE))))) {
+ __db_errx(env, DB_STR("3707",
+ "DB_ENV->rep_set_config: preferred master mode "
+ "cannot be used with %s"),
+ REP_CONFIG_IS_SET(env, REP_C_LEASE) ?
+ "master leases" :
+ REP_CONFIG_IS_SET(env, REP_C_INMEM) ?
+ "in-memory replication files" :
+ inmemlog > 0 ? "in-memory log files" :
+ "a private environment");
+ ENV_LEAVE(env, ip);
+ return (EINVAL);
+ }
+ /*
+ * If we are already in preferred master mode, we can't
+ * turn off elections or 2site_strict and we can't turn on
+ * leases.
+ */
+ if (PREFMAS_IS_SET(env) && ((FLD_ISSET(mapped,
+ (REP_C_ELECTIONS | REP_C_2SITE_STRICT)) && on == 0) ||
+ (FLD_ISSET(mapped, REP_C_LEASE) && on > 0))) {
+ __db_errx(env, DB_STR("3708",
+ "DB_ENV->rep_set_config: cannot %s %s "
+ "in preferred master mode"),
+ on == 0 ? "disable" : "enable",
+ FLD_ISSET(mapped, REP_C_ELECTIONS) ? "elections" :
+ FLD_ISSET(mapped, REP_C_LEASE) ? "leases" :
+ "2SITE_STRICT");
+ ENV_LEAVE(env, ip);
+ return (EINVAL);
+ }
+ /*
* Leases must be turned on before calling rep_start.
* Leases can never be turned off once they're turned on.
*/
@@ -252,6 +320,17 @@ __rep_set_config(dbenv, which, on)
else
FLD_CLR(rep->config, mapped);
+#ifdef HAVE_REPLICATION_THREADS
+ /* Do automatic preferred master configuration. */
+ if (TURNING_ON_PREFMAS(orig, rep->config) &&
+ (pm_ret = __repmgr_prefmas_auto_config(dbenv,
+ &rep->config)) != 0) {
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ENV_LEAVE(env, ip);
+ goto prefmas_err;
+ }
+#endif
/*
* Bulk transfer requires special processing if it is getting
* toggled.
@@ -297,10 +376,25 @@ __rep_set_config(dbenv, which, on)
ret = t_ret;
#endif
} else {
+ orig = db_rep->config;
if (on)
FLD_SET(db_rep->config, mapped);
else
FLD_CLR(db_rep->config, mapped);
+#ifdef HAVE_REPLICATION_THREADS
+ /* Do automatic preferred master configuration. */
+ if (TURNING_ON_PREFMAS(orig, db_rep->config))
+ pm_ret =
+ __repmgr_prefmas_auto_config(dbenv,
+ &db_rep->config);
+#endif
+ }
+prefmas_err:
+ if (pm_ret != 0) {
+ __db_errx(env, DB_STR("3709",
+ "DB_ENV->rep_set_config: could not complete automatic "
+ "preferred master configuration"));
+ ret = EINVAL;
}
/* Configuring 2SITE_STRICT, etc. makes this a repmgr application */
if (ret == 0 && FLD_ISSET(mapped, REPMGR_FLAGS))
@@ -331,6 +425,10 @@ __rep_config_map(env, inflagsp, outflagsp)
FLD_SET(*outflagsp, REP_C_DELAYCLIENT);
FLD_CLR(*inflagsp, DB_REP_CONF_DELAYCLIENT);
}
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_ELECT_LOGLENGTH)) {
+ FLD_SET(*outflagsp, REP_C_ELECT_LOGLENGTH);
+ FLD_CLR(*inflagsp, DB_REP_CONF_ELECT_LOGLENGTH);
+ }
if (FLD_ISSET(*inflagsp, DB_REP_CONF_INMEM)) {
FLD_SET(*outflagsp, REP_C_INMEM);
FLD_CLR(*inflagsp, DB_REP_CONF_INMEM);
@@ -351,6 +449,14 @@ __rep_config_map(env, inflagsp, outflagsp)
FLD_SET(*outflagsp, REP_C_ELECTIONS);
FLD_CLR(*inflagsp, DB_REPMGR_CONF_ELECTIONS);
}
+ if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_PREFMAS_CLIENT)) {
+ FLD_SET(*outflagsp, REP_C_PREFMAS_CLIENT);
+ FLD_CLR(*inflagsp, DB_REPMGR_CONF_PREFMAS_CLIENT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_PREFMAS_MASTER)) {
+ FLD_SET(*outflagsp, REP_C_PREFMAS_MASTER);
+ FLD_CLR(*inflagsp, DB_REPMGR_CONF_PREFMAS_MASTER);
+ }
DB_ASSERT(env, *inflagsp == 0);
}
@@ -368,8 +474,10 @@ __rep_start_pp(dbenv, dbt, flags)
DBT *dbt;
u_int32_t flags;
{
- DB_REP *db_rep;
ENV *env;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ int ret;
env = dbenv->env;
db_rep = env->rep_handle;
@@ -400,7 +508,11 @@ __rep_start_pp(dbenv, dbt, flags)
return (EINVAL);
}
- return (__rep_start_int(env, dbt, flags));
+ ENV_ENTER(env, ip);
+ ret = __rep_start_int(env, dbt, flags, 0);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
}
/*
@@ -432,13 +544,14 @@ __rep_start_pp(dbenv, dbt, flags)
* clients that reference non-existent files whose creation was backed out
* during a synchronizing recovery.
*
- * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t));
+ * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t, u_int32_t));
*/
int
-__rep_start_int(env, dbt, flags)
+__rep_start_int(env, dbt, flags, startopts)
ENV *env;
DBT *dbt;
u_int32_t flags;
+ u_int32_t startopts;
{
DB *dbp;
DB_LOG *dblp;
@@ -474,9 +587,31 @@ __rep_start_int(env, dbt, flags)
return (EINVAL);
}
- ENV_ENTER(env, ip);
+ /*
+ * If we are a view, we can never become master.
+ */
+ if (IS_VIEW_SITE(env) && role == DB_REP_MASTER) {
+ __db_errx(env, DB_STR("3685",
+ "View site cannot become master"));
+ return (EINVAL);
+ }
+
+ /*
+ * Check for consistent view usage. We need to check here rather
+ * than in __rep_open because non-rep-aware processes such as
+ * db_stat may open/join the environment. Rep-aware handles must
+ * consistently set the view.
+ */
+ if ((ret = __rep_check_view(env)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Application env/view mismatch."));
+ __db_errx(env, DB_STR("3686",
+ "Application environment and view callback mismatch"));
+ return (ret);
+ }
/* Serialize rep_start() calls. */
+ ENV_GET_THREAD_INFO(env, ip);
MUTEX_LOCK(env, rep->mtx_repstart);
start_th = 1;
@@ -492,8 +627,14 @@ __rep_start_int(env, dbt, flags)
goto out;
REP_SYSTEM_LOCK(env);
+ /*
+ * The FORCE_ROLECHG option is used when a side-effect of the role
+ * change such as incrementing the master gen is needed regardless
+ * of the previous role.
+ */
role_chg = (!F_ISSET(rep, REP_F_MASTER) && role == DB_REP_MASTER) ||
- (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT);
+ (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT) ||
+ FLD_ISSET(startopts, REP_START_FORCE_ROLECHG);
/*
* There is no need for lockout if all we're doing is sending a message.
@@ -511,9 +652,11 @@ __rep_start_int(env, dbt, flags)
goto out;
}
- if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
+ if (!FLD_ISSET(startopts, REP_START_WAIT_LOCKMSG) &&
+ FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
/*
- * There is already someone in msg lockout. Return.
+ * There is already someone in msg lockout and we are not
+ * waiting. Return.
*/
RPRINT(env, (env, DB_VERB_REP_MISC,
"Thread already in msg lockout"));
@@ -702,10 +845,15 @@ __rep_start_int(env, dbt, flags)
* now defunct on master.
* NEWFILE: Used to delay client apply during newfile
* operation, not applicable to master.
+ * READONLY_MASTER: Used to coordinate preferred master
+ * takeover, should not remain in effect after restart.
+ * HOLD_GEN: Freeze gen for preferred master, should not
+ * remain in effect after restart.
*/
F_CLR(rep, REP_F_CLIENT | REP_F_ABBREVIATED |
REP_F_MASTERELECT | REP_F_SKIPPED_APPLY | REP_F_DELAY |
- REP_F_LEASE_EXPIRED | REP_F_NEWFILE);
+ REP_F_LEASE_EXPIRED | REP_F_NEWFILE |
+ REP_F_READONLY_MASTER | REP_F_HOLD_GEN);
/*
* When becoming a master, set the following flags:
* MASTER: Indicate that this site is master.
@@ -842,11 +990,16 @@ __rep_start_int(env, dbt, flags)
}
/*
* When becoming a client, clear the following flags:
+ * HOLD_GEN: Freeze gen for preferred master, should not
+ * remain in effect after restart.
* MASTER: Site is no longer a master.
* MASTERELECT: Indicates that a master is elected
* rather than appointed, not applicable on client.
+ * READONLY_MASTER: Used to coordinate preferred master
+ * takeover, should not remain in effect after restart.
*/
- F_CLR(rep, REP_F_MASTER | REP_F_MASTERELECT);
+ F_CLR(rep, REP_F_HOLD_GEN | REP_F_MASTER | REP_F_MASTERELECT |
+ REP_F_READONLY_MASTER);
F_SET(rep, REP_F_CLIENT);
/*
@@ -928,6 +1081,15 @@ __rep_start_int(env, dbt, flags)
* sync with the master.
*/
SET_GEN(0);
+ /*
+ * If we are changing role to client, reset our min log file
+ * until we hear from a master or another client. In
+ * particular, in a dupmaster situation, if this site loses
+ * an election a stale min_log_file would prevent archiving.
+ */
+#ifdef HAVE_REPLICATION_THREADS
+ rep->min_log_file = 0;
+#endif
REP_SYSTEM_UNLOCK(env);
/*
@@ -935,6 +1097,15 @@ __rep_start_int(env, dbt, flags)
*/
if ((ret = __dbt_usercopy(env, dbt)) != 0)
goto out;
+ /*
+ * The HOLD_CLIGEN option does not allow this client's
+ * gen to change until the REP_F_HOLD_GEN flag is cleared.
+ * It prevents this site from responding to NEWMASTER messages
+ * and disables updating the gen from other incoming messages.
+ */
+ if (FLD_ISSET(startopts, REP_START_HOLD_CLIGEN))
+ F_SET(rep, REP_F_HOLD_GEN);
+
(void)__rep_send_message(env,
DB_EID_BROADCAST, REP_NEWCLIENT, NULL, dbt, 0, 0);
}
@@ -967,7 +1138,6 @@ out:
if (start_th)
MUTEX_UNLOCK(env, rep->mtx_repstart);
__dbt_userfree(env, dbt, NULL, NULL);
- ENV_LEAVE(env, ip);
return (ret);
}
@@ -1170,6 +1340,9 @@ __rep_client_dbinit(env, startup, which)
if (which == REP_DB) {
name = REPDBNAME;
rdbpp = &db_rep->rep_db;
+ } else if (which == REP_BLOB) {
+ name = REPBLOBNAME;
+ rdbpp = &db_rep->blob_dbp;
} else {
name = REPPAGENAME;
rdbpp = &db_rep->file_dbp;
@@ -1209,16 +1382,28 @@ __rep_client_dbinit(env, startup, which)
if (which == REP_DB &&
(ret = __bam_set_bt_compare(dbp, __rep_bt_cmp)) != 0)
goto err;
+ if (which == REP_BLOB &&
+ (ret = __bam_set_bt_compare(dbp, __rep_blob_cmp)) != 0 &&
+ (ret = __db_set_dup_compare(dbp, __rep_offset_cmp)) != 0)
+ goto err;
/* Don't write log records on the client. */
if ((ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
goto err;
+ /* Blob gap processing requires sorted duplicates. */
+ if (which == REP_BLOB) {
+ if ((ret = __db_set_blob_threshold(dbp, 0, 0)) != 0)
+ goto err;
+ if ((ret = __db_set_flags(dbp, DB_DUPSORT)) != 0)
+ goto err;
+ }
+
flags = DB_NO_AUTO_COMMIT | DB_CREATE | DB_INTERNAL_TEMPORARY_DB |
(F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
if ((ret = __db_open(dbp, ip, NULL, fname, subdb,
- (which == REP_DB ? DB_BTREE : DB_RECNO),
+ (which == REP_PG ? DB_RECNO : DB_BTREE),
flags, 0, PGNO_BASE_MD)) != 0)
goto err;
@@ -1243,14 +1428,16 @@ err: if (dbp != NULL &&
* care about the LSNs.
*/
static int
-__rep_bt_cmp(dbp, dbt1, dbt2)
+__rep_bt_cmp(dbp, dbt1, dbt2, locp)
DB *dbp;
const DBT *dbt1, *dbt2;
+ size_t *locp;
{
DB_LSN lsn1, lsn2;
__rep_control_args *rp1, *rp2;
COMPQUIET(dbp, NULL);
+ COMPQUIET(locp, NULL);
rp1 = dbt1->data;
rp2 = dbt2->data;
@@ -1274,6 +1461,82 @@ __rep_bt_cmp(dbp, dbt1, dbt2)
}
/*
+ * __rep_blob_cmp --
+ *
+ * Comparison function for the blob gap database. The key is the blob_sid
+ * appended with the blob_id.
+ *
+ * PUBLIC: int __rep_blob_cmp __P((DB *, const DBT *, const DBT *, size_t *));
+ */
+int
+__rep_blob_cmp(dbp, dbt1, dbt2, locp)
+ DB *dbp;
+ const DBT *dbt1, *dbt2;
+ size_t *locp;
+{
+ db_seq_t blob_id1, blob_id2, blob_sid1, blob_sid2;
+ u_int8_t *p;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(locp, NULL);
+
+ /* Use memcpy here to prevent alignment issues. */
+ p = dbt1->data;
+ memcpy(&blob_sid1, p, sizeof(db_seq_t));
+ p += sizeof(db_seq_t);
+ memcpy(&blob_id1, p, sizeof(db_seq_t));
+ p = dbt2->data;
+ memcpy(&blob_sid2, p, sizeof(db_seq_t));
+ p += sizeof(db_seq_t);
+ memcpy(&blob_id2, p, sizeof(db_seq_t));
+
+ if (blob_sid1 > blob_sid2)
+ return (1);
+
+ if (blob_sid1 < blob_sid2)
+ return (-1);
+
+ if (blob_id1 > blob_id2)
+ return (1);
+
+ if (blob_id1 < blob_id2)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * __rep_offset_cmp --
+ *
+ * Comparison function for duplicates in the the blob gap database.
+ *
+ * PUBLIC: int __rep_offset_cmp
+ * PUBLIC: __P((DB *, const DBT *, const DBT *, size_t *));
+ */
+int
+__rep_offset_cmp(dbp, dbt1, dbt2, locp)
+ DB *dbp;
+ const DBT *dbt1, *dbt2;
+ size_t *locp;
+{
+ off_t offset1, offset2;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(locp, NULL);
+
+ /* Use memcpy here to prevent alignment issues. */
+ memcpy(&offset1, dbt1->data, sizeof(off_t));
+ memcpy(&offset2, dbt2->data, sizeof(off_t));
+
+ if (offset1 == offset2)
+ return (0);
+ else if (offset1 > offset2)
+ return (1);
+
+ return (-1);
+}
+
+/*
* __rep_abort_prepared --
* Abort any prepared transactions that recovery restored.
*
@@ -1684,7 +1947,10 @@ __rep_set_nsites_pp(dbenv, n)
"DB_ENV->rep_set_nsites: cannot call from Replication Manager application"));
return (EINVAL);
}
- if ((ret = __rep_set_nsites_int(env, n)) == 0)
+ ENV_ENTER(env, ip);
+ ret = __rep_set_nsites_int(env, n);
+ ENV_LEAVE(env, ip);
+ if (ret == 0)
APP_SET_BASEAPI(env);
return (ret);
}
@@ -1748,18 +2014,15 @@ __rep_get_nsites(dbenv, n)
}
/*
- * PUBLIC: int __rep_set_priority __P((DB_ENV *, u_int32_t));
+ * PUBLIC: int __rep_set_priority_pp __P((DB_ENV *, u_int32_t));
*/
int
-__rep_set_priority(dbenv, priority)
+__rep_set_priority_pp(dbenv, priority)
DB_ENV *dbenv;
u_int32_t priority;
{
DB_REP *db_rep;
ENV *env;
- REP *rep;
- u_int32_t prev;
- int ret;
env = dbenv->env;
db_rep = env->rep_handle;
@@ -1767,6 +2030,30 @@ __rep_set_priority(dbenv, priority)
ENV_NOT_CONFIGURED(
env, db_rep->region, "DB_ENV->rep_set_priority", DB_INIT_REP);
+ if (PREFMAS_IS_SET(env)) {
+ __db_errx(env, DB_STR_A("3710",
+"%s: cannot change priority in preferred master mode.",
+ "%s"), "DB_ENV->rep_set_priority");
+ return (EINVAL);
+ }
+
+ return (__rep_set_priority_int(env, priority));
+}
+
+/*
+ * PUBLIC: int __rep_set_priority_int __P((ENV *, u_int32_t));
+ */
+int
+__rep_set_priority_int(env, priority)
+ ENV *env;
+ u_int32_t priority;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ u_int32_t prev;
+ int ret;
+
+ db_rep = env->rep_handle;
ret = 0;
if (REP_ON(env)) {
rep = db_rep->region;
@@ -1807,10 +2094,10 @@ __rep_get_priority(dbenv, priority)
}
/*
- * PUBLIC: int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t));
+ * PUBLIC: int __rep_set_timeout_pp __P((DB_ENV *, int, db_timeout_t));
*/
int
-__rep_set_timeout(dbenv, which, timeout)
+__rep_set_timeout_pp(dbenv, which, timeout)
DB_ENV *dbenv;
int which;
db_timeout_t timeout;
@@ -1818,13 +2105,10 @@ __rep_set_timeout(dbenv, which, timeout)
DB_REP *db_rep;
DB_THREAD_INFO *ip;
ENV *env;
- REP *rep;
int repmgr_timeout, ret;
env = dbenv->env;
db_rep = env->rep_handle;
- rep = db_rep->region;
- ret = 0;
repmgr_timeout = 0;
if (timeout == 0 && (which == DB_REP_CONNECTION_RETRY ||
@@ -1850,12 +2134,46 @@ __rep_set_timeout(dbenv, which, timeout)
return (EINVAL);
}
if (which == DB_REP_LEASE_TIMEOUT && IS_REP_STARTED(env)) {
- ret = EINVAL;
__db_errx(env, DB_STR_A("3568",
"%s: lease timeout must be set before DB_ENV->rep_start.",
"%s"), "DB_ENV->rep_set_timeout");
return (EINVAL);
}
+ if (PREFMAS_IS_SET(env) &&
+ (which == DB_REP_HEARTBEAT_MONITOR ||
+ which == DB_REP_HEARTBEAT_SEND) &&
+ timeout == 0) {
+ __db_errx(env, DB_STR_A("3711",
+"%s: cannot turn off heartbeat timeout in preferred master mode.",
+ "%s"), "DB_ENV->rep_set_timeout");
+ return (EINVAL);
+ }
+
+ ret = __rep_set_timeout_int(env, which, timeout);
+
+ /* Setting a repmgr timeout makes this a repmgr application */
+ if (ret == 0 && repmgr_timeout)
+ APP_SET_REPMGR(env);
+ return (ret);
+
+}
+
+/*
+ * PUBLIC: int __rep_set_timeout_int __P((ENV *, int, db_timeout_t));
+ */
+int
+__rep_set_timeout_int(env, which, timeout)
+ ENV *env;
+ int which;
+ db_timeout_t timeout;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
switch (which) {
case DB_REP_CHECKPOINT_DELAY:
@@ -1888,6 +2206,7 @@ __rep_set_timeout(dbenv, which, timeout)
rep->ack_timeout = timeout;
else
db_rep->ack_timeout = timeout;
+ ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout);
break;
case DB_REP_CONNECTION_RETRY:
if (REP_ON(env))
@@ -1919,10 +2238,6 @@ __rep_set_timeout(dbenv, which, timeout)
"Unknown timeout type argument to DB_ENV->rep_set_timeout"));
ret = EINVAL;
}
-
- /* Setting a repmgr timeout makes this a repmgr application */
- if (ret == 0 && repmgr_timeout)
- APP_SET_REPMGR(env);
return (ret);
}
@@ -2099,6 +2414,144 @@ __rep_set_request(dbenv, min, max)
}
/*
+ * __rep_set_view --
+ * Set the view/partial replication function.
+ *
+ * PUBLIC: int __rep_set_view __P((DB_ENV *,
+ * PUBLIC: int (*)(DB_ENV *, const char *, int *, u_int32_t)));
+ */
+int
+__rep_set_view(dbenv, f_partial)
+ DB_ENV *dbenv;
+ int (*f_partial) __P((DB_ENV *,
+ const char *, int *, u_int32_t));
+{
+ DB_REP *db_rep;
+ ENV *env;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_view", DB_INIT_REP);
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->rep_set_view");
+
+ if (f_partial == NULL)
+ db_rep->partial = __rep_defview;
+ else
+ db_rep->partial = f_partial;
+ return (0);
+}
+
+/*
+ * __rep_defview --
+ * Default view function. Always replicate.
+ */
+static int
+__rep_defview(dbenv, name, result, flags)
+ DB_ENV *dbenv;
+ const char *name;
+ int *result;
+ u_int32_t flags;
+{
+ COMPQUIET(dbenv, NULL);
+ COMPQUIET(name, NULL);
+ COMPQUIET(flags, 0);
+ *result = 1;
+ return (0);
+}
+
+/*
+ * __rep_call_partial --
+ * Calls the partial function, after doing some checks required for
+ * handling blobs.
+ *
+ * PUBLIC: int __rep_call_partial
+ * PUBLIC: __P((ENV *, const char *, int *, u_int32_t, DELAYED_BLOB_LIST **));
+ */
+int
+__rep_call_partial(env, name, result, flags, lsp)
+ ENV *env;
+ const char *name;
+ int *result;
+ u_int32_t flags;
+ DELAYED_BLOB_LIST **lsp;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DELAYED_BLOB_LIST *dbl;
+ FNAME *fname;
+ db_seq_t blob_file_id;
+ char *file_name;
+ int ret;
+
+ ret = 0;
+ blob_file_id = 0;
+ db_rep = env->rep_handle;
+ dblp = env->lg_handle;
+ fname = NULL;
+
+ /*
+ * If the database being sent is a blob meta database or file, then the
+ * name of its associated database needs to be passed to the partial
+ * function. To do this, use the blob file id in the path to the
+ * file to look up the blob_file_id of the associated database. That
+ * can be used to look up the name of the associated database through
+ * dbreg.
+ */
+ if (db_rep->partial == __rep_defview ||
+ (!IS_BLOB_META(name) && !IS_BLOB_FILE(name))) {
+ ret = db_rep->partial(env->dbenv, name, result, flags);
+ } else {
+ /*
+ * The top level blob meta database must always be replicated.
+ */
+ if (strcmp(name, BLOB_META_FILE_NAME) == 0) {
+ *result = 1;
+ return (ret);
+ }
+ if ((ret = __blob_path_to_dir_ids(
+ env, name, &blob_file_id, NULL)) != 0)
+ return (ret);
+ DB_ASSERT(env, blob_file_id > 0);
+
+ /*
+ * It is possible that the database that owns this blob meta
+ * database has not yet been processed on the client when
+ * processing the transaction, so assume it is not replicated.
+ * Return its information and process it later when its
+ * owning database is processed (which must happen in the
+ * same transaction).
+ */
+ if (__dbreg_blob_file_to_fname(
+ dblp, blob_file_id, 0, &fname) != 0) {
+ if ((ret = __os_malloc(
+ env, sizeof(DELAYED_BLOB_LIST), &dbl)) != 0)
+ return (ret);
+ memset(dbl, 0, sizeof(DELAYED_BLOB_LIST));
+ dbl->blob_file_id = blob_file_id;
+ if (*lsp == NULL)
+ *lsp = dbl;
+ else {
+ dbl->next = *lsp;
+ (*lsp)->prev = dbl;
+ *lsp = dbl;
+ }
+ *result = 0;
+ return (0);
+ }
+
+ file_name = fname->fname_off == INVALID_ROFF ?
+ NULL : R_ADDR(&dblp->reginfo, fname->fname_off);
+ DB_ASSERT(env, file_name != NULL);
+ ret = db_rep->partial(env->dbenv, file_name, result, flags);
+ }
+
+ return (ret);
+}
+
+/*
* __rep_set_transport_pp --
* Set the transport function for replication.
*
@@ -2288,25 +2741,46 @@ __rep_set_clockskew(dbenv, fast_clock, slow_clock)
}
/*
- * __rep_flush --
+ * __rep_flush_pp --
* Re-push the last log record to all clients, in case they've lost
* messages and don't know it.
*
- * PUBLIC: int __rep_flush __P((DB_ENV *));
+ * PUBLIC: int __rep_flush_pp __P((DB_ENV *));
*/
int
-__rep_flush(dbenv)
+__rep_flush_pp (dbenv)
DB_ENV *dbenv;
{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ENTER(env, ip);
+ ret = __rep_flush_int(env);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __rep_flush_int --
+ * Re-push the last log record to all clients, in case they've lost
+ * messages and don't know it.
+ *
+ * PUBLIC: int __rep_flush_int __P((ENV *));
+ */
+int
+__rep_flush_int(env)
+ ENV *env;
+{
DBT rec;
DB_LOGC *logc;
DB_LSN lsn;
DB_REP *db_rep;
- DB_THREAD_INFO *ip;
- ENV *env;
int ret, t_ret;
- env = dbenv->env;
db_rep = env->rep_handle;
ENV_REQUIRES_CONFIG_XX(
@@ -2322,8 +2796,6 @@ __rep_flush(dbenv)
return (EINVAL);
}
- ENV_ENTER(env, ip);
-
if ((ret = __log_cursor(env, &logc)) != 0)
return (ret);
@@ -2338,7 +2810,6 @@ __rep_flush(dbenv)
err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
ret = t_ret;
- ENV_LEAVE(env, ip);
return (ret);
}
@@ -2693,7 +3164,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
*/
if (commit_info->gen == gen) {
ret = __rep_read_lsn_history(env,
- ip, &txn, &dbc, gen, &hist, reasonp, DB_SET);
+ ip, &txn, &dbc, gen, &hist, reasonp, DB_SET, 1);
if (ret == DB_NOTFOUND) {
/*
* We haven't yet received the LSN history of the
@@ -2720,7 +3191,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
* masters at the same gen, and the txn of interest was
* rolled back.
*/
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto out;
}
@@ -2750,7 +3221,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
* description of the txn of interest doesn't match what we see
* in the history available to us now.
*/
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
} else if (commit_info->gen < gen || gen == 0) {
/*
@@ -2759,10 +3230,10 @@ __rep_check_applied(env, ip, commit_info, reasonp)
* the token LSN is within the close/open range defined by
* [base,next).
*/
- ret = __rep_read_lsn_history(env,
- ip, &txn, &dbc, commit_info->gen, &hist, reasonp, DB_SET);
- t_ret = __rep_read_lsn_history(env,
- ip, &txn, &dbc, commit_info->gen, &hist2, reasonp, DB_NEXT);
+ ret = __rep_read_lsn_history(env, ip,
+ &txn, &dbc, commit_info->gen, &hist, reasonp, DB_SET, 1);
+ t_ret = __rep_read_lsn_history(env, ip,
+ &txn, &dbc, commit_info->gen, &hist2, reasonp, DB_NEXT, 1);
if (ret == DB_NOTFOUND) {
/*
* If the desired gen is not in our database, it could
@@ -2812,7 +3283,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
* don't match, meaning the txn was written at a dup
* master and that gen instance was rolled back.
*/
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto out;
}
@@ -2837,7 +3308,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
LOG_COMPARE(&commit_info->lsn, &hist2.lsn) < 0)
ret = 0;
else
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
} else {
/*
* Token names a future gen. If we're a client and the LSN also
@@ -2851,7 +3322,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
reasonp->u.gen = commit_info->gen;
return (DB_TIMEOUT);
}
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
}
out:
@@ -2867,9 +3338,19 @@ out:
/*
* The txn and dbc handles are owned by caller, though we create them if
* necessary. Caller is responsible for closing them.
+ *
+ * The use_cache option is enabled for the read-your-writes feature, which
+ * makes frequent requests for the cached information (envid and lsn) when it
+ * is in use. Callers that require information that is not cached (e.g.
+ * timestamp) should not set use_cache.
+ *
+ * PUBLIC: int __rep_read_lsn_history __P((ENV *, DB_THREAD_INFO *, DB_TXN **,
+ * PUBLIC: DBC **, u_int32_t, __rep_lsn_hist_data_args *,
+ * PUBLIC: struct rep_waitgoal *, u_int32_t, int));
*/
-static int
-__rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags)
+int
+__rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags,
+ use_cache)
ENV *env;
DB_THREAD_INFO *ip;
DB_TXN **txn;
@@ -2878,6 +3359,7 @@ __rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags)
__rep_lsn_hist_data_args *gen_infop;
struct rep_waitgoal *reasonp;
u_int32_t flags;
+ int use_cache;
{
DB_REP *db_rep;
REP *rep;
@@ -2898,7 +3380,8 @@ __rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags)
/* Simply return cached info, if we already have it. */
desired_gen = flags == DB_SET ? gen : gen + 1;
REP_SYSTEM_LOCK(env);
- if (rep->gen == desired_gen && !IS_ZERO_LSN(rep->gen_base_lsn)) {
+ if (use_cache && rep->gen == desired_gen &&
+ !IS_ZERO_LSN(rep->gen_base_lsn)) {
gen_infop->lsn = rep->gen_base_lsn;
gen_infop->envid = rep->master_envid;
goto unlock;
@@ -3005,8 +3488,14 @@ __rep_conv_vers(env, log_ver)
/*
* We can't use a switch statement, some of the DB_LOGVERSION_XX
- * constants are the same
+ * constants are the same.
*/
+ if (log_ver == DB_LOGVERSION_61)
+ return (DB_REPVERSION_61);
+ if (log_ver == DB_LOGVERSION_60p1)
+ return (DB_REPVERSION_60);
+ if (log_ver == DB_LOGVERSION_60)
+ return (DB_REPVERSION_60);
if (log_ver == DB_LOGVERSION_53)
return (DB_REPVERSION_53);
if (log_ver == DB_LOGVERSION_52)
diff --git a/src/rep/rep_record.c b/src/rep/rep_record.c
index f4691974..b206e60e 100644
--- a/src/rep/rep_record.c
+++ b/src/rep/rep_record.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,13 +9,17 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_am.h"
#include "dbinc/lock.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"
-static int __rep_collect_txn __P((ENV *, DB_LSN *, LSN_COLLECTION *));
+static int __rep_collect_txn
+ __P((ENV *, DB_LSN *, LSN_COLLECTION *, DELAYED_BLOB_LIST **));
+static int __rep_remove_delayed_blobs
+ __P((ENV *, db_seq_t, u_int32_t ,DELAYED_BLOB_LIST **));
static int __rep_do_ckp __P((ENV *, DBT *, __rep_control_args *));
static int __rep_fire_newmaster __P((ENV *, u_int32_t, int));
static int __rep_fire_startupdone __P((ENV *, u_int32_t, int));
@@ -153,6 +157,7 @@ __rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp)
DB_LSN *ret_lsnp;
{
ENV *env;
+ DB_THREAD_INFO *ip;
int ret;
env = dbenv->env;
@@ -193,7 +198,9 @@ __rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp)
return (ret);
}
+ ENV_ENTER(env, ip);
ret = __rep_process_message_int(env, control, rec, eid, ret_lsnp);
+ ENV_LEAVE(env, ip);
__dbt_userfree(env, control, rec, NULL);
return (ret);
@@ -289,8 +296,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp)
if (ret_lsnp != NULL)
ZERO_LSN(*ret_lsnp);
- ENV_ENTER(env, ip);
-
+ ENV_GET_THREAD_INFO(env, ip);
REP_PRINT_MESSAGE(env, eid, rp, "rep_process_message", 0);
/*
* Check the version number for both rep and log. If it is
@@ -303,8 +309,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp)
"%lu %d"), (u_long)rp->rep_version,
DB_REPVERSION_MIN);
- ret = EINVAL;
- goto errlock;
+ return (EINVAL);
}
VPRINT(env, (env, DB_VERB_REP_MSGS,
"Received record %lu with old rep version %lu",
@@ -322,8 +327,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp)
__db_errx(env, DB_STR_A("3517",
"unexpected replication message version %lu, expected %d",
"%lu %d"), (u_long)rp->rep_version, DB_REPVERSION);
- ret = EINVAL;
- goto errlock;
+ return (EINVAL);
}
if (rp->log_version < DB_LOGVERSION) {
@@ -332,8 +336,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp)
"unsupported old replication log version %lu, minimum version %d",
"%lu %d"), (u_long)rp->log_version,
DB_LOGVERSION_MIN);
- ret = EINVAL;
- goto errlock;
+ return (EINVAL);
}
VPRINT(env, (env, DB_VERB_REP_MSGS,
"Received record %lu with old log version %lu",
@@ -342,8 +345,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp)
__db_errx(env, DB_STR_A("3519",
"unexpected log record version %lu, expected %d",
"%lu %d"), (u_long)rp->log_version, DB_LOGVERSION);
- ret = EINVAL;
- goto errlock;
+ return (EINVAL);
}
/*
@@ -465,9 +467,14 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp)
* accept the generation number and participate in future
* elections and communication. Otherwise, I need to hear about
* a new master and sync up.
+ *
+ * But do not do any of this if REP_F_HOLD_GEN is set. In
+ * this case we keep the site at its current gen until we
+ * clear this flag.
*/
- if (rp->rectype == REP_ALIVE ||
- rp->rectype == REP_VOTE1 || rp->rectype == REP_VOTE2) {
+ if ((rp->rectype == REP_ALIVE ||
+ rp->rectype == REP_VOTE1 || rp->rectype == REP_VOTE2) &&
+ !F_ISSET(rep, REP_F_HOLD_GEN)) {
REP_SYSTEM_LOCK(env);
RPRINT(env, (env, DB_VERB_REP_MSGS,
"Updating gen from %lu to %lu",
@@ -593,6 +600,38 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp)
ret = __rep_allreq(env, rp, eid);
CLIENT_REREQ;
break;
+ case REP_BLOB_ALL_REQ:
+ /* Blobs do not support peer-to-peer. */
+ RECOVERING_SKIP;
+ MASTER_ONLY(rep, rp);
+ ret = __rep_blob_allreq(env, eid, rec);
+ CLIENT_REREQ;
+ break;
+ case REP_BLOB_CHUNK:
+ /* Handle even if in recovery. */
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_blob_chunk(env, eid, ip, rec);
+ if (ret == DB_REP_PAGEDONE)
+ ret = 0;
+ break;
+ case REP_BLOB_CHUNK_REQ:
+ /* Blobs do not support peer-to-peer. */
+ RECOVERING_SKIP;
+ MASTER_ONLY(rep, rp);
+ ret = __rep_blob_chunk_req(env, eid, rec);
+ CLIENT_REREQ;
+ break;
+ case REP_BLOB_UPDATE:
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_blob_update(env, eid, ip, rec);
+ break;
+ case REP_BLOB_UPDATE_REQ:
+ MASTER_ONLY(rep, rp);
+ infop = env->reginfo;
+ renv = infop->primary;
+ MASTER_UPDATE(env, renv);
+ ret = __rep_blob_update_req(env, ip, rec);
+ break;
case REP_BULK_LOG:
RECOVERING_LOG_SKIP;
CLIENT_ONLY(rep, rp);
@@ -1059,8 +1098,6 @@ out:
*ret_lsnp = rp->lsn;
ret = DB_REP_NOTPERM;
}
- __dbt_userfree(env, control, rec, NULL);
- ENV_LEAVE(env, ip);
return (ret);
}
@@ -1290,8 +1327,24 @@ gap_check:
#endif
}
- if (ret == DB_KEYEXIST)
+ if (ret == DB_KEYEXIST) {
+ STAT(rep->stat.st_log_duplicated++);
+#ifdef CONFIG_TEST
+ STAT(rep->stat.st_log_futuredup++);
+#endif
+ if (is_dupp != NULL) {
+ *is_dupp = 1;
+ /*
+ * Could get overwritten by max_lsn later,
+ * but only when returning NOTPERM for a
+ * REPCTL_PERM record, in which case max_lsn
+ * is this log record.
+ */
+ if (ret_lsnp != NULL)
+ *ret_lsnp = lp->ready_lsn;
+ }
ret = 0;
+ }
if (ret != 0 && ret != ENOMEM)
goto done;
@@ -1337,10 +1390,11 @@ gap_check:
* But max_lsn is guaranteed <= ready_lsn, so
* it would be a more conservative LSN to return.
*/
- *ret_lsnp = lp->ready_lsn;
+ if (ret_lsnp != NULL)
+ *ret_lsnp = lp->ready_lsn;
}
LOGCOPY_32(env, &rectype, rec->data);
- if (rectype == DB___txn_regop || rectype == DB___txn_ckp)
+ if (IS_PERM_RECTYPE(rectype))
max_lsn = lp->max_perm_lsn;
/*
* We check REPCTL_LEASE here, because this client may
@@ -1536,6 +1590,7 @@ __rep_process_txn(env, rec)
DB_REP *db_rep;
DB_THREAD_INFO *ip;
DB_TXNHEAD *txninfo;
+ DELAYED_BLOB_LIST *dblp, *dummy;
LSN_COLLECTION lc;
REP *rep;
__txn_regop_args *txn_args;
@@ -1548,12 +1603,12 @@ __rep_process_txn(env, rec)
db_rep = env->rep_handle;
rep = db_rep->region;
logc = NULL;
+ dblp = dummy = NULL;
txn_args = NULL;
txn42_args = NULL;
prep_args = NULL;
txninfo = NULL;
- ENV_ENTER(env, ip);
memset(&data_dbt, 0, sizeof(data_dbt));
if (F_ISSET(env, ENV_THREAD))
F_SET(&data_dbt, DB_DBT_REALLOC);
@@ -1618,8 +1673,19 @@ __rep_process_txn(env, rec)
goto err;
/* Phase 1. Get a list of the LSNs in this transaction, and sort it. */
- if ((ret = __rep_collect_txn(env, &prev_lsn, &lc)) != 0)
+ if ((ret = __rep_collect_txn(env, &prev_lsn, &lc, &dblp)) != 0)
goto err;
+ /* Deal with any child transactions that had to be delayed. */
+ while (dblp != NULL) {
+ if ((ret = __rep_collect_txn(
+ env, &dblp->lsn, &lc, &dummy)) != 0)
+ goto err;
+ DB_ASSERT(env, dummy == NULL);
+ dummy = dblp;
+ dblp = dummy->next;
+ __os_free(env, dummy);
+ dummy = NULL;
+ }
qsort(lc.array, lc.nlsns, sizeof(DB_LSN), __rep_lsn_cmp);
/*
@@ -1627,6 +1693,7 @@ __rep_process_txn(env, rec)
* records. Create a txnlist so that they can keep track of file
* state between records.
*/
+ ENV_GET_THREAD_INFO(env, ip);
if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0)
goto err;
@@ -1647,6 +1714,7 @@ __rep_process_txn(env, rec)
(u_long)lsnp->file, (u_long)lsnp->offset);
goto err;
}
+ LOGCOPY_32(env, &rectype, data_dbt.data);
}
err: memset(&req, 0, sizeof(req));
@@ -1658,6 +1726,12 @@ err: memset(&req, 0, sizeof(req));
if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0)
ret = t_ret;
+ while (dblp != NULL) {
+ dummy = dblp;
+ dblp = dummy->next;
+ __os_free(env, dummy);
+ }
+
err1: if (txn_args != NULL)
__os_free(env, txn_args);
if (txn42_args != NULL)
@@ -1694,25 +1768,52 @@ err1: if (txn_args != NULL)
* the entire transaction family at once.
*/
static int
-__rep_collect_txn(env, lsnp, lc)
+__rep_collect_txn(env, lsnp, lc, dbl)
ENV *env;
DB_LSN *lsnp;
LSN_COLLECTION *lc;
+ DELAYED_BLOB_LIST **dbl;
{
+ __dbreg_register_args *dbregargp;
__txn_child_args *argp;
DB_LOGC *logc;
DB_LSN c_lsn;
+ DB_REP *db_rep;
DBT data;
- u_int32_t rectype;
+ db_seq_t blob_file_id;
+ u_int32_t child, rectype, skip_txnid;
u_int nalloc;
- int ret, t_ret;
+ int ret, t_ret, view_partial;
+ char *name;
memset(&data, 0, sizeof(data));
F_SET(&data, DB_DBT_REALLOC);
+ skip_txnid = TXN_INVALID;
if ((ret = __log_cursor(env, &logc)) != 0)
return (ret);
+ /*
+ * For partial replication we assume a certain sequence of
+ * log records to detect a database create and skip it if
+ * desired. We are walking backward through the records of
+ * a single transaction right now.
+ *
+ * A create operation is done inside a BDB-owned child txn.
+ * Nothing else is done within this BDB-owned child txn.
+ * The last piece of a create operations is the dbreg_register
+ * log record that records the opening of the file. That
+ * log record contains the child txnid in the 'id' field, and
+ * the file name. At this point we invoke the partial callback
+ * to determine if this database should be replicated. If it
+ * should not be replicated, we need to avoid collecting the
+ * entire child txn referenced in the 'id' field.
+ *
+ * So if processing the dbreg_register record finds a database
+ * to skip, we store the child txnid in 'skip_txnid'. We use
+ * 'skip_txnid' to avoid processing log records or making
+ * recursive calls for that txnid.
+ */
while (!IS_ZERO_LSN(*lsnp) &&
(ret = __logc_get(logc, lsnp, &data, DB_SET)) == 0) {
LOGCOPY_32(env, &rectype, data.data);
@@ -1722,9 +1823,66 @@ __rep_collect_txn(env, lsnp, lc)
goto err;
c_lsn = argp->c_lsn;
*lsnp = argp->prev_lsn;
+ child = argp->child;
__os_free(env, argp);
- ret = __rep_collect_txn(env, &c_lsn, lc);
- } else {
+
+ if (child == skip_txnid && *dbl != NULL &&
+ (*dbl)->child == child)
+ (*dbl)->lsn = c_lsn;
+ /*
+ * If skip_txnid is set, it is the id of the child txnid
+ * that creates a database we should skip. So, if
+ * this is that child txn, do not collect it.
+ */
+ if (skip_txnid == TXN_INVALID || child != skip_txnid)
+ ret = __rep_collect_txn(env, &c_lsn, lc, dbl);
+ } else if (IS_VIEW_SITE(env) &&
+ rectype == DB___dbreg_register) {
+ db_rep = env->rep_handle;
+ /*
+ * If we are a view see if this is a file creation
+ * stream. On-disk files have the creating child txn
+ * in the 'id' field and the name. See if this view
+ * wants this file.
+ */
+ if ((ret = __dbreg_register_read(
+ env, data.data, &dbregargp)) != 0)
+ goto err;
+ child = dbregargp->id;
+ name = (char *)dbregargp->name.data;
+ skip_txnid = TXN_INVALID;
+ if (child != TXN_INVALID &&
+ (!IS_DB_FILE(name) || IS_BLOB_META(name))) {
+ /*
+ * The 'id' has a child txn so it is a create.
+ */
+ DB_ASSERT(env, db_rep->partial != NULL);
+ GET_LO_HI(env, dbregargp->blob_fid_lo,
+ dbregargp->blob_fid_hi, blob_file_id, ret);
+ if (ret != 0)
+ goto err;
+ if ((ret = __rep_call_partial(env,
+ name, &view_partial, 0, dbl)) != 0) {
+ VPRINT(env, (env, DB_VERB_REP_MISC,
+ "rep_collect_txn: partial cb err %d for %s", ret, name));
+ __os_free(env, dbregargp);
+ goto err;
+ }
+ /*
+ * Save the child txnid for when we walk back
+ * into the txn_child record.
+ */
+ if (view_partial == 0) {
+ skip_txnid = child;
+ if ((ret =
+ __rep_remove_delayed_blobs(env,
+ blob_file_id, child, dbl)) != 0)
+ goto err;
+ }
+ }
+ __os_free(env, dbregargp);
+ }
+ if (rectype != DB___txn_child) {
if (lc->nalloc < lc->nlsns + 1) {
nalloc = lc->nalloc == 0 ? 20 : lc->nalloc * 2;
if ((ret = __os_realloc(env,
@@ -1761,6 +1919,62 @@ err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
}
/*
+ * __rep_remove_delayed_blobs --
+ *
+ * If a blob meta database is opened in the same transaction as the database
+ * that owns it, then deciding whether it should be replicated or not needs
+ * to be delayed until after the rest of the transaction is processed. To do
+ * this, the transaction's information is added to a DELAYED_BLOB_LIST. When
+ * the owning database is processed, if it is not replicated then remove the
+ * entry of its blob meta database from the delayed list.
+ */
+static int
+__rep_remove_delayed_blobs(env, blob_file_id, child, dbl)
+ ENV *env;
+ db_seq_t blob_file_id;
+ u_int32_t child;
+ DELAYED_BLOB_LIST **dbl;
+{
+ DELAYED_BLOB_LIST *ent, *next, *prev;
+
+ if (*dbl == NULL)
+ return (0);
+
+ /*
+ * If the child transaction has not been set, then a new entry was just
+ * added to the list.
+ */
+ if ((*dbl)->child == 0) {
+ (*dbl)->child = child;
+ return (0);
+ }
+
+ if (blob_file_id == 0)
+ return (0);
+
+ /*
+ * This blob meta database should not be replicated if its associated
+ * database is not replicated. Remove it from the delayed
+ * list so it will not be processed at a later time.
+ */
+ for (ent = *dbl; ent != NULL; ent = (DELAYED_BLOB_LIST *)ent->next) {
+ if (ent->blob_file_id == blob_file_id && ent->child != child) {
+ next = (DELAYED_BLOB_LIST *)ent->next;
+ prev = (DELAYED_BLOB_LIST *)ent->prev;
+ if (ent == *dbl)
+ *dbl = next;
+ if (prev != NULL)
+ prev->next = ent->next;
+ if (next != NULL)
+ next->prev = ent->prev;
+ __os_free(env, ent);
+ break;
+ }
+ }
+ return (0);
+}
+
+/*
* __rep_lsn_cmp --
* qsort-type-compatible wrapper for LOG_COMPARE.
*/
@@ -2138,9 +2352,13 @@ __rep_process_rec(env, ip, rp, rec, ret_tsp, ret_lsnp)
ret = __rep_process_txn(env, rec);
} while (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED);
- /* Now flush the log unless we're running TXN_NOSYNC. */
- if (ret == 0 && !F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC))
- ret = __log_flush(env, NULL);
+ /* Now write/flush the log as appropriate. */
+ if (ret == 0) {
+ if (F_ISSET(env->dbenv, DB_ENV_TXN_WRITE_NOSYNC))
+ ret = __log_rep_write(env);
+ else if (!F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC))
+ ret = __log_flush(env, NULL);
+ }
if (ret != 0) {
__db_errx(env, DB_STR_A("3526",
"Error processing txn [%lu][%lu]", "%lu %lu"),
@@ -2256,7 +2474,7 @@ __rep_resend_req(env, rereq)
DB_REP *db_rep;
LOG *lp;
REP *rep;
- int master, ret;
+ int blob_sync, master, ret;
repsync_t sync_state;
u_int32_t gapflags, msgtype, repflags, sendflags;
@@ -2271,6 +2489,7 @@ __rep_resend_req(env, rereq)
repflags = rep->flags;
sync_state = rep->sync_state;
+ blob_sync = rep->blob_sync;
/*
* If we are delayed we do not rerequest anything.
*/
@@ -2293,9 +2512,17 @@ __rep_resend_req(env, rereq)
*/
msgtype = REP_UPDATE_REQ;
} else if (sync_state == SYNC_PAGE) {
- REP_SYSTEM_LOCK(env);
- ret = __rep_pggap_req(env, rep, NULL, gapflags);
- REP_SYSTEM_UNLOCK(env);
+ if (blob_sync == 0) {
+ REP_SYSTEM_LOCK(env);
+ ret = __rep_pggap_req(env, rep, NULL, gapflags);
+ REP_SYSTEM_UNLOCK(env);
+ } else {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ ret = __rep_blob_rereq(env, rep);
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
} else {
MUTEX_LOCK(env, rep->mtx_clientdb);
ret = __rep_loggap_req(env, rep, NULL, gapflags);
@@ -2397,9 +2624,20 @@ __rep_skip_msg(env, rep, eid, rectype)
if (rep->master_id == DB_EID_INVALID) /* Case 1. */
(void)__rep_send_message(env,
DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0);
- else if (eid == rep->master_id) /* Case 2. */
- ret = __rep_resend_req(env, 0);
- else if (F_ISSET(rep, REP_F_CLIENT)) /* Case 3. */
+ else if (eid == rep->master_id) { /* Case 2. */
+ /*
+ * When we receive log messages in the SYNC_PAGE stage
+ * and we decide to rerequest, it often means the pages
+ * we expect have been dropped. Send a rerequest with
+ * gapflags for better performance.
+ */
+ if ((rectype == REP_LOG || rectype == REP_BULK_LOG ||
+ rectype == REP_LOG_MORE) &&
+ rep->sync_state == SYNC_PAGE)
+ ret = __rep_resend_req(env, 1);
+ else
+ ret = __rep_resend_req(env, 0);
+ } else if (F_ISSET(rep, REP_F_CLIENT)) /* Case 3. */
(void)__rep_send_message(env,
eid, REP_REREQUEST, NULL, NULL, 0, 0);
}
@@ -2421,7 +2659,6 @@ __rep_check_missing(env, gen, master_perm_lsn)
DB_LOG *dblp;
DB_LSN *end_lsn;
DB_REP *db_rep;
- DB_THREAD_INFO *ip;
LOG *lp;
REGINFO *infop;
REP *rep;
@@ -2434,7 +2671,6 @@ __rep_check_missing(env, gen, master_perm_lsn)
infop = env->reginfo;
has_log_gap = has_page_gap = ret = 0;
- ENV_ENTER(env, ip);
MUTEX_LOCK(env, rep->mtx_clientdb);
REP_SYSTEM_LOCK(env);
/*
@@ -2518,8 +2754,7 @@ __rep_check_missing(env, gen, master_perm_lsn)
rep->msg_th--;
REP_SYSTEM_UNLOCK(env);
-out: ENV_LEAVE(env, ip);
- return (ret);
+out: return (ret);
}
static int
diff --git a/src/rep/rep_region.c b/src/rep/rep_region.c
index f1d69dff..72372bff 100644
--- a/src/rep/rep_region.c
+++ b/src/rep/rep_region.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -14,6 +14,8 @@
static int __rep_egen_init __P((ENV *, REP *));
static int __rep_gen_init __P((ENV *, REP *));
+static int __rep_view_init __P((ENV *, REP *));
+static int __rep_viewfile_exists __P((ENV *, int *));
/*
* __rep_open --
@@ -29,7 +31,7 @@ __rep_open(env)
REGENV *renv;
REGINFO *infop;
REP *rep;
- int i, ret;
+ int i, ret, view;
char *p;
char fname[sizeof(REP_DIAGNAME) + 3];
@@ -37,10 +39,15 @@ __rep_open(env)
infop = env->reginfo;
renv = infop->primary;
ret = 0;
+ view = 0;
DB_ASSERT(env, DBREP_DIAG_FILES < 100);
if (renv->rep_off == INVALID_ROFF) {
- /* Must create the region. */
+ /*
+ * Must create the region. This environment either is being
+ * created for the first time or has just had its regions
+ * cleared by a recovery.
+ */
if ((ret = __env_alloc(infop, sizeof(REP), &rep)) != 0)
return (ret);
memset(rep, 0, sizeof(*rep));
@@ -108,6 +115,23 @@ __rep_open(env)
return (ret);
if ((ret = __rep_egen_init(env, rep)) != 0)
return (ret);
+ /*
+ * Determine if this is a view site or not. It is a view
+ * if the callback is set. If the site was a view in the
+ * past, we mark it as a view, but will check consistency
+ * later when starting replication.
+ */
+ if (db_rep->partial != NULL) {
+ rep->stat.st_view = 1;
+ if ((ret = __rep_view_init(env, rep)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __rep_viewfile_exists(env, &view)) != 0)
+ return (ret);
+ if (view)
+ rep->stat.st_view = 1;
+ }
+
rep->gbytes = db_rep->gbytes;
rep->bytes = db_rep->bytes;
rep->request_gap = db_rep->request_gap;
@@ -157,6 +181,32 @@ __rep_open(env)
"process joining the environment"));
return (EINVAL);
}
+ /*
+ * If we are joining an existing environment and we
+ * have a view callback set, then the environment must
+ * already be a view. If not, error.
+ *
+ * The other mismatch is not an error here (no callback
+ * set, but environment is a view) because we may be a
+ * rep unaware process such as db_stat and that is allowed
+ * to proceed. There is additional checking in other rep
+ * functions like rep_start to confirm consistency before
+ * using replication.
+ */
+ if (db_rep->partial != NULL) {
+ if ((ret = __rep_viewfile_exists(env, &view)) != 0)
+ return (ret);
+ /*
+ * If there is a callback, and we are not in-memory,
+ * there better be a view system file too.
+ */
+ if (view == 0 && !FLD_ISSET(rep->config, REP_C_INMEM)) {
+ __db_errx(env, DB_STR("3688",
+ "Application environment and view mismatch "
+ "joining the environment"));
+ return (EINVAL);
+ }
+ }
#ifdef HAVE_REPLICATION_THREADS
if ((ret = __repmgr_join(env, rep)) != 0)
return (ret);
@@ -506,9 +556,8 @@ __rep_write_egen(env, rep, egen)
* If running in-memory replication, return without any file
* operations.
*/
- if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
return (0);
- }
if ((ret = __db_appname(env,
DB_APP_META, REP_EGENNAME, NULL, &p)) != 0)
@@ -591,9 +640,8 @@ __rep_write_gen(env, rep, gen)
* If running in-memory replication, return without any file
* operations.
*/
- if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
return (0);
- }
if ((ret = __db_appname(env,
DB_APP_META, REP_GENNAME, NULL, &p)) != 0)
@@ -608,3 +656,105 @@ __rep_write_gen(env, rep, gen)
__os_free(env, p);
return (ret);
}
+
+/*
+ * __rep_view_init --
+ * Initialize the permanent view file to know this site is a view
+ * forever. The existence of the file is the record.
+ */
+static int
+__rep_view_init(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ DB_FH *fhp;
+ int ret;
+ char *p;
+
+ /*
+ * If running in-memory replication, return without any file
+ * operations.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
+ return (0);
+
+ if ((ret = __db_appname(env,
+ DB_APP_META, REPVIEW, NULL, &p)) != 0)
+ return (ret);
+
+ /*
+ * If the file doesn't exist, create it. We just want to open
+ * and close the file. It doesn't have any content.
+ * If the file already exists, there is nothing else to do.
+ */
+ if (__os_exists(env, p, NULL) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_MISC, "View init: Create %s", p));
+ if ((ret = __os_open(env, p, 0,
+ DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) != 0)
+ goto out;
+ (void)__os_closehandle(env, fhp);
+ }
+out: __os_free(env, p);
+ return (ret);
+}
+
+/*
+ * __rep_check_view --
+ * Check consistency between the view file and the db_rep handle.
+ *
+ * PUBLIC: int __rep_check_view __P((ENV *));
+ */
+int
+__rep_check_view(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int exist, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+
+ /*
+ * If running in-memory replication, check without any file
+ * operations. We can only check what exists in the region,
+ * which is the st_view field from a previous open.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
+ exist = (int)rep->stat.st_view;
+ else if ((ret = __rep_viewfile_exists(env, &exist)) != 0)
+ return (ret);
+
+ RPRINT(env, (env, DB_VERB_REP_MISC, "Check view. Exist %d, cb %d",
+ exist, (db_rep->partial != NULL)));
+ /*
+ * If view file exists, a partial function must be set.
+ * If view file does not exist, a partial function must not be set.
+ */
+ if ((exist == 0 && db_rep->partial != NULL) ||
+ (exist == 1 && db_rep->partial == NULL))
+ ret = EINVAL;
+ return (ret);
+}
+
+static int
+__rep_viewfile_exists(env, existp)
+ ENV *env;
+ int *existp;
+{
+ char *p;
+ int ret;
+
+ *existp = 0;
+ if ((ret = __db_appname(env,
+ DB_APP_META, REPVIEW, NULL, &p)) != 0)
+ return (ret);
+
+ if (__os_exists(env, p, NULL) == 0)
+ *existp = 1;
+
+ __os_free(env, p);
+ return (ret);
+
+}
diff --git a/src/rep/rep_stat.c b/src/rep/rep_stat.c
index addfee25..ffb9f262 100644
--- a/src/rep/rep_stat.c
+++ b/src/rep/rep_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -73,6 +73,13 @@ static const char *__rep_syncstate_to_string __P((repsync_t));
} \
} while (0)
+#define PRINT_VIEW(sp) do { \
+ if ((sp)->st_view != 0) \
+ __db_msg(env, "Environment configured as view site"); \
+ else \
+ __db_msg(env, "Environment not configured as view site");\
+} while (0)
+
/*
* __rep_stat_pp --
* ENV->rep_stat pre/post processing.
@@ -120,7 +127,7 @@ __rep_stat(env, statp, flags)
DB_REP_STAT *stats;
LOG *lp;
REP *rep;
- u_int32_t startupdone;
+ u_int32_t startupdone, view;
uintmax_t queued;
int dolock, ret;
@@ -177,10 +184,12 @@ __rep_stat(env, statp, flags)
if (LF_ISSET(DB_STAT_CLEAR)) {
queued = rep->stat.st_log_queued;
startupdone = rep->stat.st_startup_complete;
+ view = rep->stat.st_view;
memset(&rep->stat, 0, sizeof(rep->stat));
rep->stat.st_log_queued = rep->stat.st_log_queued_total =
rep->stat.st_log_queued_max = queued;
rep->stat.st_startup_complete = startupdone;
+ rep->stat.st_view = view;
}
/*
@@ -377,6 +386,7 @@ __rep_print_stats(env, flags)
__db_dl(env, "Number of page records missed and requested",
(u_long)sp->st_pg_requested);
PRINT_STARTUPCOMPLETE(sp);
+ PRINT_VIEW(sp);
__db_dl(env,
"Number of transactions applied", (u_long)sp->st_txns_applied);
@@ -462,16 +472,20 @@ __rep_print_all(env, flags)
u_int32_t flags;
{
static const FN rep_cfn[] = {
- { REP_C_2SITE_STRICT, "REP_C_2SITE_STRICT" },
- { REP_C_AUTOINIT, "REP_C_AUTOINIT" },
- { REP_C_AUTOROLLBACK, "REP_C_AUTOROLLBACK" },
- { REP_C_BULK, "REP_C_BULK" },
- { REP_C_DELAYCLIENT, "REP_C_DELAYCLIENT" },
- { REP_C_ELECTIONS, "REP_C_ELECTIONS" },
- { REP_C_INMEM, "REP_C_INMEM" },
- { REP_C_LEASE, "REP_C_LEASE" },
- { REP_C_NOWAIT, "REP_C_NOWAIT" },
- { 0, NULL }
+ { REP_C_2SITE_STRICT, "REP_C_2SITE_STRICT" },
+ { REP_C_AUTOINIT, "REP_C_AUTOINIT" },
+ { REP_C_AUTOROLLBACK, "REP_C_AUTOROLLBACK" },
+ { REP_C_AUTOTAKEOVER, "REP_C_AUTOTAKEOVER" },
+ { REP_C_BULK, "REP_C_BULK" },
+ { REP_C_DELAYCLIENT, "REP_C_DELAYCLIENT" },
+ { REP_C_ELECT_LOGLENGTH, "REP_C_ELECT_LOGLENGTH" },
+ { REP_C_ELECTIONS, "REP_C_ELECTIONS" },
+ { REP_C_INMEM, "REP_C_INMEM" },
+ { REP_C_LEASE, "REP_C_LEASE" },
+ { REP_C_NOWAIT, "REP_C_NOWAIT" },
+ { REP_C_PREFMAS_CLIENT, "REP_C_PREFMAS_CLIENT" },
+ { REP_C_PREFMAS_MASTER, "REP_C_PREFMAS_MASTER" },
+ { 0, NULL }
};
static const FN rep_efn[] = {
{ REP_E_PHASE0, "REP_E_PHASE0" },
@@ -481,19 +495,21 @@ __rep_print_all(env, flags)
{ 0, NULL }
};
static const FN rep_fn[] = {
- { REP_F_ABBREVIATED, "REP_F_ABBREVIATED" },
- { REP_F_APP_BASEAPI, "REP_F_APP_BASEAPI" },
- { REP_F_APP_REPMGR, "REP_F_APP_REPMGR" },
- { REP_F_CLIENT, "REP_F_CLIENT" },
- { REP_F_DELAY, "REP_F_DELAY" },
- { REP_F_GROUP_ESTD, "REP_F_GROUP_ESTD" },
- { REP_F_LEASE_EXPIRED, "REP_F_LEASE_EXPIRED" },
- { REP_F_MASTER, "REP_F_MASTER" },
- { REP_F_MASTERELECT, "REP_F_MASTERELECT" },
- { REP_F_NEWFILE, "REP_F_NEWFILE" },
- { REP_F_NIMDBS_LOADED, "REP_F_NIMDBS_LOADED" },
- { REP_F_SKIPPED_APPLY, "REP_F_SKIPPED_APPLY" },
- { REP_F_START_CALLED, "REP_F_START_CALLED" },
+ { REP_F_ABBREVIATED, "REP_F_ABBREVIATED" },
+ { REP_F_APP_BASEAPI, "REP_F_APP_BASEAPI" },
+ { REP_F_APP_REPMGR, "REP_F_APP_REPMGR" },
+ { REP_F_CLIENT, "REP_F_CLIENT" },
+ { REP_F_DELAY, "REP_F_DELAY" },
+ { REP_F_GROUP_ESTD, "REP_F_GROUP_ESTD" },
+ { REP_F_HOLD_GEN, "REP_F_HOLD_GEN" },
+ { REP_F_LEASE_EXPIRED, "REP_F_LEASE_EXPIRED" },
+ { REP_F_MASTER, "REP_F_MASTER" },
+ { REP_F_MASTERELECT, "REP_F_MASTERELECT" },
+ { REP_F_NEWFILE, "REP_F_NEWFILE" },
+ { REP_F_NIMDBS_LOADED, "REP_F_NIMDBS_LOADED" },
+ { REP_F_READONLY_MASTER, "REP_F_READONLY_MASTER" },
+ { REP_F_SKIPPED_APPLY, "REP_F_SKIPPED_APPLY" },
+ { REP_F_START_CALLED, "REP_F_START_CALLED" },
{ 0, NULL }
};
static const FN rep_lfn[] = {
@@ -523,15 +539,16 @@ __rep_print_all(env, flags)
rep = db_rep->region;
infop = env->reginfo;
renv = infop->primary;
- ENV_ENTER(env, ip);
__db_msg(env, "%s", DB_GLOBAL(db_line));
__db_msg(env, "DB_REP handle information:");
if (db_rep->rep_db == NULL)
STAT_ISSET("Bookkeeping database", db_rep->rep_db);
- else
+ else {
+ ENV_GET_THREAD_INFO(env, ip);
(void)__db_stat_print(db_rep->rep_db, ip, flags);
+ }
__db_prflags(env, NULL, db_rep->flags, dbrep_fn, NULL, "\tFlags");
@@ -604,7 +621,6 @@ __rep_print_all(env, flags)
STAT_LONG("Maximum lease timestamp microseconds",
lp->max_lease_ts.tv_nsec / NS_PER_US);
MUTEX_UNLOCK(env, rep->mtx_clientdb);
- ENV_LEAVE(env, ip);
return (0);
}
@@ -648,8 +664,10 @@ __rep_stat_summary_print(env)
ret = 0;
if ((ret = __rep_stat(env, &sp, 0)) == 0) {
PRINT_STATUS(sp, is_client);
- if (is_client)
+ if (is_client) {
PRINT_STARTUPCOMPLETE(sp);
+ PRINT_VIEW(sp);
+ }
PRINT_MAXPERMLSN(sp);
/*
* Use the number of sites that is kept up-to-date most
diff --git a/src/rep/rep_stub.c b/src/rep/rep_stub.c
index 2d96ea59..51c79eb0 100644
--- a/src/rep/rep_stub.c
+++ b/src/rep/rep_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -130,7 +130,7 @@ __rep_elect_pp(dbenv, nsites, nvotes, flags)
}
int
-__rep_flush(dbenv)
+__rep_flush_pp(dbenv)
DB_ENV *dbenv;
{
return (__db_norep(dbenv->env));
@@ -201,7 +201,7 @@ __rep_get_nsites(dbenv, n)
}
int
-__rep_set_priority(dbenv, priority)
+__rep_set_priority_pp(dbenv, priority)
DB_ENV *dbenv;
u_int32_t priority;
{
@@ -219,7 +219,7 @@ __rep_get_priority(dbenv, priority)
}
int
-__rep_set_timeout(dbenv, which, timeout)
+__rep_set_timeout_pp(dbenv, which, timeout)
DB_ENV *dbenv;
int which;
db_timeout_t timeout;
@@ -342,6 +342,16 @@ __rep_set_transport_pp(dbenv, eid, f_send)
}
int
+__rep_set_view(dbenv, f_partial)
+ DB_ENV *dbenv;
+ int (*f_partial) __P((DB_ENV *,
+ const char *, int *, u_int32_t));
+{
+ COMPQUIET(f_partial, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
__rep_set_request(dbenv, min, max)
DB_ENV *dbenv;
u_int32_t min, max;
diff --git a/src/rep/rep_util.c b/src/rep/rep_util.c
index 0dfe6122..5ee2592f 100644
--- a/src/rep/rep_util.c
+++ b/src/rep/rep_util.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -11,6 +11,7 @@
#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"
@@ -437,7 +438,7 @@ __rep_send_message(env, eid, rtype, lsnp, dbt, ctlflags, repflags)
FLD_ISSET(ctlflags, REPCTL_LEASE | REPCTL_PERM)) {
F_SET(&cntrl, REPCTL_LEASE);
DB_ASSERT(env, rep->version == DB_REPVERSION);
- __os_gettime(env, &msg_time, 1);
+ __os_gettime(env, &msg_time, 0);
cntrl.msg_sec = (u_int32_t)msg_time.tv_sec;
cntrl.msg_nsec = (u_int32_t)msg_time.tv_nsec;
}
@@ -591,6 +592,15 @@ __rep_new_master(env, cntrl, eid)
ret = 0;
logc = NULL;
lockout_msg = 0;
+
+ /*
+ * If REP_F_HOLD_GEN is set, we want to keep this site at its
+ * current gen. Do not process an incoming NEWMASTER, which
+ * would change the gen.
+ */
+ if (F_ISSET(rep, REP_F_HOLD_GEN))
+ return (ret);
+
REP_SYSTEM_LOCK(env);
change = rep->gen != cntrl->gen || rep->master_id != eid;
/*
@@ -1128,6 +1138,8 @@ __env_db_rep_exit(env)
rep = db_rep->region;
REP_SYSTEM_LOCK(env);
+ /* If we have a reference, it better not already be 0. */
+ DB_ASSERT(env, rep->handle_cnt != 0);
rep->handle_cnt--;
REP_SYSTEM_UNLOCK(env);
@@ -1190,7 +1202,7 @@ __db_rep_enter(dbp, checkgen, checklock, return_now)
* get an exclusive lock on this database.
*/
if (checkgen && dbp->mpf->mfp && IS_REP_CLIENT(env)) {
- if (dbp->mpf->mfp->excl_lockout)
+ if (dbp->mpf->mfp->excl_lockout)
return (DB_REP_HANDLE_DEAD);
}
@@ -1328,7 +1340,8 @@ __op_rep_exit(env)
rep = db_rep->region;
REP_SYSTEM_LOCK(env);
- DB_ASSERT(env, rep->op_cnt > 0);
+ /* If we have a reference, it better not already be 0. */
+ DB_ASSERT(env, rep->op_cnt != 0);
rep->op_cnt--;
REP_SYSTEM_UNLOCK(env);
@@ -1697,7 +1710,9 @@ __rep_msg_to_old(version, rectype)
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
- REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID },
/*
* 4.2/DB_REPVERSION 1 no longer supported.
*/
@@ -1708,7 +1723,9 @@ __rep_msg_to_old(version, rectype)
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
- REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID },
/*
* 4.3/DB_REPVERSION 2 no longer supported.
*/
@@ -1719,7 +1736,9 @@ __rep_msg_to_old(version, rectype)
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
- REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID },
/*
* From 4.7 message number To 4.4/4.5 message number
*/
@@ -1727,6 +1746,11 @@ __rep_msg_to_old(version, rectype)
1, /* REP_ALIVE */
2, /* REP_ALIVE_REQ */
3, /* REP_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_CHUNK */
+ REP_INVALID, /* REP_BLOB_CHUNK_REQ */
+ REP_INVALID, /* REP_BLOB_UPDATE */
+ REP_INVALID, /* REP_BLOB_UPDATE_REQ */
4, /* REP_BULK_LOG */
5, /* REP_BULK_PAGE */
6, /* REP_DUPMASTER */
@@ -1765,6 +1789,11 @@ __rep_msg_to_old(version, rectype)
1, /* REP_ALIVE */
2, /* REP_ALIVE_REQ */
3, /* REP_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_CHUNK */
+ REP_INVALID, /* REP_BLOB_CHUNK_REQ */
+ REP_INVALID, /* REP_BLOB_UPDATE */
+ REP_INVALID, /* REP_BLOB_UPDATE_REQ */
4, /* REP_BULK_LOG */
5, /* REP_BULK_PAGE */
6, /* REP_DUPMASTER */
@@ -1803,6 +1832,11 @@ __rep_msg_to_old(version, rectype)
1, /* REP_ALIVE */
2, /* REP_ALIVE_REQ */
3, /* REP_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_CHUNK */
+ REP_INVALID, /* REP_BLOB_CHUNK_REQ */
+ REP_INVALID, /* REP_BLOB_UPDATE */
+ REP_INVALID, /* REP_BLOB_UPDATE_REQ */
4, /* REP_BULK_LOG */
5, /* REP_BULK_PAGE */
6, /* REP_DUPMASTER */
@@ -1841,6 +1875,53 @@ __rep_msg_to_old(version, rectype)
1, /* REP_ALIVE */
2, /* REP_ALIVE_REQ */
3, /* REP_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_CHUNK */
+ REP_INVALID, /* REP_BLOB_CHUNK_REQ */
+ REP_INVALID, /* REP_BLOB_UPDATE */
+ REP_INVALID, /* REP_BLOB_UPDATE_REQ */
+ 4, /* REP_BULK_LOG */
+ 5, /* REP_BULK_PAGE */
+ 6, /* REP_DUPMASTER */
+ 7, /* REP_FILE */
+ 8, /* REP_FILE_FAIL */
+ 9, /* REP_FILE_REQ */
+ 10, /* REP_LEASE_GRANT */
+ 11, /* REP_LOG */
+ 12, /* REP_LOG_MORE */
+ 13, /* REP_LOG_REQ */
+ 14, /* REP_MASTER_REQ */
+ 15, /* REP_NEWCLIENT */
+ 16, /* REP_NEWFILE */
+ 17, /* REP_NEWMASTER */
+ 18, /* REP_NEWSITE */
+ 19, /* REP_PAGE */
+ 20, /* REP_PAGE_FAIL */
+ 21, /* REP_PAGE_MORE */
+ 22, /* REP_PAGE_REQ */
+ 23, /* REP_REREQUEST */
+ 24, /* REP_START_SYNC */
+ 25, /* REP_UPDATE */
+ 26, /* REP_UPDATE_REQ */
+ 27, /* REP_VERIFY */
+ 28, /* REP_VERIFY_FAIL */
+ 29, /* REP_VERIFY_REQ */
+ 30, /* REP_VOTE1 */
+ 31 /* REP_VOTE2 */
+ },
+ /*
+ * From 6.1 message number To 5.3 message number. Messages
+ * handling BLOBs were added.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* REP_ALIVE */
+ 2, /* REP_ALIVE_REQ */
+ 3, /* REP_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_ALL_REQ */
+ REP_INVALID, /* REP_BLOB_CHUNK */
+ REP_INVALID, /* REP_BLOB_CHUNK_REQ */
+ REP_INVALID, /* REP_BLOB_UPDATE */
+ REP_INVALID, /* REP_BLOB_UPDATE_REQ */
4, /* REP_BULK_LOG */
5, /* REP_BULK_PAGE */
6, /* REP_DUPMASTER */
@@ -1901,7 +1982,9 @@ __rep_msg_from_old(version, rectype)
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
- REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID },
/*
* 4.2/DB_REPVERSION 1 no longer supported.
*/
@@ -1912,7 +1995,9 @@ __rep_msg_from_old(version, rectype)
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
- REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID },
/*
* 4.3/DB_REPVERSION 2 no longer supported.
*/
@@ -1923,7 +2008,9 @@ __rep_msg_from_old(version, rectype)
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
- REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID },
/*
* From 4.4/4.5 message number To 4.7 message number
*/
@@ -1931,36 +2018,41 @@ __rep_msg_from_old(version, rectype)
1, /* 1, REP_ALIVE */
2, /* 2, REP_ALIVE_REQ */
3, /* 3, REP_ALL_REQ */
- 4, /* 4, REP_BULK_LOG */
- 5, /* 5, REP_BULK_PAGE */
- 6, /* 6, REP_DUPMASTER */
- 7, /* 7, REP_FILE */
- 8, /* 8, REP_FILE_FAIL */
- 9, /* 9, REP_FILE_REQ */
- /* 10, REP_LEASE_GRANT doesn't exist */
- 11, /* 10, REP_LOG */
- 12, /* 11, REP_LOG_MORE */
- 13, /* 12, REP_LOG_REQ */
- 14, /* 13, REP_MASTER_REQ */
- 15, /* 14, REP_NEWCLIENT */
- 16, /* 15, REP_NEWFILE */
- 17, /* 16, REP_NEWMASTER */
- 18, /* 17, REP_NEWSITE */
- 19, /* 18, REP_PAGE */
- 20, /* 19, REP_PAGE_FAIL */
- 21, /* 20, REP_PAGE_MORE */
- 22, /* 21, REP_PAGE_REQ */
- 23, /* 22, REP_REREQUEST */
- /* 24, REP_START_SYNC doesn't exist */
- 25, /* 23, REP_UPDATE */
- 26, /* 24, REP_UPDATE_REQ */
- 27, /* 25, REP_VERIFY */
- 28, /* 26, REP_VERIFY_FAIL */
- 29, /* 27, REP_VERIFY_REQ */
- 30, /* 28, REP_VOTE1 */
- 31, /* 29, REP_VOTE2 */
+ 9, /* 4, REP_BULK_LOG */
+ 10, /* 5, REP_BULK_PAGE */
+ 11, /* 6, REP_DUPMASTER */
+ 12, /* 7, REP_FILE */
+ 13, /* 8, REP_FILE_FAIL */
+ 14, /* 9, REP_FILE_REQ */
+ /* 15, REP_LEASE_GRANT doesn't exist */
+ 16, /* 10, REP_LOG */
+ 17, /* 11, REP_LOG_MORE */
+ 18, /* 12, REP_LOG_REQ */
+ 19, /* 13, REP_MASTER_REQ */
+ 20, /* 14, REP_NEWCLIENT */
+ 21, /* 15, REP_NEWFILE */
+ 22, /* 16, REP_NEWMASTER */
+ 23, /* 17, REP_NEWSITE */
+ 24, /* 18, REP_PAGE */
+ 25, /* 19, REP_PAGE_FAIL */
+ 26, /* 20, REP_PAGE_MORE */
+ 27, /* 21, REP_PAGE_REQ */
+ 28, /* 22, REP_REREQUEST */
+ /* 29, REP_START_SYNC doesn't exist */
+ 30, /* 23, REP_UPDATE */
+ 31, /* 24, REP_UPDATE_REQ */
+ 32, /* 25, REP_VERIFY */
+ 33, /* 26, REP_VERIFY_FAIL */
+ 34, /* 27, REP_VERIFY_REQ */
+ 35, /* 28, REP_VOTE1 */
+ 36, /* 29, REP_VOTE2 */
REP_INVALID, /* 30, 4.4/4.5 no message */
- REP_INVALID /* 31, 4.4/4.5 no message */
+ REP_INVALID, /* 31, 4.4/4.5 no message */
+ REP_INVALID, /* 32, 4.4/4.5 no message */
+ REP_INVALID, /* 33, 4.4/4.5 no message */
+ REP_INVALID, /* 34, 4.4/4.5 no message */
+ REP_INVALID, /* 35, 4.4/4.5 no message */
+ REP_INVALID /* 36, 4.4/4.5 no message */
},
/*
* From 4.6 message number To 4.7 message number. There are
@@ -1971,34 +2063,39 @@ __rep_msg_from_old(version, rectype)
1, /* 1, REP_ALIVE */
2, /* 2, REP_ALIVE_REQ */
3, /* 3, REP_ALL_REQ */
- 4, /* 4, REP_BULK_LOG */
- 5, /* 5, REP_BULK_PAGE */
- 6, /* 6, REP_DUPMASTER */
- 7, /* 7, REP_FILE */
- 8, /* 8, REP_FILE_FAIL */
- 9, /* 9, REP_FILE_REQ */
- 10, /* 10, REP_LEASE_GRANT */
- 11, /* 11, REP_LOG */
- 12, /* 12, REP_LOG_MORE */
- 13, /* 13, REP_LOG_REQ */
- 14, /* 14, REP_MASTER_REQ */
- 15, /* 15, REP_NEWCLIENT */
- 16, /* 16, REP_NEWFILE */
- 17, /* 17, REP_NEWMASTER */
- 18, /* 18, REP_NEWSITE */
- 19, /* 19, REP_PAGE */
- 20, /* 20, REP_PAGE_FAIL */
- 21, /* 21, REP_PAGE_MORE */
- 22, /* 22, REP_PAGE_REQ */
- 23, /* 22, REP_REREQUEST */
- 24, /* 24, REP_START_SYNC */
- 25, /* 25, REP_UPDATE */
- 26, /* 26, REP_UPDATE_REQ */
- 27, /* 27, REP_VERIFY */
- 28, /* 28, REP_VERIFY_FAIL */
- 29, /* 29, REP_VERIFY_REQ */
- 30, /* 30, REP_VOTE1 */
- 31 /* 31, REP_VOTE2 */
+ 9, /* 4, REP_BULK_LOG */
+ 10, /* 5, REP_BULK_PAGE */
+ 11, /* 6, REP_DUPMASTER */
+ 12, /* 7, REP_FILE */
+ 13, /* 8, REP_FILE_FAIL */
+ 14, /* 9, REP_FILE_REQ */
+ 15, /* 10, REP_LEASE_GRANT */
+ 16, /* 11, REP_LOG */
+ 17, /* 12, REP_LOG_MORE */
+ 18, /* 13, REP_LOG_REQ */
+ 19, /* 14, REP_MASTER_REQ */
+ 20, /* 15, REP_NEWCLIENT */
+ 21, /* 16, REP_NEWFILE */
+ 22, /* 17, REP_NEWMASTER */
+ 23, /* 18, REP_NEWSITE */
+ 24, /* 19, REP_PAGE */
+ 25, /* 20, REP_PAGE_FAIL */
+ 26, /* 21, REP_PAGE_MORE */
+ 27, /* 22, REP_PAGE_REQ */
+ 28, /* 22, REP_REREQUEST */
+ 29, /* 24, REP_START_SYNC */
+ 30, /* 25, REP_UPDATE */
+ 31, /* 26, REP_UPDATE_REQ */
+ 32, /* 27, REP_VERIFY */
+ 33, /* 28, REP_VERIFY_FAIL */
+ 34, /* 29, REP_VERIFY_REQ */
+ 35, /* 30, REP_VOTE1 */
+ 36, /* 31, REP_VOTE2 */
+ REP_INVALID, /* 32, 4.6/4.7 no message */
+ REP_INVALID, /* 33, 4.6/4.7 no message */
+ REP_INVALID, /* 34, 4.6/4.7 no message */
+ REP_INVALID, /* 35, 4.6/4.7 no message */
+ REP_INVALID /* 36, 4.6/4.7 no message */
},
/*
* From 4.7 message number To 5.2 message number. There are
@@ -2009,34 +2106,39 @@ __rep_msg_from_old(version, rectype)
1, /* 1, REP_ALIVE */
2, /* 2, REP_ALIVE_REQ */
3, /* 3, REP_ALL_REQ */
- 4, /* 4, REP_BULK_LOG */
- 5, /* 5, REP_BULK_PAGE */
- 6, /* 6, REP_DUPMASTER */
- 7, /* 7, REP_FILE */
- 8, /* 8, REP_FILE_FAIL */
- 9, /* 9, REP_FILE_REQ */
- 10, /* 10, REP_LEASE_GRANT */
- 11, /* 11, REP_LOG */
- 12, /* 12, REP_LOG_MORE */
- 13, /* 13, REP_LOG_REQ */
- 14, /* 14, REP_MASTER_REQ */
- 15, /* 15, REP_NEWCLIENT */
- 16, /* 16, REP_NEWFILE */
- 17, /* 17, REP_NEWMASTER */
- 18, /* 18, REP_NEWSITE */
- 19, /* 19, REP_PAGE */
- 20, /* 20, REP_PAGE_FAIL */
- 21, /* 21, REP_PAGE_MORE */
- 22, /* 22, REP_PAGE_REQ */
- 23, /* 22, REP_REREQUEST */
- 24, /* 24, REP_START_SYNC */
- 25, /* 25, REP_UPDATE */
- 26, /* 26, REP_UPDATE_REQ */
- 27, /* 27, REP_VERIFY */
- 28, /* 28, REP_VERIFY_FAIL */
- 29, /* 29, REP_VERIFY_REQ */
- 30, /* 30, REP_VOTE1 */
- 31 /* 31, REP_VOTE2 */
+ 9, /* 4, REP_BULK_LOG */
+ 10, /* 5, REP_BULK_PAGE */
+ 11, /* 6, REP_DUPMASTER */
+ 12, /* 7, REP_FILE */
+ 13, /* 8, REP_FILE_FAIL */
+ 14, /* 9, REP_FILE_REQ */
+ 15, /* 10, REP_LEASE_GRANT */
+ 16, /* 11, REP_LOG */
+ 17, /* 12, REP_LOG_MORE */
+ 18, /* 13, REP_LOG_REQ */
+ 19, /* 14, REP_MASTER_REQ */
+ 20, /* 15, REP_NEWCLIENT */
+ 21, /* 16, REP_NEWFILE */
+ 22, /* 17, REP_NEWMASTER */
+ 23, /* 18, REP_NEWSITE */
+ 24, /* 19, REP_PAGE */
+ 25, /* 20, REP_PAGE_FAIL */
+ 26, /* 21, REP_PAGE_MORE */
+ 27, /* 22, REP_PAGE_REQ */
+ 28, /* 22, REP_REREQUEST */
+ 29, /* 24, REP_START_SYNC */
+ 30, /* 25, REP_UPDATE */
+ 31, /* 26, REP_UPDATE_REQ */
+ 32, /* 27, REP_VERIFY */
+ 33, /* 28, REP_VERIFY_FAIL */
+ 34, /* 29, REP_VERIFY_REQ */
+ 35, /* 30, REP_VOTE1 */
+ 36, /* 31, REP_VOTE2 */
+ REP_INVALID, /* 32, 4.7/5.2 no message */
+ REP_INVALID, /* 33, 4.7/5.2 no message */
+ REP_INVALID, /* 34, 4.7/5.2 no message */
+ REP_INVALID, /* 35, 4.7/5.2 no message */
+ REP_INVALID /* 36, 4.7/5.2 no message */
},
/*
* From 4.7 message number To 5.3 message number. There are
@@ -2047,34 +2149,86 @@ __rep_msg_from_old(version, rectype)
1, /* 1, REP_ALIVE */
2, /* 2, REP_ALIVE_REQ */
3, /* 3, REP_ALL_REQ */
- 4, /* 4, REP_BULK_LOG */
- 5, /* 5, REP_BULK_PAGE */
- 6, /* 6, REP_DUPMASTER */
- 7, /* 7, REP_FILE */
- 8, /* 8, REP_FILE_FAIL */
- 9, /* 9, REP_FILE_REQ */
- 10, /* 10, REP_LEASE_GRANT */
- 11, /* 11, REP_LOG */
- 12, /* 12, REP_LOG_MORE */
- 13, /* 13, REP_LOG_REQ */
- 14, /* 14, REP_MASTER_REQ */
- 15, /* 15, REP_NEWCLIENT */
- 16, /* 16, REP_NEWFILE */
- 17, /* 17, REP_NEWMASTER */
- 18, /* 18, REP_NEWSITE */
- 19, /* 19, REP_PAGE */
- 20, /* 20, REP_PAGE_FAIL */
- 21, /* 21, REP_PAGE_MORE */
- 22, /* 22, REP_PAGE_REQ */
- 23, /* 22, REP_REREQUEST */
- 24, /* 24, REP_START_SYNC */
- 25, /* 25, REP_UPDATE */
- 26, /* 26, REP_UPDATE_REQ */
- 27, /* 27, REP_VERIFY */
- 28, /* 28, REP_VERIFY_FAIL */
- 29, /* 29, REP_VERIFY_REQ */
- 30, /* 30, REP_VOTE1 */
- 31 /* 31, REP_VOTE2 */
+ 9, /* 4, REP_BULK_LOG */
+ 10, /* 5, REP_BULK_PAGE */
+ 11, /* 6, REP_DUPMASTER */
+ 12, /* 7, REP_FILE */
+ 13, /* 8, REP_FILE_FAIL */
+ 14, /* 9, REP_FILE_REQ */
+ 15, /* 10, REP_LEASE_GRANT */
+ 16, /* 11, REP_LOG */
+ 17, /* 12, REP_LOG_MORE */
+ 18, /* 13, REP_LOG_REQ */
+ 19, /* 14, REP_MASTER_REQ */
+ 20, /* 15, REP_NEWCLIENT */
+ 21, /* 16, REP_NEWFILE */
+ 22, /* 17, REP_NEWMASTER */
+ 23, /* 18, REP_NEWSITE */
+ 24, /* 19, REP_PAGE */
+ 25, /* 20, REP_PAGE_FAIL */
+ 26, /* 21, REP_PAGE_MORE */
+ 27, /* 22, REP_PAGE_REQ */
+ 28, /* 22, REP_REREQUEST */
+ 29, /* 24, REP_START_SYNC */
+ 30, /* 25, REP_UPDATE */
+ 31, /* 26, REP_UPDATE_REQ */
+ 32, /* 27, REP_VERIFY */
+ 33, /* 28, REP_VERIFY_FAIL */
+ 34, /* 29, REP_VERIFY_REQ */
+ 35, /* 30, REP_VOTE1 */
+ 36, /* 31, REP_VOTE2 */
+ REP_INVALID, /* 32, 4.7/5.3 no message */
+ REP_INVALID, /* 33, 4.7/5.3 no message */
+ REP_INVALID, /* 34, 4.7/5.3 no message */
+ REP_INVALID, /* 35, 4.7/5.3 no message */
+ REP_INVALID /* 36, 4.7/5.3 no message */
+ },
+ /*
+ * From 5.3 message number To 6.1 message number. Messages to
+ * handle BLOBs were added.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* 1, REP_ALIVE */
+ 2, /* 2, REP_ALIVE_REQ */
+ 3, /* 3, REP_ALL_REQ */
+ /* 4, REP_BLOB_ALL_REQ doesn't exist */
+ /* 5, REP_BLOB_CHUNK doesn't exist */
+ /* 6, REP_BLOB_CHUNK_REQ doesn't exist */
+ /* 7, REP_BLOB_UPDATE doesn't exist */
+ /* 8, REP_BLOB_UPDATE_REQ doesn't exist */
+ 9, /* 4, REP_BULK_LOG */
+ 10, /* 5, REP_BULK_PAGE */
+ 11, /* 6, REP_DUPMASTER */
+ 12, /* 7, REP_FILE */
+ 13, /* 8, REP_FILE_FAIL */
+ 14, /* 9, REP_FILE_REQ */
+ 15, /* 10, REP_LEASE_GRANT */
+ 16, /* 11, REP_LOG */
+ 17, /* 12, REP_LOG_MORE */
+ 18, /* 13, REP_LOG_REQ */
+ 19, /* 14, REP_MASTER_REQ */
+ 20, /* 15, REP_NEWCLIENT */
+ 21, /* 16, REP_NEWFILE */
+ 22, /* 17, REP_NEWMASTER */
+ 23, /* 18, REP_NEWSITE */
+ 24, /* 19, REP_PAGE */
+ 25, /* 20, REP_PAGE_FAIL */
+ 26, /* 21, REP_PAGE_MORE */
+ 27, /* 22, REP_PAGE_REQ */
+ 28, /* 23, REP_REREQUEST */
+ 29, /* 24, REP_START_SYNC */
+ 30, /* 25, REP_UPDATE */
+ 31, /* 26, REP_UPDATE_REQ */
+ 32, /* 27, REP_VERIFY */
+ 33, /* 28, REP_VERIFY_FAIL */
+ 34, /* 29, REP_VERIFY_REQ */
+ 35, /* 30, REP_VOTE1 */
+ 36, /* 31, REP_VOTE2 */
+ REP_INVALID, /* 32, 5.3/6.1 no message */
+ REP_INVALID, /* 33, 5.3/6.1 no message */
+ REP_INVALID, /* 34, 5.3/6.1 no message */
+ REP_INVALID, /* 35, 5.3/6.1 no message */
+ REP_INVALID /* 36, 5.3/6.1 no message */
}
};
return (table[version][rectype]);
@@ -2215,9 +2369,9 @@ __rep_print_int(env, verbose, fmt, ap)
__os_id(env->dbenv, &pid, &tid);
if (diag_msg)
MUTEX_LOCK(env, rep->mtx_diag);
- __os_gettime(env, &ts, 1);
+ __os_gettime(env, &ts, 0);
__db_msgadd(env, &mb, "[%lu:%lu][%s] %s: ",
- (u_long)ts.tv_sec, (u_long)ts.tv_nsec/NS_PER_US,
+ (u_long)ts.tv_sec, (u_long)ts.tv_nsec / NS_PER_US,
env->dbenv->thread_id_string(env->dbenv, pid, tid, buf), s);
__db_msgadd_ap(env, &mb, fmt, ap);
@@ -2260,6 +2414,26 @@ __rep_print_message(env, eid, rp, str, flags)
FLD_SET(verbflag, DB_VERB_REP_MISC);
type = "all_req";
break;
+ case REP_BLOB_ALL_REQ:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "all_blob_req";
+ break;
+ case REP_BLOB_CHUNK:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "blob_chunk";
+ break;
+ case REP_BLOB_CHUNK_REQ:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "blob_chunk_req";
+ break;
+ case REP_BLOB_UPDATE:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "blob_update";
+ break;
+ case REP_BLOB_UPDATE_REQ:
+ FLD_SET(verbflag, DB_VERB_REP_MISC);
+ type = "blob_update_req";
+ break;
case REP_BULK_LOG:
FLD_SET(verbflag, DB_VERB_REP_MISC);
type = "bulk_log";
@@ -2650,9 +2824,19 @@ __rep_log_backup(env, logc, lsn, match)
*/
if ((match == REP_REC_COMMIT &&
rectype == DB___txn_regop) ||
- (match == REP_REC_PERM &&
- (rectype == DB___txn_ckp || rectype == DB___txn_regop)))
+ ((match == REP_REC_PERM || match == REP_REC_PERM_DEL) &&
+ IS_PERM_RECTYPE(rectype)))
break;
+ /*
+ * Break early if a file remove is discovered in the logs.
+ * BDB cannot restore a deleted database or blob file from
+ * logs, so trigger internal init to recover the file.
+ * Used by Instant Internal Init in replication.
+ */
+ if (match == REP_REC_PERM_DEL && rectype == DB___fop_remove) {
+ ret = DB_NOTFOUND;
+ break;
+ }
}
return (ret);
}
@@ -2671,7 +2855,6 @@ __rep_get_maxpermlsn(env, max_perm_lsnp)
{
DB_LOG *dblp;
DB_REP *db_rep;
- DB_THREAD_INFO *ip;
LOG *lp;
REP *rep;
@@ -2680,11 +2863,9 @@ __rep_get_maxpermlsn(env, max_perm_lsnp)
dblp = env->lg_handle;
lp = dblp->reginfo.primary;
- ENV_ENTER(env, ip);
MUTEX_LOCK(env, rep->mtx_clientdb);
*max_perm_lsnp = lp->max_perm_lsn;
MUTEX_UNLOCK(env, rep->mtx_clientdb);
- ENV_LEAVE(env, ip);
return (0);
}
@@ -2724,12 +2905,13 @@ __rep_get_datagen(env, data_genp)
u_int8_t data_buf[__REP_LSN_HIST_DATA_SIZE];
DBT key_dbt, data_dbt;
u_int32_t flags;
- int ret, t_ret, tries;
+ int ret, t_ret, tries, was_open;
db_rep = env->rep_handle;
ret = 0;
*data_genp = 0;
tries = 0;
+ was_open = 0;
flags = DB_LAST;
retry:
if ((ret = __txn_begin(env, NULL, NULL, &txn, DB_IGNORE_LEASE)) != 0)
@@ -2746,10 +2928,10 @@ retry:
* That is not an error.
*/
ret = 0;
- goto out;
+ goto noclose;
}
- db_rep->lsn_db = dbp;
- }
+ } else
+ was_open = 1;
if ((ret = __db_cursor(dbp, NULL, txn, &dbc, 0)) != 0)
goto out;
@@ -2784,8 +2966,126 @@ retry:
&key, key_buf, __REP_LSN_HIST_KEY_SIZE, NULL)) == 0)
*data_genp = key.gen;
out:
+ if (!was_open && dbp != NULL &&
+ (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+noclose:
if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
ret = t_ret;
err:
return (ret);
}
+
+/*
+ * __rep_become_readonly_master --
+ *
+ * Put this master into a state where it no longer accepts writes but it
+ * is still a master that can respond to requests for missing messages.
+ * It fills in sync_lsn to provide a mechanism to know the LSN of the
+ * next log record expected on this site. Generally, this site should
+ * be restarted as a client shortly after becoming a readonly master.
+ *
+ * PUBLIC: int __rep_become_readonly_master
+ * PUBLIC: __P((ENV *, u_int32_t *, DB_LSN *));
+ */
+int
+__rep_become_readonly_master(env, gen, sync_lsnp)
+ ENV *env;
+ u_int32_t *gen;
+ DB_LSN *sync_lsnp;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int locked, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ *gen = 0;
+ ZERO_LSN(*sync_lsnp);
+ ret = 0;
+ locked = 0;
+
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Lock out replication message thread processing so that replication
+ * world won't change (e.g. restart, client sync).
+ */
+ if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
+ /* There is already someone in msg lockout, return. */
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Readonly master: thread already in msg lockout"));
+ goto errunlock;
+ } else if ((ret = __rep_lockout_msg(env, rep, 0)) != 0)
+ goto errclearlockouts;
+
+ /*
+ * Lock out API to wait for active txn/mpool operations to complete
+ * and prevent new ones from starting.
+ */
+ if ((ret = __rep_lockout_api(env, rep)) != 0)
+ goto errclearlockouts;
+ locked = 1;
+
+ /* Make this site a readonly master and get master generation. */
+ F_SET(rep, REP_F_READONLY_MASTER);
+ *gen = rep->gen;
+ REP_SYSTEM_UNLOCK(env);
+
+ /* Get the next log record the logging subsystem expects to write. */
+ LOG_SYSTEM_LOCK(env);
+ *sync_lsnp = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+
+ REP_SYSTEM_LOCK(env);
+errclearlockouts:
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
+ if (locked)
+ CLR_LOCKOUT_BDB(rep);
+errunlock:
+ REP_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __rep_get_lsnhist_data --
+ *
+ * A utility function to get the full LSN history database record for a
+ * particular gen.
+ *
+ * PUBLIC: int __rep_get_lsnhist_data __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC: u_int32_t, __rep_lsn_hist_data_args *));
+ */
+int
+__rep_get_lsnhist_data(env, ip, gen, lsnhist_data)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t gen;
+ __rep_lsn_hist_data_args *lsnhist_data;
+{
+ DB_TXN *txn;
+ DBC *dbc;
+ struct rep_waitgoal reason;
+ int ret, t_ret;
+
+ txn = NULL;
+ dbc = NULL;
+
+ /*
+ * Cannot use cached LSN history values because we need the
+ * timestamp value here, which is not cached.
+ */
+ ret = __rep_read_lsn_history(env,
+ ip, &txn, &dbc, gen, lsnhist_data, &reason, DB_SET, 0);
+
+ if (dbc != NULL &&
+ (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (txn != NULL &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 1, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
diff --git a/src/rep/rep_verify.c b/src/rep/rep_verify.c
index 5238f900..40a0dfce 100644
--- a/src/rep/rep_verify.c
+++ b/src/rep/rep_verify.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -119,8 +119,15 @@ __rep_verify(env, rp, rec, eid, savetime)
goto out;
}
}
+ /*
+ * Search for a matching perm record. If none is found,
+ * or a database or file delete is encountered before the
+ * perm record, begin internal init. Database and blob file
+ * deletes cannot be undone once committed, so internal init
+ * must be used to re-create the files.
+ */
if ((ret = __rep_log_backup(env, logc, &lsn,
- REP_REC_PERM)) == 0) {
+ REP_REC_PERM_DEL)) == 0) {
MUTEX_LOCK(env, rep->mtx_clientdb);
lp->verify_lsn = lsn;
__os_gettime(env, &lp->rcvd_ts, 1);
@@ -205,8 +212,10 @@ __rep_internal_init(env, abbrev)
u_int32_t abbrev;
{
REP *rep;
+ u_int32_t ctlflags;
int master, ret;
+ ctlflags = 0;
rep = env->rep_handle->region;
REP_SYSTEM_LOCK(env);
#ifdef HAVE_STATISTICS
@@ -227,6 +236,7 @@ __rep_internal_init(env, abbrev)
RPRINT(env, (env, DB_VERB_REP_SYNC,
"send UPDATE_REQ, merely to check for NIMDB refresh"));
F_SET(rep, REP_F_ABBREVIATED);
+ FLD_SET(ctlflags, REPCTL_INMEM_ONLY);
} else
F_CLR(rep, REP_F_ABBREVIATED);
ZERO_LSN(rep->first_lsn);
@@ -237,7 +247,7 @@ __rep_internal_init(env, abbrev)
REP_SYSTEM_UNLOCK(env);
if (ret == 0 && master != DB_EID_INVALID)
(void)__rep_send_message(env,
- master, REP_UPDATE_REQ, NULL, NULL, 0, 0);
+ master, REP_UPDATE_REQ, NULL, NULL, ctlflags, 0);
return (ret);
}
@@ -504,8 +514,7 @@ __rep_dorecovery(env, lsnp, trunclsnp)
*/
DB_ASSERT(env, rep->op_cnt == 0);
DB_ASSERT(env, rep->msg_th == 1);
- if (rectype == DB___txn_regop || rectype == DB___txn_ckp ||
- rectype == DB___dbreg_register)
+ if (IS_PERM_RECTYPE(rectype) || rectype == DB___dbreg_register)
skip_rec = 0;
if (rectype == DB___txn_regop) {
if (rep->version >= DB_REPVERSION_44) {
@@ -653,8 +662,10 @@ __rep_verify_match(env, reclsnp, savetime)
/*
* Lockout the API and wait for operations to complete.
*/
- if ((ret = __rep_lockout_api(env, rep)) != 0)
+ if ((ret = __rep_lockout_api(env, rep)) != 0) {
+ FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG);
goto errunlock;
+ }
/* OK, everyone is out, we can now run recovery. */
REP_SYSTEM_UNLOCK(env);
@@ -690,6 +701,10 @@ __rep_verify_match(env, reclsnp, savetime)
*/
if (db_rep->rep_db == NULL &&
(ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+ REP_SYSTEM_LOCK(env);
+ FLD_CLR(rep->lockout_flags,
+ REP_LOCKOUT_API | REP_LOCKOUT_MSG | REP_LOCKOUT_OP);
+ REP_SYSTEM_UNLOCK(env);
MUTEX_UNLOCK(env, rep->mtx_clientdb);
goto out;
}
diff --git a/src/repmgr/repmgr.msg b/src/repmgr/repmgr.msg
index 020f2e9c..ba544936 100644
--- a/src/repmgr/repmgr.msg
+++ b/src/repmgr/repmgr.msg
@@ -65,6 +65,11 @@ ARG port u_int16_t
END
BEGIN_MSG membership_data
+ARG status u_int32_t
+ARG flags u_int32_t
+END
+
+BEGIN_MSG v4membership_data
ARG flags u_int32_t
END
@@ -98,22 +103,51 @@ BEGIN_MSG membr_vers
ARG version u_int32_t
ARG gen u_int32_t
END
+
BEGIN_MSG site_info check_length
ARG host DBT
ARG port u_int16_t
+ARG status u_int32_t
+ARG flags u_int32_t
+END
+
+BEGIN_MSG v4site_info check_length
+ARG host DBT
+ARG port u_int16_t
ARG flags u_int32_t
END
/*
* If site A breaks or rejects a connection from site B, it first
* tries to send B this message containing site A's currently known
- * membership DB version. Site B can use this to decide what to do.
- * If site B knows of a later version, it should retry the connection
- * to site A later, polling at it until site A catches up. However, if
- * site B's known version is less, it means that site B is no longer in
- * the group, and so instead it should shut down and notify the application.
+ * membership DB version and site B's status in site A's membership DB.
+ * Site B can use them to decide what to do. If site B knows of a later
+ * version, it should retry the connection to site A later, polling
+ * until site A catches up. However, if site B's known version is
+ * less and site B's status is adding in site A's membership DB, it
+ * means that a badly-timed change of master may have caused the current
+ * master to lose B's membership DB update to present, so it should
+ * retry the connection to site A later, otherwise, site B is no longer
+ * in the group and it should shut down and notify the application.
*/
BEGIN_MSG connect_reject
ARG version u_int32_t
ARG gen u_int32_t
+ARG status u_int32_t
+END
+
+BEGIN_MSG v4connect_reject
+ARG version u_int32_t
+ARG gen u_int32_t
+END
+
+/*
+ * For preferred master LSN history comparison between the sites.
+ * The next_gen_lsn is [0,0] if the next generation doesn't yet exist.
+ */
+BEGIN_MSG lsnhist_match
+ARG lsn DB_LSN
+ARG hist_sec u_int32_t
+ARG hist_nsec u_int32_t
+ARG next_gen_lsn DB_LSN
END
diff --git a/src/repmgr/repmgr.src b/src/repmgr/repmgr.src
index 68d8c239..f42e159f 100644
--- a/src/repmgr/repmgr.src
+++ b/src/repmgr/repmgr.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*/
DBPRIVATE
diff --git a/src/repmgr/repmgr_automsg.c b/src/repmgr/repmgr_automsg.c
index 90af08ff..31bc4c35 100644
--- a/src/repmgr/repmgr_automsg.c
+++ b/src/repmgr/repmgr_automsg.c
@@ -463,6 +463,7 @@ __repmgr_membership_data_marshal(env, argp, bp)
__repmgr_membership_data_args *argp;
u_int8_t *bp;
{
+ DB_HTONL_COPYOUT(env, bp, argp->status);
DB_HTONL_COPYOUT(env, bp, argp->flags);
}
@@ -481,6 +482,7 @@ __repmgr_membership_data_unmarshal(env, argp, bp, max, nextp)
{
if (max < __REPMGR_MEMBERSHIP_DATA_SIZE)
goto too_few;
+ DB_NTOHL_COPYIN(env, argp->status, bp);
DB_NTOHL_COPYIN(env, argp->flags, bp);
if (nextp != NULL)
@@ -494,6 +496,46 @@ too_few:
}
/*
+ * PUBLIC: void __repmgr_v4membership_data_marshal __P((ENV *,
+ * PUBLIC: __repmgr_v4membership_data_args *, u_int8_t *));
+ */
+void
+__repmgr_v4membership_data_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_v4membership_data_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+}
+
+/*
+ * PUBLIC: int __repmgr_v4membership_data_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_v4membership_data_args *, u_int8_t *, size_t,
+ * PUBLIC: u_int8_t **));
+ */
+int
+__repmgr_v4membership_data_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_v4membership_data_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_V4MEMBERSHIP_DATA_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_v4membership_data message"));
+ return (EINVAL);
+}
+
+/*
* PUBLIC: void __repmgr_member_metadata_marshal __P((ENV *,
* PUBLIC: __repmgr_member_metadata_args *, u_int8_t *));
*/
@@ -669,6 +711,7 @@ __repmgr_site_info_marshal(env, argp, bp, max, lenp)
bp += argp->host.size;
}
DB_HTONS_COPYOUT(env, bp, argp->port);
+ DB_HTONL_COPYOUT(env, bp, argp->status);
DB_HTONL_COPYOUT(env, bp, argp->flags);
*lenp = (size_t)(bp - start);
@@ -702,6 +745,7 @@ __repmgr_site_info_unmarshal(env, argp, bp, max, nextp)
goto too_few;
bp += argp->host.size;
DB_NTOHS_COPYIN(env, argp->port, bp);
+ DB_NTOHL_COPYIN(env, argp->status, bp);
DB_NTOHL_COPYIN(env, argp->flags, bp);
if (nextp != NULL)
@@ -715,6 +759,75 @@ too_few:
}
/*
+ * PUBLIC: int __repmgr_v4site_info_marshal __P((ENV *,
+ * PUBLIC: __repmgr_v4site_info_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__repmgr_v4site_info_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __repmgr_v4site_info_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REPMGR_V4SITE_INFO_SIZE
+ + (size_t)argp->host.size)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->host.size);
+ if (argp->host.size > 0) {
+ memcpy(bp, argp->host.data, argp->host.size);
+ bp += argp->host.size;
+ }
+ DB_HTONS_COPYOUT(env, bp, argp->port);
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_v4site_info_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_v4site_info_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_v4site_info_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_v4site_info_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+
+ needed = __REPMGR_V4SITE_INFO_SIZE;
+ if (max < needed)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->host.size, bp);
+ if (argp->host.size == 0)
+ argp->host.data = NULL;
+ else
+ argp->host.data = bp;
+ needed += (size_t)argp->host.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->host.size;
+ DB_NTOHS_COPYIN(env, argp->port, bp);
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_v4site_info message"));
+ return (EINVAL);
+}
+
+/*
* PUBLIC: void __repmgr_connect_reject_marshal __P((ENV *,
* PUBLIC: __repmgr_connect_reject_args *, u_int8_t *));
*/
@@ -726,6 +839,7 @@ __repmgr_connect_reject_marshal(env, argp, bp)
{
DB_HTONL_COPYOUT(env, bp, argp->version);
DB_HTONL_COPYOUT(env, bp, argp->gen);
+ DB_HTONL_COPYOUT(env, bp, argp->status);
}
/*
@@ -744,6 +858,7 @@ __repmgr_connect_reject_unmarshal(env, argp, bp, max, nextp)
goto too_few;
DB_NTOHL_COPYIN(env, argp->version, bp);
DB_NTOHL_COPYIN(env, argp->gen, bp);
+ DB_NTOHL_COPYIN(env, argp->status, bp);
if (nextp != NULL)
*nextp = bp;
@@ -755,3 +870,94 @@ too_few:
return (EINVAL);
}
+/*
+ * PUBLIC: void __repmgr_v4connect_reject_marshal __P((ENV *,
+ * PUBLIC: __repmgr_v4connect_reject_args *, u_int8_t *));
+ */
+void
+__repmgr_v4connect_reject_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_v4connect_reject_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->version);
+ DB_HTONL_COPYOUT(env, bp, argp->gen);
+}
+
+/*
+ * PUBLIC: int __repmgr_v4connect_reject_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_v4connect_reject_args *, u_int8_t *, size_t,
+ * PUBLIC: u_int8_t **));
+ */
+int
+__repmgr_v4connect_reject_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_v4connect_reject_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_V4CONNECT_REJECT_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->version, bp);
+ DB_NTOHL_COPYIN(env, argp->gen, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_v4connect_reject message"));
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: void __repmgr_lsnhist_match_marshal __P((ENV *,
+ * PUBLIC: __repmgr_lsnhist_match_args *, u_int8_t *));
+ */
+void
+__repmgr_lsnhist_match_marshal(env, argp, bp)
+ ENV *env;
+ __repmgr_lsnhist_match_args *argp;
+ u_int8_t *bp;
+{
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+ DB_HTONL_COPYOUT(env, bp, argp->hist_sec);
+ DB_HTONL_COPYOUT(env, bp, argp->hist_nsec);
+ DB_HTONL_COPYOUT(env, bp, argp->next_gen_lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->next_gen_lsn.offset);
+}
+
+/*
+ * PUBLIC: int __repmgr_lsnhist_match_unmarshal __P((ENV *,
+ * PUBLIC: __repmgr_lsnhist_match_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__repmgr_lsnhist_match_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __repmgr_lsnhist_match_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REPMGR_LSNHIST_MATCH_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+ DB_NTOHL_COPYIN(env, argp->hist_sec, bp);
+ DB_NTOHL_COPYIN(env, argp->hist_nsec, bp);
+ DB_NTOHL_COPYIN(env, argp->next_gen_lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->next_gen_lsn.offset, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env, DB_STR("3675",
+ "Not enough input bytes to fill a __repmgr_lsnhist_match message"));
+ return (EINVAL);
+}
+
diff --git a/src/repmgr/repmgr_elect.c b/src/repmgr/repmgr_elect.c
index 3a84694a..15a2de7b 100644
--- a/src/repmgr/repmgr_elect.c
+++ b/src/repmgr/repmgr_elect.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -12,9 +12,9 @@
static db_timeout_t __repmgr_compute_response_time __P((ENV *));
static int __repmgr_elect __P((ENV *, u_int32_t, db_timespec *));
-static int __repmgr_elect_main __P((ENV *, REPMGR_RUNNABLE *));
+static int __repmgr_elect_main __P((ENV *,
+ DB_THREAD_INFO *, REPMGR_RUNNABLE *));
static void *__repmgr_elect_thread __P((void *));
-static int send_membership __P((ENV *));
/*
* Starts an election thread.
@@ -90,26 +90,39 @@ __repmgr_elect_thread(argsp)
{
REPMGR_RUNNABLE *th;
ENV *env;
+ DB_THREAD_INFO *ip;
int ret;
th = argsp;
env = th->env;
+ ip = NULL;
+ ret = 0;
- RPRINT(env, (env, DB_VERB_REPMGR_MISC, "starting election thread"));
+ ENV_ENTER_RET(env, ip, ret);
+ if (ret == 0)
+ RPRINT(env, (env,
+ DB_VERB_REPMGR_MISC, "starting election thread"));
- if ((ret = __repmgr_elect_main(env, th)) != 0) {
+ if (ret != 0 || (ret = __repmgr_elect_main(env, ip, th)) != 0) {
__db_err(env, ret, "election thread failed");
+ RPRINT(env, (env,
+ DB_VERB_REPMGR_MISC, "election thread is exiting"));
+ ENV_LEAVE(env, ip);
(void)__repmgr_thread_failure(env, ret);
}
-
- RPRINT(env, (env, DB_VERB_REPMGR_MISC, "election thread is exiting"));
+ if (ret == 0) {
+ RPRINT(env, (env,
+ DB_VERB_REPMGR_MISC, "election thread is exiting"));
+ ENV_LEAVE(env, ip);
+ }
th->finished = TRUE;
return (NULL);
}
static int
-__repmgr_elect_main(env, th)
+__repmgr_elect_main(env, ip, th)
ENV *env;
+ DB_THREAD_INFO *ip;
REPMGR_RUNNABLE *th;
{
DB_REP *db_rep;
@@ -123,10 +136,13 @@ __repmgr_elect_main(env, th)
db_timespec failtime, now, repstart_time, target, wait_til;
db_timeout_t delay_time, response_time, tmp_time;
u_long sec, usec;
- u_int32_t flags;
- int done_repstart, ret, suppress_election;
+ u_int32_t flags, max_tries, tries;
+ int client_detected, done_repstart, lsnhist_match, master_detected;
+ int ret, suppress_election;
enum { ELECTION, REPSTART } action;
+ COMPQUIET(usec, 0);
+ COMPQUIET(max_tries, 0);
COMPQUIET(action, ELECTION);
db_rep = env->rep_handle;
@@ -181,6 +197,120 @@ __repmgr_elect_main(env, th)
UNLOCK_MUTEX(db_rep->mutex);
/*
+ * In preferred master mode, the select thread signals when a
+ * client has lost its connection to the master via prefmas_pending,
+ * but the actual restart as temporary master is done here in an
+ * election thread.
+ */
+ if (IS_PREFMAS_MODE(env) && F_ISSET(rep, REP_F_CLIENT) &&
+ db_rep->prefmas_pending == start_temp_master) {
+ db_rep->prefmas_pending = no_action;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "elect_main preferred master restart temp master"));
+ ret = __repmgr_become_master(env, 0);
+ goto out;
+ }
+
+ /* Get preferred master wait limits for detecting the other site. */
+ if (IS_PREFMAS_MODE(env) &&
+ (ret = __repmgr_prefmas_get_wait(env, &max_tries, &usec)) != 0)
+ goto out;
+
+ /* Preferred master mode master site start-up. */
+ if (IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER) &&
+ LF_ISSET(ELECT_F_STARTUP)) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "elect_main preferred master site startup"));
+ client_detected = FALSE;
+ lsnhist_match = FALSE;
+ tries = 0;
+ while (!client_detected && tries < max_tries) {
+ __os_yield(env, 0, usec);
+ tries++;
+ client_detected = __repmgr_prefmas_connected(env);
+ }
+ if (client_detected) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "elect_main preferred master client detected"));
+ /*
+ * Restart remote site as a client. Depending on the
+ * outcome of lsnhist_match below, this site will
+ * either restart as master or it will start an
+ * election. In either case, the remote site should
+ * be running as a client.
+ *
+ * Then perform the lsnhist_match comparison.
+ */
+ if ((ret = __repmgr_restart_site_as_client(
+ env, 1)) != 0 ||
+ (ret = __repmgr_lsnhist_match(env,
+ ip, 1, &lsnhist_match)) != 0)
+ goto out;
+ /*
+ * An lsnhist_match means that we have a continuous
+ * set of transactions and it is safe to call a
+ * comparison election to preserve any temporary master
+ * transactions that were committed while this site
+ * was down.
+ */
+ if (lsnhist_match) {
+ F_CLR(rep, REP_F_HOLD_GEN);
+ LF_SET(ELECT_F_IMMED);
+ LF_CLR(ELECT_F_STARTUP);
+ /* Continue on to election code below. */
+ }
+ }
+ /*
+ * If we didn't detect a client within a reasonable time or
+ * we failed the lsnhist_match (meaning we have conflicting
+ * sets of transactions), we start this site as a master and
+ * possibly force rollback of temporary master transactions.
+ */
+ if (!client_detected || !lsnhist_match) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "elect_main preferred master site start master"));
+ ret = __repmgr_become_master(env, 0);
+ F_CLR(rep, REP_F_HOLD_GEN);
+ goto out;
+ }
+ }
+
+ /* Preferred master mode client site start-up. */
+ if (IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT) &&
+ LF_ISSET(ELECT_F_STARTUP)) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "elect_main preferred master client site startup"));
+ master_detected = FALSE;
+ tries = 0;
+ while (!master_detected && tries < max_tries) {
+ __os_yield(env, 0, usec);
+ tries++;
+ master_detected = __repmgr_prefmas_connected(env);
+ }
+ /*
+ * If we find the master, restart as client here so that we
+ * send a newclient message after we are connected to the
+ * master. The master will send a newmaster message so that
+ * we can start the client sync process.
+ *
+ * If we haven't found the master after the timeout, start as
+ * temporary master.
+ */
+ if (master_detected) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "elect_main preferred master detected"));
+ ret = __repmgr_become_client(env);
+ } else {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "elect_main preferred master client start master"));
+ ret = __repmgr_become_master(env, 0);
+ }
+ goto out;
+ }
+
+ /*
* The 'done_repstart' flag keeps track of which was our most recent
* operation (repstart or election), so that we can alternate
* appropriately. There are a few different ways this thread can be
@@ -188,7 +318,7 @@ __repmgr_elect_main(env, th)
* called. The one exception is at initial start-up, where we
* first probe for a master by sending out rep_start(CLIENT) calls.
*/
- if (LF_ISSET(ELECT_F_IMMED)) {
+ if (LF_ISSET(ELECT_F_IMMED) && !IS_VIEW_SITE(env)) {
/*
* When the election succeeds, we've successfully completed
* everything we need to do. If it fails in an unexpected way,
@@ -256,11 +386,13 @@ __repmgr_elect_main(env, th)
/*
* See if it's time to retry the operation. Normally it's an
* election we're interested in retrying. But we refrain from
- * calling for elections if so configured.
+ * calling for elections if so configured or we are a view.
*/
- suppress_election = LF_ISSET(ELECT_F_STARTUP) ?
+ suppress_election = IS_VIEW_SITE(env) ||
+ (LF_ISSET(ELECT_F_STARTUP) ?
db_rep->init_policy == DB_REP_CLIENT :
- !FLD_ISSET(rep->config, REP_C_ELECTIONS);
+ !FLD_ISSET(rep->config, REP_C_ELECTIONS)) ||
+ LF_ISSET(ELECT_F_CLIENT_RESTART);
repstart_time = db_rep->repstart_time;
target = suppress_election ? repstart_time : failtime;
TIMESPEC_ADD_DB_TIMEOUT(&target, rep->election_retry_wait);
@@ -343,7 +475,8 @@ __repmgr_elect_main(env, th)
DB_ASSERT(env, action == REPSTART);
db_rep->new_connection = FALSE;
- if ((ret = __repmgr_repstart(env, DB_REP_CLIENT)) != 0)
+ if ((ret = __repmgr_repstart(env,
+ DB_REP_CLIENT, 0)) != 0)
goto out;
done_repstart = TRUE;
@@ -476,7 +609,20 @@ __repmgr_elect(env, flags, failtimep)
case DB_REP_UNAVAIL:
__os_gettime(env, failtimep, 1);
DB_EVENT(env, DB_EVENT_REP_ELECTION_FAILED, NULL);
- if ((t_ret = send_membership(env)) != 0)
+ /*
+ * If an election fails with DB_REP_UNAVAIL, it could be
+ * because a participating site has an obsolete, too-high
+ * notion of the group size. (This could happen if the site
+ * was down/disconnected during removal of some (other) sites.)
+ * To remedy this, broadcast a current copy of the membership
+ * list. Since all sites are doing this, and we always ratchet
+ * to the most up-to-date version, this should bring all sites
+ * up to date. We only do this after a failure, during what
+ * will normally be an idle period anyway, so that we don't
+ * slow down a first election following the loss of an active
+ * master.
+ */
+ if ((t_ret = __repmgr_bcast_member_list(env)) != 0)
ret = t_ret;
break;
@@ -498,40 +644,6 @@ __repmgr_elect(env, flags, failtimep)
}
/*
- * If an election fails with DB_REP_UNAVAIL, it could be because a participating
- * site has an obsolete, too-high notion of the group size. (This could happen
- * if the site was down/disconnected during removal of some (other) sites.) To
- * remedy this, broadcast a current copy of the membership list. Since all
- * sites are doing this, and we always ratchet to the most up-to-date version,
- * this should bring all sites up to date. We only do this after a failure,
- * during what will normally be an idle period anyway, so that we don't slow
- * down a first election following the loss of an active master.
- */
-static int
-send_membership(env)
- ENV *env;
-{
- DB_REP *db_rep;
- u_int8_t *buf;
- size_t len;
- int ret;
-
- db_rep = env->rep_handle;
- buf = NULL;
- LOCK_MUTEX(db_rep->mutex);
- if ((ret = __repmgr_marshal_member_list(env, &buf, &len)) != 0)
- goto out;
- RPRINT(env, (env, DB_VERB_REPMGR_MISC,
- "Broadcast latest membership list"));
- ret = __repmgr_bcast_own_msg(env, REPMGR_SHARING, buf, len);
-out:
- UNLOCK_MUTEX(db_rep->mutex);
- if (buf != NULL)
- __os_free(env, buf);
- return (ret);
-}
-
-/*
* Becomes master after we've won an election, if we can.
*
* PUBLIC: int __repmgr_claim_victory __P((ENV *));
@@ -543,7 +655,7 @@ __repmgr_claim_victory(env)
int ret;
env->rep_handle->takeover_pending = FALSE;
- if ((ret = __repmgr_become_master(env)) == DB_REP_UNAVAIL) {
+ if ((ret = __repmgr_become_master(env, 0)) == DB_REP_UNAVAIL) {
ret = 0;
RPRINT(env, (env, DB_VERB_REPMGR_MISC,
"Won election but lost race with DUPMASTER client intent"));
diff --git a/src/repmgr/repmgr_method.c b/src/repmgr/repmgr_method.c
index 229cf650..729ba5ff 100644
--- a/src/repmgr/repmgr_method.c
+++ b/src/repmgr/repmgr_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -29,19 +29,17 @@ static int get_channel_connection __P((CHANNEL *, REPMGR_CONNECTION **));
static int init_dbsite __P((ENV *, int, const char *, u_int, DB_SITE **));
static int join_group_at_site __P((ENV *, repmgr_netaddr_t *));
static int kick_blockers __P((ENV *, REPMGR_CONNECTION *, void *));
-static int make_request_conn __P((ENV *,
- repmgr_netaddr_t *, REPMGR_CONNECTION **));
static int set_local_site __P((DB_SITE *, u_int32_t));
-static int read_own_msg __P((ENV *,
- REPMGR_CONNECTION *, u_int32_t *, u_int8_t **, size_t *));
static int refresh_site __P((DB_SITE *));
static int __repmgr_await_threads __P((ENV *));
static int __repmgr_build_data_out __P((ENV *,
DBT *, u_int32_t, __repmgr_msg_metadata_args *, REPMGR_IOVECS **iovecsp));
static int __repmgr_build_msg_out __P((ENV *,
DBT *, u_int32_t, __repmgr_msg_metadata_args *, REPMGR_IOVECS **iovecsp));
+static int __repmgr_demote_site(ENV *, int);
static int repmgr_only __P((ENV *, const char *));
static int __repmgr_restart __P((ENV *, int, u_int32_t));
+static int __repmgr_remove_and_close_site __P((DB_SITE *));
static int __repmgr_remove_site __P((DB_SITE *));
static int __repmgr_remove_site_pp __P((DB_SITE *));
static int __repmgr_start_msg_threads __P((ENV *, u_int));
@@ -52,25 +50,21 @@ static int send_msg_self __P((ENV *, REPMGR_IOVECS *, u_int32_t));
static int site_by_addr __P((ENV *, const char *, u_int, DB_SITE **));
/*
- * PUBLIC: int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+ * PUBLIC: int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t));
*/
int
-__repmgr_start(dbenv, nthreads, flags)
+__repmgr_start_pp(dbenv, nthreads, flags)
DB_ENV *dbenv;
int nthreads;
u_int32_t flags;
{
DB_REP *db_rep;
- REP *rep;
- REPMGR_SITE *me, *site;
- DB_THREAD_INFO *ip;
ENV *env;
- int first, is_listener, locked, min, need_masterseek, ret, start_master;
- u_int i, n;
+ DB_THREAD_INFO *ip;
+ int ret;
env = dbenv->env;
db_rep = env->rep_handle;
- rep = db_rep->region;
switch (flags) {
case 0:
@@ -102,7 +96,27 @@ __repmgr_start(dbenv, nthreads, flags)
return (EINVAL);
}
- /* Check if it is a shut-down site, if so, clean the resources. */
+ /* A view site cannot be started as MASTER or ELECTION. */
+ if (IS_VIEW_SITE(env) &&
+ (flags == DB_REP_MASTER || flags == DB_REP_ELECTION)) {
+ __db_errx(env, DB_STR("3694",
+ "A view site must be started with DB_REP_CLIENT"));
+ return (EINVAL);
+ }
+
+ /* Must start site as client in preferred master mode. */
+ if (PREFMAS_IS_SET(env) &&
+ (flags == DB_REP_MASTER || flags == DB_REP_ELECTION)) {
+ __db_errx(env, DB_STR("3702",
+ "A preferred master site must be started with "
+ "DB_REP_CLIENT"));
+ return (EINVAL);
+ }
+
+ /*
+ * Check if it is a shut-down site, if so, clean the resources and
+ * reset the status in order to get ready to start replication.
+ */
if (db_rep->repmgr_status == stopped) {
if ((ret = __repmgr_stop(env)) != 0) {
__db_errx(env, DB_STR("3638",
@@ -112,7 +126,55 @@ __repmgr_start(dbenv, nthreads, flags)
db_rep->repmgr_status = ready;
}
+ /* Record the original configurations given by application. */
+ ENV_ENTER(env, ip);
db_rep->init_policy = flags;
+ db_rep->config_nthreads = nthreads;
+ ret = __repmgr_start_int(env, nthreads, flags);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * Internal processing to start replication manager.
+ *
+ * PUBLIC: int __repmgr_start_int __P((ENV *, int, u_int32_t));
+ */
+int
+__repmgr_start_int(env, nthreads, flags)
+ ENV *env;
+ int nthreads;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ REPMGR_SITE *me, *site;
+ u_int32_t startopts;
+ int first, flags_error, is_listener, locked, min;
+ int need_masterseek, ret, start_master;
+ u_int i, n;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ flags_error = 0;
+ startopts = 0;
+
+ /*
+ * For preferred master master site startup, we need to save the
+ * log location at the end of our previous transactions for
+ * the lsnhist_match comparisons. Starting repmgr adds a few
+ * more log records that we don't want to count in lsnhist_match.
+ */
+ if (FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)) {
+ LOG_SYSTEM_LOCK(env);
+ db_rep->prefmas_init_lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ }
+
if ((ret = __rep_set_transport_int(env,
db_rep->self_eid, __repmgr_send)) != 0)
return (ret);
@@ -128,7 +190,8 @@ __repmgr_start(dbenv, nthreads, flags)
if (db_rep->restored_list != NULL) {
ret = __repmgr_refresh_membership(env,
- db_rep->restored_list, db_rep->restored_list_length);
+ db_rep->restored_list, db_rep->restored_list_length,
+ DB_REPMGR_VERSION);
__os_free(env, db_rep->restored_list);
db_rep->restored_list = NULL;
} else {
@@ -145,9 +208,15 @@ __repmgr_start(dbenv, nthreads, flags)
* join.
*/
ret = __repmgr_join_group(env);
+ else if (VIEW_TO_PARTICIPANT(db_rep, me)) {
+ __db_errx(env, DB_STR("3695",
+ "A view site must be started with a view callback"));
+ return (EINVAL);
+ }
} else if (ret == ENOENT) {
- ENV_ENTER(env, ip);
- if (FLD_ISSET(me->config, DB_GROUP_CREATOR))
+ if (FLD_ISSET(me->config, DB_GROUP_CREATOR) ||
+ (IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)))
start_master = TRUE;
/*
* LEGACY is inconsistent with CREATOR, but start_master
@@ -166,10 +235,12 @@ __repmgr_start(dbenv, nthreads, flags)
continue;
if ((ret = __repmgr_set_membership(env,
site->net_addr.host,
- site->net_addr.port,
- SITE_PRESENT)) != 0)
+ site->net_addr.port, SITE_PRESENT,
+ site->gmdb_flags)) != 0)
break;
- n++;
+ if (!FLD_ISSET(site->gmdb_flags,
+ SITE_VIEW))
+ n++;
}
ret = __rep_set_nsites_int(env, n);
DB_ASSERT(env, ret == 0);
@@ -180,30 +251,27 @@ __repmgr_start(dbenv, nthreads, flags)
db_rep->member_version_gen = 1;
if ((ret = __repmgr_set_membership(env,
me->net_addr.host, me->net_addr.port,
- SITE_PRESENT)) == 0) {
+ SITE_PRESENT, 0)) == 0) {
ret = __rep_set_nsites_int(env, 1);
DB_ASSERT(env, ret == 0);
}
UNLOCK_MUTEX(db_rep->mutex);
} else
ret = __repmgr_join_group(env);
- ENV_LEAVE(env, ip);
} else if (ret == DB_DELETED)
ret = DB_REP_UNAVAIL;
}
if (ret != 0)
return (ret);
- DB_ASSERT(env, start_master ||
- SITE_FROM_EID(db_rep->self_eid)->membership == SITE_PRESENT);
-
/*
- * If we're the first repmgr_start() call, we will have to start threads.
- * Therefore, we require a flags value (to tell us how).
+ * Catch case where user defines a different local site address than
+ * the one in the restored_list from an ongoing internal init.
*/
- if (db_rep->repmgr_status != running && flags == 0) {
- __db_errx(env, DB_STR("3639",
- "a non-zero flags value is required for initial repmgr_start() call"));
+ if (!start_master &&
+ SITE_FROM_EID(db_rep->self_eid)->membership != SITE_PRESENT) {
+ __db_errx(env, DB_STR("3696",
+ "Current local site conflicts with earlier definition"));
return (EINVAL);
}
@@ -214,37 +282,54 @@ __repmgr_start(dbenv, nthreads, flags)
*
* Then, in case there could be multiple processes, we're either the
* main listener process or a subordinate process. On a "subsequent"
- * repmgr_start() call we already have enough information to know which
- * it is. Otherwise, negotiate with information in the shared region to
- * claim the listener role if possible.
+ * repmgr_start() call, with a running main listener process, we already
+ * have enough information to know which it is. Otherwise, if there is
+ * no listener, negotiate with information in the shared region to claim
+ * the listener role if possible. Once we decide we're the listener,
+ * mark the listener id in the shared region, so that no other process
+ * thinks the same thing.
*
* To avoid a race, once we decide we're in the first call, mark the
* handle as started, so that no other thread thinks the same thing.
*/
+ first = FALSE;
+ is_listener = FALSE;
LOCK_MUTEX(db_rep->mutex);
locked = TRUE;
- if (db_rep->repmgr_status == running) {
- first = FALSE;
+ if (db_rep->repmgr_status == running && !(rep->listener == 0 &&
+ FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER)))
is_listener = !IS_SUBORDINATE(db_rep);
- } else {
+ else if (db_rep->repmgr_status != running &&
+ rep->listener == 0 && flags == 0)
+ flags_error = 1;
+ else {
first = TRUE;
db_rep->repmgr_status = running;
- ENV_ENTER(env, ip);
MUTEX_LOCK(env, rep->mtx_repmgr);
if (rep->listener == 0) {
is_listener = TRUE;
- __os_id(dbenv, &rep->listener, NULL);
- } else {
- is_listener = FALSE;
+ __os_id(env->dbenv, &rep->listener, NULL);
+ } else
nthreads = 0;
- }
MUTEX_UNLOCK(env, rep->mtx_repmgr);
- ENV_LEAVE(env, ip);
}
UNLOCK_MUTEX(db_rep->mutex);
locked = FALSE;
+ /*
+ * The first repmgr_start() call for the main listener process
+ * requires a flags value to tell us how to start up the site.
+ * But we don't require a flags value for the repmgr_start()
+ * call for a subordinate process because the site is already
+ * started and we would only ignore the value anyway.
+ */
+ if (flags_error) {
+ __db_errx(env, DB_STR("3639",
+ "A non-zero flags value is required for initial repmgr_start() call"));
+ return (EINVAL);
+ }
+
if (!first) {
/*
* Subsequent call is allowed when ELECTIONS are turned off, so
@@ -266,7 +351,7 @@ __repmgr_start(dbenv, nthreads, flags)
/*
* The minimum legal number of threads is either 1 or 0, depending upon
- * whether we're the main process or a subordinate.
+ * whether we're the listener process or a subordinate.
*/
min = is_listener ? 1 : 0;
if (nthreads < min) {
@@ -303,14 +388,24 @@ __repmgr_start(dbenv, nthreads, flags)
* of rep_start calls even within an env region lifetime.
*/
if (start_master) {
- ret = __repmgr_become_master(env);
+ ret = __repmgr_become_master(env, 0);
/* No other repmgr threads running yet. */
DB_ASSERT(env, ret != DB_REP_UNAVAIL);
if (ret != 0)
goto err;
need_masterseek = FALSE;
} else {
- if ((ret = __repmgr_repstart(env, DB_REP_CLIENT)) != 0)
+ /*
+ * The preferred master site cannot allow its gen
+ * to change until it has done its lsnhist_match to
+ * guarantee that no preferred master transactions
+ * will be rolled back.
+ */
+ if (IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER))
+ startopts = REP_START_HOLD_CLIGEN;
+ if ((ret = __repmgr_repstart(env,
+ DB_REP_CLIENT, startopts)) != 0)
goto err;
/*
* The repmgr election code starts elections only if
@@ -352,6 +447,7 @@ __repmgr_start(dbenv, nthreads, flags)
if ((ret =
__repmgr_start_msg_threads(env, (u_int)nthreads)) != 0)
goto err;
+ rep->listener_nthreads = (u_int)nthreads;
if (need_masterseek) {
/*
@@ -374,10 +470,47 @@ __repmgr_start(dbenv, nthreads, flags)
}
UNLOCK_MUTEX(db_rep->mutex);
locked = FALSE;
+ /*
+ * Turn on the DB_EVENT_REP_INQUEUE_FULL event firing. We only
+ * do this for the main listener process. For a subordinate
+ * process, it is always turned on.
+ */
+ rep->inqueue_full_event_on = 1;
+ }
+ if (db_rep->selector == NULL) {
+ /* All processes (even non-listeners) need a select() thread. */
+ if ((ret = __repmgr_start_selector(env)) == 0) {
+ /*
+ * A view callback is set but this site isn't yet a
+ * view in the internal site list. Do the view
+ * demotion here, which will update the internal
+ * site list. We need the select() thread for the
+ * demotion because the demotion performs gmdb
+ * operations.
+ */
+ if (PARTICIPANT_TO_VIEW(db_rep,
+ SITE_FROM_EID(db_rep->self_eid)) &&
+ (ret = __repmgr_demote_site(env,
+ db_rep->self_eid)) != 0)
+ goto err;
+ return (is_listener ? 0 : DB_REP_IGNORE);
+ }
+ } else {
+ /*
+ * If the selector thread already exists, the current process
+ * should be the new listener which has just finished a
+ * takeover. Now, all active connections need to be refreshed
+ * to notify remote sites about the new listener. If a new
+ * connection is established immediately, disable the existing
+ * main connection to the same site. Otherwise, schedule a
+ * second immediate attempt. If it still fails, disable the
+ * main connection and retry a connection as usual.
+ */
+ DB_ASSERT(env, is_listener &&
+ FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER));
+ if ((ret = __repmgr_refresh_selector(env)) == 0)
+ return (0);
}
- /* All processes (even non-listeners) need a select() thread. */
- if ((ret = __repmgr_start_selector(env)) == 0)
- return (is_listener ? 0 : DB_REP_IGNORE);
err:
/* If we couldn't succeed at everything, undo the parts we did do. */
@@ -392,6 +525,16 @@ err:
if (!locked)
LOCK_MUTEX(db_rep->mutex);
(void)__repmgr_net_close(env);
+ /* Reset the listener when we fail before having a valid listen_fd. */
+ if (first && is_listener)
+ rep->listener = 0;
+ /*
+ * Reset repmgr_status when we fail before starting a selector if the
+ * earlier call to __repmgr_stop_threads() hasn't already reset it to
+ * stopped.
+ */
+ if (db_rep->repmgr_status == running)
+ db_rep->repmgr_status = ready;
UNLOCK_MUTEX(db_rep->mutex);
return (ret);
}
@@ -425,6 +568,53 @@ __repmgr_valid_config(env, flags)
}
/*
+ * Set priority, heartbeat and election_retry timeouts for preferred master
+ * mode. Turn on 2SITE_STRICT and ELECTIONS. Can be called whether or not
+ * REP_ON() is true
+ *
+ * PUBLIC: int __repmgr_prefmas_auto_config __P((DB_ENV *, u_int32_t *));
+ */
+int __repmgr_prefmas_auto_config (dbenv, config_flags)
+ DB_ENV *dbenv;
+ u_int32_t *config_flags;
+{
+ ENV * env;
+ db_timeout_t timeout;
+ int ret;
+
+ env = dbenv->env;
+ timeout = 0;
+
+ /* Change heartbeat timeouts if they are not already set. */
+ if ((ret = __rep_get_timeout(dbenv,
+ DB_REP_HEARTBEAT_MONITOR, &timeout)) == 0 &&
+ timeout == 0 && (ret = __rep_set_timeout_int(env,
+ DB_REP_HEARTBEAT_MONITOR,
+ DB_REPMGR_PREFMAS_HEARTBEAT_MONITOR)) != 0)
+ return (ret);
+ if ((ret = __rep_get_timeout(dbenv,
+ DB_REP_HEARTBEAT_SEND, &timeout)) == 0 &&
+ timeout == 0 && (ret = __rep_set_timeout_int(env,
+ DB_REP_HEARTBEAT_SEND, DB_REPMGR_PREFMAS_HEARTBEAT_SEND)) != 0)
+ return (ret);
+
+ /* Change election_retry timeout if it is still the default value. */
+ if ((ret = __rep_get_timeout(dbenv,
+ DB_REP_ELECTION_RETRY, &timeout)) == 0 &&
+ timeout == DB_REPMGR_DEFAULT_ELECTION_RETRY &&
+ (ret = __rep_set_timeout_int(env,
+ DB_REP_ELECTION_RETRY, DB_REPMGR_PREFMAS_ELECTION_RETRY)) != 0)
+ return (ret);
+
+ if ((ret = __rep_set_priority_int(env, FLD_ISSET(*config_flags,
+ REP_C_PREFMAS_MASTER) ? DB_REPMGR_PREFMAS_PRIORITY_MASTER :
+ DB_REPMGR_PREFMAS_PRIORITY_CLIENT)) != 0)
+ return (ret);
+ FLD_SET(*config_flags, REP_C_ELECTIONS | REP_C_2SITE_STRICT);
+ return (0);
+}
+
+/*
* Starts message processing threads. On entry, the actual number of threads
* already active is db_rep->nthreads; the desired number of threads is passed
* as "n".
@@ -473,7 +663,7 @@ __repmgr_restart(env, nthreads, flags)
REP *rep;
REPMGR_RUNNABLE **th;
u_int32_t cur_repflags;
- int locked, ret, t_ret;
+ int locked, ret, role_change, t_ret;
u_int delta, i, min, nth;
th = NULL;
@@ -491,6 +681,7 @@ __repmgr_restart(env, nthreads, flags)
}
ret = 0;
+ role_change = 0;
db_rep = env->rep_handle;
DB_ASSERT(env, REP_ON(env));
rep = db_rep->region;
@@ -498,11 +689,14 @@ __repmgr_restart(env, nthreads, flags)
cur_repflags = F_ISSET(rep, REP_F_MASTER | REP_F_CLIENT);
DB_ASSERT(env, cur_repflags);
if (FLD_ISSET(cur_repflags, REP_F_MASTER) &&
- flags == DB_REP_CLIENT)
+ flags == DB_REP_CLIENT) {
ret = __repmgr_become_client(env);
- else if (FLD_ISSET(cur_repflags, REP_F_CLIENT) &&
- flags == DB_REP_MASTER)
- ret = __repmgr_become_master(env);
+ role_change = 1;
+ } else if (FLD_ISSET(cur_repflags, REP_F_CLIENT) &&
+ flags == DB_REP_MASTER) {
+ ret = __repmgr_become_master(env, 0);
+ role_change = 1;
+ }
if (ret != 0)
return (ret);
@@ -574,6 +768,9 @@ __repmgr_restart(env, nthreads, flags)
}
__os_free(env, th);
}
+ /* We will always turn on the inqueue full event after role change. */
+ if (role_change)
+ rep->inqueue_full_event_on = 1;
out: if (locked)
UNLOCK_MUTEX(db_rep->mutex);
@@ -668,7 +865,8 @@ __repmgr_start_selector(env)
* PUBLIC: int __repmgr_close __P((ENV *));
*
* Close repmgr during env close. It stops repmgr, frees sites array and
- * its addresses.
+ * its addresses. Note that it is possible for the sites array to exist
+ * and require deallocation independently of whether repmgr was started.
*/
int
__repmgr_close(env)
@@ -679,10 +877,15 @@ __repmgr_close(env)
int ret;
u_int i;
- db_rep = env->rep_handle;
+ if ((db_rep = env->rep_handle) == NULL)
+ return (0);
ret = 0;
- ret = __repmgr_stop(env);
+ /* Stop repmgr and all of its threads if it was previously started. */
+ if (IS_ENV_REPLICATED(env))
+ ret = __repmgr_stop(env);
+
+ /* Clean up sites array regardless of whether we could stop repmgr. */
if (db_rep->sites != NULL) {
for (i = 0; i < db_rep->site_cnt; i++) {
site = &db_rep->sites[i];
@@ -756,9 +959,9 @@ __repmgr_set_ack_policy(dbenv, policy)
DB_ENV *dbenv;
int policy;
{
+ ENV *env;
DB_REP *db_rep;
DB_THREAD_INFO *ip;
- ENV *env;
REP *rep;
int ret;
@@ -823,6 +1026,208 @@ __repmgr_get_ack_policy(dbenv, policy)
}
/*
+ * PUBLIC: int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t,
+ * PUBLIC: u_int32_t));
+ *
+ * Sets the maximum amount of dynamic memory used by the Replication Manager
+ * incoming queue.
+ */
+int
+__repmgr_set_incoming_queue_max(dbenv, gbytes, bytes)
+ DB_ENV *dbenv;
+ u_int32_t gbytes, bytes;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->repmgr_set_incoming_queue_max",
+ DB_INIT_REP);
+
+ if (APP_IS_BASEAPI(env)) {
+ __db_errx(env, "%s %s",
+ "DB_ENV->repmgr_set_incoming_queue_max:",
+ "cannot call from base replication application");
+ return (EINVAL);
+ }
+
+ /*
+ * If the caller provided 0 for the size, the size will be unlimited.
+ */
+ if (gbytes == 0 && bytes == 0) {
+ gbytes = UINT32_MAX;
+ bytes = GIGABYTE - 1;
+ }
+
+ while (bytes >= GIGABYTE) {
+ bytes -= GIGABYTE;
+ if (gbytes < UINT32_MAX)
+ gbytes++;
+ }
+
+ if (REP_ON(env)) {
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ rep->inqueue_max_gbytes = gbytes;
+ rep->inqueue_max_bytes = bytes;
+ __repmgr_set_incoming_queue_redzone(rep, gbytes, bytes);
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ ENV_LEAVE(env, ip);
+ } else {
+ db_rep->inqueue_max_gbytes = gbytes;
+ db_rep->inqueue_max_bytes = bytes;
+ }
+
+ /*
+ * Setting incoming queue maximum sizes makes this a replication
+ * manager application.
+ */
+ APP_SET_REPMGR(env);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *,
+ * PUBLIC: u_int32_t *));
+ *
+ * Gets the maximum amount of dynamic memory that can be used by the
+ * Replicaton Manager incoming queue.
+ */
+int
+__repmgr_get_incoming_queue_max(dbenv, gbytesp, bytesp)
+ DB_ENV *dbenv;
+ u_int32_t *gbytesp, *bytesp;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_REP *db_rep;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (REP_ON(env)) {
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ *gbytesp = rep->inqueue_max_gbytes;
+ *bytesp = rep->inqueue_max_bytes;
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ ENV_LEAVE(env, ip);
+ } else {
+ *gbytesp = db_rep->inqueue_max_gbytes;
+ *bytesp = db_rep->inqueue_max_bytes;
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: void __repmgr_set_incoming_queue_redzone __P((void *, u_int32_t,
+ * PUBLIC: u_int32_t));
+ *
+ * Sets the lower bound of the repmgr incoming queue red zone.
+ * !!! Assumes caller holds mtx_repmgr lock.
+ *
+ * Note that we can't simply get the REP* address from the env as we usually do,
+ * because at the time of this call it may not have been linked into there yet.
+ * Also note that, REP is not a public structure, so we use "void *" here.
+ */
+void __repmgr_set_incoming_queue_redzone(rep_, gbytes, bytes)
+ void *rep_;
+ u_int32_t gbytes, bytes;
+{
+ REP *rep;
+ double rdgbytes, rdbytes;
+
+ rep = rep_;
+
+ /*
+ * We use 'double' values to do the computation for precision, and
+ * to avoid overflow.
+ */
+ rdgbytes = gbytes * 1.00 * DB_REPMGR_INQUEUE_REDZONE_PERCENT / 100.00;
+ rdbytes = (rdgbytes - (u_int32_t)rdgbytes) * GIGABYTE;
+ rdbytes += bytes * 1.00 * DB_REPMGR_INQUEUE_REDZONE_PERCENT / 100.00;
+ if (rdbytes >= GIGABYTE) {
+ rdgbytes += 1;
+ rdbytes -= GIGABYTE;
+ }
+ rep->inqueue_rz_gbytes = (u_int32_t)rdgbytes;
+ rep->inqueue_rz_bytes = (u_int32_t)rdbytes;
+}
+
+/*
+ * PUBLIC: int __repmgr_get_incoming_queue_redzone __P((DB_ENV *,
+ * PUBLIC: u_int32_t *, u_int32_t *));
+ *
+ * Gets the lower bound of the repmgr incoming queue red zone.
+ * This method must be called after environment open.
+ */
+int __repmgr_get_incoming_queue_redzone(dbenv, gbytesp, bytesp)
+ DB_ENV *dbenv;
+ u_int32_t *gbytesp, *bytesp;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ENV_REQUIRES_CONFIG(
+ env, db_rep->region, "__repmgr_get_incoming_queue_redzone",
+ DB_INIT_REP);
+
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ *gbytesp = rep->inqueue_rz_gbytes;
+ *bytesp = rep->inqueue_rz_bytes;
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ ENV_LEAVE(env, ip);
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *,
+ * PUBLIC: int *));
+ *
+ * Return whether the DB_EVENT_REP_INQUEUE_FULL event firing is
+ * turned on or off.
+ * This method must be called after environment open.
+ */
+int __repmgr_get_incoming_queue_fullevent(dbenv, onoffp)
+ DB_ENV *dbenv;
+ int *onoffp;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ENV_REQUIRES_CONFIG(
+ env, db_rep->region,
+ "DB_ENV->__repmgr_get_incoming_queue_fullevent",
+ DB_INIT_REP);
+
+ *onoffp = rep->inqueue_full_event_on ? 1 : 0;
+
+ return (0);
+}
+
+/*
* PUBLIC: int __repmgr_env_create __P((ENV *, DB_REP *));
*/
int
@@ -837,7 +1242,13 @@ __repmgr_env_create(env, db_rep)
db_rep->connection_retry_wait = DB_REPMGR_DEFAULT_CONNECTION_RETRY;
db_rep->election_retry_wait = DB_REPMGR_DEFAULT_ELECTION_RETRY;
db_rep->config_nsites = 0;
+ ADJUST_AUTOTAKEOVER_WAITS(db_rep, DB_REPMGR_DEFAULT_ACK_TIMEOUT);
db_rep->perm_policy = DB_REPMGR_ACKS_QUORUM;
+ db_rep->inqueue_max_gbytes = 0;
+ db_rep->inqueue_max_bytes = 0;
+#ifdef HAVE_REPLICATION_LISTENER_TAKEOVER
+ FLD_SET(db_rep->config, REP_C_AUTOTAKEOVER);
+#endif
FLD_SET(db_rep->config, REP_C_ELECTIONS);
FLD_SET(db_rep->config, REP_C_2SITE_STRICT);
@@ -846,7 +1257,8 @@ __repmgr_env_create(env, db_rep)
TAILQ_INIT(&db_rep->connections);
TAILQ_INIT(&db_rep->retries);
- db_rep->input_queue.size = 0;
+ db_rep->input_queue.gbytes = 0;
+ db_rep->input_queue.bytes = 0;
STAILQ_INIT(&db_rep->input_queue.header);
__repmgr_env_create_pf(db_rep);
@@ -944,6 +1356,15 @@ __repmgr_await_threads(env)
* of a connector thread.
*/
+ /* Takeover thread. */
+ if (db_rep->takeover_thread != NULL) {
+ if ((t_ret = __repmgr_thread_join(db_rep->takeover_thread)) !=
+ 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, db_rep->takeover_thread);
+ db_rep->takeover_thread = NULL;
+ }
+
/* Message processing threads. */
for (i = 0;
i < db_rep->nthreads && db_rep->messengers[i] != NULL; i++) {
@@ -1178,7 +1599,7 @@ get_shared_netaddr(env, eid, netaddr)
MUTEX_LOCK(env, rep->mtx_repmgr);
if ((u_int)eid >= rep->site_cnt) {
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto err;
}
DB_ASSERT(env, rep->siteinfo_off != INVALID_ROFF);
@@ -1423,7 +1844,7 @@ send_msg_self(env, iovecs, nmsg)
u_int32_t nmsg;
{
REPMGR_MESSAGE *msg;
- size_t align, bodysize, structsize;
+ size_t align, bodysize, msgsize, structsize;
u_int8_t *membase;
int ret;
@@ -1431,10 +1852,12 @@ send_msg_self(env, iovecs, nmsg)
bodysize = iovecs->total_bytes - __REPMGR_MSG_HDR_SIZE;
structsize = (size_t)DB_ALIGN((size_t)(sizeof(REPMGR_MESSAGE) +
nmsg * sizeof(DBT)), align);
- if ((ret = __os_malloc(env, structsize + bodysize, &membase)) != 0)
+ msgsize = structsize + bodysize;
+ if ((ret = __os_malloc(env, msgsize, &membase)) != 0)
return (ret);
msg = (void*)membase;
+ msg->size = msgsize;
membase += structsize;
/*
@@ -1616,13 +2039,14 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags)
}
ENV_ENTER(env, ip);
- ret = get_channel_connection(channel, &conn);
- ENV_LEAVE(env, ip);
- if (ret != 0)
- return (ret);
+ if ((ret = get_channel_connection(channel, &conn)) != 0)
+ goto out;
- if (conn == NULL)
- return (request_self(env, request, nrequest, response, flags));
+ /* If conn is NULL, call request_self and then we are done here. */
+ if (conn == NULL) {
+ ret = request_self(env, request, nrequest, response, flags);
+ goto out;
+ }
/* Find an available array slot, or grow the array if necessary. */
LOCK_MUTEX(db_rep->mutex);
@@ -1670,7 +2094,7 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags)
LOCK_MUTEX(db_rep->mutex);
F_CLR(&conn->responses[i], RESP_IN_USE | RESP_THREAD_WAITING);
UNLOCK_MUTEX(db_rep->mutex);
- return (ret);
+ goto out;
}
timeout = timeout > 0 ? timeout : db_channel->timeout;
@@ -1688,7 +2112,7 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags)
* to wake up those threads, with a COMPLETE indication and an
* error code. That's more than we want to tackle here.
*/
- return (ret);
+ goto out;
}
/*
@@ -1732,7 +2156,7 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags)
sz = conn->iovecs.vectors[0].iov_len;
if ((ret = __os_malloc(env, sz, &dummy)) != 0)
- goto out;
+ goto out_unlck;
__repmgr_iovec_init(&conn->iovecs);
DB_INIT_DBT(resp->dbt, dummy, sz);
__repmgr_add_dbt(&conn->iovecs, &resp->dbt);
@@ -1740,8 +2164,9 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags)
}
}
-out:
+out_unlck:
UNLOCK_MUTEX(db_rep->mutex);
+out: ENV_LEAVE(env, ip);
return (ret);
}
@@ -2168,6 +2593,7 @@ __repmgr_channel_close(dbchan, flags)
{
ENV *env;
DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
REPMGR_CONNECTION *conn;
CHANNEL *channel;
u_int32_t i;
@@ -2182,6 +2608,7 @@ __repmgr_channel_close(dbchan, flags)
* Disable connection(s) (if not already done due to an error having
* occurred previously); release our reference to conn struct(s).
*/
+ ENV_ENTER(env, ip);
LOCK_MUTEX(db_rep->mutex);
if (dbchan->eid >= 0) {
conn = channel->c.conn;
@@ -2218,6 +2645,7 @@ __repmgr_channel_close(dbchan, flags)
__os_free(env, channel);
__os_free(env, dbchan);
+ ENV_LEAVE(env, ip);
return (ret);
}
@@ -2369,29 +2797,26 @@ join_group_at_site(env, addrp)
repmgr_netaddr_t *addrp;
{
DB_REP *db_rep;
+ REP *rep;
REPMGR_CONNECTION *conn;
SITE_STRING_BUFFER addr_buf;
repmgr_netaddr_t addr, myaddr;
__repmgr_gm_fwd_args fwd;
__repmgr_site_info_args site_info;
+ __repmgr_v4site_info_args v4site_info;
u_int8_t *p, *response_buf, siteinfo_buf[MAX_MSG_BUF];
char host_buf[MAXHOSTNAMELEN + 1], *host;
u_int32_t gen, type;
- size_t len;
+ size_t host_len, msg_len, req_len;
int ret, t_ret;
db_rep = env->rep_handle;
+ rep = db_rep->region;
LOCK_MUTEX(db_rep->mutex);
myaddr = SITE_FROM_EID(db_rep->self_eid)->net_addr;
UNLOCK_MUTEX(db_rep->mutex);
- len = strlen(myaddr.host) + 1;
- DB_INIT_DBT(site_info.host, myaddr.host, len);
- site_info.port = myaddr.port;
- site_info.flags = 0;
- ret = __repmgr_site_info_marshal(env,
- &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len);
- DB_ASSERT(env, ret == 0);
+ host_len = strlen(myaddr.host) + 1;
conn = NULL;
response_buf = NULL;
@@ -2399,14 +2824,35 @@ join_group_at_site(env, addrp)
RPRINT(env, (env, DB_VERB_REPMGR_MISC, "try join request to site %s",
__repmgr_format_addr_loc(addrp, addr_buf)));
retry:
- if ((ret = make_request_conn(env, addrp, &conn)) != 0)
+ if ((ret = __repmgr_make_request_conn(env, addrp, &conn)) != 0)
return (ret);
+ DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION);
+ if (conn->version < 5) {
+ DB_INIT_DBT(v4site_info.host, myaddr.host, host_len);
+ v4site_info.port = myaddr.port;
+ v4site_info.flags = 0;
+ ret = __repmgr_v4site_info_marshal(env,
+ &v4site_info, siteinfo_buf, sizeof(siteinfo_buf), &req_len);
+ } else {
+ DB_INIT_DBT(site_info.host, myaddr.host, host_len);
+ site_info.port = myaddr.port;
+ site_info.status = 0;
+ site_info.flags = 0;
+ if (IS_VIEW_SITE(env))
+ FLD_SET(site_info.flags, SITE_VIEW);
+ if (rep->priority > 0)
+ FLD_SET(site_info.flags, SITE_JOIN_ELECTABLE);
+ ret = __repmgr_site_info_marshal(env,
+ &site_info, siteinfo_buf, sizeof(siteinfo_buf), &req_len);
+ }
+ DB_ASSERT(env, ret == 0);
+ /* Preserve separate request length in case there is a retry. */
if ((ret = __repmgr_send_sync_msg(env, conn,
- REPMGR_JOIN_REQUEST, siteinfo_buf, (u_int32_t)len)) != 0)
+ REPMGR_JOIN_REQUEST, siteinfo_buf, (u_int32_t)req_len)) != 0)
goto err;
- if ((ret = read_own_msg(env,
- conn, &type, &response_buf, &len)) != 0)
+ if ((ret = __repmgr_read_own_msg(env,
+ conn, &type, &response_buf, &msg_len)) != 0)
goto err;
if (type == REPMGR_GM_FAILURE) {
@@ -2429,7 +2875,7 @@ retry:
goto err;
ret = __repmgr_gm_fwd_unmarshal(env, &fwd,
- response_buf, len, &p);
+ response_buf, msg_len, &p);
DB_ASSERT(env, ret == 0);
if (fwd.gen > gen) {
if (fwd.host.size > MAXHOSTNAMELEN + 1) {
@@ -2456,7 +2902,8 @@ retry:
}
}
if (type == REPMGR_JOIN_SUCCESS)
- ret = __repmgr_refresh_membership(env, response_buf, len);
+ ret = __repmgr_refresh_membership(env, response_buf, msg_len,
+ conn->version);
else
ret = DB_REP_UNAVAIL; /* Invalid response: protocol violation */
@@ -2476,129 +2923,6 @@ err:
}
/*
- * Reads a whole message, when we expect to get a REPMGR_OWN_MSG.
- */
-static int
-read_own_msg(env, conn, typep, bufp, lenp)
- ENV *env;
- REPMGR_CONNECTION *conn;
- u_int32_t *typep;
- u_int8_t **bufp;
- size_t *lenp;
-{
- __repmgr_msg_hdr_args msg_hdr;
- u_int8_t *buf;
- u_int32_t type;
- size_t size;
- int ret;
-
- __repmgr_reset_for_reading(conn);
- if ((ret = __repmgr_read_conn(conn)) != 0)
- goto err;
- ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
- conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
- DB_ASSERT(env, ret == 0);
-
- if ((conn->msg_type = msg_hdr.type) != REPMGR_OWN_MSG) {
- ret = DB_REP_UNAVAIL; /* Protocol violation. */
- goto err;
- }
- type = REPMGR_OWN_MSG_TYPE(msg_hdr);
- if ((size = (size_t)REPMGR_OWN_BUF_SIZE(msg_hdr)) > 0) {
- conn->reading_phase = DATA_PHASE;
- __repmgr_iovec_init(&conn->iovecs);
-
- if ((ret = __os_malloc(env, size, &buf)) != 0)
- goto err;
- conn->input.rep_message = NULL;
-
- __repmgr_add_buffer(&conn->iovecs, buf, size);
- if ((ret = __repmgr_read_conn(conn)) != 0) {
- __os_free(env, buf);
- goto err;
- }
- *bufp = buf;
- }
-
- *typep = type;
- *lenp = size;
-
-err:
- return (ret);
-}
-
-static int
-make_request_conn(env, addr, connp)
- ENV *env;
- repmgr_netaddr_t *addr;
- REPMGR_CONNECTION **connp;
-{
- DBT vi;
- __repmgr_msg_hdr_args msg_hdr;
- __repmgr_version_confirmation_args conf;
- REPMGR_CONNECTION *conn;
- int alloc, ret, unused;
-
- alloc = FALSE;
- if ((ret = __repmgr_connect(env, addr, &conn, &unused)) != 0)
- return (ret);
- conn->type = APP_CONNECTION;
-
- /* Read a handshake msg, to get version confirmation and parameters. */
- if ((ret = __repmgr_read_conn(conn)) != 0)
- goto err;
- /*
- * We can only get here after having read the full 9 bytes that we
- * expect, so this can't fail.
- */
- DB_ASSERT(env, conn->reading_phase == SIZES_PHASE);
- ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
- conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
- DB_ASSERT(env, ret == 0);
- __repmgr_iovec_init(&conn->iovecs);
- conn->reading_phase = DATA_PHASE;
-
- if ((ret = __repmgr_prepare_simple_input(env, conn, &msg_hdr)) != 0)
- goto err;
- alloc = TRUE;
-
- if ((ret = __repmgr_read_conn(conn)) != 0)
- goto err;
-
- /*
- * Analyze the handshake msg, and stash relevant info.
- */
- if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
- goto err;
- DB_ASSERT(env, vi.size > 0);
- if ((ret = __repmgr_version_confirmation_unmarshal(env,
- &conf, vi.data, vi.size, NULL)) != 0)
- goto err;
-
- if (conf.version < GM_MIN_VERSION) {
- ret = DB_REP_UNAVAIL;
- goto err;
- }
- conn->version = conf.version;
-
-err:
- if (alloc) {
- DB_ASSERT(env, conn->input.repmgr_msg.cntrl.size > 0);
- __os_free(env, conn->input.repmgr_msg.cntrl.data);
- DB_ASSERT(env, conn->input.repmgr_msg.rec.size > 0);
- __os_free(env, conn->input.repmgr_msg.rec.data);
- }
- __repmgr_reset_for_reading(conn);
- if (ret == 0)
- *connp = conn;
- else {
- (void)__repmgr_close_connection(env, conn);
- (void)__repmgr_destroy_conn(env, conn);
- }
- return (ret);
-}
-
-/*
* PUBLIC: int __repmgr_site __P((DB_ENV *,
* PUBLIC: const char *, u_int, DB_SITE **, u_int32_t));
*/
@@ -2640,9 +2964,9 @@ site_by_addr(env, host, port, sitep)
if ((ret = addr_chk(env, host, port)) != 0)
return (ret);
+ ENV_ENTER(env, ip);
if (REP_ON(env)) {
LOCK_MUTEX(db_rep->mutex);
- ENV_ENTER(env, ip);
locked = TRUE;
} else
locked = FALSE;
@@ -2654,10 +2978,9 @@ site_by_addr(env, host, port, sitep)
* we want the DB_SITE handle to point to; just like site_by_eid() does.
*/
host = site->net_addr.host;
- if (locked) {
- ENV_LEAVE(env, ip);
+ if (locked)
UNLOCK_MUTEX(db_rep->mutex);
- }
+ ENV_LEAVE(env, ip);
if (ret != 0)
return (ret);
@@ -2723,7 +3046,7 @@ init_dbsite(env, eid, host, port, sitep)
dbsite->get_address = __repmgr_get_site_address;
dbsite->get_config = __repmgr_get_config;
dbsite->get_eid = __repmgr_get_eid;
- dbsite->set_config = __repmgr_site_config;
+ dbsite->set_config = __repmgr_site_config_pp;
dbsite->remove = __repmgr_remove_site_pp;
dbsite->close = __repmgr_site_close;
@@ -2756,9 +3079,16 @@ __repmgr_get_eid(dbsite, eidp)
DB_SITE *dbsite;
int *eidp;
{
+ DB_THREAD_INFO *ip;
+ ENV *env;
int ret;
- if ((ret = refresh_site(dbsite)) != 0)
+ env = dbsite->env;
+
+ ENV_ENTER(env, ip);
+ ret = refresh_site(dbsite);
+ ENV_LEAVE(env, ip);
+ if (ret != 0)
return (ret);
if (F_ISSET(dbsite, DB_SITE_PREOPEN)) {
@@ -2791,8 +3121,11 @@ __repmgr_get_config(dbsite, which, valuep)
env = dbsite->env;
db_rep = env->rep_handle;
- if ((ret = refresh_site(dbsite)) != 0)
+ ENV_ENTER(env, ip);
+ if ((ret = refresh_site(dbsite)) != 0) {
+ ENV_LEAVE(env, ip);
return (ret);
+ }
LOCK_MUTEX(db_rep->mutex);
DB_ASSERT(env, IS_VALID_EID(dbsite->eid));
site = SITE_FROM_EID(dbsite->eid);
@@ -2800,32 +3133,52 @@ __repmgr_get_config(dbsite, which, valuep)
rep = db_rep->region;
infop = env->reginfo;
- ENV_ENTER(env, ip);
MUTEX_LOCK(env, rep->mtx_repmgr);
sites = R_ADDR(infop, rep->siteinfo_off);
site->config = sites[dbsite->eid].config;
MUTEX_UNLOCK(env, rep->mtx_repmgr);
- ENV_LEAVE(env, ip);
}
*valuep = FLD_ISSET(site->config, which) ? 1 : 0;
UNLOCK_MUTEX(db_rep->mutex);
+ ENV_LEAVE(env, ip);
return (0);
}
/*
- * PUBLIC: int __repmgr_site_config __P((DB_SITE *, u_int32_t, u_int32_t));
+ * PUBLIC: int __repmgr_site_config_pp __P((DB_SITE *, u_int32_t, u_int32_t));
*/
int
-__repmgr_site_config(dbsite, which, value)
+__repmgr_site_config_pp(dbsite, which, value)
DB_SITE *dbsite;
u_int32_t which;
u_int32_t value;
{
- DB_REP *db_rep;
DB_THREAD_INFO *ip;
ENV *env;
+ int ret;
+
+ env = dbsite->env;
+
+ ENV_ENTER(env, ip);
+ ret = __repmgr_site_config_int(dbsite, which, value);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_config_int __P((DB_SITE *, u_int32_t, u_int32_t));
+ */
+int
+__repmgr_site_config_int(dbsite, which, value)
+ DB_SITE *dbsite;
+ u_int32_t which;
+ u_int32_t value;
+{
+ DB_REP *db_rep;
+ ENV *env;
REGINFO *infop;
REP *rep;
REPMGR_SITE *site;
@@ -2875,7 +3228,6 @@ __repmgr_site_config(dbsite, which, value)
infop = env->reginfo;
LOCK_MUTEX(db_rep->mutex);
- ENV_ENTER(env, ip);
MUTEX_LOCK(env, rep->mtx_repmgr);
sites = R_ADDR(infop, rep->siteinfo_off);
site = SITE_FROM_EID(dbsite->eid);
@@ -2896,7 +3248,6 @@ __repmgr_site_config(dbsite, which, value)
rep->siteinfo_seq++;
}
MUTEX_UNLOCK(env, rep->mtx_repmgr);
- ENV_LEAVE(env, ip);
UNLOCK_MUTEX(db_rep->mutex);
} else {
site = SITE_FROM_EID(dbsite->eid);
@@ -2930,7 +3281,6 @@ set_local_site(dbsite, value)
if (REP_ON(env)) {
rep = db_rep->region;
LOCK_MUTEX(db_rep->mutex);
- ENV_ENTER(env, ip);
MUTEX_LOCK(env, rep->mtx_repmgr);
locked = TRUE;
/* Make sure we're in sync first. */
@@ -2941,31 +3291,32 @@ set_local_site(dbsite, value)
__db_errx(env, DB_STR("3666",
"A previously given local site may not be unset"));
ret = EINVAL;
- } else if (IS_VALID_EID(db_rep->self_eid) &&
- db_rep->self_eid != dbsite->eid) {
- __db_errx(env, DB_STR("3667",
- "A (different) local site has already been set"));
- ret = EINVAL;
- } else {
- DB_ASSERT(env, IS_VALID_EID(dbsite->eid));
- site = SITE_FROM_EID(dbsite->eid);
- if (FLD_ISSET(site->config,
- DB_BOOTSTRAP_HELPER | DB_REPMGR_PEER)) {
- __db_errx(env, DB_STR("3668",
- "Local site cannot have HELPER or PEER attributes"));
+ } else if (value) {
+ if (IS_VALID_EID(db_rep->self_eid) &&
+ db_rep->self_eid != dbsite->eid) {
+ __db_errx(env, DB_STR("3697",
+ "A (different) local site has already been set"));
ret = EINVAL;
+ } else {
+ DB_ASSERT(env, IS_VALID_EID(dbsite->eid));
+ site = SITE_FROM_EID(dbsite->eid);
+ if (FLD_ISSET(site->config,
+ DB_BOOTSTRAP_HELPER | DB_REPMGR_PEER)) {
+ __db_errx(env, DB_STR("3698",
+ "Local site cannot have HELPER or PEER attributes"));
+ ret = EINVAL;
+ }
}
}
- if (ret == 0) {
+ if (ret == 0 && value) {
db_rep->self_eid = dbsite->eid;
if (locked) {
- rep->self_eid = dbsite->eid;
+ rep->self_eid = db_rep->self_eid;
rep->siteinfo_seq++;
}
}
if (locked) {
MUTEX_UNLOCK(env, rep->mtx_repmgr);
- ENV_LEAVE(env, ip);
UNLOCK_MUTEX(db_rep->mutex);
}
return (ret);
@@ -2998,7 +3349,7 @@ refresh_site(dbsite)
}
static int
-__repmgr_remove_site_pp(dbsite)
+__repmgr_remove_and_close_site(dbsite)
DB_SITE *dbsite;
{
int ret, t_ret;
@@ -3011,6 +3362,23 @@ __repmgr_remove_site_pp(dbsite)
*/
if ((t_ret = __repmgr_site_close(dbsite)) != 0 && ret == 0)
ret = t_ret;
+
+ return (ret);
+}
+
+static int
+__repmgr_remove_site_pp(dbsite)
+ DB_SITE *dbsite;
+{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ env = dbsite->env;
+
+ ENV_ENTER(env, ip);
+ ret = __repmgr_remove_and_close_site(dbsite);
+ ENV_LEAVE(env, ip);
return (ret);
}
@@ -3024,6 +3392,7 @@ __repmgr_remove_site(dbsite)
REPMGR_CONNECTION *conn;
repmgr_netaddr_t addr;
__repmgr_site_info_args site_info;
+ __repmgr_v4site_info_args v4site_info;
u_int8_t *response_buf, siteinfo_buf[MAX_MSG_BUF];
size_t len;
u_int32_t type;
@@ -3046,23 +3415,33 @@ __repmgr_remove_site(dbsite)
DB_ASSERT(env, IS_VALID_EID(master));
addr = SITE_FROM_EID(master)->net_addr;
UNLOCK_MUTEX(db_rep->mutex);
-
len = strlen(dbsite->host) + 1;
- DB_INIT_DBT(site_info.host, dbsite->host, len);
- site_info.port = dbsite->port;
- site_info.flags = 0;
- ret = __repmgr_site_info_marshal(env,
- &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len);
- DB_ASSERT(env, ret == 0);
conn = NULL;
response_buf = NULL;
- if ((ret = make_request_conn(env, &addr, &conn)) != 0)
+ if ((ret = __repmgr_make_request_conn(env, &addr, &conn)) != 0)
return (ret);
+ DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION);
+ if (conn->version < 5) {
+ DB_INIT_DBT(v4site_info.host, dbsite->host, len);
+ v4site_info.port = dbsite->port;
+ v4site_info.flags = 0;
+ ret = __repmgr_v4site_info_marshal(env,
+ &v4site_info, siteinfo_buf, sizeof(siteinfo_buf), &len);
+ } else {
+ DB_INIT_DBT(site_info.host, dbsite->host, len);
+ site_info.port = dbsite->port;
+ site_info.status = 0;
+ site_info.flags = 0;
+ ret = __repmgr_site_info_marshal(env,
+ &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len);
+ }
+ DB_ASSERT(env, ret == 0);
+
if ((ret = __repmgr_send_sync_msg(env, conn,
REPMGR_REMOVE_REQUEST, siteinfo_buf, (u_int32_t)len)) != 0)
goto err;
- if ((ret = read_own_msg(env,
+ if ((ret = __repmgr_read_own_msg(env,
conn, &type, &response_buf, &len)) != 0)
goto err;
ret = type == REPMGR_REMOVE_SUCCESS ? 0 : DB_REP_UNAVAIL;
@@ -3090,3 +3469,82 @@ __repmgr_site_close(dbsite)
__os_free(dbsite->env, dbsite);
return (0);
}
+
+/*
+ * Demotes a participant site to a view. This is a one-way and one-time
+ * operation.
+ *
+ * The demotion occurs at the very end of repmgr_start() because it
+ * requires a select thread to perform the gmdb operations that remove
+ * the site from the replication group and immediately add the site back
+ * into the group as a view. The demotion also preserves any other threads
+ * created by repmgr_start() so that they are there to be used by the
+ * demoted site after it is re-added as a view site.
+ *
+ * We remove and re-add the site to propagate the site's change from
+ * participant to view to all sites in the replication group. This includes
+ * updates to each site's gmdb and in-memory site list.
+ */
+#define REPMGR_DEMOTION_MASTER_RETRIES 10
+#define REPMGR_DEMOTION_RETRY_USECS 500000
+static int
+__repmgr_demote_site(env, eid)
+ ENV *env;
+ int eid;
+{
+ DB_REP *db_rep;
+ DB_SITE *dbsite;
+ REP *rep;
+ REPMGR_SITE *site;
+ int ret, t_ret, tries;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ site = SITE_FROM_EID(eid);
+ dbsite = NULL;
+
+ /* Inform other repmgr threads that a demotion is in progress. */
+ db_rep->demotion_pending = TRUE;
+
+ if ((ret = init_dbsite(env, eid, site->net_addr.host,
+ site->net_addr.port, &dbsite)) != 0)
+ goto err;
+
+ /*
+ * We need a master to perform the gmdb updates. Poll periodically
+ * for a limited time to find one.
+ */
+ tries = 0;
+ while (rep->master_id == DB_EID_INVALID) {
+ __os_yield(env, 0, REPMGR_DEMOTION_RETRY_USECS);
+ if (++tries >= REPMGR_DEMOTION_MASTER_RETRIES) {
+ ret = DB_REP_UNAVAIL;
+ goto err;
+ }
+ }
+
+ /* Remove site from replication group. */
+ if ((ret = __repmgr_remove_site(dbsite)) != 0)
+ goto err;
+
+ /*
+ * Add site back into replication group as a view. This demotion is
+ * occurring because this site now has a view callback but its
+ * SITE_VIEW flag is not set. Now, __repmgr_join_group() will detect
+ * the view callback and set the SITE_VIEW flag before sending this
+ * site's information to the rest of the replication group.
+ */
+ if ((ret = __repmgr_join_group(env)) != 0)
+ goto err;
+
+err:
+ /* Deallocates dbsite. */
+ if (dbsite != NULL) {
+ t_ret = __repmgr_site_close(dbsite);
+ if (ret == 0 && t_ret != 0)
+ ret = t_ret;
+ }
+ /* Must reset demotion_pending before leaving this routine. */
+ db_rep->demotion_pending = FALSE;
+ return (ret);
+}
diff --git a/src/repmgr/repmgr_msg.c b/src/repmgr/repmgr_msg.c
index 13537823..71cb2ada 100644
--- a/src/repmgr/repmgr_msg.c
+++ b/src/repmgr/repmgr_msg.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -15,15 +15,19 @@
#include "dbinc_auto/repmgr_auto.h"
static int dispatch_app_message __P((ENV *, REPMGR_MESSAGE *));
-static int finish_gmdb_update __P((ENV *,
- DB_THREAD_INFO *, DBT *, u_int32_t, u_int32_t, __repmgr_member_args *));
+static int finish_gmdb_update __P((ENV *, DB_THREAD_INFO *,
+ DBT *, u_int32_t, u_int32_t, u_int32_t, __repmgr_member_args *));
static int incr_gm_version __P((ENV *, DB_THREAD_INFO *, DB_TXN *));
-static void marshal_site_data __P((ENV *, u_int32_t, u_int8_t *, DBT *));
+static void marshal_site_data __P((ENV *,
+ u_int32_t, u_int32_t, u_int8_t *, DBT *));
static void marshal_site_key __P((ENV *,
repmgr_netaddr_t *, u_int8_t *, DBT *, __repmgr_member_args *));
static int message_loop __P((ENV *, REPMGR_RUNNABLE *));
+static int preferred_master_takeover __P((ENV*));
static int process_message __P((ENV*, DBT*, DBT*, int));
static int reject_fwd __P((ENV *, REPMGR_CONNECTION *));
+static int rejoin_connections(ENV *);
+static int rejoin_deferred_election(ENV *);
static int rescind_pending __P((ENV *,
DB_THREAD_INFO *, int, u_int32_t, u_int32_t));
static int resolve_limbo_int __P((ENV *, DB_THREAD_INFO *));
@@ -33,9 +37,13 @@ static int send_permlsn_conn __P((ENV *,
REPMGR_CONNECTION *, u_int32_t, DB_LSN *));
static int serve_join_request __P((ENV *,
DB_THREAD_INFO *, REPMGR_MESSAGE *));
+static int serve_lsnhist_request __P((ENV *, DB_THREAD_INFO *,
+ REPMGR_MESSAGE *));
+static int serve_readonly_master_request __P((ENV *, REPMGR_MESSAGE *));
static int serve_remove_request __P((ENV *,
DB_THREAD_INFO *, REPMGR_MESSAGE *));
static int serve_repmgr_request __P((ENV *, REPMGR_MESSAGE *));
+static int serve_restart_client_request __P((ENV *, REPMGR_MESSAGE *));
/*
* Map one of the phase-1/provisional membership status values to its
@@ -72,6 +80,7 @@ message_loop(env, th)
REPMGR_RUNNABLE *th;
{
DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
REP *rep;
REPMGR_MESSAGE *msg;
REPMGR_CONNECTION *conn;
@@ -83,6 +92,7 @@ message_loop(env, th)
COMPQUIET(membership, 0);
db_rep = env->rep_handle;
rep = db_rep->region;
+ ENV_ENTER(env, ip);
LOCK_MUTEX(db_rep->mutex);
while ((ret = __repmgr_queue_get(env, &msg, th)) == 0) {
incremented = FALSE;
@@ -141,7 +151,21 @@ message_loop(env, th)
* detect it without the need for application
* activity.
*/
- ret = __rep_flush(env->dbenv);
+ ret = __rep_flush_int(env);
+ } else if (db_rep->prefmas_pending == master_switch &&
+ IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER) &&
+ F_ISSET(rep, REP_F_CLIENT)) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+"message_loop heartbeat preferred master switch"));
+ /*
+ * We are a preferred master site currently
+ * running as a client and we have finished
+ * syncing with the temporary master. It is
+ * now time to take over as master.
+ */
+ db_rep->prefmas_pending = no_action;
+ ret = preferred_master_takeover(env);
} else {
/*
* Use heartbeat message to initiate rerequest
@@ -162,6 +186,12 @@ message_loop(env, th)
db_rep->non_rep_th--;
if (ret != 0)
goto out;
+ if (db_rep->view_mismatch) {
+ __db_errx(env, DB_STR("3699",
+ "Site is not recorded as a view in the group membership database"));
+ ret = EINVAL;
+ goto out;
+ }
}
/*
* A return of DB_REP_UNAVAIL from __repmgr_queue_get() merely means we
@@ -171,6 +201,7 @@ message_loop(env, th)
ret = 0;
out:
UNLOCK_MUTEX(db_rep->mutex);
+ ENV_LEAVE(env, ip);
return (ret);
}
@@ -341,16 +372,45 @@ process_message(env, control, rec, eid)
break;
case DB_REP_DUPMASTER:
- /*
- * Initiate an election if we're configured to be using
- * elections, but only if we're *NOT* using leases. When using
- * leases, there is never any uncertainty over which site is the
- * rightful master, and only the loser gets the DUPMASTER return
- * code.
- */
- if ((ret = __repmgr_become_client(env)) == 0 &&
+ if (IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)) {
+ /*
+ * The preferred master site must restart as a master
+ * so that it sends out a NEWMASTER to help the client
+ * sync. It must force a role change so that it
+ * advances its gen even though it is already master.
+ * This is needed if there was a temporary master at
+ * a higher gen that is now restarting as a client.
+ * A client won't process messages from a master at
+ * a lower gen than its own.
+ */
+ ret = __repmgr_repstart(env, DB_REP_MASTER,
+ REP_START_FORCE_ROLECHG);
+ } else if (IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT) &&
+ (ret = __repmgr_become_client(env)) == 0) {
+ /*
+ * The preferred master client site must restart as
+ * client without any elections to enable the preferred
+ * master site to preserve its own transactions. It
+ * uses an election thread to repeatedly perform client
+ * startups so that it will perform its client sync
+ * when the preferred master's gen has caught up.
+ */
+ LOCK_MUTEX(db_rep->mutex);
+ ret = __repmgr_init_election(env,
+ ELECT_F_CLIENT_RESTART);
+ UNLOCK_MUTEX(db_rep->mutex);
+ } else if ((ret = __repmgr_become_client(env)) == 0 &&
FLD_ISSET(rep->config, REP_C_LEASE | REP_C_ELECTIONS)
== REP_C_ELECTIONS) {
+ /*
+ * Initiate an election if we're configured to be using
+ * elections, but only if we're *NOT* using leases.
+ * When using leases, there is never any uncertainty
+ * over which site is the rightful master, and only the
+ * loser gets the DUPMASTER return code.
+ */
LOCK_MUTEX(db_rep->mutex);
ret = __repmgr_init_election(env, ELECT_F_IMMED);
UNLOCK_MUTEX(db_rep->mutex);
@@ -406,6 +466,14 @@ DB_TEST_RECOVERY_LABEL
t_ret = __op_rep_exit(env);
if (ret == ENOENT)
ret = 0;
+ else if (ret == DB_DELETED && db_rep->demotion_pending)
+ /*
+ * If a demotion is in progress, we want to keep
+ * the repmgr threads instead of bowing out because
+ * they are needed when we rejoin the replication group
+ * immediately as a view.
+ */
+ ret = 0;
else if (ret == DB_DELETED)
ret = __repmgr_bow_out(env);
if (t_ret != 0 && ret == 0)
@@ -428,8 +496,10 @@ __repmgr_handle_event(env, event, info)
void *info;
{
DB_REP *db_rep;
+ REP *rep;
db_rep = env->rep_handle;
+ rep = db_rep->region;
if (db_rep->selector == NULL) {
/* Repmgr is not in use, so all events go to application. */
@@ -457,9 +527,46 @@ __repmgr_handle_event(env, event, info)
/* Application still needs to see this. */
break;
+ case DB_EVENT_REP_MASTER:
+ case DB_EVENT_REP_STARTUPDONE:
+ /*
+ * Detect a rare case where a dupmaster or incomplete gmdb
+ * operation has left the site's gmdb inconsistent with
+ * a view callback definition. The user would have correctly
+ * defined a view callback and called repmgr_start(), but the
+ * gmdb operation to update this site to a view would have been
+ * incomplete or rolled back. The site cannot operate in this
+ * inconsistent state, so set an indicator to cause a message
+ * thread to panic and terminate.
+ *
+ * The one exception is during a demotion to view, when
+ * this inconsistency is expected for a short time.
+ */
+ if (IS_VALID_EID(db_rep->self_eid) &&
+ PARTICIPANT_TO_VIEW(db_rep,
+ SITE_FROM_EID(db_rep->self_eid)) &&
+ !db_rep->demotion_pending)
+ db_rep->view_mismatch = TRUE;
+
+ /*
+ * In preferred master mode, when the preferred master site
+ * finishes synchronizing with the temporary master it must
+ * prepare to take over as master. This is detected by the
+ * next heartbeat in a message thread, where the takeover is
+ * actually performed.
+ */
+ if (event == DB_EVENT_REP_STARTUPDONE &&
+ IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "startupdone set preferred master switch"));
+ db_rep->prefmas_pending = master_switch;
+ }
+ break;
default:
break;
}
+ COMPQUIET(info, NULL);
return (DB_EVENT_NOT_HANDLED);
}
@@ -504,7 +611,7 @@ send_permlsn(env, generation, lsn)
*/
policy = site->ack_policy > 0 ?
site->ack_policy : rep->perm_policy;
- if (policy == DB_REPMGR_ACKS_NONE ||
+ if (IS_VIEW_SITE(env) || policy == DB_REPMGR_ACKS_NONE ||
(IS_PEER_POLICY(policy) && rep->priority == 0))
ack = FALSE;
else
@@ -614,26 +721,149 @@ send_permlsn_conn(env, conn, generation, lsn)
return (ret);
}
+/*
+ * Perform the steps on the preferred master site to take over again as
+ * preferred master from a temporary master. This routine should only be
+ * called after the preferred master has restarted as a client and finished
+ * a client sync with the temporary master.
+ *
+ * This routine makes a best effort to wait until all temporary master
+ * transactions have been applied on this site before taking over.
+ */
+static int
+preferred_master_takeover(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ DB_LSN last_ready_lsn, ready_lsn, sync_lsn;
+ u_long usec;
+ u_int32_t gen, max_tries, tries;
+ int ret, synced;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ gen = 0;
+ ZERO_LSN(sync_lsn);
+ ret = 0;
+
+ if (!IS_PREFMAS_MODE(env))
+ return (ret);
+
+ /*
+ * Start by making the temporary master a readonly master so that we
+ * can know when we have applied all of its transactions on this
+ * site before taking over.
+ */
+ if ((ret = __repmgr_make_site_readonly_master(env,
+ 1, &gen, &sync_lsn)) != 0)
+ return (ret);
+ DB_ASSERT(env, gen >= rep->gen);
+
+ /*
+ * Make a best effort to wait until this site has all transactions
+ * from the temporary master. We want to preserve temporary master
+ * transactions, but we can't wait forever. If we exceed our wait,
+ * we restart this site as preferred master anyway. This may
+ * sacrifice some temporary master transactions in order to preserve
+ * repgroup write availability.
+ *
+ * We restart the number of tries each time we make progress in
+ * transactions applied, until either we apply through sync_lsn or
+ * we exceed max_tries without progress.
+ */
+ if ((ret = __repmgr_prefmas_get_wait(env, &max_tries, &usec)) != 0)
+ return (ret);
+ tries = 0;
+ synced = 0;
+ ZERO_LSN(ready_lsn);
+ ZERO_LSN(last_ready_lsn);
+ while (!synced && tries < max_tries) {
+ __os_yield(env, 0, usec);
+ tries++;
+ /*
+ * lp->ready_lsn is the next LSN we expect to receive,
+ * which also indicates how much we've applied. sync_lsn
+ * is the lp->lsn (indicating the next log record expected)
+ * from the other site.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ ready_lsn = lp->ready_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (gen == rep->gen && LOG_COMPARE(&ready_lsn, &sync_lsn) >= 0)
+ synced = 1;
+ else if (LOG_COMPARE(&ready_lsn, &last_ready_lsn) >= 0) {
+ /* We are making progress, restart number of tries. */
+ last_ready_lsn = ready_lsn;
+ tries = 0;
+ }
+ }
+
+ /* Restart the remote readonly temporary master as a client. */
+ if ((ret = __repmgr_restart_site_as_client(env, 1)) != 0)
+ return (ret);
+
+ /* Restart this site as the preferred master, waiting for
+ * REP_LOCKOUT_MSG. The NEWCLIENT message sent back from
+ * restarting the other site as client can briefly lock
+ * REP_LOCKOUT_MSG to do some cleanup. We don't want this
+ * to cause the rep_start_int() call to restart this site
+ * as master to return 0 without doing anything.
+ */
+ ret = __repmgr_become_master(env, REP_START_WAIT_LOCKMSG);
+ return (ret);
+}
+
static int
serve_repmgr_request(env, msg)
ENV *env;
REPMGR_MESSAGE *msg;
{
- DB_THREAD_INFO *ip;
+ DB_REP *db_rep;
DBT *dbt;
+ DB_THREAD_INFO *ip;
REPMGR_CONNECTION *conn;
+ u_int32_t mtype;
int ret, t_ret;
- ENV_ENTER(env, ip);
- switch (REPMGR_OWN_MSG_TYPE(msg->msg_hdr)) {
+ db_rep = env->rep_handle;
+ ENV_GET_THREAD_INFO(env, ip);
+ conn = msg->v.gmdb_msg.conn;
+ mtype = REPMGR_OWN_MSG_TYPE(msg->msg_hdr);
+ switch (mtype) {
case REPMGR_JOIN_REQUEST:
ret = serve_join_request(env, ip, msg);
break;
+ case REPMGR_LSNHIST_REQUEST:
+ ret = serve_lsnhist_request(env, ip, msg);
+ break;
+ case REPMGR_READONLY_MASTER:
+ ret = serve_readonly_master_request(env, msg);
+ break;
case REPMGR_REJOIN:
RPRINT(env, (env, DB_VERB_REPMGR_MISC,
"One try at rejoining group automatically"));
if ((ret = __repmgr_join_group(env)) == DB_REP_UNAVAIL)
ret = __repmgr_bow_out(env);
+ else if (ret == 0 && IS_PREFMAS_MODE(env)) {
+ /*
+ * For preferred master mode, we need to get
+ * a "regular" connection to the other site without
+ * calling an election prematurely here.
+ */
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Establishing connections after rejoin"));
+ ret = rejoin_connections(env);
+ } else if (ret == 0 && db_rep->rejoin_pending) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Calling deferred election after rejoin"));
+ ret = rejoin_deferred_election(env);
+ }
+ db_rep->rejoin_pending = FALSE;
break;
case REPMGR_REMOVE_REQUEST:
ret = serve_remove_request(env, ip, msg);
@@ -641,23 +871,32 @@ serve_repmgr_request(env, msg)
case REPMGR_RESOLVE_LIMBO:
ret = resolve_limbo_wrapper(env, ip);
break;
+ case REPMGR_RESTART_CLIENT:
+ ret = serve_restart_client_request(env, msg);
+ break;
case REPMGR_SHARING:
dbt = &msg->v.gmdb_msg.request;
- ret = __repmgr_refresh_membership(env, dbt->data, dbt->size);
+ ret = __repmgr_refresh_membership(env, dbt->data, dbt->size,
+ (conn == NULL ? DB_REPMGR_VERSION : conn->version));
break;
default:
ret = __db_unknown_path(env, "serve_repmgr_request");
break;
}
- if ((conn = msg->v.gmdb_msg.conn) != NULL) {
+ if (conn != NULL) {
+ /*
+ * A site that removed itself may have already closed its
+ * connections. Do not return an error and panic if we
+ * can't close the one-shot GMDB connection for a remove
+ * request here.
+ */
if ((t_ret = __repmgr_close_connection(env, conn)) != 0 &&
- ret == 0)
+ ret == 0 && mtype != REPMGR_REMOVE_REQUEST)
ret = t_ret;
if ((t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 &&
ret == 0)
ret = t_ret;
}
- ENV_LEAVE(env, ip);
return (ret);
}
@@ -674,8 +913,10 @@ serve_join_request(env, ip, msg)
{
DB_REP *db_rep;
REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site;
DBT *dbt;
__repmgr_site_info_args site_info;
+ __repmgr_v4site_info_args v4site_info;
u_int8_t *buf;
char *host;
size_t len;
@@ -686,9 +927,18 @@ serve_join_request(env, ip, msg)
COMPQUIET(status, 0);
conn = msg->v.gmdb_msg.conn;
+ DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION);
dbt = &msg->v.gmdb_msg.request;
- ret = __repmgr_site_info_unmarshal(env,
- &site_info, dbt->data, dbt->size, NULL);
+ if (conn->version < 5) {
+ ret = __repmgr_v4site_info_unmarshal(env,
+ &v4site_info, dbt->data, dbt->size, NULL);
+ site_info.host = v4site_info.host;
+ site_info.port = v4site_info.port;
+ site_info.status = v4site_info.flags;
+ site_info.flags = 0;
+ } else
+ ret = __repmgr_site_info_unmarshal(env,
+ &site_info, dbt->data, dbt->size, NULL);
host = site_info.host.data;
host[site_info.host.size - 1] = '\0';
@@ -703,7 +953,23 @@ serve_join_request(env, ip, msg)
LOCK_MUTEX(db_rep->mutex);
if ((ret = __repmgr_find_site(env, host, site_info.port, &eid)) == 0) {
DB_ASSERT(env, eid != db_rep->self_eid);
- status = SITE_FROM_EID(eid)->membership;
+ site = SITE_FROM_EID(eid);
+ status = site->membership;
+ /*
+ * Remote site electability is usually exchanged when
+ * a connection is established, but when a new site
+ * joins the repgroup there is a brief gap between the
+ * join and the connection. Record electability for
+ * the joining site so that we are not overly conservative
+ * about the number of acks we require for a PERM
+ * transaction if the joining site is unelectable.
+ */
+ if (FLD_ISSET(site_info.flags, SITE_JOIN_ELECTABLE)) {
+ F_SET(site, SITE_ELECTABLE);
+ FLD_CLR(site_info.flags, SITE_JOIN_ELECTABLE);
+ } else
+ F_CLR(site, SITE_ELECTABLE);
+ F_SET(site, SITE_HAS_PRIO);
}
UNLOCK_MUTEX(db_rep->mutex);
if (ret != 0)
@@ -712,7 +978,8 @@ serve_join_request(env, ip, msg)
switch (status) {
case 0:
case SITE_ADDING:
- ret = __repmgr_update_membership(env, ip, eid, SITE_ADDING);
+ ret = __repmgr_update_membership(env, ip, eid, SITE_ADDING,
+ site_info.flags);
break;
case SITE_PRESENT:
/* Already in desired state. */
@@ -729,7 +996,7 @@ serve_join_request(env, ip, msg)
goto err;
LOCK_MUTEX(db_rep->mutex);
- ret = __repmgr_marshal_member_list(env, &buf, &len);
+ ret = __repmgr_marshal_member_list(env, conn->version, &buf, &len);
UNLOCK_MUTEX(db_rep->mutex);
if (ret != 0)
goto err;
@@ -760,6 +1027,7 @@ serve_remove_request(env, ip, msg)
REPMGR_SITE *site;
DBT *dbt;
__repmgr_site_info_args site_info;
+ __repmgr_v4site_info_args v4site_info;
char *host;
u_int32_t status, type;
int eid, ret, t_ret;
@@ -768,9 +1036,18 @@ serve_remove_request(env, ip, msg)
db_rep = env->rep_handle;
conn = msg->v.gmdb_msg.conn;
+ DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION);
dbt = &msg->v.gmdb_msg.request;
- ret = __repmgr_site_info_unmarshal(env,
- &site_info, dbt->data, dbt->size, NULL);
+ if (conn->version < 5) {
+ ret = __repmgr_v4site_info_unmarshal(env,
+ &v4site_info, dbt->data, dbt->size, NULL);
+ site_info.host = v4site_info.host;
+ site_info.port = v4site_info.port;
+ site_info.status = v4site_info.flags;
+ site_info.flags = 0;
+ } else
+ ret = __repmgr_site_info_unmarshal(env,
+ &site_info, dbt->data, dbt->size, NULL);
host = site_info.host.data;
host[site_info.host.size - 1] = '\0';
@@ -810,7 +1087,8 @@ serve_remove_request(env, ip, msg)
break;
case SITE_PRESENT:
case SITE_DELETING:
- ret = __repmgr_update_membership(env, ip, eid, SITE_DELETING);
+ ret = __repmgr_update_membership(env, ip, eid, SITE_DELETING,
+ site_info.flags);
break;
default:
ret = __db_unknown_path(env, "serve_remove_request");
@@ -829,7 +1107,175 @@ err:
default:
return (ret);
}
- return (__repmgr_send_sync_msg(env, conn, type, NULL, 0));
+ /*
+ * It is possible when a site removes itself that by now it has
+ * already acted on the first GMDB update and closed its connections.
+ * Do not return an error and panic if we can't send the final
+ * status of the remove operation.
+ */
+ if ((ret = __repmgr_send_sync_msg(env, conn, type, NULL, 0)) != 0)
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Problem sending remove site status message %d", ret));
+ return (0);
+}
+
+/*
+ * Serve the REPMGR_RESTART_CLIENT message by restarting this site as a
+ * client if it is not already a client. Always sends back a
+ * REPMGR_PREFMAS_SUCCESS message with an empty payload.
+ */
+static int
+serve_restart_client_request(env, msg)
+ ENV *env;
+ REPMGR_MESSAGE *msg;
+{
+ DB_REP *db_rep;
+ REP * rep;
+ REPMGR_CONNECTION *conn;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Serving restart_client request"));
+ conn = msg->v.gmdb_msg.conn;
+ DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION);
+ /* No need to read payload - it is just a dummy byte. */
+
+ if (IS_PREFMAS_MODE(env) && !F_ISSET(rep, REP_F_CLIENT))
+ ret = __repmgr_become_client(env);
+
+ if ((t_ret = __repmgr_send_sync_msg(env, conn,
+ REPMGR_PREFMAS_SUCCESS, NULL, 0)) != 0)
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Problem sending restart client success message %d", ret));
+
+ if (ret == 0 && t_ret != 0)
+ ret = t_ret;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Request for restart_client returning %d", ret));
+ return (ret);
+}
+
+/*
+ * Serve the REPMGR_READONLY_MASTER message by turning this site into a
+ * readonly master. Always sends back a REPMGR_READONLY_RESPONSE message with
+ * a payload containing this site's gen and next LSN expected. If there are
+ * any errors, the gen is 0 and the next LSN is [0,0].
+ */
+static int
+serve_readonly_master_request(env, msg)
+ ENV *env;
+ REPMGR_MESSAGE *msg;
+{
+ REPMGR_CONNECTION *conn;
+ __repmgr_permlsn_args permlsn;
+ u_int8_t buf[__REPMGR_PERMLSN_SIZE];
+ int ret, t_ret;
+
+ ret = 0;
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Serving readonly_master request"));
+ conn = msg->v.gmdb_msg.conn;
+ DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION);
+ /* No need to read payload - it is just a dummy byte. */
+
+ if (IS_PREFMAS_MODE(env))
+ ret = __rep_become_readonly_master(env,
+ &permlsn.generation, &permlsn.lsn);
+
+ __repmgr_permlsn_marshal(env, &permlsn, buf);
+ if ((t_ret = __repmgr_send_sync_msg(env, conn,
+ REPMGR_READONLY_RESPONSE, buf, __REPMGR_PERMLSN_SIZE)) != 0)
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Problem sending readonly response message %d", ret));
+ if (ret == 0 && t_ret != 0)
+ ret = t_ret;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Request for readonly_master returning %d", ret));
+ return (ret);
+}
+
+/*
+ * Serve the REPMGR_LSNHIST_REQUEST message by retrieving information from
+ * this site's LSN history database for the requested gen. If the requested
+ * gen exists at this site, sends back a REPMGR_LSNHIST_RESPONSE message
+ * containing the LSN and timestamp at the requested gen and the LSN for the
+ * next gen if that gen exists (next gen LSN is [0,0] if next gen doesn't
+ * yet exist at this site.) Sends back a PREFMAS_FAILURE message if the
+ * requested gen does not yet exist at this site or if there are any errors.
+ */
+static int
+serve_lsnhist_request(env, ip, msg)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ REPMGR_MESSAGE *msg;
+{
+ REPMGR_CONNECTION *conn;
+ DBT *dbt;
+ __repmgr_lsnhist_match_args lsnhist_match;
+ __rep_lsn_hist_data_args lsnhist_data, next_lsnhist_data;
+ __rep_lsn_hist_key_args key;
+ u_int8_t match_buf[__REPMGR_LSNHIST_MATCH_SIZE];
+ DB_LSN next_gen_lsn;
+ int ret, t_ret;
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Serving lsnhist request"));
+ conn = msg->v.gmdb_msg.conn;
+ DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION);
+ /* Read lsn_hist_key incoming payload to get gen being requested. */
+ dbt = &msg->v.gmdb_msg.request;
+ if ((ret = __rep_lsn_hist_key_unmarshal(env,
+ &key, dbt->data, dbt->size, NULL)) != 0)
+ return (ret);
+ if (key.version != REP_LSN_HISTORY_FMT_VERSION) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "serve_lsnhist_request version mismatch"));
+ return (0);
+ }
+
+ /*
+ * There's no need to retry if we don't find an lsnhist record for
+ * requested gen. This site is either a temporary master or a client,
+ * which means that if it doesn't already have an lsnhist record at
+ * this gen, it is highly unlikely to get one in the near future.
+ */
+ if ((ret = __rep_get_lsnhist_data(env,
+ ip, key.gen, &lsnhist_data)) == 0) {
+
+ if ((t_ret = __rep_get_lsnhist_data(env,
+ ip, key.gen + 1, &next_lsnhist_data)) == 0)
+ next_gen_lsn = next_lsnhist_data.lsn;
+ else
+ ZERO_LSN(next_gen_lsn);
+
+ lsnhist_match.lsn = lsnhist_data.lsn;
+ lsnhist_match.hist_sec = lsnhist_data.hist_sec;
+ lsnhist_match.hist_nsec = lsnhist_data.hist_nsec;
+ lsnhist_match.next_gen_lsn = next_gen_lsn;
+ __repmgr_lsnhist_match_marshal(env, &lsnhist_match, match_buf);
+ if ((t_ret = __repmgr_send_sync_msg(env, conn,
+ REPMGR_LSNHIST_RESPONSE, match_buf,
+ __REPMGR_LSNHIST_MATCH_SIZE)) != 0)
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Problem sending lsnhist response message %d",
+ ret));
+ } else if ((t_ret = __repmgr_send_sync_msg(env, conn,
+ REPMGR_PREFMAS_FAILURE, NULL, 0)) != 0)
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Problem sending prefmas failure message %d", ret));
+
+ /* Do not return an error if LSN history record not found. */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if (ret == 0 && t_ret != 0)
+ ret = t_ret;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Request for lsnhist returning %d", ret));
+ return (ret);
}
/*
@@ -917,7 +1363,13 @@ resolve_limbo_int(env, ip)
if (orig_status == SITE_PRESENT || orig_status == 0)
goto out;
- if (IS_ZERO_LSN(db_rep->limbo_failure))
+ /*
+ * It is possible after an autotakeover on a master to have no
+ * limbo_failure LSN but to have a limbo_victim that was found
+ * in the gmdb that still needs to be resolved.
+ */
+ if (IS_ZERO_LSN(db_rep->limbo_failure) &&
+ !db_rep->limbo_resolution_needed)
goto out;
/*
@@ -947,7 +1399,8 @@ resolve_limbo_int(env, ip)
ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
goto out;
- marshal_site_data(env, orig_status, data_buf, &data_dbt);
+ marshal_site_data(env,
+ orig_status, site->gmdb_flags, data_buf, &data_dbt);
ret = __db_put(db_rep->gmdb, ip, txn, &key_dbt, &data_dbt, 0);
if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 &&
@@ -980,15 +1433,15 @@ resolve_limbo_int(env, ip)
UNLOCK_MUTEX(db_rep->mutex);
locked = FALSE;
status = NEXT_STATUS(orig_status);
- if ((ret = finish_gmdb_update(env,
- ip, &key_dbt, orig_status, status, &logrec)) != 0)
+ if ((ret = finish_gmdb_update(env, ip,
+ &key_dbt, orig_status, status, site->gmdb_flags, &logrec)) != 0)
goto out;
/* Track modified membership status in our in-memory sites array. */
LOCK_MUTEX(db_rep->mutex);
locked = TRUE;
if ((ret = __repmgr_set_membership(env,
- addr.host, addr.port, status)) != 0)
+ addr.host, addr.port, status, site->gmdb_flags)) != 0)
goto out;
__repmgr_set_sites(env);
@@ -1005,14 +1458,15 @@ out:
* status is inferred (ADDING -> PRESENT, or DELETING -> 0).
*
* PUBLIC: int __repmgr_update_membership __P((ENV *,
- * PUBLIC: DB_THREAD_INFO *, int, u_int32_t));
+ * PUBLIC: DB_THREAD_INFO *, int, u_int32_t, u_int32_t));
*/
int
-__repmgr_update_membership(env, ip, eid, pstatus)
+__repmgr_update_membership(env, ip, eid, pstatus, site_flags)
ENV *env;
DB_THREAD_INFO *ip;
int eid;
u_int32_t pstatus; /* Provisional status. */
+ u_int32_t site_flags;
{
DB_REP *db_rep;
REPMGR_SITE *site;
@@ -1092,7 +1546,7 @@ retry:
* those seem even more confusing.
*/
if ((ret = __repmgr_set_membership(env,
- addr.host, addr.port, pstatus)) != 0)
+ addr.host, addr.port, pstatus, site_flags)) != 0)
goto err;
__repmgr_set_sites(env);
@@ -1108,7 +1562,7 @@ retry:
if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0)
goto err;
marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec);
- marshal_site_data(env, pstatus, status_buf, &data_dbt);
+ marshal_site_data(env, pstatus, site_flags, status_buf, &data_dbt);
if ((ret = __db_put(db_rep->gmdb,
ip, txn, &key_dbt, &data_dbt, 0)) != 0)
goto err;
@@ -1152,13 +1606,14 @@ retry:
locked = FALSE;
if ((ret = finish_gmdb_update(env, ip,
- &key_dbt, pstatus, ult_status, &logrec)) != 0)
+ &key_dbt, pstatus, ult_status, site_flags, &logrec)) != 0)
goto err;
/* Track modified membership status in our in-memory sites array. */
LOCK_MUTEX(db_rep->mutex);
locked = TRUE;
- ret = __repmgr_set_membership(env, addr.host, addr.port, ult_status);
+ ret = __repmgr_set_membership(env, addr.host, addr.port,
+ ult_status, site_flags);
__repmgr_set_sites(env);
err:
@@ -1173,7 +1628,7 @@ err:
* that we keep in sync.
*/
(void)__repmgr_set_membership(env,
- addr.host, addr.port, orig_status);
+ addr.host, addr.port, orig_status, site_flags);
}
if ((t_ret = __repmgr_cleanup_gmdb_op(env, do_close)) != 0 &&
ret == 0)
@@ -1215,13 +1670,14 @@ retry:
UNLOCK_MUTEX(db_rep->mutex);
marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec);
- if ((ret = finish_gmdb_update(env,
- ip, &key_dbt, cur_status, new_status, &logrec)) != 0)
+ if ((ret = finish_gmdb_update(env, ip,
+ &key_dbt, cur_status, new_status, site->gmdb_flags, &logrec)) != 0)
goto err;
/* Track modified membership status in our in-memory sites array. */
LOCK_MUTEX(db_rep->mutex);
- ret = __repmgr_set_membership(env, addr.host, addr.port, new_status);
+ ret = __repmgr_set_membership(env, addr.host, addr.port,
+ new_status, site->gmdb_flags);
__repmgr_set_sites(env);
UNLOCK_MUTEX(db_rep->mutex);
@@ -1301,11 +1757,11 @@ __repmgr_set_gm_version(env, ip, txn, version)
* really deleted.
*/
static int
-finish_gmdb_update(env, ip, key_dbt, prev_status, status, logrec)
+finish_gmdb_update(env, ip, key_dbt, prev_status, status, flags, logrec)
ENV *env;
DB_THREAD_INFO *ip;
DBT *key_dbt;
- u_int32_t prev_status, status;
+ u_int32_t prev_status, status, flags;
__repmgr_member_args *logrec;
{
DB_REP *db_rep;
@@ -1324,7 +1780,7 @@ finish_gmdb_update(env, ip, key_dbt, prev_status, status, logrec)
if (status == 0)
ret = __db_del(db_rep->gmdb, ip, txn, key_dbt, 0);
else {
- marshal_site_data(env, status, data_buf, &data_dbt);
+ marshal_site_data(env, status, flags, data_buf, &data_dbt);
ret = __db_put(db_rep->gmdb, ip, txn, key_dbt, &data_dbt, 0);
}
if (ret != 0)
@@ -1617,16 +2073,18 @@ marshal_site_key(env, addr, buf, dbt, logrec)
}
static void
-marshal_site_data(env, status, buf, dbt)
+marshal_site_data(env, status, flags, buf, dbt)
ENV *env;
u_int32_t status;
+ u_int32_t flags;
u_int8_t *buf;
DBT *dbt;
{
- __repmgr_membership_data_args member_status;
+ __repmgr_membership_data_args member_data;
- member_status.flags = status;
- __repmgr_membership_data_marshal(env, &member_status, buf);
+ member_data.status = status;
+ member_data.flags = flags;
+ __repmgr_membership_data_marshal(env, &member_data, buf);
DB_INIT_DBT(*dbt, buf, __REPMGR_MEMBERSHIP_DATA_SIZE);
}
@@ -1640,16 +2098,107 @@ __repmgr_set_sites(env)
ENV *env;
{
DB_REP *db_rep;
+ REP *rep;
int ret;
u_int32_t n;
u_int i;
db_rep = env->rep_handle;
+ rep = db_rep->region;
for (i = 0, n = 0; i < db_rep->site_cnt; i++) {
- if (db_rep->sites[i].membership > 0)
+ /*
+ * Views do not count towards nsites because they cannot
+ * vote in elections, become master or contribute to
+ * durability.
+ */
+ if (db_rep->sites[i].membership > 0 &&
+ !FLD_ISSET(db_rep->sites[i].gmdb_flags, SITE_VIEW))
n++;
}
ret = __rep_set_nsites_int(env, n);
DB_ASSERT(env, ret == 0);
+ if (FLD_ISSET(rep->config,
+ REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT) &&
+ rep->config_nsites > 2)
+ __db_errx(env, DB_STR("3701",
+ "More than two sites in preferred master replication group"));
+}
+
+/*
+ * If a site is rejoining a 2-site repgroup with 2SITE_STRICT off
+ * and has a rejection because it needs to catch up with the latest
+ * group membership database, it cannot call an election right away
+ * because it would win with only its own vote and ignore an existing
+ * master in the repgroup. Instead, this routine is used to call the
+ * deferred election after the site has rejoined the repgroup successfully.
+ */
+static int
+rejoin_deferred_election(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ u_int32_t flags;
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+ LOCK_MUTEX(db_rep->mutex);
+
+ /*
+ * First, retry all connections so that the election can communicate
+ * with the other sites. Normally there should only be one other
+ * site in the repgroup, but it is safest to retry all remote sites
+ * found in case the group membership changed while we were gone.
+ */
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ if ((ret =
+ __repmgr_schedule_connection_attempt(env, eid, TRUE)) != 0)
+ break;
+ }
+
+ /*
+ * Call an immediate, but not a fast, election because a fast
+ * election reduces the number of votes needed by 1.
+ */
+ flags = ELECT_F_EVENT_NOTIFY;
+ if (FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS))
+ LF_SET(ELECT_F_IMMED);
+ else
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Deferred rejoin election, but no elections"));
+ ret = __repmgr_init_election(env, flags);
+
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
+}
+/*
+ * If a site is rejoining a preferred master replication group and has a
+ * rejection because it needs to catch up with the latest group membership
+ * database, it needs to establish its "regular" connection to the other site
+ * so that it can proceed through the preferred master startup sequence.
+ */
+static int
+rejoin_connections(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+ ret = 0;
+ LOCK_MUTEX(db_rep->mutex);
+
+ /*
+ * Retry all connections. Normally there should only be one other
+ * site in the repgroup, but it is safest to retry all remote sites
+ * found in case the group membership changed while we were gone.
+ */
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ if ((ret =
+ __repmgr_schedule_connection_attempt(env, eid, TRUE)) != 0)
+ break;
+ }
+
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (ret);
}
diff --git a/src/repmgr/repmgr_net.c b/src/repmgr/repmgr_net.c
index 54e3d066..334fd150 100644
--- a/src/repmgr/repmgr_net.c
+++ b/src/repmgr/repmgr_net.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -57,6 +57,7 @@ struct sending_msg {
* whether the PERM message should be considered durable.
*/
struct repmgr_permanence {
+ u_int32_t gen; /* Master generation for LSN. */
DB_LSN lsn; /* LSN whose ack this thread is waiting for. */
u_int threshold; /* Number of client acks to wait for. */
u_int quorum; /* Durability threshold for QUORUM policy. */
@@ -378,7 +379,7 @@ __repmgr_send(dbenv, control, rec, lsnp, eid, flags)
goto out;
#undef SEND_ONE_CONNECTION
- nsites_sent = 1;
+ nsites_sent = FLD_ISSET(site->gmdb_flags, SITE_VIEW) ? 0 : 1;
npeers_sent = F_ISSET(site, SITE_ELECTABLE) ? 1 : 0;
missed_peer = FALSE;
}
@@ -418,7 +419,13 @@ __repmgr_send(dbenv, control, rec, lsnp, eid, flags)
nclients = 0;
else if ((policy == DB_REPMGR_ACKS_ONE ||
policy == DB_REPMGR_ACKS_ONE_PEER) &&
- nclients == 1) {
+ nclients < 2) {
+ /*
+ * Adjust to QUORUM when first other
+ * participant joins (nclients=1) or when there
+ * are no other participants but a view joins
+ * (nclients=0) to get enough acks.
+ */
nclients = 0;
policy = DB_REPMGR_ACKS_QUORUM;
}
@@ -498,9 +505,16 @@ __repmgr_send(dbenv, control, rec, lsnp, eid, flags)
if (nclients > 1 ||
FLD_ISSET(db_rep->region->config,
REP_C_2SITE_STRICT) ||
- db_rep->active_gmdb_update == gmdb_primary)
+ db_rep->active_gmdb_update == gmdb_primary) {
quorum = nclients / 2;
- else
+ /*
+ * An unelectable master can't be part of the
+ * QUORUM policy quorum.
+ */
+ if (rep->priority == 0 &&
+ policy == DB_REPMGR_ACKS_QUORUM)
+ quorum++;
+ } else
quorum = nclients;
if (policy == DB_REPMGR_ACKS_ALL_AVAILABLE) {
@@ -560,6 +574,7 @@ __repmgr_send(dbenv, control, rec, lsnp, eid, flags)
/* In ALL_PEERS case, display of "needed" might be confusing. */
VPRINT(env, (env, DB_VERB_REPMGR_MISC,
"will await acknowledgement: need %u", needed));
+ perm.gen = rep->gen;
perm.lsn = *lsnp;
perm.threshold = needed;
perm.policy = policy;
@@ -734,8 +749,13 @@ __repmgr_send_broadcast(env, type, control, rec, nsitesp, npeersp, missingp)
* useful to keep letting a removed site see updates so that it
* learns of its own removal, and will know to rejoin at its
* next reboot.
+ *
+ * We never count sends to views because views cannot
+ * contribute to durability, but we always do the sends.
*/
- if (site->membership == SITE_PRESENT)
+ if (FLD_ISSET(site->gmdb_flags, SITE_VIEW))
+ full_member = FALSE;
+ else if (site->membership == SITE_PRESENT)
full_member = TRUE;
else {
full_member = FALSE;
@@ -802,7 +822,9 @@ send_connection(env, type, conn, msg, sent)
REPMGR_MAX_V1_MSG_TYPE,
REPMGR_MAX_V2_MSG_TYPE,
REPMGR_MAX_V3_MSG_TYPE,
- REPMGR_MAX_V4_MSG_TYPE
+ REPMGR_MAX_V4_MSG_TYPE,
+ REPMGR_MAX_V5_MSG_TYPE,
+ REPMGR_MAX_V6_MSG_TYPE
};
db_rep = env->rep_handle;
@@ -1132,18 +1154,24 @@ got_acks(env, context)
has_unacked_peer = FALSE;
FOR_EACH_REMOTE_SITE_INDEX(eid) {
site = SITE_FROM_EID(eid);
- if (site->membership != SITE_PRESENT)
+ /*
+ * Do not count an ack from a view because a view cannot
+ * contribute to durability.
+ */
+ if (FLD_ISSET(site->gmdb_flags, SITE_VIEW))
continue;
if (!F_ISSET(site, SITE_HAS_PRIO)) {
/*
- * Never connected to this site: since we can't know
- * whether it's a peer, assume the worst.
+ * We have not reconnected to this site since the last
+ * recovery. Since we don't yet know whether it's a
+ * peer, assume the worst.
*/
has_unacked_peer = TRUE;
continue;
}
- if (LOG_COMPARE(&site->max_ack, &perm->lsn) >= 0) {
+ if (site->max_ack_gen == perm->gen &&
+ LOG_COMPARE(&site->max_ack, &perm->lsn) >= 0) {
sites_acked++;
if (F_ISSET(site, SITE_ELECTABLE))
peers_acked++;
@@ -1206,6 +1234,7 @@ __repmgr_bust_connection(env, conn)
DB_REP *db_rep;
REP *rep;
REPMGR_SITE *site;
+ db_timespec now;
u_int32_t flags;
int ret, eid;
@@ -1259,7 +1288,9 @@ __repmgr_bust_connection(env, conn)
} else /* Subordinate connection. */
goto out;
- if ((ret = __repmgr_schedule_connection_attempt(env, eid, FALSE)) != 0)
+ /* Defer connection attempt if rejoining 2SITE_STRICT=off repgroup. */
+ if (!db_rep->rejoin_pending &&
+ (ret = __repmgr_schedule_connection_attempt(env, eid, FALSE)) != 0)
goto out;
/*
@@ -1267,11 +1298,47 @@ __repmgr_bust_connection(env, conn)
* master, assume that the master may have failed, and call for
* an election. But only do this for the connection to the main
* master process, not a subordinate one. And only do it if
- * we're our site's main process, not a subordinate one. And
+ * we're our site's listener process, not a subordinate one. And
* skip it if the application has configured us not to do
* elections.
*/
if (!IS_SUBORDINATE(db_rep) && eid == rep->master_id) {
+ if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER)) {
+ /*
+ * When the connection is from master's listener, if
+ * there is any other connection from a master's
+ * subordinate process that could take over as
+ * listener, we delay the election to allow some time
+ * for a new master listener to start. At the end of
+ * the delay, if there is still no master listener,
+ * call an election. There is a slight chance that
+ * we will delay the election to wait for an inactive
+ * connection which would never become the next main
+ * connection.
+ */
+ TAILQ_FOREACH(conn, &site->sub_conns, entries) {
+ if (conn->auto_takeover) {
+ if (!timespecisset(
+ &db_rep->m_listener_chk)) {
+ __os_gettime(env, &now, 1);
+ TIMESPEC_ADD_DB_TIMEOUT(&now,
+ db_rep->m_listener_wait);
+ db_rep->m_listener_chk = now;
+ }
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Master failure, but delay elections for takeover on master"));
+ return (0);
+ }
+ }
+ }
+
+ /* Defer election if rejoining 2SITE_STRICT=off repgroup. */
+ if (db_rep->rejoin_pending) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Deferring election after rejoin rejection"));
+ goto out;
+ }
+
/*
* Even if we're not doing elections, defer the event
* notification to later execution in the election
@@ -1285,6 +1352,17 @@ __repmgr_bust_connection(env, conn)
RPRINT(env, (env, DB_VERB_REPMGR_MISC,
"Master failure, but no elections"));
+ /*
+ * In preferred master mode, a client that has lost its
+ * connection to the master uses an election thread to
+ * restart as master.
+ */
+ if (IS_PREFMAS_MODE(env)) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+"bust_connection setting preferred master temp master"));
+ db_rep->prefmas_pending = start_temp_master;
+ }
+
if ((ret = __repmgr_init_election(env, flags)) != 0)
goto out;
}
@@ -1340,25 +1418,59 @@ __repmgr_disable_connection(env, conn)
REPMGR_CONNECTION *conn;
{
DB_REP *db_rep;
- REPMGR_SITE *site;
+ REP *rep;
REPMGR_RESPONSE *resp;
+ REPMGR_SITE *site;
+ SITEINFO *sites;
u_int32_t i;
- int eid, ret, t_ret;
+ int eid, is_subord, orig_state, ret, t_ret;
db_rep = env->rep_handle;
+ rep = db_rep->region;
ret = 0;
+ is_subord = 0;
+ orig_state = conn->state;
conn->state = CONN_DEFUNCT;
if (conn->type == REP_CONNECTION) {
eid = conn->eid;
if (IS_VALID_EID(eid)) {
site = SITE_FROM_EID(eid);
if (conn != site->ref.conn.in &&
- conn != site->ref.conn.out)
- /* It's a subordinate connection. */
+ conn != site->ref.conn.out) {
+ /*
+ * It is a subordinate connection to disable.
+ * Remove it from the subordinate connection
+ * list, and decrease the number of listener
+ * candidates by 1 if it is from a subordinate
+ * rep-aware process that allows takeover.
+ */
TAILQ_REMOVE(&site->sub_conns, conn, entries);
+ SET_LISTENER_CAND(conn->auto_takeover, --);
+ is_subord = 1;
+ }
TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries);
conn->ref_count++;
+ /*
+ * Do not decrease sites_avail for a subordinate
+ * connection.
+ */
+ if (site->state == SITE_CONNECTED && !is_subord &&
+ (orig_state == CONN_READY ||
+ orig_state == CONN_CONGESTED)) {
+ /*
+ * Some thread orderings can cause a brief
+ * dip into a negative sites_avail value.
+ * Once it goes negative it stays negative,
+ * so avoid this. Future connections will
+ * be counted correctly.
+ */
+ if (rep->sites_avail > 0)
+ rep->sites_avail--;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "disable_conn: EID %lu disabled. sites_avail %lu",
+ (u_long)eid, (u_long)rep->sites_avail));
+ }
}
conn->eid = -1;
} else if (conn->type == APP_CONNECTION) {
@@ -1646,8 +1758,10 @@ flatten(env, msg)
}
/*
- * Scan the list of remote sites, returning the first one that is a peer,
- * is not the current master, and is available.
+ * Scan the list of remote sites, returning the first participant that is a
+ * peer, is not the current master, and is available. If there are no
+ * available participant peers but there is an available view peer, return the
+ * first available view peer.
*/
static REPMGR_SITE *
__repmgr_find_available_peer(env)
@@ -1656,23 +1770,28 @@ __repmgr_find_available_peer(env)
DB_REP *db_rep;
REP *rep;
REPMGR_CONNECTION *conn;
- REPMGR_SITE *site;
- u_int i;
+ REPMGR_SITE *site, *view;
+ u_int avail, i;
db_rep = env->rep_handle;
rep = db_rep->region;
+ view = NULL;
FOR_EACH_REMOTE_SITE_INDEX(i) {
site = &db_rep->sites[i];
- if (FLD_ISSET(site->config, DB_REPMGR_PEER) &&
- EID_FROM_SITE(site) != rep->master_id &&
- site->state == SITE_CONNECTED &&
+ avail = (site->state == SITE_CONNECTED &&
(((conn = site->ref.conn.in) != NULL &&
conn->state == CONN_READY) ||
((conn = site->ref.conn.out) != NULL &&
- conn->state == CONN_READY)))
+ conn->state == CONN_READY)));
+ if (FLD_ISSET(site->config, DB_REPMGR_PEER) &&
+ !FLD_ISSET(site->gmdb_flags, SITE_VIEW) &&
+ EID_FROM_SITE(site) != rep->master_id && avail)
return (site);
+ if (!view && FLD_ISSET(site->config, DB_REPMGR_PEER) &&
+ FLD_ISSET(site->gmdb_flags, SITE_VIEW) && avail)
+ view = site;
}
- return (NULL);
+ return (view);
}
/*
@@ -1852,6 +1971,7 @@ __repmgr_net_close(env)
site->ref.conn.out = NULL;
}
}
+ rep->sites_avail = 0;
if (db_rep->listen_fd != INVALID_SOCKET) {
if (closesocket(db_rep->listen_fd) == SOCKET_ERROR && ret == 0)
@@ -1870,22 +1990,28 @@ final_cleanup(env, conn, unused)
void *unused;
{
DB_REP *db_rep;
+ REP *rep;
REPMGR_SITE *site;
- int ret, t_ret;
+ SITEINFO *sites;
+ int eid, ret, t_ret;
COMPQUIET(unused, NULL);
db_rep = env->rep_handle;
+ rep = db_rep->region;
+ eid = conn->eid;
ret = __repmgr_close_connection(env, conn);
/* Remove the connection from whatever list it's on, if any. */
- if (conn->type == REP_CONNECTION && IS_VALID_EID(conn->eid)) {
- site = SITE_FROM_EID(conn->eid);
+ if (conn->type == REP_CONNECTION && IS_VALID_EID(eid)) {
+ site = SITE_FROM_EID(eid);
if (site->state == SITE_CONNECTED &&
(conn == site->ref.conn.in || conn == site->ref.conn.out)) {
/* Not on any list, so no need to do anything. */
- } else
+ } else {
TAILQ_REMOVE(&site->sub_conns, conn, entries);
+ SET_LISTENER_CAND(conn->auto_takeover, --);
+ }
t_ret = __repmgr_destroy_conn(env, conn);
} else {
diff --git a/src/repmgr/repmgr_posix.c b/src/repmgr/repmgr_posix.c
index 0687681a..c49017ff 100644
--- a/src/repmgr/repmgr_posix.c
+++ b/src/repmgr/repmgr_posix.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/repmgr/repmgr_queue.c b/src/repmgr/repmgr_queue.c
index 6a381acf..3a51b32b 100644
--- a/src/repmgr/repmgr_queue.c
+++ b/src/repmgr/repmgr_queue.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -22,13 +22,28 @@ __repmgr_queue_destroy(env)
ENV *env;
{
DB_REP *db_rep;
+ REP *rep;
REPMGR_MESSAGE *m;
REPMGR_CONNECTION *conn;
+ u_int32_t mtype;
int ret, t_ret;
+ COMPQUIET(mtype, 0);
+
db_rep = env->rep_handle;
+ rep = db_rep->region;
ret = 0;
+
+ /*
+ * Turn on the DB_EVENT_REP_INQUEUE_FULL event firing. We only do
+ * this for the main listener process. For a subordinate process,
+ * it is always turned on.
+ */
+ if (!STAILQ_EMPTY(&db_rep->input_queue.header) &&
+ !IS_SUBORDINATE(db_rep))
+ rep->inqueue_full_event_on = 1;
+
while (!STAILQ_EMPTY(&db_rep->input_queue.header)) {
m = STAILQ_FIRST(&db_rep->input_queue.header);
STAILQ_REMOVE_HEAD(&db_rep->input_queue.header, entries);
@@ -38,8 +53,25 @@ __repmgr_queue_destroy(env)
ret == 0)
ret = t_ret;
}
+ if (m->msg_hdr.type == REPMGR_OWN_MSG) {
+ mtype = REPMGR_OWN_MSG_TYPE(m->msg_hdr);
+ if ((conn = m->v.gmdb_msg.conn) != NULL) {
+ /*
+ * A site that removed itself may have already
+ * closed its connections.
+ */
+ if ((t_ret = __repmgr_close_connection(env,
+ conn)) != 0 && ret == 0 &&
+ mtype != REPMGR_REMOVE_REQUEST)
+ ret = t_ret;
+ if ((t_ret = __repmgr_decr_conn_ref(env,
+ conn)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ }
__os_free(env, m);
}
+
return (ret);
}
@@ -60,14 +92,17 @@ __repmgr_queue_get(env, msgp, th)
REPMGR_RUNNABLE *th;
{
DB_REP *db_rep;
+ REP *rep;
REPMGR_MESSAGE *m;
#ifdef DB_WIN32
HANDLE wait_events[2];
#endif
+ u_int32_t msgsize;
int ret;
ret = 0;
db_rep = env->rep_handle;
+ rep = db_rep->region;
while ((m = available_work(env)) == NULL &&
db_rep->repmgr_status == running && !th->quit_requested) {
@@ -104,10 +139,42 @@ __repmgr_queue_get(env, msgp, th)
else {
STAILQ_REMOVE(&db_rep->input_queue.header,
m, __repmgr_message, entries);
- db_rep->input_queue.size--;
+ msgsize = (u_int32_t)m->size;
+ while (msgsize >= GIGABYTE) {
+ DB_ASSERT(env, db_rep->input_queue.gbytes > 0);
+ db_rep->input_queue.gbytes--;
+ msgsize -= GIGABYTE;
+ }
+ if (db_rep->input_queue.bytes < msgsize) {
+ DB_ASSERT(env, db_rep->input_queue.gbytes > 0);
+ db_rep->input_queue.gbytes--;
+ db_rep->input_queue.bytes += GIGABYTE;
+ }
+ db_rep->input_queue.bytes -= msgsize;
+
+ /*
+ * Check if current size is out of the red zone.
+ * If it is, we will turn on the DB_EVENT_REP_INQUEUE_FULL
+ * event firing.
+ *
+ * We only have the redzone machanism for the main listener
+ * process.
+ */
+ if (!IS_SUBORDINATE(db_rep) &&
+ rep->inqueue_full_event_on == 0) {
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ if (db_rep->input_queue.gbytes <
+ rep->inqueue_rz_gbytes ||
+ (db_rep->input_queue.gbytes ==
+ rep->inqueue_rz_gbytes &&
+ db_rep->input_queue.bytes <
+ rep->inqueue_rz_bytes))
+ rep->inqueue_full_event_on = 1;
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ }
+
*msgp = m;
}
-
err:
return (ret);
}
@@ -157,24 +224,55 @@ __repmgr_queue_put(env, msg)
REPMGR_MESSAGE *msg;
{
DB_REP *db_rep;
+ REP *rep;
+ u_int32_t msgsize;
db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * Drop message if incoming queue contains more messages than the
+ * limit. See dbenv->repmgr_set_incoming_queue_max() for more
+ * information.
+ */
+ MUTEX_LOCK(env, rep->mtx_repmgr);
+ if (db_rep->input_queue.gbytes > rep->inqueue_max_gbytes ||
+ (db_rep->input_queue.gbytes == rep->inqueue_max_gbytes &&
+ db_rep->input_queue.bytes >= rep->inqueue_max_bytes)) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "incoming queue limit exceeded"));
+ STAT(rep->mstat.st_incoming_msgs_dropped++);
+ if (IS_SUBORDINATE(db_rep) || rep->inqueue_full_event_on) {
+ DB_EVENT(env, DB_EVENT_REP_INQUEUE_FULL, NULL);
+ /*
+ * We will always disable the event firing after
+ * the queue is full. It will be enabled again
+ * after the incoming queue size is out of the
+ * redzone.
+ *
+ * We only have the redzone machanism for the main
+ * listener process.
+ */
+ if (!IS_SUBORDINATE(db_rep))
+ rep->inqueue_full_event_on = 0;
+ }
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
+ __os_free(env, msg);
+ return (0);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_repmgr);
STAILQ_INSERT_TAIL(&db_rep->input_queue.header, msg, entries);
- db_rep->input_queue.size++;
+ msgsize = (u_int32_t)msg->size;
+ while (msgsize >= GIGABYTE) {
+ msgsize -= GIGABYTE;
+ db_rep->input_queue.gbytes++;
+ }
+ db_rep->input_queue.bytes += msgsize;
+ if (db_rep->input_queue.bytes >= GIGABYTE) {
+ db_rep->input_queue.gbytes++;
+ db_rep->input_queue.bytes -= GIGABYTE;
+ }
return (__repmgr_signal(&db_rep->msg_avail));
}
-
-/*
- * PUBLIC: int __repmgr_queue_size __P((ENV *));
- *
- * !!!
- * Caller must hold repmgr->mutex.
- */
-int
-__repmgr_queue_size(env)
- ENV *env;
-{
- return (env->rep_handle->input_queue.size);
-}
diff --git a/src/repmgr/repmgr_rec.c b/src/repmgr/repmgr_rec.c
index 41827aff..568df45d 100644
--- a/src/repmgr/repmgr_rec.c
+++ b/src/repmgr/repmgr_rec.c
@@ -1,3 +1,11 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2014, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
#include "db_config.h"
#include "db_int.h"
@@ -31,7 +39,7 @@ __repmgr_member_recover(env, dbtp, lsnp, op, info)
/*
* The annotation log record describes the update in enough detail for
- * us to be able to optimize our tracking of it at clients sites.
+ * us to be able to optimize our tracking of it at client sites.
* However, for now we just simply reread the whole (small) database
* each time, since changes happen so seldom (and we need to have the
* code for reading the whole thing anyway, for other cases).
diff --git a/src/repmgr/repmgr_sel.c b/src/repmgr/repmgr_sel.c
index ba14368f..c32dad25 100644
--- a/src/repmgr/repmgr_sel.c
+++ b/src/repmgr/repmgr_sel.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -12,7 +12,7 @@
typedef int (*HEARTBEAT_ACTION) __P((ENV *));
-static int accept_handshake __P((ENV *, REPMGR_CONNECTION *, char *));
+static int accept_handshake __P((ENV *, REPMGR_CONNECTION *, char *, int *));
static int accept_v1_handshake __P((ENV *, REPMGR_CONNECTION *, char *));
static void check_min_log_file __P((ENV *));
static int dispatch_msgin __P((ENV *, REPMGR_CONNECTION *));
@@ -23,13 +23,18 @@ static int process_parameters __P((ENV *,
static int read_version_response __P((ENV *, REPMGR_CONNECTION *));
static int record_permlsn __P((ENV *, REPMGR_CONNECTION *));
static int __repmgr_call_election __P((ENV *));
+static int __repmgr_check_listener __P((ENV *));
+static int __repmgr_check_master_listener __P((ENV *));
static int __repmgr_connector_main __P((ENV *, REPMGR_RUNNABLE *));
static void *__repmgr_connector_thread __P((void *));
static int __repmgr_next_timeout __P((ENV *,
db_timespec *, HEARTBEAT_ACTION *));
+static int __repmgr_reset_last_rcvd __P((ENV *));
static int __repmgr_retry_connections __P((ENV *));
static int __repmgr_send_heartbeat __P((ENV *));
-static int __repmgr_try_one __P((ENV *, int));
+static int __repmgr_start_takeover __P((ENV *));
+static void *__repmgr_takeover_thread __P((void *));
+static int __repmgr_try_one __P((ENV *, int, int));
static int resolve_collision __P((ENV *, REPMGR_SITE *, REPMGR_CONNECTION *));
static int send_version_response __P((ENV *, REPMGR_CONNECTION *));
@@ -49,17 +54,24 @@ void *
__repmgr_select_thread(argsp)
void *argsp;
{
- REPMGR_RUNNABLE *args;
ENV *env;
+ DB_THREAD_INFO *ip;
int ret;
+ REPMGR_RUNNABLE *args;
args = argsp;
env = args->env;
+ ip = NULL;
+ ret = 0;
- if ((ret = __repmgr_select_loop(env)) != 0) {
+ ENV_ENTER_RET(env, ip, ret);
+ if (ret != 0 || (ret = __repmgr_select_loop(env)) != 0) {
__db_err(env, ret, DB_STR("3614", "select loop failed"));
+ ENV_LEAVE(env, ip);
(void)__repmgr_thread_failure(env, ret);
}
+ if (ret == 0)
+ ENV_LEAVE(env, ip);
return (NULL);
}
@@ -71,12 +83,19 @@ __repmgr_bow_out(env)
ENV *env;
{
DB_REP *db_rep;
+ REP *rep;
int ret;
db_rep = env->rep_handle;
+ rep = db_rep->region;
LOCK_MUTEX(db_rep->mutex);
ret = __repmgr_stop_threads(env);
UNLOCK_MUTEX(db_rep->mutex);
+ /*
+ * Reset sites_avail so that it will be calculated correctly if this
+ * site rejoins the group in the future.
+ */
+ rep->sites_avail = 0;
DB_EVENT(env, DB_EVENT_REP_LOCAL_SITE_REMOVED, NULL);
return (ret);
}
@@ -187,23 +206,53 @@ __repmgr_compute_timeout(env, timeout)
db_rep = env->rep_handle;
/*
- * There are two factors to consider: are heartbeats in use? and, do we
+ * There are four factors to consider: are heartbeats in use? do we
* have any sites with broken connections that we ought to retry?
+ * is there a listener process running locally? do we need to call
+ * an election if no master listener exists?
*/
have_timeout = __repmgr_next_timeout(env, &t, NULL);
/* List items are in order, so we only have to examine the first one. */
if (!TAILQ_EMPTY(&db_rep->retries)) {
retry = TAILQ_FIRST(&db_rep->retries);
- if (have_timeout) {
+ if (have_timeout)
/* Choose earliest timeout deadline. */
t = timespeccmp(&retry->time, &t, <) ? retry->time : t;
- } else {
+ else {
t = retry->time;
have_timeout = TRUE;
}
}
+ /* Check listener every timeout in subordinate rep-aware process. */
+ if (IS_LISTENER_CAND(db_rep)) {
+ if (!timespecisset(&db_rep->l_listener_chk)) {
+ __os_gettime(env, &now, 1);
+ TIMESPEC_ADD_DB_TIMEOUT(&now, db_rep->l_listener_wait);
+ db_rep->l_listener_chk = now;
+ }
+ if (have_timeout)
+ t = timespeccmp(&db_rep->l_listener_chk, &t, <) ?
+ db_rep->l_listener_chk : t;
+ else {
+ t = db_rep->l_listener_chk;
+ have_timeout = TRUE;
+ }
+ }
+
+ /* Check master listener if needed. */
+ if (FLD_ISSET(db_rep->region->config, REP_C_AUTOTAKEOVER) &&
+ timespecisset(&db_rep->m_listener_chk)) {
+ if (have_timeout)
+ t = timespeccmp(&db_rep->m_listener_chk, &t, <) ?
+ db_rep->m_listener_chk : t;
+ else {
+ t = db_rep->m_listener_chk;
+ have_timeout = TRUE;
+ }
+ }
+
if (have_timeout) {
__os_gettime(env, &now, 1);
if (timespeccmp(&now, &t, >=))
@@ -242,7 +291,17 @@ __repmgr_next_timeout(env, deadline, action)
if (rep->master_id == db_rep->self_eid &&
rep->heartbeat_frequency > 0) {
- t = db_rep->last_bcast;
+ /*
+ * A temporary master in preferred master mode must send
+ * regular heartbeats regardless of other activity because
+ * the preferred master requires a heartbeat to take over as
+ * master after it has synced with the temporary master.
+ */
+ if (IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT))
+ t = db_rep->last_hbeat;
+ else
+ t = db_rep->last_bcast;
TIMESPEC_ADD_DB_TIMEOUT(&t, rep->heartbeat_frequency);
my_action = __repmgr_send_heartbeat;
} else if ((master = __repmgr_connected_master(env)) != NULL &&
@@ -301,6 +360,24 @@ __repmgr_send_heartbeat(env)
db_rep = env->rep_handle;
rep = db_rep->region;
+ ret = 0;
+
+ /*
+ * Check test hook preventing heartbeats and connection attempts.
+ * This is used to create and maintain a dupmaster condition in
+ * a test until the test hook is rescinded.
+ */
+ DB_TEST_SET(env->test_abort, DB_TEST_REPMGR_HEARTBEAT);
+
+ /*
+ * Track last heartbeat for temporary master in preferred master
+ * mode so that it will send regular heartbeats regardless of
+ * other activity.
+ */
+ if (IS_PREFMAS_MODE(env) &&
+ FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT) &&
+ rep->master_id == db_rep->self_eid)
+ __os_gettime(env, &db_rep->last_hbeat, 1);
permlsn.generation = rep->gen;
if ((ret = __rep_get_maxpermlsn(env, &permlsn.lsn)) != 0)
@@ -310,8 +387,11 @@ __repmgr_send_heartbeat(env)
control.size = __REPMGR_PERMLSN_SIZE;
DB_INIT_DBT(rec, NULL, 0);
- return (__repmgr_send_broadcast(env,
- REPMGR_HEARTBEAT, &control, &rec, &unused1, &unused2, &unused3));
+ ret =__repmgr_send_broadcast(env,
+ REPMGR_HEARTBEAT, &control, &rec, &unused1, &unused2, &unused3);
+
+DB_TEST_RECOVERY_LABEL
+ return (ret);
}
/*
@@ -373,6 +453,8 @@ __repmgr_check_timeouts(env)
HEARTBEAT_ACTION action;
int ret;
+ ret = 0;
+
/*
* Figure out the next heartbeat-related thing to be done. Then, if
* it's time to do it, do so.
@@ -384,7 +466,342 @@ __repmgr_check_timeouts(env)
return (ret);
}
- return (__repmgr_retry_connections(env));
+ /* Check the existence of local listener. */
+ if ((ret = __repmgr_check_listener(env)) != 0)
+ return (ret);
+
+ /* Check the existence of master listener. */
+ if ((ret = __repmgr_check_master_listener(env)) != 0)
+ return (ret);
+
+ /*
+ * Check test hook preventing heartbeats and connection attempts.
+ * This is used to create and maintain a dupmaster condition in
+ * a test until the test hook is rescinded.
+ */
+ DB_TEST_SET(env->test_abort, DB_TEST_REPMGR_HEARTBEAT);
+
+ ret = __repmgr_retry_connections(env);
+
+DB_TEST_RECOVERY_LABEL
+ return (ret);
+}
+
+/*
+ * Check the existence of the listener process on the local site. If one
+ * does not exist and the current process is a subordinate rep-aware process,
+ * then start a takeover thread to covert this process to the listener process.
+ */
+static int
+__repmgr_check_listener(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ SITEINFO *sites;
+ db_timespec t;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+
+ /*
+ * Only subordinate rep-aware process can take over listener role, so
+ * no need to check listener in listener process or rep unaware process.
+ */
+ if (!IS_LISTENER_CAND(db_rep))
+ return (0);
+
+ /*
+ * If the listener quits due to site removal, no subordinate process
+ * should take over as listener as the current site is not expected
+ * to be active in the group. Check the status from the site array
+ * in the shared region instead of that in the GMDB. We do this
+ * because the GMDB doesn't apply the change yet when replication
+ * is stopped on the removed site.
+ */
+ sites = R_ADDR(env->reginfo, rep->siteinfo_off);
+ if (sites[rep->self_eid].status == SITE_DELETING)
+ return (0);
+
+ /*
+ * Check the listener after timeout. If there is no listener, we
+ * take over. During takeover, we will refresh all connections.
+ * A subordinate process does not have an up-to-date site list, so sync
+ * up addresses from the in-memory site array before takeover.
+ */
+ __os_gettime(env, &t, 1);
+ if (timespeccmp(&t, &db_rep->l_listener_chk, >=)) {
+ /* Compute the next timeout. */
+ TIMESPEC_ADD_DB_TIMEOUT(&t, db_rep->l_listener_wait);
+ db_rep->l_listener_chk = t;
+
+ /* Check if site address information needs to be refreshed. */
+ if ((rep->siteinfo_seq > db_rep->siteinfo_seq) &&
+ (ret = __repmgr_sync_siteaddr(env)) != 0)
+ return (ret);
+
+ if (rep->listener == 0)
+ ret = __repmgr_start_takeover(env);
+ }
+ return (ret);
+}
+
+/*
+ * Start a thread to take over the listener role in the current subordinate
+ * process.
+ */
+static int
+__repmgr_start_takeover(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_RUNNABLE *th;
+ int ret;
+
+ db_rep = env->rep_handle;
+ th = db_rep->takeover_thread;
+ if (th == NULL) {
+ if ((ret = __os_calloc(env, 1, sizeof(REPMGR_RUNNABLE),
+ &th)) != 0)
+ return (ret);
+ db_rep->takeover_thread = th;
+ } else if (th->finished) {
+ if ((ret = __repmgr_thread_join(th)) != 0)
+ return (ret);
+ } else {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "takeover thread still running"));
+ return (0);
+ }
+ th->run = __repmgr_takeover_thread;
+ if ((ret = __repmgr_thread_start(env, th)) != 0) {
+ __os_free(env, th);
+ db_rep->takeover_thread = NULL;
+ }
+ return (ret);
+}
+
+/*
+ * Take over listener role in the current subordinate process.
+ */
+static void *
+__repmgr_takeover_thread(argsp)
+ void *argsp;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+ REPMGR_RUNNABLE *th;
+ int nthreads, ret, save_policy;
+
+ th = argsp;
+ env = th->env;
+ db_rep = env->rep_handle;
+ ip = NULL;
+ rep = db_rep->region;
+ ret = 0;
+
+ ENV_ENTER_RET(env, ip, ret);
+ if (ret != 0)
+ goto out;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "starting takeover thread"));
+ /*
+ * It is likely that there is an old heartbeat ready to expire
+ * immediately upon restarting repmgr, leading to an unnecessary
+ * election. Reset the expiration countdown here to avoid this.
+ */
+ if ((ret = __repmgr_reset_last_rcvd(env)) != 0)
+ goto out;
+ /*
+ * If nthreads is set to be 0 in the current subordinate process, use
+ * the value in the last listener. The nthreads should be larger than
+ * 0 in listener.
+ */
+ nthreads = db_rep->config_nthreads == 0 ? (int)rep->listener_nthreads :
+ db_rep->config_nthreads;
+ /*
+ * It is possible that this subordinate process does not have intact
+ * connections to the other sites. For most ack policies, restarting
+ * repmgr will wait for acks when it commits its transaction to reload
+ * the gmdb. Temporarily set the ack policy to NONE for the takeover
+ * so that it is not delayed waiting for acks that can never come.
+ */
+ save_policy = rep->perm_policy;
+ rep->perm_policy = DB_REPMGR_ACKS_NONE;
+ /*
+ * Restart the repmgr as listener. If DB_REP_IGNORE is returned,
+ * the current process has become listener. If DB_REP_UNAVAIL is
+ * returned, the site has been removed from the group and no listener
+ * should be started. For any other error, if the replication is
+ * stopped because of the takeover thread, we will notify the
+ * application.
+ */
+ ret = __repmgr_start_int(env, nthreads, F_ISSET(rep, REP_F_MASTER) ?
+ DB_REP_MASTER : DB_REP_CLIENT);
+ if (ret == 0 && !IS_SUBORDINATE(db_rep) &&
+ db_rep->repmgr_status == running) {
+ STAT(rep->mstat.st_takeovers++);
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "finished takeover and became listener"));
+ } else if (ret != 0 && db_rep->repmgr_status == stopped) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "failed to take over, repmgr was stopped"));
+ DB_EVENT(env, DB_EVENT_REP_AUTOTAKEOVER_FAILED, NULL);
+ } else {
+ /* The current process is not changed to listener. */
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "failed to take over"));
+ }
+ rep->perm_policy = save_policy;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC, "takeover thread is exiting"));
+ ENV_LEAVE(env, ip);
+out: th->finished = TRUE;
+ return (NULL);
+}
+
+/*
+ * Reset the last_rcvd_timestamp to restart the wait for a heartbeat
+ * monitor expiration.
+ */
+static int
+__repmgr_reset_last_rcvd(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_SITE *master;
+
+ db_rep = env->rep_handle;
+
+ LOCK_MUTEX(db_rep->mutex);
+ if ((master = __repmgr_connected_master(env)) != NULL)
+ __os_gettime(env, &master->last_rcvd_timestamp, 1);
+ UNLOCK_MUTEX(db_rep->mutex);
+ return (0);
+}
+
+/*
+ * Monitor the connection to master listener. When the master listener is
+ * disconnected and some other master process might take over as listener
+ * soon, we will delay the election. After the delay if there is still no
+ * connection from master listener, call an election then.
+ */
+static int
+__repmgr_check_master_listener(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_SITE *master;
+ db_timespec t;
+ u_int32_t flags;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+
+ /*
+ * We only check for a master listener if m_listener_chk is set.
+ * The field is only set when __repmgr_bust_connection() previously
+ * detected the loss of our connection to the master listener.
+ * If rep->master_id is invalid, wait until it is ready to check.
+ */
+ if (!FLD_ISSET((db_rep)->region->config, REP_C_AUTOTAKEOVER) ||
+ !timespecisset(&db_rep->m_listener_chk) ||
+ !IS_VALID_EID(rep->master_id))
+ return (0);
+
+ __os_gettime(env, &t, 1);
+ if (timespeccmp(&t, &db_rep->m_listener_chk, >=)) {
+ master = SITE_FROM_EID(db_rep->region->master_id);
+ if (master->ref.conn.out == NULL &&
+ master->ref.conn.in == NULL) {
+ flags = ELECT_F_EVENT_NOTIFY;
+ if (FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS))
+ LF_SET(ELECT_F_IMMED | ELECT_F_FAST);
+ else
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Master failure, but no elections"));
+
+ /*
+ * In preferred master mode, a client that has lost its
+ * connection to the master uses an election thread to
+ * restart as master.
+ */
+ if (IS_PREFMAS_MODE(env)) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+"check_master_listener setting preferred master temp master"));
+ db_rep->prefmas_pending = start_temp_master;
+ }
+
+ ret = __repmgr_init_election(env, flags);
+ }
+ /*
+ * If the delay has expired reset m_listener_chk. We reset
+ * it whether or not the master listener process comes back
+ * so that we will not continue checking for a master listener
+ * indefinitely.
+ */
+ timespecclear(&db_rep->m_listener_chk);
+ }
+ return (ret);
+}
+
+/*
+ * Wake up I/O waiting in selector thread, refresh connections to all connected
+ * and present sites.
+ *
+ * PUBLIC: int __repmgr_refresh_selector __P((ENV *));
+ */
+int
+__repmgr_refresh_selector(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ REPMGR_RETRY *retry;
+ REPMGR_SITE *site;
+ SITEINFO *sites;
+ int eid, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if ((ret = __repmgr_wake_main_thread(env)) != 0)
+ return (ret);
+
+ FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ SET_LISTENER_CAND(1, = 0);
+ site = SITE_FROM_EID(eid);
+
+ /*
+ * It is possible some sites were left in a paused state
+ * during the switch, so they have to be removed from the
+ * retry list.
+ */
+ if (site->state == SITE_PAUSING) {
+ retry = site->ref.retry;
+ if (retry != NULL) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Removing site from retry list eid %lu",
+ (u_long)eid));
+ TAILQ_REMOVE(&db_rep->retries, retry, entries);
+ __os_free(env, retry);
+ site->ref.retry = NULL;
+ }
+
+ }
+ /*
+ * Try to connect to any site that is now PRESENT after
+ * rereading the gmdb.
+ */
+ if (site->membership == SITE_PRESENT &&
+ (ret = __repmgr_try_one(env, eid, TRUE)) != 0)
+ return (ret);
+ }
+ return (0);
}
/*
@@ -415,10 +832,11 @@ __repmgr_retry_connections(env)
__os_free(env, retry);
DB_ASSERT(env, IS_VALID_EID(eid));
site = SITE_FROM_EID(eid);
+ site->ref.retry = NULL;
DB_ASSERT(env, site->state == SITE_PAUSING);
if (site->membership == SITE_PRESENT) {
- if ((ret = __repmgr_try_one(env, eid)) != 0)
+ if ((ret = __repmgr_try_one(env, eid, FALSE)) != 0)
return (ret);
} else
site->state = SITE_IDLE;
@@ -437,11 +855,23 @@ __repmgr_first_try_connections(env)
ENV *env;
{
DB_REP *db_rep;
+ REP *rep;
REPMGR_SITE *site;
+ SITEINFO *sites;
int eid, ret;
db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * Check test hook preventing heartbeats and connection attempts.
+ * This is used to create and maintain a dupmaster condition in
+ * a test until the test hook is rescinded.
+ */
+ DB_TEST_SET(env->test_abort, DB_TEST_REPMGR_HEARTBEAT);
+
FOR_EACH_REMOTE_SITE_INDEX(eid) {
+ SET_LISTENER_CAND(1, = 0);
site = SITE_FROM_EID(eid);
/*
* Normally all sites would be IDLE here. But if a user thread
@@ -453,19 +883,22 @@ __repmgr_first_try_connections(env)
*/
if (site->state == SITE_IDLE &&
site->membership == SITE_PRESENT &&
- (ret = __repmgr_try_one(env, eid)) != 0)
+ (ret = __repmgr_try_one(env, eid, FALSE)) != 0)
return (ret);
}
+DB_TEST_RECOVERY_LABEL
return (0);
}
/*
- * Starts a thread to open a connection to the site at the given EID.
+ * Starts a thread to open a connection to the site at the given EID. We might
+ * have no connection to the site, or an existing connection to be replaced.
*/
static int
-__repmgr_try_one(env, eid)
+__repmgr_try_one(env, eid, refresh)
ENV *env;
int eid;
+ int refresh;
{
DB_REP *db_rep;
REPMGR_SITE *site;
@@ -488,13 +921,22 @@ __repmgr_try_one(env, eid)
"eid %lu previous connector thread still running; will retry",
(u_long)eid));
return (__repmgr_schedule_connection_attempt(env,
- eid, FALSE));
+ eid, refresh));
}
site->state = SITE_CONNECTING;
th->run = __repmgr_connector_thread;
- th->args.eid = eid;
+ th->args.conn_th.eid = eid;
+ /*
+ * The flag CONNECT_F_REFRESH indicates an immediate connection attempt
+ * should be scheduled if the current connection attempt fails. It is
+ * turned on before the first attempt to refresh the connection but
+ * turned off if the first attempt fails. In this way, when refreshing
+ * the connection, there will be at most two immediate connection
+ * attempts, after that, retry as usual.
+ */
+ th->args.conn_th.flags = refresh ? CONNECT_F_REFRESH : 0;
if ((ret = __repmgr_thread_start(env, th)) != 0) {
__os_free(env, th);
site->connector = NULL;
@@ -506,21 +948,33 @@ static void *
__repmgr_connector_thread(argsp)
void *argsp;
{
- REPMGR_RUNNABLE *th;
ENV *env;
+ DB_THREAD_INFO *ip;
+ REPMGR_RUNNABLE *th;
int ret;
th = argsp;
env = th->env;
+ ip = NULL;
+ ret = 0;
- RPRINT(env, (env, DB_VERB_REPMGR_MISC,
- "starting connector thread, eid %u", th->args.eid));
- if ((ret = __repmgr_connector_main(env, th)) != 0) {
+ ENV_ENTER_RET(env, ip, ret);
+ if (ret == 0)
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "starting connector thread, eid %u",
+ th->args.conn_th.eid));
+ if (ret != 0 || (ret = __repmgr_connector_main(env, th)) != 0) {
__db_err(env, ret, DB_STR("3617", "connector thread failed"));
+ RPRINT(env, (env,
+ DB_VERB_REPMGR_MISC, "connector thread is exiting"));
+ ENV_LEAVE(env, ip);
(void)__repmgr_thread_failure(env, ret);
}
- RPRINT(env, (env, DB_VERB_REPMGR_MISC, "connector thread is exiting"));
-
+ if (ret == 0) {
+ RPRINT(env, (env,
+ DB_VERB_REPMGR_MISC, "connector thread is exiting"));
+ ENV_LEAVE(env, ip);
+ }
th->finished = TRUE;
return (NULL);
}
@@ -542,8 +996,8 @@ __repmgr_connector_main(env, th)
ret = 0;
LOCK_MUTEX(db_rep->mutex);
- DB_ASSERT(env, IS_VALID_EID(th->args.eid));
- site = SITE_FROM_EID(th->args.eid);
+ DB_ASSERT(env, IS_VALID_EID(th->args.conn_th.eid));
+ site = SITE_FROM_EID(th->args.conn_th.eid);
if (site->state != SITE_CONNECTING && db_rep->repmgr_status == stopped)
goto unlock;
@@ -563,7 +1017,8 @@ __repmgr_connector_main(env, th)
UNLOCK_MUTEX(db_rep->mutex);
if ((ret = __repmgr_connect(env, &netaddr, &conn, &err)) == 0) {
- DB_EVENT(env, DB_EVENT_REP_CONNECT_ESTD, &th->args.eid);
+ DB_EVENT(env,
+ DB_EVENT_REP_CONNECT_ESTD, &th->args.conn_th.eid);
LOCK_MUTEX(db_rep->mutex);
if ((ret = __repmgr_set_nonblock_conn(conn)) != 0) {
__db_err(env, ret, DB_STR("3618",
@@ -571,33 +1026,53 @@ __repmgr_connector_main(env, th)
goto cleanup;
}
conn->type = REP_CONNECTION;
- site = SITE_FROM_EID(th->args.eid);
+ site = SITE_FROM_EID(th->args.conn_th.eid);
if (site->state != SITE_CONNECTING ||
db_rep->repmgr_status == stopped)
goto cleanup;
- conn->eid = th->args.eid;
- site = SITE_FROM_EID(th->args.eid);
- site->ref.conn.out = conn;
+ conn->eid = th->args.conn_th.eid;
+ site = SITE_FROM_EID(th->args.conn_th.eid);
+ /*
+ * If there is an existing outgoing connection, disable it and
+ * replace it with a new connection. The sites for a formerly
+ * subordinate handle that is now taking over might still be
+ * SITE_CONNECTING. Set to SITE_CONNECTED before disabling
+ * connection so that sites_avail is correctly maintained.
+ */
site->state = SITE_CONNECTED;
+ if (site->ref.conn.out != NULL)
+ (void)__repmgr_disable_connection(env,
+ site->ref.conn.out);
+ site->ref.conn.out = conn;
__os_gettime(env, &site->last_rcvd_timestamp, 1);
ret = __repmgr_wake_main_thread(env);
} else if (ret == DB_REP_UNAVAIL) {
/* Retryable error while trying to connect: retry later. */
- info.eid = th->args.eid;
+ info.eid = th->args.conn_th.eid;
info.error = err;
DB_EVENT(env, DB_EVENT_REP_CONNECT_TRY_FAILED, &info);
STAT(db_rep->region->mstat.st_connect_fail++);
LOCK_MUTEX(db_rep->mutex);
- site = SITE_FROM_EID(th->args.eid);
+ site = SITE_FROM_EID(th->args.conn_th.eid);
if (site->state != SITE_CONNECTING ||
db_rep->repmgr_status == stopped) {
ret = 0;
goto unlock;
}
+ /*
+ * If it fails to create a new outgoing connection to replace
+ * the existing one in the first attempt, schedule another
+ * immediate attempt. If it is our second attempt, disable
+ * the existing connections and retry as normal.
+ */
+ if (site->ref.conn.out != NULL && th->args.conn_th.flags == 0)
+ (void)__repmgr_disable_connection(env,
+ site->ref.conn.out);
ret = __repmgr_schedule_connection_attempt(env,
- th->args.eid, FALSE);
+ th->args.conn_th.eid,
+ th->args.conn_th.flags == CONNECT_F_REFRESH);
} else
goto out;
@@ -842,6 +1317,7 @@ prepare_input(env, conn)
if ((ret = __os_malloc(env, memsize, &membase)) != 0)
return (ret);
conn->input.rep_message = membase;
+ conn->input.rep_message->size = memsize;
conn->input.rep_message->msg_hdr = msg_hdr;
conn->input.rep_message->v.repmsg.originating_eid = conn->eid;
@@ -876,6 +1352,7 @@ prepare_input(env, conn)
if ((ret = __os_malloc(env, memsize, &membase)) != 0)
return (ret);
conn->input.rep_message = membase;
+ conn->input.rep_message->size = memsize;
conn->input.rep_message->msg_hdr = msg_hdr;
conn->input.rep_message->v.appmsg.conn = conn;
@@ -891,6 +1368,7 @@ prepare_input(env, conn)
if ((ret = __os_malloc(env, size, &membase)) != 0)
return (ret);
conn->input.rep_message = membase;
+ conn->input.rep_message->size = size;
conn->input.rep_message->msg_hdr = msg_hdr;
/*
@@ -1065,16 +1543,18 @@ dispatch_msgin(env, conn)
ENV *env;
REPMGR_CONNECTION *conn;
{
+ DBT *dbt;
DB_REP *db_rep;
- REPMGR_SITE *site;
- REPMGR_RUNNABLE *th;
+ REP *rep;
REPMGR_RESPONSE *resp;
- DBT *dbt;
+ REPMGR_RUNNABLE *th;
+ REPMGR_SITE *site;
char *hostname;
- int eid, ret;
+ int eid, ret, subord;
DB_ASSERT(env, conn->reading_phase == DATA_PHASE);
db_rep = env->rep_handle;
+ rep = db_rep->region;
switch (conn->state) {
case CONN_CONNECTED:
@@ -1129,9 +1609,22 @@ dispatch_msgin(env, conn)
dbt = &conn->input.repmgr_msg.rec;
hostname = dbt->data;
hostname[dbt->size-1] = '\0';
- if ((ret = accept_handshake(env, conn, hostname)) != 0)
+ if ((ret = accept_handshake(env,
+ conn, hostname, &subord)) != 0)
return (ret);
conn->state = CONN_READY;
+ site = SITE_FROM_EID(conn->eid);
+ /*
+ * Do not increase sites_avail redundantly for an
+ * incoming subordinate connection.
+ */
+ if (conn->type == REP_CONNECTION &&
+ site->state == SITE_CONNECTED && !subord) {
+ rep->sites_avail++;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "msgin: EID %lu CONNECTED, READY. sites_avail %lu",
+ (u_long)conn->eid, (u_long)rep->sites_avail));
+ }
break;
case REPMGR_OWN_MSG:
/*
@@ -1279,9 +1772,11 @@ process_own_msg(env, conn)
REPMGR_SITE *site;
REPMGR_MESSAGE *msg;
__repmgr_connect_reject_args reject;
+ __repmgr_v4connect_reject_args v4reject;
__repmgr_parm_refresh_args parms;
int ret;
+ db_rep = env->rep_handle;
ret = 0;
/*
* Set "msg" to point to the message struct. If we do all necessary
@@ -1293,28 +1788,61 @@ process_own_msg(env, conn)
switch (REPMGR_OWN_MSG_TYPE((msg = conn->input.rep_message)->msg_hdr)) {
case REPMGR_CONNECT_REJECT:
dbt = &msg->v.gmdb_msg.request;
- if ((ret = __repmgr_connect_reject_unmarshal(env,
- &reject, dbt->data, dbt->size, NULL)) != 0)
- return (DB_REP_UNAVAIL);
+ if (conn->version < 5) {
+ if ((ret = __repmgr_v4connect_reject_unmarshal(env,
+ &v4reject, dbt->data, dbt->size, NULL)) != 0)
+ return (DB_REP_UNAVAIL);
+ reject.version = v4reject.version;
+ reject.gen = v4reject.gen;
+ reject.status = 0;
+ } else {
+ if ((ret = __repmgr_connect_reject_unmarshal(env,
+ &reject, dbt->data, dbt->size, NULL)) != 0)
+ return (DB_REP_UNAVAIL);
+ }
/*
* If we're being rejected by someone who has more up-to-date
- * membership information than we do, it means we have been
- * removed from the group. If we've just gotten started, we can
- * make one attempt at automatically rejoining; otherwise we bow
- * out gracefully.
+ * membership information than we do, it means we are not in
+ * the group. If we've just gotten started, or our status is
+ * adding, we can make one attempt at automatically rejoining;
+ * otherwise we bow out gracefully.
*/
RPRINT(env, (env, DB_VERB_REPMGR_MISC,
- "got rejection msg citing version %lu/%lu",
- (u_long)reject.gen, (u_long)reject.version));
+ "got rejection msg citing version %lu/%lu mine %lu/%lu membership %lu",
+ (u_long)reject.gen, (u_long)reject.version,
+ (u_long)db_rep->member_version_gen,
+ (u_long)db_rep->membership_version,
+ (u_long)reject.status));
if (__repmgr_gmdb_version_cmp(env,
reject.gen, reject.version) > 0) {
- if (env->rep_handle->seen_repmsg)
+ if (db_rep->seen_repmsg && reject.status != SITE_ADDING)
ret = DB_DELETED;
- else if ((ret = __repmgr_defer_op(env,
- REPMGR_REJOIN)) == 0)
- ret = DB_REP_UNAVAIL;
+ else {
+ /*
+ * If 2SITE_STRICT is off, we are likely to
+ * win an election with our own vote before
+ * discovering there is already a master.
+ * Set indicator to defer the election until
+ * after rejoining group.
+ *
+ * In preferred master mode, either site
+ * should defer the election (which
+ * executes the preferred master startup
+ * code and only calls an election if it is
+ * safe) and also avoid scheduling an extra
+ * reconnect attempt in bust_connection()
+ * by setting the indicator.
+ */
+ if (!FLD_ISSET(db_rep->region->config,
+ REP_C_2SITE_STRICT) ||
+ IS_PREFMAS_MODE(env))
+ db_rep->rejoin_pending = TRUE;
+ if ((ret = __repmgr_defer_op(env,
+ REPMGR_REJOIN)) == 0)
+ ret = DB_REP_UNAVAIL;
+ }
} else
ret = DB_REP_UNAVAIL;
DB_ASSERT(env, ret != 0);
@@ -1332,7 +1860,6 @@ process_own_msg(env, conn)
if ((ret = __repmgr_parm_refresh_unmarshal(env,
&parms, dbt->data, dbt->size, NULL)) != 0)
return (DB_REP_UNAVAIL);
- db_rep = env->rep_handle;
DB_ASSERT(env, conn->type == REP_CONNECTION &&
IS_KNOWN_REMOTE_SITE(conn->eid));
site = SITE_FROM_EID(conn->eid);
@@ -1348,8 +1875,15 @@ process_own_msg(env, conn)
case REPMGR_GM_FORWARD:
case REPMGR_JOIN_REQUEST:
case REPMGR_JOIN_SUCCESS:
+ case REPMGR_LSNHIST_REQUEST:
+ case REPMGR_LSNHIST_RESPONSE:
+ case REPMGR_PREFMAS_FAILURE:
+ case REPMGR_PREFMAS_SUCCESS:
+ case REPMGR_READONLY_MASTER:
+ case REPMGR_READONLY_RESPONSE:
case REPMGR_REMOVE_REQUEST:
case REPMGR_RESOLVE_LIMBO:
+ case REPMGR_RESTART_CLIENT:
default:
__db_errx(env, DB_STR_A("3677",
"unexpected msg type %lu in process_own_msg", "%lu"),
@@ -1482,6 +2016,8 @@ __repmgr_send_handshake(env, conn, opt, optlen, flags)
cntrl_len = __REPMGR_V3HANDSHAKE_SIZE;
break;
case 4:
+ case 5:
+ case 6:
cntrl_len = __REPMGR_HANDSHAKE_SIZE;
break;
default:
@@ -1513,6 +2049,8 @@ __repmgr_send_handshake(env, conn, opt, optlen, flags)
__repmgr_v3handshake_marshal(env, &v3hs, p);
break;
case 4:
+ case 5:
+ case 6:
hs.port = my_addr->port;
hs.alignment = MEM_ALIGN;
hs.ack_policy = (u_int32_t)rep->perm_policy;
@@ -1551,11 +2089,14 @@ read_version_response(env, conn)
DB_REP *db_rep;
__repmgr_version_confirmation_args conf;
DBT vi;
+ REP *rep;
+ REPMGR_SITE *site;
char *hostname;
u_int32_t flags;
- int ret;
+ int ret, subord;
db_rep = env->rep_handle;
+ rep = db_rep->region;
if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
return (ret);
@@ -1581,14 +2122,37 @@ read_version_response(env, conn)
return (DB_REP_UNAVAIL);
}
- if ((ret = accept_handshake(env, conn, hostname)) != 0)
+ if ((ret = accept_handshake(env, conn, hostname, &subord)) != 0)
return (ret);
- flags = IS_SUBORDINATE(db_rep) ? REPMGR_SUBORDINATE : 0;
+ if (!IS_SUBORDINATE(db_rep))
+ flags = 0;
+ else {
+ flags = REPMGR_SUBORDINATE;
+ if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER) &&
+ db_rep->repmgr_status == running)
+ /*
+ * Takeover is enabled in rep-aware subordinate
+ * process.
+ */
+ flags |= REPMGR_AUTOTAKEOVER;
+ }
if ((ret = __repmgr_send_handshake(env,
conn, NULL, 0, flags)) != 0)
return (ret);
}
conn->state = CONN_READY;
+ site = SITE_FROM_EID(conn->eid);
+ /*
+ * Do not increase sites_avail redundantly for a new outgoing
+ * connection from a subordinate process.
+ */
+ if (conn->type == REP_CONNECTION &&
+ site->state == SITE_CONNECTED && !IS_SUBORDINATE(db_rep)) {
+ rep->sites_avail++;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "vers_resp: EID %lu CONNECTED, READY. sites_avail %lu",
+ (u_long)conn->eid, (u_long)rep->sites_avail));
+ }
return (ret);
}
@@ -1641,10 +2205,11 @@ __repmgr_find_version_info(env, conn, vi)
}
static int
-accept_handshake(env, conn, hostname)
+accept_handshake(env, conn, hostname, subordinate)
ENV *env;
REPMGR_CONNECTION *conn;
char *hostname;
+ int *subordinate;
{
__repmgr_handshake_args hs;
__repmgr_v2handshake_args hs2;
@@ -1653,6 +2218,7 @@ accept_handshake(env, conn, hostname)
u_int32_t ack, flags;
int electable;
+ *subordinate = 0;
switch (conn->version) {
case 2:
if (__repmgr_v2handshake_unmarshal(env, &hs2,
@@ -1674,6 +2240,8 @@ accept_handshake(env, conn, hostname)
ack = 0;
break;
case 4:
+ case 5:
+ case 6:
if (__repmgr_handshake_unmarshal(env, &hs,
conn->input.repmgr_msg.cntrl.data,
conn->input.repmgr_msg.cntrl.size, NULL) != 0)
@@ -1682,6 +2250,8 @@ accept_handshake(env, conn, hostname)
electable = F_ISSET(&hs, ELECTABLE_SITE);
flags = hs.flags;
ack = hs.ack_policy;
+ if (LF_ISSET(REPMGR_SUBORDINATE))
+ *subordinate = 1;
break;
default:
__db_errx(env, DB_STR_A("3679",
@@ -1729,13 +2299,17 @@ process_parameters(env, conn, host, port, ack, electable, flags)
u_int32_t ack, flags;
{
DB_REP *db_rep;
+ REP *rep;
REPMGR_RETRY *retry;
REPMGR_SITE *site;
+ SITEINFO *sites;
__repmgr_connect_reject_args reject;
+ __repmgr_v4connect_reject_args v4reject;
u_int8_t reject_buf[__REPMGR_CONNECT_REJECT_SIZE];
int eid, ret;
db_rep = env->rep_handle;
+ rep = db_rep->region;
/* Connection state can be used to discern incoming versus outgoing. */
if (conn->state == CONN_CONNECTED) {
@@ -1785,6 +2359,13 @@ process_parameters(env, conn, host, port, ack, electable, flags)
TAILQ_INSERT_TAIL(&site->sub_conns,
conn, entries);
conn->eid = eid;
+ conn->auto_takeover =
+ LF_ISSET(REPMGR_AUTOTAKEOVER) ? 1 : 0;
+ SET_LISTENER_CAND(conn->auto_takeover, ++);
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "handshake from subordinate %sconnection at site %s:%u EID %u",
+ LF_ISSET(REPMGR_AUTOTAKEOVER)?
+ "takeover ": "", host, port, eid));
} else {
DB_EVENT(env,
DB_EVENT_REP_CONNECT_ESTD, &eid);
@@ -1797,6 +2378,7 @@ process_parameters(env, conn, host, port, ack, electable, flags)
TAILQ_REMOVE(&db_rep->retries,
retry, entries);
__os_free(env, retry);
+ site->ref.retry = NULL;
break;
case SITE_CONNECTED:
/*
@@ -1821,6 +2403,16 @@ process_parameters(env, conn, host, port, ack, electable, flags)
* don't have to do anything else here.
*/
break;
+ case SITE_IDLE:
+ /*
+ * This can occur after the heartbeat
+ * test hook artificially kept this
+ * site from first trying to connect.
+ */
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "handshake from idle site %s:%u EID %u",
+ host, port, eid));
+ break;
default:
DB_ASSERT(env, FALSE);
}
@@ -1834,10 +2426,18 @@ process_parameters(env, conn, host, port, ack, electable, flags)
RPRINT(env, (env, DB_VERB_REPMGR_MISC,
"rejecting connection from unknown or provisional site %s:%u",
host, port));
- reject.version = db_rep->membership_version;
- reject.gen = db_rep->member_version_gen;
- __repmgr_connect_reject_marshal(env,
- &reject, reject_buf);
+ if (conn->version < 5) {
+ v4reject.version = db_rep->membership_version;
+ v4reject.gen = db_rep->member_version_gen;
+ __repmgr_v4connect_reject_marshal(env,
+ &v4reject, reject_buf);
+ } else {
+ reject.version = db_rep->membership_version;
+ reject.gen = db_rep->member_version_gen;
+ reject.status = (site) ? site->membership : 0;
+ __repmgr_connect_reject_marshal(env,
+ &reject, reject_buf);
+ }
if ((ret = __repmgr_send_own_msg(env, conn,
REPMGR_CONNECT_REJECT, reject_buf,
@@ -1867,7 +2467,8 @@ process_parameters(env, conn, host, port, ack, electable, flags)
*/
if (!IS_SUBORDINATE(db_rep) && /* us */
!__repmgr_master_is_known(env) &&
- !LF_ISSET(REPMGR_SUBORDINATE)) { /* the remote site */
+ !LF_ISSET(REPMGR_SUBORDINATE) && /* the remote site */
+ !IS_PREFMAS_MODE(env)) {
RPRINT(env, (env, DB_VERB_REPMGR_MISC,
"handshake with no known master to wake election thread"));
db_rep->new_connection = TRUE;
@@ -1980,6 +2581,7 @@ record_permlsn(env, conn)
*/
if (ackp->lsn.file > site->max_ack.file)
do_log_check = 1;
+ site->max_ack_gen = ackp->generation;
memcpy(&site->max_ack, &ackp->lsn, sizeof(DB_LSN));
if (do_log_check)
check_min_log_file(env);
diff --git a/src/repmgr/repmgr_stat.c b/src/repmgr/repmgr_stat.c
index fd6dabd3..215f4719 100644
--- a/src/repmgr/repmgr_stat.c
+++ b/src/repmgr/repmgr_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -55,7 +55,9 @@ __repmgr_stat(env, statp, flags)
{
DB_REP *db_rep;
DB_REPMGR_STAT *copy, *stats;
- uintmax_t tmp;
+ REPMGR_SITE *site;
+ u_int32_t tmp;
+ u_int i;
int ret;
db_rep = env->rep_handle;
@@ -73,6 +75,20 @@ __repmgr_stat(env, statp, flags)
memset(stats, 0, sizeof(DB_REPMGR_STAT));
stats->st_max_elect_threads = tmp;
}
+ stats->st_incoming_queue_gbytes = db_rep->input_queue.gbytes;
+ stats->st_incoming_queue_bytes = db_rep->input_queue.bytes;
+ LOCK_MUTEX(db_rep->mutex);
+ for (i = 0; i < db_rep->site_cnt; i++) {
+ site = SITE_FROM_EID(i);
+ if (site->membership != 0) {
+ copy->st_site_total++;
+ if (FLD_ISSET(site->gmdb_flags, SITE_VIEW))
+ copy->st_site_views++;
+ else
+ copy->st_site_participants++;
+ }
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
*statp = copy;
return (0);
@@ -148,6 +164,11 @@ __repmgr_print_stats(env, flags)
(u_long)sp->st_msgs_queued);
__db_dl(env, "Number of messages discarded due to queue length",
(u_long)sp->st_msgs_dropped);
+ __db_dlbytes(env, "Incoming message size in queue",
+ (u_long)sp->st_incoming_queue_gbytes, (u_long)0,
+ (u_long)sp->st_incoming_queue_bytes);
+ __db_dl(env, "Number of messages discarded due to incoming queue full",
+ (u_long)sp->st_incoming_msgs_dropped);
__db_dl(env, "Number of existing connections dropped",
(u_long)sp->st_connection_drop);
__db_dl(env, "Number of failed new connection attempts",
@@ -156,6 +177,14 @@ __repmgr_print_stats(env, flags)
(u_long)sp->st_elect_threads);
__db_dl(env, "Election threads for which space is reserved",
(u_long)sp->st_max_elect_threads);
+ __db_dl(env, "Number of participant sites in replication group",
+ (u_long)sp->st_site_participants);
+ __db_dl(env, "Total number of sites in replication group",
+ (u_long)sp->st_site_total);
+ __db_dl(env, "Number of view sites in replication group",
+ (u_long)sp->st_site_views);
+ __db_dl(env, "Number of automatic replication process takeovers",
+ (u_long)sp->st_takeovers);
__os_ufree(env, sp);
@@ -171,7 +200,7 @@ __repmgr_print_sites(env)
u_int count, i;
int ret;
- if ((ret = __repmgr_site_list(env->dbenv, &count, &list)) != 0)
+ if ((ret = __repmgr_site_list_int(env, &count, &list)) != 0)
return (ret);
if (count == 0)
@@ -189,6 +218,9 @@ __repmgr_print_sites(env)
list[i].status == DB_REPMGR_CONNECTED ? "" : "dis");
__db_msgadd(env, &mb, ", %speer",
F_ISSET(&list[i], DB_REPMGR_ISPEER) ? "" : "non-");
+ __db_msgadd(env, &mb, ", %s",
+ F_ISSET(&list[i], DB_REPMGR_ISVIEW) ?
+ "view" : "participant");
__db_msgadd(env, &mb, ")");
DB_MSGBUF_FLUSH(env, &mb);
}
@@ -238,26 +270,46 @@ __repmgr_stat_print_pp(dbenv, flags)
#endif
/*
- * PUBLIC: int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+ * PUBLIC: int __repmgr_site_list_pp
+ * PUBLIC: __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
*/
int
-__repmgr_site_list(dbenv, countp, listp)
+__repmgr_site_list_pp(dbenv, countp, listp)
DB_ENV *dbenv;
u_int *countp;
DB_REPMGR_SITE **listp;
{
- DB_REP *db_rep;
- REP *rep;
- DB_REPMGR_SITE *status;
ENV *env;
DB_THREAD_INFO *ip;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ENTER(env, ip);
+ ret = __repmgr_site_list_int(env, countp, listp);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __repmgr_site_list_int __P((ENV *, u_int *, DB_REPMGR_SITE **));
+ */
+int
+__repmgr_site_list_int(env, countp, listp)
+ ENV *env;
+ u_int *countp;
+ DB_REPMGR_SITE **listp;
+{
+ DB_REP *db_rep;
+ DB_REPMGR_SITE *status;
+ REP *rep;
REPMGR_SITE *site;
size_t array_size, total_size;
int eid, locked, ret;
u_int count, i;
char *name;
- env = dbenv->env;
db_rep = env->rep_handle;
ret = 0;
@@ -269,10 +321,8 @@ __repmgr_site_list(dbenv, countp, listp)
LOCK_MUTEX(db_rep->mutex);
locked = TRUE;
- ENV_ENTER(env, ip);
if (rep->siteinfo_seq > db_rep->siteinfo_seq)
ret = __repmgr_sync_siteaddr(env);
- ENV_LEAVE(env, ip);
if (ret != 0)
goto err;
} else {
@@ -329,6 +379,8 @@ __repmgr_site_list(dbenv, countp, listp)
if (FLD_ISSET(site->config, DB_REPMGR_PEER))
F_SET(&status[i], DB_REPMGR_ISPEER);
+ if (FLD_ISSET(site->gmdb_flags, SITE_VIEW))
+ F_SET(&status[i], DB_REPMGR_ISVIEW);
/*
* If we haven't started a communications thread, connection
diff --git a/src/repmgr/repmgr_stub.c b/src/repmgr/repmgr_stub.c
index 734c2240..999b759f 100644
--- a/src/repmgr/repmgr_stub.c
+++ b/src/repmgr/repmgr_stub.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -73,6 +73,69 @@ __repmgr_set_ack_policy(dbenv, policy)
/*
* PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *,
+ * PUBLIC: u_int32_t *));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_get_incoming_queue_max(dbenv, messagesp, bulk_messagesp)
+ DB_ENV *dbenv;
+ u_int32_t *messagesp;
+ u_int32_t *bulk_messagesp;
+{
+ COMPQUIET(messagesp, NULL);
+ COMPQUIET(bulk_messagesp, NULL);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t,
+ * PUBLIC: u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__repmgr_set_incoming_queue_max(dbenv, messages, bulk_messages)
+ DB_ENV *dbenv;
+ u_int32_t messages;
+ u_int32_t bulk_messages;
+{
+ COMPQUIET(messages, 0);
+ COMPQUIET(bulk_messages, 0);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_get_incoming_queue_redzone __P((DB_ENV *,
+ * PUBLIC: u_int32_t *, u_int32_t *));
+ * PUBLIC: #endif
+ */
+int __repmgr_get_incoming_queue_redzone(dbenv, gbytesp, bytesp)
+ DB_ENV *dbenv;
+ u_int32_t *gbytesp, *bytesp;
+{
+ COMPQUIET(gbytesp, NULL);
+ COMPQUIET(bytesp, NULL);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
+ * PUBLIC: int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *,
+ * PUBLIC: int *));
+ * PUBLIC: #endif
+ */
+int __repmgr_get_incoming_queue_fullevent(dbenv, onoffp)
+ DB_ENV *dbenv;
+ int *onoffp;
+{
+ COMPQUIET(onoffp, NULL);
+ return (__db_norepmgr(dbenv));
+}
+
+/*
+ * PUBLIC: #ifndef HAVE_REPLICATION_THREADS
* PUBLIC: int __repmgr_site
* PUBLIC: __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t));
* PUBLIC: #endif
@@ -125,11 +188,12 @@ __repmgr_local_site(dbenv, dbsitep)
/*
* PUBLIC: #ifndef HAVE_REPLICATION_THREADS
- * PUBLIC: int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
+ * PUBLIC: int __repmgr_site_list_pp
+ * PUBLIC: __P((DB_ENV *, u_int *, DB_REPMGR_SITE **));
* PUBLIC: #endif
*/
int
-__repmgr_site_list(dbenv, countp, listp)
+__repmgr_site_list_pp(dbenv, countp, listp)
DB_ENV *dbenv;
u_int *countp;
DB_REPMGR_SITE **listp;
@@ -141,11 +205,11 @@ __repmgr_site_list(dbenv, countp, listp)
/*
* PUBLIC: #ifndef HAVE_REPLICATION_THREADS
- * PUBLIC: int __repmgr_start __P((DB_ENV *, int, u_int32_t));
+ * PUBLIC: int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t));
* PUBLIC: #endif
*/
int
-__repmgr_start(dbenv, nthreads, flags)
+__repmgr_start_pp(dbenv, nthreads, flags)
DB_ENV *dbenv;
int nthreads;
u_int32_t flags;
diff --git a/src/repmgr/repmgr_util.c b/src/repmgr/repmgr_util.c
index c2439436..1c5ebe59 100644
--- a/src/repmgr/repmgr_util.c
+++ b/src/repmgr/repmgr_util.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -15,9 +15,13 @@
#define INITIAL_SITES_ALLOCATION 3 /* Arbitrary guess. */
+static int convert_gmdb(ENV *, DB_THREAD_INFO *, DB *, DB_TXN *);
static int get_eid __P((ENV *, const char *, u_int, int *));
-static int __repmgr_addrcmp __P((repmgr_netaddr_t *, repmgr_netaddr_t *));
static int read_gmdb __P((ENV *, DB_THREAD_INFO *, u_int8_t **, size_t *));
+static int __repmgr_addrcmp __P((repmgr_netaddr_t *, repmgr_netaddr_t *));
+static int __repmgr_find_commit __P((ENV *, DB_LSN *, DB_LSN *, int *));
+static int __repmgr_remote_lsnhist(ENV *, int, u_int32_t,
+ __repmgr_lsnhist_match_args *);
/*
* Schedules a future attempt to re-establish a connection with the given site.
@@ -43,6 +47,8 @@ __repmgr_schedule_connection_attempt(env, eid, immediate)
REP *rep;
REPMGR_RETRY *retry, *target;
REPMGR_SITE *site;
+ SITEINFO *sites;
+ db_timeout_t timeout;
db_timespec t;
int ret;
@@ -57,7 +63,24 @@ __repmgr_schedule_connection_attempt(env, eid, immediate)
if (immediate)
TAILQ_INSERT_HEAD(&db_rep->retries, retry, entries);
else {
- TIMESPEC_ADD_DB_TIMEOUT(&t, rep->connection_retry_wait);
+ /*
+ * Normally we retry a connection after connection retry
+ * timeout. In a subordinate rep-aware process, we retry sooner
+ * when there is a listener candidate on the disconnected site.
+ * The listener process will be connected from the new listener,
+ * but subordinate rep-aware process can only wait for retry.
+ * It matters when the subordinate process becomes listener and
+ * the disconnected site is master. The m_listener_wait is set
+ * to retry after enough time has passed for a takeover. The
+ * number of listener candidates is maintained in the listener
+ * process as it has connections to all subordinate processes
+ * from other sites.
+ */
+ timeout = rep->connection_retry_wait;
+ CHECK_LISTENER_CAND(timeout, >0, db_rep->m_listener_wait,
+ timeout);
+ TIMESPEC_ADD_DB_TIMEOUT(&t, timeout);
+
/*
* Insert the new "retry" on the (time-ordered) list in its
* proper position. To do so, find the list entry ("target")
@@ -284,6 +307,7 @@ __repmgr_new_site(env, sitep, host, port)
site->net_addr.host = p;
site->net_addr.port = (u_int16_t)port;
+ site->max_ack_gen = 0;
ZERO_LSN(site->max_ack);
site->ack_policy = 0;
site->alignment = 0;
@@ -295,6 +319,7 @@ __repmgr_new_site(env, sitep, host, port)
site->state = SITE_IDLE;
site->membership = 0;
+ site->gmdb_flags = 0;
site->config = 0;
*sitep = site;
@@ -535,11 +560,14 @@ __repmgr_thread_failure(env, why)
int why;
{
DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
db_rep = env->rep_handle;
+ ENV_ENTER(env, ip);
LOCK_MUTEX(db_rep->mutex);
(void)__repmgr_stop_threads(env);
UNLOCK_MUTEX(db_rep->mutex);
+ ENV_LEAVE(env, ip);
return (__env_panic(env, why));
}
@@ -597,12 +625,13 @@ __repmgr_format_addr_loc(addr, buffer)
}
/*
- * PUBLIC: int __repmgr_repstart __P((ENV *, u_int32_t));
+ * PUBLIC: int __repmgr_repstart __P((ENV *, u_int32_t, u_int32_t));
*/
int
-__repmgr_repstart(env, flags)
+__repmgr_repstart(env, flags, startopts)
ENV *env;
u_int32_t flags;
+ u_int32_t startopts;
{
DBT my_addr;
int ret;
@@ -610,7 +639,11 @@ __repmgr_repstart(env, flags)
/* Include "cdata" in case sending to old-version site. */
if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0)
return (ret);
- ret = __rep_start_int(env, &my_addr, flags);
+ /*
+ * force_role_chg and hold_client_gen are used by preferred master
+ * mode to help control site startup.
+ */
+ ret = __rep_start_int(env, &my_addr, flags, startopts);
__os_free(env, my_addr.data);
if (ret != 0)
__db_err(env, ret, DB_STR("3673", "rep_start"));
@@ -618,11 +651,12 @@ __repmgr_repstart(env, flags)
}
/*
- * PUBLIC: int __repmgr_become_master __P((ENV *));
+ * PUBLIC: int __repmgr_become_master __P((ENV *, u_int32_t));
*/
int
-__repmgr_become_master(env)
+__repmgr_become_master(env, startopts)
ENV *env;
+ u_int32_t startopts;
{
DB_REP *db_rep;
DB_THREAD_INFO *ip;
@@ -631,7 +665,7 @@ __repmgr_become_master(env)
REPMGR_SITE *site;
DBT key_dbt, data_dbt;
__repmgr_membership_key_args key;
- __repmgr_membership_data_args member_status;
+ __repmgr_membership_data_args member_data;
repmgr_netaddr_t addr;
u_int32_t status;
u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
@@ -668,16 +702,23 @@ __repmgr_become_master(env)
db_rep->client_intent = FALSE;
UNLOCK_MUTEX(db_rep->mutex);
- if ((ret = __repmgr_repstart(env, DB_REP_MASTER)) != 0)
+ if ((ret = __repmgr_repstart(env, DB_REP_MASTER, startopts)) != 0)
return (ret);
+ /*
+ * Make sure member_version_gen is current so that this master
+ * can reject obsolete member lists from other sites.
+ */
+ db_rep->member_version_gen = db_rep->region->gen;
+
+ /* If there is already a gmdb, we are finished. */
if (db_rep->have_gmdb)
return (0);
- db_rep->member_version_gen = db_rep->region->gen;
- ENV_ENTER(env, ip);
+ /* There isn't a gmdb. Create one from the in-memory site list. */
if ((ret = __repmgr_hold_master_role(env, NULL)) != 0)
goto leave;
+ ENV_GET_THREAD_INFO(env, ip);
retry:
if ((ret = __repmgr_setup_gmdb_op(env, ip, &txn, DB_CREATE)) != 0)
goto err;
@@ -705,8 +746,9 @@ retry:
&key, key_buf, sizeof(key_buf), &len);
DB_ASSERT(env, ret == 0);
DB_INIT_DBT(key_dbt, key_buf, len);
- member_status.flags = status;
- __repmgr_membership_data_marshal(env, &member_status, data_buf);
+ member_data.status = status;
+ member_data.flags = site->gmdb_flags;
+ __repmgr_membership_data_marshal(env, &member_data, data_buf);
DB_INIT_DBT(data_dbt, data_buf, __REPMGR_MEMBERSHIP_DATA_SIZE);
if ((ret = __db_put(dbp, ip, txn, &key_dbt, &data_dbt, 0)) != 0)
goto err;
@@ -726,7 +768,6 @@ err:
if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0)
ret = t_ret;
leave:
- ENV_LEAVE(env, ip);
return (ret);
}
@@ -840,6 +881,14 @@ __repmgr_open(env, rep_)
rep->election_retry_wait = db_rep->election_retry_wait;
rep->heartbeat_monitor_timeout = db_rep->heartbeat_monitor_timeout;
rep->heartbeat_frequency = db_rep->heartbeat_frequency;
+ rep->inqueue_max_gbytes = db_rep->inqueue_max_gbytes;
+ rep->inqueue_max_bytes = db_rep->inqueue_max_bytes;
+ if (rep->inqueue_max_gbytes == 0 && rep->inqueue_max_bytes == 0) {
+ rep->inqueue_max_bytes = DB_REPMGR_DEFAULT_INQUEUE_MAX;
+ }
+ __repmgr_set_incoming_queue_redzone(rep, rep->inqueue_max_gbytes,
+ rep->inqueue_max_bytes);
+
return (ret);
}
@@ -958,6 +1007,18 @@ __repmgr_join(env, rep_)
}
db_rep->siteinfo_seq = rep->siteinfo_seq;
+ /*
+ * Update the incoming queue limit settings if necessary.
+ */
+ if ((db_rep->inqueue_max_gbytes != 0 ||
+ db_rep->inqueue_max_bytes != 0) &&
+ (db_rep->inqueue_max_gbytes != rep->inqueue_max_gbytes ||
+ db_rep->inqueue_max_bytes != rep->inqueue_max_gbytes)) {
+ rep->inqueue_max_gbytes = db_rep->inqueue_max_gbytes;
+ rep->inqueue_max_bytes = db_rep->inqueue_max_bytes;
+ __repmgr_set_incoming_queue_redzone(rep,
+ rep->inqueue_max_gbytes, rep->inqueue_max_bytes);
+ }
unlock:
MUTEX_UNLOCK(env, rep->mtx_repmgr);
return (ret);
@@ -1073,6 +1134,7 @@ __repmgr_share_netaddrs(env, rep_, start, limit)
shared_array[eid].addr.port = db_rep->sites[i].net_addr.port;
shared_array[eid].config = db_rep->sites[i].config;
shared_array[eid].status = db_rep->sites[i].membership;
+ shared_array[eid].flags = db_rep->sites[i].gmdb_flags;
RPRINT(env, (env, DB_VERB_REPMGR_MISC,
"EID %d is assigned for site %s:%lu",
eid, host, (u_long)shared_array[eid].addr.port));
@@ -1134,6 +1196,7 @@ __repmgr_copy_in_added_sites(env)
site = SITE_FROM_EID(i);
site->config = p->config;
site->membership = p->status;
+ site->gmdb_flags = p->flags;
}
out:
@@ -1266,7 +1329,9 @@ __repmgr_stable_lsn(env, stable_lsn)
db_rep = env->rep_handle;
rep = db_rep->region;
- if (rep->min_log_file != 0 && rep->min_log_file < stable_lsn->file) {
+ LOCK_MUTEX(db_rep->mutex);
+ if (rep->sites_avail != 0 && rep->min_log_file != 0 &&
+ rep->min_log_file < stable_lsn->file) {
/*
* Returning an LSN to be consistent with the rest of the
* log archiving processing. Construct LSN of format
@@ -1276,12 +1341,91 @@ __repmgr_stable_lsn(env, stable_lsn)
stable_lsn->offset = 0;
}
RPRINT(env, (env, DB_VERB_REPMGR_MISC,
- "Repmgr_stable_lsn: Returning stable_lsn[%lu][%lu]",
- (u_long)stable_lsn->file, (u_long)stable_lsn->offset));
+"Repmgr_stable_lsn: Returning stable_lsn[%lu][%lu] sites_avail %lu min_log %lu",
+ (u_long)stable_lsn->file, (u_long)stable_lsn->offset,
+ (u_long)rep->sites_avail, (u_long)rep->min_log_file));
+ UNLOCK_MUTEX(db_rep->mutex);
return (0);
}
/*
+ * PUBLIC: int __repmgr_make_request_conn __P((ENV *,
+ * PUBLIC: repmgr_netaddr_t *, REPMGR_CONNECTION **));
+ */
+int
+__repmgr_make_request_conn(env, addr, connp)
+ ENV *env;
+ repmgr_netaddr_t *addr;
+ REPMGR_CONNECTION **connp;
+{
+ DBT vi;
+ __repmgr_msg_hdr_args msg_hdr;
+ __repmgr_version_confirmation_args conf;
+ REPMGR_CONNECTION *conn;
+ int alloc, ret, unused;
+
+ alloc = FALSE;
+ if ((ret = __repmgr_connect(env, addr, &conn, &unused)) != 0)
+ return (ret);
+ conn->type = APP_CONNECTION;
+
+ /* Read a handshake msg, to get version confirmation and parameters. */
+ if ((ret = __repmgr_read_conn(conn)) != 0)
+ goto err;
+ /*
+ * We can only get here after having read the full 9 bytes that we
+ * expect, so this can't fail.
+ */
+ DB_ASSERT(env, conn->reading_phase == SIZES_PHASE);
+ ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+ conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+ DB_ASSERT(env, ret == 0);
+ __repmgr_iovec_init(&conn->iovecs);
+ conn->reading_phase = DATA_PHASE;
+
+ if ((ret = __repmgr_prepare_simple_input(env, conn, &msg_hdr)) != 0)
+ goto err;
+ alloc = TRUE;
+
+ if ((ret = __repmgr_read_conn(conn)) != 0)
+ goto err;
+
+ /*
+ * Analyze the handshake msg, and stash relevant info.
+ */
+ if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0)
+ goto err;
+ DB_ASSERT(env, vi.size > 0);
+ if ((ret = __repmgr_version_confirmation_unmarshal(env,
+ &conf, vi.data, vi.size, NULL)) != 0)
+ goto err;
+
+ if (conf.version < GM_MIN_VERSION ||
+ (IS_VIEW_SITE(env) && conf.version < VIEW_MIN_VERSION) ||
+ (PREFMAS_IS_SET(env) && conf.version < PREFMAS_MIN_VERSION)) {
+ ret = DB_REP_UNAVAIL;
+ goto err;
+ }
+ conn->version = conf.version;
+
+err:
+ if (alloc) {
+ DB_ASSERT(env, conn->input.repmgr_msg.cntrl.size > 0);
+ __os_free(env, conn->input.repmgr_msg.cntrl.data);
+ DB_ASSERT(env, conn->input.repmgr_msg.rec.size > 0);
+ __os_free(env, conn->input.repmgr_msg.rec.data);
+ }
+ __repmgr_reset_for_reading(conn);
+ if (ret == 0)
+ *connp = conn;
+ else {
+ (void)__repmgr_close_connection(env, conn);
+ (void)__repmgr_destroy_conn(env, conn);
+ }
+ return (ret);
+}
+
+/*
* PUBLIC: int __repmgr_send_sync_msg __P((ENV *, REPMGR_CONNECTION *,
* PUBLIC: u_int32_t, u_int8_t *, u_int32_t));
*/
@@ -1311,15 +1455,511 @@ __repmgr_send_sync_msg(env, conn, type, buf, len)
}
/*
+ * Reads a whole message, when we expect to get a REPMGR_OWN_MSG.
+ */
+/*
+ * PUBLIC: int __repmgr_read_own_msg __P((ENV *, REPMGR_CONNECTION *,
+ * PUBLIC: u_int32_t *, u_int8_t **, size_t *));
+ */
+int
+__repmgr_read_own_msg(env, conn, typep, bufp, lenp)
+ ENV *env;
+ REPMGR_CONNECTION *conn;
+ u_int32_t *typep;
+ u_int8_t **bufp;
+ size_t *lenp;
+{
+ __repmgr_msg_hdr_args msg_hdr;
+ u_int8_t *buf;
+ u_int32_t type;
+ size_t size;
+ int ret;
+
+ __repmgr_reset_for_reading(conn);
+ if ((ret = __repmgr_read_conn(conn)) != 0)
+ goto err;
+ ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr,
+ conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL);
+ DB_ASSERT(env, ret == 0);
+
+ if ((conn->msg_type = msg_hdr.type) != REPMGR_OWN_MSG) {
+ ret = DB_REP_UNAVAIL; /* Protocol violation. */
+ goto err;
+ }
+ type = REPMGR_OWN_MSG_TYPE(msg_hdr);
+ if ((size = (size_t)REPMGR_OWN_BUF_SIZE(msg_hdr)) > 0) {
+ conn->reading_phase = DATA_PHASE;
+ __repmgr_iovec_init(&conn->iovecs);
+
+ if ((ret = __os_malloc(env, size, &buf)) != 0)
+ goto err;
+ conn->input.rep_message = NULL;
+
+ __repmgr_add_buffer(&conn->iovecs, buf, size);
+ if ((ret = __repmgr_read_conn(conn)) != 0) {
+ __os_free(env, buf);
+ goto err;
+ }
+ *bufp = buf;
+ }
+
+ *typep = type;
+ *lenp = size;
+
+err:
+ return (ret);
+}
+
+/*
+ * Returns TRUE if we are connected to the other site in a preferred
+ * master replication group, FALSE otherwise.
+ *
+ * PUBLIC: int __repmgr_prefmas_connected __P((ENV *));
+ */
+int
+__repmgr_prefmas_connected(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *other_site;
+
+ db_rep = env->rep_handle;
+
+ /*
+ * Preferred master mode only has 2 sites, so the other site is
+ * always EID 1.
+ */
+ if (!IS_PREFMAS_MODE(env) || !IS_KNOWN_REMOTE_SITE(1))
+ return (FALSE);
+
+ other_site = SITE_FROM_EID(1);
+ if (other_site->state == SITE_CONNECTED)
+ return (TRUE);
+
+ if ((conn = other_site->ref.conn.in) != NULL &&
+ IS_READY_STATE(conn->state))
+ return (TRUE);
+ if ((conn = other_site->ref.conn.out) != NULL &&
+ IS_READY_STATE(conn->state))
+ return (TRUE);
+
+ return (FALSE);
+}
+
+/*
+ * Used by a preferred master site to restart the remote temporary master
+ * site as a client. This is used to help guarantee that the preferred master
+ * site's transactions are never rolled back.
+ *
+ * PUBLIC: int __repmgr_restart_site_as_client __P((ENV *, int));
+ */
+int
+__repmgr_restart_site_as_client(env, eid)
+ ENV *env;
+ int eid;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ repmgr_netaddr_t addr;
+ u_int32_t type;
+ size_t len;
+ u_int8_t any_value, *response_buf;
+ int ret, t_ret;
+
+ COMPQUIET(any_value, 0);
+ db_rep = env->rep_handle;
+ conn = NULL;
+
+ if (!IS_PREFMAS_MODE(env))
+ return (0);
+
+ LOCK_MUTEX(db_rep->mutex);
+ addr = SITE_FROM_EID(eid)->net_addr;
+ UNLOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_make_request_conn(env, &addr, &conn)) != 0)
+ return (ret);
+
+ /*
+ * No payload needed, but must send at least a dummy byte for the
+ * other side to recognize that a message has arrived.
+ */
+ if ((ret = __repmgr_send_sync_msg(env, conn,
+ REPMGR_RESTART_CLIENT, VOID_STAR_CAST &any_value, 1)) != 0)
+ goto err;
+
+ if ((ret = __repmgr_read_own_msg(env,
+ conn, &type, &response_buf, &len)) != 0)
+ goto err;
+ if (type != REPMGR_PREFMAS_SUCCESS) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "restart_site_as_client got unexpected message type %d",
+ type));
+ ret = DB_REP_UNAVAIL; /* Invalid response: protocol violation */
+ }
+err:
+ if (conn != NULL) {
+ if ((t_ret = __repmgr_close_connection(env,
+ conn)) != 0 && ret != 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_destroy_conn(env,
+ conn)) != 0 && ret != 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+
+/*
+ * Used by a preferred master site to make the remote temporary master
+ * site a readonly master. This is used to help preserve all temporary
+ * master transactions.
+ *
+ * PUBLIC: int __repmgr_make_site_readonly_master __P((ENV *, int,
+ * PUBLIC: u_int32_t *, DB_LSN *));
+ */
+int
+__repmgr_make_site_readonly_master(env, eid, gen, sync_lsnp)
+ ENV *env;
+ int eid;
+ u_int32_t *gen;
+ DB_LSN *sync_lsnp;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ repmgr_netaddr_t addr;
+ __repmgr_permlsn_args permlsn;
+ u_int32_t type;
+ size_t len;
+ u_int8_t any_value, *response_buf;
+ int ret, t_ret;
+
+ COMPQUIET(any_value, 0);
+ db_rep = env->rep_handle;
+ conn = NULL;
+ response_buf = NULL;
+ *gen = 0;
+ ZERO_LSN(*sync_lsnp);
+
+ if (!IS_PREFMAS_MODE(env))
+ return (0);
+
+ LOCK_MUTEX(db_rep->mutex);
+ addr = SITE_FROM_EID(eid)->net_addr;
+ UNLOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_make_request_conn(env, &addr, &conn)) != 0)
+ return (ret);
+
+ /*
+ * No payload needed, but must send at least a dummy byte for the
+ * other side to recognize that a message has arrived.
+ */
+ if ((ret = __repmgr_send_sync_msg(env, conn,
+ REPMGR_READONLY_MASTER, VOID_STAR_CAST &any_value, 1)) != 0)
+ goto err;
+
+ if ((ret = __repmgr_read_own_msg(env,
+ conn, &type, &response_buf, &len)) != 0)
+ goto err;
+
+ if (type == REPMGR_READONLY_RESPONSE) {
+ if ((ret = __repmgr_permlsn_unmarshal(env,
+ &permlsn, response_buf, len, NULL)) != 0)
+ goto err;
+ *gen = permlsn.generation;
+ *sync_lsnp = permlsn.lsn;
+ } else {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "make_site_readonly_master got unexpected message type %d",
+ type));
+ ret = DB_REP_UNAVAIL; /* Invalid response: protocol violation */
+ }
+
+err:
+ if (conn != NULL) {
+ if ((t_ret = __repmgr_close_connection(env,
+ conn)) != 0 && ret != 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_destroy_conn(env,
+ conn)) != 0 && ret != 0)
+ ret = t_ret;
+ }
+ if (response_buf != NULL)
+ __os_free(env, response_buf);
+ return (ret);
+}
+
+/*
+ * Used by a preferred master site to perform the LSN history comparisons to
+ * determine whether there is are continuous or conflicting sets of
+ * transactions between this site and the remote temporary master.
+ *
+ * PUBLIC: int __repmgr_lsnhist_match __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, int, int *));
+ */
+int
+__repmgr_lsnhist_match(env, ip, eid, match)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ int *match;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ __rep_lsn_hist_data_args my_lsnhist;
+ __repmgr_lsnhist_match_args remote_lsnhist;
+ u_int32_t my_gen;
+ int found_commit, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ *match = FALSE;
+ my_gen = rep->gen;
+ found_commit = FALSE;
+
+ if (!IS_PREFMAS_MODE(env))
+ return (0);
+
+ /* Get local LSN history information for comparison. */
+ if ((ret = __rep_get_lsnhist_data(env, ip, my_gen, &my_lsnhist)) != 0)
+ return (ret);
+
+ /* Get remote LSN history information for comparison. */
+ ret = __repmgr_remote_lsnhist(env, eid, my_gen, &remote_lsnhist);
+
+ /*
+ * If the current gen doesn't exist at the remote site, the match
+ * fails.
+ *
+ * If the remote LSN or timestamp at the current gen doesn't match
+ * ours, we probably had a whack-a-mole situation where each site
+ * as up and down in isolation one or more times and the match fails.
+ *
+ * If the remote LSN for the next generation is lower than this
+ * site's startup LSN and there are any commit operations between
+ * these LSNs, there are conflicting sets of transactions and the
+ * match fails.
+ */
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "lsnhist_match my_lsn [%lu][%lu] remote_lsn [%lu][%lu]",
+ (u_long)my_lsnhist.lsn.file, (u_long)my_lsnhist.lsn.offset,
+ (u_long)remote_lsnhist.lsn.file,
+ (u_long)remote_lsnhist.lsn.offset));
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "lsnhist_match my_time %lu:%lu remote_time %lu:%lu",
+ (u_long)my_lsnhist.hist_sec, (u_long)my_lsnhist.hist_nsec,
+ (u_long)remote_lsnhist.hist_sec, (u_long)remote_lsnhist.hist_nsec));
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "lsnhist_match pminit_lsn [%lu][%lu] next_gen_lsn [%lu][%lu]",
+ (u_long)db_rep->prefmas_init_lsn.file,
+ (u_long)db_rep->prefmas_init_lsn.offset,
+ (u_long)remote_lsnhist.next_gen_lsn.file,
+ (u_long)remote_lsnhist.next_gen_lsn.offset));
+ if (ret != DB_REP_UNAVAIL &&
+ LOG_COMPARE(&my_lsnhist.lsn, &remote_lsnhist.lsn) == 0 &&
+ my_lsnhist.hist_sec == remote_lsnhist.hist_sec &&
+ my_lsnhist.hist_nsec == remote_lsnhist.hist_nsec) {
+ /*
+ * If the remote site doesn't yet have the next gen or if
+ * our startup LSN is <= than the remote next gen LSN, we
+ * have a match.
+ *
+ * Otherwise, our startup LSN is higher than the remote
+ * next gen LSN. If we have any commit operations between
+ * these two LSNs, we have preferred master operations we
+ * must preserve and there is not a match. But if we just
+ * have uncommitted operations between these LSNs it doesn't
+ * matter if they are rolled back, so we call it a match and
+ * try to retain temporary master transactions if possible.
+ */
+ if (IS_ZERO_LSN(remote_lsnhist.next_gen_lsn) ||
+ LOG_COMPARE(&db_rep->prefmas_init_lsn,
+ &remote_lsnhist.next_gen_lsn) <= 0)
+ *match = TRUE;
+ else if ((ret = __repmgr_find_commit(env,
+ &remote_lsnhist.next_gen_lsn,
+ &db_rep->prefmas_init_lsn, &found_commit)) == 0 &&
+ !found_commit) {
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "lsnhist_match !found_commit set match TRUE"));
+ *match = TRUE;
+ }
+ }
+
+ /* Don't return an error if current gen didn't exist at remote site. */
+ if (ret == DB_REP_UNAVAIL)
+ ret = 0;
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "lsnhist_match match %d returning %d", *match, ret));
+ return (ret);
+}
+
+/*
+ * Checks a range of log records from low_lsn to high_lsn for any
+ * commit operations. Sets found_commit to TRUE if a commit is
+ * found.
+ */
+static int
+__repmgr_find_commit(env, low_lsn, high_lsn, found_commit)
+ ENV *env;
+ DB_LSN *low_lsn;
+ DB_LSN *high_lsn;
+ int *found_commit;
+{
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ DBT rec;
+ __txn_regop_args *txn_args;
+ u_int32_t rectype;
+ int ret, t_ret;
+
+ *found_commit = FALSE;
+ ret = 0;
+
+ lsn = *low_lsn;
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ memset(&rec, 0, sizeof(rec));
+ if (__logc_get(logc, &lsn, &rec, DB_SET) == 0) {
+ do {
+ LOGCOPY_32(env, &rectype, rec.data);
+ if (rectype == DB___txn_regop) {
+ if ((ret = __txn_regop_read(
+ env, rec.data, &txn_args)) != 0)
+ goto close_cursor;
+ if (txn_args->opcode == TXN_COMMIT) {
+ *found_commit = TRUE;
+ __os_free(env, txn_args);
+ break;
+ }
+ __os_free(env, txn_args);
+ }
+ } while ((ret = __logc_get(logc, &lsn, &rec, DB_NEXT)) == 0 &&
+ LOG_COMPARE(&lsn, high_lsn) <= 0);
+ }
+close_cursor:
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * Used by a preferred master site to get remote LSN history information
+ * from the other site in the replication group.
+ */
+static int
+__repmgr_remote_lsnhist(env, eid, gen, lsnhist_match)
+ ENV *env;
+ int eid;
+ u_int32_t gen;
+ __repmgr_lsnhist_match_args *lsnhist_match;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ repmgr_netaddr_t addr;
+ __rep_lsn_hist_key_args lsnhist_key;
+ u_int8_t lsnhist_key_buf[__REP_LSN_HIST_KEY_SIZE];
+ u_int32_t type;
+ size_t len;
+ u_int8_t *response_buf;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ conn = NULL;
+ response_buf = NULL;
+
+ if (!IS_KNOWN_REMOTE_SITE(eid))
+ return (0);
+
+ LOCK_MUTEX(db_rep->mutex);
+ addr = SITE_FROM_EID(eid)->net_addr;
+ UNLOCK_MUTEX(db_rep->mutex);
+ if ((ret = __repmgr_make_request_conn(env, &addr, &conn)) != 0)
+ return (ret);
+
+ /* Marshal generation for which to request remote lsnhist data. */
+ lsnhist_key.version = REP_LSN_HISTORY_FMT_VERSION;
+ lsnhist_key.gen = gen;
+ __rep_lsn_hist_key_marshal(env, &lsnhist_key, lsnhist_key_buf);
+ if ((ret = __repmgr_send_sync_msg(env, conn, REPMGR_LSNHIST_REQUEST,
+ lsnhist_key_buf, sizeof(lsnhist_key_buf))) != 0)
+ goto err;
+
+ if ((ret = __repmgr_read_own_msg(env,
+ conn, &type, &response_buf, &len)) != 0)
+ goto err;
+
+ /* Unmarshal remote lsnhist time and LSNs for comparison. */
+ if (type == REPMGR_LSNHIST_RESPONSE) {
+ if ((ret = __repmgr_lsnhist_match_unmarshal(env, lsnhist_match,
+ response_buf, __REPMGR_LSNHIST_MATCH_SIZE, NULL)) != 0)
+ goto err;
+ } else {
+ /*
+ * If the other site sent back REPMGR_PREFMAS_FAILURE, it means
+ * no lsnhist record for the requested gen was found on other
+ * site.
+ */
+ if (type != REPMGR_PREFMAS_FAILURE)
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "remote_lsnhist got unexpected message type %d",
+ type));
+ ret = DB_REP_UNAVAIL;
+ }
+
+err:
+ if (conn != NULL) {
+ if ((t_ret = __repmgr_close_connection(env,
+ conn)) != 0 && ret != 0)
+ ret = t_ret;
+ if ((t_ret = __repmgr_destroy_conn(env,
+ conn)) != 0 && ret != 0)
+ ret = t_ret;
+ }
+ if (response_buf != NULL)
+ __os_free(env, response_buf);
+ return (ret);
+}
+
+/*
+ * Returns the number of tries and the amount of time to yield the
+ * processor for preferred master waits. The total wait is the larger
+ * of 2 seconds or 3 * ack_timeout.
+ *
+ * PUBLIC: int __repmgr_prefmas_get_wait __P((ENV *, u_int32_t *, u_long *));
+ */
+int
+__repmgr_prefmas_get_wait(env, tries, yield_usecs)
+ ENV *env;
+ u_int32_t *tries;
+ u_long *yield_usecs;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ db_timeout_t max_wait;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ *yield_usecs = 250000;
+ max_wait = DB_REPMGR_DEFAULT_ACK_TIMEOUT * 2;
+ if ((rep->ack_timeout * 3) > max_wait)
+ max_wait = rep->ack_timeout * 3;
+ *tries = max_wait / (u_int32_t)*yield_usecs;
+ return (0);
+}
+
+/*
* Produce a membership list from the known info currently in memory.
*
- * PUBLIC: int __repmgr_marshal_member_list __P((ENV *, u_int8_t **, size_t *));
+ * PUBLIC: int __repmgr_marshal_member_list __P((ENV *, u_int32_t,
+ * PUBLIC: u_int8_t **, size_t *));
*
* Caller must hold mutex.
*/
int
-__repmgr_marshal_member_list(env, bufp, lenp)
+__repmgr_marshal_member_list(env, msg_version, bufp, lenp)
ENV *env;
+ u_int32_t msg_version;
u_int8_t **bufp;
size_t *lenp;
{
@@ -1328,6 +1968,7 @@ __repmgr_marshal_member_list(env, bufp, lenp)
REPMGR_SITE *site;
__repmgr_membr_vers_args membr_vers;
__repmgr_site_info_args site_info;
+ __repmgr_v4site_info_args v4site_info;
u_int8_t *buf, *p;
size_t bufsize, len;
u_int i;
@@ -1353,14 +1994,24 @@ __repmgr_marshal_member_list(env, bufp, lenp)
if (site->membership == 0)
continue;
- site_info.host.data = site->net_addr.host;
- site_info.host.size =
- (u_int32_t)strlen(site->net_addr.host) + 1;
- site_info.port = site->net_addr.port;
- site_info.flags = site->membership;
-
- ret = __repmgr_site_info_marshal(env,
- &site_info, p, (size_t)(&buf[bufsize]-p), &len);
+ if (msg_version < 5) {
+ v4site_info.host.data = site->net_addr.host;
+ v4site_info.host.size =
+ (u_int32_t)strlen(site->net_addr.host) + 1;
+ v4site_info.port = site->net_addr.port;
+ v4site_info.flags = site->membership;
+ ret = __repmgr_v4site_info_marshal(env,
+ &v4site_info, p, (size_t)(&buf[bufsize]-p), &len);
+ } else {
+ site_info.host.data = site->net_addr.host;
+ site_info.host.size =
+ (u_int32_t)strlen(site->net_addr.host) + 1;
+ site_info.port = site->net_addr.port;
+ site_info.status = site->membership;
+ site_info.flags = site->gmdb_flags;
+ ret = __repmgr_site_info_marshal(env,
+ &site_info, p, (size_t)(&buf[bufsize]-p), &len);
+ }
DB_ASSERT(env, ret == 0);
p += len;
}
@@ -1387,7 +2038,7 @@ read_gmdb(env, ip, bufp, lenp)
DBC *dbc;
DBT key_dbt, data_dbt;
__repmgr_membership_key_args key;
- __repmgr_membership_data_args member_status;
+ __repmgr_membership_data_args member_data;
__repmgr_member_metadata_args metadata;
__repmgr_membr_vers_args membr_vers;
__repmgr_site_info_args site_info;
@@ -1435,8 +2086,13 @@ read_gmdb(env, ip, bufp, lenp)
ret = __repmgr_member_metadata_unmarshal(env,
&metadata, metadata_buf, data_dbt.size, NULL);
DB_ASSERT(env, ret == 0);
- DB_ASSERT(env, metadata.format == REPMGR_GMDB_FMT_VERSION);
+ DB_ASSERT(env, metadata.format >= REPMGR_GMDB_FMT_MIN_VERSION &&
+ metadata.format <= REPMGR_GMDB_FMT_VERSION);
DB_ASSERT(env, metadata.version > 0);
+ /* Automatic conversion of old format gmdb if needed. */
+ if (metadata.format < REPMGR_GMDB_FMT_VERSION &&
+ (ret = convert_gmdb(env, ip, dbp, txn)) != 0)
+ goto err;
bufsize = 1000; /* Initial guess. */
if ((ret = __os_malloc(env, bufsize, &buf)) != 0)
@@ -1459,13 +2115,14 @@ read_gmdb(env, ip, bufp, lenp)
DB_ASSERT(env, key.port > 0);
ret = __repmgr_membership_data_unmarshal(env,
- &member_status, data_buf, data_dbt.size, NULL);
+ &member_data, data_buf, data_dbt.size, NULL);
DB_ASSERT(env, ret == 0);
- DB_ASSERT(env, member_status.flags != 0);
+ DB_ASSERT(env, member_data.status != 0);
site_info.host = key.host;
site_info.port = key.port;
- site_info.flags = member_status.flags;
+ site_info.status = member_data.status;
+ site_info.flags = member_data.flags;
if ((ret = __repmgr_site_info_marshal(env, &site_info,
p, (size_t)(&buf[bufsize]-p), &len)) == ENOMEM) {
bufsize *= 2;
@@ -1501,28 +2158,129 @@ err:
}
/*
+ * Convert an older-format group membership database into the current format.
+ */
+static int
+convert_gmdb(env, ip, dbp, txn)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB *dbp;
+ DB_TXN *txn;
+{
+ DBC *dbc;
+ DBT key_dbt, data_dbt, v4data_dbt;
+ __repmgr_membership_key_args key;
+ __repmgr_membership_data_args member_data;
+ __repmgr_v4membership_data_args v4member_data;
+ __repmgr_member_metadata_args metadata;
+ u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE];
+ u_int8_t key_buf[MAX_MSG_BUF];
+ u_int8_t metadata_buf[__REPMGR_MEMBER_METADATA_SIZE];
+ u_int8_t v4data_buf[__REPMGR_V4MEMBERSHIP_DATA_SIZE];
+ int ret, t_ret;
+
+ dbc = NULL;
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+
+ memset(&key_dbt, 0, sizeof(key_dbt));
+ key_dbt.data = key_buf;
+ key_dbt.ulen = sizeof(key_buf);
+ F_SET(&key_dbt, DB_DBT_USERMEM);
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ data_dbt.data = metadata_buf;
+ data_dbt.ulen = sizeof(metadata_buf);
+ F_SET(&data_dbt, DB_DBT_USERMEM);
+ memset(&v4data_dbt, 0, sizeof(v4data_dbt));
+ v4data_dbt.data = v4data_buf;
+ v4data_dbt.ulen = sizeof(v4data_buf);
+ F_SET(&v4data_dbt, DB_DBT_USERMEM);
+
+ /*
+ * The first gmdb record is a special metadata record that contains
+ * an empty key and gmdb metadata (format and version) and has already
+ * been validated by the caller. We need to update its format value
+ * for this conversion but leave the version alone.
+ */
+ if ((ret = __dbc_get(dbc, &key_dbt, &data_dbt, DB_NEXT)) != 0)
+ goto err;
+ ret = __repmgr_membership_key_unmarshal(env,
+ &key, key_buf, key_dbt.size, NULL);
+ DB_ASSERT(env, ret == 0);
+ DB_ASSERT(env, key.host.size == 0);
+ DB_ASSERT(env, key.port == 0);
+ ret = __repmgr_member_metadata_unmarshal(env,
+ &metadata, metadata_buf, data_dbt.size, NULL);
+ DB_ASSERT(env, ret == 0);
+ DB_ASSERT(env, metadata.version > 0);
+ metadata.format = REPMGR_GMDB_FMT_VERSION;
+ __repmgr_member_metadata_marshal(env, &metadata, metadata_buf);
+ DB_INIT_DBT(data_dbt, metadata_buf, __REPMGR_MEMBER_METADATA_SIZE);
+ if ((ret = __dbc_put(dbc, &key_dbt, &data_dbt, DB_CURRENT)) != 0)
+ goto err;
+
+ /*
+ * The rest of the gmdb records contain a key (host and port) and
+ * membership data (status and now flags). But the old format was
+ * using flags for the status value, so we need to transfer the
+ * old flags value to status and provide an empty flags value for
+ * this conversion.
+ */
+ data_dbt.data = data_buf;
+ data_dbt.ulen = sizeof(data_buf);
+ while ((ret = __dbc_get(dbc, &key_dbt, &v4data_dbt, DB_NEXT)) == 0) {
+ /* Get membership data in old format. */
+ ret = __repmgr_v4membership_data_unmarshal(env,
+ &v4member_data, v4data_buf, v4data_dbt.size, NULL);
+ DB_ASSERT(env, ret == 0);
+ DB_ASSERT(env, v4member_data.flags != 0);
+
+ /* Convert membership data into current format and update. */
+ member_data.status = v4member_data.flags;
+ member_data.flags = 0;
+ __repmgr_membership_data_marshal(env, &member_data, data_buf);
+ DB_INIT_DBT(data_dbt, data_buf, __REPMGR_MEMBERSHIP_DATA_SIZE);
+ if ((ret = __dbc_put(dbc,
+ &key_dbt, &data_dbt, DB_CURRENT)) != 0)
+ goto err;
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+err:
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
* Refresh our sites array from the given membership list.
*
* PUBLIC: int __repmgr_refresh_membership __P((ENV *,
- * PUBLIC: u_int8_t *, size_t));
+ * PUBLIC: u_int8_t *, size_t, u_int32_t));
*/
int
-__repmgr_refresh_membership(env, buf, len)
+__repmgr_refresh_membership(env, buf, len, version)
ENV *env;
u_int8_t *buf;
size_t len;
+ u_int32_t version;
{
DB_REP *db_rep;
+ REP *rep;
REPMGR_SITE *site;
__repmgr_membr_vers_args membr_vers;
__repmgr_site_info_args site_info;
+ __repmgr_v4site_info_args v4site_info;
char *host;
u_int8_t *p;
u_int16_t port;
- u_int32_t i, n;
+ u_int32_t i, participants;
int eid, ret;
db_rep = env->rep_handle;
+ rep = db_rep->region;
/*
* Membership list consists of membr_vers followed by a number of
@@ -1546,9 +2304,17 @@ __repmgr_refresh_membership(env, buf, len)
for (i = 0; i < db_rep->site_cnt; i++)
F_CLR(SITE_FROM_EID(i), SITE_TOUCHED);
- for (n = 0; p < &buf[len]; ++n) {
- ret = __repmgr_site_info_unmarshal(env,
- &site_info, p, (size_t)(&buf[len] - p), &p);
+ for (participants = 0; p < &buf[len]; ) {
+ if (version < 5) {
+ ret = __repmgr_v4site_info_unmarshal(env,
+ &v4site_info, p, (size_t)(&buf[len] - p), &p);
+ site_info.host = v4site_info.host;
+ site_info.port = v4site_info.port;
+ site_info.status = v4site_info.flags;
+ site_info.flags = 0;
+ } else
+ ret = __repmgr_site_info_unmarshal(env,
+ &site_info, p, (size_t)(&buf[len] - p), &p);
DB_ASSERT(env, ret == 0);
host = site_info.host.data;
@@ -1556,9 +2322,11 @@ __repmgr_refresh_membership(env, buf, len)
(u_int8_t*)site_info.host.data + site_info.host.size <= p);
host[site_info.host.size-1] = '\0';
port = site_info.port;
+ if (!FLD_ISSET(site_info.flags, SITE_VIEW))
+ participants++;
if ((ret = __repmgr_set_membership(env,
- host, port, site_info.flags)) != 0)
+ host, port, site_info.status, site_info.flags)) != 0)
goto err;
if ((ret = __repmgr_find_site(env, host, port, &eid)) != 0)
@@ -1566,8 +2334,13 @@ __repmgr_refresh_membership(env, buf, len)
DB_ASSERT(env, IS_VALID_EID(eid));
F_SET(SITE_FROM_EID(eid), SITE_TOUCHED);
}
- ret = __rep_set_nsites_int(env, n);
+ ret = __rep_set_nsites_int(env, participants);
DB_ASSERT(env, ret == 0);
+ if (FLD_ISSET(rep->config,
+ REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT) &&
+ rep->config_nsites > 2)
+ __db_errx(env, DB_STR("3703",
+ "More than two sites in preferred master replication group"));
/* Scan "touched" flags so as to notice sites that have been removed. */
for (i = 0; i < db_rep->site_cnt; i++) {
@@ -1576,7 +2349,8 @@ __repmgr_refresh_membership(env, buf, len)
continue;
host = site->net_addr.host;
port = site->net_addr.port;
- if ((ret = __repmgr_set_membership(env, host, port, 0)) != 0)
+ if ((ret = __repmgr_set_membership(env, host, port,
+ 0, site->gmdb_flags)) != 0)
goto err;
}
@@ -1597,13 +2371,13 @@ __repmgr_reload_gmdb(env)
size_t len;
int ret;
- ENV_ENTER(env, ip);
+ ENV_GET_THREAD_INFO(env, ip);
if ((ret = read_gmdb(env, ip, &buf, &len)) == 0) {
env->rep_handle->have_gmdb = TRUE;
- ret = __repmgr_refresh_membership(env, buf, len);
+ ret = __repmgr_refresh_membership(env, buf, len,
+ DB_REPMGR_VERSION);
__os_free(env, buf);
}
- ENV_LEAVE(env, ip);
return (ret);
}
@@ -1650,7 +2424,8 @@ __repmgr_init_save(env, dbt)
dbt->data = NULL;
dbt->size = 0;
ret = 0;
- } else if ((ret = __repmgr_marshal_member_list(env, &buf, &len)) == 0) {
+ } else if ((ret = __repmgr_marshal_member_list(env,
+ DB_REPMGR_VERSION, &buf, &len)) == 0) {
dbt->data = buf;
dbt->size = (u_int32_t)len;
}
@@ -1700,6 +2475,7 @@ __repmgr_defer_op(env, op)
*/
if ((ret = __os_calloc(env, 1, sizeof(*msg), &msg)) != 0)
return (ret);
+ msg->size = sizeof(*msg);
msg->msg_hdr.type = REPMGR_OWN_MSG;
REPMGR_OWN_MSG_TYPE(msg->msg_hdr) = op;
ret = __repmgr_queue_put(env, msg);
@@ -1771,7 +2547,7 @@ __repmgr_become_client(env)
if ((ret = __repmgr_await_gmdbop(env)) == 0)
db_rep->client_intent = TRUE;
UNLOCK_MUTEX(db_rep->mutex);
- return (ret == 0 ? __repmgr_repstart(env, DB_REP_CLIENT) : ret);
+ return (ret == 0 ? __repmgr_repstart(env, DB_REP_CLIENT, 0) : ret);
}
/*
@@ -1897,16 +2673,17 @@ get_eid(env, host, port, eidp)
* accordingly.
*
* PUBLIC: int __repmgr_set_membership __P((ENV *,
- * PUBLIC: const char *, u_int, u_int32_t));
+ * PUBLIC: const char *, u_int, u_int32_t, u_int32_t));
*
* Caller must host db_rep mutex, and be in ENV_ENTER context.
*/
int
-__repmgr_set_membership(env, host, port, status)
+__repmgr_set_membership(env, host, port, status, flags)
ENV *env;
const char *host;
u_int port;
u_int32_t status;
+ u_int32_t flags;
{
DB_REP *db_rep;
REP *rep;
@@ -1953,7 +2730,9 @@ __repmgr_set_membership(env, host, port, status)
/* Set both private and shared copies of the info. */
site->membership = status;
+ site->gmdb_flags = flags;
sites[eid].status = status;
+ sites[eid].flags = flags;
}
MUTEX_UNLOCK(env, rep->mtx_repmgr);
@@ -1965,7 +2744,8 @@ __repmgr_set_membership(env, host, port, status)
SELECTOR_RUNNING(db_rep)) {
if (eid == db_rep->self_eid && status != SITE_PRESENT)
- ret = DB_DELETED;
+ ret = (status == SITE_ADDING) ?
+ __repmgr_defer_op(env, REPMGR_REJOIN) : DB_DELETED;
else if (orig != SITE_PRESENT && status == SITE_PRESENT &&
site->state == SITE_IDLE) {
/*
@@ -1981,10 +2761,11 @@ __repmgr_set_membership(env, host, port, status)
* failure shouldn't hurt anything, because we'll just
* naturally try again later.
*/
- ret = __repmgr_schedule_connection_attempt(env,
- eid, TRUE);
- if (eid != db_rep->self_eid)
+ if (eid != db_rep->self_eid) {
+ ret = __repmgr_schedule_connection_attempt(env,
+ eid, TRUE);
DB_EVENT(env, DB_EVENT_REP_SITE_ADDED, &eid);
+ }
} else if (orig != 0 && status == 0)
DB_EVENT(env, DB_EVENT_REP_SITE_REMOVED, &eid);
@@ -2084,3 +2865,73 @@ __repmgr_bcast_own_msg(env, type, buf, len)
}
return (0);
}
+
+/*
+ * PUBLIC: int __repmgr_bcast_member_list __P((ENV *));
+ *
+ * Broadcast membership list to all other sites in the replication group.
+ *
+ * Caller must hold mutex.
+ */
+int
+__repmgr_bcast_member_list(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REPMGR_CONNECTION *conn;
+ REPMGR_SITE *site;
+ u_int8_t *buf, *v4buf;
+ size_t len, v4len;
+ int ret;
+ u_int i;
+
+ db_rep = env->rep_handle;
+ if (!SELECTOR_RUNNING(db_rep))
+ return (0);
+ buf = NULL;
+ v4buf = NULL;
+ LOCK_MUTEX(db_rep->mutex);
+ /*
+ * Some of the other sites in the replication group might be at
+ * an older version, so we need to be able to send the membership
+ * list in the current or older format.
+ */
+ if ((ret = __repmgr_marshal_member_list(env,
+ DB_REPMGR_VERSION, &buf, &len)) != 0 ||
+ (ret = __repmgr_marshal_member_list(env,
+ 4, &v4buf, &v4len)) != 0) {
+ UNLOCK_MUTEX(db_rep->mutex);
+ goto out;
+ }
+ UNLOCK_MUTEX(db_rep->mutex);
+
+ RPRINT(env, (env, DB_VERB_REPMGR_MISC,
+ "Broadcast latest membership list"));
+ FOR_EACH_REMOTE_SITE_INDEX(i) {
+ site = SITE_FROM_EID(i);
+ if (site->state != SITE_CONNECTED)
+ continue;
+ if ((conn = site->ref.conn.in) != NULL &&
+ conn->state == CONN_READY &&
+ (ret = __repmgr_send_own_msg(env, conn, REPMGR_SHARING,
+ (conn->version < 5 ? v4buf : buf),
+ (conn->version < 5 ? (u_int32_t) v4len : (u_int32_t)len)))
+ != 0 &&
+ (ret = __repmgr_bust_connection(env, conn)) != 0)
+ goto out;
+ if ((conn = site->ref.conn.out) != NULL &&
+ conn->state == CONN_READY &&
+ (ret = __repmgr_send_own_msg(env, conn, REPMGR_SHARING,
+ (conn->version < 5 ? v4buf : buf),
+ (conn->version < 5 ? (u_int32_t)v4len : (u_int32_t)len)))
+ != 0 &&
+ (ret = __repmgr_bust_connection(env, conn)) != 0)
+ goto out;
+ }
+out:
+ if (buf != NULL)
+ __os_free(env, buf);
+ if (v4buf != NULL)
+ __os_free(env, v4buf);
+ return (ret);
+}
diff --git a/src/repmgr/repmgr_windows.c b/src/repmgr/repmgr_windows.c
index d9c2a03d..8cf05960 100644
--- a/src/repmgr/repmgr_windows.c
+++ b/src/repmgr/repmgr_windows.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -252,7 +252,7 @@ allocate_wait_slot(env, resultp, table)
* the previous wait but before reacquiring the mutex, and this
* extra signal would incorrectly cause the next wait to return
* immediately.
- */
+ */
(void)WaitForSingleObject(w->event, 0);
*resultp = i;
return (0);
@@ -639,31 +639,40 @@ __repmgr_select_loop(env)
WSAEVENT listen_event;
WSANETWORKEVENTS net_events;
struct io_info io_info;
- int i;
+ int accept_connect, i;
db_rep = env->rep_handle;
io_info.connections = connections;
io_info.events = events;
+ accept_connect = FALSE;
if ((listen_event = WSACreateEvent()) == WSA_INVALID_EVENT) {
__db_err(env, net_errno, DB_STR("3590",
"can't create event for listen socket"));
return (net_errno);
}
- if (!IS_SUBORDINATE(db_rep) &&
- WSAEventSelect(db_rep->listen_fd, listen_event, FD_ACCEPT) ==
- SOCKET_ERROR) {
- ret = net_errno;
- __db_err(env, ret, DB_STR("3591",
- "can't enable event for listener"));
- (void)WSACloseEvent(listen_event);
- goto out;
- }
LOCK_MUTEX(db_rep->mutex);
if ((ret = __repmgr_first_try_connections(env)) != 0)
goto unlock;
for (;;) {
+ /*
+ * Set the event for this process to receive notification of
+ * incoming connections if this process is or has just taken
+ * over as the listener process.
+ */
+ if (!IS_SUBORDINATE(db_rep) && !accept_connect) {
+ if (WSAEventSelect(db_rep->listen_fd, listen_event,
+ FD_ACCEPT) == SOCKET_ERROR) {
+ ret = net_errno;
+ __db_err(env, ret, DB_STR("3700",
+ "can't enable event for listener"));
+ (void)WSACloseEvent(listen_event);
+ goto out;
+ }
+ accept_connect = TRUE;
+ }
+
/* Start with the two events that we always wait for. */
#define SIGNALER_INDEX 0
#define LISTENER_INDEX 1
@@ -714,6 +723,8 @@ __repmgr_select_loop(env)
ret = net_errno;
goto unlock;
}
+ if (net_events.lNetworkEvents == 0)
+ continue;
DB_ASSERT(env,
net_events.lNetworkEvents & FD_ACCEPT);
if ((ret = net_events.iErrorCode[FD_ACCEPT_BIT])
@@ -815,7 +826,16 @@ handle_completion(env, conn)
/* Check both writing and reading. */
if (events.lNetworkEvents & FD_CLOSE) {
error = events.iErrorCode[FD_CLOSE_BIT];
- goto report;
+
+ /*
+ * There could be data for reading when we see FD_CLOSE,
+ * so we should try reading in this case.
+ */
+ if (error != 0)
+ goto report;
+ else if ((ret =
+ __repmgr_read_from_site(env, conn)) != 0)
+ goto err;
}
if (events.lNetworkEvents & FD_WRITE) {
@@ -823,7 +843,7 @@ handle_completion(env, conn)
error = events.iErrorCode[FD_WRITE_BIT];
goto report;
} else if ((ret =
- __repmgr_write_some(env, conn)) != 0)
+ __repmgr_write_some(env, conn)) != 0)
goto err;
}
@@ -832,7 +852,7 @@ handle_completion(env, conn)
error = events.iErrorCode[FD_READ_BIT];
goto report;
} else if ((ret =
- __repmgr_read_from_site(env, conn)) != 0)
+ __repmgr_read_from_site(env, conn)) != 0)
goto err;
}
diff --git a/src/sequence/seq_stat.c b/src/sequence/seq_stat.c
index d5b9a401..28f61174 100644
--- a/src/sequence/seq_stat.c
+++ b/src/sequence/seq_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -124,10 +124,12 @@ __seq_stat_print(seq, flags)
DB *dbp;
DB_THREAD_INFO *ip;
ENV *env;
+ u_int32_t orig_flags;
int handle_check, ret, t_ret;
dbp = seq->seq_dbp;
env = dbp->env;
+ ret = 0;
SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->stat_print");
@@ -140,11 +142,16 @@ __seq_stat_print(seq, flags)
goto err;
}
- if ((ret = __seq_print_stats(seq, flags)) != 0)
- goto err;
+ orig_flags = flags;
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+ if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+ ret = __seq_print_stats(seq, orig_flags);
+ if (flags == 0 || ret != 0)
+ goto err;
+ }
if (LF_ISSET(DB_STAT_ALL) &&
- (ret = __seq_print_all(seq, flags)) != 0)
+ (ret = __seq_print_all(seq, orig_flags)) != 0)
goto err;
/* Release replication block. */
diff --git a/src/sequence/sequence.c b/src/sequence/sequence.c
index 1c19f838..9ee31123 100644
--- a/src/sequence/sequence.c
+++ b/src/sequence/sequence.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -53,24 +53,23 @@
} \
} while (0)
-static int __seq_chk_cachesize __P((ENV *, int32_t, db_seq_t, db_seq_t));
-static int __seq_close __P((DB_SEQUENCE *, u_int32_t));
+static int __seq_chk_cachesize __P((ENV *, u_int32_t, db_seq_t, db_seq_t));
static int __seq_close_pp __P((DB_SEQUENCE *, u_int32_t));
-static int __seq_get
- __P((DB_SEQUENCE *, DB_TXN *, int32_t, db_seq_t *, u_int32_t));
-static int __seq_get_cachesize __P((DB_SEQUENCE *, int32_t *));
+static int __seq_get_pp
+ __P((DB_SEQUENCE *,
+ DB_TXN *, u_int32_t, db_seq_t *, u_int32_t));
+static int __seq_get_cachesize __P((DB_SEQUENCE *, u_int32_t *));
static int __seq_get_db __P((DB_SEQUENCE *, DB **));
static int __seq_get_flags __P((DB_SEQUENCE *, u_int32_t *));
static int __seq_get_key __P((DB_SEQUENCE *, DBT *));
static int __seq_get_range __P((DB_SEQUENCE *, db_seq_t *, db_seq_t *));
-static int __seq_initial_value __P((DB_SEQUENCE *, db_seq_t));
static int __seq_open_pp __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t));
static int __seq_remove __P((DB_SEQUENCE *, DB_TXN *, u_int32_t));
-static int __seq_set_cachesize __P((DB_SEQUENCE *, int32_t));
+static int __seq_set_cachesize __P((DB_SEQUENCE *, u_int32_t));
static int __seq_set_flags __P((DB_SEQUENCE *, u_int32_t));
static int __seq_set_range __P((DB_SEQUENCE *, db_seq_t, db_seq_t));
static int __seq_update
- __P((DB_SEQUENCE *, DB_THREAD_INFO *, DB_TXN *, int32_t, u_int32_t));
+ __P((DB_SEQUENCE *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, u_int32_t));
/*
* db_sequence_create --
@@ -113,7 +112,7 @@ db_sequence_create(seqp, dbp, flags)
seq->seq_dbp = dbp;
seq->close = __seq_close_pp;
- seq->get = __seq_get;
+ seq->get = __seq_get_pp;
seq->get_cachesize = __seq_get_cachesize;
seq->set_cachesize = __seq_set_cachesize;
seq->get_db = __seq_get_db;
@@ -134,7 +133,7 @@ db_sequence_create(seqp, dbp, flags)
}
/*
- * __seq_open --
+ * __seq_open_pp --
* DB_SEQUENCE->open method.
*
*/
@@ -146,21 +145,18 @@ __seq_open_pp(seq, txn, keyp, flags)
u_int32_t flags;
{
DB *dbp;
- DB_SEQ_RECORD *rp;
DB_THREAD_INFO *ip;
ENV *env;
- u_int32_t tflags;
- int handle_check, txn_local, ret, t_ret;
+ int handle_check, ret, t_ret;
#define SEQ_OPEN_FLAGS (DB_CREATE | DB_EXCL | DB_THREAD)
- dbp = seq->seq_dbp;
- env = dbp->env;
- txn_local = 0;
-
- STRIP_AUTO_COMMIT(flags);
SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->open");
+ env = seq->seq_dbp->env;
+ dbp = seq->seq_dbp;
+
ENV_ENTER(env, ip);
+ STRIP_AUTO_COMMIT(flags);
/* Check for replication block. */
handle_check = IS_ENV_REPLICATED(env);
@@ -174,6 +170,41 @@ __seq_open_pp(seq, txn, keyp, flags)
"DB_SEQUENCE->open", flags, SEQ_OPEN_FLAGS)) != 0)
goto err;
+ ret = __seq_open(seq, txn, keyp, flags);
+
+ /* Release replication block. */
+err: if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __seq_open --
+ * Internal open function.
+ *
+ * PUBLIC: int __seq_open __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t));
+ */
+
+int
+__seq_open(seq, txn, keyp, flags)
+ DB_SEQUENCE *seq;
+ DB_TXN *txn;
+ DBT *keyp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_SEQ_RECORD *rp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ u_int32_t tflags;
+ int txn_local, ret, t_ret;
+
+ dbp = seq->seq_dbp;
+ env = dbp->env;
+ txn_local = 0;
+
if (keyp->size == 0) {
__db_errx(env, DB_STR("4001",
"Zero length sequence key specified"));
@@ -229,6 +260,7 @@ __seq_open_pp(seq, txn, keyp, flags)
seq->seq_key.size = seq->seq_key.ulen = keyp->size;
seq->seq_key.flags = DB_DBT_USERMEM;
+ ENV_GET_THREAD_INFO(env, ip);
retry: if ((ret = __db_get(dbp, ip,
txn, &seq->seq_key, &seq->seq_data, 0)) != 0) {
if (ret == DB_BUFFER_SMALL &&
@@ -369,11 +401,6 @@ err: if (txn_local &&
__os_free(env, seq->seq_key.data);
seq->seq_key.data = NULL;
}
- /* Release replication block. */
- if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
- ret = t_ret;
-
- ENV_LEAVE(env, ip);
__dbt_userfree(env, keyp, NULL, NULL);
return (ret);
}
@@ -386,10 +413,8 @@ err: if (txn_local &&
static int
__seq_get_cachesize(seq, cachesize)
DB_SEQUENCE *seq;
- int32_t *cachesize;
+ u_int32_t *cachesize;
{
- SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_cachesize");
-
*cachesize = seq->seq_cache_size;
return (0);
}
@@ -402,25 +427,9 @@ __seq_get_cachesize(seq, cachesize)
static int
__seq_set_cachesize(seq, cachesize)
DB_SEQUENCE *seq;
- int32_t cachesize;
+ u_int32_t cachesize;
{
- ENV *env;
- int ret;
-
- env = seq->seq_dbp->env;
-
- if (cachesize < 0) {
- __db_errx(env, DB_STR("4007",
- "Cache size must be >= 0"));
- return (EINVAL);
- }
-
- /*
- * It's an error to specify a cache larger than the range of sequences.
- */
- if (SEQ_IS_OPEN(seq) && (ret = __seq_chk_cachesize(env,
- cachesize, seq->seq_rp->seq_max, seq->seq_rp->seq_min)) != 0)
- return (ret);
+ SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->set_cachesize");
seq->seq_cache_size = cachesize;
return (0);
@@ -437,8 +446,6 @@ __seq_get_flags(seq, flagsp)
DB_SEQUENCE *seq;
u_int32_t *flagsp;
{
- SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_flags");
-
*flagsp = F_ISSET(seq->seq_rp, SEQ_SET_FLAGS);
return (0);
}
@@ -480,8 +487,10 @@ __seq_set_flags(seq, flags)
* __seq_initial_value --
* DB_SEQUENCE->initial_value.
*
+ * PUBLIC: int __seq_initial_value __P((DB_SEQUENCE *, db_seq_t));
+ *
*/
-static int
+int
__seq_initial_value(seq, value)
DB_SEQUENCE *seq;
db_seq_t value;
@@ -515,8 +524,6 @@ __seq_get_range(seq, minp, maxp)
DB_SEQUENCE *seq;
db_seq_t *minp, *maxp;
{
- SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_range");
-
*minp = seq->seq_rp->seq_min;
*maxp = seq->seq_rp->seq_max;
return (0);
@@ -557,14 +564,13 @@ __seq_update(seq, ip, txn, delta, flags)
DB_SEQUENCE *seq;
DB_THREAD_INFO *ip;
DB_TXN *txn;
- int32_t delta;
- u_int32_t flags;
+ u_int32_t delta, flags;
{
DB *dbp;
DBT *data, ldata;
DB_SEQ_RECORD *rp;
ENV *env;
- int32_t adjust;
+ db_seq_t adjust;
int ret, txn_local, need_mutex;
dbp = seq->seq_dbp;
@@ -721,29 +727,36 @@ err: if (need_mutex) {
env, txn, LF_ISSET(DB_TXN_NOSYNC), ret) : ret);
}
-static int
+/*
+ * __seq_get --
+ * Internal get function for sequence.
+ *
+ * PUBLIC: int __seq_get
+ * PUBLIC: __P((DB_SEQUENCE *, DB_TXN *, u_int32_t, db_seq_t *, u_int32_t));
+ */
+int
__seq_get(seq, txn, delta, retp, flags)
DB_SEQUENCE *seq;
DB_TXN *txn;
- int32_t delta;
+ u_int32_t delta, flags;
db_seq_t *retp;
- u_int32_t flags;
{
DB *dbp;
DB_SEQ_RECORD *rp;
DB_THREAD_INFO *ip;
ENV *env;
- int handle_check, ret, t_ret;
+ int handle_check, ret;
dbp = seq->seq_dbp;
env = dbp->env;
rp = seq->seq_rp;
ret = 0;
+ ENV_GET_THREAD_INFO(env, ip);
STRIP_AUTO_COMMIT(flags);
SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get");
- if (delta < 0 || (delta == 0 && !LF_ISSET(DB_CURRENT))) {
+ if (delta == 0 && !LF_ISSET(DB_CURRENT)) {
__db_errx(env, "Sequence delta must be greater than 0");
return (EINVAL);
}
@@ -754,16 +767,9 @@ __seq_get(seq, txn, delta, retp, flags)
return (EINVAL);
}
- ENV_ENTER(env, ip);
-
- /* Check for replication block. */
- handle_check = IS_ENV_REPLICATED(env);
- if (handle_check &&
- (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0)
- return (ret);
-
MUTEX_LOCK(env, seq->mtx_seq);
+ handle_check = IS_ENV_REPLICATED(env);
if (handle_check && IS_REP_CLIENT(env) &&
!F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
ret = __db_rdonly(env, "DB_SEQUENCE->get");
@@ -799,6 +805,31 @@ __seq_get(seq, txn, delta, retp, flags)
}
err: MUTEX_UNLOCK(env, seq->mtx_seq);
+ return (ret);
+}
+
+static int
+__seq_get_pp(seq, txn, delta, retp, flags)
+ DB_SEQUENCE *seq;
+ DB_TXN *txn;
+ u_int32_t delta, flags;
+ db_seq_t *retp;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = seq->seq_dbp->env;
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(seq->seq_dbp, 1, 0, IS_REAL_TXN(txn))) != 0)
+ return (ret);
+
+ ret = __seq_get(seq, txn, delta, retp, flags);
/* Release replication block. */
if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
@@ -868,8 +899,9 @@ __seq_close_pp(seq, flags)
* __seq_close --
* Close a sequence
*
+ * PUBLIC: int __seq_close __P((DB_SEQUENCE *, u_int32_t));
*/
-static int
+int
__seq_close(seq, flags)
DB_SEQUENCE *seq;
u_int32_t flags;
@@ -916,19 +948,24 @@ __seq_remove(seq, txn, flags)
dbp = seq->seq_dbp;
env = dbp->env;
+ handle_check = 0;
+ ret = 0;
txn_local = 0;
- SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->remove");
+ if (!SEQ_IS_OPEN(seq))
+ ret = __db_mi_open(env, "DB_SEQUENCE->remove", 0);
/*
* Flags can only be 0, unless the database has DB_AUTO_COMMIT enabled.
* Then DB_TXN_NOSYNC is allowed.
*/
- if (flags != 0 &&
+ if (ret == 0 && flags != 0 &&
(flags != DB_TXN_NOSYNC || !IS_DB_AUTO_COMMIT(dbp, txn)))
- return (__db_ferr(env, "DB_SEQUENCE->remove illegal flag", 0));
+ ret = __db_ferr(env, "DB_SEQUENCE->remove illegal flag", 0);
ENV_ENTER(env, ip);
+ if (ret != 0)
+ goto err;
/* Check for replication block. */
handle_check = IS_ENV_REPLICATED(env);
@@ -945,7 +982,7 @@ __seq_remove(seq, txn, flags)
*/
if (IS_DB_AUTO_COMMIT(dbp, txn)) {
if ((ret = __txn_begin(env, ip, NULL, &txn, flags)) != 0)
- return (ret);
+ goto err;
txn_local = 1;
}
@@ -955,13 +992,14 @@ __seq_remove(seq, txn, flags)
ret = __db_del(dbp, ip, txn, &seq->seq_key, 0);
+err:
if ((t_ret = __seq_close(seq, 0)) != 0 && ret == 0)
ret = t_ret;
/* Release replication block. */
if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
ret = t_ret;
-err: if (txn_local && (t_ret =
+ if (txn_local && (t_ret =
__db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
ret = t_ret;
@@ -976,7 +1014,7 @@ err: if (txn_local && (t_ret =
static int
__seq_chk_cachesize(env, cachesize, max, min)
ENV *env;
- int32_t cachesize;
+ u_int32_t cachesize;
db_seq_t max, min;
{
/*
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 81225e5c..91652cb7 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1995, 1996
@@ -227,8 +227,15 @@ __txn_begin(env, ip, parent, txnpp, flags)
if (LF_ISSET(DB_TXN_FAMILY))
F_SET(txn, TXN_FAMILY | TXN_INFAMILY | TXN_READONLY);
if (LF_ISSET(DB_TXN_SNAPSHOT) || F_ISSET(dbenv, DB_ENV_TXN_SNAPSHOT) ||
- (parent != NULL && F_ISSET(parent, TXN_SNAPSHOT)))
- F_SET(txn, TXN_SNAPSHOT);
+ (parent != NULL && F_ISSET(parent, TXN_SNAPSHOT))) {
+ if (IS_REP_CLIENT(env)) {
+ __db_errx(env, DB_STR("4572",
+ "DB_TXN_SNAPSHOT may not be used on a replication client"));
+ ret = (EINVAL);
+ goto err;
+ } else
+ F_SET(txn, TXN_SNAPSHOT);
+ }
if (LF_ISSET(DB_IGNORE_LEASE))
F_SET(txn, TXN_IGNORE_LEASE);
@@ -581,8 +588,7 @@ __txn_continue(env, txn, td, ip, add_to_list)
txn->set_timeout = __txn_set_timeout;
txn->set_txn_lsnp = __txn_set_txn_lsnp;
- /* XXX Do we need to explicitly set a SYNC flag here? */
- txn->flags = TXN_MALLOC |
+ txn->flags = TXN_MALLOC | TXN_SYNC |
(F_ISSET(td, TXN_DTL_NOWAIT) ? TXN_NOWAIT : 0);
txn->xa_thr_status = TXN_XA_THREAD_NOTA;
@@ -795,8 +801,9 @@ __txn_commit(txn, flags)
if (ret == 0) {
DB_LSN s_lsn;
- DB_ASSERT(env, __log_current_lsn_int(
- env, &s_lsn, NULL, NULL) == 0);
+ if ((ret = __log_current_lsn_int(
+ env, &s_lsn, NULL, NULL)) != 0)
+ goto err;
DB_ASSERT(env, LOG_COMPARE(
&td->visible_lsn, &s_lsn) <= 0);
COMPQUIET(s_lsn.file, 0);
@@ -890,17 +897,16 @@ static int
__txn_close_cursors(txn)
DB_TXN *txn;
{
- int ret, tret;
+ int ret, t_ret;
DBC *dbc;
- ret = tret = 0;
+ ret = t_ret = 0;
dbc = NULL;
if (txn == NULL)
return (0);
while ((dbc = TAILQ_FIRST(&txn->my_cursors)) != NULL) {
-
DB_ASSERT(dbc->env, txn == dbc->txn);
/*
@@ -913,21 +919,21 @@ __txn_close_cursors(txn)
/* Removed from the active queue here. */
if (F_ISSET(dbc, DBC_ACTIVE))
- ret = __dbc_close(dbc);
+ t_ret = __dbc_close(dbc);
dbc->txn = NULL;
/* We have to close all cursors anyway, so continue on error. */
- if (ret != 0) {
- __db_err(dbc->env, ret, "__dbc_close");
- if (tret == 0)
- tret = ret;
+ if (t_ret != 0) {
+ __db_err(dbc->env, t_ret, "__dbc_close");
+ if (ret == 0)
+ ret = t_ret;
}
}
txn->my_cursors.tqh_first = NULL;
txn->my_cursors.tqh_last = NULL;
- return (tret);/* Return the first error if any. */
+ return (ret); /* Return the first error, if any. */
}
/*
@@ -1050,7 +1056,7 @@ __txn_abort(txn)
* it, however make sure that it is aborted when the last process
* tries to abort it.
*/
- if (txn->xa_thr_status != TXN_XA_THREAD_NOTA && td->xa_ref > 1) {
+ if (txn->xa_thr_status != TXN_XA_THREAD_NOTA && td->xa_ref > 1) {
td->status = TXN_NEED_ABORT;
return (0);
}
@@ -2165,5 +2171,5 @@ __txn_applied(env, ip, commit_info, timeout)
if (renv->envid == commit_info->envid &&
LOG_COMPARE(&commit_info->lsn, &lsn) <= 0)
return (0);
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
}
diff --git a/src/txn/txn.src b/src/txn/txn.src
index 7e82dc82..d9af5318 100644
--- a/src/txn/txn.src
+++ b/src/txn/txn.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/txn/txn_chkpt.c b/src/txn/txn_chkpt.c
index 73715b10..a909767f 100644
--- a/src/txn/txn_chkpt.c
+++ b/src/txn/txn_chkpt.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1995, 1996
@@ -377,7 +377,7 @@ __txn_getckp(env, lsnp)
TXN_SYSTEM_UNLOCK(env);
if (IS_ZERO_LSN(lsn))
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
*lsnp = lsn;
return (0);
diff --git a/src/txn/txn_failchk.c b/src/txn/txn_failchk.c
index b2007ad6..94f22ec2 100644
--- a/src/txn/txn_failchk.c
+++ b/src/txn/txn_failchk.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -57,7 +57,7 @@ retry: TXN_SYSTEM_LOCK(env);
if (F_ISSET(td, TXN_DTL_INMEMORY)) {
TXN_SYSTEM_UNLOCK(env);
- return (__db_failed(env, DB_STR("4501",
+ return (__db_failed(env, DB_STR("4573",
"Transaction has in memory logs"),
td->pid, td->tid));
}
diff --git a/src/txn/txn_method.c b/src/txn/txn_method.c
index 629eac04..357e78c6 100644
--- a/src/txn/txn_method.c
+++ b/src/txn/txn_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/txn/txn_rec.c b/src/txn/txn_rec.c
index b39d56d1..708af98a 100644
--- a/src/txn/txn_rec.c
+++ b/src/txn/txn_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1996
@@ -210,11 +210,12 @@ __txn_prepare_recover(env, dbtp, lsnp, op, info)
*/
else if ((ret = __db_txnlist_remove(env,
info, argp->txnp->txnid)) != 0) {
-txn_err: __db_errx(env,
+txn_err:
+ ret = USR_ERR(env, DB_NOTFOUND);
+ __db_errx(env,
DB_STR_A("4515",
"transaction not in list %lx", "%lx"),
(u_long)argp->txnp->txnid);
- ret = DB_NOTFOUND;
} else if (IS_ZERO_LSN(headp->trunc_lsn) ||
LOG_COMPARE(&headp->trunc_lsn, lsnp) >= 0) {
if ((ret = __db_txnlist_add(env,
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
index 67f24439..915a289f 100644
--- a/src/txn/txn_recover.c
+++ b/src/txn/txn_recover.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -305,8 +305,8 @@ __txn_openfiles(env, ip, min, force)
if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0)
goto err;
- ret = __env_openfiles(
- env, logc, txninfo, &data, &open_lsn, NULL, (double)0, 0);
+ ret = __env_openfiles(env,
+ logc, txninfo, &data, &open_lsn, NULL, (double)0, 0);
if (txninfo != NULL)
__db_txnlist_end(env, txninfo);
diff --git a/src/txn/txn_region.c b/src/txn/txn_region.c
index 6f43d45f..7fef66e6 100644
--- a/src/txn/txn_region.c
+++ b/src/txn/txn_region.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -13,6 +13,7 @@
#include "dbinc/txn.h"
static int __txn_init __P((ENV *, DB_TXNMGR *));
+static int lsn_hi_to_low __P((const void *, const void *));
/*
* __txn_open --
@@ -57,12 +58,30 @@ __txn_open(env)
env->tx_handle = mgr;
return (0);
-err: env->tx_handle = NULL;
- if (mgr->reginfo.addr != NULL)
- (void)__env_region_detach(env, &mgr->reginfo, 0);
+err: (void)__mutex_free(env, &mgr->mutex);
+ (void)__txn_region_detach(env, mgr);
- (void)__mutex_free(env, &mgr->mutex);
- __os_free(env, mgr);
+ return (ret);
+}
+
+/*
+ * __txn_region_detach --
+ *
+ * PUBLIC: int __txn_region_detach __P((ENV *, DB_TXNMGR *));
+ */
+int
+__txn_region_detach(env, mgr)
+ ENV *env;
+ DB_TXNMGR *mgr;
+{
+ int ret;
+
+ ret = 0;
+ if (mgr != NULL) {
+ ret = __env_region_detach(env, &mgr->reginfo, 0);
+ __os_free(env, mgr);
+ env->tx_handle = NULL;
+ }
return (ret);
}
@@ -409,39 +428,101 @@ __txn_id_set(env, cur_txnid, max_txnid)
}
/*
- * __txn_oldest_reader --
- * Find the oldest "read LSN" of any active transaction'
- * MVCC changes older than this can safely be discarded from the cache.
+ * lsn_hi_to_low --
+ * Compare lsns, sorting them from high to low. This is the opposite of
+ * __rep_lsn_cmp.
+ */
+static int
+lsn_hi_to_low(lsn1, lsn2)
+ const void *lsn1, *lsn2;
+{
+ return (LOG_COMPARE((DB_LSN *)lsn2, (DB_LSN *)lsn1));
+}
+
+/*
+ * __txn_get_readers --
+ * Find the read LSN of all active transactions.
+ * MVCC versions older than the oldest active transaction can safely be
+ * discarded from the cache. MVCC versions not quite so old can be
+ * discarded if they are not visible to any active transaction.
*
- * PUBLIC: int __txn_oldest_reader __P((ENV *, DB_LSN *));
+ * Returns:
+ * An error code, or 0.
+ * If 0 was returned, *readers has been filled in with an __os_malloc()'d
+ * array of active transactions with read_lsns, sorted from newest
+ * (largest) to oldest (smallest). *ntxnsp indicates how many are there.
+ * The last lsn is that of the oldest active mvcc-supporting transaction.
+ * The caller must __os_free() *readers whenever it is non-NULL.
+ *
+ * PUBLIC: int __txn_get_readers __P((ENV *, DB_LSN **, int *));
*/
+#define TXN_READERS_SIZE 64 /* Initial number of LSNs to allocate. */
int
-__txn_oldest_reader(env, lsnp)
+__txn_get_readers(env, readers, ntxnsp)
ENV *env;
- DB_LSN *lsnp;
+ DB_LSN **readers;
+ int *ntxnsp;
{
- DB_LSN old_lsn;
+ DB_LSN current, *lsns;
DB_TXNMGR *mgr;
DB_TXNREGION *region;
TXN_DETAIL *td;
- int ret;
+ int cmp, is_sorted, ret;
+ unsigned count, txnmax;
+
+ *ntxnsp = 0;
+ *readers = NULL;
if ((mgr = env->tx_handle) == NULL)
return (0);
region = mgr->reginfo.primary;
+ lsns = NULL;
+
+ if ((ret = __log_current_lsn_int(env, &current, NULL, NULL)) != 0)
+ return (ret);
- if ((ret = __log_current_lsn_int(env, &old_lsn, NULL, NULL)) != 0)
+ txnmax = TXN_READERS_SIZE;
+ if ((ret = __os_malloc(env, txnmax * sizeof(lsns[0]), &lsns)) != 0)
return (ret);
TXN_SYSTEM_LOCK(env);
- SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail)
- if (LOG_COMPARE(&td->read_lsn, &old_lsn) < 0)
- old_lsn = td->read_lsn;
+ /* The array always has at least the current lsn. */
+ lsns[0] = current;
+ count = 1;
+ is_sorted = TRUE;
- *lsnp = old_lsn;
+ /*
+ * Build up our array in most-recent (largest) to first-started (oldest)
+ * order. Delete adjacent dups. Detect when the txns need to be sorted.
+ */
+ SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail) {
+ if (IS_MAX_LSN(td->read_lsn) ||
+ (cmp = LOG_COMPARE(&td->read_lsn, &lsns[count - 1])) == 0)
+ continue;
+ if (cmp > 0)
+ is_sorted = FALSE;
+ if (count >= txnmax) {
+ txnmax += txnmax;
+ if ((ret = __os_realloc(env,
+ txnmax * sizeof(lsns[0]), &lsns)) != 0)
+ goto err;
+ }
+ lsns[count] = td->read_lsn;
+ count++;
+ }
+
+err:
TXN_SYSTEM_UNLOCK(env);
- return (0);
+ if (ret != 0)
+ __os_free(env, lsns);
+ else {
+ if (!is_sorted)
+ qsort(lsns, count, sizeof(lsns[0]), lsn_hi_to_low);
+ *ntxnsp = (int)count;
+ *readers = lsns;
+ }
+ return (ret);
}
/*
diff --git a/src/txn/txn_stat.c b/src/txn/txn_stat.c
index 62fe622d..231ac3c5 100644
--- a/src/txn/txn_stat.c
+++ b/src/txn/txn_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/txn/txn_util.c b/src/txn/txn_util.c
index 0ecd7f6c..9f3b8cf6 100644
--- a/src/txn/txn_util.c
+++ b/src/txn/txn_util.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/lock.h"
#include "dbinc/mp.h"
@@ -209,7 +210,7 @@ __txn_remlock(env, txn, lock, locker)
for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) {
next_e = TAILQ_NEXT(e, links);
- if ((e->op != TXN_TRADE && e->op != TXN_TRADED &&
+ if ((e->op != TXN_TRADE && e->op != TXN_TRADED &&
e->op != TXN_XTRADE) ||
(e->u.t.lock.off != lock->off && e->u.t.locker != locker))
continue;
@@ -280,13 +281,21 @@ __txn_doevents(env, txn, opcode, preprocess)
e != NULL; e = enext) {
enext = TAILQ_NEXT(e, links);
/*
- * Move all exclusive handle locks and
+ * Move all exclusive handle locks and
* read handle locks to the handle locker.
*/
if (!(opcode == TXN_COMMIT && e->op == TXN_XTRADE) &&
- (e->op != TXN_TRADE ||
- IS_WRITELOCK(e->u.t.lock.mode)))
+ (e->op != TXN_TRADE ||
+ IS_WRITELOCK(e->u.t.lock.mode))) {
+ if (opcode == TXN_PREPARE &&
+ e->op == TXN_REMOVE) {
+ __db_errx(env, DB_STR_A("4501",
+"TXN->prepare is not allowed because this transaction removes \"%s\"", "%s"),
+ e->u.r.name);
+ return (EINVAL);
+ }
continue;
+ }
DO_TRADE;
if (txn->parent != NULL) {
TAILQ_REMOVE(&txn->events, e, links);
@@ -321,17 +330,26 @@ __txn_doevents(env, txn, opcode, preprocess)
ret = t_ret;
break;
case TXN_REMOVE:
- if (txn->parent != NULL)
+ if (txn->parent != NULL) {
TAILQ_INSERT_TAIL(
&txn->parent->events, e, links);
- else if (e->u.r.fileid != NULL) {
+ continue;
+ } else if (e->u.r.fileid != NULL) {
if ((t_ret = __memp_nameop(env,
e->u.r.fileid, NULL, e->u.r.name,
NULL, e->u.r.inmem)) != 0 && ret == 0)
ret = t_ret;
- } else if ((t_ret =
- __os_unlink(env, e->u.r.name, 0)) != 0 && ret == 0)
- ret = t_ret;
+ } else if ((t_ret = __os_unlink(
+ env, e->u.r.name, 0)) != 0 && ret == 0) {
+ /*
+ * It is possible for blob files to be deleted
+ * multiple times when truncating a database,
+ * so ignore ENOENT errors with blob files.
+ */
+ if (t_ret != ENOENT || strstr(
+ e->u.r.name, BLOB_FILE_PREFIX) == NULL)
+ ret = t_ret;
+ }
break;
case TXN_TRADE:
case TXN_XTRADE:
@@ -371,8 +389,6 @@ dofree:
/* Free resources here. */
switch (e->op) {
case TXN_REMOVE:
- if (txn->parent != NULL)
- continue;
if (e->u.r.fileid != NULL)
__os_free(env, e->u.r.fileid);
__os_free(env, e->u.r.name);
@@ -548,9 +564,8 @@ __txn_reset_fe_watermarks(txn)
{
DB *db;
- if (txn->parent) {
+ if (txn->parent)
DB_ASSERT(txn->mgrp->env, TAILQ_FIRST(&txn->femfs) == NULL);
- }
while ((db = TAILQ_FIRST(&txn->femfs)))
__clear_fe_watermark(txn, db);
diff --git a/src/xa/xa.c b/src/xa/xa.c
index ee75e792..5ce7842f 100644
--- a/src/xa/xa.c
+++ b/src/xa/xa.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -233,8 +233,8 @@ __xa_put_txn(env, txnp)
SH_TAILQ_REMOVE(&ip->dbth_xatxn, txnp, xa_links, __db_txn);
TAILQ_REMOVE(&txnp->mgrp->txn_chain, txnp, links);
td = txnp->td;
- DB_ASSERT(env, td->xa_ref > 0);
- td->xa_ref--;
+ if (td->xa_ref > 0)
+ td->xa_ref--;
__os_free(env, txnp);
ip->dbth_xa_status = TXN_XA_THREAD_UNASSOCIATED;
}
@@ -852,9 +852,9 @@ __db_xa_commit(xid, rmid, arg_flags)
return (ret);
/*
- * Because this transaction is currently associated, commit will not free
- * the transaction structure, which is good, because we need to do that
- * in xa_put_txn below.
+ * Because this transaction is currently associated, commit will
+ * not free the transaction structure, which is good, because we
+ * need to do that in xa_put_txn below.
*/
if ((ret = txnp->commit(txnp, 0)) != 0) {
dbenv->err(dbenv, ret, DB_STR("4563",
diff --git a/src/xa/xa_map.c b/src/xa/xa_map.c
index 4dcf4d75..9fd50185 100644
--- a/src/xa/xa_map.c
+++ b/src/xa/xa_map.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/