summaryrefslogtreecommitdiff
path: root/src/dbinc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dbinc')
-rw-r--r--src/dbinc/atomic.h17
-rw-r--r--src/dbinc/blob.h103
-rw-r--r--src/dbinc/btree.h9
-rw-r--r--src/dbinc/clock.h9
-rw-r--r--src/dbinc/crypto.h2
-rw-r--r--src/dbinc/cxx_int.h2
-rw-r--r--src/dbinc/db.in278
-rw-r--r--src/dbinc/db_185.in2
-rw-r--r--src/dbinc/db_am.h12
-rw-r--r--src/dbinc/db_cxx.in86
-rw-r--r--src/dbinc/db_dispatch.h2
-rw-r--r--src/dbinc/db_int.in140
-rw-r--r--src/dbinc/db_join.h2
-rw-r--r--src/dbinc/db_page.h185
-rw-r--r--src/dbinc/db_swap.h88
-rw-r--r--src/dbinc/db_upgrade.h119
-rw-r--r--src/dbinc/db_verify.h9
-rw-r--r--src/dbinc/debug.h47
-rw-r--r--src/dbinc/fop.h16
-rw-r--r--src/dbinc/globals.h22
-rw-r--r--src/dbinc/hash.h4
-rw-r--r--src/dbinc/heap.h5
-rw-r--r--src/dbinc/hmac.h2
-rw-r--r--src/dbinc/lock.h17
-rw-r--r--src/dbinc/log.h27
-rw-r--r--src/dbinc/log_verify.h2
-rw-r--r--src/dbinc/mp.h48
-rw-r--r--src/dbinc/mutex.h110
-rw-r--r--src/dbinc/mutex_int.h67
-rw-r--r--src/dbinc/os.h2
-rw-r--r--src/dbinc/partition.h12
-rw-r--r--src/dbinc/perfmon.h2
-rw-r--r--src/dbinc/qam.h2
-rw-r--r--src/dbinc/queue.h2
-rw-r--r--src/dbinc/region.h27
-rw-r--r--src/dbinc/rep.h287
-rw-r--r--src/dbinc/repmgr.h157
-rw-r--r--src/dbinc/shqueue.h13
-rw-r--r--src/dbinc/tcl_db.h53
-rw-r--r--src/dbinc/txn.h2
-rw-r--r--src/dbinc/win_db.h44
-rw-r--r--src/dbinc/xa.h2
42 files changed, 1574 insertions, 463 deletions
diff --git a/src/dbinc/atomic.h b/src/dbinc/atomic.h
index 096176a5..61f2ead9 100644
--- a/src/dbinc/atomic.h
+++ b/src/dbinc/atomic.h
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2009, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -79,12 +79,11 @@ typedef struct {
#define WINCE_ATOMIC_MAGIC(p) \
/* \
* Memory mapped regions on Windows CE cause problems with \
- * InterlockedXXX calls. Each page in a mapped region needs to \
- * have been written to prior to an InterlockedXXX call, or the \
- * InterlockedXXX call hangs. This does not seem to be \
- * documented anywhere. For now, read/write a non-critical \
- * piece of memory from the shared region prior to attempting \
- * shared region prior to attempting an InterlockedExchange \
+ * InterlockedXXX calls. Each process making an InterlockedXXX \
+ * call must make sure that it has written to the page prior to \
+ * the call, or the InterlockedXXX call hangs. This does not \
+ * seem to be documented anywhere. Write a non-critical piece \
+ * of memory from the shared region prior to attempting an \
* InterlockedXXX operation. \
*/ \
(p)->dummy = 0
@@ -144,7 +143,7 @@ typedef LONG volatile *interlocked_val;
#define atomic_inc(env, p) __atomic_inc(p)
#define atomic_dec(env, p) __atomic_dec(p)
#define atomic_compare_exchange(env, p, o, n) \
- __atomic_compare_exchange((p), (o), (n))
+ __atomic_compare_exchange_int((p), (o), (n))
static inline int __atomic_inc(db_atomic_t *p)
{
int temp;
@@ -176,7 +175,7 @@ static inline int __atomic_dec(db_atomic_t *p)
* http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
* which configure could be changed to use.
*/
-static inline int __atomic_compare_exchange(
+static inline int __atomic_compare_exchange_int(
db_atomic_t *p, atomic_value_t oldval, atomic_value_t newval)
{
atomic_value_t was;
diff --git a/src/dbinc/blob.h b/src/dbinc/blob.h
new file mode 100644
index 00000000..f4ff475b
--- /dev/null
+++ b/src/dbinc/blob.h
@@ -0,0 +1,103 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _DB_BLOB_H_
+#define _DB_BLOB_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * How many characters can the path for a blob file use?
+ * Up to 6 subdirectory separators.
+ * Up to 6 directory names of up to three characters each.
+ * Up to 21 characters for blob_id identifier.
+ * 7 characters for the standard prefix (__db.bl)
+ * 1 for luck (or NULL)
+ * The largest blob id, 9,223,372,036,854,775,807 would
+ * produce a path and file name:
+ * 009/223/372/036/854/775/807/__db.bl009223372036854775807
+ */
+#define MAX_BLOB_PATH "009/223/372/036/854/775/807/__db.bl009223372036854775807"
+#define MAX_BLOB_PATH_SZ sizeof(MAX_BLOB_PATH)
+#define BLOB_DEFAULT_DIR "__db_bl"
+#define BLOB_META_FILE_NAME "__db_blob_meta.db"
+#define BLOB_DIR_PREFIX "__db"
+#define BLOB_FILE_PREFIX "__db.bl"
+
+#define BLOB_DIR_ELEMS 1000
+
+#define IS_BLOB_META(name) \
+ (name != NULL && strstr(name, BLOB_META_FILE_NAME) != NULL)
+#define IS_BLOB_FILE(name) \
+ (name != NULL && strstr(name, BLOB_FILE_PREFIX) != NULL)
+
+/*
+ * Combines two unsigned 32 bit integers into a 64 bit integer.
+ * Blob database file ids and sub database ids are 64 bit integers,
+ * but have to be stored on database metadata pages that must
+ * be readable on 32 bit only compilers. So the ids are split into
+ * two 32 bit integers, and combined when needed.
+ */
+#define GET_LO_HI(e, lo, hi, o, ret) do { \
+ DB_ASSERT((e), sizeof(o) <= 8); \
+ if (sizeof(o) == 8) { \
+ (o) = (hi); \
+ (o) = ((o) << 32); \
+ (o) += (lo); \
+ } else { \
+ if ((hi) > 0) { \
+ __db_errx((e), DB_STR("0765", \
+ "Offset or id size overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ (o) = (lo); \
+ } \
+} while (0);
+
+#define GET_BLOB_FILE_ID(e, p, o, ret) \
+ GET_LO_HI(e, (p)->blob_file_lo, (p)->blob_file_hi, o, ret);
+
+#define GET_BLOB_SDB_ID(e, p, o, ret) \
+ GET_LO_HI(e, (p)->blob_sdb_lo, (p)->blob_sdb_hi, o, ret);
+
+/* Splits a 64 bit integer into two unsigned 32 bit integers. */
+#define SET_LO_HI(p, v, type, field_lo, field_hi) do { \
+ u_int32_t tmp; \
+ if (sizeof((v)) == 8) { \
+ tmp = (u_int32_t)((v) >> 32); \
+ memcpy(((u_int8_t *)p) + SSZ(type, field_hi), \
+ &tmp, sizeof(u_int32_t)); \
+ } else { \
+ memset(((u_int8_t *)p) + SSZ(type, field_hi), \
+ 0, sizeof(u_int32_t)); \
+ } \
+ tmp = (u_int32_t)(v); \
+ memcpy(((u_int8_t *)p) + SSZ(type, field_lo), \
+ &tmp, sizeof(u_int32_t)); \
+} while (0);
+
+#define SET_LO_HI_VAR(v, field_lo, field_hi) do { \
+ if (sizeof((v)) == 8) \
+ field_hi = (u_int32_t)((v) >> 32); \
+ else \
+ field_hi = 0; \
+ field_lo = (u_int32_t)(v); \
+} while (0);
+
+#define SET_BLOB_META_FILE_ID(p, v, type) \
+ SET_LO_HI(p, v, type, blob_file_lo, blob_file_hi);
+
+#define SET_BLOB_META_SDB_ID(p, v, type) \
+ SET_LO_HI(p, v, type, blob_sdb_lo, blob_sdb_hi);
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_BLOB_H_ */
diff --git a/src/dbinc/btree.h b/src/dbinc/btree.h
index 86bbec14..a8b9e1ee 100644
--- a/src/dbinc/btree.h
+++ b/src/dbinc/btree.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -472,7 +472,7 @@ struct __btree { /* Btree access method. */
u_int32_t bt_minkey; /* Minimum keys per page. */
/* Btree comparison function. */
- int (*bt_compare) __P((DB *, const DBT *, const DBT *));
+ int (*bt_compare) __P((DB *, const DBT *, const DBT *, size_t *));
/* Btree prefix function. */
size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *));
/* Btree compress function. */
@@ -483,7 +483,8 @@ struct __btree { /* Btree access method. */
int (*bt_decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
DBT *));
/* dup_compare for compression */
- int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *));
+ int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *,
+ size_t *));
#endif
/* Recno access method. */
@@ -539,7 +540,7 @@ typedef enum {
* Flags for __bam_pinsert.
*/
#define BPI_SPACEONLY 0x01 /* Only check for space to update. */
-#define BPI_NORECNUM 0x02 /* Not update the recnum on the left. */
+#define BPI_NORECNUM 0x02 /* Don't update the left's recnum. */
#define BPI_NOLOGGING 0x04 /* Don't log the update. */
#define BPI_REPLACE 0x08 /* Replace the record. */
diff --git a/src/dbinc/clock.h b/src/dbinc/clock.h
index caeaee70..b2815ea2 100644
--- a/src/dbinc/clock.h
+++ b/src/dbinc/clock.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -125,6 +125,13 @@ typedef struct {
timespecadd((vvp), &__tmp); \
} while (0)
+#define TIMESPEC_SUB_DB_TIMEOUT(vvp, t) \
+ do { \
+ db_timespec __tmp; \
+ DB_TIMEOUT_TO_TIMESPEC(t, &__tmp); \
+ timespecsub((vvp), &__tmp); \
+ } while (0)
+
#if defined(__cplusplus)
}
#endif
diff --git a/src/dbinc/crypto.h b/src/dbinc/crypto.h
index ea7a9cf0..4d889fd9 100644
--- a/src/dbinc/crypto.h
+++ b/src/dbinc/crypto.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/cxx_int.h b/src/dbinc/cxx_int.h
index 5492ead7..368bac86 100644
--- a/src/dbinc/cxx_int.h
+++ b/src/dbinc/cxx_int.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/db.in b/src/dbinc/db.in
index a948910e..b592b746 100644
--- a/src/dbinc/db.in
+++ b/src/dbinc/db.in
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*
@@ -102,6 +102,7 @@ extern "C" {
@FILE_t_decl@
@off_t_decl@
+@db_off_t_decl@
@pid_t_decl@
@size_t_decl@
#ifdef HAVE_MIXED_SIZE_ADDRESSING
@@ -131,9 +132,9 @@ typedef u_int16_t db_indx_t; /* Page offset type. */
#define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */
typedef u_int32_t db_recno_t; /* Record number type. */
-#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */
+#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a recno tree. */
-typedef u_int32_t db_timeout_t; /* Type of a timeout. */
+typedef u_int32_t db_timeout_t; /* Type of a timeout in microseconds. */
/*
* Region offsets are the difference between a pointer in a region and the
@@ -157,6 +158,10 @@ struct __db_compact; typedef struct __db_compact DB_COMPACT;
struct __db_dbt; typedef struct __db_dbt DBT;
struct __db_distab; typedef struct __db_distab DB_DISTAB;
struct __db_env; typedef struct __db_env DB_ENV;
+struct __db_event_mutex_died_info;
+ typedef struct __db_event_mutex_died_info DB_EVENT_MUTEX_DIED_INFO;
+struct __db_event_failchk_info;
+ typedef struct __db_event_failchk_info DB_EVENT_FAILCHK_INFO;
struct __db_h_stat; typedef struct __db_h_stat DB_HASH_STAT;
struct __db_heap_rid; typedef struct __db_heap_rid DB_HEAP_RID;
struct __db_heap_stat; typedef struct __db_heap_stat DB_HEAP_STAT;
@@ -189,6 +194,7 @@ struct __db_repmgr_site;typedef struct __db_repmgr_site DB_REPMGR_SITE;
struct __db_repmgr_stat;typedef struct __db_repmgr_stat DB_REPMGR_STAT;
struct __db_seq_record; typedef struct __db_seq_record DB_SEQ_RECORD;
struct __db_seq_stat; typedef struct __db_seq_stat DB_SEQUENCE_STAT;
+struct __db_stream; typedef struct __db_stream DB_STREAM;
struct __db_site; typedef struct __db_site DB_SITE;
struct __db_sequence; typedef struct __db_sequence DB_SEQUENCE;
struct __db_thread_info;typedef struct __db_thread_info DB_THREAD_INFO;
@@ -226,18 +232,20 @@ struct __db_dbt {
void *app_data;
-#define DB_DBT_APPMALLOC 0x001 /* Callback allocated memory. */
-#define DB_DBT_BULK 0x002 /* Internal: Insert if duplicate. */
-#define DB_DBT_DUPOK 0x004 /* Internal: Insert if duplicate. */
-#define DB_DBT_ISSET 0x008 /* Lower level calls set value. */
-#define DB_DBT_MALLOC 0x010 /* Return in malloc'd memory. */
-#define DB_DBT_MULTIPLE 0x020 /* References multiple records. */
-#define DB_DBT_PARTIAL 0x040 /* Partial put/get. */
-#define DB_DBT_REALLOC 0x080 /* Return in realloc'd memory. */
-#define DB_DBT_READONLY 0x100 /* Readonly, don't update. */
-#define DB_DBT_STREAMING 0x200 /* Internal: DBT is being streamed. */
-#define DB_DBT_USERCOPY 0x400 /* Use the user-supplied callback. */
-#define DB_DBT_USERMEM 0x800 /* Return in user's memory. */
+#define DB_DBT_APPMALLOC 0x0001 /* Callback allocated memory. */
+#define DB_DBT_BULK 0x0002 /* Internal: Insert if duplicate. */
+#define DB_DBT_DUPOK 0x0004 /* Internal: Insert if duplicate. */
+#define DB_DBT_ISSET 0x0008 /* Lower level calls set value. */
+#define DB_DBT_MALLOC 0x0010 /* Return in malloc'd memory. */
+#define DB_DBT_MULTIPLE 0x0020 /* References multiple records. */
+#define DB_DBT_PARTIAL 0x0040 /* Partial put/get. */
+#define DB_DBT_REALLOC 0x0080 /* Return in realloc'd memory. */
+#define DB_DBT_READONLY 0x0100 /* Readonly, don't update. */
+#define DB_DBT_STREAMING 0x0200 /* Internal: DBT is being streamed. */
+#define DB_DBT_USERCOPY 0x0400 /* Use the user-supplied callback. */
+#define DB_DBT_USERMEM 0x0800 /* Return in user's memory. */
+#define DB_DBT_BLOB 0x1000 /* Data item is a blob. */
+#define DB_DBT_BLOB_REC 0x2000 /* Internal: Blob database record. */
u_int32_t flags;
};
@@ -274,6 +282,23 @@ struct __db_mutex_stat { /* SHARED */
#endif
};
+/* Buffers passed to __mutex_describe() must be at least this large. */
+#define DB_MUTEX_DESCRIBE_STRLEN 128
+
+/* This is the info of a DB_EVENT_MUTEX_DIED event notification. */
+struct __db_event_mutex_died_info {
+ pid_t pid; /* Process which last owned the mutex */
+ db_threadid_t tid; /* Thread which last owned the mutex */
+ db_mutex_t mutex; /* ID of the mutex */
+ char desc[DB_MUTEX_DESCRIBE_STRLEN];
+};
+
+/* This is the info of a DB_EVENT_FAILCHK event notification. */
+#define DB_FAILURE_SYMPTOM_SIZE 120
+struct __db_event_failchk_info {
+ int error;
+ char symptom[DB_FAILURE_SYMPTOM_SIZE];
+};
/* This is the length of the buffer passed to DB_ENV->thread_id_string() */
#define DB_THREADID_STRLEN 128
@@ -400,6 +425,8 @@ struct __db_lock_stat { /* SHARED */
uintmax_t st_lockers_nowait; /* Locker lock granted without wait. */
uintmax_t st_region_wait; /* Region lock granted after wait. */
uintmax_t st_region_nowait; /* Region lock granted without wait. */
+ uintmax_t st_nlockers_hit; /* Lockers found in thread info. */
+ uintmax_t st_nlockers_reused; /* Lockers reallocated from thread info. */
u_int32_t st_hash_len; /* Max length of bucket. */
roff_t st_regsize; /* Region size. */
#endif
@@ -469,7 +496,7 @@ struct __db_lockreq {
/*******************************************************
* Logging.
*******************************************************/
-#define DB_LOGVERSION 19 /* Current log version. */
+#define DB_LOGVERSION 22 /* Current log version. */
#define DB_LOGVERSION_LATCHING 15 /* Log version using latching: db-4.8 */
#define DB_LOGCHKSUM 12 /* Check sum headers: db-4.5 */
#define DB_LOGOLDVER 8 /* Oldest version supported: db-4.2 */
@@ -595,7 +622,8 @@ typedef enum {
LOGREC_PGDDBT,
LOGREC_PGLIST,
LOGREC_POINTER,
- LOGREC_TIME
+ LOGREC_TIME,
+ LOGREC_LONGARG
} log_rec_type_t;
typedef const struct __log_rec_spec {
@@ -755,6 +783,7 @@ struct __db_mpool_stat { /* SHARED */
uintmax_t st_mvcc_frozen; /* Buffers frozen. */
uintmax_t st_mvcc_thawed; /* Buffers thawed. */
uintmax_t st_mvcc_freed; /* Frozen buffers freed. */
+ uintmax_t st_mvcc_reused; /* Outdated invisible buffers reused. */
uintmax_t st_alloc; /* Number of page allocations. */
uintmax_t st_alloc_buckets; /* Buckets checked during allocation. */
uintmax_t st_alloc_max_buckets;/* Max checked during allocation. */
@@ -762,6 +791,8 @@ struct __db_mpool_stat { /* SHARED */
uintmax_t st_alloc_max_pages; /* Max checked during allocation. */
uintmax_t st_io_wait; /* Thread waited on buffer I/O. */
uintmax_t st_sync_interrupted; /* Number of times sync interrupted. */
+ u_int32_t st_oddfsize_detect; /* Odd file size detected. */
+ u_int32_t st_oddfsize_resolve; /* Odd file size resolved. */
roff_t st_regsize; /* Region size. */
roff_t st_regmax; /* Region max. */
#endif
@@ -956,7 +987,7 @@ struct __db_txn {
#define TXN_SNAPSHOT 0x08000 /* Snapshot Isolation. */
#define TXN_SYNC 0x10000 /* Write and sync on prepare/commit. */
#define TXN_WRITE_NOSYNC 0x20000 /* Write only on prepare/commit. */
-#define TXN_BULK 0x40000 /* Enable bulk loading optimization. */
+#define TXN_BULK 0x40000 /* Enable bulk loading optimization. */
u_int32_t flags;
};
@@ -1065,30 +1096,34 @@ struct __db_txn_token {
/*
* Event notification types. (Tcl testing interface currently assumes there are
- * no more than 32 of these.)
+ * no more than 32 of these.). Comments include any relevant event_info types.
*/
#define DB_EVENT_PANIC 0
-#define DB_EVENT_REG_ALIVE 1
-#define DB_EVENT_REG_PANIC 2
-#define DB_EVENT_REP_CLIENT 3
-#define DB_EVENT_REP_CONNECT_BROKEN 4
-#define DB_EVENT_REP_CONNECT_ESTD 5
-#define DB_EVENT_REP_CONNECT_TRY_FAILED 6
-#define DB_EVENT_REP_DUPMASTER 7
-#define DB_EVENT_REP_ELECTED 8
-#define DB_EVENT_REP_ELECTION_FAILED 9
-#define DB_EVENT_REP_INIT_DONE 10
-#define DB_EVENT_REP_JOIN_FAILURE 11
-#define DB_EVENT_REP_LOCAL_SITE_REMOVED 12
-#define DB_EVENT_REP_MASTER 13
-#define DB_EVENT_REP_MASTER_FAILURE 14
-#define DB_EVENT_REP_NEWMASTER 15
-#define DB_EVENT_REP_PERM_FAILED 16
-#define DB_EVENT_REP_SITE_ADDED 17
-#define DB_EVENT_REP_SITE_REMOVED 18
-#define DB_EVENT_REP_STARTUPDONE 19
-#define DB_EVENT_REP_WOULD_ROLLBACK 20 /* Undocumented; C API only. */
-#define DB_EVENT_WRITE_FAILED 21
+#define DB_EVENT_REG_ALIVE 1 /* int: pid which was in env */
+#define DB_EVENT_REG_PANIC 2 /* int: error causing the panic. */
+#define DB_EVENT_REP_AUTOTAKEOVER_FAILED 3
+#define DB_EVENT_REP_CLIENT 4
+#define DB_EVENT_REP_CONNECT_BROKEN 5 /* DB_REPMGR_CONN_ERR */
+#define DB_EVENT_REP_CONNECT_ESTD 6 /* int: EID of remote site */
+#define DB_EVENT_REP_CONNECT_TRY_FAILED 7 /* DB_REPMGR_CONN_ERR */
+#define DB_EVENT_REP_DUPMASTER 8
+#define DB_EVENT_REP_ELECTED 9
+#define DB_EVENT_REP_ELECTION_FAILED 10
+#define DB_EVENT_REP_INIT_DONE 11
+#define DB_EVENT_REP_INQUEUE_FULL 12
+#define DB_EVENT_REP_JOIN_FAILURE 13
+#define DB_EVENT_REP_LOCAL_SITE_REMOVED 14
+#define DB_EVENT_REP_MASTER 15
+#define DB_EVENT_REP_MASTER_FAILURE 16
+#define DB_EVENT_REP_NEWMASTER 17 /* int: new master's site id */
+#define DB_EVENT_REP_PERM_FAILED 18
+#define DB_EVENT_REP_SITE_ADDED 19 /* int: eid */
+#define DB_EVENT_REP_SITE_REMOVED 20 /* int: eid */
+#define DB_EVENT_REP_STARTUPDONE 21
+#define DB_EVENT_REP_WOULD_ROLLBACK 22 /* Undocumented; C API only. */
+#define DB_EVENT_WRITE_FAILED 23
+#define DB_EVENT_MUTEX_DIED 24 /* DB_EVENT_MUTEX_DIED_INFO */
+#define DB_EVENT_FAILCHK_PANIC 25 /* DB_EVENT_FAILCHK_INFO */
#define DB_EVENT_NO_SUCH_EVENT 0xffffffff /* OOB sentinel value */
/* Replication Manager site status. */
@@ -1102,6 +1137,7 @@ struct __db_repmgr_site {
u_int32_t status;
#define DB_REPMGR_ISPEER 0x01
+#define DB_REPMGR_ISVIEW 0x02
u_int32_t flags;
};
@@ -1117,6 +1153,7 @@ struct __db_rep_stat { /* SHARED */
* circumstances, garbaged).
*/
u_int32_t st_startup_complete; /* Site completed client sync-up. */
+ u_int32_t st_view; /* Site is a view. */
#ifndef __TEST_DB_NO_STATISTICS
uintmax_t st_log_queued; /* Log records currently queued.+ */
u_int32_t st_status; /* Current replication status. */
@@ -1194,6 +1231,7 @@ struct __db_rep_stat { /* SHARED */
/* Undocumented statistics only used by the test system. */
#ifdef CONFIG_TEST
u_int32_t st_filefail_cleanups; /* # of FILE_FAIL cleanups done. */
+ uintmax_t st_log_futuredup; /* Future log records that are dups. */
#endif
#endif
};
@@ -1204,10 +1242,18 @@ struct __db_repmgr_stat { /* SHARED */
uintmax_t st_msgs_queued; /* # msgs queued for network delay. */
uintmax_t st_msgs_dropped; /* # msgs discarded due to excessive
queue length. */
+ u_int32_t st_incoming_queue_gbytes; /* Incoming queue size: GB. */
+ u_int32_t st_incoming_queue_bytes; /* Incoming queue size: B. */
+ uintmax_t st_incoming_msgs_dropped; /* # of msgs discarded due to
+ incoming queue full. */
uintmax_t st_connection_drop; /* Existing connections dropped. */
uintmax_t st_connect_fail; /* Failed new connection attempts. */
- uintmax_t st_elect_threads; /* # of active election threads. */
- uintmax_t st_max_elect_threads; /* Max concurrent e-threads ever. */
+ u_int32_t st_elect_threads; /* # of active election threads. */
+ u_int32_t st_max_elect_threads; /* Max concurrent e-threads ever. */
+ u_int32_t st_site_participants; /* # of repgroup participant sites. */
+ u_int32_t st_site_total; /* # of repgroup total sites. */
+ u_int32_t st_site_views; /* # of repgroup view sites. */
+ uintmax_t st_takeovers; /* # of automatic listener takeovers. */
};
/* Replication Manager connection error. */
@@ -1238,7 +1284,7 @@ struct __db_sequence {
db_mutex_t mtx_seq; /* Mutex if sequence is threaded. */
DB_SEQ_RECORD *seq_rp; /* Pointer to current data. */
DB_SEQ_RECORD seq_record; /* Data from DB_SEQUENCE. */
- int32_t seq_cache_size; /* Number of values cached. */
+ u_int32_t seq_cache_size; /* Number of values cached. */
db_seq_t seq_last_value; /* Last value cached. */
db_seq_t seq_prev_value; /* Last value returned. */
DBT seq_key; /* DBT pointing to sequence key. */
@@ -1250,8 +1296,8 @@ struct __db_sequence {
/* DB_SEQUENCE PUBLIC HANDLE LIST BEGIN */
int (*close) __P((DB_SEQUENCE *, u_int32_t));
int (*get) __P((DB_SEQUENCE *,
- DB_TXN *, int32_t, db_seq_t *, u_int32_t));
- int (*get_cachesize) __P((DB_SEQUENCE *, int32_t *));
+ DB_TXN *, u_int32_t, db_seq_t *, u_int32_t));
+ int (*get_cachesize) __P((DB_SEQUENCE *, u_int32_t *));
int (*get_db) __P((DB_SEQUENCE *, DB **));
int (*get_flags) __P((DB_SEQUENCE *, u_int32_t *));
int (*get_key) __P((DB_SEQUENCE *, DBT *));
@@ -1261,7 +1307,7 @@ struct __db_sequence {
int (*open) __P((DB_SEQUENCE *,
DB_TXN *, DBT *, u_int32_t));
int (*remove) __P((DB_SEQUENCE *, DB_TXN *, u_int32_t));
- int (*set_cachesize) __P((DB_SEQUENCE *, int32_t));
+ int (*set_cachesize) __P((DB_SEQUENCE *, u_int32_t));
int (*set_flags) __P((DB_SEQUENCE *, u_int32_t));
int (*set_range) __P((DB_SEQUENCE *, db_seq_t, db_seq_t));
int (*stat) __P((DB_SEQUENCE *,
@@ -1278,7 +1324,7 @@ struct __db_seq_stat { /* SHARED */
db_seq_t st_last_value; /* Last cached value. */
db_seq_t st_min; /* Minimum value. */
db_seq_t st_max; /* Maximum value. */
- int32_t st_cache_size; /* Cache size. */
+ u_int32_t st_cache_size; /* Cache size. */
u_int32_t st_flags; /* Flag value. */
};
@@ -1300,15 +1346,15 @@ typedef enum {
#define DB_RENAMEMAGIC 0x030800 /* File has been renamed. */
-#define DB_BTREEVERSION 9 /* Current btree version. */
+#define DB_BTREEVERSION 10 /* Current btree version. */
#define DB_BTREEOLDVER 8 /* Oldest btree version supported. */
#define DB_BTREEMAGIC 0x053162
-#define DB_HASHVERSION 9 /* Current hash version. */
+#define DB_HASHVERSION 10 /* Current hash version. */
#define DB_HASHOLDVER 7 /* Oldest hash version supported. */
#define DB_HASHMAGIC 0x061561
-#define DB_HEAPVERSION 1 /* Current heap version. */
+#define DB_HEAPVERSION 2 /* Current heap version. */
#define DB_HEAPOLDVER 1 /* Oldest heap version supported. */
#define DB_HEAPMAGIC 0x074582
@@ -1377,6 +1423,7 @@ typedef enum {
#define DB_LOCK_NOTGRANTED (-30992)/* Lock unavailable. */
#define DB_LOG_BUFFER_FULL (-30991)/* In-memory log buffer full. */
#define DB_LOG_VERIFY_BAD (-30990)/* Log verification failed. */
+#define DB_META_CHKSUM_FAIL (-30968)/* Metadata page checksum failed. */
#define DB_NOSERVER (-30989)/* Server panic return. */
#define DB_NOTFOUND (-30988)/* Key/data pair not found (EOF). */
#define DB_OLD_VERSION (-30987)/* Out-of-date version. */
@@ -1405,6 +1452,8 @@ typedef enum {
#define DB_DELETED (-30897)/* Recovery file marked deleted. */
#define DB_EVENT_NOT_HANDLED (-30896)/* Forward event to application. */
#define DB_NEEDSPLIT (-30895)/* Page needs to be split. */
+#define DB_NOINTMP (-30886)/* Sequences not supported in temporary
+ or in-memory databases. */
#define DB_REP_BULKOVF (-30894)/* Rep bulk buffer overflow. */
#define DB_REP_LOGREADY (-30893)/* Rep log ready for recovery. */
#define DB_REP_NEWMASTER (-30892)/* We have learned of a new master. */
@@ -1415,6 +1464,13 @@ typedef enum {
#define DB_TXN_CKP (-30888)/* Encountered ckp record in log. */
#define DB_VERIFY_FATAL (-30887)/* DB->verify cannot proceed. */
+/*
+ * This exit status indicates that a BDB utility failed because it needed a
+ * resource which had been held by a process which crashed or otherwise did
+ * not exit cleanly.
+ */
+#define DB_EXIT_FAILCHK 3
+
/* Database handle. */
struct __db {
/*******************************************************
@@ -1426,7 +1482,7 @@ struct __db {
/* Callbacks. */
int (*db_append_recno) __P((DB *, DBT *, db_recno_t));
void (*db_feedback) __P((DB *, int, int));
- int (*dup_compare) __P((DB *, const DBT *, const DBT *));
+ int (*dup_compare) __P((DB *, const DBT *, const DBT *, size_t *));
void *app_private; /* Application-private handle. */
@@ -1450,6 +1506,8 @@ struct __db {
u_int32_t adj_fileid; /* File's unique ID for curs. adj. */
+ u_int32_t blob_threshold; /* Blob threshold record size. */
+
#define DB_LOGFILEID_INVALID -1
FNAME *log_filename; /* File's naming info for logging. */
@@ -1593,6 +1651,12 @@ struct __db {
/* Reference to foreign -- set in the secondary. */
DB *s_foreign;
+ DB *blob_meta_db; /* Databases holding blob metadata. */
+ DB_SEQUENCE *blob_seq; /* Sequence of blob ids. */
+ char *blob_sub_dir; /* Subdirectory for blob files */
+ db_seq_t blob_file_id; /* Id of the file blob directory. */
+ db_seq_t blob_sdb_id; /* Id of the subdb blob directory. */
+
/* API-private structure: used by DB 1.85, C++, Java, Perl and Tcl */
void *api_internal;
@@ -1623,8 +1687,11 @@ struct __db {
void *(**)(void *, size_t), void (**)(void *)));
int (*get_append_recno) __P((DB *, int (**)(DB *, DBT *, db_recno_t)));
int (*get_assoc_flags) __P((DB *, u_int32_t *));
+ int (*get_blob_dir) __P((DB *, const char **));
+ int (*get_blob_sub_dir) __P((DB *, const char **));
+ int (*get_blob_threshold) __P((DB *, u_int32_t *));
int (*get_bt_compare)
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
int (*get_bt_compress) __P((DB *,
int (**)(DB *,
const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
@@ -1637,7 +1704,7 @@ struct __db {
int (*get_create_dir) __P((DB *, const char **));
int (*get_dbname) __P((DB *, const char **, const char **));
int (*get_dup_compare)
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
int (*get_encrypt_flags) __P((DB *, u_int32_t *));
DB_ENV *(*get_env) __P((DB *));
void (*get_errcall) __P((DB *,
@@ -1647,7 +1714,7 @@ struct __db {
int (*get_feedback) __P((DB *, void (**)(DB *, int, int)));
int (*get_flags) __P((DB *, u_int32_t *));
int (*get_h_compare)
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
int (*get_h_ffactor) __P((DB *, u_int32_t *));
int (*get_h_hash)
__P((DB *, u_int32_t (**)(DB *, const void *, u_int32_t)));
@@ -1688,8 +1755,10 @@ struct __db {
int (*set_alloc) __P((DB *, void *(*)(size_t),
void *(*)(void *, size_t), void (*)(void *)));
int (*set_append_recno) __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
+ int (*set_blob_dir) __P((DB *, const char *));
+ int (*set_blob_threshold) __P((DB *, u_int32_t, u_int32_t));
int (*set_bt_compare)
- __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *)));
int (*set_bt_compress) __P((DB *,
int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
@@ -1699,7 +1768,7 @@ struct __db {
int (*set_cachesize) __P((DB *, u_int32_t, u_int32_t, int));
int (*set_create_dir) __P((DB *, const char *));
int (*set_dup_compare)
- __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *)));
int (*set_encrypt) __P((DB *, const char *, u_int32_t));
void (*set_errcall) __P((DB *,
void (*)(const DB_ENV *, const char *, const char *)));
@@ -1708,7 +1777,7 @@ struct __db {
int (*set_feedback) __P((DB *, void (*)(DB *, int, int)));
int (*set_flags) __P((DB *, u_int32_t));
int (*set_h_compare)
- __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *)));
int (*set_h_ffactor) __P((DB *, u_int32_t));
int (*set_h_hash)
__P((DB *, u_int32_t (*)(DB *, const void *, u_int32_t)));
@@ -1808,13 +1877,34 @@ struct __db {
u_int32_t orig_flags; /* Flags at open, for refresh */
u_int32_t flags;
-#define DB2_AM_EXCL 0x00000001 /* Exclusively lock the handle */
-#define DB2_AM_INTEXCL 0x00000002 /* Internal exclusive lock. */
-#define DB2_AM_NOWAIT 0x00000004 /* Do not wait for handle lock */
- u_int32_t orig_flags2; /* Second flags word; for refresh */
+#define DB2_AM_EXCL 0x00000001 /* Exclusively lock the handle */
+#define DB2_AM_INTEXCL 0x00000002 /* Internal exclusive lock. */
+#define DB2_AM_NOWAIT 0x00000004 /* Do not wait for handle lock */
u_int32_t flags2; /* Second flags word */
};
+/*
+ * Stream interface for blob files.
+ */
+struct __db_stream {
+ DBC *dbc; /* Cursor pointing to the db blob record. */
+ DB_FH *fhp;
+
+ /* DB_STREAM PUBLIC HANDLE LIST BEGIN */
+ int (*close) __P((DB_STREAM *, u_int32_t));
+ int (*read) __P((DB_STREAM *, DBT *, db_off_t, u_int32_t, u_int32_t));
+ int (*size) __P((DB_STREAM *, db_off_t *, u_int32_t));
+ int (*write) __P((DB_STREAM *, DBT *, db_off_t, u_int32_t));
+ /* DB_STREAM PUBLIC HANDLE LIST END */
+
+ u_int32_t flags;
+#define DB_STREAM_READ 0x00000001 /* Stream is read only. */
+#define DB_STREAM_WRITE 0x00000002 /* Stream is writeable. */
+#define DB_STREAM_SYNC_WRITE 0x00000004 /* Sync file on each write. */
+ db_seq_t blob_id;
+ db_off_t file_size;
+};
+
/*
* Macros for bulk operations. These are only intended for the C API.
* For C++, use DbMultiple*Iterator or DbMultiple*Builder.
@@ -1889,7 +1979,7 @@ struct __db {
pointer = __p; \
} while (0)
-#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \
+#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \
do { \
(dbt)->flags |= DB_DBT_BULK; \
pointer = (u_int8_t *)(dbt)->data + \
@@ -1897,7 +1987,7 @@ struct __db {
*(u_int32_t *)(pointer) = (u_int32_t)-1; \
} while (0)
-#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \
+#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \
do { \
u_int32_t *__p = (u_int32_t *)(pointer); \
u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
@@ -1914,7 +2004,7 @@ struct __db {
} \
} while (0)
-#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \
+#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \
do { \
void *__destd; \
DB_MULTIPLE_RESERVE_NEXT((pointer), (dbt), \
@@ -1925,7 +2015,7 @@ struct __db {
memcpy(__destd, (writedata), (writedlen)); \
} while (0)
-#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
+#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
do { \
u_int32_t *__p = (u_int32_t *)(pointer); \
u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
@@ -1948,7 +2038,7 @@ struct __db {
} \
} while (0)
-#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
+#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \
do { \
void *__destk, *__destd; \
DB_MULTIPLE_KEY_RESERVE_NEXT((pointer), (dbt), \
@@ -1962,7 +2052,7 @@ struct __db {
} \
} while (0)
-#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \
+#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \
do { \
(dbt)->flags |= DB_DBT_BULK; \
pointer = (u_int8_t *)(dbt)->data + \
@@ -1970,7 +2060,7 @@ struct __db {
*(u_int32_t *)(pointer) = 0; \
} while (0)
-#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \
+#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \
do { \
u_int32_t *__p = (u_int32_t *)(pointer); \
u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\
@@ -1988,7 +2078,7 @@ struct __db {
} \
} while (0)
-#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\
+#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\
do { \
void *__destd; \
DB_MULTIPLE_RECNO_RESERVE_NEXT((pointer), (dbt), \
@@ -2003,7 +2093,7 @@ struct __db_heap_rid {
db_pgno_t pgno; /* Page number. */
db_indx_t indx; /* Index in the offset table. */
};
-#define DB_HEAP_RID_SZ (sizeof(db_pgno_t) + sizeof(db_indx_t))
+#define DB_HEAP_RID_SZ (sizeof(db_pgno_t) + sizeof(db_indx_t))
/*******************************************************
* Access method cursors.
@@ -2074,6 +2164,7 @@ struct __dbc {
int (*close) __P((DBC *));
int (*cmp) __P((DBC *, DBC *, int *, u_int32_t));
int (*count) __P((DBC *, db_recno_t *, u_int32_t));
+ int (*db_stream) __P((DBC *, DB_STREAM **, u_int32_t));
int (*del) __P((DBC *, u_int32_t));
int (*dup) __P((DBC *, DBC **, u_int32_t));
int (*get) __P((DBC *, DBT *, DBT *, u_int32_t));
@@ -2151,6 +2242,7 @@ struct __db_bt_stat { /* SHARED */
u_int32_t bt_pagecnt; /* Page count. */
u_int32_t bt_pagesize; /* Page size. */
u_int32_t bt_minkey; /* Minkey value. */
+ u_int32_t bt_nblobs; /* Number of blobs. */
u_int32_t bt_re_len; /* Fixed-length record length. */
u_int32_t bt_re_pad; /* Fixed-length record pad. */
u_int32_t bt_levels; /* Tree levels. */
@@ -2179,7 +2271,7 @@ struct __db_compact {
u_int32_t compact_deadlock; /* Number of deadlocks. */
db_pgno_t compact_pages_truncated; /* Pages truncated to OS. */
/* Internal. */
- db_pgno_t compact_truncate; /* Page number for truncation */
+ db_pgno_t compact_truncate; /* Exchange pages above here. */
};
/* Hash statistics structure. */
@@ -2189,6 +2281,7 @@ struct __db_h_stat { /* SHARED */
u_int32_t hash_metaflags; /* Metadata flags. */
u_int32_t hash_nkeys; /* Number of unique keys. */
u_int32_t hash_ndata; /* Number of data items. */
+ u_int32_t hash_nblobs; /* Number of blobs. */
u_int32_t hash_pagecnt; /* Page count. */
u_int32_t hash_pagesize; /* Page size. */
u_int32_t hash_ffactor; /* Fill factor specified at create. */
@@ -2208,6 +2301,7 @@ struct __db_heap_stat { /* SHARED */
u_int32_t heap_magic; /* Magic number. */
u_int32_t heap_version; /* Version number. */
u_int32_t heap_metaflags; /* Metadata flags. */
+ u_int32_t heap_nblobs; /* Number of blobs. */
u_int32_t heap_nrecs; /* Number of records. */
u_int32_t heap_pagecnt; /* Page count. */
u_int32_t heap_pagesize; /* Page size. */
@@ -2267,21 +2361,15 @@ typedef enum {
* Backup configuration types.
*/
typedef enum {
- DB_BACKUP_READ_COUNT = 1,
- DB_BACKUP_READ_SLEEP = 2,
- DB_BACKUP_SIZE = 3,
- DB_BACKUP_WRITE_DIRECT = 4
+ DB_BACKUP_READ_COUNT=1,
+ DB_BACKUP_READ_SLEEP=2,
+ DB_BACKUP_SIZE=3,
+ DB_BACKUP_WRITE_DIRECT=4
} DB_BACKUP_CONFIG;
struct __db_env {
ENV *env; /* Linked ENV structure */
- /*
- * The DB_ENV structure can be used concurrently, so field access is
- * protected.
- */
- db_mutex_t mtx_db_env; /* DB_ENV structure mutex */
-
/* Error message callback */
void (*db_errcall) __P((const DB_ENV *, const char *, const char *));
FILE *db_errfile; /* Error message file stream */
@@ -2304,6 +2392,7 @@ struct __db_env {
char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *));
/* Application specified paths */
+ char *db_blob_dir; /* Blob file directory */
char *db_log_dir; /* Database log file directory */
char *db_md_dir; /* Persistent metadata directory */
char *db_tmp_dir; /* Database tmp file directory */
@@ -2327,6 +2416,8 @@ struct __db_env {
u_int32_t verbose; /* DB_VERB_XXX flags */
+ u_int32_t blob_threshold; /* Blob threshold record size */
+
/* Mutex configuration */
u_int32_t mutex_align; /* Mutex alignment */
u_int32_t mutex_cnt; /* Number of mutexes to configure */
@@ -2395,6 +2486,11 @@ struct __db_env {
* build settings.
*/
db_timeout_t envreg_timeout; /* DB_REGISTER wait timeout */
+ /*
+ * When failchk broadcasting is active, any wait for a mutex will wake
+ * up this frequently in order to check whether the mutex has died.
+ */
+ db_timeout_t mutex_failchk_timeout;
#define DB_ENV_AUTO_COMMIT 0x00000001 /* DB_AUTO_COMMIT */
#define DB_ENV_CDB_ALLDB 0x00000002 /* CDB environment wide locking */
@@ -2414,8 +2510,8 @@ struct __db_env {
#define DB_ENV_TXN_SNAPSHOT 0x00008000 /* DB_TXN_SNAPSHOT set */
#define DB_ENV_TXN_WRITE_NOSYNC 0x00010000 /* DB_TXN_WRITE_NOSYNC set */
#define DB_ENV_YIELDCPU 0x00020000 /* DB_YIELDCPU set */
-#define DB_ENV_HOTBACKUP 0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */
-#define DB_ENV_NOFLUSH 0x00080000 /* DB_NOFLUSH set */
+#define DB_ENV_HOTBACKUP 0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */
+#define DB_ENV_NOFLUSH 0x00080000 /* DB_NOFLUSH set */
u_int32_t flags;
/* DB_ENV PUBLIC HANDLE LIST BEGIN */
@@ -2436,6 +2532,8 @@ struct __db_env {
void *(**)(void *, size_t), void (**)(void *)));
int (*get_app_dispatch)
__P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+ int (*get_blob_dir) __P((DB_ENV *, const char **));
+ int (*get_blob_threshold) __P((DB_ENV*, u_int32_t *));
int (*get_cache_max) __P((DB_ENV *, u_int32_t *, u_int32_t *));
int (*get_cachesize) __P((DB_ENV *, u_int32_t *, u_int32_t *, int *));
int (*get_create_dir) __P((DB_ENV *, const char **));
@@ -2451,8 +2549,8 @@ struct __db_env {
void (**)(const DB_ENV *, const char *, const char *)));
void (*get_errfile) __P((DB_ENV *, FILE **));
void (*get_errpfx) __P((DB_ENV *, const char **));
- int (*get_flags) __P((DB_ENV *, u_int32_t *));
int (*get_feedback) __P((DB_ENV *, void (**)(DB_ENV *, int, int)));
+ int (*get_flags) __P((DB_ENV *, u_int32_t *));
int (*get_home) __P((DB_ENV *, const char **));
int (*get_intermediate_dir_mode) __P((DB_ENV *, const char **));
int (*get_isalive) __P((DB_ENV *,
@@ -2568,17 +2666,23 @@ struct __db_env {
int (*rep_set_timeout) __P((DB_ENV *, int, db_timeout_t));
int (*rep_set_transport) __P((DB_ENV *, int, int (*)(DB_ENV *,
const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)));
+ int (*rep_set_view) __P((DB_ENV *, int (*)(DB_ENV *,
+ const char *, int *, u_int32_t)));
int (*rep_start) __P((DB_ENV *, DBT *, u_int32_t));
int (*rep_stat) __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
int (*rep_stat_print) __P((DB_ENV *, u_int32_t));
int (*rep_sync) __P((DB_ENV *, u_int32_t));
int (*repmgr_channel) __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t));
int (*repmgr_get_ack_policy) __P((DB_ENV *, int *));
+ int (*repmgr_get_incoming_queue_max)
+ __P((DB_ENV *, u_int32_t *, u_int32_t *));
int (*repmgr_local_site) __P((DB_ENV *, DB_SITE **));
int (*repmgr_msg_dispatch) __P((DB_ENV *,
void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t),
u_int32_t));
int (*repmgr_set_ack_policy) __P((DB_ENV *, int));
+ int (*repmgr_set_incoming_queue_max)
+ __P((DB_ENV *, u_int32_t, u_int32_t));
int (*repmgr_site)
__P((DB_ENV *, const char *, u_int, DB_SITE**, u_int32_t));
int (*repmgr_site_by_eid) __P((DB_ENV *, int, DB_SITE**));
@@ -2590,6 +2694,8 @@ struct __db_env {
void *(*)(void *, size_t), void (*)(void *)));
int (*set_app_dispatch)
__P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops)));
+ int (*set_blob_dir) __P((DB_ENV *, const char *));
+ int (*set_blob_threshold) __P((DB_ENV *, u_int32_t, u_int32_t));
int (*set_cache_max) __P((DB_ENV *, u_int32_t, u_int32_t));
int (*set_cachesize) __P((DB_ENV *, u_int32_t, u_int32_t, int));
int (*set_create_dir) __P((DB_ENV *, const char *));
@@ -2662,8 +2768,8 @@ struct __db_env {
/* DB_ENV PUBLIC HANDLE LIST END */
/* DB_ENV PRIVATE HANDLE LIST BEGIN */
- int (*prdbt) __P((DBT *, int,
- const char *, void *, int (*)(void *, const void *), int, int));
+ int (*prdbt) __P((DBT *, int, const char *, void *,
+ int (*)(void *, const void *), int, int, int));
/* DB_ENV PRIVATE HANDLE LIST END */
};
diff --git a/src/dbinc/db_185.in b/src/dbinc/db_185.in
index 43735344..3aef2eca 100644
--- a/src/dbinc/db_185.in
+++ b/src/dbinc/db_185.in
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
diff --git a/src/dbinc/db_am.h b/src/dbinc/db_am.h
index f34578c4..2b5c49d2 100644
--- a/src/dbinc/db_am.h
+++ b/src/dbinc/db_am.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -199,12 +199,16 @@ struct __db_foreign_info {
#define DB_IS_PRIMARY(dbp) (LIST_FIRST(&dbp->s_secondaries) != NULL)
/*
* A database should be required to be readonly if it's been explicitly
- * specified as such or if we're a client in a replicated environment
- * and the user did not specify DB_TXN_NOT_DURABLE.
+ * specified as such, if we're a client in a replicated environment
+ * and the user did not specify DB_TXN_NOT_DURABLE, or if we're a master
+ * in a replicated environment and the REP_F_READONLY_MASTER flag has been
+ * set in preparation for a preferred master takeover.
*/
#define DB_IS_READONLY(dbp) \
(F_ISSET(dbp, DB_AM_RDONLY) || \
- (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE)))
+ (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE)) \
+ || (IS_REP_MASTER((dbp)->env) && \
+ F_ISSET((dbp)->env->rep_handle->region, REP_F_READONLY_MASTER)))
#ifdef HAVE_COMPRESSION
/*
diff --git a/src/dbinc/db_cxx.in b/src/dbinc/db_cxx.in
index 84fc0f88..5b29f7e8 100644
--- a/src/dbinc/db_cxx.in
+++ b/src/dbinc/db_cxx.in
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -76,6 +76,7 @@ class DbMpoolFile; // forward
class DbPreplist; // forward
class DbSequence; // forward
class DbSite; // forward
+class DbStream; // forward
class Dbt; // forward
class DbTxn; // forward
@@ -159,13 +160,13 @@ extern "C" {
typedef void (*db_free_fcn_type)
(void *);
typedef int (*bt_compare_fcn_type) /*C++ version available*/
- (DB *, const DBT *, const DBT *);
+ (DB *, const DBT *, const DBT *, size_t *);
typedef size_t (*bt_prefix_fcn_type) /*C++ version available*/
(DB *, const DBT *, const DBT *);
typedef int (*dup_compare_fcn_type) /*C++ version available*/
- (DB *, const DBT *, const DBT *);
+ (DB *, const DBT *, const DBT *, size_t *);
typedef int (*h_compare_fcn_type) /*C++ version available*/
- (DB *, const DBT *, const DBT *);
+ (DB *, const DBT *, const DBT *, size_t *);
typedef u_int32_t (*h_hash_fcn_type) /*C++ version available*/
(DB *, const void *, u_int32_t);
typedef int (*pgin_fcn_type)
@@ -204,7 +205,10 @@ public:
virtual int get_alloc(
db_malloc_fcn_type *, db_realloc_fcn_type *, db_free_fcn_type *);
virtual int get_append_recno(int (**)(Db *, Dbt *, db_recno_t));
- virtual int get_bt_compare(int (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_blob_dir(const char **);
+ virtual int get_blob_threshold(u_int32_t *);
+ virtual int get_bt_compare(
+ int (**)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int get_bt_compress(
int (**)(
Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *),
@@ -215,7 +219,8 @@ public:
virtual int get_cachesize(u_int32_t *, u_int32_t *, int *);
virtual int get_create_dir(const char **);
virtual int get_dbname(const char **, const char **);
- virtual int get_dup_compare(int (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_dup_compare(
+ int (**)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int get_encrypt_flags(u_int32_t *);
virtual void get_errcall(
void (**)(const DbEnv *, const char *, const char *));
@@ -225,7 +230,8 @@ public:
virtual int get_flags(u_int32_t *);
virtual int get_heapsize(u_int32_t *, u_int32_t *);
virtual int get_heap_regionsize(u_int32_t *);
- virtual int get_h_compare(int (**)(Db *, const Dbt *, const Dbt *));
+ virtual int get_h_compare(
+ int (**)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int get_h_ffactor(u_int32_t *);
virtual int get_h_hash(u_int32_t (**)(Db *, const void *, u_int32_t));
virtual int get_h_nelem(u_int32_t *);
@@ -261,8 +267,11 @@ public:
db_malloc_fcn_type, db_realloc_fcn_type, db_free_fcn_type);
virtual void set_app_private(void *);
virtual int set_append_recno(int (*)(Db *, Dbt *, db_recno_t));
+ virtual int set_blob_dir(const char *);
+ virtual int set_blob_threshold(u_int32_t, u_int32_t);
virtual int set_bt_compare(bt_compare_fcn_type); /*deprecated*/
- virtual int set_bt_compare(int (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_bt_compare(
+ int (*)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int set_bt_compress(
int (*)
(Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *),
@@ -273,7 +282,8 @@ public:
virtual int set_cachesize(u_int32_t, u_int32_t, int);
virtual int set_create_dir(const char *);
virtual int set_dup_compare(dup_compare_fcn_type); /*deprecated*/
- virtual int set_dup_compare(int (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_dup_compare(
+ int (*)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int set_encrypt(const char *, u_int32_t);
virtual void set_errcall(
void (*)(const DbEnv *, const char *, const char *));
@@ -284,7 +294,8 @@ public:
virtual int set_heapsize(u_int32_t, u_int32_t);
virtual int set_heap_regionsize(u_int32_t);
virtual int set_h_compare(h_compare_fcn_type); /*deprecated*/
- virtual int set_h_compare(int (*)(Db *, const Dbt *, const Dbt *));
+ virtual int set_h_compare(
+ int (*)(Db *, const Dbt *, const Dbt *, size_t *));
virtual int set_h_ffactor(u_int32_t);
virtual int set_h_hash(h_hash_fcn_type); /*deprecated*/
virtual int set_h_hash(u_int32_t (*)(Db *, const void *, u_int32_t));
@@ -383,16 +394,16 @@ public:
int (*associate_callback_)(Db *, const Dbt *, const Dbt *, Dbt *);
int (*associate_foreign_callback_)
(Db *, const Dbt *, Dbt *, const Dbt *, int *);
- int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *);
+ int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *);
int (*bt_compress_callback_)(
Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *);
int (*bt_decompress_callback_)(
Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *);
size_t (*bt_prefix_callback_)(Db *, const Dbt *, const Dbt *);
u_int32_t (*db_partition_callback_)(Db *, Dbt *);
- int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *);
+ int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *);
void (*feedback_callback_)(Db *, int, int);
- int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *);
+ int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *);
u_int32_t (*h_hash_callback_)(Db *, const void *, u_int32_t);
};
@@ -407,6 +418,7 @@ public:
int close();
int cmp(Dbc *other_csr, int *result, u_int32_t flags);
int count(db_recno_t *countp, u_int32_t flags);
+ int db_stream(DbStream **dbsp, u_int32_t flags);
int del(u_int32_t flags);
int dup(Dbc** cursorp, u_int32_t flags);
int get(Dbt* key, Dbt *data, u_int32_t flags);
@@ -527,6 +539,10 @@ public:
int (*)(DbEnv *, const char *, void *));
virtual int get_backup_config(DB_BACKUP_CONFIG, u_int32_t *);
virtual int set_backup_config(DB_BACKUP_CONFIG, u_int32_t);
+ virtual int get_blob_dir(const char **);
+ virtual int set_blob_dir(const char *);
+ virtual int get_blob_threshold(u_int32_t *);
+ virtual int set_blob_threshold(u_int32_t, u_int32_t);
virtual int get_cachesize(u_int32_t *, u_int32_t *, int *);
virtual int set_cachesize(u_int32_t, u_int32_t, int);
virtual int get_cache_max(u_int32_t *, u_int32_t *);
@@ -761,10 +777,16 @@ public:
virtual int rep_set_priority(u_int32_t priority);
virtual int rep_get_timeout(int which, db_timeout_t *timeout);
virtual int rep_set_timeout(int which, db_timeout_t timeout);
+ virtual int rep_set_view(int (*)(DbEnv *,
+ const char *, int *, u_int32_t));
virtual int repmgr_channel(int eid, DbChannel **channel,
u_int32_t flags);
virtual int repmgr_get_ack_policy(int *policy);
virtual int repmgr_set_ack_policy(int policy);
+ virtual int repmgr_get_incoming_queue_max(u_int32_t *gbytesp,
+ u_int32_t *bytesp);
+ virtual int repmgr_set_incoming_queue_max(u_int32_t gbytes,
+ u_int32_t bytes);
virtual int repmgr_local_site(DbSite **site);
virtual int repmgr_msg_dispatch(void (*) (DbEnv *,
DbChannel *, Dbt *, u_int32_t, u_int32_t), u_int32_t flags);
@@ -824,6 +846,8 @@ public:
static int _backup_write_intercept(DB_ENV *dbenv, u_int32_t off_gbytes,
u_int32_t off_bytes, u_int32_t size, u_int8_t *buf, void *handle);
static void _paniccall_intercept(DB_ENV *dbenv, int errval);
+ static int _partial_rep_intercept(DB_ENV *dbenv,
+ const char *name, int *result, u_int32_t flags);
static void _feedback_intercept(DB_ENV *dbenv, int opcode, int pct);
static void _event_func_intercept(DB_ENV *dbenv, u_int32_t, void *);
static int _isalive_intercept(DB_ENV *dbenv, pid_t pid,
@@ -872,6 +896,7 @@ private:
void (*feedback_callback_)(DbEnv *, int, int);
void (*message_callback_)(const DbEnv *, const char *);
void (*paniccall_callback_)(DbEnv *, int);
+ int (*partial_rep_callback_)(DbEnv *, const char *, int *, u_int32_t);
void (*event_func_callback_)(DbEnv *, u_int32_t, void *);
int (*rep_send_callback_)(DbEnv *, const Dbt *, const Dbt *,
const DbLsn *, int, u_int32_t);
@@ -1057,9 +1082,9 @@ public:
int stat(DB_SEQUENCE_STAT **sp, u_int32_t flags);
int stat_print(u_int32_t flags);
- int get(DbTxn *txnid, int32_t delta, db_seq_t *retp, u_int32_t flags);
- int get_cachesize(int32_t *sizep);
- int set_cachesize(int32_t size);
+ int get(DbTxn *txnid, u_int32_t delta, db_seq_t *retp, u_int32_t flags);
+ int get_cachesize(u_int32_t *sizep);
+ int set_cachesize(u_int32_t size);
int get_flags(u_int32_t *flagsp);
int set_flags(u_int32_t flags);
int get_range(db_seq_t *minp, db_seq_t *maxp);
@@ -1137,6 +1162,34 @@ private:
};
//
+// DbStream
+//
+class _exported DbStream : protected DB_STREAM
+{
+ friend class Dbc;
+
+public:
+ int close(u_int32_t flags);
+ int read(Dbt *data, db_off_t offset, u_int32_t size, u_int32_t flags);
+ int size(db_off_t *size, u_int32_t flags);
+ int write(Dbt *data, db_off_t offset, u_int32_t flags);
+
+private:
+ // No data is permitted in this class (see comment at top)
+
+ // Note: use Dbc::dbstream() to get pointers to a DbStream,
+ // and call Dbstream::close() rather than delete to release them.
+ //
+ DbStream();
+ ~DbStream();
+
+ // no copying
+ DbStream(const DbStream &);
+ DbStream &operator = (const DbStream &);
+
+};
+
+//
// Transaction
//
class _exported DbTxn
@@ -1245,6 +1298,7 @@ class _exported Dbt : private DBT
friend class DbEnv;
friend class DbLogc;
friend class DbSequence;
+ friend class DbStream;
public:
// key/data
diff --git a/src/dbinc/db_dispatch.h b/src/dbinc/db_dispatch.h
index b6382871..b3aedab1 100644
--- a/src/dbinc/db_dispatch.h
+++ b/src/dbinc/db_dispatch.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1995, 1996
diff --git a/src/dbinc/db_int.in b/src/dbinc/db_int.in
index 42439107..593deef6 100644
--- a/src/dbinc/db_int.in
+++ b/src/dbinc/db_int.in
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -73,6 +73,17 @@
#endif /* !HAVE_SYSTEM_INCLUDE_FILES */
+/*
+ * The Windows compiler needs to be told about structures that are available
+ * outside a dll.
+ */
+#if defined(DB_WIN32) && defined(_MSC_VER) && \
+ !defined(DB_CREATE_DLL) && !defined(_LIB)
+#define __DB_IMPORT __declspec(dllimport)
+#else
+#define __DB_IMPORT
+#endif
+
#ifdef DB_WIN32
#include "dbinc/win_db.h"
#endif
@@ -88,22 +99,12 @@
#include "dbinc/queue.h"
#include "dbinc/shqueue.h"
#include "dbinc/perfmon.h"
+#include "dbinc/clock.h"
#if defined(__cplusplus)
extern "C" {
#endif
-/*
- * The Windows compiler needs to be told about structures that are available
- * outside a dll.
- */
-#if defined(DB_WIN32) && defined(_MSC_VER) && \
- !defined(DB_CREATE_DLL) && !defined(_LIB)
-#define __DB_IMPORT __declspec(dllimport)
-#else
-#define __DB_IMPORT
-#endif
-
/*******************************************************
* Forward structure declarations.
*******************************************************/
@@ -366,22 +367,27 @@ typedef struct __fn {
/*
* Structure used for callback message aggregation.
*
- * Display values in XXX_stat_print calls.
+ * DB_MSGBUF_FLUSH displays values in XXX_stat_print calls.
+ * DB_MSGBUF_REP_FLUSH displays replication system messages.
*/
typedef struct __db_msgbuf {
char *buf; /* Heap allocated buffer. */
char *cur; /* Current end of message. */
size_t len; /* Allocated length of buffer. */
+ int flags;
} DB_MSGBUF;
+#define DB_MSGBUF_PREALLOCATED 0x0001
+
#define DB_MSGBUF_INIT(a) do { \
(a)->buf = (a)->cur = NULL; \
- (a)->len = 0; \
+ (a)->len = (a)->flags = 0; \
} while (0)
#define DB_MSGBUF_FLUSH(env, a) do { \
if ((a)->buf != NULL) { \
if ((a)->cur != (a)->buf) \
__db_msg(env, "%s", (a)->buf); \
- __os_free(env, (a)->buf); \
+ if (!F_ISSET((a), DB_MSGBUF_PREALLOCATED)) \
+ __os_free(env, (a)->buf); \
DB_MSGBUF_INIT(a); \
} \
} while (0)
@@ -392,18 +398,14 @@ typedef struct __db_msgbuf {
if (regular_msg) \
DB_MSGBUF_FLUSH(env, a); \
else { \
- __os_free(env, (a)->buf); \
+ if (!F_ISSET((a), DB_MSGBUF_PREALLOCATED)) \
+ __os_free(env, (a)->buf); \
DB_MSGBUF_INIT(a); \
} \
} \
} while (0)
-#define STAT_FMT(msg, fmt, type, v) do { \
- DB_MSGBUF __mb; \
- DB_MSGBUF_INIT(&__mb); \
- __db_msgadd(env, &__mb, fmt, (type)(v)); \
- __db_msgadd(env, &__mb, "\t%s", msg); \
- DB_MSGBUF_FLUSH(env, &__mb); \
-} while (0)
+#define STAT_FMT(msg, fmt, type, v) \
+ __db_msg(env, fmt "\t%s", (type)(v), msg);
#define STAT_HEX(msg, v) \
__db_msg(env, "%#lx\t%s", (u_long)(v), msg)
#define STAT_ISSET(msg, p) \
@@ -441,25 +443,21 @@ typedef struct __db_msgbuf {
*
* Error message IDs are automatically assigned by dist/s_message_id script.
*/
-#ifdef HAVE_LOCALIZATION
-#define _(msg) msg /* Replace with localization function. */
-#else
-#define _(msg) msg
-#endif
-
#ifdef HAVE_STRIPPED_MESSAGES
#define DB_STR_C(msg, fmt) fmt
#else
-#define DB_STR_C(msg, fmt) _(msg)
+#define DB_STR_C(msg, fmt) msg
#endif
-#define DB_MSGID(id) "BDB" id
-
-#define DB_STR(id, msg) DB_MSGID(id) " " DB_STR_C(msg, "")
-
-#define DB_STR_A(id, msg, fmt) DB_MSGID(id) " " DB_STR_C(msg, fmt)
+#ifdef HAVE_LOCALIZATION
+#define _(msg) (msg) /* Replace with localization function. */
+#else
+#define _(msg) msg
+#endif
-#define DB_STR_P(msg) _(msg)
+#define DB_STR(id, msg) _("BDB" id " " DB_STR_C(msg, ""))
+#define DB_STR_A(id, msg, fmt) _("BDB" id " " DB_STR_C(msg, fmt))
+#define DB_STR_P(msg) _(msg)
/*
* There are quite a few places in Berkeley DB where we want to initialize
@@ -542,6 +540,7 @@ typedef struct __db_msgbuf {
/* Type passed to __db_appname(). */
typedef enum {
DB_APP_NONE=0, /* No type (region). */
+ DB_APP_BLOB, /* Blob file. */
DB_APP_DATA, /* Data file. */
DB_APP_LOG, /* Log file. */
DB_APP_META, /* Persistent metadata file. */
@@ -612,8 +611,13 @@ typedef enum {
if (F_ISSET((env), ENV_OPEN_CALLED)) \
ENV_REQUIRES_CONFIG(env, handle, i, flags)
+/*
+ * The ENV_ENTER and ENV_LEAVE macros announce to other threads that
+ * the current thread is entering or leaving the BDB api.
+ */
#define ENV_ENTER_RET(env, ip, ret) do { \
ret = 0; \
+ DISCARD_HISTORY(env); \
PANIC_CHECK_RET(env, ret); \
if (ret == 0) { \
if ((env)->thr_hashtab == NULL) \
@@ -631,6 +635,10 @@ typedef enum {
return (__ret); \
} while (0)
+/*
+ * Publicize the current thread's intention to run failchk. This invokes
+ * DB_ENV->is_alive() in the mutex code, to avoid hanging on dead processes.
+ */
#define FAILCHK_THREAD(env, ip) do { \
if ((ip) != NULL) \
(ip)->dbth_state = THREAD_FAILCHK; \
@@ -638,20 +646,15 @@ typedef enum {
#define ENV_GET_THREAD_INFO(env, ip) ENV_ENTER(env, ip)
-#ifdef DIAGNOSTIC
#define ENV_LEAVE(env, ip) do { \
- if ((ip) != NULL) { \
- DB_ASSERT(env, ((ip)->dbth_state == THREAD_ACTIVE || \
- (ip)->dbth_state == THREAD_FAILCHK)); \
+ if ((ip) != NULL) { \
+ DB_ASSERT((env), (ip)->dbth_state == THREAD_ACTIVE || \
+ (ip)->dbth_state == THREAD_FAILCHK); \
(ip)->dbth_state = THREAD_OUT; \
} \
} while (0)
-#else
-#define ENV_LEAVE(env, ip) do { \
- if ((ip) != NULL) \
- (ip)->dbth_state = THREAD_OUT; \
-} while (0)
-#endif
+
+
#ifdef DIAGNOSTIC
#define CHECK_THREAD(env) do { \
if ((env)->thr_hashtab != NULL) \
@@ -688,6 +691,23 @@ typedef struct __pin_list {
} PIN_LIST;
#define PINMAX 4
+typedef enum {
+ MUTEX_ACTION_UNLOCKED=0,
+ MUTEX_ACTION_INTEND_SHARE, /* Thread is attempting a read-lock. */
+ MUTEX_ACTION_SHARED /* Thread has gotten a read lock. */
+} MUTEX_ACTION;
+
+typedef struct __mutex_state { /* SHARED */
+ db_mutex_t mutex;
+ MUTEX_ACTION action;
+#ifdef DIAGNOSTIC
+ db_timespec when;
+#endif
+} MUTEX_STATE;
+
+#define MUTEX_STATE_MAX 10 /* It only needs enough for shared latches. */
+
+
struct __db_thread_info { /* SHARED */
pid_t dbth_pid;
db_threadid_t dbth_tid;
@@ -707,11 +727,25 @@ struct __db_thread_info { /* SHARED */
u_int16_t dbth_pinmax; /* Number of slots allocated. */
roff_t dbth_pinlist; /* List of pins. */
PIN_LIST dbth_pinarray[PINMAX]; /* Initial array of slots. */
+
+ /*
+ * While thread tracking is active this caches one of the lockers
+ * created by each thread. This locker remains allocated, with an
+ * invalid id, even after the locker id is freed.
+ */
+ roff_t dbth_local_locker;
+ /*
+ * Each latch shared by this thread has an entry here. Exclusive
+ * ownership, for both latches and mutexes, are in the DB_MUTEX.
+ */
+ MUTEX_STATE dbth_latches[MUTEX_STATE_MAX];
#ifdef DIAGNOSTIC
roff_t dbth_locker; /* Current locker for this thread. */
u_int32_t dbth_check_off; /* Count of number of LOCK_OFF calls. */
#endif
+ db_timespec dbth_failtime; /* Time when its crash was detected. */
};
+
#ifdef DIAGNOSTIC
#define LOCK_CHECK_OFF(ip) if ((ip) != NULL) \
(ip)->dbth_check_off++
@@ -729,7 +763,7 @@ struct __db_thread_info { /* SHARED */
#define LOCK_CHECK(dbc, pgno, mode) NOP_STATEMENT
#endif
-typedef struct __env_thread_info {
+typedef struct __env_thread_info { /* SHARED */
u_int32_t thr_count;
u_int32_t thr_init;
u_int32_t thr_max;
@@ -803,6 +837,11 @@ struct __env {
#define ENV_DEF_DATA_LEN 100
u_int32_t data_len; /* Data length in __db_prbytes. */
+ /* Registered processes */
+ size_t num_active_pids; /* number of entries in active_pids */
+ size_t size_active_pids; /* allocated size of active_pids */
+ pid_t *active_pids; /* array active pids */
+
/* Thread tracking */
u_int32_t thr_nbucket; /* Number of hash buckets */
DB_HASHTAB *thr_hashtab; /* Hash table of DB_THREAD_INFO */
@@ -866,6 +905,7 @@ struct __env {
#define DB_TEST_PREOPEN 10 /* before __os_open */
#define DB_TEST_REPMGR_PERM 11 /* repmgr perm/archiving tests */
#define DB_TEST_SUBDB_LOCKS 12 /* subdb locking tests */
+#define DB_TEST_REPMGR_HEARTBEAT 13 /* repmgr stop sending heartbeats */
int test_abort; /* Abort value for testing */
int test_check; /* Checkpoint value for testing */
int test_copy; /* Copy value for testing */
@@ -881,7 +921,9 @@ struct __env {
#define ENV_REF_COUNTED 0x00000100 /* Region references this handle */
#define ENV_SYSTEM_MEM 0x00000200 /* DB_SYSTEM_MEM set */
#define ENV_THREAD 0x00000400 /* DB_THREAD set */
-#define ENV_FORCE_TXN_BULK 0x00000800 /* Txns use bulk mode-for testing */
+#define ENV_FORCE_TXN_BULK 0x00000800 /* Txns use bulk mode-for testing */
+#define ENV_REMEMBER_PANIC 0x00001000 /* Panic was on during cleanup. */
+#define ENV_FORCESYNCENV 0x00002000 /* Force msync on closing. */
u_int32_t flags;
};
@@ -1106,7 +1148,6 @@ typedef struct __dbpginfo {
@db_int_def@
#include "dbinc/globals.h"
-#include "dbinc/clock.h"
#include "dbinc/debug.h"
#include "dbinc/region.h"
#include "dbinc_auto/env_ext.h"
@@ -1118,6 +1159,7 @@ typedef struct __dbpginfo {
#include "dbinc/os.h"
#include "dbinc_auto/clib_ext.h"
#include "dbinc_auto/common_ext.h"
+#include "dbinc_auto/blob_ext.h"
/*******************************************************
* Remaining Log.
diff --git a/src/dbinc/db_join.h b/src/dbinc/db_join.h
index aecf059a..8f22adcb 100644
--- a/src/dbinc/db_join.h
+++ b/src/dbinc/db_join.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/db_page.h b/src/dbinc/db_page.h
index 2d4de2e5..4694c4cf 100644
--- a/src/dbinc/db_page.h
+++ b/src/dbinc/db_page.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -93,6 +93,7 @@ typedef struct _dbmeta33 {
u_int8_t uid[DB_FILE_ID_LEN];
} DBMETA33, DBMETA;
+
/************************************************************************
BTREE METADATA PAGE LAYOUT
************************************************************************/
@@ -113,7 +114,13 @@ typedef struct _btmeta33 {
u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */
u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */
u_int32_t root; /* 88-91: Root page. */
- u_int32_t unused2[92]; /* 92-459: Unused space. */
+ u_int32_t blob_threshold;
+ /* 92-95: Minimum blob file size. */
+ u_int32_t blob_file_lo; /* 96-99: Blob file dir id lo. */
+ u_int32_t blob_file_hi; /* 100-103: Blob file dir id hi. */
+ u_int32_t blob_sdb_lo; /* 104-107: Blob sdb dir id lo */
+ u_int32_t blob_sdb_hi; /* 108-111: Blob sdb dir id hi */
+ u_int32_t unused2[87]; /* 112-459: Unused space. */
u_int32_t crypto_magic; /* 460-463: Crypto magic number */
u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
@@ -142,7 +149,13 @@ typedef struct _hashmeta33 {
#define NCACHED 32 /* number of spare points */
/* 96-223: Spare pages for overflow */
u_int32_t spares[NCACHED];
- u_int32_t unused[59]; /* 224-459: Unused space */
+ u_int32_t blob_threshold;
+ /* 224-227: Minimum blob file size. */
+ u_int32_t blob_file_lo; /* 228-231: Blob file dir id lo. */
+ u_int32_t blob_file_hi; /* 232-235: Blob file dir id hi. */
+ u_int32_t blob_sdb_lo; /* 236-239: Blob sdb dir id lo. */
+ u_int32_t blob_sdb_hi; /* 240-243: Blob sdb dir id hi. */
+ u_int32_t unused[54]; /* 244-459: Unused space */
u_int32_t crypto_magic; /* 460-463: Crypto magic number */
u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
@@ -168,7 +181,10 @@ typedef struct _heapmeta {
u_int32_t gbytes; /* 80-83: GBytes for fixed size heap. */
u_int32_t bytes; /* 84-87: Bytes for fixed size heap. */
u_int32_t region_size; /* 88-91: Max region size. */
- u_int32_t unused2[92]; /* 92-459: Unused space.*/
+ u_int32_t blob_threshold; /* 92-95: Minimum blob file size. */
+ u_int32_t blob_file_lo; /* 96-97: Blob file dir id lo. */
+ u_int32_t blob_file_hi; /* 98-101: Blob file dir id hi. */
+ u_int32_t unused2[89]; /* 102-459: Unused space.*/
u_int32_t crypto_magic; /* 460-463: Crypto magic number */
u_int32_t trash[3]; /* 464-475: Trash space - Do not use */
u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */
@@ -371,6 +387,7 @@ typedef struct __heaphdr {
#define HEAP_RECSPLIT 0x01 /* Heap data record is split */
#define HEAP_RECFIRST 0x02 /* First piece of a split record */
#define HEAP_RECLAST 0x04 /* Last piece of a split record */
+#define HEAP_RECBLOB 0x08 /* Record refers to a blob */
u_int8_t flags; /* 00: Flags describing record. */
u_int8_t unused; /* 01: Padding. */
u_int16_t size; /* 02-03: The size of the stored data piece. */
@@ -384,8 +401,35 @@ typedef struct __heaphdrsplt {
u_int16_t unused; /* 14-15: Padding. */
} HEAPSPLITHDR;
+/*
+ * HEAPBLOB, the blob database record for heap.
+ * Saving bytes is not a concern for the blob record type - if too many
+ * fit onto a single page, then we're likely to introduce unnecessary
+ * contention for blobs. Using blobs implies storing large items, thus slightly
+ * more per-item overhead is acceptable.
+ * If this proves untrue, the crypto section of the record could be optional.
+ * encoding, lsn, encryption, and checksum fields are unused at the moment, but
+ * included to make adding those features easier.
+ */
+typedef struct _heapblob {
+ HEAPHDR std_hdr; /* 00-03: The standard data header */
+ u_int8_t encoding; /* 04: Encoding of blob file. */
+ u_int8_t unused[7]; /* 05-11: Padding, unused. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */
+ DB_LSN lsn; /* 48-55: LSN for blob file update. */
+ u_int64_t id; /* 56-63: Blob file identifier. */
+ u_int64_t size; /* 64-71: Blob file size. */
+ u_int64_t file_id; /* 72-80: File directory. */
+} HEAPBLOBHDR, HEAPBLOBHDR60P1;
+
#define HEAP_HDRSIZE(hdr) \
- (F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : sizeof(HEAPHDR))
+ (F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : \
+ sizeof(HEAPHDR))
+
+#define HEAPBLOBREC_SIZE (sizeof(HEAPBLOBHDR))
+#define HEAPBLOBREC_DSIZE (sizeof(HEAPBLOBHDR) - sizeof(HEAPHDR))
+#define HEAPBLOBREC_DATA(p) (((u_int8_t *)p) + sizeof(HEAPHDR))
#define HEAPPG_SZ(dbp) \
(F_ISSET((dbp), DB_AM_ENCRYPT) ? HEAPPG_SEC : \
@@ -441,12 +485,12 @@ typedef struct __heaphdrsplt {
/* Return the amount of free space on a heap data page. */
#define HEAP_FREESPACE(dbp, p) \
- (HOFFSET(p) - HEAPPG_SZ(dbp) - \
+ ((HOFFSET(p) - HEAPPG_SZ(dbp)) - \
(NUM_ENT(p) == 0 ? 0 : ((HEAP_HIGHINDX(p) + 1) * sizeof(db_indx_t))))
/* The maximum amount of data that can fit on an empty heap data page. */
#define HEAP_MAXDATASIZE(dbp) \
- ((dbp)->pgsize - HEAPPG_SZ(dbp) - sizeof(db_indx_t))
+ (((dbp)->pgsize - HEAPPG_SZ(dbp)) - sizeof(db_indx_t))
#define HEAP_FREEINDX(p) (((HEAPPG *)p)->free_indx)
#define HEAP_HIGHINDX(p) (((HEAPPG *)p)->high_indx)
@@ -549,9 +593,9 @@ typedef struct _qpage {
* The amount of overflow data stored on each page is stored in the
* hf_offset field.
*
- * The implementation reference counts overflow items as it's possible
- * for them to be promoted onto btree internal pages. The reference
- * count is stored in the entries field.
+ * Before 4.3 the implementation reference counted overflow items as it
+ * once was possible for them to be promoted onto btree internal pages.
+ * The reference count is stored in the entries field.
*/
#define OV_LEN(p) (((PAGE *)p)->hf_offset)
#define OV_REF(p) (((PAGE *)p)->entries)
@@ -571,6 +615,7 @@ typedef struct _qpage {
#define H_DUPLICATE 2 /* Duplicate key/data item. */
#define H_OFFPAGE 3 /* Overflow key/data item. */
#define H_OFFDUP 4 /* Overflow page of duplicates. */
+#define H_BLOB 5 /* Blob file data item. */
/*
* !!!
@@ -685,6 +730,78 @@ typedef struct _hoffdup {
*/
#define HOFFDUP_SIZE (sizeof(HOFFDUP))
+/*
+ * The fifth type is the H_BLOB, represented by the HBLOB structure.
+ * Saving bytes is not a concern for the blob record type - if too many
+ * fit onto a single page, then we're likely to introduce unnecessary
+ * contention for blobs. Using blobs implies storing large items, thus slightly
+ * more per-item overhead is acceptable.
+ * If this proves untrue, the crypto section of the record could be optional.
+ * encoding, encryption, and checksum fields are unused at the moment, but
+ * included to make adding those features easier.
+ */
+typedef struct _hblob {
+ u_int8_t type; /* 00: Page type and delete flag. */
+ u_int8_t encoding; /* 01: Encoding of blob file. */
+ u_int8_t unused[10]; /* 02-11: Padding, unused. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */
+ u_int64_t id; /* 48-55: Blob file identifier. */
+ u_int64_t size; /* 56-63: Blob file size. */
+ u_int64_t file_id; /* 64-71: File directory. */
+ u_int64_t sdb_id; /* 72-79: Subdb that owns this blob. */
+} HBLOB, HBLOB60P1;
+
+#define HBLOB_ID(p) (((u_int8_t *)p) + SSZ(HBLOB, id))
+#define HBLOB_FILE_ID(p) (((u_int8_t *)p) + SSZ(HBLOB, file_id))
+
+/*
+ * Return a off_t version of the u_int64_t blob size.
+ * Since off_t can be a 32 or 64 integer on different systems, this macro
+ * is used to catch cases of overflow.
+ */
+#define GET_BLOB_SIZE(e, p, o, ret) do { \
+ DB_ASSERT((e), sizeof(o) <= 8); \
+ if (sizeof(o) == 8) { \
+ (o) = (off_t)(p).size; \
+ } else { \
+ if ((p).size > INT_MAX) { \
+ __db_errx((e), DB_STR("0769", \
+ "Blob size overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ (o) = (int32_t)(p).size; \
+ } \
+} while (0);
+
+#define SET_BLOB_FIELD(p, v, type, field) do { \
+ u_int64_t tmp; \
+ tmp = (u_int64_t)(v); \
+ memcpy((u_int8_t *)(p) + SSZ(type, field), \
+ &tmp, sizeof(u_int64_t)); \
+} while (0);
+
+#define SET_BLOB_ID(p, v, type) \
+ SET_BLOB_FIELD(p, v, type, id)
+
+#define SET_BLOB_SIZE(p, v, type) \
+ SET_BLOB_FIELD(p, v, type, size)
+
+#define SET_BLOB_FILE_ID(p, v, type) \
+ SET_BLOB_FIELD(p, v, type, file_id)
+
+#define SET_BLOB_SDB_ID(p, v, type) \
+ SET_BLOB_FIELD(p, v, type, sdb_id)
+
+/*
+ * Page space required to add a new HBLOB item to the page, with and
+ * without the index value.
+ */
+#define HBLOB_SIZE (sizeof(HBLOB))
+#define HBLOB_DSIZE (sizeof(HBLOB) - SSZA(HKEYDATA, data))
+#define HBLOB_PSIZE (HBLOB_SIZE + sizeof(db_indx_t))
+
+
/************************************************************************
BTREE PAGE LAYOUT
************************************************************************/
@@ -693,6 +810,7 @@ typedef struct _hoffdup {
#define B_KEYDATA 1 /* Key/data item. */
#define B_DUPLICATE 2 /* Duplicate key/data item. */
#define B_OVERFLOW 3 /* Overflow key/data item. */
+#define B_BLOB 4 /* Blob file key/data item. */
/*
* We have to store a deleted entry flag in the page. The reason is complex,
@@ -746,6 +864,32 @@ typedef struct _boverflow {
u_int32_t tlen; /* 08-11: Total length of item. */
} BOVERFLOW;
+/*
+ * The fourth type is the B_BLOB, represented by the BBLOB structure.
+ * Saving bytes is not a concern for the blob record type - if too many
+ * fit onto a single page, then we're likely to introduce unnecessary
+ * contention for blobs. Using blobs implies storing large items, thus slightly
+ * more per-item overhead is acceptable.
+ * The len field is set to BBLOB_DSIZE, so that a B_BLOB can be treated just
+ * like a B_KEYDATA for the purposes of moving items between or on a page.
+ * If this proves untrue, the crypto section of the record could be optional.
+ * encoding, lsn, encryption, and checksum fields are unused at the moment, but
+ * included to make adding those features easier.
+ */
+typedef struct _bblob {
+ db_indx_t len; /* 00-01: BBLOB_DSIZE. */
+ u_int8_t type; /* 02: Page type and delete flag. */
+ u_int8_t encoding; /* 03: Encoding of blob file. */
+ u_int8_t unused[8]; /* 04-11: Padding, unused. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */
+ u_int64_t id; /* 48-55: Blob file identifier. */
+ u_int64_t size; /* 56-63: Blob file size. */
+ u_int64_t file_id; /* 64-71: File directory. */
+ u_int64_t sdb_id; /* 72-79: Subdb that owns this blob. */
+} BBLOB, BBLOB60P1;
+#define BBLOB_DATA(p) ((u_int8_t *)((BKEYDATA *)p)->data)
+
/* Get a BOVERFLOW item for a specific index. */
#define GET_BOVERFLOW(dbp, pg, indx) \
((BOVERFLOW *)P_ENTRY(dbp, pg, indx))
@@ -759,13 +903,26 @@ typedef struct _boverflow {
#define BOVERFLOW_PSIZE \
(BOVERFLOW_SIZE + sizeof(db_indx_t))
+/*
+ * Page space required to add a new BBLOB item to the page, with and
+ * without the index value. BBLOB_DSIZE is used so that a B_BLOB item
+ * can be treated just like a B_KEYDATA for the purposes of moving items
+ * between or on a page, such as when doing compaction.
+ */
+#define BBLOB_SIZE \
+ ((u_int16_t)DB_ALIGN(sizeof(BBLOB), sizeof(u_int32_t)))
+#define BBLOB_DSIZE \
+ (BBLOB_SIZE - SSZA(BKEYDATA, data))
+#define BBLOB_PSIZE \
+ (BBLOB_SIZE + sizeof(db_indx_t))
+
#define BITEM_SIZE(bk) \
- (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_SIZE : \
- BKEYDATA_SIZE((bk)->len))
+ (B_TYPE((bk)->type) == B_KEYDATA ? BKEYDATA_SIZE((bk)->len) : \
+ (B_TYPE((bk)->type) == B_BLOB ? BBLOB_SIZE : BOVERFLOW_SIZE))
#define BITEM_PSIZE(bk) \
- (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_PSIZE : \
- BKEYDATA_PSIZE((bk)->len))
+ (B_TYPE((bk)->type) == B_KEYDATA ? BKEYDATA_PSIZE((bk)->len) : \
+ (B_TYPE((bk)->type) == B_BLOB ? BBLOB_PSIZE : BOVERFLOW_PSIZE))
/*
* Btree leaf and hash page layouts group indices in sets of two, one for the
diff --git a/src/dbinc/db_swap.h b/src/dbinc/db_swap.h
index 352ae227..06f4eb47 100644
--- a/src/dbinc/db_swap.h
+++ b/src/dbinc/db_swap.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
@@ -51,15 +51,26 @@ extern "C" {
#define M_64_SWAP(a) { \
u_int64_t _tmp; \
_tmp = (u_int64_t)a; \
- ((u_int8_t *)&a)[0] = ((u_int8_t *)&_tmp)[7]; \
- ((u_int8_t *)&a)[1] = ((u_int8_t *)&_tmp)[6]; \
- ((u_int8_t *)&a)[2] = ((u_int8_t *)&_tmp)[5]; \
- ((u_int8_t *)&a)[3] = ((u_int8_t *)&_tmp)[4]; \
- ((u_int8_t *)&a)[4] = ((u_int8_t *)&_tmp)[3]; \
- ((u_int8_t *)&a)[5] = ((u_int8_t *)&_tmp)[2]; \
- ((u_int8_t *)&a)[6] = ((u_int8_t *)&_tmp)[1]; \
- ((u_int8_t *)&a)[7] = ((u_int8_t *)&_tmp)[0]; \
+ ((u_int8_t *)&(a))[0] = ((u_int8_t *)&_tmp)[7]; \
+ ((u_int8_t *)&(a))[1] = ((u_int8_t *)&_tmp)[6]; \
+ ((u_int8_t *)&(a))[2] = ((u_int8_t *)&_tmp)[5]; \
+ ((u_int8_t *)&(a))[3] = ((u_int8_t *)&_tmp)[4]; \
+ ((u_int8_t *)&(a))[4] = ((u_int8_t *)&_tmp)[3]; \
+ ((u_int8_t *)&(a))[5] = ((u_int8_t *)&_tmp)[2]; \
+ ((u_int8_t *)&(a))[6] = ((u_int8_t *)&_tmp)[1]; \
+ ((u_int8_t *)&(a))[7] = ((u_int8_t *)&_tmp)[0]; \
}
+#undef P_64_COPYSWAP
+#define P_64_COPYSWAP(a, b) do { \
+ ((u_int8_t *)b)[0] = ((u_int8_t *)a)[7]; \
+ ((u_int8_t *)b)[1] = ((u_int8_t *)a)[6]; \
+ ((u_int8_t *)b)[2] = ((u_int8_t *)a)[5]; \
+ ((u_int8_t *)b)[3] = ((u_int8_t *)a)[4]; \
+ ((u_int8_t *)b)[4] = ((u_int8_t *)a)[3]; \
+ ((u_int8_t *)b)[5] = ((u_int8_t *)a)[2]; \
+ ((u_int8_t *)b)[6] = ((u_int8_t *)a)[1]; \
+ ((u_int8_t *)b)[7] = ((u_int8_t *)a)[0]; \
+} while (0)
#undef P_64_COPY
#define P_64_COPY(a, b) { \
((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \
@@ -113,7 +124,7 @@ extern "C" {
P_32_COPYSWAP(&_tmp, a); \
} while (0)
#undef M_32_SWAP
-#define M_32_SWAP(a) P_32_SWAP(&a)
+#define M_32_SWAP(a) P_32_SWAP(&(a))
/*
* Little endian <==> big endian 16-bit swap macros.
@@ -139,8 +150,13 @@ extern "C" {
P_16_COPYSWAP(&_tmp, a); \
} while (0)
#undef M_16_SWAP
-#define M_16_SWAP(a) P_16_SWAP(&a)
+#define M_16_SWAP(a) P_16_SWAP(&(a))
+#undef SWAP64
+#define SWAP64(p) { \
+ P_64_SWAP(p); \
+ (p) += sizeof(u_int64_t); \
+}
#undef SWAP32
#define SWAP32(p) { \
P_32_SWAP(p); \
@@ -168,6 +184,25 @@ extern "C" {
P_32_SWAP(p); \
} while (0)
+#undef DB_NTOHLL_COPYIN
+#define DB_NTOHLL_COPYIN(env, i, p) do { \
+ u_int8_t *tmp; \
+ tmp = (u_int8_t *)&(i); \
+ if (F_ISSET(env, ENV_LITTLEENDIAN)) { \
+ tmp[7] = *p++; \
+ tmp[6] = *p++; \
+ tmp[5] = *p++; \
+ tmp[4] = *p++; \
+ tmp[3] = *p++; \
+ tmp[2] = *p++; \
+ tmp[1] = *p++; \
+ tmp[0] = *p++; \
+ } else { \
+ memcpy(&(i), p, sizeof(u_int64_t)); \
+ p = (u_int8_t *)p + sizeof(u_int64_t); \
+ } \
+} while (0)
+
#undef DB_NTOHL_COPYIN
#define DB_NTOHL_COPYIN(env, i, p) do { \
u_int8_t *tmp; \
@@ -178,7 +213,7 @@ extern "C" {
tmp[1] = *p++; \
tmp[0] = *p++; \
} else { \
- memcpy(&i, p, sizeof(u_int32_t)); \
+ memcpy(&(i), p, sizeof(u_int32_t)); \
p = (u_int8_t *)p + sizeof(u_int32_t); \
} \
} while (0)
@@ -191,11 +226,29 @@ extern "C" {
tmp[1] = *p++; \
tmp[0] = *p++; \
} else { \
- memcpy(&i, p, sizeof(u_int16_t)); \
+ memcpy(&(i), p, sizeof(u_int16_t)); \
p = (u_int8_t *)p + sizeof(u_int16_t); \
} \
} while (0)
+#undef DB_HTONLL_COPYOUT
+#define DB_HTONLL_COPYOUT(env, p, i) do { \
+ u_int8_t *tmp; \
+ tmp = (u_int8_t *)p; \
+ if (F_ISSET(env, ENV_LITTLEENDIAN)) { \
+ *tmp++ = ((u_int8_t *)&(i))[7]; \
+ *tmp++ = ((u_int8_t *)&(i))[6]; \
+ *tmp++ = ((u_int8_t *)&(i))[5]; \
+ *tmp++ = ((u_int8_t *)&(i))[4]; \
+ *tmp++ = ((u_int8_t *)&(i))[3]; \
+ *tmp++ = ((u_int8_t *)&(i))[2]; \
+ *tmp++ = ((u_int8_t *)&(i))[1]; \
+ *tmp++ = ((u_int8_t *)&(i))[0]; \
+ } else \
+ memcpy(p, &(i), sizeof(u_int64_t)); \
+ p = (u_int8_t *)p + sizeof(u_int64_t); \
+} while (0)
+
#undef DB_HTONL_COPYOUT
#define DB_HTONL_COPYOUT(env, p, i) do { \
u_int8_t *tmp; \
@@ -206,7 +259,7 @@ extern "C" {
*tmp++ = ((u_int8_t *)&(i))[1]; \
*tmp++ = ((u_int8_t *)&(i))[0]; \
} else \
- memcpy(p, &i, sizeof(u_int32_t)); \
+ memcpy(p, &(i), sizeof(u_int32_t)); \
p = (u_int8_t *)p + sizeof(u_int32_t); \
} while (0)
@@ -229,6 +282,13 @@ extern "C" {
*/
#define LOG_SWAPPED(env) !F_ISSET(env, ENV_LITTLEENDIAN)
+#define LOGCOPY_64(env, x, p) do { \
+ if (LOG_SWAPPED(env)) \
+ P_64_COPYSWAP((p), (x)); \
+ else \
+ memcpy((x), (p), sizeof(u_int64_t)); \
+} while (0)
+
#define LOGCOPY_32(env, x, p) do { \
if (LOG_SWAPPED(env)) \
P_32_COPYSWAP((p), (x)); \
diff --git a/src/dbinc/db_upgrade.h b/src/dbinc/db_upgrade.h
index 45fb624d..716594c9 100644
--- a/src/dbinc/db_upgrade.h
+++ b/src/dbinc/db_upgrade.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -242,6 +242,123 @@ typedef struct hashhdr { /* Disk resident portion */
*/
} HASHHDR;
+
+/************************************************************************
+ BLOB RECORD LAYOUTS
+ ************************************************************************/
+
+/*
+ * Hash BLOB record layout.
+ */
+typedef struct _hblob60 {
+ u_int8_t type; /* 00: Page type and delete flag. */
+ u_int8_t encoding; /* 01: Encoding of blob file. */
+ u_int8_t unused[2]; /* 02-03: Padding, unused. */
+ u_int32_t id_lo; /* 04-07: Blob file identifier. */
+ u_int32_t id_hi; /* 07-11: Blob file identifier. */
+ u_int32_t size_lo; /* 12-15: Blob file size. */
+ u_int32_t size_hi; /* 15-19: Blob file size. */
+ DB_LSN lsn; /* 20-27: LSN for blob file update. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */
+ u_int32_t file_id_lo; /* 64-67: File directory lo. */
+ u_int32_t file_id_hi; /* 68-71: File directory hi. */
+ u_int32_t sdb_id_lo; /* 72-75: Subdb that owns this blob. */
+ u_int32_t sdb_id_hi; /* 76-79: Subdb that owns this blob. */
+} HBLOB60;
+
+#define HBLOB60_SIZE (sizeof(HBLOB60))
+
+/*
+ * Btree BLOB record layout.
+ */
+typedef struct _bblob60 {
+ db_indx_t len; /* 00-01: BBLOB_DSIZE. */
+ u_int8_t type; /* 02: Page type and delete flag. */
+ u_int8_t encoding; /* 03: Encoding of blob file. */
+ u_int32_t id_lo; /* 04-07: Blob file identifier. */
+ u_int32_t id_hi; /* 08-11: Blob file identifier. */
+ u_int32_t size_lo; /* 12-15: Blob file size. */
+ u_int32_t size_hi; /* 15-19: Blob file size. */
+ DB_LSN lsn; /* 20-27: LSN for blob file update. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */
+ u_int32_t file_id_lo; /* 64-67: File directory lo. */
+ u_int32_t file_id_hi; /* 68-71: File directory hi. */
+ u_int32_t sdb_id_lo; /* 72-75: Subdb that owns this blob. */
+ u_int32_t sdb_id_hi; /* 76-79: Subdb that owns this blob. */
+} BBLOB60;
+
+#define BBLOB60_SIZE \
+ ((u_int16_t)DB_ALIGN(sizeof(BBLOB60), sizeof(u_int32_t)))
+/*
+ * Heap BLOB record layout.
+ */
+typedef struct _heapblob60 {
+ u_int8_t flags; /* 00: Flags describing record. */
+ u_int8_t unused; /* 01: Padding. */
+ u_int16_t size; /* 02-03: The size of the stored data piece. */
+ u_int8_t encoding; /* 04: Encoding of blob file. */
+ u_int8_t unused2[3]; /* 05-07: Padding, unused. */
+ u_int32_t id_lo; /* 08-11: Blob file identifier. */
+ u_int32_t id_hi; /* 12-15: Blob file identifier. */
+ u_int32_t size_lo; /* 16-19: Blob file size. */
+ u_int32_t size_hi; /* 20-23: Blob file size. */
+ u_int8_t unused3[4]; /* 24-27: Padding, unused. */
+ u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */
+ u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */
+ DB_LSN lsn; /* 64-67: LSN for blob file update. */
+ u_int32_t file_id_lo; /* 68-71: File directory lo. */
+ u_int32_t file_id_hi; /* 72-75: File directory hi. */
+} HEAPBLOBHDR60;
+
+#define HEAPBLOBREC60_SIZE (sizeof(HEAPBLOBHDR60))
+
+#define GET_BLOB60_FILE_ID(e, p, o, ret) \
+ GET_LO_HI(e, (p)->file_id_lo, (p)->file_id_hi, o, ret);
+
+#define GET_BLOB60_SDB_ID(e, p, o, ret) \
+ GET_LO_HI(e, (p)->sdb_id_lo, (p)->sdb_id_hi, o, ret);
+
+/* Return a uintmax_t version of blob_id. */
+#define GET_BLOB60_ID(e, p, o, ret) do { \
+ DB_ASSERT((e), sizeof(o) <= 8); \
+ if (sizeof(o) == 8) { \
+ (o) = (p).id_hi; \
+ (o) = (o) << 32; \
+ (o) += (p).id_lo; \
+ } else { \
+ if ((p).id_hi > 0) { \
+ __db_errx((e), DB_STR("0766", \
+ "Blob identifier overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ (o) = (p).id_lo; \
+ } \
+} while (0);
+
+/* Return a off_t version of blob size. */
+#define GET_BLOB60_SIZE(e, p, o, ret) do { \
+ DB_ASSERT((e), sizeof(o) <= 8); \
+ if (sizeof(o) == 8) { \
+ (o) = (p).size_hi; \
+ (o) = (o) << 32; \
+ (o) += (p).size_lo; \
+ } else { \
+ if ((p).size_hi > 0) { \
+ __db_errx((e), DB_STR("0767", \
+ "Blob size overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ if ((p).size_lo > INT_MAX) { \
+ __db_errx((e), DB_STR("0768", \
+ "Blob size overflow.")); \
+ (ret) = EINVAL; \
+ } \
+ (o) = (int32_t)(p).size_lo; \
+ } \
+} while (0);
+
#if defined(__cplusplus)
}
#endif
diff --git a/src/dbinc/db_verify.h b/src/dbinc/db_verify.h
index 68acbf6c..ea87680f 100644
--- a/src/dbinc/db_verify.h
+++ b/src/dbinc/db_verify.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -120,9 +120,10 @@ struct __vrfy_dbinfo {
#define SALVAGE_PRINTABLE 0x01 /* Output printable chars literally. */
#define SALVAGE_PRINTHEADER 0x02 /* Print the unknown-key header. */
#define SALVAGE_PRINTFOOTER 0x04 /* Print the unknown-key footer. */
-#define SALVAGE_HASSUBDBS 0x08 /* There are subdatabases to salvage. */
-#define VRFY_LEAFCHAIN_BROKEN 0x10 /* Lost one or more Btree leaf pgs. */
-#define VRFY_QMETA_SET 0x20 /* We've seen a QUEUE meta page and
+#define SALVAGE_STREAM_BLOB 0x08 /* Currently streaming a blob. */
+#define SALVAGE_HASSUBDBS 0x10 /* There are subdatabases to salvage. */
+#define SALVAGE_LEAFCHAIN_BROKEN 0x20 /* Lost one or more Btree leaf pgs. */
+#define SALVAGE_QMETA_SET 0x40 /* We've seen a QUEUE meta page and
set things up for it. */
u_int32_t flags;
}; /* VRFY_DBINFO */
diff --git a/src/dbinc/debug.h b/src/dbinc/debug.h
index a8da000d..5388b791 100644
--- a/src/dbinc/debug.h
+++ b/src/dbinc/debug.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -36,7 +36,13 @@ extern "C" {
#define DB_ASSERT(env, e) \
((e) ? (void)0 : __db_assert(env, #e, __FILE__, __LINE__))
#else
-#define DB_ASSERT(env, e) NOP_STATEMENT
+#define DB_ASSERT(env, e) ((void)0)
+#endif
+
+#if defined(HAVE_ERROR_HISTORY)
+#define DB_DEBUG_MSG __db_debug_msg
+#else
+#define DB_DEBUG_MSG if (0) __db_debug_msg
#endif
/*
@@ -55,10 +61,11 @@ extern "C" {
* of structure fields whose only purpose is padding, as well as when heap
* memory that was never initialized is written to disk.
*/
+#define UMRW_SET(var) UMRW_SET_VALUE((var), 0)
#ifdef UMRW
-#define UMRW_SET(v) (v) = 0
+#define UMRW_SET_VALUE(var, value) (var) = (value)
#else
-#define UMRW_SET(v) NOP_STATEMENT
+#define UMRW_SET_VALUE(var, value) NOP_STATEMENT
#endif
/*
@@ -73,6 +80,34 @@ typedef enum {
} db_error_set_t;
/*
+ * Use these macros wherever an error condition is initially noticed, e.g., when
+ * setting a value to any of the user visible error return codes, whether
+ * defined by Berkeley DB or by the operating environment (EINVAL).
+ * saving the specific source of an instance of an error code, including the
+ * time, stack, db name, current LSN, etc. If the error turns out to be
+ * important, the deferred message text is added to the text produced by
+ * __db_err(), __db_errx, and __db_syserr(). The additional information can be
+ * useful for diagnosing the behavior of applications under error conditions.
+ * It is enabled by configuring with --enable-error_history. The current
+ * implmentation requires pthreads' version of thread local storage.
+ */
+#ifdef HAVE_ERROR_HISTORY
+#define USR_ERR(env, errcode) __db_diags((env), (errcode))
+#define DBC_ERR(dbc, errcode) __dbc_diags((dbc), (errcode))
+#define MUTEX_ERR(env, mutex, errcode) __mutex_diags((env), (mutex), (errcode))
+#define DISCARD_HISTORY(env) __db_deferred_discard()
+/* Save at most 10KB of error history in an API call. Adjust this as desired. */
+#define DB_ERROR_HISTORY_SIZE (10 * 1024)
+#else
+#define USR_ERR(env, errcode) (errcode)
+#define DBC_ERR(dbc, errcode) (errcode)
+#define MUTEX_ERR(env, mutex, errcode) (errcode)
+#define DISCARD_HISTORY(env) NOP_STATEMENT
+/* No space is needed when error history is disabled. */
+#define DB_ERROR_HISTORY_SIZE 0
+#endif
+
+/*
* Message handling. Use a macro instead of a function because va_list
* references to variadic arguments cannot be reset to the beginning of the
* variadic argument list (and then rescanned), by functions other than the
@@ -102,6 +137,7 @@ typedef enum {
((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \
__db_errfile(dbenv, error, error_set, fmt, __ap); \
va_end(__ap); \
+ DISCARD_HISTORY((dbenv)->env); \
}
#else
#define DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) { \
@@ -127,6 +163,7 @@ typedef enum {
((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \
__db_errfile(env, error, error_set, fmt, __ap); \
va_end(__ap); \
+ DISCARD_HISTORY(env); \
}
#endif
#if defined(STDC_HEADERS) || defined(__cplusplus)
@@ -192,7 +229,7 @@ typedef enum {
#define LOG_OP(C, T, O, K, A, F) { \
DB_LSN __lsn; \
DBT __op; \
- if (DBC_LOGGING((C))) { \
+ if ((C)->dbp->log_filename != NULL && DBC_LOGGING((C))) { \
memset(&__op, 0, sizeof(__op)); \
__op.data = O; \
__op.size = (u_int32_t)strlen(O) + 1; \
diff --git a/src/dbinc/fop.h b/src/dbinc/fop.h
index 94f27f9f..7ea62023 100644
--- a/src/dbinc/fop.h
+++ b/src/dbinc/fop.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -23,6 +23,20 @@ extern "C" {
(void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 0); \
} while (0)
+/*
+ * Never change the value of DB_FOP_CREATE (0x00000002),
+ * DB_FOP_APPEND (0x00000001), and DB_FOP_REDO(0x00000008),
+ * as those values are used in write_file logs.
+ */
+#define DB_FOP_APPEND 0x00000001 /* Appending to a file. */
+#define DB_FOP_CREATE 0x00000002 /* Creating the file. */
+#define DB_FOP_PARTIAL_LOG 0x00000004 /* Partial logging of file data. */
+#define DB_FOP_REDO 0x00000008 /* File operation can be redone. */
+#define DB_FOP_READONLY 0x00000010 /* File is read only. */
+#define DB_FOP_WRITE 0x00000020 /* File is writeable. */
+#define DB_FOP_SYNC_WRITE 0x00000040 /* Sync file on each write. */
+
+
#include "dbinc_auto/fileops_auto.h"
#include "dbinc_auto/fileops_ext.h"
diff --git a/src/dbinc/globals.h b/src/dbinc/globals.h
index 95e5c118..becd6365 100644
--- a/src/dbinc/globals.h
+++ b/src/dbinc/globals.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -52,21 +52,27 @@ typedef struct __db_globals {
char error_buf[40]; /* Error string buffer. */
- int uid_init; /* srand set in UID generator */
+ int random_seeded; /* Has __os_srandom been called? */
- u_long rand_next; /* rand/srand value */
+#if defined(HAVE_RANDOM_R)
+ struct random_data random_data; /* srandom_r/random_r argument */
+ char random_state[64]; /* random number state */
+#elif !defined(HAVE_RAND) && !defined(HAVE_RANDOM)
+ u_long rand_next; /* next rand value for clib/rand.c */
+#endif
u_int32_t fid_serial; /* file id counter */
int db_errno; /* Errno value if not available */
- size_t num_active_pids; /* number of entries in active_pids */
-
- size_t size_active_pids; /* allocated size of active_pids */
+ char *saved_errstr; /* saved error string from backup */
- pid_t *active_pids; /* array active pids */
+ char *time_format; /* strftime-format for printing dates */
- char *saved_errstr; /* saved error string from backup */
+#if defined(HAVE_ERROR_HISTORY) && defined(HAVE_PTHREAD_SELF)
+ pthread_key_t msgs_key;
+ pthread_once_t thread_once;
+#endif
/* Underlying OS interface jump table.*/
void (*j_assert) __P((const char *, const char *, int));
diff --git a/src/dbinc/hash.h b/src/dbinc/hash.h
index f485128a..55a64f87 100644
--- a/src/dbinc/hash.h
+++ b/src/dbinc/hash.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
@@ -56,7 +56,7 @@ typedef struct hash_t {
u_int32_t h_nelem; /* Number of elements. */
/* Hash and compare functions. */
u_int32_t (*h_hash) __P((DB *, const void *, u_int32_t));
- int (*h_compare) __P((DB *, const DBT *, const DBT *));
+ int (*h_compare) __P((DB *, const DBT *, const DBT *, size_t *));
} HASH;
/* Cursor structure definitions. */
diff --git a/src/dbinc/heap.h b/src/dbinc/heap.h
index ca3407e0..bb96ebec 100644
--- a/src/dbinc/heap.h
+++ b/src/dbinc/heap.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _DB_HEAP_H_
@@ -26,7 +26,8 @@ struct __heap { /* Heap access method. */
db_pgno_t curregion; /* The region of the next insert. */
db_pgno_t maxpgno; /* Maximum page number of a fixed size heap. */
- int curpgindx; /* The last used offset in the region's space bitmap. */
+ u_int32_t curpgindx; /* The last used offset in the
+ * region's space bitmap. */
};
struct __heap_cursor {
diff --git a/src/dbinc/hmac.h b/src/dbinc/hmac.h
index 2a495b17..f87965eb 100644
--- a/src/dbinc/hmac.h
+++ b/src/dbinc/hmac.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/lock.h b/src/dbinc/lock.h
index eab51832..298b8527 100644
--- a/src/dbinc/lock.h
+++ b/src/dbinc/lock.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -37,7 +37,10 @@ extern "C" {
*/
#define LOCK_INVALID INVALID_ROFF
#define LOCK_ISSET(lock) ((lock).off != LOCK_INVALID)
-#define LOCK_INIT(lock) ((lock).off = LOCK_INVALID)
+#define LOCK_INIT(lock) do { \
+ (lock).off = LOCK_INVALID; \
+ UMRW_SET_VALUE((lock).mode, DB_LOCK_NG); \
+} while(0)
/*
* Macro to identify a write lock for the purpose of counting locks
@@ -66,8 +69,8 @@ extern "C" {
typedef struct __db_lockregion { /* SHARED */
db_mutex_t mtx_region; /* Region mutex. */
- u_int32_t need_dd; /* flag for deadlock detector */
- u_int32_t detect; /* run dd on every conflict */
+ u_int32_t need_dd; /* run dd on every conflict */
+ u_int32_t detect; /* flag for deadlock detector */
db_timespec next_timeout; /* next time to expire a lock */
db_mutex_t mtx_dd; /* mutex for lock object dd list. */
db_mutex_t mtx_lockers; /* mutex for locker allocation. */
@@ -92,7 +95,7 @@ typedef struct __db_lockregion { /* SHARED */
u_int32_t lock_id; /* Current lock(er) id to allocate. */
u_int32_t cur_maxid; /* Current max lock(er) id. */
- u_int32_t nlockers; /* Current number of lockers. */
+ u_int32_t nlockers; /* Current number of locker ids. */
int32_t nmodes; /* Number of modes in conflict table. */
DB_LOCK_STAT stat; /* stats about locking. */
} DB_LOCKREGION;
@@ -157,12 +160,16 @@ struct __db_locker { /* SHARED */
db_timespec lk_expire; /* When current lock expires. */
db_timespec tx_expire; /* When this txn expires. */
db_timeout_t lk_timeout; /* How long do we let locks live. */
+#ifdef DIAGNOSTIC
+ roff_t prev_locker; /* The thread's previous dbth_locker. */
+#endif
#define DB_LOCKER_DIRTY 0x0001 /* Has write locks. */
#define DB_LOCKER_INABORT 0x0002 /* Is aborting, don't abort again. */
#define DB_LOCKER_TIMEOUT 0x0004 /* Has timeout set. */
#define DB_LOCKER_FAMILY_LOCKER 0x0008 /* Part of a family of lockers. */
#define DB_LOCKER_HANDLE_LOCKER 0x0010 /* Not associated with a thread. */
+#define DB_LOCKER_FREE 0x0020 /* Diag: it is on the free list. */
u_int32_t flags;
};
diff --git a/src/dbinc/log.h b/src/dbinc/log.h
index c4dea6fc..2e2929f0 100644
--- a/src/dbinc/log.h
+++ b/src/dbinc/log.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -55,6 +55,8 @@ struct __fname {
/* number of txn referencing + 1 for the db handle. */
u_int32_t txn_ref;
+ db_seq_t blob_file_id; /* BLOB file directory id. */
+
#define DB_FNAME_CLOSED 0x01 /* DBP was closed. */
#define DB_FNAME_DURABLE 0x02 /* File is durable. */
#define DB_FNAME_INMEM 0x04 /* File is in memory. */
@@ -137,16 +139,18 @@ struct __db_log {
ENV *env; /* Environment */
REGINFO reginfo; /* Region information. */
-#define DBLOG_AUTOREMOVE 0x01 /* Autoremove log files. */
-#define DBLOG_DIRECT 0x02 /* Do direct I/O on the log. */
-#define DBLOG_DSYNC 0x04 /* Set OS_DSYNC on the log. */
-#define DBLOG_FORCE_OPEN 0x08 /* Force the DB open even if it appears
+#define DBLOG_AUTOREMOVE 0x001 /* Autoremove log files. */
+#define DBLOG_BLOB 0x002 /* Full logging of blob data. */
+#define DBLOG_DIRECT 0x004 /* Do direct I/O on the log. */
+#define DBLOG_DSYNC 0x008 /* Set OS_DSYNC on the log. */
+#define DBLOG_FORCE_OPEN 0x010 /* Force the DB open even if it appears
* to be deleted. */
-#define DBLOG_INMEMORY 0x10 /* Logging is in memory. */
-#define DBLOG_OPENFILES 0x20 /* Prepared files need to be open. */
-#define DBLOG_RECOVER 0x40 /* We are in recovery. */
-#define DBLOG_ZERO 0x80 /* Zero fill the log. */
-#define DBLOG_VERIFYING 0x100 /* The log is being verified. */
+#define DBLOG_INMEMORY 0x020 /* Logging is in memory. */
+#define DBLOG_NOSYNC 0x040 /* Don't sync log files during flush. */
+#define DBLOG_OPENFILES 0x080 /* Prepared files need to be open. */
+#define DBLOG_RECOVER 0x100 /* We are in recovery. */
+#define DBLOG_ZERO 0x200 /* Zero fill the log. */
+#define DBLOG_VERIFYING 0x400 /* The log is being verified. */
u_int32_t flags;
};
@@ -251,7 +255,8 @@ struct __log { /* SHARED */
* rather than by the region mutex.
*/
db_mutex_t mtx_flush; /* Mutex guarding flushing. */
- int32_t in_flush; /* Log flush in progress. */
+ int32_t in_flush; /* Log flush in progress. */
+ int32_t nosync; /* log_set_config(DB_LOG_NOSYNC) */
DB_LSN s_lsn; /* LSN of the last sync. */
DB_LOG_STAT stat; /* Log statistics. */
diff --git a/src/dbinc/log_verify.h b/src/dbinc/log_verify.h
index fa90ace4..ec43c4d7 100644
--- a/src/dbinc/log_verify.h
+++ b/src/dbinc/log_verify.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h
index 9a10c6d9..598ca366 100644
--- a/src/dbinc/mp.h
+++ b/src/dbinc/mp.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -226,10 +226,15 @@ struct __mpool { /* SHARED */
#define DB_MEMP_SYNC_INTERRUPT 0x02
u_int32_t config_flags;
- /* Free frozen buffer headers, protected by the region lock. */
+ /* These MVCC fields are protected by the mpool region lock. */
+
+ /* This is the free list of BH_FROZEN_PAGEs, the frozen headers. */
SH_TAILQ_HEAD(__free_frozen) free_frozen;
- /* Allocated blocks of frozen buffer headers. */
+ /*
+ * This list of BH_FROZEN_ALLOCs contains all the BH_FROZEN_PAGEs,
+ * whether they are in free_frozen or busy (in a bh.vc version chain).
+ */
SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen;
};
@@ -550,9 +555,10 @@ struct __bh { /* SHARED */
#define BH_FROZEN 0x040 /* Frozen buffer: allocate & re-read. */
#define BH_TRASH 0x080 /* Page is garbage. */
#define BH_THAWED 0x100 /* Page was thawed. */
+#define BH_UNREACHABLE 0x200 /* Discard this defunct MVCC version. */
u_int16_t flags;
- u_int32_t priority; /* Priority. */
+ u_int32_t priority; /* Cache priority. */
SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */
db_pgno_t pgno; /* Underlying MPOOLFILE page number. */
@@ -587,9 +593,12 @@ struct __bh_frozen_p {
/*
* BH_FROZEN_ALLOC --
- * Frozen buffer headers are allocated a page at a time in general. This
- * structure is allocated at the beginning of the page so that the
- * allocation chunks can be tracked and freed (for private environments).
+ * This structure is the container for one or more frozen buffer headers.
+ * Blocks of BH_FROZEN_PAGE structs are usually allocated a page at a time,
+ * though when an mpool is nearly full and a whole page isn't available
+ * there can be single-item blocks. BH_FROZEN_ALLOC is the block header
+ * allocated at the beginning of the chunk and is linked to the mpool's
+ * alloc_frozen so that the allocation chunks can be tracked and freed.
*/
struct __bh_frozen_a {
SH_TAILQ_ENTRY links;
@@ -602,33 +611,36 @@ struct __bh_frozen_a {
(F_ISSET(PAGE_TO_BH(p), BH_DIRTY|BH_EXCLUSIVE) == (BH_DIRTY|BH_EXCLUSIVE))
#define BH_OWNER(env, bhp) \
- ((TXN_DETAIL *)R_ADDR(&env->tx_handle->reginfo, bhp->td_off))
+ ((TXN_DETAIL *)R_ADDR(&(env)->tx_handle->reginfo, (bhp)->td_off))
#define BH_OWNED_BY(env, bhp, txn) ((txn) != NULL && \
- (bhp)->td_off != INVALID_ROFF && \
- (txn)->td == BH_OWNER(env, bhp))
+ (bhp)->td_off != INVALID_ROFF && (txn)->td == BH_OWNER(env, bhp))
-#define VISIBLE_LSN(env, bhp) \
- (&BH_OWNER(env, bhp)->visible_lsn)
+#define VISIBLE_LSN(env, bhp) (&BH_OWNER(env, bhp)->visible_lsn)
/*
- * Make a copy of the buffer's visible LSN, one field at a time. We rely on the
- * 32-bit operations being atomic. The visible_lsn starts at MAX_LSN and is
- * set during commit or abort to the current LSN.
+ * MVCC Versions are visible only to snapshot transactions whose read_lsn is at
+ * least as recent (large) as the buffer's lsn. Visibility checks must be made
+ * from newest to oldest along bhp.vc, stopping at the first visible one.
+ * Unversioned buffers (those with invalid td_off) are always visible.
+ *
+ * BH_VISIBLE() makes a copy of the buffer's visible LSN, one field at a time.
+ * We rely on the 32-bit operations being atomic. The visible_lsn starts at
+ * MAX_LSN and is set during commit or abort to the current LSN.
*
- * If we race with a commit / abort, we may see either the file or the offset
+ * If we race with a commit or abort, we may see either the file or the offset
* still at UINT32_MAX, so vlsn is guaranteed to be in the future. That's OK,
* since we had to take the log region lock to allocate the read LSN so we were
* never going to see this buffer anyway.
*/
#define BH_VISIBLE(env, bhp, read_lsnp, vlsn) \
(bhp->td_off == INVALID_ROFF || \
- ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \
+ ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \
(vlsn).offset = VISIBLE_LSN(env, bhp)->offset, \
LOG_COMPARE((read_lsnp), &(vlsn)) >= 0))
#define BH_OBSOLETE(bhp, old_lsn, vlsn) (SH_CHAIN_HASNEXT(bhp, vc) ? \
- BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) :\
+ BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) : \
BH_VISIBLE(env, bhp, &(old_lsn), vlsn))
#define MVCC_SKIP_CURADJ(dbc, pgno) (dbc->txn != NULL && \
diff --git a/src/dbinc/mutex.h b/src/dbinc/mutex.h
index b699142c..334d8f96 100644
--- a/src/dbinc/mutex.h
+++ b/src/dbinc/mutex.h
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -24,10 +24,14 @@ extern "C" {
#endif
/*
- * By default, spin 50 times per processor if fail to acquire a test-and-set
- * mutex, we have anecdotal evidence it's a reasonable value.
+ * These specify the default spin parameters for test-and-set mutexes. A single
+ * processor system spins just once, a multiprocessor system spins 50 times per
+ * processor up to a default maximum of 200. This limit reduces excessive
+ * busy-waiting on machines with many hyperthreads. We have anecdotal evidence
+ * that these are reasonable default values.
*/
#define MUTEX_SPINS_PER_PROCESSOR 50
+#define MUTEX_SPINS_DEFAULT_MAX 200
/*
* Mutexes are represented by unsigned, 32-bit integral values. As the
@@ -163,13 +167,6 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
#define __mutex_rdlock(a, b) __db_win32_mutex_readlock(a, b)
#define __mutex_tryrdlock(a, b) __db_win32_mutex_tryreadlock(a, b)
#endif
-#elif defined(HAVE_MUTEX_FCNTL)
-#define __mutex_init(a, b, c) __db_fcntl_mutex_init(a, b, c)
-#define __mutex_lock(a, b) __db_fcntl_mutex_lock(a, b, 0)
-#define __mutex_timedlock(a, b, c) __db_fcntl_lock(a, b, c)
-#define __mutex_trylock(a, b) __db_fcntl_mutex_trylock(a, b)
-#define __mutex_unlock(a, b) __db_fcntl_mutex_unlock(a, b)
-#define __mutex_destroy(a, b) __db_fcntl_mutex_destroy(a, b)
#else
#define __mutex_init(a, b, c) __db_tas_mutex_init(a, b, c)
#define __mutex_lock(a, b) __db_tas_mutex_lock(a, b, 0)
@@ -184,9 +181,8 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
#endif
/*
- * When there is no method to get a shared latch, fall back to
- * implementing __mutex_rdlock() as getting an exclusive one.
- * This occurs either when !HAVE_SHARED_LATCHES or HAVE_MUTEX_FCNTL.
+ * When there is no method to get a shared latch, fall back to implementing
+ * __mutex_rdlock() as an exclusive one. This may no longer be supported?
*/
#ifndef __mutex_rdlock
#define __mutex_rdlock(a, b) __mutex_lock(a, b)
@@ -199,17 +195,25 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
* Lock/unlock a mutex. If the mutex was never required, the thread of
* control can proceed without it.
*
- * We never fail to acquire or release a mutex without panicing. Simplify
+ * We rarely fail to acquire or release a mutex without panicing. Simplify
* the macros to always return a panic value rather than saving the actual
- * return value of the mutex routine.
+ * return value of the mutex routine. Use MUTEX_LOCK_RET() when the caller has
+ * a code path for a mutex failure, e.g., when cleaning up after a panic.
*/
#ifdef HAVE_MUTEX_SUPPORT
#define MUTEX_LOCK(env, mutex) do { \
- if ((mutex) != MUTEX_INVALID && \
- __mutex_lock(env, mutex) != 0) \
+ if ((mutex) != MUTEX_INVALID && __mutex_lock(env, mutex) != 0) \
return (DB_RUNRECOVERY); \
} while (0)
+#define MUTEX_LOCK_RET(env, mutex) \
+ ((mutex) == MUTEX_INVALID ? 0 : __mutex_lock(env, mutex))
+
+/*
+ * Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success,
+ * or possibly DB_RUNRECOVERY for failchk.
+ */
+
/*
* Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success,
* or DB_LOCK_NOTGRANTED, or possibly DB_RUNRECOVERY for failchk.
@@ -217,9 +221,7 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
#define MUTEX_TRYLOCK(env, mutex) \
(((mutex) == MUTEX_INVALID) ? 0 : __mutex_trylock(env, mutex))
-/*
- * Acquire a DB_MUTEX_SHARED "mutex" in shared mode.
- */
+/* Acquire a latch (a DB_MUTEX_SHARED "mutex") in shared mode. */
#define MUTEX_READLOCK(env, mutex) do { \
if ((mutex) != MUTEX_INVALID && \
__mutex_rdlock(env, mutex) != 0) \
@@ -234,30 +236,68 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex)
return (DB_RUNRECOVERY); \
} while (0)
-#define MUTEX_WAIT(env, mutex, duration) do { \
- int __ret; \
- if ((mutex) != MUTEX_INVALID && \
- (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \
- __ret != DB_TIMEOUT) \
- return (DB_RUNRECOVERY); \
+#define MUTEX_WAIT(env, mutex, duration) do { \
+ int __ret; \
+ if ((mutex) != MUTEX_INVALID && \
+ (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \
+ __ret != DB_TIMEOUT) \
+ return (DB_RUNRECOVERY); \
} while (0)
+
+/*
+ * Check that a particular mutex is exclusively held at least by someone, not
+ * necessarily the current thread.
+ */
+#define MUTEX_IS_OWNED(env, mutex) \
+ (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \
+ F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \
+ F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED))
#else
/*
* There are calls to lock/unlock mutexes outside of #ifdef's -- replace
* the call with something the compiler can discard, but which will make
- * if-then-else blocks work correctly.
+ * if-then-else blocks work correctly, and suppress unused variable messages.
+ */
+#define MUTEX_LOCK(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_LOCK_RET(env, mutex) ( env = (env), mutex = (mutex), 0)
+#define MUTEX_TRYLOCK(env, mutex) ( env = (env), mutex = (mutex), 0)
+#define MUTEX_READLOCK(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_TRY_READLOCK(env, mutex) ( env = (env), mutex = (mutex), 0 )
+#define MUTEX_UNLOCK(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_REQUIRED(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_REQUIRED_READ(env, mutex) { env = (env); mutex = (mutex); }
+#define MUTEX_WAIT(env, mutex, duration) { \
+ (env) = (env); (mutex) = (mutex); (duration) = (duration); \
+}
+
+/*
+ * Every MUTEX_IS_OWNED() caller expects to own it. When there is no mutex
+ * support, act as if we have ownership.
*/
-#define MUTEX_LOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_TRYLOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_READLOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_TRY_READLOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_UNLOCK(env, mutex) (mutex) = (mutex)
-#define MUTEX_REQUIRED(env, mutex) (mutex) = (mutex)
-#define MUTEX_REQUIRED_READ(env, mutex) (mutex) = (mutex)
-#define MUTEX_WAIT(env, mutex, duration) (mutex) = (mutex)
+#define MUTEX_IS_OWNED(env, mutex) 1
#endif
/*
+ * Bulk initialization of mutexes in regions.
+ */
+
+#define MUTEX_BULK_INIT(env, region, start, howmany) do { \
+ DB_MUTEX *__mutexp; \
+ db_mutex_t __i = start; \
+ u_int32_t __n = howmany; \
+ for (__mutexp = MUTEXP_SET(env, __i); \
+ --__n > 0; \
+ __mutexp = MUTEXP_SET(env, __i)) { \
+ __mutexp->flags = 0; \
+ __i = (F_ISSET(env, ENV_PRIVATE)) ? \
+ ((uintptr_t)__mutexp + region->mutex_size) : __i + 1; \
+ __mutexp->mutex_next_link = __i; \
+ } \
+ __mutexp->flags = 0; \
+ __mutexp->mutex_next_link = MUTEX_INVALID; \
+} while (0)
+
+/*
* Berkeley DB ports may require single-threading at places in the code.
*/
#ifdef HAVE_MUTEX_VXWORKS
diff --git a/src/dbinc/mutex_int.h b/src/dbinc/mutex_int.h
index b9bccdf7..4a4468af 100644
--- a/src/dbinc/mutex_int.h
+++ b/src/dbinc/mutex_int.h
@@ -1,7 +1,7 @@
/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -73,6 +73,14 @@ extern "C" {
else \
RET_SET((pthread_mutex_lock(&(mutexp)->u.m.mutex)), ret); \
} while (0)
+#define RET_SET_PTHREAD_TIMEDLOCK(mutexp, timespec, ret) do { \
+ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \
+ RET_SET(pthread_rwlock_timedwrlock(&(mutexp)->u.rwlock, \
+ (timespec)), ret); \
+ else \
+ RET_SET(pthread_mutex_timedlock(&(mutexp)->u.m.mutex, \
+ (timespec)), ret); \
+} while (0)
#define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) do { \
if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \
RET_SET((pthread_rwlock_trywrlock(&(mutexp)->u.rwlock)), \
@@ -84,6 +92,9 @@ extern "C" {
#else
#define RET_SET_PTHREAD_LOCK(mutexp, ret) \
RET_SET(pthread_mutex_lock(&(mutexp)->u.m.mutex), ret);
+#define RET_SET_PTHREAD_TIMEDLOCK(mutexp, timespec, ret) \
+ RET_SET(pthread_mutex_timedlock(&(mutexp)->u.m.mutex, \
+ (timespec)), ret);
#define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) \
RET_SET(pthread_mutex_trylock(&(mutexp)->u.m.mutex), ret);
#endif
@@ -267,6 +278,11 @@ typedef abilock_t tsl_t;
#include <sys/machlock.h>
typedef lock_t tsl_t;
+/*
+ * Solaris requires 8 byte alignment for pthread_mutex_t values.
+ */
+#define MUTEX_ALIGN 8
+
/*
* The functions are declared in <sys/machlock.h>, but under #ifdef KERNEL.
* Re-declare them here to avoid warnings.
@@ -778,6 +794,7 @@ MUTEX_SET(tsl_t *tsl) {
static inline void
MUTEX_UNSET(tsl_t *tsl) {
__asm__ volatile(
+ " .set mips2 \n"
" .set noreorder \n"
" sync \n"
" sw $0, %0 \n"
@@ -892,15 +909,22 @@ struct __db_mutexmgr {
REGINFO reginfo; /* Region information */
void *mutex_array; /* Base of the mutex array */
+#ifdef HAVE_FAILCHK_BROADCAST
+ /*
+ * The mutex lock functions wait for at most this long between checks
+ * for DB_MUTEX_OWNER_DEAD. This field needs no mutex protection.
+ */
+ db_timeout_t failchk_polltime;
+#endif
};
/* Macros to lock/unlock the mutex region as a whole. */
-#define MUTEX_SYSTEM_LOCK(dbenv) \
- MUTEX_LOCK(dbenv, ((DB_MUTEXREGION *) \
- (dbenv)->mutex_handle->reginfo.primary)->mtx_region)
-#define MUTEX_SYSTEM_UNLOCK(dbenv) \
- MUTEX_UNLOCK(dbenv, ((DB_MUTEXREGION *) \
- (dbenv)->mutex_handle->reginfo.primary)->mtx_region)
+#define MUTEX_SYSTEM_LOCK(env) \
+ MUTEX_LOCK(env, ((DB_MUTEXREGION *) \
+ (env)->mutex_handle->reginfo.primary)->mtx_region)
+#define MUTEX_SYSTEM_UNLOCK(env) \
+ MUTEX_UNLOCK(env, ((DB_MUTEXREGION *) \
+ (env)->mutex_handle->reginfo.primary)->mtx_region)
/*
* DB_MUTEXREGION --
@@ -927,6 +951,16 @@ typedef struct __db_mutexregion { /* SHARED */
} DB_MUTEXREGION;
#ifdef HAVE_MUTEX_SUPPORT
+/*
+ * MTX_DIAG turns on the recording of when and where a mutex was locked. It has
+ * a large impact, and should only be turned on when debugging mutexes.
+ */
+#define MUTEX_STACK_TEXT_SIZE 600
+typedef struct __mutex_history { /* SHARED */
+ db_timespec when;
+ char stacktext[MUTEX_STACK_TEXT_SIZE];
+} MUTEX_HISTORY;
+
struct __db_mutex_t { /* SHARED */ /* Mutex. */
#ifdef MUTEX_FIELDS
MUTEX_FIELDS /* Opaque thread mutex structures. */
@@ -959,9 +993,9 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */
db_mutex_t mutex_next_link; /* Linked list of free mutexes. */
-#ifdef HAVE_STATISTICS
int alloc_id; /* Allocation ID. */
+#ifdef HAVE_STATISTICS
u_int32_t mutex_set_wait; /* Granted after wait. */
u_int32_t mutex_set_nowait; /* Granted without waiting. */
#ifdef HAVE_SHARED_LATCHES
@@ -973,7 +1007,9 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */
u_int32_t hybrid_wakeup; /* for counting spurious wakeups */
#endif
#endif
-
+#ifdef MUTEX_DIAG
+ MUTEX_HISTORY mutex_history;
+#endif
/*
* A subset of the flag arguments for __mutex_alloc().
*
@@ -992,19 +1028,6 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */
(indx) * \
((DB_MUTEXREGION *)env->mutex_handle->reginfo.primary)->mutex_size))
-/*
- * Check that a particular mutex is exclusively held at least by someone, not
- * necessarily the current thread.
- */
-#ifdef HAVE_MUTEX_SUPPORT
-#define MUTEX_IS_OWNED(env, mutex) \
- (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \
- F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \
- F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED))
-#else
-#define MUTEX_IS_OWNED(env, mutex) 0
-#endif
-
#if defined(HAVE_MUTEX_HYBRID) || defined(DB_WIN32) || \
(defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS))
#define MUTEXP_IS_BUSY(mutexp) \
diff --git a/src/dbinc/os.h b/src/dbinc/os.h
index 2515e6ee..ea1fd2c4 100644
--- a/src/dbinc/os.h
+++ b/src/dbinc/os.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/partition.h b/src/dbinc/partition.h
index 09e42573..11cdfa6f 100644
--- a/src/dbinc/partition.h
+++ b/src/dbinc/partition.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* $Id$
@@ -22,6 +22,7 @@ typedef struct __db_partition {
u_int32_t (*callback) (DB *, DBT *);
#define PART_CALLBACK 0x01
#define PART_RANGE 0x02
+#define PART_KEYS_SETUP 0x04
u_int32_t flags;
} DB_PARTITION;
@@ -36,7 +37,14 @@ typedef struct __part_internal {
#ifdef HAVE_PARTITION
#define PART_NAME "__dbp.%s.%03d"
-#define PART_LEN (strlen("__dbp..")+3)
+/*
+ * Currently we only support no more than 1000000 partitions.
+ * If the limit is changed, the PART_DIGITS and PART_MAXIMUM
+ * should be changed accordingly.
+ */
+#define PART_DIGITS 6
+#define PART_MAXIMUM 1000000
+#define PART_LEN (sizeof("__dbp..") + PART_DIGITS)
#define PART_PREFIX "__dbp."
#define IS_PARTITION_DB_FILE(name) (strncmp(name, PART_PREFIX, \
sizeof(PART_PREFIX) - 1) == 0)
diff --git a/src/dbinc/perfmon.h b/src/dbinc/perfmon.h
index c3b9b9fa..e89eba33 100644
--- a/src/dbinc/perfmon.h
+++ b/src/dbinc/perfmon.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/qam.h b/src/dbinc/qam.h
index 657c11e2..d18f91f3 100644
--- a/src/dbinc/qam.h
+++ b/src/dbinc/qam.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/queue.h b/src/dbinc/queue.h
index 5a62741a..c53941ab 100644
--- a/src/dbinc/queue.h
+++ b/src/dbinc/queue.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1991, 1993
diff --git a/src/dbinc/region.h b/src/dbinc/region.h
index ac0ff16f..070aff5f 100644
--- a/src/dbinc/region.h
+++ b/src/dbinc/region.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -134,7 +134,10 @@ typedef enum {
REGION_TYPE_LOG,
REGION_TYPE_MPOOL,
REGION_TYPE_MUTEX,
- REGION_TYPE_TXN } reg_type_t;
+ REGION_TYPE_TXN,
+ /* This enum always must be the last, and is the largest valid type. */
+ REGION_TYPE_MAX = REGION_TYPE_TXN
+} reg_type_t;
#define INVALID_REGION_SEGID -1 /* Segment IDs are either shmget(2) or
* Win16 segment identifiers. They are
@@ -196,10 +199,10 @@ typedef struct __db_reg_env { /* SHARED */
/*
- * The mtx_regenv mutex protects the environment reference count and
- * memory allocation from the primary shared region (the crypto, thread
- * control block and replication implementations allocate memory from
- * the primary shared region).
+ * The mtx_regenv mutex protects the environment reference count,
+ * blob threshold and memory allocation from the primary shared region
+ * (the crypto, thread control block and replication implementations
+ * allocate memory from the primary shared region).
*
* The rest of the fields are initialized at creation time, and don't
* need mutex protection. The flags, op_timestamp and rep_timestamp
@@ -209,6 +212,7 @@ typedef struct __db_reg_env { /* SHARED */
*/
db_mutex_t mtx_regenv; /* Refcnt, region allocation mutex. */
u_int32_t refcnt; /* References to the environment. */
+ u_int32_t blob_threshold; /* Environment wide blob threshold. */
u_int32_t region_cnt; /* Number of REGIONs. */
roff_t region_off; /* Offset of region array */
@@ -227,6 +231,8 @@ typedef struct __db_reg_env { /* SHARED */
time_t op_timestamp; /* Timestamp for operations. */
time_t rep_timestamp; /* Timestamp for rep db handles. */
u_int32_t reg_panic; /* DB_REGISTER triggered panic */
+ u_int32_t failure_panic; /* Failchk or mutex lock saw a crash. */
+ char failure_symptom[DB_FAILURE_SYMPTOM_SIZE];
uintmax_t unused; /* The ALLOC_LAYOUT structure follows
* the REGENV structure in memory and
* contains uintmax_t fields. Force
@@ -308,11 +314,14 @@ struct __db_reginfo_t { /* __env_region_attach IN parameters. */
/*
* PANIC_ISSET, PANIC_CHECK:
- * Check to see if the DB environment is dead.
+ * Check to see if the DB environment is dead. If the environment is still
+ * attached to its regions, look in the REGENV. Otherwise, check whether
+ * the region had the panic state set when this even detached from it.
*/
#define PANIC_ISSET(env) \
- ((env) != NULL && (env)->reginfo != NULL && \
- ((REGENV *)(env)->reginfo->primary)->panic != 0 && \
+ ((env) != NULL && ((env)->reginfo != NULL ? \
+ ((REGENV *)(env)->reginfo->primary)->panic != 0 : \
+ F_ISSET(env, ENV_REMEMBER_PANIC)) && \
!F_ISSET((env)->dbenv, DB_ENV_NOPANIC))
#define PANIC_CHECK(env) \
diff --git a/src/dbinc/rep.h b/src/dbinc/rep.h
index 75004239..f3bdf481 100644
--- a/src/dbinc/rep.h
+++ b/src/dbinc/rep.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -19,6 +19,7 @@ extern "C" {
* Names of client temp databases.
*/
#define REPFILEPREFIX "__db.rep"
+#define REPBLOBNAME "__db.rep.blob.db"
#define REPDBNAME "__db.rep.db"
#define REPPAGENAME "__db.reppg.db"
@@ -42,43 +43,58 @@ extern "C" {
/*
* Message types
*/
-#define REP_INVALID 0 /* Invalid message type. */
-#define REP_ALIVE 1 /* I am alive message. */
-#define REP_ALIVE_REQ 2 /* Request for alive messages. */
-#define REP_ALL_REQ 3 /* Request all log records greater than LSN. */
-#define REP_BULK_LOG 4 /* Bulk transfer of log records. */
-#define REP_BULK_PAGE 5 /* Bulk transfer of pages. */
-#define REP_DUPMASTER 6 /* Duplicate master detected; propagate. */
-#define REP_FILE 7 /* Page of a database file. NOTUSED */
-#define REP_FILE_FAIL 8 /* File requested does not exist. */
-#define REP_FILE_REQ 9 /* Request for a database file. NOTUSED */
-#define REP_LEASE_GRANT 10 /* Client grants a lease to a master. */
-#define REP_LOG 11 /* Log record. */
-#define REP_LOG_MORE 12 /* There are more log records to request. */
-#define REP_LOG_REQ 13 /* Request for a log record. */
-#define REP_MASTER_REQ 14 /* Who is the master */
-#define REP_NEWCLIENT 15 /* Announces the presence of a new client. */
-#define REP_NEWFILE 16 /* Announce a log file change. */
-#define REP_NEWMASTER 17 /* Announces who the master is. */
-#define REP_NEWSITE 18 /* Announces that a site has heard from a new
- * site; like NEWCLIENT, but indirect. A
- * NEWCLIENT message comes directly from the new
- * client while a NEWSITE comes indirectly from
- * someone who heard about a NEWSITE.
- */
-#define REP_PAGE 19 /* Database page. */
-#define REP_PAGE_FAIL 20 /* Requested page does not exist. */
-#define REP_PAGE_MORE 21 /* There are more pages to request. */
-#define REP_PAGE_REQ 22 /* Request for a database page. */
-#define REP_REREQUEST 23 /* Force rerequest. */
-#define REP_START_SYNC 24 /* Tell client to begin syncing a ckp.*/
-#define REP_UPDATE 25 /* Environment hotcopy information. */
-#define REP_UPDATE_REQ 26 /* Request for hotcopy information. */
-#define REP_VERIFY 27 /* A log record for verification. */
-#define REP_VERIFY_FAIL 28 /* The client is outdated. */
-#define REP_VERIFY_REQ 29 /* Request for a log record to verify. */
-#define REP_VOTE1 30 /* Send out your information for an election. */
-#define REP_VOTE2 31 /* Send a "you are master" vote. */
+#define REP_INVALID 0 /* Invalid message type. */
+#define REP_ALIVE 1 /* I am alive message. */
+#define REP_ALIVE_REQ 2 /* Request for alive messages. */
+#define REP_ALL_REQ 3 /* Request all log records greater than
+ * LSN. */
+#define REP_BLOB_ALL_REQ 4 /* Request all the given blob files. */
+#define REP_BLOB_CHUNK 5 /* A piece of data contained in a blob
+ * file. */
+#define REP_BLOB_CHUNK_REQ 6 /* Request a piece of data from a blob
+ * file. */
+#define REP_BLOB_UPDATE 7 /* A list of blob files for a
+ * database. */
+#define REP_BLOB_UPDATE_REQ 8 /* Request blob files. */
+#define REP_BULK_LOG 9 /* Bulk transfer of log records. */
+#define REP_BULK_PAGE 10 /* Bulk transfer of pages. */
+#define REP_DUPMASTER 11 /* Duplicate master detected;
+ * propagate. */
+#define REP_FILE 12 /* Page of a database file. NOTUSED */
+#define REP_FILE_FAIL 13 /* File requested does not exist. */
+#define REP_FILE_REQ 14 /* Request for a database file.
+ * NOTUSED */
+#define REP_LEASE_GRANT 15 /* Client grants a lease to a master. */
+#define REP_LOG 16 /* Log record. */
+#define REP_LOG_MORE 17 /* There are more log records to
+ * request. */
+#define REP_LOG_REQ 18 /* Request for a log record. */
+#define REP_MASTER_REQ 19 /* Who is the master */
+#define REP_NEWCLIENT 20 /* Announces the presence of a new
+ * client. */
+#define REP_NEWFILE 21 /* Announce a log file change. */
+#define REP_NEWMASTER 22 /* Announces who the master is. */
+#define REP_NEWSITE 23 /* Announces that a site has heard from
+ * a new site; like NEWCLIENT, but
+ * indirect. A NEWCLIENT message comes
+ * directly from the new client while a
+ * NEWSITE comes indirectly from
+ * someone who heard about a NEWSITE.*/
+#define REP_PAGE 24 /* Database page. */
+#define REP_PAGE_FAIL 25 /* Requested page does not exist. */
+#define REP_PAGE_MORE 26 /* There are more pages to request. */
+#define REP_PAGE_REQ 27 /* Request for a database page. */
+#define REP_REREQUEST 28 /* Force rerequest. */
+#define REP_START_SYNC 29 /* Tell client to begin syncing a ckp.*/
+#define REP_UPDATE 30 /* Environment hotcopy information. */
+#define REP_UPDATE_REQ 31 /* Request for hotcopy information. */
+#define REP_VERIFY 32 /* A log record for verification. */
+#define REP_VERIFY_FAIL 33 /* The client is outdated. */
+#define REP_VERIFY_REQ 34 /* Request for a log record to
+ * verify. */
+#define REP_VOTE1 35 /* Send out your information for an
+ * election. */
+#define REP_VOTE2 36 /* Send a "you are master" vote. */
/*
* Maximum message number for conversion tables. Update this
* value as the largest message number above increases.
@@ -90,7 +106,7 @@ extern "C" {
* NOTE: When changing messages above, the two tables for upgrade support
* need adjusting. They are in rep_util.c.
*/
-#define REP_MAX_MSG 31
+#define REP_MAX_MSG 36
/*
* This is the list of client-to-client requests messages.
@@ -99,6 +115,8 @@ extern "C" {
*/
#define REP_MSG_REQ(rectype) \
(rectype == REP_ALL_REQ || \
+ rectype == REP_BLOB_ALL_REQ || \
+ rectype == REP_BLOB_CHUNK_REQ || \
rectype == REP_LOG_REQ || \
rectype == REP_PAGE_REQ || \
rectype == REP_VERIFY_REQ)
@@ -125,6 +143,9 @@ extern "C" {
#define DB_LOGVERSION_51 17
#define DB_LOGVERSION_52 18
#define DB_LOGVERSION_53 19
+#define DB_LOGVERSION_60 20
+#define DB_LOGVERSION_60p1 21
+#define DB_LOGVERSION_61 22
#define DB_LOGVERSION_MIN DB_LOGVERSION_44
#define DB_REPVERSION_INVALID 0
#define DB_REPVERSION_44 3
@@ -132,11 +153,12 @@ extern "C" {
#define DB_REPVERSION_46 4
#define DB_REPVERSION_47 5
#define DB_REPVERSION_48 5
-#define DB_REPVERSION_50 5
#define DB_REPVERSION_51 5
#define DB_REPVERSION_52 6
#define DB_REPVERSION_53 7
-#define DB_REPVERSION DB_REPVERSION_53
+#define DB_REPVERSION_60 7
+#define DB_REPVERSION_61 8
+#define DB_REPVERSION DB_REPVERSION_61
#define DB_REPVERSION_MIN DB_REPVERSION_44
/*
@@ -204,9 +226,20 @@ extern "C" {
#define REP_INITVERSION 3
/*
+ * View/partial replication file name.
+ * The file is empty. It exists as a permanent indicator that this
+ * environment can never be master.
+ */
+#define REPVIEW "__db.rep.view"
+#define IS_VIEW_SITE(env) \
+ (REP_ON(env) && \
+ ((env)->rep_handle->region->stat.st_view != 0))
+
+/*
* Database types for __rep_client_dbinit
*/
typedef enum {
+ REP_BLOB, /* Blob file database. */
REP_DB, /* Log record database. */
REP_PG /* Pg database. */
} repdb_t;
@@ -239,7 +272,7 @@ typedef enum {
typedef enum {
SYNC_OFF, /* No recovery. */
SYNC_LOG, /* Recovery - log. */
- SYNC_PAGE, /* Recovery - pages. */
+ SYNC_PAGE, /* Recovery - pages and blobs. */
SYNC_UPDATE, /* Recovery - update. */
SYNC_VERIFY /* Recovery - verify. */
} repsync_t;
@@ -346,6 +379,17 @@ typedef struct __rep { /* SHARED */
u_int32_t first_vers; /* Log version of first log file. */
DB_LSN last_lsn; /* Latest LSN we need. */
/* These are protected by mtx_clientdb. */
+ db_seq_t gap_bl_hi_id; /* Last id in the blob gap. */
+ db_seq_t gap_bl_hi_sid; /* Last sid in the blob gap. */
+ off_t gap_bl_hi_off; /* Last offset in the blob gap. */
+ db_seq_t last_blob_id; /* Last id on the list to process. */
+ db_seq_t last_blob_sid; /* Last sid on the list to process. */
+ db_seq_t prev_blob_id; /* Previous last id on list. */
+ db_seq_t prev_blob_sid; /* Previous last sid on list. */
+ db_seq_t highest_id; /* Highest file id to request. */
+ u_int32_t blob_more_files;/* More blob files to be processed. */
+ int blob_sync; /* Currently handling blobs. */
+ int blob_rereq; /* When to rereq a blob update msg. */
db_timespec last_pg_ts; /* Last page stored timestamp. */
db_pgno_t ready_pg; /* Next pg expected. */
db_pgno_t waiting_pg; /* First pg after gap. */
@@ -391,11 +435,13 @@ typedef struct __rep { /* SHARED */
roff_t siteinfo_off; /* Offset of site array region. */
u_int site_cnt; /* Array slots in use. */
u_int site_max; /* Total array slots allocated. */
+ u_int sites_avail; /* Total number of available sites. */
int self_eid; /* Where to find the local site. */
u_int siteinfo_seq; /* Number of updates to this info. */
u_int32_t min_log_file; /* Earliest log needed by repgroup. */
pid_t listener;
+ u_int listener_nthreads; /* # of msg threads in listener. */
int perm_policy;
db_timeout_t ack_timeout;
@@ -403,6 +449,11 @@ typedef struct __rep { /* SHARED */
db_timeout_t connection_retry_wait;
db_timeout_t heartbeat_frequency; /* Max period between msgs. */
db_timeout_t heartbeat_monitor_timeout;
+ u_int32_t inqueue_max_gbytes;
+ u_int32_t inqueue_max_bytes;
+ u_int32_t inqueue_rz_gbytes;
+ u_int32_t inqueue_rz_bytes;
+ u_int32_t inqueue_full_event_on;
#endif /* HAVE_REPLICATION_THREADS */
/* Statistics. */
@@ -419,12 +470,16 @@ typedef struct __rep { /* SHARED */
#define REP_C_2SITE_STRICT 0x00001 /* Don't cheat on elections. */
#define REP_C_AUTOINIT 0x00002 /* Auto initialization. */
#define REP_C_AUTOROLLBACK 0x00004 /* Discard client txns: sync. */
-#define REP_C_BULK 0x00008 /* Bulk transfer. */
-#define REP_C_DELAYCLIENT 0x00010 /* Delay client sync-up. */
-#define REP_C_ELECTIONS 0x00020 /* Repmgr to use elections. */
-#define REP_C_INMEM 0x00040 /* In-memory replication. */
-#define REP_C_LEASE 0x00080 /* Leases configured. */
-#define REP_C_NOWAIT 0x00100 /* Immediate error return. */
+#define REP_C_AUTOTAKEOVER 0x00008 /* Auto listener take over. */
+#define REP_C_BULK 0x00010 /* Bulk transfer. */
+#define REP_C_DELAYCLIENT 0x00020 /* Delay client sync-up. */
+#define REP_C_ELECT_LOGLENGTH 0x00040 /* Log length wins election. */
+#define REP_C_ELECTIONS 0x00080 /* Repmgr to use elections. */
+#define REP_C_INMEM 0x00100 /* In-memory replication. */
+#define REP_C_LEASE 0x00200 /* Leases configured. */
+#define REP_C_NOWAIT 0x00400 /* Immediate error return. */
+#define REP_C_PREFMAS_CLIENT 0x00800 /* Preferred master client. */
+#define REP_C_PREFMAS_MASTER 0x01000 /* Preferred master site. */
u_int32_t config; /* Configuration flags. */
/* Election. */
@@ -455,15 +510,17 @@ typedef struct __rep { /* SHARED */
#define REP_F_CLIENT 0x00000008 /* Client replica. */
#define REP_F_DELAY 0x00000010 /* Delaying client sync-up. */
#define REP_F_GROUP_ESTD 0x00000020 /* Rep group is established. */
-#define REP_F_INUPDREQ 0x00000040 /* Thread in rep_update_req. */
-#define REP_F_LEASE_EXPIRED 0x00000080 /* Leases guaranteed expired. */
-#define REP_F_MASTER 0x00000100 /* Master replica. */
-#define REP_F_MASTERELECT 0x00000200 /* Master elect. */
-#define REP_F_NEWFILE 0x00000400 /* Newfile in progress. */
-#define REP_F_NIMDBS_LOADED 0x00000800 /* NIMDBs are materialized. */
-#define REP_F_SKIPPED_APPLY 0x00001000 /* Skipped applying a record. */
-#define REP_F_START_CALLED 0x00002000 /* Rep_start called. */
-#define REP_F_SYS_DB_OP 0x00004000 /* Operation in progress. */
+#define REP_F_HOLD_GEN 0x00000040 /* PrefMas startup hold gen. */
+#define REP_F_INUPDREQ 0x00000080 /* Thread in rep_update_req. */
+#define REP_F_LEASE_EXPIRED 0x00000100 /* Leases guaranteed expired. */
+#define REP_F_MASTER 0x00000200 /* Master replica. */
+#define REP_F_MASTERELECT 0x00000400 /* Master elect. */
+#define REP_F_NEWFILE 0x00000800 /* Newfile in progress. */
+#define REP_F_NIMDBS_LOADED 0x00001000 /* NIMDBs are materialized. */
+#define REP_F_READONLY_MASTER 0x00002000 /* PrefMas readonly master. */
+#define REP_F_SKIPPED_APPLY 0x00004000 /* Skipped applying a record. */
+#define REP_F_START_CALLED 0x00008000 /* Rep_start called. */
+#define REP_F_SYS_DB_OP 0x00010000 /* Operation in progress. */
u_int32_t flags;
} REP;
@@ -525,7 +582,7 @@ do { \
/*
* REP_F_EPHASE0 is not a *real* election phase. It is used for
* master leases and allowing the client to find the master or
- * expire its lease. However, EPHASE0 is cleared by __rep_elect_done.
+ * expire its lease.
*/
#define IN_ELECTION(R) \
FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2)
@@ -594,6 +651,22 @@ do { \
} while (0)
+/* Macros to determine current replication configuration options. */
+#define REP_CONFIG_IS_SET(env, flags) \
+ (REP_ON(env) ? \
+ FLD_ISSET(((env)->rep_handle->region)->config, flags) : \
+ FLD_ISSET(((env)->rep_handle)->config, flags))
+#ifdef HAVE_REPLICATION_THREADS
+#define PREFMAS_IS_SET(env) \
+ (REP_CONFIG_IS_SET(env, \
+ (REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT)))
+#else
+#define PREFMAS_IS_SET(env) 0
+#endif
+#define IS_PREFMAS_MODE(env) \
+ (REP_ON(env) && PREFMAS_IS_SET(env) && \
+ ((env)->rep_handle->region)->config_nsites < 3)
+
/*
* Gap processing flags. These provide control over the basic
* gap processing algorithm for some special cases.
@@ -603,11 +676,28 @@ do { \
/* REREQUEST is a superset of FORCE. */
/*
+ * Internal options for rep_start_int(). These are used by preferred master
+ * mode to help coordinate between the sites during changes of master.
+ */
+#define REP_START_FORCE_ROLECHG 0x001 /* Force role change to advance gen. */
+#define REP_START_HOLD_CLIGEN 0x002 /* Hold client gen before doing
+ * lsnhist match. */
+#define REP_START_WAIT_LOCKMSG 0x004 /* Wait for REP_LOCKOUT_MSG. */
+
+/*
* Flags indicating what kind of record we want to back up to, in the log.
*/
-#define REP_REC_COMMIT 0x001 /* Most recent commit record. */
-#define REP_REC_PERM 0x002 /* Most recent perm record. */
+#define REP_REC_COMMIT 0x001 /* Most recent commit record. */
+#define REP_REC_PERM 0x002 /* Most recent perm record. */
/* PERM is a superset of COMMIT. */
+#define REP_REC_PERM_DEL 0x004 /* Most recent PERM, or fail if a
+ * file delete is found first. */
+
+/*
+ * Permanent record types.
+ */
+#define IS_PERM_RECTYPE(rectype) \
+ ((rectype) == DB___txn_regop || (rectype) == DB___txn_ckp)
/*
* Basic pre/post-amble processing.
@@ -692,7 +782,7 @@ do { \
* machine instruction. A single 32-bit integer value is safe without a
* mutex, but most other types of value should use a mutex.
*
- * Any use of a mutex must be inside a matched pair of ENV_ENTER() and
+ * Use of a db_mutex_t mutex must be inside a matched pair of ENV_ENTER() and
* ENV_LEAVE() macros. This ensures that if a thread dies while holding
* a lock (i.e. a mutex), recovery can clean it up so that it does not
* indefinitely block other threads.
@@ -727,6 +817,9 @@ struct __db_rep {
/*
* End of shared configuration information.
*/
+ int (*partial) /* View/partial replication function. */
+ __P((DB_ENV *, const char *, int *, u_int32_t));
+
int (*send) /* Send function. */
__P((DB_ENV *, const DBT *, const DBT *,
const DB_LSN *, int, u_int32_t));
@@ -745,6 +838,7 @@ struct __db_rep {
DB_MPOOLFILE *file_mpf; /* Mpoolfile for current database. */
DB *file_dbp; /* This file's page info. */
DBC *queue_dbc; /* Dbc for a queue file. */
+ DB *blob_dbp; /* Blob file database. */
/*
* Please change __rep_print_all (rep_stat.c) to track any changes made
@@ -759,6 +853,7 @@ struct __db_rep {
/*
* Replication Framework (repmgr) per-process information.
*/
+ int config_nthreads;/* Configured msg processing threads. */
u_int nthreads; /* Msg processing threads. */
u_int athreads; /* Space allocated for msg threads. */
u_int non_rep_th; /* Threads in GMDB or channel msgs. */
@@ -771,10 +866,13 @@ struct __db_rep {
db_timeout_t connection_retry_wait;
db_timeout_t heartbeat_frequency; /* Max period between msgs. */
db_timeout_t heartbeat_monitor_timeout;
+ u_int32_t inqueue_max_gbytes;
+ u_int32_t inqueue_max_bytes;
/* Thread synchronization. */
REPMGR_RUNNABLE *selector, **messengers, **elect_threads;
REPMGR_RUNNABLE *preferred_elect_thr;
+ REPMGR_RUNNABLE *takeover_thread;
db_timespec repstart_time;
mgr_mutex_t *mutex;
cond_var_t check_election, gmdb_idle, msg_avail;
@@ -799,12 +897,18 @@ struct __db_rep {
CONNECTION_LIST connections;
RETRY_Q_HEADER retries; /* Sites needing connection retry. */
struct {
- int size;
+ u_int32_t gbytes;
+ u_int32_t bytes;
STAILQ_HEAD(__repmgr_q_header, __repmgr_message) header;
} input_queue;
socket_t listen_fd;
db_timespec last_bcast; /* Time of last broadcast msg. */
+ db_timespec last_hbeat; /* Time of last heartbeat (prefmas). */
+ db_timespec l_listener_chk; /* Time to check local listener. */
+ db_timeout_t l_listener_wait;/* Timeout to check local listener. */
+ db_timespec m_listener_chk; /* Time to check master listener. */
+ db_timeout_t m_listener_wait;/* Timeout to check master listener. */
/*
* Status of repmgr. It is ready when repmgr is not yet started. It
@@ -813,12 +917,15 @@ struct __db_rep {
*/
enum { ready, running, stopped } repmgr_status;
int new_connection; /* Since last master seek attempt. */
+ int demotion_pending; /* We're being demoted to a view. */
int takeover_pending; /* We've been elected master. */
+ int rejoin_pending; /* Join group retry after rejection. */
int gmdb_busy;
int client_intent; /* Will relinquish master role. */
int gmdb_dirty;
int have_gmdb;
int seen_repmsg;
+ int view_mismatch; /* View callback and gmdb don't match. */
/*
* Flag to show what kind of transaction is currently in progress.
@@ -854,6 +961,16 @@ struct __db_rep {
u_int8_t *restored_list;
size_t restored_list_length;
+ /*
+ * Preferred master mode indicator for a pending action. A
+ * master_switch is initiated when the preferred master site is
+ * ready to take over as master. A start_temp_master is initiated
+ * when the client site needs to start as the temporary master.
+ */
+ enum { no_action, master_switch, start_temp_master } prefmas_pending;
+ /* The LSN at the very beginning of preferred master site startup. */
+ DB_LSN prefmas_init_lsn;
+
/* Application's message dispatch call-back function. */
void (*msg_dispatch) __P((DB_ENV *, DB_CHANNEL *,
DBT *, u_int32_t, u_int32_t));
@@ -920,6 +1037,10 @@ struct __db_rep {
} else if (!F_ISSET((env)->rep_handle, DBREP_APP_REPMGR)) \
F_SET((env)->rep_handle, DBREP_APP_BASEAPI); \
} while (0)
+#define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \
+ (db_rep)->l_listener_wait = timeout; \
+ (db_rep)->m_listener_wait = 3 * timeout; \
+} while (0)
#else
/*
@@ -935,6 +1056,9 @@ struct __db_rep {
#define APP_SET_BASEAPI(env) do { \
; \
} while (0)
+#define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \
+ ; \
+} while (0)
#endif /* HAVE_REPLICATION_THREADS */
/*
@@ -945,22 +1069,27 @@ struct __db_rep {
* compatibility with old versions, these values must be reserved explicitly in
* the list of flag values (below)
*/
-#define DB_LOG_PERM_42_44 0x20
-#define DB_LOG_RESEND_42_44 0x40
-#define REPCTL_INIT_45 0x02 /* Back compatible flag value. */
-
-#define REPCTL_ELECTABLE 0x01 /* Upgraded client is electable. */
-#define REPCTL_FLUSH 0x02 /* Record should be flushed. */
-#define REPCTL_GROUP_ESTD 0x04 /* Message from site in a group. */
-#define REPCTL_INIT 0x08 /* Internal init message. */
-#define REPCTL_LEASE 0x10 /* Lease related message.. */
+#define DB_LOG_PERM_42_44 0x020
+#define DB_LOG_RESEND_42_44 0x040
+#define REPCTL_INIT_45 0x002 /* Back compatible flag value. */
+
+/*
+ * Add new REPCTL flags to the end of this list to preserve compatibility
+ * with old versions.
+ */
+#define REPCTL_ELECTABLE 0x001 /* Upgraded client is electable. */
+#define REPCTL_FLUSH 0x002 /* Record should be flushed. */
+#define REPCTL_GROUP_ESTD 0x004 /* Message from site in a group. */
+#define REPCTL_INIT 0x008 /* Internal init message. */
+#define REPCTL_LEASE 0x010 /* Lease related message. */
/*
* Skip over reserved values 0x20
* and 0x40, as explained above.
*/
-#define REPCTL_LOG_END 0x80 /* Approximate end of group-wide log. */
+#define REPCTL_LOG_END 0x080 /* Approximate end of group-wide log. */
#define REPCTL_PERM DB_LOG_PERM_42_44
#define REPCTL_RESEND DB_LOG_RESEND_42_44
+#define REPCTL_INMEM_ONLY 0x100 /* In-memory databases only. */
/*
* File info flags for internal init. The per-database (i.e., file) flag
@@ -1094,6 +1223,20 @@ typedef struct {
DBT *objs;
} linfo_t;
+/*
+ * Used to store information on the child transaction that opens a blob meta
+ * database. In partial replication processing the child transaction of the
+ * blob meta database must be delayed until after processing the child
+ * transaction that opens the database that owns the BMD.
+ */
+typedef struct {
+ db_seq_t blob_file_id;
+ DB_LSN lsn;
+ u_int32_t child;
+ void *next;
+ void *prev;
+} DELAYED_BLOB_LIST;
+
#if defined(__cplusplus)
}
#endif
diff --git a/src/dbinc/repmgr.h b/src/dbinc/repmgr.h
index d8fd199c..a38defa2 100644
--- a/src/dbinc/repmgr.h
+++ b/src/dbinc/repmgr.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -47,20 +47,29 @@ extern "C" {
* In protocol version one there were only three message types: 1, 2, and 3; so
* 3 was the max. In protocol version 2 we introduced heartbeats, type 4.
* (Protocol version 3 did not introduce any new message types.) In version 4
- * we introduced a few more new message types, the largest of which had value 7.
+ * we introduced a few more new message types, the largest of which had value 8.
+ * Protocol version 5 did not introduce any new message types, but changed
+ * the format of site info and membership data to support views.
+ *
+ * Protocol version 6 introduced preferred master mode, which added several
+ * new REPMGR_OWN messages.
*/
#define REPMGR_MAX_V1_MSG_TYPE 3
#define REPMGR_MAX_V2_MSG_TYPE 4
#define REPMGR_MAX_V3_MSG_TYPE 4
#define REPMGR_MAX_V4_MSG_TYPE 8
+#define REPMGR_MAX_V5_MSG_TYPE 8
+#define REPMGR_MAX_V6_MSG_TYPE 8
#define HEARTBEAT_MIN_VERSION 2
#define CHANNEL_MIN_VERSION 4
#define CONN_COLLISION_VERSION 4
#define GM_MIN_VERSION 4
#define OWN_MIN_VERSION 4
+#define VIEW_MIN_VERSION 5
+#define PREFMAS_MIN_VERSION 6
/* The range of protocol versions we're willing to support. */
-#define DB_REPMGR_VERSION 4
+#define DB_REPMGR_VERSION 6
#define DB_REPMGR_MIN_VERSION 1
/*
@@ -73,18 +82,30 @@ extern "C" {
* Like the message format types, these message type values should be
* permanently frozen.
*/
-#define REPMGR_CONNECT_REJECT 1
-#define REPMGR_GM_FAILURE 2
-#define REPMGR_GM_FORWARD 3
-#define REPMGR_JOIN_REQUEST 4
-#define REPMGR_JOIN_SUCCESS 5
-#define REPMGR_PARM_REFRESH 6
-#define REPMGR_REJOIN 7
-#define REPMGR_REMOVE_REQUEST 8
-#define REPMGR_REMOVE_SUCCESS 9
-#define REPMGR_RESOLVE_LIMBO 10
-#define REPMGR_SHARING 11
-
+#define REPMGR_CONNECT_REJECT 1
+#define REPMGR_GM_FAILURE 2
+#define REPMGR_GM_FORWARD 3
+#define REPMGR_JOIN_REQUEST 4
+#define REPMGR_JOIN_SUCCESS 5
+#define REPMGR_PARM_REFRESH 6
+#define REPMGR_REJOIN 7
+#define REPMGR_REMOVE_REQUEST 8
+#define REPMGR_REMOVE_SUCCESS 9
+#define REPMGR_RESOLVE_LIMBO 10
+#define REPMGR_SHARING 11
+#define REPMGR_LSNHIST_REQUEST 12
+#define REPMGR_LSNHIST_RESPONSE 13
+#define REPMGR_PREFMAS_FAILURE 14
+#define REPMGR_PREFMAS_SUCCESS 15
+#define REPMGR_READONLY_MASTER 16
+#define REPMGR_READONLY_RESPONSE 17
+#define REPMGR_RESTART_CLIENT 18
+
+/* Detect inconsistencies between view callback and site's gmdb. */
+#define PARTICIPANT_TO_VIEW(db_rep, site) \
+ ((db_rep)->partial && !FLD_ISSET((site)->gmdb_flags, SITE_VIEW))
+#define VIEW_TO_PARTICIPANT(db_rep, site) \
+ (!(db_rep)->partial && FLD_ISSET((site)->gmdb_flags, SITE_VIEW))
struct __repmgr_connection;
typedef struct __repmgr_connection REPMGR_CONNECTION;
@@ -98,7 +119,8 @@ struct __cond_waiters_table;
typedef struct __cond_waiters_table COND_WAITERS_TABLE;
/* Current Group Membership DB format ID. */
-#define REPMGR_GMDB_FMT_VERSION 1
+#define REPMGR_GMDB_FMT_VERSION 2
+#define REPMGR_GMDB_FMT_MIN_VERSION 1
#ifdef DB_WIN32
typedef SOCKET socket_t;
@@ -151,6 +173,17 @@ typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1];
#define DB_REPMGR_DEFAULT_ELECTION_RETRY (10 * US_PER_SEC)
#define DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT (5 * US_PER_SEC)
+/* Default preferred master automatic configuration values. */
+#define DB_REPMGR_PREFMAS_ELECTION_RETRY (1 * US_PER_SEC)
+#define DB_REPMGR_PREFMAS_HEARTBEAT_MONITOR (2 * US_PER_SEC)
+#define DB_REPMGR_PREFMAS_HEARTBEAT_SEND (75 * (US_PER_SEC / 100))
+#define DB_REPMGR_PREFMAS_PRIORITY_CLIENT 75
+#define DB_REPMGR_PREFMAS_PRIORITY_MASTER 200
+
+/* Defaults for undocumented incoming queue maximum messages. */
+#define DB_REPMGR_DEFAULT_INQUEUE_MAX (100 * MEGABYTE)
+#define DB_REPMGR_INQUEUE_REDZONE_PERCENT 85
+
typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST;
typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER;
typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER;
@@ -170,14 +203,20 @@ struct __repmgr_runnable {
/*
* Options governing requested behavior of election thread.
*/
-#define ELECT_F_EVENT_NOTIFY 0x01 /* Notify application of master failure. */
-#define ELECT_F_FAST 0x02 /* First election "fast" (n-1 trick). */
-#define ELECT_F_IMMED 0x04 /* Start with immediate election. */
-#define ELECT_F_INVITEE 0x08 /* Honor (remote) inviter's nsites. */
-#define ELECT_F_STARTUP 0x10 /* Observe repmgr_start() policy. */
+#define ELECT_F_CLIENT_RESTART 0x01 /* Do client restarts but no elections. */
+#define ELECT_F_EVENT_NOTIFY 0x02 /* Notify application of master failure. */
+#define ELECT_F_FAST 0x04 /* First election "fast" (n-1 trick). */
+#define ELECT_F_IMMED 0x08 /* Start with immediate election. */
+#define ELECT_F_INVITEE 0x10 /* Honor (remote) inviter's nsites. */
+#define ELECT_F_STARTUP 0x20 /* Observe repmgr_start() policy. */
u_int32_t flags;
- int eid; /* For Connector thread. */
+ /* For connector thread. */
+ struct {
+ int eid;
+#define CONNECT_F_REFRESH 0x01 /* New connection to replace old one. */
+ u_int32_t flags;
+ } conn_th;
/*
* Args for other thread types can be added here in the future
@@ -265,6 +304,7 @@ struct __queued_output {
*/
typedef struct __repmgr_message {
STAILQ_ENTRY(__repmgr_message) entries;
+ size_t size;
__repmgr_msg_hdr_args msg_hdr;
union {
struct {
@@ -343,6 +383,7 @@ struct __repmgr_connection {
#define CONN_PARAMETERS 5 /* Awaiting parameters handshake. */
#define CONN_READY 6 /* Everything's fine. */
int state;
+ u_int32_t auto_takeover;/* Connection to remote listener candidate. */
/*
* Input: while we're reading a message, we keep track of what phase
@@ -464,6 +505,8 @@ typedef struct {
SITEADDR addr; /* Unprocessed network address of site. */
u_int32_t config; /* Configuration flags: peer, helper, etc. */
u_int32_t status; /* Group membership status. */
+ u_int32_t flags; /* Group membership flags. */
+ u_int32_t listener_cand;/* Number of listener candidates of site. */
} SITEINFO;
/*
@@ -489,6 +532,42 @@ typedef struct {
((u_int)i) < db_rep->site_cnt; \
(int)(++(i)) == db_rep->self_eid ? ++(i) : i)
+/*
+ * Enable replication manager auto listener takeover.
+ */
+#define HAVE_REPLICATION_LISTENER_TAKEOVER 1
+
+/* Listener candidate, that is subordinate rep-aware process. */
+#define IS_LISTENER_CAND(db_rep) \
+ (FLD_ISSET((db_rep)->region->config, REP_C_AUTOTAKEOVER) && \
+ IS_SUBORDINATE(db_rep) && (db_rep)->repmgr_status == running)
+
+/*
+ * The number of listener candidates for each remote site is maintained in
+ * the listener process and used in subordinate rep-aware processes.
+ */
+#define SET_LISTENER_CAND(cond, op) \
+ do { \
+ if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER) && \
+ !IS_SUBORDINATE(db_rep) && (cond)) { \
+ MUTEX_LOCK(env, rep->mtx_repmgr); \
+ sites = R_ADDR(env->reginfo, rep->siteinfo_off);\
+ (sites[eid].listener_cand)op; \
+ MUTEX_UNLOCK(env, rep->mtx_repmgr); \
+ } \
+ } while (0)
+
+#define CHECK_LISTENER_CAND(val, op, tval, fval) \
+ do { \
+ if (IS_LISTENER_CAND(db_rep)) { \
+ MUTEX_LOCK(env, rep->mtx_repmgr); \
+ sites = R_ADDR(env->reginfo, rep->siteinfo_off);\
+ val = ((sites[eid].listener_cand)op) ? \
+ (tval) : (fval); \
+ MUTEX_UNLOCK(env, rep->mtx_repmgr); \
+ } \
+ } while (0)
+
struct __repmgr_site {
repmgr_netaddr_t net_addr;
@@ -499,12 +578,14 @@ struct __repmgr_site {
* host/port network address is promised to be associated with the
* locally known EID for the life of the environment.
*/
- u_int32_t membership; /* Status flags from GMDB. */
+ u_int32_t membership; /* Status value from GMDB. */
+ u_int32_t gmdb_flags; /* Flags from GMDB. */
u_int32_t config; /* Flags from site->set_config() */
/*
* Everything below here is applicable only to remote sites.
*/
+ u_int32_t max_ack_gen; /* Master generation for max_ack. */
DB_LSN max_ack; /* Best ack we've heard from this site. */
int ack_policy; /* Or 0 if unknown. */
u_int16_t alignment; /* Requirements for app channel msgs. */
@@ -604,11 +685,11 @@ struct __channel {
* connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and
* (3) db_rep->connections.
*
- * 1. SITE->ref.conn points to our connection with the main process running
- * at the given site, if such a connection exists. We may have initiated
- * the connection to the site ourselves, or we may have received it as an
- * incoming connection. Once it is established there is very little
- * difference between those two cases.
+ * 1. SITE->ref.conn points to our connection with the listener process
+ * running at the given site, if such a connection exists. We may have
+ * initiated the connection to the site ourselves, or we may have received
+ * it as an incoming connection. Once it is established there is very
+ * little difference between those two cases.
*
* 2. SITE->sub_conns is a list of connections we have with subordinate
* processes running at the given site. There can be any number of these
@@ -694,6 +775,7 @@ struct __channel {
*/
#define APP_CHANNEL_CONNECTION 0x02 /* Connection used for app channel. */
#define ELECTABLE_SITE 0x04
+#define REPMGR_AUTOTAKEOVER 0x08 /* Could become main connection. */
#define REPMGR_SUBORDINATE 0x01 /* This is a subordinate connection. */
/*
@@ -719,13 +801,20 @@ typedef struct {
* As with message formats, stored formats are defined in repmgr.msg.
*/
/*
- * Flags for the Group Membership data portion of a record. Like message type
- * codes, these values are frozen across releases, in order to avoid pointless
- * churn.
+ * Status values for the Group Membership data portion of a record. Like
+ * message type codes, these values are frozen across releases, in order to
+ * avoid pointless churn. These values are mutually exclusive.
*/
#define SITE_ADDING 0x01
#define SITE_DELETING 0x02
#define SITE_PRESENT 0x04
+/*
+ * Flags for the Group Membership data portion of a record. These values are
+ * also frozen across releases. These values are bit fields and may be OR'ed
+ * together.
+ */
+#define SITE_VIEW 0x01
+#define SITE_JOIN_ELECTABLE 0x02
/*
* Message types whose processing could take a long time. We're careful to
@@ -755,9 +844,9 @@ typedef struct {
* fraction of the code, it's a tiny fraction of the time: repmgr spends most of
* its time in a call to select(), and as well a bit in calls into the Base
* replication API. All of those release the mutex.
- * Access to repmgr's shared list of site addresses is protected by
- * another mutex: mtx_repmgr. And, when changing space allocation for that site
- * list we conform to the convention of acquiring renv->mtx_regenv. These are
+ * Access to repmgr's shared values is protected by another mutex:
+ * mtx_repmgr. And, when changing space allocation for that site list
+ * we conform to the convention of acquiring renv->mtx_regenv. These are
* less frequent of course.
* When it's necessary to acquire more than one of these mutexes, the
* ordering priority (or "lock ordering protocol") is:
diff --git a/src/dbinc/shqueue.h b/src/dbinc/shqueue.h
index 22464462..20e0fae7 100644
--- a/src/dbinc/shqueue.h
+++ b/src/dbinc/shqueue.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -140,6 +140,17 @@ struct { \
((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next)))
/*
+ * __SH_LIST_WAS_EMPTY is private API. SH_LIST_FIRST is not thread-safe;
+ * the slh_first field could be evaluated multiple times if the optimizer
+ * does not eliminate the second load. __SH_LIST_WAS_EMPTY tests whether a
+ * prior call of SH_LIST_FIRSTP occurred while the list was empty; i.e., its
+ * relative offset was -1. It is thread-safe to call SH_LIST_FIRSTP and then
+ * test the resulting pointer with __SH_LIST_WAS_EMPTY.
+ */
+#define __SH_LIST_WAS_EMPTY(head, ptr) \
+ ((u_int8_t *)(ptr) == (((u_int8_t *)(head)) + (-1)))
+
+ /*
*__SH_LIST_PREV_OFF is private API. It calculates the address of
* the elm->field.sle_next member of a SH_LIST structure. All offsets
* between elements are relative to that point in SH_LIST structures.
diff --git a/src/dbinc/tcl_db.h b/src/dbinc/tcl_db.h
index 4c56164f..99992467 100644
--- a/src/dbinc/tcl_db.h
+++ b/src/dbinc/tcl_db.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -16,7 +16,7 @@ extern "C" {
#define MSG_SIZE 100 /* Message size */
enum INFOTYPE {
- I_AUX, I_DB, I_DBC, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN};
+ I_AUX, I_DB, I_DBC, I_DBSTREAM, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN};
#define MAX_ID 8 /* Maximum number of sub-id's we need */
#define DBTCL_PREP 64 /* Size of txn_recover preplist */
@@ -24,9 +24,11 @@ enum INFOTYPE {
#define DBTCL_DBM 1
#define DBTCL_NDBM 2
-#define DBTCL_GETCLOCK 0
-#define DBTCL_GETLIMIT 1
-#define DBTCL_GETREQ 2
+#define DBTCL_GETCLOCK 0
+#define DBTCL_GETINQUEUE_MAX 1
+#define DBTCL_GETINQUEUE_REDZONE 2
+#define DBTCL_GETLIMIT 3
+#define DBTCL_GETREQ 4
#define DBTCL_MUT_ALIGN 0
#define DBTCL_MUT_INCR 1
@@ -36,9 +38,11 @@ enum INFOTYPE {
/*
* Data structure to record information about events that have occurred. Tcl
- * command "env event_info" can retrieve the information. For now, we record
- * only one occurrence per event type; "env event_info -clear" can be used to
- * reset the info.
+ * command "env event_info" can retrieve all the information except the number
+ * of times, and "env event_count" can retrieve the number of times a specific
+ * event is fired. We added "env event_count" instead of merging the times
+ * information into "env event_info" to avoid breaking the existing tests.
+ * Tcl command "env event_info -clear" can be used to reset the info.
*
* Besides the bit flag that records the fact that an event type occurred, some
* event types have associated "info" and we record that here too. When new
@@ -47,16 +51,17 @@ enum INFOTYPE {
* with the "env event_info" results.
*/
typedef struct dbtcl_event_info {
- u_int32_t events; /* Bit flag on for each event fired. */
- int panic_error;
- int newmaster_eid;
- int added_eid;
- int removed_eid;
- pid_t attached_process;
- int connected_eid;
+ u_int32_t events; /* Bit flag on for each event fired. */
+ int panic_error;
+ int newmaster_eid;
+ int added_eid;
+ int removed_eid;
+ pid_t attached_process;
+ int connected_eid;
DB_REPMGR_CONN_ERR conn_broken_info;
DB_REPMGR_CONN_ERR conn_failed_try_info;
- DB_LSN sync_point;
+ DB_LSN sync_point;
+ size_t count[32]; /* The number of times for each event. */
} DBTCL_EVENT_INFO;
/*
@@ -99,6 +104,7 @@ typedef struct dbtcl_info {
DB_LOCK *lock;
DB_LOGC *logc;
DB_MPOOLFILE *mp;
+ DB_STREAM *dbsp;
DB_TXN *txnp;
void *anyp;
} un;
@@ -128,6 +134,7 @@ typedef struct dbtcl_info {
Tcl_Obj *i_isalive;
Tcl_Obj *i_part_callback;
Tcl_Obj *i_rep_send;
+ Tcl_Obj *i_rep_view;
Tcl_Obj *i_second_call;
/* Environment ID for the i_rep_send callback. */
@@ -144,6 +151,7 @@ typedef struct dbtcl_info {
#define i_anyp un.anyp
#define i_dbp un.dbp
#define i_dbcp un.dbcp
+#define i_dbsp un.dbsp
#define i_envp un.envp
#define i_lock un.lock
#define i_logc un.logc
@@ -170,6 +178,8 @@ typedef struct dbtcl_info {
#define i_dbdbcid i_otherid[0]
+#define i_dbcdbsid i_otherid[0]
+
extern int __debug_on, __debug_print, __debug_stop, __debug_test;
typedef struct dbtcl_global {
@@ -202,6 +212,7 @@ extern DBTCL_GLOBAL __dbtcl_global;
* functions this will typically go before the "free" function to free the
* stat structure returned by DB.
*/
+#ifdef HAVE_STATISTICS
#define MAKE_STAT_LIST(s, v) do { \
result = _SetListElemInt(interp, res, (s), (long)(v)); \
if (result != TCL_OK) \
@@ -213,6 +224,11 @@ extern DBTCL_GLOBAL __dbtcl_global;
if (result != TCL_OK) \
goto error; \
} while (0)
+#else
+/* These do-nothing versions streamline the code & reduce warning messages. */
+#define MAKE_STAT_LIST(s, v) if (0) goto error
+#define MAKE_WSTAT_LIST(s, v) if (0) goto error
+#endif
/*
* MAKE_STAT_LSN appends a {name {LSNfile LSNoffset}} pair to a result list
@@ -257,13 +273,14 @@ extern DBTCL_GLOBAL __dbtcl_global;
* This macro also assumes a label "error" to go to in the event of a Tcl
* error.
*/
-#define MAKE_SITE_LIST(e, h, p, s, pr) do { \
- myobjc = 5; \
+#define MAKE_SITE_LIST(e, h, p, s, pr, vw) do { \
+ myobjc = 6; \
myobjv[0] = Tcl_NewIntObj(e); \
myobjv[1] = Tcl_NewStringObj((h), (int)strlen(h)); \
myobjv[2] = Tcl_NewIntObj((int)p); \
myobjv[3] = Tcl_NewStringObj((s), (int)strlen(s)); \
myobjv[4] = Tcl_NewStringObj((pr), (int)strlen(pr)); \
+ myobjv[5] = Tcl_NewStringObj((vw), (int)strlen(vw)); \
thislist = Tcl_NewListObj(myobjc, myobjv); \
result = Tcl_ListObjAppendElement(interp, res, thislist); \
if (result != TCL_OK) \
diff --git a/src/dbinc/txn.h b/src/dbinc/txn.h
index 7cbae263..682d7c42 100644
--- a/src/dbinc/txn.h
+++ b/src/dbinc/txn.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/dbinc/win_db.h b/src/dbinc/win_db.h
index ba57cd1f..e22aba98 100644
--- a/src/dbinc/win_db.h
+++ b/src/dbinc/win_db.h
@@ -1,17 +1,21 @@
/*-
- * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved.
*
* The following provides the information necessary to build Berkeley
* DB on native Windows, and other Windows environments such as MinGW.
*/
/*
- * Berkeley DB requires at least Windows 2000, tell Visual Studio of the
- * requirement.
+ * Berkeley DB requires at least Windows 2000, and Windows XP if we are using
+ * Visual Studio 2012. Tell Visual Studio of the requirement.
*/
#ifndef _WIN32_WINNT
+#if _MSC_VER >= 1700
+#define _WIN32_WINNT 0x0501
+#else
#define _WIN32_WINNT 0x0500
#endif
+#endif
#ifndef DB_WINCE
#include <sys/types.h>
@@ -69,12 +73,46 @@
#endif
#define getpid GetCurrentProcessId
#define snprintf _snprintf
+#ifndef strcasecmp
#define strcasecmp _stricmp
#define strncasecmp _strnicmp
+#endif
#define vsnprintf _vsnprintf
#define h_errno WSAGetLastError()
+#ifdef DB_WINCE
+/* Macros used by setvbuf on WINCE */
+#ifndef _IOFBF
+#define _IOFBF 0x0000
+#endif
+#ifndef _IOLBF
+#define _IOLBF 0x0040
+#endif
+#ifndef _IONBF
+#define _IONBF 0x0004
+#endif
+/* The macros for time functions */
+#define freopen __ce_freopen
+#define gmtime __ce_gmtime
+#define mktime __ce_mktime
+#define remove __ce_remove
+#define SECSPERMIN 60
+#define MINSPERHOUR 60
+#define HOURSPERDAY 24
+#define DAYSPERWEEK 7
+#define DAYSPERNYEAR 365
+#define DAYSPERLYEAR 366
+#define SECSPERHOUR (SECSPERMIN * MINSPERHOUR)
+#define SECSPERDAY ((long) SECSPERHOUR * HOURSPERDAY)
+#define MONSPERYEAR 12
+#define TM_YEAR_BASE 1900
+#define TM_YEAR_EPOCH 1970
+#define isleap(y) ((((y) % 4) == 0 && ((y) % 100) != 0) || ((y) % 400) == 0)
+extern const __DB_IMPORT unsigned int mon_lengths[][MONSPERYEAR];
+extern const __DB_IMPORT unsigned int year_lengths[];
+#endif
+
/*
* Win32 does not have getopt.
*
diff --git a/src/dbinc/xa.h b/src/dbinc/xa.h
index 7283c1ea..7b7e2cb0 100644
--- a/src/dbinc/xa.h
+++ b/src/dbinc/xa.h
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/