diff options
Diffstat (limited to 'src/dbinc')
42 files changed, 1574 insertions, 463 deletions
diff --git a/src/dbinc/atomic.h b/src/dbinc/atomic.h index 096176a5..61f2ead9 100644 --- a/src/dbinc/atomic.h +++ b/src/dbinc/atomic.h @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 2009, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -79,12 +79,11 @@ typedef struct { #define WINCE_ATOMIC_MAGIC(p) \ /* \ * Memory mapped regions on Windows CE cause problems with \ - * InterlockedXXX calls. Each page in a mapped region needs to \ - * have been written to prior to an InterlockedXXX call, or the \ - * InterlockedXXX call hangs. This does not seem to be \ - * documented anywhere. For now, read/write a non-critical \ - * piece of memory from the shared region prior to attempting \ - * shared region prior to attempting an InterlockedExchange \ + * InterlockedXXX calls. Each process making an InterlockedXXX \ + * call must make sure that it has written to the page prior to \ + * the call, or the InterlockedXXX call hangs. This does not \ + * seem to be documented anywhere. Write a non-critical piece \ + * of memory from the shared region prior to attempting an \ * InterlockedXXX operation. \ */ \ (p)->dummy = 0 @@ -144,7 +143,7 @@ typedef LONG volatile *interlocked_val; #define atomic_inc(env, p) __atomic_inc(p) #define atomic_dec(env, p) __atomic_dec(p) #define atomic_compare_exchange(env, p, o, n) \ - __atomic_compare_exchange((p), (o), (n)) + __atomic_compare_exchange_int((p), (o), (n)) static inline int __atomic_inc(db_atomic_t *p) { int temp; @@ -176,7 +175,7 @@ static inline int __atomic_dec(db_atomic_t *p) * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html * which configure could be changed to use. */ -static inline int __atomic_compare_exchange( +static inline int __atomic_compare_exchange_int( db_atomic_t *p, atomic_value_t oldval, atomic_value_t newval) { atomic_value_t was; diff --git a/src/dbinc/blob.h b/src/dbinc/blob.h new file mode 100644 index 00000000..f4ff475b --- /dev/null +++ b/src/dbinc/blob.h @@ -0,0 +1,103 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_BLOB_H_ +#define _DB_BLOB_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * How many characters can the path for a blob file use? + * Up to 6 subdirectory separators. + * Up to 6 directory names of up to three characters each. + * Up to 21 characters for blob_id identifier. + * 7 characters for the standard prefix (__db.bl) + * 1 for luck (or NULL) + * The largest blob id, 9,223,372,036,854,775,807 would + * produce a path and file name: + * 009/223/372/036/854/775/807/__db.bl009223372036854775807 + */ +#define MAX_BLOB_PATH "009/223/372/036/854/775/807/__db.bl009223372036854775807" +#define MAX_BLOB_PATH_SZ sizeof(MAX_BLOB_PATH) +#define BLOB_DEFAULT_DIR "__db_bl" +#define BLOB_META_FILE_NAME "__db_blob_meta.db" +#define BLOB_DIR_PREFIX "__db" +#define BLOB_FILE_PREFIX "__db.bl" + +#define BLOB_DIR_ELEMS 1000 + +#define IS_BLOB_META(name) \ + (name != NULL && strstr(name, BLOB_META_FILE_NAME) != NULL) +#define IS_BLOB_FILE(name) \ + (name != NULL && strstr(name, BLOB_FILE_PREFIX) != NULL) + +/* + * Combines two unsigned 32 bit integers into a 64 bit integer. + * Blob database file ids and sub database ids are 64 bit integers, + * but have to be stored on database metadata pages that must + * be readable on 32 bit only compilers. So the ids are split into + * two 32 bit integers, and combined when needed. + */ +#define GET_LO_HI(e, lo, hi, o, ret) do { \ + DB_ASSERT((e), sizeof(o) <= 8); \ + if (sizeof(o) == 8) { \ + (o) = (hi); \ + (o) = ((o) << 32); \ + (o) += (lo); \ + } else { \ + if ((hi) > 0) { \ + __db_errx((e), DB_STR("0765", \ + "Offset or id size overflow.")); \ + (ret) = EINVAL; \ + } \ + (o) = (lo); \ + } \ +} while (0); + +#define GET_BLOB_FILE_ID(e, p, o, ret) \ + GET_LO_HI(e, (p)->blob_file_lo, (p)->blob_file_hi, o, ret); + +#define GET_BLOB_SDB_ID(e, p, o, ret) \ + GET_LO_HI(e, (p)->blob_sdb_lo, (p)->blob_sdb_hi, o, ret); + +/* Splits a 64 bit integer into two unsigned 32 bit integers. */ +#define SET_LO_HI(p, v, type, field_lo, field_hi) do { \ + u_int32_t tmp; \ + if (sizeof((v)) == 8) { \ + tmp = (u_int32_t)((v) >> 32); \ + memcpy(((u_int8_t *)p) + SSZ(type, field_hi), \ + &tmp, sizeof(u_int32_t)); \ + } else { \ + memset(((u_int8_t *)p) + SSZ(type, field_hi), \ + 0, sizeof(u_int32_t)); \ + } \ + tmp = (u_int32_t)(v); \ + memcpy(((u_int8_t *)p) + SSZ(type, field_lo), \ + &tmp, sizeof(u_int32_t)); \ +} while (0); + +#define SET_LO_HI_VAR(v, field_lo, field_hi) do { \ + if (sizeof((v)) == 8) \ + field_hi = (u_int32_t)((v) >> 32); \ + else \ + field_hi = 0; \ + field_lo = (u_int32_t)(v); \ +} while (0); + +#define SET_BLOB_META_FILE_ID(p, v, type) \ + SET_LO_HI(p, v, type, blob_file_lo, blob_file_hi); + +#define SET_BLOB_META_SDB_ID(p, v, type) \ + SET_LO_HI(p, v, type, blob_sdb_lo, blob_sdb_hi); + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_BLOB_H_ */ diff --git a/src/dbinc/btree.h b/src/dbinc/btree.h index 86bbec14..a8b9e1ee 100644 --- a/src/dbinc/btree.h +++ b/src/dbinc/btree.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -472,7 +472,7 @@ struct __btree { /* Btree access method. */ u_int32_t bt_minkey; /* Minimum keys per page. */ /* Btree comparison function. */ - int (*bt_compare) __P((DB *, const DBT *, const DBT *)); + int (*bt_compare) __P((DB *, const DBT *, const DBT *, size_t *)); /* Btree prefix function. */ size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *)); /* Btree compress function. */ @@ -483,7 +483,8 @@ struct __btree { /* Btree access method. */ int (*bt_decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)); /* dup_compare for compression */ - int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *)); + int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *, + size_t *)); #endif /* Recno access method. */ @@ -539,7 +540,7 @@ typedef enum { * Flags for __bam_pinsert. */ #define BPI_SPACEONLY 0x01 /* Only check for space to update. */ -#define BPI_NORECNUM 0x02 /* Not update the recnum on the left. */ +#define BPI_NORECNUM 0x02 /* Don't update the left's recnum. */ #define BPI_NOLOGGING 0x04 /* Don't log the update. */ #define BPI_REPLACE 0x08 /* Replace the record. */ diff --git a/src/dbinc/clock.h b/src/dbinc/clock.h index caeaee70..b2815ea2 100644 --- a/src/dbinc/clock.h +++ b/src/dbinc/clock.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -125,6 +125,13 @@ typedef struct { timespecadd((vvp), &__tmp); \ } while (0) +#define TIMESPEC_SUB_DB_TIMEOUT(vvp, t) \ + do { \ + db_timespec __tmp; \ + DB_TIMEOUT_TO_TIMESPEC(t, &__tmp); \ + timespecsub((vvp), &__tmp); \ + } while (0) + #if defined(__cplusplus) } #endif diff --git a/src/dbinc/crypto.h b/src/dbinc/crypto.h index ea7a9cf0..4d889fd9 100644 --- a/src/dbinc/crypto.h +++ b/src/dbinc/crypto.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/cxx_int.h b/src/dbinc/cxx_int.h index 5492ead7..368bac86 100644 --- a/src/dbinc/cxx_int.h +++ b/src/dbinc/cxx_int.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/db.in b/src/dbinc/db.in index a948910e..b592b746 100644 --- a/src/dbinc/db.in +++ b/src/dbinc/db.in @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ * @@ -102,6 +102,7 @@ extern "C" { @FILE_t_decl@ @off_t_decl@ +@db_off_t_decl@ @pid_t_decl@ @size_t_decl@ #ifdef HAVE_MIXED_SIZE_ADDRESSING @@ -131,9 +132,9 @@ typedef u_int16_t db_indx_t; /* Page offset type. */ #define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */ typedef u_int32_t db_recno_t; /* Record number type. */ -#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */ +#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a recno tree. */ -typedef u_int32_t db_timeout_t; /* Type of a timeout. */ +typedef u_int32_t db_timeout_t; /* Type of a timeout in microseconds. */ /* * Region offsets are the difference between a pointer in a region and the @@ -157,6 +158,10 @@ struct __db_compact; typedef struct __db_compact DB_COMPACT; struct __db_dbt; typedef struct __db_dbt DBT; struct __db_distab; typedef struct __db_distab DB_DISTAB; struct __db_env; typedef struct __db_env DB_ENV; +struct __db_event_mutex_died_info; + typedef struct __db_event_mutex_died_info DB_EVENT_MUTEX_DIED_INFO; +struct __db_event_failchk_info; + typedef struct __db_event_failchk_info DB_EVENT_FAILCHK_INFO; struct __db_h_stat; typedef struct __db_h_stat DB_HASH_STAT; struct __db_heap_rid; typedef struct __db_heap_rid DB_HEAP_RID; struct __db_heap_stat; typedef struct __db_heap_stat DB_HEAP_STAT; @@ -189,6 +194,7 @@ struct __db_repmgr_site;typedef struct __db_repmgr_site DB_REPMGR_SITE; struct __db_repmgr_stat;typedef struct __db_repmgr_stat DB_REPMGR_STAT; struct __db_seq_record; typedef struct __db_seq_record DB_SEQ_RECORD; struct __db_seq_stat; typedef struct __db_seq_stat DB_SEQUENCE_STAT; +struct __db_stream; typedef struct __db_stream DB_STREAM; struct __db_site; typedef struct __db_site DB_SITE; struct __db_sequence; typedef struct __db_sequence DB_SEQUENCE; struct __db_thread_info;typedef struct __db_thread_info DB_THREAD_INFO; @@ -226,18 +232,20 @@ struct __db_dbt { void *app_data; -#define DB_DBT_APPMALLOC 0x001 /* Callback allocated memory. */ -#define DB_DBT_BULK 0x002 /* Internal: Insert if duplicate. */ -#define DB_DBT_DUPOK 0x004 /* Internal: Insert if duplicate. */ -#define DB_DBT_ISSET 0x008 /* Lower level calls set value. */ -#define DB_DBT_MALLOC 0x010 /* Return in malloc'd memory. */ -#define DB_DBT_MULTIPLE 0x020 /* References multiple records. */ -#define DB_DBT_PARTIAL 0x040 /* Partial put/get. */ -#define DB_DBT_REALLOC 0x080 /* Return in realloc'd memory. */ -#define DB_DBT_READONLY 0x100 /* Readonly, don't update. */ -#define DB_DBT_STREAMING 0x200 /* Internal: DBT is being streamed. */ -#define DB_DBT_USERCOPY 0x400 /* Use the user-supplied callback. */ -#define DB_DBT_USERMEM 0x800 /* Return in user's memory. */ +#define DB_DBT_APPMALLOC 0x0001 /* Callback allocated memory. */ +#define DB_DBT_BULK 0x0002 /* Internal: Insert if duplicate. */ +#define DB_DBT_DUPOK 0x0004 /* Internal: Insert if duplicate. */ +#define DB_DBT_ISSET 0x0008 /* Lower level calls set value. */ +#define DB_DBT_MALLOC 0x0010 /* Return in malloc'd memory. */ +#define DB_DBT_MULTIPLE 0x0020 /* References multiple records. */ +#define DB_DBT_PARTIAL 0x0040 /* Partial put/get. */ +#define DB_DBT_REALLOC 0x0080 /* Return in realloc'd memory. */ +#define DB_DBT_READONLY 0x0100 /* Readonly, don't update. */ +#define DB_DBT_STREAMING 0x0200 /* Internal: DBT is being streamed. */ +#define DB_DBT_USERCOPY 0x0400 /* Use the user-supplied callback. */ +#define DB_DBT_USERMEM 0x0800 /* Return in user's memory. */ +#define DB_DBT_BLOB 0x1000 /* Data item is a blob. */ +#define DB_DBT_BLOB_REC 0x2000 /* Internal: Blob database record. */ u_int32_t flags; }; @@ -274,6 +282,23 @@ struct __db_mutex_stat { /* SHARED */ #endif }; +/* Buffers passed to __mutex_describe() must be at least this large. */ +#define DB_MUTEX_DESCRIBE_STRLEN 128 + +/* This is the info of a DB_EVENT_MUTEX_DIED event notification. */ +struct __db_event_mutex_died_info { + pid_t pid; /* Process which last owned the mutex */ + db_threadid_t tid; /* Thread which last owned the mutex */ + db_mutex_t mutex; /* ID of the mutex */ + char desc[DB_MUTEX_DESCRIBE_STRLEN]; +}; + +/* This is the info of a DB_EVENT_FAILCHK event notification. */ +#define DB_FAILURE_SYMPTOM_SIZE 120 +struct __db_event_failchk_info { + int error; + char symptom[DB_FAILURE_SYMPTOM_SIZE]; +}; /* This is the length of the buffer passed to DB_ENV->thread_id_string() */ #define DB_THREADID_STRLEN 128 @@ -400,6 +425,8 @@ struct __db_lock_stat { /* SHARED */ uintmax_t st_lockers_nowait; /* Locker lock granted without wait. */ uintmax_t st_region_wait; /* Region lock granted after wait. */ uintmax_t st_region_nowait; /* Region lock granted without wait. */ + uintmax_t st_nlockers_hit; /* Lockers found in thread info. */ + uintmax_t st_nlockers_reused; /* Lockers reallocated from thread info. */ u_int32_t st_hash_len; /* Max length of bucket. */ roff_t st_regsize; /* Region size. */ #endif @@ -469,7 +496,7 @@ struct __db_lockreq { /******************************************************* * Logging. *******************************************************/ -#define DB_LOGVERSION 19 /* Current log version. */ +#define DB_LOGVERSION 22 /* Current log version. */ #define DB_LOGVERSION_LATCHING 15 /* Log version using latching: db-4.8 */ #define DB_LOGCHKSUM 12 /* Check sum headers: db-4.5 */ #define DB_LOGOLDVER 8 /* Oldest version supported: db-4.2 */ @@ -595,7 +622,8 @@ typedef enum { LOGREC_PGDDBT, LOGREC_PGLIST, LOGREC_POINTER, - LOGREC_TIME + LOGREC_TIME, + LOGREC_LONGARG } log_rec_type_t; typedef const struct __log_rec_spec { @@ -755,6 +783,7 @@ struct __db_mpool_stat { /* SHARED */ uintmax_t st_mvcc_frozen; /* Buffers frozen. */ uintmax_t st_mvcc_thawed; /* Buffers thawed. */ uintmax_t st_mvcc_freed; /* Frozen buffers freed. */ + uintmax_t st_mvcc_reused; /* Outdated invisible buffers reused. */ uintmax_t st_alloc; /* Number of page allocations. */ uintmax_t st_alloc_buckets; /* Buckets checked during allocation. */ uintmax_t st_alloc_max_buckets;/* Max checked during allocation. */ @@ -762,6 +791,8 @@ struct __db_mpool_stat { /* SHARED */ uintmax_t st_alloc_max_pages; /* Max checked during allocation. */ uintmax_t st_io_wait; /* Thread waited on buffer I/O. */ uintmax_t st_sync_interrupted; /* Number of times sync interrupted. */ + u_int32_t st_oddfsize_detect; /* Odd file size detected. */ + u_int32_t st_oddfsize_resolve; /* Odd file size resolved. */ roff_t st_regsize; /* Region size. */ roff_t st_regmax; /* Region max. */ #endif @@ -956,7 +987,7 @@ struct __db_txn { #define TXN_SNAPSHOT 0x08000 /* Snapshot Isolation. */ #define TXN_SYNC 0x10000 /* Write and sync on prepare/commit. */ #define TXN_WRITE_NOSYNC 0x20000 /* Write only on prepare/commit. */ -#define TXN_BULK 0x40000 /* Enable bulk loading optimization. */ +#define TXN_BULK 0x40000 /* Enable bulk loading optimization. */ u_int32_t flags; }; @@ -1065,30 +1096,34 @@ struct __db_txn_token { /* * Event notification types. (Tcl testing interface currently assumes there are - * no more than 32 of these.) + * no more than 32 of these.). Comments include any relevant event_info types. */ #define DB_EVENT_PANIC 0 -#define DB_EVENT_REG_ALIVE 1 -#define DB_EVENT_REG_PANIC 2 -#define DB_EVENT_REP_CLIENT 3 -#define DB_EVENT_REP_CONNECT_BROKEN 4 -#define DB_EVENT_REP_CONNECT_ESTD 5 -#define DB_EVENT_REP_CONNECT_TRY_FAILED 6 -#define DB_EVENT_REP_DUPMASTER 7 -#define DB_EVENT_REP_ELECTED 8 -#define DB_EVENT_REP_ELECTION_FAILED 9 -#define DB_EVENT_REP_INIT_DONE 10 -#define DB_EVENT_REP_JOIN_FAILURE 11 -#define DB_EVENT_REP_LOCAL_SITE_REMOVED 12 -#define DB_EVENT_REP_MASTER 13 -#define DB_EVENT_REP_MASTER_FAILURE 14 -#define DB_EVENT_REP_NEWMASTER 15 -#define DB_EVENT_REP_PERM_FAILED 16 -#define DB_EVENT_REP_SITE_ADDED 17 -#define DB_EVENT_REP_SITE_REMOVED 18 -#define DB_EVENT_REP_STARTUPDONE 19 -#define DB_EVENT_REP_WOULD_ROLLBACK 20 /* Undocumented; C API only. */ -#define DB_EVENT_WRITE_FAILED 21 +#define DB_EVENT_REG_ALIVE 1 /* int: pid which was in env */ +#define DB_EVENT_REG_PANIC 2 /* int: error causing the panic. */ +#define DB_EVENT_REP_AUTOTAKEOVER_FAILED 3 +#define DB_EVENT_REP_CLIENT 4 +#define DB_EVENT_REP_CONNECT_BROKEN 5 /* DB_REPMGR_CONN_ERR */ +#define DB_EVENT_REP_CONNECT_ESTD 6 /* int: EID of remote site */ +#define DB_EVENT_REP_CONNECT_TRY_FAILED 7 /* DB_REPMGR_CONN_ERR */ +#define DB_EVENT_REP_DUPMASTER 8 +#define DB_EVENT_REP_ELECTED 9 +#define DB_EVENT_REP_ELECTION_FAILED 10 +#define DB_EVENT_REP_INIT_DONE 11 +#define DB_EVENT_REP_INQUEUE_FULL 12 +#define DB_EVENT_REP_JOIN_FAILURE 13 +#define DB_EVENT_REP_LOCAL_SITE_REMOVED 14 +#define DB_EVENT_REP_MASTER 15 +#define DB_EVENT_REP_MASTER_FAILURE 16 +#define DB_EVENT_REP_NEWMASTER 17 /* int: new master's site id */ +#define DB_EVENT_REP_PERM_FAILED 18 +#define DB_EVENT_REP_SITE_ADDED 19 /* int: eid */ +#define DB_EVENT_REP_SITE_REMOVED 20 /* int: eid */ +#define DB_EVENT_REP_STARTUPDONE 21 +#define DB_EVENT_REP_WOULD_ROLLBACK 22 /* Undocumented; C API only. */ +#define DB_EVENT_WRITE_FAILED 23 +#define DB_EVENT_MUTEX_DIED 24 /* DB_EVENT_MUTEX_DIED_INFO */ +#define DB_EVENT_FAILCHK_PANIC 25 /* DB_EVENT_FAILCHK_INFO */ #define DB_EVENT_NO_SUCH_EVENT 0xffffffff /* OOB sentinel value */ /* Replication Manager site status. */ @@ -1102,6 +1137,7 @@ struct __db_repmgr_site { u_int32_t status; #define DB_REPMGR_ISPEER 0x01 +#define DB_REPMGR_ISVIEW 0x02 u_int32_t flags; }; @@ -1117,6 +1153,7 @@ struct __db_rep_stat { /* SHARED */ * circumstances, garbaged). */ u_int32_t st_startup_complete; /* Site completed client sync-up. */ + u_int32_t st_view; /* Site is a view. */ #ifndef __TEST_DB_NO_STATISTICS uintmax_t st_log_queued; /* Log records currently queued.+ */ u_int32_t st_status; /* Current replication status. */ @@ -1194,6 +1231,7 @@ struct __db_rep_stat { /* SHARED */ /* Undocumented statistics only used by the test system. */ #ifdef CONFIG_TEST u_int32_t st_filefail_cleanups; /* # of FILE_FAIL cleanups done. */ + uintmax_t st_log_futuredup; /* Future log records that are dups. */ #endif #endif }; @@ -1204,10 +1242,18 @@ struct __db_repmgr_stat { /* SHARED */ uintmax_t st_msgs_queued; /* # msgs queued for network delay. */ uintmax_t st_msgs_dropped; /* # msgs discarded due to excessive queue length. */ + u_int32_t st_incoming_queue_gbytes; /* Incoming queue size: GB. */ + u_int32_t st_incoming_queue_bytes; /* Incoming queue size: B. */ + uintmax_t st_incoming_msgs_dropped; /* # of msgs discarded due to + incoming queue full. */ uintmax_t st_connection_drop; /* Existing connections dropped. */ uintmax_t st_connect_fail; /* Failed new connection attempts. */ - uintmax_t st_elect_threads; /* # of active election threads. */ - uintmax_t st_max_elect_threads; /* Max concurrent e-threads ever. */ + u_int32_t st_elect_threads; /* # of active election threads. */ + u_int32_t st_max_elect_threads; /* Max concurrent e-threads ever. */ + u_int32_t st_site_participants; /* # of repgroup participant sites. */ + u_int32_t st_site_total; /* # of repgroup total sites. */ + u_int32_t st_site_views; /* # of repgroup view sites. */ + uintmax_t st_takeovers; /* # of automatic listener takeovers. */ }; /* Replication Manager connection error. */ @@ -1238,7 +1284,7 @@ struct __db_sequence { db_mutex_t mtx_seq; /* Mutex if sequence is threaded. */ DB_SEQ_RECORD *seq_rp; /* Pointer to current data. */ DB_SEQ_RECORD seq_record; /* Data from DB_SEQUENCE. */ - int32_t seq_cache_size; /* Number of values cached. */ + u_int32_t seq_cache_size; /* Number of values cached. */ db_seq_t seq_last_value; /* Last value cached. */ db_seq_t seq_prev_value; /* Last value returned. */ DBT seq_key; /* DBT pointing to sequence key. */ @@ -1250,8 +1296,8 @@ struct __db_sequence { /* DB_SEQUENCE PUBLIC HANDLE LIST BEGIN */ int (*close) __P((DB_SEQUENCE *, u_int32_t)); int (*get) __P((DB_SEQUENCE *, - DB_TXN *, int32_t, db_seq_t *, u_int32_t)); - int (*get_cachesize) __P((DB_SEQUENCE *, int32_t *)); + DB_TXN *, u_int32_t, db_seq_t *, u_int32_t)); + int (*get_cachesize) __P((DB_SEQUENCE *, u_int32_t *)); int (*get_db) __P((DB_SEQUENCE *, DB **)); int (*get_flags) __P((DB_SEQUENCE *, u_int32_t *)); int (*get_key) __P((DB_SEQUENCE *, DBT *)); @@ -1261,7 +1307,7 @@ struct __db_sequence { int (*open) __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t)); int (*remove) __P((DB_SEQUENCE *, DB_TXN *, u_int32_t)); - int (*set_cachesize) __P((DB_SEQUENCE *, int32_t)); + int (*set_cachesize) __P((DB_SEQUENCE *, u_int32_t)); int (*set_flags) __P((DB_SEQUENCE *, u_int32_t)); int (*set_range) __P((DB_SEQUENCE *, db_seq_t, db_seq_t)); int (*stat) __P((DB_SEQUENCE *, @@ -1278,7 +1324,7 @@ struct __db_seq_stat { /* SHARED */ db_seq_t st_last_value; /* Last cached value. */ db_seq_t st_min; /* Minimum value. */ db_seq_t st_max; /* Maximum value. */ - int32_t st_cache_size; /* Cache size. */ + u_int32_t st_cache_size; /* Cache size. */ u_int32_t st_flags; /* Flag value. */ }; @@ -1300,15 +1346,15 @@ typedef enum { #define DB_RENAMEMAGIC 0x030800 /* File has been renamed. */ -#define DB_BTREEVERSION 9 /* Current btree version. */ +#define DB_BTREEVERSION 10 /* Current btree version. */ #define DB_BTREEOLDVER 8 /* Oldest btree version supported. */ #define DB_BTREEMAGIC 0x053162 -#define DB_HASHVERSION 9 /* Current hash version. */ +#define DB_HASHVERSION 10 /* Current hash version. */ #define DB_HASHOLDVER 7 /* Oldest hash version supported. */ #define DB_HASHMAGIC 0x061561 -#define DB_HEAPVERSION 1 /* Current heap version. */ +#define DB_HEAPVERSION 2 /* Current heap version. */ #define DB_HEAPOLDVER 1 /* Oldest heap version supported. */ #define DB_HEAPMAGIC 0x074582 @@ -1377,6 +1423,7 @@ typedef enum { #define DB_LOCK_NOTGRANTED (-30992)/* Lock unavailable. */ #define DB_LOG_BUFFER_FULL (-30991)/* In-memory log buffer full. */ #define DB_LOG_VERIFY_BAD (-30990)/* Log verification failed. */ +#define DB_META_CHKSUM_FAIL (-30968)/* Metadata page checksum failed. */ #define DB_NOSERVER (-30989)/* Server panic return. */ #define DB_NOTFOUND (-30988)/* Key/data pair not found (EOF). */ #define DB_OLD_VERSION (-30987)/* Out-of-date version. */ @@ -1405,6 +1452,8 @@ typedef enum { #define DB_DELETED (-30897)/* Recovery file marked deleted. */ #define DB_EVENT_NOT_HANDLED (-30896)/* Forward event to application. */ #define DB_NEEDSPLIT (-30895)/* Page needs to be split. */ +#define DB_NOINTMP (-30886)/* Sequences not supported in temporary + or in-memory databases. */ #define DB_REP_BULKOVF (-30894)/* Rep bulk buffer overflow. */ #define DB_REP_LOGREADY (-30893)/* Rep log ready for recovery. */ #define DB_REP_NEWMASTER (-30892)/* We have learned of a new master. */ @@ -1415,6 +1464,13 @@ typedef enum { #define DB_TXN_CKP (-30888)/* Encountered ckp record in log. */ #define DB_VERIFY_FATAL (-30887)/* DB->verify cannot proceed. */ +/* + * This exit status indicates that a BDB utility failed because it needed a + * resource which had been held by a process which crashed or otherwise did + * not exit cleanly. + */ +#define DB_EXIT_FAILCHK 3 + /* Database handle. */ struct __db { /******************************************************* @@ -1426,7 +1482,7 @@ struct __db { /* Callbacks. */ int (*db_append_recno) __P((DB *, DBT *, db_recno_t)); void (*db_feedback) __P((DB *, int, int)); - int (*dup_compare) __P((DB *, const DBT *, const DBT *)); + int (*dup_compare) __P((DB *, const DBT *, const DBT *, size_t *)); void *app_private; /* Application-private handle. */ @@ -1450,6 +1506,8 @@ struct __db { u_int32_t adj_fileid; /* File's unique ID for curs. adj. */ + u_int32_t blob_threshold; /* Blob threshold record size. */ + #define DB_LOGFILEID_INVALID -1 FNAME *log_filename; /* File's naming info for logging. */ @@ -1593,6 +1651,12 @@ struct __db { /* Reference to foreign -- set in the secondary. */ DB *s_foreign; + DB *blob_meta_db; /* Databases holding blob metadata. */ + DB_SEQUENCE *blob_seq; /* Sequence of blob ids. */ + char *blob_sub_dir; /* Subdirectory for blob files */ + db_seq_t blob_file_id; /* Id of the file blob directory. */ + db_seq_t blob_sdb_id; /* Id of the subdb blob directory. */ + /* API-private structure: used by DB 1.85, C++, Java, Perl and Tcl */ void *api_internal; @@ -1623,8 +1687,11 @@ struct __db { void *(**)(void *, size_t), void (**)(void *))); int (*get_append_recno) __P((DB *, int (**)(DB *, DBT *, db_recno_t))); int (*get_assoc_flags) __P((DB *, u_int32_t *)); + int (*get_blob_dir) __P((DB *, const char **)); + int (*get_blob_sub_dir) __P((DB *, const char **)); + int (*get_blob_threshold) __P((DB *, u_int32_t *)); int (*get_bt_compare) - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); int (*get_bt_compress) __P((DB *, int (**)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), @@ -1637,7 +1704,7 @@ struct __db { int (*get_create_dir) __P((DB *, const char **)); int (*get_dbname) __P((DB *, const char **, const char **)); int (*get_dup_compare) - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); int (*get_encrypt_flags) __P((DB *, u_int32_t *)); DB_ENV *(*get_env) __P((DB *)); void (*get_errcall) __P((DB *, @@ -1647,7 +1714,7 @@ struct __db { int (*get_feedback) __P((DB *, void (**)(DB *, int, int))); int (*get_flags) __P((DB *, u_int32_t *)); int (*get_h_compare) - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); int (*get_h_ffactor) __P((DB *, u_int32_t *)); int (*get_h_hash) __P((DB *, u_int32_t (**)(DB *, const void *, u_int32_t))); @@ -1688,8 +1755,10 @@ struct __db { int (*set_alloc) __P((DB *, void *(*)(size_t), void *(*)(void *, size_t), void (*)(void *))); int (*set_append_recno) __P((DB *, int (*)(DB *, DBT *, db_recno_t))); + int (*set_blob_dir) __P((DB *, const char *)); + int (*set_blob_threshold) __P((DB *, u_int32_t, u_int32_t)); int (*set_bt_compare) - __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *))); int (*set_bt_compress) __P((DB *, int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *))); @@ -1699,7 +1768,7 @@ struct __db { int (*set_cachesize) __P((DB *, u_int32_t, u_int32_t, int)); int (*set_create_dir) __P((DB *, const char *)); int (*set_dup_compare) - __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *))); int (*set_encrypt) __P((DB *, const char *, u_int32_t)); void (*set_errcall) __P((DB *, void (*)(const DB_ENV *, const char *, const char *))); @@ -1708,7 +1777,7 @@ struct __db { int (*set_feedback) __P((DB *, void (*)(DB *, int, int))); int (*set_flags) __P((DB *, u_int32_t)); int (*set_h_compare) - __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *))); int (*set_h_ffactor) __P((DB *, u_int32_t)); int (*set_h_hash) __P((DB *, u_int32_t (*)(DB *, const void *, u_int32_t))); @@ -1808,13 +1877,34 @@ struct __db { u_int32_t orig_flags; /* Flags at open, for refresh */ u_int32_t flags; -#define DB2_AM_EXCL 0x00000001 /* Exclusively lock the handle */ -#define DB2_AM_INTEXCL 0x00000002 /* Internal exclusive lock. */ -#define DB2_AM_NOWAIT 0x00000004 /* Do not wait for handle lock */ - u_int32_t orig_flags2; /* Second flags word; for refresh */ +#define DB2_AM_EXCL 0x00000001 /* Exclusively lock the handle */ +#define DB2_AM_INTEXCL 0x00000002 /* Internal exclusive lock. */ +#define DB2_AM_NOWAIT 0x00000004 /* Do not wait for handle lock */ u_int32_t flags2; /* Second flags word */ }; +/* + * Stream interface for blob files. + */ +struct __db_stream { + DBC *dbc; /* Cursor pointing to the db blob record. */ + DB_FH *fhp; + + /* DB_STREAM PUBLIC HANDLE LIST BEGIN */ + int (*close) __P((DB_STREAM *, u_int32_t)); + int (*read) __P((DB_STREAM *, DBT *, db_off_t, u_int32_t, u_int32_t)); + int (*size) __P((DB_STREAM *, db_off_t *, u_int32_t)); + int (*write) __P((DB_STREAM *, DBT *, db_off_t, u_int32_t)); + /* DB_STREAM PUBLIC HANDLE LIST END */ + + u_int32_t flags; +#define DB_STREAM_READ 0x00000001 /* Stream is read only. */ +#define DB_STREAM_WRITE 0x00000002 /* Stream is writeable. */ +#define DB_STREAM_SYNC_WRITE 0x00000004 /* Sync file on each write. */ + db_seq_t blob_id; + db_off_t file_size; +}; + /* * Macros for bulk operations. These are only intended for the C API. * For C++, use DbMultiple*Iterator or DbMultiple*Builder. @@ -1889,7 +1979,7 @@ struct __db { pointer = __p; \ } while (0) -#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \ +#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \ do { \ (dbt)->flags |= DB_DBT_BULK; \ pointer = (u_int8_t *)(dbt)->data + \ @@ -1897,7 +1987,7 @@ struct __db { *(u_int32_t *)(pointer) = (u_int32_t)-1; \ } while (0) -#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \ +#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \ do { \ u_int32_t *__p = (u_int32_t *)(pointer); \ u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\ @@ -1914,7 +2004,7 @@ struct __db { } \ } while (0) -#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \ +#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \ do { \ void *__destd; \ DB_MULTIPLE_RESERVE_NEXT((pointer), (dbt), \ @@ -1925,7 +2015,7 @@ struct __db { memcpy(__destd, (writedata), (writedlen)); \ } while (0) -#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ +#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ do { \ u_int32_t *__p = (u_int32_t *)(pointer); \ u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\ @@ -1948,7 +2038,7 @@ struct __db { } \ } while (0) -#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ +#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ do { \ void *__destk, *__destd; \ DB_MULTIPLE_KEY_RESERVE_NEXT((pointer), (dbt), \ @@ -1962,7 +2052,7 @@ struct __db { } \ } while (0) -#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \ +#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \ do { \ (dbt)->flags |= DB_DBT_BULK; \ pointer = (u_int8_t *)(dbt)->data + \ @@ -1970,7 +2060,7 @@ struct __db { *(u_int32_t *)(pointer) = 0; \ } while (0) -#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \ +#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \ do { \ u_int32_t *__p = (u_int32_t *)(pointer); \ u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\ @@ -1988,7 +2078,7 @@ struct __db { } \ } while (0) -#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\ +#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\ do { \ void *__destd; \ DB_MULTIPLE_RECNO_RESERVE_NEXT((pointer), (dbt), \ @@ -2003,7 +2093,7 @@ struct __db_heap_rid { db_pgno_t pgno; /* Page number. */ db_indx_t indx; /* Index in the offset table. */ }; -#define DB_HEAP_RID_SZ (sizeof(db_pgno_t) + sizeof(db_indx_t)) +#define DB_HEAP_RID_SZ (sizeof(db_pgno_t) + sizeof(db_indx_t)) /******************************************************* * Access method cursors. @@ -2074,6 +2164,7 @@ struct __dbc { int (*close) __P((DBC *)); int (*cmp) __P((DBC *, DBC *, int *, u_int32_t)); int (*count) __P((DBC *, db_recno_t *, u_int32_t)); + int (*db_stream) __P((DBC *, DB_STREAM **, u_int32_t)); int (*del) __P((DBC *, u_int32_t)); int (*dup) __P((DBC *, DBC **, u_int32_t)); int (*get) __P((DBC *, DBT *, DBT *, u_int32_t)); @@ -2151,6 +2242,7 @@ struct __db_bt_stat { /* SHARED */ u_int32_t bt_pagecnt; /* Page count. */ u_int32_t bt_pagesize; /* Page size. */ u_int32_t bt_minkey; /* Minkey value. */ + u_int32_t bt_nblobs; /* Number of blobs. */ u_int32_t bt_re_len; /* Fixed-length record length. */ u_int32_t bt_re_pad; /* Fixed-length record pad. */ u_int32_t bt_levels; /* Tree levels. */ @@ -2179,7 +2271,7 @@ struct __db_compact { u_int32_t compact_deadlock; /* Number of deadlocks. */ db_pgno_t compact_pages_truncated; /* Pages truncated to OS. */ /* Internal. */ - db_pgno_t compact_truncate; /* Page number for truncation */ + db_pgno_t compact_truncate; /* Exchange pages above here. */ }; /* Hash statistics structure. */ @@ -2189,6 +2281,7 @@ struct __db_h_stat { /* SHARED */ u_int32_t hash_metaflags; /* Metadata flags. */ u_int32_t hash_nkeys; /* Number of unique keys. */ u_int32_t hash_ndata; /* Number of data items. */ + u_int32_t hash_nblobs; /* Number of blobs. */ u_int32_t hash_pagecnt; /* Page count. */ u_int32_t hash_pagesize; /* Page size. */ u_int32_t hash_ffactor; /* Fill factor specified at create. */ @@ -2208,6 +2301,7 @@ struct __db_heap_stat { /* SHARED */ u_int32_t heap_magic; /* Magic number. */ u_int32_t heap_version; /* Version number. */ u_int32_t heap_metaflags; /* Metadata flags. */ + u_int32_t heap_nblobs; /* Number of blobs. */ u_int32_t heap_nrecs; /* Number of records. */ u_int32_t heap_pagecnt; /* Page count. */ u_int32_t heap_pagesize; /* Page size. */ @@ -2267,21 +2361,15 @@ typedef enum { * Backup configuration types. */ typedef enum { - DB_BACKUP_READ_COUNT = 1, - DB_BACKUP_READ_SLEEP = 2, - DB_BACKUP_SIZE = 3, - DB_BACKUP_WRITE_DIRECT = 4 + DB_BACKUP_READ_COUNT=1, + DB_BACKUP_READ_SLEEP=2, + DB_BACKUP_SIZE=3, + DB_BACKUP_WRITE_DIRECT=4 } DB_BACKUP_CONFIG; struct __db_env { ENV *env; /* Linked ENV structure */ - /* - * The DB_ENV structure can be used concurrently, so field access is - * protected. - */ - db_mutex_t mtx_db_env; /* DB_ENV structure mutex */ - /* Error message callback */ void (*db_errcall) __P((const DB_ENV *, const char *, const char *)); FILE *db_errfile; /* Error message file stream */ @@ -2304,6 +2392,7 @@ struct __db_env { char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *)); /* Application specified paths */ + char *db_blob_dir; /* Blob file directory */ char *db_log_dir; /* Database log file directory */ char *db_md_dir; /* Persistent metadata directory */ char *db_tmp_dir; /* Database tmp file directory */ @@ -2327,6 +2416,8 @@ struct __db_env { u_int32_t verbose; /* DB_VERB_XXX flags */ + u_int32_t blob_threshold; /* Blob threshold record size */ + /* Mutex configuration */ u_int32_t mutex_align; /* Mutex alignment */ u_int32_t mutex_cnt; /* Number of mutexes to configure */ @@ -2395,6 +2486,11 @@ struct __db_env { * build settings. */ db_timeout_t envreg_timeout; /* DB_REGISTER wait timeout */ + /* + * When failchk broadcasting is active, any wait for a mutex will wake + * up this frequently in order to check whether the mutex has died. + */ + db_timeout_t mutex_failchk_timeout; #define DB_ENV_AUTO_COMMIT 0x00000001 /* DB_AUTO_COMMIT */ #define DB_ENV_CDB_ALLDB 0x00000002 /* CDB environment wide locking */ @@ -2414,8 +2510,8 @@ struct __db_env { #define DB_ENV_TXN_SNAPSHOT 0x00008000 /* DB_TXN_SNAPSHOT set */ #define DB_ENV_TXN_WRITE_NOSYNC 0x00010000 /* DB_TXN_WRITE_NOSYNC set */ #define DB_ENV_YIELDCPU 0x00020000 /* DB_YIELDCPU set */ -#define DB_ENV_HOTBACKUP 0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */ -#define DB_ENV_NOFLUSH 0x00080000 /* DB_NOFLUSH set */ +#define DB_ENV_HOTBACKUP 0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */ +#define DB_ENV_NOFLUSH 0x00080000 /* DB_NOFLUSH set */ u_int32_t flags; /* DB_ENV PUBLIC HANDLE LIST BEGIN */ @@ -2436,6 +2532,8 @@ struct __db_env { void *(**)(void *, size_t), void (**)(void *))); int (*get_app_dispatch) __P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops))); + int (*get_blob_dir) __P((DB_ENV *, const char **)); + int (*get_blob_threshold) __P((DB_ENV*, u_int32_t *)); int (*get_cache_max) __P((DB_ENV *, u_int32_t *, u_int32_t *)); int (*get_cachesize) __P((DB_ENV *, u_int32_t *, u_int32_t *, int *)); int (*get_create_dir) __P((DB_ENV *, const char **)); @@ -2451,8 +2549,8 @@ struct __db_env { void (**)(const DB_ENV *, const char *, const char *))); void (*get_errfile) __P((DB_ENV *, FILE **)); void (*get_errpfx) __P((DB_ENV *, const char **)); - int (*get_flags) __P((DB_ENV *, u_int32_t *)); int (*get_feedback) __P((DB_ENV *, void (**)(DB_ENV *, int, int))); + int (*get_flags) __P((DB_ENV *, u_int32_t *)); int (*get_home) __P((DB_ENV *, const char **)); int (*get_intermediate_dir_mode) __P((DB_ENV *, const char **)); int (*get_isalive) __P((DB_ENV *, @@ -2568,17 +2666,23 @@ struct __db_env { int (*rep_set_timeout) __P((DB_ENV *, int, db_timeout_t)); int (*rep_set_transport) __P((DB_ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t))); + int (*rep_set_view) __P((DB_ENV *, int (*)(DB_ENV *, + const char *, int *, u_int32_t))); int (*rep_start) __P((DB_ENV *, DBT *, u_int32_t)); int (*rep_stat) __P((DB_ENV *, DB_REP_STAT **, u_int32_t)); int (*rep_stat_print) __P((DB_ENV *, u_int32_t)); int (*rep_sync) __P((DB_ENV *, u_int32_t)); int (*repmgr_channel) __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t)); int (*repmgr_get_ack_policy) __P((DB_ENV *, int *)); + int (*repmgr_get_incoming_queue_max) + __P((DB_ENV *, u_int32_t *, u_int32_t *)); int (*repmgr_local_site) __P((DB_ENV *, DB_SITE **)); int (*repmgr_msg_dispatch) __P((DB_ENV *, void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t), u_int32_t)); int (*repmgr_set_ack_policy) __P((DB_ENV *, int)); + int (*repmgr_set_incoming_queue_max) + __P((DB_ENV *, u_int32_t, u_int32_t)); int (*repmgr_site) __P((DB_ENV *, const char *, u_int, DB_SITE**, u_int32_t)); int (*repmgr_site_by_eid) __P((DB_ENV *, int, DB_SITE**)); @@ -2590,6 +2694,8 @@ struct __db_env { void *(*)(void *, size_t), void (*)(void *))); int (*set_app_dispatch) __P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops))); + int (*set_blob_dir) __P((DB_ENV *, const char *)); + int (*set_blob_threshold) __P((DB_ENV *, u_int32_t, u_int32_t)); int (*set_cache_max) __P((DB_ENV *, u_int32_t, u_int32_t)); int (*set_cachesize) __P((DB_ENV *, u_int32_t, u_int32_t, int)); int (*set_create_dir) __P((DB_ENV *, const char *)); @@ -2662,8 +2768,8 @@ struct __db_env { /* DB_ENV PUBLIC HANDLE LIST END */ /* DB_ENV PRIVATE HANDLE LIST BEGIN */ - int (*prdbt) __P((DBT *, int, - const char *, void *, int (*)(void *, const void *), int, int)); + int (*prdbt) __P((DBT *, int, const char *, void *, + int (*)(void *, const void *), int, int, int)); /* DB_ENV PRIVATE HANDLE LIST END */ }; diff --git a/src/dbinc/db_185.in b/src/dbinc/db_185.in index 43735344..3aef2eca 100644 --- a/src/dbinc/db_185.in +++ b/src/dbinc/db_185.in @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 diff --git a/src/dbinc/db_am.h b/src/dbinc/db_am.h index f34578c4..2b5c49d2 100644 --- a/src/dbinc/db_am.h +++ b/src/dbinc/db_am.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -199,12 +199,16 @@ struct __db_foreign_info { #define DB_IS_PRIMARY(dbp) (LIST_FIRST(&dbp->s_secondaries) != NULL) /* * A database should be required to be readonly if it's been explicitly - * specified as such or if we're a client in a replicated environment - * and the user did not specify DB_TXN_NOT_DURABLE. + * specified as such, if we're a client in a replicated environment + * and the user did not specify DB_TXN_NOT_DURABLE, or if we're a master + * in a replicated environment and the REP_F_READONLY_MASTER flag has been + * set in preparation for a preferred master takeover. */ #define DB_IS_READONLY(dbp) \ (F_ISSET(dbp, DB_AM_RDONLY) || \ - (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE))) + (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE)) \ + || (IS_REP_MASTER((dbp)->env) && \ + F_ISSET((dbp)->env->rep_handle->region, REP_F_READONLY_MASTER))) #ifdef HAVE_COMPRESSION /* diff --git a/src/dbinc/db_cxx.in b/src/dbinc/db_cxx.in index 84fc0f88..5b29f7e8 100644 --- a/src/dbinc/db_cxx.in +++ b/src/dbinc/db_cxx.in @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -76,6 +76,7 @@ class DbMpoolFile; // forward class DbPreplist; // forward class DbSequence; // forward class DbSite; // forward +class DbStream; // forward class Dbt; // forward class DbTxn; // forward @@ -159,13 +160,13 @@ extern "C" { typedef void (*db_free_fcn_type) (void *); typedef int (*bt_compare_fcn_type) /*C++ version available*/ - (DB *, const DBT *, const DBT *); + (DB *, const DBT *, const DBT *, size_t *); typedef size_t (*bt_prefix_fcn_type) /*C++ version available*/ (DB *, const DBT *, const DBT *); typedef int (*dup_compare_fcn_type) /*C++ version available*/ - (DB *, const DBT *, const DBT *); + (DB *, const DBT *, const DBT *, size_t *); typedef int (*h_compare_fcn_type) /*C++ version available*/ - (DB *, const DBT *, const DBT *); + (DB *, const DBT *, const DBT *, size_t *); typedef u_int32_t (*h_hash_fcn_type) /*C++ version available*/ (DB *, const void *, u_int32_t); typedef int (*pgin_fcn_type) @@ -204,7 +205,10 @@ public: virtual int get_alloc( db_malloc_fcn_type *, db_realloc_fcn_type *, db_free_fcn_type *); virtual int get_append_recno(int (**)(Db *, Dbt *, db_recno_t)); - virtual int get_bt_compare(int (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_blob_dir(const char **); + virtual int get_blob_threshold(u_int32_t *); + virtual int get_bt_compare( + int (**)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int get_bt_compress( int (**)( Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *), @@ -215,7 +219,8 @@ public: virtual int get_cachesize(u_int32_t *, u_int32_t *, int *); virtual int get_create_dir(const char **); virtual int get_dbname(const char **, const char **); - virtual int get_dup_compare(int (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_dup_compare( + int (**)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int get_encrypt_flags(u_int32_t *); virtual void get_errcall( void (**)(const DbEnv *, const char *, const char *)); @@ -225,7 +230,8 @@ public: virtual int get_flags(u_int32_t *); virtual int get_heapsize(u_int32_t *, u_int32_t *); virtual int get_heap_regionsize(u_int32_t *); - virtual int get_h_compare(int (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_h_compare( + int (**)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int get_h_ffactor(u_int32_t *); virtual int get_h_hash(u_int32_t (**)(Db *, const void *, u_int32_t)); virtual int get_h_nelem(u_int32_t *); @@ -261,8 +267,11 @@ public: db_malloc_fcn_type, db_realloc_fcn_type, db_free_fcn_type); virtual void set_app_private(void *); virtual int set_append_recno(int (*)(Db *, Dbt *, db_recno_t)); + virtual int set_blob_dir(const char *); + virtual int set_blob_threshold(u_int32_t, u_int32_t); virtual int set_bt_compare(bt_compare_fcn_type); /*deprecated*/ - virtual int set_bt_compare(int (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_bt_compare( + int (*)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int set_bt_compress( int (*) (Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *), @@ -273,7 +282,8 @@ public: virtual int set_cachesize(u_int32_t, u_int32_t, int); virtual int set_create_dir(const char *); virtual int set_dup_compare(dup_compare_fcn_type); /*deprecated*/ - virtual int set_dup_compare(int (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_dup_compare( + int (*)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int set_encrypt(const char *, u_int32_t); virtual void set_errcall( void (*)(const DbEnv *, const char *, const char *)); @@ -284,7 +294,8 @@ public: virtual int set_heapsize(u_int32_t, u_int32_t); virtual int set_heap_regionsize(u_int32_t); virtual int set_h_compare(h_compare_fcn_type); /*deprecated*/ - virtual int set_h_compare(int (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_h_compare( + int (*)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int set_h_ffactor(u_int32_t); virtual int set_h_hash(h_hash_fcn_type); /*deprecated*/ virtual int set_h_hash(u_int32_t (*)(Db *, const void *, u_int32_t)); @@ -383,16 +394,16 @@ public: int (*associate_callback_)(Db *, const Dbt *, const Dbt *, Dbt *); int (*associate_foreign_callback_) (Db *, const Dbt *, Dbt *, const Dbt *, int *); - int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *); + int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *); int (*bt_compress_callback_)( Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *); int (*bt_decompress_callback_)( Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *); size_t (*bt_prefix_callback_)(Db *, const Dbt *, const Dbt *); u_int32_t (*db_partition_callback_)(Db *, Dbt *); - int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *); + int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *); void (*feedback_callback_)(Db *, int, int); - int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *); + int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *); u_int32_t (*h_hash_callback_)(Db *, const void *, u_int32_t); }; @@ -407,6 +418,7 @@ public: int close(); int cmp(Dbc *other_csr, int *result, u_int32_t flags); int count(db_recno_t *countp, u_int32_t flags); + int db_stream(DbStream **dbsp, u_int32_t flags); int del(u_int32_t flags); int dup(Dbc** cursorp, u_int32_t flags); int get(Dbt* key, Dbt *data, u_int32_t flags); @@ -527,6 +539,10 @@ public: int (*)(DbEnv *, const char *, void *)); virtual int get_backup_config(DB_BACKUP_CONFIG, u_int32_t *); virtual int set_backup_config(DB_BACKUP_CONFIG, u_int32_t); + virtual int get_blob_dir(const char **); + virtual int set_blob_dir(const char *); + virtual int get_blob_threshold(u_int32_t *); + virtual int set_blob_threshold(u_int32_t, u_int32_t); virtual int get_cachesize(u_int32_t *, u_int32_t *, int *); virtual int set_cachesize(u_int32_t, u_int32_t, int); virtual int get_cache_max(u_int32_t *, u_int32_t *); @@ -761,10 +777,16 @@ public: virtual int rep_set_priority(u_int32_t priority); virtual int rep_get_timeout(int which, db_timeout_t *timeout); virtual int rep_set_timeout(int which, db_timeout_t timeout); + virtual int rep_set_view(int (*)(DbEnv *, + const char *, int *, u_int32_t)); virtual int repmgr_channel(int eid, DbChannel **channel, u_int32_t flags); virtual int repmgr_get_ack_policy(int *policy); virtual int repmgr_set_ack_policy(int policy); + virtual int repmgr_get_incoming_queue_max(u_int32_t *gbytesp, + u_int32_t *bytesp); + virtual int repmgr_set_incoming_queue_max(u_int32_t gbytes, + u_int32_t bytes); virtual int repmgr_local_site(DbSite **site); virtual int repmgr_msg_dispatch(void (*) (DbEnv *, DbChannel *, Dbt *, u_int32_t, u_int32_t), u_int32_t flags); @@ -824,6 +846,8 @@ public: static int _backup_write_intercept(DB_ENV *dbenv, u_int32_t off_gbytes, u_int32_t off_bytes, u_int32_t size, u_int8_t *buf, void *handle); static void _paniccall_intercept(DB_ENV *dbenv, int errval); + static int _partial_rep_intercept(DB_ENV *dbenv, + const char *name, int *result, u_int32_t flags); static void _feedback_intercept(DB_ENV *dbenv, int opcode, int pct); static void _event_func_intercept(DB_ENV *dbenv, u_int32_t, void *); static int _isalive_intercept(DB_ENV *dbenv, pid_t pid, @@ -872,6 +896,7 @@ private: void (*feedback_callback_)(DbEnv *, int, int); void (*message_callback_)(const DbEnv *, const char *); void (*paniccall_callback_)(DbEnv *, int); + int (*partial_rep_callback_)(DbEnv *, const char *, int *, u_int32_t); void (*event_func_callback_)(DbEnv *, u_int32_t, void *); int (*rep_send_callback_)(DbEnv *, const Dbt *, const Dbt *, const DbLsn *, int, u_int32_t); @@ -1057,9 +1082,9 @@ public: int stat(DB_SEQUENCE_STAT **sp, u_int32_t flags); int stat_print(u_int32_t flags); - int get(DbTxn *txnid, int32_t delta, db_seq_t *retp, u_int32_t flags); - int get_cachesize(int32_t *sizep); - int set_cachesize(int32_t size); + int get(DbTxn *txnid, u_int32_t delta, db_seq_t *retp, u_int32_t flags); + int get_cachesize(u_int32_t *sizep); + int set_cachesize(u_int32_t size); int get_flags(u_int32_t *flagsp); int set_flags(u_int32_t flags); int get_range(db_seq_t *minp, db_seq_t *maxp); @@ -1137,6 +1162,34 @@ private: }; // +// DbStream +// +class _exported DbStream : protected DB_STREAM +{ + friend class Dbc; + +public: + int close(u_int32_t flags); + int read(Dbt *data, db_off_t offset, u_int32_t size, u_int32_t flags); + int size(db_off_t *size, u_int32_t flags); + int write(Dbt *data, db_off_t offset, u_int32_t flags); + +private: + // No data is permitted in this class (see comment at top) + + // Note: use Dbc::dbstream() to get pointers to a DbStream, + // and call Dbstream::close() rather than delete to release them. + // + DbStream(); + ~DbStream(); + + // no copying + DbStream(const DbStream &); + DbStream &operator = (const DbStream &); + +}; + +// // Transaction // class _exported DbTxn @@ -1245,6 +1298,7 @@ class _exported Dbt : private DBT friend class DbEnv; friend class DbLogc; friend class DbSequence; + friend class DbStream; public: // key/data diff --git a/src/dbinc/db_dispatch.h b/src/dbinc/db_dispatch.h index b6382871..b3aedab1 100644 --- a/src/dbinc/db_dispatch.h +++ b/src/dbinc/db_dispatch.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1995, 1996 diff --git a/src/dbinc/db_int.in b/src/dbinc/db_int.in index 42439107..593deef6 100644 --- a/src/dbinc/db_int.in +++ b/src/dbinc/db_int.in @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -73,6 +73,17 @@ #endif /* !HAVE_SYSTEM_INCLUDE_FILES */ +/* + * The Windows compiler needs to be told about structures that are available + * outside a dll. + */ +#if defined(DB_WIN32) && defined(_MSC_VER) && \ + !defined(DB_CREATE_DLL) && !defined(_LIB) +#define __DB_IMPORT __declspec(dllimport) +#else +#define __DB_IMPORT +#endif + #ifdef DB_WIN32 #include "dbinc/win_db.h" #endif @@ -88,22 +99,12 @@ #include "dbinc/queue.h" #include "dbinc/shqueue.h" #include "dbinc/perfmon.h" +#include "dbinc/clock.h" #if defined(__cplusplus) extern "C" { #endif -/* - * The Windows compiler needs to be told about structures that are available - * outside a dll. - */ -#if defined(DB_WIN32) && defined(_MSC_VER) && \ - !defined(DB_CREATE_DLL) && !defined(_LIB) -#define __DB_IMPORT __declspec(dllimport) -#else -#define __DB_IMPORT -#endif - /******************************************************* * Forward structure declarations. *******************************************************/ @@ -366,22 +367,27 @@ typedef struct __fn { /* * Structure used for callback message aggregation. * - * Display values in XXX_stat_print calls. + * DB_MSGBUF_FLUSH displays values in XXX_stat_print calls. + * DB_MSGBUF_REP_FLUSH displays replication system messages. */ typedef struct __db_msgbuf { char *buf; /* Heap allocated buffer. */ char *cur; /* Current end of message. */ size_t len; /* Allocated length of buffer. */ + int flags; } DB_MSGBUF; +#define DB_MSGBUF_PREALLOCATED 0x0001 + #define DB_MSGBUF_INIT(a) do { \ (a)->buf = (a)->cur = NULL; \ - (a)->len = 0; \ + (a)->len = (a)->flags = 0; \ } while (0) #define DB_MSGBUF_FLUSH(env, a) do { \ if ((a)->buf != NULL) { \ if ((a)->cur != (a)->buf) \ __db_msg(env, "%s", (a)->buf); \ - __os_free(env, (a)->buf); \ + if (!F_ISSET((a), DB_MSGBUF_PREALLOCATED)) \ + __os_free(env, (a)->buf); \ DB_MSGBUF_INIT(a); \ } \ } while (0) @@ -392,18 +398,14 @@ typedef struct __db_msgbuf { if (regular_msg) \ DB_MSGBUF_FLUSH(env, a); \ else { \ - __os_free(env, (a)->buf); \ + if (!F_ISSET((a), DB_MSGBUF_PREALLOCATED)) \ + __os_free(env, (a)->buf); \ DB_MSGBUF_INIT(a); \ } \ } \ } while (0) -#define STAT_FMT(msg, fmt, type, v) do { \ - DB_MSGBUF __mb; \ - DB_MSGBUF_INIT(&__mb); \ - __db_msgadd(env, &__mb, fmt, (type)(v)); \ - __db_msgadd(env, &__mb, "\t%s", msg); \ - DB_MSGBUF_FLUSH(env, &__mb); \ -} while (0) +#define STAT_FMT(msg, fmt, type, v) \ + __db_msg(env, fmt "\t%s", (type)(v), msg); #define STAT_HEX(msg, v) \ __db_msg(env, "%#lx\t%s", (u_long)(v), msg) #define STAT_ISSET(msg, p) \ @@ -441,25 +443,21 @@ typedef struct __db_msgbuf { * * Error message IDs are automatically assigned by dist/s_message_id script. */ -#ifdef HAVE_LOCALIZATION -#define _(msg) msg /* Replace with localization function. */ -#else -#define _(msg) msg -#endif - #ifdef HAVE_STRIPPED_MESSAGES #define DB_STR_C(msg, fmt) fmt #else -#define DB_STR_C(msg, fmt) _(msg) +#define DB_STR_C(msg, fmt) msg #endif -#define DB_MSGID(id) "BDB" id - -#define DB_STR(id, msg) DB_MSGID(id) " " DB_STR_C(msg, "") - -#define DB_STR_A(id, msg, fmt) DB_MSGID(id) " " DB_STR_C(msg, fmt) +#ifdef HAVE_LOCALIZATION +#define _(msg) (msg) /* Replace with localization function. */ +#else +#define _(msg) msg +#endif -#define DB_STR_P(msg) _(msg) +#define DB_STR(id, msg) _("BDB" id " " DB_STR_C(msg, "")) +#define DB_STR_A(id, msg, fmt) _("BDB" id " " DB_STR_C(msg, fmt)) +#define DB_STR_P(msg) _(msg) /* * There are quite a few places in Berkeley DB where we want to initialize @@ -542,6 +540,7 @@ typedef struct __db_msgbuf { /* Type passed to __db_appname(). */ typedef enum { DB_APP_NONE=0, /* No type (region). */ + DB_APP_BLOB, /* Blob file. */ DB_APP_DATA, /* Data file. */ DB_APP_LOG, /* Log file. */ DB_APP_META, /* Persistent metadata file. */ @@ -612,8 +611,13 @@ typedef enum { if (F_ISSET((env), ENV_OPEN_CALLED)) \ ENV_REQUIRES_CONFIG(env, handle, i, flags) +/* + * The ENV_ENTER and ENV_LEAVE macros announce to other threads that + * the current thread is entering or leaving the BDB api. + */ #define ENV_ENTER_RET(env, ip, ret) do { \ ret = 0; \ + DISCARD_HISTORY(env); \ PANIC_CHECK_RET(env, ret); \ if (ret == 0) { \ if ((env)->thr_hashtab == NULL) \ @@ -631,6 +635,10 @@ typedef enum { return (__ret); \ } while (0) +/* + * Publicize the current thread's intention to run failchk. This invokes + * DB_ENV->is_alive() in the mutex code, to avoid hanging on dead processes. + */ #define FAILCHK_THREAD(env, ip) do { \ if ((ip) != NULL) \ (ip)->dbth_state = THREAD_FAILCHK; \ @@ -638,20 +646,15 @@ typedef enum { #define ENV_GET_THREAD_INFO(env, ip) ENV_ENTER(env, ip) -#ifdef DIAGNOSTIC #define ENV_LEAVE(env, ip) do { \ - if ((ip) != NULL) { \ - DB_ASSERT(env, ((ip)->dbth_state == THREAD_ACTIVE || \ - (ip)->dbth_state == THREAD_FAILCHK)); \ + if ((ip) != NULL) { \ + DB_ASSERT((env), (ip)->dbth_state == THREAD_ACTIVE || \ + (ip)->dbth_state == THREAD_FAILCHK); \ (ip)->dbth_state = THREAD_OUT; \ } \ } while (0) -#else -#define ENV_LEAVE(env, ip) do { \ - if ((ip) != NULL) \ - (ip)->dbth_state = THREAD_OUT; \ -} while (0) -#endif + + #ifdef DIAGNOSTIC #define CHECK_THREAD(env) do { \ if ((env)->thr_hashtab != NULL) \ @@ -688,6 +691,23 @@ typedef struct __pin_list { } PIN_LIST; #define PINMAX 4 +typedef enum { + MUTEX_ACTION_UNLOCKED=0, + MUTEX_ACTION_INTEND_SHARE, /* Thread is attempting a read-lock. */ + MUTEX_ACTION_SHARED /* Thread has gotten a read lock. */ +} MUTEX_ACTION; + +typedef struct __mutex_state { /* SHARED */ + db_mutex_t mutex; + MUTEX_ACTION action; +#ifdef DIAGNOSTIC + db_timespec when; +#endif +} MUTEX_STATE; + +#define MUTEX_STATE_MAX 10 /* It only needs enough for shared latches. */ + + struct __db_thread_info { /* SHARED */ pid_t dbth_pid; db_threadid_t dbth_tid; @@ -707,11 +727,25 @@ struct __db_thread_info { /* SHARED */ u_int16_t dbth_pinmax; /* Number of slots allocated. */ roff_t dbth_pinlist; /* List of pins. */ PIN_LIST dbth_pinarray[PINMAX]; /* Initial array of slots. */ + + /* + * While thread tracking is active this caches one of the lockers + * created by each thread. This locker remains allocated, with an + * invalid id, even after the locker id is freed. + */ + roff_t dbth_local_locker; + /* + * Each latch shared by this thread has an entry here. Exclusive + * ownership, for both latches and mutexes, are in the DB_MUTEX. + */ + MUTEX_STATE dbth_latches[MUTEX_STATE_MAX]; #ifdef DIAGNOSTIC roff_t dbth_locker; /* Current locker for this thread. */ u_int32_t dbth_check_off; /* Count of number of LOCK_OFF calls. */ #endif + db_timespec dbth_failtime; /* Time when its crash was detected. */ }; + #ifdef DIAGNOSTIC #define LOCK_CHECK_OFF(ip) if ((ip) != NULL) \ (ip)->dbth_check_off++ @@ -729,7 +763,7 @@ struct __db_thread_info { /* SHARED */ #define LOCK_CHECK(dbc, pgno, mode) NOP_STATEMENT #endif -typedef struct __env_thread_info { +typedef struct __env_thread_info { /* SHARED */ u_int32_t thr_count; u_int32_t thr_init; u_int32_t thr_max; @@ -803,6 +837,11 @@ struct __env { #define ENV_DEF_DATA_LEN 100 u_int32_t data_len; /* Data length in __db_prbytes. */ + /* Registered processes */ + size_t num_active_pids; /* number of entries in active_pids */ + size_t size_active_pids; /* allocated size of active_pids */ + pid_t *active_pids; /* array active pids */ + /* Thread tracking */ u_int32_t thr_nbucket; /* Number of hash buckets */ DB_HASHTAB *thr_hashtab; /* Hash table of DB_THREAD_INFO */ @@ -866,6 +905,7 @@ struct __env { #define DB_TEST_PREOPEN 10 /* before __os_open */ #define DB_TEST_REPMGR_PERM 11 /* repmgr perm/archiving tests */ #define DB_TEST_SUBDB_LOCKS 12 /* subdb locking tests */ +#define DB_TEST_REPMGR_HEARTBEAT 13 /* repmgr stop sending heartbeats */ int test_abort; /* Abort value for testing */ int test_check; /* Checkpoint value for testing */ int test_copy; /* Copy value for testing */ @@ -881,7 +921,9 @@ struct __env { #define ENV_REF_COUNTED 0x00000100 /* Region references this handle */ #define ENV_SYSTEM_MEM 0x00000200 /* DB_SYSTEM_MEM set */ #define ENV_THREAD 0x00000400 /* DB_THREAD set */ -#define ENV_FORCE_TXN_BULK 0x00000800 /* Txns use bulk mode-for testing */ +#define ENV_FORCE_TXN_BULK 0x00000800 /* Txns use bulk mode-for testing */ +#define ENV_REMEMBER_PANIC 0x00001000 /* Panic was on during cleanup. */ +#define ENV_FORCESYNCENV 0x00002000 /* Force msync on closing. */ u_int32_t flags; }; @@ -1106,7 +1148,6 @@ typedef struct __dbpginfo { @db_int_def@ #include "dbinc/globals.h" -#include "dbinc/clock.h" #include "dbinc/debug.h" #include "dbinc/region.h" #include "dbinc_auto/env_ext.h" @@ -1118,6 +1159,7 @@ typedef struct __dbpginfo { #include "dbinc/os.h" #include "dbinc_auto/clib_ext.h" #include "dbinc_auto/common_ext.h" +#include "dbinc_auto/blob_ext.h" /******************************************************* * Remaining Log. diff --git a/src/dbinc/db_join.h b/src/dbinc/db_join.h index aecf059a..8f22adcb 100644 --- a/src/dbinc/db_join.h +++ b/src/dbinc/db_join.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/db_page.h b/src/dbinc/db_page.h index 2d4de2e5..4694c4cf 100644 --- a/src/dbinc/db_page.h +++ b/src/dbinc/db_page.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -93,6 +93,7 @@ typedef struct _dbmeta33 { u_int8_t uid[DB_FILE_ID_LEN]; } DBMETA33, DBMETA; + /************************************************************************ BTREE METADATA PAGE LAYOUT ************************************************************************/ @@ -113,7 +114,13 @@ typedef struct _btmeta33 { u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */ u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */ u_int32_t root; /* 88-91: Root page. */ - u_int32_t unused2[92]; /* 92-459: Unused space. */ + u_int32_t blob_threshold; + /* 92-95: Minimum blob file size. */ + u_int32_t blob_file_lo; /* 96-99: Blob file dir id lo. */ + u_int32_t blob_file_hi; /* 100-103: Blob file dir id hi. */ + u_int32_t blob_sdb_lo; /* 104-107: Blob sdb dir id lo */ + u_int32_t blob_sdb_hi; /* 108-111: Blob sdb dir id hi */ + u_int32_t unused2[87]; /* 112-459: Unused space. */ u_int32_t crypto_magic; /* 460-463: Crypto magic number */ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ @@ -142,7 +149,13 @@ typedef struct _hashmeta33 { #define NCACHED 32 /* number of spare points */ /* 96-223: Spare pages for overflow */ u_int32_t spares[NCACHED]; - u_int32_t unused[59]; /* 224-459: Unused space */ + u_int32_t blob_threshold; + /* 224-227: Minimum blob file size. */ + u_int32_t blob_file_lo; /* 228-231: Blob file dir id lo. */ + u_int32_t blob_file_hi; /* 232-235: Blob file dir id hi. */ + u_int32_t blob_sdb_lo; /* 236-239: Blob sdb dir id lo. */ + u_int32_t blob_sdb_hi; /* 240-243: Blob sdb dir id hi. */ + u_int32_t unused[54]; /* 244-459: Unused space */ u_int32_t crypto_magic; /* 460-463: Crypto magic number */ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ @@ -168,7 +181,10 @@ typedef struct _heapmeta { u_int32_t gbytes; /* 80-83: GBytes for fixed size heap. */ u_int32_t bytes; /* 84-87: Bytes for fixed size heap. */ u_int32_t region_size; /* 88-91: Max region size. */ - u_int32_t unused2[92]; /* 92-459: Unused space.*/ + u_int32_t blob_threshold; /* 92-95: Minimum blob file size. */ + u_int32_t blob_file_lo; /* 96-97: Blob file dir id lo. */ + u_int32_t blob_file_hi; /* 98-101: Blob file dir id hi. */ + u_int32_t unused2[89]; /* 102-459: Unused space.*/ u_int32_t crypto_magic; /* 460-463: Crypto magic number */ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ @@ -371,6 +387,7 @@ typedef struct __heaphdr { #define HEAP_RECSPLIT 0x01 /* Heap data record is split */ #define HEAP_RECFIRST 0x02 /* First piece of a split record */ #define HEAP_RECLAST 0x04 /* Last piece of a split record */ +#define HEAP_RECBLOB 0x08 /* Record refers to a blob */ u_int8_t flags; /* 00: Flags describing record. */ u_int8_t unused; /* 01: Padding. */ u_int16_t size; /* 02-03: The size of the stored data piece. */ @@ -384,8 +401,35 @@ typedef struct __heaphdrsplt { u_int16_t unused; /* 14-15: Padding. */ } HEAPSPLITHDR; +/* + * HEAPBLOB, the blob database record for heap. + * Saving bytes is not a concern for the blob record type - if too many + * fit onto a single page, then we're likely to introduce unnecessary + * contention for blobs. Using blobs implies storing large items, thus slightly + * more per-item overhead is acceptable. + * If this proves untrue, the crypto section of the record could be optional. + * encoding, lsn, encryption, and checksum fields are unused at the moment, but + * included to make adding those features easier. + */ +typedef struct _heapblob { + HEAPHDR std_hdr; /* 00-03: The standard data header */ + u_int8_t encoding; /* 04: Encoding of blob file. */ + u_int8_t unused[7]; /* 05-11: Padding, unused. */ + u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */ + DB_LSN lsn; /* 48-55: LSN for blob file update. */ + u_int64_t id; /* 56-63: Blob file identifier. */ + u_int64_t size; /* 64-71: Blob file size. */ + u_int64_t file_id; /* 72-80: File directory. */ +} HEAPBLOBHDR, HEAPBLOBHDR60P1; + #define HEAP_HDRSIZE(hdr) \ - (F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : sizeof(HEAPHDR)) + (F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : \ + sizeof(HEAPHDR)) + +#define HEAPBLOBREC_SIZE (sizeof(HEAPBLOBHDR)) +#define HEAPBLOBREC_DSIZE (sizeof(HEAPBLOBHDR) - sizeof(HEAPHDR)) +#define HEAPBLOBREC_DATA(p) (((u_int8_t *)p) + sizeof(HEAPHDR)) #define HEAPPG_SZ(dbp) \ (F_ISSET((dbp), DB_AM_ENCRYPT) ? HEAPPG_SEC : \ @@ -441,12 +485,12 @@ typedef struct __heaphdrsplt { /* Return the amount of free space on a heap data page. */ #define HEAP_FREESPACE(dbp, p) \ - (HOFFSET(p) - HEAPPG_SZ(dbp) - \ + ((HOFFSET(p) - HEAPPG_SZ(dbp)) - \ (NUM_ENT(p) == 0 ? 0 : ((HEAP_HIGHINDX(p) + 1) * sizeof(db_indx_t)))) /* The maximum amount of data that can fit on an empty heap data page. */ #define HEAP_MAXDATASIZE(dbp) \ - ((dbp)->pgsize - HEAPPG_SZ(dbp) - sizeof(db_indx_t)) + (((dbp)->pgsize - HEAPPG_SZ(dbp)) - sizeof(db_indx_t)) #define HEAP_FREEINDX(p) (((HEAPPG *)p)->free_indx) #define HEAP_HIGHINDX(p) (((HEAPPG *)p)->high_indx) @@ -549,9 +593,9 @@ typedef struct _qpage { * The amount of overflow data stored on each page is stored in the * hf_offset field. * - * The implementation reference counts overflow items as it's possible - * for them to be promoted onto btree internal pages. The reference - * count is stored in the entries field. + * Before 4.3 the implementation reference counted overflow items as it + * once was possible for them to be promoted onto btree internal pages. + * The reference count is stored in the entries field. */ #define OV_LEN(p) (((PAGE *)p)->hf_offset) #define OV_REF(p) (((PAGE *)p)->entries) @@ -571,6 +615,7 @@ typedef struct _qpage { #define H_DUPLICATE 2 /* Duplicate key/data item. */ #define H_OFFPAGE 3 /* Overflow key/data item. */ #define H_OFFDUP 4 /* Overflow page of duplicates. */ +#define H_BLOB 5 /* Blob file data item. */ /* * !!! @@ -685,6 +730,78 @@ typedef struct _hoffdup { */ #define HOFFDUP_SIZE (sizeof(HOFFDUP)) +/* + * The fifth type is the H_BLOB, represented by the HBLOB structure. + * Saving bytes is not a concern for the blob record type - if too many + * fit onto a single page, then we're likely to introduce unnecessary + * contention for blobs. Using blobs implies storing large items, thus slightly + * more per-item overhead is acceptable. + * If this proves untrue, the crypto section of the record could be optional. + * encoding, encryption, and checksum fields are unused at the moment, but + * included to make adding those features easier. + */ +typedef struct _hblob { + u_int8_t type; /* 00: Page type and delete flag. */ + u_int8_t encoding; /* 01: Encoding of blob file. */ + u_int8_t unused[10]; /* 02-11: Padding, unused. */ + u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */ + u_int64_t id; /* 48-55: Blob file identifier. */ + u_int64_t size; /* 56-63: Blob file size. */ + u_int64_t file_id; /* 64-71: File directory. */ + u_int64_t sdb_id; /* 72-79: Subdb that owns this blob. */ +} HBLOB, HBLOB60P1; + +#define HBLOB_ID(p) (((u_int8_t *)p) + SSZ(HBLOB, id)) +#define HBLOB_FILE_ID(p) (((u_int8_t *)p) + SSZ(HBLOB, file_id)) + +/* + * Return a off_t version of the u_int64_t blob size. + * Since off_t can be a 32 or 64 integer on different systems, this macro + * is used to catch cases of overflow. + */ +#define GET_BLOB_SIZE(e, p, o, ret) do { \ + DB_ASSERT((e), sizeof(o) <= 8); \ + if (sizeof(o) == 8) { \ + (o) = (off_t)(p).size; \ + } else { \ + if ((p).size > INT_MAX) { \ + __db_errx((e), DB_STR("0769", \ + "Blob size overflow.")); \ + (ret) = EINVAL; \ + } \ + (o) = (int32_t)(p).size; \ + } \ +} while (0); + +#define SET_BLOB_FIELD(p, v, type, field) do { \ + u_int64_t tmp; \ + tmp = (u_int64_t)(v); \ + memcpy((u_int8_t *)(p) + SSZ(type, field), \ + &tmp, sizeof(u_int64_t)); \ +} while (0); + +#define SET_BLOB_ID(p, v, type) \ + SET_BLOB_FIELD(p, v, type, id) + +#define SET_BLOB_SIZE(p, v, type) \ + SET_BLOB_FIELD(p, v, type, size) + +#define SET_BLOB_FILE_ID(p, v, type) \ + SET_BLOB_FIELD(p, v, type, file_id) + +#define SET_BLOB_SDB_ID(p, v, type) \ + SET_BLOB_FIELD(p, v, type, sdb_id) + +/* + * Page space required to add a new HBLOB item to the page, with and + * without the index value. + */ +#define HBLOB_SIZE (sizeof(HBLOB)) +#define HBLOB_DSIZE (sizeof(HBLOB) - SSZA(HKEYDATA, data)) +#define HBLOB_PSIZE (HBLOB_SIZE + sizeof(db_indx_t)) + + /************************************************************************ BTREE PAGE LAYOUT ************************************************************************/ @@ -693,6 +810,7 @@ typedef struct _hoffdup { #define B_KEYDATA 1 /* Key/data item. */ #define B_DUPLICATE 2 /* Duplicate key/data item. */ #define B_OVERFLOW 3 /* Overflow key/data item. */ +#define B_BLOB 4 /* Blob file key/data item. */ /* * We have to store a deleted entry flag in the page. The reason is complex, @@ -746,6 +864,32 @@ typedef struct _boverflow { u_int32_t tlen; /* 08-11: Total length of item. */ } BOVERFLOW; +/* + * The fourth type is the B_BLOB, represented by the BBLOB structure. + * Saving bytes is not a concern for the blob record type - if too many + * fit onto a single page, then we're likely to introduce unnecessary + * contention for blobs. Using blobs implies storing large items, thus slightly + * more per-item overhead is acceptable. + * The len field is set to BBLOB_DSIZE, so that a B_BLOB can be treated just + * like a B_KEYDATA for the purposes of moving items between or on a page. + * If this proves untrue, the crypto section of the record could be optional. + * encoding, lsn, encryption, and checksum fields are unused at the moment, but + * included to make adding those features easier. + */ +typedef struct _bblob { + db_indx_t len; /* 00-01: BBLOB_DSIZE. */ + u_int8_t type; /* 02: Page type and delete flag. */ + u_int8_t encoding; /* 03: Encoding of blob file. */ + u_int8_t unused[8]; /* 04-11: Padding, unused. */ + u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */ + u_int64_t id; /* 48-55: Blob file identifier. */ + u_int64_t size; /* 56-63: Blob file size. */ + u_int64_t file_id; /* 64-71: File directory. */ + u_int64_t sdb_id; /* 72-79: Subdb that owns this blob. */ +} BBLOB, BBLOB60P1; +#define BBLOB_DATA(p) ((u_int8_t *)((BKEYDATA *)p)->data) + /* Get a BOVERFLOW item for a specific index. */ #define GET_BOVERFLOW(dbp, pg, indx) \ ((BOVERFLOW *)P_ENTRY(dbp, pg, indx)) @@ -759,13 +903,26 @@ typedef struct _boverflow { #define BOVERFLOW_PSIZE \ (BOVERFLOW_SIZE + sizeof(db_indx_t)) +/* + * Page space required to add a new BBLOB item to the page, with and + * without the index value. BBLOB_DSIZE is used so that a B_BLOB item + * can be treated just like a B_KEYDATA for the purposes of moving items + * between or on a page, such as when doing compaction. + */ +#define BBLOB_SIZE \ + ((u_int16_t)DB_ALIGN(sizeof(BBLOB), sizeof(u_int32_t))) +#define BBLOB_DSIZE \ + (BBLOB_SIZE - SSZA(BKEYDATA, data)) +#define BBLOB_PSIZE \ + (BBLOB_SIZE + sizeof(db_indx_t)) + #define BITEM_SIZE(bk) \ - (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_SIZE : \ - BKEYDATA_SIZE((bk)->len)) + (B_TYPE((bk)->type) == B_KEYDATA ? BKEYDATA_SIZE((bk)->len) : \ + (B_TYPE((bk)->type) == B_BLOB ? BBLOB_SIZE : BOVERFLOW_SIZE)) #define BITEM_PSIZE(bk) \ - (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_PSIZE : \ - BKEYDATA_PSIZE((bk)->len)) + (B_TYPE((bk)->type) == B_KEYDATA ? BKEYDATA_PSIZE((bk)->len) : \ + (B_TYPE((bk)->type) == B_BLOB ? BBLOB_PSIZE : BOVERFLOW_PSIZE)) /* * Btree leaf and hash page layouts group indices in sets of two, one for the diff --git a/src/dbinc/db_swap.h b/src/dbinc/db_swap.h index 352ae227..06f4eb47 100644 --- a/src/dbinc/db_swap.h +++ b/src/dbinc/db_swap.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 @@ -51,15 +51,26 @@ extern "C" { #define M_64_SWAP(a) { \ u_int64_t _tmp; \ _tmp = (u_int64_t)a; \ - ((u_int8_t *)&a)[0] = ((u_int8_t *)&_tmp)[7]; \ - ((u_int8_t *)&a)[1] = ((u_int8_t *)&_tmp)[6]; \ - ((u_int8_t *)&a)[2] = ((u_int8_t *)&_tmp)[5]; \ - ((u_int8_t *)&a)[3] = ((u_int8_t *)&_tmp)[4]; \ - ((u_int8_t *)&a)[4] = ((u_int8_t *)&_tmp)[3]; \ - ((u_int8_t *)&a)[5] = ((u_int8_t *)&_tmp)[2]; \ - ((u_int8_t *)&a)[6] = ((u_int8_t *)&_tmp)[1]; \ - ((u_int8_t *)&a)[7] = ((u_int8_t *)&_tmp)[0]; \ + ((u_int8_t *)&(a))[0] = ((u_int8_t *)&_tmp)[7]; \ + ((u_int8_t *)&(a))[1] = ((u_int8_t *)&_tmp)[6]; \ + ((u_int8_t *)&(a))[2] = ((u_int8_t *)&_tmp)[5]; \ + ((u_int8_t *)&(a))[3] = ((u_int8_t *)&_tmp)[4]; \ + ((u_int8_t *)&(a))[4] = ((u_int8_t *)&_tmp)[3]; \ + ((u_int8_t *)&(a))[5] = ((u_int8_t *)&_tmp)[2]; \ + ((u_int8_t *)&(a))[6] = ((u_int8_t *)&_tmp)[1]; \ + ((u_int8_t *)&(a))[7] = ((u_int8_t *)&_tmp)[0]; \ } +#undef P_64_COPYSWAP +#define P_64_COPYSWAP(a, b) do { \ + ((u_int8_t *)b)[0] = ((u_int8_t *)a)[7]; \ + ((u_int8_t *)b)[1] = ((u_int8_t *)a)[6]; \ + ((u_int8_t *)b)[2] = ((u_int8_t *)a)[5]; \ + ((u_int8_t *)b)[3] = ((u_int8_t *)a)[4]; \ + ((u_int8_t *)b)[4] = ((u_int8_t *)a)[3]; \ + ((u_int8_t *)b)[5] = ((u_int8_t *)a)[2]; \ + ((u_int8_t *)b)[6] = ((u_int8_t *)a)[1]; \ + ((u_int8_t *)b)[7] = ((u_int8_t *)a)[0]; \ +} while (0) #undef P_64_COPY #define P_64_COPY(a, b) { \ ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \ @@ -113,7 +124,7 @@ extern "C" { P_32_COPYSWAP(&_tmp, a); \ } while (0) #undef M_32_SWAP -#define M_32_SWAP(a) P_32_SWAP(&a) +#define M_32_SWAP(a) P_32_SWAP(&(a)) /* * Little endian <==> big endian 16-bit swap macros. @@ -139,8 +150,13 @@ extern "C" { P_16_COPYSWAP(&_tmp, a); \ } while (0) #undef M_16_SWAP -#define M_16_SWAP(a) P_16_SWAP(&a) +#define M_16_SWAP(a) P_16_SWAP(&(a)) +#undef SWAP64 +#define SWAP64(p) { \ + P_64_SWAP(p); \ + (p) += sizeof(u_int64_t); \ +} #undef SWAP32 #define SWAP32(p) { \ P_32_SWAP(p); \ @@ -168,6 +184,25 @@ extern "C" { P_32_SWAP(p); \ } while (0) +#undef DB_NTOHLL_COPYIN +#define DB_NTOHLL_COPYIN(env, i, p) do { \ + u_int8_t *tmp; \ + tmp = (u_int8_t *)&(i); \ + if (F_ISSET(env, ENV_LITTLEENDIAN)) { \ + tmp[7] = *p++; \ + tmp[6] = *p++; \ + tmp[5] = *p++; \ + tmp[4] = *p++; \ + tmp[3] = *p++; \ + tmp[2] = *p++; \ + tmp[1] = *p++; \ + tmp[0] = *p++; \ + } else { \ + memcpy(&(i), p, sizeof(u_int64_t)); \ + p = (u_int8_t *)p + sizeof(u_int64_t); \ + } \ +} while (0) + #undef DB_NTOHL_COPYIN #define DB_NTOHL_COPYIN(env, i, p) do { \ u_int8_t *tmp; \ @@ -178,7 +213,7 @@ extern "C" { tmp[1] = *p++; \ tmp[0] = *p++; \ } else { \ - memcpy(&i, p, sizeof(u_int32_t)); \ + memcpy(&(i), p, sizeof(u_int32_t)); \ p = (u_int8_t *)p + sizeof(u_int32_t); \ } \ } while (0) @@ -191,11 +226,29 @@ extern "C" { tmp[1] = *p++; \ tmp[0] = *p++; \ } else { \ - memcpy(&i, p, sizeof(u_int16_t)); \ + memcpy(&(i), p, sizeof(u_int16_t)); \ p = (u_int8_t *)p + sizeof(u_int16_t); \ } \ } while (0) +#undef DB_HTONLL_COPYOUT +#define DB_HTONLL_COPYOUT(env, p, i) do { \ + u_int8_t *tmp; \ + tmp = (u_int8_t *)p; \ + if (F_ISSET(env, ENV_LITTLEENDIAN)) { \ + *tmp++ = ((u_int8_t *)&(i))[7]; \ + *tmp++ = ((u_int8_t *)&(i))[6]; \ + *tmp++ = ((u_int8_t *)&(i))[5]; \ + *tmp++ = ((u_int8_t *)&(i))[4]; \ + *tmp++ = ((u_int8_t *)&(i))[3]; \ + *tmp++ = ((u_int8_t *)&(i))[2]; \ + *tmp++ = ((u_int8_t *)&(i))[1]; \ + *tmp++ = ((u_int8_t *)&(i))[0]; \ + } else \ + memcpy(p, &(i), sizeof(u_int64_t)); \ + p = (u_int8_t *)p + sizeof(u_int64_t); \ +} while (0) + #undef DB_HTONL_COPYOUT #define DB_HTONL_COPYOUT(env, p, i) do { \ u_int8_t *tmp; \ @@ -206,7 +259,7 @@ extern "C" { *tmp++ = ((u_int8_t *)&(i))[1]; \ *tmp++ = ((u_int8_t *)&(i))[0]; \ } else \ - memcpy(p, &i, sizeof(u_int32_t)); \ + memcpy(p, &(i), sizeof(u_int32_t)); \ p = (u_int8_t *)p + sizeof(u_int32_t); \ } while (0) @@ -229,6 +282,13 @@ extern "C" { */ #define LOG_SWAPPED(env) !F_ISSET(env, ENV_LITTLEENDIAN) +#define LOGCOPY_64(env, x, p) do { \ + if (LOG_SWAPPED(env)) \ + P_64_COPYSWAP((p), (x)); \ + else \ + memcpy((x), (p), sizeof(u_int64_t)); \ +} while (0) + #define LOGCOPY_32(env, x, p) do { \ if (LOG_SWAPPED(env)) \ P_32_COPYSWAP((p), (x)); \ diff --git a/src/dbinc/db_upgrade.h b/src/dbinc/db_upgrade.h index 45fb624d..716594c9 100644 --- a/src/dbinc/db_upgrade.h +++ b/src/dbinc/db_upgrade.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -242,6 +242,123 @@ typedef struct hashhdr { /* Disk resident portion */ */ } HASHHDR; + +/************************************************************************ + BLOB RECORD LAYOUTS + ************************************************************************/ + +/* + * Hash BLOB record layout. + */ +typedef struct _hblob60 { + u_int8_t type; /* 00: Page type and delete flag. */ + u_int8_t encoding; /* 01: Encoding of blob file. */ + u_int8_t unused[2]; /* 02-03: Padding, unused. */ + u_int32_t id_lo; /* 04-07: Blob file identifier. */ + u_int32_t id_hi; /* 07-11: Blob file identifier. */ + u_int32_t size_lo; /* 12-15: Blob file size. */ + u_int32_t size_hi; /* 15-19: Blob file size. */ + DB_LSN lsn; /* 20-27: LSN for blob file update. */ + u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ + u_int32_t file_id_lo; /* 64-67: File directory lo. */ + u_int32_t file_id_hi; /* 68-71: File directory hi. */ + u_int32_t sdb_id_lo; /* 72-75: Subdb that owns this blob. */ + u_int32_t sdb_id_hi; /* 76-79: Subdb that owns this blob. */ +} HBLOB60; + +#define HBLOB60_SIZE (sizeof(HBLOB60)) + +/* + * Btree BLOB record layout. + */ +typedef struct _bblob60 { + db_indx_t len; /* 00-01: BBLOB_DSIZE. */ + u_int8_t type; /* 02: Page type and delete flag. */ + u_int8_t encoding; /* 03: Encoding of blob file. */ + u_int32_t id_lo; /* 04-07: Blob file identifier. */ + u_int32_t id_hi; /* 08-11: Blob file identifier. */ + u_int32_t size_lo; /* 12-15: Blob file size. */ + u_int32_t size_hi; /* 15-19: Blob file size. */ + DB_LSN lsn; /* 20-27: LSN for blob file update. */ + u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ + u_int32_t file_id_lo; /* 64-67: File directory lo. */ + u_int32_t file_id_hi; /* 68-71: File directory hi. */ + u_int32_t sdb_id_lo; /* 72-75: Subdb that owns this blob. */ + u_int32_t sdb_id_hi; /* 76-79: Subdb that owns this blob. */ +} BBLOB60; + +#define BBLOB60_SIZE \ + ((u_int16_t)DB_ALIGN(sizeof(BBLOB60), sizeof(u_int32_t))) +/* + * Heap BLOB record layout. + */ +typedef struct _heapblob60 { + u_int8_t flags; /* 00: Flags describing record. */ + u_int8_t unused; /* 01: Padding. */ + u_int16_t size; /* 02-03: The size of the stored data piece. */ + u_int8_t encoding; /* 04: Encoding of blob file. */ + u_int8_t unused2[3]; /* 05-07: Padding, unused. */ + u_int32_t id_lo; /* 08-11: Blob file identifier. */ + u_int32_t id_hi; /* 12-15: Blob file identifier. */ + u_int32_t size_lo; /* 16-19: Blob file size. */ + u_int32_t size_hi; /* 20-23: Blob file size. */ + u_int8_t unused3[4]; /* 24-27: Padding, unused. */ + u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ + DB_LSN lsn; /* 64-67: LSN for blob file update. */ + u_int32_t file_id_lo; /* 68-71: File directory lo. */ + u_int32_t file_id_hi; /* 72-75: File directory hi. */ +} HEAPBLOBHDR60; + +#define HEAPBLOBREC60_SIZE (sizeof(HEAPBLOBHDR60)) + +#define GET_BLOB60_FILE_ID(e, p, o, ret) \ + GET_LO_HI(e, (p)->file_id_lo, (p)->file_id_hi, o, ret); + +#define GET_BLOB60_SDB_ID(e, p, o, ret) \ + GET_LO_HI(e, (p)->sdb_id_lo, (p)->sdb_id_hi, o, ret); + +/* Return a uintmax_t version of blob_id. */ +#define GET_BLOB60_ID(e, p, o, ret) do { \ + DB_ASSERT((e), sizeof(o) <= 8); \ + if (sizeof(o) == 8) { \ + (o) = (p).id_hi; \ + (o) = (o) << 32; \ + (o) += (p).id_lo; \ + } else { \ + if ((p).id_hi > 0) { \ + __db_errx((e), DB_STR("0766", \ + "Blob identifier overflow.")); \ + (ret) = EINVAL; \ + } \ + (o) = (p).id_lo; \ + } \ +} while (0); + +/* Return a off_t version of blob size. */ +#define GET_BLOB60_SIZE(e, p, o, ret) do { \ + DB_ASSERT((e), sizeof(o) <= 8); \ + if (sizeof(o) == 8) { \ + (o) = (p).size_hi; \ + (o) = (o) << 32; \ + (o) += (p).size_lo; \ + } else { \ + if ((p).size_hi > 0) { \ + __db_errx((e), DB_STR("0767", \ + "Blob size overflow.")); \ + (ret) = EINVAL; \ + } \ + if ((p).size_lo > INT_MAX) { \ + __db_errx((e), DB_STR("0768", \ + "Blob size overflow.")); \ + (ret) = EINVAL; \ + } \ + (o) = (int32_t)(p).size_lo; \ + } \ +} while (0); + #if defined(__cplusplus) } #endif diff --git a/src/dbinc/db_verify.h b/src/dbinc/db_verify.h index 68acbf6c..ea87680f 100644 --- a/src/dbinc/db_verify.h +++ b/src/dbinc/db_verify.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -120,9 +120,10 @@ struct __vrfy_dbinfo { #define SALVAGE_PRINTABLE 0x01 /* Output printable chars literally. */ #define SALVAGE_PRINTHEADER 0x02 /* Print the unknown-key header. */ #define SALVAGE_PRINTFOOTER 0x04 /* Print the unknown-key footer. */ -#define SALVAGE_HASSUBDBS 0x08 /* There are subdatabases to salvage. */ -#define VRFY_LEAFCHAIN_BROKEN 0x10 /* Lost one or more Btree leaf pgs. */ -#define VRFY_QMETA_SET 0x20 /* We've seen a QUEUE meta page and +#define SALVAGE_STREAM_BLOB 0x08 /* Currently streaming a blob. */ +#define SALVAGE_HASSUBDBS 0x10 /* There are subdatabases to salvage. */ +#define SALVAGE_LEAFCHAIN_BROKEN 0x20 /* Lost one or more Btree leaf pgs. */ +#define SALVAGE_QMETA_SET 0x40 /* We've seen a QUEUE meta page and set things up for it. */ u_int32_t flags; }; /* VRFY_DBINFO */ diff --git a/src/dbinc/debug.h b/src/dbinc/debug.h index a8da000d..5388b791 100644 --- a/src/dbinc/debug.h +++ b/src/dbinc/debug.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -36,7 +36,13 @@ extern "C" { #define DB_ASSERT(env, e) \ ((e) ? (void)0 : __db_assert(env, #e, __FILE__, __LINE__)) #else -#define DB_ASSERT(env, e) NOP_STATEMENT +#define DB_ASSERT(env, e) ((void)0) +#endif + +#if defined(HAVE_ERROR_HISTORY) +#define DB_DEBUG_MSG __db_debug_msg +#else +#define DB_DEBUG_MSG if (0) __db_debug_msg #endif /* @@ -55,10 +61,11 @@ extern "C" { * of structure fields whose only purpose is padding, as well as when heap * memory that was never initialized is written to disk. */ +#define UMRW_SET(var) UMRW_SET_VALUE((var), 0) #ifdef UMRW -#define UMRW_SET(v) (v) = 0 +#define UMRW_SET_VALUE(var, value) (var) = (value) #else -#define UMRW_SET(v) NOP_STATEMENT +#define UMRW_SET_VALUE(var, value) NOP_STATEMENT #endif /* @@ -73,6 +80,34 @@ typedef enum { } db_error_set_t; /* + * Use these macros wherever an error condition is initially noticed, e.g., when + * setting a value to any of the user visible error return codes, whether + * defined by Berkeley DB or by the operating environment (EINVAL). + * saving the specific source of an instance of an error code, including the + * time, stack, db name, current LSN, etc. If the error turns out to be + * important, the deferred message text is added to the text produced by + * __db_err(), __db_errx, and __db_syserr(). The additional information can be + * useful for diagnosing the behavior of applications under error conditions. + * It is enabled by configuring with --enable-error_history. The current + * implmentation requires pthreads' version of thread local storage. + */ +#ifdef HAVE_ERROR_HISTORY +#define USR_ERR(env, errcode) __db_diags((env), (errcode)) +#define DBC_ERR(dbc, errcode) __dbc_diags((dbc), (errcode)) +#define MUTEX_ERR(env, mutex, errcode) __mutex_diags((env), (mutex), (errcode)) +#define DISCARD_HISTORY(env) __db_deferred_discard() +/* Save at most 10KB of error history in an API call. Adjust this as desired. */ +#define DB_ERROR_HISTORY_SIZE (10 * 1024) +#else +#define USR_ERR(env, errcode) (errcode) +#define DBC_ERR(dbc, errcode) (errcode) +#define MUTEX_ERR(env, mutex, errcode) (errcode) +#define DISCARD_HISTORY(env) NOP_STATEMENT +/* No space is needed when error history is disabled. */ +#define DB_ERROR_HISTORY_SIZE 0 +#endif + +/* * Message handling. Use a macro instead of a function because va_list * references to variadic arguments cannot be reset to the beginning of the * variadic argument list (and then rescanned), by functions other than the @@ -102,6 +137,7 @@ typedef enum { ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \ __db_errfile(dbenv, error, error_set, fmt, __ap); \ va_end(__ap); \ + DISCARD_HISTORY((dbenv)->env); \ } #else #define DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) { \ @@ -127,6 +163,7 @@ typedef enum { ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \ __db_errfile(env, error, error_set, fmt, __ap); \ va_end(__ap); \ + DISCARD_HISTORY(env); \ } #endif #if defined(STDC_HEADERS) || defined(__cplusplus) @@ -192,7 +229,7 @@ typedef enum { #define LOG_OP(C, T, O, K, A, F) { \ DB_LSN __lsn; \ DBT __op; \ - if (DBC_LOGGING((C))) { \ + if ((C)->dbp->log_filename != NULL && DBC_LOGGING((C))) { \ memset(&__op, 0, sizeof(__op)); \ __op.data = O; \ __op.size = (u_int32_t)strlen(O) + 1; \ diff --git a/src/dbinc/fop.h b/src/dbinc/fop.h index 94f27f9f..7ea62023 100644 --- a/src/dbinc/fop.h +++ b/src/dbinc/fop.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -23,6 +23,20 @@ extern "C" { (void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 0); \ } while (0) +/* + * Never change the value of DB_FOP_CREATE (0x00000002), + * DB_FOP_APPEND (0x00000001), and DB_FOP_REDO(0x00000008), + * as those values are used in write_file logs. + */ +#define DB_FOP_APPEND 0x00000001 /* Appending to a file. */ +#define DB_FOP_CREATE 0x00000002 /* Creating the file. */ +#define DB_FOP_PARTIAL_LOG 0x00000004 /* Partial logging of file data. */ +#define DB_FOP_REDO 0x00000008 /* File operation can be redone. */ +#define DB_FOP_READONLY 0x00000010 /* File is read only. */ +#define DB_FOP_WRITE 0x00000020 /* File is writeable. */ +#define DB_FOP_SYNC_WRITE 0x00000040 /* Sync file on each write. */ + + #include "dbinc_auto/fileops_auto.h" #include "dbinc_auto/fileops_ext.h" diff --git a/src/dbinc/globals.h b/src/dbinc/globals.h index 95e5c118..becd6365 100644 --- a/src/dbinc/globals.h +++ b/src/dbinc/globals.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -52,21 +52,27 @@ typedef struct __db_globals { char error_buf[40]; /* Error string buffer. */ - int uid_init; /* srand set in UID generator */ + int random_seeded; /* Has __os_srandom been called? */ - u_long rand_next; /* rand/srand value */ +#if defined(HAVE_RANDOM_R) + struct random_data random_data; /* srandom_r/random_r argument */ + char random_state[64]; /* random number state */ +#elif !defined(HAVE_RAND) && !defined(HAVE_RANDOM) + u_long rand_next; /* next rand value for clib/rand.c */ +#endif u_int32_t fid_serial; /* file id counter */ int db_errno; /* Errno value if not available */ - size_t num_active_pids; /* number of entries in active_pids */ - - size_t size_active_pids; /* allocated size of active_pids */ + char *saved_errstr; /* saved error string from backup */ - pid_t *active_pids; /* array active pids */ + char *time_format; /* strftime-format for printing dates */ - char *saved_errstr; /* saved error string from backup */ +#if defined(HAVE_ERROR_HISTORY) && defined(HAVE_PTHREAD_SELF) + pthread_key_t msgs_key; + pthread_once_t thread_once; +#endif /* Underlying OS interface jump table.*/ void (*j_assert) __P((const char *, const char *, int)); diff --git a/src/dbinc/hash.h b/src/dbinc/hash.h index f485128a..55a64f87 100644 --- a/src/dbinc/hash.h +++ b/src/dbinc/hash.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 @@ -56,7 +56,7 @@ typedef struct hash_t { u_int32_t h_nelem; /* Number of elements. */ /* Hash and compare functions. */ u_int32_t (*h_hash) __P((DB *, const void *, u_int32_t)); - int (*h_compare) __P((DB *, const DBT *, const DBT *)); + int (*h_compare) __P((DB *, const DBT *, const DBT *, size_t *)); } HASH; /* Cursor structure definitions. */ diff --git a/src/dbinc/heap.h b/src/dbinc/heap.h index ca3407e0..bb96ebec 100644 --- a/src/dbinc/heap.h +++ b/src/dbinc/heap.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. */ #ifndef _DB_HEAP_H_ @@ -26,7 +26,8 @@ struct __heap { /* Heap access method. */ db_pgno_t curregion; /* The region of the next insert. */ db_pgno_t maxpgno; /* Maximum page number of a fixed size heap. */ - int curpgindx; /* The last used offset in the region's space bitmap. */ + u_int32_t curpgindx; /* The last used offset in the + * region's space bitmap. */ }; struct __heap_cursor { diff --git a/src/dbinc/hmac.h b/src/dbinc/hmac.h index 2a495b17..f87965eb 100644 --- a/src/dbinc/hmac.h +++ b/src/dbinc/hmac.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/lock.h b/src/dbinc/lock.h index eab51832..298b8527 100644 --- a/src/dbinc/lock.h +++ b/src/dbinc/lock.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -37,7 +37,10 @@ extern "C" { */ #define LOCK_INVALID INVALID_ROFF #define LOCK_ISSET(lock) ((lock).off != LOCK_INVALID) -#define LOCK_INIT(lock) ((lock).off = LOCK_INVALID) +#define LOCK_INIT(lock) do { \ + (lock).off = LOCK_INVALID; \ + UMRW_SET_VALUE((lock).mode, DB_LOCK_NG); \ +} while(0) /* * Macro to identify a write lock for the purpose of counting locks @@ -66,8 +69,8 @@ extern "C" { typedef struct __db_lockregion { /* SHARED */ db_mutex_t mtx_region; /* Region mutex. */ - u_int32_t need_dd; /* flag for deadlock detector */ - u_int32_t detect; /* run dd on every conflict */ + u_int32_t need_dd; /* run dd on every conflict */ + u_int32_t detect; /* flag for deadlock detector */ db_timespec next_timeout; /* next time to expire a lock */ db_mutex_t mtx_dd; /* mutex for lock object dd list. */ db_mutex_t mtx_lockers; /* mutex for locker allocation. */ @@ -92,7 +95,7 @@ typedef struct __db_lockregion { /* SHARED */ u_int32_t lock_id; /* Current lock(er) id to allocate. */ u_int32_t cur_maxid; /* Current max lock(er) id. */ - u_int32_t nlockers; /* Current number of lockers. */ + u_int32_t nlockers; /* Current number of locker ids. */ int32_t nmodes; /* Number of modes in conflict table. */ DB_LOCK_STAT stat; /* stats about locking. */ } DB_LOCKREGION; @@ -157,12 +160,16 @@ struct __db_locker { /* SHARED */ db_timespec lk_expire; /* When current lock expires. */ db_timespec tx_expire; /* When this txn expires. */ db_timeout_t lk_timeout; /* How long do we let locks live. */ +#ifdef DIAGNOSTIC + roff_t prev_locker; /* The thread's previous dbth_locker. */ +#endif #define DB_LOCKER_DIRTY 0x0001 /* Has write locks. */ #define DB_LOCKER_INABORT 0x0002 /* Is aborting, don't abort again. */ #define DB_LOCKER_TIMEOUT 0x0004 /* Has timeout set. */ #define DB_LOCKER_FAMILY_LOCKER 0x0008 /* Part of a family of lockers. */ #define DB_LOCKER_HANDLE_LOCKER 0x0010 /* Not associated with a thread. */ +#define DB_LOCKER_FREE 0x0020 /* Diag: it is on the free list. */ u_int32_t flags; }; diff --git a/src/dbinc/log.h b/src/dbinc/log.h index c4dea6fc..2e2929f0 100644 --- a/src/dbinc/log.h +++ b/src/dbinc/log.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -55,6 +55,8 @@ struct __fname { /* number of txn referencing + 1 for the db handle. */ u_int32_t txn_ref; + db_seq_t blob_file_id; /* BLOB file directory id. */ + #define DB_FNAME_CLOSED 0x01 /* DBP was closed. */ #define DB_FNAME_DURABLE 0x02 /* File is durable. */ #define DB_FNAME_INMEM 0x04 /* File is in memory. */ @@ -137,16 +139,18 @@ struct __db_log { ENV *env; /* Environment */ REGINFO reginfo; /* Region information. */ -#define DBLOG_AUTOREMOVE 0x01 /* Autoremove log files. */ -#define DBLOG_DIRECT 0x02 /* Do direct I/O on the log. */ -#define DBLOG_DSYNC 0x04 /* Set OS_DSYNC on the log. */ -#define DBLOG_FORCE_OPEN 0x08 /* Force the DB open even if it appears +#define DBLOG_AUTOREMOVE 0x001 /* Autoremove log files. */ +#define DBLOG_BLOB 0x002 /* Full logging of blob data. */ +#define DBLOG_DIRECT 0x004 /* Do direct I/O on the log. */ +#define DBLOG_DSYNC 0x008 /* Set OS_DSYNC on the log. */ +#define DBLOG_FORCE_OPEN 0x010 /* Force the DB open even if it appears * to be deleted. */ -#define DBLOG_INMEMORY 0x10 /* Logging is in memory. */ -#define DBLOG_OPENFILES 0x20 /* Prepared files need to be open. */ -#define DBLOG_RECOVER 0x40 /* We are in recovery. */ -#define DBLOG_ZERO 0x80 /* Zero fill the log. */ -#define DBLOG_VERIFYING 0x100 /* The log is being verified. */ +#define DBLOG_INMEMORY 0x020 /* Logging is in memory. */ +#define DBLOG_NOSYNC 0x040 /* Don't sync log files during flush. */ +#define DBLOG_OPENFILES 0x080 /* Prepared files need to be open. */ +#define DBLOG_RECOVER 0x100 /* We are in recovery. */ +#define DBLOG_ZERO 0x200 /* Zero fill the log. */ +#define DBLOG_VERIFYING 0x400 /* The log is being verified. */ u_int32_t flags; }; @@ -251,7 +255,8 @@ struct __log { /* SHARED */ * rather than by the region mutex. */ db_mutex_t mtx_flush; /* Mutex guarding flushing. */ - int32_t in_flush; /* Log flush in progress. */ + int32_t in_flush; /* Log flush in progress. */ + int32_t nosync; /* log_set_config(DB_LOG_NOSYNC) */ DB_LSN s_lsn; /* LSN of the last sync. */ DB_LOG_STAT stat; /* Log statistics. */ diff --git a/src/dbinc/log_verify.h b/src/dbinc/log_verify.h index fa90ace4..ec43c4d7 100644 --- a/src/dbinc/log_verify.h +++ b/src/dbinc/log_verify.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h index 9a10c6d9..598ca366 100644 --- a/src/dbinc/mp.h +++ b/src/dbinc/mp.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -226,10 +226,15 @@ struct __mpool { /* SHARED */ #define DB_MEMP_SYNC_INTERRUPT 0x02 u_int32_t config_flags; - /* Free frozen buffer headers, protected by the region lock. */ + /* These MVCC fields are protected by the mpool region lock. */ + + /* This is the free list of BH_FROZEN_PAGEs, the frozen headers. */ SH_TAILQ_HEAD(__free_frozen) free_frozen; - /* Allocated blocks of frozen buffer headers. */ + /* + * This list of BH_FROZEN_ALLOCs contains all the BH_FROZEN_PAGEs, + * whether they are in free_frozen or busy (in a bh.vc version chain). + */ SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen; }; @@ -550,9 +555,10 @@ struct __bh { /* SHARED */ #define BH_FROZEN 0x040 /* Frozen buffer: allocate & re-read. */ #define BH_TRASH 0x080 /* Page is garbage. */ #define BH_THAWED 0x100 /* Page was thawed. */ +#define BH_UNREACHABLE 0x200 /* Discard this defunct MVCC version. */ u_int16_t flags; - u_int32_t priority; /* Priority. */ + u_int32_t priority; /* Cache priority. */ SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */ db_pgno_t pgno; /* Underlying MPOOLFILE page number. */ @@ -587,9 +593,12 @@ struct __bh_frozen_p { /* * BH_FROZEN_ALLOC -- - * Frozen buffer headers are allocated a page at a time in general. This - * structure is allocated at the beginning of the page so that the - * allocation chunks can be tracked and freed (for private environments). + * This structure is the container for one or more frozen buffer headers. + * Blocks of BH_FROZEN_PAGE structs are usually allocated a page at a time, + * though when an mpool is nearly full and a whole page isn't available + * there can be single-item blocks. BH_FROZEN_ALLOC is the block header + * allocated at the beginning of the chunk and is linked to the mpool's + * alloc_frozen so that the allocation chunks can be tracked and freed. */ struct __bh_frozen_a { SH_TAILQ_ENTRY links; @@ -602,33 +611,36 @@ struct __bh_frozen_a { (F_ISSET(PAGE_TO_BH(p), BH_DIRTY|BH_EXCLUSIVE) == (BH_DIRTY|BH_EXCLUSIVE)) #define BH_OWNER(env, bhp) \ - ((TXN_DETAIL *)R_ADDR(&env->tx_handle->reginfo, bhp->td_off)) + ((TXN_DETAIL *)R_ADDR(&(env)->tx_handle->reginfo, (bhp)->td_off)) #define BH_OWNED_BY(env, bhp, txn) ((txn) != NULL && \ - (bhp)->td_off != INVALID_ROFF && \ - (txn)->td == BH_OWNER(env, bhp)) + (bhp)->td_off != INVALID_ROFF && (txn)->td == BH_OWNER(env, bhp)) -#define VISIBLE_LSN(env, bhp) \ - (&BH_OWNER(env, bhp)->visible_lsn) +#define VISIBLE_LSN(env, bhp) (&BH_OWNER(env, bhp)->visible_lsn) /* - * Make a copy of the buffer's visible LSN, one field at a time. We rely on the - * 32-bit operations being atomic. The visible_lsn starts at MAX_LSN and is - * set during commit or abort to the current LSN. + * MVCC Versions are visible only to snapshot transactions whose read_lsn is at + * least as recent (large) as the buffer's lsn. Visibility checks must be made + * from newest to oldest along bhp.vc, stopping at the first visible one. + * Unversioned buffers (those with invalid td_off) are always visible. + * + * BH_VISIBLE() makes a copy of the buffer's visible LSN, one field at a time. + * We rely on the 32-bit operations being atomic. The visible_lsn starts at + * MAX_LSN and is set during commit or abort to the current LSN. * - * If we race with a commit / abort, we may see either the file or the offset + * If we race with a commit or abort, we may see either the file or the offset * still at UINT32_MAX, so vlsn is guaranteed to be in the future. That's OK, * since we had to take the log region lock to allocate the read LSN so we were * never going to see this buffer anyway. */ #define BH_VISIBLE(env, bhp, read_lsnp, vlsn) \ (bhp->td_off == INVALID_ROFF || \ - ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \ + ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \ (vlsn).offset = VISIBLE_LSN(env, bhp)->offset, \ LOG_COMPARE((read_lsnp), &(vlsn)) >= 0)) #define BH_OBSOLETE(bhp, old_lsn, vlsn) (SH_CHAIN_HASNEXT(bhp, vc) ? \ - BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) :\ + BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) : \ BH_VISIBLE(env, bhp, &(old_lsn), vlsn)) #define MVCC_SKIP_CURADJ(dbc, pgno) (dbc->txn != NULL && \ diff --git a/src/dbinc/mutex.h b/src/dbinc/mutex.h index b699142c..334d8f96 100644 --- a/src/dbinc/mutex.h +++ b/src/dbinc/mutex.h @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -24,10 +24,14 @@ extern "C" { #endif /* - * By default, spin 50 times per processor if fail to acquire a test-and-set - * mutex, we have anecdotal evidence it's a reasonable value. + * These specify the default spin parameters for test-and-set mutexes. A single + * processor system spins just once, a multiprocessor system spins 50 times per + * processor up to a default maximum of 200. This limit reduces excessive + * busy-waiting on machines with many hyperthreads. We have anecdotal evidence + * that these are reasonable default values. */ #define MUTEX_SPINS_PER_PROCESSOR 50 +#define MUTEX_SPINS_DEFAULT_MAX 200 /* * Mutexes are represented by unsigned, 32-bit integral values. As the @@ -163,13 +167,6 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) #define __mutex_rdlock(a, b) __db_win32_mutex_readlock(a, b) #define __mutex_tryrdlock(a, b) __db_win32_mutex_tryreadlock(a, b) #endif -#elif defined(HAVE_MUTEX_FCNTL) -#define __mutex_init(a, b, c) __db_fcntl_mutex_init(a, b, c) -#define __mutex_lock(a, b) __db_fcntl_mutex_lock(a, b, 0) -#define __mutex_timedlock(a, b, c) __db_fcntl_lock(a, b, c) -#define __mutex_trylock(a, b) __db_fcntl_mutex_trylock(a, b) -#define __mutex_unlock(a, b) __db_fcntl_mutex_unlock(a, b) -#define __mutex_destroy(a, b) __db_fcntl_mutex_destroy(a, b) #else #define __mutex_init(a, b, c) __db_tas_mutex_init(a, b, c) #define __mutex_lock(a, b) __db_tas_mutex_lock(a, b, 0) @@ -184,9 +181,8 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) #endif /* - * When there is no method to get a shared latch, fall back to - * implementing __mutex_rdlock() as getting an exclusive one. - * This occurs either when !HAVE_SHARED_LATCHES or HAVE_MUTEX_FCNTL. + * When there is no method to get a shared latch, fall back to implementing + * __mutex_rdlock() as an exclusive one. This may no longer be supported? */ #ifndef __mutex_rdlock #define __mutex_rdlock(a, b) __mutex_lock(a, b) @@ -199,17 +195,25 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) * Lock/unlock a mutex. If the mutex was never required, the thread of * control can proceed without it. * - * We never fail to acquire or release a mutex without panicing. Simplify + * We rarely fail to acquire or release a mutex without panicing. Simplify * the macros to always return a panic value rather than saving the actual - * return value of the mutex routine. + * return value of the mutex routine. Use MUTEX_LOCK_RET() when the caller has + * a code path for a mutex failure, e.g., when cleaning up after a panic. */ #ifdef HAVE_MUTEX_SUPPORT #define MUTEX_LOCK(env, mutex) do { \ - if ((mutex) != MUTEX_INVALID && \ - __mutex_lock(env, mutex) != 0) \ + if ((mutex) != MUTEX_INVALID && __mutex_lock(env, mutex) != 0) \ return (DB_RUNRECOVERY); \ } while (0) +#define MUTEX_LOCK_RET(env, mutex) \ + ((mutex) == MUTEX_INVALID ? 0 : __mutex_lock(env, mutex)) + +/* + * Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success, + * or possibly DB_RUNRECOVERY for failchk. + */ + /* * Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success, * or DB_LOCK_NOTGRANTED, or possibly DB_RUNRECOVERY for failchk. @@ -217,9 +221,7 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) #define MUTEX_TRYLOCK(env, mutex) \ (((mutex) == MUTEX_INVALID) ? 0 : __mutex_trylock(env, mutex)) -/* - * Acquire a DB_MUTEX_SHARED "mutex" in shared mode. - */ +/* Acquire a latch (a DB_MUTEX_SHARED "mutex") in shared mode. */ #define MUTEX_READLOCK(env, mutex) do { \ if ((mutex) != MUTEX_INVALID && \ __mutex_rdlock(env, mutex) != 0) \ @@ -234,30 +236,68 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) return (DB_RUNRECOVERY); \ } while (0) -#define MUTEX_WAIT(env, mutex, duration) do { \ - int __ret; \ - if ((mutex) != MUTEX_INVALID && \ - (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \ - __ret != DB_TIMEOUT) \ - return (DB_RUNRECOVERY); \ +#define MUTEX_WAIT(env, mutex, duration) do { \ + int __ret; \ + if ((mutex) != MUTEX_INVALID && \ + (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \ + __ret != DB_TIMEOUT) \ + return (DB_RUNRECOVERY); \ } while (0) + +/* + * Check that a particular mutex is exclusively held at least by someone, not + * necessarily the current thread. + */ +#define MUTEX_IS_OWNED(env, mutex) \ + (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \ + F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \ + F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED)) #else /* * There are calls to lock/unlock mutexes outside of #ifdef's -- replace * the call with something the compiler can discard, but which will make - * if-then-else blocks work correctly. + * if-then-else blocks work correctly, and suppress unused variable messages. + */ +#define MUTEX_LOCK(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_LOCK_RET(env, mutex) ( env = (env), mutex = (mutex), 0) +#define MUTEX_TRYLOCK(env, mutex) ( env = (env), mutex = (mutex), 0) +#define MUTEX_READLOCK(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_TRY_READLOCK(env, mutex) ( env = (env), mutex = (mutex), 0 ) +#define MUTEX_UNLOCK(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_REQUIRED(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_REQUIRED_READ(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_WAIT(env, mutex, duration) { \ + (env) = (env); (mutex) = (mutex); (duration) = (duration); \ +} + +/* + * Every MUTEX_IS_OWNED() caller expects to own it. When there is no mutex + * support, act as if we have ownership. */ -#define MUTEX_LOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_TRYLOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_READLOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_TRY_READLOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_UNLOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_REQUIRED(env, mutex) (mutex) = (mutex) -#define MUTEX_REQUIRED_READ(env, mutex) (mutex) = (mutex) -#define MUTEX_WAIT(env, mutex, duration) (mutex) = (mutex) +#define MUTEX_IS_OWNED(env, mutex) 1 #endif /* + * Bulk initialization of mutexes in regions. + */ + +#define MUTEX_BULK_INIT(env, region, start, howmany) do { \ + DB_MUTEX *__mutexp; \ + db_mutex_t __i = start; \ + u_int32_t __n = howmany; \ + for (__mutexp = MUTEXP_SET(env, __i); \ + --__n > 0; \ + __mutexp = MUTEXP_SET(env, __i)) { \ + __mutexp->flags = 0; \ + __i = (F_ISSET(env, ENV_PRIVATE)) ? \ + ((uintptr_t)__mutexp + region->mutex_size) : __i + 1; \ + __mutexp->mutex_next_link = __i; \ + } \ + __mutexp->flags = 0; \ + __mutexp->mutex_next_link = MUTEX_INVALID; \ +} while (0) + +/* * Berkeley DB ports may require single-threading at places in the code. */ #ifdef HAVE_MUTEX_VXWORKS diff --git a/src/dbinc/mutex_int.h b/src/dbinc/mutex_int.h index b9bccdf7..4a4468af 100644 --- a/src/dbinc/mutex_int.h +++ b/src/dbinc/mutex_int.h @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -73,6 +73,14 @@ extern "C" { else \ RET_SET((pthread_mutex_lock(&(mutexp)->u.m.mutex)), ret); \ } while (0) +#define RET_SET_PTHREAD_TIMEDLOCK(mutexp, timespec, ret) do { \ + if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \ + RET_SET(pthread_rwlock_timedwrlock(&(mutexp)->u.rwlock, \ + (timespec)), ret); \ + else \ + RET_SET(pthread_mutex_timedlock(&(mutexp)->u.m.mutex, \ + (timespec)), ret); \ +} while (0) #define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) do { \ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \ RET_SET((pthread_rwlock_trywrlock(&(mutexp)->u.rwlock)), \ @@ -84,6 +92,9 @@ extern "C" { #else #define RET_SET_PTHREAD_LOCK(mutexp, ret) \ RET_SET(pthread_mutex_lock(&(mutexp)->u.m.mutex), ret); +#define RET_SET_PTHREAD_TIMEDLOCK(mutexp, timespec, ret) \ + RET_SET(pthread_mutex_timedlock(&(mutexp)->u.m.mutex, \ + (timespec)), ret); #define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) \ RET_SET(pthread_mutex_trylock(&(mutexp)->u.m.mutex), ret); #endif @@ -267,6 +278,11 @@ typedef abilock_t tsl_t; #include <sys/machlock.h> typedef lock_t tsl_t; +/* + * Solaris requires 8 byte alignment for pthread_mutex_t values. + */ +#define MUTEX_ALIGN 8 + /* * The functions are declared in <sys/machlock.h>, but under #ifdef KERNEL. * Re-declare them here to avoid warnings. @@ -778,6 +794,7 @@ MUTEX_SET(tsl_t *tsl) { static inline void MUTEX_UNSET(tsl_t *tsl) { __asm__ volatile( + " .set mips2 \n" " .set noreorder \n" " sync \n" " sw $0, %0 \n" @@ -892,15 +909,22 @@ struct __db_mutexmgr { REGINFO reginfo; /* Region information */ void *mutex_array; /* Base of the mutex array */ +#ifdef HAVE_FAILCHK_BROADCAST + /* + * The mutex lock functions wait for at most this long between checks + * for DB_MUTEX_OWNER_DEAD. This field needs no mutex protection. + */ + db_timeout_t failchk_polltime; +#endif }; /* Macros to lock/unlock the mutex region as a whole. */ -#define MUTEX_SYSTEM_LOCK(dbenv) \ - MUTEX_LOCK(dbenv, ((DB_MUTEXREGION *) \ - (dbenv)->mutex_handle->reginfo.primary)->mtx_region) -#define MUTEX_SYSTEM_UNLOCK(dbenv) \ - MUTEX_UNLOCK(dbenv, ((DB_MUTEXREGION *) \ - (dbenv)->mutex_handle->reginfo.primary)->mtx_region) +#define MUTEX_SYSTEM_LOCK(env) \ + MUTEX_LOCK(env, ((DB_MUTEXREGION *) \ + (env)->mutex_handle->reginfo.primary)->mtx_region) +#define MUTEX_SYSTEM_UNLOCK(env) \ + MUTEX_UNLOCK(env, ((DB_MUTEXREGION *) \ + (env)->mutex_handle->reginfo.primary)->mtx_region) /* * DB_MUTEXREGION -- @@ -927,6 +951,16 @@ typedef struct __db_mutexregion { /* SHARED */ } DB_MUTEXREGION; #ifdef HAVE_MUTEX_SUPPORT +/* + * MTX_DIAG turns on the recording of when and where a mutex was locked. It has + * a large impact, and should only be turned on when debugging mutexes. + */ +#define MUTEX_STACK_TEXT_SIZE 600 +typedef struct __mutex_history { /* SHARED */ + db_timespec when; + char stacktext[MUTEX_STACK_TEXT_SIZE]; +} MUTEX_HISTORY; + struct __db_mutex_t { /* SHARED */ /* Mutex. */ #ifdef MUTEX_FIELDS MUTEX_FIELDS /* Opaque thread mutex structures. */ @@ -959,9 +993,9 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */ db_mutex_t mutex_next_link; /* Linked list of free mutexes. */ -#ifdef HAVE_STATISTICS int alloc_id; /* Allocation ID. */ +#ifdef HAVE_STATISTICS u_int32_t mutex_set_wait; /* Granted after wait. */ u_int32_t mutex_set_nowait; /* Granted without waiting. */ #ifdef HAVE_SHARED_LATCHES @@ -973,7 +1007,9 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */ u_int32_t hybrid_wakeup; /* for counting spurious wakeups */ #endif #endif - +#ifdef MUTEX_DIAG + MUTEX_HISTORY mutex_history; +#endif /* * A subset of the flag arguments for __mutex_alloc(). * @@ -992,19 +1028,6 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */ (indx) * \ ((DB_MUTEXREGION *)env->mutex_handle->reginfo.primary)->mutex_size)) -/* - * Check that a particular mutex is exclusively held at least by someone, not - * necessarily the current thread. - */ -#ifdef HAVE_MUTEX_SUPPORT -#define MUTEX_IS_OWNED(env, mutex) \ - (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \ - F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \ - F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED)) -#else -#define MUTEX_IS_OWNED(env, mutex) 0 -#endif - #if defined(HAVE_MUTEX_HYBRID) || defined(DB_WIN32) || \ (defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS)) #define MUTEXP_IS_BUSY(mutexp) \ diff --git a/src/dbinc/os.h b/src/dbinc/os.h index 2515e6ee..ea1fd2c4 100644 --- a/src/dbinc/os.h +++ b/src/dbinc/os.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/partition.h b/src/dbinc/partition.h index 09e42573..11cdfa6f 100644 --- a/src/dbinc/partition.h +++ b/src/dbinc/partition.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * $Id$ @@ -22,6 +22,7 @@ typedef struct __db_partition { u_int32_t (*callback) (DB *, DBT *); #define PART_CALLBACK 0x01 #define PART_RANGE 0x02 +#define PART_KEYS_SETUP 0x04 u_int32_t flags; } DB_PARTITION; @@ -36,7 +37,14 @@ typedef struct __part_internal { #ifdef HAVE_PARTITION #define PART_NAME "__dbp.%s.%03d" -#define PART_LEN (strlen("__dbp..")+3) +/* + * Currently we only support no more than 1000000 partitions. + * If the limit is changed, the PART_DIGITS and PART_MAXIMUM + * should be changed accordingly. + */ +#define PART_DIGITS 6 +#define PART_MAXIMUM 1000000 +#define PART_LEN (sizeof("__dbp..") + PART_DIGITS) #define PART_PREFIX "__dbp." #define IS_PARTITION_DB_FILE(name) (strncmp(name, PART_PREFIX, \ sizeof(PART_PREFIX) - 1) == 0) diff --git a/src/dbinc/perfmon.h b/src/dbinc/perfmon.h index c3b9b9fa..e89eba33 100644 --- a/src/dbinc/perfmon.h +++ b/src/dbinc/perfmon.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/qam.h b/src/dbinc/qam.h index 657c11e2..d18f91f3 100644 --- a/src/dbinc/qam.h +++ b/src/dbinc/qam.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/queue.h b/src/dbinc/queue.h index 5a62741a..c53941ab 100644 --- a/src/dbinc/queue.h +++ b/src/dbinc/queue.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1991, 1993 diff --git a/src/dbinc/region.h b/src/dbinc/region.h index ac0ff16f..070aff5f 100644 --- a/src/dbinc/region.h +++ b/src/dbinc/region.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -134,7 +134,10 @@ typedef enum { REGION_TYPE_LOG, REGION_TYPE_MPOOL, REGION_TYPE_MUTEX, - REGION_TYPE_TXN } reg_type_t; + REGION_TYPE_TXN, + /* This enum always must be the last, and is the largest valid type. */ + REGION_TYPE_MAX = REGION_TYPE_TXN +} reg_type_t; #define INVALID_REGION_SEGID -1 /* Segment IDs are either shmget(2) or * Win16 segment identifiers. They are @@ -196,10 +199,10 @@ typedef struct __db_reg_env { /* SHARED */ /* - * The mtx_regenv mutex protects the environment reference count and - * memory allocation from the primary shared region (the crypto, thread - * control block and replication implementations allocate memory from - * the primary shared region). + * The mtx_regenv mutex protects the environment reference count, + * blob threshold and memory allocation from the primary shared region + * (the crypto, thread control block and replication implementations + * allocate memory from the primary shared region). * * The rest of the fields are initialized at creation time, and don't * need mutex protection. The flags, op_timestamp and rep_timestamp @@ -209,6 +212,7 @@ typedef struct __db_reg_env { /* SHARED */ */ db_mutex_t mtx_regenv; /* Refcnt, region allocation mutex. */ u_int32_t refcnt; /* References to the environment. */ + u_int32_t blob_threshold; /* Environment wide blob threshold. */ u_int32_t region_cnt; /* Number of REGIONs. */ roff_t region_off; /* Offset of region array */ @@ -227,6 +231,8 @@ typedef struct __db_reg_env { /* SHARED */ time_t op_timestamp; /* Timestamp for operations. */ time_t rep_timestamp; /* Timestamp for rep db handles. */ u_int32_t reg_panic; /* DB_REGISTER triggered panic */ + u_int32_t failure_panic; /* Failchk or mutex lock saw a crash. */ + char failure_symptom[DB_FAILURE_SYMPTOM_SIZE]; uintmax_t unused; /* The ALLOC_LAYOUT structure follows * the REGENV structure in memory and * contains uintmax_t fields. Force @@ -308,11 +314,14 @@ struct __db_reginfo_t { /* __env_region_attach IN parameters. */ /* * PANIC_ISSET, PANIC_CHECK: - * Check to see if the DB environment is dead. + * Check to see if the DB environment is dead. If the environment is still + * attached to its regions, look in the REGENV. Otherwise, check whether + * the region had the panic state set when this even detached from it. */ #define PANIC_ISSET(env) \ - ((env) != NULL && (env)->reginfo != NULL && \ - ((REGENV *)(env)->reginfo->primary)->panic != 0 && \ + ((env) != NULL && ((env)->reginfo != NULL ? \ + ((REGENV *)(env)->reginfo->primary)->panic != 0 : \ + F_ISSET(env, ENV_REMEMBER_PANIC)) && \ !F_ISSET((env)->dbenv, DB_ENV_NOPANIC)) #define PANIC_CHECK(env) \ diff --git a/src/dbinc/rep.h b/src/dbinc/rep.h index 75004239..f3bdf481 100644 --- a/src/dbinc/rep.h +++ b/src/dbinc/rep.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -19,6 +19,7 @@ extern "C" { * Names of client temp databases. */ #define REPFILEPREFIX "__db.rep" +#define REPBLOBNAME "__db.rep.blob.db" #define REPDBNAME "__db.rep.db" #define REPPAGENAME "__db.reppg.db" @@ -42,43 +43,58 @@ extern "C" { /* * Message types */ -#define REP_INVALID 0 /* Invalid message type. */ -#define REP_ALIVE 1 /* I am alive message. */ -#define REP_ALIVE_REQ 2 /* Request for alive messages. */ -#define REP_ALL_REQ 3 /* Request all log records greater than LSN. */ -#define REP_BULK_LOG 4 /* Bulk transfer of log records. */ -#define REP_BULK_PAGE 5 /* Bulk transfer of pages. */ -#define REP_DUPMASTER 6 /* Duplicate master detected; propagate. */ -#define REP_FILE 7 /* Page of a database file. NOTUSED */ -#define REP_FILE_FAIL 8 /* File requested does not exist. */ -#define REP_FILE_REQ 9 /* Request for a database file. NOTUSED */ -#define REP_LEASE_GRANT 10 /* Client grants a lease to a master. */ -#define REP_LOG 11 /* Log record. */ -#define REP_LOG_MORE 12 /* There are more log records to request. */ -#define REP_LOG_REQ 13 /* Request for a log record. */ -#define REP_MASTER_REQ 14 /* Who is the master */ -#define REP_NEWCLIENT 15 /* Announces the presence of a new client. */ -#define REP_NEWFILE 16 /* Announce a log file change. */ -#define REP_NEWMASTER 17 /* Announces who the master is. */ -#define REP_NEWSITE 18 /* Announces that a site has heard from a new - * site; like NEWCLIENT, but indirect. A - * NEWCLIENT message comes directly from the new - * client while a NEWSITE comes indirectly from - * someone who heard about a NEWSITE. - */ -#define REP_PAGE 19 /* Database page. */ -#define REP_PAGE_FAIL 20 /* Requested page does not exist. */ -#define REP_PAGE_MORE 21 /* There are more pages to request. */ -#define REP_PAGE_REQ 22 /* Request for a database page. */ -#define REP_REREQUEST 23 /* Force rerequest. */ -#define REP_START_SYNC 24 /* Tell client to begin syncing a ckp.*/ -#define REP_UPDATE 25 /* Environment hotcopy information. */ -#define REP_UPDATE_REQ 26 /* Request for hotcopy information. */ -#define REP_VERIFY 27 /* A log record for verification. */ -#define REP_VERIFY_FAIL 28 /* The client is outdated. */ -#define REP_VERIFY_REQ 29 /* Request for a log record to verify. */ -#define REP_VOTE1 30 /* Send out your information for an election. */ -#define REP_VOTE2 31 /* Send a "you are master" vote. */ +#define REP_INVALID 0 /* Invalid message type. */ +#define REP_ALIVE 1 /* I am alive message. */ +#define REP_ALIVE_REQ 2 /* Request for alive messages. */ +#define REP_ALL_REQ 3 /* Request all log records greater than + * LSN. */ +#define REP_BLOB_ALL_REQ 4 /* Request all the given blob files. */ +#define REP_BLOB_CHUNK 5 /* A piece of data contained in a blob + * file. */ +#define REP_BLOB_CHUNK_REQ 6 /* Request a piece of data from a blob + * file. */ +#define REP_BLOB_UPDATE 7 /* A list of blob files for a + * database. */ +#define REP_BLOB_UPDATE_REQ 8 /* Request blob files. */ +#define REP_BULK_LOG 9 /* Bulk transfer of log records. */ +#define REP_BULK_PAGE 10 /* Bulk transfer of pages. */ +#define REP_DUPMASTER 11 /* Duplicate master detected; + * propagate. */ +#define REP_FILE 12 /* Page of a database file. NOTUSED */ +#define REP_FILE_FAIL 13 /* File requested does not exist. */ +#define REP_FILE_REQ 14 /* Request for a database file. + * NOTUSED */ +#define REP_LEASE_GRANT 15 /* Client grants a lease to a master. */ +#define REP_LOG 16 /* Log record. */ +#define REP_LOG_MORE 17 /* There are more log records to + * request. */ +#define REP_LOG_REQ 18 /* Request for a log record. */ +#define REP_MASTER_REQ 19 /* Who is the master */ +#define REP_NEWCLIENT 20 /* Announces the presence of a new + * client. */ +#define REP_NEWFILE 21 /* Announce a log file change. */ +#define REP_NEWMASTER 22 /* Announces who the master is. */ +#define REP_NEWSITE 23 /* Announces that a site has heard from + * a new site; like NEWCLIENT, but + * indirect. A NEWCLIENT message comes + * directly from the new client while a + * NEWSITE comes indirectly from + * someone who heard about a NEWSITE.*/ +#define REP_PAGE 24 /* Database page. */ +#define REP_PAGE_FAIL 25 /* Requested page does not exist. */ +#define REP_PAGE_MORE 26 /* There are more pages to request. */ +#define REP_PAGE_REQ 27 /* Request for a database page. */ +#define REP_REREQUEST 28 /* Force rerequest. */ +#define REP_START_SYNC 29 /* Tell client to begin syncing a ckp.*/ +#define REP_UPDATE 30 /* Environment hotcopy information. */ +#define REP_UPDATE_REQ 31 /* Request for hotcopy information. */ +#define REP_VERIFY 32 /* A log record for verification. */ +#define REP_VERIFY_FAIL 33 /* The client is outdated. */ +#define REP_VERIFY_REQ 34 /* Request for a log record to + * verify. */ +#define REP_VOTE1 35 /* Send out your information for an + * election. */ +#define REP_VOTE2 36 /* Send a "you are master" vote. */ /* * Maximum message number for conversion tables. Update this * value as the largest message number above increases. @@ -90,7 +106,7 @@ extern "C" { * NOTE: When changing messages above, the two tables for upgrade support * need adjusting. They are in rep_util.c. */ -#define REP_MAX_MSG 31 +#define REP_MAX_MSG 36 /* * This is the list of client-to-client requests messages. @@ -99,6 +115,8 @@ extern "C" { */ #define REP_MSG_REQ(rectype) \ (rectype == REP_ALL_REQ || \ + rectype == REP_BLOB_ALL_REQ || \ + rectype == REP_BLOB_CHUNK_REQ || \ rectype == REP_LOG_REQ || \ rectype == REP_PAGE_REQ || \ rectype == REP_VERIFY_REQ) @@ -125,6 +143,9 @@ extern "C" { #define DB_LOGVERSION_51 17 #define DB_LOGVERSION_52 18 #define DB_LOGVERSION_53 19 +#define DB_LOGVERSION_60 20 +#define DB_LOGVERSION_60p1 21 +#define DB_LOGVERSION_61 22 #define DB_LOGVERSION_MIN DB_LOGVERSION_44 #define DB_REPVERSION_INVALID 0 #define DB_REPVERSION_44 3 @@ -132,11 +153,12 @@ extern "C" { #define DB_REPVERSION_46 4 #define DB_REPVERSION_47 5 #define DB_REPVERSION_48 5 -#define DB_REPVERSION_50 5 #define DB_REPVERSION_51 5 #define DB_REPVERSION_52 6 #define DB_REPVERSION_53 7 -#define DB_REPVERSION DB_REPVERSION_53 +#define DB_REPVERSION_60 7 +#define DB_REPVERSION_61 8 +#define DB_REPVERSION DB_REPVERSION_61 #define DB_REPVERSION_MIN DB_REPVERSION_44 /* @@ -204,9 +226,20 @@ extern "C" { #define REP_INITVERSION 3 /* + * View/partial replication file name. + * The file is empty. It exists as a permanent indicator that this + * environment can never be master. + */ +#define REPVIEW "__db.rep.view" +#define IS_VIEW_SITE(env) \ + (REP_ON(env) && \ + ((env)->rep_handle->region->stat.st_view != 0)) + +/* * Database types for __rep_client_dbinit */ typedef enum { + REP_BLOB, /* Blob file database. */ REP_DB, /* Log record database. */ REP_PG /* Pg database. */ } repdb_t; @@ -239,7 +272,7 @@ typedef enum { typedef enum { SYNC_OFF, /* No recovery. */ SYNC_LOG, /* Recovery - log. */ - SYNC_PAGE, /* Recovery - pages. */ + SYNC_PAGE, /* Recovery - pages and blobs. */ SYNC_UPDATE, /* Recovery - update. */ SYNC_VERIFY /* Recovery - verify. */ } repsync_t; @@ -346,6 +379,17 @@ typedef struct __rep { /* SHARED */ u_int32_t first_vers; /* Log version of first log file. */ DB_LSN last_lsn; /* Latest LSN we need. */ /* These are protected by mtx_clientdb. */ + db_seq_t gap_bl_hi_id; /* Last id in the blob gap. */ + db_seq_t gap_bl_hi_sid; /* Last sid in the blob gap. */ + off_t gap_bl_hi_off; /* Last offset in the blob gap. */ + db_seq_t last_blob_id; /* Last id on the list to process. */ + db_seq_t last_blob_sid; /* Last sid on the list to process. */ + db_seq_t prev_blob_id; /* Previous last id on list. */ + db_seq_t prev_blob_sid; /* Previous last sid on list. */ + db_seq_t highest_id; /* Highest file id to request. */ + u_int32_t blob_more_files;/* More blob files to be processed. */ + int blob_sync; /* Currently handling blobs. */ + int blob_rereq; /* When to rereq a blob update msg. */ db_timespec last_pg_ts; /* Last page stored timestamp. */ db_pgno_t ready_pg; /* Next pg expected. */ db_pgno_t waiting_pg; /* First pg after gap. */ @@ -391,11 +435,13 @@ typedef struct __rep { /* SHARED */ roff_t siteinfo_off; /* Offset of site array region. */ u_int site_cnt; /* Array slots in use. */ u_int site_max; /* Total array slots allocated. */ + u_int sites_avail; /* Total number of available sites. */ int self_eid; /* Where to find the local site. */ u_int siteinfo_seq; /* Number of updates to this info. */ u_int32_t min_log_file; /* Earliest log needed by repgroup. */ pid_t listener; + u_int listener_nthreads; /* # of msg threads in listener. */ int perm_policy; db_timeout_t ack_timeout; @@ -403,6 +449,11 @@ typedef struct __rep { /* SHARED */ db_timeout_t connection_retry_wait; db_timeout_t heartbeat_frequency; /* Max period between msgs. */ db_timeout_t heartbeat_monitor_timeout; + u_int32_t inqueue_max_gbytes; + u_int32_t inqueue_max_bytes; + u_int32_t inqueue_rz_gbytes; + u_int32_t inqueue_rz_bytes; + u_int32_t inqueue_full_event_on; #endif /* HAVE_REPLICATION_THREADS */ /* Statistics. */ @@ -419,12 +470,16 @@ typedef struct __rep { /* SHARED */ #define REP_C_2SITE_STRICT 0x00001 /* Don't cheat on elections. */ #define REP_C_AUTOINIT 0x00002 /* Auto initialization. */ #define REP_C_AUTOROLLBACK 0x00004 /* Discard client txns: sync. */ -#define REP_C_BULK 0x00008 /* Bulk transfer. */ -#define REP_C_DELAYCLIENT 0x00010 /* Delay client sync-up. */ -#define REP_C_ELECTIONS 0x00020 /* Repmgr to use elections. */ -#define REP_C_INMEM 0x00040 /* In-memory replication. */ -#define REP_C_LEASE 0x00080 /* Leases configured. */ -#define REP_C_NOWAIT 0x00100 /* Immediate error return. */ +#define REP_C_AUTOTAKEOVER 0x00008 /* Auto listener take over. */ +#define REP_C_BULK 0x00010 /* Bulk transfer. */ +#define REP_C_DELAYCLIENT 0x00020 /* Delay client sync-up. */ +#define REP_C_ELECT_LOGLENGTH 0x00040 /* Log length wins election. */ +#define REP_C_ELECTIONS 0x00080 /* Repmgr to use elections. */ +#define REP_C_INMEM 0x00100 /* In-memory replication. */ +#define REP_C_LEASE 0x00200 /* Leases configured. */ +#define REP_C_NOWAIT 0x00400 /* Immediate error return. */ +#define REP_C_PREFMAS_CLIENT 0x00800 /* Preferred master client. */ +#define REP_C_PREFMAS_MASTER 0x01000 /* Preferred master site. */ u_int32_t config; /* Configuration flags. */ /* Election. */ @@ -455,15 +510,17 @@ typedef struct __rep { /* SHARED */ #define REP_F_CLIENT 0x00000008 /* Client replica. */ #define REP_F_DELAY 0x00000010 /* Delaying client sync-up. */ #define REP_F_GROUP_ESTD 0x00000020 /* Rep group is established. */ -#define REP_F_INUPDREQ 0x00000040 /* Thread in rep_update_req. */ -#define REP_F_LEASE_EXPIRED 0x00000080 /* Leases guaranteed expired. */ -#define REP_F_MASTER 0x00000100 /* Master replica. */ -#define REP_F_MASTERELECT 0x00000200 /* Master elect. */ -#define REP_F_NEWFILE 0x00000400 /* Newfile in progress. */ -#define REP_F_NIMDBS_LOADED 0x00000800 /* NIMDBs are materialized. */ -#define REP_F_SKIPPED_APPLY 0x00001000 /* Skipped applying a record. */ -#define REP_F_START_CALLED 0x00002000 /* Rep_start called. */ -#define REP_F_SYS_DB_OP 0x00004000 /* Operation in progress. */ +#define REP_F_HOLD_GEN 0x00000040 /* PrefMas startup hold gen. */ +#define REP_F_INUPDREQ 0x00000080 /* Thread in rep_update_req. */ +#define REP_F_LEASE_EXPIRED 0x00000100 /* Leases guaranteed expired. */ +#define REP_F_MASTER 0x00000200 /* Master replica. */ +#define REP_F_MASTERELECT 0x00000400 /* Master elect. */ +#define REP_F_NEWFILE 0x00000800 /* Newfile in progress. */ +#define REP_F_NIMDBS_LOADED 0x00001000 /* NIMDBs are materialized. */ +#define REP_F_READONLY_MASTER 0x00002000 /* PrefMas readonly master. */ +#define REP_F_SKIPPED_APPLY 0x00004000 /* Skipped applying a record. */ +#define REP_F_START_CALLED 0x00008000 /* Rep_start called. */ +#define REP_F_SYS_DB_OP 0x00010000 /* Operation in progress. */ u_int32_t flags; } REP; @@ -525,7 +582,7 @@ do { \ /* * REP_F_EPHASE0 is not a *real* election phase. It is used for * master leases and allowing the client to find the master or - * expire its lease. However, EPHASE0 is cleared by __rep_elect_done. + * expire its lease. */ #define IN_ELECTION(R) \ FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2) @@ -594,6 +651,22 @@ do { \ } while (0) +/* Macros to determine current replication configuration options. */ +#define REP_CONFIG_IS_SET(env, flags) \ + (REP_ON(env) ? \ + FLD_ISSET(((env)->rep_handle->region)->config, flags) : \ + FLD_ISSET(((env)->rep_handle)->config, flags)) +#ifdef HAVE_REPLICATION_THREADS +#define PREFMAS_IS_SET(env) \ + (REP_CONFIG_IS_SET(env, \ + (REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT))) +#else +#define PREFMAS_IS_SET(env) 0 +#endif +#define IS_PREFMAS_MODE(env) \ + (REP_ON(env) && PREFMAS_IS_SET(env) && \ + ((env)->rep_handle->region)->config_nsites < 3) + /* * Gap processing flags. These provide control over the basic * gap processing algorithm for some special cases. @@ -603,11 +676,28 @@ do { \ /* REREQUEST is a superset of FORCE. */ /* + * Internal options for rep_start_int(). These are used by preferred master + * mode to help coordinate between the sites during changes of master. + */ +#define REP_START_FORCE_ROLECHG 0x001 /* Force role change to advance gen. */ +#define REP_START_HOLD_CLIGEN 0x002 /* Hold client gen before doing + * lsnhist match. */ +#define REP_START_WAIT_LOCKMSG 0x004 /* Wait for REP_LOCKOUT_MSG. */ + +/* * Flags indicating what kind of record we want to back up to, in the log. */ -#define REP_REC_COMMIT 0x001 /* Most recent commit record. */ -#define REP_REC_PERM 0x002 /* Most recent perm record. */ +#define REP_REC_COMMIT 0x001 /* Most recent commit record. */ +#define REP_REC_PERM 0x002 /* Most recent perm record. */ /* PERM is a superset of COMMIT. */ +#define REP_REC_PERM_DEL 0x004 /* Most recent PERM, or fail if a + * file delete is found first. */ + +/* + * Permanent record types. + */ +#define IS_PERM_RECTYPE(rectype) \ + ((rectype) == DB___txn_regop || (rectype) == DB___txn_ckp) /* * Basic pre/post-amble processing. @@ -692,7 +782,7 @@ do { \ * machine instruction. A single 32-bit integer value is safe without a * mutex, but most other types of value should use a mutex. * - * Any use of a mutex must be inside a matched pair of ENV_ENTER() and + * Use of a db_mutex_t mutex must be inside a matched pair of ENV_ENTER() and * ENV_LEAVE() macros. This ensures that if a thread dies while holding * a lock (i.e. a mutex), recovery can clean it up so that it does not * indefinitely block other threads. @@ -727,6 +817,9 @@ struct __db_rep { /* * End of shared configuration information. */ + int (*partial) /* View/partial replication function. */ + __P((DB_ENV *, const char *, int *, u_int32_t)); + int (*send) /* Send function. */ __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)); @@ -745,6 +838,7 @@ struct __db_rep { DB_MPOOLFILE *file_mpf; /* Mpoolfile for current database. */ DB *file_dbp; /* This file's page info. */ DBC *queue_dbc; /* Dbc for a queue file. */ + DB *blob_dbp; /* Blob file database. */ /* * Please change __rep_print_all (rep_stat.c) to track any changes made @@ -759,6 +853,7 @@ struct __db_rep { /* * Replication Framework (repmgr) per-process information. */ + int config_nthreads;/* Configured msg processing threads. */ u_int nthreads; /* Msg processing threads. */ u_int athreads; /* Space allocated for msg threads. */ u_int non_rep_th; /* Threads in GMDB or channel msgs. */ @@ -771,10 +866,13 @@ struct __db_rep { db_timeout_t connection_retry_wait; db_timeout_t heartbeat_frequency; /* Max period between msgs. */ db_timeout_t heartbeat_monitor_timeout; + u_int32_t inqueue_max_gbytes; + u_int32_t inqueue_max_bytes; /* Thread synchronization. */ REPMGR_RUNNABLE *selector, **messengers, **elect_threads; REPMGR_RUNNABLE *preferred_elect_thr; + REPMGR_RUNNABLE *takeover_thread; db_timespec repstart_time; mgr_mutex_t *mutex; cond_var_t check_election, gmdb_idle, msg_avail; @@ -799,12 +897,18 @@ struct __db_rep { CONNECTION_LIST connections; RETRY_Q_HEADER retries; /* Sites needing connection retry. */ struct { - int size; + u_int32_t gbytes; + u_int32_t bytes; STAILQ_HEAD(__repmgr_q_header, __repmgr_message) header; } input_queue; socket_t listen_fd; db_timespec last_bcast; /* Time of last broadcast msg. */ + db_timespec last_hbeat; /* Time of last heartbeat (prefmas). */ + db_timespec l_listener_chk; /* Time to check local listener. */ + db_timeout_t l_listener_wait;/* Timeout to check local listener. */ + db_timespec m_listener_chk; /* Time to check master listener. */ + db_timeout_t m_listener_wait;/* Timeout to check master listener. */ /* * Status of repmgr. It is ready when repmgr is not yet started. It @@ -813,12 +917,15 @@ struct __db_rep { */ enum { ready, running, stopped } repmgr_status; int new_connection; /* Since last master seek attempt. */ + int demotion_pending; /* We're being demoted to a view. */ int takeover_pending; /* We've been elected master. */ + int rejoin_pending; /* Join group retry after rejection. */ int gmdb_busy; int client_intent; /* Will relinquish master role. */ int gmdb_dirty; int have_gmdb; int seen_repmsg; + int view_mismatch; /* View callback and gmdb don't match. */ /* * Flag to show what kind of transaction is currently in progress. @@ -854,6 +961,16 @@ struct __db_rep { u_int8_t *restored_list; size_t restored_list_length; + /* + * Preferred master mode indicator for a pending action. A + * master_switch is initiated when the preferred master site is + * ready to take over as master. A start_temp_master is initiated + * when the client site needs to start as the temporary master. + */ + enum { no_action, master_switch, start_temp_master } prefmas_pending; + /* The LSN at the very beginning of preferred master site startup. */ + DB_LSN prefmas_init_lsn; + /* Application's message dispatch call-back function. */ void (*msg_dispatch) __P((DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t)); @@ -920,6 +1037,10 @@ struct __db_rep { } else if (!F_ISSET((env)->rep_handle, DBREP_APP_REPMGR)) \ F_SET((env)->rep_handle, DBREP_APP_BASEAPI); \ } while (0) +#define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \ + (db_rep)->l_listener_wait = timeout; \ + (db_rep)->m_listener_wait = 3 * timeout; \ +} while (0) #else /* @@ -935,6 +1056,9 @@ struct __db_rep { #define APP_SET_BASEAPI(env) do { \ ; \ } while (0) +#define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \ + ; \ +} while (0) #endif /* HAVE_REPLICATION_THREADS */ /* @@ -945,22 +1069,27 @@ struct __db_rep { * compatibility with old versions, these values must be reserved explicitly in * the list of flag values (below) */ -#define DB_LOG_PERM_42_44 0x20 -#define DB_LOG_RESEND_42_44 0x40 -#define REPCTL_INIT_45 0x02 /* Back compatible flag value. */ - -#define REPCTL_ELECTABLE 0x01 /* Upgraded client is electable. */ -#define REPCTL_FLUSH 0x02 /* Record should be flushed. */ -#define REPCTL_GROUP_ESTD 0x04 /* Message from site in a group. */ -#define REPCTL_INIT 0x08 /* Internal init message. */ -#define REPCTL_LEASE 0x10 /* Lease related message.. */ +#define DB_LOG_PERM_42_44 0x020 +#define DB_LOG_RESEND_42_44 0x040 +#define REPCTL_INIT_45 0x002 /* Back compatible flag value. */ + +/* + * Add new REPCTL flags to the end of this list to preserve compatibility + * with old versions. + */ +#define REPCTL_ELECTABLE 0x001 /* Upgraded client is electable. */ +#define REPCTL_FLUSH 0x002 /* Record should be flushed. */ +#define REPCTL_GROUP_ESTD 0x004 /* Message from site in a group. */ +#define REPCTL_INIT 0x008 /* Internal init message. */ +#define REPCTL_LEASE 0x010 /* Lease related message. */ /* * Skip over reserved values 0x20 * and 0x40, as explained above. */ -#define REPCTL_LOG_END 0x80 /* Approximate end of group-wide log. */ +#define REPCTL_LOG_END 0x080 /* Approximate end of group-wide log. */ #define REPCTL_PERM DB_LOG_PERM_42_44 #define REPCTL_RESEND DB_LOG_RESEND_42_44 +#define REPCTL_INMEM_ONLY 0x100 /* In-memory databases only. */ /* * File info flags for internal init. The per-database (i.e., file) flag @@ -1094,6 +1223,20 @@ typedef struct { DBT *objs; } linfo_t; +/* + * Used to store information on the child transaction that opens a blob meta + * database. In partial replication processing the child transaction of the + * blob meta database must be delayed until after processing the child + * transaction that opens the database that owns the BMD. + */ +typedef struct { + db_seq_t blob_file_id; + DB_LSN lsn; + u_int32_t child; + void *next; + void *prev; +} DELAYED_BLOB_LIST; + #if defined(__cplusplus) } #endif diff --git a/src/dbinc/repmgr.h b/src/dbinc/repmgr.h index d8fd199c..a38defa2 100644 --- a/src/dbinc/repmgr.h +++ b/src/dbinc/repmgr.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -47,20 +47,29 @@ extern "C" { * In protocol version one there were only three message types: 1, 2, and 3; so * 3 was the max. In protocol version 2 we introduced heartbeats, type 4. * (Protocol version 3 did not introduce any new message types.) In version 4 - * we introduced a few more new message types, the largest of which had value 7. + * we introduced a few more new message types, the largest of which had value 8. + * Protocol version 5 did not introduce any new message types, but changed + * the format of site info and membership data to support views. + * + * Protocol version 6 introduced preferred master mode, which added several + * new REPMGR_OWN messages. */ #define REPMGR_MAX_V1_MSG_TYPE 3 #define REPMGR_MAX_V2_MSG_TYPE 4 #define REPMGR_MAX_V3_MSG_TYPE 4 #define REPMGR_MAX_V4_MSG_TYPE 8 +#define REPMGR_MAX_V5_MSG_TYPE 8 +#define REPMGR_MAX_V6_MSG_TYPE 8 #define HEARTBEAT_MIN_VERSION 2 #define CHANNEL_MIN_VERSION 4 #define CONN_COLLISION_VERSION 4 #define GM_MIN_VERSION 4 #define OWN_MIN_VERSION 4 +#define VIEW_MIN_VERSION 5 +#define PREFMAS_MIN_VERSION 6 /* The range of protocol versions we're willing to support. */ -#define DB_REPMGR_VERSION 4 +#define DB_REPMGR_VERSION 6 #define DB_REPMGR_MIN_VERSION 1 /* @@ -73,18 +82,30 @@ extern "C" { * Like the message format types, these message type values should be * permanently frozen. */ -#define REPMGR_CONNECT_REJECT 1 -#define REPMGR_GM_FAILURE 2 -#define REPMGR_GM_FORWARD 3 -#define REPMGR_JOIN_REQUEST 4 -#define REPMGR_JOIN_SUCCESS 5 -#define REPMGR_PARM_REFRESH 6 -#define REPMGR_REJOIN 7 -#define REPMGR_REMOVE_REQUEST 8 -#define REPMGR_REMOVE_SUCCESS 9 -#define REPMGR_RESOLVE_LIMBO 10 -#define REPMGR_SHARING 11 - +#define REPMGR_CONNECT_REJECT 1 +#define REPMGR_GM_FAILURE 2 +#define REPMGR_GM_FORWARD 3 +#define REPMGR_JOIN_REQUEST 4 +#define REPMGR_JOIN_SUCCESS 5 +#define REPMGR_PARM_REFRESH 6 +#define REPMGR_REJOIN 7 +#define REPMGR_REMOVE_REQUEST 8 +#define REPMGR_REMOVE_SUCCESS 9 +#define REPMGR_RESOLVE_LIMBO 10 +#define REPMGR_SHARING 11 +#define REPMGR_LSNHIST_REQUEST 12 +#define REPMGR_LSNHIST_RESPONSE 13 +#define REPMGR_PREFMAS_FAILURE 14 +#define REPMGR_PREFMAS_SUCCESS 15 +#define REPMGR_READONLY_MASTER 16 +#define REPMGR_READONLY_RESPONSE 17 +#define REPMGR_RESTART_CLIENT 18 + +/* Detect inconsistencies between view callback and site's gmdb. */ +#define PARTICIPANT_TO_VIEW(db_rep, site) \ + ((db_rep)->partial && !FLD_ISSET((site)->gmdb_flags, SITE_VIEW)) +#define VIEW_TO_PARTICIPANT(db_rep, site) \ + (!(db_rep)->partial && FLD_ISSET((site)->gmdb_flags, SITE_VIEW)) struct __repmgr_connection; typedef struct __repmgr_connection REPMGR_CONNECTION; @@ -98,7 +119,8 @@ struct __cond_waiters_table; typedef struct __cond_waiters_table COND_WAITERS_TABLE; /* Current Group Membership DB format ID. */ -#define REPMGR_GMDB_FMT_VERSION 1 +#define REPMGR_GMDB_FMT_VERSION 2 +#define REPMGR_GMDB_FMT_MIN_VERSION 1 #ifdef DB_WIN32 typedef SOCKET socket_t; @@ -151,6 +173,17 @@ typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1]; #define DB_REPMGR_DEFAULT_ELECTION_RETRY (10 * US_PER_SEC) #define DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT (5 * US_PER_SEC) +/* Default preferred master automatic configuration values. */ +#define DB_REPMGR_PREFMAS_ELECTION_RETRY (1 * US_PER_SEC) +#define DB_REPMGR_PREFMAS_HEARTBEAT_MONITOR (2 * US_PER_SEC) +#define DB_REPMGR_PREFMAS_HEARTBEAT_SEND (75 * (US_PER_SEC / 100)) +#define DB_REPMGR_PREFMAS_PRIORITY_CLIENT 75 +#define DB_REPMGR_PREFMAS_PRIORITY_MASTER 200 + +/* Defaults for undocumented incoming queue maximum messages. */ +#define DB_REPMGR_DEFAULT_INQUEUE_MAX (100 * MEGABYTE) +#define DB_REPMGR_INQUEUE_REDZONE_PERCENT 85 + typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST; typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER; typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER; @@ -170,14 +203,20 @@ struct __repmgr_runnable { /* * Options governing requested behavior of election thread. */ -#define ELECT_F_EVENT_NOTIFY 0x01 /* Notify application of master failure. */ -#define ELECT_F_FAST 0x02 /* First election "fast" (n-1 trick). */ -#define ELECT_F_IMMED 0x04 /* Start with immediate election. */ -#define ELECT_F_INVITEE 0x08 /* Honor (remote) inviter's nsites. */ -#define ELECT_F_STARTUP 0x10 /* Observe repmgr_start() policy. */ +#define ELECT_F_CLIENT_RESTART 0x01 /* Do client restarts but no elections. */ +#define ELECT_F_EVENT_NOTIFY 0x02 /* Notify application of master failure. */ +#define ELECT_F_FAST 0x04 /* First election "fast" (n-1 trick). */ +#define ELECT_F_IMMED 0x08 /* Start with immediate election. */ +#define ELECT_F_INVITEE 0x10 /* Honor (remote) inviter's nsites. */ +#define ELECT_F_STARTUP 0x20 /* Observe repmgr_start() policy. */ u_int32_t flags; - int eid; /* For Connector thread. */ + /* For connector thread. */ + struct { + int eid; +#define CONNECT_F_REFRESH 0x01 /* New connection to replace old one. */ + u_int32_t flags; + } conn_th; /* * Args for other thread types can be added here in the future @@ -265,6 +304,7 @@ struct __queued_output { */ typedef struct __repmgr_message { STAILQ_ENTRY(__repmgr_message) entries; + size_t size; __repmgr_msg_hdr_args msg_hdr; union { struct { @@ -343,6 +383,7 @@ struct __repmgr_connection { #define CONN_PARAMETERS 5 /* Awaiting parameters handshake. */ #define CONN_READY 6 /* Everything's fine. */ int state; + u_int32_t auto_takeover;/* Connection to remote listener candidate. */ /* * Input: while we're reading a message, we keep track of what phase @@ -464,6 +505,8 @@ typedef struct { SITEADDR addr; /* Unprocessed network address of site. */ u_int32_t config; /* Configuration flags: peer, helper, etc. */ u_int32_t status; /* Group membership status. */ + u_int32_t flags; /* Group membership flags. */ + u_int32_t listener_cand;/* Number of listener candidates of site. */ } SITEINFO; /* @@ -489,6 +532,42 @@ typedef struct { ((u_int)i) < db_rep->site_cnt; \ (int)(++(i)) == db_rep->self_eid ? ++(i) : i) +/* + * Enable replication manager auto listener takeover. + */ +#define HAVE_REPLICATION_LISTENER_TAKEOVER 1 + +/* Listener candidate, that is subordinate rep-aware process. */ +#define IS_LISTENER_CAND(db_rep) \ + (FLD_ISSET((db_rep)->region->config, REP_C_AUTOTAKEOVER) && \ + IS_SUBORDINATE(db_rep) && (db_rep)->repmgr_status == running) + +/* + * The number of listener candidates for each remote site is maintained in + * the listener process and used in subordinate rep-aware processes. + */ +#define SET_LISTENER_CAND(cond, op) \ + do { \ + if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER) && \ + !IS_SUBORDINATE(db_rep) && (cond)) { \ + MUTEX_LOCK(env, rep->mtx_repmgr); \ + sites = R_ADDR(env->reginfo, rep->siteinfo_off);\ + (sites[eid].listener_cand)op; \ + MUTEX_UNLOCK(env, rep->mtx_repmgr); \ + } \ + } while (0) + +#define CHECK_LISTENER_CAND(val, op, tval, fval) \ + do { \ + if (IS_LISTENER_CAND(db_rep)) { \ + MUTEX_LOCK(env, rep->mtx_repmgr); \ + sites = R_ADDR(env->reginfo, rep->siteinfo_off);\ + val = ((sites[eid].listener_cand)op) ? \ + (tval) : (fval); \ + MUTEX_UNLOCK(env, rep->mtx_repmgr); \ + } \ + } while (0) + struct __repmgr_site { repmgr_netaddr_t net_addr; @@ -499,12 +578,14 @@ struct __repmgr_site { * host/port network address is promised to be associated with the * locally known EID for the life of the environment. */ - u_int32_t membership; /* Status flags from GMDB. */ + u_int32_t membership; /* Status value from GMDB. */ + u_int32_t gmdb_flags; /* Flags from GMDB. */ u_int32_t config; /* Flags from site->set_config() */ /* * Everything below here is applicable only to remote sites. */ + u_int32_t max_ack_gen; /* Master generation for max_ack. */ DB_LSN max_ack; /* Best ack we've heard from this site. */ int ack_policy; /* Or 0 if unknown. */ u_int16_t alignment; /* Requirements for app channel msgs. */ @@ -604,11 +685,11 @@ struct __channel { * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and * (3) db_rep->connections. * - * 1. SITE->ref.conn points to our connection with the main process running - * at the given site, if such a connection exists. We may have initiated - * the connection to the site ourselves, or we may have received it as an - * incoming connection. Once it is established there is very little - * difference between those two cases. + * 1. SITE->ref.conn points to our connection with the listener process + * running at the given site, if such a connection exists. We may have + * initiated the connection to the site ourselves, or we may have received + * it as an incoming connection. Once it is established there is very + * little difference between those two cases. * * 2. SITE->sub_conns is a list of connections we have with subordinate * processes running at the given site. There can be any number of these @@ -694,6 +775,7 @@ struct __channel { */ #define APP_CHANNEL_CONNECTION 0x02 /* Connection used for app channel. */ #define ELECTABLE_SITE 0x04 +#define REPMGR_AUTOTAKEOVER 0x08 /* Could become main connection. */ #define REPMGR_SUBORDINATE 0x01 /* This is a subordinate connection. */ /* @@ -719,13 +801,20 @@ typedef struct { * As with message formats, stored formats are defined in repmgr.msg. */ /* - * Flags for the Group Membership data portion of a record. Like message type - * codes, these values are frozen across releases, in order to avoid pointless - * churn. + * Status values for the Group Membership data portion of a record. Like + * message type codes, these values are frozen across releases, in order to + * avoid pointless churn. These values are mutually exclusive. */ #define SITE_ADDING 0x01 #define SITE_DELETING 0x02 #define SITE_PRESENT 0x04 +/* + * Flags for the Group Membership data portion of a record. These values are + * also frozen across releases. These values are bit fields and may be OR'ed + * together. + */ +#define SITE_VIEW 0x01 +#define SITE_JOIN_ELECTABLE 0x02 /* * Message types whose processing could take a long time. We're careful to @@ -755,9 +844,9 @@ typedef struct { * fraction of the code, it's a tiny fraction of the time: repmgr spends most of * its time in a call to select(), and as well a bit in calls into the Base * replication API. All of those release the mutex. - * Access to repmgr's shared list of site addresses is protected by - * another mutex: mtx_repmgr. And, when changing space allocation for that site - * list we conform to the convention of acquiring renv->mtx_regenv. These are + * Access to repmgr's shared values is protected by another mutex: + * mtx_repmgr. And, when changing space allocation for that site list + * we conform to the convention of acquiring renv->mtx_regenv. These are * less frequent of course. * When it's necessary to acquire more than one of these mutexes, the * ordering priority (or "lock ordering protocol") is: diff --git a/src/dbinc/shqueue.h b/src/dbinc/shqueue.h index 22464462..20e0fae7 100644 --- a/src/dbinc/shqueue.h +++ b/src/dbinc/shqueue.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -140,6 +140,17 @@ struct { \ ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next))) /* + * __SH_LIST_WAS_EMPTY is private API. SH_LIST_FIRST is not thread-safe; + * the slh_first field could be evaluated multiple times if the optimizer + * does not eliminate the second load. __SH_LIST_WAS_EMPTY tests whether a + * prior call of SH_LIST_FIRSTP occurred while the list was empty; i.e., its + * relative offset was -1. It is thread-safe to call SH_LIST_FIRSTP and then + * test the resulting pointer with __SH_LIST_WAS_EMPTY. + */ +#define __SH_LIST_WAS_EMPTY(head, ptr) \ + ((u_int8_t *)(ptr) == (((u_int8_t *)(head)) + (-1))) + + /* *__SH_LIST_PREV_OFF is private API. It calculates the address of * the elm->field.sle_next member of a SH_LIST structure. All offsets * between elements are relative to that point in SH_LIST structures. diff --git a/src/dbinc/tcl_db.h b/src/dbinc/tcl_db.h index 4c56164f..99992467 100644 --- a/src/dbinc/tcl_db.h +++ b/src/dbinc/tcl_db.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -16,7 +16,7 @@ extern "C" { #define MSG_SIZE 100 /* Message size */ enum INFOTYPE { - I_AUX, I_DB, I_DBC, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN}; + I_AUX, I_DB, I_DBC, I_DBSTREAM, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN}; #define MAX_ID 8 /* Maximum number of sub-id's we need */ #define DBTCL_PREP 64 /* Size of txn_recover preplist */ @@ -24,9 +24,11 @@ enum INFOTYPE { #define DBTCL_DBM 1 #define DBTCL_NDBM 2 -#define DBTCL_GETCLOCK 0 -#define DBTCL_GETLIMIT 1 -#define DBTCL_GETREQ 2 +#define DBTCL_GETCLOCK 0 +#define DBTCL_GETINQUEUE_MAX 1 +#define DBTCL_GETINQUEUE_REDZONE 2 +#define DBTCL_GETLIMIT 3 +#define DBTCL_GETREQ 4 #define DBTCL_MUT_ALIGN 0 #define DBTCL_MUT_INCR 1 @@ -36,9 +38,11 @@ enum INFOTYPE { /* * Data structure to record information about events that have occurred. Tcl - * command "env event_info" can retrieve the information. For now, we record - * only one occurrence per event type; "env event_info -clear" can be used to - * reset the info. + * command "env event_info" can retrieve all the information except the number + * of times, and "env event_count" can retrieve the number of times a specific + * event is fired. We added "env event_count" instead of merging the times + * information into "env event_info" to avoid breaking the existing tests. + * Tcl command "env event_info -clear" can be used to reset the info. * * Besides the bit flag that records the fact that an event type occurred, some * event types have associated "info" and we record that here too. When new @@ -47,16 +51,17 @@ enum INFOTYPE { * with the "env event_info" results. */ typedef struct dbtcl_event_info { - u_int32_t events; /* Bit flag on for each event fired. */ - int panic_error; - int newmaster_eid; - int added_eid; - int removed_eid; - pid_t attached_process; - int connected_eid; + u_int32_t events; /* Bit flag on for each event fired. */ + int panic_error; + int newmaster_eid; + int added_eid; + int removed_eid; + pid_t attached_process; + int connected_eid; DB_REPMGR_CONN_ERR conn_broken_info; DB_REPMGR_CONN_ERR conn_failed_try_info; - DB_LSN sync_point; + DB_LSN sync_point; + size_t count[32]; /* The number of times for each event. */ } DBTCL_EVENT_INFO; /* @@ -99,6 +104,7 @@ typedef struct dbtcl_info { DB_LOCK *lock; DB_LOGC *logc; DB_MPOOLFILE *mp; + DB_STREAM *dbsp; DB_TXN *txnp; void *anyp; } un; @@ -128,6 +134,7 @@ typedef struct dbtcl_info { Tcl_Obj *i_isalive; Tcl_Obj *i_part_callback; Tcl_Obj *i_rep_send; + Tcl_Obj *i_rep_view; Tcl_Obj *i_second_call; /* Environment ID for the i_rep_send callback. */ @@ -144,6 +151,7 @@ typedef struct dbtcl_info { #define i_anyp un.anyp #define i_dbp un.dbp #define i_dbcp un.dbcp +#define i_dbsp un.dbsp #define i_envp un.envp #define i_lock un.lock #define i_logc un.logc @@ -170,6 +178,8 @@ typedef struct dbtcl_info { #define i_dbdbcid i_otherid[0] +#define i_dbcdbsid i_otherid[0] + extern int __debug_on, __debug_print, __debug_stop, __debug_test; typedef struct dbtcl_global { @@ -202,6 +212,7 @@ extern DBTCL_GLOBAL __dbtcl_global; * functions this will typically go before the "free" function to free the * stat structure returned by DB. */ +#ifdef HAVE_STATISTICS #define MAKE_STAT_LIST(s, v) do { \ result = _SetListElemInt(interp, res, (s), (long)(v)); \ if (result != TCL_OK) \ @@ -213,6 +224,11 @@ extern DBTCL_GLOBAL __dbtcl_global; if (result != TCL_OK) \ goto error; \ } while (0) +#else +/* These do-nothing versions streamline the code & reduce warning messages. */ +#define MAKE_STAT_LIST(s, v) if (0) goto error +#define MAKE_WSTAT_LIST(s, v) if (0) goto error +#endif /* * MAKE_STAT_LSN appends a {name {LSNfile LSNoffset}} pair to a result list @@ -257,13 +273,14 @@ extern DBTCL_GLOBAL __dbtcl_global; * This macro also assumes a label "error" to go to in the event of a Tcl * error. */ -#define MAKE_SITE_LIST(e, h, p, s, pr) do { \ - myobjc = 5; \ +#define MAKE_SITE_LIST(e, h, p, s, pr, vw) do { \ + myobjc = 6; \ myobjv[0] = Tcl_NewIntObj(e); \ myobjv[1] = Tcl_NewStringObj((h), (int)strlen(h)); \ myobjv[2] = Tcl_NewIntObj((int)p); \ myobjv[3] = Tcl_NewStringObj((s), (int)strlen(s)); \ myobjv[4] = Tcl_NewStringObj((pr), (int)strlen(pr)); \ + myobjv[5] = Tcl_NewStringObj((vw), (int)strlen(vw)); \ thislist = Tcl_NewListObj(myobjc, myobjv); \ result = Tcl_ListObjAppendElement(interp, res, thislist); \ if (result != TCL_OK) \ diff --git a/src/dbinc/txn.h b/src/dbinc/txn.h index 7cbae263..682d7c42 100644 --- a/src/dbinc/txn.h +++ b/src/dbinc/txn.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/win_db.h b/src/dbinc/win_db.h index ba57cd1f..e22aba98 100644 --- a/src/dbinc/win_db.h +++ b/src/dbinc/win_db.h @@ -1,17 +1,21 @@ /*- - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * The following provides the information necessary to build Berkeley * DB on native Windows, and other Windows environments such as MinGW. */ /* - * Berkeley DB requires at least Windows 2000, tell Visual Studio of the - * requirement. + * Berkeley DB requires at least Windows 2000, and Windows XP if we are using + * Visual Studio 2012. Tell Visual Studio of the requirement. */ #ifndef _WIN32_WINNT +#if _MSC_VER >= 1700 +#define _WIN32_WINNT 0x0501 +#else #define _WIN32_WINNT 0x0500 #endif +#endif #ifndef DB_WINCE #include <sys/types.h> @@ -69,12 +73,46 @@ #endif #define getpid GetCurrentProcessId #define snprintf _snprintf +#ifndef strcasecmp #define strcasecmp _stricmp #define strncasecmp _strnicmp +#endif #define vsnprintf _vsnprintf #define h_errno WSAGetLastError() +#ifdef DB_WINCE +/* Macros used by setvbuf on WINCE */ +#ifndef _IOFBF +#define _IOFBF 0x0000 +#endif +#ifndef _IOLBF +#define _IOLBF 0x0040 +#endif +#ifndef _IONBF +#define _IONBF 0x0004 +#endif +/* The macros for time functions */ +#define freopen __ce_freopen +#define gmtime __ce_gmtime +#define mktime __ce_mktime +#define remove __ce_remove +#define SECSPERMIN 60 +#define MINSPERHOUR 60 +#define HOURSPERDAY 24 +#define DAYSPERWEEK 7 +#define DAYSPERNYEAR 365 +#define DAYSPERLYEAR 366 +#define SECSPERHOUR (SECSPERMIN * MINSPERHOUR) +#define SECSPERDAY ((long) SECSPERHOUR * HOURSPERDAY) +#define MONSPERYEAR 12 +#define TM_YEAR_BASE 1900 +#define TM_YEAR_EPOCH 1970 +#define isleap(y) ((((y) % 4) == 0 && ((y) % 100) != 0) || ((y) % 400) == 0) +extern const __DB_IMPORT unsigned int mon_lengths[][MONSPERYEAR]; +extern const __DB_IMPORT unsigned int year_lengths[]; +#endif + /* * Win32 does not have getopt. * diff --git a/src/dbinc/xa.h b/src/dbinc/xa.h index 7283c1ea..7b7e2cb0 100644 --- a/src/dbinc/xa.h +++ b/src/dbinc/xa.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ |