summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/include/session.h
blob: f9a49ae7030cf73880280ab3634801373a6fc054 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

/*
 * WT_DATA_HANDLE_CACHE --
 *	Per-session cache of handles to avoid synchronization when opening
 *	cursors.
 */
struct __wt_data_handle_cache {
    WT_DATA_HANDLE *dhandle;

    TAILQ_ENTRY(__wt_data_handle_cache) q;
    TAILQ_ENTRY(__wt_data_handle_cache) hashq;
};

/*
 * WT_HAZARD --
 *	A hazard pointer.
 */
struct __wt_hazard {
    WT_REF *ref; /* Page reference */
#ifdef HAVE_DIAGNOSTIC
    const char *func; /* Function/line hazard acquired */
    int line;
#endif
};

/* Get the connection implementation for a session */
#define S2C(session) ((WT_CONNECTION_IMPL *)((WT_SESSION_IMPL *)(session))->iface.connection)

/* Get the btree for a session */
#define S2BT(session) ((WT_BTREE *)(session)->dhandle->handle)
#define S2BT_SAFE(session) ((session)->dhandle == NULL ? NULL : S2BT(session))

/* Get the file system for a session */
#define S2FS(session)                                                \
    ((session)->bucket_storage == NULL ? S2C(session)->file_system : \
                                         (session)->bucket_storage->file_system)

typedef TAILQ_HEAD(__wt_cursor_list, __wt_cursor) WT_CURSOR_LIST;

/* Number of cursors cached to trigger cursor sweep. */
#define WT_SESSION_CURSOR_SWEEP_COUNTDOWN 40

/* Minimum number of buckets to visit during a regular cursor sweep. */
#define WT_SESSION_CURSOR_SWEEP_MIN 5

/* Maximum number of buckets to visit during a regular cursor sweep. */
#define WT_SESSION_CURSOR_SWEEP_MAX 64

/* Invalid session ID. */
#define WT_SESSION_ID_INVALID 0xffffffff

/* A fake session ID for when we need to refer to a session that is actually NULL. */
#define WT_SESSION_ID_NULL 0xfffffffe

/*
 * WT_SESSION_IMPL --
 *	Implementation of WT_SESSION.
 */
struct __wt_session_impl {
    WT_SESSION iface;
    WT_EVENT_HANDLER *event_handler; /* Application's event handlers */

    void *lang_private; /* Language specific private storage */

    void (*format_private)(WT_CURSOR *, int, void *); /* Format test program private callback. */
    void *format_private_arg;

    u_int active; /* Non-zero if the session is in-use */

    const char *name;   /* Name */
    const char *lastop; /* Last operation */
    uint32_t id;        /* UID, offset in session array */

    uint64_t cache_wait_us;        /* Wait time for cache for current operation */
    uint64_t operation_start_us;   /* Operation start */
    uint64_t operation_timeout_us; /* Maximum operation period before rollback */
    u_int api_call_counter;        /* Depth of api calls */

    WT_DATA_HANDLE *dhandle;           /* Current data handle */
    WT_BUCKET_STORAGE *bucket_storage; /* Current bucket storage and file system */

    /*
     * Each session keeps a cache of data handles. The set of handles can grow quite large so we
     * maintain both a simple list and a hash table of lists. The hash table key is based on a hash
     * of the data handle's URI. Though all hash entries are discarded on session close, the hash
     * table list itself is kept in allocated memory that lives across session close - so it is
     * declared further down.
     */
    /* Session handle reference list */
    TAILQ_HEAD(__dhandles, __wt_data_handle_cache) dhandles;
    uint64_t last_sweep;        /* Last sweep for dead handles */
    struct timespec last_epoch; /* Last epoch time returned */

    WT_CURSOR_LIST cursors;          /* Cursors closed with the session */
    u_int ncursors;                  /* Count of active file cursors. */
    uint32_t cursor_sweep_countdown; /* Countdown to cursor sweep */
    uint32_t cursor_sweep_position;  /* Position in cursor_cache for sweep */
    uint64_t last_cursor_big_sweep;  /* Last big sweep for dead cursors */
    uint64_t last_cursor_sweep;      /* Last regular sweep for dead cursors */
    u_int sweep_warning_5min;        /* Whether the session was without sweep for 5 min. */
    u_int sweep_warning_60min;       /* Whether the session was without sweep for 60 min. */

    WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */

    WT_COMPACT_STATE *compact; /* Compaction information */
    enum { WT_COMPACT_NONE = 0, WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state;

    WT_IMPORT_LIST *import_list; /* List of metadata entries to import from file. */

    u_int hs_cursor_counter; /* Number of open history store cursors */

    WT_CURSOR *meta_cursor;  /* Metadata file */
    void *meta_track;        /* Metadata operation tracking */
    void *meta_track_next;   /* Current position */
    void *meta_track_sub;    /* Child transaction / save point */
    size_t meta_track_alloc; /* Currently allocated */
    int meta_track_nest;     /* Nesting level of meta transaction */
#define WT_META_TRACKING(session) ((session)->meta_track_next != NULL)

    /* Current rwlock for callback. */
    WT_RWLOCK *current_rwlock;
    uint8_t current_rwticket;

    WT_ITEM **scratch;     /* Temporary memory for any function */
    u_int scratch_alloc;   /* Currently allocated */
    size_t scratch_cached; /* Scratch bytes cached */
#ifdef HAVE_DIAGNOSTIC
    /*
     * Variables used to look for violations of the contract that a session is only used by a single
     * session at once.
     */
    volatile uintmax_t api_tid;
    volatile uint32_t api_enter_refcnt;
    /*
     * It's hard to figure out from where a buffer was allocated after it's leaked, so in diagnostic
     * mode we track them; DIAGNOSTIC can't simply add additional fields to WT_ITEM structures
     * because they are visible to applications, create a parallel structure instead.
     */
    struct __wt_scratch_track {
        const char *func; /* Allocating function, line */
        int line;
    } * scratch_track;
#endif

    /* Record the important timestamps of each stage in an reconciliation. */
    struct __wt_reconcile_timeline {
        uint64_t reconcile_start;
        uint64_t image_build_start;
        uint64_t image_build_finish;
        uint64_t hs_wrapup_start;
        uint64_t hs_wrapup_finish;
        uint64_t reconcile_finish;
        uint64_t total_reentry_hs_eviction_time;
    } reconcile_timeline;

    /*
     * Record the important timestamps of each stage in an eviction. If an eviction takes a long
     * time and times out, we can trace the time usage of each stage from this information.
     */
    struct __wt_evict_timeline {
        uint64_t evict_start;
        uint64_t reentry_hs_evict_start;
        uint64_t reentry_hs_evict_finish;
        uint64_t evict_finish;
        bool reentry_hs_eviction;
    } evict_timeline;

    WT_ITEM err; /* Error buffer */

    WT_TXN_ISOLATION isolation;
    WT_TXN *txn; /* Transaction state */

    void *block_manager; /* Block-manager support */
    int (*block_manager_cleanup)(WT_SESSION_IMPL *);

    const char *hs_checkpoint;     /* History store checkpoint name, during checkpoint cursor ops */
    uint64_t checkpoint_write_gen; /* Write generation override, during checkpoint cursor ops */

    /* Checkpoint handles */
    WT_DATA_HANDLE **ckpt_handle; /* Handle list */
    u_int ckpt_handle_next;       /* Next empty slot */
    size_t ckpt_handle_allocated; /* Bytes allocated */

    /* Named checkpoint drop list, during a checkpoint */
    WT_ITEM *ckpt_drop_list;

    /* Checkpoint time of current checkpoint, during a checkpoint */
    uint64_t current_ckpt_sec;

    /*
     * Operations acting on handles.
     *
     * The preferred pattern is to gather all of the required handles at the beginning of an
     * operation, then drop any other locks, perform the operation, then release the handles. This
     * cannot be easily merged with the list of checkpoint handles because some operations (such as
     * compact) do checkpoints internally.
     */
    WT_DATA_HANDLE **op_handle; /* Handle list */
    u_int op_handle_next;       /* Next empty slot */
    size_t op_handle_allocated; /* Bytes allocated */

    void *reconcile; /* Reconciliation support */
    int (*reconcile_cleanup)(WT_SESSION_IMPL *);

    /* Salvage support. */
    void *salvage_track;

    /* Sessions have an associated statistics bucket based on its ID. */
    u_int stat_bucket;          /* Statistics bucket offset */
    uint64_t cache_max_wait_us; /* Maximum time an operation waits for space in cache */

#ifdef HAVE_DIAGNOSTIC
    uint8_t dump_raw; /* Configure debugging page dump */
#endif

#ifdef HAVE_UNITTEST_ASSERTS
/*
 * Unit testing assertions requires overriding abort logic and instead capturing this information to
 * be checked by the unit test.
 */
#define WT_SESSION_UNITTEST_BUF_LEN 100
    bool unittest_assert_hit;
    char unittest_assert_msg[WT_SESSION_UNITTEST_BUF_LEN];
#endif

/* AUTOMATIC FLAG VALUE GENERATION START 0 */
#define WT_SESSION_LOCKED_CHECKPOINT 0x0001u
#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x0002u
#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x0004u
#define WT_SESSION_LOCKED_HOTBACKUP_READ 0x0008u
#define WT_SESSION_LOCKED_HOTBACKUP_WRITE 0x0010u
#define WT_SESSION_LOCKED_METADATA 0x0020u
#define WT_SESSION_LOCKED_PASS 0x0040u
#define WT_SESSION_LOCKED_SCHEMA 0x0080u
#define WT_SESSION_LOCKED_SLOT 0x0100u
#define WT_SESSION_LOCKED_TABLE_READ 0x0200u
#define WT_SESSION_LOCKED_TABLE_WRITE 0x0400u
#define WT_SESSION_LOCKED_TURTLE 0x0800u
#define WT_SESSION_NO_SCHEMA_LOCK 0x1000u
    /*AUTOMATIC FLAG VALUE GENERATION STOP 32 */
    uint32_t lock_flags;

/* AUTOMATIC FLAG VALUE GENERATION START 0 */
#define WT_SESSION_BACKUP_CURSOR 0x00001u
#define WT_SESSION_BACKUP_DUP 0x00002u
#define WT_SESSION_CACHE_CURSORS 0x00004u
#define WT_SESSION_CAN_WAIT 0x00008u
#define WT_SESSION_DEBUG_DO_NOT_CLEAR_TXN_ID 0x00010u
#define WT_SESSION_DEBUG_RELEASE_EVICT 0x00020u
#define WT_SESSION_EVICTION 0x00040u
#define WT_SESSION_IGNORE_CACHE_SIZE 0x00080u
#define WT_SESSION_IMPORT 0x00100u
#define WT_SESSION_IMPORT_REPAIR 0x00200u
#define WT_SESSION_INTERNAL 0x00400u
#define WT_SESSION_LOGGING_INMEM 0x00800u
#define WT_SESSION_NO_DATA_HANDLES 0x01000u
#define WT_SESSION_NO_RECONCILE 0x02000u
#define WT_SESSION_QUIET_CORRUPT_FILE 0x04000u
#define WT_SESSION_READ_WONT_NEED 0x08000u
#define WT_SESSION_RESOLVING_TXN 0x10000u
#define WT_SESSION_ROLLBACK_TO_STABLE 0x20000u
#define WT_SESSION_SCHEMA_TXN 0x40000u
    /* AUTOMATIC FLAG VALUE GENERATION STOP 32 */
    uint32_t flags;

/*
 * All of the following fields live at the end of the structure so it's easier to clear everything
 * but the fields that persist.
 */
#define WT_SESSION_CLEAR_SIZE (offsetof(WT_SESSION_IMPL, rnd))

    /*
     * The random number state persists past session close because we don't want to repeatedly use
     * the same values for skiplist depth when the application isn't caching sessions.
     */
    WT_RAND_STATE rnd; /* Random number generation state */

    /*
     * Hash tables are allocated lazily as sessions are used to keep the size of this structure from
     * growing too large.
     */
    WT_CURSOR_LIST *cursor_cache; /* Hash table of cached cursors */

    /* Hashed handle reference list array */
    TAILQ_HEAD(__dhandles_hash, __wt_data_handle_cache) * dhhash;

/* Generations manager */
#define WT_GEN_CHECKPOINT 0 /* Checkpoint generation */
#define WT_GEN_COMMIT 1     /* Commit generation */
#define WT_GEN_EVICT 2      /* Eviction generation */
#define WT_GEN_HAZARD 3     /* Hazard pointer */
#define WT_GEN_SPLIT 4      /* Page splits */
#define WT_GENERATIONS 5    /* Total generation manager entries */
    volatile uint64_t generations[WT_GENERATIONS];

    /*
     * Session memory persists past session close because it's accessed by threads of control other
     * than the thread owning the session. For example, btree splits and hazard pointers can "free"
     * memory that's still in use. In order to eventually free it, it's stashed here with its
     * generation number; when no thread is reading in generation, the memory can be freed for real.
     */
    struct __wt_session_stash {
        struct __wt_stash {
            void *p; /* Memory, length */
            size_t len;
            uint64_t gen; /* Generation */
        } * list;
        size_t cnt;   /* Array entries */
        size_t alloc; /* Allocated bytes */
    } stash[WT_GENERATIONS];

/*
 * Hazard pointers.
 *
 * Hazard information persists past session close because it's accessed by threads of control other
 * than the thread owning the session.
 *
 * Use the non-NULL state of the hazard field to know if the session has previously been
 * initialized.
 */
#define WT_SESSION_FIRST_USE(s) ((s)->hazard == NULL)

/*
 * The hazard pointer array grows as necessary, initialize with 250 slots.
 */
#define WT_SESSION_INITIAL_HAZARD_SLOTS 250
    uint32_t hazard_size;  /* Hazard pointer array slots */
    uint32_t hazard_inuse; /* Hazard pointer array slots in-use */
    uint32_t nhazard;      /* Count of active hazard pointers */
    WT_HAZARD *hazard;     /* Hazard pointer array */

    /*
     * Operation tracking.
     */
    WT_OPTRACK_RECORD *optrack_buf;
    u_int optrackbuf_ptr;
    uint64_t optrack_offset;
    WT_FH *optrack_fh;

    WT_SESSION_STATS stats;
};

/* Consider moving this to session_inline.h if it ever appears. */
#define WT_READING_CHECKPOINT(s)                                       \
    ((s)->dhandle != NULL && F_ISSET((s)->dhandle, WT_DHANDLE_OPEN) && \
      WT_DHANDLE_IS_CHECKPOINT((s)->dhandle))