summaryrefslogtreecommitdiff
path: root/storage/tokudb/PerconaFT/ft/cachetable/cachetable.h
blob: 148326562ab81faec708d47ca757c9b00b74e81d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.


Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.

    PerconaFT is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License, version 2,
    as published by the Free Software Foundation.

    PerconaFT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.

----------------------------------------

    PerconaFT is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License, version 3,
    as published by the Free Software Foundation.

    PerconaFT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
======= */

#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."

#pragma once

#include <fcntl.h>

#include "ft/logger/logger.h"
#include "ft/serialize/block_table.h"
#include "ft/txn/txn.h"
#include "ft/ft-status.h"
#include "util/minicron.h"

// Maintain a cache mapping from cachekeys to values (void*)
// Some of the keys can be pinned.  Don't pin too many or for too long.
// If the cachetable is too full, it will call the flush_callback() function with the key, the value, and the otherargs
// and then remove the key-value pair from the cache.
// The callback won't be any of the currently pinned keys.
// Also when flushing an object, the cachetable drops all references to it,
// so you may need to free() it.
// Note: The cachetable should use a common pool of memory, flushing things across cachetables.
//  (The first implementation doesn't)
// If you pin something twice, you must unpin it twice.
// table_size is the initial size of the cache table hash table (in number of entries)
// size limit is the upper bound of the sum of size of the entries in the cache table (total number of bytes)

typedef BLOCKNUM CACHEKEY;

class checkpointer;
typedef class checkpointer *CHECKPOINTER;
typedef struct cachetable *CACHETABLE;
typedef struct cachefile *CACHEFILE;
typedef struct ctpair *PAIR;

// This struct hold information about values stored in the cachetable.
// As one can tell from the names, we are probably violating an
// abstraction layer by placing names.
//
// The purpose of having this struct is to have a way for the 
// cachetable to accumulate the some totals we are interested in.
// Breaking this abstraction layer by having these names was the 
// easiest way.
//
typedef struct pair_attr_s {
    long size; // size PAIR's value takes in memory
    long nonleaf_size; // size if PAIR is a nonleaf node, 0 otherwise, used only for engine status
    long leaf_size; // size if PAIR is a leaf node, 0 otherwise, used only for engine status
    long rollback_size; // size of PAIR is a rollback node, 0 otherwise, used only for engine status
    long cache_pressure_size; // amount PAIR contributes to cache pressure, is sum of buffer sizes and workdone counts
    bool is_valid;
} PAIR_ATTR;

static inline PAIR_ATTR make_pair_attr(long size) { 
    PAIR_ATTR result={
        .size = size, 
        .nonleaf_size = 0, 
        .leaf_size = 0, 
        .rollback_size = 0, 
        .cache_pressure_size = 0,
        .is_valid = true
    }; 
    return result; 
}

void toku_set_cleaner_period (CACHETABLE ct, uint32_t new_period);
uint32_t toku_get_cleaner_period_unlocked (CACHETABLE ct);
void toku_set_cleaner_iterations (CACHETABLE ct, uint32_t new_iterations);
uint32_t toku_get_cleaner_iterations (CACHETABLE ct);
uint32_t toku_get_cleaner_iterations_unlocked (CACHETABLE ct);
void toku_set_enable_partial_eviction (CACHETABLE ct, bool enabled);
bool toku_get_enable_partial_eviction (CACHETABLE ct);

// cachetable operations

// create and initialize a cache table
// size_limit is the upper limit on the size of the size of the values in the table
// pass 0 if you want the default
int toku_cachetable_create_ex(CACHETABLE *result, long size_limit,
                           unsigned long client_pool_threads,
                           unsigned long cachetable_pool_threads,
                           unsigned long checkpoint_pool_threads,
                           LSN initial_lsn, struct tokulogger *logger);

#define toku_cachetable_create(r, s, l, o) \
    toku_cachetable_create_ex(r, s, 0, 0, 0, l, o);

// Create a new cachetable.
// Effects: a new cachetable is created and initialized.
// The cachetable pointer is stored into result.
// The sum of the sizes of the memory objects is set to size_limit, in whatever
// units make sense to the user of the cachetable.
// Returns: If success, returns 0 and result points to the new cachetable. Otherwise,
// returns an error number.

// Returns a pointer to the checkpointer within the given cachetable.
CHECKPOINTER toku_cachetable_get_checkpointer(CACHETABLE ct);

// What is the cachefile that goes with a particular filenum?
// During a transaction, we cannot reuse a filenum.
int toku_cachefile_of_filenum (CACHETABLE t, FILENUM filenum, CACHEFILE *cf);

// What is the cachefile that goes with a particular iname (relative to env)?
// During a transaction, we cannot reuse an iname.
int toku_cachefile_of_iname_in_env (CACHETABLE ct, const char *iname_in_env, CACHEFILE *cf);

// Get the iname (within the cwd) associated with the cachefile
// Return the filename
char *toku_cachefile_fname_in_cwd (CACHEFILE cf);

void toku_cachetable_begin_checkpoint (CHECKPOINTER cp, struct tokulogger *logger);

void toku_cachetable_end_checkpoint(CHECKPOINTER cp, struct tokulogger *logger, 
                                   void (*testcallback_f)(void*),  void * testextra);


// Shuts down checkpoint thread
// Requires no locks be held that are taken by the checkpoint function
void toku_cachetable_minicron_shutdown(CACHETABLE ct);

// Prepare to close the cachetable.  This informs the cachetable that it is about to be closed
// so that it can tune its checkpoint resource use.
void toku_cachetable_prepare_close(CACHETABLE ct);

// Close the cachetable.
// Effects: All of the memory objects are flushed to disk, and the cachetable is destroyed.
void toku_cachetable_close(CACHETABLE *ct); 

// Open a file and bind the file to a new cachefile object. (For use by test programs only.)
int toku_cachetable_openf(CACHEFILE *,CACHETABLE, const char *fname_in_env, int flags, mode_t mode);

// Bind a file to a new cachefile object.
int toku_cachetable_openfd(CACHEFILE *,CACHETABLE, int fd, 
                            const char *fname_relative_to_env);
int toku_cachetable_openfd_with_filenum (CACHEFILE *,CACHETABLE, int fd, 
                                         const char *fname_in_env,
                                         FILENUM filenum, bool* was_open);

// reserve a unique filenum
FILENUM toku_cachetable_reserve_filenum(CACHETABLE ct);

// Effect: Reserve a fraction of the cachetable memory.
// Returns the amount reserved.
// To return the memory to the cachetable, call toku_cachetable_release_reserved_memory
// Requires 0<fraction<1.
uint64_t toku_cachetable_reserve_memory(CACHETABLE, double fraction, uint64_t upper_bound);
void toku_cachetable_release_reserved_memory(CACHETABLE, uint64_t);

// cachefile operations

// Does an fsync of a cachefile.
void toku_cachefile_fsync(CACHEFILE cf);

enum partial_eviction_cost {
    PE_CHEAP=0, // running partial eviction is cheap, and can be done on the client thread
    PE_EXPENSIVE=1, // running partial eviction is expensive, and should not be done on the client thread
};

// cachetable pair clean or dirty WRT external memory
enum cachetable_dirty {
    CACHETABLE_CLEAN=0, // the cached object is clean WRT the cachefile
    CACHETABLE_DIRTY=1, // the cached object is dirty WRT the cachefile
};

// The flush callback is called when a key value pair is being written to storage and possibly removed from the cachetable.
// When write_me is true, the value should be written to storage.
// When keep_me is false, the value should be freed.
// When for_checkpoint is true, this was a 'pending' write
// Returns: 0 if success, otherwise an error number.
// Can access fd (fd is protected by a readlock during call)
typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, void *value, void **disk_data, void *write_extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, bool write_me, bool keep_me, bool for_checkpoint, bool is_clone);

// The fetch callback is called when a thread is attempting to get and pin a memory
// object and it is not in the cachetable.
// Returns: 0 if success, otherwise an error number.  The address and size of the object
// associated with the key are returned.
// Can access fd (fd is protected by a readlock during call)
typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, PAIR p, int fd, CACHEKEY key, uint32_t fullhash, void **value_data, void **disk_data, PAIR_ATTR *sizep, int *dirtyp, void *read_extraargs);

// The cachetable calls the partial eviction estimate callback to determine if 
// partial eviction is a cheap operation that may be called by on the client thread
// or whether partial eviction is expensive and should be done on a background (writer) thread.
// The callback conveys this information by setting cost to either PE_CHEAP or PE_EXPENSIVE.
// If cost is PE_EXPENSIVE, then the callback also sets bytes_freed_estimate 
// to return an estimate of the number of bytes it will free
// so that the cachetable can estimate how much data is being evicted on background threads.
// If cost is PE_CHEAP, then the callback does not set bytes_freed_estimate.
typedef void (*CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK)(void *ftnode_pv, void* disk_data, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void *write_extraargs);

// The cachetable calls the partial eviction callback is to possibly try and partially evict pieces
// of the PAIR. The callback determines the strategy for what to evict. The callback may choose to free
// nothing, or may choose to free as much as possible. When the partial eviction callback is finished,
// it must call finalize with the new PAIR_ATTR and the given finalize_extra. After this point, the
// write lock will be released on the PAIR and it is no longer safe to operate on any of the passed arguments.
// This is useful for doing expensive cleanup work outside of the PAIR's write lock (such as destroying objects, etc)
//
// on entry, requires a write lock to be held on the PAIR in the cachetable while this function is called
// on exit, the finalize continuation is called
typedef int (*CACHETABLE_PARTIAL_EVICTION_CALLBACK)(void *ftnode_pv, PAIR_ATTR old_attr, void *write_extraargs,
                                                    void (*finalize)(PAIR_ATTR new_attr, void *extra), void *finalize_extra);

// The cachetable calls this function to determine if get_and_pin call requires a partial fetch. If this function returns true, 
// then the cachetable will subsequently call CACHETABLE_PARTIAL_FETCH_CALLBACK to perform
// a partial fetch. If this function returns false, then the PAIR's value is returned to the caller as is.
//
// An alternative to having this callback is to always call CACHETABLE_PARTIAL_FETCH_CALLBACK, and let
// CACHETABLE_PARTIAL_FETCH_CALLBACK decide whether to do any partial fetching or not.
// There is no particular reason why this alternative was not chosen.
// Requires: a read lock to be held on the PAIR
typedef bool (*CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK)(void *ftnode_pv, void *read_extraargs);

// The cachetable calls the partial fetch callback when a thread needs to read or decompress a subset of a PAIR into memory.
// An example is needing to read a basement node into memory. Another example is decompressing an internal node's
// message buffer. The cachetable determines if a partial fetch is necessary by first calling CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK.
// The new PAIR_ATTR of the PAIR is returned in sizep
// Can access fd (fd is protected by a readlock during call)
// Returns: 0 if success, otherwise an error number.  
typedef int (*CACHETABLE_PARTIAL_FETCH_CALLBACK)(void *value_data, void* disk_data, void *read_extraargs, int fd, PAIR_ATTR *sizep);

// The cachetable calls the put callback during a cachetable_put command to provide the opaque PAIR.
// The PAIR can then be used to later unpin the pair.
// Returns: 0 if success, otherwise an error number.  
typedef void (*CACHETABLE_PUT_CALLBACK)(CACHEKEY key, void *value_data, PAIR p);

// TODO(leif) XXX TODO XXX
typedef int (*CACHETABLE_CLEANER_CALLBACK)(void *ftnode_pv, BLOCKNUM blocknum, uint32_t fullhash, void *write_extraargs);

typedef void (*CACHETABLE_CLONE_CALLBACK)(void* value_data, void** cloned_value_data, long* clone_size, PAIR_ATTR* new_attr, bool for_checkpoint, void* write_extraargs);

typedef void (*CACHETABLE_CHECKPOINT_COMPLETE_CALLBACK)(void *value_data);

typedef struct {
    CACHETABLE_FLUSH_CALLBACK flush_callback;
    CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK pe_est_callback;
    CACHETABLE_PARTIAL_EVICTION_CALLBACK pe_callback; 
    CACHETABLE_CLEANER_CALLBACK cleaner_callback;
    CACHETABLE_CLONE_CALLBACK clone_callback;
    CACHETABLE_CHECKPOINT_COMPLETE_CALLBACK checkpoint_complete_callback;
    void* write_extraargs; // parameter for flush_callback, pe_est_callback, pe_callback, and cleaner_callback
} CACHETABLE_WRITE_CALLBACK;

typedef void (*CACHETABLE_GET_KEY_AND_FULLHASH)(CACHEKEY* cachekey, uint32_t* fullhash, void* extra);

typedef void (*CACHETABLE_REMOVE_KEY)(CACHEKEY* cachekey, bool for_checkpoint, void* extra);

void toku_cachefile_set_userdata(CACHEFILE cf, void *userdata,
    void (*log_fassociate_during_checkpoint)(CACHEFILE, void*),
    void (*close_userdata)(CACHEFILE, int, void*, bool, LSN),
    void (*free_userdata)(CACHEFILE, void*),
    void (*checkpoint_userdata)(CACHEFILE, int, void*),
    void (*begin_checkpoint_userdata)(LSN, void*),
    void (*end_checkpoint_userdata)(CACHEFILE, int, void*),
    void (*note_pin_by_checkpoint)(CACHEFILE, void*),
    void (*note_unpin_by_checkpoint)(CACHEFILE, void*));
// Effect: Store some cachefile-specific user data.  When the last reference to a cachefile is closed, we call close_userdata().
// Before starting a checkpoint, we call checkpoint_prepare_userdata().
// When the cachefile needs to be checkpointed, we call checkpoint_userdata().
// If userdata is already non-NULL, then we simply overwrite it.

void *toku_cachefile_get_userdata(CACHEFILE);
// Effect: Get the user data.

CACHETABLE toku_cachefile_get_cachetable(CACHEFILE cf);
// Effect: Get the cachetable.

void toku_cachetable_swap_pair_values(PAIR old_pair, PAIR new_pair);
// Effect: Swaps the value_data of old_pair and new_pair. 
// Requires: both old_pair and new_pair to be pinned with write locks.

typedef enum {
    PL_READ = 0,
    PL_WRITE_CHEAP,
    PL_WRITE_EXPENSIVE
} pair_lock_type;

// put something into the cachetable and checkpoint dependent pairs
// if the checkpointing is necessary
void toku_cachetable_put_with_dep_pairs(
    CACHEFILE cachefile,
    CACHETABLE_GET_KEY_AND_FULLHASH get_key_and_fullhash,
    void *value,
    PAIR_ATTR attr,
    CACHETABLE_WRITE_CALLBACK write_callback,
    void *get_key_and_fullhash_extra,
    uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
    PAIR* dependent_pairs,
    enum cachetable_dirty* dependent_dirty, // array stating dirty/cleanness of dependent pairs
    CACHEKEY* key,
    uint32_t* fullhash,
    CACHETABLE_PUT_CALLBACK put_callback
    );

// Put a memory object into the cachetable.
// Effects: Lookup the key in the cachetable. If the key is not in the cachetable,
// then insert the pair and pin it. Otherwise return an error.  Some of the key
// value pairs may be evicted from the cachetable when the cachetable gets too big.
void toku_cachetable_put(CACHEFILE cf, CACHEKEY key, uint32_t fullhash,
			void *value, PAIR_ATTR size,
			CACHETABLE_WRITE_CALLBACK write_callback,
                        CACHETABLE_PUT_CALLBACK put_callback
                        );

// Get and pin the memory object of a PAIR, and write dependent pairs to disk
// if the dependent pairs are pending a checkpoint.
// Effects: If the memory object is in the cachetable, acquire a PAIR lock on it.
// Otherwise, fetch it from storage by calling the fetch callback.  If the fetch
// succeeded, add the memory object to the cachetable with a PAIR lock on it.
// Before returning to the user, if the PAIR object being retrieved, or any of the
// dependent pairs passed in as parameters must be written to disk for checkpoint,
// then the required PAIRs are written to disk for checkpoint.
// KEY PROPERTY OF DEPENDENT PAIRS: They are already locked by the client
// Returns: 0 if the memory object is in memory, otherwise an error number.
int toku_cachetable_get_and_pin_with_dep_pairs (
    CACHEFILE cachefile,
    CACHEKEY key,
    uint32_t fullhash,
    void**value,
    long *sizep,
    CACHETABLE_WRITE_CALLBACK write_callback,
    CACHETABLE_FETCH_CALLBACK fetch_callback,
    CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
    CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
    pair_lock_type lock_type,
    void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
    uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
    PAIR* dependent_pairs,
    enum cachetable_dirty* dependent_dirty // array stating dirty/cleanness of dependent pairs
    );

// Get and pin a memory object.
// Effects: If the memory object is in the cachetable acquire the PAIR lock on it.
// Otherwise, fetch it from storage by calling the fetch callback.  If the fetch
// succeeded, add the memory object to the cachetable with a read lock on it.
// Returns: 0 if the memory object is in memory, otherwise an error number.
int toku_cachetable_get_and_pin (
    CACHEFILE cachefile, 
    CACHEKEY key, 
    uint32_t fullhash, 
    void**value, 
    long *sizep,
    CACHETABLE_WRITE_CALLBACK write_callback,
    CACHETABLE_FETCH_CALLBACK fetch_callback, 
    CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
    CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
    bool may_modify_value,
    void* read_extraargs // parameter for fetch_callback, pf_req_callback, and pf_callback
    );

// does partial fetch on a pinned pair
void toku_cachetable_pf_pinned_pair(
    void* value,
    CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
    void* read_extraargs,
    CACHEFILE cf,
    CACHEKEY key,
    uint32_t fullhash
    ); 

struct unlockers {
    bool       locked;
    void (*f)(void* extra);
    void      *extra;
    struct unlockers *next;
};
typedef struct unlockers *UNLOCKERS;

// Effect:  If the block is in the cachetable, then return it.
//   Otherwise call the functions in unlockers, fetch the data (but don't pin it, since we'll just end up pinning it again later), and return TOKUDB_TRY_AGAIN.
int toku_cachetable_get_and_pin_nonblocking (
    CACHEFILE cf,
    CACHEKEY key,
    uint32_t fullhash,
    void**value,
    long *sizep,
    CACHETABLE_WRITE_CALLBACK write_callback,
    CACHETABLE_FETCH_CALLBACK fetch_callback,
    CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
    CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
    pair_lock_type lock_type,
    void *read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
    UNLOCKERS unlockers
    );

int toku_cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, uint32_t /*fullhash*/, pair_lock_type, void**);
// Effect: Maybe get and pin a memory object.
//  This function is similar to the get_and_pin function except that it
//  will not attempt to fetch a memory object that is not in the cachetable or requires any kind of blocking to get it.  
// Returns: If the the item is already in memory, then return 0 and store it in the
// void**.  If the item is not in memory, then return a nonzero error number.

int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE, CACHEKEY, uint32_t /*fullhash*/, pair_lock_type, void**);
// Effect: Like maybe get and pin, but may pin a clean pair.

int toku_cachetable_unpin(CACHEFILE, PAIR, enum cachetable_dirty dirty, PAIR_ATTR size);
// Effect: Unpin a memory object
// Modifies: If the memory object is in the cachetable, then OR the dirty flag,
// update the size, and release the read lock on the memory object.
// Returns: 0 if success, otherwise returns an error number.
// Requires: The ct is locked.

int toku_cachetable_unpin_ct_prelocked_no_flush(CACHEFILE, PAIR, enum cachetable_dirty dirty, PAIR_ATTR size);
// Effect: The same as tokud_cachetable_unpin, except that the ct must not be locked.
// Requires: The ct is NOT locked.

int toku_cachetable_unpin_and_remove (CACHEFILE, PAIR, CACHETABLE_REMOVE_KEY, void*); /* Removing something already present is OK. */
// Effect: Remove an object from the cachetable.  Don't write it back.
// Requires: The object must be pinned exactly once.

// test-only wrapper that use CACHEKEY and fullhash
int toku_test_cachetable_unpin(CACHEFILE, CACHEKEY, uint32_t fullhash, enum cachetable_dirty dirty, PAIR_ATTR size);

// test-only wrapper that use CACHEKEY and fullhash
int toku_test_cachetable_unpin_ct_prelocked_no_flush(CACHEFILE, CACHEKEY, uint32_t fullhash, enum cachetable_dirty dirty, PAIR_ATTR size);

// test-only wrapper that use CACHEKEY
int toku_test_cachetable_unpin_and_remove (CACHEFILE, CACHEKEY, CACHETABLE_REMOVE_KEY, void*); /* Removing something already present is OK. */

int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, uint32_t fullhash,
                            CACHETABLE_WRITE_CALLBACK write_callback,
                            CACHETABLE_FETCH_CALLBACK fetch_callback,
                            CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
                            CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
                            void *read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback 
                            bool *doing_prefetch);
// Effect: Prefetch a memory object for a given key into the cachetable
// Precondition: The cachetable mutex is NOT held.
// Postcondition: The cachetable mutex is NOT held.
// Returns: 0 if success
// Implement Note: 
//  1) The pair's rwlock is acquired (for write) (there is not a deadlock here because the rwlock is a pthread_cond_wait using the cachetable mutex).  
//  Case A:  Single-threaded.
//    A1)  Call cachetable_fetch_pair, which
//      a) Obtains a readlock on the cachefile's fd (to prevent multipler readers at once)
//      b) Unlocks the cachetable
//      c) Does the fetch off disk.
//      d) Locks the cachetable
//      e) Unlocks the fd lock.
//      f) Unlocks the pair rwlock.
//  Case B: Multithreaded
//      a) Enqueue a cachetable_reader into the workqueue.
//      b) Unlock the cache table.
//      c) The enqueue'd job later locks the cachetable, and calls cachetable_fetch_pair (doing the steps in A1 above).

int toku_cachetable_assert_all_unpinned (CACHETABLE);

int toku_cachefile_count_pinned (CACHEFILE, int /*printthem*/ );

// Close the cachefile.
// Effects: All of the cached object associated with the cachefile are evicted from
// the cachetable.  The flush callback is called for each of these objects.  The
// close function does not return until all of the objects are evicted.  The cachefile
// object is freed.
// If oplsn_valid is true then use oplsn as the LSN of the close instead of asking the logger.  oplsn_valid being true is only allowed during recovery, and requires that you are removing the last reference (otherwise the lsn wouldn't make it in.)
void toku_cachefile_close (CACHEFILE*, bool oplsn_valid, LSN oplsn);

// Return on success (different from pread and pwrite)
//int cachefile_pwrite (CACHEFILE, const void *buf, size_t count, toku_off_t offset);
//int cachefile_pread  (CACHEFILE, void *buf, size_t count, toku_off_t offset);

// Get the file descriptor associated with the cachefile
// Return the file descriptor
// Grabs a read lock protecting the fd
int toku_cachefile_get_fd (CACHEFILE);

// Get the iname (within the environment) associated with the cachefile
// Return the filename
char * toku_cachefile_fname_in_env (CACHEFILE cf);

// Make it so when the cachefile closes, the underlying file is unlinked
void toku_cachefile_unlink_on_close(CACHEFILE cf);

// is this cachefile marked as unlink on close?
bool toku_cachefile_is_unlink_on_close(CACHEFILE cf);

// Return the logger associated with the cachefile
struct tokulogger *toku_cachefile_logger(CACHEFILE cf);

// Return the filenum associated with the cachefile
FILENUM toku_cachefile_filenum(CACHEFILE cf);

// Effect: Return a 32-bit hash key.  The hash key shall be suitable for using with bitmasking for a table of size power-of-two.
uint32_t toku_cachetable_hash(CACHEFILE cf, CACHEKEY key);

uint32_t toku_cachefile_fullhash_of_header(CACHEFILE cf);

// debug functions

// Print the contents of the cachetable. This is mainly used from gdb
void toku_cachetable_print_state (CACHETABLE ct);

// Get the state of the cachetable. This is used to verify the cachetable
void toku_cachetable_get_state(CACHETABLE ct, int *num_entries_ptr, int *hash_size_ptr, long *size_current_ptr, long *size_limit_ptr);

// Get the state of a cachetable entry by key. This is used to verify the cachetable
int toku_cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, CACHEFILE cf,
                                  void **value_ptr,
				  int *dirty_ptr,
                                  long long *pin_ptr,
                                  long *size_ptr);

// Verify the whole cachetable that the cachefile is in.  Slow.
void toku_cachefile_verify (CACHEFILE cf);

// Verify the cachetable. Slow.
void toku_cachetable_verify (CACHETABLE t);

// Not for use in production, but useful for testing.
void toku_cachetable_print_hash_histogram (void) __attribute__((__visibility__("default")));

void toku_cachetable_maybe_flush_some(CACHETABLE ct);

// for stat64
uint64_t toku_cachefile_size(CACHEFILE cf);

void toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS s);

void toku_cachetable_set_env_dir(CACHETABLE ct, const char *env_dir);
char * toku_construct_full_name(int count, ...);
char * toku_cachetable_get_fname_in_cwd(CACHETABLE ct, const char * fname_in_env);

void cachefile_kibbutz_enq (CACHEFILE cf, void (*f)(void*), void *extra);
// Effect: Add a job to the cachetable's collection of work to do.  Note that function f must call remove_background_job_from_cf()

void remove_background_job_from_cf (CACHEFILE cf);
// Effect: When a kibbutz job or cleaner thread finishes in a cachefile,
// the cachetable must be notified.

// test-only function
int toku_cachetable_get_checkpointing_user_data_status(void);

// test-only function
int toku_cleaner_thread_for_test(CACHETABLE ct);
int toku_cleaner_thread(void *cleaner_v);

// test function. Exported in the ydb layer and used by tests that want to run DRD
// The default of 1M is too high for drd tests, so this is a mechanism to set a smaller number.
void toku_pair_list_set_lock_size(uint32_t num_locks);

// Used by ft-ops.cc to figure out if it has the write lock on a pair.
// Pretty hacky and not accurate enough, should be improved at the frwlock
// layer.
__attribute__((const,nonnull))
bool toku_ctpair_is_write_locked(PAIR pair);