summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/reconcile/rec_dictionary.c
blob: d0cfb69946b595a2e11f2e03332f6a529dbe3bcc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/*-
 * Copyright (c) 2014-2019 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * __rec_dictionary_skip_search --
 *     Search a dictionary skiplist.
 */
static WT_REC_DICTIONARY *
__rec_dictionary_skip_search(WT_REC_DICTIONARY **head, uint64_t hash)
{
    WT_REC_DICTIONARY **e;
    int i;

    /*
     * Start at the highest skip level, then go as far as possible at each level before stepping
     * down to the next.
     */
    for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
        if (*e == NULL) { /* Empty levels */
            --i;
            --e;
            continue;
        }

        /*
         * Return any exact matches: we don't care in what search level we found a match.
         */
        if ((*e)->hash == hash) /* Exact match */
            return (*e);
        if ((*e)->hash > hash) { /* Drop down a level */
            --i;
            --e;
        } else /* Keep going at this level */
            e = &(*e)->next[i];
    }
    return (NULL);
}

/*
 * __rec_dictionary_skip_search_stack --
 *     Search a dictionary skiplist, returning an insert/remove stack.
 */
static void
__rec_dictionary_skip_search_stack(
  WT_REC_DICTIONARY **head, WT_REC_DICTIONARY ***stack, uint64_t hash)
{
    WT_REC_DICTIONARY **e;
    int i;

    /*
     * Start at the highest skip level, then go as far as possible at each level before stepping
     * down to the next.
     */
    for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;)
        if (*e == NULL || (*e)->hash > hash)
            stack[i--] = e--; /* Drop down a level */
        else
            e = &(*e)->next[i]; /* Keep going at this level */
}

/*
 * __rec_dictionary_skip_insert --
 *     Insert an entry into the dictionary skip-list.
 */
static void
__rec_dictionary_skip_insert(WT_REC_DICTIONARY **head, WT_REC_DICTIONARY *e, uint64_t hash)
{
    WT_REC_DICTIONARY **stack[WT_SKIP_MAXDEPTH];
    u_int i;

    /* Insert the new entry into the skiplist. */
    __rec_dictionary_skip_search_stack(head, stack, hash);
    for (i = 0; i < e->depth; ++i) {
        e->next[i] = *stack[i];
        *stack[i] = e;
    }
}

/*
 * __wt_rec_dictionary_init --
 *     Allocate and initialize the dictionary.
 */
int
__wt_rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots)
{
    u_int depth, i;

    /* Free any previous dictionary. */
    __wt_rec_dictionary_free(session, r);

    r->dictionary_slots = slots;
    WT_RET(__wt_calloc(session, r->dictionary_slots, sizeof(WT_REC_DICTIONARY *), &r->dictionary));
    for (i = 0; i < r->dictionary_slots; ++i) {
        depth = __wt_skip_choose_depth(session);
        WT_RET(__wt_calloc(session, 1,
          sizeof(WT_REC_DICTIONARY) + depth * sizeof(WT_REC_DICTIONARY *), &r->dictionary[i]));
        r->dictionary[i]->depth = depth;
    }
    return (0);
}

/*
 * __wt_rec_dictionary_free --
 *     Free the dictionary.
 */
void
__wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
    u_int i;

    if (r->dictionary == NULL)
        return;

    /*
     * We don't correct dictionary_slots when we fail during allocation, but that's OK, the value is
     * either NULL or a memory reference to be free'd.
     */
    for (i = 0; i < r->dictionary_slots; ++i)
        __wt_free(session, r->dictionary[i]);
    __wt_free(session, r->dictionary);
}

/*
 * __wt_rec_dictionary_reset --
 *     Reset the dictionary when reconciliation restarts and when crossing a page boundary (a
 *     potential split).
 */
void
__wt_rec_dictionary_reset(WT_RECONCILE *r)
{
    if (r->dictionary_slots) {
        r->dictionary_next = 0;
        memset(r->dictionary_head, 0, sizeof(r->dictionary_head));
    }
}

/*
 * __wt_rec_dictionary_lookup --
 *     Check the dictionary for a matching value on this page.
 */
int
__wt_rec_dictionary_lookup(
  WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *val, WT_REC_DICTIONARY **dpp)
{
    WT_REC_DICTIONARY *dp, *next;
    uint64_t hash;
    bool match;

    *dpp = NULL;

    /* Search the dictionary, and return any match we find. */
    hash = __wt_hash_fnv64(val->buf.data, val->buf.size);
    for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
         dp != NULL && dp->hash == hash; dp = dp->next[0]) {
        WT_RET(__wt_cell_pack_data_match((WT_CELL *)((uint8_t *)r->cur_ptr->image.mem + dp->offset),
          &val->cell, val->buf.data, &match));
        if (match) {
            WT_STAT_DATA_INCR(session, rec_dictionary);
            *dpp = dp;
            return (0);
        }
    }

    /*
     * We're not doing value replacement in the dictionary. We stop adding new entries if we run out
     * of empty dictionary slots (but continue to use the existing entries). I can't think of any
     * reason a leaf page value is more likely to be seen because it was seen more recently than
     * some other value: if we find working sets where that's not the case, it shouldn't be too
     * difficult to maintain a pointer which is the next dictionary slot to re-use.
     */
    if (r->dictionary_next >= r->dictionary_slots)
        return (0);

    /*
     * Set the hash value, we'll add this entry into the dictionary when we write it into the page's
     * disk image buffer (because that's when we know where on the page it will be written).
     */
    next = r->dictionary[r->dictionary_next++];
    next->offset = 0; /* Not necessary, just cautious. */
    next->hash = hash;
    __rec_dictionary_skip_insert(r->dictionary_head, next, hash);
    *dpp = next;
    return (0);
}