summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org>2022-01-06 14:49:30 +0200
committerSergey Poznyakoff <gray@gnu.org>2022-01-06 15:36:10 +0200
commitb8c3d13fd821e90a190cc5cfad3a9e17f18aa416 (patch)
tree1beb84078d59d5800ade4de7972671c2b823b8b7
parent42276af5bd0a48512a23f83db021b6e832c3fb92 (diff)
downloadgdbm-b8c3d13fd821e90a190cc5cfad3a9e17f18aa416.tar.gz
Speed up flushing the bucket cache on disk
The implementation of _gdbm_cache_flush becomes prohibitively inefficient during extensive updates of large databases. The bug was reported at https://github.com/Perl/perl5/issues/19306. To fix it, make sure that all changed cache entries are placed at the head of the cache_mru list, forming a contiguous sequence. This way a potentially long iteration over all cache entries can be cut off at the first entry with ca_changed == FALSE. This commit also gets rid of several superfluous fields in struct gdbm_file_info: - cache_entry Not needed, because the most recently used cache entry (cache_mru) is always the current one. - bucket_changed dbf->cache_mru->ca_changed reflects the status of the current bucket. - second_changed Not needed because _gdbm_cache_flush, which flushes all changed buckets, is now invoked unconditionally by _gdbm_end_update (and also whenever dbf->cache_mru changes). * src/gdbmdefs.h (struct gdbm_file_info): Remove cache_entry. The current cache entry is cache_mru. Remove bucket_changed, and second_changed. All uses changed. * src/proto.h (_gdbm_current_bucket_changed): New inline function. * src/bucket.c (_gdbm_cache_flush): Assume all changed elements form a contiguous sequence beginning with dbf->cache_mru. (set_cache_entry): Remove. All callers changed. (lru_link_elem,lru_unlink_elem): Update dbf->bucket as necessary. (cache_lookup): If the obtained bucket is not changed and is going to become current, flush all changed cache elements. * src/update.c (_gdbm_end_update): Call _gdbm_cache_flush unconditionally. * src/findkey.c: Use dbf->cache_mru instead of the removed dbf->cache_entry. * src/gdbmseq.c: Likewise. * tools/gdbmshell.c (_gdbm_print_bucket_cache): Likewise. * src/falloc.c: Use _gdbm_current_bucket_changed to mark the current bucket as changed. * src/gdbmstore.c: Likewise. * src/gdbmdelete.c: Likewise. Use _gdbm_current_bucket_changed. * tests/gtcacheopt.c: Fix typo. * tests/gtload.c: New option: -cachesize
-rw-r--r--src/bucket.c88
-rw-r--r--src/falloc.c4
-rw-r--r--src/findkey.c20
-rw-r--r--src/gdbmdefs.h7
-rw-r--r--src/gdbmdelete.c8
-rw-r--r--src/gdbmopen.c2
-rw-r--r--src/gdbmseq.c2
-rw-r--r--src/gdbmstore.c3
-rw-r--r--src/proto.h7
-rw-r--r--src/recover.c3
-rw-r--r--src/update.c16
-rw-r--r--tests/gtcacheopt.c2
-rw-r--r--tests/gtload.c15
-rw-r--r--tools/gdbmshell.c2
14 files changed, 97 insertions, 82 deletions
diff --git a/src/bucket.c b/src/bucket.c
index 6ce08ad..289ae11 100644
--- a/src/bucket.c
+++ b/src/bucket.c
@@ -41,14 +41,6 @@ _gdbm_new_bucket (GDBM_FILE dbf, hash_bucket *bucket, int bits)
for (index = 0; index < dbf->header->bucket_elems; index++)
bucket->h_table[index].hash_value = -1;
}
-
-static void
-set_cache_entry (GDBM_FILE dbf, cache_elem *elem)
-{
- dbf->cache_entry = elem;
- dbf->bucket = dbf->cache_entry->ca_bucket;
-}
-
/* Bucket cache table functions */
@@ -90,7 +82,10 @@ cache_tab_lookup_slot (GDBM_FILE dbf, off_t adr)
/* LRU list management */
-/* Link ELEM after REF in DBF cache. If REF is NULL, link at head */
+/*
+ * Link ELEM after REF in DBF cache. If REF is NULL, link at head and
+ * set DBF->bucket to point to the ca_bucket of ELEM.
+ */
static void
lru_link_elem (GDBM_FILE dbf, cache_elem *elem, cache_elem *ref)
{
@@ -103,6 +98,7 @@ lru_link_elem (GDBM_FILE dbf, cache_elem *elem, cache_elem *ref)
else
dbf->cache_lru = elem;
dbf->cache_mru = elem;
+ dbf->bucket = dbf->cache_mru->ca_bucket;
}
else
{
@@ -118,7 +114,10 @@ lru_link_elem (GDBM_FILE dbf, cache_elem *elem, cache_elem *ref)
}
}
-/* Unlink ELEM from the list of cache elements in DBF. */
+/*
+ * Unlink ELEM from the list of cache elements in DBF.
+ * If cache_mru gets updated, update DBF->bucket accordingly.
+ */
static void
lru_unlink_elem (GDBM_FILE dbf, cache_elem *elem)
{
@@ -127,7 +126,10 @@ lru_unlink_elem (GDBM_FILE dbf, cache_elem *elem)
if ((x = elem->ca_prev))
x->ca_next = elem->ca_next;
else
- dbf->cache_mru = elem->ca_next;
+ {
+ dbf->cache_mru = elem->ca_next;
+ dbf->bucket = dbf->cache_mru->ca_bucket;
+ }
if ((x = elem->ca_next))
x->ca_prev = elem->ca_prev;
else
@@ -334,19 +336,34 @@ cache_lookup (GDBM_FILE dbf, off_t adr, cache_elem *ref, cache_elem **ret_elem)
dbf->cache_num++;
}
}
+
+ /*
+ * If the obtained bucket is not changed and is going to become current,
+ * flush all changed cache elements. This ensures that changed cache
+ * elements form a contiguous sequence at the head of the cache list (see
+ * _gdbm_cache_flush).
+ */
+ if (ref == NULL && !elem->ca_changed)
+ _gdbm_cache_flush (dbf);
+
lru_link_elem (dbf, elem, ref);
if (rc != cache_failure)
*ret_elem = elem;
return rc;
}
-/* Find a bucket for DBF that is pointed to by the bucket directory from
- location DIR_INDEX. The bucket cache is first checked to see if it
- is already in memory. If not, a bucket may be tossed to read the new
- bucket. On success, the requested bucket becomes the "current" bucket
- and dbf->bucket points to the correct bucket. On error, the current
- bucket remains unchanged. */
-
+/*
+ * Find a bucket for DBF that is pointed to by the bucket directory from
+ * location DIR_INDEX. The bucket cache is first checked to see if it
+ * is already in memory. If not, the last recently used bucket may be
+ * tossed (if the cache is full) to read the new bucket.
+ *
+ * On success, the cached entry with the requested bucket is placed at
+ * the head of the cache list (cache_mru) and the requested bucket becomes
+ * "current".
+ *
+ * On error, the current bucket remains unchanged.
+ */
int
_gdbm_get_bucket (GDBM_FILE dbf, int dir_index)
{
@@ -367,9 +384,6 @@ _gdbm_get_bucket (GDBM_FILE dbf, int dir_index)
dbf->bucket_dir = dir_index;
bucket_adr = dbf->dir[dir_index];
- if (dbf->cache_entry && dbf->cache_entry->ca_adr == bucket_adr)
- return 0;
-
switch (cache_lookup (dbf, bucket_adr, NULL, &elem))
{
case cache_found:
@@ -427,7 +441,6 @@ _gdbm_get_bucket (GDBM_FILE dbf, int dir_index)
case cache_failure:
return -1;
}
- set_cache_entry (dbf, elem);
return 0;
}
@@ -462,7 +475,14 @@ _gdbm_split_bucket (GDBM_FILE dbf, int next_insert)
new_bits = dbf->bucket->bucket_bits + 1;
- /* Allocate two new buckets */
+ /*
+ * Allocate two new buckets. They will be populated with the entries
+ * from the current bucket (cache_mru->bucket), so make sure that
+ * cache_mru remains unchanged until both buckets are fully formed.
+ * Newly allocated buckets must be linked right after cache_mru, so
+ * that all changed buckets form a contiguous sequence at the beginning
+ * of the cache list (this is needed by _gdbm_cache_flush).
+ */
adr_0 = _gdbm_alloc (dbf, dbf->header->bucket_size);
switch (cache_lookup (dbf, adr_0, dbf->cache_mru, &newcache[0]))
{
@@ -603,17 +623,15 @@ _gdbm_split_bucket (GDBM_FILE dbf, int next_insert)
/* Set changed flags. */
newcache[0]->ca_changed = TRUE;
newcache[1]->ca_changed = TRUE;
- dbf->bucket_changed = TRUE;
dbf->directory_changed = TRUE;
- dbf->second_changed = TRUE;
/* Update the cache! */
dbf->bucket_dir = _gdbm_bucket_dir (dbf, next_insert);
/* Invalidate old cache entry. */
- old_bucket.av_adr = dbf->cache_entry->ca_adr;
+ old_bucket.av_adr = dbf->cache_mru->ca_adr;
old_bucket.av_size = dbf->header->bucket_size;
- cache_elem_free (dbf, dbf->cache_entry);
+ cache_elem_free (dbf, dbf->cache_mru);
/* Set dbf->bucket to the proper bucket. */
if (dbf->dir[dbf->bucket_dir] != adr_0)
@@ -630,7 +648,6 @@ _gdbm_split_bucket (GDBM_FILE dbf, int next_insert)
lru_unlink_elem (dbf, newcache[0]);
lru_link_elem (dbf, newcache[0], NULL);
- set_cache_entry (dbf, newcache[0]);
}
/* Get rid of old directories. */
@@ -725,18 +742,19 @@ _gdbm_cache_free (GDBM_FILE dbf)
}
}
-/* Flush cache content to disk. */
+/*
+ * Flush cache content to disk.
+ * All cache elements with the changed buckets form a contiguous sequence
+ * at the head of the cache list (starting with cache_mru).
+ */
int
_gdbm_cache_flush (GDBM_FILE dbf)
{
cache_elem *elem;
- for (elem = dbf->cache_lru; elem; elem = elem->ca_prev)
+ for (elem = dbf->cache_mru; elem && elem->ca_changed; elem = elem->ca_next)
{
- if (elem->ca_changed)
- {
- if (_gdbm_write_bucket (dbf, elem))
- return -1;
- }
+ if (_gdbm_write_bucket (dbf, elem))
+ return -1;
}
return 0;
}
diff --git a/src/falloc.c b/src/falloc.c
index f1cad4f..1c3115b 100644
--- a/src/falloc.c
+++ b/src/falloc.c
@@ -515,7 +515,7 @@ adjust_bucket_avail (GDBM_FILE dbf)
av_el = dbf->avail->av_table[dbf->avail->count];
_gdbm_put_av_elem (av_el, dbf->bucket->bucket_avail,
&dbf->bucket->av_count, dbf->coalesce_blocks);
- dbf->bucket_changed = TRUE;
+ _gdbm_current_bucket_changed (dbf);
}
return 0;
}
@@ -533,7 +533,7 @@ adjust_bucket_avail (GDBM_FILE dbf)
_gdbm_put_av_elem (av_el, dbf->avail->av_table,
&dbf->avail->count,
dbf->coalesce_blocks);
- dbf->bucket_changed = TRUE;
+ _gdbm_current_bucket_changed (dbf);
}
return 0;
}
diff --git a/src/findkey.c b/src/findkey.c
index d4c0d01..de8cbbf 100644
--- a/src/findkey.c
+++ b/src/findkey.c
@@ -67,8 +67,8 @@ _gdbm_read_entry (GDBM_FILE dbf, int elem_loc)
data_cache_elem *data_ca;
/* Is it already in the cache? */
- if (dbf->cache_entry->ca_data.elem_loc == elem_loc)
- return dbf->cache_entry->ca_data.dptr;
+ if (dbf->cache_mru->ca_data.elem_loc == elem_loc)
+ return dbf->cache_mru->ca_data.dptr;
if (!gdbm_bucket_element_valid_p (dbf, elem_loc))
{
@@ -80,7 +80,7 @@ _gdbm_read_entry (GDBM_FILE dbf, int elem_loc)
key_size = dbf->bucket->h_table[elem_loc].key_size;
data_size = dbf->bucket->h_table[elem_loc].data_size;
dsize = key_size + data_size;
- data_ca = &dbf->cache_entry->ca_data;
+ data_ca = &dbf->cache_mru->ca_data;
/* Make sure data_ca has sufficient space to accommodate both
key and content. */
@@ -179,17 +179,17 @@ _gdbm_findkey (GDBM_FILE dbf, datum key, char **ret_dptr, int *ret_hash_val)
return -1;
/* Is the element the last one found for this bucket? */
- if (dbf->cache_entry->ca_data.elem_loc != -1
- && new_hash_val == dbf->cache_entry->ca_data.hash_val
- && dbf->cache_entry->ca_data.key_size == key.dsize
- && dbf->cache_entry->ca_data.dptr != NULL
- && memcmp (dbf->cache_entry->ca_data.dptr, key.dptr, key.dsize) == 0)
+ if (dbf->cache_mru->ca_data.elem_loc != -1
+ && new_hash_val == dbf->cache_mru->ca_data.hash_val
+ && dbf->cache_mru->ca_data.key_size == key.dsize
+ && dbf->cache_mru->ca_data.dptr != NULL
+ && memcmp (dbf->cache_mru->ca_data.dptr, key.dptr, key.dsize) == 0)
{
GDBM_DEBUG (GDBM_DEBUG_LOOKUP, "%s: found in cache", dbf->name);
/* This is it. Return the cache pointer. */
if (ret_dptr)
- *ret_dptr = dbf->cache_entry->ca_data.dptr + key.dsize;
- return dbf->cache_entry->ca_data.elem_loc;
+ *ret_dptr = dbf->cache_mru->ca_data.dptr + key.dsize;
+ return dbf->cache_mru->ca_data.elem_loc;
}
/* It is not the cached value, search for element in the bucket. */
diff --git a/src/gdbmdefs.h b/src/gdbmdefs.h
index b651f92..3348f9d 100644
--- a/src/gdbmdefs.h
+++ b/src/gdbmdefs.h
@@ -277,10 +277,7 @@ struct gdbm_file_info
cache_elem *cache_lru; /* Last recently used element - tail of the list */
cache_elem *cache_avail; /* Pool of available elements (linked by prev, next)
*/
- /* Pointer to the current bucket's cache entry. */
- cache_elem *cache_entry;
-
- /* Points to the current hash bucket in the cache. */
+ /* Points to dbf->cache_mru.ca_bucket -- the current hash bucket */
hash_bucket *bucket;
/* The directory entry used to get the current hash bucket. */
@@ -294,8 +291,6 @@ struct gdbm_file_info
end of an update. */
unsigned header_changed :1;
unsigned directory_changed :1;
- unsigned bucket_changed :1;
- unsigned second_changed :1;
off_t file_size; /* Cached value of the current disk file size.
If -1, fstat will be used to retrieve it. */
diff --git a/src/gdbmdelete.c b/src/gdbmdelete.c
index 948c8a1..9206658 100644
--- a/src/gdbmdelete.c
+++ b/src/gdbmdelete.c
@@ -85,12 +85,12 @@ gdbm_delete (GDBM_FILE dbf, datum key)
return -1;
/* Set the flags. */
- dbf->bucket_changed = TRUE;
+ _gdbm_current_bucket_changed (dbf);
/* Invalidate data cache for the current bucket. */
- dbf->cache_entry->ca_data.hash_val = -1;
- dbf->cache_entry->ca_data.key_size = 0;
- dbf->cache_entry->ca_data.elem_loc = -1;
+ dbf->cache_mru->ca_data.hash_val = -1;
+ dbf->cache_mru->ca_data.key_size = 0;
+ dbf->cache_mru->ca_data.elem_loc = -1;
/* Do the writes. */
return _gdbm_end_update (dbf);
diff --git a/src/gdbmopen.c b/src/gdbmopen.c
index 08d42bc..183159c 100644
--- a/src/gdbmopen.c
+++ b/src/gdbmopen.c
@@ -673,8 +673,6 @@ gdbm_fd_open (int fd, const char *file_name, int block_size,
dbf->bucket_dir = 0;
dbf->header_changed = FALSE;
dbf->directory_changed = FALSE;
- dbf->bucket_changed = FALSE;
- dbf->second_changed = FALSE;
if (flags & GDBM_XVERIFY)
{
diff --git a/src/gdbmseq.c b/src/gdbmseq.c
index f938c5a..52bddbd 100644
--- a/src/gdbmseq.c
+++ b/src/gdbmseq.c
@@ -68,7 +68,7 @@ get_next_key (GDBM_FILE dbf, int elem_loc, datum *return_val)
/* Find the next bucket. It is possible several entries in
the bucket directory point to the same bucket. */
while (dbf->bucket_dir < GDBM_DIR_COUNT (dbf)
- && dbf->cache_entry->ca_adr == dbf->dir[dbf->bucket_dir])
+ && dbf->cache_mru->ca_adr == dbf->dir[dbf->bucket_dir])
dbf->bucket_dir++;
/* Check to see if there was a next bucket. */
diff --git a/src/gdbmstore.c b/src/gdbmstore.c
index f860d80..7491ffc 100644
--- a/src/gdbmstore.c
+++ b/src/gdbmstore.c
@@ -190,8 +190,7 @@ gdbm_store (GDBM_FILE dbf, datum key, datum content, int flags)
}
/* Current bucket has changed. */
- dbf->cache_entry->ca_changed = TRUE;
- dbf->bucket_changed = TRUE;
+ _gdbm_current_bucket_changed (dbf);
/* Write everything that is needed to the disk. */
return _gdbm_end_update (dbf);
diff --git a/src/proto.h b/src/proto.h
index 3b03c73..40a396a 100644
--- a/src/proto.h
+++ b/src/proto.h
@@ -27,6 +27,13 @@ int _gdbm_cache_init (GDBM_FILE, size_t);
void _gdbm_cache_free (GDBM_FILE dbf);
int _gdbm_cache_flush (GDBM_FILE dbf);
+/* Mark current bucket as changed. */
+static inline void
+_gdbm_current_bucket_changed (GDBM_FILE dbf)
+{
+ dbf->cache_mru->ca_changed = TRUE;
+}
+
/* Return true if the directory entry at DIR_INDEX can be considered
valid. This means that DIR_INDEX is in the valid range for addressing
the dir array, and the offset stored in dir[DIR_INDEX] points past
diff --git a/src/recover.c b/src/recover.c
index dbc187b..9ef43d1 100644
--- a/src/recover.c
+++ b/src/recover.c
@@ -168,12 +168,9 @@ _gdbm_finish_transfer (GDBM_FILE dbf, GDBM_FILE new_dbf,
dbf->cache_mru = new_dbf->cache_mru;
dbf->cache_lru = new_dbf->cache_lru;
dbf->cache_avail = new_dbf->cache_avail;
- dbf->cache_entry = new_dbf->cache_entry;
dbf->header_changed = new_dbf->header_changed;
dbf->directory_changed = new_dbf->directory_changed;
- dbf->bucket_changed = new_dbf->bucket_changed;
- dbf->second_changed = new_dbf->second_changed;
dbf->file_size = -1;
diff --git a/src/update.c b/src/update.c
index 7e16eeb..2faa116 100644
--- a/src/update.c
+++ b/src/update.c
@@ -63,20 +63,8 @@ _gdbm_end_update (GDBM_FILE dbf)
off_t file_pos; /* Return value for lseek. */
int rc;
- /* Write the current bucket. */
- if (dbf->bucket_changed && (dbf->cache_entry != NULL))
- {
- if (_gdbm_write_bucket (dbf, dbf->cache_entry))
- return -1;
- dbf->bucket_changed = FALSE;
- }
-
- /* Write the other changed buckets if there are any. */
- if (dbf->second_changed)
- {
- _gdbm_cache_flush (dbf);
- dbf->second_changed = FALSE;
- }
+ /* Write the changed buckets if there are any. */
+ _gdbm_cache_flush (dbf);
/* Write the directory. */
if (dbf->directory_changed)
diff --git a/tests/gtcacheopt.c b/tests/gtcacheopt.c
index 3f23714..088e6d8 100644
--- a/tests/gtcacheopt.c
+++ b/tests/gtcacheopt.c
@@ -202,7 +202,7 @@ main (int argc, char **argv)
i = CACHE_SIZE;
if (gdbm_setopt (dbf, GDBM_SETCACHESIZE, &i, sizeof (i)))
{
- fprintf (stderr, "GDBM_GETCACHESIZE: %s\n", gdbm_strerror (gdbm_errno));
+ fprintf (stderr, "GDBM_SETCACHESIZE: %s\n", gdbm_strerror (gdbm_errno));
return 1;
}
diff --git a/tests/gtload.c b/tests/gtload.c
index d843111..1fcafb2 100644
--- a/tests/gtload.c
+++ b/tests/gtload.c
@@ -96,6 +96,7 @@ main (int argc, char **argv)
int recover = 0;
gdbm_recovery rcvr;
int rcvr_flags = 0;
+ size_t cache_size = 0;
progname = canonical_progname (argv[0]);
#ifdef GDBM_DEBUG_ENABLE
@@ -135,6 +136,8 @@ main (int argc, char **argv)
delim = arg[7];
else if (strcmp (arg, "-recover") == 0)
recover = 1;
+ else if (strncmp (arg, "-cachesize=", 11) == 0)
+ cache_size = read_size (arg + 11);
else if (strcmp (arg, "-verbose") == 0)
{
verbose = 1;
@@ -213,7 +216,17 @@ main (int argc, char **argv)
gdbm_strerror (gdbm_errno));
exit (1);
}
- }
+ }
+ if (cache_size)
+ {
+ if (gdbm_setopt (dbf, GDBM_SETCACHESIZE, &cache_size,
+ sizeof (cache_size)))
+ {
+ fprintf (stderr, "GDBM_SETCACHESIZE failed: %s\n",
+ gdbm_strerror (gdbm_errno));
+ exit (1);
+ }
+ }
if (verbose)
{
diff --git a/tools/gdbmshell.c b/tools/gdbmshell.c
index 3a85211..22c4938 100644
--- a/tools/gdbmshell.c
+++ b/tools/gdbmshell.c
@@ -373,7 +373,7 @@ _gdbm_print_bucket_cache (FILE *fp, GDBM_FILE dbf)
fprintf (fp,
_("Bucket Cache (size %zu/%zu):\n Index: Address Changed Data_Hash \n"),
dbf->cache_num, dbf->cache_size);
- for (elem = dbf->cache_entry, i = 0; elem; elem = elem->ca_next, i++)
+ for (elem = dbf->cache_mru, i = 0; elem; elem = elem->ca_next, i++)
{
fprintf (fp, " %5d: %15lu %7s %x\n",
i,