summaryrefslogtreecommitdiff
path: root/src/bin/pg_rewind/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/bin/pg_rewind/filemap.c')
-rw-r--r--src/bin/pg_rewind/filemap.c667
1 files changed, 667 insertions, 0 deletions
diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
new file mode 100644
index 0000000000..4e02647306
--- /dev/null
+++ b/src/bin/pg_rewind/filemap.c
@@ -0,0 +1,667 @@
+/*-------------------------------------------------------------------------
+ *
+ * filemap.c
+ * A data structure for keeping track of files that have changed.
+ *
+ * Copyright (c) 2013-2015, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "datapagemap.h"
+#include "filemap.h"
+#include "logging.h"
+#include "pg_rewind.h"
+
+#include "common/string.h"
+#include "catalog/pg_tablespace.h"
+#include "storage/fd.h"
+
+filemap_t *filemap = NULL;
+
+static bool isRelDataFile(const char *path);
+static char *datasegpath(RelFileNode rnode, ForkNumber forknum,
+ BlockNumber segno);
+static int path_cmp(const void *a, const void *b);
+static int final_filemap_cmp(const void *a, const void *b);
+static void filemap_list_to_array(void);
+
+/*
+ * Create a new file map.
+ */
+filemap_t *
+filemap_create(void)
+{
+ filemap_t *map;
+
+ map = pg_malloc(sizeof(filemap_t));
+ map->first = map->last = NULL;
+ map->nlist = 0;
+ map->array = NULL;
+ map->narray = 0;
+
+ Assert(filemap == NULL);
+ filemap = map;
+
+ return map;
+}
+
+/*
+ * Callback for processing remote file list.
+ *
+ * This is called once for every file in the source server. We decide what
+ * action needs to be taken for the file, depending on whether the file
+ * exists in the target and whether the size matches.
+ */
+void
+process_remote_file(const char *path, file_type_t type, size_t newsize,
+ const char *link_target)
+{
+ bool exists;
+ char localpath[MAXPGPATH];
+ struct stat statbuf;
+ filemap_t *map = filemap;
+ file_action_t action = FILE_ACTION_NONE;
+ size_t oldsize = 0;
+ file_entry_t *entry;
+
+ Assert(map->array == NULL);
+
+ /*
+ * Completely ignore some special files in source and destination.
+ */
+ if (strcmp(path, "postmaster.pid") == 0 ||
+ strcmp(path, "postmaster.opts") == 0)
+ return;
+
+ /*
+ * Skip temporary files, .../pgsql_tmp/... and .../pgsql_tmp.* in source.
+ * This has the effect that all temporary files in the destination will be
+ * removed.
+ */
+ if (strstr(path, "/" PG_TEMP_FILE_PREFIX) != NULL)
+ return;
+ if (strstr(path, "/" PG_TEMP_FILES_DIR "/") != NULL)
+ return;
+
+ /*
+ * sanity check: a filename that looks like a data file better be a
+ * regular file
+ */
+ if (type != FILE_TYPE_REGULAR && isRelDataFile(path))
+ pg_fatal("data file in source \"%s\" is not a regular file\n", path);
+
+ snprintf(localpath, sizeof(localpath), "%s/%s", datadir_target, path);
+
+ /* Does the corresponding local file exist? */
+ if (lstat(localpath, &statbuf) < 0)
+ {
+ if (errno != ENOENT)
+ pg_fatal("could not stat file \"%s\": %s\n",
+ localpath, strerror(errno));
+
+ exists = false;
+ }
+ else
+ exists = true;
+
+ switch (type)
+ {
+ case FILE_TYPE_DIRECTORY:
+ if (exists && !S_ISDIR(statbuf.st_mode))
+ {
+ /* it's a directory in target, but not in source. Strange.. */
+ pg_fatal("\"%s\" is not a directory\n", localpath);
+ }
+
+ if (!exists)
+ action = FILE_ACTION_CREATE;
+ else
+ action = FILE_ACTION_NONE;
+ oldsize = 0;
+ break;
+
+ case FILE_TYPE_SYMLINK:
+ if (exists &&
+#ifndef WIN32
+ !S_ISLNK(statbuf.st_mode)
+#else
+ !pgwin32_is_junction(localpath)
+#endif
+ )
+ {
+ /*
+ * It's a symbolic link in target, but not in source.
+ * Strange..
+ */
+ pg_fatal("\"%s\" is not a symbolic link\n", localpath);
+ }
+
+ if (!exists)
+ action = FILE_ACTION_CREATE;
+ else
+ action = FILE_ACTION_NONE;
+ oldsize = 0;
+ break;
+
+ case FILE_TYPE_REGULAR:
+ if (exists && !S_ISREG(statbuf.st_mode))
+ pg_fatal("\"%s\" is not a regular file\n", localpath);
+
+ if (!exists || !isRelDataFile(path))
+ {
+ /*
+ * File exists in source, but not in target. Or it's a
+ * non-data file that we have no special processing for. Copy
+ * it in toto.
+ *
+ * An exception: PG_VERSIONs should be identical, but avoid
+ * overwriting it for paranoia.
+ */
+ if (pg_str_endswith(path, "PG_VERSION"))
+ {
+ action = FILE_ACTION_NONE;
+ oldsize = statbuf.st_size;
+ }
+ else
+ {
+ action = FILE_ACTION_COPY;
+ oldsize = 0;
+ }
+ }
+ else
+ {
+ /*
+ * It's a data file that exists in both.
+ *
+ * If it's larger in target, we can truncate it. There will
+ * also be a WAL record of the truncation in the source
+ * system, so WAL replay would eventually truncate the target
+ * too, but we might as well do it now.
+ *
+ * If it's smaller in the target, it means that it has been
+ * truncated in the target, or enlarged in the source, or
+ * both. If it was truncated locally, we need to copy the
+ * missing tail from the remote system. If it was enlarged in
+ * the remote system, there will be WAL records in the remote
+ * system for the new blocks, so we wouldn't need to copy them
+ * here. But we don't know which scenario we're dealing with,
+ * and there's no harm in copying the missing blocks now, so
+ * do it now.
+ *
+ * If it's the same size, do nothing here. Any locally
+ * modified blocks will be copied based on parsing the local
+ * WAL, and any remotely modified blocks will be updated after
+ * rewinding, when the remote WAL is replayed.
+ */
+ oldsize = statbuf.st_size;
+ if (oldsize < newsize)
+ action = FILE_ACTION_COPY_TAIL;
+ else if (oldsize > newsize)
+ action = FILE_ACTION_TRUNCATE;
+ else
+ action = FILE_ACTION_NONE;
+ }
+ break;
+ }
+
+ /* Create a new entry for this file */
+ entry = pg_malloc(sizeof(file_entry_t));
+ entry->path = pg_strdup(path);
+ entry->type = type;
+ entry->action = action;
+ entry->oldsize = oldsize;
+ entry->newsize = newsize;
+ entry->link_target = link_target ? pg_strdup(link_target) : NULL;
+ entry->next = NULL;
+ entry->pagemap.bitmap = NULL;
+ entry->pagemap.bitmapsize = 0;
+ entry->isrelfile = isRelDataFile(path);
+
+ if (map->last)
+ {
+ map->last->next = entry;
+ map->last = entry;
+ }
+ else
+ map->first = map->last = entry;
+ map->nlist++;
+}
+
+/*
+ * Callback for processing local file list.
+ *
+ * All remote files must be already processed before calling this. This only
+ * marks local files that didn't exist in the remote system for deletion.
+ */
+void
+process_local_file(const char *path, file_type_t type, size_t oldsize,
+ const char *link_target)
+{
+ bool exists;
+ char localpath[MAXPGPATH];
+ struct stat statbuf;
+ file_entry_t key;
+ file_entry_t *key_ptr;
+ filemap_t *map = filemap;
+ file_entry_t *entry;
+
+ snprintf(localpath, sizeof(localpath), "%s/%s", datadir_target, path);
+ if (lstat(localpath, &statbuf) < 0)
+ {
+ if (errno != ENOENT)
+ pg_fatal("could not stat file \"%s\": %s",
+ localpath, strerror(errno));
+
+ exists = false;
+ }
+
+ if (map->array == NULL)
+ {
+ /* on first call, initialize lookup array */
+ if (map->nlist == 0)
+ {
+ /* should not happen */
+ pg_fatal("remote file list is empty\n");
+ }
+
+ filemap_list_to_array();
+ qsort(map->array, map->narray, sizeof(file_entry_t *), path_cmp);
+ }
+
+ /*
+ * Completely ignore some special files
+ */
+ if (strcmp(path, "postmaster.pid") == 0 ||
+ strcmp(path, "postmaster.opts") == 0)
+ return;
+
+ key.path = (char *) path;
+ key_ptr = &key;
+ exists = bsearch(&key_ptr, map->array, map->narray, sizeof(file_entry_t *),
+ path_cmp) != NULL;
+
+ /* Remove any file or folder that doesn't exist in the remote system. */
+ if (!exists)
+ {
+ entry = pg_malloc(sizeof(file_entry_t));
+ entry->path = pg_strdup(path);
+ entry->type = type;
+ entry->action = FILE_ACTION_REMOVE;
+ entry->oldsize = oldsize;
+ entry->newsize = 0;
+ entry->link_target = link_target ? pg_strdup(link_target) : NULL;
+ entry->next = NULL;
+ entry->pagemap.bitmap = NULL;
+ entry->pagemap.bitmapsize = 0;
+ entry->isrelfile = isRelDataFile(path);
+
+ if (map->last == NULL)
+ map->first = entry;
+ else
+ map->last->next = entry;
+ map->last = entry;
+ map->nlist++;
+ }
+ else
+ {
+ /*
+ * We already handled all files that exist in the remote system in
+ * process_remote_file().
+ */
+ }
+}
+
+/*
+ * This callback gets called while we read the old WAL, for every block that
+ * have changed in the local system. It makes note of all the changed blocks
+ * in the pagemap of the file.
+ */
+void
+process_block_change(ForkNumber forknum, RelFileNode rnode, BlockNumber blkno)
+{
+ char *path;
+ file_entry_t key;
+ file_entry_t *key_ptr;
+ file_entry_t *entry;
+ BlockNumber blkno_inseg;
+ int segno;
+ filemap_t *map = filemap;
+ file_entry_t **e;
+
+ Assert(filemap->array);
+
+ segno = blkno / RELSEG_SIZE;
+ blkno_inseg = blkno % RELSEG_SIZE;
+
+ path = datasegpath(rnode, forknum, segno);
+
+ key.path = (char *) path;
+ key_ptr = &key;
+
+ e = bsearch(&key_ptr, map->array, map->narray, sizeof(file_entry_t *),
+ path_cmp);
+ if (e)
+ entry = *e;
+ else
+ entry = NULL;
+ free(path);
+
+ if (entry)
+ {
+ Assert(entry->isrelfile);
+
+ switch (entry->action)
+ {
+ case FILE_ACTION_NONE:
+ case FILE_ACTION_TRUNCATE:
+ /* skip if we're truncating away the modified block anyway */
+ if ((blkno_inseg + 1) * BLCKSZ <= entry->newsize)
+ datapagemap_add(&entry->pagemap, blkno_inseg);
+ break;
+
+ case FILE_ACTION_COPY_TAIL:
+ /*
+ * skip the modified block if it is part of the "tail" that
+ * we're copying anyway.
+ */
+ if ((blkno_inseg + 1) * BLCKSZ <= entry->oldsize)
+ datapagemap_add(&entry->pagemap, blkno_inseg);
+ break;
+
+ case FILE_ACTION_COPY:
+ case FILE_ACTION_REMOVE:
+ break;
+
+ case FILE_ACTION_CREATE:
+ pg_fatal("unexpected page modification for directory or symbolic link \"%s\"\n", entry->path);
+ }
+ }
+ else
+ {
+ /*
+ * If we don't have any record of this file in the file map, it means
+ * that it's a relation that doesn't exist in the remote system, and
+ * it was subsequently removed in the local system, too. We can safely
+ * ignore it.
+ */
+ }
+}
+
+/*
+ * Convert the linked list of entries in filemap->first/last to the array,
+ * filemap->array.
+ */
+static void
+filemap_list_to_array(void)
+{
+ int narray;
+ file_entry_t *entry,
+ *next;
+
+ filemap->array =
+ pg_realloc(filemap->array,
+ (filemap->nlist + filemap->narray) * sizeof(file_entry_t));
+
+ narray = filemap->narray;
+ for (entry = filemap->first; entry != NULL; entry = next)
+ {
+ filemap->array[narray++] = entry;
+ next = entry->next;
+ entry->next = NULL;
+ }
+ Assert(narray == filemap->nlist + filemap->narray);
+ filemap->narray = narray;
+ filemap->nlist = 0;
+ filemap->first = filemap->last = NULL;
+}
+
+void
+filemap_finalize(void)
+{
+ filemap_list_to_array();
+ qsort(filemap->array, filemap->narray, sizeof(file_entry_t *),
+ final_filemap_cmp);
+}
+
+static const char *
+action_to_str(file_action_t action)
+{
+ switch (action)
+ {
+ case FILE_ACTION_NONE:
+ return "NONE";
+ case FILE_ACTION_COPY:
+ return "COPY";
+ case FILE_ACTION_TRUNCATE:
+ return "TRUNCATE";
+ case FILE_ACTION_COPY_TAIL:
+ return "COPY_TAIL";
+ case FILE_ACTION_CREATE:
+ return "CREATE";
+ case FILE_ACTION_REMOVE:
+ return "REMOVE";
+
+ default:
+ return "unknown";
+ }
+}
+
+/*
+ * Calculate the totals needed for progress reports.
+ */
+void
+calculate_totals(void)
+{
+ file_entry_t *entry;
+ int i;
+ filemap_t *map = filemap;
+
+ map->total_size = 0;
+ map->fetch_size = 0;
+
+ for (i = 0; i < filemap->narray; i++)
+ {
+ entry = filemap->array[i];
+
+ if (entry->type != FILE_TYPE_REGULAR)
+ continue;
+
+ map->total_size += entry->newsize;
+
+ if (entry->action == FILE_ACTION_COPY)
+ {
+ map->fetch_size += entry->newsize;
+ continue;
+ }
+
+ if (entry->action == FILE_ACTION_COPY_TAIL)
+ map->fetch_size += (entry->newsize - entry->oldsize);
+
+ if (entry->pagemap.bitmapsize > 0)
+ {
+ datapagemap_iterator_t *iter;
+ BlockNumber blk;
+
+ iter = datapagemap_iterate(&entry->pagemap);
+ while (datapagemap_next(iter, &blk))
+ map->fetch_size += BLCKSZ;
+
+ pg_free(iter);
+ }
+ }
+}
+
+void
+print_filemap(void)
+{
+ file_entry_t *entry;
+ int i;
+
+ for (i = 0; i < filemap->narray; i++)
+ {
+ entry = filemap->array[i];
+ if (entry->action != FILE_ACTION_NONE ||
+ entry->pagemap.bitmapsize > 0)
+ {
+ printf("%s (%s)\n", entry->path, action_to_str(entry->action));
+
+ if (entry->pagemap.bitmapsize > 0)
+ datapagemap_print(&entry->pagemap);
+ }
+ }
+ fflush(stdout);
+}
+
+/*
+ * Does it look like a relation data file?
+ *
+ * For our purposes, only files belonging to the main fork are considered
+ * relation files. Other forks are alwayes copied in toto, because we cannot
+ * reliably track changes to them, because WAL only contains block references
+ * for the main fork.
+ */
+static bool
+isRelDataFile(const char *path)
+{
+ char buf[20 + 1];
+ RelFileNode rnode;
+ unsigned int segNo;
+ int nmatch;
+ bool matched;
+
+ /*----
+ * Relation data files can be in one of the following directories:
+ *
+ * global/
+ * shared relations
+ *
+ * base/<db oid>/
+ * regular relations, default tablespace
+ *
+ * pg_tblspc/<tblspc oid>/PG_9.4_201403261/
+ * within a non-default tablespace (the name of the directory
+ * depends on version)
+ *
+ * And the relation data files themselves have a filename like:
+ *
+ * <oid>.<segment number>
+ *
+ *----
+ */
+ rnode.spcNode = InvalidOid;
+ rnode.dbNode = InvalidOid;
+ rnode.relNode = InvalidOid;
+ segNo = 0;
+ matched = false;
+
+ nmatch = sscanf(path, "global/%u.%u", &rnode.relNode, &segNo);
+ if (nmatch == 1 || nmatch == 2)
+ {
+ rnode.spcNode = GLOBALTABLESPACE_OID;
+ rnode.dbNode = 0;
+ matched = true;
+ }
+ else
+ {
+ nmatch = sscanf(path, "base/%u/%u.%u",
+ &rnode.dbNode, &rnode.relNode, &segNo);
+ if (nmatch == 2 || nmatch == 3)
+ {
+ rnode.spcNode = DEFAULTTABLESPACE_OID;
+ matched = true;
+ }
+ else
+ {
+ nmatch = sscanf(path, "pg_tblspc/%u/PG_%20s/%u/%u.%u",
+ &rnode.spcNode, buf, &rnode.dbNode, &rnode.relNode,
+ &segNo);
+ if (nmatch == 4 || nmatch == 5)
+ matched = true;
+ }
+ }
+
+ /*
+ * The sscanf tests above can match files that have extra characters at
+ * the end, and the last check can also match a path belonging to a
+ * different version (different TABLESPACE_VERSION_DIRECTORY). To make
+ * eliminate such cases, cross-check that GetRelationPath creates the
+ * exact same filename, when passed the RelFileNode information we
+ * extracted from the filename.
+ */
+ if (matched)
+ {
+ char *check_path = datasegpath(rnode, MAIN_FORKNUM, segNo);
+
+ if (strcmp(check_path, path) != 0)
+ matched = false;
+
+ pfree(check_path);
+ }
+
+ return matched;
+}
+
+/*
+ * A helper function to create the path of a relation file and segment.
+ *
+ * The returned path is palloc'd
+ */
+static char *
+datasegpath(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
+{
+ char *path;
+ char *segpath;
+
+ path = relpathperm(rnode, forknum);
+ if (segno > 0)
+ {
+ segpath = psprintf("%s.%u", path, segno);
+ pfree(path);
+ return segpath;
+ }
+ else
+ return path;
+}
+
+static int
+path_cmp(const void *a, const void *b)
+{
+ file_entry_t *fa = *((file_entry_t **) a);
+ file_entry_t *fb = *((file_entry_t **) b);
+
+ return strcmp(fa->path, fb->path);
+}
+
+/*
+ * In the final stage, the filemap is sorted so that removals come last.
+ * From disk space usage point of view, it would be better to do removals
+ * first, but for now, safety first. If a whole directory is deleted, all
+ * files and subdirectories inside it need to removed first. On creation,
+ * parent directory needs to be created before files and directories inside
+ * it. To achieve that, the file_action_t enum is ordered so that we can
+ * just sort on that first. Furthermore, sort REMOVE entries in reverse
+ * path order, so that "foo/bar" subdirectory is removed before "foo".
+ */
+static int
+final_filemap_cmp(const void *a, const void *b)
+{
+ file_entry_t *fa = *((file_entry_t **) a);
+ file_entry_t *fb = *((file_entry_t **) b);
+
+ if (fa->action > fb->action)
+ return 1;
+ if (fa->action < fb->action)
+ return -1;
+
+ if (fa->action == FILE_ACTION_REMOVE)
+ return -strcmp(fa->path, fb->path);
+ else
+ return strcmp(fa->path, fb->path);
+}