4 files changed, 125 insertions, 37 deletions
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 65c74b61995..d19c05e802b 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -863,6 +863,7 @@ lu
 lwsync
 lz
 lzo
+mT
 madvise
 majorp
 malloc
diff --git a/src/log/log.c b/src/log/log.c
index 00e4ea5f441..96b593ec706 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -21,6 +21,60 @@ static int __log_write_internal(
 #define	WT_LOG_OPEN_VERIFY	0x02
 
 /*
+ * __log_wait_for_earlier_slot --
+ *	Wait for write_lsn to catch up to this slot.
+ */
+static void
+__log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+	int yield_count;
+
+	conn = S2C(session);
+	log = conn->log;
+	yield_count = 0;
+
+	while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+		/*
+		 * If we're on a locked path and the write LSN is not advancing,
+		 * unlock in case an earlier thread is trying to switch its
+		 * slot and complete its operation.
+		 */
+		if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
+			__wt_spin_unlock(session, &log->log_slot_lock);
+		__wt_cond_auto_signal(session, conn->log_wrlsn_cond);
+		if (++yield_count < WT_THOUSAND)
+			__wt_yield();
+		else
+			__wt_cond_wait(session, log->log_write_cond, 200);
+		if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
+			__wt_spin_lock(session, &log->log_slot_lock);
+	}
+}
+
+/*
+ * __log_fs_write --
+ *	Wrapper when writing to a log file.  If we're writing to a new log
+ *	file for the first time wait for writes to the previous log file.
+ */
+static int
+__log_fs_write(WT_SESSION_IMPL *session,
+    WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf)
+{
+	/*
+	 * If we're writing into a new log file, we have to wait for all
+	 * writes to the previous log file to complete otherwise there could
+	 * be a hole at the end of the previous log file that we cannot detect.
+	 */
+	if (slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) {
+		__log_wait_for_earlier_slot(session, slot);
+		WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn));
+	}
+	return (__wt_write(session, slot->slot_fh, offset, len, buf));
+}
+
+/*
  * __wt_log_ckpt --
  *	Record the given LSN as the checkpoint LSN and signal the archive
  *	thread as needed.
@@ -576,7 +630,7 @@ __log_fill(WT_SESSION_IMPL *session,
 		/*
 		 * If this is a force or unbuffered write, write it now.
 		 */
-		WT_ERR(__wt_write(session, myslot->slot->slot_fh,
+		WT_ERR(__log_fs_write(session, myslot->slot,
 		    myslot->offset + myslot->slot->slot_start_offset,
 		    record->size, record->mem));
 
@@ -1352,13 +1406,11 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
 	WT_LSN sync_lsn;
 	int64_t release_buffered, release_bytes;
 	uint64_t fsync_duration_usecs;
-	int yield_count;
 	bool locked;
 
 	conn = S2C(session);
 	log = conn->log;
 	locked = false;
-	yield_count = 0;
 	if (freep != NULL)
 		*freep = 1;
 	release_buffered = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
@@ -1379,8 +1431,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
 
 	/* Write the buffered records */
 	if (release_buffered != 0)
-		WT_ERR(__wt_write(session,
-		    slot->slot_fh, slot->slot_start_offset,
+		WT_ERR(__log_fs_write(session, slot, slot->slot_start_offset,
 		    (size_t)release_buffered, slot->slot_buf.mem));
 
 	/*
@@ -1411,22 +1462,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
 	 * be holes in the log file.
 	 */
 	WT_STAT_CONN_INCR(session, log_release_write_lsn);
-	while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) {
-		/*
-		 * If we're on a locked path and the write LSN is not advancing,
-		 * unlock in case an earlier thread is trying to switch its
-		 * slot and complete its operation.
-		 */
-		if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
-			__wt_spin_unlock(session, &log->log_slot_lock);
-		__wt_cond_auto_signal(session, conn->log_wrlsn_cond);
-		if (++yield_count < WT_THOUSAND)
-			__wt_yield();
-		else
-			__wt_cond_wait(session, log->log_write_cond, 200);
-		if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
-			__wt_spin_lock(session, &log->log_slot_lock);
-	}
+	__log_wait_for_earlier_slot(session, slot);
 
 	log->write_start_lsn = slot->slot_start_lsn;
 	log->write_lsn = slot->slot_end_lsn;
diff --git a/test/recovery/random-abort.c b/test/recovery/random-abort.c
index 03e67e2f723..21460646017 100644
--- a/test/recovery/random-abort.c
+++ b/test/recovery/random-abort.c
@@ -34,6 +34,7 @@
 static char home[512];			/* Program working dir */
 static const char *progname;		/* Program name */
 static const char * const uri = "table:main";
+bool inmem;
 
 #define	MAX_TH	12
 #define	MIN_TH	5
@@ -41,7 +42,9 @@ static const char * const uri = "table:main";
 #define	MIN_TIME	10
 #define	RECORDS_FILE	"records-%" PRIu32
 
-#define	ENV_CONFIG						\
+#define	ENV_CONFIG_DEF						\
+    "create,log=(file_max=10M,archive=false,enabled)"
+#define	ENV_CONFIG_TXNSYNC					\
     "create,log=(file_max=10M,archive=false,enabled),"		\
     "transaction_sync=(enabled,method=none)"
 #define	ENV_CONFIG_REC "log=(recover=on)"
@@ -73,11 +76,15 @@ thread_run(void *arg)
 	WT_THREAD_DATA *td;
 	uint64_t i;
 	int ret;
-	char buf[MAX_VAL], kname[64];
+	size_t lsize;
+	char buf[MAX_VAL], kname[64], lgbuf[8];
+	char large[128*1024];
 
 	__wt_random_init(&rnd);
 	memset(buf, 0, sizeof(buf));
 	memset(kname, 0, sizeof(kname));
+	lsize = sizeof(large);
+	memset(large, 0, lsize);
 
 	td = (WT_THREAD_DATA *)arg;
 	/*
@@ -85,6 +92,13 @@ thread_run(void *arg)
 	 */
 	snprintf(buf, sizeof(buf), RECORDS_FILE, td->id);
 	/*
+	 * Set up a large value putting our id in it.  Write it in there a
+	 * bunch of times, but the rest of the buffer can just be zero.
+	 */
+	snprintf(lgbuf, sizeof(lgbuf), "th-%" PRIu32, td->id);
+	for (i = 0; i < 128; i += strlen(lgbuf))
+		snprintf(&large[i], lsize - i, "%s", lgbuf);
+	/*
 	 * Keep a separate file with the records we wrote for checking.
 	 */
 	(void)unlink(buf);
@@ -107,8 +121,18 @@ thread_run(void *arg)
 	 */
 	for (i = td->start; ; ++i) {
 		snprintf(kname, sizeof(kname), "%" PRIu64, i);
-		data.size = __wt_random(&rnd) % MAX_VAL;
 		cursor->set_key(cursor, kname);
+		/*
+		 * Every 30th record write a very large record that exceeds the
+		 * log buffer size.  This forces us to use the unbuffered path.
+		 */
+		if (i % 30 == 0) {
+			data.size = 128 * 1024;
+			data.data = large;
+		} else {
+			data.size = __wt_random(&rnd) % MAX_VAL;
+			data.data = buf;
+		}
 		cursor->set_value(cursor, &data);
 		if ((ret = cursor->insert(cursor)) != 0)
 			testutil_die(ret, "WT_CURSOR.insert");
@@ -136,12 +160,17 @@ fill_db(uint32_t nth)
 	WT_THREAD_DATA *td;
 	uint32_t i;
 	int ret;
+	const char *envconf;
 
 	thr = dcalloc(nth, sizeof(pthread_t));
 	td = dcalloc(nth, sizeof(WT_THREAD_DATA));
 	if (chdir(home) != 0)
 		testutil_die(errno, "Child chdir: %s", home);
-	if ((ret = wiredtiger_open(NULL, NULL, ENV_CONFIG, &conn)) != 0)
+	if (inmem)
+		envconf = ENV_CONFIG_DEF;
+	else
+		envconf = ENV_CONFIG_TXNSYNC;
+	if ((ret = wiredtiger_open(NULL, NULL, envconf, &conn)) != 0)
 		testutil_die(ret, "wiredtiger_open");
 	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
 		testutil_die(ret, "WT_CONNECTION:open_session");
@@ -187,11 +216,11 @@ main(int argc, char *argv[])
 	WT_CURSOR *cursor;
 	WT_SESSION *session;
 	WT_RAND_STATE rnd;
-	uint64_t key, last_key;
-	uint32_t absent, count, i, nth, timeout;
+	uint64_t absent, count, key, last_key, middle;
+	uint32_t i, nth, timeout;
 	int ch, status, ret;
 	pid_t pid;
-	bool rand_th, rand_time, verify_only;
+	bool fatal, rand_th, rand_time, verify_only;
 	const char *working_dir;
 	char fname[64], kname[64];
 
@@ -200,17 +229,21 @@ main(int argc, char *argv[])
 	else
 		++progname;
 
+	inmem = false;
 	nth = MIN_TH;
 	rand_th = rand_time = true;
 	timeout = MIN_TIME;
 	verify_only = false;
 	working_dir = "WT_TEST.random-abort";
 
-	while ((ch = __wt_getopt(progname, argc, argv, "h:T:t:v")) != EOF)
+	while ((ch = __wt_getopt(progname, argc, argv, "h:mT:t:v")) != EOF)
 		switch (ch) {
 		case 'h':
 			working_dir = __wt_optarg;
 			break;
+		case 'm':
+			inmem = true;
+			break;
 		case 'T':
 			rand_th = false;
 			nth = (uint32_t)atoi(__wt_optarg);
@@ -303,7 +336,9 @@ main(int argc, char *argv[])
 		testutil_die(ret, "WT_SESSION.open_cursor: %s", uri);
 
 	absent = count = 0;
+	fatal = false;
 	for (i = 0; i < nth; ++i) {
+		middle = 0;
 		snprintf(fname, sizeof(fname), RECORDS_FILE, i);
 		if ((fp = fopen(fname, "r")) == NULL) {
 			fprintf(stderr,
@@ -313,8 +348,10 @@ main(int argc, char *argv[])
 
 		/*
 		 * For every key in the saved file, verify that the key exists
-		 * in the table after recovery.  Since we did write-no-sync, we
-		 * expect every key to have been recovered.
+		 * in the table after recovery.  If we're doing in-memory
+		 * log buffering we never expect a record missing in the middle,
+		 * but records may be missing at the end.  If we did
+		 * write-no-sync, we expect every key to have been recovered.
 		 */
 		for (last_key = UINT64_MAX;; ++count, last_key = key) {
 			ret = fscanf(fp, "%" SCNu64 "\n", &key);
@@ -338,9 +375,20 @@ main(int argc, char *argv[])
 			if ((ret = cursor->search(cursor)) != 0) {
 				if (ret != WT_NOTFOUND)
 					testutil_die(ret, "search");
-				printf("%s: no record with key %" PRIu64 "\n",
-				    fname, key);
-				++absent;
+				if (!inmem)
+					printf("%s: no record with key %"
+					    PRIu64 "\n", fname, key);
+				absent++;
+				middle = key;
+			} else if (middle != 0) {
+				/*
+				 * We should never find an existing key after
+				 * we have detected one missing.
+				 */
+				printf("%s: after absent record at %" PRIu64
+				    " key %" PRIu64 " exists\n",
+				    fname, middle, key);
+				fatal = true;
 			}
 		}
 		if (fclose(fp) != 0)
@@ -348,11 +396,13 @@ main(int argc, char *argv[])
 	}
 	if ((ret = conn->close(conn, NULL)) != 0)
 		testutil_die(ret, "WT_CONNECTION:close");
-	if (absent) {
-		printf("%" PRIu32 " record(s) absent from %" PRIu32 "\n",
+	if (fatal)
+		return (EXIT_FAILURE);
+	if (!inmem && absent) {
+		printf("%" PRIu64 " record(s) absent from %" PRIu64 "\n",
 		    absent, count);
 		return (EXIT_FAILURE);
 	}
-	printf("%" PRIu32 " records verified\n", count);
+	printf("%" PRIu64 " records verified\n", count);
 	return (EXIT_SUCCESS);
 }
diff --git a/test/recovery/smoke.sh b/test/recovery/smoke.sh
index c7677b64503..ce0662d3b2b 100755
--- a/test/recovery/smoke.sh
+++ b/test/recovery/smoke.sh
@@ -5,4 +5,5 @@ set -e
 # Smoke-test recovery as part of running "make check".
 
 $TEST_WRAPPER ./random-abort -t 10 -T 5
+$TEST_WRAPPER ./random-abort -m -t 10 -T 5
 $TEST_WRAPPER ./truncated-log