#! /bin/bash

[ -z $BASH_VERSION ] && {
	echo "$0 is a bash script: \$BASH_VERSION not set, exiting"
	exit 1
}

name=$(basename $0)

quit=0
force_quit=0
onintr()
{
	echo "$name: interrupted, cleaning up..."
	force_quit=1
}
trap 'onintr' 2

usage() {
	echo "usage: $0 [-aFSv] [-c config] "
	echo "    [-b format-binary] [-h home] [-j parallel-jobs] [-n total-jobs] [-t minutes] [format-configuration]"
	echo
	echo "    -a           abort/recovery testing (defaults to off)"
	echo "    -b binary    format binary (defaults to "./t")"
	echo "    -c config    format configuration file (defaults to CONFIG.stress)"
	echo "    -F           quit on first failure (defaults to off)"
	echo "    -h home      run directory (defaults to .)"
	echo "    -j parallel  jobs to execute in parallel (defaults to 8)"
	echo "    -n total     total jobs to execute (defaults to no limit)"
	echo "    -S           run smoke-test configurations (defaults to off)"
	echo "    -t minutes   minutes to run (defaults to no limit)"
	echo "    -v           verbose output (defaults to off)"
	echo "    --           separates $name arguments from format arguments"

	exit 1
}

# Smoke-tests.
smoke_base_1="data_source=table rows=100000 threads=6 timer=4"
smoke_base_2="$smoke_base_1 leaf_page_max=9 internal_page_max=9"
smoke_list=(
	# Three access methods.
	"$smoke_base_1 file_type=fix"
	"$smoke_base_1 file_type=row"
	"$smoke_base_1 file_type=var"

	# Huffman key/value encoding.
	"$smoke_base_1 file_type=row huffman_key=1 huffman_value=1"
	"$smoke_base_1 file_type=var huffman_key=1 huffman_value=1"

	# LSM
	"$smoke_base_1 file_type=row data_source=lsm"

	# Force tree rebalance and the statistics server.
	"$smoke_base_1 file_type=row statistics_server=1 rebalance=1"

	# Overflow testing.
	"$smoke_base_2 file_type=var value_min=256"
	"$smoke_base_2 file_type=row key_min=256"
	"$smoke_base_2 file_type=row key_min=256 value_min=256"
)
smoke_next=0

abort_test=0
build=""
config="CONFIG.stress"
first_failure=0
format_args=""
home="."
minutes=0
parallel_jobs=8
smoke_test=0
total_jobs=0
verbose=0
format_binary="./t"

while :; do
	case "$1" in
	-a)
		abort_test=1
		shift ;;
	-b)
		format_binary="$2"
		shift ; shift ;;
	-c)
		config="$2"
		shift ; shift ;;
	-F)
		first_failure=1
		shift ;;
	-h)
		home="$2"
		shift ; shift ;;
	-j)
		parallel_jobs="$2"
		[[ "$parallel_jobs" =~ ^[1-9][0-9]*$ ]] || {
			echo "$name: -j option argument must be a non-zero integer"
			exit 1
		}
		shift ; shift ;;
	-n)
		total_jobs="$2"
		[[ "$total_jobs" =~ ^[1-9][0-9]*$ ]] || {
			echo "$name: -n option argument must be an non-zero integer"
			exit 1
		}
		shift ; shift ;;
	-S)
		smoke_test=1
		shift ;;
	-t)
		minutes="$2"
		[[ "$minutes" =~ ^[1-9][0-9]*$ ]] || {
			echo "$name: -t option argument must be a non-zero integer"
			exit 1
		}
		shift ; shift ;;
	-v)
		verbose=1
		shift ;;
	--)
		shift; break;;
	-*)
		usage ;;
	*)
		break ;;
	esac
done
format_args="$*"

verbose()
{
	[[ $verbose -ne 0 ]] && echo "$@"
}

verbose "$name: run starting at $(date)"

# Home is possibly relative to our current directory and we're about to change directories.
# Get an absolute path for home.
[[ -d "$home" ]] || {
	echo "$name: directory \"$home\" not found"
	exit 1
}
home=$(cd $home && echo $PWD)

# Config is possibly relative to our current directory and we're about to change directories.
# Get an absolute path for config if it's local.
config_found=0
[[ -f "$config" ]] && config_found=1 && config="$PWD/$config"

# Move to the format.sh directory (assumed to be in a WiredTiger build tree).
cd $(dirname $0) || exit 1

# If we haven't already found it, check for the config file (by default it's CONFIG.stress which
# lives in the same directory of the WiredTiger build tree as format.sh. We're about to change
# directories if we don't find the format binary here, get an absolute path for config if it's
# local.
[[ $config_found -eq 0 ]] && [[ -f "$config" ]] && config="$PWD/$config"

# Find the last part of format_binary, which is format binary file. Builds are normally in the
# WiredTiger source tree, in which case it's in the same directory as format.sh, else it's in
# the build_posix tree. If the build is in the build_posix tree, move there, we have to run in
# the directory where the format binary lives because the format binary "knows" the wt utility
# is two directory levels above it.

[[ -x ${format_binary##* } ]] || {
	build_posix_directory="../../build_posix/test/format"
	[[ ! -d $build_posix_directory ]] || cd $build_posix_directory || exit 1
	[[ -x ${format_binary##* } ]] || {
		echo "$name: format program \"${format_binary##* }\" not found"
		exit 1
	}
}

# Find the wt binary (required for abort/recovery testing).
wt_binary="../../wt"
[[ -x $wt_binary ]] || {
	echo "$name: wt program \"$wt_binary\" not found"
	exit 1
}

# We tested for the CONFIG file in the original directory, then in the WiredTiger source directory,
# the last place to check is in the WiredTiger build directory. Fail if we don't find it.
[[ -f "$config" ]] || {
    echo "$name: configuration file \"$config\" not found"
    exit 1
}

verbose "$name configuration: $format_binary [-c $config]\
[-h $home] [-j $parallel_jobs] [-n $total_jobs] [-t $minutes] $format_args"

failure=0
success=0
running=0
status="format.sh-status"

# Report a failure.
# $1 directory name
report_failure()
{
	dir=$1
	log="$dir.log"

	echo "$name: failure status reported" > $dir/$status
	failure=$(($failure + 1))

	# Forcibly quit if first-failure configured.
	[[ $first_failure -ne 0 ]] && force_quit=1

	echo "$name: job in $dir failed"
	echo "$name: $dir log:"
	sed 's/^/    > /' < $log
}

# Resolve/cleanup completed jobs.
resolve()
{
	running=0
	list=$(ls $home | grep '^RUNDIR.[0-9]*$')
	for i in $list; do
		dir="$home/$i"
		log="$dir.log"

		# Skip directories that aren't ours.
		[[ -f "$log" ]] || continue

		# Skip failures we've already reported.
		[[ -f "$dir/$status" ]] && continue

		# Get the process ID, ignore any jobs that aren't yet running.
		pid=`grep -E 'process.*running' $log | awk '{print $3}'`
		[[ "$pid" =~ ^[1-9][0-9]*$ ]] || continue

		# Leave any process waiting for a gdb attach running, but report it as a failure.
		grep -E 'waiting for debugger' $log > /dev/null && {
			report_failure $dir
			continue
		}

		# If the job is still running, ignore it unless we're forcibly quitting.
		kill -s 0 $pid > /dev/null 2>&1 && {
			[[ $force_quit -eq 0 ]] && {
				running=$((running + 1))
				continue
			}

			# Kill the process group to catch any child processes.
			kill -KILL -- -$pid
			wait $pid

			# Remove jobs we killed, they count as neither success or failure.
			rm -rf $dir $log
			verbose "$name: job in $dir killed"
			continue
		}

		# Wait for the job and get an exit status.
		wait $pid
		eret=$?

		# Remove successful jobs.
		grep 'successful run completed' $log > /dev/null && {
			rm -rf $dir $log
			success=$(($success + 1))
			verbose "$name: job in $dir successfully completed"
			continue
		}

		# Test recovery on jobs configured for random abort. */
		grep 'aborting to test recovery' $log > /dev/null && {
			cp -pr $dir $dir.RECOVER

			(echo
			 echo "$name: running recovery after abort test"
			 echo "$name: original directory copied into $dir.RECOVER"
			 echo) >> $log

			# Everything is a table unless explicitly a file.
			uri="table:wt"
			grep 'data_source=file' $dir/CONFIG > /dev/null && uri="file:wt"
						
			# Use the wt utility to recover & verify the object.
			if  $($wt_binary -R -h $dir verify $uri >> $log 2>&1); then
				rm -rf $dir $dir.RECOVER $log
				success=$(($success + 1))
				verbose "$name: job in $dir successfully completed"
			else
				echo "$name: job in $dir failed abort/recovery testing"
				report_failure $dir
			fi
			continue
		}

		# Check for the library abort message, or an error from format.
		grep -E \
		    'aborting WiredTiger library|format alarm timed out|run FAILED' \
		    $log > /dev/null && {
			report_failure $dir
			continue
		}

		# There's some chance we just dropped core. We have the exit status of the process,
		# but there's no way to be sure. There are reasons the process' exit status looks
		# like a core dump was created (format deliberately causes a segfault in the case
		# of abort/recovery testing, and does work that can often segfault in the case of a
		# snapshot-isolation mismatch failure), but those cases have already been handled,
		# format is responsible for logging a failure before the core can happen. If the
		# process exited with a likely failure, call it a failure.
		signame=""
		case $eret in
		$((128 + 3)))
			signame="SIGQUIT";;
		$((128 + 4)))
			signame="SIGILL";;
		$((128 + 6)))
			signame="SIGABRT";;
		$((128 + 7)))
			signame="SIGBUS";;
		$((128 + 8)))
			signame="SIGFPE";;
		$((128 + 11)))
			signame="SIGSEGV";;
		$((128 + 24)))
			signame="SIGXCPU";;
		$((128 + 25)))
			signame="SIGXFSZ";;
		$((128 + 31)))
			signame="SIGSYS";;
		esac
		[[ -z $signame ]] || {
			(echo
			 echo "$name: job in $dir killed with signal $signame"
			 echo "$name: there may be a core dump associated with this failure"
			 echo) >> $log

			echo "$name: job in $dir killed with signal $signame"
			echo "$name: there may be a core dump associated with this failure"

			report_failure $dir
			continue
		}

		# If we don't understand why the job exited, report it as a failure and flag
		# a problem in this script.
		echo "$name: job in $dir exited with status $eret for an unknown reason"
		echo "$name: reporting job in $dir as a failure"
		echo "$name: $name needs to be updated"
		report_failure $dir
	done
	return 0
}

# Start a single job.
count_jobs=0
format()
{
	count_jobs=$(($count_jobs + 1))
	dir="$home/RUNDIR.$count_jobs"
	log="$dir.log"

	if [[ $smoke_test -ne 0 ]]; then
		args=${smoke_list[$smoke_next]}
		smoke_next=$(($smoke_next + 1))
		echo "$name: starting smoke-test job in $dir ($(date))"
	else
		args=$format_args

		# If abort/recovery testing is configured, do it 5% of the time.
		[[ $abort_test -ne 0 ]] && [[ $(($count_jobs % 20)) -eq 0 ]] && args="$args abort=1"

		echo "$name: starting job in $dir ($(date))"
	fi

	cmd="$format_binary -c "$config" -h "$dir" -1 $args quiet=1"
	verbose "$name: $cmd"

	# Disassociate the command from the shell script so we can exit and let the command
	# continue to run.
	# Run format in its own session so child processes are in their own process gorups
	# and we can individually terminate (and clean up) running jobs and their children.
	nohup setsid $cmd > $log 2>&1 &
}

seconds=$((minutes * 60))
start_time="$(date -u +%s)"
while :; do
	# Check if our time has expired.
	[[ $seconds -ne 0 ]] && {
		now="$(date -u +%s)"
		elapsed=$(($now - $start_time))

		# If we've run out of time, terminate all running jobs.
		[[ $elapsed -ge $seconds ]] && {
			verbose "$name: run timed out at $(date)"
			force_quit=1
		}
	}

	# Start more jobs.
	while :; do
		# Check if we're only running the smoke-tests and we're done.
		[[ $smoke_test -ne 0 ]] && [[ $smoke_next -ge ${#smoke_list[@]} ]] && quit=1
	
		# Check if the total number of jobs has been reached.
		[[ $total_jobs -ne 0 ]] && [[ $count_jobs -ge $total_jobs ]] && quit=1

		# Check if less than 60 seconds left on any timer. The goal is to avoid killing
		# jobs that haven't yet configured signal handlers, because we rely on handler
		# output to determine their final status.
		[[ $seconds -ne 0 ]] && [[ $(($seconds - $elapsed)) -lt 60 ]] && quit=1

		# Don't create more jobs if we're quitting for any reason.
		[[ $force_quit -ne 0 ]] || [[ $quit -ne 0 ]] && break;

		# Check if the maximum number of jobs in parallel has been reached.
		[[ $running -ge $parallel_jobs ]] && break
		running=$(($running + 1))

		# Start another job, but don't pound on the system.
		format
		sleep 2
	done

	# Clean up and update status.
	success_save=$success
	failure_save=$failure
	resolve
	[[ $success -ne $success_save ]] || [[ $failure -ne $failure_save ]] &&
	    echo "$name: $success successful jobs, $failure failed jobs"

	# Quit if we're done and there aren't any jobs left to wait for.
	[[ $quit -ne 0 ]] || [[ $force_quit -ne 0 ]] && [[ $running -eq 0 ]] && break

	# Wait for awhile, unless we're killing everything or there are jobs to start.
	[[ $force_quit -eq 0 ]] && [[ $running -ge $parallel_jobs ]] && sleep 10
done

echo "$name: $success successful jobs, $failure failed jobs"

verbose "$name: run ending at $(date)"
[[ $failure -ne 0 ]] && exit 1
exit 0