diff options
author | Julius Goryavsky <julius.goryavsky@mariadb.com> | 2021-06-09 03:41:37 +0200 |
---|---|---|
committer | Julius Goryavsky <julius.goryavsky@mariadb.com> | 2021-06-15 14:27:22 +0200 |
commit | 18d5be5b54b1a05e6107a1c5828d9eed9cf18636 (patch) | |
tree | a8332d06a06906e09c1e59fbd9b2c6998d8edc11 /scripts | |
parent | 1c35a3f6fd92704d7a135a81c7752f5058aaede5 (diff) | |
download | mariadb-git-18d5be5b54b1a05e6107a1c5828d9eed9cf18636.tar.gz |
MDEV-25880: rsync may be mistakenly killed when overlapping SST
This commit fixes a bug was originally discovered during the
galera_nbo_sst_slave mtr test for 10.6 branch. However it is
relevant for all versions and can lead to intermittent SST
crashes via rsync on very fast server restarts - when a new
SST process (for example, after starting a new server instance)
overlaps the old SST process started by the previous, already
terminated server. This overlap can result in the new rsync
being killed instead of the old rsync, or the pid file from
the new rsync being killed, which then lead to problems.
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/wsrep_sst_common.sh | 2 | ||||
-rw-r--r-- | scripts/wsrep_sst_rsync.sh | 28 |
2 files changed, 25 insertions, 5 deletions
diff --git a/scripts/wsrep_sst_common.sh b/scripts/wsrep_sst_common.sh index 05944ef6035..c2f31b2818d 100644 --- a/scripts/wsrep_sst_common.sh +++ b/scripts/wsrep_sst_common.sh @@ -1190,7 +1190,6 @@ trim_string() check_pid() { local pid_file="$1" - local remove=${2:-0} if [ -r "$pid_file" ]; then local pid=$(cat "$pid_file" 2>/dev/null) if [ -n "$pid" ]; then @@ -1201,6 +1200,7 @@ check_pid() fi fi fi + local remove=${2:-0} if [ $remove -eq 1 ]; then rm -f "$pid_file" fi diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index 19a4d19fded..a602af79af0 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -68,6 +68,8 @@ cleanup_joiner() if [ "$WSREP_SST_OPT_ROLE" = 'joiner' ]; then wsrep_cleanup_progress_file fi + + [ -f "$SST_PID" ] && rm -f "$SST_PID" } check_pid_and_port() @@ -281,6 +283,7 @@ then *) wsrep_log_error "Unrecognized ssl-mode option: '$SSLMODE'" exit 22 # EINVAL + ;; esac if [ -z "$CAFILE_OPT" ]; then wsrep_log_error "Can't have ssl-mode='$SSLMODE' without CA file" @@ -499,6 +502,21 @@ elif [ "$WSREP_SST_OPT_ROLE" = 'joiner' ] then check_sockets_utils + SST_PID="$WSREP_SST_OPT_DATA/wsrep_rsync_sst.pid" + + # give some time for lingering stunnel from previous SST to complete + check_round=0 + while check_pid "$SST_PID" 0 + do + wsrep_log_info "previous SST not completed, waiting for it to exit" + check_round=$(( check_round + 1 )) + if [ $check_round -eq 10 ]; then + wsrep_log_error "SST script already running." + exit 114 # EALREADY + fi + sleep 1 + done + # give some time for lingering stunnel from previous SST to complete check_round=0 while check_pid "$STUNNEL_PID" 1 @@ -583,12 +601,14 @@ EOF RSYNC_ADDR="*" fi + echo $$ > "$SST_PID" + if [ -z "$STUNNEL" ] then rsync --daemon --no-detach --port "$RSYNC_PORT" --config "$RSYNC_CONF" $RSYNC_EXTRA_ARGS & RSYNC_REAL_PID=$! - TRANSFER_REAL_PID="$RSYNC_REAL_PID" - TRANSFER_PID=$RSYNC_PID + TRANSFER_REAL_PID=$RSYNC_REAL_PID + TRANSFER_PID="$RSYNC_PID" else # Let's check if the path to the config file contains a space? if [ "${RSYNC_CONF#* }" = "$RSYNC_CONF" ]; then @@ -631,8 +651,8 @@ EOF fi stunnel "$STUNNEL_CONF" & STUNNEL_REAL_PID=$! - TRANSFER_REAL_PID="$STUNNEL_REAL_PID" - TRANSFER_PID=$STUNNEL_PID + TRANSFER_REAL_PID=$STUNNEL_REAL_PID + TRANSFER_PID="$STUNNEL_PID" fi if [ "${SSLMODE#VERIFY}" != "$SSLMODE" ] |